options(digits = 2, width = 60)
# Select the smallest diamond in each colour
ddply(diamonds, .(color), subset, carat == min(carat))
# Select the two smallest diamonds
ddply(diamonds, .(color), subset, order(carat) <= 2)
# Select the 1% largest diamonds in each group
ddply(diamonds, .(color), subset, carat >
quantile(carat, 0.99))
# Select all diamonds bigger than the group average
ddply(diamonds, .(color), subset, price > mean(price))
# Within each colour, scale price to mean 0 and variance 1
ddply(diamonds, .(color), transform, price = scale(price))
# Subtract off group mean
ddply(diamonds, .(color), transform,
price = price - mean(price))
nmissing <- function(x) sum(is.na(x))
nmissing(msleep$name)
nmissing(msleep$brainwt)
nmissing_df <- colwise(nmissing)
nmissing_df(msleep)
# This is shorthand for the previous two steps
colwise(nmissing)(msleep)
msleep2 <- msleep[, -6] # Remove a column to save space
numcolwise(median)(msleep2, na.rm = T)
numcolwise(quantile)(msleep2, na.rm = T)
numcolwise(quantile)(msleep2, probs = c(0.25, 0.75),
na.rm = T)
ddply(msleep2, .(vore), numcolwise(median), na.rm = T)
ddply(msleep2, .(vore), numcolwise(mean), na.rm = T)
my_summary <- function(df) {
with(df, data.frame(
pc_cor = cor(price, carat, method = "spearman"),
lpc_cor = cor(log(price), log(carat))
))
}
ddply(diamonds, .(cut), my_summary)
ddply(diamonds, .(color), my_summary)
# A plot showing the smoothed trends for price vs. carat for each
# colour of diamonds. With the full range of carats (left), the
# standard errors balloon after around two carats because there are
# relatively few diamonds of that size. Restricting attention to
# diamonds of less than two carats (right) focuses on the region where
# we have plenty of data.
qplot(carat, price, data = diamonds, geom = "smooth",
colour = color)
dense <- subset(diamonds, carat < 2)
qplot(carat, price, data = dense, geom = "smooth",
colour = color, fullrange = TRUE)
# Figure~\ref{fig:smooth} with all statistical calculations performed
# by hand. The predicted values (left), and with standard errors
# (right).
library(mgcv)
smooth <- function(df) {
mod <- gam(price ~ s(carat, bs = "cs"), data = df)
grid <- data.frame(carat = seq(0.2, 2, length = 50))
pred <- predict(mod, grid, se = T)
grid$price <- as.vector(pred$fit)
grid$se <- as.vector(pred$se.fit)
grid
}
smoothes <- ddply(dense, .(color), smooth)
qplot(carat, price, data = smoothes, colour = color,
geom = "line")
qplot(carat, price, data = smoothes, colour = color,
geom = "smooth", ymax = price + 2 * se, ymin = price - 2 * se)
mod <- gam(price ~ s(carat, bs = "cs") + color, data = dense)
grid <- with(diamonds, expand.grid(
carat = seq(0.2, 2, length = 50),
color = levels(color)
))
grid$pred <- predict(mod, grid)
qplot(carat, pred, data = grid, colour = color, geom = "line")
# When the economics dataset is stored in wide format, it is easy to
# create separate time series plots for each variable (left and
# centre), and easy to create scatterplots comparing them (right).
qplot(date, uempmed, data = economics, geom = "line")
qplot(date, unemploy, data = economics, geom = "line")
qplot(unemploy, uempmed, data = economics) + geom_smooth()
# The two methods of displaying both series on a single plot produce
# identical plots, but using long data is much easier when you have
# many variables. The series have radically different scales, so we
# only see the pattern in the \code{unemploy} variable. You might not
# even notice \code{uempmed} unless you're paying close attention: it's
# the line at the bottom of the plot.
ggplot(economics, aes(date)) +
geom_line(aes(y = unemploy, colour = "unemploy")) +
geom_line(aes(y = uempmed, colour = "uempmed")) +
scale_colour_hue("variable")
emp <- melt(economics, id = "date",
measure = c("unemploy", "uempmed"))
qplot(date, value, data = emp, geom = "line", colour = variable)
# When the series have very different scales we have two alternatives:
# left, rescale the variables to a common scale, or right, display the
# variables on separate facets and using free scales.
range01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / diff(rng)
}
emp2 <- ddply(emp, .(variable), transform, value = range01(value))
qplot(date, value, data = emp2, geom = "line",
colour = variable, linetype = variable)
qplot(date, value, data = emp, geom = "line") +
facet_grid(variable ~ ., scales = "free_y")
popular <- subset(movies, votes > 1e4)
ratings <- popular[, 7:16]
ratings$.row <- rownames(ratings)
molten <- melt(ratings, id = ".row")
# Variants on the parallel coordinates plot to better display the
# patterns in this highly discrete data. To improve the default pcp
# (top left) we experiment with alpha blending (top right), jittering
# (bottom left) and then both together (bottom right).
pcp <- ggplot(molten, aes(variable, value, group = .row))
pcp + geom_line()
pcp + geom_line(colour = alpha("black", 1 / 20))
jit <- position_jitter(width = 0.25, height = 2.5)
pcp + geom_line(position = jit)
pcp + geom_line(colour = alpha("black", 1 / 20), position = jit)
cl <- kmeans(ratings[1:10], 6)
ratings$cluster <- reorder(factor(cl$cluster), popular$rating)
levels(ratings$cluster) <- seq_along(levels(ratings$cluster))
molten <- melt(ratings, id = c(".row", "cluster"))
# Displaying cluster membership on a parallel coordinates plot. (Left)
# Individual movies coloured by group membership and (right) group
# means.
pcp_cl <- ggplot(molten,
aes(variable, value, group = .row, colour = cluster))
pcp_cl + geom_line(position = jit, alpha = 1/5)
pcp_cl + stat_summary(aes(group = cluster), fun.y = mean,
geom = "line")
# Faceting allows us to display each group in its own panel,
# highlighting the fact that there seems to be considerable variation
# within each group, and suggesting that we need more groups in our
# clustering.
pcp_cl + geom_line(position = jit, colour = alpha("black", 1/5)) +
facet_wrap(~ cluster)
# A simple linear model that doesn't fit the data very well.
qplot(displ, cty, data = mpg) + geom_smooth(method = "lm")
mpgmod <- lm(cty ~ displ, data = mpg)
# (Left) Basic fitted values-residual plot. (Middle) With standardised
# residuals. (Right) With size proportional to Cook's distance. It is
# easy to modify the basic plots when we have access to all of the
# data.
mod <- lm(cty ~ displ, data = mpg)
basic <- ggplot(mod, aes(.fitted, .resid)) +
geom_hline(yintercept = 0, colour = "grey50", size = 0.5) +
geom_point() +
geom_smooth(size = 0.5, se = F)
basic
basic + aes(y = .stdresid)
basic + aes(size = .cooksd) + scale_area("Cook's distance")
# Adding variables from the original data can be enlightening. Here
# when we add the number of cylinders we see that instead of a
# curvi-linear relationship between displacement and city mpg, it is
# essentially linear, conditional on the number of cylinders.
full <- basic %+% fortify(mod, mpg)
full + aes(colour = factor(cyl))
full + aes(displ, colour = factor(cyl))
fortify.Image <- function(model, data, ...) {
colours <- channel(model, "x11")[,,]
colours <- colours[, rev(seq_len(ncol(colours)))]
melt(colours, c("x", "y"))
}
library(EBImage)
img <- readImage("http://had.co.nz/me.jpg", TrueColor)
qplot(x, y, data = img, fill = value, geom="tile") +
scale_fill_identity() + coord_equal()