The contents here is based on R for Data Science. Check chapter 3, Data Visualization and chapter 5, Data Manipulation for more details.
Here, I'd like to cover
First call tidyverse library
library(tidyverse)
We will play with mpg
data containing observations collected by the US Environmental Protection Agency on 38 models of car.
mpg[1:5,]
ggplot(data = data, mapping = aes(x = , y=, color= , size= , alpha= , shape=) )
ggplot(data = mpg, mapping = aes(x = displ, y = hwy) ) +
geom_point()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class) ) +
geom_point()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, size = cyl) ) +
geom_point()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, alpha = cyl) ) +
geom_point()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy, shape = factor(cyl)) ) +
geom_point()
Split your plot into facets (facet_wrap, facet_grid)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~ class, nrow = 2)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
Functions start with geom_
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
ggplot(data = mpg, aes(x = cyl) ) +
geom_bar()
demo <- tribble(
~cut, ~freq,
"Fair", 1610,
"Good", 4906,
"Very Good", 12082,
"Premium", 13791,
"Ideal", 21551
)
demo
ggplot(demo, aes(x = cut, y = freq)) +
geom_bar(stat = "identity")
ggplot(demo, aes(x = cut, y = freq)) +
geom_bar(stat = "identity") +
theme_bw()
ggplot(demo, aes(x = cut, y = freq)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(panel.border = element_blank(),
axis.line = element_line(size = .8, color = "black"),
panel.grid.minor = element_blank(),
text = element_text(size = 25)
)
x %>% f(y)
turns into f(x,y)
, and x %>% f(y) %>% g(z)
turns into g(f(x,y), z)
filtering rows
mpg[1:10,]
mpg %>% filter(year == 1999)
mpg %>% filter(year == 1999, model == 'passat')
mpg %>% filter(year == 1999, model %in% c('passat', 'new beetle') )
Return a subset of the columns
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class)
Reorders the rows according to single or multiple variables
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(displ)
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(desc(displ))
Add columns from existing data
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(desc(displ)) %>%
mutate(displ10 = displ*10,
model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"))
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(desc(displ)) %>%
mutate(displ10 = displ*10,
model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
ex_missing = factor(ex_missing, levels = c("28", "20")),
ex_missing = fct_explicit_na(ex_missing)
)
Produce summary statistic for each group (using group_by() )
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(desc(displ)) %>%
mutate(displ10 = displ*10,
model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
ex_missing = factor(ex_missing, levels = c("28", "20")),
ex_missing = fct_explicit_na(ex_missing)
) %>%
group_by(model) %>%
summarize( n = n(),
mean_displ = mean(displ),
sd_displ = sd(displ))
mpg %>%
filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
select(model, displ, class) %>%
arrange(desc(displ)) %>%
mutate(displ10 = displ*10,
model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
ex_missing = factor(ex_missing, levels = c("28", "20")),
ex_missing = fct_explicit_na(ex_missing)
) %>%
group_by(model) %>%
summarize( n = n(),
mean_displ = mean(displ),
sd_displ = sd(displ)) %>%
ggplot(aes(x = model, y = mean_displ )) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin=mean_displ-sd_displ, ymax=mean_displ+sd_displ), width=.2) +
theme_bw() +
theme( text = element_text(size = 25) ) +
xlab('Model') +
ylab('Displ')