R for Data Science¶

The contents here is based on R for Data Science. Check chapter 3, Data Visualization and chapter 5, Data Manipulation for more details.

Here, I'd like to cover

Data Visualization using ggplot
Data Manipulation using dplyr

Data Visualization¶

First call tidyverse library

library(tidyverse)

-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
√ ggplot2 3.2.1     √ purrr   0.3.2
√ tibble  2.1.3     √ dplyr   0.8.3
√ tidyr   0.8.3     √ stringr 1.4.0
√ readr   1.3.1     √ forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

We will play with mpgdata containing observations collected by the US Environmental Protection Agency on 38 models of car.

mpg[1:5,]

Aesthetic Mappings¶

ggplot(data = data, mapping = aes(x = , y=, color= , size= , alpha= , shape=) )

x and y¶

ggplot(data = mpg, mapping = aes(x = displ, y = hwy) ) +
    geom_point()

color¶

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = class) ) +
    geom_point()

size¶

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, size = cyl) ) +
    geom_point()

alpha¶

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, alpha = cyl) ) +
    geom_point()

shape¶

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, shape = factor(cyl)) ) +
    geom_point()

Split your plot into facets (facet_wrap, facet_grid)

facet_wrap for 1d split
facet_grid for 2d split

facet_wrap¶

ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_wrap(~ class, nrow = 2)

facet_grid¶

ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl)

Geometric objects¶

Functions start with geom_

Plot geometrical objects

ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point()

ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point() +
  geom_smooth()

`geom_smooth()` using method = 'loess' and formula 'y ~ x'

geom_bar¶

geom_bar

ggplot(data = mpg, aes(x = cyl) ) + 
  geom_bar()

demo <- tribble(
  ~cut,         ~freq,
  "Fair",       1610,
  "Good",       4906,
  "Very Good",  12082,
  "Premium",    13791,
  "Ideal",      21551
)
demo

ggplot(demo, aes(x = cut, y = freq)) +
  geom_bar(stat = "identity")

Theme¶

ggplot(demo, aes(x = cut, y = freq)) +
  geom_bar(stat = "identity") + 
  theme_bw()

ggplot(demo, aes(x = cut, y = freq)) +
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(panel.border = element_blank(),
        axis.line = element_line(size = .8, color = "black"),
        panel.grid.minor = element_blank(),
        text = element_text(size = 25)
       )

Data manipulation¶

x %>% f(y) turns into f(x,y), and x %>% f(y) %>% g(z) turns into g(f(x,y), z)

pipe operator¶

pipe

filter¶

filtering rows

mpg[1:10,]

mpg %>% filter(year == 1999)

mpg %>% filter(year == 1999, model == 'passat')

mpg %>% filter(year == 1999, model %in% c('passat', 'new beetle') )

select¶

Return a subset of the columns

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class)

arrange¶

Reorders the rows according to single or multiple variables

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(displ)

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(desc(displ))

mutate¶

Add columns from existing data

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(desc(displ)) %>%
   mutate(displ10 = displ*10,
         model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
         ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"))

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(desc(displ)) %>%
   mutate(displ10 = displ*10,
          model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
          ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
          ex_missing = factor(ex_missing, levels = c("28", "20")),
          ex_missing = fct_explicit_na(ex_missing) 
          )

summarize¶

Produce summary statistic for each group (using group_by() )

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(desc(displ)) %>%
   mutate(displ10 = displ*10,
          model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
          ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
          ex_missing = factor(ex_missing, levels = c("28", "20")),
          ex_missing = fct_explicit_na(ex_missing) 
          ) %>% 
   group_by(model) %>%          
   summarize( n = n(),
            mean_displ = mean(displ), 
            sd_displ = sd(displ))

ggplot followed by dyplyr manipulation¶

mpg %>% 
   filter(year == 1999, model %in% c('passat', 'new beetle') ) %>%
   select(model, displ, class) %>%
   arrange(desc(displ)) %>%
   mutate(displ10 = displ*10,
          model2 = car::recode(model, "'passat'='p'; 'new beetle'='nb' "),
          ex_missing = car::recode(displ, "2.0 = '20'; 2.8 = '28'; else = NA"),
          ex_missing = factor(ex_missing, levels = c("28", "20")),
          ex_missing = fct_explicit_na(ex_missing) 
          ) %>% 
   group_by(model) %>%          
   summarize( n = n(),
            mean_displ = mean(displ), 
            sd_displ = sd(displ)) %>% 
   ggplot(aes(x = model, y = mean_displ )) + 
   geom_bar(stat = "identity") +
   geom_errorbar(aes(ymin=mean_displ-sd_displ, ymax=mean_displ+sd_displ), width=.2) +
   theme_bw() + 
   theme( text = element_text(size = 25) ) + 
   xlab('Model') + 
   ylab('Displ')

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact
audi	a4	3.1	2008	6	auto(av)	f	18	27	p	compact
audi	a4 quattro	1.8	1999	4	manual(m5)	4	18	26	p	compact
audi	a4 quattro	1.8	1999	4	auto(l5)	4	16	25	p	compact
audi	a4 quattro	2.0	2008	4	manual(m6)	4	20	28	p	compact

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact
audi	a4 quattro	1.8	1999	4	manual(m5)	4	18	26	p	compact
audi	a4 quattro	1.8	1999	4	auto(l5)	4	16	25	p	compact
audi	a4 quattro	2.8	1999	6	auto(l5)	4	15	25	p	compact
audi	a4 quattro	2.8	1999	6	manual(m5)	4	17	25	p	compact
audi	a6 quattro	2.8	1999	6	auto(l5)	4	15	24	p	midsize
chevrolet	c1500 suburban 2wd	5.7	1999	8	auto(l4)	r	13	17	r	suv
chevrolet	corvette	5.7	1999	8	manual(m6)	r	16	26	p	2seater
chevrolet	corvette	5.7	1999	8	auto(l4)	r	15	23	p	2seater
chevrolet	k1500 tahoe 4wd	5.7	1999	8	auto(l4)	4	11	15	r	suv
chevrolet	k1500 tahoe 4wd	6.5	1999	8	auto(l4)	4	14	17	d	suv
chevrolet	malibu	2.4	1999	4	auto(l4)	f	19	27	r	midsize
chevrolet	malibu	3.1	1999	6	auto(l4)	f	18	26	r	midsize
dodge	caravan 2wd	2.4	1999	4	auto(l3)	f	18	24	r	minivan
dodge	caravan 2wd	3.0	1999	6	auto(l4)	f	17	24	r	minivan
dodge	caravan 2wd	3.3	1999	6	auto(l4)	f	16	22	r	minivan
dodge	caravan 2wd	3.3	1999	6	auto(l4)	f	16	22	r	minivan
dodge	caravan 2wd	3.8	1999	6	auto(l4)	f	15	22	r	minivan
dodge	caravan 2wd	3.8	1999	6	auto(l4)	f	15	21	r	minivan
dodge	dakota pickup 4wd	3.9	1999	6	auto(l4)	4	13	17	r	pickup
dodge	dakota pickup 4wd	3.9	1999	6	manual(m5)	4	14	17	r	pickup
dodge	dakota pickup 4wd	5.2	1999	8	manual(m5)	4	11	17	r	pickup
dodge	dakota pickup 4wd	5.2	1999	8	auto(l4)	4	11	15	r	pickup
dodge	durango 4wd	3.9	1999	6	auto(l4)	4	13	17	r	suv
dodge	durango 4wd	5.2	1999	8	auto(l4)	4	11	16	r	suv
dodge	durango 4wd	5.9	1999	8	auto(l4)	4	11	15	r	suv
dodge	ram 1500 pickup 4wd	5.2	1999	8	auto(l4)	4	11	15	r	pickup
...	...	...	...	...	...	...	...	...	...	...
toyota	camry	3.0	1999	6	auto(l4)	f	18	26	r	midsize
toyota	camry	3.0	1999	6	manual(m5)	f	18	26	r	midsize
toyota	camry solara	2.2	1999	4	auto(l4)	f	21	27	r	compact
toyota	camry solara	2.2	1999	4	manual(m5)	f	21	29	r	compact
toyota	camry solara	3.0	1999	6	auto(l4)	f	18	26	r	compact
toyota	camry solara	3.0	1999	6	manual(m5)	f	18	26	r	compact
toyota	corolla	1.8	1999	4	auto(l3)	f	24	30	r	compact
toyota	corolla	1.8	1999	4	auto(l4)	f	24	33	r	compact
toyota	corolla	1.8	1999	4	manual(m5)	f	26	35	r	compact
toyota	land cruiser wagon 4wd	4.7	1999	8	auto(l4)	4	11	15	r	suv
toyota	toyota tacoma 4wd	2.7	1999	4	manual(m5)	4	15	20	r	pickup
toyota	toyota tacoma 4wd	2.7	1999	4	auto(l4)	4	16	20	r	pickup
toyota	toyota tacoma 4wd	3.4	1999	6	manual(m5)	4	15	17	r	pickup
toyota	toyota tacoma 4wd	3.4	1999	6	auto(l4)	4	15	19	r	pickup
volkswagen	gti	2.0	1999	4	manual(m5)	f	21	29	r	compact
volkswagen	gti	2.0	1999	4	auto(l4)	f	19	26	r	compact
volkswagen	gti	2.8	1999	6	manual(m5)	f	17	24	r	compact
volkswagen	jetta	1.9	1999	4	manual(m5)	f	33	44	d	compact
volkswagen	jetta	2.0	1999	4	manual(m5)	f	21	29	r	compact
volkswagen	jetta	2.0	1999	4	auto(l4)	f	19	26	r	compact
volkswagen	jetta	2.8	1999	6	auto(l4)	f	16	23	r	compact
volkswagen	jetta	2.8	1999	6	manual(m5)	f	17	24	r	compact
volkswagen	new beetle	1.9	1999	4	manual(m5)	f	35	44	d	subcompact
volkswagen	new beetle	1.9	1999	4	auto(l4)	f	29	41	d	subcompact
volkswagen	new beetle	2.0	1999	4	manual(m5)	f	21	29	r	subcompact
volkswagen	new beetle	2.0	1999	4	auto(l4)	f	19	26	r	subcompact
volkswagen	passat	1.8	1999	4	manual(m5)	f	21	29	p	midsize
volkswagen	passat	1.8	1999	4	auto(l5)	f	18	29	p	midsize
volkswagen	passat	2.8	1999	6	auto(l5)	f	16	26	p	midsize
volkswagen	passat	2.8	1999	6	manual(m5)	f	18	26	p	midsize

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
volkswagen	passat	1.8	1999	4	manual(m5)	f	21	29	p	midsize
volkswagen	passat	1.8	1999	4	auto(l5)	f	18	29	p	midsize
volkswagen	passat	2.8	1999	6	auto(l5)	f	16	26	p	midsize
volkswagen	passat	2.8	1999	6	manual(m5)	f	18	26	p	midsize

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
volkswagen	new beetle	1.9	1999	4	manual(m5)	f	35	44	d	subcompact
volkswagen	new beetle	1.9	1999	4	auto(l4)	f	29	41	d	subcompact
volkswagen	new beetle	2.0	1999	4	manual(m5)	f	21	29	r	subcompact
volkswagen	new beetle	2.0	1999	4	auto(l4)	f	19	26	r	subcompact
volkswagen	passat	1.8	1999	4	manual(m5)	f	21	29	p	midsize
volkswagen	passat	1.8	1999	4	auto(l5)	f	18	29	p	midsize
volkswagen	passat	2.8	1999	6	auto(l5)	f	16	26	p	midsize
volkswagen	passat	2.8	1999	6	manual(m5)	f	18	26	p	midsize

cut	freq
Fair	1610
Good	4906
Very Good	12082
Premium	13791
Ideal	21551