Contingency Table
library(readr)
comics <- read_csv('https://assets.datacamp.com/production/course_1796/datasets/comics.csv')
comics[1,]
## # A tibble: 1 x 11
## name id align eye hair gender gsm alive appearances
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 Spider-Man~ Secret Good Hazel~ Brown~ Male <NA> Living ~ 4043
## # ... with 2 more variables: first_appear <chr>, publisher <chr>
levels(comics$align)
## NULL
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
Dropping Levels
The contingency table from the last exercise revealed that there are some levels that have very low counts.
Notice that after dropping levels, ‘Reformed Criminals’ doesn’t exist anymore.
library(dplyr)
#Remove align level
comics <- comics %>%
filter(align != 'Reformed Criminals') %>%
droplevels()
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
Graph: Side-by-side Barcharts
While a contingency table represents the counts numerically, it’s often more useful to represent them graphically.
library(ggplot2)
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = "dodge") + #Decide whether to stack the graph or not.
theme(axis.text.x = element_text(angle = 0 ))#Decide the element text of x axis displaying angle.
Besides counts graphs, we can also make proportional graphs.
tab <- table(comics$align, comics$gender)
options(scipen = 999, digits = 3) # Print fewer digits
prop.table(tab) #Joint proportions
##
## Female Male Other
## Bad 0.082210 0.395160 0.001672
## Good 0.130135 0.251333 0.000888
## Neutral 0.043692 0.094021 0.000888
prop.table(tab, 2) #'2' means Conditional on columns, '1' means Condintional on rows.
##
## Female Male Other
## Bad 0.321 0.534 0.485
## Good 0.508 0.339 0.258
## Neutral 0.171 0.127 0.258
By adding position = “fill” to geom_bar(), you are saying you want the bars to fill the entire height of the plotting window, thus displaying proportions and not raw counts.
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = 'fill') +
ylab("proportion")#Rename y axis.
Distribution of One Categorical Variable
Notice the difference between geom_bar() & geom_col(). geom_bar() is ONE variable graph. However geom_col is TWO variable graph.
* Use facet_wrap(~*) to breakdown a barchart.
#Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c('Bad', 'Neutral', 'Good'))
#Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
#Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
Faceted Histogram
library(readr)
cars <- read_csv('https://assets.datacamp.com/production/course_1796/datasets/cars04.csv')
cars[1,]
## # A tibble: 1 x 19
## name sports_car suv wagon minivan pickup all_wheel rear_wheel msrp
## <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <int>
## 1 Chevro~ FALSE FALSE FALSE FALSE FALSE FALSE FALSE 11690
## # ... with 10 more variables: dealer_cost <int>, eng_size <dbl>,
## # ncyl <int>, horsepwr <int>, city_mpg <int>, hwy_mpg <int>,
## # weight <int>, wheel_base <int>, length <int>, width <int>
#Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
#Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
#Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
According to this graph, we can conclude that:
Distribution of One Numerical Variable
#Create hist of horsepwr
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram() +
ggtitle('horsepower')
#Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(x = horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle('horsepower')
Compare these two graphs, notice that afordable cars tend to has less horesepower. NO cars in the less expensive range exceed 250 horsepower.
BinWidth
As binwidth becomes larger, the graphh becomes smoother. It’s crucial to derermine how large is the binwidth.
#Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle('Horsepower binwidth = 3')
#Change binwidth to 30
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle('Horsepower binwidth = 30')
#Change binwidth to 60
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle('Horsepower binwidth = 60')
Density & BoxPlots
#Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
#Construct box plot of msrp using the reduced dataset
cars_no_out %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
#Create plot of city_mpg
cars %>%
ggplot(aes(x = 1, y = city_mpg)) +
geom_boxplot()
#Create plot of width
cars %>%
ggplot(aes(x = width)) +
geom_density()
Because the city_mpg variable has a much wider range with its outliers, it’s best to display its distribution as a box plot.
Higher Dimensions
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle('ncyl&suv')
Across both SUVs and non-SUVs, mileage tends to decrease as the number of cylinders increases according to the picture.
Center Measures: Mean, Median &Mode
library(gapminder)
#Compute groupwise mean and median lifeExp
gap2007 <- filter(gapminder, year == 2007)
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp))
## # A tibble: 5 x 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fct> <dbl> <dbl>
## 1 Africa 54.8 52.9
## 2 Americas 73.6 72.9
## 3 Asia 70.7 72.4
## 4 Europe 77.6 78.6
## 5 Oceania 80.7 80.7
#Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
Variance Measures: sd, var,diff & IQR
sum((x - mean(x))^2)/(n-1)
#Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n())
## # A tibble: 5 x 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fct> <dbl> <dbl> <int>
## 1 Africa 9.63 11.6 52
## 2 Americas 4.44 4.63 25
## 3 Asia 7.96 10.2 33
## 4 Europe 2.98 4.78 30
## 5 Oceania 0.729 0.516 2
#Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)#alpha is about the transparency.
Shapes & Transformation
#Create density plot of population
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
#Create density plot of log(pop)
gap2007 %>%
ggplot(aes(x = log(pop))) +
geom_density()
The difference is much easier to see on a log scale.
Outliers
#Outliers
gap_asia <- gap2007 %>%
filter(continent == 'Asia') %>%
mutate(is_outlier = lifeExp < 50)
#Create box plot of lifeExp with outliers
gap_asia %>%
ggplot(aes(x = 1, y = lifeExp)) +
geom_boxplot()
#Create box plot of lifeExp without outliers
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = 1, y = lifeExp)) +
geom_boxplot()
Spam & Num_char
library(openintro)
head(email, 3)
## spam to_multiple from cc sent_email time image attach
## 1 0 0 1 0 0 2012-01-01 14:16:41 0 0
## 2 0 0 1 0 0 2012-01-01 15:03:59 0 0
## 3 0 0 1 0 0 2012-01-02 00:00:32 0 0
## dollar winner inherit viagra password num_char line_breaks format
## 1 0 no 0 0 0 11.37 202 1
## 2 0 no 0 0 0 10.50 202 1
## 3 4 no 1 0 0 7.77 192 1
## re_subj exclaim_subj urgent_subj exclaim_mess number
## 1 0 0 0 0 big
## 2 0 0 0 1 small
## 3 0 0 0 6 small
#Compute summary statistics
email %>%
group_by(spam) %>%
summarize(median(num_char), IQR(num_char))
## # A tibble: 2 x 3
## spam `median(num_char)` `IQR(num_char)`
## <dbl> <dbl> <dbl>
## 1 0 6.83 13.6
## 2 1 1.05 2.82
#Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = spam, y = log_num_char, group = spam)) +
geom_boxplot()
According to the graphs, the median length of not-spam emails is greater than that of spam emails.
Spam & exclaim_mess
#Compute center and spread for exclaim_mess by spam
email %>%
group_by(spam) %>%
summarize(median(exclaim_mess), IQR(exclaim_mess))
## # A tibble: 2 x 3
## spam `median(exclaim_mess)` `IQR(exclaim_mess)`
## <dbl> <dbl> <dbl>
## 1 0 1 5
## 2 1 0 1
#Create plot for spam and exclaim_mess
email %>%
mutate(log_exclaim_mess = log(exclaim_mess)) %>%
ggplot(aes(x = log_exclaim_mess)) +
geom_histogram() +
facet_wrap(~ spam)
#Create plot of proportion of spam by image
email %>%
mutate(has_image = image > 0) %>%
ggplot(aes(x = has_image, fill = spam, group = spam)) +
geom_bar(position = 'fill')
Obviously, an email without an image is more likely to be not-spam than spam.
Strategy for Testing
#Test if images count as attachments
sum(email$image > email$attach)
## [1] 0
Since image is never greater than attach, we can infer that images are counted as attachments.
Question 1:
For emails containing the word “dollar”, does the typical spam email contain a greater number of occurrences of the word than the typical non-spam email?
Create a summary statistic that answers this question.
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(median(dollar))
## # A tibble: 2 x 2
## spam `median(dollar)`
## <dbl> <dbl>
## 1 0 4
## 2 1 2
Question 2:
If you encounter an email with greater than 10 occurrences of the word “dollar”, is it more likely to be spam or not-spam?
Create a barchart that answers this question.
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = spam)) +
geom_bar()
Spam & NUmbers
# Reorder levels
email$number = factor(email$number, order = T, levels = c('none','small','big'))
# Construct plot of number
ggplot(email, aes(x = number)) +
geom_bar() +
facet_wrap(~spam)