Introduction to R

Arithmetic

3 + 3 #加法
## [1] 6
3 - 3 #減法
## [1] 0
3 * 3 #乘法
## [1] 9
3 / 3 #除法
## [1] 1
3 %% 3 #餘數
## [1] 0

Variables

變數種類

class(3.5)   
## [1] "numeric"
class(3L) #Capital 'L' after an integer forces it to be stored as an integer.
## [1] "integer"
class(TRUE)    
## [1] "logical"
class('R studio')    
## [1] "character"

變數運算

3.5 + 3L
## [1] 6.5
3.5 + 'three'
## Error in 3.5 + "three": 二元運算子中有非數值引數

不同型別的變數型態之間的某些操作是不被允許的,例如數字與字串相加。

Vector

向量運算

vector1 <- c(100,300,500)
vector2 <- c(200,200,-100)
vector3 <- vector1 + vector2 #向量能直接進行運算
total <- sum(vector3) #使用sum向量加總
mean <- mean(vector3) #使用mean計算平均值
vector3
## [1] 300 500 400
total
## [1] 1200
mean
## [1] 400

向量命名

names(vector3) <- c('America','England','Japan') #用names替向量命名
vector3
## America England   Japan 
##     300     500     400

向量選取

selection <- vector3 > 350
selection #直接比大小回傳邏輯值
## America England   Japan 
##   FALSE    TRUE    TRUE
select_vec1 <- vector3[selection] #使用中括號進行選取
select_vec1
## England   Japan 
##     500     400
select_vec2 <- vector3[1:2] #也能以引數位置選取,注意1是第一項。
select_vec2
## America England 
##     300     500

Matrix

建立矩陣

#星際大戰票房
new_hope <- c(460.998, 314.4)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
#宣告 box_office
box_office <- c(new_hope, empire_strikes, return_jedi)
#建立 star_wars_matrix
star_wars_matrix <- matrix(box_office, byrow = T, nrow = 3) 
#一個具有 3 列(narrow = 3)的矩陣,數值(box_office)依序以列(byrow = TRUE)方向填入
star_wars_matrix
##         [,1]  [,2]
## [1,] 460.998 314.4
## [2,] 290.475 247.9
## [3,] 309.306 165.8

矩陣命名

#用來命名的向量 region 與 titles
region <- c("US", "non-US")
titles <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
#colnames()是欄命名function
colnames(star_wars_matrix) <- region
#rownames()是列命名function
rownames(star_wars_matrix) <- titles
star_wars_matrix
##                              US non-US
## A New Hope              460.998  314.4
## The Empire Strikes Back 290.475  247.9
## Return of the Jedi      309.306  165.8

運算合併

#rowSums()列總和(colSums()欄總和)
worldwide <- rowSums(star_wars_matrix)
#cbind()欄合併(rbind()列合併)
all_wars_matrix <- cbind(star_wars_matrix, worldwide)
all_wars_matrix
##                              US non-US worldwide
## A New Hope              460.998  314.4   775.398
## The Empire Strikes Back 290.475  247.9   538.375
## Return of the Jedi      309.306  165.8   475.106

矩陣選取 使用[列,欄],來選取,連續選取以冒號連結,全選留白。

#選出全部電影的非美國票房收入
non_us_all <- all_wars_matrix[,2]
non_us_all
##              A New Hope The Empire Strikes Back      Return of the Jedi 
##                   314.4                   247.9                   165.8
#選出前兩部電影的非美國票房收入
non_us_some <- all_wars_matrix[1:2,2]
non_us_some
##              A New Hope The Empire Strikes Back 
##                   314.4                   247.9

Factor

因素向量是用來儲存類別型變數的統計資料,類別型變數與連續型變數最主要的差異在於類別型變數有類別個數的上限,而連續型變數則會有無窮多的個數。用factor顯示級別。

#無法在類別之間比較好壞的【名目類別型變數】,如:動物。
animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse")
factor_animals_vector <- factor(animals_vector)
factor_animals_vector
## [1] Elephant Giraffe  Donkey   Horse   
## Levels: Donkey Elephant Giraffe Horse
#天生有比較特性的【順序類別型變數】,如:溫度。
temperature_vector <- c("High", "Low", "High","Low", "Medium")
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
factor_temperature_vector
## [1] High   Low    High   Low    Medium
## Levels: Low < Medium < High

級別命名

survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
#使用levels()重新命名級別,,預設以字母排序
levels(factor_survey_vector) <- c('Female', 'Male')
factor_survey_vector
## [1] Male   Female Female Male   Male  
## Levels: Female Male
#使用summary()產出摘要,注意summary(survey_vector)結果不同
summary(survey_vector)
##    Length     Class      Mode 
##         5 character character
summary(factor_survey_vector)
## Female   Male 
##      2      3

選取比較

#使用factor()建立順序類別
speed_vector <- c("fast", "slow", "slow", "fast", "insane")
factor_speed_vector <- factor(speed_vector, ordered = TRUE, levels = c("slow", "fast", "insane"))
#選出資料
da2 <- factor_speed_vector[2]
da5 <- factor_speed_vector[5]
#資料分析師 2 是否比資料分析師 5 快?
da2 > da5
## [1] FALSE

Dataframe

矩陣中所有的元素都是相同類型的,資料框中將觀測值儲存為列,將變數儲存為欄,通常包含了不同類型的資料。 使用data.frame()將多個等長度向量組成資料框。選取內容方式與矩陣相同,但[]內可直接打列欄名稱,或以$連接。

head(mtcars)#head()用來顯示dataframe的前六項
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
tail(mtcars)#tail()用來顯示dataframe的後六項
##                 mpg cyl  disp  hp drat    wt qsec vs am gear carb
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.7  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.9  1  1    5    2
## Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.5  0  1    5    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.5  0  1    5    6
## Maserati Bora  15.0   8 301.0 335 3.54 3.570 14.6  0  1    5    8
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.6  1  1    4    2
str(mtcars)#str()用來快速了解dataframe結構
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
mtcars$qsec#選取資料框內容以$連接欄列名稱
##  [1] 16.46 17.02 18.61 19.44 17.02 20.22 15.84 20.00 22.90 18.30 18.90
## [12] 17.40 17.60 18.00 17.98 17.82 17.42 19.47 18.52 19.90 20.01 16.87
## [23] 17.30 15.41 17.05 18.90 16.70 16.90 14.50 15.50 14.60 18.60
subset(mtcars, subset = qsec > 20)#使用subset()設定篩選條件
##                mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Valiant       18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 230      22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Toyota Corona 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1

List

清單可以蒐集多樣性的物件,包含矩陣、向量、資料框甚至清單,這些物件甚至不需要跟彼此相關。 你可以將清單視為一種超級資料類型,基本上你可以將任何資訊都儲存在清單中!

#使用list()組合各種資料,同時分別命名
my_vector <- 1:10
my_matrix <- matrix(1:9, ncol = 3)
my_df <- mtcars[1:3,]
my_list <- list(vec = my_vector, mat = my_matrix, df = my_df)
my_list
## $vec
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $mat
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## $df
##                mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4     21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710    22.8   4  108  93 3.85 2.320 18.61  1  1    4    1

清單選擇

#運用[[]]選出清單中的特定資料,再使用[]選出該指定資料的內容
my_list[['df']][2,]
##               mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4

Intermediate R

Conditionals and Control Flow

Logical Operators

T > F#TRUE == 1,FALSE == 0
## [1] TRUE
c(T, T, F) & c(T, F, F)#and
## [1]  TRUE FALSE FALSE
c(T, T, F) | c(T, F, F)#or
## [1]  TRUE  TRUE FALSE
!c(T, T, F)#not
## [1] FALSE FALSE  TRUE
c(T, T, F) && c(T, F, F)#注意&和&&結果不同,&&只回傳第一組比較值
## [1] TRUE
c(T, T, F) || c(T, F, F)#同上
## [1] TRUE

Conditional Statements
注意if對齊,以及if位置層級關係。

number = 4
if (number < 10) {
  if (number < 5) {
    result <- "extra small"
  } else {
    result <- "small"
  }
} else if (number < 100) {
  result <- "medium"
} else {
  result <- "large"
}
print(result)
## [1] "extra small"

Loops

Indefinite Loops
While,無窮迴圈,記得加入跳出迴圈條件,或是使用break結束。

i <- 1
while (i <= 10) {
  print(3*i)
  if (3*i%%8 == 0) {
    break
  }
  i <- i + 1
}
## [1] 3
## [1] 6
## [1] 9
## [1] 12
## [1] 15
## [1] 18
## [1] 21
## [1] 24

Definite Loops
For,兩種寫法。

nyc <- list(pop = 8405837, 
            boroughs = c("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island"), 
            capital = FALSE)
# Loop version 1
for (p in nyc) {
  print(p)
}
## [1] 8405837
## [1] "Manhattan"     "Bronx"         "Brooklyn"      "Queens"       
## [5] "Staten Island"
## [1] FALSE
# Loop version 2
for (i in 1:length(nyc)) {
  print(nyc[[i]])
}
## [1] 8405837
## [1] "Manhattan"     "Bronx"         "Brooklyn"      "Queens"       
## [5] "Staten Island"
## [1] FALSE

For loops for Matrix
使用print()、paste()重組句子。

ttt <- matrix(c('O',NA,'X',NA,'O','O','X',NA,'X'), nrow = 3, byrow = T)#建構矩陣
ttt
##      [,1] [,2] [,3]
## [1,] "O"  NA   "X" 
## [2,] NA   "O"  "O" 
## [3,] "X"  NA   "X"
for (i in 1) {#只抓出第一列
  for (j in 1:ncol(ttt)) {
    print(paste("On row", i ,"and column", j ,"the board contains" ,ttt[i,j]))
  }
}
## [1] "On row 1 and column 1 the board contains O"
## [1] "On row 1 and column 2 the board contains NA"
## [1] "On row 1 and column 3 the board contains X"

抓句中字母

rquote <- "rSTUDIO needs lots of practice!"
chars <- strsplit(rquote, split = "")[[1]]#strspilt()將句子裁切,""中放裁切條件
rcount <- 0
for (char in chars) {
  if(char == 'r') {
    rcount = rcount + 1
  }
  if(char == 'u') {
    break
  }
}
rcount
## [1] 2

Functions

  • Functions work like a black box.
  • Argument matching:By position or by name.
  • Function arguments can have defaults.
  • Use help(), args() for more details.
args(mean)
## NULL
linkedin <- c(16, 9, 13, 5, NA, 17, 14)
facebook <- c(17, NA, 5, 16, 8, 13, 14)
mean(abs(linkedin - facebook),na.rm = T)#na.rm決定是否排除NA
## [1] 4.8

Writing Functions

two_dice <- function() {
  possibilities <- 1:6
  dice1 <- sample(possibilities, size = 1)#sample()隨機選數字
  dice2 <- sample(possibilities, size = 1)
  dice1 + dice2
}
two_dice()#每次的結果不同
## [1] 3

R Packages

  • Where do built in functions like mean() and list() come from?
  • R Packages > install.packages(“ggvis”) > library(“ggvis”)
  • require(“ggvis”) loads the package, too.
  • require a package uninstalled > FALSE.
  • To look at the currently attached packages > search()
  • Now you can use these pre-compiled functions.
if(!require(ggplot2))install.packages("ggplot2",repos = "http://cran.us.r-project.org")
## Loading required package: ggplot2
library(ggplot2)
search()
##  [1] ".GlobalEnv"        "package:ggplot2"   "package:stats"    
##  [4] "package:graphics"  "package:grDevices" "package:utils"    
##  [7] "package:datasets"  "package:methods"   "Autoloads"        
## [10] "package:base"

Lapply, Sapply & Vapply

Lapply

  • lapply(list/vector, function)
  • Apply function over list or vector.
  • Always returns a list.
  • unlist(lapply()) to return a vector.
nyc
## $pop
## [1] 8405837
## 
## $boroughs
## [1] "Manhattan"     "Bronx"         "Brooklyn"      "Queens"       
## [5] "Staten Island"
## 
## $capital
## [1] FALSE
lapply(nyc, class)
## $pop
## [1] "numeric"
## 
## $boroughs
## [1] "character"
## 
## $capital
## [1] "logical"
unlist(lapply(nyc, class))
##         pop    boroughs     capital 
##   "numeric" "character"   "logical"
lapply(list(1,2,3), function(x, factor) {#Using anonymous functions
  x * factor
}
, factor = 3)
## [[1]]
## [1] 3
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 9

Sapply

  • sapply(list/vector, function)
  • Apply function over list or vector.
  • Returns an array that is a simplified version of lapply().
  • Similar to unlist(lapply()).
  • The ‘vector-version’ of a list of NULL’s would simply be a NULL. (一串NULL等於一個NULL)
lapply(list(runif (10), runif (10)), 
       function(x) c(min = min(x), mean = mean(x), max = max(x)))
## [[1]]
##       min      mean       max 
## 0.0422843 0.4691537 0.9274780 
## 
## [[2]]
##        min       mean        max 
## 0.03935985 0.58862369 0.98087867
sapply(list(runif (10), runif (10)), 
       function(x) c(min = min(x), mean = mean(x), max = max(x)), USE.NAMES = T)
##           [,1]      [,2]
## min  0.2321807 0.0414638
## mean 0.5003465 0.4242402
## max  0.7638852 0.8803119

Vapply

  • vapply(list/vector, function, format)
  • Apply function over list or vector.
  • explicitly specify output format.
#numeric(3) is the format here.
#numeric refers to classes of variables.
#'3' refers to length.
vapply(list(runif (10), runif (10)), 
       function(x) c(min = min(x), mean = mean(x), max = max(x)), numeric(3))
##            [,1]         [,2]
## min  0.01576775 0.0009929538
## mean 0.66297311 0.4849325555
## max  0.99883821 0.9324300122

Utilities

Useful Functions

  • Mathematical utilities

    • abs(): Calculate the absolute value.
    • sum(): Calculate the sum of all the values in a data structure.
    • mean(): Calculate the arithmetic mean.
    • round(): Round the values to 0 decimal places by default.

  • Data Utilities

    • seq(): Generate sequences, by specifying the from, to, and by arguments.
    • rep(): Replicate elements of vectors and lists.
    • sort(): Sort a vector in ascending order. Works on numerics, but also on character strings and logicals.
    • rev(): Reverse the elements in a data structures for which reversal is defined.
    • str(): Display the structure of any R object.
    • append(): Merge vectors or lists.
    • is.*(): Check for the class of an R object.
    • as.*(): Convert an R object from one class to another.
    • unlist(): Flatten (possibly embedded) lists to produce a vector.
seq1 = seq(1, 7, by = 2)
rep1 = rep(seq1, times = 2)
sort1 = sort(rep1, decreasing = T)
rev1 = rev(sort1)
append1 = append(sort1, rev1)
append1
##  [1] 7 7 5 5 3 3 1 1 1 1 3 3 5 5 7 7

Regular Expressions

  • Grepl & Grep
    • grepl(): Returns TRUE when a pattern is found in the corresponding character string.
    • grep(): Returns a vector of indices of the character strings that contains the pattern.
    • Use ‘^’and’$’ to match the content located in the start and end of a string, respectively.
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv", "kiara@@fakemail.edu")
#"@.*\\.edu$"代表"@"開頭,".edu"結尾,其中".*"表任何字串,"\\."表真的"."
hits =grep(emails, pattern = '@.*\\.edu$')
emails[hits]
## [1] "john.doe@ivyleague.edu"   "quant@bigdatacollege.edu"
## [3] "kiara@@fakemail.edu"
  • Sub & Gsub
    • sub(): Specify a ‘replacement’ replace a ‘pattern’, and it only replaces the first match.
    • gsub(): Replaces all matches.
#sub(pattern, replacement, object)
sub("@.*\\.edu$", "@datacamp.edu", emails)
## [1] "john.doe@datacamp.edu"    "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quant@datacamp.edu"       "cookie.monster@sesame.tv"
## [7] "kiara@datacamp.edu"
awards <- c("Won 1 Oscar.",
  "Won 1 Oscar. Another 9 wins & 24 nominations.",
  "1 win and 2 nominations.",
  "2 wins & 3 nominations.",
  "Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
  "4 wins & 1 nomination.")
sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)
## [1] "Won 1 Oscar." "24"           "2"            "3"           
## [5] "2"            "1"
  • Times and Dates

    • Sys.Date(): To know the date.
    • Sys.time(): To know the time.
    • Unclass(): See Date/time under the hood.
    • The 1st of January in 1970 is the common origin for representing times and dates in a wide range of programming languages.
str1 <- "2012-03-15"
class(str1)
## [1] "character"
date1 <- as.Date(str1, format = '%Y-%m-%d')
class(date1)
## [1] "Date"
str2 <- "2012-3-12 14:23:08"
class(str2)
## [1] "character"
time2 <- as.POSIXct(str2, format = '%Y-%m-%d %H:%M:%S')
class(time2)
## [1] "POSIXct" "POSIXt"
format(time2, '%I:%M%p')
## [1] "02:23下午"
#Calculations with Dates & Times
as.Date("2015-03-12") - as.Date("2015-02-27")
## Time difference of 13 days
birth <- as.POSIXct("1879-03-14 14:37:23")
death <- as.POSIXct("1955-04-18 03:47:12")
einstein <- death - birth
einstein
## Time difference of 27792.51 days

Introduction to the Tidyverse

Data Wrangling

Load Packages

#install.packages("gapminder")
if(!require(gapminder))install.packages("gapminder",repos = "http://cran.us.r-project.org")
## Loading required package: gapminder
library(gapminder)
if(!require(dplyr))install.packages("dplyr",repos = "http://cran.us.r-project.org")
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dplyr)

Filter()

  • The gapminder itself is unchanged.
# Filter for China in 2002
gapminder %>%
  filter(country == 'China', year == 2002)
## # A tibble: 1 x 6
##   country continent  year lifeExp        pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
## 1 China   Asia       2002    72.0 1280400000     3119.

Arrange()

  • The gapminder itself is unchanged.
  • Arrange() give you a new, sorted dataset.
  • Combining multiple steps with the ‘%>%’ operator.
gapminder %>%
  filter(year == 1957) %>%
  arrange(desc(pop))
## # A tibble: 142 x 6
##    country        continent  year lifeExp       pop gdpPercap
##    <fct>          <fct>     <int>   <dbl>     <int>     <dbl>
##  1 China          Asia       1957    50.5 637408000      576.
##  2 India          Asia       1957    40.2 409000000      590.
##  3 United States  Americas   1957    69.5 171984000    14847.
##  4 Japan          Asia       1957    65.5  91563009     4318.
##  5 Indonesia      Asia       1957    39.9  90124000      859.
##  6 Germany        Europe     1957    69.1  71019069    10188.
##  7 Brazil         Americas   1957    53.3  65551171     2487.
##  8 United Kingdom Europe     1957    70.4  51430000    11283.
##  9 Bangladesh     Asia       1957    39.3  51365468      662.
## 10 Italy          Europe     1957    67.8  49182000     6249.
## # ... with 132 more rows

Mutate()

  • Using mutuate() to change & add an variable.
gapminder %>%
  filter(year == 2007) %>%
  mutate(GDP = pop * gdpPercap) %>%
  arrange(desc(GDP))
## # A tibble: 142 x 7
##    country        continent  year lifeExp        pop gdpPercap     GDP
##    <fct>          <fct>     <int>   <dbl>      <int>     <dbl>   <dbl>
##  1 United States  Americas   2007    78.2  301139947    42952. 1.29e13
##  2 China          Asia       2007    73.0 1318683096     4959. 6.54e12
##  3 Japan          Asia       2007    82.6  127467972    31656. 4.04e12
##  4 India          Asia       2007    64.7 1110396331     2452. 2.72e12
##  5 Germany        Europe     2007    79.4   82400996    32170. 2.65e12
##  6 United Kingdom Europe     2007    79.4   60776238    33203. 2.02e12
##  7 France         Europe     2007    80.7   61083916    30470. 1.86e12
##  8 Brazil         Americas   2007    72.4  190010647     9066. 1.72e12
##  9 Italy          Europe     2007    80.5   58147733    28570. 1.66e12
## 10 Mexico         Americas   2007    76.2  108700891    11978. 1.30e12
## # ... with 132 more rows

Data Visualization

GGPLOT2 for Visualization

library(ggplot2)
gapminder_1952 <- gapminder %>%
  filter(year == 1952)
#Create a scatter plot with pop on the x-axis and lifeExp on the y-axis
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point()

Log Scales

  • To see more details in those dense region on the plot.
  • Put x or y on the log scale.
  • For log10(), the same distance now represents 10 times more.
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point() +
  scale_x_log10()

#Compare these two graphs, Notice the points are more spread out on the x-axis in this graph.
#It's now easier to see that there isn't a correlation between population and life expectancy.

Additional Aesthetics

  • Use color & size for more information.
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent, size = gdpPercap)) +
  geom_point() +
  scale_x_log10()

Faceting

  • **facet_wrap(~*)**,‘~’ is on the upper left of ypur keyboard which means ‘separate by’.
  • Separate graphs respectively for easier understanding.
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ continent)

Grouping and Summarizing

Summarzie()

  • Summarize(): Turns many rows into one.
# Filter for 1957 then summarize the median life expectancy and the maximum GDP per capita
gapminder %>%
  filter(year == 1957) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## # A tibble: 1 x 2
##   medianLifeExp maxGdpPercap
##           <dbl>        <dbl>
## 1          48.4      113523.

Group_by()

  • Group_by(): Using before summarize() turns group into one row each.
gapminder %>%
  group_by(year, continent) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## # A tibble: 60 x 4
## # Groups:   year [?]
##     year continent medianLifeExp maxGdpPercap
##    <int> <fct>             <dbl>        <dbl>
##  1  1952 Africa             38.8        4725.
##  2  1952 Americas           54.7       13990.
##  3  1952 Asia               44.9      108382.
##  4  1952 Europe             65.9       14734.
##  5  1952 Oceania            69.3       10557.
##  6  1957 Africa             40.6        5487.
##  7  1957 Americas           56.1       14847.
##  8  1957 Asia               48.3      113523.
##  9  1957 Europe             67.6       17909.
## 10  1957 Oceania            70.3       12247.
## # ... with 50 more rows

Visualizing Summarized Data

  • expand_limits(y = 0): Make sure y-axis starts with 0.
#Summarize medianGdpPercap within each continent within each year: by_year_continent
by_year_continent <- gapminder %>%
                        group_by(year, continent) %>%
                        summarize(medianGdpPercap = median(gdpPercap))
by_year_continent
## # A tibble: 60 x 3
## # Groups:   year [?]
##     year continent medianGdpPercap
##    <int> <fct>               <dbl>
##  1  1952 Africa               987.
##  2  1952 Americas            3048.
##  3  1952 Asia                1207.
##  4  1952 Europe              5142.
##  5  1952 Oceania            10298.
##  6  1957 Africa              1024.
##  7  1957 Americas            3781.
##  8  1957 Asia                1548.
##  9  1957 Europe              6067.
## 10  1957 Oceania            11599.
## # ... with 50 more rows
#Plot the change in medianGdpPercap in each continent over time 
ggplot(by_year_continent, aes(x= year, y = medianGdpPercap,color = continent)) +
  geom_point() +
  expand_limits(y = 0)

Types of Visualizations

Line Plot

  • Use geom_line() to plot a line plot.
#Continue with the plot above.
#Create a line plot showing the change in medianGdpPercap by continent over time
ggplot(by_year_continent, aes(x = year, y = medianGdpPercap, color = continent)) +
  geom_line() +
  expand_limits(y = 0)

Bar Plot

  • Use geom_col() to plot a bar plot.
#Summarize the median gdpPercap by year and continent in 1952
by_continent = gapminder %>%
  filter(year == 1952) %>%
  group_by(continent) %>%
  summarize(medianGdpPercap = median(gdpPercap))
by_continent
## # A tibble: 5 x 2
##   continent medianGdpPercap
##   <fct>               <dbl>
## 1 Africa               987.
## 2 Americas            3048.
## 3 Asia                1207.
## 4 Europe              5142.
## 5 Oceania            10298.
#Create a bar plot showing medianGdp by continent
ggplot(by_continent, aes(x = continent, y = medianGdpPercap)) +
  geom_col()

Histogram

  • Use geom_histogram() to plot a histogram.
gapminder_1952 <- gapminder %>%
  filter(year == 1952)
#Create a histogram of population (pop), with x on a log scale
ggplot(gapminder_1952, aes(x = pop)) +
  geom_histogram() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot

  • Use geom_boxplot() to plot a boxplot.
#ggtitle() for adding title for the plot.
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
  geom_boxplot() +
  scale_y_log10() +
  ggtitle("Comparing GDP per capita across continents")