Exploring Raw Data

Functions Helping Understanding Raw Data

bmi <- read.csv('https://assets.datacamp.com/production/repositories/34/datasets/a0a569ebbb34500d11979eba95360125127e6434/bmi_clean.csv')
#Getting a feel for your data
class(bmi)
## [1] "data.frame"
dim(bmi)
## [1] 199  30
names(bmi)
##  [1] "Country" "Y1980"   "Y1981"   "Y1982"   "Y1983"   "Y1984"   "Y1985"  
##  [8] "Y1986"   "Y1987"   "Y1988"   "Y1989"   "Y1990"   "Y1991"   "Y1992"  
## [15] "Y1993"   "Y1994"   "Y1995"   "Y1996"   "Y1997"   "Y1998"   "Y1999"  
## [22] "Y2000"   "Y2001"   "Y2002"   "Y2003"   "Y2004"   "Y2005"   "Y2006"  
## [29] "Y2007"   "Y2008"
#Viewing the structure of your data.
str(bmi)
## 'data.frame':    199 obs. of  30 variables:
##  $ Country: Factor w/ 199 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Y1980  : num  21.5 25.2 22.3 25.7 20.9 ...
##  $ Y1981  : num  21.5 25.2 22.3 25.7 20.9 ...
##  $ Y1982  : num  21.5 25.3 22.4 25.7 20.9 ...
##  $ Y1983  : num  21.4 25.3 22.5 25.8 20.9 ...
##  $ Y1984  : num  21.4 25.3 22.6 25.8 20.9 ...
##  $ Y1985  : num  21.4 25.3 22.7 25.9 20.9 ...
##  $ Y1986  : num  21.4 25.3 22.8 25.9 21 ...
##  $ Y1987  : num  21.4 25.3 22.8 25.9 21 ...
##  $ Y1988  : num  21.3 25.3 22.9 26 21 ...
##  $ Y1989  : num  21.3 25.3 23 26 21.1 ...
##  $ Y1990  : num  21.2 25.3 23 26.1 21.1 ...
##  $ Y1991  : num  21.2 25.3 23.1 26.2 21.1 ...
##  $ Y1992  : num  21.1 25.2 23.2 26.2 21.1 ...
##  $ Y1993  : num  21.1 25.2 23.3 26.3 21.1 ...
##  $ Y1994  : num  21 25.2 23.3 26.4 21.1 ...
##  $ Y1995  : num  20.9 25.3 23.4 26.4 21.2 ...
##  $ Y1996  : num  20.9 25.3 23.5 26.5 21.2 ...
##  $ Y1997  : num  20.8 25.3 23.5 26.6 21.2 ...
##  $ Y1998  : num  20.8 25.4 23.6 26.7 21.3 ...
##  $ Y1999  : num  20.8 25.5 23.7 26.8 21.3 ...
##  $ Y2000  : num  20.7 25.6 23.8 26.8 21.4 ...
##  $ Y2001  : num  20.6 25.7 23.9 26.9 21.4 ...
##  $ Y2002  : num  20.6 25.8 24 27 21.5 ...
##  $ Y2003  : num  20.6 25.9 24.1 27.1 21.6 ...
##  $ Y2004  : num  20.6 26 24.2 27.2 21.7 ...
##  $ Y2005  : num  20.6 26.1 24.3 27.3 21.8 ...
##  $ Y2006  : num  20.6 26.2 24.4 27.4 21.9 ...
##  $ Y2007  : num  20.6 26.3 24.5 27.5 22.1 ...
##  $ Y2008  : num  20.6 26.4 24.6 27.6 22.3 ...
library(dplyr)
glimpse(bmi)
## Observations: 199
## Variables: 30
## $ Country <fct> Afghanistan, Albania, Algeria, Andorra, Angola, Antigu...
## $ Y1980   <dbl> 21.48678, 25.22533, 22.25703, 25.66652, 20.94876, 23.3...
## $ Y1981   <dbl> 21.46552, 25.23981, 22.34745, 25.70868, 20.94371, 23.3...
## $ Y1982   <dbl> 21.45145, 25.25636, 22.43647, 25.74681, 20.93754, 23.4...
## $ Y1983   <dbl> 21.43822, 25.27176, 22.52105, 25.78250, 20.93187, 23.5...
## $ Y1984   <dbl> 21.42734, 25.27901, 22.60633, 25.81874, 20.93569, 23.6...
## $ Y1985   <dbl> 21.41222, 25.28669, 22.69501, 25.85236, 20.94857, 23.7...
## $ Y1986   <dbl> 21.40132, 25.29451, 22.76979, 25.89089, 20.96030, 23.8...
## $ Y1987   <dbl> 21.37679, 25.30217, 22.84096, 25.93414, 20.98025, 23.9...
## $ Y1988   <dbl> 21.34018, 25.30450, 22.90644, 25.98477, 21.01375, 24.0...
## $ Y1989   <dbl> 21.29845, 25.31944, 22.97931, 26.04450, 21.05269, 24.1...
## $ Y1990   <dbl> 21.24818, 25.32357, 23.04600, 26.10936, 21.09007, 24.2...
## $ Y1991   <dbl> 21.20269, 25.28452, 23.11333, 26.17912, 21.12136, 24.3...
## $ Y1992   <dbl> 21.14238, 25.23077, 23.18776, 26.24017, 21.14987, 24.4...
## $ Y1993   <dbl> 21.06376, 25.21192, 23.25764, 26.30356, 21.13938, 24.5...
## $ Y1994   <dbl> 20.97987, 25.22115, 23.32273, 26.36793, 21.14186, 24.6...
## $ Y1995   <dbl> 20.91132, 25.25874, 23.39526, 26.43569, 21.16022, 24.6...
## $ Y1996   <dbl> 20.85155, 25.31097, 23.46811, 26.50769, 21.19076, 24.7...
## $ Y1997   <dbl> 20.81307, 25.33988, 23.54160, 26.58255, 21.22621, 24.7...
## $ Y1998   <dbl> 20.78591, 25.39116, 23.61592, 26.66337, 21.27082, 24.8...
## $ Y1999   <dbl> 20.75469, 25.46555, 23.69486, 26.75078, 21.31954, 24.9...
## $ Y2000   <dbl> 20.69521, 25.55835, 23.77659, 26.83179, 21.37480, 24.9...
## $ Y2001   <dbl> 20.62643, 25.66701, 23.86256, 26.92373, 21.43664, 25.0...
## $ Y2002   <dbl> 20.59848, 25.77167, 23.95294, 27.02525, 21.51765, 25.1...
## $ Y2003   <dbl> 20.58706, 25.87274, 24.05243, 27.12481, 21.59924, 25.2...
## $ Y2004   <dbl> 20.57759, 25.98136, 24.15957, 27.23107, 21.69218, 25.2...
## $ Y2005   <dbl> 20.58084, 26.08939, 24.27001, 27.32827, 21.80564, 25.3...
## $ Y2006   <dbl> 20.58749, 26.20867, 24.38270, 27.43588, 21.93881, 25.5...
## $ Y2007   <dbl> 20.60246, 26.32753, 24.48846, 27.53363, 22.08962, 25.6...
## $ Y2008   <dbl> 20.62058, 26.44657, 24.59620, 27.63048, 22.25083, 25.7...
summary(bmi)
##                 Country        Y1980           Y1981           Y1982      
##  Afghanistan        :  1   Min.   :19.01   Min.   :19.04   Min.   :19.07  
##  Albania            :  1   1st Qu.:21.27   1st Qu.:21.31   1st Qu.:21.36  
##  Algeria            :  1   Median :23.31   Median :23.39   Median :23.46  
##  Andorra            :  1   Mean   :23.15   Mean   :23.21   Mean   :23.26  
##  Angola             :  1   3rd Qu.:24.82   3rd Qu.:24.89   3rd Qu.:24.94  
##  Antigua and Barbuda:  1   Max.   :28.12   Max.   :28.36   Max.   :28.58  
##  (Other)            :193                                                  
##      Y1983           Y1984           Y1985           Y1986      
##  Min.   :19.10   Min.   :19.13   Min.   :19.16   Min.   :19.20  
##  1st Qu.:21.42   1st Qu.:21.45   1st Qu.:21.47   1st Qu.:21.49  
##  Median :23.57   Median :23.64   Median :23.73   Median :23.82  
##  Mean   :23.32   Mean   :23.37   Mean   :23.42   Mean   :23.48  
##  3rd Qu.:25.02   3rd Qu.:25.06   3rd Qu.:25.11   3rd Qu.:25.20  
##  Max.   :28.82   Max.   :29.05   Max.   :29.28   Max.   :29.52  
##                                                                 
##      Y1987           Y1988           Y1989           Y1990      
##  Min.   :19.23   Min.   :19.27   Min.   :19.31   Min.   :19.35  
##  1st Qu.:21.50   1st Qu.:21.52   1st Qu.:21.55   1st Qu.:21.57  
##  Median :23.87   Median :23.93   Median :24.03   Median :24.14  
##  Mean   :23.53   Mean   :23.59   Mean   :23.65   Mean   :23.71  
##  3rd Qu.:25.27   3rd Qu.:25.34   3rd Qu.:25.37   3rd Qu.:25.39  
##  Max.   :29.75   Max.   :29.98   Max.   :30.20   Max.   :30.42  
##                                                                 
##      Y1991           Y1992           Y1993           Y1994      
##  Min.   :19.40   Min.   :19.45   Min.   :19.51   Min.   :19.59  
##  1st Qu.:21.60   1st Qu.:21.65   1st Qu.:21.74   1st Qu.:21.76  
##  Median :24.20   Median :24.19   Median :24.27   Median :24.36  
##  Mean   :23.76   Mean   :23.82   Mean   :23.88   Mean   :23.94  
##  3rd Qu.:25.42   3rd Qu.:25.48   3rd Qu.:25.54   3rd Qu.:25.62  
##  Max.   :30.64   Max.   :30.85   Max.   :31.04   Max.   :31.23  
##                                                                 
##      Y1995           Y1996           Y1997           Y1998      
##  Min.   :19.67   Min.   :19.71   Min.   :19.74   Min.   :19.77  
##  1st Qu.:21.83   1st Qu.:21.89   1st Qu.:21.94   1st Qu.:22.00  
##  Median :24.41   Median :24.42   Median :24.50   Median :24.49  
##  Mean   :24.00   Mean   :24.07   Mean   :24.14   Mean   :24.21  
##  3rd Qu.:25.70   3rd Qu.:25.78   3rd Qu.:25.85   3rd Qu.:25.94  
##  Max.   :31.41   Max.   :31.59   Max.   :31.77   Max.   :31.95  
##                                                                 
##      Y1999           Y2000           Y2001           Y2002      
##  Min.   :19.80   Min.   :19.83   Min.   :19.86   Min.   :19.84  
##  1st Qu.:22.04   1st Qu.:22.12   1st Qu.:22.22   1st Qu.:22.29  
##  Median :24.61   Median :24.66   Median :24.73   Median :24.81  
##  Mean   :24.29   Mean   :24.36   Mean   :24.44   Mean   :24.52  
##  3rd Qu.:26.01   3rd Qu.:26.09   3rd Qu.:26.19   3rd Qu.:26.30  
##  Max.   :32.13   Max.   :32.32   Max.   :32.51   Max.   :32.70  
##                                                                 
##      Y2003           Y2004           Y2005           Y2006      
##  Min.   :19.81   Min.   :19.79   Min.   :19.79   Min.   :19.80  
##  1st Qu.:22.37   1st Qu.:22.45   1st Qu.:22.54   1st Qu.:22.63  
##  Median :24.89   Median :25.00   Median :25.11   Median :25.24  
##  Mean   :24.61   Mean   :24.70   Mean   :24.79   Mean   :24.89  
##  3rd Qu.:26.38   3rd Qu.:26.47   3rd Qu.:26.53   3rd Qu.:26.59  
##  Max.   :32.90   Max.   :33.10   Max.   :33.30   Max.   :33.49  
##                                                                 
##      Y2007           Y2008      
##  Min.   :19.83   Min.   :19.87  
##  1st Qu.:22.73   1st Qu.:22.83  
##  Median :25.36   Median :25.50  
##  Mean   :24.99   Mean   :25.10  
##  3rd Qu.:26.66   3rd Qu.:26.82  
##  Max.   :33.69   Max.   :33.90  
## 
#Histogram of BMIs from 2008
hist(bmi$Y2008)

#Scatter plot comparing BMIs from 1980 to those from 2008
plot(bmi$Y1980, bmi$Y2008)

Tidying Data

Principles of Tidy Data

Common Symptoms of Messy Data

Tidyr: Gather & Spread

library(tidyr)
bmi_long <- gather(bmi, year, bmi_val, -Country)
head(bmi_long)
##               Country  year  bmi_val
## 1         Afghanistan Y1980 21.48678
## 2             Albania Y1980 25.22533
## 3             Algeria Y1980 22.25703
## 4             Andorra Y1980 25.66652
## 5              Angola Y1980 20.94876
## 6 Antigua and Barbuda Y1980 23.31424
bmi_wide <- spread(bmi_long, year, bmi_val)
head(bmi_wide)
##               Country    Y1980    Y1981    Y1982    Y1983    Y1984
## 1         Afghanistan 21.48678 21.46552 21.45145 21.43822 21.42734
## 2             Albania 25.22533 25.23981 25.25636 25.27176 25.27901
## 3             Algeria 22.25703 22.34745 22.43647 22.52105 22.60633
## 4             Andorra 25.66652 25.70868 25.74681 25.78250 25.81874
## 5              Angola 20.94876 20.94371 20.93754 20.93187 20.93569
## 6 Antigua and Barbuda 23.31424 23.39054 23.45883 23.53735 23.63584
##      Y1985    Y1986    Y1987    Y1988    Y1989    Y1990    Y1991    Y1992
## 1 21.41222 21.40132 21.37679 21.34018 21.29845 21.24818 21.20269 21.14238
## 2 25.28669 25.29451 25.30217 25.30450 25.31944 25.32357 25.28452 25.23077
## 3 22.69501 22.76979 22.84096 22.90644 22.97931 23.04600 23.11333 23.18776
## 4 25.85236 25.89089 25.93414 25.98477 26.04450 26.10936 26.17912 26.24017
## 5 20.94857 20.96030 20.98025 21.01375 21.05269 21.09007 21.12136 21.14987
## 6 23.73109 23.83449 23.93649 24.05364 24.16347 24.26782 24.36568 24.45644
##      Y1993    Y1994    Y1995    Y1996    Y1997    Y1998    Y1999    Y2000
## 1 21.06376 20.97987 20.91132 20.85155 20.81307 20.78591 20.75469 20.69521
## 2 25.21192 25.22115 25.25874 25.31097 25.33988 25.39116 25.46555 25.55835
## 3 23.25764 23.32273 23.39526 23.46811 23.54160 23.61592 23.69486 23.77659
## 4 26.30356 26.36793 26.43569 26.50769 26.58255 26.66337 26.75078 26.83179
## 5 21.13938 21.14186 21.16022 21.19076 21.22621 21.27082 21.31954 21.37480
## 6 24.54096 24.60945 24.66461 24.72544 24.78714 24.84936 24.91721 24.99158
##      Y2001    Y2002    Y2003    Y2004    Y2005    Y2006    Y2007    Y2008
## 1 20.62643 20.59848 20.58706 20.57759 20.58084 20.58749 20.60246 20.62058
## 2 25.66701 25.77167 25.87274 25.98136 26.08939 26.20867 26.32753 26.44657
## 3 23.86256 23.95294 24.05243 24.15957 24.27001 24.38270 24.48846 24.59620
## 4 26.92373 27.02525 27.12481 27.23107 27.32827 27.43588 27.53363 27.63048
## 5 21.43664 21.51765 21.59924 21.69218 21.80564 21.93881 22.08962 22.25083
## 6 25.05857 25.13039 25.20713 25.29898 25.39965 25.51382 25.64247 25.76602

Compare bmi, bmi_long & bmi_wide, notice that bmi_wide and bmi are indeed the same.

Tidyr: Separate & Unite

Preparing Data for Analysis

Types Conversion

library(lubridate)
students <- read.csv('https://assets.datacamp.com/production/repositories/34/datasets/f75a87dbbdf2cf79e2286f97b2af22146cb717b1/students_with_dates.csv')
str(students$dob)
##  Factor w/ 345 levels "1996-11-02","1996-11-03",..: 313 267 113 100 162 224 79 147 292 44 ...
str(students$Grades)
##  Factor w/ 197 levels "10/0/0","10/10/0",..: 124 123 154 86 128 88 46 131 104 77 ...
students$Grades <- as.character(students$Grades)
students$dob <- ymd(students$dob)
str(students$dob)
##  Date[1:395], format: "2000-06-05" "1999-11-25" "1998-02-02" "1997-12-20" "1998-10-04" ...
str(students$Grades)
##  chr [1:395] "5/6/6" "5/5/6" "7/8/10" "15/14/15" "6/10/10" "15/15/15" ...

String Manipulation

library(stringr)
str_trim(c("   Filip ", "Nick  ", " Jonathan"))
## [1] "Filip"    "Nick"     "Jonathan"
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")
## [1] "00023485W" "08823453Q" "00000994Z"
head(students$dob)
## [1] "2000-06-05" "1999-11-25" "1998-02-02" "1997-12-20" "1998-10-04"
## [6] "1999-06-16"
head(str_detect(students$dob, '1997'))
## [1] FALSE FALSE FALSE  TRUE FALSE FALSE
str(students$sex)
##  Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
students$sex <- str_replace(students$sex, 'F', 'Female')
students$sex <- str_replace(students$sex, 'M', 'Male')
str(students$sex)
##  chr [1:395] "Female" "Female" "Female" "Female" "Female" "Male" ...

Missing & Special Values

Outliers & Obvious Errors

A simple histogram or boxplot, displaying the distribution of a variable’s values across all the observations can be key to identifying potential outliers as early as possible.

hist(students$absences)

boxplot(students$absences)

Case Study

So, let’s start with understanding the structure of your data.

weather <- readRDS('D:/Downloads/weather.rds')
#Verify that weather is a data.frame.
class(weather)
## [1] "data.frame"
#Check the dimensions.
dim(weather)
## [1] 286  35
#View the column names.
names(weather)
##  [1] "X"       "year"    "month"   "measure" "X1"      "X2"      "X3"     
##  [8] "X4"      "X5"      "X6"      "X7"      "X8"      "X9"      "X10"    
## [15] "X11"     "X12"     "X13"     "X14"     "X15"     "X16"     "X17"    
## [22] "X18"     "X19"     "X20"     "X21"     "X22"     "X23"     "X24"    
## [29] "X25"     "X26"     "X27"     "X28"     "X29"     "X30"     "X31"

Then, we’re going to looking at your data.

#View the structure of the data.
str(weather)
## 'data.frame':    286 obs. of  35 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ year   : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ month  : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ measure: chr  "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
##  $ X1     : chr  "64" "52" "39" "46" ...
##  $ X2     : chr  "42" "38" "33" "40" ...
##  $ X3     : chr  "51" "44" "37" "49" ...
##  $ X4     : chr  "43" "37" "30" "24" ...
##  $ X5     : chr  "42" "34" "26" "37" ...
##  $ X6     : chr  "45" "42" "38" "45" ...
##  $ X7     : chr  "38" "30" "21" "36" ...
##  $ X8     : chr  "29" "24" "18" "28" ...
##  $ X9     : chr  "49" "39" "29" "49" ...
##  $ X10    : chr  "48" "43" "38" "45" ...
##  $ X11    : chr  "39" "36" "32" "37" ...
##  $ X12    : chr  "39" "35" "31" "28" ...
##  $ X13    : chr  "42" "37" "32" "28" ...
##  $ X14    : chr  "45" "39" "33" "29" ...
##  $ X15    : chr  "42" "37" "32" "33" ...
##  $ X16    : chr  "44" "40" "35" "42" ...
##  $ X17    : chr  "49" "45" "41" "46" ...
##  $ X18    : chr  "44" "40" "36" "34" ...
##  $ X19    : chr  "37" "33" "29" "25" ...
##  $ X20    : chr  "36" "32" "27" "30" ...
##  $ X21    : chr  "36" "33" "30" "30" ...
##  $ X22    : chr  "44" "39" "33" "39" ...
##  $ X23    : chr  "47" "45" "42" "45" ...
##  $ X24    : chr  "46" "44" "41" "46" ...
##  $ X25    : chr  "59" "52" "44" "58" ...
##  $ X26    : chr  "50" "44" "37" "31" ...
##  $ X27    : chr  "52" "45" "38" "34" ...
##  $ X28    : chr  "52" "46" "40" "42" ...
##  $ X29    : chr  "41" "36" "30" "26" ...
##  $ X30    : chr  "30" "26" "22" "10" ...
##  $ X31    : chr  "30" "25" "20" "8" ...
#Look at the structure using dplyr's glimpse().
library(dplyr)
glimpse(weather)
## Observations: 286
## Variables: 35
## $ X       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ year    <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ...
## $ month   <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat...
## $ X1      <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ...
## $ X2      <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ...
## $ X3      <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",...
## $ X4      <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ...
## $ X5      <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ...
## $ X6      <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",...
## $ X7      <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ...
## $ X8      <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "...
## $ X9      <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",...
## $ X10     <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",...
## $ X11     <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ...
## $ X12     <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ...
## $ X13     <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ...
## $ X14     <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ...
## $ X15     <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ...
## $ X16     <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ...
## $ X17     <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",...
## $ X18     <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ...
## $ X19     <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ...
## $ X20     <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ...
## $ X21     <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ...
## $ X22     <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ...
## $ X23     <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",...
## $ X24     <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",...
## $ X25     <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",...
## $ X26     <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ...
## $ X27     <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ...
## $ X28     <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ...
## $ X29     <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ...
## $ X30     <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "...
## $ X31     <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
#View a summary of the data.
summary(weather)
##        X               year          month          measure         
##  Min.   :  1.00   Min.   :2014   Min.   : 1.000   Length:286        
##  1st Qu.: 72.25   1st Qu.:2015   1st Qu.: 4.000   Class :character  
##  Median :143.50   Median :2015   Median : 7.000   Mode  :character  
##  Mean   :143.50   Mean   :2015   Mean   : 6.923                     
##  3rd Qu.:214.75   3rd Qu.:2015   3rd Qu.:10.000                     
##  Max.   :286.00   Max.   :2015   Max.   :12.000                     
##       X1                 X2                 X3           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       X4                 X5                 X6           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       X7                 X8                 X9           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X10                X11                X12           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X13                X14                X15           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X16                X17                X18           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X19                X20                X21           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X22                X23                X24           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X25                X26                X27           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X28                X29                X30           
##  Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      X31           
##  Length:286        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Take a closer look of your data.
head(weather)
##   X year month           measure X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## 1 1 2014    12  Max.TemperatureF 64 42 51 43 42 45 38 29 49  48  39  39
## 2 2 2014    12 Mean.TemperatureF 52 38 44 37 34 42 30 24 39  43  36  35
## 3 3 2014    12  Min.TemperatureF 39 33 37 30 26 38 21 18 29  38  32  31
## 4 4 2014    12    Max.Dew.PointF 46 40 49 24 37 45 36 28 49  45  37  28
## 5 5 2014    12    MeanDew.PointF 40 27 42 21 25 40 20 16 41  39  31  27
## 6 6 2014    12     Min.DewpointF 26 17 24 13 12 36 -3  3 28  37  27  25
##   X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
## 1  42  45  42  44  49  44  37  36  36  44  47  46  59  50  52  52  41  30
## 2  37  39  37  40  45  40  33  32  33  39  45  44  52  44  45  46  36  26
## 3  32  33  32  35  41  36  29  27  30  33  42  41  44  37  38  40  30  22
## 4  28  29  33  42  46  34  25  30  30  39  45  46  58  31  34  42  26  10
## 5  26  27  29  36  41  30  22  24  27  34  42  44  43  29  31  35  20   4
## 6  24  25  27  30  32  26  20  20  25  25  37  41  29  28  29  27  10  -6
##   X31
## 1  30
## 2  25
## 3  20
## 4   8
## 5   5
## 6   1

Well done! Not surprisingly, this dataset is pretty messy…for now :)
It’s time to tidy it up!

# Gather the columns
library(tidyr)
weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)#na.rm決定是否排除NA
head(weather2)
##   X year month           measure day value
## 1 1 2014    12  Max.TemperatureF  X1    64
## 2 2 2014    12 Mean.TemperatureF  X1    52
## 3 3 2014    12  Min.TemperatureF  X1    39
## 4 4 2014    12    Max.Dew.PointF  X1    46
## 5 5 2014    12    MeanDew.PointF  X1    40
## 6 6 2014    12     Min.DewpointF  X1    26
#First remove column of row names
weather2 <- weather2[, -1]#'-1' means to remove col1.
#Spread the data
weather3 <- spread(weather2, measure, value)
#View the head
head(weather3)
##   year month day CloudCover    Events Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014    12  X1          6      Rain             46                29
## 2 2014    12 X10          8      Rain             45                29
## 3 2014    12 X11          8 Rain-Snow             37                28
## 4 2014    12 X12          7      Snow             28                21
## 5 2014    12 X13          5                       28                23
## 6 2014    12 X14          4                       29                20
##   Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1           74                    30.45               64
## 2          100                    29.58               48
## 3           92                    29.81               39
## 4           85                    29.88               39
## 5           75                    29.86               42
## 6           82                    29.91               45
##   Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                  10                22            63
## 2                  10                23            95
## 3                  10                21            87
## 4                  10                16            75
## 5                  10                17            65
## 6                  10                15            68
##   Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                     30.13                52                   10
## 2                      29.5                43                    3
## 3                     29.61                36                    7
## 4                     29.85                35                   10
## 5                     29.82                37                   10
## 6                     29.83                39                   10
##   Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                 13             40            26           52
## 2                 13             39            37           89
## 3                 13             31            27           82
## 4                 11             27            25           64
## 5                 12             26            24           55
## 6                 10             27            25           53
##   Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                    30.01               39                  10
## 2                    29.43               38                   1
## 3                    29.44               32                   1
## 4                    29.81               31                   7
## 5                    29.78               32                  10
## 6                    29.78               33                  10
##   PrecipitationIn WindDirDegrees
## 1            0.01            268
## 2            0.28            357
## 3            0.02            230
## 4               T            286
## 5               T            298
## 6            0.00            306
#Remove X's from day column
weather3$day <- str_replace(weather3$day, 'X', '')
#Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")
#Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)
#Rearrange columns using dplyr's select()
weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
#View the head of 
head(weather5)
##         date    Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01      Rain          6             46                29
## 2 2014-12-10      Rain          8             45                29
## 3 2014-12-11 Rain-Snow          8             37                28
## 4 2014-12-12      Snow          7             28                21
## 5 2014-12-13                    5             28                23
## 6 2014-12-14                    4             29                20
##   Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1           74                    30.45               64
## 2          100                    29.58               48
## 3           92                    29.81               39
## 4           85                    29.88               39
## 5           75                    29.86               42
## 6           82                    29.91               45
##   Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                  10                22            63
## 2                  10                23            95
## 3                  10                21            87
## 4                  10                16            75
## 5                  10                17            65
## 6                  10                15            68
##   Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                     30.13                52                   10
## 2                      29.5                43                    3
## 3                     29.61                36                    7
## 4                     29.85                35                   10
## 5                     29.82                37                   10
## 6                     29.83                39                   10
##   Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                 13             40            26           52
## 2                 13             39            37           89
## 3                 13             31            27           82
## 4                 11             27            25           64
## 5                 12             26            24           55
## 6                 10             27            25           53
##   Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                    30.01               39                  10
## 2                    29.43               38                   1
## 3                    29.44               32                   1
## 4                    29.81               31                   7
## 5                    29.78               32                  10
## 6                    29.78               33                  10
##   PrecipitationIn WindDirDegrees
## 1            0.01            268
## 2            0.28            357
## 3            0.02            230
## 4               T            286
## 5               T            298
## 6            0.00            306

And now, it’s time for strange & missing values.

#Search for strange values in PrecipitationIn
## View the structure of weather5
str(weather5)
## 'data.frame':    366 obs. of  23 variables:
##  $ date                     : Date, format: "2014-12-01" "2014-12-10" ...
##  $ Events                   : chr  "Rain" "Rain" "Rain-Snow" "Snow" ...
##  $ CloudCover               : chr  "6" "8" "8" "7" ...
##  $ Max.Dew.PointF           : chr  "46" "45" "37" "28" ...
##  $ Max.Gust.SpeedMPH        : chr  "29" "29" "28" "21" ...
##  $ Max.Humidity             : chr  "74" "100" "92" "85" ...
##  $ Max.Sea.Level.PressureIn : chr  "30.45" "29.58" "29.81" "29.88" ...
##  $ Max.TemperatureF         : chr  "64" "48" "39" "39" ...
##  $ Max.VisibilityMiles      : chr  "10" "10" "10" "10" ...
##  $ Max.Wind.SpeedMPH        : chr  "22" "23" "21" "16" ...
##  $ Mean.Humidity            : chr  "63" "95" "87" "75" ...
##  $ Mean.Sea.Level.PressureIn: chr  "30.13" "29.5" "29.61" "29.85" ...
##  $ Mean.TemperatureF        : chr  "52" "43" "36" "35" ...
##  $ Mean.VisibilityMiles     : chr  "10" "3" "7" "10" ...
##  $ Mean.Wind.SpeedMPH       : chr  "13" "13" "13" "11" ...
##  $ MeanDew.PointF           : chr  "40" "39" "31" "27" ...
##  $ Min.DewpointF            : chr  "26" "37" "27" "25" ...
##  $ Min.Humidity             : chr  "52" "89" "82" "64" ...
##  $ Min.Sea.Level.PressureIn : chr  "30.01" "29.43" "29.44" "29.81" ...
##  $ Min.TemperatureF         : chr  "39" "38" "32" "31" ...
##  $ Min.VisibilityMiles      : chr  "10" "1" "1" "7" ...
##  $ PrecipitationIn          : chr  "0.01" "0.28" "0.02" "T" ...
##  $ WindDirDegrees           : chr  "268" "357" "230" "286" ...
#Examine the first 20 rows of weather5. Are most of the characters numeric?
head(weather5, 20)
##          date    Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1  2014-12-01      Rain          6             46                29
## 2  2014-12-10      Rain          8             45                29
## 3  2014-12-11 Rain-Snow          8             37                28
## 4  2014-12-12      Snow          7             28                21
## 5  2014-12-13                    5             28                23
## 6  2014-12-14                    4             29                20
## 7  2014-12-15                    2             33                21
## 8  2014-12-16      Rain          8             42                10
## 9  2014-12-17      Rain          8             46                26
## 10 2014-12-18      Rain          7             34                30
## 11 2014-12-19                    4             25                23
## 12 2014-12-02 Rain-Snow          7             40                29
## 13 2014-12-20      Snow          6             30                26
## 14 2014-12-21      Snow          8             30                20
## 15 2014-12-22      Rain          7             39                22
## 16 2014-12-23      Rain          8             45                25
## 17 2014-12-24  Fog-Rain          8             46                15
## 18 2014-12-25      Rain          6             58                40
## 19 2014-12-26                    1             31                25
## 20 2014-12-27                    3             34                21
##    Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1            74                    30.45               64
## 2           100                    29.58               48
## 3            92                    29.81               39
## 4            85                    29.88               39
## 5            75                    29.86               42
## 6            82                    29.91               45
## 7            89                    30.15               42
## 8            96                    30.17               44
## 9           100                    29.91               49
## 10           89                    29.87               44
## 11           69                    30.15               37
## 12           92                    30.71               42
## 13           89                    30.31               36
## 14           85                    30.37               36
## 15           89                     30.4               44
## 16          100                    30.31               47
## 17          100                    30.13               46
## 18          100                    29.96               59
## 19           70                    30.16               50
## 20           70                    30.22               52
##    Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                   10                22            63
## 2                   10                23            95
## 3                   10                21            87
## 4                   10                16            75
## 5                   10                17            65
## 6                   10                15            68
## 7                   10                15            75
## 8                   10                 8            85
## 9                   10                20            85
## 10                  10                23            73
## 11                  10                17            63
## 12                  10                24            72
## 13                  10                21            79
## 14                  10                16            77
## 15                  10                18            79
## 16                  10                20            91
## 17                   2                13            98
## 18                  10                28            75
## 19                  10                18            60
## 20                  10                17            60
##    Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                      30.13                52                   10
## 2                       29.5                43                    3
## 3                      29.61                36                    7
## 4                      29.85                35                   10
## 5                      29.82                37                   10
## 6                      29.83                39                   10
## 7                      30.05                37                   10
## 8                      30.09                40                    9
## 9                      29.75                45                    6
## 10                     29.78                40                   10
## 11                     29.98                33                   10
## 12                     30.59                38                    8
## 13                     30.26                32                   10
## 14                     30.32                33                    9
## 15                     30.35                39                   10
## 16                     30.23                45                    5
## 17                      29.9                44                    1
## 18                     29.63                52                    8
## 19                     30.11                44                   10
## 20                     30.14                45                   10
##    Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                  13             40            26           52
## 2                  13             39            37           89
## 3                  13             31            27           82
## 4                  11             27            25           64
## 5                  12             26            24           55
## 6                  10             27            25           53
## 7                   6             29            27           60
## 8                   4             36            30           73
## 9                  11             41            32           70
## 10                 14             30            26           57
## 11                 11             22            20           56
## 12                 15             27            17           51
## 13                 10             24            20           69
## 14                  9             27            25           69
## 15                  8             34            25           69
## 16                 13             42            37           82
## 17                  6             44            41           96
## 18                 14             43            29           49
## 19                 11             29            28           49
## 20                  9             31            29           50
##    Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                     30.01               39                  10
## 2                     29.43               38                   1
## 3                     29.44               32                   1
## 4                     29.81               31                   7
## 5                     29.78               32                  10
## 6                     29.78               33                  10
## 7                     29.91               32                  10
## 8                     29.92               35                   5
## 9                     29.69               41                   1
## 10                    29.71               36                  10
## 11                    29.86               29                  10
## 12                     30.4               33                   2
## 13                    30.17               27                   7
## 14                    30.28               30                   6
## 15                     30.3               33                   4
## 16                    30.16               42                   1
## 17                    29.55               41                   0
## 18                    29.47               44                   1
## 19                    29.99               37                  10
## 20                    30.03               38                  10
##    PrecipitationIn WindDirDegrees
## 1             0.01            268
## 2             0.28            357
## 3             0.02            230
## 4                T            286
## 5                T            298
## 6             0.00            306
## 7             0.00            324
## 8                T             79
## 9             0.43            311
## 10            0.01            281
## 11            0.00            305
## 12            0.10             62
## 13               T            350
## 14               T              2
## 15            0.05             24
## 16            0.25             63
## 17            0.56             12
## 18            0.14            250
## 19            0.00            255
## 20            0.00            251
weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn, 'T', '0')
#Or using sub(): 'weather5$PrecipitationIn <- sub('T', 0, weather5$PrecipitationIn)'
#Convert characters to numerics
weather6 <- mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))
#check the dataset
str(weather6)
## 'data.frame':    366 obs. of  23 variables:
##  $ date                     : Date, format: "2014-12-01" "2014-12-10" ...
##  $ Events                   : chr  "Rain" "Rain" "Rain-Snow" "Snow" ...
##  $ CloudCover               : num  6 8 8 7 5 4 2 8 8 7 ...
##  $ Max.Dew.PointF           : num  46 45 37 28 28 29 33 42 46 34 ...
##  $ Max.Gust.SpeedMPH        : num  29 29 28 21 23 20 21 10 26 30 ...
##  $ Max.Humidity             : num  74 100 92 85 75 82 89 96 100 89 ...
##  $ Max.Sea.Level.PressureIn : num  30.4 29.6 29.8 29.9 29.9 ...
##  $ Max.TemperatureF         : num  64 48 39 39 42 45 42 44 49 44 ...
##  $ Max.VisibilityMiles      : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Max.Wind.SpeedMPH        : num  22 23 21 16 17 15 15 8 20 23 ...
##  $ Mean.Humidity            : num  63 95 87 75 65 68 75 85 85 73 ...
##  $ Mean.Sea.Level.PressureIn: num  30.1 29.5 29.6 29.9 29.8 ...
##  $ Mean.TemperatureF        : num  52 43 36 35 37 39 37 40 45 40 ...
##  $ Mean.VisibilityMiles     : num  10 3 7 10 10 10 10 9 6 10 ...
##  $ Mean.Wind.SpeedMPH       : num  13 13 13 11 12 10 6 4 11 14 ...
##  $ MeanDew.PointF           : num  40 39 31 27 26 27 29 36 41 30 ...
##  $ Min.DewpointF            : num  26 37 27 25 24 25 27 30 32 26 ...
##  $ Min.Humidity             : num  52 89 82 64 55 53 60 73 70 57 ...
##  $ Min.Sea.Level.PressureIn : num  30 29.4 29.4 29.8 29.8 ...
##  $ Min.TemperatureF         : num  39 38 32 31 32 33 32 35 41 36 ...
##  $ Min.VisibilityMiles      : num  10 1 1 7 10 10 10 5 1 10 ...
##  $ PrecipitationIn          : num  0.01 0.28 0.02 0 0 0 0 0 0.43 0.01 ...
##  $ WindDirDegrees           : num  268 357 230 286 298 306 324 79 311 281 ...
summary(weather6)
##       date               Events            CloudCover    Max.Dew.PointF 
##  Min.   :2014-12-01   Length:366         Min.   :0.000   Min.   :-6.00  
##  1st Qu.:2015-03-02   Class :character   1st Qu.:3.000   1st Qu.:32.00  
##  Median :2015-06-01   Mode  :character   Median :5.000   Median :47.50  
##  Mean   :2015-06-01                      Mean   :4.708   Mean   :45.48  
##  3rd Qu.:2015-08-31                      3rd Qu.:7.000   3rd Qu.:61.00  
##  Max.   :2015-12-01                      Max.   :8.000   Max.   :75.00  
##                                                                         
##  Max.Gust.SpeedMPH  Max.Humidity     Max.Sea.Level.PressureIn
##  Min.   : 0.00     Min.   :  39.00   Min.   :29.58           
##  1st Qu.:21.00     1st Qu.:  73.25   1st Qu.:30.00           
##  Median :25.50     Median :  86.00   Median :30.14           
##  Mean   :26.99     Mean   :  85.69   Mean   :30.16           
##  3rd Qu.:31.25     3rd Qu.:  93.00   3rd Qu.:30.31           
##  Max.   :94.00     Max.   :1000.00   Max.   :30.88           
##  NA's   :6                                                   
##  Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity  
##  Min.   :18.00    Min.   : 2.000      Min.   : 8.00     Min.   :28.00  
##  1st Qu.:42.00    1st Qu.:10.000      1st Qu.:16.00     1st Qu.:56.00  
##  Median :60.00    Median :10.000      Median :20.00     Median :66.00  
##  Mean   :58.93    Mean   : 9.907      Mean   :20.62     Mean   :66.02  
##  3rd Qu.:76.00    3rd Qu.:10.000      3rd Qu.:24.00     3rd Qu.:76.75  
##  Max.   :96.00    Max.   :10.000      Max.   :38.00     Max.   :98.00  
##                                                                        
##  Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
##  Min.   :29.49             Min.   : 8.00     Min.   :-1.000      
##  1st Qu.:29.87             1st Qu.:36.25     1st Qu.: 8.000      
##  Median :30.03             Median :53.50     Median :10.000      
##  Mean   :30.04             Mean   :51.40     Mean   : 8.861      
##  3rd Qu.:30.19             3rd Qu.:68.00     3rd Qu.:10.000      
##  Max.   :30.77             Max.   :84.00     Max.   :10.000      
##                                                                  
##  Mean.Wind.SpeedMPH MeanDew.PointF   Min.DewpointF     Min.Humidity  
##  Min.   : 4.00      Min.   :-11.00   Min.   :-18.00   Min.   :16.00  
##  1st Qu.: 8.00      1st Qu.: 24.00   1st Qu.: 16.25   1st Qu.:35.00  
##  Median :10.00      Median : 41.00   Median : 35.00   Median :46.00  
##  Mean   :10.68      Mean   : 38.96   Mean   : 32.25   Mean   :48.31  
##  3rd Qu.:13.00      3rd Qu.: 56.00   3rd Qu.: 51.00   3rd Qu.:60.00  
##  Max.   :22.00      Max.   : 71.00   Max.   : 68.00   Max.   :96.00  
##                                                                      
##  Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
##  Min.   :29.16            Min.   :-3.00    Min.   : 0.000     
##  1st Qu.:29.76            1st Qu.:30.00    1st Qu.: 2.000     
##  Median :29.94            Median :46.00    Median :10.000     
##  Mean   :29.93            Mean   :43.33    Mean   : 6.716     
##  3rd Qu.:30.09            3rd Qu.:60.00    3rd Qu.:10.000     
##  Max.   :30.64            Max.   :74.00    Max.   :10.000     
##                                                               
##  PrecipitationIn  WindDirDegrees 
##  Min.   :0.0000   Min.   :  1.0  
##  1st Qu.:0.0000   1st Qu.:113.0  
##  Median :0.0000   Median :222.0  
##  Mean   :0.1016   Mean   :200.1  
##  3rd Qu.:0.0400   3rd Qu.:275.0  
##  Max.   :2.9000   Max.   :360.0  
## 

Look at the summary, besides strange values in PrecipitationIn, there are also proplem in Max.Gust.SpeedMPH, Max.Humidity & Mean.VisibilityMiles.

#Find indices of NAs in Max.Gust.SpeedMPH
#Count missing values
sum(is.na(weather6))
## [1] 6
ind <- which(is.na(weather6$Max.Gust.SpeedMPH))
#Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[ind, ]
##           date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 161 2015-05-18    Fog          6             52                NA
## 205 2015-06-03                 7             48                NA
## 273 2015-08-08                 4             61                NA
## 275 2015-09-01                 1             63                NA
## 308 2015-10-12                 0             56                NA
## 358 2015-11-03                 1             44                NA
##     Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 161          100                    30.30               58
## 205           93                    30.31               56
## 273           87                    30.02               76
## 275           78                    30.06               79
## 308           89                    29.86               76
## 358           82                    30.25               73
##     Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 161                  10                16            79
## 205                  10                14            82
## 273                  10                14            68
## 275                  10                15            65
## 308                  10                15            65
## 358                  10                16            57
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 161                     30.23                54                    8
## 205                     30.24                52                   10
## 273                     29.99                69                   10
## 275                     30.02                74                   10
## 308                     29.80                64                   10
## 358                     30.13                60                   10
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 161                 10             48            43           57
## 205                  7             45            43           71
## 273                  6             57            54           49
## 275                  9             62            59           52
## 308                  8             51            48           41
## 358                  8             42            40           31
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 161                    30.12               49                   0
## 205                    30.19               47                  10
## 273                    29.95               61                  10
## 275                    29.96               69                  10
## 308                    29.74               51                  10
## 358                    30.06               47                  10
##     PrecipitationIn WindDirDegrees
## 161               0             72
## 205               0             90
## 273               0             45
## 275               0             54
## 308               0            199
## 358               0            281
#It's unclear why these values are missing and there doesn't appear to be any obvious pattern to their missingness, so we'll leave them alone for now. 

#Search for extreme values in Max.Humidity
ind <- which(weather6$Max.Humidity >= 1000)
#Look at the data for that day
weather6[ind, ]
##           date                Events CloudCover Max.Dew.PointF
## 135 2015-04-21 Fog-Rain-Thunderstorm          6             57
##     Max.Gust.SpeedMPH Max.Humidity Max.Sea.Level.PressureIn
## 135                94         1000                    29.75
##     Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 135               65                  10                20            71
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 135                      29.6                56                    5
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 135                 10             49            36           42
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 135                    29.53               46                   0
##     PrecipitationIn WindDirDegrees
## 135            0.54            184
#Change 1000 to 100
weather6$Max.Humidity[ind] <- 100

#Look at summary of Mean.VisibilityMiles
summary(weather6$Mean.VisibilityMiles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.000   8.000  10.000   8.861  10.000  10.000
#Get index of row with -1 value
ind <- which(weather6$Mean.VisibilityMiles == -1)
#Look at full row
weather6[ind, ]
##           date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 192 2015-06-18                 5             54                23
##     Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 192           72                    30.14               76
##     Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 192                  10                17            59
##     Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 192                     30.04                67                   -1
##     Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 192                 10             49            45           46
##     Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 192                    29.93               57                  10
##     PrecipitationIn WindDirDegrees
## 192               0            189
#Set Mean.VisibilityMiles to the appropriate value
weather6$Mean.VisibilityMiles[ind] <- 10

Finally, let’s have a final look of our data.

#Replace empty cells in events column
weather6$Events[weather6$Events == ""] = 'None'
#Print the first 6 rows of weather6
head(weather6)
##         date    Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01      Rain          6             46                29
## 2 2014-12-10      Rain          8             45                29
## 3 2014-12-11 Rain-Snow          8             37                28
## 4 2014-12-12      Snow          7             28                21
## 5 2014-12-13      None          5             28                23
## 6 2014-12-14      None          4             29                20
##   Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1           74                    30.45               64
## 2          100                    29.58               48
## 3           92                    29.81               39
## 4           85                    29.88               39
## 5           75                    29.86               42
## 6           82                    29.91               45
##   Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1                  10                22            63
## 2                  10                23            95
## 3                  10                21            87
## 4                  10                16            75
## 5                  10                17            65
## 6                  10                15            68
##   Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1                     30.13                52                   10
## 2                     29.50                43                    3
## 3                     29.61                36                    7
## 4                     29.85                35                   10
## 5                     29.82                37                   10
## 6                     29.83                39                   10
##   Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1                 13             40            26           52
## 2                 13             39            37           89
## 3                 13             31            27           82
## 4                 11             27            25           64
## 5                 12             26            24           55
## 6                 10             27            25           53
##   Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1                    30.01               39                  10
## 2                    29.43               38                   1
## 3                    29.44               32                   1
## 4                    29.81               31                   7
## 5                    29.78               32                  10
## 6                    29.78               33                  10
##   PrecipitationIn WindDirDegrees
## 1            0.01            268
## 2            0.28            357
## 3            0.02            230
## 4            0.00            286
## 5            0.00            298
## 6            0.00            306