Functions Helping Understanding Raw Data
dplyr
row
): Allow you to quickly see the top a dataset. The default of row
= 6.row
): Allow you to quickly see the bottom a dataset. The default of row
= 6.dataframe$column
): Visualize data with histogram.dataframe$column
, dataframe$column
): Visualize data as a series of (x, y) coordinates on a two-dimensional plane with scatter plot.bmi <- read.csv('https://assets.datacamp.com/production/repositories/34/datasets/a0a569ebbb34500d11979eba95360125127e6434/bmi_clean.csv')
#Getting a feel for your data
class(bmi)
## [1] "data.frame"
dim(bmi)
## [1] 199 30
names(bmi)
## [1] "Country" "Y1980" "Y1981" "Y1982" "Y1983" "Y1984" "Y1985"
## [8] "Y1986" "Y1987" "Y1988" "Y1989" "Y1990" "Y1991" "Y1992"
## [15] "Y1993" "Y1994" "Y1995" "Y1996" "Y1997" "Y1998" "Y1999"
## [22] "Y2000" "Y2001" "Y2002" "Y2003" "Y2004" "Y2005" "Y2006"
## [29] "Y2007" "Y2008"
#Viewing the structure of your data.
str(bmi)
## 'data.frame': 199 obs. of 30 variables:
## $ Country: Factor w/ 199 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Y1980 : num 21.5 25.2 22.3 25.7 20.9 ...
## $ Y1981 : num 21.5 25.2 22.3 25.7 20.9 ...
## $ Y1982 : num 21.5 25.3 22.4 25.7 20.9 ...
## $ Y1983 : num 21.4 25.3 22.5 25.8 20.9 ...
## $ Y1984 : num 21.4 25.3 22.6 25.8 20.9 ...
## $ Y1985 : num 21.4 25.3 22.7 25.9 20.9 ...
## $ Y1986 : num 21.4 25.3 22.8 25.9 21 ...
## $ Y1987 : num 21.4 25.3 22.8 25.9 21 ...
## $ Y1988 : num 21.3 25.3 22.9 26 21 ...
## $ Y1989 : num 21.3 25.3 23 26 21.1 ...
## $ Y1990 : num 21.2 25.3 23 26.1 21.1 ...
## $ Y1991 : num 21.2 25.3 23.1 26.2 21.1 ...
## $ Y1992 : num 21.1 25.2 23.2 26.2 21.1 ...
## $ Y1993 : num 21.1 25.2 23.3 26.3 21.1 ...
## $ Y1994 : num 21 25.2 23.3 26.4 21.1 ...
## $ Y1995 : num 20.9 25.3 23.4 26.4 21.2 ...
## $ Y1996 : num 20.9 25.3 23.5 26.5 21.2 ...
## $ Y1997 : num 20.8 25.3 23.5 26.6 21.2 ...
## $ Y1998 : num 20.8 25.4 23.6 26.7 21.3 ...
## $ Y1999 : num 20.8 25.5 23.7 26.8 21.3 ...
## $ Y2000 : num 20.7 25.6 23.8 26.8 21.4 ...
## $ Y2001 : num 20.6 25.7 23.9 26.9 21.4 ...
## $ Y2002 : num 20.6 25.8 24 27 21.5 ...
## $ Y2003 : num 20.6 25.9 24.1 27.1 21.6 ...
## $ Y2004 : num 20.6 26 24.2 27.2 21.7 ...
## $ Y2005 : num 20.6 26.1 24.3 27.3 21.8 ...
## $ Y2006 : num 20.6 26.2 24.4 27.4 21.9 ...
## $ Y2007 : num 20.6 26.3 24.5 27.5 22.1 ...
## $ Y2008 : num 20.6 26.4 24.6 27.6 22.3 ...
library(dplyr)
glimpse(bmi)
## Observations: 199
## Variables: 30
## $ Country <fct> Afghanistan, Albania, Algeria, Andorra, Angola, Antigu...
## $ Y1980 <dbl> 21.48678, 25.22533, 22.25703, 25.66652, 20.94876, 23.3...
## $ Y1981 <dbl> 21.46552, 25.23981, 22.34745, 25.70868, 20.94371, 23.3...
## $ Y1982 <dbl> 21.45145, 25.25636, 22.43647, 25.74681, 20.93754, 23.4...
## $ Y1983 <dbl> 21.43822, 25.27176, 22.52105, 25.78250, 20.93187, 23.5...
## $ Y1984 <dbl> 21.42734, 25.27901, 22.60633, 25.81874, 20.93569, 23.6...
## $ Y1985 <dbl> 21.41222, 25.28669, 22.69501, 25.85236, 20.94857, 23.7...
## $ Y1986 <dbl> 21.40132, 25.29451, 22.76979, 25.89089, 20.96030, 23.8...
## $ Y1987 <dbl> 21.37679, 25.30217, 22.84096, 25.93414, 20.98025, 23.9...
## $ Y1988 <dbl> 21.34018, 25.30450, 22.90644, 25.98477, 21.01375, 24.0...
## $ Y1989 <dbl> 21.29845, 25.31944, 22.97931, 26.04450, 21.05269, 24.1...
## $ Y1990 <dbl> 21.24818, 25.32357, 23.04600, 26.10936, 21.09007, 24.2...
## $ Y1991 <dbl> 21.20269, 25.28452, 23.11333, 26.17912, 21.12136, 24.3...
## $ Y1992 <dbl> 21.14238, 25.23077, 23.18776, 26.24017, 21.14987, 24.4...
## $ Y1993 <dbl> 21.06376, 25.21192, 23.25764, 26.30356, 21.13938, 24.5...
## $ Y1994 <dbl> 20.97987, 25.22115, 23.32273, 26.36793, 21.14186, 24.6...
## $ Y1995 <dbl> 20.91132, 25.25874, 23.39526, 26.43569, 21.16022, 24.6...
## $ Y1996 <dbl> 20.85155, 25.31097, 23.46811, 26.50769, 21.19076, 24.7...
## $ Y1997 <dbl> 20.81307, 25.33988, 23.54160, 26.58255, 21.22621, 24.7...
## $ Y1998 <dbl> 20.78591, 25.39116, 23.61592, 26.66337, 21.27082, 24.8...
## $ Y1999 <dbl> 20.75469, 25.46555, 23.69486, 26.75078, 21.31954, 24.9...
## $ Y2000 <dbl> 20.69521, 25.55835, 23.77659, 26.83179, 21.37480, 24.9...
## $ Y2001 <dbl> 20.62643, 25.66701, 23.86256, 26.92373, 21.43664, 25.0...
## $ Y2002 <dbl> 20.59848, 25.77167, 23.95294, 27.02525, 21.51765, 25.1...
## $ Y2003 <dbl> 20.58706, 25.87274, 24.05243, 27.12481, 21.59924, 25.2...
## $ Y2004 <dbl> 20.57759, 25.98136, 24.15957, 27.23107, 21.69218, 25.2...
## $ Y2005 <dbl> 20.58084, 26.08939, 24.27001, 27.32827, 21.80564, 25.3...
## $ Y2006 <dbl> 20.58749, 26.20867, 24.38270, 27.43588, 21.93881, 25.5...
## $ Y2007 <dbl> 20.60246, 26.32753, 24.48846, 27.53363, 22.08962, 25.6...
## $ Y2008 <dbl> 20.62058, 26.44657, 24.59620, 27.63048, 22.25083, 25.7...
summary(bmi)
## Country Y1980 Y1981 Y1982
## Afghanistan : 1 Min. :19.01 Min. :19.04 Min. :19.07
## Albania : 1 1st Qu.:21.27 1st Qu.:21.31 1st Qu.:21.36
## Algeria : 1 Median :23.31 Median :23.39 Median :23.46
## Andorra : 1 Mean :23.15 Mean :23.21 Mean :23.26
## Angola : 1 3rd Qu.:24.82 3rd Qu.:24.89 3rd Qu.:24.94
## Antigua and Barbuda: 1 Max. :28.12 Max. :28.36 Max. :28.58
## (Other) :193
## Y1983 Y1984 Y1985 Y1986
## Min. :19.10 Min. :19.13 Min. :19.16 Min. :19.20
## 1st Qu.:21.42 1st Qu.:21.45 1st Qu.:21.47 1st Qu.:21.49
## Median :23.57 Median :23.64 Median :23.73 Median :23.82
## Mean :23.32 Mean :23.37 Mean :23.42 Mean :23.48
## 3rd Qu.:25.02 3rd Qu.:25.06 3rd Qu.:25.11 3rd Qu.:25.20
## Max. :28.82 Max. :29.05 Max. :29.28 Max. :29.52
##
## Y1987 Y1988 Y1989 Y1990
## Min. :19.23 Min. :19.27 Min. :19.31 Min. :19.35
## 1st Qu.:21.50 1st Qu.:21.52 1st Qu.:21.55 1st Qu.:21.57
## Median :23.87 Median :23.93 Median :24.03 Median :24.14
## Mean :23.53 Mean :23.59 Mean :23.65 Mean :23.71
## 3rd Qu.:25.27 3rd Qu.:25.34 3rd Qu.:25.37 3rd Qu.:25.39
## Max. :29.75 Max. :29.98 Max. :30.20 Max. :30.42
##
## Y1991 Y1992 Y1993 Y1994
## Min. :19.40 Min. :19.45 Min. :19.51 Min. :19.59
## 1st Qu.:21.60 1st Qu.:21.65 1st Qu.:21.74 1st Qu.:21.76
## Median :24.20 Median :24.19 Median :24.27 Median :24.36
## Mean :23.76 Mean :23.82 Mean :23.88 Mean :23.94
## 3rd Qu.:25.42 3rd Qu.:25.48 3rd Qu.:25.54 3rd Qu.:25.62
## Max. :30.64 Max. :30.85 Max. :31.04 Max. :31.23
##
## Y1995 Y1996 Y1997 Y1998
## Min. :19.67 Min. :19.71 Min. :19.74 Min. :19.77
## 1st Qu.:21.83 1st Qu.:21.89 1st Qu.:21.94 1st Qu.:22.00
## Median :24.41 Median :24.42 Median :24.50 Median :24.49
## Mean :24.00 Mean :24.07 Mean :24.14 Mean :24.21
## 3rd Qu.:25.70 3rd Qu.:25.78 3rd Qu.:25.85 3rd Qu.:25.94
## Max. :31.41 Max. :31.59 Max. :31.77 Max. :31.95
##
## Y1999 Y2000 Y2001 Y2002
## Min. :19.80 Min. :19.83 Min. :19.86 Min. :19.84
## 1st Qu.:22.04 1st Qu.:22.12 1st Qu.:22.22 1st Qu.:22.29
## Median :24.61 Median :24.66 Median :24.73 Median :24.81
## Mean :24.29 Mean :24.36 Mean :24.44 Mean :24.52
## 3rd Qu.:26.01 3rd Qu.:26.09 3rd Qu.:26.19 3rd Qu.:26.30
## Max. :32.13 Max. :32.32 Max. :32.51 Max. :32.70
##
## Y2003 Y2004 Y2005 Y2006
## Min. :19.81 Min. :19.79 Min. :19.79 Min. :19.80
## 1st Qu.:22.37 1st Qu.:22.45 1st Qu.:22.54 1st Qu.:22.63
## Median :24.89 Median :25.00 Median :25.11 Median :25.24
## Mean :24.61 Mean :24.70 Mean :24.79 Mean :24.89
## 3rd Qu.:26.38 3rd Qu.:26.47 3rd Qu.:26.53 3rd Qu.:26.59
## Max. :32.90 Max. :33.10 Max. :33.30 Max. :33.49
##
## Y2007 Y2008
## Min. :19.83 Min. :19.87
## 1st Qu.:22.73 1st Qu.:22.83
## Median :25.36 Median :25.50
## Mean :24.99 Mean :25.10
## 3rd Qu.:26.66 3rd Qu.:26.82
## Max. :33.69 Max. :33.90
##
#Histogram of BMIs from 2008
hist(bmi$Y2008)
#Scatter plot comparing BMIs from 1980 to those from 2008
plot(bmi$Y1980, bmi$Y2008)
Principles of Tidy Data
Common Symptoms of Messy Data
Tidyr: Gather & Spread
wide_df, my_key, my_val, -col
): Gather columns to key-value pairs. Makes wide datasets long.long_df, my_key, my_val
): Spread key-value pairs to columns. Makes long datasets wide.library(tidyr)
bmi_long <- gather(bmi, year, bmi_val, -Country)
head(bmi_long)
## Country year bmi_val
## 1 Afghanistan Y1980 21.48678
## 2 Albania Y1980 25.22533
## 3 Algeria Y1980 22.25703
## 4 Andorra Y1980 25.66652
## 5 Angola Y1980 20.94876
## 6 Antigua and Barbuda Y1980 23.31424
bmi_wide <- spread(bmi_long, year, bmi_val)
head(bmi_wide)
## Country Y1980 Y1981 Y1982 Y1983 Y1984
## 1 Afghanistan 21.48678 21.46552 21.45145 21.43822 21.42734
## 2 Albania 25.22533 25.23981 25.25636 25.27176 25.27901
## 3 Algeria 22.25703 22.34745 22.43647 22.52105 22.60633
## 4 Andorra 25.66652 25.70868 25.74681 25.78250 25.81874
## 5 Angola 20.94876 20.94371 20.93754 20.93187 20.93569
## 6 Antigua and Barbuda 23.31424 23.39054 23.45883 23.53735 23.63584
## Y1985 Y1986 Y1987 Y1988 Y1989 Y1990 Y1991 Y1992
## 1 21.41222 21.40132 21.37679 21.34018 21.29845 21.24818 21.20269 21.14238
## 2 25.28669 25.29451 25.30217 25.30450 25.31944 25.32357 25.28452 25.23077
## 3 22.69501 22.76979 22.84096 22.90644 22.97931 23.04600 23.11333 23.18776
## 4 25.85236 25.89089 25.93414 25.98477 26.04450 26.10936 26.17912 26.24017
## 5 20.94857 20.96030 20.98025 21.01375 21.05269 21.09007 21.12136 21.14987
## 6 23.73109 23.83449 23.93649 24.05364 24.16347 24.26782 24.36568 24.45644
## Y1993 Y1994 Y1995 Y1996 Y1997 Y1998 Y1999 Y2000
## 1 21.06376 20.97987 20.91132 20.85155 20.81307 20.78591 20.75469 20.69521
## 2 25.21192 25.22115 25.25874 25.31097 25.33988 25.39116 25.46555 25.55835
## 3 23.25764 23.32273 23.39526 23.46811 23.54160 23.61592 23.69486 23.77659
## 4 26.30356 26.36793 26.43569 26.50769 26.58255 26.66337 26.75078 26.83179
## 5 21.13938 21.14186 21.16022 21.19076 21.22621 21.27082 21.31954 21.37480
## 6 24.54096 24.60945 24.66461 24.72544 24.78714 24.84936 24.91721 24.99158
## Y2001 Y2002 Y2003 Y2004 Y2005 Y2006 Y2007 Y2008
## 1 20.62643 20.59848 20.58706 20.57759 20.58084 20.58749 20.60246 20.62058
## 2 25.66701 25.77167 25.87274 25.98136 26.08939 26.20867 26.32753 26.44657
## 3 23.86256 23.95294 24.05243 24.15957 24.27001 24.38270 24.48846 24.59620
## 4 26.92373 27.02525 27.12481 27.23107 27.32827 27.43588 27.53363 27.63048
## 5 21.43664 21.51765 21.59924 21.69218 21.80564 21.93881 22.08962 22.25083
## 6 25.05857 25.13039 25.20713 25.29898 25.39965 25.51382 25.64247 25.76602
Compare bmi, bmi_long & bmi_wide, notice that bmi_wide and bmi are indeed the same.
Tidyr: Separate & Unite
data, col, into(c("col1", "col2")), sep = '*'
): Separate one column into multiple.data, col, ...(bare names of columns), sep = '*'
): Unite multiple columns into one.Types Conversion
class
(): Change the class of a object.y
, m
, d
, h
, m
, s
, which stand for year, month, day, hour, minute, and second, respectively.library(lubridate)
students <- read.csv('https://assets.datacamp.com/production/repositories/34/datasets/f75a87dbbdf2cf79e2286f97b2af22146cb717b1/students_with_dates.csv')
str(students$dob)
## Factor w/ 345 levels "1996-11-02","1996-11-03",..: 313 267 113 100 162 224 79 147 292 44 ...
str(students$Grades)
## Factor w/ 197 levels "10/0/0","10/10/0",..: 124 123 154 86 128 88 46 131 104 77 ...
students$Grades <- as.character(students$Grades)
students$dob <- ymd(students$dob)
str(students$dob)
## Date[1:395], format: "2000-06-05" "1999-11-25" "1998-02-02" "1997-12-20" "1998-10-04" ...
str(students$Grades)
## chr [1:395] "5/6/6" "5/5/6" "7/8/10" "15/14/15" "6/10/10" "15/15/15" ...
String Manipulation
stringr
:
input
,str
): Detect a pattern.input
,replaced str
,new str
): Find & replace a pattern.base
for detecting string.base
for repalcing string.base
:
library(stringr)
str_trim(c(" Filip ", "Nick ", " Jonathan"))
## [1] "Filip" "Nick" "Jonathan"
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")
## [1] "00023485W" "08823453Q" "00000994Z"
head(students$dob)
## [1] "2000-06-05" "1999-11-25" "1998-02-02" "1997-12-20" "1998-10-04"
## [6] "1999-06-16"
head(str_detect(students$dob, '1997'))
## [1] FALSE FALSE FALSE TRUE FALSE FALSE
str(students$sex)
## Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
students$sex <- str_replace(students$sex, 'F', 'Female')
students$sex <- str_replace(students$sex, 'M', 'Male')
str(students$sex)
## chr [1:395] "Female" "Female" "Female" "Female" "Female" "Male" ...
Missing & Special Values
NA
, this function will help you find them.Outliers & Obvious Errors
A simple histogram or boxplot, displaying the distribution of a variable’s values across all the observations can be key to identifying potential outliers as early as possible.
hist(students$absences)
boxplot(students$absences)
So, let’s start with understanding the structure of your data.
weather <- readRDS('D:/Downloads/weather.rds')
#Verify that weather is a data.frame.
class(weather)
## [1] "data.frame"
#Check the dimensions.
dim(weather)
## [1] 286 35
#View the column names.
names(weather)
## [1] "X" "year" "month" "measure" "X1" "X2" "X3"
## [8] "X4" "X5" "X6" "X7" "X8" "X9" "X10"
## [15] "X11" "X12" "X13" "X14" "X15" "X16" "X17"
## [22] "X18" "X19" "X20" "X21" "X22" "X23" "X24"
## [29] "X25" "X26" "X27" "X28" "X29" "X30" "X31"
Then, we’re going to looking at your data.
#View the structure of the data.
str(weather)
## 'data.frame': 286 obs. of 35 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ measure: chr "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
## $ X1 : chr "64" "52" "39" "46" ...
## $ X2 : chr "42" "38" "33" "40" ...
## $ X3 : chr "51" "44" "37" "49" ...
## $ X4 : chr "43" "37" "30" "24" ...
## $ X5 : chr "42" "34" "26" "37" ...
## $ X6 : chr "45" "42" "38" "45" ...
## $ X7 : chr "38" "30" "21" "36" ...
## $ X8 : chr "29" "24" "18" "28" ...
## $ X9 : chr "49" "39" "29" "49" ...
## $ X10 : chr "48" "43" "38" "45" ...
## $ X11 : chr "39" "36" "32" "37" ...
## $ X12 : chr "39" "35" "31" "28" ...
## $ X13 : chr "42" "37" "32" "28" ...
## $ X14 : chr "45" "39" "33" "29" ...
## $ X15 : chr "42" "37" "32" "33" ...
## $ X16 : chr "44" "40" "35" "42" ...
## $ X17 : chr "49" "45" "41" "46" ...
## $ X18 : chr "44" "40" "36" "34" ...
## $ X19 : chr "37" "33" "29" "25" ...
## $ X20 : chr "36" "32" "27" "30" ...
## $ X21 : chr "36" "33" "30" "30" ...
## $ X22 : chr "44" "39" "33" "39" ...
## $ X23 : chr "47" "45" "42" "45" ...
## $ X24 : chr "46" "44" "41" "46" ...
## $ X25 : chr "59" "52" "44" "58" ...
## $ X26 : chr "50" "44" "37" "31" ...
## $ X27 : chr "52" "45" "38" "34" ...
## $ X28 : chr "52" "46" "40" "42" ...
## $ X29 : chr "41" "36" "30" "26" ...
## $ X30 : chr "30" "26" "22" "10" ...
## $ X31 : chr "30" "25" "20" "8" ...
#Look at the structure using dplyr's glimpse().
library(dplyr)
glimpse(weather)
## Observations: 286
## Variables: 35
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ...
## $ month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat...
## $ X1 <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ...
## $ X2 <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ...
## $ X3 <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",...
## $ X4 <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ...
## $ X5 <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ...
## $ X6 <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",...
## $ X7 <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ...
## $ X8 <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "...
## $ X9 <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",...
## $ X10 <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",...
## $ X11 <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ...
## $ X12 <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ...
## $ X13 <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ...
## $ X14 <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ...
## $ X15 <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ...
## $ X16 <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ...
## $ X17 <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",...
## $ X18 <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ...
## $ X19 <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ...
## $ X20 <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ...
## $ X21 <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ...
## $ X22 <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ...
## $ X23 <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",...
## $ X24 <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",...
## $ X25 <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",...
## $ X26 <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ...
## $ X27 <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ...
## $ X28 <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ...
## $ X29 <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ...
## $ X30 <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "...
## $ X31 <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
#View a summary of the data.
summary(weather)
## X year month measure
## Min. : 1.00 Min. :2014 Min. : 1.000 Length:286
## 1st Qu.: 72.25 1st Qu.:2015 1st Qu.: 4.000 Class :character
## Median :143.50 Median :2015 Median : 7.000 Mode :character
## Mean :143.50 Mean :2015 Mean : 6.923
## 3rd Qu.:214.75 3rd Qu.:2015 3rd Qu.:10.000
## Max. :286.00 Max. :2015 Max. :12.000
## X1 X2 X3
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X4 X5 X6
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X7 X8 X9
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X10 X11 X12
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X13 X14 X15
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X16 X17 X18
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X19 X20 X21
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X22 X23 X24
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X25 X26 X27
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X28 X29 X30
## Length:286 Length:286 Length:286
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## X31
## Length:286
## Class :character
## Mode :character
##
##
##
#Take a closer look of your data.
head(weather)
## X year month measure X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## 1 1 2014 12 Max.TemperatureF 64 42 51 43 42 45 38 29 49 48 39 39
## 2 2 2014 12 Mean.TemperatureF 52 38 44 37 34 42 30 24 39 43 36 35
## 3 3 2014 12 Min.TemperatureF 39 33 37 30 26 38 21 18 29 38 32 31
## 4 4 2014 12 Max.Dew.PointF 46 40 49 24 37 45 36 28 49 45 37 28
## 5 5 2014 12 MeanDew.PointF 40 27 42 21 25 40 20 16 41 39 31 27
## 6 6 2014 12 Min.DewpointF 26 17 24 13 12 36 -3 3 28 37 27 25
## X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
## 1 42 45 42 44 49 44 37 36 36 44 47 46 59 50 52 52 41 30
## 2 37 39 37 40 45 40 33 32 33 39 45 44 52 44 45 46 36 26
## 3 32 33 32 35 41 36 29 27 30 33 42 41 44 37 38 40 30 22
## 4 28 29 33 42 46 34 25 30 30 39 45 46 58 31 34 42 26 10
## 5 26 27 29 36 41 30 22 24 27 34 42 44 43 29 31 35 20 4
## 6 24 25 27 30 32 26 20 20 25 25 37 41 29 28 29 27 10 -6
## X31
## 1 30
## 2 25
## 3 20
## 4 8
## 5 5
## 6 1
Well done! Not surprisingly, this dataset is pretty messy…for now :)
It’s time to tidy it up!
# Gather the columns
library(tidyr)
weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)#na.rm決定是否排除NA
head(weather2)
## X year month measure day value
## 1 1 2014 12 Max.TemperatureF X1 64
## 2 2 2014 12 Mean.TemperatureF X1 52
## 3 3 2014 12 Min.TemperatureF X1 39
## 4 4 2014 12 Max.Dew.PointF X1 46
## 5 5 2014 12 MeanDew.PointF X1 40
## 6 6 2014 12 Min.DewpointF X1 26
#First remove column of row names
weather2 <- weather2[, -1]#'-1' means to remove col1.
#Spread the data
weather3 <- spread(weather2, measure, value)
#View the head
head(weather3)
## year month day CloudCover Events Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014 12 X1 6 Rain 46 29
## 2 2014 12 X10 8 Rain 45 29
## 3 2014 12 X11 8 Rain-Snow 37 28
## 4 2014 12 X12 7 Snow 28 21
## 5 2014 12 X13 5 28 23
## 6 2014 12 X14 4 29 20
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1 74 30.45 64
## 2 100 29.58 48
## 3 92 29.81 39
## 4 85 29.88 39
## 5 75 29.86 42
## 6 82 29.91 45
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1 10 22 63
## 2 10 23 95
## 3 10 21 87
## 4 10 16 75
## 5 10 17 65
## 6 10 15 68
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1 30.13 52 10
## 2 29.5 43 3
## 3 29.61 36 7
## 4 29.85 35 10
## 5 29.82 37 10
## 6 29.83 39 10
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1 13 40 26 52
## 2 13 39 37 89
## 3 13 31 27 82
## 4 11 27 25 64
## 5 12 26 24 55
## 6 10 27 25 53
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1 30.01 39 10
## 2 29.43 38 1
## 3 29.44 32 1
## 4 29.81 31 7
## 5 29.78 32 10
## 6 29.78 33 10
## PrecipitationIn WindDirDegrees
## 1 0.01 268
## 2 0.28 357
## 3 0.02 230
## 4 T 286
## 5 T 298
## 6 0.00 306
#Remove X's from day column
weather3$day <- str_replace(weather3$day, 'X', '')
#Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")
#Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)
#Rearrange columns using dplyr's select()
weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
#View the head of
head(weather5)
## date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01 Rain 6 46 29
## 2 2014-12-10 Rain 8 45 29
## 3 2014-12-11 Rain-Snow 8 37 28
## 4 2014-12-12 Snow 7 28 21
## 5 2014-12-13 5 28 23
## 6 2014-12-14 4 29 20
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1 74 30.45 64
## 2 100 29.58 48
## 3 92 29.81 39
## 4 85 29.88 39
## 5 75 29.86 42
## 6 82 29.91 45
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1 10 22 63
## 2 10 23 95
## 3 10 21 87
## 4 10 16 75
## 5 10 17 65
## 6 10 15 68
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1 30.13 52 10
## 2 29.5 43 3
## 3 29.61 36 7
## 4 29.85 35 10
## 5 29.82 37 10
## 6 29.83 39 10
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1 13 40 26 52
## 2 13 39 37 89
## 3 13 31 27 82
## 4 11 27 25 64
## 5 12 26 24 55
## 6 10 27 25 53
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1 30.01 39 10
## 2 29.43 38 1
## 3 29.44 32 1
## 4 29.81 31 7
## 5 29.78 32 10
## 6 29.78 33 10
## PrecipitationIn WindDirDegrees
## 1 0.01 268
## 2 0.28 357
## 3 0.02 230
## 4 T 286
## 5 T 298
## 6 0.00 306
And now, it’s time for strange & missing values.
#Search for strange values in PrecipitationIn
## View the structure of weather5
str(weather5)
## 'data.frame': 366 obs. of 23 variables:
## $ date : Date, format: "2014-12-01" "2014-12-10" ...
## $ Events : chr "Rain" "Rain" "Rain-Snow" "Snow" ...
## $ CloudCover : chr "6" "8" "8" "7" ...
## $ Max.Dew.PointF : chr "46" "45" "37" "28" ...
## $ Max.Gust.SpeedMPH : chr "29" "29" "28" "21" ...
## $ Max.Humidity : chr "74" "100" "92" "85" ...
## $ Max.Sea.Level.PressureIn : chr "30.45" "29.58" "29.81" "29.88" ...
## $ Max.TemperatureF : chr "64" "48" "39" "39" ...
## $ Max.VisibilityMiles : chr "10" "10" "10" "10" ...
## $ Max.Wind.SpeedMPH : chr "22" "23" "21" "16" ...
## $ Mean.Humidity : chr "63" "95" "87" "75" ...
## $ Mean.Sea.Level.PressureIn: chr "30.13" "29.5" "29.61" "29.85" ...
## $ Mean.TemperatureF : chr "52" "43" "36" "35" ...
## $ Mean.VisibilityMiles : chr "10" "3" "7" "10" ...
## $ Mean.Wind.SpeedMPH : chr "13" "13" "13" "11" ...
## $ MeanDew.PointF : chr "40" "39" "31" "27" ...
## $ Min.DewpointF : chr "26" "37" "27" "25" ...
## $ Min.Humidity : chr "52" "89" "82" "64" ...
## $ Min.Sea.Level.PressureIn : chr "30.01" "29.43" "29.44" "29.81" ...
## $ Min.TemperatureF : chr "39" "38" "32" "31" ...
## $ Min.VisibilityMiles : chr "10" "1" "1" "7" ...
## $ PrecipitationIn : chr "0.01" "0.28" "0.02" "T" ...
## $ WindDirDegrees : chr "268" "357" "230" "286" ...
#Examine the first 20 rows of weather5. Are most of the characters numeric?
head(weather5, 20)
## date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01 Rain 6 46 29
## 2 2014-12-10 Rain 8 45 29
## 3 2014-12-11 Rain-Snow 8 37 28
## 4 2014-12-12 Snow 7 28 21
## 5 2014-12-13 5 28 23
## 6 2014-12-14 4 29 20
## 7 2014-12-15 2 33 21
## 8 2014-12-16 Rain 8 42 10
## 9 2014-12-17 Rain 8 46 26
## 10 2014-12-18 Rain 7 34 30
## 11 2014-12-19 4 25 23
## 12 2014-12-02 Rain-Snow 7 40 29
## 13 2014-12-20 Snow 6 30 26
## 14 2014-12-21 Snow 8 30 20
## 15 2014-12-22 Rain 7 39 22
## 16 2014-12-23 Rain 8 45 25
## 17 2014-12-24 Fog-Rain 8 46 15
## 18 2014-12-25 Rain 6 58 40
## 19 2014-12-26 1 31 25
## 20 2014-12-27 3 34 21
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1 74 30.45 64
## 2 100 29.58 48
## 3 92 29.81 39
## 4 85 29.88 39
## 5 75 29.86 42
## 6 82 29.91 45
## 7 89 30.15 42
## 8 96 30.17 44
## 9 100 29.91 49
## 10 89 29.87 44
## 11 69 30.15 37
## 12 92 30.71 42
## 13 89 30.31 36
## 14 85 30.37 36
## 15 89 30.4 44
## 16 100 30.31 47
## 17 100 30.13 46
## 18 100 29.96 59
## 19 70 30.16 50
## 20 70 30.22 52
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1 10 22 63
## 2 10 23 95
## 3 10 21 87
## 4 10 16 75
## 5 10 17 65
## 6 10 15 68
## 7 10 15 75
## 8 10 8 85
## 9 10 20 85
## 10 10 23 73
## 11 10 17 63
## 12 10 24 72
## 13 10 21 79
## 14 10 16 77
## 15 10 18 79
## 16 10 20 91
## 17 2 13 98
## 18 10 28 75
## 19 10 18 60
## 20 10 17 60
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1 30.13 52 10
## 2 29.5 43 3
## 3 29.61 36 7
## 4 29.85 35 10
## 5 29.82 37 10
## 6 29.83 39 10
## 7 30.05 37 10
## 8 30.09 40 9
## 9 29.75 45 6
## 10 29.78 40 10
## 11 29.98 33 10
## 12 30.59 38 8
## 13 30.26 32 10
## 14 30.32 33 9
## 15 30.35 39 10
## 16 30.23 45 5
## 17 29.9 44 1
## 18 29.63 52 8
## 19 30.11 44 10
## 20 30.14 45 10
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1 13 40 26 52
## 2 13 39 37 89
## 3 13 31 27 82
## 4 11 27 25 64
## 5 12 26 24 55
## 6 10 27 25 53
## 7 6 29 27 60
## 8 4 36 30 73
## 9 11 41 32 70
## 10 14 30 26 57
## 11 11 22 20 56
## 12 15 27 17 51
## 13 10 24 20 69
## 14 9 27 25 69
## 15 8 34 25 69
## 16 13 42 37 82
## 17 6 44 41 96
## 18 14 43 29 49
## 19 11 29 28 49
## 20 9 31 29 50
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1 30.01 39 10
## 2 29.43 38 1
## 3 29.44 32 1
## 4 29.81 31 7
## 5 29.78 32 10
## 6 29.78 33 10
## 7 29.91 32 10
## 8 29.92 35 5
## 9 29.69 41 1
## 10 29.71 36 10
## 11 29.86 29 10
## 12 30.4 33 2
## 13 30.17 27 7
## 14 30.28 30 6
## 15 30.3 33 4
## 16 30.16 42 1
## 17 29.55 41 0
## 18 29.47 44 1
## 19 29.99 37 10
## 20 30.03 38 10
## PrecipitationIn WindDirDegrees
## 1 0.01 268
## 2 0.28 357
## 3 0.02 230
## 4 T 286
## 5 T 298
## 6 0.00 306
## 7 0.00 324
## 8 T 79
## 9 0.43 311
## 10 0.01 281
## 11 0.00 305
## 12 0.10 62
## 13 T 350
## 14 T 2
## 15 0.05 24
## 16 0.25 63
## 17 0.56 12
## 18 0.14 250
## 19 0.00 255
## 20 0.00 251
weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn, 'T', '0')
#Or using sub(): 'weather5$PrecipitationIn <- sub('T', 0, weather5$PrecipitationIn)'
#Convert characters to numerics
weather6 <- mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))
#check the dataset
str(weather6)
## 'data.frame': 366 obs. of 23 variables:
## $ date : Date, format: "2014-12-01" "2014-12-10" ...
## $ Events : chr "Rain" "Rain" "Rain-Snow" "Snow" ...
## $ CloudCover : num 6 8 8 7 5 4 2 8 8 7 ...
## $ Max.Dew.PointF : num 46 45 37 28 28 29 33 42 46 34 ...
## $ Max.Gust.SpeedMPH : num 29 29 28 21 23 20 21 10 26 30 ...
## $ Max.Humidity : num 74 100 92 85 75 82 89 96 100 89 ...
## $ Max.Sea.Level.PressureIn : num 30.4 29.6 29.8 29.9 29.9 ...
## $ Max.TemperatureF : num 64 48 39 39 42 45 42 44 49 44 ...
## $ Max.VisibilityMiles : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Max.Wind.SpeedMPH : num 22 23 21 16 17 15 15 8 20 23 ...
## $ Mean.Humidity : num 63 95 87 75 65 68 75 85 85 73 ...
## $ Mean.Sea.Level.PressureIn: num 30.1 29.5 29.6 29.9 29.8 ...
## $ Mean.TemperatureF : num 52 43 36 35 37 39 37 40 45 40 ...
## $ Mean.VisibilityMiles : num 10 3 7 10 10 10 10 9 6 10 ...
## $ Mean.Wind.SpeedMPH : num 13 13 13 11 12 10 6 4 11 14 ...
## $ MeanDew.PointF : num 40 39 31 27 26 27 29 36 41 30 ...
## $ Min.DewpointF : num 26 37 27 25 24 25 27 30 32 26 ...
## $ Min.Humidity : num 52 89 82 64 55 53 60 73 70 57 ...
## $ Min.Sea.Level.PressureIn : num 30 29.4 29.4 29.8 29.8 ...
## $ Min.TemperatureF : num 39 38 32 31 32 33 32 35 41 36 ...
## $ Min.VisibilityMiles : num 10 1 1 7 10 10 10 5 1 10 ...
## $ PrecipitationIn : num 0.01 0.28 0.02 0 0 0 0 0 0.43 0.01 ...
## $ WindDirDegrees : num 268 357 230 286 298 306 324 79 311 281 ...
summary(weather6)
## date Events CloudCover Max.Dew.PointF
## Min. :2014-12-01 Length:366 Min. :0.000 Min. :-6.00
## 1st Qu.:2015-03-02 Class :character 1st Qu.:3.000 1st Qu.:32.00
## Median :2015-06-01 Mode :character Median :5.000 Median :47.50
## Mean :2015-06-01 Mean :4.708 Mean :45.48
## 3rd Qu.:2015-08-31 3rd Qu.:7.000 3rd Qu.:61.00
## Max. :2015-12-01 Max. :8.000 Max. :75.00
##
## Max.Gust.SpeedMPH Max.Humidity Max.Sea.Level.PressureIn
## Min. : 0.00 Min. : 39.00 Min. :29.58
## 1st Qu.:21.00 1st Qu.: 73.25 1st Qu.:30.00
## Median :25.50 Median : 86.00 Median :30.14
## Mean :26.99 Mean : 85.69 Mean :30.16
## 3rd Qu.:31.25 3rd Qu.: 93.00 3rd Qu.:30.31
## Max. :94.00 Max. :1000.00 Max. :30.88
## NA's :6
## Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## Min. :18.00 Min. : 2.000 Min. : 8.00 Min. :28.00
## 1st Qu.:42.00 1st Qu.:10.000 1st Qu.:16.00 1st Qu.:56.00
## Median :60.00 Median :10.000 Median :20.00 Median :66.00
## Mean :58.93 Mean : 9.907 Mean :20.62 Mean :66.02
## 3rd Qu.:76.00 3rd Qu.:10.000 3rd Qu.:24.00 3rd Qu.:76.75
## Max. :96.00 Max. :10.000 Max. :38.00 Max. :98.00
##
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## Min. :29.49 Min. : 8.00 Min. :-1.000
## 1st Qu.:29.87 1st Qu.:36.25 1st Qu.: 8.000
## Median :30.03 Median :53.50 Median :10.000
## Mean :30.04 Mean :51.40 Mean : 8.861
## 3rd Qu.:30.19 3rd Qu.:68.00 3rd Qu.:10.000
## Max. :30.77 Max. :84.00 Max. :10.000
##
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## Min. : 4.00 Min. :-11.00 Min. :-18.00 Min. :16.00
## 1st Qu.: 8.00 1st Qu.: 24.00 1st Qu.: 16.25 1st Qu.:35.00
## Median :10.00 Median : 41.00 Median : 35.00 Median :46.00
## Mean :10.68 Mean : 38.96 Mean : 32.25 Mean :48.31
## 3rd Qu.:13.00 3rd Qu.: 56.00 3rd Qu.: 51.00 3rd Qu.:60.00
## Max. :22.00 Max. : 71.00 Max. : 68.00 Max. :96.00
##
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## Min. :29.16 Min. :-3.00 Min. : 0.000
## 1st Qu.:29.76 1st Qu.:30.00 1st Qu.: 2.000
## Median :29.94 Median :46.00 Median :10.000
## Mean :29.93 Mean :43.33 Mean : 6.716
## 3rd Qu.:30.09 3rd Qu.:60.00 3rd Qu.:10.000
## Max. :30.64 Max. :74.00 Max. :10.000
##
## PrecipitationIn WindDirDegrees
## Min. :0.0000 Min. : 1.0
## 1st Qu.:0.0000 1st Qu.:113.0
## Median :0.0000 Median :222.0
## Mean :0.1016 Mean :200.1
## 3rd Qu.:0.0400 3rd Qu.:275.0
## Max. :2.9000 Max. :360.0
##
Look at the summary, besides strange values in PrecipitationIn, there are also proplem in Max.Gust.SpeedMPH, Max.Humidity & Mean.VisibilityMiles.
#Find indices of NAs in Max.Gust.SpeedMPH
#Count missing values
sum(is.na(weather6))
## [1] 6
ind <- which(is.na(weather6$Max.Gust.SpeedMPH))
#Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[ind, ]
## date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 161 2015-05-18 Fog 6 52 NA
## 205 2015-06-03 7 48 NA
## 273 2015-08-08 4 61 NA
## 275 2015-09-01 1 63 NA
## 308 2015-10-12 0 56 NA
## 358 2015-11-03 1 44 NA
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 161 100 30.30 58
## 205 93 30.31 56
## 273 87 30.02 76
## 275 78 30.06 79
## 308 89 29.86 76
## 358 82 30.25 73
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 161 10 16 79
## 205 10 14 82
## 273 10 14 68
## 275 10 15 65
## 308 10 15 65
## 358 10 16 57
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 161 30.23 54 8
## 205 30.24 52 10
## 273 29.99 69 10
## 275 30.02 74 10
## 308 29.80 64 10
## 358 30.13 60 10
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 161 10 48 43 57
## 205 7 45 43 71
## 273 6 57 54 49
## 275 9 62 59 52
## 308 8 51 48 41
## 358 8 42 40 31
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 161 30.12 49 0
## 205 30.19 47 10
## 273 29.95 61 10
## 275 29.96 69 10
## 308 29.74 51 10
## 358 30.06 47 10
## PrecipitationIn WindDirDegrees
## 161 0 72
## 205 0 90
## 273 0 45
## 275 0 54
## 308 0 199
## 358 0 281
#It's unclear why these values are missing and there doesn't appear to be any obvious pattern to their missingness, so we'll leave them alone for now.
#Search for extreme values in Max.Humidity
ind <- which(weather6$Max.Humidity >= 1000)
#Look at the data for that day
weather6[ind, ]
## date Events CloudCover Max.Dew.PointF
## 135 2015-04-21 Fog-Rain-Thunderstorm 6 57
## Max.Gust.SpeedMPH Max.Humidity Max.Sea.Level.PressureIn
## 135 94 1000 29.75
## Max.TemperatureF Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 135 65 10 20 71
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 135 29.6 56 5
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 135 10 49 36 42
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 135 29.53 46 0
## PrecipitationIn WindDirDegrees
## 135 0.54 184
#Change 1000 to 100
weather6$Max.Humidity[ind] <- 100
#Look at summary of Mean.VisibilityMiles
summary(weather6$Mean.VisibilityMiles)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.000 8.000 10.000 8.861 10.000 10.000
#Get index of row with -1 value
ind <- which(weather6$Mean.VisibilityMiles == -1)
#Look at full row
weather6[ind, ]
## date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 192 2015-06-18 5 54 23
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 192 72 30.14 76
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 192 10 17 59
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 192 30.04 67 -1
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 192 10 49 45 46
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 192 29.93 57 10
## PrecipitationIn WindDirDegrees
## 192 0 189
#Set Mean.VisibilityMiles to the appropriate value
weather6$Mean.VisibilityMiles[ind] <- 10
Finally, let’s have a final look of our data.
#Replace empty cells in events column
weather6$Events[weather6$Events == ""] = 'None'
#Print the first 6 rows of weather6
head(weather6)
## date Events CloudCover Max.Dew.PointF Max.Gust.SpeedMPH
## 1 2014-12-01 Rain 6 46 29
## 2 2014-12-10 Rain 8 45 29
## 3 2014-12-11 Rain-Snow 8 37 28
## 4 2014-12-12 Snow 7 28 21
## 5 2014-12-13 None 5 28 23
## 6 2014-12-14 None 4 29 20
## Max.Humidity Max.Sea.Level.PressureIn Max.TemperatureF
## 1 74 30.45 64
## 2 100 29.58 48
## 3 92 29.81 39
## 4 85 29.88 39
## 5 75 29.86 42
## 6 82 29.91 45
## Max.VisibilityMiles Max.Wind.SpeedMPH Mean.Humidity
## 1 10 22 63
## 2 10 23 95
## 3 10 21 87
## 4 10 16 75
## 5 10 17 65
## 6 10 15 68
## Mean.Sea.Level.PressureIn Mean.TemperatureF Mean.VisibilityMiles
## 1 30.13 52 10
## 2 29.50 43 3
## 3 29.61 36 7
## 4 29.85 35 10
## 5 29.82 37 10
## 6 29.83 39 10
## Mean.Wind.SpeedMPH MeanDew.PointF Min.DewpointF Min.Humidity
## 1 13 40 26 52
## 2 13 39 37 89
## 3 13 31 27 82
## 4 11 27 25 64
## 5 12 26 24 55
## 6 10 27 25 53
## Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
## 1 30.01 39 10
## 2 29.43 38 1
## 3 29.44 32 1
## 4 29.81 31 7
## 5 29.78 32 10
## 6 29.78 33 10
## PrecipitationIn WindDirDegrees
## 1 0.01 268
## 2 0.28 357
## 3 0.02 230
## 4 0.00 286
## 5 0.00 298
## 6 0.00 306