3/19/2017
setwd('/path/where/your/data/located')
命令列
介面>
or +
;愛惜生命,常用 tab
和 ?
程式碼
編輯介面All RStudio keyboard shortcuts
功能 | Windows & Linux | Mac |
---|---|---|
顯示快捷鍵 | Alt+Shift+K | Option+Shift+K |
自動補完 | Tab or Ctrl+Space | Tab or Command+Space |
執行(單行/選取範圍) | Ctrl+Enter | Command+Enter |
註解(單行/選取範圍) | Ctrl+Shift+C | Command+Shift+C |
存擋 | Ctrl+S | Command+S |
縮排 | Ctrl+I | Command+I |
1 + 1
## [1] 2
sin(2017)
## [1] 0.09736191
pi
## [1] 3.141593
# Kolmogorov-Smirnov Tests ks.test(iris$Sepal.Length, iris$Petal.Length)
## Warning in ks.test(iris$Sepal.Length, iris$Petal.Length): p-value will be ## approximate in the presence of ties
## ## Two-sample Kolmogorov-Smirnov test ## ## data: iris$Sepal.Length and iris$Petal.Length ## D = 0.56, p-value < 2.2e-16 ## alternative hypothesis: two-sided
plot(density(iris$Sepal.Length), xlim = range(c(iris$Sepal.Length, iris$Petal.Length)), main = "Sample PDF") lines(density(iris$Petal.Length), col = 2)
# install.packages("binom") library(binom) # Binomial confidence intervals binom.confint(c(10, 30), c(3000, 50000), methods = "exact")
## method x n mean lower upper ## 1 exact 10 3000 0.003333333 0.0015995846 0.0061215478 ## 2 exact 30 50000 0.000600000 0.0004048529 0.0008564274
dplyr
c(1, 2, 3, 4) + 1
## [1] 2 3 4 5
c(1, 2, 3, 4) + c(2, 3, 4, 5)
## [1] 3 5 7 9
c(1, 2, 3, 4) + c(2, 10)
## [1] 3 12 5 14
To understand computations in R, two slogans are helpful:
— John Chambers
`+`
## function (e1, e2) .Primitive("+")
`<-`
## .Primitive("<-")
`[`
## .Primitive("[")
`c`
## function (...) .Primitive("c")
function call
?1 + 1
## [1] 2
`+`(1, 1)
## [1] 2
Dataframe
就好wang <- Pitching %>% filter(playerID == "wangch01") %>% arrange(desc(yearID)) wang
## playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR ## 1 wangch01 2013 1 TOR AL 1 2 6 6 0 0 0 81 40 23 5 ## 2 wangch01 2012 1 WAS NL 2 3 10 5 0 0 0 97 50 24 5 ## 3 wangch01 2011 1 WAS NL 4 3 11 11 0 0 0 187 67 28 8 ## 4 wangch01 2009 1 NYA AL 1 6 12 9 0 0 0 126 66 45 7 ## 5 wangch01 2008 1 NYA AL 8 2 15 15 1 0 0 285 90 43 4 ## 6 wangch01 2007 1 NYA AL 19 7 30 30 1 0 0 598 199 82 9 ## 7 wangch01 2006 1 NYA AL 19 6 34 33 2 1 1 654 233 88 12 ## 8 wangch01 2005 1 NYA AL 8 5 18 17 0 0 0 349 113 52 9 ## BB SO BAOpp ERA IBB WP HBP BK BFP GF R SH SF GIDP ## 1 9 14 0.351 7.67 0 2 0 0 123 0 24 0 0 NA ## 2 15 15 0.376 6.68 0 5 3 0 158 0 24 4 3 NA ## 3 13 25 0.272 4.04 0 2 1 0 264 0 35 2 2 NA ## 4 19 29 0.365 9.64 1 3 2 0 206 2 46 3 1 NA ## 5 35 54 0.249 4.07 1 0 3 0 402 0 44 0 3 NA ## 6 59 104 0.265 3.70 1 9 8 1 823 0 84 2 3 NA ## 7 52 76 0.277 3.63 4 6 2 1 900 1 92 3 2 NA ## 8 32 47 0.256 4.02 3 3 6 0 486 0 58 3 4 NA
懶人包
🎒👝👛👜💼tidyverse
# install.packages("tidyverse") library(tidyverse)
## Loading tidyverse: ggplot2 ## Loading tidyverse: tibble ## Loading tidyverse: tidyr ## Loading tidyverse: readr ## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats ## lag(): dplyr, stats
readr::read_csv()
factor
(因子)來處理data.table::fread()
略慢,不過語法較簡單traffic <- read_csv("ggplot2-slides/traffic_eng.csv")
## Parsed with column specification: ## cols( ## time.year = col_integer(), ## time.month = col_integer(), ## time.day = col_integer(), ## time.hour = col_integer(), ## time.minute = col_integer(), ## event.level = col_character(), ## location.district = col_character(), ## location.address = col_character(), ## number.dead = col_integer(), ## number.injury = col_integer(), ## party.sn = col_integer(), ## vehicle.type = col_character(), ## party.gender = col_character(), ## party.age = col_integer(), ## party.injury = col_character(), ## location.weather = col_character(), ## location.speed.limit = col_integer(), ## location.road.type = col_character(), ## location.type = col_character() ## )
readr::read_csv()
head(traffic, n=3)
## # A tibble: 3 × 19 ## time.year time.month time.day time.hour time.minute event.level ## <int> <int> <int> <int> <int> <chr> ## 1 104 1 1 0 18 一般 ## 2 104 1 1 0 18 一般 ## 3 104 1 1 0 18 一般 ## # ... with 13 more variables: location.district <chr>, ## # location.address <chr>, number.dead <int>, number.injury <int>, ## # party.sn <int>, vehicle.type <chr>, party.gender <chr>, ## # party.age <int>, party.injury <chr>, location.weather <chr>, ## # location.speed.limit <int>, location.road.type <chr>, ## # location.type <chr>
readr::read_csv()
traffic <- read_csv("ggplot2-slides/traffic_eng.csv", col_types = cols( time.year = col_integer(), time.month = col_integer(), time.day = col_integer(), time.hour = col_integer(), time.minute = col_integer(), event.level = col_character(), location.district = col_character(), location.address = col_character(), number.dead = col_integer(), number.injury = col_integer(), party.sn = col_integer(), vehicle.type = col_character(), party.gender = col_character(), party.age = col_integer(), party.injury = col_character(), location.weather = col_character(), location.speed.limit = col_integer(), location.road.type = col_character(), location.type = col_character() ))
readr::read_csv()
head(traffic, n=3)
## # A tibble: 3 × 19 ## time.year time.month time.day time.hour time.minute event.level ## <int> <int> <int> <int> <int> <chr> ## 1 104 1 1 0 18 一般 ## 2 104 1 1 0 18 一般 ## 3 104 1 1 0 18 一般 ## # ... with 13 more variables: location.district <chr>, ## # location.address <chr>, number.dead <int>, number.injury <int>, ## # party.sn <int>, vehicle.type <chr>, party.gender <chr>, ## # party.age <int>, party.injury <chr>, location.weather <chr>, ## # location.speed.limit <int>, location.road.type <chr>, ## # location.type <chr>
dplyr::data_frame()
factor
(因子)來處理data.table::data.table()
略慢,不過語法較簡單traffic <- data_frame(time.year = c(104), time.month = c(1), time.day = c(1), time.hour = c(0), time.minute = c(0, 1, 2), event.level = c("一般")) head(traffic, n=3)
## # A tibble: 3 × 6 ## time.year time.month time.day time.hour time.minute event.level ## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> ## 1 104 1 1 0 0 一般 ## 2 104 1 1 0 1 一般 ## 3 104 1 1 0 2 一般
magrittr
(pipe %>%
)串接
起來,簡化開發邏輯跟程式碼head(traffic, n=3)
## # A tibble: 3 × 6 ## time.year time.month time.day time.hour time.minute event.level ## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> ## 1 104 1 1 0 0 一般 ## 2 104 1 1 0 1 一般 ## 3 104 1 1 0 2 一般
magrittr
(pipe %>%
)串接
起來,簡化開發邏輯跟程式碼traffic %>% head(3)
## # A tibble: 3 × 6 ## time.year time.month time.day time.hour time.minute event.level ## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> ## 1 104 1 1 0 0 一般 ## 2 104 1 1 0 1 一般 ## 3 104 1 1 0 2 一般
dplyr::select()
wang %>% select(yearID, teamID, W, L, ERA)
## yearID teamID W L ERA ## 1 2013 TOR 1 2 7.67 ## 2 2012 WAS 2 3 6.68 ## 3 2011 WAS 4 3 4.04 ## 4 2009 NYA 1 6 9.64 ## 5 2008 NYA 8 2 4.07 ## 6 2007 NYA 19 7 3.70 ## 7 2006 NYA 19 6 3.63 ## 8 2005 NYA 8 5 4.02
dplyr::filter()
wang %>% filter(GS > 10)
## playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR ## 1 wangch01 2011 1 WAS NL 4 3 11 11 0 0 0 187 67 28 8 ## 2 wangch01 2008 1 NYA AL 8 2 15 15 1 0 0 285 90 43 4 ## 3 wangch01 2007 1 NYA AL 19 7 30 30 1 0 0 598 199 82 9 ## 4 wangch01 2006 1 NYA AL 19 6 34 33 2 1 1 654 233 88 12 ## 5 wangch01 2005 1 NYA AL 8 5 18 17 0 0 0 349 113 52 9 ## BB SO BAOpp ERA IBB WP HBP BK BFP GF R SH SF GIDP ## 1 13 25 0.272 4.04 0 2 1 0 264 0 35 2 2 NA ## 2 35 54 0.249 4.07 1 0 3 0 402 0 44 0 3 NA ## 3 59 104 0.265 3.70 1 9 8 1 823 0 84 2 3 NA ## 4 52 76 0.277 3.63 4 6 2 1 900 1 92 3 2 NA ## 5 32 47 0.256 4.02 3 3 6 0 486 0 58 3 4 NA
dplyr::summarise()
wang %>% group_by(lgID) %>% summarise(mean(ERA))
## # A tibble: 2 × 2 ## lgID `mean(ERA)` ## <fctr> <dbl> ## 1 AL 5.455 ## 2 NL 5.360
ggplot2
library(tidyverse)
traffic <- read_csv("traffic_eng.csv") head(traffic, n=3)
## # A tibble: 3 × 19 ## time.year time.month time.day time.hour time.minute event.level ## <int> <int> <int> <int> <int> <chr> ## 1 104 1 1 0 18 一般 ## 2 104 1 1 0 18 一般 ## 3 104 1 1 0 18 一般 ## # ... with 13 more variables: location.district <chr>, ## # location.address <chr>, number.dead <int>, number.injury <int>, ## # party.sn <int>, vehicle.type <chr>, party.gender <chr>, ## # party.age <int>, party.injury <chr>, location.weather <chr>, ## # location.speed.limit <int>, location.road.type <chr>, ## # location.type <chr>
traffic %>% select(time.month, time.day, time.hour, time.minute) %>% summary()
## time.month time.day time.hour time.minute ## Min. : 1.00 Min. : 1.00 Min. : 0.00 Min. : 0.0 ## 1st Qu.: 4.00 1st Qu.: 8.00 1st Qu.: 9.00 1st Qu.:13.0 ## Median : 7.00 Median :16.00 Median :14.00 Median :30.0 ## Mean : 6.79 Mean :15.67 Mean :13.65 Mean :27.6 ## 3rd Qu.:10.00 3rd Qu.:23.00 3rd Qu.:18.00 3rd Qu.:42.0 ## Max. :12.00 Max. :31.00 Max. :23.00 Max. :59.0
traffic %>% group_by(time.hour) %>% tally() -> pivot.time.hour # count() -> pivot.time.hour pivot.time.hour %>% head(5)
## # A tibble: 5 × 2 ## time.hour n ## <int> <int> ## 1 0 849 ## 2 1 513 ## 3 2 360 ## 4 3 282 ## 5 4 298
qq <- ggplot(data = pivot.time.hour) qq <- qq + geom_bar(aes(x=time.hour, y=n), stat = "identity") print(qq)
qq <- ggplot(data = pivot.time.hour) qq <- qq + geom_bar(aes(x=time.hour, y=n), stat = "identity") print(qq)
qq <- ggplot(data = pivot.time.hour) qq <- qq + geom_line(aes(x=time.hour, y=n)) print(qq)
qq <- ggplot(data = pivot.time.hour) qq <- qq + geom_line(aes(x=time.hour, y=n)) qq <- qq + geom_point(aes(x=time.hour, y=n)) print(qq)
traffic %>% group_by(time.hour, party.gender) %>% tally() -> pivot.hour.gender # count() -> pivot.hour.gender pivot.hour.gender %>% head(3)
## Source: local data frame [3 x 3] ## Groups: time.hour [1] ## ## time.hour party.gender n ## (int) (chr) (int) ## 1 0 女 195 ## 2 0 無或物(動物、堆置物) 25 ## 3 0 男 629
qq <- ggplot(data = pivot.hour.gender) qq <- qq + geom_line(aes(x=time.hour, y=n, colour=party.gender)) print(qq)
qq <- ggplot(data = pivot.hour.gender) qq <- qq + geom_line(aes(x=time.hour, y=n, colour=party.gender)) # qq <- qq + theme(text=element_text(family = "Microsoft JhengHei", qq <- qq + theme(text=element_text(family = "STHeiti", colour="red")) print(qq)
qq <- ggplot(data = pivot.hour.gender) qq <- qq + geom_line(aes(x=time.hour, y=n, colour=party.gender)) # qq <- qq + theme(text=element_text(family = "Microsoft JhengHei"), qq <- qq + theme(text=element_text(family = "STHeiti"), axis.title.y = element_text(angle = 0, vjust = 0.5), legend.position = "bottom") qq <- qq + labs(title="台北市每時交通事故人數,按性別分", x="時間", y="人數") qq <- qq + scale_x_continuous(breaks = seq(0,24,3)) qq <- qq + scale_colour_discrete(name="當事人性別") print(qq)
college.admission <- read_csv("college_admission.csv") college.admission %>% arrange(-n) %>% head(5)
## # A tibble: 5 × 6 ## year first.name first.pinyin n national.colleges top.5.colleges ## <int> <chr> <chr> <dbl> <dbl> <dbl> ## 1 1999 怡君 yijun 407.0708 126.86109 44.61049 ## 2 2000 怡君 yijun 398.6149 112.73957 36.23772 ## 3 2001 怡君 yijun 379.4943 109.54474 28.69029 ## 4 2002 雅婷 yating 379.2310 113.12653 28.28163 ## 5 2003 雅婷 yating 352.9057 90.54816 19.73486
QGIS
shapefile
地圖檔(可以在 QGIS
先處理完)# install.packages(c("rgdal", "ggmap")) # brew install gdal library(rgdal) library(ggplot2) library(ggmap) shapefile <- readOGR("shapefile", "Town_MOI_1041215_C_Name_臺北市")
## OGR data source with driver: ESRI Shapefile ## Source: "shapefile", layer: "Town_MOI_1041215_C_Name_臺北市" ## with 12 features ## It has 10 fields
ggplot2::fortify()
將讀進來的 shapefile
轉換成為 DataFrame
shapefile_df <- fortify(shapefile)
## Regions defined for each Polygons
shapefile_df %>% head()
## long lat order hole piece id group ## 1 121.5714 25.07429 1 FALSE 1 0 0.1 ## 2 121.5715 25.07424 2 FALSE 1 0 0.1 ## 3 121.5715 25.07398 3 FALSE 1 0 0.1 ## 4 121.5717 25.07361 4 FALSE 1 0 0.1 ## 5 121.5717 25.07344 5 FALSE 1 0 0.1 ## 6 121.5718 25.07321 6 FALSE 1 0 0.1
ggplot2::geom_path()
畫地圖map <- ggplot() + geom_path(data = shapefile_df, aes(x = long, y = lat, group = group), color = 'gray', size = 1) print(map)
map_projected <- map + coord_map() print(map_projected)