September, 2017
if(!require(rvest)){install.packages('rvest') ; library(rvest)}
## Loading required package: rvest
## Loading required package: xml2
url_tvcast = 'http://tvcast.naver.com/jtbc.youth' html_tvcast = read_html(x = url_tvcast, encoding = 'UTF-8')
html_nodes
함수는 참조하는 웹페이지에서 태그 하위에 있는 텍스트를 저장한다.html_tvcast %>% html_nodes(".title a") %>% head(n=3)
## {xml_nodeset (3)} ## [1] <a href="http://tv.naver.com/v/1081631" onclick="clickcr(this, 'chh. ... ## [2] <a href="/v/1081631" onclick="clickcr(this, 'chh.vtit', '', '', ''); ... ## [3] <a href="/v/1081613" onclick="clickcr(this, 'chh.vtit', '', '', ''); ...
title
class 아래 쪽 하이퍼링크 태그 \(<\)a\(>\) 사이에 문자열을 모은다.html_text
함수는 참조하는 텍스트 문자열에서 text 문자열 정보를 모은다.html_tvcast %>% html_nodes(".title a") %>% html_text()%>%head(n=3)
## [1] "[워맨스 스페셜]나도 이런 연ㅇ..아니 우정 하고 싶다..." ## [2] "[워맨스 스페셜]나도 이런 연ㅇ..아니 우정 하고 싶다..." ## [3] "[메이킹] '똥차' 고두영 시원하게 때려잡는 하메들!"
왕은 사랑한다
의 소개 페이지다. 'http://tv.naver.com/mbc.kingloves' 여기서 회차별 문자열을 수집하여라.url_tvcast = 'http://tv.naver.com/mbc.kingloves' html_tvcast = read_html(x = url_tvcast, encoding = 'UTF-8') html_tvcast %>% html_nodes(".title a") %>% html_text() %>% data.frame() %>% head(n = 3)
## . ## 1 임시완, 윤아·홍종현에 대한 '애절한 그리움'으로 엔딩 ## 2 《메이킹》 '왕은 사랑한다' 178일간의 대장정, 그들의 이야기 ## 3 《메이킹》 임시완·임윤아·홍종현, '원산린' 우리 이제 안녕
class
가 wikitable
임을 확인url_wiki <- "https://en.wikipedia.org/wiki/Student%27s_t-distribution" html_wiki <- read_html(x=url_wiki, encoding = 'UFT-8')
html_wiki %>% html_nodes('.wikitable') %>% html_table() %>% data.frame() %>% head(n=3)
## One.sided X75. X80. X85. X90. X95. X97.5. X99. X99.5. X99.75. ## 1 Two-sided 50% 60% 70% 80% 90% 95% 98% 99% 99.5% ## 2 1 1.000 1.376 1.963 3.078 6.314 12.71 31.82 63.66 127.3 ## 3 2 0.816 1.080 1.386 1.886 2.920 4.303 6.965 9.925 14.09 ## X99.9. X99.95. ## 1 99.8% 99.9% ## 2 318.3 636.6 ## 3 22.33 31.60
if(!require(rvest)){install.packages("rvest"); library(rvest)} url <- "http://www.baseball-reference.com/leagues/MLB/2017.shtml" webpage <- read_html(url)
webpage %>% html_nodes('div#div_teams_standard_batting table') %>% html_table() %>% data.frame() %>% head(n = 3)
## Tm X.Bat BatAge R.G G PA AB R H X2B X3B HR RBI SB CS BB ## 1 ARI 45 0.0 5.01 162 6224 5525 812 1405 314 39 220 776 103 30 578 ## 2 ATL 49 0.0 4.52 162 6216 5584 732 1467 289 26 165 706 77 31 474 ## 3 BAL 50 0.0 4.59 162 6140 5650 743 1469 269 12 232 713 32 13 392 ## SO BA OBP SLG OPS OPS. TB GDP HBP SH SF IBB LOB ## 1 1456 .254 .329 .445 .774 2457 106 54 39 27 44 1118 ## 2 1184 .263 .326 .412 .738 2303 137 66 59 32 57 1127 ## 3 1412 .260 .312 .435 .747 2458 138 50 10 37 12 1041
tm=2017.09.06.13
을 변경하여 URL 창에 붙여넣기<
table class="table_develop3" summary="기상실황표로 지점, 날씨, 기온, 강수, 바람, 기압등을 안내한 표입니다.">
url = "http://www.weather.go.kr/weather/observation/currentweather.jsp?auto_man=m&type=t99&tm=2017.09.06.13%3A00&x=19&y=3" webpage <- read_html(url, encoding = "EUC-KR") Sys.setlocale("LC_ALL", "English") webpage %>% html_nodes("table.table_develop3") tmp <- webpage %>% html_nodes("table.table_develop3") %>% html_table(header = FALSE, fill=TRUE)%>% data.frame() head(tmp)
Sys.setlocale("LC_ALL", "Korean") for(i in 1:ncol(tmp)){ tmp[,i] = rvest::repair_encoding(tmp[,i]) } head(tmp)
if(!require(httr)){install.packages('httr') ; library(httr)} url = paste0("https://movie.naver.com/movie/point/af/list.nhn?&page=1") mov_html = read_html(GET(url), Encoding = "UTF-8") content = html_nodes(mov_html, '.title') %>% html_text() content = gsub('\n|\t|<.*?>|"','',content) sub_con = strsplit(content, "\r") data.frame(do.call(rbind, lapply(sub_con, function(x) {x[x != "" & x != "신고"]}))) %>% head(3)
total_con = NULL for(i in 1:10){ url = paste0("https://movie.naver.com/movie/point/af/list.nhn?&page=",i) mov_html = read_html(GET(url), encoding = "CP949") content = html_nodes(mov_html, '.title') %>% html_text() content = gsub('\n|\t|<.*?>|"','',content) part_con = data.frame(do.call(rbind, lapply(strsplit(content, "\r"), function(x) {x[x != "" & x != "신고"]}))) total_con = rbind(total_con, part_con) cat(i, "\n") }
total_dat = NULL for(i in 1:10){ url = paste0("https://movie.naver.com/movie/point/af/list.nhn?&page=",i) mov_html = read_html(GET(url), encoding = "CP949") content = html_nodes(mov_html, '.title') %>% html_text() content = gsub('\n|\t|<.*?>|"','',content) point = html_nodes(mov_html, '.point') %>% html_text() part_con = data.frame(do.call(rbind, lapply(strsplit(content, "\r"), function(x) {x[x != "" & x != "신고"]})), point = point) total_dat = rbind(total_dat, part_con) cat(i, "\n") }
client_id = '???'; client_secret = '???'; header = httr::add_headers( 'X-Naver-Client-Id' = client_id, 'X-Naver-Client-Secret' = client_secret)
?query=
부분을 확인한다.query
,display
,start
, sort
에 대해서는 아래와 같이 URL을 작성하면 될 것이다.iconv
함수의 이용query = '새우깡' # encoding 변화 query = iconv(query, to = 'UTF-8', toRaw = T) # iconv(query, to = "UTF-8", toRaw = F) query = paste0('%', paste(unlist(query), collapse = '%')) query = toupper(query)
if(!require(httr)){install.packages("httr"); library(httr)} end_num = 1000 display_num = 100 start_point = seq(1,end_num,display_num) i = 1 url = paste0('https://openapi.naver.com/v1/search/blog.xml?query=', query,'&display=',display_num,'&start=', start_point[i],'&sort=sim') url_body = read_xml(GET(url, header))
title = url_body %>% xml_nodes('item title') %>% xml_text() bloggername = url_body %>% xml_nodes('item bloggername') %>% xml_text() postdate = url_body %>% xml_nodes('postdate') %>% xml_text() link = url_body %>% xml_nodes('item link') %>% xml_text() description = url_body %>% xml_nodes('item description') %>% html_text()
i = 1 final_dat = NULL for(i in 1:length(start_point)) { # request xml format url = paste0('https://openapi.naver.com/v1/search/blog.xml?query=',query,'&display=',display_num,'&start=',start_point[i],'&sort=sim') #option header url_body = read_xml(GET(url, header), encoding = "UTF-8") title = url_body %>% xml_nodes('item title') %>% xml_text() bloggername = url_body %>% xml_nodes('item bloggername') %>% xml_text() postdate = url_body %>% xml_nodes('postdate') %>% xml_text() link = url_body %>% xml_nodes('item link') %>% xml_text() description = url_body %>% xml_nodes('item description') %>% html_text() temp_dat = cbind(title, bloggername, postdate, link, description) final_dat = rbind(final_dat, temp_dat) cat(i, '\n') } final_dat = data.frame(final_dat, stringsAsFactors = F) head(final_dat)
api_key = "???" service = "CardSubwayTime" start = 1 end = 40 query = iconv("2호선", to = 'UTF-8') date = 201603 url = paste0("http://openapi.seoul.go.kr:8088/", api_key, "/xml/",service ,"/",start,"/",end,"/", date, "/", query)
if(!require(XML)){install.packages("XML"); library(XML)} raw.data = xmlTreeParse(url, useInternalNodes = TRUE, encoding = "UTF-8") rootNode <- xmlRoot(raw.data) list_dat = list() for(i in 3:length(names(rootNode))){ list_dat[[i-2]] = xmlSApply(rootNode[[i]], xmlValue) } total_set = data.frame(do.call(rbind, list_dat), stringsAsFactors = F) for(i in 4:ncol(total_set)){ total_set[, i] = as.numeric(total_set[, i]) }
head(total_set)
## USE_MON LINE_NUM SUB_STA_NM FOUR_RIDE_NUM FOUR_ALIGHT_NUM ## 1 201603 2호선 시청 45 2 ## 2 201603 2호선 을지로입구 170 10 ## 3 201603 2호선 을지로3가 6 1 ## 4 201603 2호선 을지로4가 7 1 ## 5 201603 2호선 동대문역사문화공원 298 6 ## 6 201603 2호선 신당 32 0 ## FIVE_RIDE_NUM FIVE_ALIGHT_NUM SIX_RIDE_NUM SIX_ALIGHT_NUM SEVEN_RIDE_NUM ## 1 1625 2147 3071 23933 6447 ## 2 3947 3225 4121 31555 9676 ## 3 1210 1676 2989 20549 5710 ## 4 707 793 1712 9063 3368 ## 5 11498 1098 7824 11483 9914 ## 6 7417 1520 13119 14420 32984 ## SEVEN_ALIGHT_NUM EIGHT_RIDE_NUM EIGHT_ALIGHT_NUM NINE_RIDE_NUM ## 1 117316 10418 228778 13803 ## 2 156210 18777 366246 25769 ## 3 70654 11834 164763 16181 ## 4 26746 6042 90334 10725 ## 5 36806 15046 74621 16107 ## 6 22267 44810 55590 28475 ## NINE_ALIGHT_NUM TEN_RIDE_NUM TEN_ALIGHT_NUM ELEVEN_RIDE_NUM ## 1 87635 18883 34537 25241 ## 2 172126 34208 96346 44029 ## 3 78132 18772 38927 22216 ## 4 53611 15083 29855 16984 ## 5 45056 17431 41366 22103 ## 6 31332 20998 23822 22102 ## ELEVEN_ALIGHT_NUM TWELVE_RIDE_NUM TWELVE_ALIGHT_NUM THIRTEEN_RIDE_NUM ## 1 33180 26032 31111 33603 ## 2 102424 50409 82347 67344 ## 3 31461 24372 26366 28559 ## 4 27270 19969 25650 23019 ## 5 38832 24771 39766 29632 ## 6 23434 23879 25203 26299 ## THIRTEEN_ALIGHT_NUM FOURTEEN_RIDE_NUM FOURTEEN_ALIGHT_NUM ## 1 36488 38652 33337 ## 2 85996 77650 76381 ## 3 28921 33154 26826 ## 4 24607 27486 25270 ## 5 43317 35721 43036 ## 6 27528 27316 28023 ## FIFTEEN_RIDE_NUM FIFTEEN_ALIGHT_NUM SIXTEEN_RIDE_NUM SIXTEEN_ALIGHT_NUM ## 1 45321 29169 49778 26542 ## 2 93403 76436 102669 73118 ## 3 35314 23028 39686 22413 ## 4 29904 21492 32574 18962 ## 5 46038 39786 52843 36949 ## 6 27884 28585 32731 31169 ## SEVENTEEN_RIDE_NUM SEVENTEEN_ALIGHT_NUM EIGHTEEN_RIDE_NUM ## 1 77646 27006 175561 ## 2 132600 76404 283613 ## 3 56709 22739 123603 ## 4 41986 15884 76223 ## 5 57605 38981 68661 ## 6 37732 36113 46717 ## EIGHTEEN_ALIGHT_NUM NINETEEN_RIDE_NUM NINETEEN_ALIGHT_NUM ## 1 29218 101820 18696 ## 2 87578 200094 61555 ## 3 24452 78249 16792 ## 4 14412 52441 9099 ## 5 43661 52560 55806 ## 6 44799 32993 38999 ## TWENTY_RIDE_NUM TWENTY_ALIGHT_NUM TWENTY_ONE_RIDE_NUM ## 1 64862 9503 64315 ## 2 174840 31900 152765 ## 3 45841 8497 43484 ## 4 22870 5215 17018 ## 5 42045 38291 40882 ## 6 23295 25147 21359 ## TWENTY_ONE_ALIGHT_NUM TWENTY_TWO_RIDE_NUM TWENTY_TWO_ALIGHT_NUM ## 1 8068 41971 7575 ## 2 24551 111340 20863 ## 3 7524 33183 6769 ## 4 5122 12406 5030 ## 5 29326 37792 23015 ## 6 23058 16500 23682 ## TWENTY_THREE_RIDE_NUM TWENTY_THREE_ALIGHT_NUM MIDNIGHT_RIDE_NUM ## 1 17526 5271 2486 ## 2 50891 12548 6165 ## 3 15776 5232 2662 ## 4 5710 3702 1061 ## 5 24098 16951 4542 ## 6 8704 14686 1217 ## MIDNIGHT_ALIGHT_NUM ONE_RIDE_NUM ONE_ALIGHT_NUM TWO_RIDE_NUM ## 1 1760 2 76 0 ## 2 4700 18 1145 5 ## 3 1552 3 65 0 ## 4 1057 2 47 0 ## 5 5903 8 161 0 ## 6 4914 1 48 0 ## TWO_ALIGHT_NUM THREE_RIDE_NUM THREE_ALIGHT_NUM WORK_DT ## 1 0 0 0 20160408 ## 2 5 7 8 20160408 ## 3 0 0 2 20160408 ## 4 0 0 0 20160408 ## 5 0 0 0 20160408 ## 6 0 0 0 20160408