Download all thread html
thread.l = readLines("https://stat.ethz.ch/pipermail/r-devel/") %>%
grep("thread.html",.,value= TRUE) %>%
gsub(".*?([19|20].*html).*","https://stat.ethz.ch/pipermail/r-devel/\\1", .)
outf = paste0("./tmp/", basename(dirname(thread.l)),".html")
dir.create("./tmp/")
## Warning in dir.create("./tmp/"): './tmp' already exists
for(i in 1:length(thread.l)){
if(!file.exists(outf[i]))
download_html(thread.l[i], file = outf[i])
}
Data
thread.l.local = list.files("./tmp/",full.names=TRUE)
Scrape it
scrap_thread_loc = function(file_thread) {
year_mon = basename(file_thread) %>% gsub(".html","",.)
dir = paste0("https://stat.ethz.ch/pipermail/r-devel/",year_mon,"/")
tmpl =
read_html(file_thread) %>%
xml_find_all("//body/ul[2]/li")
title =
tmpl %>%
xml_find_first(".//a") %>%
xml_contents() %>%
as.character()
link =
tmpl %>%
xml_find_first(".//a") %>%
xml_attr("href") %>%
paste0(dir, "/", .)
reps =
sapply(tmpl, function(node) {
return(node %>% xml_find_all(".//li") %>% length())
})
df.rslt = tibble(title,
link,
reps
)
df.rslt$year_mon = year_mon
return(df.rslt)
}
scrape all
all.rslt = pbapply::pblapply(thread.l.local, scrap_thread_loc)
all.rslt.df = do.call(rbind,all.rslt)
clean
all.rslt.df %>%
arrange(desc(reps)) %>%
slice(1:10) %>%
knitr::kable()
Combine threads with the same title
all.rslt.df %>%
group_by(title) %>%
summarise(link =max(link),
reps = sum(reps),
year_mon = max(year_mon)) %>%
arrange(desc(reps)) %>%
slice(1:10) %>%
knitr::kable()