scrap_r_mailist

Background

https://d.cosx.org/d/420739-r

Download all thread html

thread.l = readLines("https://stat.ethz.ch/pipermail/r-devel/") %>%
  grep("thread.html",.,value= TRUE) %>%
  gsub(".*?([19|20].*html).*","https://stat.ethz.ch/pipermail/r-devel/\\1", .)

outf = paste0("./tmp/", basename(dirname(thread.l)),".html")
dir.create("./tmp/")

## Warning in dir.create("./tmp/"): './tmp' already exists

for(i in 1:length(thread.l)){
  if(!file.exists(outf[i]))
    download_html(thread.l[i], file = outf[i])
}

Data

thread.l.local = list.files("./tmp/",full.names=TRUE)

Scrape it

scrap_thread_loc = function(file_thread) {
  
  year_mon = basename(file_thread) %>% gsub(".html","",.)
  
  dir = paste0("https://stat.ethz.ch/pipermail/r-devel/",year_mon,"/")
  tmpl =
    read_html(file_thread) %>%
    xml_find_all("//body/ul[2]/li")
  
  title =
    tmpl %>%
    xml_find_first(".//a") %>%
    xml_contents() %>%
    as.character()
  
  link =
    tmpl %>%
    xml_find_first(".//a") %>%
    xml_attr("href") %>%
    paste0(dir, "/", .)
  
  reps =
    sapply(tmpl, function(node) {
      return(node %>% xml_find_all(".//li") %>% length())
    })
  
  
  df.rslt = tibble(title,
                       link,
                       reps
                       )
  df.rslt$year_mon = year_mon
  
  return(df.rslt)
}

scrape all

all.rslt = pbapply::pblapply(thread.l.local, scrap_thread_loc)

all.rslt.df = do.call(rbind,all.rslt)

clean

all.rslt.df %>%
  arrange(desc(reps)) %>%
  slice(1:10) %>%
  knitr::kable()

title	link	reps	year_mon
[Rd] [RFC] A case for freezing CRAN	https://stat.ethz.ch/pipermail/r-devel/2014-March//068548.html	69	2014-March
[Rd] CRAN policies	https://stat.ethz.ch/pipermail/r-devel/2012-March//063678.html	51	2012-March
[Rd] surprising behaviour of names<-	https://stat.ethz.ch/pipermail/r-devel/2009-March//052522.html	48	2009-March
[Rd] legitimate use of :::	https://stat.ethz.ch/pipermail/r-devel/2013-August//067180.html	44	2013-August
[Rd] if(–as-cran)?	https://stat.ethz.ch/pipermail/r-devel/2012-September//064760.html	42	2012-September
[Rd] declaring package dependencies	https://stat.ethz.ch/pipermail/r-devel/2013-September//067446.html	42	2013-September
[Rd] Bias in R’s random integers?	https://stat.ethz.ch/pipermail/r-devel/2018-September//076817.html	37	2018-September
[Rd] R 3.0, Rtools3.0,l Windows7 64-bit, and permission agony	https://stat.ethz.ch/pipermail/r-devel/2013-April//066388.html	36	2013-April
[Rd] PATCH: Add fields argument to installed.packages and available.packages	https://stat.ethz.ch/pipermail/r-devel/2006-August//042397.html	35	2006-August
[Rd] Suggestion: help(<package name>)	https://stat.ethz.ch/pipermail/r-devel/2005-June//033480.html	34	2005-June

Combine threads with the same title

all.rslt.df %>%
  group_by(title) %>%
  summarise(link =max(link), 
            reps = sum(reps),
            year_mon = max(year_mon)) %>%
  arrange(desc(reps)) %>%
  slice(1:10) %>%
  knitr::kable()

title	link	reps	year_mon
[Rd] [RFC] A case for freezing CRAN	https://stat.ethz.ch/pipermail/r-devel/2014-March//068548.html	69	2014-March
[Rd] Wish list	https://stat.ethz.ch/pipermail/r-devel/2007-January//044134.html	62	2007-January
[Rd] CRAN policies	https://stat.ethz.ch/pipermail/r-devel/2012-March//063678.html	51	2012-March
[Rd] legitimate use of :::	https://stat.ethz.ch/pipermail/r-devel/2014-May//069031.html	48	2014-May
[Rd] NEWS.md support on CRAN	https://stat.ethz.ch/pipermail/r-devel/2015-May//071215.html	48	2015-May
[Rd] surprising behaviour of names<-	https://stat.ethz.ch/pipermail/r-devel/2009-March//052522.html	48	2009-March
[Rd] declaring package dependencies	https://stat.ethz.ch/pipermail/r-devel/2013-September//067446.html	42	2013-September
[Rd] if(–as-cran)?	https://stat.ethz.ch/pipermail/r-devel/2012-September//064760.html	42	2012-September
[Rd] Bias in R’s random integers?	https://stat.ethz.ch/pipermail/r-devel/2018-September//076817.html	37	2018-September
[Rd] R 3.0, Rtools3.0,l Windows7 64-bit, and permission agony	https://stat.ethz.ch/pipermail/r-devel/2013-April//066388.html	36	2013-April

scrap_r_mailist

TC

6/14/2019

Background

Download all thread html

Data

Scrape it

scrape all

clean

Combine threads with the same title