Background

https://d.cosx.org/d/420739-r

Download all thread html

thread.l = readLines("https://stat.ethz.ch/pipermail/r-devel/") %>%
  grep("thread.html",.,value= TRUE) %>%
  gsub(".*?([19|20].*html).*","https://stat.ethz.ch/pipermail/r-devel/\\1", .)

outf = paste0("./tmp/", basename(dirname(thread.l)),".html")
dir.create("./tmp/")
## Warning in dir.create("./tmp/"): './tmp' already exists
for(i in 1:length(thread.l)){
  if(!file.exists(outf[i]))
    download_html(thread.l[i], file = outf[i])
}

Data

thread.l.local = list.files("./tmp/",full.names=TRUE)

Scrape it

scrap_thread_loc = function(file_thread) {
  
  year_mon = basename(file_thread) %>% gsub(".html","",.)
  
  dir = paste0("https://stat.ethz.ch/pipermail/r-devel/",year_mon,"/")
  tmpl =
    read_html(file_thread) %>%
    xml_find_all("//body/ul[2]/li")
  
  title =
    tmpl %>%
    xml_find_first(".//a") %>%
    xml_contents() %>%
    as.character()
  
  link =
    tmpl %>%
    xml_find_first(".//a") %>%
    xml_attr("href") %>%
    paste0(dir, "/", .)
  
  reps =
    sapply(tmpl, function(node) {
      return(node %>% xml_find_all(".//li") %>% length())
    })
  
  
  df.rslt = tibble(title,
                       link,
                       reps
                       )
  df.rslt$year_mon = year_mon
  
  return(df.rslt)
}

scrape all

all.rslt = pbapply::pblapply(thread.l.local, scrap_thread_loc)
all.rslt.df = do.call(rbind,all.rslt)

clean

all.rslt.df %>%
  arrange(desc(reps)) %>%
  slice(1:10) %>%
  knitr::kable()
title link reps year_mon
[Rd] [RFC] A case for freezing CRAN https://stat.ethz.ch/pipermail/r-devel/2014-March//068548.html 69 2014-March
[Rd] CRAN policies https://stat.ethz.ch/pipermail/r-devel/2012-March//063678.html 51 2012-March
[Rd] surprising behaviour of names<- https://stat.ethz.ch/pipermail/r-devel/2009-March//052522.html 48 2009-March
[Rd] legitimate use of ::: https://stat.ethz.ch/pipermail/r-devel/2013-August//067180.html 44 2013-August
[Rd] if(–as-cran)? https://stat.ethz.ch/pipermail/r-devel/2012-September//064760.html 42 2012-September
[Rd] declaring package dependencies https://stat.ethz.ch/pipermail/r-devel/2013-September//067446.html 42 2013-September
[Rd] Bias in R’s random integers? https://stat.ethz.ch/pipermail/r-devel/2018-September//076817.html 37 2018-September
[Rd] R 3.0, Rtools3.0,l Windows7 64-bit, and permission agony https://stat.ethz.ch/pipermail/r-devel/2013-April//066388.html 36 2013-April
[Rd] PATCH: Add fields argument to installed.packages and available.packages https://stat.ethz.ch/pipermail/r-devel/2006-August//042397.html 35 2006-August
[Rd] Suggestion: help(<package name>) https://stat.ethz.ch/pipermail/r-devel/2005-June//033480.html 34 2005-June

Combine threads with the same title

all.rslt.df %>%
  group_by(title) %>%
  summarise(link =max(link), 
            reps = sum(reps),
            year_mon = max(year_mon)) %>%
  arrange(desc(reps)) %>%
  slice(1:10) %>%
  knitr::kable()
title link reps year_mon
[Rd] [RFC] A case for freezing CRAN https://stat.ethz.ch/pipermail/r-devel/2014-March//068548.html 69 2014-March
[Rd] Wish list https://stat.ethz.ch/pipermail/r-devel/2007-January//044134.html 62 2007-January
[Rd] CRAN policies https://stat.ethz.ch/pipermail/r-devel/2012-March//063678.html 51 2012-March
[Rd] legitimate use of ::: https://stat.ethz.ch/pipermail/r-devel/2014-May//069031.html 48 2014-May
[Rd] NEWS.md support on CRAN https://stat.ethz.ch/pipermail/r-devel/2015-May//071215.html 48 2015-May
[Rd] surprising behaviour of names<- https://stat.ethz.ch/pipermail/r-devel/2009-March//052522.html 48 2009-March
[Rd] declaring package dependencies https://stat.ethz.ch/pipermail/r-devel/2013-September//067446.html 42 2013-September
[Rd] if(–as-cran)? https://stat.ethz.ch/pipermail/r-devel/2012-September//064760.html 42 2012-September
[Rd] Bias in R’s random integers? https://stat.ethz.ch/pipermail/r-devel/2018-September//076817.html 37 2018-September
[Rd] R 3.0, Rtools3.0,l Windows7 64-bit, and permission agony https://stat.ethz.ch/pipermail/r-devel/2013-April//066388.html 36 2013-April