Sunday, October 25, 2020

First steps in text scraping with rvest

A small exercise in text scraping with rvest

pacman::p_load(tidyverse, rvest, textclean)

Scrape one page

html_address <- "https://www.r-bloggers.com/2020/10/daylight-charts-with-r/"
xpath1 <- "//article/div"

sometext <- 
  #read whole page
  read_html(x = html_address) %>%
  #cut out with xpath
  html_nodes(xpath = xpath1) %>% 
  #get interesting text from paragraphs and lists
  html_nodes(c("p,li")) %>% 
  #make sure it is only text
  html_text() %>% 
  #remove non-asci 
  replace_non_ascii2() %>%
  #remove all html markup
  replace_html() 

Build a function to do it

#' Funkcja "Ściagnij tekst"
#'
#' @param html_address  - url do strony
#' @param xpath_to_text - sciezka xpath do tekstów, poprzedz '//' zeby wyszukiwac
#'
#' @return text extracted from page
#' @export
#'
#' @examples fn_sciagnij_tekst(html_address = "https://www.r-bloggers.com/2020/10/rapid-internationalization-of-shiny-apps-shiny-i18n-version-0-2/")
#' 
fn_sciagnij_tekst <- function(html_address, xpath_to_text = "//article/div") {

  sometext <- 
    read_html(x = html_address) %>%
    html_nodes(xpath =xpath_to_text) %>% 
    html_nodes(c("p,li")) %>% 
    html_text() %>% 
    replace_non_ascii2() %>%
    replace_html() 
  
  return(sometext)
}

Test a function

one_read <- 
  fn_sciagnij_tekst(
  html_address = "https://www.r-bloggers.com/2020/10/rapid-internationalization-of-shiny-apps-shiny-i18n-version-0-2/",
  )

Test it on more url’s

adresy <- 
  c("https://r-bloggers.com/2020/10/rapid-internationalization-of-shiny-apps-shiny-i18n-version-0-2/",
    "https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/",
    "https://www.r-bloggers.com/2020/10/little-useless-useful-r-function-r-jobs-title-generator/"
    )

use for syntax

df <- data.frame()

for (i in seq_along(adresy)) {

  df_temp <- data_frame()
  df_temp <- data.frame(adresy = adresy[i],
                        zawartosc = paste0(fn_sciagnij_tekst(adresy[i]),collapse = "\n"))
  df <- bind_rows(df, df_temp)
}
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

use map syntax.

df <- 
  map_df(adresy, 
       ~ c(adresy = .x, 
           teksty = paste0(fn_sciagnij_tekst(.x),collapse = "\n") ))
# TO DO: it needs improvement for some error handling, eg. as in here with `possibly` 
https://stackoverflow.com/questions/50486527/how-to-use-map-with-possibly
Even better it would be to use safely and then `rectangle the embedded lists into regular columns `result` and `error. Suitable functions: 
https://tidyr.tidyverse.org/reference/hoist.html
https://purrr.tidyverse.org/reference/flatten.html

No comments:

Post a Comment

An example of a bat file that shows dialogues

@echo off setlocal :: Prompt user for input file names set /p jpgfile="Enter the name of the JPG file: " set /p archive="Ent...