A small exercise in text scraping with rvest
pacman::p_load(tidyverse, rvest, textclean)
Scrape one page
html_address <- "https://www.r-bloggers.com/2020/10/daylight-charts-with-r/"
xpath1 <- "//article/div"
sometext <-
read_html(x = html_address) %>%
html_nodes(xpath = xpath1) %>%
html_nodes(c("p,li")) %>%
html_text() %>%
replace_non_ascii2() %>%
replace_html()
Build a function to do it
fn_sciagnij_tekst <- function(html_address, xpath_to_text = "//article/div") {
sometext <-
read_html(x = html_address) %>%
html_nodes(xpath =xpath_to_text) %>%
html_nodes(c("p,li")) %>%
html_text() %>%
replace_non_ascii2() %>%
replace_html()
return(sometext)
}
Test a function
one_read <-
fn_sciagnij_tekst(
html_address = "https://www.r-bloggers.com/2020/10/rapid-internationalization-of-shiny-apps-shiny-i18n-version-0-2/",
)
Test it on more url’s
adresy <-
c("https://r-bloggers.com/2020/10/rapid-internationalization-of-shiny-apps-shiny-i18n-version-0-2/",
"https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/",
"https://www.r-bloggers.com/2020/10/little-useless-useful-r-function-r-jobs-title-generator/"
)
use for
syntax
df <- data.frame()
for (i in seq_along(adresy)) {
df_temp <- data_frame()
df_temp <- data.frame(adresy = adresy[i],
zawartosc = paste0(fn_sciagnij_tekst(adresy[i]),collapse = "\n"))
df <- bind_rows(df, df_temp)
}
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
use map
syntax.
df <-
map_df(adresy,
~ c(adresy = .x,
teksty = paste0(fn_sciagnij_tekst(.x),collapse = "\n") ))
# TO DO: it needs improvement for some error handling, eg. as in here with `possibly`
https://stackoverflow.com/questions/50486527/how-to-use-map-with-possibly
Even better it would be to use safely and then `rectangle the embedded lists into regular columns `result` and `error. Suitable functions:
https://tidyr.tidyverse.org/reference/hoist.html
https://purrr.tidyverse.org/reference/flatten.html
No comments:
Post a Comment