library(purrr)
library(tidyverse)
library(stringr)
library(rvest)Note: This was initially a collaborative project, but I made some adjustments independently. There are two GitHub links below, one which contains the initial project, and one that contains the finalized assignment. However, the final project’s code is listed here.
View the collaborative code on GitHub
Goal: Scrape information from https://www.cheese.com to obtain a dataset of characteristics about different cheeses, and gain deeper insight into your coding process. 🪤
Part 1: Locate and examine the robots.txt file for this website. Summarize what you learn from it.
https://www.cheese.com/robots.txt
User-agent: *
Sitemap: https://www.cheese.com/sitemap.xml
- User-agent: *: anyone is allowed to scrape
- No Crawl-delay: no wait time is required between each page scraped
- No Visit-time entry: no restrictions on time that scraping is allowed
- No Request-rate entry: no restrictions on simultaneous requests
- No mention of Disallow sections
Part 2: Obtain the following information for all cheeses in the database:
- cheese name
- URL for the cheese’s webpage (e.g., https://www.cheese.com/gouda/)
- whether or not the cheese has a picture (e.g., gouda has a picture, but bianco does not).
To be kind to the website owners, I added a 1 second pause between page queries. (Note that you can view 100 cheeses at a time.)
get_text_from_page <- function(page, css_selector) {
page |>
html_elements(css_selector) |>
html_text()
}
scrape_page <- function(url) {
if (!is.character(url) || length(url) != 1 || !grepl("^https?://", url)) {
#grepl reference: https://www.educative.io/answers/what-is-the-grepl-function-in-r
stop("inputted url must be a valid single URL string.")
}
# 1 second pause between page queries
Sys.sleep(1)
# Read the page
page <- read_html(url)
# Grab cheese name from the page
cheese_name <- get_text_from_page(page, "h3")
# Grab link from cheese node within page
cheese_href <- page |>
html_nodes("h3 a") |>
html_attr("href")
# make it look like a url
cheese_url <- paste0("https://cheese.com", cheese_href)
# Grab the main body image elements
cheese_pic_reference <- page |>
html_elements("#main-body img")
# If the class of the cheese pic image is image-exists, then set cheese_pic_ifelse to true
cheese_pic_ifelse <- cheese_pic_reference |>
html_attr("class") |>
str_detect("image-exists")
#Make a tibble
tibble(
name = cheese_name,
url = cheese_url,
has_pic = cheese_pic_ifelse
)
}base_url <- "https://www.cheese.com/alphabetical/?per_page=100"
urls_all_pages <- c(base_url,
str_c(base_url,
"&page=",
1:21)
)
pages <- map(urls_all_pages, scrape_page)
df_cheeses <- bind_rows(pages)
head(df_cheeses)# A tibble: 6 × 3
name url has_pic
<chr> <chr> <lgl>
1 2 Year Aged Cumin Gouda https://cheese.com/2-year-aged-cumi… TRUE
2 3-Cheese Italian Blend https://cheese.com/3-cheese-italian… FALSE
3 30 Month Aged Parmigiano Reggiano https://cheese.com/30-month-aged-pa… TRUE
4 3yrs Aged Vintage Gouda https://cheese.com/3yrs-aged-vintag… TRUE
5 Aarewasser https://cheese.com/aarewasser/ TRUE
6 Abbaye de Belloc https://cheese.com/abbaye-de-belloc/ TRUE
Part 3: When you go to a particular cheese’s page (like gouda), you’ll see more detailed information about the cheese. Obtain the following detailed information:
- milk information
- country of origin
- family
- type
- flavour
(Only 10 cheese were used to avoid overtaxing the website! Added a 1 second pause between page queries.)
#Extract a certain amount of cheese links per page
extract_links <- function(main_url, n = 10) {
#Input check
if (!is.numeric(n) || n <= 0 || n != as.integer(n)) {
stop("n must be a positive integer.")
}
# 1 second pause between page queries
Sys.sleep(1)
#Read the page
page <- read_html(main_url)
#Get the first n urls for cheeses
cheese_href <- page |>
html_nodes("h3 a") |>
head(n) |>
html_attr("href")
#get cheese url
paste0("https://cheese.com", cheese_href)
}
#Helper function to extract the specific cheese information
extract_field <- function(field_name, cheese_text) {
#Input check
if (!is.character(field_name) || length(field_name) != 1) {
stop("field_name must be a single string.")
}
# Search for lines starting with field
field <- cheese_text[str_detect(cheese_text, paste0("^", field_name))]
if (length(field) == 0) return(NA)
trimws(str_remove(field, field_name))
}
#Scrape each individual cheese page
cheese_scrape <- function(cheese_url) {
#1 second pause between page queries
Sys.sleep(1)
#Read the page
page <- read_html(cheese_url)
# Get the cheese stuff
cheese_info_items <- page |>
html_elements("li p") |>
html_text()
tibble(
milk = extract_field("Made from", cheese_info_items),
country = extract_field("Country of origin:", cheese_info_items),
family = extract_field("Family:", cheese_info_items),
type = extract_field("Type:", cheese_info_items),
flavour = extract_field("Flavour:", cheese_info_items)
)
}cheese_urls <- extract_links("https://www.cheese.com/alphabetical/")
pages2 <- map(cheese_urls, cheese_scrape)
df_cheeses2 <- bind_rows(pages2)df_cheeses2# A tibble: 10 × 5
milk country family type flavour
<chr> <chr> <chr> <chr> <chr>
1 pasteurized cow's milk Netherlands <NA> semi-hard sharp
2 pasteurized cow's milk Italy Parmesan semi-soft, artisan butter…
3 unpasteurized cow's milk Italy <NA> hard <NA>
4 pasteurized cow's milk Netherlands <NA> hard strong
5 unpasteurized cow's milk Switzerland <NA> semi-soft sweet
6 unpasteurized sheep's milk France <NA> semi-hard, artisan burnt …
7 cow's milk France <NA> semi-hard <NA>
8 unpasteurized cow's milk France <NA> semi-soft, artisan, … acidic…
9 unpasteurized cow's milk France <NA> soft, artisan fruity…
10 pasteurized cow's milk France <NA> semi-hard salty,…