library(purrr)
library(tidyverse)
library(stringr)
library(rvest)
Note: This was initially a collaborative project, but I made some adjustments independently. There are two GitHub links below, one which contains the initial project, and one that contains the finalized assignment. However, the final project’s code is listed here.
View the collaborative code on GitHub
Goal: Scrape information from https://www.cheese.com to obtain a dataset of characteristics about different cheeses, and gain deeper insight into your coding process. 🪤
Part 1: Locate and examine the robots.txt
file for this website. Summarize what you learn from it.
https://www.cheese.com/robots.txt
User-agent: *
Sitemap: https://www.cheese.com/sitemap.xml
- User-agent: *: anyone is allowed to scrape
- No Crawl-delay: no wait time is required between each page scraped
- No Visit-time entry: no restrictions on time that scraping is allowed
- No Request-rate entry: no restrictions on simultaneous requests
- No mention of Disallow sections
Part 2: Obtain the following information for all cheeses in the database:
- cheese name
- URL for the cheese’s webpage (e.g., https://www.cheese.com/gouda/)
- whether or not the cheese has a picture (e.g., gouda has a picture, but bianco does not).
To be kind to the website owners, I added a 1 second pause between page queries. (Note that you can view 100 cheeses at a time.)
<- function(page, css_selector) {
get_text_from_page
|>
page html_elements(css_selector) |>
html_text()
}
<- function(url) {
scrape_page
if (!is.character(url) || length(url) != 1 || !grepl("^https?://", url)) {
#grepl reference: https://www.educative.io/answers/what-is-the-grepl-function-in-r
stop("inputted url must be a valid single URL string.")
}# 1 second pause between page queries
Sys.sleep(1)
# Read the page
<- read_html(url)
page
# Grab cheese name from the page
<- get_text_from_page(page, "h3")
cheese_name
# Grab link from cheese node within page
<- page |>
cheese_href html_nodes("h3 a") |>
html_attr("href")
# make it look like a url
<- paste0("https://cheese.com", cheese_href)
cheese_url
# Grab the main body image elements
<- page |>
cheese_pic_reference html_elements("#main-body img")
# If the class of the cheese pic image is image-exists, then set cheese_pic_ifelse to true
<- cheese_pic_reference |>
cheese_pic_ifelse html_attr("class") |>
str_detect("image-exists")
#Make a tibble
tibble(
name = cheese_name,
url = cheese_url,
has_pic = cheese_pic_ifelse
) }
<- "https://www.cheese.com/alphabetical/?per_page=100"
base_url
<- c(base_url,
urls_all_pages str_c(base_url,
"&page=",
1:21)
)
<- map(urls_all_pages, scrape_page)
pages
<- bind_rows(pages)
df_cheeses
head(df_cheeses)
# A tibble: 6 × 3
name url has_pic
<chr> <chr> <lgl>
1 2 Year Aged Cumin Gouda https://cheese.com/2-year-aged-cumi… TRUE
2 3-Cheese Italian Blend https://cheese.com/3-cheese-italian… FALSE
3 30 Month Aged Parmigiano Reggiano https://cheese.com/30-month-aged-pa… TRUE
4 3yrs Aged Vintage Gouda https://cheese.com/3yrs-aged-vintag… TRUE
5 Aarewasser https://cheese.com/aarewasser/ TRUE
6 Abbaye de Belloc https://cheese.com/abbaye-de-belloc/ TRUE
Part 3: When you go to a particular cheese’s page (like gouda), you’ll see more detailed information about the cheese. Obtain the following detailed information:
- milk information
- country of origin
- family
- type
- flavour
(Only 10 cheese were used to avoid overtaxing the website! Added a 1 second pause between page queries.)
#Extract a certain amount of cheese links per page
<- function(main_url, n = 10) {
extract_links #Input check
if (!is.numeric(n) || n <= 0 || n != as.integer(n)) {
stop("n must be a positive integer.")
}
# 1 second pause between page queries
Sys.sleep(1)
#Read the page
<- read_html(main_url)
page
#Get the first n urls for cheeses
<- page |>
cheese_href html_nodes("h3 a") |>
head(n) |>
html_attr("href")
#get cheese url
paste0("https://cheese.com", cheese_href)
}
#Helper function to extract the specific cheese information
<- function(field_name, cheese_text) {
extract_field #Input check
if (!is.character(field_name) || length(field_name) != 1) {
stop("field_name must be a single string.")
}
# Search for lines starting with field
<- cheese_text[str_detect(cheese_text, paste0("^", field_name))]
field
if (length(field) == 0) return(NA)
trimws(str_remove(field, field_name))
}
#Scrape each individual cheese page
<- function(cheese_url) {
cheese_scrape #1 second pause between page queries
Sys.sleep(1)
#Read the page
<- read_html(cheese_url)
page
# Get the cheese stuff
<- page |>
cheese_info_items html_elements("li p") |>
html_text()
tibble(
milk = extract_field("Made from", cheese_info_items),
country = extract_field("Country of origin:", cheese_info_items),
family = extract_field("Family:", cheese_info_items),
type = extract_field("Type:", cheese_info_items),
flavour = extract_field("Flavour:", cheese_info_items)
) }
<- extract_links("https://www.cheese.com/alphabetical/")
cheese_urls
<- map(cheese_urls, cheese_scrape)
pages2
<- bind_rows(pages2) df_cheeses2
df_cheeses2
# A tibble: 10 × 5
milk country family type flavour
<chr> <chr> <chr> <chr> <chr>
1 pasteurized cow's milk Netherlands <NA> semi-hard sharp
2 pasteurized cow's milk Italy Parmesan semi-soft, artisan butter…
3 unpasteurized cow's milk Italy <NA> hard <NA>
4 pasteurized cow's milk Netherlands <NA> hard strong
5 unpasteurized cow's milk Switzerland <NA> semi-soft sweet
6 unpasteurized sheep's milk France <NA> semi-hard, artisan burnt …
7 cow's milk France <NA> semi-hard <NA>
8 unpasteurized cow's milk France <NA> semi-soft, artisan, … acidic…
9 unpasteurized cow's milk France <NA> soft, artisan fruity…
10 pasteurized cow's milk France <NA> semi-hard salty,…