bookstore-chatbot / src /scrape.py
amgadhasan's picture
First commit
60a49e6
import re
import sqlite3
import time
from typing import Dict, List, Union
import requests
from bs4 import BeautifulSoup
def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
"""
Dynamically determine the total number of pages and products from the home page.
Args:
base_url (str): The URL of the home page.
Returns:
tuple[int, int]: A tuple containing the total number of pages and total number of products.
"""
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
# Find results summary text
form = soup.find("form", class_="form-horizontal")
results_text = form.get_text(strip=True) if form else ""
# Extract total products
match = re.search(r"(\d+)\s*results", results_text)
total_products = int(match.group(1)) if match else 0
# Find page summary text
page_text_elem = soup.find("li", class_="current")
page_text = page_text_elem.text.strip() if page_text_elem else ""
# Extract total pages
match = re.search(r"Page \d+ of (\d+)", page_text)
total_pages = int(match.group(1)) if match else 0
return total_pages, total_products
def scrape_book_details(book_url: str) -> tuple[str, str, int]:
"""
Scrape detailed information for a specific book.
Args:
book_url (str): The URL of the book's detail page.
Returns:
tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
"""
response = requests.get(book_url)
soup = BeautifulSoup(response.content, "html.parser")
# Extract description
description_elem = soup.find("div", id="product_description")
description = (
description_elem.find_next("p").text if description_elem else "No description"
)
# Extract category
breadcrumb = soup.find("ul", class_="breadcrumb")
category = (
breadcrumb.find_all("a")[2].text
if breadcrumb and len(breadcrumb.find_all("a")) > 2
else "Unknown"
)
# Extract stock quantity
availability_elem = soup.find("p", class_="instock availability")
stock_text = availability_elem.text.strip() if availability_elem else ""
match = re.search(r"In stock \((\d+) available\)", stock_text)
stock_quantity = int(match.group(1)) if match else 0
return description, category, stock_quantity
def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
"""
Scrape details for books on a single page.
Args:
url (str): The URL of the page to scrape.
Returns:
list[dict[str, str | float | int]]: A list of dictionaries containing book details.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
books = []
book_elements = soup.find_all("article", class_="product_pod")
for book in book_elements:
# Basic book information
title = book.h3.a["title"]
price = book.find("p", class_="price_color").text[1:] # Remove £ symbol
# Get star rating
star_class = book.find("p", class_="star-rating")["class"][1]
rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
star_rating = rating_map.get(star_class, 0)
# Get availability
availability = book.find("p", class_="instock availability").text.strip()
# Get book page URL to scrape more details
book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
"href"
].replace("../", "")
# Scrape additional details
description, category, quantity = scrape_book_details(book_page_url)
books.append(
{
"title": title,
"price": float(price),
"star_rating": star_rating,
"availability": availability,
"description": description,
"category": category,
"quantity": quantity,
}
)
return books
def scrape_all_books(
base_url: str, page_url_template: str
) -> List[Dict[str, Union[str, float, int]]]:
"""
Scrape books from all pages.
Args:
base_url (str): The base URL of the website.
page_url_template (str): The URL template for pagination.
Returns:
list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
"""
total_pages, total_products = get_total_pages_and_products(base_url)
print(f"Total Products: {total_products}")
print(f"Total Pages: {total_pages}")
all_books = []
for page_num in range(1, total_pages + 1):
url = page_url_template.format(page_num)
print(f"Scraping page {page_num}")
try:
page_books = scrape_book_page(url)
all_books.extend(page_books)
time.sleep(0.1) # Polite scraping: add a delay between requests
except Exception as e:
print(f"Error scraping page {page_num}: {e}")
return all_books
def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
"""
Save the scraped data to an SQLite database.
Args:
data (list[dict[str, str | float | int]]): The data to save.
db_path (str): The path to the SQLite database file.
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Create table
cursor.execute("""
CREATE TABLE IF NOT EXISTS books (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
price REAL NOT NULL,
star_rating INTEGER NOT NULL,
availability TEXT NOT NULL,
description TEXT NOT NULL,
category TEXT NOT NULL,
quantity INTEGER NOT NULL
)
""")
# ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```
# Insert data
for book in data:
cursor.execute(
"""
INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
book["title"],
book["price"],
book["star_rating"],
book["availability"],
book["description"],
book["category"],
book["quantity"],
),
)
conn.commit()
conn.close()
print(f"Data saved to {db_path}")
def main() -> None:
base_url = "https://books.toscrape.com/index.html"
page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"
books_data = scrape_all_books(base_url, page_url_template)
save_to_sqlite(books_data, "data/books_data.db")
print(f"Scraped {len(books_data)} books. Data saved to books.db")
if __name__ == "__main__":
main()