Spaces:
Sleeping
Sleeping
| import re | |
| import sqlite3 | |
| import time | |
| from typing import Dict, List, Union | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def get_total_pages_and_products(base_url: str) -> tuple[int, int]: | |
| """ | |
| Dynamically determine the total number of pages and products from the home page. | |
| Args: | |
| base_url (str): The URL of the home page. | |
| Returns: | |
| tuple[int, int]: A tuple containing the total number of pages and total number of products. | |
| """ | |
| response = requests.get(base_url) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Find results summary text | |
| form = soup.find("form", class_="form-horizontal") | |
| results_text = form.get_text(strip=True) if form else "" | |
| # Extract total products | |
| match = re.search(r"(\d+)\s*results", results_text) | |
| total_products = int(match.group(1)) if match else 0 | |
| # Find page summary text | |
| page_text_elem = soup.find("li", class_="current") | |
| page_text = page_text_elem.text.strip() if page_text_elem else "" | |
| # Extract total pages | |
| match = re.search(r"Page \d+ of (\d+)", page_text) | |
| total_pages = int(match.group(1)) if match else 0 | |
| return total_pages, total_products | |
| def scrape_book_details(book_url: str) -> tuple[str, str, int]: | |
| """ | |
| Scrape detailed information for a specific book. | |
| Args: | |
| book_url (str): The URL of the book's detail page. | |
| Returns: | |
| tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity. | |
| """ | |
| response = requests.get(book_url) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Extract description | |
| description_elem = soup.find("div", id="product_description") | |
| description = ( | |
| description_elem.find_next("p").text if description_elem else "No description" | |
| ) | |
| # Extract category | |
| breadcrumb = soup.find("ul", class_="breadcrumb") | |
| category = ( | |
| breadcrumb.find_all("a")[2].text | |
| if breadcrumb and len(breadcrumb.find_all("a")) > 2 | |
| else "Unknown" | |
| ) | |
| # Extract stock quantity | |
| availability_elem = soup.find("p", class_="instock availability") | |
| stock_text = availability_elem.text.strip() if availability_elem else "" | |
| match = re.search(r"In stock \((\d+) available\)", stock_text) | |
| stock_quantity = int(match.group(1)) if match else 0 | |
| return description, category, stock_quantity | |
| def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]: | |
| """ | |
| Scrape details for books on a single page. | |
| Args: | |
| url (str): The URL of the page to scrape. | |
| Returns: | |
| list[dict[str, str | float | int]]: A list of dictionaries containing book details. | |
| """ | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| books = [] | |
| book_elements = soup.find_all("article", class_="product_pod") | |
| for book in book_elements: | |
| # Basic book information | |
| title = book.h3.a["title"] | |
| price = book.find("p", class_="price_color").text[1:] # Remove £ symbol | |
| # Get star rating | |
| star_class = book.find("p", class_="star-rating")["class"][1] | |
| rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5} | |
| star_rating = rating_map.get(star_class, 0) | |
| # Get availability | |
| availability = book.find("p", class_="instock availability").text.strip() | |
| # Get book page URL to scrape more details | |
| book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[ | |
| "href" | |
| ].replace("../", "") | |
| # Scrape additional details | |
| description, category, quantity = scrape_book_details(book_page_url) | |
| books.append( | |
| { | |
| "title": title, | |
| "price": float(price), | |
| "star_rating": star_rating, | |
| "availability": availability, | |
| "description": description, | |
| "category": category, | |
| "quantity": quantity, | |
| } | |
| ) | |
| return books | |
| def scrape_all_books( | |
| base_url: str, page_url_template: str | |
| ) -> List[Dict[str, Union[str, float, int]]]: | |
| """ | |
| Scrape books from all pages. | |
| Args: | |
| base_url (str): The base URL of the website. | |
| page_url_template (str): The URL template for pagination. | |
| Returns: | |
| list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details. | |
| """ | |
| total_pages, total_products = get_total_pages_and_products(base_url) | |
| print(f"Total Products: {total_products}") | |
| print(f"Total Pages: {total_pages}") | |
| all_books = [] | |
| for page_num in range(1, total_pages + 1): | |
| url = page_url_template.format(page_num) | |
| print(f"Scraping page {page_num}") | |
| try: | |
| page_books = scrape_book_page(url) | |
| all_books.extend(page_books) | |
| time.sleep(0.1) # Polite scraping: add a delay between requests | |
| except Exception as e: | |
| print(f"Error scraping page {page_num}: {e}") | |
| return all_books | |
| def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None: | |
| """ | |
| Save the scraped data to an SQLite database. | |
| Args: | |
| data (list[dict[str, str | float | int]]): The data to save. | |
| db_path (str): The path to the SQLite database file. | |
| """ | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| # Create table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS books ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| title TEXT NOT NULL, | |
| price REAL NOT NULL, | |
| star_rating INTEGER NOT NULL, | |
| availability TEXT NOT NULL, | |
| description TEXT NOT NULL, | |
| category TEXT NOT NULL, | |
| quantity INTEGER NOT NULL | |
| ) | |
| """) | |
| # ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)``` | |
| # Insert data | |
| for book in data: | |
| cursor.execute( | |
| """ | |
| INSERT INTO books (title, price, star_rating, availability, description, category, quantity) | |
| VALUES (?, ?, ?, ?, ?, ?, ?) | |
| """, | |
| ( | |
| book["title"], | |
| book["price"], | |
| book["star_rating"], | |
| book["availability"], | |
| book["description"], | |
| book["category"], | |
| book["quantity"], | |
| ), | |
| ) | |
| conn.commit() | |
| conn.close() | |
| print(f"Data saved to {db_path}") | |
| def main() -> None: | |
| base_url = "https://books.toscrape.com/index.html" | |
| page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html" | |
| books_data = scrape_all_books(base_url, page_url_template) | |
| save_to_sqlite(books_data, "data/books_data.db") | |
| print(f"Scraped {len(books_data)} books. Data saved to books.db") | |
| if __name__ == "__main__": | |
| main() | |