Spaces:

amgadhasan
/

bookstore-chatbot

Sleeping

File size: 7,078 Bytes

60a49e6

import re
import sqlite3
import time
from typing import Dict, List, Union

import requests
from bs4 import BeautifulSoup


def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
    """
    Dynamically determine the total number of pages and products from the home page.

    Args:
        base_url (str): The URL of the home page.

    Returns:
        tuple[int, int]: A tuple containing the total number of pages and total number of products.
    """
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find results summary text
    form = soup.find("form", class_="form-horizontal")
    results_text = form.get_text(strip=True) if form else ""

    # Extract total products
    match = re.search(r"(\d+)\s*results", results_text)
    total_products = int(match.group(1)) if match else 0

    # Find page summary text
    page_text_elem = soup.find("li", class_="current")
    page_text = page_text_elem.text.strip() if page_text_elem else ""

    # Extract total pages
    match = re.search(r"Page \d+ of (\d+)", page_text)
    total_pages = int(match.group(1)) if match else 0

    return total_pages, total_products


def scrape_book_details(book_url: str) -> tuple[str, str, int]:
    """
    Scrape detailed information for a specific book.

    Args:
        book_url (str): The URL of the book's detail page.

    Returns:
        tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
    """
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract description
    description_elem = soup.find("div", id="product_description")
    description = (
        description_elem.find_next("p").text if description_elem else "No description"
    )

    # Extract category
    breadcrumb = soup.find("ul", class_="breadcrumb")
    category = (
        breadcrumb.find_all("a")[2].text
        if breadcrumb and len(breadcrumb.find_all("a")) > 2
        else "Unknown"
    )

    # Extract stock quantity
    availability_elem = soup.find("p", class_="instock availability")
    stock_text = availability_elem.text.strip() if availability_elem else ""
    match = re.search(r"In stock \((\d+) available\)", stock_text)
    stock_quantity = int(match.group(1)) if match else 0

    return description, category, stock_quantity


def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
    """
    Scrape details for books on a single page.

    Args:
        url (str): The URL of the page to scrape.

    Returns:
        list[dict[str, str | float | int]]: A list of dictionaries containing book details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    books = []
    book_elements = soup.find_all("article", class_="product_pod")

    for book in book_elements:
        # Basic book information
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text[1:]  # Remove £ symbol

        # Get star rating
        star_class = book.find("p", class_="star-rating")["class"][1]
        rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
        star_rating = rating_map.get(star_class, 0)

        # Get availability
        availability = book.find("p", class_="instock availability").text.strip()

        # Get book page URL to scrape more details
        book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
            "href"
        ].replace("../", "")

        # Scrape additional details
        description, category, quantity = scrape_book_details(book_page_url)

        books.append(
            {
                "title": title,
                "price": float(price),
                "star_rating": star_rating,
                "availability": availability,
                "description": description,
                "category": category,
                "quantity": quantity,
            }
        )

    return books


def scrape_all_books(
    base_url: str, page_url_template: str
) -> List[Dict[str, Union[str, float, int]]]:
    """
    Scrape books from all pages.

    Args:
        base_url (str): The base URL of the website.
        page_url_template (str): The URL template for pagination.

    Returns:
        list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
    """
    total_pages, total_products = get_total_pages_and_products(base_url)

    print(f"Total Products: {total_products}")
    print(f"Total Pages: {total_pages}")

    all_books = []

    for page_num in range(1, total_pages + 1):
        url = page_url_template.format(page_num)
        print(f"Scraping page {page_num}")

        try:
            page_books = scrape_book_page(url)
            all_books.extend(page_books)
            time.sleep(0.1)  # Polite scraping: add a delay between requests
        except Exception as e:
            print(f"Error scraping page {page_num}: {e}")

    return all_books


def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
    """
    Save the scraped data to an SQLite database.

    Args:
        data (list[dict[str, str | float | int]]): The data to save.
        db_path (str): The path to the SQLite database file.
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS books (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            price REAL NOT NULL,
            star_rating INTEGER NOT NULL,
            availability TEXT NOT NULL,
            description TEXT NOT NULL,
            category TEXT NOT NULL,
            quantity INTEGER NOT NULL
        )
    """)
    # ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```

    # Insert data
    for book in data:
        cursor.execute(
            """
            INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """,
            (
                book["title"],
                book["price"],
                book["star_rating"],
                book["availability"],
                book["description"],
                book["category"],
                book["quantity"],
            ),
        )

    conn.commit()
    conn.close()
    print(f"Data saved to {db_path}")


def main() -> None:
    base_url = "https://books.toscrape.com/index.html"
    page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"

    books_data = scrape_all_books(base_url, page_url_template)

    save_to_sqlite(books_data, "data/books_data.db")

    print(f"Scraped {len(books_data)} books. Data saved to books.db")


if __name__ == "__main__":
    main()