Spaces:
Sleeping
Sleeping
File size: 7,078 Bytes
60a49e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | import re
import sqlite3
import time
from typing import Dict, List, Union
import requests
from bs4 import BeautifulSoup
def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
"""
Dynamically determine the total number of pages and products from the home page.
Args:
base_url (str): The URL of the home page.
Returns:
tuple[int, int]: A tuple containing the total number of pages and total number of products.
"""
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
# Find results summary text
form = soup.find("form", class_="form-horizontal")
results_text = form.get_text(strip=True) if form else ""
# Extract total products
match = re.search(r"(\d+)\s*results", results_text)
total_products = int(match.group(1)) if match else 0
# Find page summary text
page_text_elem = soup.find("li", class_="current")
page_text = page_text_elem.text.strip() if page_text_elem else ""
# Extract total pages
match = re.search(r"Page \d+ of (\d+)", page_text)
total_pages = int(match.group(1)) if match else 0
return total_pages, total_products
def scrape_book_details(book_url: str) -> tuple[str, str, int]:
"""
Scrape detailed information for a specific book.
Args:
book_url (str): The URL of the book's detail page.
Returns:
tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
"""
response = requests.get(book_url)
soup = BeautifulSoup(response.content, "html.parser")
# Extract description
description_elem = soup.find("div", id="product_description")
description = (
description_elem.find_next("p").text if description_elem else "No description"
)
# Extract category
breadcrumb = soup.find("ul", class_="breadcrumb")
category = (
breadcrumb.find_all("a")[2].text
if breadcrumb and len(breadcrumb.find_all("a")) > 2
else "Unknown"
)
# Extract stock quantity
availability_elem = soup.find("p", class_="instock availability")
stock_text = availability_elem.text.strip() if availability_elem else ""
match = re.search(r"In stock \((\d+) available\)", stock_text)
stock_quantity = int(match.group(1)) if match else 0
return description, category, stock_quantity
def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
"""
Scrape details for books on a single page.
Args:
url (str): The URL of the page to scrape.
Returns:
list[dict[str, str | float | int]]: A list of dictionaries containing book details.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
books = []
book_elements = soup.find_all("article", class_="product_pod")
for book in book_elements:
# Basic book information
title = book.h3.a["title"]
price = book.find("p", class_="price_color").text[1:] # Remove £ symbol
# Get star rating
star_class = book.find("p", class_="star-rating")["class"][1]
rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
star_rating = rating_map.get(star_class, 0)
# Get availability
availability = book.find("p", class_="instock availability").text.strip()
# Get book page URL to scrape more details
book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
"href"
].replace("../", "")
# Scrape additional details
description, category, quantity = scrape_book_details(book_page_url)
books.append(
{
"title": title,
"price": float(price),
"star_rating": star_rating,
"availability": availability,
"description": description,
"category": category,
"quantity": quantity,
}
)
return books
def scrape_all_books(
base_url: str, page_url_template: str
) -> List[Dict[str, Union[str, float, int]]]:
"""
Scrape books from all pages.
Args:
base_url (str): The base URL of the website.
page_url_template (str): The URL template for pagination.
Returns:
list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
"""
total_pages, total_products = get_total_pages_and_products(base_url)
print(f"Total Products: {total_products}")
print(f"Total Pages: {total_pages}")
all_books = []
for page_num in range(1, total_pages + 1):
url = page_url_template.format(page_num)
print(f"Scraping page {page_num}")
try:
page_books = scrape_book_page(url)
all_books.extend(page_books)
time.sleep(0.1) # Polite scraping: add a delay between requests
except Exception as e:
print(f"Error scraping page {page_num}: {e}")
return all_books
def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
"""
Save the scraped data to an SQLite database.
Args:
data (list[dict[str, str | float | int]]): The data to save.
db_path (str): The path to the SQLite database file.
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Create table
cursor.execute("""
CREATE TABLE IF NOT EXISTS books (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
price REAL NOT NULL,
star_rating INTEGER NOT NULL,
availability TEXT NOT NULL,
description TEXT NOT NULL,
category TEXT NOT NULL,
quantity INTEGER NOT NULL
)
""")
# ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```
# Insert data
for book in data:
cursor.execute(
"""
INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
book["title"],
book["price"],
book["star_rating"],
book["availability"],
book["description"],
book["category"],
book["quantity"],
),
)
conn.commit()
conn.close()
print(f"Data saved to {db_path}")
def main() -> None:
base_url = "https://books.toscrape.com/index.html"
page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"
books_data = scrape_all_books(base_url, page_url_template)
save_to_sqlite(books_data, "data/books_data.db")
print(f"Scraped {len(books_data)} books. Data saved to books.db")
if __name__ == "__main__":
main()
|