Spaces:

amgadhasan
/

bookstore-chatbot

Sleeping

App Files Files Community

bookstore-chatbot / src /scrape.py

amgadhasan

First commit

60a49e6 about 1 year ago

raw

history blame contribute delete

7.08 kB

	import re
	import sqlite3
	import time
	from typing import Dict, List, Union

	import requests
	from bs4 import BeautifulSoup


	def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
	"""
	Dynamically determine the total number of pages and products from the home page.

	Args:
	base_url (str): The URL of the home page.

	Returns:
	tuple[int, int]: A tuple containing the total number of pages and total number of products.
	"""
	response = requests.get(base_url)
	soup = BeautifulSoup(response.content, "html.parser")

	# Find results summary text
	form = soup.find("form", class_="form-horizontal")
	results_text = form.get_text(strip=True) if form else ""

	# Extract total products
	match = re.search(r"(\d+)\s*results", results_text)
	total_products = int(match.group(1)) if match else 0

	# Find page summary text
	page_text_elem = soup.find("li", class_="current")
	page_text = page_text_elem.text.strip() if page_text_elem else ""

	# Extract total pages
	match = re.search(r"Page \d+ of (\d+)", page_text)
	total_pages = int(match.group(1)) if match else 0

	return total_pages, total_products


	def scrape_book_details(book_url: str) -> tuple[str, str, int]:
	"""
	Scrape detailed information for a specific book.

	Args:
	book_url (str): The URL of the book's detail page.

	Returns:
	tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
	"""
	response = requests.get(book_url)
	soup = BeautifulSoup(response.content, "html.parser")

	# Extract description
	description_elem = soup.find("div", id="product_description")
	description = (
	description_elem.find_next("p").text if description_elem else "No description"
	)

	# Extract category
	breadcrumb = soup.find("ul", class_="breadcrumb")
	category = (
	breadcrumb.find_all("a")[2].text
	if breadcrumb and len(breadcrumb.find_all("a")) > 2
	else "Unknown"
	)

	# Extract stock quantity
	availability_elem = soup.find("p", class_="instock availability")
	stock_text = availability_elem.text.strip() if availability_elem else ""
	match = re.search(r"In stock \((\d+) available\)", stock_text)
	stock_quantity = int(match.group(1)) if match else 0

	return description, category, stock_quantity


	def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
	"""
	Scrape details for books on a single page.

	Args:
	url (str): The URL of the page to scrape.

	Returns:
	list[dict[str, str \| float \| int]]: A list of dictionaries containing book details.
	"""
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "html.parser")

	books = []
	book_elements = soup.find_all("article", class_="product_pod")

	for book in book_elements:
	# Basic book information
	title = book.h3.a["title"]
	price = book.find("p", class_="price_color").text[1:] # Remove £ symbol

	# Get star rating
	star_class = book.find("p", class_="star-rating")["class"][1]
	rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
	star_rating = rating_map.get(star_class, 0)

	# Get availability
	availability = book.find("p", class_="instock availability").text.strip()

	# Get book page URL to scrape more details
	book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
	"href"
	].replace("../", "")

	# Scrape additional details
	description, category, quantity = scrape_book_details(book_page_url)

	books.append(
	{
	"title": title,
	"price": float(price),
	"star_rating": star_rating,
	"availability": availability,
	"description": description,
	"category": category,
	"quantity": quantity,
	}
	)

	return books


	def scrape_all_books(
	base_url: str, page_url_template: str
	) -> List[Dict[str, Union[str, float, int]]]:
	"""
	Scrape books from all pages.

	Args:
	base_url (str): The base URL of the website.
	page_url_template (str): The URL template for pagination.

	Returns:
	list[dict[str, str \| float \| int]]: A list of dictionaries containing all scraped book details.
	"""
	total_pages, total_products = get_total_pages_and_products(base_url)

	print(f"Total Products: {total_products}")
	print(f"Total Pages: {total_pages}")

	all_books = []

	for page_num in range(1, total_pages + 1):
	url = page_url_template.format(page_num)
	print(f"Scraping page {page_num}")

	try:
	page_books = scrape_book_page(url)
	all_books.extend(page_books)
	time.sleep(0.1) # Polite scraping: add a delay between requests
	except Exception as e:
	print(f"Error scraping page {page_num}: {e}")

	return all_books


	def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
	"""
	Save the scraped data to an SQLite database.

	Args:
	data (list[dict[str, str \| float \| int]]): The data to save.
	db_path (str): The path to the SQLite database file.
	"""
	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()

	# Create table
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS books (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	title TEXT NOT NULL,
	price REAL NOT NULL,
	star_rating INTEGER NOT NULL,
	availability TEXT NOT NULL,
	description TEXT NOT NULL,
	category TEXT NOT NULL,
	quantity INTEGER NOT NULL
	)
	""")
	# ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```

	# Insert data
	for book in data:
	cursor.execute(
	"""
	INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
	VALUES (?, ?, ?, ?, ?, ?, ?)
	""",
	(
	book["title"],
	book["price"],
	book["star_rating"],
	book["availability"],
	book["description"],
	book["category"],
	book["quantity"],
	),
	)

	conn.commit()
	conn.close()
	print(f"Data saved to {db_path}")


	def main() -> None:
	base_url = "https://books.toscrape.com/index.html"
	page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"

	books_data = scrape_all_books(base_url, page_url_template)

	save_to_sqlite(books_data, "data/books_data.db")

	print(f"Scraped {len(books_data)} books. Data saved to books.db")


	if __name__ == "__main__":
	main()