File size: 7,078 Bytes
60a49e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import re
import sqlite3
import time
from typing import Dict, List, Union

import requests
from bs4 import BeautifulSoup


def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
    """
    Dynamically determine the total number of pages and products from the home page.

    Args:
        base_url (str): The URL of the home page.

    Returns:
        tuple[int, int]: A tuple containing the total number of pages and total number of products.
    """
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find results summary text
    form = soup.find("form", class_="form-horizontal")
    results_text = form.get_text(strip=True) if form else ""

    # Extract total products
    match = re.search(r"(\d+)\s*results", results_text)
    total_products = int(match.group(1)) if match else 0

    # Find page summary text
    page_text_elem = soup.find("li", class_="current")
    page_text = page_text_elem.text.strip() if page_text_elem else ""

    # Extract total pages
    match = re.search(r"Page \d+ of (\d+)", page_text)
    total_pages = int(match.group(1)) if match else 0

    return total_pages, total_products


def scrape_book_details(book_url: str) -> tuple[str, str, int]:
    """
    Scrape detailed information for a specific book.

    Args:
        book_url (str): The URL of the book's detail page.

    Returns:
        tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
    """
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract description
    description_elem = soup.find("div", id="product_description")
    description = (
        description_elem.find_next("p").text if description_elem else "No description"
    )

    # Extract category
    breadcrumb = soup.find("ul", class_="breadcrumb")
    category = (
        breadcrumb.find_all("a")[2].text
        if breadcrumb and len(breadcrumb.find_all("a")) > 2
        else "Unknown"
    )

    # Extract stock quantity
    availability_elem = soup.find("p", class_="instock availability")
    stock_text = availability_elem.text.strip() if availability_elem else ""
    match = re.search(r"In stock \((\d+) available\)", stock_text)
    stock_quantity = int(match.group(1)) if match else 0

    return description, category, stock_quantity


def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
    """
    Scrape details for books on a single page.

    Args:
        url (str): The URL of the page to scrape.

    Returns:
        list[dict[str, str | float | int]]: A list of dictionaries containing book details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    books = []
    book_elements = soup.find_all("article", class_="product_pod")

    for book in book_elements:
        # Basic book information
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text[1:]  # Remove £ symbol

        # Get star rating
        star_class = book.find("p", class_="star-rating")["class"][1]
        rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
        star_rating = rating_map.get(star_class, 0)

        # Get availability
        availability = book.find("p", class_="instock availability").text.strip()

        # Get book page URL to scrape more details
        book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
            "href"
        ].replace("../", "")

        # Scrape additional details
        description, category, quantity = scrape_book_details(book_page_url)

        books.append(
            {
                "title": title,
                "price": float(price),
                "star_rating": star_rating,
                "availability": availability,
                "description": description,
                "category": category,
                "quantity": quantity,
            }
        )

    return books


def scrape_all_books(
    base_url: str, page_url_template: str
) -> List[Dict[str, Union[str, float, int]]]:
    """
    Scrape books from all pages.

    Args:
        base_url (str): The base URL of the website.
        page_url_template (str): The URL template for pagination.

    Returns:
        list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
    """
    total_pages, total_products = get_total_pages_and_products(base_url)

    print(f"Total Products: {total_products}")
    print(f"Total Pages: {total_pages}")

    all_books = []

    for page_num in range(1, total_pages + 1):
        url = page_url_template.format(page_num)
        print(f"Scraping page {page_num}")

        try:
            page_books = scrape_book_page(url)
            all_books.extend(page_books)
            time.sleep(0.1)  # Polite scraping: add a delay between requests
        except Exception as e:
            print(f"Error scraping page {page_num}: {e}")

    return all_books


def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
    """
    Save the scraped data to an SQLite database.

    Args:
        data (list[dict[str, str | float | int]]): The data to save.
        db_path (str): The path to the SQLite database file.
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS books (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            price REAL NOT NULL,
            star_rating INTEGER NOT NULL,
            availability TEXT NOT NULL,
            description TEXT NOT NULL,
            category TEXT NOT NULL,
            quantity INTEGER NOT NULL
        )
    """)
    # ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```

    # Insert data
    for book in data:
        cursor.execute(
            """
            INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """,
            (
                book["title"],
                book["price"],
                book["star_rating"],
                book["availability"],
                book["description"],
                book["category"],
                book["quantity"],
            ),
        )

    conn.commit()
    conn.close()
    print(f"Data saved to {db_path}")


def main() -> None:
    base_url = "https://books.toscrape.com/index.html"
    page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"

    books_data = scrape_all_books(base_url, page_url_template)

    save_to_sqlite(books_data, "data/books_data.db")

    print(f"Scraped {len(books_data)} books. Data saved to books.db")


if __name__ == "__main__":
    main()