amgadhasan commited on
Commit
60a49e6
·
1 Parent(s): afd6f8b

First commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##############
2
+ # Base Image #
3
+ ##############
4
+ FROM ghcr.io/astral-sh/uv:python3.10-bookworm-slim AS base
5
+
6
+ WORKDIR /app
7
+
8
+ RUN --mount=type=cache,target=/root/.cache/uv \
9
+ --mount=type=bind,source=uv.lock,target=uv.lock \
10
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
11
+ uv sync --frozen --no-install-project
12
+
13
+ #####################
14
+ # Development Image #
15
+ #####################
16
+ FROM base AS development
17
+
18
+ COPY --from=base /app/.venv /app/.venv
19
+
20
+ COPY . /app
21
+
22
+ RUN uv sync --frozen
23
+
24
+ RUN uv run python src/scrape.py
25
+
26
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
27
+
28
+ CMD ["uv", "run", "python", "-m", "src.app"]
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  title: Bookstore Chatbot
3
- emoji: 📉
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.15.0
8
- app_file: app.py
9
  pinned: false
10
- short_description: An online bookstore chatbot
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Bookstore Chatbot
3
+ emoji: 👁
4
+ colorFrom: red
5
+ colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
+ short_description: A chabot for an online bookstore
9
  ---
10
+ # store-data-chatbot
11
+ A chatbot to ask questions about products in online stores.
api.log ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "store-data-chatbot"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "beautifulsoup4>=4.13.3",
9
+ "gradio>=5.15.0",
10
+ "openai>=1.61.1",
11
+ "requests>=2.32.3",
12
+ ]
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (207 Bytes). View file
 
src/__pycache__/app.cpython-310.pyc ADDED
Binary file (390 Bytes). View file
 
src/__pycache__/llm.cpython-310.pyc ADDED
Binary file (2.58 kB). View file
 
src/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
src/__pycache__/scrape.cpython-310.pyc ADDED
Binary file (4.93 kB). View file
 
src/__pycache__/tools.cpython-310.pyc ADDED
Binary file (971 Bytes). View file
 
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.4 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.llm import process_user_question
4
+
5
+ app = gr.Interface(
6
+ fn=process_user_question,
7
+ inputs="text",
8
+ outputs="text",
9
+ title="Bookstore Chatbot",
10
+ description="A simple chatbot interface. Ask me anything about the books!",
11
+ )
12
+
13
+ # Launch the app
14
+ app.launch()
src/llm.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import openai
4
+
5
+ from src.prompts import (
6
+ ORDER_SYSTEM_MESSAGE,
7
+ )
8
+ from src.tools import (
9
+ books_dataset_tools,
10
+ llm_tools_map,
11
+ )
12
+ from src.utils import create_logger, handle_function_calls, log_execution_time
13
+
14
+ TEMPERATURE = 0
15
+ MAX_COMPLETION_TOKENS = 2048
16
+
17
+ logger = create_logger(logger_name="llm", log_file="api.log", log_level="info")
18
+
19
+ model = os.environ.get("CHAT_MODEL")
20
+ if not model:
21
+ logger.error("CHAT_MODEL environment variable is not set.")
22
+ raise ValueError("CHAT_MODEL environment variable is not set.")
23
+
24
+ client = openai.OpenAI()
25
+
26
+
27
+ @log_execution_time(logger=logger)
28
+ def create_completion(
29
+ thread: list,
30
+ system_message: str = ORDER_SYSTEM_MESSAGE,
31
+ tools: list = books_dataset_tools,
32
+ ):
33
+ """
34
+ Creates a completion response from the language model based on the system and user messages.
35
+
36
+ Args:
37
+ user_message (str): The conversation thread containing all messages.
38
+ system_message (str): The system message to be included in the prompt.
39
+ tools (list): A list of tools available for LLM to use.
40
+
41
+ Returns:
42
+ str: The response message generated by the LLM.
43
+
44
+ Raises:
45
+ Exception: If the LLM fails to generate a response.
46
+ """
47
+ try:
48
+ logger.debug(f" create_completion |{system_message = }\n\n{thread = }")
49
+ completion = client.chat.completions.create(
50
+ messages=[
51
+ {"role": "system", "content": system_message},
52
+ *thread,
53
+ ],
54
+ model=model,
55
+ n=1,
56
+ temperature=TEMPERATURE,
57
+ max_tokens=MAX_COMPLETION_TOKENS,
58
+ tools=tools,
59
+ tool_choice="auto"
60
+ )
61
+
62
+ logger.debug(f" create_completion | {completion = }")
63
+ response = completion.choices[0].message
64
+ return response
65
+ except Exception as e:
66
+ logger.error(
67
+ f" create_completion | Error generating responses for chat thread '{thread}': {e}"
68
+ )
69
+ raise
70
+
71
+
72
+ @log_execution_time(logger=logger)
73
+ def process_user_question(user_question: str) -> str:
74
+ """Handles user questions using an LLM"""
75
+ thread = [{"role": "user", "content": user_question}]
76
+ try:
77
+ while True:
78
+ logger.info(f"Processing user input: {thread}")
79
+ response = create_completion(thread=thread, tools=books_dataset_tools)
80
+ if not response.tool_calls:
81
+ break
82
+ thread.append(response)
83
+ thread = handle_function_calls(
84
+ function_map=llm_tools_map,
85
+ response_message=response,
86
+ thread=thread,
87
+ )
88
+ return response.content
89
+ except Exception as e:
90
+ logger.error(
91
+ f"Error generating responses for user input:\n'{user_question}'\n\nError:\n{e}"
92
+ )
93
+ raise
src/prompts.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textwrap import dedent
2
+
3
+ ORDER_SYSTEM_MESSAGE = dedent("""\
4
+ You are an online bookstore AI assistant that helps users with their queries.
5
+ Your responses should be eloquent, concise and succinct.
6
+ You can use the provided tools to get relevant information that help you in assisting the user.
7
+ One of these tools, `query_books_database`, gives you access to a SQL database. You can use this tool to run SQL queries against it.
8
+ If there are no entries for the given SQL query, the tool responds with "No rows found for this sql query. This usually means there are no entries with the specified conditions.".
9
+ This means there are no books for the given conditions. Your response to the user question should reflect this.
10
+ The final answer should directly answer the user's question as well as cite the sources used for generating the answer.
11
+ Citing a source could be, for example, listing the SQL query (or queries) used to get the results.
12
+ The final response should be in the following format:
13
+ # Answer:
14
+ <answer goes here>
15
+ # References:
16
+ <references go here>
17
+ """)
src/scrape.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sqlite3
3
+ import time
4
+ from typing import Dict, List, Union
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+
10
+ def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
11
+ """
12
+ Dynamically determine the total number of pages and products from the home page.
13
+
14
+ Args:
15
+ base_url (str): The URL of the home page.
16
+
17
+ Returns:
18
+ tuple[int, int]: A tuple containing the total number of pages and total number of products.
19
+ """
20
+ response = requests.get(base_url)
21
+ soup = BeautifulSoup(response.content, "html.parser")
22
+
23
+ # Find results summary text
24
+ form = soup.find("form", class_="form-horizontal")
25
+ results_text = form.get_text(strip=True) if form else ""
26
+
27
+ # Extract total products
28
+ match = re.search(r"(\d+)\s*results", results_text)
29
+ total_products = int(match.group(1)) if match else 0
30
+
31
+ # Find page summary text
32
+ page_text_elem = soup.find("li", class_="current")
33
+ page_text = page_text_elem.text.strip() if page_text_elem else ""
34
+
35
+ # Extract total pages
36
+ match = re.search(r"Page \d+ of (\d+)", page_text)
37
+ total_pages = int(match.group(1)) if match else 0
38
+
39
+ return total_pages, total_products
40
+
41
+
42
+ def scrape_book_details(book_url: str) -> tuple[str, str, int]:
43
+ """
44
+ Scrape detailed information for a specific book.
45
+
46
+ Args:
47
+ book_url (str): The URL of the book's detail page.
48
+
49
+ Returns:
50
+ tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
51
+ """
52
+ response = requests.get(book_url)
53
+ soup = BeautifulSoup(response.content, "html.parser")
54
+
55
+ # Extract description
56
+ description_elem = soup.find("div", id="product_description")
57
+ description = (
58
+ description_elem.find_next("p").text if description_elem else "No description"
59
+ )
60
+
61
+ # Extract category
62
+ breadcrumb = soup.find("ul", class_="breadcrumb")
63
+ category = (
64
+ breadcrumb.find_all("a")[2].text
65
+ if breadcrumb and len(breadcrumb.find_all("a")) > 2
66
+ else "Unknown"
67
+ )
68
+
69
+ # Extract stock quantity
70
+ availability_elem = soup.find("p", class_="instock availability")
71
+ stock_text = availability_elem.text.strip() if availability_elem else ""
72
+ match = re.search(r"In stock \((\d+) available\)", stock_text)
73
+ stock_quantity = int(match.group(1)) if match else 0
74
+
75
+ return description, category, stock_quantity
76
+
77
+
78
+ def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
79
+ """
80
+ Scrape details for books on a single page.
81
+
82
+ Args:
83
+ url (str): The URL of the page to scrape.
84
+
85
+ Returns:
86
+ list[dict[str, str | float | int]]: A list of dictionaries containing book details.
87
+ """
88
+ response = requests.get(url)
89
+ soup = BeautifulSoup(response.content, "html.parser")
90
+
91
+ books = []
92
+ book_elements = soup.find_all("article", class_="product_pod")
93
+
94
+ for book in book_elements:
95
+ # Basic book information
96
+ title = book.h3.a["title"]
97
+ price = book.find("p", class_="price_color").text[1:] # Remove £ symbol
98
+
99
+ # Get star rating
100
+ star_class = book.find("p", class_="star-rating")["class"][1]
101
+ rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
102
+ star_rating = rating_map.get(star_class, 0)
103
+
104
+ # Get availability
105
+ availability = book.find("p", class_="instock availability").text.strip()
106
+
107
+ # Get book page URL to scrape more details
108
+ book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
109
+ "href"
110
+ ].replace("../", "")
111
+
112
+ # Scrape additional details
113
+ description, category, quantity = scrape_book_details(book_page_url)
114
+
115
+ books.append(
116
+ {
117
+ "title": title,
118
+ "price": float(price),
119
+ "star_rating": star_rating,
120
+ "availability": availability,
121
+ "description": description,
122
+ "category": category,
123
+ "quantity": quantity,
124
+ }
125
+ )
126
+
127
+ return books
128
+
129
+
130
+ def scrape_all_books(
131
+ base_url: str, page_url_template: str
132
+ ) -> List[Dict[str, Union[str, float, int]]]:
133
+ """
134
+ Scrape books from all pages.
135
+
136
+ Args:
137
+ base_url (str): The base URL of the website.
138
+ page_url_template (str): The URL template for pagination.
139
+
140
+ Returns:
141
+ list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
142
+ """
143
+ total_pages, total_products = get_total_pages_and_products(base_url)
144
+
145
+ print(f"Total Products: {total_products}")
146
+ print(f"Total Pages: {total_pages}")
147
+
148
+ all_books = []
149
+
150
+ for page_num in range(1, total_pages + 1):
151
+ url = page_url_template.format(page_num)
152
+ print(f"Scraping page {page_num}")
153
+
154
+ try:
155
+ page_books = scrape_book_page(url)
156
+ all_books.extend(page_books)
157
+ time.sleep(0.1) # Polite scraping: add a delay between requests
158
+ except Exception as e:
159
+ print(f"Error scraping page {page_num}: {e}")
160
+
161
+ return all_books
162
+
163
+
164
+ def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
165
+ """
166
+ Save the scraped data to an SQLite database.
167
+
168
+ Args:
169
+ data (list[dict[str, str | float | int]]): The data to save.
170
+ db_path (str): The path to the SQLite database file.
171
+ """
172
+ conn = sqlite3.connect(db_path)
173
+ cursor = conn.cursor()
174
+
175
+ # Create table
176
+ cursor.execute("""
177
+ CREATE TABLE IF NOT EXISTS books (
178
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
179
+ title TEXT NOT NULL,
180
+ price REAL NOT NULL,
181
+ star_rating INTEGER NOT NULL,
182
+ availability TEXT NOT NULL,
183
+ description TEXT NOT NULL,
184
+ category TEXT NOT NULL,
185
+ quantity INTEGER NOT NULL
186
+ )
187
+ """)
188
+ # ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```
189
+
190
+ # Insert data
191
+ for book in data:
192
+ cursor.execute(
193
+ """
194
+ INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
195
+ VALUES (?, ?, ?, ?, ?, ?, ?)
196
+ """,
197
+ (
198
+ book["title"],
199
+ book["price"],
200
+ book["star_rating"],
201
+ book["availability"],
202
+ book["description"],
203
+ book["category"],
204
+ book["quantity"],
205
+ ),
206
+ )
207
+
208
+ conn.commit()
209
+ conn.close()
210
+ print(f"Data saved to {db_path}")
211
+
212
+
213
+ def main() -> None:
214
+ base_url = "https://books.toscrape.com/index.html"
215
+ page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"
216
+
217
+ books_data = scrape_all_books(base_url, page_url_template)
218
+
219
+ save_to_sqlite(books_data, "data/books_data.db")
220
+
221
+ print(f"Scraped {len(books_data)} books. Data saved to books.db")
222
+
223
+
224
+ if __name__ == "__main__":
225
+ main()
src/tools.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src import utils
2
+
3
+ books_dataset_tools = [
4
+ {
5
+ "type": "function",
6
+ "function": {
7
+ "name": "query_books_database",
8
+ "description": "Quries the books database using SQL. Returns `None` if no there are no entries for the provided query. The database has the following table:```sql\nCREATE TABLE IF NOT EXISTS books (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n title TEXT NOT NULL,\n price REAL NOT NULL,\n star_rating INTEGER NOT NULL,\n availability TEXT NOT NULL,\n description TEXT NOT NULL,\n category TEXT NOT NULL,\n quantity INTEGER NOT NULL\n )\n```",
9
+ "parameters": {
10
+ "type": "object",
11
+ "properties": {
12
+ "sql_query": {
13
+ "type": "string",
14
+ "description": "The sql query to run against the SQLite database. Must be in the SQLite format.",
15
+ }
16
+ },
17
+ "required": ["sql_query"],
18
+ },
19
+ },
20
+ },
21
+ ]
22
+
23
+ llm_tools_map = {
24
+ "query_books_database": utils.query_books_database,
25
+ }
src/utils.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import sqlite3
5
+ import time
6
+ from contextlib import closing
7
+ from pathlib import Path
8
+
9
+
10
+ def query_books_database(
11
+ sql_query: str, db_url: str = "data/books_data.db"
12
+ ) -> list[dict]:
13
+ """
14
+ Execute a read-only SQL query on the books database and return the results.
15
+
16
+ Args:
17
+ sql_query (str): SQL query string to execute.
18
+ db_url (str): URL to the database file, defaults to "data/books_data.db".
19
+
20
+ Returns:
21
+ list[dict[str, str | float | int]]: A list of rows as dictionaries
22
+ """
23
+ with closing(sqlite3.connect(f"file:{db_url}?mode=ro", uri=True)) as connection:
24
+ connection.row_factory = lambda cursor, row: {
25
+ col[0]: row[i] for i, col in enumerate(cursor.description)
26
+ }
27
+ with closing(connection.cursor()) as cursor:
28
+ rows = cursor.execute(sql_query).fetchall()
29
+ return rows
30
+
31
+
32
+ def handle_function_calls(
33
+ function_map: dict, response_message, thread: list
34
+ ) -> list | None:
35
+ """
36
+ Handle function tool calls and map them to actual function executions.
37
+
38
+ Arguments:
39
+ function_map (dict): A dictionary mapping function names to function objects.
40
+ response_message: The message containing tool call information.
41
+ thread (list): List to append results of function calls.
42
+
43
+ Returns:
44
+ list: Updated list of messages.
45
+
46
+ Raises:
47
+ ValueError: If no tool calls are present in the response message.
48
+ KeyError: If a function mapping is not found.
49
+ """
50
+ if not response_message.tool_calls:
51
+ raise ValueError("No tool calls found in the response message.")
52
+
53
+ for tool_call in response_message.tool_calls:
54
+ function_name = tool_call.function.name
55
+ if function_name in function_map:
56
+ function_args = json.loads(tool_call.function.arguments)
57
+ print(f"Function arguments: {function_args}")
58
+
59
+ function_to_call = function_map[function_name]
60
+ try:
61
+ function_response = function_to_call(**function_args)
62
+ except Exception as e:
63
+ function_response = str(e)
64
+
65
+ thread.append(
66
+ {
67
+ "tool_call_id": tool_call.id,
68
+ "role": "tool",
69
+ "name": function_name,
70
+ "content": str(function_response),
71
+ }
72
+ )
73
+ return thread
74
+ else:
75
+ print(f"Function {function_name} not found.")
76
+ raise KeyError(f"Function {function_name} not found in function map.")
77
+
78
+
79
+ def create_logger(logger_name: str, log_file: str, log_level: str) -> logging.Logger:
80
+ """
81
+ Create and configure a logger with specified name, log file, and log level.
82
+
83
+ Arguments:
84
+ logger_name (str): Name of the logger.
85
+ log_file (str): Path to the log file.
86
+ log_level (str): Logging level as a string (e.g., 'INFO', 'DEBUG').
87
+
88
+ Returns:
89
+ logging.Logger: Configured logger object.
90
+ """
91
+ LOG_FORMAT = "[%(asctime)s | %(name)s | %(levelname)s | %(funcName)s | %(message)s]"
92
+ log_level = getattr(logging, log_level.upper())
93
+
94
+ logger = logging.getLogger(logger_name)
95
+ logger.setLevel(logging.DEBUG)
96
+
97
+ file_handler = logging.FileHandler(log_file)
98
+ file_handler.setLevel(logging.DEBUG)
99
+ file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
100
+ logger.addHandler(file_handler)
101
+
102
+ console_handler = logging.StreamHandler()
103
+ console_handler.setLevel(logging.INFO)
104
+ console_handler.setFormatter(logging.Formatter(LOG_FORMAT))
105
+ logger.addHandler(console_handler)
106
+
107
+ return logger
108
+
109
+
110
+ def log_execution_time(logger: logging.Logger):
111
+ """
112
+ Decorator factory to log the execution time of a function using a specified logger.
113
+
114
+ Arguments:
115
+ logger (logging.Logger): Logger object used for logging.
116
+
117
+ Returns:
118
+ function: A decorator that logs the execution time of the wrapped function.
119
+ """
120
+
121
+ def decorator(func):
122
+ def wrapper(*args, **kwargs):
123
+ start_time = time.time()
124
+ result = func(*args, **kwargs)
125
+ end_time = time.time()
126
+ execution_time = end_time - start_time
127
+ logger.info(f"Executing {func.__name__} took {execution_time:.4f} seconds")
128
+ return result
129
+
130
+ return wrapper
131
+
132
+ return decorator
uv.lock ADDED
The diff for this file is too large to render. See raw diff