Spaces:
Sleeping
Sleeping
Commit ·
60a49e6
1
Parent(s): afd6f8b
First commit
Browse files- Dockerfile +28 -0
- README.md +7 -9
- api.log +0 -0
- pyproject.toml +12 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/app.cpython-310.pyc +0 -0
- src/__pycache__/llm.cpython-310.pyc +0 -0
- src/__pycache__/prompts.cpython-310.pyc +0 -0
- src/__pycache__/scrape.cpython-310.pyc +0 -0
- src/__pycache__/tools.cpython-310.pyc +0 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/app.py +14 -0
- src/llm.py +93 -0
- src/prompts.py +17 -0
- src/scrape.py +225 -0
- src/tools.py +25 -0
- src/utils.py +132 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##############
|
| 2 |
+
# Base Image #
|
| 3 |
+
##############
|
| 4 |
+
FROM ghcr.io/astral-sh/uv:python3.10-bookworm-slim AS base
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 9 |
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
| 10 |
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
| 11 |
+
uv sync --frozen --no-install-project
|
| 12 |
+
|
| 13 |
+
#####################
|
| 14 |
+
# Development Image #
|
| 15 |
+
#####################
|
| 16 |
+
FROM base AS development
|
| 17 |
+
|
| 18 |
+
COPY --from=base /app/.venv /app/.venv
|
| 19 |
+
|
| 20 |
+
COPY . /app
|
| 21 |
+
|
| 22 |
+
RUN uv sync --frozen
|
| 23 |
+
|
| 24 |
+
RUN uv run python src/scrape.py
|
| 25 |
+
|
| 26 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 27 |
+
|
| 28 |
+
CMD ["uv", "run", "python", "-m", "src.app"]
|
README.md
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
---
|
| 2 |
title: Bookstore Chatbot
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version: 5.15.0
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description:
|
| 11 |
---
|
| 12 |
-
|
| 13 |
-
|
|
|
|
| 1 |
---
|
| 2 |
title: Bookstore Chatbot
|
| 3 |
+
emoji: 👁
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
+
short_description: A chabot for an online bookstore
|
| 9 |
---
|
| 10 |
+
# store-data-chatbot
|
| 11 |
+
A chatbot to ask questions about products in online stores.
|
api.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "store-data-chatbot"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"beautifulsoup4>=4.13.3",
|
| 9 |
+
"gradio>=5.15.0",
|
| 10 |
+
"openai>=1.61.1",
|
| 11 |
+
"requests>=2.32.3",
|
| 12 |
+
]
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (207 Bytes). View file
|
|
|
src/__pycache__/app.cpython-310.pyc
ADDED
|
Binary file (390 Bytes). View file
|
|
|
src/__pycache__/llm.cpython-310.pyc
ADDED
|
Binary file (2.58 kB). View file
|
|
|
src/__pycache__/prompts.cpython-310.pyc
ADDED
|
Binary file (1.27 kB). View file
|
|
|
src/__pycache__/scrape.cpython-310.pyc
ADDED
|
Binary file (4.93 kB). View file
|
|
|
src/__pycache__/tools.cpython-310.pyc
ADDED
|
Binary file (971 Bytes). View file
|
|
|
src/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (5.4 kB). View file
|
|
|
src/app.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from src.llm import process_user_question
|
| 4 |
+
|
| 5 |
+
app = gr.Interface(
|
| 6 |
+
fn=process_user_question,
|
| 7 |
+
inputs="text",
|
| 8 |
+
outputs="text",
|
| 9 |
+
title="Bookstore Chatbot",
|
| 10 |
+
description="A simple chatbot interface. Ask me anything about the books!",
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Launch the app
|
| 14 |
+
app.launch()
|
src/llm.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import openai
|
| 4 |
+
|
| 5 |
+
from src.prompts import (
|
| 6 |
+
ORDER_SYSTEM_MESSAGE,
|
| 7 |
+
)
|
| 8 |
+
from src.tools import (
|
| 9 |
+
books_dataset_tools,
|
| 10 |
+
llm_tools_map,
|
| 11 |
+
)
|
| 12 |
+
from src.utils import create_logger, handle_function_calls, log_execution_time
|
| 13 |
+
|
| 14 |
+
TEMPERATURE = 0
|
| 15 |
+
MAX_COMPLETION_TOKENS = 2048
|
| 16 |
+
|
| 17 |
+
logger = create_logger(logger_name="llm", log_file="api.log", log_level="info")
|
| 18 |
+
|
| 19 |
+
model = os.environ.get("CHAT_MODEL")
|
| 20 |
+
if not model:
|
| 21 |
+
logger.error("CHAT_MODEL environment variable is not set.")
|
| 22 |
+
raise ValueError("CHAT_MODEL environment variable is not set.")
|
| 23 |
+
|
| 24 |
+
client = openai.OpenAI()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@log_execution_time(logger=logger)
|
| 28 |
+
def create_completion(
|
| 29 |
+
thread: list,
|
| 30 |
+
system_message: str = ORDER_SYSTEM_MESSAGE,
|
| 31 |
+
tools: list = books_dataset_tools,
|
| 32 |
+
):
|
| 33 |
+
"""
|
| 34 |
+
Creates a completion response from the language model based on the system and user messages.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
user_message (str): The conversation thread containing all messages.
|
| 38 |
+
system_message (str): The system message to be included in the prompt.
|
| 39 |
+
tools (list): A list of tools available for LLM to use.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
str: The response message generated by the LLM.
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
Exception: If the LLM fails to generate a response.
|
| 46 |
+
"""
|
| 47 |
+
try:
|
| 48 |
+
logger.debug(f" create_completion |{system_message = }\n\n{thread = }")
|
| 49 |
+
completion = client.chat.completions.create(
|
| 50 |
+
messages=[
|
| 51 |
+
{"role": "system", "content": system_message},
|
| 52 |
+
*thread,
|
| 53 |
+
],
|
| 54 |
+
model=model,
|
| 55 |
+
n=1,
|
| 56 |
+
temperature=TEMPERATURE,
|
| 57 |
+
max_tokens=MAX_COMPLETION_TOKENS,
|
| 58 |
+
tools=tools,
|
| 59 |
+
tool_choice="auto"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
logger.debug(f" create_completion | {completion = }")
|
| 63 |
+
response = completion.choices[0].message
|
| 64 |
+
return response
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(
|
| 67 |
+
f" create_completion | Error generating responses for chat thread '{thread}': {e}"
|
| 68 |
+
)
|
| 69 |
+
raise
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@log_execution_time(logger=logger)
|
| 73 |
+
def process_user_question(user_question: str) -> str:
|
| 74 |
+
"""Handles user questions using an LLM"""
|
| 75 |
+
thread = [{"role": "user", "content": user_question}]
|
| 76 |
+
try:
|
| 77 |
+
while True:
|
| 78 |
+
logger.info(f"Processing user input: {thread}")
|
| 79 |
+
response = create_completion(thread=thread, tools=books_dataset_tools)
|
| 80 |
+
if not response.tool_calls:
|
| 81 |
+
break
|
| 82 |
+
thread.append(response)
|
| 83 |
+
thread = handle_function_calls(
|
| 84 |
+
function_map=llm_tools_map,
|
| 85 |
+
response_message=response,
|
| 86 |
+
thread=thread,
|
| 87 |
+
)
|
| 88 |
+
return response.content
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(
|
| 91 |
+
f"Error generating responses for user input:\n'{user_question}'\n\nError:\n{e}"
|
| 92 |
+
)
|
| 93 |
+
raise
|
src/prompts.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from textwrap import dedent
|
| 2 |
+
|
| 3 |
+
ORDER_SYSTEM_MESSAGE = dedent("""\
|
| 4 |
+
You are an online bookstore AI assistant that helps users with their queries.
|
| 5 |
+
Your responses should be eloquent, concise and succinct.
|
| 6 |
+
You can use the provided tools to get relevant information that help you in assisting the user.
|
| 7 |
+
One of these tools, `query_books_database`, gives you access to a SQL database. You can use this tool to run SQL queries against it.
|
| 8 |
+
If there are no entries for the given SQL query, the tool responds with "No rows found for this sql query. This usually means there are no entries with the specified conditions.".
|
| 9 |
+
This means there are no books for the given conditions. Your response to the user question should reflect this.
|
| 10 |
+
The final answer should directly answer the user's question as well as cite the sources used for generating the answer.
|
| 11 |
+
Citing a source could be, for example, listing the SQL query (or queries) used to get the results.
|
| 12 |
+
The final response should be in the following format:
|
| 13 |
+
# Answer:
|
| 14 |
+
<answer goes here>
|
| 15 |
+
# References:
|
| 16 |
+
<references go here>
|
| 17 |
+
""")
|
src/scrape.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import sqlite3
|
| 3 |
+
import time
|
| 4 |
+
from typing import Dict, List, Union
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_total_pages_and_products(base_url: str) -> tuple[int, int]:
|
| 11 |
+
"""
|
| 12 |
+
Dynamically determine the total number of pages and products from the home page.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
base_url (str): The URL of the home page.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
tuple[int, int]: A tuple containing the total number of pages and total number of products.
|
| 19 |
+
"""
|
| 20 |
+
response = requests.get(base_url)
|
| 21 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 22 |
+
|
| 23 |
+
# Find results summary text
|
| 24 |
+
form = soup.find("form", class_="form-horizontal")
|
| 25 |
+
results_text = form.get_text(strip=True) if form else ""
|
| 26 |
+
|
| 27 |
+
# Extract total products
|
| 28 |
+
match = re.search(r"(\d+)\s*results", results_text)
|
| 29 |
+
total_products = int(match.group(1)) if match else 0
|
| 30 |
+
|
| 31 |
+
# Find page summary text
|
| 32 |
+
page_text_elem = soup.find("li", class_="current")
|
| 33 |
+
page_text = page_text_elem.text.strip() if page_text_elem else ""
|
| 34 |
+
|
| 35 |
+
# Extract total pages
|
| 36 |
+
match = re.search(r"Page \d+ of (\d+)", page_text)
|
| 37 |
+
total_pages = int(match.group(1)) if match else 0
|
| 38 |
+
|
| 39 |
+
return total_pages, total_products
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def scrape_book_details(book_url: str) -> tuple[str, str, int]:
|
| 43 |
+
"""
|
| 44 |
+
Scrape detailed information for a specific book.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
book_url (str): The URL of the book's detail page.
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
tuple[str, str, int]: A tuple containing the book's description, category, and stock quantity.
|
| 51 |
+
"""
|
| 52 |
+
response = requests.get(book_url)
|
| 53 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 54 |
+
|
| 55 |
+
# Extract description
|
| 56 |
+
description_elem = soup.find("div", id="product_description")
|
| 57 |
+
description = (
|
| 58 |
+
description_elem.find_next("p").text if description_elem else "No description"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Extract category
|
| 62 |
+
breadcrumb = soup.find("ul", class_="breadcrumb")
|
| 63 |
+
category = (
|
| 64 |
+
breadcrumb.find_all("a")[2].text
|
| 65 |
+
if breadcrumb and len(breadcrumb.find_all("a")) > 2
|
| 66 |
+
else "Unknown"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Extract stock quantity
|
| 70 |
+
availability_elem = soup.find("p", class_="instock availability")
|
| 71 |
+
stock_text = availability_elem.text.strip() if availability_elem else ""
|
| 72 |
+
match = re.search(r"In stock \((\d+) available\)", stock_text)
|
| 73 |
+
stock_quantity = int(match.group(1)) if match else 0
|
| 74 |
+
|
| 75 |
+
return description, category, stock_quantity
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def scrape_book_page(url: str) -> List[Dict[str, Union[str, float, int]]]:
|
| 79 |
+
"""
|
| 80 |
+
Scrape details for books on a single page.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
url (str): The URL of the page to scrape.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
list[dict[str, str | float | int]]: A list of dictionaries containing book details.
|
| 87 |
+
"""
|
| 88 |
+
response = requests.get(url)
|
| 89 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 90 |
+
|
| 91 |
+
books = []
|
| 92 |
+
book_elements = soup.find_all("article", class_="product_pod")
|
| 93 |
+
|
| 94 |
+
for book in book_elements:
|
| 95 |
+
# Basic book information
|
| 96 |
+
title = book.h3.a["title"]
|
| 97 |
+
price = book.find("p", class_="price_color").text[1:] # Remove £ symbol
|
| 98 |
+
|
| 99 |
+
# Get star rating
|
| 100 |
+
star_class = book.find("p", class_="star-rating")["class"][1]
|
| 101 |
+
rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
|
| 102 |
+
star_rating = rating_map.get(star_class, 0)
|
| 103 |
+
|
| 104 |
+
# Get availability
|
| 105 |
+
availability = book.find("p", class_="instock availability").text.strip()
|
| 106 |
+
|
| 107 |
+
# Get book page URL to scrape more details
|
| 108 |
+
book_page_url = "https://books.toscrape.com/catalogue/" + book.h3.a[
|
| 109 |
+
"href"
|
| 110 |
+
].replace("../", "")
|
| 111 |
+
|
| 112 |
+
# Scrape additional details
|
| 113 |
+
description, category, quantity = scrape_book_details(book_page_url)
|
| 114 |
+
|
| 115 |
+
books.append(
|
| 116 |
+
{
|
| 117 |
+
"title": title,
|
| 118 |
+
"price": float(price),
|
| 119 |
+
"star_rating": star_rating,
|
| 120 |
+
"availability": availability,
|
| 121 |
+
"description": description,
|
| 122 |
+
"category": category,
|
| 123 |
+
"quantity": quantity,
|
| 124 |
+
}
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return books
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def scrape_all_books(
|
| 131 |
+
base_url: str, page_url_template: str
|
| 132 |
+
) -> List[Dict[str, Union[str, float, int]]]:
|
| 133 |
+
"""
|
| 134 |
+
Scrape books from all pages.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
base_url (str): The base URL of the website.
|
| 138 |
+
page_url_template (str): The URL template for pagination.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
list[dict[str, str | float | int]]: A list of dictionaries containing all scraped book details.
|
| 142 |
+
"""
|
| 143 |
+
total_pages, total_products = get_total_pages_and_products(base_url)
|
| 144 |
+
|
| 145 |
+
print(f"Total Products: {total_products}")
|
| 146 |
+
print(f"Total Pages: {total_pages}")
|
| 147 |
+
|
| 148 |
+
all_books = []
|
| 149 |
+
|
| 150 |
+
for page_num in range(1, total_pages + 1):
|
| 151 |
+
url = page_url_template.format(page_num)
|
| 152 |
+
print(f"Scraping page {page_num}")
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
page_books = scrape_book_page(url)
|
| 156 |
+
all_books.extend(page_books)
|
| 157 |
+
time.sleep(0.1) # Polite scraping: add a delay between requests
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"Error scraping page {page_num}: {e}")
|
| 160 |
+
|
| 161 |
+
return all_books
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def save_to_sqlite(data: List[Dict[str, Union[str, float, int]]], db_path: str) -> None:
|
| 165 |
+
"""
|
| 166 |
+
Save the scraped data to an SQLite database.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
data (list[dict[str, str | float | int]]): The data to save.
|
| 170 |
+
db_path (str): The path to the SQLite database file.
|
| 171 |
+
"""
|
| 172 |
+
conn = sqlite3.connect(db_path)
|
| 173 |
+
cursor = conn.cursor()
|
| 174 |
+
|
| 175 |
+
# Create table
|
| 176 |
+
cursor.execute("""
|
| 177 |
+
CREATE TABLE IF NOT EXISTS books (
|
| 178 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 179 |
+
title TEXT NOT NULL,
|
| 180 |
+
price REAL NOT NULL,
|
| 181 |
+
star_rating INTEGER NOT NULL,
|
| 182 |
+
availability TEXT NOT NULL,
|
| 183 |
+
description TEXT NOT NULL,
|
| 184 |
+
category TEXT NOT NULL,
|
| 185 |
+
quantity INTEGER NOT NULL
|
| 186 |
+
)
|
| 187 |
+
""")
|
| 188 |
+
# ```sql\nCREATE TABLE products (\n\tmain_category TEXT, \n\ttitle TEXT, \n\taverage_rating FLOAT, \n\trating_number BIGINT, \n\tfeatures TEXT, \n\tdescription TEXT, \n\tprice TEXT, \n\tstore TEXT, \n\tcategories TEXT, \n\tdetails TEXT, \n\tparent_asin TEXT\n)```
|
| 189 |
+
|
| 190 |
+
# Insert data
|
| 191 |
+
for book in data:
|
| 192 |
+
cursor.execute(
|
| 193 |
+
"""
|
| 194 |
+
INSERT INTO books (title, price, star_rating, availability, description, category, quantity)
|
| 195 |
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 196 |
+
""",
|
| 197 |
+
(
|
| 198 |
+
book["title"],
|
| 199 |
+
book["price"],
|
| 200 |
+
book["star_rating"],
|
| 201 |
+
book["availability"],
|
| 202 |
+
book["description"],
|
| 203 |
+
book["category"],
|
| 204 |
+
book["quantity"],
|
| 205 |
+
),
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
conn.commit()
|
| 209 |
+
conn.close()
|
| 210 |
+
print(f"Data saved to {db_path}")
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def main() -> None:
|
| 214 |
+
base_url = "https://books.toscrape.com/index.html"
|
| 215 |
+
page_url_template = "https://books.toscrape.com/catalogue/page-{0}.html"
|
| 216 |
+
|
| 217 |
+
books_data = scrape_all_books(base_url, page_url_template)
|
| 218 |
+
|
| 219 |
+
save_to_sqlite(books_data, "data/books_data.db")
|
| 220 |
+
|
| 221 |
+
print(f"Scraped {len(books_data)} books. Data saved to books.db")
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
main()
|
src/tools.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src import utils
|
| 2 |
+
|
| 3 |
+
books_dataset_tools = [
|
| 4 |
+
{
|
| 5 |
+
"type": "function",
|
| 6 |
+
"function": {
|
| 7 |
+
"name": "query_books_database",
|
| 8 |
+
"description": "Quries the books database using SQL. Returns `None` if no there are no entries for the provided query. The database has the following table:```sql\nCREATE TABLE IF NOT EXISTS books (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n title TEXT NOT NULL,\n price REAL NOT NULL,\n star_rating INTEGER NOT NULL,\n availability TEXT NOT NULL,\n description TEXT NOT NULL,\n category TEXT NOT NULL,\n quantity INTEGER NOT NULL\n )\n```",
|
| 9 |
+
"parameters": {
|
| 10 |
+
"type": "object",
|
| 11 |
+
"properties": {
|
| 12 |
+
"sql_query": {
|
| 13 |
+
"type": "string",
|
| 14 |
+
"description": "The sql query to run against the SQLite database. Must be in the SQLite format.",
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"required": ["sql_query"],
|
| 18 |
+
},
|
| 19 |
+
},
|
| 20 |
+
},
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
llm_tools_map = {
|
| 24 |
+
"query_books_database": utils.query_books_database,
|
| 25 |
+
}
|
src/utils.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import sqlite3
|
| 5 |
+
import time
|
| 6 |
+
from contextlib import closing
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def query_books_database(
|
| 11 |
+
sql_query: str, db_url: str = "data/books_data.db"
|
| 12 |
+
) -> list[dict]:
|
| 13 |
+
"""
|
| 14 |
+
Execute a read-only SQL query on the books database and return the results.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
sql_query (str): SQL query string to execute.
|
| 18 |
+
db_url (str): URL to the database file, defaults to "data/books_data.db".
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
list[dict[str, str | float | int]]: A list of rows as dictionaries
|
| 22 |
+
"""
|
| 23 |
+
with closing(sqlite3.connect(f"file:{db_url}?mode=ro", uri=True)) as connection:
|
| 24 |
+
connection.row_factory = lambda cursor, row: {
|
| 25 |
+
col[0]: row[i] for i, col in enumerate(cursor.description)
|
| 26 |
+
}
|
| 27 |
+
with closing(connection.cursor()) as cursor:
|
| 28 |
+
rows = cursor.execute(sql_query).fetchall()
|
| 29 |
+
return rows
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def handle_function_calls(
|
| 33 |
+
function_map: dict, response_message, thread: list
|
| 34 |
+
) -> list | None:
|
| 35 |
+
"""
|
| 36 |
+
Handle function tool calls and map them to actual function executions.
|
| 37 |
+
|
| 38 |
+
Arguments:
|
| 39 |
+
function_map (dict): A dictionary mapping function names to function objects.
|
| 40 |
+
response_message: The message containing tool call information.
|
| 41 |
+
thread (list): List to append results of function calls.
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
list: Updated list of messages.
|
| 45 |
+
|
| 46 |
+
Raises:
|
| 47 |
+
ValueError: If no tool calls are present in the response message.
|
| 48 |
+
KeyError: If a function mapping is not found.
|
| 49 |
+
"""
|
| 50 |
+
if not response_message.tool_calls:
|
| 51 |
+
raise ValueError("No tool calls found in the response message.")
|
| 52 |
+
|
| 53 |
+
for tool_call in response_message.tool_calls:
|
| 54 |
+
function_name = tool_call.function.name
|
| 55 |
+
if function_name in function_map:
|
| 56 |
+
function_args = json.loads(tool_call.function.arguments)
|
| 57 |
+
print(f"Function arguments: {function_args}")
|
| 58 |
+
|
| 59 |
+
function_to_call = function_map[function_name]
|
| 60 |
+
try:
|
| 61 |
+
function_response = function_to_call(**function_args)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
function_response = str(e)
|
| 64 |
+
|
| 65 |
+
thread.append(
|
| 66 |
+
{
|
| 67 |
+
"tool_call_id": tool_call.id,
|
| 68 |
+
"role": "tool",
|
| 69 |
+
"name": function_name,
|
| 70 |
+
"content": str(function_response),
|
| 71 |
+
}
|
| 72 |
+
)
|
| 73 |
+
return thread
|
| 74 |
+
else:
|
| 75 |
+
print(f"Function {function_name} not found.")
|
| 76 |
+
raise KeyError(f"Function {function_name} not found in function map.")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def create_logger(logger_name: str, log_file: str, log_level: str) -> logging.Logger:
|
| 80 |
+
"""
|
| 81 |
+
Create and configure a logger with specified name, log file, and log level.
|
| 82 |
+
|
| 83 |
+
Arguments:
|
| 84 |
+
logger_name (str): Name of the logger.
|
| 85 |
+
log_file (str): Path to the log file.
|
| 86 |
+
log_level (str): Logging level as a string (e.g., 'INFO', 'DEBUG').
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
logging.Logger: Configured logger object.
|
| 90 |
+
"""
|
| 91 |
+
LOG_FORMAT = "[%(asctime)s | %(name)s | %(levelname)s | %(funcName)s | %(message)s]"
|
| 92 |
+
log_level = getattr(logging, log_level.upper())
|
| 93 |
+
|
| 94 |
+
logger = logging.getLogger(logger_name)
|
| 95 |
+
logger.setLevel(logging.DEBUG)
|
| 96 |
+
|
| 97 |
+
file_handler = logging.FileHandler(log_file)
|
| 98 |
+
file_handler.setLevel(logging.DEBUG)
|
| 99 |
+
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
| 100 |
+
logger.addHandler(file_handler)
|
| 101 |
+
|
| 102 |
+
console_handler = logging.StreamHandler()
|
| 103 |
+
console_handler.setLevel(logging.INFO)
|
| 104 |
+
console_handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
| 105 |
+
logger.addHandler(console_handler)
|
| 106 |
+
|
| 107 |
+
return logger
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def log_execution_time(logger: logging.Logger):
|
| 111 |
+
"""
|
| 112 |
+
Decorator factory to log the execution time of a function using a specified logger.
|
| 113 |
+
|
| 114 |
+
Arguments:
|
| 115 |
+
logger (logging.Logger): Logger object used for logging.
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
function: A decorator that logs the execution time of the wrapped function.
|
| 119 |
+
"""
|
| 120 |
+
|
| 121 |
+
def decorator(func):
|
| 122 |
+
def wrapper(*args, **kwargs):
|
| 123 |
+
start_time = time.time()
|
| 124 |
+
result = func(*args, **kwargs)
|
| 125 |
+
end_time = time.time()
|
| 126 |
+
execution_time = end_time - start_time
|
| 127 |
+
logger.info(f"Executing {func.__name__} took {execution_time:.4f} seconds")
|
| 128 |
+
return result
|
| 129 |
+
|
| 130 |
+
return wrapper
|
| 131 |
+
|
| 132 |
+
return decorator
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|