Spaces:
Sleeping
Sleeping
Initial app setup
Browse files- .DS_Store +0 -0
- .dockerignore +46 -0
- .env.example +10 -0
- .gitignore +48 -0
- Dockerfile +31 -0
- Makefile +23 -0
- README.md +119 -12
- aimakerspace/.DS_Store +0 -0
- aimakerspace/__init__.py +0 -0
- aimakerspace/__pycache__/__init__.cpython-311.pyc +0 -0
- aimakerspace/__pycache__/text_utils.cpython-311.pyc +0 -0
- aimakerspace/__pycache__/vectordatabase.cpython-311.pyc +0 -0
- aimakerspace/openai_utils/__init__.py +0 -0
- aimakerspace/openai_utils/__pycache__/__init__.cpython-311.pyc +0 -0
- aimakerspace/openai_utils/__pycache__/embedding.cpython-311.pyc +0 -0
- aimakerspace/openai_utils/chatmodel.py +0 -45
- aimakerspace/openai_utils/embedding.py +0 -59
- aimakerspace/openai_utils/prompts.py +0 -78
- aimakerspace/text_utils.py +0 -146
- aimakerspace/vectordatabase.py +0 -81
- app.py +295 -0
- chainlit.json +18 -0
- chainlit.md +28 -0
- docker-compose.yml +16 -0
- huggingface-space.yml +20 -0
- notebook_version/.DS_Store +0 -0
- notebook_version/{Evaluating_RAG_with_Ragas_(2025)_AI_Makerspace.ipynb → AB_Testing_RAG_Agent.ipynb} +0 -0
- requirements.txt +13 -0
- scripts/__init__.py +3 -0
- scripts/prepare_data.py +40 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
.dockerignore
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.py[cod]
|
| 8 |
+
*$py.class
|
| 9 |
+
*.so
|
| 10 |
+
.Python
|
| 11 |
+
env/
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
|
| 28 |
+
# Virtual Environment
|
| 29 |
+
venv/
|
| 30 |
+
ENV/
|
| 31 |
+
.env
|
| 32 |
+
|
| 33 |
+
# IDE specific files
|
| 34 |
+
.idea/
|
| 35 |
+
.vscode/
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
| 38 |
+
|
| 39 |
+
# OS specific
|
| 40 |
+
.DS_Store
|
| 41 |
+
|
| 42 |
+
# Notebook version (keep in repo but not in Docker)
|
| 43 |
+
notebook_version/
|
| 44 |
+
|
| 45 |
+
# Ignore PDF data files (will be mounted at runtime)
|
| 46 |
+
data/*.pdf
|
.env.example
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI API key
|
| 2 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 3 |
+
|
| 4 |
+
# Optional backup key if primary is not set
|
| 5 |
+
OPENAI_API_KEY_BACKUP=
|
| 6 |
+
|
| 7 |
+
# Chainlit configuration
|
| 8 |
+
CHAINLIT_MAX_STEPS_HISTORY=10
|
| 9 |
+
CHAINLIT_AUTH_SECRET=
|
| 10 |
+
CHAINLIT_SERVER_URL=
|
.gitignore
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Environment and secrets
|
| 24 |
+
.env
|
| 25 |
+
.venv
|
| 26 |
+
venv/
|
| 27 |
+
ENV/
|
| 28 |
+
env/
|
| 29 |
+
.env.local
|
| 30 |
+
.env.development.local
|
| 31 |
+
.env.test.local
|
| 32 |
+
.env.production.local
|
| 33 |
+
|
| 34 |
+
# Chainlit
|
| 35 |
+
.chainlit/
|
| 36 |
+
.chainlit/cache/
|
| 37 |
+
|
| 38 |
+
# IDE specific files
|
| 39 |
+
.idea/
|
| 40 |
+
.vscode/
|
| 41 |
+
*.swp
|
| 42 |
+
*.swo
|
| 43 |
+
|
| 44 |
+
# Mac OS
|
| 45 |
+
.DS_Store
|
| 46 |
+
|
| 47 |
+
# Local data
|
| 48 |
+
/data/*.pdf
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
python3-dev \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements file
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
|
| 14 |
+
# Install Python dependencies
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy application code
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Create data directory for PDFs
|
| 21 |
+
RUN mkdir -p data
|
| 22 |
+
|
| 23 |
+
# Expose port for Chainlit
|
| 24 |
+
EXPOSE 8000
|
| 25 |
+
|
| 26 |
+
# Set environment variables
|
| 27 |
+
ENV PYTHONPATH=/app
|
| 28 |
+
ENV OPENAI_API_KEY=${OPENAI_API_KEY}
|
| 29 |
+
|
| 30 |
+
# Command to run the application
|
| 31 |
+
CMD ["chainlit", "run", "app.py", "--port", "8000", "--host", "0.0.0.0"]
|
Makefile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: setup run docker-build docker-run clean
|
| 2 |
+
|
| 3 |
+
setup:
|
| 4 |
+
python -m pip install -r requirements.txt
|
| 5 |
+
|
| 6 |
+
run:
|
| 7 |
+
chainlit run app.py
|
| 8 |
+
|
| 9 |
+
docker-build:
|
| 10 |
+
docker build -t ab-testing-rag-agent .
|
| 11 |
+
|
| 12 |
+
docker-run:
|
| 13 |
+
docker run -p 8000:8000 --env-file .env -v $(PWD)/data:/app/data ab-testing-rag-agent
|
| 14 |
+
|
| 15 |
+
docker-compose-up:
|
| 16 |
+
docker-compose up -d
|
| 17 |
+
|
| 18 |
+
docker-compose-down:
|
| 19 |
+
docker-compose down
|
| 20 |
+
|
| 21 |
+
clean:
|
| 22 |
+
rm -rf __pycache__
|
| 23 |
+
rm -rf .chainlit
|
README.md
CHANGED
|
@@ -1,12 +1,119 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AB Testing RAG Agent
|
| 2 |
+
|
| 3 |
+
An application that helps answer questions about AB Testing using a collection of PDF documents and a RAG (Retrieval Augmented Generation) approach. This agent was built using LangChain, OpenAI, Qdrant, and Chainlit.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Document Retrieval**: Retrieves relevant information from AB Testing documents
|
| 8 |
+
- **Query Rephrasing**: Improves retrieval by rephrasing your query for better results
|
| 9 |
+
- **Source References**: Shows the exact document sources used to answer your question
|
| 10 |
+
- **Streaming Interface**: See the response as it's being generated
|
| 11 |
+
|
| 12 |
+
## Prerequisites
|
| 13 |
+
|
| 14 |
+
- Python 3.9+
|
| 15 |
+
- Docker and Docker Compose (optional)
|
| 16 |
+
- OpenAI API key
|
| 17 |
+
|
| 18 |
+
## Setup
|
| 19 |
+
|
| 20 |
+
### Local Development
|
| 21 |
+
|
| 22 |
+
1. Clone the repository
|
| 23 |
+
```bash
|
| 24 |
+
git clone https://github.com/yourusername/AB_Testing_RAG_Agent.git
|
| 25 |
+
cd AB_Testing_RAG_Agent
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
2. Create a virtual environment
|
| 29 |
+
```bash
|
| 30 |
+
python -m venv venv
|
| 31 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
3. Install dependencies
|
| 35 |
+
```bash
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
4. Create a `.env` file from the example
|
| 40 |
+
```bash
|
| 41 |
+
cp .env.example .env
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
5. Add your OpenAI API key to the `.env` file
|
| 45 |
+
```bash
|
| 46 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
6. Add PDF files to the `data` directory
|
| 50 |
+
```bash
|
| 51 |
+
mkdir -p data
|
| 52 |
+
# Copy your AB Testing PDFs to the data directory
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
7. Run the application
|
| 56 |
+
```bash
|
| 57 |
+
chainlit run app.py
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
8. Open your browser to `http://localhost:8000`
|
| 61 |
+
|
| 62 |
+
### Using Docker
|
| 63 |
+
|
| 64 |
+
1. Clone the repository
|
| 65 |
+
```bash
|
| 66 |
+
git clone https://github.com/yourusername/AB_Testing_RAG_Agent.git
|
| 67 |
+
cd AB_Testing_RAG_Agent
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
2. Create a `.env` file from the example
|
| 71 |
+
```bash
|
| 72 |
+
cp .env.example .env
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
3. Add your OpenAI API key to the `.env` file
|
| 76 |
+
```bash
|
| 77 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
4. Add PDF files to the `data` directory
|
| 81 |
+
```bash
|
| 82 |
+
mkdir -p data
|
| 83 |
+
# Copy your AB Testing PDFs to the data directory
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
5. Build and run with Docker Compose
|
| 87 |
+
```bash
|
| 88 |
+
docker-compose up -d
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
6. Open your browser to `http://localhost:8000`
|
| 92 |
+
|
| 93 |
+
## Deploying to Hugging Face Spaces
|
| 94 |
+
|
| 95 |
+
1. Create a new Hugging Face Space with Docker deployment
|
| 96 |
+
2. Set the following environment variables in your Space settings:
|
| 97 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
| 98 |
+
3. Push your code to the Space's repository
|
| 99 |
+
|
| 100 |
+
## How It Works
|
| 101 |
+
|
| 102 |
+
1. The application loads and processes PDF documents using LangChain's document loaders
|
| 103 |
+
2. Documents are chunked and embedded using OpenAI's embedding model
|
| 104 |
+
3. Qdrant vector store is used for semantic search
|
| 105 |
+
4. When a user asks a question:
|
| 106 |
+
- The query is rephrased to be more specific for better retrieval
|
| 107 |
+
- Relevant document chunks are retrieved
|
| 108 |
+
- OpenAI's GPT model generates an answer based on the retrieved context
|
| 109 |
+
- Source references are tracked and displayed alongside the answer
|
| 110 |
+
|
| 111 |
+
## License
|
| 112 |
+
|
| 113 |
+
MIT
|
| 114 |
+
|
| 115 |
+
## Acknowledgements
|
| 116 |
+
|
| 117 |
+
- Based on the notebook version in `notebook_version/AB_Testing_RAG_Agent.ipynb`
|
| 118 |
+
- Thanks to the creators of LangChain, Chainlit, and Qdrant for their excellent tools
|
| 119 |
+
- Special thanks to all the AB Testing papers and documents used as the knowledge base
|
aimakerspace/.DS_Store
DELETED
|
Binary file (6.15 kB)
|
|
|
aimakerspace/__init__.py
DELETED
|
File without changes
|
aimakerspace/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (177 Bytes)
|
|
|
aimakerspace/__pycache__/text_utils.cpython-311.pyc
DELETED
|
Binary file (10.8 kB)
|
|
|
aimakerspace/__pycache__/vectordatabase.cpython-311.pyc
DELETED
|
Binary file (5.69 kB)
|
|
|
aimakerspace/openai_utils/__init__.py
DELETED
|
File without changes
|
aimakerspace/openai_utils/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (190 Bytes)
|
|
|
aimakerspace/openai_utils/__pycache__/embedding.cpython-311.pyc
DELETED
|
Binary file (8.09 kB)
|
|
|
aimakerspace/openai_utils/chatmodel.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
from openai import OpenAI, AsyncOpenAI
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
load_dotenv()
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class ChatOpenAI:
|
| 9 |
-
def __init__(self, model_name: str = "gpt-4o-mini"):
|
| 10 |
-
self.model_name = model_name
|
| 11 |
-
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 12 |
-
if self.openai_api_key is None:
|
| 13 |
-
raise ValueError("OPENAI_API_KEY is not set")
|
| 14 |
-
|
| 15 |
-
def run(self, messages, text_only: bool = True, **kwargs):
|
| 16 |
-
if not isinstance(messages, list):
|
| 17 |
-
raise ValueError("messages must be a list")
|
| 18 |
-
|
| 19 |
-
client = OpenAI()
|
| 20 |
-
response = client.chat.completions.create(
|
| 21 |
-
model=self.model_name, messages=messages, **kwargs
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
if text_only:
|
| 25 |
-
return response.choices[0].message.content
|
| 26 |
-
|
| 27 |
-
return response
|
| 28 |
-
|
| 29 |
-
async def astream(self, messages, **kwargs):
|
| 30 |
-
if not isinstance(messages, list):
|
| 31 |
-
raise ValueError("messages must be a list")
|
| 32 |
-
|
| 33 |
-
client = AsyncOpenAI()
|
| 34 |
-
|
| 35 |
-
stream = await client.chat.completions.create(
|
| 36 |
-
model=self.model_name,
|
| 37 |
-
messages=messages,
|
| 38 |
-
stream=True,
|
| 39 |
-
**kwargs
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
async for chunk in stream:
|
| 43 |
-
content = chunk.choices[0].delta.content
|
| 44 |
-
if content is not None:
|
| 45 |
-
yield content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aimakerspace/openai_utils/embedding.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
from dotenv import load_dotenv
|
| 2 |
-
from openai import AsyncOpenAI, OpenAI
|
| 3 |
-
import openai
|
| 4 |
-
from typing import List
|
| 5 |
-
import os
|
| 6 |
-
import asyncio
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
class EmbeddingModel:
|
| 10 |
-
def __init__(self, embeddings_model_name: str = "text-embedding-3-small"):
|
| 11 |
-
load_dotenv()
|
| 12 |
-
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 13 |
-
self.async_client = AsyncOpenAI()
|
| 14 |
-
self.client = OpenAI()
|
| 15 |
-
|
| 16 |
-
if self.openai_api_key is None:
|
| 17 |
-
raise ValueError(
|
| 18 |
-
"OPENAI_API_KEY environment variable is not set. Please set it to your OpenAI API key."
|
| 19 |
-
)
|
| 20 |
-
openai.api_key = self.openai_api_key
|
| 21 |
-
self.embeddings_model_name = embeddings_model_name
|
| 22 |
-
|
| 23 |
-
async def async_get_embeddings(self, list_of_text: List[str]) -> List[List[float]]:
|
| 24 |
-
embedding_response = await self.async_client.embeddings.create(
|
| 25 |
-
input=list_of_text, model=self.embeddings_model_name
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
return [embeddings.embedding for embeddings in embedding_response.data]
|
| 29 |
-
|
| 30 |
-
async def async_get_embedding(self, text: str) -> List[float]:
|
| 31 |
-
embedding = await self.async_client.embeddings.create(
|
| 32 |
-
input=text, model=self.embeddings_model_name
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
return embedding.data[0].embedding
|
| 36 |
-
|
| 37 |
-
def get_embeddings(self, list_of_text: List[str]) -> List[List[float]]:
|
| 38 |
-
embedding_response = self.client.embeddings.create(
|
| 39 |
-
input=list_of_text, model=self.embeddings_model_name
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
return [embeddings.embedding for embeddings in embedding_response.data]
|
| 43 |
-
|
| 44 |
-
def get_embedding(self, text: str) -> List[float]:
|
| 45 |
-
embedding = self.client.embeddings.create(
|
| 46 |
-
input=text, model=self.embeddings_model_name
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
return embedding.data[0].embedding
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
if __name__ == "__main__":
|
| 53 |
-
embedding_model = EmbeddingModel()
|
| 54 |
-
print(asyncio.run(embedding_model.async_get_embedding("Hello, world!")))
|
| 55 |
-
print(
|
| 56 |
-
asyncio.run(
|
| 57 |
-
embedding_model.async_get_embeddings(["Hello, world!", "Goodbye, world!"])
|
| 58 |
-
)
|
| 59 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aimakerspace/openai_utils/prompts.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
class BasePrompt:
|
| 5 |
-
def __init__(self, prompt):
|
| 6 |
-
"""
|
| 7 |
-
Initializes the BasePrompt object with a prompt template.
|
| 8 |
-
|
| 9 |
-
:param prompt: A string that can contain placeholders within curly braces
|
| 10 |
-
"""
|
| 11 |
-
self.prompt = prompt
|
| 12 |
-
self._pattern = re.compile(r"\{([^}]+)\}")
|
| 13 |
-
|
| 14 |
-
def format_prompt(self, **kwargs):
|
| 15 |
-
"""
|
| 16 |
-
Formats the prompt string using the keyword arguments provided.
|
| 17 |
-
|
| 18 |
-
:param kwargs: The values to substitute into the prompt string
|
| 19 |
-
:return: The formatted prompt string
|
| 20 |
-
"""
|
| 21 |
-
matches = self._pattern.findall(self.prompt)
|
| 22 |
-
return self.prompt.format(**{match: kwargs.get(match, "") for match in matches})
|
| 23 |
-
|
| 24 |
-
def get_input_variables(self):
|
| 25 |
-
"""
|
| 26 |
-
Gets the list of input variable names from the prompt string.
|
| 27 |
-
|
| 28 |
-
:return: List of input variable names
|
| 29 |
-
"""
|
| 30 |
-
return self._pattern.findall(self.prompt)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class RolePrompt(BasePrompt):
|
| 34 |
-
def __init__(self, prompt, role: str):
|
| 35 |
-
"""
|
| 36 |
-
Initializes the RolePrompt object with a prompt template and a role.
|
| 37 |
-
|
| 38 |
-
:param prompt: A string that can contain placeholders within curly braces
|
| 39 |
-
:param role: The role for the message ('system', 'user', or 'assistant')
|
| 40 |
-
"""
|
| 41 |
-
super().__init__(prompt)
|
| 42 |
-
self.role = role
|
| 43 |
-
|
| 44 |
-
def create_message(self, format=True, **kwargs):
|
| 45 |
-
"""
|
| 46 |
-
Creates a message dictionary with a role and a formatted message.
|
| 47 |
-
|
| 48 |
-
:param kwargs: The values to substitute into the prompt string
|
| 49 |
-
:return: Dictionary containing the role and the formatted message
|
| 50 |
-
"""
|
| 51 |
-
if format:
|
| 52 |
-
return {"role": self.role, "content": self.format_prompt(**kwargs)}
|
| 53 |
-
|
| 54 |
-
return {"role": self.role, "content": self.prompt}
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
class SystemRolePrompt(RolePrompt):
|
| 58 |
-
def __init__(self, prompt: str):
|
| 59 |
-
super().__init__(prompt, "system")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
class UserRolePrompt(RolePrompt):
|
| 63 |
-
def __init__(self, prompt: str):
|
| 64 |
-
super().__init__(prompt, "user")
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
class AssistantRolePrompt(RolePrompt):
|
| 68 |
-
def __init__(self, prompt: str):
|
| 69 |
-
super().__init__(prompt, "assistant")
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
if __name__ == "__main__":
|
| 73 |
-
prompt = BasePrompt("Hello {name}, you are {age} years old")
|
| 74 |
-
print(prompt.format_prompt(name="John", age=30))
|
| 75 |
-
|
| 76 |
-
prompt = SystemRolePrompt("Hello {name}, you are {age} years old")
|
| 77 |
-
print(prompt.create_message(name="John", age=30))
|
| 78 |
-
print(prompt.get_input_variables())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aimakerspace/text_utils.py
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
from typing import List
|
| 4 |
-
import PyPDF2
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class TextFileLoader:
|
| 8 |
-
def __init__(self, path: str, encoding: str = "utf-8"):
|
| 9 |
-
self.documents = []
|
| 10 |
-
self.path = path
|
| 11 |
-
self.encoding = encoding
|
| 12 |
-
|
| 13 |
-
def load(self):
|
| 14 |
-
if os.path.isdir(self.path):
|
| 15 |
-
self.load_directory()
|
| 16 |
-
else:
|
| 17 |
-
self.load_file()
|
| 18 |
-
|
| 19 |
-
def load_file(self):
|
| 20 |
-
with open(self.path, "r", encoding=self.encoding) as f:
|
| 21 |
-
self.documents.append(f.read())
|
| 22 |
-
|
| 23 |
-
def load_directory(self):
|
| 24 |
-
for root, _, files in os.walk(self.path):
|
| 25 |
-
for file in files:
|
| 26 |
-
if file.endswith(".txt"):
|
| 27 |
-
with open(
|
| 28 |
-
os.path.join(root, file), "r", encoding=self.encoding
|
| 29 |
-
) as f:
|
| 30 |
-
self.documents.append(f.read())
|
| 31 |
-
|
| 32 |
-
def load_documents(self):
|
| 33 |
-
self.load()
|
| 34 |
-
return self.documents
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
class CharacterTextSplitter:
|
| 38 |
-
def __init__(
|
| 39 |
-
self,
|
| 40 |
-
chunk_size: int = 1000,
|
| 41 |
-
chunk_overlap: int = 200,
|
| 42 |
-
):
|
| 43 |
-
assert (
|
| 44 |
-
chunk_size > chunk_overlap
|
| 45 |
-
), "Chunk size must be greater than chunk overlap"
|
| 46 |
-
|
| 47 |
-
self.chunk_size = chunk_size
|
| 48 |
-
self.chunk_overlap = chunk_overlap
|
| 49 |
-
|
| 50 |
-
def split(self, text: str) -> List[str]:
|
| 51 |
-
chunks = []
|
| 52 |
-
for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
|
| 53 |
-
chunks.append(text[i : i + self.chunk_size])
|
| 54 |
-
return chunks
|
| 55 |
-
|
| 56 |
-
def split_texts(self, texts: List[str]) -> List[str]:
|
| 57 |
-
chunks = []
|
| 58 |
-
for text in texts:
|
| 59 |
-
chunks.extend(self.split(text))
|
| 60 |
-
return chunks
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
class PDFLoader:
|
| 64 |
-
def __init__(self, path: str):
|
| 65 |
-
self.documents = []
|
| 66 |
-
self.path = path
|
| 67 |
-
print(f"PDFLoader initialized with path: {self.path}")
|
| 68 |
-
|
| 69 |
-
def load(self):
|
| 70 |
-
print(f"Loading PDF from path: {self.path}")
|
| 71 |
-
print(f"Path exists: {os.path.exists(self.path)}")
|
| 72 |
-
print(f"Is file: {os.path.isfile(self.path)}")
|
| 73 |
-
print(f"Is directory: {os.path.isdir(self.path)}")
|
| 74 |
-
print(f"File permissions: {oct(os.stat(self.path).st_mode)[-3:]}")
|
| 75 |
-
|
| 76 |
-
try:
|
| 77 |
-
# Try to open the file first to verify access
|
| 78 |
-
with open(self.path, 'rb') as test_file:
|
| 79 |
-
pass
|
| 80 |
-
|
| 81 |
-
# If we can open it, proceed with loading
|
| 82 |
-
if os.path.isdir(self.path):
|
| 83 |
-
self.load_directory()
|
| 84 |
-
else:
|
| 85 |
-
self.load_file()
|
| 86 |
-
|
| 87 |
-
except IOError as e:
|
| 88 |
-
raise ValueError(f"Cannot access file at '{self.path}': {str(e)}")
|
| 89 |
-
except Exception as e:
|
| 90 |
-
raise ValueError(f"Error processing file at '{self.path}': {str(e)}")
|
| 91 |
-
|
| 92 |
-
def load_file(self):
|
| 93 |
-
with open(self.path, 'rb') as file:
|
| 94 |
-
# Create PDF reader object
|
| 95 |
-
pdf_reader = PyPDF2.PdfReader(file)
|
| 96 |
-
|
| 97 |
-
# Extract text from each page separately
|
| 98 |
-
self.documents = [] # Clear existing documents
|
| 99 |
-
for page_num, page in enumerate(pdf_reader.pages):
|
| 100 |
-
page_text = page.extract_text() or "" # Handle None returns
|
| 101 |
-
if page_text.strip(): # Only add non-empty pages
|
| 102 |
-
self.documents.append(page_text)
|
| 103 |
-
else:
|
| 104 |
-
print(f"Warning: Page {page_num + 1} is empty in {os.path.basename(self.path)}")
|
| 105 |
-
|
| 106 |
-
print(f"Loaded {len(self.documents)} pages from {os.path.basename(self.path)}")
|
| 107 |
-
|
| 108 |
-
def load_directory(self):
|
| 109 |
-
self.documents = [] # Clear existing documents
|
| 110 |
-
for root, _, files in os.walk(self.path):
|
| 111 |
-
for file in files:
|
| 112 |
-
if file.lower().endswith('.pdf'):
|
| 113 |
-
file_path = os.path.join(root, file)
|
| 114 |
-
try:
|
| 115 |
-
with open(file_path, 'rb') as f:
|
| 116 |
-
pdf_reader = PyPDF2.PdfReader(f)
|
| 117 |
-
|
| 118 |
-
# Extract text from each page separately
|
| 119 |
-
for page_num, page in enumerate(pdf_reader.pages):
|
| 120 |
-
page_text = page.extract_text() or ""
|
| 121 |
-
if page_text.strip():
|
| 122 |
-
self.documents.append(page_text)
|
| 123 |
-
else:
|
| 124 |
-
print(f"Warning: Page {page_num + 1} is empty in {file}")
|
| 125 |
-
except Exception as e:
|
| 126 |
-
print(f"Error processing {file}: {str(e)}")
|
| 127 |
-
|
| 128 |
-
def load_documents(self):
|
| 129 |
-
if not self.documents: # Only load if not already loaded
|
| 130 |
-
self.load()
|
| 131 |
-
return self.documents
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
if __name__ == "__main__":
|
| 135 |
-
loader = TextFileLoader("data/KingLear.txt")
|
| 136 |
-
loader.load()
|
| 137 |
-
splitter = CharacterTextSplitter()
|
| 138 |
-
chunks = splitter.split_texts(loader.documents)
|
| 139 |
-
print(len(chunks))
|
| 140 |
-
print(chunks[0])
|
| 141 |
-
print("--------")
|
| 142 |
-
print(chunks[1])
|
| 143 |
-
print("--------")
|
| 144 |
-
print(chunks[-2])
|
| 145 |
-
print("--------")
|
| 146 |
-
print(chunks[-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aimakerspace/vectordatabase.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
from collections import defaultdict
|
| 3 |
-
from typing import List, Tuple, Callable
|
| 4 |
-
from aimakerspace.openai_utils.embedding import EmbeddingModel
|
| 5 |
-
import asyncio
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
|
| 9 |
-
"""Computes the cosine similarity between two vectors."""
|
| 10 |
-
dot_product = np.dot(vector_a, vector_b)
|
| 11 |
-
norm_a = np.linalg.norm(vector_a)
|
| 12 |
-
norm_b = np.linalg.norm(vector_b)
|
| 13 |
-
return dot_product / (norm_a * norm_b)
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
class VectorDatabase:
|
| 17 |
-
def __init__(self, embedding_model: EmbeddingModel = None):
|
| 18 |
-
self.vectors = defaultdict(np.array)
|
| 19 |
-
self.embedding_model = embedding_model or EmbeddingModel()
|
| 20 |
-
|
| 21 |
-
def insert(self, key: str, vector: np.array) -> None:
|
| 22 |
-
self.vectors[key] = vector
|
| 23 |
-
|
| 24 |
-
def search(
|
| 25 |
-
self,
|
| 26 |
-
query_vector: np.array,
|
| 27 |
-
k: int,
|
| 28 |
-
distance_measure: Callable = cosine_similarity,
|
| 29 |
-
) -> List[Tuple[str, float]]:
|
| 30 |
-
scores = [
|
| 31 |
-
(key, distance_measure(query_vector, vector))
|
| 32 |
-
for key, vector in self.vectors.items()
|
| 33 |
-
]
|
| 34 |
-
return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
|
| 35 |
-
|
| 36 |
-
def search_by_text(
|
| 37 |
-
self,
|
| 38 |
-
query_text: str,
|
| 39 |
-
k: int,
|
| 40 |
-
distance_measure: Callable = cosine_similarity,
|
| 41 |
-
return_as_text: bool = False,
|
| 42 |
-
) -> List[Tuple[str, float]]:
|
| 43 |
-
query_vector = self.embedding_model.get_embedding(query_text)
|
| 44 |
-
results = self.search(query_vector, k, distance_measure)
|
| 45 |
-
return [result[0] for result in results] if return_as_text else results
|
| 46 |
-
|
| 47 |
-
def retrieve_from_key(self, key: str) -> np.array:
|
| 48 |
-
return self.vectors.get(key, None)
|
| 49 |
-
|
| 50 |
-
async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
|
| 51 |
-
embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
|
| 52 |
-
for text, embedding in zip(list_of_text, embeddings):
|
| 53 |
-
self.insert(text, np.array(embedding))
|
| 54 |
-
return self
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
if __name__ == "__main__":
|
| 58 |
-
list_of_text = [
|
| 59 |
-
"I like to eat broccoli and bananas.",
|
| 60 |
-
"I ate a banana and spinach smoothie for breakfast.",
|
| 61 |
-
"Chinchillas and kittens are cute.",
|
| 62 |
-
"My sister adopted a kitten yesterday.",
|
| 63 |
-
"Look at this cute hamster munching on a piece of broccoli.",
|
| 64 |
-
]
|
| 65 |
-
|
| 66 |
-
vector_db = VectorDatabase()
|
| 67 |
-
vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
|
| 68 |
-
k = 2
|
| 69 |
-
|
| 70 |
-
searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
|
| 71 |
-
print(f"Closest {k} vector(s):", searched_vector)
|
| 72 |
-
|
| 73 |
-
retrieved_vector = vector_db.retrieve_from_key(
|
| 74 |
-
"I like to eat broccoli and bananas."
|
| 75 |
-
)
|
| 76 |
-
print("Retrieved vector:", retrieved_vector)
|
| 77 |
-
|
| 78 |
-
relevant_texts = vector_db.search_by_text(
|
| 79 |
-
"I think fruit is awesome!", k=k, return_as_text=True
|
| 80 |
-
)
|
| 81 |
-
print(f"Closest {k} text(s):", relevant_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import chainlit as cl
|
| 3 |
+
from chainlit.playground.providers.openai import ChatOpenAI
|
| 4 |
+
from chainlit.element import Text
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain.schema.output_parser import StrOutputParser
|
| 7 |
+
from langchain_community.document_loaders import DirectoryLoader
|
| 8 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 9 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
| 10 |
+
from langchain_community.vectorstores import Qdrant
|
| 11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 12 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 13 |
+
from langchain_core.messages import AIMessage, HumanMessage
|
| 14 |
+
from operator import itemgetter
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
import tiktoken
|
| 18 |
+
import re
|
| 19 |
+
import functools
|
| 20 |
+
|
| 21 |
+
# Configure OpenAI API key from environment variable
|
| 22 |
+
if not os.environ.get("OPENAI_API_KEY"):
|
| 23 |
+
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY_BACKUP", "")
|
| 24 |
+
|
| 25 |
+
# Document loading and processing
|
| 26 |
+
@cl.cache
|
| 27 |
+
def load_and_process_documents():
|
| 28 |
+
# Load all PDF documents (each page as a separate document)
|
| 29 |
+
path = "data/"
|
| 30 |
+
loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
|
| 31 |
+
all_docs = loader.load()
|
| 32 |
+
|
| 33 |
+
# Create a mapping of merged document chunks back to original pages
|
| 34 |
+
page_map = {}
|
| 35 |
+
current_index = 0
|
| 36 |
+
|
| 37 |
+
# For source tracking, we'll store page information before merging
|
| 38 |
+
docs_by_source = defaultdict(list)
|
| 39 |
+
|
| 40 |
+
# Group documents by their source file
|
| 41 |
+
for doc in all_docs:
|
| 42 |
+
source = doc.metadata.get("source", "")
|
| 43 |
+
docs_by_source[source].append(doc)
|
| 44 |
+
|
| 45 |
+
# Merge pages from the same PDF but track page ranges
|
| 46 |
+
merged_docs = []
|
| 47 |
+
for source, source_docs in docs_by_source.items():
|
| 48 |
+
# Sort by page number if available
|
| 49 |
+
source_docs.sort(key=lambda x: x.metadata.get("page", 0))
|
| 50 |
+
|
| 51 |
+
# Merge the content
|
| 52 |
+
merged_content = ""
|
| 53 |
+
page_ranges = []
|
| 54 |
+
current_pos = 0
|
| 55 |
+
|
| 56 |
+
for doc in source_docs:
|
| 57 |
+
# Get the page number (1-indexed for human readability)
|
| 58 |
+
page_num = doc.metadata.get("page", 0) + 1
|
| 59 |
+
|
| 60 |
+
# Add a separator between pages for clarity
|
| 61 |
+
if merged_content:
|
| 62 |
+
merged_content += "\n\n"
|
| 63 |
+
|
| 64 |
+
# Record where this page's content starts in the merged document
|
| 65 |
+
start_pos = len(merged_content)
|
| 66 |
+
merged_content += doc.page_content
|
| 67 |
+
end_pos = len(merged_content)
|
| 68 |
+
|
| 69 |
+
# Store the mapping of character ranges to original page numbers
|
| 70 |
+
page_ranges.append({
|
| 71 |
+
"start": start_pos,
|
| 72 |
+
"end": end_pos,
|
| 73 |
+
"page": page_num,
|
| 74 |
+
"source": source
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
# Create merged metadata that includes page mapping information
|
| 78 |
+
merged_metadata = {
|
| 79 |
+
"source": source,
|
| 80 |
+
"title": source.split("/")[-1],
|
| 81 |
+
"page_count": len(source_docs),
|
| 82 |
+
"merged": True,
|
| 83 |
+
"page_ranges": page_ranges # Store the page ranges for later reference
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Create a new document with the merged content
|
| 87 |
+
merged_doc = Document(page_content=merged_content, metadata=merged_metadata)
|
| 88 |
+
merged_docs.append(merged_doc)
|
| 89 |
+
|
| 90 |
+
# tiktoken_len counts tokens (not characters) using the gpt-4o-mini tokenizer
|
| 91 |
+
def tiktoken_len(text):
|
| 92 |
+
tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(
|
| 93 |
+
text,
|
| 94 |
+
)
|
| 95 |
+
return len(tokens)
|
| 96 |
+
|
| 97 |
+
# Process splits to add page info based on character position
|
| 98 |
+
def add_page_info_to_splits(splits):
|
| 99 |
+
for split in splits:
|
| 100 |
+
# Get the start position of this chunk
|
| 101 |
+
start_pos = split.metadata.get("start_index", 0)
|
| 102 |
+
end_pos = start_pos + len(split.page_content)
|
| 103 |
+
|
| 104 |
+
# Find which page this chunk belongs to
|
| 105 |
+
if "page_ranges" in split.metadata:
|
| 106 |
+
for page_range in split.metadata["page_ranges"]:
|
| 107 |
+
# If chunk significantly overlaps with this page range
|
| 108 |
+
if (start_pos <= page_range["end"] and
|
| 109 |
+
end_pos >= page_range["start"]):
|
| 110 |
+
# Use this page number
|
| 111 |
+
split.metadata["page"] = page_range["page"]
|
| 112 |
+
break
|
| 113 |
+
return splits
|
| 114 |
+
|
| 115 |
+
# Split the text with start index tracking
|
| 116 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 117 |
+
chunk_size=300,
|
| 118 |
+
chunk_overlap=50,
|
| 119 |
+
length_function=tiktoken_len,
|
| 120 |
+
add_start_index=True
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Split and then process to add page information
|
| 124 |
+
raw_splits = text_splitter.split_documents(merged_docs)
|
| 125 |
+
split_chunks = add_page_info_to_splits(raw_splits)
|
| 126 |
+
|
| 127 |
+
return split_chunks
|
| 128 |
+
|
| 129 |
+
# Load and process documents at startup
|
| 130 |
+
docs = load_and_process_documents()
|
| 131 |
+
|
| 132 |
+
# Set up embeddings and vector store
|
| 133 |
+
@cl.cache
|
| 134 |
+
def setup_vector_store():
|
| 135 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 136 |
+
vectorstore = Qdrant.from_documents(
|
| 137 |
+
docs,
|
| 138 |
+
embedding_model,
|
| 139 |
+
location=":memory:",
|
| 140 |
+
collection_name="kohavi_ab_testing_pdf_collection",
|
| 141 |
+
)
|
| 142 |
+
return vectorstore
|
| 143 |
+
|
| 144 |
+
# Setup vector store at startup
|
| 145 |
+
vectorstore = setup_vector_store()
|
| 146 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
| 147 |
+
|
| 148 |
+
# Define prompts
|
| 149 |
+
RAG_PROMPT = """
|
| 150 |
+
CONTEXT:
|
| 151 |
+
{context}
|
| 152 |
+
|
| 153 |
+
QUERY:
|
| 154 |
+
{question}
|
| 155 |
+
|
| 156 |
+
You are a helpful assistant. Use the available context to answer the question. Do not use your own knowledge! If you cannot answer the question based on the context, you must say "I don't know".
|
| 157 |
+
"""
|
| 158 |
+
|
| 159 |
+
REPHRASE_QUERY_PROMPT = """
|
| 160 |
+
QUERY:
|
| 161 |
+
{question}
|
| 162 |
+
|
| 163 |
+
You are a helpful assistant. Rephrase the provided query to be more specific and to the point in order to improve retrieval in our RAG pipeline about AB Testing.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
| 167 |
+
rephrase_query_prompt = ChatPromptTemplate.from_template(REPHRASE_QUERY_PROMPT)
|
| 168 |
+
|
| 169 |
+
# Setup chat model
|
| 170 |
+
def get_openai_chat_model():
|
| 171 |
+
model = ChatOpenAI(
|
| 172 |
+
model="gpt-4-turbo",
|
| 173 |
+
temperature=0,
|
| 174 |
+
streaming=True,
|
| 175 |
+
)
|
| 176 |
+
return model
|
| 177 |
+
|
| 178 |
+
# Setup rephrased retrieval chain
|
| 179 |
+
def setup_rephrased_retriever_chain():
|
| 180 |
+
chat_model = get_openai_chat_model()
|
| 181 |
+
|
| 182 |
+
def retrieve_and_format_documents(query):
|
| 183 |
+
rephrased_query_chain = rephrase_query_prompt | chat_model | StrOutputParser()
|
| 184 |
+
|
| 185 |
+
@cl.step(name="Rephrasing query for better retrieval")
|
| 186 |
+
async def rephrase_query():
|
| 187 |
+
rephrased_query = await cl.make_async(rephrased_query_chain.invoke)({"question": query})
|
| 188 |
+
await cl.Message(content=f"Rephrased query: {rephrased_query}").send()
|
| 189 |
+
return rephrased_query
|
| 190 |
+
|
| 191 |
+
rephrased_query_result = cl.run_sync(rephrase_query())
|
| 192 |
+
|
| 193 |
+
# Get relevant documents based on rephrased query
|
| 194 |
+
docs = retriever.get_relevant_documents(rephrased_query_result)
|
| 195 |
+
|
| 196 |
+
# Extract sources from the documents for later display
|
| 197 |
+
sources = []
|
| 198 |
+
for doc in docs:
|
| 199 |
+
source_path = doc.metadata.get("source", "")
|
| 200 |
+
filename = source_path.split("/")[-1] if "/" in source_path else source_path
|
| 201 |
+
|
| 202 |
+
sources.append({
|
| 203 |
+
"title": filename,
|
| 204 |
+
"page": doc.metadata.get("page", "unknown"),
|
| 205 |
+
})
|
| 206 |
+
|
| 207 |
+
# Format documents into a string for the context
|
| 208 |
+
formatted_docs = "\n\n".join([doc.page_content for doc in docs])
|
| 209 |
+
|
| 210 |
+
return {"context": formatted_docs, "sources": sources, "question": query}
|
| 211 |
+
|
| 212 |
+
return retrieve_and_format_documents
|
| 213 |
+
|
| 214 |
+
@cl.on_chat_start
|
| 215 |
+
async def on_chat_start():
|
| 216 |
+
# Initialize the chat model and chain in the user session
|
| 217 |
+
chat_model = get_openai_chat_model()
|
| 218 |
+
retriever_chain = setup_rephrased_retriever_chain()
|
| 219 |
+
|
| 220 |
+
# Store in user session
|
| 221 |
+
cl.user_session.set("chat_model", chat_model)
|
| 222 |
+
cl.user_session.set("retriever_chain", retriever_chain)
|
| 223 |
+
|
| 224 |
+
welcome_message = """
|
| 225 |
+
# 📊 AB Testing RAG Agent
|
| 226 |
+
|
| 227 |
+
This agent can answer questions about AB Testing using a collection of PDF documents.
|
| 228 |
+
|
| 229 |
+
**Examples of questions you can ask:**
|
| 230 |
+
- What is AB Testing?
|
| 231 |
+
- How do I interpret p-values in experiments?
|
| 232 |
+
- What are the best practices for running experiments with low traffic?
|
| 233 |
+
- How should I choose metrics for my experiments?
|
| 234 |
+
"""
|
| 235 |
+
|
| 236 |
+
await cl.Message(content=welcome_message).send()
|
| 237 |
+
|
| 238 |
+
@cl.on_message
|
| 239 |
+
async def on_message(message: cl.Message):
|
| 240 |
+
# Get the chat model and retriever chain from user session
|
| 241 |
+
chat_model = cl.user_session.get("chat_model")
|
| 242 |
+
retriever_chain = cl.user_session.get("retriever_chain")
|
| 243 |
+
|
| 244 |
+
query = message.content
|
| 245 |
+
|
| 246 |
+
# Step 1: Retrieve documents and extract sources
|
| 247 |
+
@cl.step(name="Retrieving relevant documents")
|
| 248 |
+
async def retrieve_docs():
|
| 249 |
+
result = await cl.make_async(retriever_chain)(query)
|
| 250 |
+
return result
|
| 251 |
+
|
| 252 |
+
retrieval_result = await retrieve_docs()
|
| 253 |
+
|
| 254 |
+
# Store sources for later display
|
| 255 |
+
sources = retrieval_result.get("sources", [])
|
| 256 |
+
|
| 257 |
+
# Step 2: Generate response with the retrieved context
|
| 258 |
+
msg = cl.Message(content="")
|
| 259 |
+
await msg.send()
|
| 260 |
+
|
| 261 |
+
# Stream the response
|
| 262 |
+
final_answer = await chat_model.astream(
|
| 263 |
+
[
|
| 264 |
+
HumanMessage(
|
| 265 |
+
content=rag_prompt.format(
|
| 266 |
+
context=retrieval_result.get("context", ""),
|
| 267 |
+
question=query
|
| 268 |
+
)
|
| 269 |
+
)
|
| 270 |
+
],
|
| 271 |
+
callbacks=[
|
| 272 |
+
cl.AsyncLangchainCallbackHandler(
|
| 273 |
+
stream_to_message=msg,
|
| 274 |
+
append_to_message=True,
|
| 275 |
+
)
|
| 276 |
+
],
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# Display sources
|
| 280 |
+
if sources:
|
| 281 |
+
source_elements = []
|
| 282 |
+
for i, source in enumerate(sources):
|
| 283 |
+
title = source.get("title", "Unknown")
|
| 284 |
+
page = source.get("page", "Unknown")
|
| 285 |
+
|
| 286 |
+
source_elements.append(
|
| 287 |
+
Text(
|
| 288 |
+
name=f"Source {i+1}",
|
| 289 |
+
content=f"{title}, Page: {page}",
|
| 290 |
+
display="inline",
|
| 291 |
+
)
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
await msg.elements.extend(source_elements)
|
| 295 |
+
await msg.update()
|
chainlit.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "AB Testing RAG Agent",
|
| 3 |
+
"description": "Ask questions about AB Testing and get answers with sources",
|
| 4 |
+
"theme": {
|
| 5 |
+
"color": "#4F6AFF",
|
| 6 |
+
"darkMode": true
|
| 7 |
+
},
|
| 8 |
+
"features": {
|
| 9 |
+
"multi_modal": false
|
| 10 |
+
},
|
| 11 |
+
"ui": {
|
| 12 |
+
"show_header": true,
|
| 13 |
+
"show_footer": true,
|
| 14 |
+
"message_size": "large",
|
| 15 |
+
"default_expand_messages": true
|
| 16 |
+
},
|
| 17 |
+
"markdown": "chainlit.md"
|
| 18 |
+
}
|
chainlit.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📊 AB Testing RAG Agent
|
| 2 |
+
|
| 3 |
+
This application helps answer questions about AB Testing using a collection of PDF documents.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Document Retrieval**: Retrieves relevant information from AB Testing documents
|
| 8 |
+
- **Query Rephrasing**: Improves retrieval by rephrasing your query for better results
|
| 9 |
+
- **Source References**: Shows the exact document sources used to answer your question
|
| 10 |
+
- **Streaming Interface**: See the response as it's being generated
|
| 11 |
+
|
| 12 |
+
## Example Questions
|
| 13 |
+
|
| 14 |
+
- What is AB Testing?
|
| 15 |
+
- How do I interpret p-values in experiments?
|
| 16 |
+
- What are the best practices for running experiments with low traffic?
|
| 17 |
+
- What is CUPED and when should I use it?
|
| 18 |
+
- How should I choose metrics for my experiments?
|
| 19 |
+
|
| 20 |
+
## About
|
| 21 |
+
|
| 22 |
+
This application was built using:
|
| 23 |
+
- LangChain
|
| 24 |
+
- OpenAI models
|
| 25 |
+
- Qdrant vector database
|
| 26 |
+
- Chainlit frontend
|
| 27 |
+
|
| 28 |
+
The information is sourced from various AB Testing documents including research papers and articles by experts in the field.
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
ab-testing-rag:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
container_name: ab-testing-rag
|
| 9 |
+
ports:
|
| 10 |
+
- "8000:8000"
|
| 11 |
+
volumes:
|
| 12 |
+
- ./data:/app/data
|
| 13 |
+
- ./.env:/app/.env
|
| 14 |
+
environment:
|
| 15 |
+
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
| 16 |
+
restart: unless-stopped
|
huggingface-space.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AB Testing RAG Agent
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
app_port: 8000
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# AB Testing RAG Agent
|
| 13 |
+
|
| 14 |
+
This is a Hugging Face Spaces deployment of the AB Testing RAG Agent. The agent helps answer questions about AB Testing using Ron Kohavi's work and articles from Arxiv.
|
| 15 |
+
|
| 16 |
+
## Environment Variables
|
| 17 |
+
|
| 18 |
+
Environment variables needed:
|
| 19 |
+
|
| 20 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
notebook_version/.DS_Store
CHANGED
|
Binary files a/notebook_version/.DS_Store and b/notebook_version/.DS_Store differ
|
|
|
notebook_version/{Evaluating_RAG_with_Ragas_(2025)_AI_Makerspace.ipynb → AB_Testing_RAG_Agent.ipynb}
RENAMED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
chainlit==1.0.301
|
| 2 |
+
langchain==0.1.12
|
| 3 |
+
langchain-community==0.0.30
|
| 4 |
+
langchain-openai==0.0.6
|
| 5 |
+
langchain-core==0.1.30
|
| 6 |
+
langchain-qdrant==0.0.1
|
| 7 |
+
qdrant-client==1.7.3
|
| 8 |
+
openai==1.12.0
|
| 9 |
+
tiktoken==0.5.2
|
| 10 |
+
python-dotenv==1.0.1
|
| 11 |
+
unstructured==0.12.5
|
| 12 |
+
pypdf==3.17.4
|
| 13 |
+
numpy==1.26.3
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility scripts for the AB Testing RAG Agent.
|
| 3 |
+
"""
|
scripts/prepare_data.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Utility script to copy PDF files from notebook_version/data to the data directory.
|
| 4 |
+
This makes it easier to set up the application with the existing data.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import shutil
|
| 9 |
+
import glob
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
# Create data directory if it doesn't exist
|
| 14 |
+
data_dir = Path("data")
|
| 15 |
+
data_dir.mkdir(exist_ok=True)
|
| 16 |
+
|
| 17 |
+
# Source directory with PDF files
|
| 18 |
+
source_dir = Path("notebook_version/data")
|
| 19 |
+
if not source_dir.exists():
|
| 20 |
+
print(f"Error: Source directory {source_dir} does not exist.")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
# Find all PDF files in the source directory
|
| 24 |
+
pdf_files = glob.glob(str(source_dir / "*.pdf"))
|
| 25 |
+
if not pdf_files:
|
| 26 |
+
print(f"No PDF files found in {source_dir}")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Copy each PDF file to the data directory
|
| 30 |
+
for pdf_file in pdf_files:
|
| 31 |
+
filename = os.path.basename(pdf_file)
|
| 32 |
+
destination = data_dir / filename
|
| 33 |
+
|
| 34 |
+
print(f"Copying {filename}...")
|
| 35 |
+
shutil.copy2(pdf_file, destination)
|
| 36 |
+
|
| 37 |
+
print(f"Copied {len(pdf_files)} PDF files to {data_dir}")
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
main()
|