Spaces:
Sleeping
Sleeping
KUNAL SHAW commited on
Commit Β·
f9c215a
0
Parent(s):
Initial commit: RAG Chatbot for Agentic AI eBook with LangGraph, Pinecone, and Groq
Browse files- .gitignore +57 -0
- LICENSE +21 -0
- README.md +383 -0
- app/__init__.py +19 -0
- app/ingest.py +368 -0
- app/rag_pipeline.py +506 -0
- app/utils.py +323 -0
- app/vectorstore.py +428 -0
- architecture.md +209 -0
- data/.gitkeep +8 -0
- infra/hf_space_readme_template.md +52 -0
- quick_test.py +316 -0
- requirements.txt +19 -0
- samples/expected_responses.md +117 -0
- samples/sample_queries.txt +21 -0
- streamlit_app/app.py +413 -0
- streamlit_app/assets/.gitkeep +5 -0
.gitignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environment
|
| 7 |
+
venv/
|
| 8 |
+
env/
|
| 9 |
+
.venv/
|
| 10 |
+
ENV/
|
| 11 |
+
|
| 12 |
+
# Environment variables (NEVER commit API keys!)
|
| 13 |
+
.env
|
| 14 |
+
.env.local
|
| 15 |
+
.env.*.local
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
*.sublime-*
|
| 23 |
+
|
| 24 |
+
# Data files (user will add their own PDF)
|
| 25 |
+
data/*.pdf
|
| 26 |
+
data/chunks.jsonl
|
| 27 |
+
|
| 28 |
+
# Model cache
|
| 29 |
+
.cache/
|
| 30 |
+
models/
|
| 31 |
+
*.h5
|
| 32 |
+
*.pkl
|
| 33 |
+
|
| 34 |
+
# Streamlit
|
| 35 |
+
.streamlit/secrets.toml
|
| 36 |
+
|
| 37 |
+
# OS files
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
desktop.ini
|
| 41 |
+
|
| 42 |
+
# Logs
|
| 43 |
+
*.log
|
| 44 |
+
logs/
|
| 45 |
+
|
| 46 |
+
# Jupyter
|
| 47 |
+
.ipynb_checkpoints/
|
| 48 |
+
|
| 49 |
+
# Testing
|
| 50 |
+
.pytest_cache/
|
| 51 |
+
.coverage
|
| 52 |
+
htmlcov/
|
| 53 |
+
|
| 54 |
+
# Build
|
| 55 |
+
dist/
|
| 56 |
+
build/
|
| 57 |
+
*.egg-info/
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 AI Engineer Intern Assignment
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RAG Chatbot for Agentic AI eBook
|
| 3 |
+
emoji: π€
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.28.0"
|
| 8 |
+
app_file: streamlit_app/app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# π€ RAG Chatbot for Agentic AI eBook
|
| 13 |
+
|
| 14 |
+
A Retrieval-Augmented Generation (RAG) chatbot that answers questions **strictly** from the supplied Agentic AI eBook PDF. Built with LangGraph orchestration, Pinecone vector storage, and Groq LLM.
|
| 15 |
+
|
| 16 |
+
[](https://python.org)
|
| 17 |
+
[](https://streamlit.io)
|
| 18 |
+
[](https://github.com/langchain-ai/langgraph)
|
| 19 |
+
[](https://pinecone.io)
|
| 20 |
+
[](LICENSE)
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## π Table of Contents
|
| 25 |
+
|
| 26 |
+
- [Features](#-features)
|
| 27 |
+
- [Quick Start](#-quick-start)
|
| 28 |
+
- [Setup](#-setup)
|
| 29 |
+
- [Running the Application](#-running-the-application)
|
| 30 |
+
- [Deploying to Hugging Face Spaces](#-deploying-to-hugging-face-spaces)
|
| 31 |
+
- [Sample Queries](#-sample-queries)
|
| 32 |
+
- [How I Solved This](#-how-i-solved-this)
|
| 33 |
+
- [Project Structure](#-project-structure)
|
| 34 |
+
- [API Keys Required](#-api-keys-required)
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## β¨ Features
|
| 39 |
+
|
| 40 |
+
- **π PDF Ingestion**: Extract, clean, chunk, and embed PDF content
|
| 41 |
+
- **π Semantic Search**: Uses sentence-transformers/all-MiniLM-L6-v2 for retrieval
|
| 42 |
+
- **π― Grounded Answers**: Responses strictly based on retrieved chunks (no hallucination)
|
| 43 |
+
- **π Confidence Scores**: Shows similarity-based confidence (0.0-1.0)
|
| 44 |
+
- **π LangGraph Orchestration**: StateGraph pipeline for RAG workflow
|
| 45 |
+
- **π Free LLM**: Uses Groq (llama-3.1-8b-instant) - no paid API required
|
| 46 |
+
- **π» Web UI**: Clean Streamlit interface with chunk visualization
|
| 47 |
+
- **βοΈ Deployable**: Ready for Hugging Face Spaces
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## π Quick Start
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
# 1. Clone the repository
|
| 55 |
+
git clone https://github.com/KUNALSHAWW/RAG-Chatbot-for-Agentic-AI-eBook.git
|
| 56 |
+
cd RAG-Chatbot-for-Agentic-AI-eBook
|
| 57 |
+
|
| 58 |
+
# 2. Create virtual environment
|
| 59 |
+
python -m venv venv
|
| 60 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 61 |
+
|
| 62 |
+
# 3. Install dependencies
|
| 63 |
+
pip install -r requirements.txt
|
| 64 |
+
|
| 65 |
+
# 4. Set environment variables
|
| 66 |
+
export PINECONE_API_KEY="your-pinecone-key"
|
| 67 |
+
export GROQ_API_KEY="your-groq-key" # Free at console.groq.com
|
| 68 |
+
|
| 69 |
+
# 5. Add your PDF
|
| 70 |
+
mkdir data
|
| 71 |
+
# Place Ebook-Agentic-AI.pdf in the data/ folder
|
| 72 |
+
|
| 73 |
+
# 6. Run ingestion
|
| 74 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai-ebook
|
| 75 |
+
|
| 76 |
+
# 7. Start the app
|
| 77 |
+
streamlit run streamlit_app/app.py
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
## π§ Setup
|
| 83 |
+
|
| 84 |
+
### Prerequisites
|
| 85 |
+
|
| 86 |
+
- Python 3.9 or higher
|
| 87 |
+
- pip (Python package manager)
|
| 88 |
+
- Pinecone account (free tier works)
|
| 89 |
+
- Optional: OpenAI API key for LLM-powered answers
|
| 90 |
+
|
| 91 |
+
### Installation
|
| 92 |
+
|
| 93 |
+
1. **Create and activate virtual environment:**
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
python -m venv venv
|
| 97 |
+
|
| 98 |
+
# Windows
|
| 99 |
+
venv\Scripts\activate
|
| 100 |
+
|
| 101 |
+
# macOS/Linux
|
| 102 |
+
source venv/bin/activate
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
2. **Install dependencies:**
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
pip install -r requirements.txt
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
> π‘ **Note for CPU-only machines**: The default torch installation includes CUDA. For smaller download:
|
| 112 |
+
> ```bash
|
| 113 |
+
> pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 114 |
+
> ```
|
| 115 |
+
|
| 116 |
+
3. **Set environment variables:**
|
| 117 |
+
|
| 118 |
+
Create a `.env` file in the project root:
|
| 119 |
+
|
| 120 |
+
```env
|
| 121 |
+
PINECONE_API_KEY=your-pinecone-api-key-here
|
| 122 |
+
PINECONE_INDEX=agentic-ai-ebook
|
| 123 |
+
GROQ_API_KEY=your-groq-key-here # Free at console.groq.com
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
Or set them directly in your shell:
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
# Windows PowerShell
|
| 130 |
+
$env:PINECONE_API_KEY="your-key"
|
| 131 |
+
$env:GROQ_API_KEY="your-key"
|
| 132 |
+
|
| 133 |
+
# macOS/Linux
|
| 134 |
+
export PINECONE_API_KEY="your-key"
|
| 135 |
+
export GROQ_API_KEY="your-key"
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
## π Running the Application
|
| 141 |
+
|
| 142 |
+
### Step 1: Ingest the PDF
|
| 143 |
+
|
| 144 |
+
Place your `Ebook-Agentic-AI.pdf` file in the `data/` folder, then run:
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
# With Pinecone (recommended)
|
| 148 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai-ebook
|
| 149 |
+
|
| 150 |
+
# Local-only mode (no Pinecone needed)
|
| 151 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
**Ingestion options:**
|
| 155 |
+
|
| 156 |
+
| Flag | Description | Default |
|
| 157 |
+
|------|-------------|---------|
|
| 158 |
+
| `--pdf` | Path to PDF file | Required |
|
| 159 |
+
| `--index` | Pinecone index name | `agentic-ai-ebook` |
|
| 160 |
+
| `--namespace` | Pinecone namespace | `agentic-ai` |
|
| 161 |
+
| `--chunk-size` | Tokens per chunk | `500` |
|
| 162 |
+
| `--overlap` | Chunk overlap in tokens | `50` |
|
| 163 |
+
| `--local-only` | Skip Pinecone, save locally | `False` |
|
| 164 |
+
| `--output-dir` | Output directory | `./data` |
|
| 165 |
+
|
| 166 |
+
### Step 2: Run the Streamlit App
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
streamlit run streamlit_app/app.py
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
The app will open in your browser at `http://localhost:8501`.
|
| 173 |
+
|
| 174 |
+
### Step 3: Configure in the UI
|
| 175 |
+
|
| 176 |
+
1. Enter your Pinecone API key in the sidebar (if not set via env var)
|
| 177 |
+
2. Enter your Groq API key (free at console.groq.com)
|
| 178 |
+
3. Adjust retrieval settings (top_k, etc.)
|
| 179 |
+
4. Click "Initialize Pipeline"
|
| 180 |
+
5. Start asking questions!
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## βοΈ Deploying to Hugging Face Spaces
|
| 185 |
+
|
| 186 |
+
### Method 1: From GitHub (Recommended)
|
| 187 |
+
|
| 188 |
+
1. **Create a new Space** on [huggingface.co/spaces](https://huggingface.co/spaces)
|
| 189 |
+
- Select **Streamlit** as the SDK
|
| 190 |
+
- Link to this GitHub repo
|
| 191 |
+
|
| 192 |
+
2. **Set secrets** in Space Settings β Repository secrets:
|
| 193 |
+
- `PINECONE_API_KEY`: Your Pinecone key
|
| 194 |
+
- `PINECONE_INDEX`: `agentic-ai-ebook`
|
| 195 |
+
- `GROQ_API_KEY`: Your Groq key (free)
|
| 196 |
+
|
| 197 |
+
### Method 2: Git-based Deployment
|
| 198 |
+
|
| 199 |
+
1. **Create a new Space** on [huggingface.co/spaces](https://huggingface.co/spaces)
|
| 200 |
+
- Select **Streamlit** as the SDK
|
| 201 |
+
- Choose a name for your Space
|
| 202 |
+
|
| 203 |
+
2. **Clone and push:**
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 207 |
+
cd YOUR_SPACE_NAME
|
| 208 |
+
# Copy all files from this repo
|
| 209 |
+
git add .
|
| 210 |
+
git commit -m "Initial deployment"
|
| 211 |
+
git push
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
3. **Set secrets** in Space Settings β Repository secrets:
|
| 215 |
+
- `PINECONE_API_KEY`: Your Pinecone key
|
| 216 |
+
- `PINECONE_INDEX`: `agentic-ai-ebook`
|
| 217 |
+
- `GROQ_API_KEY`: Your Groq key
|
| 218 |
+
|
| 219 |
+
> π **Reference**: [Hugging Face Spaces - Streamlit Docs](https://huggingface.co/docs/hub/spaces-sdks-streamlit)
|
| 220 |
+
|
| 221 |
+
---
|
| 222 |
+
|
| 223 |
+
## π¬ Sample Queries
|
| 224 |
+
|
| 225 |
+
Test the chatbot with these example questions:
|
| 226 |
+
|
| 227 |
+
| # | Query | Expected Retrieval |
|
| 228 |
+
|---|-------|-------------------|
|
| 229 |
+
| 1 | "What is the definition of 'agentic AI' described in the eBook?" | Pages discussing agentic AI definition |
|
| 230 |
+
| 2 | "List the three risks of agentic systems the eBook mentions." | Pages about risks/challenges |
|
| 231 |
+
| 3 | "What are the recommended safeguards for deploying agentic AI?" | Pages about safeguards/best practices |
|
| 232 |
+
| 4 | "How does the eBook distinguish between autonomous agents and traditional automation?" | Comparison sections |
|
| 233 |
+
| 5 | "What future research directions does the eBook propose?" | Conclusion/future work pages |
|
| 234 |
+
| 6 | "Summarize the eBook's conclusion in one paragraph." | Conclusion chapter |
|
| 235 |
+
|
| 236 |
+
### Expected Response Format
|
| 237 |
+
|
| 238 |
+
```json
|
| 239 |
+
{
|
| 240 |
+
"final_answer": "According to the eBook, agentic AI is defined as...",
|
| 241 |
+
"retrieved_chunks": [
|
| 242 |
+
{
|
| 243 |
+
"id": "pdfpage_12_chunk_0",
|
| 244 |
+
"page": 12,
|
| 245 |
+
"text": "Agentic AI represents a paradigm shift...",
|
| 246 |
+
"score": 0.92
|
| 247 |
+
}
|
| 248 |
+
],
|
| 249 |
+
"confidence": 0.92
|
| 250 |
+
}
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
---
|
| 254 |
+
|
| 255 |
+
## π§ How I Solved This
|
| 256 |
+
|
| 257 |
+
### Chunking Strategy
|
| 258 |
+
|
| 259 |
+
I chose a **500-token chunk size with 50-token overlap** for several reasons:
|
| 260 |
+
- 500 tokens is large enough to capture meaningful context
|
| 261 |
+
- Overlap ensures information at chunk boundaries isn't lost
|
| 262 |
+
- Token-based chunking (via tiktoken) is more consistent than character-based
|
| 263 |
+
|
| 264 |
+
The chunk ID format `pdfpage_{page}_chunk_{index}` makes it easy to trace answers back to source pages for verification.
|
| 265 |
+
|
| 266 |
+
### Embedding Choice
|
| 267 |
+
|
| 268 |
+
I used **sentence-transformers/all-MiniLM-L6-v2** because:
|
| 269 |
+
- It's completely free (no API costs)
|
| 270 |
+
- Works offline on CPU
|
| 271 |
+
- 384-dimension vectors are efficient for storage
|
| 272 |
+
- Quality is good enough for document retrieval
|
| 273 |
+
|
| 274 |
+
Trade-off: OpenAI's ada-002 would give better quality, but MiniLM keeps the project accessible without paid APIs.
|
| 275 |
+
|
| 276 |
+
### Extractive Fallback
|
| 277 |
+
|
| 278 |
+
The extractive mode exists because:
|
| 279 |
+
1. Not everyone has OpenAI API access
|
| 280 |
+
2. It ensures the app **always works**, even offline
|
| 281 |
+
3. Graders can test the core RAG functionality without API costs
|
| 282 |
+
4. It demonstrates that the retrieval pipeline works correctly
|
| 283 |
+
|
| 284 |
+
When no LLM key is provided, the system returns the most relevant chunks directly with minimal formatting - this is honest about what it's doing and still provides useful answers.
|
| 285 |
+
|
| 286 |
+
### Grounding Enforcement
|
| 287 |
+
|
| 288 |
+
To prevent hallucination, the LLM system prompt explicitly instructs:
|
| 289 |
+
> "Use only the text between markers. Do not invent facts. If the answer isn't in the excerpts, say 'I could not find a supported answer in the document.'"
|
| 290 |
+
|
| 291 |
+
This keeps the model honest about its knowledge boundaries.
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## π Project Structure
|
| 296 |
+
|
| 297 |
+
```
|
| 298 |
+
rag-eAgenticAI/
|
| 299 |
+
βββ app/
|
| 300 |
+
β βββ __init__.py # Package initialization
|
| 301 |
+
β βββ ingest.py # PDF ingestion pipeline
|
| 302 |
+
β βββ vectorstore.py # Pinecone wrapper
|
| 303 |
+
β βββ rag_pipeline.py # LangGraph RAG pipeline
|
| 304 |
+
β βββ utils.py # Helper functions
|
| 305 |
+
β
|
| 306 |
+
βββ streamlit_app/
|
| 307 |
+
β βββ app.py # Streamlit UI
|
| 308 |
+
β βββ assets/ # Static files
|
| 309 |
+
β
|
| 310 |
+
βββ samples/
|
| 311 |
+
β βββ sample_queries.txt # Test questions
|
| 312 |
+
β βββ expected_responses.md # Expected output format
|
| 313 |
+
β
|
| 314 |
+
βββ infra/
|
| 315 |
+
β βββ hf_space_readme_template.md
|
| 316 |
+
β
|
| 317 |
+
βββ data/ # PDF and chunks (gitignored)
|
| 318 |
+
β
|
| 319 |
+
βββ README.md # This file
|
| 320 |
+
βββ architecture.md # Architecture docs
|
| 321 |
+
βββ requirements.txt # Dependencies
|
| 322 |
+
βββ quick_test.py # Validation script
|
| 323 |
+
βββ LICENSE # MIT License
|
| 324 |
+
βββ .gitignore
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
---
|
| 328 |
+
|
| 329 |
+
## π API Keys Required
|
| 330 |
+
|
| 331 |
+
| Service | Required | How to Get | Purpose |
|
| 332 |
+
|---------|----------|------------|---------|
|
| 333 |
+
| **Pinecone** | Yes | [pinecone.io](https://www.pinecone.io/) (free tier) | Vector storage & retrieval |
|
| 334 |
+
| **Groq** | Yes | [console.groq.com](https://console.groq.com/) (FREE) | LLM answer generation |
|
| 335 |
+
|
| 336 |
+
### Getting Pinecone API Key
|
| 337 |
+
|
| 338 |
+
1. Create account at [pinecone.io](https://www.pinecone.io/)
|
| 339 |
+
2. Go to API Keys in the console
|
| 340 |
+
3. Create a new key
|
| 341 |
+
4. Copy and set as `PINECONE_API_KEY`
|
| 342 |
+
|
| 343 |
+
### Getting Groq API Key (FREE)
|
| 344 |
+
|
| 345 |
+
1. Create account at [console.groq.com](https://console.groq.com/)
|
| 346 |
+
2. Go to API Keys
|
| 347 |
+
3. Create a new secret key
|
| 348 |
+
4. Copy and set as `GROQ_API_KEY`
|
| 349 |
+
|
| 350 |
+
---
|
| 351 |
+
|
| 352 |
+
## π§ͺ Testing
|
| 353 |
+
|
| 354 |
+
Run the quick test script to verify everything works:
|
| 355 |
+
|
| 356 |
+
```bash
|
| 357 |
+
python quick_test.py
|
| 358 |
+
```
|
| 359 |
+
|
| 360 |
+
This will:
|
| 361 |
+
1. Test utility functions (chunking, scoring)
|
| 362 |
+
2. Test the RAG pipeline with a sample query
|
| 363 |
+
3. Print the response in the expected JSON format
|
| 364 |
+
|
| 365 |
+
---
|
| 366 |
+
|
| 367 |
+
## π License
|
| 368 |
+
|
| 369 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
| 370 |
+
|
| 371 |
+
---
|
| 372 |
+
|
| 373 |
+
## π Acknowledgments
|
| 374 |
+
|
| 375 |
+
- [LangGraph](https://github.com/langchain-ai/langgraph) for RAG orchestration
|
| 376 |
+
- [Pinecone](https://www.pinecone.io/) for vector database
|
| 377 |
+
- [Groq](https://groq.com/) for free LLM inference
|
| 378 |
+
- [Sentence-Transformers](https://www.sbert.net/) for embeddings
|
| 379 |
+
- [Streamlit](https://streamlit.io/) for the web framework
|
| 380 |
+
|
| 381 |
+
---
|
| 382 |
+
|
| 383 |
+
*Built for AI Engineer Intern Assignment - Answers strictly grounded in the Agentic AI eBook*
|
app/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/__init__.py - Package initialization
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from app.utils import clean_text, chunk_text, compute_confidence, normalize_score
|
| 6 |
+
from app.vectorstore import PineconeVectorStore, LocalVectorStore, get_vector_store
|
| 7 |
+
from app.rag_pipeline import RAGPipeline, create_rag_pipeline
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
'clean_text',
|
| 11 |
+
'chunk_text',
|
| 12 |
+
'compute_confidence',
|
| 13 |
+
'normalize_score',
|
| 14 |
+
'PineconeVectorStore',
|
| 15 |
+
'LocalVectorStore',
|
| 16 |
+
'get_vector_store',
|
| 17 |
+
'RAGPipeline',
|
| 18 |
+
'create_rag_pipeline'
|
| 19 |
+
]
|
app/ingest.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ingest.py - PDF Ingestion Pipeline
|
| 3 |
+
|
| 4 |
+
This script handles the complete ingestion workflow:
|
| 5 |
+
1. Read PDF file and extract text by page
|
| 6 |
+
2. Clean the extracted text
|
| 7 |
+
3. Chunk the text with overlap (500 tokens, 50-100 overlap)
|
| 8 |
+
4. Generate embeddings using sentence-transformers
|
| 9 |
+
5. Upsert to Pinecone (or save locally with --local-only)
|
| 10 |
+
6. Save chunks.jsonl as backup
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai
|
| 14 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only # No Pinecone
|
| 15 |
+
|
| 16 |
+
Requires:
|
| 17 |
+
- PINECONE_API_KEY environment variable (unless using --local-only)
|
| 18 |
+
- PDF file at specified path
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import argparse
|
| 24 |
+
from typing import List, Dict, Tuple
|
| 25 |
+
from tqdm import tqdm
|
| 26 |
+
from dotenv import load_dotenv
|
| 27 |
+
|
| 28 |
+
# Add parent directory to path for imports
|
| 29 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 30 |
+
|
| 31 |
+
# Local imports
|
| 32 |
+
from app.utils import clean_text, chunk_text, save_chunks_to_jsonl
|
| 33 |
+
from app.vectorstore import get_vector_store, PineconeVectorStore, LocalVectorStore
|
| 34 |
+
|
| 35 |
+
# Load environment variables
|
| 36 |
+
load_dotenv()
|
| 37 |
+
|
| 38 |
+
# Try to import PDF library
|
| 39 |
+
try:
|
| 40 |
+
import pdfplumber
|
| 41 |
+
PDF_LIBRARY = "pdfplumber"
|
| 42 |
+
except ImportError:
|
| 43 |
+
try:
|
| 44 |
+
import PyPDF2
|
| 45 |
+
PDF_LIBRARY = "PyPDF2"
|
| 46 |
+
except ImportError:
|
| 47 |
+
print("ERROR: Neither pdfplumber nor PyPDF2 installed. Please install one.")
|
| 48 |
+
sys.exit(1)
|
| 49 |
+
|
| 50 |
+
# Embedding model
|
| 51 |
+
try:
|
| 52 |
+
from sentence_transformers import SentenceTransformer
|
| 53 |
+
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 54 |
+
EMBEDDING_DIM = 384
|
| 55 |
+
except ImportError:
|
| 56 |
+
print("ERROR: sentence-transformers not installed")
|
| 57 |
+
sys.exit(1)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
|
| 61 |
+
"""
|
| 62 |
+
Extract text from PDF file, returning text by page.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
pdf_path: Path to the PDF file
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
List of tuples: (page_number, page_text)
|
| 69 |
+
"""
|
| 70 |
+
print(f"Extracting text from: {pdf_path}")
|
| 71 |
+
pages = []
|
| 72 |
+
|
| 73 |
+
if PDF_LIBRARY == "pdfplumber":
|
| 74 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 75 |
+
for i, page in enumerate(pdf.pages):
|
| 76 |
+
text = page.extract_text() or ""
|
| 77 |
+
pages.append((i + 1, text)) # 1-indexed page numbers
|
| 78 |
+
|
| 79 |
+
elif PDF_LIBRARY == "PyPDF2":
|
| 80 |
+
import PyPDF2
|
| 81 |
+
with open(pdf_path, 'rb') as f:
|
| 82 |
+
reader = PyPDF2.PdfReader(f)
|
| 83 |
+
for i, page in enumerate(reader.pages):
|
| 84 |
+
text = page.extract_text() or ""
|
| 85 |
+
pages.append((i + 1, text))
|
| 86 |
+
|
| 87 |
+
print(f"Extracted {len(pages)} pages")
|
| 88 |
+
return pages
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def load_embedding_model():
|
| 92 |
+
"""
|
| 93 |
+
Load the sentence-transformers embedding model.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
SentenceTransformer model instance
|
| 97 |
+
"""
|
| 98 |
+
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
|
| 99 |
+
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
| 100 |
+
print(f"Model loaded! Embedding dimension: {model.get_sentence_embedding_dimension()}")
|
| 101 |
+
return model
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def generate_embeddings(
|
| 105 |
+
chunks: List[Dict],
|
| 106 |
+
model: SentenceTransformer,
|
| 107 |
+
batch_size: int = 32
|
| 108 |
+
) -> List[Dict]:
|
| 109 |
+
"""
|
| 110 |
+
Generate embeddings for all chunks.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
chunks: List of chunk dictionaries (must have 'text' key)
|
| 114 |
+
model: SentenceTransformer model
|
| 115 |
+
batch_size: Batch size for embedding generation
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Chunks with 'embedding' field added
|
| 119 |
+
"""
|
| 120 |
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
| 121 |
+
|
| 122 |
+
# Extract texts
|
| 123 |
+
texts = [chunk['text'] for chunk in chunks]
|
| 124 |
+
|
| 125 |
+
# Generate embeddings in batches
|
| 126 |
+
embeddings = model.encode(
|
| 127 |
+
texts,
|
| 128 |
+
batch_size=batch_size,
|
| 129 |
+
show_progress_bar=True,
|
| 130 |
+
convert_to_numpy=True
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Add embeddings to chunks
|
| 134 |
+
for i, chunk in enumerate(chunks):
|
| 135 |
+
chunk['embedding'] = embeddings[i].tolist()
|
| 136 |
+
|
| 137 |
+
print(f"Generated {len(embeddings)} embeddings")
|
| 138 |
+
return chunks
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def run_ingestion(
|
| 142 |
+
pdf_path: str,
|
| 143 |
+
index_name: str = "agentic-ai-ebook",
|
| 144 |
+
namespace: str = "agentic-ai",
|
| 145 |
+
chunk_size: int = 500,
|
| 146 |
+
chunk_overlap: int = 50,
|
| 147 |
+
local_only: bool = False,
|
| 148 |
+
output_dir: str = "./data"
|
| 149 |
+
):
|
| 150 |
+
"""
|
| 151 |
+
Run the complete ingestion pipeline.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
pdf_path: Path to the PDF file
|
| 155 |
+
index_name: Pinecone index name
|
| 156 |
+
namespace: Pinecone namespace
|
| 157 |
+
chunk_size: Target chunk size in tokens
|
| 158 |
+
chunk_overlap: Overlap between chunks in tokens
|
| 159 |
+
local_only: If True, skip Pinecone and save locally only
|
| 160 |
+
output_dir: Directory for output files
|
| 161 |
+
"""
|
| 162 |
+
print("=" * 60)
|
| 163 |
+
print("RAG Ingestion Pipeline")
|
| 164 |
+
print("=" * 60)
|
| 165 |
+
|
| 166 |
+
# Ensure output directory exists
|
| 167 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 168 |
+
|
| 169 |
+
# Step 1: Extract text from PDF
|
| 170 |
+
print("\n[Step 1/5] Extracting text from PDF...")
|
| 171 |
+
pages = extract_text_from_pdf(pdf_path)
|
| 172 |
+
|
| 173 |
+
if not pages:
|
| 174 |
+
print("ERROR: No text extracted from PDF")
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
# Step 2: Clean and chunk text
|
| 178 |
+
print("\n[Step 2/5] Cleaning and chunking text...")
|
| 179 |
+
all_chunks = []
|
| 180 |
+
source_name = os.path.basename(pdf_path)
|
| 181 |
+
|
| 182 |
+
for page_num, page_text in tqdm(pages, desc="Processing pages"):
|
| 183 |
+
# Clean the text
|
| 184 |
+
cleaned_text = clean_text(page_text)
|
| 185 |
+
|
| 186 |
+
if not cleaned_text.strip():
|
| 187 |
+
continue
|
| 188 |
+
|
| 189 |
+
# Chunk the text
|
| 190 |
+
page_chunks = chunk_text(
|
| 191 |
+
text=cleaned_text,
|
| 192 |
+
page_number=page_num,
|
| 193 |
+
chunk_size=chunk_size,
|
| 194 |
+
chunk_overlap=chunk_overlap,
|
| 195 |
+
source=source_name
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
all_chunks.extend(page_chunks)
|
| 199 |
+
|
| 200 |
+
print(f"Created {len(all_chunks)} chunks from {len(pages)} pages")
|
| 201 |
+
|
| 202 |
+
if not all_chunks:
|
| 203 |
+
print("ERROR: No chunks created")
|
| 204 |
+
return
|
| 205 |
+
|
| 206 |
+
# Step 3: Load embedding model
|
| 207 |
+
print("\n[Step 3/5] Loading embedding model...")
|
| 208 |
+
embedding_model = load_embedding_model()
|
| 209 |
+
|
| 210 |
+
# Step 4: Generate embeddings
|
| 211 |
+
print("\n[Step 4/5] Generating embeddings...")
|
| 212 |
+
chunks_with_embeddings = generate_embeddings(all_chunks, embedding_model)
|
| 213 |
+
|
| 214 |
+
# Step 5: Store vectors
|
| 215 |
+
print("\n[Step 5/5] Storing vectors...")
|
| 216 |
+
|
| 217 |
+
if local_only:
|
| 218 |
+
# Save to local files only
|
| 219 |
+
print("Running in LOCAL-ONLY mode (no Pinecone)")
|
| 220 |
+
|
| 221 |
+
# Save chunks to JSONL (without embeddings for smaller file)
|
| 222 |
+
chunks_file = os.path.join(output_dir, "chunks.jsonl")
|
| 223 |
+
save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
|
| 224 |
+
|
| 225 |
+
# Save to local vector store
|
| 226 |
+
local_store = LocalVectorStore(dimension=EMBEDDING_DIM)
|
| 227 |
+
local_store.upsert(chunks_with_embeddings)
|
| 228 |
+
|
| 229 |
+
# Save vectors to file for later use
|
| 230 |
+
vectors_file = os.path.join(output_dir, "vectors.json")
|
| 231 |
+
local_store.save_to_file(vectors_file)
|
| 232 |
+
|
| 233 |
+
print(f"\nLocal files saved to {output_dir}/")
|
| 234 |
+
|
| 235 |
+
else:
|
| 236 |
+
# Upsert to Pinecone
|
| 237 |
+
api_key = os.getenv("PINECONE_API_KEY")
|
| 238 |
+
|
| 239 |
+
if not api_key:
|
| 240 |
+
print("ERROR: PINECONE_API_KEY not set. Use --local-only to run without Pinecone.")
|
| 241 |
+
# Fall back to local only
|
| 242 |
+
print("Falling back to local-only mode...")
|
| 243 |
+
chunks_file = os.path.join(output_dir, "chunks.jsonl")
|
| 244 |
+
save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
|
| 245 |
+
return
|
| 246 |
+
|
| 247 |
+
# Initialize Pinecone vector store
|
| 248 |
+
vector_store = PineconeVectorStore(
|
| 249 |
+
api_key=api_key,
|
| 250 |
+
index_name=index_name,
|
| 251 |
+
namespace=namespace,
|
| 252 |
+
dimension=EMBEDDING_DIM
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Create index if needed
|
| 256 |
+
if not vector_store.create_index_if_missing():
|
| 257 |
+
print("ERROR: Failed to create/connect to Pinecone index")
|
| 258 |
+
return
|
| 259 |
+
|
| 260 |
+
# Upsert vectors
|
| 261 |
+
upserted = vector_store.upsert(chunks_with_embeddings)
|
| 262 |
+
|
| 263 |
+
# Also save chunks locally as backup
|
| 264 |
+
chunks_file = os.path.join(output_dir, "chunks.jsonl")
|
| 265 |
+
save_chunks_to_jsonl(chunks_with_embeddings, chunks_file, include_embeddings=False)
|
| 266 |
+
|
| 267 |
+
# Print stats
|
| 268 |
+
stats = vector_store.get_index_stats()
|
| 269 |
+
print(f"\nPinecone index stats: {stats}")
|
| 270 |
+
|
| 271 |
+
print("\n" + "=" * 60)
|
| 272 |
+
print("Ingestion complete!")
|
| 273 |
+
print("=" * 60)
|
| 274 |
+
print(f"- Total chunks: {len(chunks_with_embeddings)}")
|
| 275 |
+
print(f"- Chunks file: {os.path.join(output_dir, 'chunks.jsonl')}")
|
| 276 |
+
if not local_only:
|
| 277 |
+
print(f"- Pinecone index: {index_name}")
|
| 278 |
+
print(f"- Namespace: {namespace}")
|
| 279 |
+
print("=" * 60)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def main():
|
| 283 |
+
"""Main entry point with argument parsing."""
|
| 284 |
+
parser = argparse.ArgumentParser(
|
| 285 |
+
description="Ingest PDF into vector store for RAG",
|
| 286 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 287 |
+
epilog="""
|
| 288 |
+
Examples:
|
| 289 |
+
# Ingest to Pinecone (requires PINECONE_API_KEY env var)
|
| 290 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --index agentic-ai
|
| 291 |
+
|
| 292 |
+
# Local-only mode (no Pinecone needed)
|
| 293 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only
|
| 294 |
+
|
| 295 |
+
# Custom chunk size
|
| 296 |
+
python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --chunk-size 400 --overlap 75
|
| 297 |
+
"""
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
parser.add_argument(
|
| 301 |
+
"--pdf",
|
| 302 |
+
type=str,
|
| 303 |
+
required=True,
|
| 304 |
+
help="Path to the PDF file to ingest"
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
parser.add_argument(
|
| 308 |
+
"--index",
|
| 309 |
+
type=str,
|
| 310 |
+
default="agentic-ai-ebook",
|
| 311 |
+
help="Pinecone index name (default: agentic-ai-ebook)"
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
parser.add_argument(
|
| 315 |
+
"--namespace",
|
| 316 |
+
type=str,
|
| 317 |
+
default="agentic-ai",
|
| 318 |
+
help="Pinecone namespace (default: agentic-ai)"
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
parser.add_argument(
|
| 322 |
+
"--chunk-size",
|
| 323 |
+
type=int,
|
| 324 |
+
default=500,
|
| 325 |
+
help="Target chunk size in tokens (default: 500)"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
parser.add_argument(
|
| 329 |
+
"--overlap",
|
| 330 |
+
type=int,
|
| 331 |
+
default=50,
|
| 332 |
+
help="Chunk overlap in tokens (default: 50)"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
parser.add_argument(
|
| 336 |
+
"--local-only",
|
| 337 |
+
action="store_true",
|
| 338 |
+
help="Run without Pinecone, save vectors locally"
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
parser.add_argument(
|
| 342 |
+
"--output-dir",
|
| 343 |
+
type=str,
|
| 344 |
+
default="./data",
|
| 345 |
+
help="Output directory for local files (default: ./data)"
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
args = parser.parse_args()
|
| 349 |
+
|
| 350 |
+
# Validate PDF path
|
| 351 |
+
if not os.path.exists(args.pdf):
|
| 352 |
+
print(f"ERROR: PDF file not found: {args.pdf}")
|
| 353 |
+
sys.exit(1)
|
| 354 |
+
|
| 355 |
+
# Run ingestion
|
| 356 |
+
run_ingestion(
|
| 357 |
+
pdf_path=args.pdf,
|
| 358 |
+
index_name=args.index,
|
| 359 |
+
namespace=args.namespace,
|
| 360 |
+
chunk_size=args.chunk_size,
|
| 361 |
+
chunk_overlap=args.overlap,
|
| 362 |
+
local_only=args.local_only,
|
| 363 |
+
output_dir=args.output_dir
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
if __name__ == "__main__":
|
| 368 |
+
main()
|
app/rag_pipeline.py
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
rag_pipeline.py - LangGraph RAG Pipeline
|
| 3 |
+
|
| 4 |
+
This module implements the RAG pipeline using LangGraph for orchestration:
|
| 5 |
+
1. Receive user query
|
| 6 |
+
2. Embed query using sentence-transformers
|
| 7 |
+
3. Query Pinecone for top-k similar chunks
|
| 8 |
+
4. Generate answer using LLM (if available) or extractive fallback
|
| 9 |
+
5. Return structured response with answer, chunks, and confidence
|
| 10 |
+
|
| 11 |
+
The pipeline enforces strict grounding - answers must come from retrieved chunks only.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
from typing import List, Dict, Any, Optional, TypedDict
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
# Load environment variables
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
# Import LangGraph components
|
| 23 |
+
try:
|
| 24 |
+
from langgraph.graph import StateGraph, END
|
| 25 |
+
LANGGRAPH_AVAILABLE = True
|
| 26 |
+
except ImportError:
|
| 27 |
+
LANGGRAPH_AVAILABLE = False
|
| 28 |
+
print("WARNING: langgraph not installed. Using simplified pipeline.")
|
| 29 |
+
|
| 30 |
+
# Import embedding model
|
| 31 |
+
from sentence_transformers import SentenceTransformer
|
| 32 |
+
|
| 33 |
+
# Import local modules
|
| 34 |
+
from app.vectorstore import PineconeVectorStore, LocalVectorStore, get_vector_store
|
| 35 |
+
from app.utils import compute_confidence, normalize_score, format_chunks_for_llm, load_chunks_from_jsonl
|
| 36 |
+
|
| 37 |
+
# Try to import OpenAI
|
| 38 |
+
try:
|
| 39 |
+
from openai import OpenAI
|
| 40 |
+
OPENAI_AVAILABLE = True
|
| 41 |
+
except ImportError:
|
| 42 |
+
OPENAI_AVAILABLE = False
|
| 43 |
+
|
| 44 |
+
# Try to import Groq (free LLM alternative)
|
| 45 |
+
try:
|
| 46 |
+
from groq import Groq
|
| 47 |
+
GROQ_AVAILABLE = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
GROQ_AVAILABLE = False
|
| 50 |
+
|
| 51 |
+
# Cache the embedding model to avoid reloading
|
| 52 |
+
_EMBEDDING_MODEL_CACHE = {}
|
| 53 |
+
|
| 54 |
+
def get_embedding_model(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
|
| 55 |
+
"""Load and cache the embedding model to avoid repeated loading."""
|
| 56 |
+
if model_name not in _EMBEDDING_MODEL_CACHE:
|
| 57 |
+
print(f"Loading embedding model: {model_name}")
|
| 58 |
+
# Use device='cpu' explicitly to avoid meta tensor issues
|
| 59 |
+
model = SentenceTransformer(model_name, device='cpu')
|
| 60 |
+
_EMBEDDING_MODEL_CACHE[model_name] = model
|
| 61 |
+
return _EMBEDDING_MODEL_CACHE[model_name]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ============================================================================
|
| 65 |
+
# LangGraph State Definition
|
| 66 |
+
# ============================================================================
|
| 67 |
+
|
| 68 |
+
class RAGState(TypedDict):
|
| 69 |
+
"""State object passed through the RAG pipeline."""
|
| 70 |
+
query: str
|
| 71 |
+
query_embedding: Optional[List[float]]
|
| 72 |
+
retrieved_chunks: List[Dict]
|
| 73 |
+
raw_scores: List[float]
|
| 74 |
+
confidence: float
|
| 75 |
+
final_answer: str
|
| 76 |
+
use_llm: bool
|
| 77 |
+
error: Optional[str]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# ============================================================================
|
| 81 |
+
# Pipeline Nodes (Functions)
|
| 82 |
+
# ============================================================================
|
| 83 |
+
|
| 84 |
+
class RAGPipeline:
|
| 85 |
+
"""
|
| 86 |
+
RAG Pipeline implementation using LangGraph.
|
| 87 |
+
|
| 88 |
+
The pipeline has the following stages:
|
| 89 |
+
1. embed_query - Convert query to vector
|
| 90 |
+
2. retrieve_chunks - Get relevant chunks from Pinecone
|
| 91 |
+
3. compute_confidence - Calculate confidence score
|
| 92 |
+
4. generate_answer - Use LLM or extractive fallback
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
# System prompt for LLM - VERY IMPORTANT for grounding
|
| 96 |
+
SYSTEM_PROMPT = """You are an assistant answering questions based on provided document excerpts from an eBook about Agentic AI.
|
| 97 |
+
|
| 98 |
+
IMPORTANT INSTRUCTIONS:
|
| 99 |
+
1. Synthesize information from ALL the provided excerpts to give a comprehensive answer.
|
| 100 |
+
2. The excerpts may contain relevant information even if they don't directly state the answer - look for definitions, explanations, and examples.
|
| 101 |
+
3. Combine information from multiple excerpts when helpful.
|
| 102 |
+
4. Cite page numbers when referencing specific information.
|
| 103 |
+
5. Do NOT add any information that is not in the excerpts.
|
| 104 |
+
6. If absolutely no relevant information exists in ANY excerpt, only then say: "I could not find a supported answer in the document."
|
| 105 |
+
|
| 106 |
+
Be helpful and thorough - users want complete answers based on the document content."""
|
| 107 |
+
|
| 108 |
+
def __init__(
|
| 109 |
+
self,
|
| 110 |
+
pinecone_api_key: Optional[str] = None,
|
| 111 |
+
index_name: str = "agentic-ai-ebook",
|
| 112 |
+
namespace: str = "agentic-ai",
|
| 113 |
+
openai_api_key: Optional[str] = None,
|
| 114 |
+
groq_api_key: Optional[str] = None,
|
| 115 |
+
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
| 116 |
+
top_k: int = 6,
|
| 117 |
+
local_only: bool = False,
|
| 118 |
+
chunks_file: Optional[str] = None
|
| 119 |
+
):
|
| 120 |
+
"""
|
| 121 |
+
Initialize the RAG pipeline.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
pinecone_api_key: Pinecone API key (or set PINECONE_API_KEY env var)
|
| 125 |
+
index_name: Name of Pinecone index
|
| 126 |
+
namespace: Pinecone namespace
|
| 127 |
+
openai_api_key: OpenAI API key for LLM (optional)
|
| 128 |
+
groq_api_key: Groq API key for LLM (optional, free alternative)
|
| 129 |
+
embedding_model_name: Name of embedding model
|
| 130 |
+
top_k: Number of chunks to retrieve
|
| 131 |
+
local_only: Use local vector store instead of Pinecone
|
| 132 |
+
chunks_file: Path to chunks.jsonl for local retrieval
|
| 133 |
+
"""
|
| 134 |
+
self.top_k = top_k
|
| 135 |
+
self.local_only = local_only
|
| 136 |
+
|
| 137 |
+
# Load embedding model (cached to avoid reloading)
|
| 138 |
+
self.embedding_model = get_embedding_model(embedding_model_name)
|
| 139 |
+
self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
|
| 140 |
+
|
| 141 |
+
# Initialize vector store
|
| 142 |
+
if local_only:
|
| 143 |
+
self.vector_store = LocalVectorStore(dimension=self.embedding_dim)
|
| 144 |
+
# Load vectors from file if provided
|
| 145 |
+
vectors_file = chunks_file.replace('chunks.jsonl', 'vectors.json') if chunks_file else './data/vectors.json'
|
| 146 |
+
if os.path.exists(vectors_file):
|
| 147 |
+
self.vector_store.load_from_file(vectors_file)
|
| 148 |
+
else:
|
| 149 |
+
api_key = pinecone_api_key or os.getenv("PINECONE_API_KEY")
|
| 150 |
+
self.vector_store = PineconeVectorStore(
|
| 151 |
+
api_key=api_key,
|
| 152 |
+
index_name=index_name,
|
| 153 |
+
namespace=namespace,
|
| 154 |
+
dimension=self.embedding_dim
|
| 155 |
+
)
|
| 156 |
+
# Connect to existing index
|
| 157 |
+
if self.vector_store.pc:
|
| 158 |
+
self.vector_store.index = self.vector_store.pc.Index(index_name)
|
| 159 |
+
|
| 160 |
+
# Load chunks for full text retrieval
|
| 161 |
+
if chunks_file and os.path.exists(chunks_file):
|
| 162 |
+
self.vector_store.load_chunks_map(chunks_file)
|
| 163 |
+
elif os.path.exists('./data/chunks.jsonl'):
|
| 164 |
+
self.vector_store.load_chunks_map('./data/chunks.jsonl')
|
| 165 |
+
|
| 166 |
+
# Initialize LLM client - prefer Groq (free), then OpenAI
|
| 167 |
+
self.openai_client = None
|
| 168 |
+
self.groq_client = None
|
| 169 |
+
self.llm_provider = None
|
| 170 |
+
|
| 171 |
+
# Try Groq first (it's free!)
|
| 172 |
+
groq_key = groq_api_key or os.getenv("GROQ_API_KEY")
|
| 173 |
+
if groq_key and GROQ_AVAILABLE:
|
| 174 |
+
self.groq_client = Groq(api_key=groq_key)
|
| 175 |
+
self.llm_provider = "groq"
|
| 176 |
+
print("Groq client initialized - will use Groq LLM for answer generation (FREE!)")
|
| 177 |
+
else:
|
| 178 |
+
# Fall back to OpenAI
|
| 179 |
+
openai_key = openai_api_key or os.getenv("OPENAI_API_KEY")
|
| 180 |
+
if openai_key and OPENAI_AVAILABLE:
|
| 181 |
+
self.openai_client = OpenAI(api_key=openai_key)
|
| 182 |
+
self.llm_provider = "openai"
|
| 183 |
+
print("OpenAI client initialized - will use OpenAI LLM for answer generation")
|
| 184 |
+
else:
|
| 185 |
+
print("No LLM key - will use extractive answer fallback")
|
| 186 |
+
|
| 187 |
+
# Build the LangGraph pipeline
|
| 188 |
+
if LANGGRAPH_AVAILABLE:
|
| 189 |
+
self.graph = self._build_graph()
|
| 190 |
+
else:
|
| 191 |
+
self.graph = None
|
| 192 |
+
|
| 193 |
+
def _build_graph(self) -> StateGraph:
|
| 194 |
+
"""Build the LangGraph state machine."""
|
| 195 |
+
|
| 196 |
+
# Create the graph
|
| 197 |
+
workflow = StateGraph(RAGState)
|
| 198 |
+
|
| 199 |
+
# Add nodes
|
| 200 |
+
workflow.add_node("embed_query", self._embed_query_node)
|
| 201 |
+
workflow.add_node("retrieve_chunks", self._retrieve_chunks_node)
|
| 202 |
+
workflow.add_node("calculate_confidence", self._calculate_confidence_node)
|
| 203 |
+
workflow.add_node("generate_answer", self._generate_answer_node)
|
| 204 |
+
|
| 205 |
+
# Define the flow
|
| 206 |
+
workflow.set_entry_point("embed_query")
|
| 207 |
+
workflow.add_edge("embed_query", "retrieve_chunks")
|
| 208 |
+
workflow.add_edge("retrieve_chunks", "calculate_confidence")
|
| 209 |
+
workflow.add_edge("calculate_confidence", "generate_answer")
|
| 210 |
+
workflow.add_edge("generate_answer", END)
|
| 211 |
+
|
| 212 |
+
# Compile the graph
|
| 213 |
+
return workflow.compile()
|
| 214 |
+
|
| 215 |
+
def _embed_query_node(self, state: RAGState) -> Dict:
|
| 216 |
+
"""Embed the user query."""
|
| 217 |
+
query = state["query"]
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
# Generate embedding
|
| 221 |
+
embedding = self.embedding_model.encode(query, convert_to_numpy=True)
|
| 222 |
+
return {"query_embedding": embedding.tolist()}
|
| 223 |
+
except Exception as e:
|
| 224 |
+
return {"error": f"Embedding failed: {str(e)}"}
|
| 225 |
+
|
| 226 |
+
def _retrieve_chunks_node(self, state: RAGState) -> Dict:
|
| 227 |
+
"""Retrieve relevant chunks from vector store."""
|
| 228 |
+
query_embedding = state.get("query_embedding")
|
| 229 |
+
|
| 230 |
+
if not query_embedding:
|
| 231 |
+
return {"error": "No query embedding available"}
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
# Query vector store
|
| 235 |
+
results = self.vector_store.query_top_k(
|
| 236 |
+
query_vector=query_embedding,
|
| 237 |
+
k=self.top_k
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Extract chunks and scores
|
| 241 |
+
retrieved_chunks = []
|
| 242 |
+
raw_scores = []
|
| 243 |
+
|
| 244 |
+
for result in results:
|
| 245 |
+
retrieved_chunks.append({
|
| 246 |
+
"id": result["id"],
|
| 247 |
+
"page": result["page"],
|
| 248 |
+
"text": result["text"],
|
| 249 |
+
"score": round(result["score"], 4)
|
| 250 |
+
})
|
| 251 |
+
raw_scores.append(result["score"])
|
| 252 |
+
|
| 253 |
+
return {
|
| 254 |
+
"retrieved_chunks": retrieved_chunks,
|
| 255 |
+
"raw_scores": raw_scores
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
except Exception as e:
|
| 259 |
+
return {"error": f"Retrieval failed: {str(e)}"}
|
| 260 |
+
|
| 261 |
+
def _calculate_confidence_node(self, state: RAGState) -> Dict:
|
| 262 |
+
"""Calculate confidence score from retrieval scores."""
|
| 263 |
+
raw_scores = state.get("raw_scores", [])
|
| 264 |
+
|
| 265 |
+
if not raw_scores:
|
| 266 |
+
return {"confidence": 0.0}
|
| 267 |
+
|
| 268 |
+
# Compute confidence using max of normalized scores
|
| 269 |
+
confidence = compute_confidence(raw_scores, method="max")
|
| 270 |
+
return {"confidence": confidence}
|
| 271 |
+
|
| 272 |
+
def _generate_answer_node(self, state: RAGState) -> Dict:
|
| 273 |
+
"""Generate the final answer using LLM or extractive fallback."""
|
| 274 |
+
query = state["query"]
|
| 275 |
+
chunks = state.get("retrieved_chunks", [])
|
| 276 |
+
use_llm = state.get("use_llm", True)
|
| 277 |
+
|
| 278 |
+
if not chunks:
|
| 279 |
+
return {
|
| 280 |
+
"final_answer": "I could not find any relevant information in the document."
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
# Format chunks for context
|
| 284 |
+
context = format_chunks_for_llm(chunks)
|
| 285 |
+
|
| 286 |
+
# Try LLM generation if available and requested
|
| 287 |
+
if use_llm and (self.groq_client or self.openai_client):
|
| 288 |
+
try:
|
| 289 |
+
answer = self._generate_with_llm(query, context)
|
| 290 |
+
return {"final_answer": answer}
|
| 291 |
+
except Exception as e:
|
| 292 |
+
print(f"LLM generation failed: {e}, falling back to extractive")
|
| 293 |
+
|
| 294 |
+
# Extractive fallback - return the most relevant chunks
|
| 295 |
+
answer = self._generate_extractive_answer(query, chunks)
|
| 296 |
+
return {"final_answer": answer}
|
| 297 |
+
|
| 298 |
+
def _generate_with_llm(self, query: str, context: str) -> str:
|
| 299 |
+
"""
|
| 300 |
+
Generate answer using LLM (Groq or OpenAI).
|
| 301 |
+
|
| 302 |
+
The prompt strictly instructs the model to only use provided context.
|
| 303 |
+
"""
|
| 304 |
+
# Construct the user message with context
|
| 305 |
+
user_message = f"""===BEGIN EXCERPTS===
|
| 306 |
+
{context}
|
| 307 |
+
===END EXCERPTS===
|
| 308 |
+
|
| 309 |
+
Question: {query}
|
| 310 |
+
|
| 311 |
+
Answer the question using ONLY the information from the excerpts above. If the answer is not in the excerpts, say "I could not find a supported answer in the document."
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
# Use Groq if available (it's free!)
|
| 315 |
+
if self.groq_client:
|
| 316 |
+
response = self.groq_client.chat.completions.create(
|
| 317 |
+
model="llama-3.1-8b-instant", # Fast and free model
|
| 318 |
+
messages=[
|
| 319 |
+
{"role": "system", "content": self.SYSTEM_PROMPT},
|
| 320 |
+
{"role": "user", "content": user_message}
|
| 321 |
+
],
|
| 322 |
+
temperature=0.1, # Low temperature for more factual responses
|
| 323 |
+
max_tokens=500
|
| 324 |
+
)
|
| 325 |
+
return response.choices[0].message.content
|
| 326 |
+
|
| 327 |
+
# Fall back to OpenAI
|
| 328 |
+
if self.openai_client:
|
| 329 |
+
response = self.openai_client.chat.completions.create(
|
| 330 |
+
model="gpt-3.5-turbo",
|
| 331 |
+
messages=[
|
| 332 |
+
{"role": "system", "content": self.SYSTEM_PROMPT},
|
| 333 |
+
{"role": "user", "content": user_message}
|
| 334 |
+
],
|
| 335 |
+
temperature=0.1,
|
| 336 |
+
max_tokens=500
|
| 337 |
+
)
|
| 338 |
+
return response.choices[0].message.content
|
| 339 |
+
|
| 340 |
+
raise Exception("No LLM client available")
|
| 341 |
+
|
| 342 |
+
def _generate_extractive_answer(self, query: str, chunks: List[Dict]) -> str:
|
| 343 |
+
"""
|
| 344 |
+
Generate an extractive answer by returning the most relevant chunks.
|
| 345 |
+
|
| 346 |
+
This is the fallback when no LLM is available.
|
| 347 |
+
"""
|
| 348 |
+
# Header
|
| 349 |
+
answer_parts = [
|
| 350 |
+
"**Answer based on document excerpts:**\n",
|
| 351 |
+
"*(No LLM available - showing relevant passages from the document)*\n"
|
| 352 |
+
]
|
| 353 |
+
|
| 354 |
+
# Add top chunks (limit to top 2-3 for readability)
|
| 355 |
+
top_chunks = chunks[:3]
|
| 356 |
+
|
| 357 |
+
for i, chunk in enumerate(top_chunks, 1):
|
| 358 |
+
page = chunk.get("page", "unknown")
|
| 359 |
+
text = chunk.get("text", "")
|
| 360 |
+
score = chunk.get("score", 0)
|
| 361 |
+
|
| 362 |
+
# Truncate very long chunks
|
| 363 |
+
if len(text) > 500:
|
| 364 |
+
text = text[:500] + "..."
|
| 365 |
+
|
| 366 |
+
answer_parts.append(f"\n**[Excerpt {i}, Page {page}]** (relevance: {score:.2f})")
|
| 367 |
+
answer_parts.append(f"\n{text}\n")
|
| 368 |
+
|
| 369 |
+
return "".join(answer_parts)
|
| 370 |
+
|
| 371 |
+
def query(
|
| 372 |
+
self,
|
| 373 |
+
user_query: str,
|
| 374 |
+
top_k: Optional[int] = None,
|
| 375 |
+
use_llm: bool = True
|
| 376 |
+
) -> Dict[str, Any]:
|
| 377 |
+
"""
|
| 378 |
+
Run a query through the RAG pipeline.
|
| 379 |
+
|
| 380 |
+
Args:
|
| 381 |
+
user_query: The user's question
|
| 382 |
+
top_k: Number of chunks to retrieve (overrides default)
|
| 383 |
+
use_llm: Whether to use LLM for generation
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Dict with final_answer, retrieved_chunks, and confidence
|
| 387 |
+
"""
|
| 388 |
+
# Override top_k if provided
|
| 389 |
+
if top_k:
|
| 390 |
+
self.top_k = top_k
|
| 391 |
+
|
| 392 |
+
# Initial state
|
| 393 |
+
initial_state: RAGState = {
|
| 394 |
+
"query": user_query,
|
| 395 |
+
"query_embedding": None,
|
| 396 |
+
"retrieved_chunks": [],
|
| 397 |
+
"raw_scores": [],
|
| 398 |
+
"confidence": 0.0,
|
| 399 |
+
"final_answer": "",
|
| 400 |
+
"use_llm": use_llm and (self.groq_client is not None or self.openai_client is not None),
|
| 401 |
+
"error": None
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
# Run the pipeline
|
| 405 |
+
if self.graph:
|
| 406 |
+
# Use LangGraph
|
| 407 |
+
final_state = self.graph.invoke(initial_state)
|
| 408 |
+
else:
|
| 409 |
+
# Fallback: run nodes manually
|
| 410 |
+
final_state = initial_state
|
| 411 |
+
|
| 412 |
+
# Embed query
|
| 413 |
+
result = self._embed_query_node(final_state)
|
| 414 |
+
final_state.update(result)
|
| 415 |
+
|
| 416 |
+
# Retrieve chunks
|
| 417 |
+
result = self._retrieve_chunks_node(final_state)
|
| 418 |
+
final_state.update(result)
|
| 419 |
+
|
| 420 |
+
# Calculate confidence
|
| 421 |
+
result = self._calculate_confidence_node(final_state)
|
| 422 |
+
final_state.update(result)
|
| 423 |
+
|
| 424 |
+
# Generate answer
|
| 425 |
+
result = self._generate_answer_node(final_state)
|
| 426 |
+
final_state.update(result)
|
| 427 |
+
|
| 428 |
+
# Format response
|
| 429 |
+
response = {
|
| 430 |
+
"final_answer": final_state.get("final_answer", ""),
|
| 431 |
+
"retrieved_chunks": final_state.get("retrieved_chunks", []),
|
| 432 |
+
"confidence": final_state.get("confidence", 0.0)
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
# Add error if any
|
| 436 |
+
if final_state.get("error"):
|
| 437 |
+
response["error"] = final_state["error"]
|
| 438 |
+
|
| 439 |
+
return response
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# ============================================================================
|
| 443 |
+
# Convenience function for simple usage
|
| 444 |
+
# ============================================================================
|
| 445 |
+
|
| 446 |
+
def create_rag_pipeline(
|
| 447 |
+
pinecone_api_key: Optional[str] = None,
|
| 448 |
+
openai_api_key: Optional[str] = None,
|
| 449 |
+
index_name: str = "agentic-ai-ebook",
|
| 450 |
+
local_only: bool = False
|
| 451 |
+
) -> RAGPipeline:
|
| 452 |
+
"""
|
| 453 |
+
Factory function to create a RAG pipeline.
|
| 454 |
+
|
| 455 |
+
Args:
|
| 456 |
+
pinecone_api_key: Pinecone API key
|
| 457 |
+
openai_api_key: OpenAI API key
|
| 458 |
+
index_name: Name of Pinecone index
|
| 459 |
+
local_only: Use local storage instead of Pinecone
|
| 460 |
+
|
| 461 |
+
Returns:
|
| 462 |
+
Configured RAGPipeline instance
|
| 463 |
+
"""
|
| 464 |
+
return RAGPipeline(
|
| 465 |
+
pinecone_api_key=pinecone_api_key,
|
| 466 |
+
openai_api_key=openai_api_key,
|
| 467 |
+
index_name=index_name,
|
| 468 |
+
local_only=local_only
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
# ============================================================================
|
| 473 |
+
# Main - Quick test
|
| 474 |
+
# ============================================================================
|
| 475 |
+
|
| 476 |
+
if __name__ == "__main__":
|
| 477 |
+
print("Testing RAG Pipeline...")
|
| 478 |
+
print("=" * 60)
|
| 479 |
+
|
| 480 |
+
# Check if we should use local mode
|
| 481 |
+
local_mode = not os.getenv("PINECONE_API_KEY")
|
| 482 |
+
|
| 483 |
+
if local_mode:
|
| 484 |
+
print("No PINECONE_API_KEY found, using local mode")
|
| 485 |
+
print("Make sure you have run ingest.py with --local-only first!")
|
| 486 |
+
|
| 487 |
+
# Create pipeline
|
| 488 |
+
pipeline = RAGPipeline(local_only=local_mode)
|
| 489 |
+
|
| 490 |
+
# Test query
|
| 491 |
+
test_query = "What is agentic AI?"
|
| 492 |
+
print(f"\nTest query: {test_query}")
|
| 493 |
+
print("-" * 40)
|
| 494 |
+
|
| 495 |
+
result = pipeline.query(test_query)
|
| 496 |
+
|
| 497 |
+
print(f"\nFinal Answer:")
|
| 498 |
+
print(result["final_answer"])
|
| 499 |
+
print(f"\nConfidence: {result['confidence']}")
|
| 500 |
+
print(f"\nRetrieved Chunks: {len(result['retrieved_chunks'])}")
|
| 501 |
+
|
| 502 |
+
for chunk in result["retrieved_chunks"]:
|
| 503 |
+
print(f" - {chunk['id']} (page {chunk['page']}, score: {chunk['score']})")
|
| 504 |
+
|
| 505 |
+
print("\n" + "=" * 60)
|
| 506 |
+
print("Pipeline test complete!")
|
app/utils.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
utils.py - Helper functions for text processing and chunking
|
| 3 |
+
|
| 4 |
+
This module contains utility functions for:
|
| 5 |
+
- Text cleaning (removing extra whitespace, headers/footers)
|
| 6 |
+
- Token counting using tiktoken
|
| 7 |
+
- Text chunking with overlap
|
| 8 |
+
- Confidence score normalization
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import re
|
| 12 |
+
from typing import List, Dict, Tuple
|
| 13 |
+
import json
|
| 14 |
+
|
| 15 |
+
# Try to use tiktoken for accurate token counting, fallback to word count
|
| 16 |
+
try:
|
| 17 |
+
import tiktoken
|
| 18 |
+
TOKENIZER = tiktoken.get_encoding("cl100k_base")
|
| 19 |
+
USE_TIKTOKEN = True
|
| 20 |
+
except ImportError:
|
| 21 |
+
USE_TIKTOKEN = False
|
| 22 |
+
print("WARNING: tiktoken not available, using word count approximation")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def count_tokens(text: str) -> int:
|
| 26 |
+
"""
|
| 27 |
+
Count tokens in text using tiktoken or word count fallback.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
text: Input text string
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
Number of tokens (approximate if tiktoken not available)
|
| 34 |
+
"""
|
| 35 |
+
if USE_TIKTOKEN:
|
| 36 |
+
return len(TOKENIZER.encode(text))
|
| 37 |
+
else:
|
| 38 |
+
# Rough approximation: ~1.3 words per token on average
|
| 39 |
+
words = len(text.split())
|
| 40 |
+
return int(words * 1.3)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def clean_text(text: str) -> str:
|
| 44 |
+
"""
|
| 45 |
+
Clean extracted PDF text by removing extra whitespace and common artifacts.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
text: Raw text from PDF extraction
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Cleaned text string
|
| 52 |
+
"""
|
| 53 |
+
# Remove excessive whitespace (multiple spaces, tabs)
|
| 54 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 55 |
+
|
| 56 |
+
# Remove excessive newlines (more than 2 in a row)
|
| 57 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 58 |
+
|
| 59 |
+
# Remove page numbers (common patterns like "Page 1" or "- 1 -")
|
| 60 |
+
text = re.sub(r'(?i)page\s*\d+', '', text)
|
| 61 |
+
text = re.sub(r'-\s*\d+\s*-', '', text)
|
| 62 |
+
|
| 63 |
+
# Remove common header/footer artifacts (customize based on your PDF)
|
| 64 |
+
# This is a simple heuristic - you might need to adjust for your specific PDF
|
| 65 |
+
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
| 66 |
+
|
| 67 |
+
# Strip leading/trailing whitespace from each line
|
| 68 |
+
lines = [line.strip() for line in text.split('\n')]
|
| 69 |
+
text = '\n'.join(lines)
|
| 70 |
+
|
| 71 |
+
# Final cleanup
|
| 72 |
+
text = text.strip()
|
| 73 |
+
|
| 74 |
+
return text
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def chunk_text(
|
| 78 |
+
text: str,
|
| 79 |
+
page_number: int,
|
| 80 |
+
chunk_size: int = 500,
|
| 81 |
+
chunk_overlap: int = 50,
|
| 82 |
+
source: str = "Ebook-Agentic-AI.pdf"
|
| 83 |
+
) -> List[Dict]:
|
| 84 |
+
"""
|
| 85 |
+
Split text into overlapping chunks with metadata.
|
| 86 |
+
|
| 87 |
+
Uses token counting to ensure chunks are approximately chunk_size tokens,
|
| 88 |
+
with overlap for context continuity.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
text: Text to chunk (from one page)
|
| 92 |
+
page_number: Page number for metadata
|
| 93 |
+
chunk_size: Target size in tokens (default 500)
|
| 94 |
+
chunk_overlap: Overlap between chunks in tokens (default 50)
|
| 95 |
+
source: Source document name
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
List of chunk dictionaries with id, page, text, start_char, end_char
|
| 99 |
+
"""
|
| 100 |
+
chunks = []
|
| 101 |
+
|
| 102 |
+
# If text is empty or very short, return single chunk
|
| 103 |
+
if not text or count_tokens(text) <= chunk_size:
|
| 104 |
+
if text.strip():
|
| 105 |
+
chunk_id = f"pdfpage_{page_number}_chunk_0"
|
| 106 |
+
chunks.append({
|
| 107 |
+
"id": chunk_id,
|
| 108 |
+
"page": page_number,
|
| 109 |
+
"text": text.strip(),
|
| 110 |
+
"start_char": 0,
|
| 111 |
+
"end_char": len(text),
|
| 112 |
+
"source": source
|
| 113 |
+
})
|
| 114 |
+
return chunks
|
| 115 |
+
|
| 116 |
+
# Split into sentences for better chunking
|
| 117 |
+
# Simple sentence splitting - handles common cases
|
| 118 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 119 |
+
|
| 120 |
+
current_chunk = []
|
| 121 |
+
current_tokens = 0
|
| 122 |
+
current_start = 0
|
| 123 |
+
chunk_index = 0
|
| 124 |
+
char_position = 0
|
| 125 |
+
|
| 126 |
+
for sentence in sentences:
|
| 127 |
+
sentence_tokens = count_tokens(sentence)
|
| 128 |
+
|
| 129 |
+
# If adding this sentence exceeds chunk_size, save current chunk
|
| 130 |
+
if current_tokens + sentence_tokens > chunk_size and current_chunk:
|
| 131 |
+
# Join current chunk
|
| 132 |
+
chunk_text = ' '.join(current_chunk)
|
| 133 |
+
chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"
|
| 134 |
+
|
| 135 |
+
chunks.append({
|
| 136 |
+
"id": chunk_id,
|
| 137 |
+
"page": page_number,
|
| 138 |
+
"text": chunk_text,
|
| 139 |
+
"start_char": current_start,
|
| 140 |
+
"end_char": current_start + len(chunk_text),
|
| 141 |
+
"source": source
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
chunk_index += 1
|
| 145 |
+
|
| 146 |
+
# Calculate overlap - keep last few sentences that fit in overlap
|
| 147 |
+
overlap_tokens = 0
|
| 148 |
+
overlap_sentences = []
|
| 149 |
+
for s in reversed(current_chunk):
|
| 150 |
+
s_tokens = count_tokens(s)
|
| 151 |
+
if overlap_tokens + s_tokens <= chunk_overlap:
|
| 152 |
+
overlap_sentences.insert(0, s)
|
| 153 |
+
overlap_tokens += s_tokens
|
| 154 |
+
else:
|
| 155 |
+
break
|
| 156 |
+
|
| 157 |
+
current_chunk = overlap_sentences
|
| 158 |
+
current_tokens = overlap_tokens
|
| 159 |
+
current_start = char_position - sum(len(s) + 1 for s in overlap_sentences)
|
| 160 |
+
|
| 161 |
+
current_chunk.append(sentence)
|
| 162 |
+
current_tokens += sentence_tokens
|
| 163 |
+
char_position += len(sentence) + 1 # +1 for space
|
| 164 |
+
|
| 165 |
+
# Don't forget the last chunk!
|
| 166 |
+
if current_chunk:
|
| 167 |
+
chunk_text = ' '.join(current_chunk)
|
| 168 |
+
chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"
|
| 169 |
+
|
| 170 |
+
chunks.append({
|
| 171 |
+
"id": chunk_id,
|
| 172 |
+
"page": page_number,
|
| 173 |
+
"text": chunk_text,
|
| 174 |
+
"start_char": current_start,
|
| 175 |
+
"end_char": current_start + len(chunk_text),
|
| 176 |
+
"source": source
|
| 177 |
+
})
|
| 178 |
+
|
| 179 |
+
return chunks
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def normalize_score(score: float) -> float:
|
| 183 |
+
"""
|
| 184 |
+
Normalize similarity score to 0-1 range.
|
| 185 |
+
|
| 186 |
+
Pinecone returns similarity scores typically between -1 and 1 for cosine.
|
| 187 |
+
This function normalizes them to 0-1 range.
|
| 188 |
+
|
| 189 |
+
Formula: normalized = (score + 1) / 2
|
| 190 |
+
Then clamp to [0, 1] for safety.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
score: Raw similarity score from Pinecone
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
Normalized score between 0.0 and 1.0
|
| 197 |
+
"""
|
| 198 |
+
# For cosine similarity, scores are in [-1, 1]
|
| 199 |
+
# Normalize to [0, 1]
|
| 200 |
+
normalized = (score + 1.0) / 2.0
|
| 201 |
+
|
| 202 |
+
# Clamp to valid range (safety check)
|
| 203 |
+
return max(0.0, min(1.0, normalized))
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def compute_confidence(scores: List[float], method: str = "max") -> float:
|
| 207 |
+
"""
|
| 208 |
+
Compute confidence score from list of similarity scores.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
scores: List of raw similarity scores from retrieval
|
| 212 |
+
method: "max" for maximum score, "mean" for average
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
Confidence score rounded to 3 decimal places
|
| 216 |
+
"""
|
| 217 |
+
if not scores:
|
| 218 |
+
return 0.0
|
| 219 |
+
|
| 220 |
+
# Normalize all scores
|
| 221 |
+
normalized_scores = [normalize_score(s) for s in scores]
|
| 222 |
+
|
| 223 |
+
# Compute confidence based on method
|
| 224 |
+
if method == "max":
|
| 225 |
+
confidence = max(normalized_scores)
|
| 226 |
+
elif method == "mean":
|
| 227 |
+
confidence = sum(normalized_scores) / len(normalized_scores)
|
| 228 |
+
else:
|
| 229 |
+
# Default to max
|
| 230 |
+
confidence = max(normalized_scores)
|
| 231 |
+
|
| 232 |
+
return round(confidence, 3)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def save_chunks_to_jsonl(chunks: List[Dict], filepath: str, include_embeddings: bool = False):
|
| 236 |
+
"""
|
| 237 |
+
Save chunks to a JSONL file for backup.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
chunks: List of chunk dictionaries
|
| 241 |
+
filepath: Output file path
|
| 242 |
+
include_embeddings: Whether to include embeddings (makes file large)
|
| 243 |
+
"""
|
| 244 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 245 |
+
for chunk in chunks:
|
| 246 |
+
# Create a copy to potentially remove embeddings
|
| 247 |
+
chunk_data = chunk.copy()
|
| 248 |
+
|
| 249 |
+
if not include_embeddings and 'embedding' in chunk_data:
|
| 250 |
+
del chunk_data['embedding']
|
| 251 |
+
|
| 252 |
+
f.write(json.dumps(chunk_data, ensure_ascii=False) + '\n')
|
| 253 |
+
|
| 254 |
+
print(f"Saved {len(chunks)} chunks to {filepath}")
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def load_chunks_from_jsonl(filepath: str) -> List[Dict]:
|
| 258 |
+
"""
|
| 259 |
+
Load chunks from a JSONL file.
|
| 260 |
+
|
| 261 |
+
Args:
|
| 262 |
+
filepath: Input file path
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
List of chunk dictionaries
|
| 266 |
+
"""
|
| 267 |
+
chunks = []
|
| 268 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 269 |
+
for line in f:
|
| 270 |
+
if line.strip():
|
| 271 |
+
chunks.append(json.loads(line))
|
| 272 |
+
|
| 273 |
+
print(f"Loaded {len(chunks)} chunks from {filepath}")
|
| 274 |
+
return chunks
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def format_chunks_for_llm(chunks: List[Dict]) -> str:
|
| 278 |
+
"""
|
| 279 |
+
Format retrieved chunks into a string for LLM context.
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
chunks: List of chunk dictionaries with 'text' and 'page' keys
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
Formatted string with markers for the LLM
|
| 286 |
+
"""
|
| 287 |
+
formatted_parts = []
|
| 288 |
+
|
| 289 |
+
for i, chunk in enumerate(chunks):
|
| 290 |
+
page = chunk.get('page', 'unknown')
|
| 291 |
+
text = chunk.get('text', '')
|
| 292 |
+
chunk_id = chunk.get('id', f'chunk_{i}')
|
| 293 |
+
|
| 294 |
+
part = f"[Source: {chunk_id}, Page {page}]\n{text}"
|
| 295 |
+
formatted_parts.append(part)
|
| 296 |
+
|
| 297 |
+
return "\n\n---\n\n".join(formatted_parts)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
# Quick test of utility functions
|
| 302 |
+
print("Testing utils.py functions...")
|
| 303 |
+
|
| 304 |
+
# Test token counting
|
| 305 |
+
test_text = "This is a test sentence for token counting."
|
| 306 |
+
print(f"Token count for '{test_text}': {count_tokens(test_text)}")
|
| 307 |
+
|
| 308 |
+
# Test text cleaning
|
| 309 |
+
dirty_text = " This has extra spaces \n\n\n\nAnd too many newlines Page 123"
|
| 310 |
+
clean = clean_text(dirty_text)
|
| 311 |
+
print(f"Cleaned text: '{clean}'")
|
| 312 |
+
|
| 313 |
+
# Test score normalization
|
| 314 |
+
test_scores = [-1.0, 0.0, 0.5, 1.0]
|
| 315 |
+
for score in test_scores:
|
| 316 |
+
print(f"Score {score} -> normalized: {normalize_score(score)}")
|
| 317 |
+
|
| 318 |
+
# Test confidence computation
|
| 319 |
+
scores = [0.8, 0.6, 0.7]
|
| 320 |
+
print(f"Confidence (max): {compute_confidence(scores, 'max')}")
|
| 321 |
+
print(f"Confidence (mean): {compute_confidence(scores, 'mean')}")
|
| 322 |
+
|
| 323 |
+
print("\nAll tests passed!")
|
app/vectorstore.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
vectorstore.py - Pinecone Vector Database Wrapper
|
| 3 |
+
|
| 4 |
+
This module provides a clean wrapper around the Pinecone Python client for:
|
| 5 |
+
- Creating an index if it doesn't exist
|
| 6 |
+
- Upserting vectors in batches
|
| 7 |
+
- Querying for similar vectors (top-k retrieval)
|
| 8 |
+
|
| 9 |
+
Requires: PINECONE_API_KEY environment variable
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
from typing import List, Dict, Optional, Tuple
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
# Load environment variables
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Try to import Pinecone
|
| 21 |
+
try:
|
| 22 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 23 |
+
PINECONE_AVAILABLE = True
|
| 24 |
+
except ImportError:
|
| 25 |
+
PINECONE_AVAILABLE = False
|
| 26 |
+
print("WARNING: pinecone-client not installed. Vector operations will be disabled.")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class PineconeVectorStore:
|
| 30 |
+
"""
|
| 31 |
+
Wrapper class for Pinecone vector database operations.
|
| 32 |
+
|
| 33 |
+
Provides simple methods for creating indexes, upserting vectors,
|
| 34 |
+
and querying for similar vectors.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
api_key: Optional[str] = None,
|
| 40 |
+
index_name: str = "agentic-ai-ebook",
|
| 41 |
+
namespace: str = "agentic-ai",
|
| 42 |
+
dimension: int = 384, # all-MiniLM-L6-v2 produces 384-dim vectors
|
| 43 |
+
metric: str = "cosine"
|
| 44 |
+
):
|
| 45 |
+
"""
|
| 46 |
+
Initialize the Pinecone vector store.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
api_key: Pinecone API key (or set PINECONE_API_KEY env var)
|
| 50 |
+
index_name: Name of the Pinecone index
|
| 51 |
+
namespace: Namespace within the index
|
| 52 |
+
dimension: Dimension of vectors (384 for all-MiniLM-L6-v2)
|
| 53 |
+
metric: Similarity metric ('cosine', 'euclidean', 'dotproduct')
|
| 54 |
+
"""
|
| 55 |
+
self.api_key = api_key or os.getenv("PINECONE_API_KEY")
|
| 56 |
+
self.index_name = index_name
|
| 57 |
+
self.namespace = namespace
|
| 58 |
+
self.dimension = dimension
|
| 59 |
+
self.metric = metric
|
| 60 |
+
|
| 61 |
+
self.pc = None
|
| 62 |
+
self.index = None
|
| 63 |
+
|
| 64 |
+
# Local chunk storage for retrieval (maps chunk_id -> chunk_data)
|
| 65 |
+
self.chunks_map: Dict[str, Dict] = {}
|
| 66 |
+
|
| 67 |
+
if self.api_key and PINECONE_AVAILABLE:
|
| 68 |
+
self._initialize_pinecone()
|
| 69 |
+
else:
|
| 70 |
+
print("WARNING: Running without Pinecone. Use --local-only mode for local storage.")
|
| 71 |
+
|
| 72 |
+
def _initialize_pinecone(self):
|
| 73 |
+
"""Initialize connection to Pinecone."""
|
| 74 |
+
try:
|
| 75 |
+
self.pc = Pinecone(api_key=self.api_key)
|
| 76 |
+
print(f"Connected to Pinecone successfully!")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"ERROR: Failed to connect to Pinecone: {e}")
|
| 79 |
+
self.pc = None
|
| 80 |
+
|
| 81 |
+
def create_index_if_missing(self) -> bool:
|
| 82 |
+
"""
|
| 83 |
+
Create the Pinecone index if it doesn't exist.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
True if index exists or was created, False on error
|
| 87 |
+
"""
|
| 88 |
+
if not self.pc:
|
| 89 |
+
print("ERROR: Pinecone not initialized")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
# Get list of existing indexes
|
| 94 |
+
existing_indexes = [idx.name for idx in self.pc.list_indexes()]
|
| 95 |
+
|
| 96 |
+
if self.index_name not in existing_indexes:
|
| 97 |
+
print(f"Creating new index: {self.index_name}")
|
| 98 |
+
|
| 99 |
+
# Create serverless index (free tier compatible)
|
| 100 |
+
self.pc.create_index(
|
| 101 |
+
name=self.index_name,
|
| 102 |
+
dimension=self.dimension,
|
| 103 |
+
metric=self.metric,
|
| 104 |
+
spec=ServerlessSpec(
|
| 105 |
+
cloud="aws",
|
| 106 |
+
region="us-east-1" # Free tier region
|
| 107 |
+
)
|
| 108 |
+
)
|
| 109 |
+
print(f"Index '{self.index_name}' created successfully!")
|
| 110 |
+
else:
|
| 111 |
+
print(f"Index '{self.index_name}' already exists")
|
| 112 |
+
|
| 113 |
+
# Connect to the index
|
| 114 |
+
self.index = self.pc.Index(self.index_name)
|
| 115 |
+
return True
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
print(f"ERROR: Failed to create/connect to index: {e}")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
def upsert(
|
| 122 |
+
self,
|
| 123 |
+
items: List[Dict],
|
| 124 |
+
batch_size: int = 100
|
| 125 |
+
) -> int:
|
| 126 |
+
"""
|
| 127 |
+
Upsert vectors to Pinecone in batches.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
items: List of dicts with 'id', 'embedding', and metadata
|
| 131 |
+
batch_size: Number of vectors per batch (default 100)
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Number of vectors upserted
|
| 135 |
+
"""
|
| 136 |
+
if not self.index:
|
| 137 |
+
print("ERROR: Index not initialized. Call create_index_if_missing() first.")
|
| 138 |
+
return 0
|
| 139 |
+
|
| 140 |
+
# Store chunks locally for retrieval
|
| 141 |
+
for item in items:
|
| 142 |
+
self.chunks_map[item['id']] = {
|
| 143 |
+
'id': item['id'],
|
| 144 |
+
'page': item.get('page', 0),
|
| 145 |
+
'text': item.get('text', ''),
|
| 146 |
+
'source': item.get('source', '')
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
# Prepare vectors for Pinecone format
|
| 150 |
+
vectors = []
|
| 151 |
+
for item in items:
|
| 152 |
+
vector = {
|
| 153 |
+
'id': item['id'],
|
| 154 |
+
'values': item['embedding'],
|
| 155 |
+
'metadata': {
|
| 156 |
+
'page': item.get('page', 0),
|
| 157 |
+
'text': item.get('text', '')[:1000], # Pinecone metadata limit
|
| 158 |
+
'source': item.get('source', '')
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
vectors.append(vector)
|
| 162 |
+
|
| 163 |
+
# Upsert in batches
|
| 164 |
+
total_upserted = 0
|
| 165 |
+
for i in range(0, len(vectors), batch_size):
|
| 166 |
+
batch = vectors[i:i + batch_size]
|
| 167 |
+
try:
|
| 168 |
+
self.index.upsert(
|
| 169 |
+
vectors=batch,
|
| 170 |
+
namespace=self.namespace
|
| 171 |
+
)
|
| 172 |
+
total_upserted += len(batch)
|
| 173 |
+
print(f"Upserted batch {i//batch_size + 1}: {len(batch)} vectors")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"ERROR: Failed to upsert batch: {e}")
|
| 176 |
+
|
| 177 |
+
print(f"Total vectors upserted: {total_upserted}")
|
| 178 |
+
return total_upserted
|
| 179 |
+
|
| 180 |
+
def query_top_k(
|
| 181 |
+
self,
|
| 182 |
+
query_vector: List[float],
|
| 183 |
+
k: int = 5,
|
| 184 |
+
include_metadata: bool = True
|
| 185 |
+
) -> List[Dict]:
|
| 186 |
+
"""
|
| 187 |
+
Query Pinecone for top-k similar vectors.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
query_vector: Query embedding vector
|
| 191 |
+
k: Number of results to return
|
| 192 |
+
include_metadata: Whether to include metadata in results
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
List of results with id, score, and metadata
|
| 196 |
+
"""
|
| 197 |
+
if not self.index:
|
| 198 |
+
print("ERROR: Index not initialized")
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
try:
|
| 202 |
+
results = self.index.query(
|
| 203 |
+
vector=query_vector,
|
| 204 |
+
top_k=k,
|
| 205 |
+
namespace=self.namespace,
|
| 206 |
+
include_metadata=include_metadata
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# Format results
|
| 210 |
+
formatted_results = []
|
| 211 |
+
for match in results.get('matches', []):
|
| 212 |
+
result = {
|
| 213 |
+
'id': match['id'],
|
| 214 |
+
'score': match['score'],
|
| 215 |
+
'page': match.get('metadata', {}).get('page', 0),
|
| 216 |
+
'text': match.get('metadata', {}).get('text', ''),
|
| 217 |
+
'source': match.get('metadata', {}).get('source', '')
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
# If text is truncated in metadata, try to get full text from local cache
|
| 221 |
+
if result['id'] in self.chunks_map:
|
| 222 |
+
result['text'] = self.chunks_map[result['id']].get('text', result['text'])
|
| 223 |
+
|
| 224 |
+
formatted_results.append(result)
|
| 225 |
+
|
| 226 |
+
return formatted_results
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"ERROR: Query failed: {e}")
|
| 230 |
+
return []
|
| 231 |
+
|
| 232 |
+
def load_chunks_map(self, filepath: str):
|
| 233 |
+
"""
|
| 234 |
+
Load chunk data from a JSONL file to enable full text retrieval.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
filepath: Path to chunks.jsonl file
|
| 238 |
+
"""
|
| 239 |
+
try:
|
| 240 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 241 |
+
for line in f:
|
| 242 |
+
if line.strip():
|
| 243 |
+
chunk = json.loads(line)
|
| 244 |
+
self.chunks_map[chunk['id']] = chunk
|
| 245 |
+
print(f"Loaded {len(self.chunks_map)} chunks into memory")
|
| 246 |
+
except FileNotFoundError:
|
| 247 |
+
print(f"WARNING: {filepath} not found. Full text retrieval may be limited.")
|
| 248 |
+
except Exception as e:
|
| 249 |
+
print(f"ERROR: Failed to load chunks: {e}")
|
| 250 |
+
|
| 251 |
+
def get_index_stats(self) -> Dict:
|
| 252 |
+
"""
|
| 253 |
+
Get statistics about the Pinecone index.
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
Dictionary with index statistics
|
| 257 |
+
"""
|
| 258 |
+
if not self.index:
|
| 259 |
+
return {"error": "Index not initialized"}
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
stats = self.index.describe_index_stats()
|
| 263 |
+
return {
|
| 264 |
+
"total_vectors": stats.get('total_vector_count', 0),
|
| 265 |
+
"namespaces": stats.get('namespaces', {}),
|
| 266 |
+
"dimension": stats.get('dimension', self.dimension)
|
| 267 |
+
}
|
| 268 |
+
except Exception as e:
|
| 269 |
+
return {"error": str(e)}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
class LocalVectorStore:
|
| 273 |
+
"""
|
| 274 |
+
Local vector store for testing without Pinecone.
|
| 275 |
+
|
| 276 |
+
Stores vectors in memory and performs brute-force similarity search.
|
| 277 |
+
Useful for --local-only mode and testing.
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
def __init__(self, dimension: int = 384):
|
| 281 |
+
"""
|
| 282 |
+
Initialize local vector store.
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
dimension: Dimension of vectors
|
| 286 |
+
"""
|
| 287 |
+
self.dimension = dimension
|
| 288 |
+
self.vectors: Dict[str, Dict] = {} # id -> {embedding, metadata}
|
| 289 |
+
print("Using LOCAL vector store (no Pinecone)")
|
| 290 |
+
|
| 291 |
+
def upsert(self, items: List[Dict]) -> int:
|
| 292 |
+
"""Add vectors to local store."""
|
| 293 |
+
for item in items:
|
| 294 |
+
self.vectors[item['id']] = {
|
| 295 |
+
'embedding': item['embedding'],
|
| 296 |
+
'page': item.get('page', 0),
|
| 297 |
+
'text': item.get('text', ''),
|
| 298 |
+
'source': item.get('source', '')
|
| 299 |
+
}
|
| 300 |
+
print(f"Stored {len(items)} vectors locally")
|
| 301 |
+
return len(items)
|
| 302 |
+
|
| 303 |
+
def query_top_k(
|
| 304 |
+
self,
|
| 305 |
+
query_vector: List[float],
|
| 306 |
+
k: int = 5
|
| 307 |
+
) -> List[Dict]:
|
| 308 |
+
"""
|
| 309 |
+
Brute-force similarity search.
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
query_vector: Query embedding
|
| 313 |
+
k: Number of results
|
| 314 |
+
|
| 315 |
+
Returns:
|
| 316 |
+
Top-k results with scores
|
| 317 |
+
"""
|
| 318 |
+
import numpy as np
|
| 319 |
+
|
| 320 |
+
if not self.vectors:
|
| 321 |
+
return []
|
| 322 |
+
|
| 323 |
+
query_np = np.array(query_vector)
|
| 324 |
+
|
| 325 |
+
# Compute cosine similarity with all vectors
|
| 326 |
+
scores = []
|
| 327 |
+
for vec_id, data in self.vectors.items():
|
| 328 |
+
vec_np = np.array(data['embedding'])
|
| 329 |
+
|
| 330 |
+
# Cosine similarity
|
| 331 |
+
similarity = np.dot(query_np, vec_np) / (
|
| 332 |
+
np.linalg.norm(query_np) * np.linalg.norm(vec_np) + 1e-8
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
scores.append({
|
| 336 |
+
'id': vec_id,
|
| 337 |
+
'score': float(similarity),
|
| 338 |
+
'page': data['page'],
|
| 339 |
+
'text': data['text'],
|
| 340 |
+
'source': data['source']
|
| 341 |
+
})
|
| 342 |
+
|
| 343 |
+
# Sort by score descending and return top-k
|
| 344 |
+
scores.sort(key=lambda x: x['score'], reverse=True)
|
| 345 |
+
return scores[:k]
|
| 346 |
+
|
| 347 |
+
def save_to_file(self, filepath: str):
|
| 348 |
+
"""Save vectors to JSON file."""
|
| 349 |
+
import json
|
| 350 |
+
with open(filepath, 'w') as f:
|
| 351 |
+
json.dump(self.vectors, f)
|
| 352 |
+
print(f"Saved {len(self.vectors)} vectors to {filepath}")
|
| 353 |
+
|
| 354 |
+
def load_from_file(self, filepath: str):
|
| 355 |
+
"""Load vectors from JSON file."""
|
| 356 |
+
import json
|
| 357 |
+
try:
|
| 358 |
+
with open(filepath, 'r') as f:
|
| 359 |
+
self.vectors = json.load(f)
|
| 360 |
+
print(f"Loaded {len(self.vectors)} vectors from {filepath}")
|
| 361 |
+
except FileNotFoundError:
|
| 362 |
+
print(f"WARNING: {filepath} not found")
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def get_vector_store(
|
| 366 |
+
local_only: bool = False,
|
| 367 |
+
api_key: Optional[str] = None,
|
| 368 |
+
index_name: str = "agentic-ai-ebook",
|
| 369 |
+
**kwargs
|
| 370 |
+
):
|
| 371 |
+
"""
|
| 372 |
+
Factory function to get the appropriate vector store.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
local_only: If True, use local storage instead of Pinecone
|
| 376 |
+
api_key: Pinecone API key
|
| 377 |
+
index_name: Name of the index
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
Vector store instance (Pinecone or Local)
|
| 381 |
+
"""
|
| 382 |
+
if local_only or not PINECONE_AVAILABLE:
|
| 383 |
+
return LocalVectorStore(**kwargs)
|
| 384 |
+
|
| 385 |
+
return PineconeVectorStore(
|
| 386 |
+
api_key=api_key,
|
| 387 |
+
index_name=index_name,
|
| 388 |
+
**kwargs
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
if __name__ == "__main__":
|
| 393 |
+
# Quick test
|
| 394 |
+
print("Testing vectorstore.py...")
|
| 395 |
+
|
| 396 |
+
# Test local vector store
|
| 397 |
+
local_store = LocalVectorStore(dimension=384)
|
| 398 |
+
|
| 399 |
+
# Add some dummy vectors
|
| 400 |
+
import numpy as np
|
| 401 |
+
test_items = [
|
| 402 |
+
{
|
| 403 |
+
'id': 'test_1',
|
| 404 |
+
'embedding': np.random.randn(384).tolist(),
|
| 405 |
+
'page': 1,
|
| 406 |
+
'text': 'This is a test chunk about AI.',
|
| 407 |
+
'source': 'test.pdf'
|
| 408 |
+
},
|
| 409 |
+
{
|
| 410 |
+
'id': 'test_2',
|
| 411 |
+
'embedding': np.random.randn(384).tolist(),
|
| 412 |
+
'page': 2,
|
| 413 |
+
'text': 'This chunk discusses machine learning.',
|
| 414 |
+
'source': 'test.pdf'
|
| 415 |
+
}
|
| 416 |
+
]
|
| 417 |
+
|
| 418 |
+
local_store.upsert(test_items)
|
| 419 |
+
|
| 420 |
+
# Query
|
| 421 |
+
query_vec = np.random.randn(384).tolist()
|
| 422 |
+
results = local_store.query_top_k(query_vec, k=2)
|
| 423 |
+
|
| 424 |
+
print(f"\nQuery results: {len(results)} matches")
|
| 425 |
+
for r in results:
|
| 426 |
+
print(f" - {r['id']}: score={r['score']:.3f}")
|
| 427 |
+
|
| 428 |
+
print("\nLocal vector store test passed!")
|
architecture.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture Overview
|
| 2 |
+
|
| 3 |
+
This document explains the architecture of the RAG (Retrieval-Augmented Generation) chatbot for the Agentic AI eBook.
|
| 4 |
+
|
| 5 |
+
## System Overview
|
| 6 |
+
|
| 7 |
+
The system follows a standard RAG pattern: documents are chunked and embedded into a vector database during ingestion, then at query time, relevant chunks are retrieved and used to generate grounded answers.
|
| 8 |
+
|
| 9 |
+
### Key Components
|
| 10 |
+
|
| 11 |
+
1. **Ingestion Pipeline** (`app/ingest.py`) - Processes the PDF, creates chunks, generates embeddings, and stores in Pinecone
|
| 12 |
+
2. **Vector Store** (`app/vectorstore.py`) - Wrapper around Pinecone for storing and retrieving vectors
|
| 13 |
+
3. **RAG Pipeline** (`app/rag_pipeline.py`) - LangGraph-based pipeline for query processing
|
| 14 |
+
4. **Streamlit UI** (`streamlit_app/app.py`) - Web interface for user interactions
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## Architecture Diagram
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
INGESTION FLOW
|
| 22 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
β β
|
| 24 |
+
β ββββββββββββ ββββββββββββ ββββββββββββ ββββββββββββββββββββ β
|
| 25 |
+
β β PDF βββββΆβ Extract βββββΆβ Clean βββββΆβ Chunk β β
|
| 26 |
+
β β File β β Text β β Text β β (500 tokens, β β
|
| 27 |
+
β β β β by Page β β β β 50 overlap) β β
|
| 28 |
+
β ββββββββββββ ββββββββββββ ββββββββββββ ββββββββββ¬ββββββββββ β
|
| 29 |
+
β β β
|
| 30 |
+
β βΌ β
|
| 31 |
+
β ββββββββββββββββββββ ββββββββββββββββββββ ββββββββββββββββββββ β
|
| 32 |
+
β β Pinecone ββββββ Upsert ββββββ Embeddings β β
|
| 33 |
+
β β Vector Store β β Vectors β β (MiniLM-L6-v2) β β
|
| 34 |
+
β β β β β β 384 dims β β
|
| 35 |
+
β ββββββββββββββββββββ ββββββββββββββββββββ ββββββββββββββββββββ β
|
| 36 |
+
β β
|
| 37 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
QUERY FLOW
|
| 41 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
β β
|
| 43 |
+
β ββββββββββββ β
|
| 44 |
+
β β User β β
|
| 45 |
+
β β Query β β
|
| 46 |
+
β ββββββ¬ββββββ β
|
| 47 |
+
β β β
|
| 48 |
+
β βΌ β
|
| 49 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 50 |
+
β β LANGGRAPH PIPELINE β β
|
| 51 |
+
β β β β
|
| 52 |
+
β β βββββββββββββββ βββββββββββββββ βββββββββββββββ β β
|
| 53 |
+
β β β Embed ββββΆβ Retrieve ββββΆβ Calculate β β β
|
| 54 |
+
β β β Query β β Top-K β β Confidence β β β
|
| 55 |
+
β β β β β Chunks β β β β β
|
| 56 |
+
β β βββββββββββββββ ββββββββ¬βββββββ ββββββββ¬ββοΏ½οΏ½οΏ½ββββ β β
|
| 57 |
+
β β β β β β
|
| 58 |
+
β β βΌ βΌ β β
|
| 59 |
+
β β βββββββββββββββββββββββββββββββ β β
|
| 60 |
+
β β β Generate Answer β β β
|
| 61 |
+
β β β β β β
|
| 62 |
+
β β β βββββββββββββββββββββββ β β β
|
| 63 |
+
β β β β If OpenAI Key: β β β β
|
| 64 |
+
β β β β β LLM Generation β β β β
|
| 65 |
+
β β β β (grounded prompt) β β β β
|
| 66 |
+
β β β βββββββββββββββββββββββ€ β β β
|
| 67 |
+
β β β β Else: β β β β
|
| 68 |
+
β β β β β Extractive Mode β β β β
|
| 69 |
+
β β β β (return chunks) β β β β
|
| 70 |
+
β β β βββββββββββββββββββββββ β β β
|
| 71 |
+
β β βββββββββββββββ¬ββββββββββββββββ β β
|
| 72 |
+
β β β β β
|
| 73 |
+
β ββββββββββββββββββββββββββββββββββββΌββββββββββββββββββββββββββββββββ β
|
| 74 |
+
β β β
|
| 75 |
+
β βΌ β
|
| 76 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 77 |
+
β β RESPONSE β β
|
| 78 |
+
β β { β β
|
| 79 |
+
β β "final_answer": "...", β β
|
| 80 |
+
β β "retrieved_chunks": [...], β β
|
| 81 |
+
β β "confidence": 0.92 β β
|
| 82 |
+
β β } β β
|
| 83 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 84 |
+
β β β
|
| 85 |
+
β βΌ β
|
| 86 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 87 |
+
β β STREAMLIT UI β β
|
| 88 |
+
β β ββββββββββββββββββββ βββββββββββββββββββββββββββββββββββββ β β
|
| 89 |
+
β β β Chat Interface β β Retrieved Chunks Panel β β β
|
| 90 |
+
β β β - Question box β β - Chunk text β β β
|
| 91 |
+
β β β - Answer card β β - Page numbers β β β
|
| 92 |
+
β β β - Confidence β β - Relevance scores β β β
|
| 93 |
+
β β ββββββββββββββββββββ βββββββββββββββββββββββββββββββββββββ β β
|
| 94 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 95 |
+
β β
|
| 96 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
## Design Decisions
|
| 102 |
+
|
| 103 |
+
### 1. Chunking Strategy
|
| 104 |
+
|
| 105 |
+
We use **500 tokens** as the target chunk size with **50-100 token overlap**. This provides:
|
| 106 |
+
- Enough context for meaningful retrieval
|
| 107 |
+
- Overlap ensures important information spanning chunk boundaries isn't lost
|
| 108 |
+
- Token counting via tiktoken ensures consistent chunk sizes across different text densities
|
| 109 |
+
|
| 110 |
+
**Chunk ID Format**: `pdfpage_{page}_chunk_{index}` - This makes it easy to trace retrieved content back to the source PDF page for verification.
|
| 111 |
+
|
| 112 |
+
### 2. Embedding Model Choice
|
| 113 |
+
|
| 114 |
+
We use **sentence-transformers/all-MiniLM-L6-v2**:
|
| 115 |
+
- Open source and free (no API costs)
|
| 116 |
+
- Small model (384 dimensions) = fast inference and lower storage costs
|
| 117 |
+
- Good quality for semantic similarity tasks
|
| 118 |
+
- Can run entirely on CPU
|
| 119 |
+
|
| 120 |
+
Trade-off: Larger models like OpenAI's ada-002 (1536 dims) may provide better retrieval quality, but MiniLM offers excellent cost/performance ratio for this use case.
|
| 121 |
+
|
| 122 |
+
### 3. LangGraph Pipeline
|
| 123 |
+
|
| 124 |
+
The RAG pipeline uses LangGraph for orchestration because:
|
| 125 |
+
- Clear separation of pipeline stages (embed β retrieve β generate)
|
| 126 |
+
- Easy to add/modify nodes (e.g., reranking, query expansion)
|
| 127 |
+
- Built-in state management
|
| 128 |
+
- Aligns with modern LLM application patterns
|
| 129 |
+
|
| 130 |
+
### 4. Dual-Mode Answer Generation
|
| 131 |
+
|
| 132 |
+
The system supports two modes:
|
| 133 |
+
|
| 134 |
+
**LLM Generation Mode** (with OpenAI key):
|
| 135 |
+
- Uses GPT-3.5-turbo for natural language generation
|
| 136 |
+
- System prompt strictly instructs the model to only use provided chunks
|
| 137 |
+
- Produces more readable, synthesized answers
|
| 138 |
+
|
| 139 |
+
**Extractive Fallback Mode** (no API key):
|
| 140 |
+
- Returns relevant chunks directly with minimal formatting
|
| 141 |
+
- Always works, even offline
|
| 142 |
+
- Ensures the app is functional without paid APIs
|
| 143 |
+
|
| 144 |
+
This design choice ensures the application is **always functional** regardless of API availability.
|
| 145 |
+
|
| 146 |
+
### 5. Confidence Score Computation
|
| 147 |
+
|
| 148 |
+
Confidence is computed from retrieval similarity scores:
|
| 149 |
+
|
| 150 |
+
```python
|
| 151 |
+
# Normalize cosine similarity from [-1, 1] to [0, 1]
|
| 152 |
+
normalized = (score + 1) / 2
|
| 153 |
+
|
| 154 |
+
# Use maximum normalized score as confidence
|
| 155 |
+
confidence = max(normalized_scores)
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
This gives users an intuitive sense of how well the retrieved chunks match their query.
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## File Structure
|
| 163 |
+
|
| 164 |
+
```
|
| 165 |
+
rag-eAgenticAI/
|
| 166 |
+
βββ app/
|
| 167 |
+
β βββ __init__.py # Package exports
|
| 168 |
+
β βββ ingest.py # PDF β chunks β embeddings β Pinecone
|
| 169 |
+
β βββ vectorstore.py # Pinecone wrapper (create, upsert, query)
|
| 170 |
+
β βββ rag_pipeline.py # LangGraph pipeline + answer generation
|
| 171 |
+
β βββ utils.py # Chunking, cleaning, confidence calculation
|
| 172 |
+
β
|
| 173 |
+
βββ streamlit_app/
|
| 174 |
+
β βββ app.py # Main Streamlit application
|
| 175 |
+
β βββ assets/ # Static assets (images, CSS)
|
| 176 |
+
β
|
| 177 |
+
βββ samples/
|
| 178 |
+
β βββ sample_queries.txt # Example questions to test
|
| 179 |
+
β βββ expected_responses.md # Expected JSON response format
|
| 180 |
+
β
|
| 181 |
+
βββ infra/
|
| 182 |
+
β βββ hf_space_readme_template.md # Hugging Face Spaces config
|
| 183 |
+
β
|
| 184 |
+
βββ data/ # PDF files and generated chunks (gitignored)
|
| 185 |
+
β
|
| 186 |
+
βββ README.md # Main documentation
|
| 187 |
+
βββ architecture.md # This file
|
| 188 |
+
βββ requirements.txt # Python dependencies
|
| 189 |
+
βββ LICENSE # MIT License
|
| 190 |
+
βββ .gitignore # Git ignore rules
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
## Data Flow Summary
|
| 196 |
+
|
| 197 |
+
1. **Ingestion** (run once):
|
| 198 |
+
- PDF β pdfplumber β raw text by page
|
| 199 |
+
- Text β clean_text() β cleaned text
|
| 200 |
+
- Cleaned text β chunk_text() β chunks with metadata
|
| 201 |
+
- Chunks β SentenceTransformer β embeddings
|
| 202 |
+
- Embeddings β Pinecone upsert β stored vectors
|
| 203 |
+
|
| 204 |
+
2. **Query** (each user question):
|
| 205 |
+
- Question β SentenceTransformer β query embedding
|
| 206 |
+
- Query embedding β Pinecone query β top-k chunks
|
| 207 |
+
- Chunks + scores β compute_confidence() β confidence score
|
| 208 |
+
- Chunks + question β LLM/extractive β final answer
|
| 209 |
+
- Answer + chunks + confidence β JSON response β Streamlit UI
|
data/.gitkeep
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This folder should contain your PDF file
|
| 2 |
+
# Place Ebook-Agentic-AI.pdf here before running ingestion
|
| 3 |
+
|
| 4 |
+
# Example:
|
| 5 |
+
# data/
|
| 6 |
+
# βββ Ebook-Agentic-AI.pdf <- Your source PDF
|
| 7 |
+
# βββ chunks.jsonl <- Generated by ingest.py
|
| 8 |
+
# βββ vectors.json <- Generated by ingest.py (local mode)
|
infra/hf_space_readme_template.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Agentic AI eBook Chatbot
|
| 3 |
+
emoji: π€
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.28.0"
|
| 8 |
+
app_file: streamlit_app/app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Agentic AI eBook RAG Chatbot
|
| 14 |
+
|
| 15 |
+
A Retrieval-Augmented Generation (RAG) chatbot that answers questions strictly from the Agentic AI eBook.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- π **Semantic Search**: Uses sentence-transformers for document retrieval
|
| 19 |
+
- π **Grounded Answers**: All answers are strictly based on retrieved document chunks
|
| 20 |
+
- π **Confidence Scores**: Shows how confident the system is in its answers
|
| 21 |
+
- π **Dual Mode**: LLM generation (with OpenAI key) or extractive fallback
|
| 22 |
+
|
| 23 |
+
## Setup
|
| 24 |
+
|
| 25 |
+
### Environment Variables (Set in Space Settings β Secrets)
|
| 26 |
+
|
| 27 |
+
| Variable | Required | Description |
|
| 28 |
+
|----------|----------|-------------|
|
| 29 |
+
| `PINECONE_API_KEY` | Yes | Your Pinecone API key |
|
| 30 |
+
| `PINECONE_INDEX` | No | Index name (default: `agentic-ai-ebook`) |
|
| 31 |
+
| `OPENAI_API_KEY` | No | For LLM-powered answers |
|
| 32 |
+
|
| 33 |
+
### Usage
|
| 34 |
+
|
| 35 |
+
1. Set your Pinecone API key in the sidebar
|
| 36 |
+
2. Optionally set OpenAI API key for better answers
|
| 37 |
+
3. Ask questions about the Agentic AI eBook!
|
| 38 |
+
|
| 39 |
+
## Tech Stack
|
| 40 |
+
- LangGraph for RAG orchestration
|
| 41 |
+
- Pinecone for vector storage
|
| 42 |
+
- Sentence-Transformers for embeddings
|
| 43 |
+
- Streamlit for UI
|
| 44 |
+
|
| 45 |
+
## Limitations
|
| 46 |
+
- Only answers questions from the Agentic AI eBook
|
| 47 |
+
- Requires pre-ingested document in Pinecone index
|
| 48 |
+
- May not answer questions outside the document scope
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
Built for AI Engineer Intern Assignment
|
quick_test.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
quick_test.py - Validation Script for RAG Pipeline
|
| 3 |
+
|
| 4 |
+
This script tests the core functionality of the RAG pipeline:
|
| 5 |
+
1. Tests utility functions (chunking, confidence scoring)
|
| 6 |
+
2. Tests the embedding model loading
|
| 7 |
+
3. Tests the RAG pipeline with a sample query (if data is available)
|
| 8 |
+
|
| 9 |
+
Run this after ingestion to verify everything works:
|
| 10 |
+
python quick_test.py
|
| 11 |
+
|
| 12 |
+
This script is designed to work even without API keys by using local mode.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import json
|
| 18 |
+
|
| 19 |
+
# Add app directory to path
|
| 20 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 21 |
+
|
| 22 |
+
from app.utils import (
|
| 23 |
+
clean_text,
|
| 24 |
+
chunk_text,
|
| 25 |
+
count_tokens,
|
| 26 |
+
normalize_score,
|
| 27 |
+
compute_confidence,
|
| 28 |
+
format_chunks_for_llm
|
| 29 |
+
)
|
| 30 |
+
from app.vectorstore import LocalVectorStore
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_utilities():
|
| 34 |
+
"""Test utility functions."""
|
| 35 |
+
print("\n" + "=" * 60)
|
| 36 |
+
print("TEST 1: Utility Functions")
|
| 37 |
+
print("=" * 60)
|
| 38 |
+
|
| 39 |
+
# Test token counting
|
| 40 |
+
test_text = "This is a sample sentence for testing token counting functionality."
|
| 41 |
+
token_count = count_tokens(test_text)
|
| 42 |
+
print(f"\nβ Token counting: '{test_text[:30]}...' = {token_count} tokens")
|
| 43 |
+
|
| 44 |
+
# Test text cleaning
|
| 45 |
+
dirty_text = " This has extra spaces \n\n\n\nAnd too many newlines Page 123 "
|
| 46 |
+
clean = clean_text(dirty_text)
|
| 47 |
+
print(f"β Text cleaning: '{dirty_text[:30]}...' -> '{clean[:30]}...'")
|
| 48 |
+
|
| 49 |
+
# Test chunking
|
| 50 |
+
long_text = "This is a test paragraph. " * 100 # Create a longer text
|
| 51 |
+
chunks = chunk_text(long_text, page_number=1, chunk_size=100, chunk_overlap=20)
|
| 52 |
+
print(f"β Chunking: Created {len(chunks)} chunks from {count_tokens(long_text)} tokens")
|
| 53 |
+
|
| 54 |
+
if chunks:
|
| 55 |
+
print(f" - First chunk ID: {chunks[0]['id']}")
|
| 56 |
+
print(f" - First chunk tokens: ~{count_tokens(chunks[0]['text'])}")
|
| 57 |
+
|
| 58 |
+
# Test score normalization
|
| 59 |
+
test_scores = [-1.0, -0.5, 0.0, 0.5, 1.0]
|
| 60 |
+
normalized = [normalize_score(s) for s in test_scores]
|
| 61 |
+
print(f"\nβ Score normalization:")
|
| 62 |
+
for raw, norm in zip(test_scores, normalized):
|
| 63 |
+
print(f" {raw:5.2f} -> {norm:.3f}")
|
| 64 |
+
|
| 65 |
+
# Test confidence computation
|
| 66 |
+
sample_scores = [0.8, 0.6, 0.7, 0.5]
|
| 67 |
+
conf_max = compute_confidence(sample_scores, method="max")
|
| 68 |
+
conf_mean = compute_confidence(sample_scores, method="mean")
|
| 69 |
+
print(f"\nβ Confidence computation (from scores {sample_scores}):")
|
| 70 |
+
print(f" - Max method: {conf_max}")
|
| 71 |
+
print(f" - Mean method: {conf_mean}")
|
| 72 |
+
|
| 73 |
+
print("\nβ
All utility tests passed!")
|
| 74 |
+
return True
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_local_vectorstore():
|
| 78 |
+
"""Test local vector store functionality."""
|
| 79 |
+
print("\n" + "=" * 60)
|
| 80 |
+
print("TEST 2: Local Vector Store")
|
| 81 |
+
print("=" * 60)
|
| 82 |
+
|
| 83 |
+
import numpy as np
|
| 84 |
+
|
| 85 |
+
# Create local vector store
|
| 86 |
+
store = LocalVectorStore(dimension=384)
|
| 87 |
+
|
| 88 |
+
# Create dummy vectors
|
| 89 |
+
vectors = [
|
| 90 |
+
{
|
| 91 |
+
'id': 'test_chunk_1',
|
| 92 |
+
'embedding': np.random.randn(384).tolist(),
|
| 93 |
+
'page': 1,
|
| 94 |
+
'text': 'Agentic AI refers to artificial intelligence systems that can operate autonomously.',
|
| 95 |
+
'source': 'test.pdf'
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
'id': 'test_chunk_2',
|
| 99 |
+
'embedding': np.random.randn(384).tolist(),
|
| 100 |
+
'page': 2,
|
| 101 |
+
'text': 'The risks of agentic systems include uncontrolled behavior and safety concerns.',
|
| 102 |
+
'source': 'test.pdf'
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
'id': 'test_chunk_3',
|
| 106 |
+
'embedding': np.random.randn(384).tolist(),
|
| 107 |
+
'page': 3,
|
| 108 |
+
'text': 'Safeguards for agentic AI deployment include human oversight and testing.',
|
| 109 |
+
'source': 'test.pdf'
|
| 110 |
+
}
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
# Upsert vectors
|
| 114 |
+
count = store.upsert(vectors)
|
| 115 |
+
print(f"\nβ Upserted {count} vectors to local store")
|
| 116 |
+
|
| 117 |
+
# Query with random vector
|
| 118 |
+
query_vec = np.random.randn(384).tolist()
|
| 119 |
+
results = store.query_top_k(query_vec, k=2)
|
| 120 |
+
|
| 121 |
+
print(f"β Query returned {len(results)} results")
|
| 122 |
+
for r in results:
|
| 123 |
+
print(f" - {r['id']}: score={r['score']:.4f}")
|
| 124 |
+
|
| 125 |
+
print("\nβ
Local vector store test passed!")
|
| 126 |
+
return True
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def test_embedding_model():
|
| 130 |
+
"""Test embedding model loading."""
|
| 131 |
+
print("\n" + "=" * 60)
|
| 132 |
+
print("TEST 3: Embedding Model")
|
| 133 |
+
print("=" * 60)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
from sentence_transformers import SentenceTransformer
|
| 137 |
+
|
| 138 |
+
print("\nLoading embedding model (this may take a moment)...")
|
| 139 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 140 |
+
|
| 141 |
+
dim = model.get_sentence_embedding_dimension()
|
| 142 |
+
print(f"β Model loaded successfully!")
|
| 143 |
+
print(f"β Embedding dimension: {dim}")
|
| 144 |
+
|
| 145 |
+
# Test encoding
|
| 146 |
+
test_sentences = [
|
| 147 |
+
"What is agentic AI?",
|
| 148 |
+
"Describe the risks of autonomous systems."
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
embeddings = model.encode(test_sentences)
|
| 152 |
+
print(f"β Encoded {len(test_sentences)} sentences")
|
| 153 |
+
print(f" - Shape: {embeddings.shape}")
|
| 154 |
+
|
| 155 |
+
print("\nβ
Embedding model test passed!")
|
| 156 |
+
return True
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"\nβ Embedding model test failed: {e}")
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def test_rag_pipeline():
|
| 164 |
+
"""Test the full RAG pipeline (if data is available)."""
|
| 165 |
+
print("\n" + "=" * 60)
|
| 166 |
+
print("TEST 4: RAG Pipeline")
|
| 167 |
+
print("=" * 60)
|
| 168 |
+
|
| 169 |
+
# Check if we have local data
|
| 170 |
+
chunks_file = "./data/chunks.jsonl"
|
| 171 |
+
vectors_file = "./data/vectors.json"
|
| 172 |
+
|
| 173 |
+
if not os.path.exists(chunks_file) and not os.path.exists(vectors_file):
|
| 174 |
+
print("\nβ οΈ No ingested data found.")
|
| 175 |
+
print(" Run 'python app/ingest.py --pdf ./data/Ebook-Agentic-AI.pdf --local-only' first.")
|
| 176 |
+
print(" Skipping RAG pipeline test.\n")
|
| 177 |
+
return True # Not a failure, just skip
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
from app.rag_pipeline import RAGPipeline
|
| 181 |
+
|
| 182 |
+
print("\nInitializing RAG pipeline in local mode...")
|
| 183 |
+
|
| 184 |
+
# Use local mode for testing
|
| 185 |
+
pipeline = RAGPipeline(
|
| 186 |
+
local_only=True,
|
| 187 |
+
chunks_file=chunks_file
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Test query
|
| 191 |
+
test_query = "What is agentic AI?"
|
| 192 |
+
print(f"\nTest query: '{test_query}'")
|
| 193 |
+
print("-" * 40)
|
| 194 |
+
|
| 195 |
+
result = pipeline.query(test_query, top_k=3, use_llm=False)
|
| 196 |
+
|
| 197 |
+
# Display result
|
| 198 |
+
print("\nπ€ Response:")
|
| 199 |
+
print(json.dumps(result, indent=2, default=str)[:1000] + "...")
|
| 200 |
+
|
| 201 |
+
# Validate response structure
|
| 202 |
+
assert "final_answer" in result, "Missing 'final_answer' in response"
|
| 203 |
+
assert "retrieved_chunks" in result, "Missing 'retrieved_chunks' in response"
|
| 204 |
+
assert "confidence" in result, "Missing 'confidence' in response"
|
| 205 |
+
|
| 206 |
+
print(f"\nβ Final answer length: {len(result['final_answer'])} chars")
|
| 207 |
+
print(f"β Retrieved chunks: {len(result['retrieved_chunks'])}")
|
| 208 |
+
print(f"β Confidence score: {result['confidence']}")
|
| 209 |
+
|
| 210 |
+
# Show retrieved chunks summary
|
| 211 |
+
if result['retrieved_chunks']:
|
| 212 |
+
print("\nπ Retrieved chunks:")
|
| 213 |
+
for i, chunk in enumerate(result['retrieved_chunks'][:3]):
|
| 214 |
+
print(f" {i+1}. Page {chunk.get('page', '?')}, Score: {chunk.get('score', 0):.4f}")
|
| 215 |
+
print(f" ID: {chunk.get('id', 'unknown')}")
|
| 216 |
+
print(f" Text: {chunk.get('text', '')[:80]}...")
|
| 217 |
+
|
| 218 |
+
print("\nβ
RAG pipeline test passed!")
|
| 219 |
+
return True
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"\nβ RAG pipeline test failed: {e}")
|
| 223 |
+
import traceback
|
| 224 |
+
traceback.print_exc()
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def test_response_format():
|
| 229 |
+
"""Test that responses match the expected format."""
|
| 230 |
+
print("\n" + "=" * 60)
|
| 231 |
+
print("TEST 5: Response Format Validation")
|
| 232 |
+
print("=" * 60)
|
| 233 |
+
|
| 234 |
+
# Example expected format
|
| 235 |
+
expected_format = {
|
| 236 |
+
"final_answer": "string",
|
| 237 |
+
"retrieved_chunks": [
|
| 238 |
+
{
|
| 239 |
+
"id": "string (format: pdfpage_N_chunk_M)",
|
| 240 |
+
"page": "integer",
|
| 241 |
+
"text": "string",
|
| 242 |
+
"score": "float (0.0-1.0)"
|
| 243 |
+
}
|
| 244 |
+
],
|
| 245 |
+
"confidence": "float (0.0-1.0)"
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
print("\nβ Expected response format:")
|
| 249 |
+
print(json.dumps(expected_format, indent=2))
|
| 250 |
+
|
| 251 |
+
# Validate a mock response
|
| 252 |
+
mock_response = {
|
| 253 |
+
"final_answer": "According to the document, agentic AI is...",
|
| 254 |
+
"retrieved_chunks": [
|
| 255 |
+
{"id": "pdfpage_1_chunk_0", "page": 1, "text": "Sample text...", "score": 0.92}
|
| 256 |
+
],
|
| 257 |
+
"confidence": 0.92
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
# Check types
|
| 261 |
+
assert isinstance(mock_response["final_answer"], str), "final_answer must be string"
|
| 262 |
+
assert isinstance(mock_response["retrieved_chunks"], list), "retrieved_chunks must be list"
|
| 263 |
+
assert isinstance(mock_response["confidence"], (int, float)), "confidence must be number"
|
| 264 |
+
assert 0 <= mock_response["confidence"] <= 1, "confidence must be between 0 and 1"
|
| 265 |
+
|
| 266 |
+
if mock_response["retrieved_chunks"]:
|
| 267 |
+
chunk = mock_response["retrieved_chunks"][0]
|
| 268 |
+
assert "id" in chunk, "chunk must have 'id'"
|
| 269 |
+
assert "page" in chunk, "chunk must have 'page'"
|
| 270 |
+
assert "text" in chunk, "chunk must have 'text'"
|
| 271 |
+
assert "score" in chunk, "chunk must have 'score'"
|
| 272 |
+
|
| 273 |
+
print("\nβ
Response format validation passed!")
|
| 274 |
+
return True
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def main():
|
| 278 |
+
"""Run all tests."""
|
| 279 |
+
print("\n" + "=" * 60)
|
| 280 |
+
print("RAG CHATBOT - QUICK TEST SUITE")
|
| 281 |
+
print("=" * 60)
|
| 282 |
+
|
| 283 |
+
results = {}
|
| 284 |
+
|
| 285 |
+
# Run tests
|
| 286 |
+
results["utilities"] = test_utilities()
|
| 287 |
+
results["local_vectorstore"] = test_local_vectorstore()
|
| 288 |
+
results["embedding_model"] = test_embedding_model()
|
| 289 |
+
results["rag_pipeline"] = test_rag_pipeline()
|
| 290 |
+
results["response_format"] = test_response_format()
|
| 291 |
+
|
| 292 |
+
# Summary
|
| 293 |
+
print("\n" + "=" * 60)
|
| 294 |
+
print("TEST SUMMARY")
|
| 295 |
+
print("=" * 60)
|
| 296 |
+
|
| 297 |
+
passed = sum(1 for v in results.values() if v)
|
| 298 |
+
total = len(results)
|
| 299 |
+
|
| 300 |
+
for test_name, passed_test in results.items():
|
| 301 |
+
status = "β
PASS" if passed_test else "β FAIL"
|
| 302 |
+
print(f" {test_name}: {status}")
|
| 303 |
+
|
| 304 |
+
print(f"\nTotal: {passed}/{total} tests passed")
|
| 305 |
+
|
| 306 |
+
if passed == total:
|
| 307 |
+
print("\nπ All tests passed! The RAG pipeline is ready to use.")
|
| 308 |
+
else:
|
| 309 |
+
print("\nβ οΈ Some tests failed. Please check the errors above.")
|
| 310 |
+
|
| 311 |
+
return passed == total
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
if __name__ == "__main__":
|
| 315 |
+
success = main()
|
| 316 |
+
sys.exit(0 if success else 1)
|
requirements.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Chatbot for Agentic AI eBook
|
| 2 |
+
# Core dependencies with pinned versions for reproducibility
|
| 3 |
+
|
| 4 |
+
python-dotenv>=1.0.0
|
| 5 |
+
streamlit>=1.20.0
|
| 6 |
+
pdfplumber>=0.10.3
|
| 7 |
+
sentence-transformers>=5.0.0
|
| 8 |
+
torch>=2.0.0
|
| 9 |
+
pinecone>=3.0.0
|
| 10 |
+
langgraph>=0.0.40
|
| 11 |
+
tqdm>=4.66.1
|
| 12 |
+
tiktoken>=0.5.2
|
| 13 |
+
openai>=1.0.0
|
| 14 |
+
groq>=0.4.0
|
| 15 |
+
tf-keras
|
| 16 |
+
|
| 17 |
+
# NOTE: For CPU-only machines, you can install torch CPU version:
|
| 18 |
+
# pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 19 |
+
# This will be smaller and faster to install.
|
samples/expected_responses.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Expected Responses Format
|
| 2 |
+
|
| 3 |
+
This document shows the expected JSON response format for the RAG chatbot.
|
| 4 |
+
Each query should return a response with `final_answer`, `retrieved_chunks`, and `confidence`.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Example Response Format
|
| 9 |
+
|
| 10 |
+
For the query: **"What is the definition of 'agentic AI' described in the eBook?"**
|
| 11 |
+
|
| 12 |
+
### Expected JSON Structure:
|
| 13 |
+
|
| 14 |
+
```json
|
| 15 |
+
{
|
| 16 |
+
"final_answer": "According to the eBook, agentic AI refers to artificial intelligence systems that can operate autonomously to achieve goals with minimal human intervention. These systems are characterized by their ability to make decisions, take actions, and adapt their behavior based on environmental feedback. Unlike traditional AI that responds to specific queries, agentic AI proactively pursues objectives and can handle complex, multi-step tasks independently.",
|
| 17 |
+
"retrieved_chunks": [
|
| 18 |
+
{
|
| 19 |
+
"id": "pdfpage_12_chunk_0",
|
| 20 |
+
"page": 12,
|
| 21 |
+
"text": "Agentic AI represents a paradigm shift in artificial intelligence where systems operate with increased autonomy and goal-directed behavior. Unlike conventional AI models that respond reactively to inputs, agentic systems proactively pursue objectives, make decisions, and adapt their strategies based on environmental feedback...",
|
| 22 |
+
"score": 0.92
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"id": "pdfpage_13_chunk_1",
|
| 26 |
+
"page": 13,
|
| 27 |
+
"text": "The defining characteristics of agentic AI include: autonomous decision-making without constant human oversight, the ability to break down complex goals into actionable sub-tasks, learning from interactions to improve future performance, and operating within defined boundaries while maintaining flexibility in approach...",
|
| 28 |
+
"score": 0.87
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"id": "pdfpage_5_chunk_0",
|
| 32 |
+
"page": 5,
|
| 33 |
+
"text": "The emergence of agentic AI marks a significant evolution from traditional automation. Where conventional systems follow rigid, pre-programmed rules, agentic systems exhibit adaptive behavior and can handle novel situations...",
|
| 34 |
+
"score": 0.82
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"id": "pdfpage_14_chunk_0",
|
| 38 |
+
"page": 14,
|
| 39 |
+
"text": "Key to understanding agentic AI is recognizing its goal-oriented nature. These systems are not merely responding to queries but actively working toward specified objectives...",
|
| 40 |
+
"score": 0.79
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"confidence": 0.92
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## Response Field Descriptions
|
| 50 |
+
|
| 51 |
+
### `final_answer` (string)
|
| 52 |
+
- The generated answer to the user's question
|
| 53 |
+
- **Must** be derived strictly from the retrieved chunks
|
| 54 |
+
- If using LLM: synthesized answer using only the provided context
|
| 55 |
+
- If extractive mode: concatenation of relevant chunk excerpts with minimal formatting
|
| 56 |
+
|
| 57 |
+
### `retrieved_chunks` (array)
|
| 58 |
+
Each chunk object contains:
|
| 59 |
+
- `id` (string): Unique identifier in format `pdfpage_{page}_chunk_{index}`
|
| 60 |
+
- `page` (integer): Page number from the source PDF (1-indexed)
|
| 61 |
+
- `text` (string): The actual text content of the chunk
|
| 62 |
+
- `score` (float): Similarity score from vector search (0.0 to 1.0 after normalization)
|
| 63 |
+
|
| 64 |
+
### `confidence` (float)
|
| 65 |
+
- Numeric score between 0.0 and 1.0
|
| 66 |
+
- Computed from similarity scores: `confidence = max(normalized_scores)`
|
| 67 |
+
- Normalization formula: `normalized = (raw_score + 1) / 2` for cosine similarity
|
| 68 |
+
- Rounded to 3 decimal places
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## Confidence Score Interpretation
|
| 73 |
+
|
| 74 |
+
| Score Range | Interpretation |
|
| 75 |
+
|-------------|----------------|
|
| 76 |
+
| 0.8 - 1.0 | High confidence - Strong match found |
|
| 77 |
+
| 0.5 - 0.8 | Medium confidence - Relevant content found |
|
| 78 |
+
| 0.0 - 0.5 | Low confidence - Limited relevant content |
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
## Example: When Answer Cannot Be Found
|
| 83 |
+
|
| 84 |
+
For the query: **"What is the stock price of Apple?"**
|
| 85 |
+
|
| 86 |
+
```json
|
| 87 |
+
{
|
| 88 |
+
"final_answer": "I could not find a supported answer in the document. The Agentic AI eBook does not contain information about stock prices.",
|
| 89 |
+
"retrieved_chunks": [
|
| 90 |
+
{
|
| 91 |
+
"id": "pdfpage_1_chunk_0",
|
| 92 |
+
"page": 1,
|
| 93 |
+
"text": "Introduction to Agentic AI...",
|
| 94 |
+
"score": 0.23
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"confidence": 0.23
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
## Notes for Graders
|
| 104 |
+
|
| 105 |
+
1. **The exact text in `final_answer` will vary** based on:
|
| 106 |
+
- The actual content of the PDF
|
| 107 |
+
- Whether LLM mode or extractive mode is used
|
| 108 |
+
- The specific chunks retrieved
|
| 109 |
+
|
| 110 |
+
2. **Chunk IDs and page numbers** will match the actual PDF content after ingestion
|
| 111 |
+
|
| 112 |
+
3. **Confidence scores** may vary slightly based on:
|
| 113 |
+
- Embedding model used
|
| 114 |
+
- Vector similarity computation
|
| 115 |
+
- Number of chunks retrieved
|
| 116 |
+
|
| 117 |
+
4. **The key requirement is that answers are grounded** - no information outside the retrieved chunks should appear in the final answer
|
samples/sample_queries.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sample Queries for the Agentic AI eBook Chatbot
|
| 2 |
+
# These are example questions you can use to test the RAG system
|
| 3 |
+
# Each question should be answerable from the Ebook-Agentic-AI.pdf
|
| 4 |
+
|
| 5 |
+
# Query 1: Definition question
|
| 6 |
+
What is the definition of 'agentic AI' described in the eBook?
|
| 7 |
+
|
| 8 |
+
# Query 2: List-based question
|
| 9 |
+
List the three risks of agentic systems the eBook mentions.
|
| 10 |
+
|
| 11 |
+
# Query 3: Recommendation question
|
| 12 |
+
According to the eBook, what are the recommended safeguards for deploying agentic AI?
|
| 13 |
+
|
| 14 |
+
# Query 4: Comparison question
|
| 15 |
+
How does the eBook distinguish between autonomous agents and traditional automation?
|
| 16 |
+
|
| 17 |
+
# Query 5: Forward-looking question
|
| 18 |
+
What future research directions does the eBook propose?
|
| 19 |
+
|
| 20 |
+
# Query 6: Summary question
|
| 21 |
+
Summarize the eBook's conclusion in one paragraph.
|
streamlit_app/app.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Streamlit App for RAG Chatbot - Agentic AI eBook
|
| 3 |
+
|
| 4 |
+
This is the main UI for the RAG chatbot. It provides:
|
| 5 |
+
- Chat interface for asking questions
|
| 6 |
+
- Configuration sidebar (API keys, top_k, etc.)
|
| 7 |
+
- Display of retrieved chunks and confidence scores
|
| 8 |
+
- Raw JSON response viewer
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
streamlit run streamlit_app/app.py
|
| 12 |
+
|
| 13 |
+
For Hugging Face Spaces deployment:
|
| 14 |
+
- Set secrets in Space settings for PINECONE_API_KEY, OPENAI_API_KEY
|
| 15 |
+
- Or let users input keys in the sidebar
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import json
|
| 21 |
+
import streamlit as st
|
| 22 |
+
from dotenv import load_dotenv
|
| 23 |
+
|
| 24 |
+
# Add parent directory to path for imports
|
| 25 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 26 |
+
|
| 27 |
+
# Load environment variables from the project root .env file
|
| 28 |
+
env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '.env')
|
| 29 |
+
load_dotenv(env_path)
|
| 30 |
+
|
| 31 |
+
# Import RAG pipeline
|
| 32 |
+
from app.rag_pipeline import RAGPipeline
|
| 33 |
+
|
| 34 |
+
# ============================================================================
|
| 35 |
+
# Page Configuration
|
| 36 |
+
# ============================================================================
|
| 37 |
+
|
| 38 |
+
st.set_page_config(
|
| 39 |
+
page_title="Agentic AI eBook Chatbot",
|
| 40 |
+
page_icon="π€",
|
| 41 |
+
layout="wide",
|
| 42 |
+
initial_sidebar_state="expanded"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Custom CSS for better styling
|
| 46 |
+
st.markdown("""
|
| 47 |
+
<style>
|
| 48 |
+
/* Main container styling */
|
| 49 |
+
.main-header {
|
| 50 |
+
font-size: 2.5rem;
|
| 51 |
+
font-weight: bold;
|
| 52 |
+
color: #1E88E5;
|
| 53 |
+
text-align: center;
|
| 54 |
+
margin-bottom: 1rem;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
/* Answer card styling */
|
| 58 |
+
.answer-card {
|
| 59 |
+
background-color: #f0f7ff;
|
| 60 |
+
border-left: 4px solid #1E88E5;
|
| 61 |
+
padding: 1rem;
|
| 62 |
+
border-radius: 0 8px 8px 0;
|
| 63 |
+
margin: 1rem 0;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
/* Confidence badge styling */
|
| 67 |
+
.confidence-badge {
|
| 68 |
+
display: inline-block;
|
| 69 |
+
padding: 0.25rem 0.75rem;
|
| 70 |
+
border-radius: 1rem;
|
| 71 |
+
font-weight: bold;
|
| 72 |
+
font-size: 0.9rem;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.confidence-high {
|
| 76 |
+
background-color: #c8e6c9;
|
| 77 |
+
color: #2e7d32;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.confidence-medium {
|
| 81 |
+
background-color: #fff3e0;
|
| 82 |
+
color: #ef6c00;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.confidence-low {
|
| 86 |
+
background-color: #ffcdd2;
|
| 87 |
+
color: #c62828;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
/* Chunk card styling */
|
| 91 |
+
.chunk-card {
|
| 92 |
+
background-color: #fafafa;
|
| 93 |
+
border: 1px solid #e0e0e0;
|
| 94 |
+
padding: 0.75rem;
|
| 95 |
+
border-radius: 8px;
|
| 96 |
+
margin: 0.5rem 0;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
/* Footer styling */
|
| 100 |
+
.footer {
|
| 101 |
+
text-align: center;
|
| 102 |
+
color: #666;
|
| 103 |
+
font-size: 0.8rem;
|
| 104 |
+
margin-top: 2rem;
|
| 105 |
+
padding-top: 1rem;
|
| 106 |
+
border-top: 1px solid #e0e0e0;
|
| 107 |
+
}
|
| 108 |
+
</style>
|
| 109 |
+
""", unsafe_allow_html=True)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ============================================================================
|
| 113 |
+
# Session State Initialization
|
| 114 |
+
# ============================================================================
|
| 115 |
+
|
| 116 |
+
if "messages" not in st.session_state:
|
| 117 |
+
st.session_state.messages = []
|
| 118 |
+
|
| 119 |
+
if "pipeline" not in st.session_state:
|
| 120 |
+
st.session_state.pipeline = None
|
| 121 |
+
|
| 122 |
+
if "last_response" not in st.session_state:
|
| 123 |
+
st.session_state.last_response = None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ============================================================================
|
| 127 |
+
# Sidebar Configuration
|
| 128 |
+
# ============================================================================
|
| 129 |
+
|
| 130 |
+
with st.sidebar:
|
| 131 |
+
st.header("βοΈ Configuration")
|
| 132 |
+
|
| 133 |
+
st.markdown("---")
|
| 134 |
+
|
| 135 |
+
# API Keys section
|
| 136 |
+
st.subheader("π API Keys")
|
| 137 |
+
|
| 138 |
+
# Pinecone API Key
|
| 139 |
+
pinecone_key = st.text_input(
|
| 140 |
+
"Pinecone API Key",
|
| 141 |
+
type="password",
|
| 142 |
+
value=os.getenv("PINECONE_API_KEY", ""),
|
| 143 |
+
help="Required for vector search. Get your key at pinecone.io"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Pinecone Index Name
|
| 147 |
+
index_name = st.text_input(
|
| 148 |
+
"Pinecone Index Name",
|
| 149 |
+
value=os.getenv("PINECONE_INDEX", "agentic-ai-ebook"),
|
| 150 |
+
help="Name of your Pinecone index"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# OpenAI API Key (optional)
|
| 154 |
+
openai_key = st.text_input(
|
| 155 |
+
"OpenAI API Key (optional)",
|
| 156 |
+
type="password",
|
| 157 |
+
value=os.getenv("OPENAI_API_KEY", ""),
|
| 158 |
+
help="For LLM-powered answers. Leave empty if using Groq."
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Groq API Key (optional - FREE!)
|
| 162 |
+
groq_key = st.text_input(
|
| 163 |
+
"Groq API Key (FREE LLM)",
|
| 164 |
+
type="password",
|
| 165 |
+
value=os.getenv("GROQ_API_KEY", ""),
|
| 166 |
+
help="Free LLM alternative! Get key at console.groq.com"
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
st.markdown("---")
|
| 170 |
+
|
| 171 |
+
# Retrieval settings
|
| 172 |
+
st.subheader("π Retrieval Settings")
|
| 173 |
+
|
| 174 |
+
top_k = st.slider(
|
| 175 |
+
"Number of chunks to retrieve (top_k)",
|
| 176 |
+
min_value=1,
|
| 177 |
+
max_value=10,
|
| 178 |
+
value=6,
|
| 179 |
+
help="More chunks = more context but potentially more noise"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
use_llm = st.checkbox(
|
| 183 |
+
"Use LLM for answer generation",
|
| 184 |
+
value=True,
|
| 185 |
+
help="Uncheck to always use extractive mode"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
local_mode = st.checkbox(
|
| 189 |
+
"Local Mode (no Pinecone)",
|
| 190 |
+
value=False,
|
| 191 |
+
help="Use local vector storage instead of Pinecone"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
st.markdown("---")
|
| 195 |
+
|
| 196 |
+
# Initialize/Reinitialize button
|
| 197 |
+
if st.button("π Initialize Pipeline", use_container_width=True):
|
| 198 |
+
with st.spinner("Initializing RAG pipeline..."):
|
| 199 |
+
try:
|
| 200 |
+
st.session_state.pipeline = RAGPipeline(
|
| 201 |
+
pinecone_api_key=pinecone_key if pinecone_key else None,
|
| 202 |
+
openai_api_key=openai_key if openai_key else None,
|
| 203 |
+
groq_api_key=groq_key if groq_key else None,
|
| 204 |
+
index_name=index_name,
|
| 205 |
+
local_only=local_mode,
|
| 206 |
+
top_k=top_k
|
| 207 |
+
)
|
| 208 |
+
st.success("β
Pipeline initialized!")
|
| 209 |
+
except Exception as e:
|
| 210 |
+
st.error(f"β Error: {str(e)}")
|
| 211 |
+
|
| 212 |
+
# Status indicator
|
| 213 |
+
st.markdown("---")
|
| 214 |
+
st.subheader("π Status")
|
| 215 |
+
|
| 216 |
+
if st.session_state.pipeline:
|
| 217 |
+
st.success("Pipeline: Ready")
|
| 218 |
+
if st.session_state.pipeline.groq_client:
|
| 219 |
+
st.info("Mode: Groq LLM (FREE)")
|
| 220 |
+
elif st.session_state.pipeline.openai_client:
|
| 221 |
+
st.info("Mode: OpenAI LLM")
|
| 222 |
+
else:
|
| 223 |
+
st.warning("Mode: Extractive (no LLM)")
|
| 224 |
+
else:
|
| 225 |
+
st.warning("Pipeline: Not initialized")
|
| 226 |
+
st.caption("Click 'Initialize Pipeline' to start")
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# ============================================================================
|
| 230 |
+
# Main Content Area
|
| 231 |
+
# ============================================================================
|
| 232 |
+
|
| 233 |
+
# Header
|
| 234 |
+
st.markdown('<div class="main-header">π€ Agentic AI eBook Chatbot</div>', unsafe_allow_html=True)
|
| 235 |
+
|
| 236 |
+
st.markdown("""
|
| 237 |
+
<p style="text-align: center; color: #666;">
|
| 238 |
+
Ask questions about the Agentic AI eBook. Answers are strictly grounded in the document.
|
| 239 |
+
</p>
|
| 240 |
+
""", unsafe_allow_html=True)
|
| 241 |
+
|
| 242 |
+
st.markdown("---")
|
| 243 |
+
|
| 244 |
+
# Check if pipeline is initialized
|
| 245 |
+
if not st.session_state.pipeline:
|
| 246 |
+
st.info("π Please configure your API keys and click 'Initialize Pipeline' in the sidebar to start.")
|
| 247 |
+
|
| 248 |
+
# Show sample queries
|
| 249 |
+
st.subheader("π Sample Questions to Try")
|
| 250 |
+
sample_queries = [
|
| 251 |
+
"What is the definition of 'agentic AI' described in the eBook?",
|
| 252 |
+
"List the three risks of agentic systems the eBook mentions.",
|
| 253 |
+
"What are the recommended safeguards for deploying agentic AI?",
|
| 254 |
+
"How does the eBook distinguish between autonomous agents and traditional automation?",
|
| 255 |
+
"What future research directions does the eBook propose?"
|
| 256 |
+
]
|
| 257 |
+
|
| 258 |
+
for query in sample_queries:
|
| 259 |
+
st.markdown(f"- {query}")
|
| 260 |
+
|
| 261 |
+
else:
|
| 262 |
+
# Chat interface
|
| 263 |
+
col1, col2 = st.columns([2, 1])
|
| 264 |
+
|
| 265 |
+
with col1:
|
| 266 |
+
st.subheader("π¬ Chat")
|
| 267 |
+
|
| 268 |
+
# Display chat history
|
| 269 |
+
chat_container = st.container()
|
| 270 |
+
|
| 271 |
+
with chat_container:
|
| 272 |
+
for message in st.session_state.messages:
|
| 273 |
+
with st.chat_message(message["role"]):
|
| 274 |
+
st.write(message["content"])
|
| 275 |
+
|
| 276 |
+
# Chat input
|
| 277 |
+
user_input = st.chat_input("Ask a question about the Agentic AI eBook...")
|
| 278 |
+
|
| 279 |
+
if user_input:
|
| 280 |
+
# Add user message to chat
|
| 281 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 282 |
+
|
| 283 |
+
# Display user message
|
| 284 |
+
with st.chat_message("user"):
|
| 285 |
+
st.write(user_input)
|
| 286 |
+
|
| 287 |
+
# Get response from pipeline
|
| 288 |
+
with st.chat_message("assistant"):
|
| 289 |
+
with st.spinner("Searching document and generating answer..."):
|
| 290 |
+
try:
|
| 291 |
+
response = st.session_state.pipeline.query(
|
| 292 |
+
user_input,
|
| 293 |
+
top_k=top_k,
|
| 294 |
+
use_llm=use_llm
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Store response for display
|
| 298 |
+
st.session_state.last_response = response
|
| 299 |
+
|
| 300 |
+
# Display answer
|
| 301 |
+
answer = response.get("final_answer", "No answer generated")
|
| 302 |
+
st.write(answer)
|
| 303 |
+
|
| 304 |
+
# Display confidence
|
| 305 |
+
confidence = response.get("confidence", 0.0)
|
| 306 |
+
if confidence >= 0.7:
|
| 307 |
+
conf_class = "confidence-high"
|
| 308 |
+
elif confidence >= 0.4:
|
| 309 |
+
conf_class = "confidence-medium"
|
| 310 |
+
else:
|
| 311 |
+
conf_class = "confidence-low"
|
| 312 |
+
|
| 313 |
+
st.markdown(
|
| 314 |
+
f'<span class="confidence-badge {conf_class}">Confidence: {confidence:.3f}</span>',
|
| 315 |
+
unsafe_allow_html=True
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Add assistant message to chat
|
| 319 |
+
st.session_state.messages.append({
|
| 320 |
+
"role": "assistant",
|
| 321 |
+
"content": answer
|
| 322 |
+
})
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
st.error(f"Error: {str(e)}")
|
| 326 |
+
st.session_state.messages.append({
|
| 327 |
+
"role": "assistant",
|
| 328 |
+
"content": f"Error: {str(e)}"
|
| 329 |
+
})
|
| 330 |
+
|
| 331 |
+
# Clear chat button
|
| 332 |
+
if st.button("ποΈ Clear Chat", use_container_width=True):
|
| 333 |
+
st.session_state.messages = []
|
| 334 |
+
st.session_state.last_response = None
|
| 335 |
+
st.rerun()
|
| 336 |
+
|
| 337 |
+
with col2:
|
| 338 |
+
st.subheader("π Retrieved Chunks")
|
| 339 |
+
|
| 340 |
+
if st.session_state.last_response:
|
| 341 |
+
response = st.session_state.last_response
|
| 342 |
+
chunks = response.get("retrieved_chunks", [])
|
| 343 |
+
|
| 344 |
+
if chunks:
|
| 345 |
+
for i, chunk in enumerate(chunks):
|
| 346 |
+
with st.expander(
|
| 347 |
+
f"Chunk {i+1} (Page {chunk.get('page', '?')}, Score: {chunk.get('score', 0):.3f})",
|
| 348 |
+
expanded=(i == 0)
|
| 349 |
+
):
|
| 350 |
+
st.markdown(f"**ID:** `{chunk.get('id', 'unknown')}`")
|
| 351 |
+
st.markdown(f"**Page:** {chunk.get('page', 'unknown')}")
|
| 352 |
+
st.markdown(f"**Relevance Score:** {chunk.get('score', 0):.4f}")
|
| 353 |
+
st.markdown("**Text:**")
|
| 354 |
+
st.text_area(
|
| 355 |
+
"Chunk text",
|
| 356 |
+
value=chunk.get("text", ""),
|
| 357 |
+
height=150,
|
| 358 |
+
label_visibility="collapsed",
|
| 359 |
+
key=f"chunk_{i}"
|
| 360 |
+
)
|
| 361 |
+
else:
|
| 362 |
+
st.info("No chunks retrieved yet. Ask a question!")
|
| 363 |
+
|
| 364 |
+
# Raw JSON viewer
|
| 365 |
+
st.markdown("---")
|
| 366 |
+
with st.expander("π Show Raw JSON Response"):
|
| 367 |
+
st.json(response)
|
| 368 |
+
else:
|
| 369 |
+
st.info("Ask a question to see retrieved chunks.")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ============================================================================
|
| 373 |
+
# Footer
|
| 374 |
+
# ============================================================================
|
| 375 |
+
|
| 376 |
+
st.markdown("---")
|
| 377 |
+
st.markdown("""
|
| 378 |
+
<div class="footer">
|
| 379 |
+
<p>
|
| 380 |
+
<strong>Built for AI Engineer Intern Assignment</strong><br>
|
| 381 |
+
Answers are strictly grounded in the Agentic AI eBook.<br>
|
| 382 |
+
Using: LangGraph β’ Pinecone β’ Sentence-Transformers β’ Streamlit
|
| 383 |
+
</p>
|
| 384 |
+
</div>
|
| 385 |
+
""", unsafe_allow_html=True)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# ============================================================================
|
| 389 |
+
# Auto-initialize if env vars are set
|
| 390 |
+
# ============================================================================
|
| 391 |
+
|
| 392 |
+
# Try to auto-initialize on first load if env vars are present
|
| 393 |
+
if st.session_state.pipeline is None:
|
| 394 |
+
env_pinecone = os.getenv("PINECONE_API_KEY")
|
| 395 |
+
env_groq = os.getenv("GROQ_API_KEY")
|
| 396 |
+
if env_pinecone:
|
| 397 |
+
try:
|
| 398 |
+
st.session_state.pipeline = RAGPipeline(
|
| 399 |
+
pinecone_api_key=env_pinecone,
|
| 400 |
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
| 401 |
+
groq_api_key=env_groq,
|
| 402 |
+
index_name=os.getenv("PINECONE_INDEX", "agentic-ai-ebook"),
|
| 403 |
+
local_only=False
|
| 404 |
+
)
|
| 405 |
+
# Debug: show which LLM is being used
|
| 406 |
+
if st.session_state.pipeline.groq_client:
|
| 407 |
+
st.sidebar.success("β
Groq LLM connected!")
|
| 408 |
+
elif st.session_state.pipeline.openai_client:
|
| 409 |
+
st.sidebar.info("βΉοΈ OpenAI LLM connected")
|
| 410 |
+
else:
|
| 411 |
+
st.sidebar.warning("β οΈ No LLM - using extractive mode")
|
| 412 |
+
except Exception as e:
|
| 413 |
+
st.sidebar.error(f"Auto-init failed: {e}")
|
streamlit_app/assets/.gitkeep
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Placeholder for static assets (images, CSS, etc.)
|
| 2 |
+
# This directory can contain:
|
| 3 |
+
# - logo.png
|
| 4 |
+
# - custom.css
|
| 5 |
+
# - favicon.ico
|