Spaces:
Sleeping
Sleeping
Commit
·
14f13a5
1
Parent(s):
4e73d36
Deploy DeveloperDocs-AI-Copilot-RAG to Hugging Face Space
Browse files- Dockerfile +50 -0
- README.md +242 -9
- __init__.py +4 -0
- app.py +372 -0
- ci.yml +66 -0
- docker-compose.yml +25 -0
- evaluate_rag.py +229 -0
- ingest_docs.py +360 -0
- requirements.txt +40 -0
- src/__init__.py +21 -0
- src/chunking.py +265 -0
- src/config.py +95 -0
- src/embeddings.py +113 -0
- src/prompts.py +140 -0
- src/rag_pipeline.py +219 -0
- src/retriever.py +224 -0
- test_chunking.py +97 -0
- test_retrieval.py +85 -0
Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
build-essential \
|
| 9 |
+
curl \
|
| 10 |
+
git \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Create non-root user (required by HF Spaces)
|
| 14 |
+
RUN useradd -m -u 1000 user
|
| 15 |
+
ENV HOME=/home/user
|
| 16 |
+
ENV PATH=/home/user/.local/bin:$PATH
|
| 17 |
+
|
| 18 |
+
# Copy requirements first for better caching
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
|
| 21 |
+
# Install Python dependencies
|
| 22 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Download sentence-transformers model at build time to avoid runtime delays
|
| 25 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
|
| 26 |
+
|
| 27 |
+
# Copy application code
|
| 28 |
+
COPY . .
|
| 29 |
+
|
| 30 |
+
# Create necessary directories and set ownership
|
| 31 |
+
RUN mkdir -p data/vectordb data/raw data/processed evals/results \
|
| 32 |
+
&& chown -R user:user /app
|
| 33 |
+
|
| 34 |
+
# Switch to non-root user
|
| 35 |
+
USER user
|
| 36 |
+
|
| 37 |
+
# Expose port
|
| 38 |
+
EXPOSE 7860
|
| 39 |
+
|
| 40 |
+
# Health check
|
| 41 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 42 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 43 |
+
|
| 44 |
+
# Set environment variables
|
| 45 |
+
ENV PYTHONUNBUFFERED=1
|
| 46 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
| 47 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 48 |
+
|
| 49 |
+
# Run the application
|
| 50 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,244 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
| 1 |
+
# 🤖 Developer Docs Copilot
|
| 2 |
+
|
| 3 |
+
> Production-grade RAG system that answers questions using official techstack documentation (eg:fastapi)
|
| 4 |
+
|
| 5 |
+
[](https://huggingface.co/spaces)
|
| 6 |
+
[](https://www.docker.com/)
|
| 7 |
+
[](https://www.python.org/)
|
| 8 |
+
|
| 9 |
+
## 🎯 What This Project Demonstrates
|
| 10 |
+
|
| 11 |
+
This is a **production-style RAG (Retrieval-Augmented Generation)** system that showcases:
|
| 12 |
+
|
| 13 |
+
- ✅ **Professional documentation ingestion pipeline** with chunking strategies
|
| 14 |
+
- ✅ **Semantic search** using vector embeddings (ChromaDB)
|
| 15 |
+
- ✅ **Source attribution** with clickable citations
|
| 16 |
+
- ✅ **RAG evaluation metrics** (RAGAS framework)
|
| 17 |
+
- ✅ **Dockerized deployment** ready for cloud platforms
|
| 18 |
+
- ✅ **Production-grade error handling** and logging
|
| 19 |
+
|
| 20 |
+
## 🏗️ Architecture
|
| 21 |
+
|
| 22 |
+
```
|
| 23 |
+
┌─────────────┐
|
| 24 |
+
│ User │
|
| 25 |
+
│ Question │
|
| 26 |
+
└──────┬──────┘
|
| 27 |
+
│
|
| 28 |
+
▼
|
| 29 |
+
┌─────────────────────────────────────┐
|
| 30 |
+
│ 1. Query Embedding │
|
| 31 |
+
│ (sentence-transformers) │
|
| 32 |
+
└──────────┬──────────────────────────┘
|
| 33 |
+
│
|
| 34 |
+
▼
|
| 35 |
+
┌─────────────────────────────────────┐
|
| 36 |
+
│ 2. Vector Search (ChromaDB) │
|
| 37 |
+
│ - Top 5 relevant chunks │
|
| 38 |
+
│ - Metadata: source, section │
|
| 39 |
+
└──────────┬──────────────────────────┘
|
| 40 |
+
│
|
| 41 |
+
▼
|
| 42 |
+
┌─────────────────────────────────────┐
|
| 43 |
+
│ 3. Context Assembly │
|
| 44 |
+
│ - Format chunks │
|
| 45 |
+
│ - Add instructions │
|
| 46 |
+
└──────────┬──────────────────────────┘
|
| 47 |
+
│
|
| 48 |
+
▼
|
| 49 |
+
┌─────────────────────────────────────┐
|
| 50 |
+
│ 4. LLM Generation (HF Inference) │
|
| 51 |
+
│ - Answer with citations │
|
| 52 |
+
│ - Code examples preserved │
|
| 53 |
+
└──────────┬──────────────────────────┘
|
| 54 |
+
│
|
| 55 |
+
▼
|
| 56 |
+
┌─────────────────────────────────────┐
|
| 57 |
+
│ 5. Response + Source Links │
|
| 58 |
+
└─────────────────────────────────────┘
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Local Setup
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
# Clone the repository
|
| 65 |
+
git clone https://github.com/aishwarya30998/DeveloperDocs-AI-Copilot-RAG.git
|
| 66 |
+
cd DeveloperDocs-AI-Copilot-RAG
|
| 67 |
+
|
| 68 |
+
# Create virtual environment
|
| 69 |
+
python -m venv venv
|
| 70 |
+
source venv/bin/activate
|
| 71 |
+
# On Windows: venv\Scripts\activate
|
| 72 |
+
|
| 73 |
+
# Install dependencies
|
| 74 |
+
pip install -r requirements.txt
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# create .env and add your HF_TOKEN
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Run the application
|
| 81 |
+
python app.py
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
Visit `http://localhost:7860` in your browser.
|
| 85 |
+
|
| 86 |
+
## 📦 Project Structure
|
| 87 |
+
|
| 88 |
+
```
|
| 89 |
+
fastapi-docs-copilot/
|
| 90 |
+
├── app.py # Gradio UI application
|
| 91 |
+
├── Dockerfile # Container configuration
|
| 92 |
+
├── docker-compose.yml # Local container orchestration
|
| 93 |
+
├── requirements.txt # Python dependencies
|
| 94 |
+
├── .env.example # Environment variables template
|
| 95 |
+
│
|
| 96 |
+
├── src/
|
| 97 |
+
│ ├── __init__.py
|
| 98 |
+
│ ├── config.py # Configuration management
|
| 99 |
+
│ ├── chunking.py # Document chunking strategies
|
| 100 |
+
│ ├── embeddings.py # Embedding generation
|
| 101 |
+
│ ├── retriever.py # Vector search logic
|
| 102 |
+
│ ├── rag_pipeline.py # Main RAG orchestration
|
| 103 |
+
│ └── prompts.py # Prompt templates
|
| 104 |
+
│
|
| 105 |
+
├── scripts/
|
| 106 |
+
│ ├── ingest_docs.py # Documentation ingestion
|
| 107 |
+
│ ├── evaluate_rag.py # RAG metrics evaluation
|
| 108 |
+
│ └── test_retrieval.py # Test retrieval quality
|
| 109 |
+
│
|
| 110 |
+
├── data/
|
| 111 |
+
│ ├── raw/ # Downloaded documentation
|
| 112 |
+
│ ├── processed/ # Chunked documents
|
| 113 |
+
│ └── vectordb/ # ChromaDB storage
|
| 114 |
+
│
|
| 115 |
+
├── tests/
|
| 116 |
+
│ ├── test_chunking.py
|
| 117 |
+
│ ├── test_retriever.py
|
| 118 |
+
│ └── test_rag_pipeline.py
|
| 119 |
+
│
|
| 120 |
+
��── evals/
|
| 121 |
+
├── test_queries.json # Evaluation dataset
|
| 122 |
+
└── results/ # Evaluation outputs
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## 🎯 Key Features
|
| 126 |
+
|
| 127 |
+
### 1. Smart Chunking
|
| 128 |
+
|
| 129 |
+
- **Semantic chunking** with overlap for context preservation
|
| 130 |
+
- **Metadata enrichment** (section titles, URLs, code blocks)
|
| 131 |
+
- **Configurable chunk sizes** (300-800 tokens)
|
| 132 |
+
|
| 133 |
+
### 2. Retrieval Quality
|
| 134 |
+
|
| 135 |
+
- **Hybrid search** (semantic + keyword)
|
| 136 |
+
- **Reranking** for improved relevance
|
| 137 |
+
- **Source attribution** with confidence scores
|
| 138 |
+
|
| 139 |
+
### 3. Answer Generation
|
| 140 |
+
|
| 141 |
+
- **Code-aware formatting** (preserves indentation)
|
| 142 |
+
- **Inline citations** with source links
|
| 143 |
+
- **Fallback handling** for low-confidence results
|
| 144 |
+
|
| 145 |
+
### 4. Production Features
|
| 146 |
+
|
| 147 |
+
- **Health check endpoint** (`/health`)
|
| 148 |
+
- **Query logging** for analytics
|
| 149 |
+
- **Rate limiting** (basic throttling)
|
| 150 |
+
- **Error recovery** with graceful degradation
|
| 151 |
+
|
| 152 |
+
## 📊 RAG Evaluation
|
| 153 |
+
|
| 154 |
+
We use **RAGAS** framework to measure:
|
| 155 |
+
|
| 156 |
+
| Metric | Description | Target Score |
|
| 157 |
+
| --------------------- | --------------------------- | ------------ |
|
| 158 |
+
| **Faithfulness** | Answer accuracy vs. context | > 0.8 |
|
| 159 |
+
| **Answer Relevancy** | Response relevance to query | > 0.7 |
|
| 160 |
+
| **Context Precision** | Retrieval accuracy | > 0.75 |
|
| 161 |
+
| **Context Recall** | Context completeness | > 0.8 |
|
| 162 |
+
|
| 163 |
+
Run evaluations:
|
| 164 |
+
|
| 165 |
+
```bash
|
| 166 |
+
python evaluate_rag.py
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
## 🐳 Docker Deployment
|
| 170 |
+
|
| 171 |
+
### Build and run locally:
|
| 172 |
+
|
| 173 |
+
```bash
|
| 174 |
+
docker build -t developerdocs-rag
|
| 175 |
+
docker run -p 7860:7860 --name developerdocs-rag-container developerdocs-rag
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Deploy to HuggingFace Spaces:
|
| 179 |
+
|
| 180 |
+
1. Create a new Space on HuggingFace
|
| 181 |
+
2. Enable Docker SDK
|
| 182 |
+
3. Push this repository
|
| 183 |
+
4. Add `HF_TOKEN` as a Space secret
|
| 184 |
+
5. Deploy automatically
|
| 185 |
+
|
| 186 |
+
## 🧪 Testing
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
# Run all tests
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# Test chunking strategy
|
| 193 |
+
pytest test_chunking.py -v
|
| 194 |
+
|
| 195 |
+
# Test retrieval quality
|
| 196 |
+
python test_retrieval.py
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
## 📈 Performance Benchmarks
|
| 200 |
+
|
| 201 |
+
On HuggingFace Spaces (free tier):
|
| 202 |
+
|
| 203 |
+
- **Query latency**: ~2-3 seconds
|
| 204 |
+
- **Vector DB size**: ~150MB (FastAPI docs)
|
| 205 |
+
- **Memory usage**: ~800MB
|
| 206 |
+
- **Concurrent users**: 5-10
|
| 207 |
+
|
| 208 |
+
## 🛠️ Technology Stack
|
| 209 |
+
|
| 210 |
+
| Component | Technology | Why? |
|
| 211 |
+
| -------------- | ---------------------------------------- | ---------------------------------- |
|
| 212 |
+
| **Embeddings** | `sentence-transformers/all-MiniLM-L6-v2` | Fast, lightweight, good quality |
|
| 213 |
+
| **Vector DB** | ChromaDB | Easy setup, persistent storage |
|
| 214 |
+
| **LLM** | HuggingFace Inference API (Mistral-7B) | Free tier, good code understanding |
|
| 215 |
+
| **Framework** | LangChain | Industry standard, modular |
|
| 216 |
+
| **UI** | Gradio | Rapid prototyping, HF integration |
|
| 217 |
+
| **Deployment** | Docker + HF Spaces | Free, scalable, shareable |
|
| 218 |
+
|
| 219 |
+
## 🔮 Future Enhancements
|
| 220 |
+
|
| 221 |
+
- [ ] Multi-documentation support (React, Django, etc.)
|
| 222 |
+
- [ ] Conversation memory for follow-up questions
|
| 223 |
+
- [ ] Advanced retrieval (HyDE, Multi-Query)
|
| 224 |
+
- [ ] User feedback loop for continuous improvement
|
| 225 |
+
- [ ] Analytics dashboard for query patterns
|
| 226 |
+
|
| 227 |
+
## 📝 License
|
| 228 |
+
|
| 229 |
+
MIT License - feel free to use for your portfolio!
|
| 230 |
+
|
| 231 |
+
## 🤝 Contributing
|
| 232 |
+
|
| 233 |
+
This is a portfolio project, but suggestions are welcome via issues.
|
| 234 |
+
|
| 235 |
+
## 📧 Contact
|
| 236 |
+
|
| 237 |
+
Built by Aishwarya as a portfolio demonstration of production RAG systems.
|
| 238 |
+
|
| 239 |
+
- Portfolio: https://aishwarya30998.github.io/projects.html
|
| 240 |
+
- LinkedIn: https://www.linkedin.com/in/aishwarya-pentyala/
|
| 241 |
+
|
| 242 |
---
|
| 243 |
|
| 244 |
+
⭐ If this helped you understand production RAG, give it a star!
|
__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Developer Docs AI Copilot - RAG System
|
| 3 |
+
"""
|
| 4 |
+
__version__ = "1.0.0"
|
app.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Developer Docs AI Copilot - Gradio UI Application
|
| 3 |
+
|
| 4 |
+
Production-grade RAG chatbot interface for any developer documentation.
|
| 5 |
+
|
| 6 |
+
Two-tab UI:
|
| 7 |
+
Setup tab — enter a docs URL, trigger ingestion/embedding
|
| 8 |
+
Chat tab — ask questions, get answers with source citations
|
| 9 |
+
"""
|
| 10 |
+
import logging
|
| 11 |
+
import sys
|
| 12 |
+
import queue
|
| 13 |
+
import threading
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import List, Tuple, Optional
|
| 16 |
+
import gradio as gr
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
import json
|
| 19 |
+
from urllib.parse import urlparse
|
| 20 |
+
|
| 21 |
+
from src import create_rag_pipeline, settings
|
| 22 |
+
from src.config import RESULTS_DIR
|
| 23 |
+
from ingest_docs import run_ingestion
|
| 24 |
+
|
| 25 |
+
logging.basicConfig(
|
| 26 |
+
level=getattr(logging, settings.log_level),
|
| 27 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Global pipeline state
|
| 33 |
+
rag_pipeline = None
|
| 34 |
+
pipeline_stats: dict = {}
|
| 35 |
+
current_docs_name: str = settings.docs_name # may be updated after ingestion
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _try_load_pipeline():
|
| 39 |
+
"""Attempt to load the RAG pipeline from an existing vector DB."""
|
| 40 |
+
global rag_pipeline, pipeline_stats
|
| 41 |
+
try:
|
| 42 |
+
rag_pipeline = create_rag_pipeline()
|
| 43 |
+
pipeline_stats = rag_pipeline.get_stats()
|
| 44 |
+
logger.info(f"Pipeline loaded. {pipeline_stats.get('total_chunks', 0)} chunks indexed.")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.warning(f"Could not load pipeline on startup (run Setup first): {e}")
|
| 47 |
+
rag_pipeline = None
|
| 48 |
+
pipeline_stats = {}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
_try_load_pipeline()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Query logging
|
| 55 |
+
|
| 56 |
+
QUERY_LOG_FILE = RESULTS_DIR / "query_log.jsonl"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def log_query(question: str, response: dict):
|
| 60 |
+
try:
|
| 61 |
+
entry = {
|
| 62 |
+
"timestamp": datetime.now().isoformat(),
|
| 63 |
+
"docs_name": current_docs_name,
|
| 64 |
+
"question": question,
|
| 65 |
+
"answer": response.get("answer", ""),
|
| 66 |
+
"source_count": response.get("source_count", 0),
|
| 67 |
+
"confidence": response.get("confidence", "unknown"),
|
| 68 |
+
"chunks_retrieved": response.get("chunks_retrieved", 0),
|
| 69 |
+
}
|
| 70 |
+
with open(QUERY_LOG_FILE, "a", encoding="utf-8") as f:
|
| 71 |
+
f.write(json.dumps(entry) + "\n")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Failed to log query: {e}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Chat helpers
|
| 77 |
+
|
| 78 |
+
def format_sources(sources: List[dict]) -> str:
|
| 79 |
+
if not sources:
|
| 80 |
+
return "No sources available."
|
| 81 |
+
formatted = "### Sources\n\n"
|
| 82 |
+
for i, source in enumerate(sources, 1):
|
| 83 |
+
title = source.get("title", "Unknown")
|
| 84 |
+
section = source.get("section", "")
|
| 85 |
+
url = source.get("url", "#")
|
| 86 |
+
score = source.get("score", 0.0)
|
| 87 |
+
formatted += f"{i}. **{title}**"
|
| 88 |
+
if section:
|
| 89 |
+
formatted += f" ({section})"
|
| 90 |
+
formatted += f"\n - Relevance: {score:.2%}\n"
|
| 91 |
+
if url and url != "#":
|
| 92 |
+
formatted += f" - [View Documentation]({url})\n"
|
| 93 |
+
formatted += "\n"
|
| 94 |
+
return formatted
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def process_query(question: str, history: List[Tuple[str, str]]) -> Tuple[str, str]:
|
| 98 |
+
if not rag_pipeline:
|
| 99 |
+
return (
|
| 100 |
+
"Pipeline not ready. Please go to the **Setup** tab and ingest documentation first.",
|
| 101 |
+
"No sources available.",
|
| 102 |
+
)
|
| 103 |
+
if not question or not question.strip():
|
| 104 |
+
return "Please enter a question.", ""
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
logger.info(f"Processing query: {question[:100]}...")
|
| 108 |
+
response = rag_pipeline.query(question, top_k=5)
|
| 109 |
+
log_query(question, response)
|
| 110 |
+
|
| 111 |
+
answer = response["answer"]
|
| 112 |
+
confidence = response.get("confidence", "unknown")
|
| 113 |
+
chunks_retrieved = response.get("chunks_retrieved", 0)
|
| 114 |
+
answer += f"\n\n---\n*Confidence: {confidence.upper()} | Retrieved {chunks_retrieved} chunks*"
|
| 115 |
+
|
| 116 |
+
sources_text = format_sources(response.get("sources", []))
|
| 117 |
+
return answer, sources_text
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Error processing query: {e}", exc_info=True)
|
| 121 |
+
return f"Error: {str(e)}", "No sources available."
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# Ingestion helper — runs in a background thread, streams log lines via queue
|
| 125 |
+
def _derive_docs_name(url: str) -> str:
|
| 126 |
+
hostname = urlparse(url).hostname or ""
|
| 127 |
+
return hostname.split(".")[0].replace("-", " ").title()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def ingest_and_stream(docs_url: str, docs_name: str, url_patterns_raw: str):
|
| 131 |
+
"""
|
| 132 |
+
Generator function: runs ingestion in a background thread and streams
|
| 133 |
+
status lines to the Gradio Textbox.
|
| 134 |
+
"""
|
| 135 |
+
global rag_pipeline, pipeline_stats, current_docs_name
|
| 136 |
+
|
| 137 |
+
docs_url = docs_url.strip().rstrip("/")
|
| 138 |
+
docs_name = docs_name.strip() or _derive_docs_name(docs_url)
|
| 139 |
+
url_patterns = [p.strip() for p in url_patterns_raw.split(",") if p.strip()]
|
| 140 |
+
|
| 141 |
+
if not docs_url:
|
| 142 |
+
yield "Please enter a documentation URL."
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
# Queue used to pass log lines from the worker thread to the generator
|
| 146 |
+
log_q: queue.Queue = queue.Queue()
|
| 147 |
+
result_holder: dict = {}
|
| 148 |
+
error_holder: dict = {}
|
| 149 |
+
|
| 150 |
+
def worker():
|
| 151 |
+
try:
|
| 152 |
+
stats = run_ingestion(
|
| 153 |
+
docs_url=docs_url,
|
| 154 |
+
docs_name=docs_name,
|
| 155 |
+
url_patterns=url_patterns or None,
|
| 156 |
+
progress_callback=lambda msg: log_q.put(msg),
|
| 157 |
+
)
|
| 158 |
+
result_holder["stats"] = stats
|
| 159 |
+
except Exception as exc:
|
| 160 |
+
error_holder["error"] = str(exc)
|
| 161 |
+
logger.error(f"Ingestion failed: {exc}", exc_info=True)
|
| 162 |
+
finally:
|
| 163 |
+
log_q.put(None) # sentinel
|
| 164 |
+
|
| 165 |
+
thread = threading.Thread(target=worker, daemon=True)
|
| 166 |
+
thread.start()
|
| 167 |
+
|
| 168 |
+
# Stream log lines as they arrive
|
| 169 |
+
accumulated = ""
|
| 170 |
+
while True:
|
| 171 |
+
try:
|
| 172 |
+
line = log_q.get(timeout=120)
|
| 173 |
+
except queue.Empty:
|
| 174 |
+
yield accumulated + "\n[Timed out waiting for ingestion]"
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
if line is None: # sentinel → done
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
+
accumulated += line + "\n"
|
| 181 |
+
yield accumulated
|
| 182 |
+
|
| 183 |
+
thread.join(timeout=5)
|
| 184 |
+
|
| 185 |
+
if "error" in error_holder:
|
| 186 |
+
yield accumulated + f"\n\nIngestion failed: {error_holder['error']}"
|
| 187 |
+
return
|
| 188 |
+
|
| 189 |
+
# Reload the RAG pipeline with the newly ingested docs
|
| 190 |
+
accumulated += "\nReloading RAG pipeline..."
|
| 191 |
+
yield accumulated
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
# Update settings so the pipeline and prompts use the new docs name
|
| 195 |
+
settings.docs_url = docs_url
|
| 196 |
+
settings.docs_name = docs_name
|
| 197 |
+
current_docs_name = docs_name
|
| 198 |
+
|
| 199 |
+
rag_pipeline = create_rag_pipeline()
|
| 200 |
+
pipeline_stats = rag_pipeline.get_stats()
|
| 201 |
+
|
| 202 |
+
accumulated += f"\nPipeline ready — {pipeline_stats.get('total_chunks', 0)} chunks indexed."
|
| 203 |
+
accumulated += f"\n\nSwitch to the Chat tab and start asking questions about {docs_name}!"
|
| 204 |
+
yield accumulated
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
accumulated += f"\n\nPipeline reload failed: {e}"
|
| 208 |
+
yield accumulated
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# UI
|
| 212 |
+
def create_ui():
|
| 213 |
+
custom_css = """
|
| 214 |
+
.stats-box {
|
| 215 |
+
background: #e3f2fd;
|
| 216 |
+
padding: 10px;
|
| 217 |
+
border-radius: 5px;
|
| 218 |
+
margin: 10px 0;
|
| 219 |
+
}
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
with gr.Blocks(
|
| 223 |
+
title="Developer Docs AI Copilot",
|
| 224 |
+
theme=gr.themes.Soft(),
|
| 225 |
+
css=custom_css,
|
| 226 |
+
) as app:
|
| 227 |
+
|
| 228 |
+
gr.Markdown("# Developer Docs AI Copilot")
|
| 229 |
+
gr.Markdown(
|
| 230 |
+
"Ingest any developer documentation and ask questions answered directly from the source."
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
with gr.Tabs() as tabs:
|
| 234 |
+
|
| 235 |
+
# TAB 1 — Setup
|
| 236 |
+
|
| 237 |
+
with gr.Tab("⚙️ Setup — Ingest Docs", id="setup"):
|
| 238 |
+
gr.Markdown(
|
| 239 |
+
"Enter the URL of any developer documentation site. "
|
| 240 |
+
"The system will scrape, chunk, embed, and index it for Q&A."
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
with gr.Row():
|
| 244 |
+
docs_url_input = gr.Textbox(
|
| 245 |
+
label="Documentation URL",
|
| 246 |
+
placeholder="e.g. https://docs.djangoproject.com/en/stable/",
|
| 247 |
+
scale=3,
|
| 248 |
+
)
|
| 249 |
+
docs_name_input = gr.Textbox(
|
| 250 |
+
label="Docs Name (optional — auto-derived if empty)",
|
| 251 |
+
placeholder="e.g. Django",
|
| 252 |
+
scale=1,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
url_patterns_input = gr.Textbox(
|
| 256 |
+
label="URL Path Patterns to include (optional, comma-separated)",
|
| 257 |
+
placeholder="e.g. /topics,/ref,/howto — leave empty to include all pages",
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
ingest_btn = gr.Button("Ingest Documentation", variant="primary")
|
| 261 |
+
|
| 262 |
+
ingest_status = gr.Textbox(
|
| 263 |
+
label="Ingestion Log",
|
| 264 |
+
lines=20,
|
| 265 |
+
interactive=False,
|
| 266 |
+
placeholder="Status will appear here when you click Ingest...",
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Wire up the button to the streaming generator
|
| 270 |
+
ingest_btn.click(
|
| 271 |
+
fn=ingest_and_stream,
|
| 272 |
+
inputs=[docs_url_input, docs_name_input, url_patterns_input],
|
| 273 |
+
outputs=ingest_status,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
gr.Markdown("""
|
| 277 |
+
**Tips:**
|
| 278 |
+
- Most documentation sites (FastAPI, Django, React, Stripe, etc.) work out of the box
|
| 279 |
+
- Use URL patterns to ingest only a specific section (faster)
|
| 280 |
+
- Re-run ingestion any time to switch to a different documentation source
|
| 281 |
+
- Default page cap is **50 pages** — sufficient for most demos
|
| 282 |
+
""")
|
| 283 |
+
|
| 284 |
+
# TAB 2 — Chat
|
| 285 |
+
with gr.Tab("💬 Chat", id="chat"):
|
| 286 |
+
|
| 287 |
+
# Live status bar
|
| 288 |
+
status_text = (
|
| 289 |
+
f"Ready — {pipeline_stats.get('total_chunks', 0)} chunks indexed "
|
| 290 |
+
f"({current_docs_name})"
|
| 291 |
+
if rag_pipeline
|
| 292 |
+
else "Not ready — please ingest documentation in the Setup tab first."
|
| 293 |
+
)
|
| 294 |
+
status_md = gr.Markdown(f"**Status:** {status_text}")
|
| 295 |
+
|
| 296 |
+
with gr.Row():
|
| 297 |
+
with gr.Column(scale=2):
|
| 298 |
+
chatbot = gr.Chatbot(
|
| 299 |
+
label="Conversation",
|
| 300 |
+
height=420,
|
| 301 |
+
show_copy_button=True,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
with gr.Row():
|
| 305 |
+
question_input = gr.Textbox(
|
| 306 |
+
label="Ask a question",
|
| 307 |
+
placeholder="e.g. How do I get started?",
|
| 308 |
+
lines=2,
|
| 309 |
+
scale=4,
|
| 310 |
+
)
|
| 311 |
+
submit_btn = gr.Button("Ask", variant="primary", scale=1)
|
| 312 |
+
|
| 313 |
+
gr.Examples(
|
| 314 |
+
examples=[
|
| 315 |
+
"How do I get started?",
|
| 316 |
+
"What are the core concepts?",
|
| 317 |
+
"Show me a basic example",
|
| 318 |
+
"How do I handle authentication?",
|
| 319 |
+
"What is the recommended project structure?",
|
| 320 |
+
],
|
| 321 |
+
inputs=question_input,
|
| 322 |
+
label="Example Questions",
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
with gr.Column(scale=1):
|
| 326 |
+
sources_display = gr.Markdown(
|
| 327 |
+
value="Sources will appear here after asking a question."
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
clear_btn = gr.Button("Clear Conversation")
|
| 331 |
+
|
| 332 |
+
def respond(message, chat_history):
|
| 333 |
+
answer, sources = process_query(message, chat_history)
|
| 334 |
+
chat_history.append((message, answer))
|
| 335 |
+
return "", chat_history, sources
|
| 336 |
+
|
| 337 |
+
submit_btn.click(
|
| 338 |
+
respond,
|
| 339 |
+
inputs=[question_input, chatbot],
|
| 340 |
+
outputs=[question_input, chatbot, sources_display],
|
| 341 |
+
)
|
| 342 |
+
question_input.submit(
|
| 343 |
+
respond,
|
| 344 |
+
inputs=[question_input, chatbot],
|
| 345 |
+
outputs=[question_input, chatbot, sources_display],
|
| 346 |
+
)
|
| 347 |
+
clear_btn.click(
|
| 348 |
+
lambda: ([], "Sources will appear here after asking a question."),
|
| 349 |
+
outputs=[chatbot, sources_display],
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
gr.Markdown(
|
| 353 |
+
"---\n*Built with: ChromaDB · Sentence Transformers · HuggingFace · Gradio*"
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
return app
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def health_check():
|
| 360 |
+
return {"status": "healthy", "pipeline_ready": rag_pipeline is not None}
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
if __name__ == "__main__":
|
| 364 |
+
logger.info("Starting Developer Docs AI Copilot...")
|
| 365 |
+
app = create_ui()
|
| 366 |
+
logger.info(f"Launching on port {settings.app_port}")
|
| 367 |
+
app.launch(
|
| 368 |
+
server_name="0.0.0.0",
|
| 369 |
+
server_port=settings.app_port,
|
| 370 |
+
share=False,
|
| 371 |
+
show_error=True,
|
| 372 |
+
)
|
ci.yml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI/CD Pipeline
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main, develop ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
test:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v3
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v4
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.10'
|
| 20 |
+
|
| 21 |
+
- name: Cache dependencies
|
| 22 |
+
uses: actions/cache@v3
|
| 23 |
+
with:
|
| 24 |
+
path: ~/.cache/pip
|
| 25 |
+
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
|
| 26 |
+
restore-keys: |
|
| 27 |
+
${{ runner.os }}-pip-
|
| 28 |
+
|
| 29 |
+
- name: Install dependencies
|
| 30 |
+
run: |
|
| 31 |
+
python -m pip install --upgrade pip
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
pip install pytest pytest-cov
|
| 34 |
+
|
| 35 |
+
- name: Run tests
|
| 36 |
+
run: |
|
| 37 |
+
pytest tests/ -v --cov=src --cov-report=term-missing
|
| 38 |
+
|
| 39 |
+
- name: Lint code
|
| 40 |
+
run: |
|
| 41 |
+
pip install flake8
|
| 42 |
+
flake8 src/ --max-line-length=100 --ignore=E501,W503
|
| 43 |
+
|
| 44 |
+
build:
|
| 45 |
+
runs-on: ubuntu-latest
|
| 46 |
+
needs: test
|
| 47 |
+
if: github.ref == 'refs/heads/main'
|
| 48 |
+
|
| 49 |
+
steps:
|
| 50 |
+
- uses: actions/checkout@v3
|
| 51 |
+
|
| 52 |
+
- name: Set up Docker Buildx
|
| 53 |
+
uses: docker/setup-buildx-action@v2
|
| 54 |
+
|
| 55 |
+
- name: Build Docker image
|
| 56 |
+
run: |
|
| 57 |
+
docker build -t fastapi-copilot:latest .
|
| 58 |
+
|
| 59 |
+
- name: Test Docker image
|
| 60 |
+
run: |
|
| 61 |
+
docker run -d --name test-container -p 7860:7860 \
|
| 62 |
+
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
|
| 63 |
+
fastapi-copilot:latest
|
| 64 |
+
sleep 10
|
| 65 |
+
curl -f http://localhost:7860/health || exit 1
|
| 66 |
+
docker stop test-container
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
developerdocs-copilot:
|
| 5 |
+
build: .
|
| 6 |
+
container_name: developer-docs-copilot
|
| 7 |
+
ports:
|
| 8 |
+
- "7860:7860"
|
| 9 |
+
env_file:
|
| 10 |
+
- .env
|
| 11 |
+
volumes:
|
| 12 |
+
# Mount vector DB for persistence
|
| 13 |
+
- ./data/vectordb:/app/data/vectordb
|
| 14 |
+
# Mount for live code changes during development
|
| 15 |
+
- ./src:/app/src
|
| 16 |
+
- ./app.py:/app/app.py
|
| 17 |
+
environment:
|
| 18 |
+
- PYTHONUNBUFFERED=1
|
| 19 |
+
restart: unless-stopped
|
| 20 |
+
healthcheck:
|
| 21 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
|
| 22 |
+
interval: 30s
|
| 23 |
+
timeout: 10s
|
| 24 |
+
retries: 3
|
| 25 |
+
start_period: 60s
|
evaluate_rag.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluate RAG pipeline using RAGAS framework.
|
| 3 |
+
|
| 4 |
+
Measures:
|
| 5 |
+
- Faithfulness: Answer accuracy vs. retrieved context
|
| 6 |
+
- Answer Relevancy: How relevant the answer is to the question
|
| 7 |
+
- Context Precision: How precise the retrieved context is
|
| 8 |
+
- Context Recall: Coverage of relevant information
|
| 9 |
+
"""
|
| 10 |
+
import logging
|
| 11 |
+
import sys
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import json
|
| 14 |
+
from typing import List, Dict, Any
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
|
| 17 |
+
# Add parent directory to path
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 19 |
+
|
| 20 |
+
from src import create_rag_pipeline, settings
|
| 21 |
+
from src.config import EVALS_DIR, RESULTS_DIR
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from datasets import Dataset
|
| 25 |
+
from ragas import evaluate
|
| 26 |
+
from ragas.metrics import (
|
| 27 |
+
faithfulness,
|
| 28 |
+
answer_relevancy,
|
| 29 |
+
context_precision,
|
| 30 |
+
context_recall,
|
| 31 |
+
)
|
| 32 |
+
RAGAS_AVAILABLE = True
|
| 33 |
+
except ImportError:
|
| 34 |
+
RAGAS_AVAILABLE = False
|
| 35 |
+
print("WARNING: RAGAS not installed. Install with: pip install ragas")
|
| 36 |
+
|
| 37 |
+
logging.basicConfig(
|
| 38 |
+
level=logging.INFO,
|
| 39 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 40 |
+
)
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Evaluation dataset
|
| 45 |
+
TEST_QUERIES = [
|
| 46 |
+
{
|
| 47 |
+
"question": "How do I create a FastAPI application?",
|
| 48 |
+
"ground_truth": "You create a FastAPI application by importing FastAPI and creating an instance: from fastapi import FastAPI; app = FastAPI()"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"question": "What are path parameters in FastAPI?",
|
| 52 |
+
"ground_truth": "Path parameters are variables in the URL path that FastAPI can extract and pass to your endpoint function."
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"question": "How do I add request validation?",
|
| 56 |
+
"ground_truth": "FastAPI uses Pydantic models for request validation. You define a model with type hints and use it as a parameter type."
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"question": "What is dependency injection in FastAPI?",
|
| 60 |
+
"ground_truth": "Dependency injection allows you to declare dependencies that FastAPI will resolve and inject into your endpoint functions."
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"question": "How do I handle authentication in FastAPI?",
|
| 64 |
+
"ground_truth": "FastAPI provides security utilities for OAuth2, JWT tokens, and API keys. You can use dependencies to protect endpoints."
|
| 65 |
+
},
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_evaluation():
|
| 70 |
+
"""Run RAGAS evaluation on the RAG pipeline."""
|
| 71 |
+
|
| 72 |
+
if not RAGAS_AVAILABLE:
|
| 73 |
+
logger.error("RAGAS not available. Please install it.")
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
logger.info("=" * 60)
|
| 77 |
+
logger.info("RAG Evaluation with RAGAS")
|
| 78 |
+
logger.info("=" * 60)
|
| 79 |
+
|
| 80 |
+
# Initialize pipeline
|
| 81 |
+
logger.info("Initializing RAG pipeline...")
|
| 82 |
+
pipeline = create_rag_pipeline()
|
| 83 |
+
|
| 84 |
+
# Prepare evaluation data
|
| 85 |
+
logger.info(f"\nRunning evaluation on {len(TEST_QUERIES)} queries...")
|
| 86 |
+
|
| 87 |
+
evaluation_data = {
|
| 88 |
+
"question": [],
|
| 89 |
+
"answer": [],
|
| 90 |
+
"contexts": [],
|
| 91 |
+
"ground_truth": []
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
for item in TEST_QUERIES:
|
| 95 |
+
question = item["question"]
|
| 96 |
+
logger.info(f"\nProcessing: {question}")
|
| 97 |
+
|
| 98 |
+
# Get response from pipeline
|
| 99 |
+
response = pipeline.query(question, top_k=5)
|
| 100 |
+
|
| 101 |
+
# Extract data for RAGAS
|
| 102 |
+
evaluation_data["question"].append(question)
|
| 103 |
+
evaluation_data["answer"].append(response["answer"])
|
| 104 |
+
evaluation_data["ground_truth"].append(item["ground_truth"])
|
| 105 |
+
|
| 106 |
+
# Get context from retrieved chunks
|
| 107 |
+
contexts = []
|
| 108 |
+
retrieved_chunks = pipeline.retriever.retrieve(question, top_k=5)
|
| 109 |
+
for chunk in retrieved_chunks:
|
| 110 |
+
contexts.append(chunk["content"])
|
| 111 |
+
evaluation_data["contexts"].append(contexts)
|
| 112 |
+
|
| 113 |
+
logger.info(f" Answer length: {len(response['answer'])} chars")
|
| 114 |
+
logger.info(f" Contexts retrieved: {len(contexts)}")
|
| 115 |
+
|
| 116 |
+
# Create dataset
|
| 117 |
+
dataset = Dataset.from_dict(evaluation_data)
|
| 118 |
+
|
| 119 |
+
# Run evaluation
|
| 120 |
+
logger.info("\n" + "=" * 60)
|
| 121 |
+
logger.info("Running RAGAS metrics...")
|
| 122 |
+
logger.info("=" * 60)
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
results = evaluate(
|
| 126 |
+
dataset,
|
| 127 |
+
metrics=[
|
| 128 |
+
faithfulness,
|
| 129 |
+
answer_relevancy,
|
| 130 |
+
context_precision,
|
| 131 |
+
context_recall,
|
| 132 |
+
],
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Display results
|
| 136 |
+
logger.info("\n" + "=" * 60)
|
| 137 |
+
logger.info("Evaluation Results")
|
| 138 |
+
logger.info("=" * 60)
|
| 139 |
+
|
| 140 |
+
metrics = {
|
| 141 |
+
"faithfulness": results["faithfulness"],
|
| 142 |
+
"answer_relevancy": results["answer_relevancy"],
|
| 143 |
+
"context_precision": results["context_precision"],
|
| 144 |
+
"context_recall": results["context_recall"],
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
for metric_name, score in metrics.items():
|
| 148 |
+
logger.info(f"{metric_name.replace('_', ' ').title()}: {score:.4f}")
|
| 149 |
+
|
| 150 |
+
# Overall score
|
| 151 |
+
overall_score = sum(metrics.values()) / len(metrics)
|
| 152 |
+
logger.info(f"\nOverall Score: {overall_score:.4f}")
|
| 153 |
+
|
| 154 |
+
# Interpretation
|
| 155 |
+
logger.info("\n" + "=" * 60)
|
| 156 |
+
logger.info("Interpretation")
|
| 157 |
+
logger.info("=" * 60)
|
| 158 |
+
logger.info("Scores range from 0 to 1 (higher is better)")
|
| 159 |
+
logger.info("Target scores for production:")
|
| 160 |
+
logger.info(" • Faithfulness: > 0.80 (answers are accurate)")
|
| 161 |
+
logger.info(" • Answer Relevancy: > 0.70 (answers address the question)")
|
| 162 |
+
logger.info(" • Context Precision: > 0.75 (retrieved context is relevant)")
|
| 163 |
+
logger.info(" • Context Recall: > 0.80 (all relevant info is retrieved)")
|
| 164 |
+
|
| 165 |
+
# Save results
|
| 166 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 167 |
+
results_file = RESULTS_DIR / f"ragas_eval_{timestamp}.json"
|
| 168 |
+
|
| 169 |
+
results_dict = {
|
| 170 |
+
"timestamp": timestamp,
|
| 171 |
+
"metrics": metrics,
|
| 172 |
+
"overall_score": overall_score,
|
| 173 |
+
"test_queries": TEST_QUERIES,
|
| 174 |
+
"settings": {
|
| 175 |
+
"chunk_size": settings.chunk_size,
|
| 176 |
+
"chunk_overlap": settings.chunk_overlap,
|
| 177 |
+
"top_k": 5,
|
| 178 |
+
"embedding_model": settings.embedding_model,
|
| 179 |
+
"llm_model": settings.llm_model
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
with open(results_file, 'w') as f:
|
| 184 |
+
json.dump(results_dict, f, indent=2)
|
| 185 |
+
|
| 186 |
+
logger.info(f"\nResults saved to: {results_file}")
|
| 187 |
+
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logger.error(f"Evaluation failed: {e}", exc_info=True)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def simple_accuracy_test():
|
| 193 |
+
"""Simple accuracy test without RAGAS."""
|
| 194 |
+
logger.info("Running simple accuracy test...")
|
| 195 |
+
|
| 196 |
+
pipeline = create_rag_pipeline()
|
| 197 |
+
|
| 198 |
+
correct = 0
|
| 199 |
+
total = len(TEST_QUERIES)
|
| 200 |
+
|
| 201 |
+
for item in TEST_QUERIES:
|
| 202 |
+
question = item["question"]
|
| 203 |
+
response = pipeline.query(question)
|
| 204 |
+
|
| 205 |
+
# Simple check: does answer contain key terms?
|
| 206 |
+
answer_lower = response["answer"].lower()
|
| 207 |
+
ground_truth_lower = item["ground_truth"].lower()
|
| 208 |
+
|
| 209 |
+
# Extract key terms from ground truth
|
| 210 |
+
key_terms = [term for term in ground_truth_lower.split() if len(term) > 4]
|
| 211 |
+
|
| 212 |
+
# Check if at least 50% of key terms are in answer
|
| 213 |
+
matches = sum(1 for term in key_terms if term in answer_lower)
|
| 214 |
+
if matches / len(key_terms) >= 0.5:
|
| 215 |
+
correct += 1
|
| 216 |
+
logger.info(f"✓ {question}")
|
| 217 |
+
else:
|
| 218 |
+
logger.info(f"✗ {question}")
|
| 219 |
+
|
| 220 |
+
accuracy = correct / total
|
| 221 |
+
logger.info(f"\nSimple Accuracy: {accuracy:.2%} ({correct}/{total})")
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
if RAGAS_AVAILABLE:
|
| 226 |
+
run_evaluation()
|
| 227 |
+
else:
|
| 228 |
+
logger.warning("RAGAS not available. Running simple test instead.")
|
| 229 |
+
simple_accuracy_test()
|
ingest_docs.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingest developer documentation into the vector database.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Scrapes documentation from any URL (via sitemap or recursive crawl)
|
| 6 |
+
2. Chunks the content semantically
|
| 7 |
+
3. Generates embeddings
|
| 8 |
+
4. Stores in ChromaDB
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python ingest_docs.py
|
| 12 |
+
|
| 13 |
+
Configure via environment variables (or .env):
|
| 14 |
+
DOCS_URL - Base URL of the documentation (required)
|
| 15 |
+
DOCS_NAME - auto-derived if empty
|
| 16 |
+
DOCS_URL_PATTERNS - Comma-separated path patterns to include, e.g. "/tutorial,/guide"
|
| 17 |
+
Leave empty to include all pages under the base URL.
|
| 18 |
+
COLLECTION_NAME - ChromaDB collection name
|
| 19 |
+
"""
|
| 20 |
+
import logging
|
| 21 |
+
import re
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from urllib.parse import urlparse, urljoin
|
| 24 |
+
import requests
|
| 25 |
+
from bs4 import BeautifulSoup
|
| 26 |
+
from typing import List, Dict, Any, Optional
|
| 27 |
+
from tqdm import tqdm
|
| 28 |
+
import json
|
| 29 |
+
|
| 30 |
+
from src.config import settings, RAW_DATA_DIR, PROCESSED_DATA_DIR
|
| 31 |
+
from src.chunking import create_chunker
|
| 32 |
+
from src.retriever import create_retriever
|
| 33 |
+
|
| 34 |
+
logging.basicConfig(
|
| 35 |
+
level=logging.INFO,
|
| 36 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 37 |
+
)
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class DocsScraper:
|
| 42 |
+
"""
|
| 43 |
+
Generic documentation scraper that works with any documentation site.
|
| 44 |
+
|
| 45 |
+
Discovers pages via sitemap.xml first; falls back to recursive same-domain
|
| 46 |
+
crawling if no sitemap is available.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
base_url: str,
|
| 52 |
+
url_patterns: Optional[List[str]] = None,
|
| 53 |
+
max_pages: int = 200,
|
| 54 |
+
):
|
| 55 |
+
"""
|
| 56 |
+
Args:
|
| 57 |
+
base_url: Root URL of the documentation site.
|
| 58 |
+
url_patterns: Optional list of path substrings to include
|
| 59 |
+
(e.g. ["/tutorial", "/guide"]). When empty/None,
|
| 60 |
+
all pages whose URL starts with base_url are included.
|
| 61 |
+
max_pages: Safety cap on the number of pages to scrape.
|
| 62 |
+
"""
|
| 63 |
+
self.base_url = base_url.rstrip("/")
|
| 64 |
+
self.url_patterns = url_patterns or []
|
| 65 |
+
self.max_pages = max_pages
|
| 66 |
+
|
| 67 |
+
parsed = urlparse(self.base_url)
|
| 68 |
+
self.base_domain = parsed.netloc
|
| 69 |
+
|
| 70 |
+
# URL discovery
|
| 71 |
+
def get_doc_urls(self) -> List[str]:
|
| 72 |
+
"""Return a deduplicated list of documentation page URLs."""
|
| 73 |
+
urls = self._urls_from_sitemap()
|
| 74 |
+
if not urls:
|
| 75 |
+
logger.warning("No sitemap found or empty — falling back to recursive crawl")
|
| 76 |
+
urls = self._urls_from_crawl()
|
| 77 |
+
|
| 78 |
+
urls = self._filter_urls(urls)
|
| 79 |
+
logger.info(f"Discovered {len(urls)} documentation pages")
|
| 80 |
+
return urls[: self.max_pages]
|
| 81 |
+
|
| 82 |
+
def _urls_from_sitemap(self) -> List[str]:
|
| 83 |
+
"""Try to fetch all URLs from sitemap.xml."""
|
| 84 |
+
sitemap_url = f"{self.base_url}/sitemap.xml"
|
| 85 |
+
logger.info(f"Fetching sitemap: {sitemap_url}")
|
| 86 |
+
try:
|
| 87 |
+
resp = requests.get(sitemap_url, timeout=10)
|
| 88 |
+
resp.raise_for_status()
|
| 89 |
+
soup = BeautifulSoup(resp.content, "xml")
|
| 90 |
+
urls = [loc.text.strip() for loc in soup.find_all("loc")]
|
| 91 |
+
logger.info(f"Found {len(urls)} URLs in sitemap")
|
| 92 |
+
return urls
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.warning(f"Could not load sitemap: {e}")
|
| 95 |
+
return []
|
| 96 |
+
|
| 97 |
+
def _urls_from_crawl(self, start_url: Optional[str] = None) -> List[str]:
|
| 98 |
+
"""
|
| 99 |
+
Recursively crawl same-domain links starting from base_url.
|
| 100 |
+
Limited to self.max_pages pages to avoid runaway crawls.
|
| 101 |
+
"""
|
| 102 |
+
start = start_url or self.base_url
|
| 103 |
+
visited: set = set()
|
| 104 |
+
queue: List[str] = [start]
|
| 105 |
+
found: List[str] = []
|
| 106 |
+
|
| 107 |
+
while queue and len(found) < self.max_pages * 2:
|
| 108 |
+
url = queue.pop(0)
|
| 109 |
+
if url in visited:
|
| 110 |
+
continue
|
| 111 |
+
visited.add(url)
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
resp = requests.get(url, timeout=10)
|
| 115 |
+
if resp.status_code != 200:
|
| 116 |
+
continue
|
| 117 |
+
soup = BeautifulSoup(resp.content, "html.parser")
|
| 118 |
+
found.append(url)
|
| 119 |
+
|
| 120 |
+
for tag in soup.find_all("a", href=True):
|
| 121 |
+
href = tag["href"].strip()
|
| 122 |
+
absolute = urljoin(url, href).split("#")[0]
|
| 123 |
+
if (
|
| 124 |
+
absolute not in visited
|
| 125 |
+
and urlparse(absolute).netloc == self.base_domain
|
| 126 |
+
and absolute.startswith("http")
|
| 127 |
+
):
|
| 128 |
+
queue.append(absolute)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.debug(f"Crawl error for {url}: {e}")
|
| 131 |
+
|
| 132 |
+
return found
|
| 133 |
+
|
| 134 |
+
def _filter_urls(self, urls: List[str]) -> List[str]:
|
| 135 |
+
"""
|
| 136 |
+
Keep only URLs that belong to the same domain and, if url_patterns
|
| 137 |
+
is set, match at least one pattern.
|
| 138 |
+
"""
|
| 139 |
+
filtered = []
|
| 140 |
+
for url in urls:
|
| 141 |
+
parsed = urlparse(url)
|
| 142 |
+
if parsed.netloc != self.base_domain:
|
| 143 |
+
continue
|
| 144 |
+
if self.url_patterns:
|
| 145 |
+
if not any(p in parsed.path for p in self.url_patterns):
|
| 146 |
+
continue
|
| 147 |
+
filtered.append(url)
|
| 148 |
+
seen = set()
|
| 149 |
+
unique = []
|
| 150 |
+
for u in filtered:
|
| 151 |
+
if u not in seen:
|
| 152 |
+
seen.add(u)
|
| 153 |
+
unique.append(u)
|
| 154 |
+
return unique
|
| 155 |
+
|
| 156 |
+
# Page scraping
|
| 157 |
+
def scrape_page(self, url: str) -> Dict[str, Any]:
|
| 158 |
+
"""
|
| 159 |
+
Scrape a single documentation page.
|
| 160 |
+
|
| 161 |
+
Returns a dict with keys: url, title, section, content, success.
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
resp = requests.get(url, timeout=10)
|
| 165 |
+
resp.raise_for_status()
|
| 166 |
+
soup = BeautifulSoup(resp.content, "html.parser")
|
| 167 |
+
|
| 168 |
+
main_content = (
|
| 169 |
+
soup.find("main")
|
| 170 |
+
or soup.find("article")
|
| 171 |
+
or soup.find(attrs={"role": "main"})
|
| 172 |
+
or soup.find("div", class_=re.compile(r"content|doc|page|main", re.I))
|
| 173 |
+
or soup.find("body")
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
if not main_content:
|
| 177 |
+
logger.warning(f"No content container found for {url}")
|
| 178 |
+
return {"url": url, "success": False}
|
| 179 |
+
|
| 180 |
+
# Strip navigation / chrome elements
|
| 181 |
+
for unwanted in main_content.find_all(
|
| 182 |
+
["nav", "header", "footer", "script", "style", "aside"]
|
| 183 |
+
):
|
| 184 |
+
unwanted.decompose()
|
| 185 |
+
|
| 186 |
+
text = main_content.get_text(separator="\n", strip=True)
|
| 187 |
+
|
| 188 |
+
h1 = soup.find("h1")
|
| 189 |
+
if h1:
|
| 190 |
+
title_text = h1.get_text(strip=True)
|
| 191 |
+
elif soup.title:
|
| 192 |
+
title_text = soup.title.get_text(strip=True)
|
| 193 |
+
else:
|
| 194 |
+
parts = [p for p in urlparse(url).path.split("/") if p]
|
| 195 |
+
title_text = parts[-1].replace("-", " ").replace("_", " ").title() if parts else url
|
| 196 |
+
|
| 197 |
+
path_parts = [p for p in urlparse(url).path.strip("/").split("/") if p]
|
| 198 |
+
section = path_parts[0].replace("-", " ").replace("_", " ").title() if path_parts else "General"
|
| 199 |
+
|
| 200 |
+
return {
|
| 201 |
+
"url": url,
|
| 202 |
+
"title": title_text,
|
| 203 |
+
"section": section,
|
| 204 |
+
"content": text,
|
| 205 |
+
"success": True,
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.error(f"Error scraping {url}: {e}")
|
| 210 |
+
return {"url": url, "success": False, "error": str(e)}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# Helpers
|
| 214 |
+
|
| 215 |
+
def _safe_filename(name: str) -> str:
|
| 216 |
+
"""Convert a docs name into a safe filename prefix."""
|
| 217 |
+
return re.sub(r"[^a-zA-Z0-9_-]", "_", name).lower()
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# Programmatic ingestion API (used by app.py UI)
|
| 221 |
+
def run_ingestion(
|
| 222 |
+
docs_url: str,
|
| 223 |
+
docs_name: str,
|
| 224 |
+
url_patterns: Optional[List[str]] = None,
|
| 225 |
+
max_pages: int = 50,
|
| 226 |
+
progress_callback=None,
|
| 227 |
+
) -> dict:
|
| 228 |
+
"""
|
| 229 |
+
Run the full ingestion pipeline programmatically.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
docs_url: Base URL of the documentation site.
|
| 233 |
+
docs_name: Human-readable name.
|
| 234 |
+
url_patterns: Optional list of path substrings to filter pages.
|
| 235 |
+
max_pages: Maximum number of pages to scrape.
|
| 236 |
+
progress_callback: Optional callable(message: str) for live status updates.
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Stats dict with keys: total_chunks, collection_name, embedding_dimension,
|
| 240 |
+
metadata_fields, pages_scraped.
|
| 241 |
+
"""
|
| 242 |
+
def emit(msg: str):
|
| 243 |
+
logger.info(msg)
|
| 244 |
+
if progress_callback:
|
| 245 |
+
progress_callback(msg)
|
| 246 |
+
|
| 247 |
+
safe_name = _safe_filename(docs_name)
|
| 248 |
+
url_patterns = url_patterns or []
|
| 249 |
+
|
| 250 |
+
emit("=" * 50)
|
| 251 |
+
emit(f"Ingestion Pipeline: {docs_name}")
|
| 252 |
+
emit(f"Source: {docs_url}")
|
| 253 |
+
if url_patterns:
|
| 254 |
+
emit(f"URL patterns: {url_patterns}")
|
| 255 |
+
emit("=" * 50)
|
| 256 |
+
|
| 257 |
+
# Step 1: Scrape
|
| 258 |
+
emit(f"\n[1/4] Discovering and scraping {docs_name} documentation...")
|
| 259 |
+
scraper = DocsScraper(
|
| 260 |
+
base_url=docs_url,
|
| 261 |
+
url_patterns=url_patterns,
|
| 262 |
+
max_pages=max_pages * 4,
|
| 263 |
+
)
|
| 264 |
+
urls = scraper.get_doc_urls()
|
| 265 |
+
urls = urls[:max_pages]
|
| 266 |
+
emit(f" Scraping {len(urls)} pages...")
|
| 267 |
+
|
| 268 |
+
documents = []
|
| 269 |
+
for i, url in enumerate(urls, 1):
|
| 270 |
+
doc = scraper.scrape_page(url)
|
| 271 |
+
if doc.get("success"):
|
| 272 |
+
documents.append(doc)
|
| 273 |
+
if i % 10 == 0 or i == len(urls):
|
| 274 |
+
emit(f" Scraped {i}/{len(urls)} pages ({len(documents)} succeeded)")
|
| 275 |
+
|
| 276 |
+
emit(f"[1/4] Done — {len(documents)} pages scraped successfully")
|
| 277 |
+
|
| 278 |
+
# Save raw documents
|
| 279 |
+
raw_file = RAW_DATA_DIR / f"{safe_name}_docs_raw.json"
|
| 280 |
+
with open(raw_file, "w", encoding="utf-8") as f:
|
| 281 |
+
json.dump(documents, f, indent=2, ensure_ascii=False)
|
| 282 |
+
|
| 283 |
+
# Step 2: Chunk
|
| 284 |
+
emit(f"\n[2/4] Chunking {len(documents)} documents...")
|
| 285 |
+
chunker = create_chunker(
|
| 286 |
+
chunk_size=settings.chunk_size,
|
| 287 |
+
chunk_overlap=settings.chunk_overlap,
|
| 288 |
+
)
|
| 289 |
+
all_chunks = []
|
| 290 |
+
for doc in documents:
|
| 291 |
+
metadata = {
|
| 292 |
+
"source": doc["url"],
|
| 293 |
+
"title": doc["title"],
|
| 294 |
+
"section": doc["section"],
|
| 295 |
+
"url": doc["url"],
|
| 296 |
+
"docs_name": docs_name,
|
| 297 |
+
}
|
| 298 |
+
chunks = chunker.chunk_document(text=doc["content"], metadata=metadata)
|
| 299 |
+
all_chunks.extend(chunks)
|
| 300 |
+
|
| 301 |
+
emit(f"[2/4] Done — {len(all_chunks)} chunks created")
|
| 302 |
+
|
| 303 |
+
processed_file = PROCESSED_DATA_DIR / f"{safe_name}_docs_chunks.json"
|
| 304 |
+
with open(processed_file, "w", encoding="utf-8") as f:
|
| 305 |
+
json.dump(
|
| 306 |
+
[chunk.to_dict() for chunk in all_chunks],
|
| 307 |
+
f,
|
| 308 |
+
indent=2,
|
| 309 |
+
ensure_ascii=False,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Step 3: Embed + store
|
| 313 |
+
emit(f"\n[3/4] Generating embeddings and storing in ChromaDB...")
|
| 314 |
+
emit(f" This may take a few minutes for large doc sets...")
|
| 315 |
+
retriever = create_retriever()
|
| 316 |
+
|
| 317 |
+
try:
|
| 318 |
+
retriever.reset_collection()
|
| 319 |
+
except Exception:
|
| 320 |
+
pass
|
| 321 |
+
|
| 322 |
+
batch_size = 100
|
| 323 |
+
total_batches = (len(all_chunks) + batch_size - 1) // batch_size
|
| 324 |
+
for idx, i in enumerate(range(0, len(all_chunks), batch_size), 1):
|
| 325 |
+
batch = all_chunks[i : i + batch_size]
|
| 326 |
+
retriever.add_documents(batch)
|
| 327 |
+
emit(f" Stored batch {idx}/{total_batches}")
|
| 328 |
+
|
| 329 |
+
# Step 4: Verify
|
| 330 |
+
emit(f"\n[4/4] Verifying ingestion...")
|
| 331 |
+
stats = retriever.get_collection_stats()
|
| 332 |
+
stats["pages_scraped"] = len(documents)
|
| 333 |
+
|
| 334 |
+
emit("\n" + "=" * 50)
|
| 335 |
+
emit("Ingestion Complete!")
|
| 336 |
+
emit(f" Pages scraped : {len(documents)}")
|
| 337 |
+
emit(f" Chunks indexed : {stats['total_chunks']}")
|
| 338 |
+
emit(f" Collection : {stats['collection_name']}")
|
| 339 |
+
emit(f" Embedding dim : {stats['embedding_dimension']}")
|
| 340 |
+
emit("=" * 50)
|
| 341 |
+
|
| 342 |
+
return stats
|
| 343 |
+
|
| 344 |
+
# CLI entry point
|
| 345 |
+
def main():
|
| 346 |
+
"""CLI entry point — reads config from settings / .env."""
|
| 347 |
+
url_patterns: List[str] = []
|
| 348 |
+
if settings.docs_url_patterns.strip():
|
| 349 |
+
url_patterns = [p.strip() for p in settings.docs_url_patterns.split(",") if p.strip()]
|
| 350 |
+
|
| 351 |
+
run_ingestion(
|
| 352 |
+
docs_url=settings.docs_url,
|
| 353 |
+
docs_name=settings.docs_name,
|
| 354 |
+
url_patterns=url_patterns,
|
| 355 |
+
)
|
| 356 |
+
logger.info("Ready to use! Run 'python app.py' to start the UI")
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
if __name__ == "__main__":
|
| 360 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Dependencies
|
| 2 |
+
python-dotenv==1.0.0
|
| 3 |
+
gradio==4.44.0
|
| 4 |
+
langchain==0.1.20
|
| 5 |
+
langchain-community==0.0.38
|
| 6 |
+
langchain-huggingface==0.0.1
|
| 7 |
+
|
| 8 |
+
# Vector Store & Embeddings
|
| 9 |
+
chromadb==0.4.22
|
| 10 |
+
sentence-transformers==2.6.0
|
| 11 |
+
|
| 12 |
+
# Document Processing
|
| 13 |
+
beautifulsoup4==4.12.3
|
| 14 |
+
lxml==5.1.0
|
| 15 |
+
markdownify==0.11.6
|
| 16 |
+
pypdf==3.17.4
|
| 17 |
+
|
| 18 |
+
# RAG Evaluation
|
| 19 |
+
ragas==0.1.7
|
| 20 |
+
datasets==2.16.1
|
| 21 |
+
|
| 22 |
+
# API & Monitoring
|
| 23 |
+
fastapi==0.109.2
|
| 24 |
+
uvicorn==0.27.1
|
| 25 |
+
pydantic==2.6.1
|
| 26 |
+
pydantic-settings==2.1.0
|
| 27 |
+
|
| 28 |
+
# Utilities
|
| 29 |
+
requests==2.31.0
|
| 30 |
+
tqdm==4.66.1
|
| 31 |
+
python-multipart==0.0.9
|
| 32 |
+
|
| 33 |
+
# Testing
|
| 34 |
+
pytest==7.4.4
|
| 35 |
+
pytest-asyncio==0.23.4
|
| 36 |
+
pytest-cov==4.1.0
|
| 37 |
+
|
| 38 |
+
# Hugging Face
|
| 39 |
+
huggingface-hub==0.27.0
|
| 40 |
+
transformers==4.40.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Developer Docs AI Copilot - src package
|
| 3 |
+
"""
|
| 4 |
+
from src.config import settings
|
| 5 |
+
from src.chunking import SemanticChunker, DocumentChunk, create_chunker
|
| 6 |
+
from src.embeddings import EmbeddingGenerator, create_embedding_generator
|
| 7 |
+
from src.retriever import DocumentRetriever, create_retriever
|
| 8 |
+
from src.rag_pipeline import RAGPipeline, create_rag_pipeline
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"settings",
|
| 12 |
+
"SemanticChunker",
|
| 13 |
+
"DocumentChunk",
|
| 14 |
+
"create_chunker",
|
| 15 |
+
"EmbeddingGenerator",
|
| 16 |
+
"create_embedding_generator",
|
| 17 |
+
"DocumentRetriever",
|
| 18 |
+
"create_retriever",
|
| 19 |
+
"RAGPipeline",
|
| 20 |
+
"create_rag_pipeline",
|
| 21 |
+
]
|
src/chunking.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document chunking strategies for RAG.
|
| 3 |
+
|
| 4 |
+
Implements semantic chunking with overlap, metadata enrichment,
|
| 5 |
+
and configurable strategies for different content types.
|
| 6 |
+
"""
|
| 7 |
+
import re
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class DocumentChunk:
|
| 17 |
+
"""Represents a single document chunk with metadata."""
|
| 18 |
+
content: str
|
| 19 |
+
metadata: Dict[str, Any]
|
| 20 |
+
chunk_id: str
|
| 21 |
+
|
| 22 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 23 |
+
"""Convert to dictionary for storage."""
|
| 24 |
+
return {
|
| 25 |
+
"content": self.content,
|
| 26 |
+
"metadata": self.metadata,
|
| 27 |
+
"chunk_id": self.chunk_id
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class SemanticChunker:
|
| 32 |
+
"""
|
| 33 |
+
Smart chunking that preserves semantic meaning.
|
| 34 |
+
|
| 35 |
+
Features:
|
| 36 |
+
- Splits on natural boundaries (paragraphs, sentences)
|
| 37 |
+
- Maintains context with overlap
|
| 38 |
+
- Preserves code blocks intact
|
| 39 |
+
- Enriches chunks with metadata
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
chunk_size: int = 600,
|
| 45 |
+
chunk_overlap: int = 100,
|
| 46 |
+
preserve_code_blocks: bool = True
|
| 47 |
+
):
|
| 48 |
+
self.chunk_size = chunk_size
|
| 49 |
+
self.chunk_overlap = chunk_overlap
|
| 50 |
+
self.preserve_code_blocks = preserve_code_blocks
|
| 51 |
+
|
| 52 |
+
def chunk_document(
|
| 53 |
+
self,
|
| 54 |
+
text: str,
|
| 55 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 56 |
+
) -> List[DocumentChunk]:
|
| 57 |
+
"""
|
| 58 |
+
Split document into semantically meaningful chunks.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
text: Document text to chunk
|
| 62 |
+
metadata: Optional metadata to attach to all chunks
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List of DocumentChunk objects
|
| 66 |
+
"""
|
| 67 |
+
if not text or not text.strip():
|
| 68 |
+
logger.warning("Empty text provided for chunking")
|
| 69 |
+
return []
|
| 70 |
+
|
| 71 |
+
metadata = metadata or {}
|
| 72 |
+
|
| 73 |
+
# Extract and preserve code blocks
|
| 74 |
+
code_blocks = []
|
| 75 |
+
if self.preserve_code_blocks:
|
| 76 |
+
text, code_blocks = self._extract_code_blocks(text)
|
| 77 |
+
|
| 78 |
+
# Split into paragraphs first
|
| 79 |
+
paragraphs = self._split_paragraphs(text)
|
| 80 |
+
|
| 81 |
+
# Create chunks
|
| 82 |
+
chunks = []
|
| 83 |
+
current_chunk = []
|
| 84 |
+
current_size = 0
|
| 85 |
+
|
| 86 |
+
for i, para in enumerate(paragraphs):
|
| 87 |
+
para_tokens = self._estimate_tokens(para)
|
| 88 |
+
|
| 89 |
+
# If single paragraph exceeds chunk size, split by sentences
|
| 90 |
+
if para_tokens > self.chunk_size:
|
| 91 |
+
if current_chunk:
|
| 92 |
+
chunks.append(self._create_chunk(
|
| 93 |
+
current_chunk,
|
| 94 |
+
metadata,
|
| 95 |
+
len(chunks)
|
| 96 |
+
))
|
| 97 |
+
current_chunk = []
|
| 98 |
+
current_size = 0
|
| 99 |
+
|
| 100 |
+
# Split long paragraph
|
| 101 |
+
sentence_chunks = self._split_long_paragraph(para, metadata, len(chunks))
|
| 102 |
+
chunks.extend(sentence_chunks)
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
# Add paragraph to current chunk
|
| 106 |
+
if current_size + para_tokens <= self.chunk_size:
|
| 107 |
+
current_chunk.append(para)
|
| 108 |
+
current_size += para_tokens
|
| 109 |
+
else:
|
| 110 |
+
# Save current chunk
|
| 111 |
+
if current_chunk:
|
| 112 |
+
chunks.append(self._create_chunk(
|
| 113 |
+
current_chunk,
|
| 114 |
+
metadata,
|
| 115 |
+
len(chunks)
|
| 116 |
+
))
|
| 117 |
+
|
| 118 |
+
# Start new chunk with overlap
|
| 119 |
+
overlap_text = self._get_overlap_text(current_chunk)
|
| 120 |
+
current_chunk = [overlap_text, para] if overlap_text else [para]
|
| 121 |
+
current_size = self._estimate_tokens(overlap_text) + para_tokens
|
| 122 |
+
|
| 123 |
+
# Add remaining chunk
|
| 124 |
+
if current_chunk:
|
| 125 |
+
chunks.append(self._create_chunk(
|
| 126 |
+
current_chunk,
|
| 127 |
+
metadata,
|
| 128 |
+
len(chunks)
|
| 129 |
+
))
|
| 130 |
+
|
| 131 |
+
# Reinsert code blocks
|
| 132 |
+
if code_blocks:
|
| 133 |
+
chunks = self._reinsert_code_blocks(chunks, code_blocks)
|
| 134 |
+
|
| 135 |
+
logger.info(f"Created {len(chunks)} chunks from document")
|
| 136 |
+
return chunks
|
| 137 |
+
|
| 138 |
+
def _extract_code_blocks(self, text: str) -> tuple[str, List[Dict[str, str]]]:
|
| 139 |
+
"""Extract code blocks to preserve them intact."""
|
| 140 |
+
code_pattern = r'```[\s\S]*?```|`[^`]+`'
|
| 141 |
+
code_blocks = []
|
| 142 |
+
|
| 143 |
+
def replace_code(match):
|
| 144 |
+
placeholder = f"__CODE_BLOCK_{len(code_blocks)}__"
|
| 145 |
+
code_blocks.append({
|
| 146 |
+
"placeholder": placeholder,
|
| 147 |
+
"content": match.group(0)
|
| 148 |
+
})
|
| 149 |
+
return placeholder
|
| 150 |
+
|
| 151 |
+
text_without_code = re.sub(code_pattern, replace_code, text)
|
| 152 |
+
return text_without_code, code_blocks
|
| 153 |
+
|
| 154 |
+
def _reinsert_code_blocks(
|
| 155 |
+
self,
|
| 156 |
+
chunks: List[DocumentChunk],
|
| 157 |
+
code_blocks: List[Dict[str, str]]
|
| 158 |
+
) -> List[DocumentChunk]:
|
| 159 |
+
"""Reinsert code blocks into chunks."""
|
| 160 |
+
for chunk in chunks:
|
| 161 |
+
for code_block in code_blocks:
|
| 162 |
+
chunk.content = chunk.content.replace(
|
| 163 |
+
code_block["placeholder"],
|
| 164 |
+
code_block["content"]
|
| 165 |
+
)
|
| 166 |
+
return chunks
|
| 167 |
+
|
| 168 |
+
def _split_paragraphs(self, text: str) -> List[str]:
|
| 169 |
+
"""Split text into paragraphs."""
|
| 170 |
+
# Split on double newlines or more
|
| 171 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 172 |
+
return [p.strip() for p in paragraphs if p.strip()]
|
| 173 |
+
|
| 174 |
+
def _split_long_paragraph(
|
| 175 |
+
self,
|
| 176 |
+
paragraph: str,
|
| 177 |
+
metadata: Dict[str, Any],
|
| 178 |
+
start_idx: int
|
| 179 |
+
) -> List[DocumentChunk]:
|
| 180 |
+
"""Split a long paragraph by sentences."""
|
| 181 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
| 182 |
+
|
| 183 |
+
chunks = []
|
| 184 |
+
current_chunk = []
|
| 185 |
+
current_size = 0
|
| 186 |
+
|
| 187 |
+
for sentence in sentences:
|
| 188 |
+
sentence_tokens = self._estimate_tokens(sentence)
|
| 189 |
+
|
| 190 |
+
if current_size + sentence_tokens <= self.chunk_size:
|
| 191 |
+
current_chunk.append(sentence)
|
| 192 |
+
current_size += sentence_tokens
|
| 193 |
+
else:
|
| 194 |
+
if current_chunk:
|
| 195 |
+
chunks.append(self._create_chunk(
|
| 196 |
+
current_chunk,
|
| 197 |
+
metadata,
|
| 198 |
+
start_idx + len(chunks)
|
| 199 |
+
))
|
| 200 |
+
current_chunk = [sentence]
|
| 201 |
+
current_size = sentence_tokens
|
| 202 |
+
|
| 203 |
+
if current_chunk:
|
| 204 |
+
chunks.append(self._create_chunk(
|
| 205 |
+
current_chunk,
|
| 206 |
+
metadata,
|
| 207 |
+
start_idx + len(chunks)
|
| 208 |
+
))
|
| 209 |
+
|
| 210 |
+
return chunks
|
| 211 |
+
|
| 212 |
+
def _create_chunk(
|
| 213 |
+
self,
|
| 214 |
+
text_segments: List[str],
|
| 215 |
+
metadata: Dict[str, Any],
|
| 216 |
+
chunk_idx: int
|
| 217 |
+
) -> DocumentChunk:
|
| 218 |
+
"""Create a DocumentChunk from text segments."""
|
| 219 |
+
content = "\n\n".join(text_segments)
|
| 220 |
+
|
| 221 |
+
# Enrich metadata
|
| 222 |
+
enriched_metadata = {
|
| 223 |
+
**metadata,
|
| 224 |
+
"chunk_index": chunk_idx,
|
| 225 |
+
"chunk_size": len(content),
|
| 226 |
+
"has_code": "```" in content or "`" in content,
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
chunk_id = f"{metadata.get('source', 'unknown')}_{chunk_idx}"
|
| 230 |
+
|
| 231 |
+
return DocumentChunk(
|
| 232 |
+
content=content,
|
| 233 |
+
metadata=enriched_metadata,
|
| 234 |
+
chunk_id=chunk_id
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
def _get_overlap_text(self, chunks: List[str]) -> str:
|
| 238 |
+
"""Get overlap text from previous chunks."""
|
| 239 |
+
if not chunks:
|
| 240 |
+
return ""
|
| 241 |
+
|
| 242 |
+
combined = " ".join(chunks[-2:])
|
| 243 |
+
tokens = self._estimate_tokens(combined)
|
| 244 |
+
|
| 245 |
+
if tokens <= self.chunk_overlap:
|
| 246 |
+
return combined
|
| 247 |
+
|
| 248 |
+
# Truncate to overlap size
|
| 249 |
+
words = combined.split()
|
| 250 |
+
overlap_words = words[-(self.chunk_overlap // 4):]
|
| 251 |
+
return " ".join(overlap_words)
|
| 252 |
+
|
| 253 |
+
@staticmethod
|
| 254 |
+
def _estimate_tokens(text: str) -> int:
|
| 255 |
+
"""Rough token estimation (1 token ≈ 4 characters)."""
|
| 256 |
+
return len(text) // 4
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def create_chunker(chunk_size: int = 600, chunk_overlap: int = 100) -> SemanticChunker:
|
| 260 |
+
"""Factory function to create a chunker instance."""
|
| 261 |
+
return SemanticChunker(
|
| 262 |
+
chunk_size=chunk_size,
|
| 263 |
+
chunk_overlap=chunk_overlap,
|
| 264 |
+
preserve_code_blocks=True
|
| 265 |
+
)
|
src/config.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for Developer Docs AI Copilot.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
from pydantic_settings import BaseSettings
|
| 9 |
+
from pydantic import Field, model_validator
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Settings(BaseSettings):
|
| 13 |
+
"""Application settings loaded from environment variables."""
|
| 14 |
+
|
| 15 |
+
# API Keys
|
| 16 |
+
hf_token: str = Field(default="", alias="HF_TOKEN")
|
| 17 |
+
|
| 18 |
+
# Model Configuration
|
| 19 |
+
llm_model: str = Field(
|
| 20 |
+
default="meta-llama/Llama-3.2-3B-Instruct",
|
| 21 |
+
alias="LLM_MODEL"
|
| 22 |
+
)
|
| 23 |
+
llm_max_tokens: int = Field(default=512, alias="LLM_MAX_TOKENS")
|
| 24 |
+
llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE")
|
| 25 |
+
|
| 26 |
+
embedding_model: str = Field(
|
| 27 |
+
default="sentence-transformers/all-MiniLM-L6-v2",
|
| 28 |
+
alias="EMBEDDING_MODEL"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Vector Database
|
| 32 |
+
chroma_persist_dir: str = Field(
|
| 33 |
+
default="./data/vectordb",
|
| 34 |
+
alias="CHROMA_PERSIST_DIR"
|
| 35 |
+
)
|
| 36 |
+
collection_name: str = Field(
|
| 37 |
+
default="developer_docs",
|
| 38 |
+
alias="COLLECTION_NAME"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Chunking Configuration
|
| 42 |
+
chunk_size: int = Field(default=600, alias="CHUNK_SIZE")
|
| 43 |
+
chunk_overlap: int = Field(default=100, alias="CHUNK_OVERLAP")
|
| 44 |
+
|
| 45 |
+
# Retrieval Configuration
|
| 46 |
+
top_k_retrieval: int = Field(default=5, alias="TOP_K_RETRIEVAL")
|
| 47 |
+
min_similarity_score: float = Field(
|
| 48 |
+
default=0.2,
|
| 49 |
+
alias="MIN_SIMILARITY_SCORE"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Application Settings
|
| 53 |
+
app_port: int = Field(default=7860, alias="APP_PORT")
|
| 54 |
+
log_level: str = Field(default="INFO", alias="LOG_LEVEL")
|
| 55 |
+
|
| 56 |
+
# Documentation Source
|
| 57 |
+
docs_url: str = Field(
|
| 58 |
+
default="https://fastapi.tiangolo.com",
|
| 59 |
+
alias="DOCS_URL"
|
| 60 |
+
)
|
| 61 |
+
# Human-readable name for the docs. it is auto-derived from URL if not set
|
| 62 |
+
docs_name: str = Field(default="", alias="DOCS_NAME")
|
| 63 |
+
|
| 64 |
+
docs_url_patterns: str = Field(default="", alias="DOCS_URL_PATTERNS")
|
| 65 |
+
|
| 66 |
+
@model_validator(mode="after")
|
| 67 |
+
def set_docs_name(self) -> "Settings":
|
| 68 |
+
if not self.docs_name:
|
| 69 |
+
hostname = urlparse(self.docs_url).hostname or ""
|
| 70 |
+
name = hostname.split(".")[0].replace("-", " ").title()
|
| 71 |
+
self.docs_name = name
|
| 72 |
+
return self
|
| 73 |
+
|
| 74 |
+
class Config:
|
| 75 |
+
env_file = ".env"
|
| 76 |
+
env_file_encoding = "utf-8"
|
| 77 |
+
case_sensitive = False
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Global settings instance
|
| 81 |
+
settings = Settings()
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Directory paths
|
| 85 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 86 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 87 |
+
RAW_DATA_DIR = DATA_DIR / "raw"
|
| 88 |
+
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
| 89 |
+
VECTORDB_DIR = DATA_DIR / "vectordb"
|
| 90 |
+
EVALS_DIR = PROJECT_ROOT / "evals"
|
| 91 |
+
RESULTS_DIR = EVALS_DIR / "results"
|
| 92 |
+
|
| 93 |
+
# Ensure directories exist
|
| 94 |
+
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, VECTORDB_DIR, RESULTS_DIR]:
|
| 95 |
+
directory.mkdir(parents=True, exist_ok=True)
|
src/embeddings.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Embedding generation for RAG system.
|
| 3 |
+
|
| 4 |
+
Handles text-to-vector conversion using sentence-transformers.
|
| 5 |
+
"""
|
| 6 |
+
from typing import List, Union
|
| 7 |
+
import logging
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EmbeddingGenerator:
|
| 15 |
+
"""
|
| 16 |
+
Generates embeddings for text using sentence-transformers.
|
| 17 |
+
|
| 18 |
+
Features:
|
| 19 |
+
- Batch processing for efficiency
|
| 20 |
+
- Caching of model
|
| 21 |
+
- Normalized embeddings for cosine similarity
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
|
| 25 |
+
"""
|
| 26 |
+
Initialize embedding generator.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
model_name: HuggingFace model identifier
|
| 30 |
+
"""
|
| 31 |
+
self.model_name = model_name
|
| 32 |
+
logger.info(f"Loading embedding model: {model_name}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
self.model = SentenceTransformer(model_name)
|
| 36 |
+
self.embedding_dim = self.model.get_sentence_embedding_dimension()
|
| 37 |
+
logger.info(f"Model loaded. Embedding dimension: {self.embedding_dim}")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Failed to load embedding model: {e}")
|
| 40 |
+
raise
|
| 41 |
+
|
| 42 |
+
def embed_text(self, text: Union[str, List[str]]) -> np.ndarray:
|
| 43 |
+
"""
|
| 44 |
+
Generate embeddings for text.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
text: Single text string or list of strings
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Numpy array of embeddings (shape: [n_texts, embedding_dim])
|
| 51 |
+
"""
|
| 52 |
+
if isinstance(text, str):
|
| 53 |
+
text = [text]
|
| 54 |
+
|
| 55 |
+
if not text:
|
| 56 |
+
raise ValueError("No text provided for embedding")
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
# Generate embeddings
|
| 60 |
+
embeddings = self.model.encode(
|
| 61 |
+
text,
|
| 62 |
+
normalize_embeddings=True, # For cosine similarity
|
| 63 |
+
show_progress_bar=len(text) > 10,
|
| 64 |
+
batch_size=32
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
logger.debug(f"Generated embeddings for {len(text)} texts")
|
| 68 |
+
return embeddings
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Embedding generation failed: {e}")
|
| 72 |
+
raise
|
| 73 |
+
|
| 74 |
+
def embed_query(self, query: str) -> np.ndarray:
|
| 75 |
+
"""
|
| 76 |
+
Generate embedding for a single query.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
query: Query text
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
1D numpy array of embedding
|
| 83 |
+
"""
|
| 84 |
+
embedding = self.embed_text(query)
|
| 85 |
+
return embedding[0] # Return single embedding
|
| 86 |
+
|
| 87 |
+
def embed_documents(self, documents: List[str]) -> np.ndarray:
|
| 88 |
+
"""
|
| 89 |
+
Generate embeddings for a batch of documents.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
documents: List of document texts
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
2D numpy array of embeddings
|
| 96 |
+
"""
|
| 97 |
+
return self.embed_text(documents)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def create_embedding_generator(model_name: str = None) -> EmbeddingGenerator:
|
| 101 |
+
"""
|
| 102 |
+
Factory function to create embedding generator.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
model_name: Optional model name override
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
EmbeddingGenerator instance
|
| 109 |
+
"""
|
| 110 |
+
from src.config import settings
|
| 111 |
+
|
| 112 |
+
model = model_name or settings.embedding_model
|
| 113 |
+
return EmbeddingGenerator(model_name=model)
|
src/prompts.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prompt templates for the RAG system.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
|
| 6 |
+
from src.config import settings
|
| 7 |
+
|
| 8 |
+
_DOCS_NAME = settings.docs_name
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _build_system_prompt(docs_name: str) -> str:
|
| 12 |
+
return f"""You are a helpful assistant specialized in {docs_name} documentation.
|
| 13 |
+
|
| 14 |
+
Your role is to answer questions ONLY using the provided context from the official {docs_name} documentation.
|
| 15 |
+
|
| 16 |
+
Guidelines:
|
| 17 |
+
1. Answer based ONLY on the provided context
|
| 18 |
+
2. If the context doesn't contain the answer, say "I don't have enough information in the documentation to answer that"
|
| 19 |
+
3. Preserve code formatting and indentation
|
| 20 |
+
4. Include code examples when available in the context
|
| 21 |
+
5. Cite sources by mentioning the section (e.g., "According to the Routing section...")
|
| 22 |
+
6. Be concise but complete
|
| 23 |
+
7. Use technical language appropriate for developers
|
| 24 |
+
|
| 25 |
+
If you're unsure, it's better to admit it than to make up information."""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
SYSTEM_PROMPT = _build_system_prompt(_DOCS_NAME)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_rag_prompt(query: str, context_chunks: List[Dict[str, Any]]) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Create the full RAG prompt with context and query.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
query: User's question
|
| 37 |
+
context_chunks: Retrieved document chunks with metadata
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Formatted prompt string
|
| 41 |
+
"""
|
| 42 |
+
# Build context section
|
| 43 |
+
context_parts = []
|
| 44 |
+
for i, chunk in enumerate(context_chunks, 1):
|
| 45 |
+
source = chunk["metadata"].get("source", "Unknown")
|
| 46 |
+
section = chunk["metadata"].get("section", "")
|
| 47 |
+
|
| 48 |
+
context_header = f"[Context {i}"
|
| 49 |
+
if section:
|
| 50 |
+
context_header += f" - {section}"
|
| 51 |
+
context_header += f" from {source}]"
|
| 52 |
+
|
| 53 |
+
context_parts.append(f"{context_header}\n{chunk['content']}\n")
|
| 54 |
+
|
| 55 |
+
context_text = "\n".join(context_parts)
|
| 56 |
+
|
| 57 |
+
# Create full prompt
|
| 58 |
+
prompt = f"""{SYSTEM_PROMPT}
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
CONTEXT FROM DOCUMENTATION:
|
| 63 |
+
|
| 64 |
+
{context_text}
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
USER QUESTION: {query}
|
| 69 |
+
|
| 70 |
+
ANSWER (based only on the context above):"""
|
| 71 |
+
|
| 72 |
+
return prompt
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def create_no_context_prompt(query: str) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Create prompt when no relevant context is found.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
query: User's question
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Formatted prompt string
|
| 84 |
+
"""
|
| 85 |
+
prompt = f"""{SYSTEM_PROMPT}
|
| 86 |
+
|
| 87 |
+
USER QUESTION: {query}
|
| 88 |
+
|
| 89 |
+
Unfortunately, I couldn't find relevant information in the {_DOCS_NAME} documentation to answer this question.
|
| 90 |
+
|
| 91 |
+
This could mean:
|
| 92 |
+
1. The question is about a topic not covered in the documentation I have access to
|
| 93 |
+
2. The question might need to be rephrased
|
| 94 |
+
3. The topic might be covered in a different section
|
| 95 |
+
|
| 96 |
+
Can you rephrase your question or provide more context?"""
|
| 97 |
+
|
| 98 |
+
return prompt
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def format_response_with_sources(
|
| 102 |
+
answer: str,
|
| 103 |
+
sources: List[Dict[str, Any]]
|
| 104 |
+
) -> Dict[str, Any]:
|
| 105 |
+
"""
|
| 106 |
+
Format the final response with sources.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
answer: Generated answer
|
| 110 |
+
sources: Retrieved source chunks
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Formatted response dictionary
|
| 114 |
+
"""
|
| 115 |
+
# Extract unique sources
|
| 116 |
+
unique_sources = {}
|
| 117 |
+
for source in sources:
|
| 118 |
+
metadata = source["metadata"]
|
| 119 |
+
source_key = metadata.get("url", metadata.get("source", "Unknown"))
|
| 120 |
+
|
| 121 |
+
if source_key not in unique_sources:
|
| 122 |
+
unique_sources[source_key] = {
|
| 123 |
+
"url": metadata.get("url", ""),
|
| 124 |
+
"title": metadata.get("title", ""),
|
| 125 |
+
"section": metadata.get("section", ""),
|
| 126 |
+
"score": source.get("score", 0.0)
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Sort by relevance score
|
| 130 |
+
sorted_sources = sorted(
|
| 131 |
+
unique_sources.values(),
|
| 132 |
+
key=lambda x: x["score"],
|
| 133 |
+
reverse=True
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"answer": answer,
|
| 138 |
+
"sources": sorted_sources,
|
| 139 |
+
"source_count": len(sorted_sources)
|
| 140 |
+
}
|
src/rag_pipeline.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main RAG pipeline orchestration.
|
| 3 |
+
|
| 4 |
+
Coordinates retrieval and generation for question answering.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import requests
|
| 8 |
+
from typing import Dict, Any, Optional, List
|
| 9 |
+
|
| 10 |
+
from src.retriever import DocumentRetriever
|
| 11 |
+
from src.prompts import create_rag_prompt, create_no_context_prompt, format_response_with_sources
|
| 12 |
+
from src.config import settings
|
| 13 |
+
|
| 14 |
+
# HuggingFace router — OpenAI-compatible chat completions endpoint
|
| 15 |
+
_HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class RAGPipeline:
|
| 21 |
+
"""
|
| 22 |
+
Orchestrates the RAG pipeline: retrieve → generate → format.
|
| 23 |
+
|
| 24 |
+
Features:
|
| 25 |
+
- Smart retrieval with filtering
|
| 26 |
+
- LLM generation via HuggingFace Inference API
|
| 27 |
+
- Source attribution
|
| 28 |
+
- Error handling with graceful degradation
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
retriever: DocumentRetriever,
|
| 34 |
+
llm_model: Optional[str] = None,
|
| 35 |
+
min_similarity_score: float = 0.5
|
| 36 |
+
):
|
| 37 |
+
"""
|
| 38 |
+
Initialize RAG pipeline.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
retriever: Document retriever instance
|
| 42 |
+
llm_model: Optional LLM model name override
|
| 43 |
+
min_similarity_score: Minimum score for relevant results
|
| 44 |
+
"""
|
| 45 |
+
self.retriever = retriever
|
| 46 |
+
self.llm_model = llm_model or settings.llm_model
|
| 47 |
+
self.min_similarity_score = min_similarity_score
|
| 48 |
+
|
| 49 |
+
self._api_url = _HF_API_URL
|
| 50 |
+
self._headers = {
|
| 51 |
+
"Authorization": f"Bearer {settings.hf_token}",
|
| 52 |
+
"Content-Type": "application/json",
|
| 53 |
+
}
|
| 54 |
+
logger.info(f"LLM endpoint: {self._api_url} model={self.llm_model}")
|
| 55 |
+
|
| 56 |
+
def query(
|
| 57 |
+
self,
|
| 58 |
+
question: str,
|
| 59 |
+
top_k: int = 5,
|
| 60 |
+
filter_metadata: Optional[Dict[str, Any]] = None
|
| 61 |
+
) -> Dict[str, Any]:
|
| 62 |
+
"""
|
| 63 |
+
Process a user query through the RAG pipeline.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
question: User's question
|
| 67 |
+
top_k: Number of chunks to retrieve
|
| 68 |
+
filter_metadata: Optional metadata filters
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Dictionary with answer, sources, and metadata
|
| 72 |
+
"""
|
| 73 |
+
try:
|
| 74 |
+
logger.info(f"Processing query: {question[:100]}...")
|
| 75 |
+
|
| 76 |
+
# Step 1: Retrieve relevant context
|
| 77 |
+
retrieved_chunks = self.retriever.retrieve(
|
| 78 |
+
query=question,
|
| 79 |
+
top_k=top_k,
|
| 80 |
+
filter_metadata=filter_metadata
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Log raw scores for diagnostics
|
| 84 |
+
scores = [round(c["score"], 4) for c in retrieved_chunks]
|
| 85 |
+
logger.info(f"Raw chunk scores: {scores}")
|
| 86 |
+
|
| 87 |
+
# Filter by minimum similarity score
|
| 88 |
+
relevant_chunks = [
|
| 89 |
+
chunk for chunk in retrieved_chunks
|
| 90 |
+
if chunk["score"] >= self.min_similarity_score
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
logger.info(f"Found {len(relevant_chunks)} relevant chunks (threshold: {self.min_similarity_score})")
|
| 94 |
+
|
| 95 |
+
# Step 2: Generate answer
|
| 96 |
+
if not relevant_chunks:
|
| 97 |
+
answer = f"I couldn't find relevant information in the {settings.docs_name} documentation to answer this question. Could you rephrase or ask about a different topic?"
|
| 98 |
+
return {
|
| 99 |
+
"answer": answer,
|
| 100 |
+
"sources": [],
|
| 101 |
+
"source_count": 0,
|
| 102 |
+
"confidence": "low",
|
| 103 |
+
"chunks_retrieved": 0
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Create prompt
|
| 107 |
+
prompt = create_rag_prompt(question, relevant_chunks)
|
| 108 |
+
|
| 109 |
+
# Generate answer
|
| 110 |
+
answer = self._generate_answer(prompt)
|
| 111 |
+
|
| 112 |
+
# Step 3: Format response
|
| 113 |
+
response = format_response_with_sources(answer, relevant_chunks)
|
| 114 |
+
|
| 115 |
+
# Add metadata
|
| 116 |
+
response["confidence"] = self._estimate_confidence(relevant_chunks)
|
| 117 |
+
response["chunks_retrieved"] = len(relevant_chunks)
|
| 118 |
+
|
| 119 |
+
logger.info("Query processed successfully")
|
| 120 |
+
return response
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Error processing query: {e}", exc_info=True)
|
| 124 |
+
return {
|
| 125 |
+
"answer": f"An error occurred while processing your question: {str(e)}",
|
| 126 |
+
"sources": [],
|
| 127 |
+
"source_count": 0,
|
| 128 |
+
"confidence": "error",
|
| 129 |
+
"chunks_retrieved": 0
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
def _generate_answer(self, prompt: str) -> str:
|
| 133 |
+
"""
|
| 134 |
+
Generate answer using LLM.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
prompt: Formatted prompt with context
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
Generated answer text
|
| 141 |
+
"""
|
| 142 |
+
try:
|
| 143 |
+
# Use OpenAI-compatible chat completions endpoint
|
| 144 |
+
payload = {
|
| 145 |
+
"model": f"{self.llm_model}:fastest",
|
| 146 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 147 |
+
"max_tokens": settings.llm_max_tokens,
|
| 148 |
+
"temperature": settings.llm_temperature,
|
| 149 |
+
"top_p": 0.9,
|
| 150 |
+
}
|
| 151 |
+
response = requests.post(
|
| 152 |
+
self._api_url,
|
| 153 |
+
headers=self._headers,
|
| 154 |
+
json=payload,
|
| 155 |
+
timeout=60
|
| 156 |
+
)
|
| 157 |
+
response.raise_for_status()
|
| 158 |
+
result = response.json()
|
| 159 |
+
answer = result["choices"][0]["message"]["content"].strip()
|
| 160 |
+
logger.debug(f"Generated answer ({len(answer)} chars)")
|
| 161 |
+
|
| 162 |
+
return answer
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"LLM generation failed: {e}")
|
| 166 |
+
raise
|
| 167 |
+
|
| 168 |
+
def _estimate_confidence(self, chunks: List[Dict[str, Any]]) -> str:
|
| 169 |
+
"""
|
| 170 |
+
Estimate confidence based on retrieval scores.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
chunks: Retrieved chunks with scores
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Confidence level: "high", "medium", or "low"
|
| 177 |
+
"""
|
| 178 |
+
if not chunks:
|
| 179 |
+
return "low"
|
| 180 |
+
|
| 181 |
+
avg_score = sum(chunk["score"] for chunk in chunks) / len(chunks)
|
| 182 |
+
|
| 183 |
+
if avg_score >= 0.75:
|
| 184 |
+
return "high"
|
| 185 |
+
elif avg_score >= 0.6:
|
| 186 |
+
return "medium"
|
| 187 |
+
else:
|
| 188 |
+
return "low"
|
| 189 |
+
|
| 190 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 191 |
+
"""Get pipeline statistics."""
|
| 192 |
+
return {
|
| 193 |
+
"llm_model": self.llm_model,
|
| 194 |
+
"min_similarity_score": self.min_similarity_score,
|
| 195 |
+
**self.retriever.get_collection_stats()
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def create_rag_pipeline(
|
| 200 |
+
retriever: Optional[DocumentRetriever] = None
|
| 201 |
+
) -> RAGPipeline:
|
| 202 |
+
"""
|
| 203 |
+
Factory function to create RAG pipeline.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
retriever: Optional retriever override
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
RAGPipeline instance
|
| 210 |
+
"""
|
| 211 |
+
from src.retriever import create_retriever
|
| 212 |
+
|
| 213 |
+
if retriever is None:
|
| 214 |
+
retriever = create_retriever()
|
| 215 |
+
|
| 216 |
+
return RAGPipeline(
|
| 217 |
+
retriever=retriever,
|
| 218 |
+
min_similarity_score=settings.min_similarity_score
|
| 219 |
+
)
|
src/retriever.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector retrieval system using ChromaDB.
|
| 3 |
+
|
| 4 |
+
Handles document storage, indexing, and semantic search.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import chromadb
|
| 10 |
+
from chromadb.config import Settings as ChromaSettings
|
| 11 |
+
from chromadb.utils import embedding_functions
|
| 12 |
+
|
| 13 |
+
from src.embeddings import EmbeddingGenerator
|
| 14 |
+
from src.chunking import DocumentChunk
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DocumentRetriever:
|
| 20 |
+
"""
|
| 21 |
+
Manages document storage and retrieval using ChromaDB.
|
| 22 |
+
|
| 23 |
+
Features:
|
| 24 |
+
- Persistent vector storage
|
| 25 |
+
- Semantic similarity search
|
| 26 |
+
- Metadata filtering
|
| 27 |
+
- Source attribution
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
persist_directory: str,
|
| 33 |
+
collection_name: str,
|
| 34 |
+
embedding_generator: EmbeddingGenerator
|
| 35 |
+
):
|
| 36 |
+
"""
|
| 37 |
+
Initialize retriever.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
persist_directory: Path to ChromaDB storage
|
| 41 |
+
collection_name: Name of the collection
|
| 42 |
+
embedding_generator: Embedding generator instance
|
| 43 |
+
"""
|
| 44 |
+
self.persist_directory = Path(persist_directory)
|
| 45 |
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
self.collection_name = collection_name
|
| 48 |
+
self.embedding_generator = embedding_generator
|
| 49 |
+
|
| 50 |
+
# Initialize ChromaDB client
|
| 51 |
+
logger.info(f"Initializing ChromaDB at {persist_directory}")
|
| 52 |
+
self.client = chromadb.PersistentClient(
|
| 53 |
+
path=str(self.persist_directory),
|
| 54 |
+
settings=ChromaSettings(
|
| 55 |
+
anonymized_telemetry=False,
|
| 56 |
+
allow_reset=True
|
| 57 |
+
)
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Get or create collection (cosine distance for proper similarity scores)
|
| 61 |
+
self.collection = self._get_or_create_collection()
|
| 62 |
+
coll_meta = self.collection.metadata or {}
|
| 63 |
+
self._use_cosine = coll_meta.get("hnsw:space") == "cosine"
|
| 64 |
+
logger.info(f"Collection '{collection_name}' ready. Count: {self.collection.count()}. Distance: {'cosine' if self._use_cosine else 'l2'}")
|
| 65 |
+
|
| 66 |
+
def _get_or_create_collection(self):
|
| 67 |
+
"""Get existing collection or create new one."""
|
| 68 |
+
try:
|
| 69 |
+
# Try to get existing collection
|
| 70 |
+
collection = self.client.get_collection(
|
| 71 |
+
name=self.collection_name
|
| 72 |
+
)
|
| 73 |
+
logger.info(f"Loaded existing collection: {self.collection_name}")
|
| 74 |
+
except Exception:
|
| 75 |
+
# Create new collection with cosine distance so scores stay in [0, 1]
|
| 76 |
+
collection = self.client.create_collection(
|
| 77 |
+
name=self.collection_name,
|
| 78 |
+
metadata={"hnsw:space": "cosine", "description": "Developer documentation chunks"}
|
| 79 |
+
)
|
| 80 |
+
logger.info(f"Created new collection: {self.collection_name}")
|
| 81 |
+
|
| 82 |
+
return collection
|
| 83 |
+
|
| 84 |
+
def add_documents(self, chunks: List[DocumentChunk]) -> None:
|
| 85 |
+
"""
|
| 86 |
+
Add document chunks to the vector store.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
chunks: List of DocumentChunk objects
|
| 90 |
+
"""
|
| 91 |
+
if not chunks:
|
| 92 |
+
logger.warning("No chunks to add")
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
logger.info(f"Adding {len(chunks)} chunks to collection")
|
| 96 |
+
|
| 97 |
+
# Prepare data for ChromaDB
|
| 98 |
+
documents = [chunk.content for chunk in chunks]
|
| 99 |
+
metadatas = [chunk.metadata for chunk in chunks]
|
| 100 |
+
ids = [chunk.chunk_id for chunk in chunks]
|
| 101 |
+
|
| 102 |
+
# Generate embeddings
|
| 103 |
+
embeddings = self.embedding_generator.embed_documents(documents)
|
| 104 |
+
|
| 105 |
+
# Add to collection in batches
|
| 106 |
+
batch_size = 100
|
| 107 |
+
for i in range(0, len(chunks), batch_size):
|
| 108 |
+
batch_end = min(i + batch_size, len(chunks))
|
| 109 |
+
|
| 110 |
+
self.collection.add(
|
| 111 |
+
embeddings=embeddings[i:batch_end].tolist(),
|
| 112 |
+
documents=documents[i:batch_end],
|
| 113 |
+
metadatas=metadatas[i:batch_end],
|
| 114 |
+
ids=ids[i:batch_end]
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
logger.debug(f"Added batch {i//batch_size + 1}")
|
| 118 |
+
|
| 119 |
+
logger.info(f"Successfully added {len(chunks)} chunks. Total: {self.collection.count()}")
|
| 120 |
+
|
| 121 |
+
def retrieve(
|
| 122 |
+
self,
|
| 123 |
+
query: str,
|
| 124 |
+
top_k: int = 5,
|
| 125 |
+
filter_metadata: Optional[Dict[str, Any]] = None
|
| 126 |
+
) -> List[Dict[str, Any]]:
|
| 127 |
+
"""
|
| 128 |
+
Retrieve relevant documents for a query.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
query: Search query
|
| 132 |
+
top_k: Number of results to return
|
| 133 |
+
filter_metadata: Optional metadata filters
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
List of results with content, metadata, and scores
|
| 137 |
+
"""
|
| 138 |
+
logger.debug(f"Retrieving top {top_k} results for query: {query[:100]}...")
|
| 139 |
+
|
| 140 |
+
# Generate query embedding
|
| 141 |
+
query_embedding = self.embedding_generator.embed_query(query)
|
| 142 |
+
|
| 143 |
+
# Search
|
| 144 |
+
results = self.collection.query(
|
| 145 |
+
query_embeddings=[query_embedding.tolist()],
|
| 146 |
+
n_results=top_k,
|
| 147 |
+
where=filter_metadata,
|
| 148 |
+
include=["documents", "metadatas", "distances"]
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Format results
|
| 152 |
+
formatted_results = []
|
| 153 |
+
if results["documents"] and results["documents"][0]:
|
| 154 |
+
for i in range(len(results["documents"][0])):
|
| 155 |
+
d = results["distances"][0][i]
|
| 156 |
+
score = max(0.0, 1 - d) if self._use_cosine else max(0.0, 1 - d ** 2 / 2)
|
| 157 |
+
formatted_results.append({
|
| 158 |
+
"content": results["documents"][0][i],
|
| 159 |
+
"metadata": results["metadatas"][0][i],
|
| 160 |
+
"score": score,
|
| 161 |
+
"id": results["ids"][0][i] if "ids" in results else None
|
| 162 |
+
})
|
| 163 |
+
|
| 164 |
+
logger.info(f"Retrieved {len(formatted_results)} results")
|
| 165 |
+
return formatted_results
|
| 166 |
+
|
| 167 |
+
def get_collection_stats(self) -> Dict[str, Any]:
|
| 168 |
+
"""Get statistics about the collection."""
|
| 169 |
+
count = self.collection.count()
|
| 170 |
+
|
| 171 |
+
# Sample a document to get metadata fields
|
| 172 |
+
sample = self.collection.peek(limit=1)
|
| 173 |
+
metadata_fields = list(sample["metadatas"][0].keys()) if sample["metadatas"] else []
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
"total_chunks": count,
|
| 177 |
+
"collection_name": self.collection_name,
|
| 178 |
+
"metadata_fields": metadata_fields,
|
| 179 |
+
"embedding_dimension": self.embedding_generator.embedding_dim
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
def delete_collection(self) -> None:
|
| 183 |
+
"""Delete the entire collection."""
|
| 184 |
+
logger.warning(f"Deleting collection: {self.collection_name}")
|
| 185 |
+
self.client.delete_collection(name=self.collection_name)
|
| 186 |
+
|
| 187 |
+
def reset_collection(self) -> None:
|
| 188 |
+
"""Reset collection (delete and recreate)."""
|
| 189 |
+
logger.warning("Resetting collection")
|
| 190 |
+
try:
|
| 191 |
+
self.delete_collection()
|
| 192 |
+
except Exception:
|
| 193 |
+
pass
|
| 194 |
+
self.collection = self._get_or_create_collection()
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def create_retriever(
|
| 198 |
+
persist_directory: Optional[str] = None,
|
| 199 |
+
collection_name: Optional[str] = None,
|
| 200 |
+
embedding_generator: Optional[EmbeddingGenerator] = None
|
| 201 |
+
) -> DocumentRetriever:
|
| 202 |
+
"""
|
| 203 |
+
Factory function to create retriever.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
persist_directory: Optional directory override
|
| 207 |
+
collection_name: Optional collection name override
|
| 208 |
+
embedding_generator: Optional embedding generator override
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
DocumentRetriever instance
|
| 212 |
+
"""
|
| 213 |
+
from src.config import settings
|
| 214 |
+
from src.embeddings import create_embedding_generator
|
| 215 |
+
|
| 216 |
+
persist_dir = persist_directory or settings.chroma_persist_dir
|
| 217 |
+
coll_name = collection_name or settings.collection_name
|
| 218 |
+
emb_gen = embedding_generator or create_embedding_generator()
|
| 219 |
+
|
| 220 |
+
return DocumentRetriever(
|
| 221 |
+
persist_directory=persist_dir,
|
| 222 |
+
collection_name=coll_name,
|
| 223 |
+
embedding_generator=emb_gen
|
| 224 |
+
)
|
test_chunking.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for document chunking functionality.
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from src.chunking import SemanticChunker, DocumentChunk
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@pytest.fixture
|
| 9 |
+
def chunker():
|
| 10 |
+
"""Create a chunker instance for testing."""
|
| 11 |
+
return SemanticChunker(chunk_size=200, chunk_overlap=50)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_basic_chunking(chunker):
|
| 15 |
+
"""Test basic document chunking."""
|
| 16 |
+
text = """
|
| 17 |
+
FastAPI is a modern, fast (high-performance) web framework.
|
| 18 |
+
|
| 19 |
+
It is based on standard Python type hints.
|
| 20 |
+
|
| 21 |
+
The key features are:
|
| 22 |
+
- Fast: Very high performance
|
| 23 |
+
- Fast to code: Increase development speed
|
| 24 |
+
- Fewer bugs: Reduce human errors
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
chunks = chunker.chunk_document(text)
|
| 28 |
+
|
| 29 |
+
assert len(chunks) > 0
|
| 30 |
+
assert all(isinstance(chunk, DocumentChunk) for chunk in chunks)
|
| 31 |
+
assert all(chunk.content for chunk in chunks)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_chunk_metadata(chunker):
|
| 35 |
+
"""Test that metadata is properly attached."""
|
| 36 |
+
text = "FastAPI is awesome."
|
| 37 |
+
metadata = {
|
| 38 |
+
"source": "test.md",
|
| 39 |
+
"title": "Test Document",
|
| 40 |
+
"url": "https://example.com"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
chunks = chunker.chunk_document(text, metadata=metadata)
|
| 44 |
+
|
| 45 |
+
assert len(chunks) > 0
|
| 46 |
+
chunk = chunks[0]
|
| 47 |
+
|
| 48 |
+
assert chunk.metadata["source"] == "test.md"
|
| 49 |
+
assert chunk.metadata["title"] == "Test Document"
|
| 50 |
+
assert chunk.metadata["url"] == "https://example.com"
|
| 51 |
+
assert "chunk_index" in chunk.metadata
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_code_block_preservation(chunker):
|
| 55 |
+
"""Test that code blocks are preserved."""
|
| 56 |
+
text = """
|
| 57 |
+
Here's an example:
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
from fastapi import FastAPI
|
| 61 |
+
app = FastAPI()
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
This creates an app.
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
chunks = chunker.chunk_document(text)
|
| 68 |
+
|
| 69 |
+
# Code block should be preserved
|
| 70 |
+
combined_content = " ".join(chunk.content for chunk in chunks)
|
| 71 |
+
assert "```python" in combined_content
|
| 72 |
+
assert "FastAPI" in combined_content
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_empty_text(chunker):
|
| 76 |
+
"""Test handling of empty text."""
|
| 77 |
+
chunks = chunker.chunk_document("")
|
| 78 |
+
assert chunks == []
|
| 79 |
+
|
| 80 |
+
chunks = chunker.chunk_document(" ")
|
| 81 |
+
assert chunks == []
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def test_to_dict(chunker):
|
| 85 |
+
"""Test DocumentChunk serialization."""
|
| 86 |
+
text = "Test content"
|
| 87 |
+
metadata = {"source": "test"}
|
| 88 |
+
|
| 89 |
+
chunks = chunker.chunk_document(text, metadata=metadata)
|
| 90 |
+
chunk = chunks[0]
|
| 91 |
+
|
| 92 |
+
chunk_dict = chunk.to_dict()
|
| 93 |
+
|
| 94 |
+
assert "content" in chunk_dict
|
| 95 |
+
assert "metadata" in chunk_dict
|
| 96 |
+
assert "chunk_id" in chunk_dict
|
| 97 |
+
assert chunk_dict["content"] == chunk.content
|
test_retrieval.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test retrieval quality independently.
|
| 4 |
+
|
| 5 |
+
Useful for debugging and tuning retrieval parameters.
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add parent directory to path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 13 |
+
|
| 14 |
+
from src import create_retriever
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
level=logging.INFO,
|
| 18 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 19 |
+
)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
TEST_QUERIES = [
|
| 24 |
+
"How do I create a router in FastAPI?",
|
| 25 |
+
"What are dependencies?",
|
| 26 |
+
"How do I handle errors?",
|
| 27 |
+
"Show me authentication examples",
|
| 28 |
+
"How do I validate request bodies?",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_retrieval():
|
| 33 |
+
"""Test retrieval with various queries."""
|
| 34 |
+
|
| 35 |
+
logger.info("=" * 60)
|
| 36 |
+
logger.info("Retrieval Quality Test")
|
| 37 |
+
logger.info("=" * 60)
|
| 38 |
+
|
| 39 |
+
# Initialize retriever
|
| 40 |
+
retriever = create_retriever()
|
| 41 |
+
stats = retriever.get_collection_stats()
|
| 42 |
+
|
| 43 |
+
logger.info(f"\nVector Database Stats:")
|
| 44 |
+
logger.info(f" Total chunks: {stats['total_chunks']}")
|
| 45 |
+
logger.info(f" Collection: {stats['collection_name']}")
|
| 46 |
+
logger.info(f" Embedding dim: {stats['embedding_dimension']}")
|
| 47 |
+
|
| 48 |
+
# Test each query
|
| 49 |
+
for i, query in enumerate(TEST_QUERIES, 1):
|
| 50 |
+
logger.info("\n" + "=" * 60)
|
| 51 |
+
logger.info(f"Test {i}/{len(TEST_QUERIES)}")
|
| 52 |
+
logger.info("=" * 60)
|
| 53 |
+
logger.info(f"Query: {query}")
|
| 54 |
+
|
| 55 |
+
# Retrieve
|
| 56 |
+
results = retriever.retrieve(query, top_k=3)
|
| 57 |
+
|
| 58 |
+
logger.info(f"\nFound {len(results)} results:")
|
| 59 |
+
|
| 60 |
+
for j, result in enumerate(results, 1):
|
| 61 |
+
logger.info(f"\n--- Result {j} ---")
|
| 62 |
+
logger.info(f"Score: {result['score']:.4f}")
|
| 63 |
+
logger.info(f"Source: {result['metadata'].get('title', 'Unknown')}")
|
| 64 |
+
logger.info(f"Section: {result['metadata'].get('section', 'Unknown')}")
|
| 65 |
+
logger.info(f"Content preview:")
|
| 66 |
+
logger.info(f"{result['content'][:200]}...")
|
| 67 |
+
|
| 68 |
+
# Quality check
|
| 69 |
+
avg_score = sum(r['score'] for r in results) / len(results) if results else 0
|
| 70 |
+
logger.info(f"\nAverage relevance score: {avg_score:.4f}")
|
| 71 |
+
|
| 72 |
+
if avg_score >= 0.75:
|
| 73 |
+
logger.info("✓ High quality results")
|
| 74 |
+
elif avg_score >= 0.6:
|
| 75 |
+
logger.info("⚠ Medium quality results")
|
| 76 |
+
else:
|
| 77 |
+
logger.info("✗ Low quality results - consider tuning")
|
| 78 |
+
|
| 79 |
+
logger.info("\n" + "=" * 60)
|
| 80 |
+
logger.info("Retrieval test complete")
|
| 81 |
+
logger.info("=" * 60)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
test_retrieval()
|