Spaces:
Sleeping
Sleeping
Commit ·
a34068e
0
Parent(s):
Initial deploy: RagCore RAG system with hybrid search and Gradio UI
Browse filesCo-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- .dockerignore +14 -0
- .github/workflows/ci.yml +31 -0
- .gitignore +39 -0
- Dockerfile +19 -0
- README.md +1499 -0
- app/__init__.py +0 -0
- app/api/__init__.py +0 -0
- app/api/deps.py +53 -0
- app/api/routes/__init__.py +0 -0
- app/api/routes/health.py +42 -0
- app/api/routes/ingest.py +155 -0
- app/api/routes/query.py +128 -0
- app/config.py +58 -0
- app/core/__init__.py +0 -0
- app/core/bm25.py +124 -0
- app/core/chunker.py +60 -0
- app/core/embedder.py +43 -0
- app/core/generator.py +128 -0
- app/core/llm.py +88 -0
- app/core/metadata.py +62 -0
- app/core/query_analyzer.py +127 -0
- app/core/reranker.py +52 -0
- app/core/retriever.py +126 -0
- app/core/vectorstore.py +219 -0
- app/main.py +75 -0
- app/models/__init__.py +0 -0
- app/models/document.py +32 -0
- app/models/schemas.py +70 -0
- app/ui/__init__.py +0 -0
- app/ui/gradio_app.py +427 -0
- app/utils/__init__.py +0 -0
- app/utils/helpers.py +51 -0
- app/utils/parsers.py +76 -0
- docker-compose.yml +10 -0
- requirements.txt +18 -0
- tests/__init__.py +0 -0
- tests/conftest.py +21 -0
- tests/test_api.py +23 -0
- tests/test_chunker.py +42 -0
- tests/test_parsers.py +35 -0
- tests/test_query_analyzer.py +52 -0
- tests/test_retrieval.py +56 -0
.dockerignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
.env
|
| 5 |
+
.venv
|
| 6 |
+
venv
|
| 7 |
+
tests/
|
| 8 |
+
.github/
|
| 9 |
+
.pytest_cache
|
| 10 |
+
.coverage
|
| 11 |
+
htmlcov
|
| 12 |
+
*.egg-info
|
| 13 |
+
flashrank_cache/
|
| 14 |
+
.cache/
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [main]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
lint-and-test:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v4
|
| 14 |
+
|
| 15 |
+
- name: Set up Python
|
| 16 |
+
uses: actions/setup-python@v5
|
| 17 |
+
with:
|
| 18 |
+
python-version: "3.12"
|
| 19 |
+
|
| 20 |
+
- name: Install dependencies
|
| 21 |
+
run: pip install -r requirements.txt
|
| 22 |
+
|
| 23 |
+
- name: Lint
|
| 24 |
+
run: ruff check .
|
| 25 |
+
|
| 26 |
+
- name: Run unit tests
|
| 27 |
+
run: pytest tests/ -v --ignore=tests/test_integration.py -x
|
| 28 |
+
env:
|
| 29 |
+
GEMINI_API_KEY: "test"
|
| 30 |
+
QDRANT_URL: "http://localhost:6333"
|
| 31 |
+
QDRANT_API_KEY: "test"
|
.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
*.egg
|
| 10 |
+
|
| 11 |
+
# Virtual environment
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
env/
|
| 15 |
+
|
| 16 |
+
# Environment variables
|
| 17 |
+
.env
|
| 18 |
+
|
| 19 |
+
# IDE
|
| 20 |
+
.vscode/
|
| 21 |
+
.idea/
|
| 22 |
+
*.swp
|
| 23 |
+
*.swo
|
| 24 |
+
|
| 25 |
+
# OS
|
| 26 |
+
.DS_Store
|
| 27 |
+
Thumbs.db
|
| 28 |
+
|
| 29 |
+
# Models cache
|
| 30 |
+
flashrank_cache/
|
| 31 |
+
.cache/
|
| 32 |
+
|
| 33 |
+
# Uploads
|
| 34 |
+
uploads/
|
| 35 |
+
|
| 36 |
+
# Testing
|
| 37 |
+
.pytest_cache/
|
| 38 |
+
htmlcov/
|
| 39 |
+
.coverage
|
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
build-essential && rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# Pre-download models so they're cached in the image layer
|
| 12 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 13 |
+
RUN python -c "from flashrank import Ranker; Ranker(model_name='ms-marco-MiniLM-L-12-v2')"
|
| 14 |
+
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
EXPOSE 7860
|
| 18 |
+
|
| 19 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,1499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RagCore
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# RagCore
|
| 12 |
+
|
| 13 |
+
**A production-ready Retrieval-Augmented Generation system with hybrid search, metadata filtering, and a conversational UI.**
|
| 14 |
+
|
| 15 |
+
RagCore solves the problem of querying unstructured documents (PDFs, text files, HTML pages) using natural language. It ingests documents, splits them into semantically meaningful chunks, indexes them in both a vector database and a BM25 keyword index, then retrieves and reranks the most relevant passages to generate grounded, citation-backed answers using Google Gemini.
|
| 16 |
+
|
| 17 |
+
Unlike naive RAG implementations that rely solely on vector similarity, RagCore combines dense (semantic) and sparse (keyword) retrieval using Reciprocal Rank Fusion, applies a cross-encoder reranker to promote the most relevant passages, and uses an intelligent query analyzer that automatically extracts filters (date ranges, document types, sources) from natural language queries.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Table of Contents
|
| 22 |
+
|
| 23 |
+
1. [Architecture Overview](#architecture-overview)
|
| 24 |
+
2. [Tech Stack](#tech-stack)
|
| 25 |
+
3. [Project Structure](#project-structure)
|
| 26 |
+
4. [Core Components Deep Dive](#core-components-deep-dive)
|
| 27 |
+
5. [Data Models](#data-models)
|
| 28 |
+
6. [API Reference](#api-reference)
|
| 29 |
+
7. [UI Guide](#ui-guide)
|
| 30 |
+
8. [Setup and Installation](#setup-and-installation)
|
| 31 |
+
9. [Deployment](#deployment)
|
| 32 |
+
10. [Configuration Reference](#configuration-reference)
|
| 33 |
+
11. [How It Works End-to-End](#how-it-works-end-to-end)
|
| 34 |
+
12. [Testing](#testing)
|
| 35 |
+
13. [CI/CD](#cicd)
|
| 36 |
+
14. [Performance and Limits](#performance-and-limits)
|
| 37 |
+
15. [Troubleshooting](#troubleshooting)
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
## Architecture Overview
|
| 42 |
+
|
| 43 |
+
RagCore is built as a FastAPI application with two main pipelines: **Ingestion** and **Query**. A Gradio-based UI is mounted directly onto the FastAPI app at `/ui`.
|
| 44 |
+
|
| 45 |
+
### Ingestion Pipeline
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
+------------------+ +----------------+ +-------------------+
|
| 49 |
+
| File Upload | --> | Parser | --> | Text Cleaner |
|
| 50 |
+
| (PDF/TXT/HTML) | | (pypdf/bs4) | | (regex cleanup) |
|
| 51 |
+
+------------------+ +----------------+ +-------------------+
|
| 52 |
+
|
|
| 53 |
+
v
|
| 54 |
+
+------------------+ +----------------+ +-------------------+
|
| 55 |
+
| Qdrant Cloud | <-- | Embedder | <-- | Chunker |
|
| 56 |
+
| (vector store) | | (MiniLM-L6-v2) | | (sentence-aware) |
|
| 57 |
+
+------------------+ +----------------+ +-------------------+
|
| 58 |
+
| |
|
| 59 |
+
| v
|
| 60 |
+
| +-------------------+
|
| 61 |
+
+------------------------------------> | BM25 Index |
|
| 62 |
+
| (in-memory) |
|
| 63 |
+
+-------------------+
|
| 64 |
+
^
|
| 65 |
+
|
|
| 66 |
+
+-------------------+
|
| 67 |
+
| Metadata Extractor|
|
| 68 |
+
| (title/dates/tags)|
|
| 69 |
+
+-------------------+
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
**Step-by-step flow:**
|
| 73 |
+
|
| 74 |
+
1. User uploads a file via the `/api/ingest` endpoint or the Gradio UI.
|
| 75 |
+
2. The **Parser** detects file type by extension and extracts raw text (pypdf for PDFs, BeautifulSoup for HTML, direct decoding for TXT).
|
| 76 |
+
3. The **Text Cleaner** normalizes whitespace, collapses blank lines, and trims each line.
|
| 77 |
+
4. The **Metadata Extractor** pulls out the document title (first non-empty line), dates (via regex patterns), and tags (frequent capitalized phrases).
|
| 78 |
+
5. The **Chunker** splits text into overlapping chunks at sentence boundaries, respecting a configurable word-count limit.
|
| 79 |
+
6. The **Embedder** encodes each chunk into a 384-dimensional vector using the `all-MiniLM-L6-v2` sentence transformer.
|
| 80 |
+
7. Chunks with their vectors and payload metadata are upserted into **Qdrant Cloud** in batches of 100.
|
| 81 |
+
8. The same chunks are added to the in-memory **BM25 index** for keyword search.
|
| 82 |
+
|
| 83 |
+
### Query Pipeline
|
| 84 |
+
|
| 85 |
+
```
|
| 86 |
+
+------------------+ +-------------------+ +------------------+
|
| 87 |
+
| User Query | --> | Query Analyzer | --> | Hybrid Retriever|
|
| 88 |
+
| "What is RAG | | (intent, filters, | | |
|
| 89 |
+
| from PDFs?" | | cleaned query) | | +----------+ |
|
| 90 |
+
+------------------+ +-------------------+ | |Dense | |
|
| 91 |
+
| |(Qdrant) | |
|
| 92 |
+
| +----------+ |
|
| 93 |
+
| | |
|
| 94 |
+
| +----------+ |
|
| 95 |
+
| |Sparse | |
|
| 96 |
+
| |(BM25) | |
|
| 97 |
+
| +----------+ |
|
| 98 |
+
| | |
|
| 99 |
+
| +----------+ |
|
| 100 |
+
| |RRF Fusion| |
|
| 101 |
+
| +----------+ |
|
| 102 |
+
+------------------+
|
| 103 |
+
|
|
| 104 |
+
v
|
| 105 |
+
+-------------------+ +------------------+
|
| 106 |
+
| Answer Generator | <-- | Reranker |
|
| 107 |
+
| (Gemini Flash) | | (FlashRank) |
|
| 108 |
+
+-------------------+ +------------------+
|
| 109 |
+
|
|
| 110 |
+
v
|
| 111 |
+
+-------------------+
|
| 112 |
+
| Cited Answer |
|
| 113 |
+
| with Sources |
|
| 114 |
+
+-------------------+
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
**Step-by-step flow:**
|
| 118 |
+
|
| 119 |
+
1. User submits a natural language query.
|
| 120 |
+
2. The **Query Analyzer** classifies intent (factual, summarize, comparative, list, explanatory), extracts inline filters (doc type, date range, source filename), and produces a cleaned query.
|
| 121 |
+
3. The **Hybrid Retriever** runs two parallel searches:
|
| 122 |
+
- **Dense search**: encodes the query with the same embedding model, queries Qdrant with cosine similarity, fetching `top_k * 2` results.
|
| 123 |
+
- **Sparse search**: tokenizes the query and scores all chunks via BM25Okapi, also fetching `top_k * 2` results.
|
| 124 |
+
4. Results are fused using **Reciprocal Rank Fusion (RRF)** with configurable weights (default: 0.6 dense, 0.4 sparse).
|
| 125 |
+
5. The top-K fused results are passed to the **Reranker** (FlashRank cross-encoder), which rescores and selects the best 5 passages.
|
| 126 |
+
6. The **Answer Generator** builds a prompt with numbered context passages and sends it to **Google Gemini Flash**, which generates a cited, markdown-formatted answer.
|
| 127 |
+
7. The answer is returned with source references (streaming or non-streaming).
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## Tech Stack
|
| 132 |
+
|
| 133 |
+
| Technology | Version | Purpose |
|
| 134 |
+
|---|---|---|
|
| 135 |
+
| **Python** | 3.12 | Runtime language. Chosen for its ML/NLP ecosystem. |
|
| 136 |
+
| **FastAPI** | >=0.110 | Async web framework. High performance, automatic OpenAPI docs, dependency injection. |
|
| 137 |
+
| **Uvicorn** | >=0.29 | ASGI server for running FastAPI in production. |
|
| 138 |
+
| **Pydantic** | >=2.6 | Data validation and serialization for all request/response models. |
|
| 139 |
+
| **pydantic-settings** | >=2.2 | Environment-based configuration with `.env` file support. |
|
| 140 |
+
| **sentence-transformers** | >=2.6 | Embedding model loading and inference (`all-MiniLM-L6-v2`). Chosen for fast CPU inference and high quality at 384 dimensions. |
|
| 141 |
+
| **qdrant-client** | >=1.8 | Client for Qdrant vector database. Chosen for its generous free tier (1GB), filtering support, and payload storage. |
|
| 142 |
+
| **rank-bm25** | >=0.2.2 | BM25Okapi implementation for sparse keyword retrieval. Lightweight, pure-Python, no external dependencies. |
|
| 143 |
+
| **FlashRank** | >=0.2 | Ultra-fast cross-encoder reranker (`ms-marco-MiniLM-L-12-v2`). Runs on CPU, no GPU required. |
|
| 144 |
+
| **google-generativeai** | >=0.5 | Official Google Gemini SDK. Gemini 2.0 Flash offers a free tier with 15 RPM. |
|
| 145 |
+
| **Gradio** | >=4.20 | Web UI framework mounted directly on FastAPI. Two-tab interface for Q&A and document management. |
|
| 146 |
+
| **pypdf** | >=4.1 | PDF text extraction. Handles most PDF formats without external system dependencies. |
|
| 147 |
+
| **beautifulsoup4** | >=4.12 | HTML parsing with tag stripping (removes scripts, styles, nav, footer, header). |
|
| 148 |
+
| **httpx** | >=0.27 | Async/sync HTTP client used by the Gradio UI to call the FastAPI backend. |
|
| 149 |
+
| **python-multipart** | >=0.0.9 | Required by FastAPI for file upload support. |
|
| 150 |
+
| **python-dateutil** | >=2.9 | Fuzzy date parsing for the query analyzer's absolute date extraction. |
|
| 151 |
+
| **Ruff** | >=0.3 | Fast Python linter. Used in CI for code quality checks. |
|
| 152 |
+
| **pytest** | >=8.0 | Test framework. Unit tests for chunker, parsers, query analyzer, retrieval, and API. |
|
| 153 |
+
| **Docker** | - | Containerization. Pre-downloads ML models in the build step for fast cold starts. |
|
| 154 |
+
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
+
## Project Structure
|
| 158 |
+
|
| 159 |
+
```
|
| 160 |
+
ragcore/
|
| 161 |
+
|-- .github/
|
| 162 |
+
| +-- workflows/
|
| 163 |
+
| +-- ci.yml # GitHub Actions CI pipeline (lint + test)
|
| 164 |
+
|-- app/
|
| 165 |
+
| |-- __init__.py
|
| 166 |
+
| |-- config.py # Settings class with all env vars, setup_logging()
|
| 167 |
+
| |-- main.py # FastAPI app creation, lifespan, middleware, routing
|
| 168 |
+
| |-- api/
|
| 169 |
+
| | |-- __init__.py
|
| 170 |
+
| | |-- deps.py # Dependency injection factories for all services
|
| 171 |
+
| | +-- routes/
|
| 172 |
+
| | |-- __init__.py
|
| 173 |
+
| | |-- health.py # GET /health endpoint
|
| 174 |
+
| | |-- ingest.py # POST /api/ingest, GET /api/documents, DELETE /api/documents/{id}
|
| 175 |
+
| | +-- query.py # POST /api/search, POST /api/ask (with streaming)
|
| 176 |
+
| |-- core/
|
| 177 |
+
| | |-- __init__.py
|
| 178 |
+
| | |-- bm25.py # BM25 index: tokenization, search, rebuild from vectorstore
|
| 179 |
+
| | |-- chunker.py # Sentence-aware text chunking with overlap
|
| 180 |
+
| | |-- embedder.py # SentenceTransformer embedding service
|
| 181 |
+
| | |-- generator.py # Answer generation with prompt templates and streaming
|
| 182 |
+
| | |-- llm.py # Gemini API client with rate limiting
|
| 183 |
+
| | |-- metadata.py # Metadata extraction (title, dates, tags)
|
| 184 |
+
| | |-- query_analyzer.py # Query intent classification and filter extraction
|
| 185 |
+
| | |-- reranker.py # FlashRank cross-encoder reranking
|
| 186 |
+
| | |-- retriever.py # Hybrid retriever with RRF fusion
|
| 187 |
+
| | +-- vectorstore.py # Qdrant client wrapper (CRUD, search, filtering)
|
| 188 |
+
| |-- models/
|
| 189 |
+
| | |-- __init__.py
|
| 190 |
+
| | |-- document.py # DocumentMetadata, Chunk, Document models
|
| 191 |
+
| | +-- schemas.py # API request/response schemas (IngestResponse, QueryRequest, etc.)
|
| 192 |
+
| |-- ui/
|
| 193 |
+
| | |-- __init__.py
|
| 194 |
+
| | +-- gradio_app.py # Gradio Blocks UI (Ask tab, Documents tab)
|
| 195 |
+
| +-- utils/
|
| 196 |
+
| |-- __init__.py
|
| 197 |
+
| |-- helpers.py # generate_id, clean_text, count_words, timer, retry_with_backoff
|
| 198 |
+
| +-- parsers.py # File parsing (PDF, TXT, HTML) and page count extraction
|
| 199 |
+
|-- tests/
|
| 200 |
+
| |-- __init__.py
|
| 201 |
+
| |-- conftest.py # Shared fixtures (TestClient, sample_text)
|
| 202 |
+
| |-- test_api.py # API integration tests (health, redirect, docs)
|
| 203 |
+
| |-- test_chunker.py # Chunker unit tests (empty, single, multiple, overlap)
|
| 204 |
+
| |-- test_parsers.py # Parser unit tests (UTF-8, Latin-1, HTML, unsupported)
|
| 205 |
+
| |-- test_query_analyzer.py # Query analyzer tests (intents, filters, dates)
|
| 206 |
+
| +-- test_retrieval.py # RRF fusion tests (basic, empty, weights, filters)
|
| 207 |
+
|-- .dockerignore
|
| 208 |
+
|-- .env # Environment variables (not committed to git)
|
| 209 |
+
|-- .gitignore
|
| 210 |
+
|-- Dockerfile # Python 3.12-slim, pre-downloads ML models
|
| 211 |
+
|-- docker-compose.yml # Single-service compose with env_file
|
| 212 |
+
+-- requirements.txt # All Python dependencies with version constraints
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
---
|
| 216 |
+
|
| 217 |
+
## Core Components Deep Dive
|
| 218 |
+
|
| 219 |
+
### Parsers (`app/utils/parsers.py`)
|
| 220 |
+
|
| 221 |
+
**What it does:** Extracts raw text from uploaded files based on their extension.
|
| 222 |
+
|
| 223 |
+
**Supported formats:** `.pdf`, `.txt`, `.html`, `.htm`
|
| 224 |
+
|
| 225 |
+
**How it works internally:**
|
| 226 |
+
|
| 227 |
+
- `parse_document(file_bytes, filename)` is the main dispatcher. It reads the file extension and calls the appropriate parser.
|
| 228 |
+
- **PDF parsing** uses `pypdf.PdfReader` to iterate over all pages, extract text from each, and join them with double newlines.
|
| 229 |
+
- **HTML parsing** uses `BeautifulSoup` with the `html.parser` backend. Before extracting text, it decomposes `<script>`, `<style>`, `<nav>`, `<footer>`, and `<header>` tags to remove boilerplate content. Text is extracted with `get_text(separator="\n")`.
|
| 230 |
+
- **TXT parsing** attempts UTF-8 decoding first, falling back to Latin-1 for non-UTF-8 files.
|
| 231 |
+
- All parsers pass their output through `clean_text()` for normalization.
|
| 232 |
+
|
| 233 |
+
**Key functions:**
|
| 234 |
+
|
| 235 |
+
```python
|
| 236 |
+
def parse_document(file_bytes: bytes, filename: str) -> str
|
| 237 |
+
def parse_pdf(file_bytes: bytes, filename: str) -> str
|
| 238 |
+
def parse_text(file_bytes: bytes, filename: str) -> str
|
| 239 |
+
def parse_html(file_bytes: bytes, filename: str) -> str
|
| 240 |
+
def get_page_count(file_bytes: bytes, filename: str) -> int | None
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
**Configuration:** No direct configuration. File size is validated at the API layer (`max_file_size_mb`).
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
### Chunker (`app/core/chunker.py`)
|
| 248 |
+
|
| 249 |
+
**What it does:** Splits raw text into overlapping chunks at sentence boundaries, sized by word count.
|
| 250 |
+
|
| 251 |
+
**How it works internally:**
|
| 252 |
+
|
| 253 |
+
1. Text is split into sentences using the regex pattern `(?<=[.!?])\s+` (splits after sentence-ending punctuation followed by whitespace).
|
| 254 |
+
2. Sentences are accumulated word-by-word into the current chunk.
|
| 255 |
+
3. When adding the next sentence would exceed `chunk_size` words, the current chunk is finalized.
|
| 256 |
+
4. Overlap is implemented by retaining the last `chunk_overlap` words from the previous chunk as the start of the new chunk.
|
| 257 |
+
5. Each chunk records its `text`, `start_char`, `end_char`, and `chunk_index`.
|
| 258 |
+
|
| 259 |
+
**Key function:**
|
| 260 |
+
|
| 261 |
+
```python
|
| 262 |
+
def chunk_text(
|
| 263 |
+
text: str,
|
| 264 |
+
chunk_size: int = 512, # Maximum words per chunk
|
| 265 |
+
chunk_overlap: int = 50, # Number of overlapping words between consecutive chunks
|
| 266 |
+
) -> list[dict]
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
**Return format:** Each dict contains `{"text": str, "start_char": int, "end_char": int, "chunk_index": int}`.
|
| 270 |
+
|
| 271 |
+
**Configuration:**
|
| 272 |
+
|
| 273 |
+
| Setting | Default | Description |
|
| 274 |
+
|---|---|---|
|
| 275 |
+
| `CHUNK_SIZE` | 512 | Maximum number of words per chunk |
|
| 276 |
+
| `CHUNK_OVERLAP` | 50 | Number of overlapping words between consecutive chunks |
|
| 277 |
+
|
| 278 |
+
**Design note:** Sentence-aware splitting avoids cutting mid-sentence, which improves both retrieval relevance and answer generation quality compared to fixed-character splitting.
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
### Metadata Extractor (`app/core/metadata.py`)
|
| 283 |
+
|
| 284 |
+
**What it does:** Automatically extracts structured metadata from raw document text.
|
| 285 |
+
|
| 286 |
+
**How it works internally:**
|
| 287 |
+
|
| 288 |
+
- **Title extraction:** Scans lines from the top of the document, returning the first non-empty line with more than 3 characters (truncated to 200 chars).
|
| 289 |
+
- **Date extraction:** Searches the first 2000 characters for dates using three regex patterns:
|
| 290 |
+
- `YYYY-MM-DD` (ISO format)
|
| 291 |
+
- `MM/DD/YYYY` (US format)
|
| 292 |
+
- `Month DD, YYYY` (long format, e.g., "January 15, 2024")
|
| 293 |
+
- **Tag extraction:** Finds all capitalized phrases (e.g., "Machine Learning", "Neural Network") using regex, counts their occurrences, and returns the top 10 that appear at least twice. Tags are lowercased before returning.
|
| 294 |
+
- **Doc type:** Derived from the file extension (e.g., "pdf", "html", "txt").
|
| 295 |
+
|
| 296 |
+
**Key function:**
|
| 297 |
+
|
| 298 |
+
```python
|
| 299 |
+
def extract_metadata(raw_text: str, filename: str, page_count: int | None = None) -> DocumentMetadata
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
**Supporting functions:**
|
| 303 |
+
|
| 304 |
+
```python
|
| 305 |
+
def extract_title(text: str) -> str | None
|
| 306 |
+
def extract_dates(text: str) -> datetime | None
|
| 307 |
+
def extract_tags(text: str, max_tags: int = 10) -> list[str]
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
### Embedder (`app/core/embedder.py`)
|
| 313 |
+
|
| 314 |
+
**What it does:** Converts text into dense vector representations using a sentence transformer model.
|
| 315 |
+
|
| 316 |
+
**How it works internally:**
|
| 317 |
+
|
| 318 |
+
- Uses `sentence-transformers` to load the `all-MiniLM-L6-v2` model on CPU at startup.
|
| 319 |
+
- Encodes text in batches of 64 with L2 normalization enabled (so cosine similarity is equivalent to dot product).
|
| 320 |
+
- The model produces 384-dimensional embeddings.
|
| 321 |
+
- Singleton pattern via `get_embedder()` ensures the model is loaded only once.
|
| 322 |
+
|
| 323 |
+
**Key class:** `EmbedderService`
|
| 324 |
+
|
| 325 |
+
```python
|
| 326 |
+
class EmbedderService:
|
| 327 |
+
EMBEDDING_DIM = 384
|
| 328 |
+
|
| 329 |
+
def __init__(self, model_name: str)
|
| 330 |
+
def embed_texts(self, texts: list[str]) -> list[list[float]] # Batch embedding
|
| 331 |
+
def embed_query(self, query: str) -> list[float] # Single query embedding
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
**Configuration:**
|
| 335 |
+
|
| 336 |
+
| Setting | Default | Description |
|
| 337 |
+
|---|---|---|
|
| 338 |
+
| `EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | HuggingFace sentence-transformers model name |
|
| 339 |
+
| `EMBEDDING_DIM` | 384 | Embedding vector dimensionality |
|
| 340 |
+
|
| 341 |
+
---
|
| 342 |
+
|
| 343 |
+
### Vector Store -- Qdrant (`app/core/vectorstore.py`)
|
| 344 |
+
|
| 345 |
+
**What it does:** Manages all interactions with the Qdrant vector database: collection management, upserting chunks, searching, filtering, scrolling, and deleting.
|
| 346 |
+
|
| 347 |
+
**How it works internally:**
|
| 348 |
+
|
| 349 |
+
- On initialization, connects to Qdrant Cloud using the provided URL and API key.
|
| 350 |
+
- `ensure_collection()` checks if the collection exists; if not, creates it with cosine distance and the configured vector size.
|
| 351 |
+
- **Upsert:** Chunks are uploaded in batches of 100 as `PointStruct` objects, with the chunk text and all metadata stored in the payload.
|
| 352 |
+
- **Search:** Uses `query_points()` with an optional `Filter` object built from `SearchFilters`. Over-fetches `top_k * 2` results to give the fusion step more candidates.
|
| 353 |
+
- **Filtering:** Supports exact match on `source`, `doc_type`, `MatchAny` on `tags`, and `Range` on `created_date`.
|
| 354 |
+
- **Scroll:** Iterates through all points in the collection using offset-based pagination (batch size 100). Used to rebuild the BM25 index on startup.
|
| 355 |
+
- **Document listing:** Aggregates all points by `document_id` to return a list of unique documents with chunk counts.
|
| 356 |
+
|
| 357 |
+
**Key class:** `VectorStoreService`
|
| 358 |
+
|
| 359 |
+
```python
|
| 360 |
+
class VectorStoreService:
|
| 361 |
+
def __init__(self, url: str, api_key: str, collection_name: str)
|
| 362 |
+
def ensure_collection(self, vector_size: int = 384) -> None
|
| 363 |
+
def upsert_chunks(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None
|
| 364 |
+
def search(self, query_vector: list[float], limit: int = 10, filters: SearchFilters | None = None) -> list[dict]
|
| 365 |
+
def delete_document(self, document_id: str) -> int
|
| 366 |
+
def scroll_all(self, batch_size: int = 100) -> list[dict]
|
| 367 |
+
def get_document_ids(self) -> list[dict]
|
| 368 |
+
def count(self) -> int
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
**Payload schema stored per point:**
|
| 372 |
+
|
| 373 |
+
```json
|
| 374 |
+
{
|
| 375 |
+
"text": "chunk text content",
|
| 376 |
+
"document_id": "uuid-string",
|
| 377 |
+
"chunk_index": 0,
|
| 378 |
+
"source": "filename.pdf",
|
| 379 |
+
"doc_type": "pdf",
|
| 380 |
+
"title": "Document Title or null",
|
| 381 |
+
"created_date": "2024-01-15T00:00:00 or null",
|
| 382 |
+
"tags": ["machine learning", "neural networks"],
|
| 383 |
+
"page_count": 12
|
| 384 |
+
}
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
**Configuration:**
|
| 388 |
+
|
| 389 |
+
| Setting | Default | Description |
|
| 390 |
+
|---|---|---|
|
| 391 |
+
| `QDRANT_URL` | (required) | Qdrant Cloud cluster URL |
|
| 392 |
+
| `QDRANT_API_KEY` | (required) | Qdrant Cloud API key |
|
| 393 |
+
| `QDRANT_COLLECTION` | `ragcore_docs` | Collection name in Qdrant |
|
| 394 |
+
|
| 395 |
+
---
|
| 396 |
+
|
| 397 |
+
### BM25 Index (`app/core/bm25.py`)
|
| 398 |
+
|
| 399 |
+
**What it does:** Maintains an in-memory BM25 keyword index for sparse retrieval alongside the dense vector search.
|
| 400 |
+
|
| 401 |
+
**How it works internally:**
|
| 402 |
+
|
| 403 |
+
- **Tokenization:** Text is lowercased, split into words via `\b\w+\b`, then filtered to remove stop words (58 common English words) and single-character tokens.
|
| 404 |
+
- Uses `rank_bm25.BM25Okapi`, which implements the Okapi BM25 scoring formula:
|
| 405 |
+
```
|
| 406 |
+
score(D, Q) = SUM[ IDF(q) * (f(q,D) * (k1+1)) / (f(q,D) + k1 * (1 - b + b * |D|/avgdl)) ]
|
| 407 |
+
```
|
| 408 |
+
- On startup, the index is rebuilt from all existing points in Qdrant via `rebuild_from_vectorstore()`, which scrolls through all stored chunks.
|
| 409 |
+
- When new documents are ingested, `add_documents()` appends them and rebuilds the full BM25 corpus (the index is not incremental -- it rebuilds from the full document list).
|
| 410 |
+
- Search returns scored results filtered to only those with `score > 0`.
|
| 411 |
+
|
| 412 |
+
**Key class:** `BM25Index`
|
| 413 |
+
|
| 414 |
+
```python
|
| 415 |
+
class BM25Index:
|
| 416 |
+
def __init__(self)
|
| 417 |
+
def build_index(self, chunks: list[Chunk]) -> None
|
| 418 |
+
def add_documents(self, chunks: list[Chunk]) -> None
|
| 419 |
+
def search(self, query: str, top_k: int = 10) -> list[dict]
|
| 420 |
+
def rebuild_from_vectorstore(self, vectorstore) -> None
|
| 421 |
+
@property
|
| 422 |
+
def doc_count(self) -> int
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
**Tokenization function:**
|
| 426 |
+
|
| 427 |
+
```python
|
| 428 |
+
def tokenize(text: str) -> list[str]
|
| 429 |
+
```
|
| 430 |
+
|
| 431 |
+
**Design note:** The in-memory approach means the BM25 index is rebuilt on every application restart (from Qdrant data). This is acceptable for small-to-medium collections (thousands of chunks) but would need a persistent store for larger deployments.
|
| 432 |
+
|
| 433 |
+
---
|
| 434 |
+
|
| 435 |
+
### Hybrid Retriever with RRF (`app/core/retriever.py`)
|
| 436 |
+
|
| 437 |
+
**What it does:** Combines dense (vector) and sparse (BM25) retrieval results using Reciprocal Rank Fusion.
|
| 438 |
+
|
| 439 |
+
**How it works internally:**
|
| 440 |
+
|
| 441 |
+
1. Embeds the query using the same `EmbedderService`.
|
| 442 |
+
2. Runs a dense search via Qdrant, fetching `top_k * 2` candidates (over-fetch to give fusion more options).
|
| 443 |
+
3. Runs a BM25 search, also fetching `top_k * 2` candidates.
|
| 444 |
+
4. If filters were provided, applies them post-hoc to BM25 results (since BM25 does not natively support metadata filtering).
|
| 445 |
+
5. Fuses both result lists using the **RRF formula**:
|
| 446 |
+
|
| 447 |
+
```
|
| 448 |
+
RRF_score(d) = SUM_over_lists[ weight_i * 1 / (k + rank_i(d)) ]
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
Where `k = 60` (smoothing constant), `rank_i(d)` is the rank of document `d` in list `i` (0-indexed), and `weight_i` is the list weight (default: 0.6 for dense, 0.4 for sparse).
|
| 452 |
+
|
| 453 |
+
6. Deduplicates by `chunk_id` and returns the top-K results as `RetrievedChunk` objects.
|
| 454 |
+
|
| 455 |
+
**Key class:** `HybridRetriever`
|
| 456 |
+
|
| 457 |
+
```python
|
| 458 |
+
class HybridRetriever:
|
| 459 |
+
def __init__(self, vectorstore: VectorStoreService, bm25: BM25Index, embedder: EmbedderService)
|
| 460 |
+
def retrieve(self, query: str, top_k: int = 10, filters: SearchFilters | None = None,
|
| 461 |
+
dense_weight: float = 0.6, sparse_weight: float = 0.4) -> list[RetrievedChunk]
|
| 462 |
+
|
| 463 |
+
@staticmethod
|
| 464 |
+
def rrf_fuse(result_lists: list[list[dict]], k: int = 60,
|
| 465 |
+
weights: list[float] | None = None) -> list[dict]
|
| 466 |
+
|
| 467 |
+
@staticmethod
|
| 468 |
+
def _apply_filters(results: list[dict], filters: SearchFilters) -> list[dict]
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
**Configuration:**
|
| 472 |
+
|
| 473 |
+
| Setting | Default | Description |
|
| 474 |
+
|---|---|---|
|
| 475 |
+
| `TOP_K` | 10 | Number of chunks to return from retrieval |
|
| 476 |
+
| `DENSE_WEIGHT` | 0.6 | Weight for dense (vector) search in RRF |
|
| 477 |
+
| `SPARSE_WEIGHT` | 0.4 | Weight for sparse (BM25) search in RRF |
|
| 478 |
+
|
| 479 |
+
**Why RRF?** Reciprocal Rank Fusion is a score-agnostic fusion method. Since BM25 scores and cosine similarity scores are on different scales, RRF uses only rank positions, making it a robust choice for combining heterogeneous retrieval signals.
|
| 480 |
+
|
| 481 |
+
---
|
| 482 |
+
|
| 483 |
+
### Reranker (`app/core/reranker.py`)
|
| 484 |
+
|
| 485 |
+
**What it does:** Rescores retrieved chunks using a cross-encoder model to improve ranking precision.
|
| 486 |
+
|
| 487 |
+
**How it works internally:**
|
| 488 |
+
|
| 489 |
+
- Uses FlashRank with the `ms-marco-MiniLM-L-12-v2` model, which is a lightweight cross-encoder trained on the MS MARCO passage ranking dataset.
|
| 490 |
+
- Unlike embedding models (which encode query and document independently), cross-encoders process the query-document pair jointly, allowing richer interaction signals.
|
| 491 |
+
- Input: the query string and a list of `RetrievedChunk` objects from the hybrid retriever.
|
| 492 |
+
- Output: the top `rerank_top_k` chunks reordered by cross-encoder score.
|
| 493 |
+
- The reranker model is cached in `./flashrank_cache/` to avoid re-downloading on each startup.
|
| 494 |
+
|
| 495 |
+
**Key class:** `RerankerService`
|
| 496 |
+
|
| 497 |
+
```python
|
| 498 |
+
class RerankerService:
|
| 499 |
+
def __init__(self)
|
| 500 |
+
def rerank(self, query: str, chunks: list[RetrievedChunk], top_k: int = 5) -> list[RetrievedChunk]
|
| 501 |
+
```
|
| 502 |
+
|
| 503 |
+
**Configuration:**
|
| 504 |
+
|
| 505 |
+
| Setting | Default | Description |
|
| 506 |
+
|---|---|---|
|
| 507 |
+
| `RERANK_TOP_K` | 5 | Number of chunks to keep after reranking |
|
| 508 |
+
|
| 509 |
+
---
|
| 510 |
+
|
| 511 |
+
### LLM Client (`app/core/llm.py`)
|
| 512 |
+
|
| 513 |
+
**What it does:** Manages all communication with the Google Gemini API, including rate limiting and streaming.
|
| 514 |
+
|
| 515 |
+
**How it works internally:**
|
| 516 |
+
|
| 517 |
+
- Configures the `google.generativeai` library with the provided API key.
|
| 518 |
+
- Instantiates a `GenerativeModel` for the configured model name (default: `gemini-2.0-flash`).
|
| 519 |
+
- **Rate limiting:** Enforces a minimum interval between API calls based on `rpm_limit`. For the free tier (15 RPM), the minimum interval is 4 seconds. Uses `time.sleep()` for synchronous calls and `asyncio.sleep()` for async calls.
|
| 520 |
+
- **Synchronous generation:** `generate(prompt, temperature, max_tokens)` returns the full response text.
|
| 521 |
+
- **Streaming generation:** `generate_stream(prompt, temperature, max_tokens)` is an async generator that yields text chunks as they arrive from the API.
|
| 522 |
+
|
| 523 |
+
**Key class:** `GeminiService`
|
| 524 |
+
|
| 525 |
+
```python
|
| 526 |
+
class GeminiService:
|
| 527 |
+
def __init__(self, api_key: str, model_name: str, rpm_limit: int = 15)
|
| 528 |
+
def generate(self, prompt: str, temperature: float = 0.3, max_tokens: int = 2048) -> str
|
| 529 |
+
async def generate_stream(self, prompt: str, temperature: float = 0.3,
|
| 530 |
+
max_tokens: int = 2048) -> AsyncGenerator[str, None]
|
| 531 |
+
```
|
| 532 |
+
|
| 533 |
+
**Configuration:**
|
| 534 |
+
|
| 535 |
+
| Setting | Default | Description |
|
| 536 |
+
|---|---|---|
|
| 537 |
+
| `GEMINI_API_KEY` | (required) | Google Gemini API key |
|
| 538 |
+
| `GEMINI_MODEL` | `gemini-2.0-flash` | Gemini model identifier |
|
| 539 |
+
| `GEMINI_RPM_LIMIT` | 15 | Requests per minute limit |
|
| 540 |
+
| `GEMINI_TEMPERATURE` | 0.3 | Generation temperature (lower = more deterministic) |
|
| 541 |
+
| `GEMINI_MAX_TOKENS` | 2048 | Maximum output tokens per generation |
|
| 542 |
+
|
| 543 |
+
---
|
| 544 |
+
|
| 545 |
+
### Query Analyzer (`app/core/query_analyzer.py`)
|
| 546 |
+
|
| 547 |
+
**What it does:** Parses natural language queries to extract intent, metadata filters, and a cleaned query string.
|
| 548 |
+
|
| 549 |
+
**How it works internally:**
|
| 550 |
+
|
| 551 |
+
The analyzer performs multiple regex-based extractions in sequence:
|
| 552 |
+
|
| 553 |
+
1. **Document type extraction:** Matches patterns like "PDFs", "pdf", "HTML", "text files", "txt" and sets the `doc_type` filter.
|
| 554 |
+
2. **Relative date extraction:** Matches temporal phrases like "last week", "last month", "this year", "today", "yesterday" and converts them to `date_from`/`date_to` datetime ranges.
|
| 555 |
+
3. **Absolute date extraction:** Matches "after {date}" and "before {date}" patterns. Uses `python-dateutil` for fuzzy parsing of the date string.
|
| 556 |
+
4. **Source extraction:** Matches "from {filename.ext}" patterns to filter by specific source file.
|
| 557 |
+
5. **Query cleaning:** Removes all matched filter phrases from the query, collapses whitespace, and strips dangling prepositions (about, from, in, on).
|
| 558 |
+
6. **Intent classification:** Matches the original query against patterns for five intent types:
|
| 559 |
+
- `summarize` -- "summarize", "summary", "overview"
|
| 560 |
+
- `comparative` -- "compare", "difference", "vs", "versus"
|
| 561 |
+
- `list` -- "list", "enumerate", "what are all"
|
| 562 |
+
- `explanatory` -- starts with "why", "how", "explain"
|
| 563 |
+
- `factual` -- starts with "what", "who", "when", "where", "how many/much" (default fallback)
|
| 564 |
+
7. **Confidence scoring:** Starts at 0.5, incremented by 0.1 for each filter successfully extracted, capped at 1.0.
|
| 565 |
+
|
| 566 |
+
**Key class:** `QueryAnalyzer`
|
| 567 |
+
|
| 568 |
+
```python
|
| 569 |
+
class QueryAnalyzer:
|
| 570 |
+
def analyze(self, query: str) -> AnalyzedQuery
|
| 571 |
+
```
|
| 572 |
+
|
| 573 |
+
**Example:**
|
| 574 |
+
|
| 575 |
+
Input: `"summarize PDFs from last month"`
|
| 576 |
+
|
| 577 |
+
Output:
|
| 578 |
+
```json
|
| 579 |
+
{
|
| 580 |
+
"original_query": "summarize PDFs from last month",
|
| 581 |
+
"clean_query": "summarize",
|
| 582 |
+
"intent": "summarize",
|
| 583 |
+
"extracted_filters": {
|
| 584 |
+
"doc_type": "pdf",
|
| 585 |
+
"date_from": "2026-02-17T00:00:00",
|
| 586 |
+
"date_to": "2026-03-17T00:00:00"
|
| 587 |
+
},
|
| 588 |
+
"confidence": 0.7
|
| 589 |
+
}
|
| 590 |
+
```
|
| 591 |
+
|
| 592 |
+
---
|
| 593 |
+
|
| 594 |
+
### Answer Generator (`app/core/generator.py`)
|
| 595 |
+
|
| 596 |
+
**What it does:** Builds a prompt from retrieved chunks and generates a cited answer using the LLM.
|
| 597 |
+
|
| 598 |
+
**How it works internally:**
|
| 599 |
+
|
| 600 |
+
1. **Reranking:** Calls the `RerankerService` to narrow the retrieved chunks to `rerank_top_k`.
|
| 601 |
+
2. **Context building:** Formats each reranked chunk as a numbered passage with its source filename:
|
| 602 |
+
```
|
| 603 |
+
[1] (Source: report.pdf)
|
| 604 |
+
Chunk text content here...
|
| 605 |
+
|
| 606 |
+
[2] (Source: notes.txt)
|
| 607 |
+
Another chunk text...
|
| 608 |
+
```
|
| 609 |
+
3. **Prompt selection:** Uses `SYSTEM_PROMPT` for most intents and `SUMMARY_PROMPT` when the intent is "summarize".
|
| 610 |
+
4. **Prompt rules instruct the LLM to:**
|
| 611 |
+
- Answer based ONLY on the provided context
|
| 612 |
+
- Cite sources inline using [1], [2], etc.
|
| 613 |
+
- Admit when context is insufficient
|
| 614 |
+
- Use markdown formatting
|
| 615 |
+
5. **Streaming:** The `generate_answer_stream()` async generator yields text chunks during generation, then yields a final `GeneratedAnswer` object with source metadata.
|
| 616 |
+
|
| 617 |
+
**Key class:** `AnswerGenerator`
|
| 618 |
+
|
| 619 |
+
```python
|
| 620 |
+
class AnswerGenerator:
|
| 621 |
+
def __init__(self, llm: GeminiService, reranker: RerankerService)
|
| 622 |
+
def generate_answer(self, query: str, chunks: list[RetrievedChunk],
|
| 623 |
+
rerank_top_k: int = 5, intent: str = "factual") -> GeneratedAnswer
|
| 624 |
+
async def generate_answer_stream(self, query: str, chunks: list[RetrievedChunk],
|
| 625 |
+
rerank_top_k: int = 5, intent: str = "factual") -> AsyncGenerator
|
| 626 |
+
```
|
| 627 |
+
|
| 628 |
+
---
|
| 629 |
+
|
| 630 |
+
## Data Models
|
| 631 |
+
|
| 632 |
+
All models are defined using Pydantic v2 and live in `app/models/`.
|
| 633 |
+
|
| 634 |
+
### Core Document Models (`app/models/document.py`)
|
| 635 |
+
|
| 636 |
+
#### `DocumentMetadata`
|
| 637 |
+
|
| 638 |
+
Stores extracted metadata for a document or chunk.
|
| 639 |
+
|
| 640 |
+
| Field | Type | Default | Description |
|
| 641 |
+
|---|---|---|---|
|
| 642 |
+
| `source` | `str` | `""` | Original filename |
|
| 643 |
+
| `doc_type` | `str` | `""` | File type without dot (e.g., "pdf", "html", "txt") |
|
| 644 |
+
| `title` | `str \| None` | `None` | Extracted title (first meaningful line) |
|
| 645 |
+
| `created_date` | `datetime \| None` | `None` | Extracted date from document content |
|
| 646 |
+
| `tags` | `list[str]` | `[]` | Auto-extracted topic tags |
|
| 647 |
+
| `page_count` | `int \| None` | `None` | Number of pages (PDFs only) |
|
| 648 |
+
|
| 649 |
+
#### `Chunk`
|
| 650 |
+
|
| 651 |
+
Represents a single text chunk derived from a document.
|
| 652 |
+
|
| 653 |
+
| Field | Type | Default | Description |
|
| 654 |
+
|---|---|---|---|
|
| 655 |
+
| `chunk_id` | `str` | `uuid4()` | Unique chunk identifier |
|
| 656 |
+
| `document_id` | `str` | `""` | Parent document identifier |
|
| 657 |
+
| `text` | `str` | `""` | Chunk text content |
|
| 658 |
+
| `metadata` | `DocumentMetadata` | `{}` | Inherited document metadata |
|
| 659 |
+
| `chunk_index` | `int` | `0` | Position of this chunk in the document |
|
| 660 |
+
| `start_char` | `int` | `0` | Start character offset in original text |
|
| 661 |
+
| `end_char` | `int` | `0` | End character offset in original text |
|
| 662 |
+
|
| 663 |
+
#### `Document`
|
| 664 |
+
|
| 665 |
+
Represents a full ingested document.
|
| 666 |
+
|
| 667 |
+
| Field | Type | Default | Description |
|
| 668 |
+
|---|---|---|---|
|
| 669 |
+
| `document_id` | `str` | `uuid4()` | Unique document identifier |
|
| 670 |
+
| `filename` | `str` | `""` | Original filename |
|
| 671 |
+
| `metadata` | `DocumentMetadata` | `{}` | Extracted metadata |
|
| 672 |
+
| `chunks` | `list[Chunk]` | `[]` | Child chunks (populated during ingestion) |
|
| 673 |
+
| `raw_text` | `str` | `""` | Full extracted text |
|
| 674 |
+
|
| 675 |
+
### API Schemas (`app/models/schemas.py`)
|
| 676 |
+
|
| 677 |
+
#### `IngestResponse`
|
| 678 |
+
|
| 679 |
+
Returned after successful document ingestion.
|
| 680 |
+
|
| 681 |
+
| Field | Type | Description |
|
| 682 |
+
|---|---|---|
|
| 683 |
+
| `document_id` | `str` | Assigned UUID |
|
| 684 |
+
| `filename` | `str` | Original filename |
|
| 685 |
+
| `num_chunks` | `int` | Number of chunks created |
|
| 686 |
+
| `message` | `str` | Human-readable success message |
|
| 687 |
+
|
| 688 |
+
#### `SearchFilters`
|
| 689 |
+
|
| 690 |
+
Used for metadata filtering in search and query operations.
|
| 691 |
+
|
| 692 |
+
| Field | Type | Default | Description |
|
| 693 |
+
|---|---|---|---|
|
| 694 |
+
| `source` | `str \| None` | `None` | Filter by exact source filename |
|
| 695 |
+
| `doc_type` | `str \| None` | `None` | Filter by document type |
|
| 696 |
+
| `date_from` | `datetime \| None` | `None` | Filter documents created on or after this date |
|
| 697 |
+
| `date_to` | `datetime \| None` | `None` | Filter documents created on or before this date |
|
| 698 |
+
| `tags` | `list[str] \| None` | `None` | Filter by any matching tag |
|
| 699 |
+
|
| 700 |
+
#### `RetrievedChunk`
|
| 701 |
+
|
| 702 |
+
A chunk returned from retrieval, with its relevance score and rank.
|
| 703 |
+
|
| 704 |
+
| Field | Type | Description |
|
| 705 |
+
|---|---|---|
|
| 706 |
+
| `chunk_id` | `str` | Chunk identifier |
|
| 707 |
+
| `document_id` | `str` | Parent document identifier |
|
| 708 |
+
| `text` | `str` | Chunk text |
|
| 709 |
+
| `score` | `float` | Relevance score (RRF-fused or reranker score) |
|
| 710 |
+
| `metadata` | `DocumentMetadata` | Chunk metadata |
|
| 711 |
+
| `rank` | `int` | Position in the result list (0-indexed) |
|
| 712 |
+
|
| 713 |
+
#### `SearchRequest`
|
| 714 |
+
|
| 715 |
+
Request body for the `/api/search` endpoint.
|
| 716 |
+
|
| 717 |
+
| Field | Type | Default | Description |
|
| 718 |
+
|---|---|---|---|
|
| 719 |
+
| `query` | `str` | (required) | Natural language search query |
|
| 720 |
+
| `top_k` | `int` | `10` | Number of results to return |
|
| 721 |
+
| `filters` | `SearchFilters \| None` | `None` | Optional explicit filters (overrides auto-extraction) |
|
| 722 |
+
|
| 723 |
+
#### `SearchResponse`
|
| 724 |
+
|
| 725 |
+
Response from the `/api/search` endpoint.
|
| 726 |
+
|
| 727 |
+
| Field | Type | Description |
|
| 728 |
+
|---|---|---|
|
| 729 |
+
| `query` | `str` | Original query |
|
| 730 |
+
| `results` | `list[RetrievedChunk]` | Retrieved and ranked chunks |
|
| 731 |
+
| `total_results` | `int` | Number of results returned |
|
| 732 |
+
| `search_time_ms` | `float` | Total search time in milliseconds |
|
| 733 |
+
|
| 734 |
+
#### `QueryRequest`
|
| 735 |
+
|
| 736 |
+
Request body for the `/api/ask` endpoint.
|
| 737 |
+
|
| 738 |
+
| Field | Type | Default | Description |
|
| 739 |
+
|---|---|---|---|
|
| 740 |
+
| `query` | `str` | (required) | Natural language question |
|
| 741 |
+
| `top_k` | `int` | `10` | Number of chunks to retrieve |
|
| 742 |
+
| `rerank_top_k` | `int` | `5` | Number of chunks to keep after reranking |
|
| 743 |
+
| `filters` | `SearchFilters \| None` | `None` | Optional explicit filters |
|
| 744 |
+
| `stream` | `bool` | `False` | Enable Server-Sent Events streaming |
|
| 745 |
+
|
| 746 |
+
#### `GeneratedAnswer`
|
| 747 |
+
|
| 748 |
+
Response from the `/api/ask` endpoint (non-streaming).
|
| 749 |
+
|
| 750 |
+
| Field | Type | Description |
|
| 751 |
+
|---|---|---|
|
| 752 |
+
| `query` | `str` | Original question |
|
| 753 |
+
| `answer` | `str` | Generated markdown answer with inline citations |
|
| 754 |
+
| `sources` | `list[RetrievedChunk]` | Source chunks used for generation |
|
| 755 |
+
| `generation_time_ms` | `float` | Total generation time in milliseconds |
|
| 756 |
+
| `model` | `str` | LLM model name used |
|
| 757 |
+
|
| 758 |
+
#### `AnalyzedQuery`
|
| 759 |
+
|
| 760 |
+
Internal model from the query analyzer (not directly exposed via API).
|
| 761 |
+
|
| 762 |
+
| Field | Type | Default | Description |
|
| 763 |
+
|---|---|---|---|
|
| 764 |
+
| `original_query` | `str` | - | The raw user query |
|
| 765 |
+
| `clean_query` | `str` | - | Query with filter phrases removed |
|
| 766 |
+
| `intent` | `str` | `"factual"` | Classified intent |
|
| 767 |
+
| `extracted_filters` | `SearchFilters` | `{}` | Automatically extracted filters |
|
| 768 |
+
| `confidence` | `float` | `0.5` | Confidence in filter extraction |
|
| 769 |
+
|
| 770 |
+
---
|
| 771 |
+
|
| 772 |
+
## API Reference
|
| 773 |
+
|
| 774 |
+
The FastAPI app automatically generates interactive API documentation at `/docs` (Swagger UI) and `/redoc` (ReDoc).
|
| 775 |
+
|
| 776 |
+
### Health Check
|
| 777 |
+
|
| 778 |
+
```
|
| 779 |
+
GET /health
|
| 780 |
+
```
|
| 781 |
+
|
| 782 |
+
Returns the status of all system components.
|
| 783 |
+
|
| 784 |
+
**Response:**
|
| 785 |
+
|
| 786 |
+
```json
|
| 787 |
+
{
|
| 788 |
+
"status": "ok",
|
| 789 |
+
"components": {
|
| 790 |
+
"embedder": "loaded",
|
| 791 |
+
"bm25": "142 documents",
|
| 792 |
+
"vectorstore": "connected"
|
| 793 |
+
}
|
| 794 |
+
}
|
| 795 |
+
```
|
| 796 |
+
|
| 797 |
+
**curl example:**
|
| 798 |
+
|
| 799 |
+
```bash
|
| 800 |
+
curl http://localhost:7860/health
|
| 801 |
+
```
|
| 802 |
+
|
| 803 |
+
---
|
| 804 |
+
|
| 805 |
+
### Ingest Document
|
| 806 |
+
|
| 807 |
+
```
|
| 808 |
+
POST /api/ingest
|
| 809 |
+
Content-Type: multipart/form-data
|
| 810 |
+
```
|
| 811 |
+
|
| 812 |
+
Uploads and indexes a document. The file is parsed, chunked, embedded, and stored in both the vector database and the BM25 index.
|
| 813 |
+
|
| 814 |
+
**Request:** Multipart form with a `file` field.
|
| 815 |
+
|
| 816 |
+
**Constraints:**
|
| 817 |
+
- Supported extensions: `.pdf`, `.txt`, `.html`, `.htm`
|
| 818 |
+
- Maximum file size: 10 MB (configurable via `MAX_FILE_SIZE_MB`)
|
| 819 |
+
|
| 820 |
+
**Response (200):**
|
| 821 |
+
|
| 822 |
+
```json
|
| 823 |
+
{
|
| 824 |
+
"document_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
|
| 825 |
+
"filename": "report.pdf",
|
| 826 |
+
"num_chunks": 47,
|
| 827 |
+
"message": "Successfully ingested 'report.pdf' with 47 chunks"
|
| 828 |
+
}
|
| 829 |
+
```
|
| 830 |
+
|
| 831 |
+
**Error responses:**
|
| 832 |
+
- `400` -- Missing filename or unsupported file type
|
| 833 |
+
- `413` -- File exceeds maximum size
|
| 834 |
+
- `422` -- Could not extract text from file
|
| 835 |
+
|
| 836 |
+
**curl example:**
|
| 837 |
+
|
| 838 |
+
```bash
|
| 839 |
+
curl -X POST http://localhost:7860/api/ingest \
|
| 840 |
+
-F "file=@/path/to/document.pdf"
|
| 841 |
+
```
|
| 842 |
+
|
| 843 |
+
---
|
| 844 |
+
|
| 845 |
+
### List Documents
|
| 846 |
+
|
| 847 |
+
```
|
| 848 |
+
GET /api/documents
|
| 849 |
+
```
|
| 850 |
+
|
| 851 |
+
Returns all indexed documents with their metadata and chunk counts.
|
| 852 |
+
|
| 853 |
+
**Response (200):**
|
| 854 |
+
|
| 855 |
+
```json
|
| 856 |
+
{
|
| 857 |
+
"documents": [
|
| 858 |
+
{
|
| 859 |
+
"document_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
|
| 860 |
+
"source": "report.pdf",
|
| 861 |
+
"title": "Annual Report 2024",
|
| 862 |
+
"doc_type": "pdf",
|
| 863 |
+
"num_chunks": 47
|
| 864 |
+
}
|
| 865 |
+
],
|
| 866 |
+
"total": 1
|
| 867 |
+
}
|
| 868 |
+
```
|
| 869 |
+
|
| 870 |
+
**curl example:**
|
| 871 |
+
|
| 872 |
+
```bash
|
| 873 |
+
curl http://localhost:7860/api/documents
|
| 874 |
+
```
|
| 875 |
+
|
| 876 |
+
---
|
| 877 |
+
|
| 878 |
+
### Delete Document
|
| 879 |
+
|
| 880 |
+
```
|
| 881 |
+
DELETE /api/documents/{document_id}
|
| 882 |
+
```
|
| 883 |
+
|
| 884 |
+
Removes all chunks for the given document from Qdrant and rebuilds the BM25 index.
|
| 885 |
+
|
| 886 |
+
**Response (200):**
|
| 887 |
+
|
| 888 |
+
```json
|
| 889 |
+
{
|
| 890 |
+
"message": "Document 'a1b2c3d4-e5f6-7890-abcd-ef1234567890' deleted successfully"
|
| 891 |
+
}
|
| 892 |
+
```
|
| 893 |
+
|
| 894 |
+
**curl example:**
|
| 895 |
+
|
| 896 |
+
```bash
|
| 897 |
+
curl -X DELETE http://localhost:7860/api/documents/a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
| 898 |
+
```
|
| 899 |
+
|
| 900 |
+
---
|
| 901 |
+
|
| 902 |
+
### Search (Retrieval Only)
|
| 903 |
+
|
| 904 |
+
```
|
| 905 |
+
POST /api/search
|
| 906 |
+
Content-Type: application/json
|
| 907 |
+
```
|
| 908 |
+
|
| 909 |
+
Performs hybrid retrieval without LLM generation. Useful for inspecting which chunks would be retrieved for a given query.
|
| 910 |
+
|
| 911 |
+
**Request body:**
|
| 912 |
+
|
| 913 |
+
```json
|
| 914 |
+
{
|
| 915 |
+
"query": "What is retrieval-augmented generation?",
|
| 916 |
+
"top_k": 10,
|
| 917 |
+
"filters": {
|
| 918 |
+
"doc_type": "pdf",
|
| 919 |
+
"tags": ["machine learning"]
|
| 920 |
+
}
|
| 921 |
+
}
|
| 922 |
+
```
|
| 923 |
+
|
| 924 |
+
**Response (200):**
|
| 925 |
+
|
| 926 |
+
```json
|
| 927 |
+
{
|
| 928 |
+
"query": "What is retrieval-augmented generation?",
|
| 929 |
+
"results": [
|
| 930 |
+
{
|
| 931 |
+
"chunk_id": "uuid",
|
| 932 |
+
"document_id": "uuid",
|
| 933 |
+
"text": "Retrieval-Augmented Generation (RAG) is...",
|
| 934 |
+
"score": 0.0234,
|
| 935 |
+
"metadata": {
|
| 936 |
+
"source": "report.pdf",
|
| 937 |
+
"doc_type": "pdf",
|
| 938 |
+
"title": "Annual Report",
|
| 939 |
+
"created_date": null,
|
| 940 |
+
"tags": ["machine learning"],
|
| 941 |
+
"page_count": 12
|
| 942 |
+
},
|
| 943 |
+
"rank": 0
|
| 944 |
+
}
|
| 945 |
+
],
|
| 946 |
+
"total_results": 10,
|
| 947 |
+
"search_time_ms": 142.5
|
| 948 |
+
}
|
| 949 |
+
```
|
| 950 |
+
|
| 951 |
+
**curl example:**
|
| 952 |
+
|
| 953 |
+
```bash
|
| 954 |
+
curl -X POST http://localhost:7860/api/search \
|
| 955 |
+
-H "Content-Type: application/json" \
|
| 956 |
+
-d '{"query": "What is RAG?", "top_k": 5}'
|
| 957 |
+
```
|
| 958 |
+
|
| 959 |
+
---
|
| 960 |
+
|
| 961 |
+
### Ask (Full RAG Pipeline)
|
| 962 |
+
|
| 963 |
+
```
|
| 964 |
+
POST /api/ask
|
| 965 |
+
Content-Type: application/json
|
| 966 |
+
```
|
| 967 |
+
|
| 968 |
+
Runs the full pipeline: query analysis, hybrid retrieval, reranking, and LLM answer generation.
|
| 969 |
+
|
| 970 |
+
**Request body:**
|
| 971 |
+
|
| 972 |
+
```json
|
| 973 |
+
{
|
| 974 |
+
"query": "What are the key findings in the report?",
|
| 975 |
+
"top_k": 10,
|
| 976 |
+
"rerank_top_k": 5,
|
| 977 |
+
"filters": null,
|
| 978 |
+
"stream": false
|
| 979 |
+
}
|
| 980 |
+
```
|
| 981 |
+
|
| 982 |
+
**Response (200, non-streaming):**
|
| 983 |
+
|
| 984 |
+
```json
|
| 985 |
+
{
|
| 986 |
+
"query": "What are the key findings in the report?",
|
| 987 |
+
"answer": "Based on the provided documents, the key findings are:\n\n1. **Finding one** [1]...\n2. **Finding two** [2]...",
|
| 988 |
+
"sources": [
|
| 989 |
+
{
|
| 990 |
+
"chunk_id": "uuid",
|
| 991 |
+
"document_id": "uuid",
|
| 992 |
+
"text": "chunk text...",
|
| 993 |
+
"score": 0.892,
|
| 994 |
+
"metadata": { "source": "report.pdf", "..." : "..." },
|
| 995 |
+
"rank": 0
|
| 996 |
+
}
|
| 997 |
+
],
|
| 998 |
+
"generation_time_ms": 3420.5,
|
| 999 |
+
"model": "gemini-2.0-flash"
|
| 1000 |
+
}
|
| 1001 |
+
```
|
| 1002 |
+
|
| 1003 |
+
**Streaming response (`"stream": true`):**
|
| 1004 |
+
|
| 1005 |
+
Returns `text/event-stream` with Server-Sent Events:
|
| 1006 |
+
|
| 1007 |
+
```
|
| 1008 |
+
data: {"text": "Based on"}
|
| 1009 |
+
|
| 1010 |
+
data: {"text": " the provided"}
|
| 1011 |
+
|
| 1012 |
+
data: {"text": " documents..."}
|
| 1013 |
+
|
| 1014 |
+
data: {"done": true, "sources": [...], "model": "gemini-2.0-flash", "time_ms": 3420.5}
|
| 1015 |
+
```
|
| 1016 |
+
|
| 1017 |
+
**curl examples:**
|
| 1018 |
+
|
| 1019 |
+
```bash
|
| 1020 |
+
# Non-streaming
|
| 1021 |
+
curl -X POST http://localhost:7860/api/ask \
|
| 1022 |
+
-H "Content-Type: application/json" \
|
| 1023 |
+
-d '{"query": "Summarize the report", "stream": false}'
|
| 1024 |
+
|
| 1025 |
+
# Streaming
|
| 1026 |
+
curl -X POST http://localhost:7860/api/ask \
|
| 1027 |
+
-H "Content-Type: application/json" \
|
| 1028 |
+
-d '{"query": "What is RAG?", "stream": true}' \
|
| 1029 |
+
--no-buffer
|
| 1030 |
+
```
|
| 1031 |
+
|
| 1032 |
+
---
|
| 1033 |
+
|
| 1034 |
+
## UI Guide
|
| 1035 |
+
|
| 1036 |
+
RagCore includes a Gradio web interface mounted at `/ui` (the root `/` redirects there automatically).
|
| 1037 |
+
|
| 1038 |
+
### Ask Tab
|
| 1039 |
+
|
| 1040 |
+
The primary interaction surface for querying your documents.
|
| 1041 |
+
|
| 1042 |
+
**Components:**
|
| 1043 |
+
|
| 1044 |
+
- **Query input** -- A text box where you type your question in natural language. Supports pressing Enter to submit.
|
| 1045 |
+
- **Document Type filter** -- Dropdown to restrict results to a specific file type: All, PDF, TXT, or HTML.
|
| 1046 |
+
- **Stream response toggle** -- Checkbox (default: on) to enable real-time streaming of the answer as it is generated.
|
| 1047 |
+
- **Ask button** -- Submits the query.
|
| 1048 |
+
- **Answer area** -- Displays the generated answer with markdown formatting, followed by a "Sources" section listing each referenced chunk with its filename, relevance score, and a text snippet.
|
| 1049 |
+
- **Example queries** -- Pre-filled example questions you can click to populate the query input.
|
| 1050 |
+
|
| 1051 |
+
### Documents Tab
|
| 1052 |
+
|
| 1053 |
+
Manages the document collection.
|
| 1054 |
+
|
| 1055 |
+
**Components:**
|
| 1056 |
+
|
| 1057 |
+
- **File upload zone** -- Drag-and-drop or click to select a file (`.pdf`, `.txt`, `.html`, `.htm`).
|
| 1058 |
+
- **Upload & Index button** -- Triggers the ingestion pipeline. Shows a status card with filename, chunk count, and document ID on success.
|
| 1059 |
+
- **Indexed Documents table** -- Displays all ingested documents with their filename, type, chunk count, and truncated document ID. Click "Refresh" to update.
|
| 1060 |
+
- **Delete section** -- Paste a full document ID and click "Delete" to remove a document and all its chunks.
|
| 1061 |
+
|
| 1062 |
+
### Stats Bar
|
| 1063 |
+
|
| 1064 |
+
At the top of every tab, a card shows the current count of indexed documents and total chunks.
|
| 1065 |
+
|
| 1066 |
+
---
|
| 1067 |
+
|
| 1068 |
+
## Setup and Installation
|
| 1069 |
+
|
| 1070 |
+
### Prerequisites
|
| 1071 |
+
|
| 1072 |
+
- Python 3.12 or later
|
| 1073 |
+
- A Qdrant Cloud account (free tier)
|
| 1074 |
+
- A Google AI Studio account (free tier Gemini API key)
|
| 1075 |
+
- (Optional) Docker and Docker Compose
|
| 1076 |
+
|
| 1077 |
+
### Step 1: Get API Keys
|
| 1078 |
+
|
| 1079 |
+
**Qdrant Cloud (vector database):**
|
| 1080 |
+
|
| 1081 |
+
1. Go to [https://cloud.qdrant.io](https://cloud.qdrant.io) and create a free account.
|
| 1082 |
+
2. Create a new cluster (the free tier provides 1 GB of storage).
|
| 1083 |
+
3. Copy the cluster URL (e.g., `https://abc123-xyz.us-east4-0.gcp.cloud.qdrant.io:6333`).
|
| 1084 |
+
4. Generate an API key from the cluster dashboard.
|
| 1085 |
+
|
| 1086 |
+
**Google Gemini (LLM):**
|
| 1087 |
+
|
| 1088 |
+
1. Go to [https://aistudio.google.com/apikey](https://aistudio.google.com/apikey).
|
| 1089 |
+
2. Click "Create API key" and select or create a Google Cloud project.
|
| 1090 |
+
3. Copy the generated API key. The free tier allows 15 requests per minute for Gemini 2.0 Flash.
|
| 1091 |
+
|
| 1092 |
+
### Step 2: Clone and Configure
|
| 1093 |
+
|
| 1094 |
+
```bash
|
| 1095 |
+
git clone <repository-url>
|
| 1096 |
+
cd ragcore
|
| 1097 |
+
```
|
| 1098 |
+
|
| 1099 |
+
Create a `.env` file in the `ragcore/` directory:
|
| 1100 |
+
|
| 1101 |
+
```env
|
| 1102 |
+
# Required
|
| 1103 |
+
GEMINI_API_KEY=your-gemini-api-key-here
|
| 1104 |
+
QDRANT_URL=https://your-cluster.cloud.qdrant.io:6333
|
| 1105 |
+
QDRANT_API_KEY=your-qdrant-api-key-here
|
| 1106 |
+
|
| 1107 |
+
# Optional (these are the defaults)
|
| 1108 |
+
EMBEDDING_MODEL=all-MiniLM-L6-v2
|
| 1109 |
+
EMBEDDING_DIM=384
|
| 1110 |
+
QDRANT_COLLECTION=ragcore_docs
|
| 1111 |
+
CHUNK_SIZE=512
|
| 1112 |
+
CHUNK_OVERLAP=50
|
| 1113 |
+
TOP_K=10
|
| 1114 |
+
RERANK_TOP_K=5
|
| 1115 |
+
DENSE_WEIGHT=0.6
|
| 1116 |
+
SPARSE_WEIGHT=0.4
|
| 1117 |
+
GEMINI_MODEL=gemini-2.0-flash
|
| 1118 |
+
GEMINI_RPM_LIMIT=15
|
| 1119 |
+
GEMINI_TEMPERATURE=0.3
|
| 1120 |
+
GEMINI_MAX_TOKENS=2048
|
| 1121 |
+
LOG_LEVEL=INFO
|
| 1122 |
+
MAX_FILE_SIZE_MB=10
|
| 1123 |
+
```
|
| 1124 |
+
|
| 1125 |
+
### Step 3: Running Locally
|
| 1126 |
+
|
| 1127 |
+
```bash
|
| 1128 |
+
# Create and activate a virtual environment
|
| 1129 |
+
python -m venv .venv
|
| 1130 |
+
source .venv/bin/activate # On Linux/macOS
|
| 1131 |
+
# .venv\Scripts\activate # On Windows
|
| 1132 |
+
|
| 1133 |
+
# Install dependencies
|
| 1134 |
+
pip install -r requirements.txt
|
| 1135 |
+
|
| 1136 |
+
# Start the server
|
| 1137 |
+
uvicorn app.main:app --host 0.0.0.0 --port 7860
|
| 1138 |
+
```
|
| 1139 |
+
|
| 1140 |
+
The first startup will download two ML models (~90 MB for the embedding model, ~50 MB for the reranker). Subsequent startups use cached models.
|
| 1141 |
+
|
| 1142 |
+
Once running:
|
| 1143 |
+
- Web UI: [http://localhost:7860/ui](http://localhost:7860/ui)
|
| 1144 |
+
- API docs: [http://localhost:7860/docs](http://localhost:7860/docs)
|
| 1145 |
+
- Health check: [http://localhost:7860/health](http://localhost:7860/health)
|
| 1146 |
+
|
| 1147 |
+
### Step 4: Running with Docker
|
| 1148 |
+
|
| 1149 |
+
```bash
|
| 1150 |
+
# Build and run
|
| 1151 |
+
docker compose up --build
|
| 1152 |
+
|
| 1153 |
+
# Or build and run in detached mode
|
| 1154 |
+
docker compose up --build -d
|
| 1155 |
+
```
|
| 1156 |
+
|
| 1157 |
+
The Docker build pre-downloads both ML models into the image layer, so container startup is faster. The app is exposed on port 8000 (mapped from container port 7860).
|
| 1158 |
+
|
| 1159 |
+
Once running: [http://localhost:8000/ui](http://localhost:8000/ui)
|
| 1160 |
+
|
| 1161 |
+
---
|
| 1162 |
+
|
| 1163 |
+
## Deployment
|
| 1164 |
+
|
| 1165 |
+
### Deploying to HuggingFace Spaces
|
| 1166 |
+
|
| 1167 |
+
HuggingFace Spaces provides free hosting for Gradio and Docker-based applications. RagCore is pre-configured for deployment there.
|
| 1168 |
+
|
| 1169 |
+
**Step-by-step:**
|
| 1170 |
+
|
| 1171 |
+
1. **Create a HuggingFace account** at [https://huggingface.co](https://huggingface.co) if you do not have one.
|
| 1172 |
+
|
| 1173 |
+
2. **Create a new Space:**
|
| 1174 |
+
- Go to [https://huggingface.co/new-space](https://huggingface.co/new-space).
|
| 1175 |
+
- Choose a name (e.g., `ragcore`).
|
| 1176 |
+
- Select **Docker** as the SDK.
|
| 1177 |
+
- Choose the **Free** CPU basic tier.
|
| 1178 |
+
- Click "Create Space".
|
| 1179 |
+
|
| 1180 |
+
3. **Configure secrets:**
|
| 1181 |
+
- Go to your Space's Settings > Repository secrets.
|
| 1182 |
+
- Add the following secrets:
|
| 1183 |
+
- `GEMINI_API_KEY` -- your Google Gemini API key
|
| 1184 |
+
- `QDRANT_URL` -- your Qdrant Cloud cluster URL
|
| 1185 |
+
- `QDRANT_API_KEY` -- your Qdrant Cloud API key
|
| 1186 |
+
|
| 1187 |
+
4. **Push the code:**
|
| 1188 |
+
|
| 1189 |
+
```bash
|
| 1190 |
+
cd ragcore
|
| 1191 |
+
git remote add space https://huggingface.co/spaces/YOUR_USERNAME/ragcore
|
| 1192 |
+
git push space main
|
| 1193 |
+
```
|
| 1194 |
+
|
| 1195 |
+
Alternatively, upload files via the HuggingFace web interface.
|
| 1196 |
+
|
| 1197 |
+
5. **Wait for the build** -- the Docker image will be built on HuggingFace's infrastructure. The first build takes 5-10 minutes due to model downloads. The Space will show "Running" when ready.
|
| 1198 |
+
|
| 1199 |
+
6. **Access your app** at `https://YOUR_USERNAME-ragcore.hf.space`.
|
| 1200 |
+
|
| 1201 |
+
**Important notes:**
|
| 1202 |
+
- HuggingFace Spaces exposes port 7860 by default, which matches the Dockerfile's `EXPOSE 7860`.
|
| 1203 |
+
- The free tier has 2 vCPU and 16 GB RAM, which is sufficient for RagCore.
|
| 1204 |
+
- Spaces may sleep after inactivity. The first request after sleep triggers a cold start (30-60 seconds).
|
| 1205 |
+
|
| 1206 |
+
---
|
| 1207 |
+
|
| 1208 |
+
## Configuration Reference
|
| 1209 |
+
|
| 1210 |
+
All settings are managed via environment variables, loaded from a `.env` file by `pydantic-settings`.
|
| 1211 |
+
|
| 1212 |
+
| Variable | Type | Default | Description |
|
| 1213 |
+
|---|---|---|---|
|
| 1214 |
+
| `GEMINI_API_KEY` | string | `""` | **Required.** Google Gemini API key for LLM generation. |
|
| 1215 |
+
| `QDRANT_URL` | string | `""` | **Required.** Full URL of the Qdrant Cloud cluster (including port). |
|
| 1216 |
+
| `QDRANT_API_KEY` | string | `""` | **Required.** Qdrant Cloud API key for authentication. |
|
| 1217 |
+
| `EMBEDDING_MODEL` | string | `all-MiniLM-L6-v2` | HuggingFace model name for sentence-transformers. |
|
| 1218 |
+
| `EMBEDDING_DIM` | integer | `384` | Dimensionality of the embedding vectors. Must match the model. |
|
| 1219 |
+
| `QDRANT_COLLECTION` | string | `ragcore_docs` | Name of the Qdrant collection to use. Created automatically if missing. |
|
| 1220 |
+
| `CHUNK_SIZE` | integer | `512` | Maximum number of words per text chunk. |
|
| 1221 |
+
| `CHUNK_OVERLAP` | integer | `50` | Number of words overlapping between consecutive chunks. |
|
| 1222 |
+
| `TOP_K` | integer | `10` | Number of chunks retrieved by the hybrid retriever. |
|
| 1223 |
+
| `RERANK_TOP_K` | integer | `5` | Number of chunks kept after cross-encoder reranking. |
|
| 1224 |
+
| `DENSE_WEIGHT` | float | `0.6` | Weight for dense (vector) search in RRF fusion. Range: 0.0-1.0. |
|
| 1225 |
+
| `SPARSE_WEIGHT` | float | `0.4` | Weight for sparse (BM25) search in RRF fusion. Range: 0.0-1.0. |
|
| 1226 |
+
| `GEMINI_MODEL` | string | `gemini-2.0-flash` | Gemini model identifier. |
|
| 1227 |
+
| `GEMINI_RPM_LIMIT` | integer | `15` | Maximum requests per minute to the Gemini API. |
|
| 1228 |
+
| `GEMINI_TEMPERATURE` | float | `0.3` | LLM generation temperature. Lower values produce more deterministic output. |
|
| 1229 |
+
| `GEMINI_MAX_TOKENS` | integer | `2048` | Maximum number of output tokens per LLM generation. |
|
| 1230 |
+
| `LOG_LEVEL` | string | `INFO` | Logging level. Valid values: DEBUG, INFO, WARNING, ERROR, CRITICAL. |
|
| 1231 |
+
| `MAX_FILE_SIZE_MB` | integer | `10` | Maximum allowed file size for upload in megabytes. |
|
| 1232 |
+
|
| 1233 |
+
---
|
| 1234 |
+
|
| 1235 |
+
## How It Works End-to-End
|
| 1236 |
+
|
| 1237 |
+
This section traces a complete user interaction: uploading a PDF and then asking a question about it.
|
| 1238 |
+
|
| 1239 |
+
### Phase 1: Document Ingestion
|
| 1240 |
+
|
| 1241 |
+
**User action:** Uploads `annual-report-2024.pdf` (2.1 MB, 45 pages) via the Gradio Documents tab.
|
| 1242 |
+
|
| 1243 |
+
1. The Gradio UI reads the file and sends it as a multipart POST to `http://localhost:7860/api/ingest`.
|
| 1244 |
+
|
| 1245 |
+
2. **Validation** (`ingest.py`):
|
| 1246 |
+
- Filename is checked: extension `.pdf` is in `SUPPORTED_EXTENSIONS`.
|
| 1247 |
+
- File size 2.1 MB is under the 10 MB limit.
|
| 1248 |
+
|
| 1249 |
+
3. **Parsing** (`parsers.py`):
|
| 1250 |
+
- `parse_pdf()` creates a `PdfReader` from the bytes.
|
| 1251 |
+
- Iterates over all 45 pages, extracting text from each.
|
| 1252 |
+
- Joins page texts with double newlines.
|
| 1253 |
+
- `clean_text()` normalizes whitespace: collapses 3+ consecutive newlines to 2, collapses horizontal whitespace to single spaces, trims each line.
|
| 1254 |
+
- Result: ~85,000 characters of cleaned text.
|
| 1255 |
+
|
| 1256 |
+
4. **Metadata extraction** (`metadata.py`):
|
| 1257 |
+
- `extract_title()` returns `"Annual Report 2024 - Acme Corporation"` (first meaningful line).
|
| 1258 |
+
- `extract_dates()` finds `"2024-03-15"` in the first 2000 chars, parses it to `datetime(2024, 3, 15)`.
|
| 1259 |
+
- `extract_tags()` finds frequent capitalized phrases: `["acme corporation", "revenue growth", "machine learning", ...]`.
|
| 1260 |
+
- `get_page_count()` returns `45`.
|
| 1261 |
+
- Final `DocumentMetadata`: source="annual-report-2024.pdf", doc_type="pdf", title="Annual Report 2024 - Acme Corporation", created_date=2024-03-15, tags=[...], page_count=45.
|
| 1262 |
+
|
| 1263 |
+
5. **Chunking** (`chunker.py`):
|
| 1264 |
+
- Splits the ~85,000 chars into sentences via `(?<=[.!?])\s+`.
|
| 1265 |
+
- Accumulates sentences until the word count exceeds 512.
|
| 1266 |
+
- Produces ~32 chunks, each with 50-word overlap with the next.
|
| 1267 |
+
- Each chunk records start_char, end_char, and chunk_index.
|
| 1268 |
+
|
| 1269 |
+
6. **Embedding** (`embedder.py`):
|
| 1270 |
+
- `embed_texts()` encodes all 32 chunk texts in a single batch (batch_size=64).
|
| 1271 |
+
- Returns 32 vectors, each of dimension 384, L2-normalized.
|
| 1272 |
+
|
| 1273 |
+
7. **Vector storage** (`vectorstore.py`):
|
| 1274 |
+
- `upsert_chunks()` creates 32 `PointStruct` objects with the vectors and payload.
|
| 1275 |
+
- Since 32 < 100, they are uploaded in a single batch.
|
| 1276 |
+
- Each point's payload includes text, document_id, chunk_index, source, doc_type, title, created_date, tags, page_count.
|
| 1277 |
+
|
| 1278 |
+
8. **BM25 indexing** (`bm25.py`):
|
| 1279 |
+
- `add_documents()` tokenizes each chunk (lowercase, remove stop words, remove single chars).
|
| 1280 |
+
- Appends to the document list and rebuilds the full BM25Okapi index.
|
| 1281 |
+
|
| 1282 |
+
9. **Response:** Returns `IngestResponse` with document_id, filename, num_chunks=32, and success message.
|
| 1283 |
+
|
| 1284 |
+
### Phase 2: Querying
|
| 1285 |
+
|
| 1286 |
+
**User action:** Types `"What was the revenue growth last year from PDFs?"` in the Ask tab with streaming enabled.
|
| 1287 |
+
|
| 1288 |
+
1. The Gradio UI sends a POST to `http://localhost:7860/api/ask` with:
|
| 1289 |
+
```json
|
| 1290 |
+
{"query": "What was the revenue growth last year from PDFs?", "top_k": 10, "rerank_top_k": 5, "stream": true, "filters": {"doc_type": "pdf"}}
|
| 1291 |
+
```
|
| 1292 |
+
(Note: the UI sets `doc_type` filter from the dropdown if not "All".)
|
| 1293 |
+
|
| 1294 |
+
2. **Query analysis** (`query_analyzer.py`):
|
| 1295 |
+
- Doc type extraction: matches "PDFs" -> `filters.doc_type = "pdf"`.
|
| 1296 |
+
- Date extraction: matches "last year" -> `filters.date_from = 2025-03-17`, `filters.date_to = 2026-03-17`.
|
| 1297 |
+
- Clean query: removes "last year" and "PDFs" -> `"What was the revenue growth"`.
|
| 1298 |
+
- Intent: matches `^(?:what|...)` -> `"factual"`.
|
| 1299 |
+
- Confidence: 0.5 + 0.1 (doc_type) + 0.1 (date) = 0.7.
|
| 1300 |
+
|
| 1301 |
+
3. **Hybrid retrieval** (`retriever.py`):
|
| 1302 |
+
- Embeds the clean query `"What was the revenue growth"` to a 384-dim vector.
|
| 1303 |
+
- **Dense search:** Queries Qdrant with the vector, limit=20 (top_k * 2), with filters for doc_type="pdf" and date range. Returns 20 results ranked by cosine similarity.
|
| 1304 |
+
- **Sparse search:** Tokenizes query to `["what", "revenue", "growth"]` (stop words removed), scores all BM25 documents, returns top 20 by BM25 score. Post-filters by doc_type="pdf".
|
| 1305 |
+
- **RRF fusion:** For each chunk, computes `score = 0.6 * 1/(60+dense_rank) + 0.4 * 1/(60+sparse_rank)`. Chunks appearing in both lists get boosted scores.
|
| 1306 |
+
- Deduplicates by chunk_id, takes top 10.
|
| 1307 |
+
|
| 1308 |
+
4. **Reranking** (`reranker.py`):
|
| 1309 |
+
- Creates passage pairs: (query, chunk_text) for all 10 retrieved chunks.
|
| 1310 |
+
- The FlashRank cross-encoder scores each pair jointly.
|
| 1311 |
+
- Returns the top 5 by cross-encoder score, with updated scores and ranks.
|
| 1312 |
+
|
| 1313 |
+
5. **Answer generation** (`generator.py`):
|
| 1314 |
+
- Builds context with numbered passages:
|
| 1315 |
+
```
|
| 1316 |
+
[1] (Source: annual-report-2024.pdf)
|
| 1317 |
+
Revenue increased by 23% year-over-year...
|
| 1318 |
+
|
| 1319 |
+
[2] (Source: annual-report-2024.pdf)
|
| 1320 |
+
The growth was primarily driven by...
|
| 1321 |
+
```
|
| 1322 |
+
- Constructs the SYSTEM_PROMPT with context and query.
|
| 1323 |
+
- Calls `llm.generate_stream()` which respects the rate limit, then yields text chunks.
|
| 1324 |
+
|
| 1325 |
+
6. **Streaming response** (`query.py`):
|
| 1326 |
+
- Each text chunk from Gemini is wrapped as `data: {"text": "..."}\n\n` (SSE format).
|
| 1327 |
+
- The Gradio UI accumulates text and renders it progressively in the answer area.
|
| 1328 |
+
- Final SSE event includes `{"done": true, "sources": [...], "model": "gemini-2.0-flash", "time_ms": 3420}`.
|
| 1329 |
+
- Gradio formats the sources as styled cards showing filename, score, and snippet.
|
| 1330 |
+
|
| 1331 |
+
---
|
| 1332 |
+
|
| 1333 |
+
## Testing
|
| 1334 |
+
|
| 1335 |
+
### Running Tests
|
| 1336 |
+
|
| 1337 |
+
```bash
|
| 1338 |
+
# Run all unit tests (excluding integration tests)
|
| 1339 |
+
pytest tests/ -v --ignore=tests/test_integration.py -x
|
| 1340 |
+
|
| 1341 |
+
# Run a specific test file
|
| 1342 |
+
pytest tests/test_chunker.py -v
|
| 1343 |
+
|
| 1344 |
+
# Run with coverage (install pytest-cov first)
|
| 1345 |
+
pytest tests/ -v --ignore=tests/test_integration.py --cov=app
|
| 1346 |
+
```
|
| 1347 |
+
|
| 1348 |
+
### Test Coverage
|
| 1349 |
+
|
| 1350 |
+
| Test File | Module Under Test | What Is Tested |
|
| 1351 |
+
|---|---|---|
|
| 1352 |
+
| `test_chunker.py` | `app.core.chunker` | Empty input, single sentence, multiple chunks, overlap behavior, chunk size limits |
|
| 1353 |
+
| `test_parsers.py` | `app.utils.parsers` | UTF-8 text, Latin-1 fallback, HTML tag stripping, unsupported extensions, empty files, extension-based dispatch |
|
| 1354 |
+
| `test_query_analyzer.py` | `app.core.query_analyzer` | Intent classification (factual, comparative, summarize, explanatory), doc type extraction, date extraction, clean query preservation |
|
| 1355 |
+
| `test_retrieval.py` | `app.core.retriever` | RRF fusion (basic, empty lists, single list, weighted), metadata filter application |
|
| 1356 |
+
| `test_api.py` | `app.main` (FastAPI) | Health endpoint returns 200 with components, root redirects to `/ui`, `/docs` page loads |
|
| 1357 |
+
|
| 1358 |
+
### Test Fixtures
|
| 1359 |
+
|
| 1360 |
+
Defined in `tests/conftest.py`:
|
| 1361 |
+
- `client` -- A `FastAPI TestClient` instance for API testing.
|
| 1362 |
+
- `sample_text` -- A paragraph about RAG for use in unit tests.
|
| 1363 |
+
|
| 1364 |
+
**Note:** Unit tests mock or avoid external dependencies (Qdrant, Gemini). The CI pipeline sets dummy API keys via environment variables. Integration tests (if present in `tests/test_integration.py`) are excluded from the default test run.
|
| 1365 |
+
|
| 1366 |
+
---
|
| 1367 |
+
|
| 1368 |
+
## CI/CD
|
| 1369 |
+
|
| 1370 |
+
### GitHub Actions Pipeline (`.github/workflows/ci.yml`)
|
| 1371 |
+
|
| 1372 |
+
The CI pipeline runs on every push to `main` and on every pull request targeting `main`.
|
| 1373 |
+
|
| 1374 |
+
**Pipeline steps:**
|
| 1375 |
+
|
| 1376 |
+
| Step | Description |
|
| 1377 |
+
|---|---|
|
| 1378 |
+
| Checkout | Clones the repository using `actions/checkout@v4` |
|
| 1379 |
+
| Set up Python | Installs Python 3.12 via `actions/setup-python@v5` |
|
| 1380 |
+
| Install dependencies | Runs `pip install -r requirements.txt` |
|
| 1381 |
+
| Lint | Runs `ruff check .` for code style and quality |
|
| 1382 |
+
| Unit tests | Runs `pytest tests/ -v --ignore=tests/test_integration.py -x` |
|
| 1383 |
+
|
| 1384 |
+
**Environment variables set during testing:**
|
| 1385 |
+
|
| 1386 |
+
```yaml
|
| 1387 |
+
env:
|
| 1388 |
+
GEMINI_API_KEY: "test"
|
| 1389 |
+
QDRANT_URL: "http://localhost:6333"
|
| 1390 |
+
QDRANT_API_KEY: "test"
|
| 1391 |
+
```
|
| 1392 |
+
|
| 1393 |
+
These are dummy values that allow the application to initialize its settings without connecting to real services. Tests that would require live connections are either mocked or skipped.
|
| 1394 |
+
|
| 1395 |
+
The `-x` flag causes pytest to stop on the first failure for faster feedback.
|
| 1396 |
+
|
| 1397 |
+
---
|
| 1398 |
+
|
| 1399 |
+
## Performance and Limits
|
| 1400 |
+
|
| 1401 |
+
### Free Tier Limits
|
| 1402 |
+
|
| 1403 |
+
| Service | Limit | Impact |
|
| 1404 |
+
|---|---|---|
|
| 1405 |
+
| **Qdrant Cloud** (free tier) | 1 GB storage | Approximately 500,000-700,000 chunks at 384 dimensions. More than sufficient for thousands of documents. |
|
| 1406 |
+
| **Google Gemini** (free tier) | 15 requests per minute | RagCore enforces this with built-in rate limiting (4-second minimum interval between calls). Each question costs 1 API call. |
|
| 1407 |
+
| **HuggingFace Spaces** (free tier) | 2 vCPU, 16 GB RAM | Sufficient for running the embedding model, reranker, and BM25 index concurrently. |
|
| 1408 |
+
|
| 1409 |
+
### Expected Latency
|
| 1410 |
+
|
| 1411 |
+
| Operation | Typical Latency | Notes |
|
| 1412 |
+
|---|---|---|
|
| 1413 |
+
| Document ingestion (10-page PDF) | 3-8 seconds | Dominated by embedding time on CPU |
|
| 1414 |
+
| Document ingestion (50-page PDF) | 10-20 seconds | Linear with number of chunks |
|
| 1415 |
+
| Query (hybrid retrieval only) | 100-300 ms | Embedding + Qdrant + BM25 + RRF |
|
| 1416 |
+
| Query (full RAG with answer) | 3-8 seconds | Dominated by Gemini API call |
|
| 1417 |
+
| Query (streaming, time to first token) | 1-3 seconds | Reranking + Gemini startup |
|
| 1418 |
+
| BM25 rebuild on startup | 50-500 ms | Depends on collection size (scrolls all points from Qdrant) |
|
| 1419 |
+
| Embedding model cold load | 2-5 seconds | First request only; cached thereafter |
|
| 1420 |
+
| Reranker model cold load | 1-3 seconds | First request only; cached thereafter |
|
| 1421 |
+
|
| 1422 |
+
### Capacity Guidelines
|
| 1423 |
+
|
| 1424 |
+
- **Small deployment** (< 100 documents, < 5,000 chunks): Everything runs comfortably within free tiers.
|
| 1425 |
+
- **Medium deployment** (100-1,000 documents, 5,000-50,000 chunks): BM25 index may use 50-500 MB RAM. Qdrant free tier still has ample space.
|
| 1426 |
+
- **Large deployment** (> 1,000 documents): Consider upgrading Qdrant to a paid tier and running the embedder on GPU for faster ingestion.
|
| 1427 |
+
|
| 1428 |
+
---
|
| 1429 |
+
|
| 1430 |
+
## Troubleshooting
|
| 1431 |
+
|
| 1432 |
+
### Common Errors and Fixes
|
| 1433 |
+
|
| 1434 |
+
**Error: `"Unsupported file type '.docx'"` or similar**
|
| 1435 |
+
|
| 1436 |
+
Only PDF, TXT, and HTML files are supported. Convert other formats to one of these before uploading. For DOCX files, export to PDF from your word processor.
|
| 1437 |
+
|
| 1438 |
+
---
|
| 1439 |
+
|
| 1440 |
+
**Error: `"File too large. Maximum size is 10MB"`**
|
| 1441 |
+
|
| 1442 |
+
Increase the limit by setting `MAX_FILE_SIZE_MB` in your `.env` file, or split the file into smaller parts.
|
| 1443 |
+
|
| 1444 |
+
---
|
| 1445 |
+
|
| 1446 |
+
**Error: `"Could not extract text from file"`**
|
| 1447 |
+
|
| 1448 |
+
The PDF may be image-based (scanned document) without an embedded text layer. pypdf cannot extract text from images. Use an OCR tool (e.g., Tesseract) to add a text layer first.
|
| 1449 |
+
|
| 1450 |
+
---
|
| 1451 |
+
|
| 1452 |
+
**Error: Qdrant connection timeout or `"Connection refused"`**
|
| 1453 |
+
|
| 1454 |
+
- Verify your `QDRANT_URL` includes the port (typically `:6333`).
|
| 1455 |
+
- Verify your `QDRANT_API_KEY` is correct.
|
| 1456 |
+
- Check that your Qdrant Cloud cluster is active (free clusters may be paused after inactivity).
|
| 1457 |
+
|
| 1458 |
+
---
|
| 1459 |
+
|
| 1460 |
+
**Error: `"Gemini generation failed"` or `"429 Too Many Requests"`**
|
| 1461 |
+
|
| 1462 |
+
You have exceeded the Gemini API rate limit. RagCore has built-in rate limiting, but if multiple users are sharing the same API key, collisions can occur. Solutions:
|
| 1463 |
+
- Wait a few seconds and retry.
|
| 1464 |
+
- Reduce `GEMINI_RPM_LIMIT` to add more buffer between calls.
|
| 1465 |
+
- Upgrade to a paid Gemini plan for higher limits.
|
| 1466 |
+
|
| 1467 |
+
---
|
| 1468 |
+
|
| 1469 |
+
**Error: `"Embedder initialization deferred"`**
|
| 1470 |
+
|
| 1471 |
+
This warning during startup means the embedding model could not be loaded immediately. This usually resolves on the first request. If it persists:
|
| 1472 |
+
- Check internet connectivity (the model needs to be downloaded on first use).
|
| 1473 |
+
- Ensure sufficient disk space (~200 MB for cached models).
|
| 1474 |
+
- Check if the `EMBEDDING_MODEL` name is correct.
|
| 1475 |
+
|
| 1476 |
+
---
|
| 1477 |
+
|
| 1478 |
+
**BM25 index shows 0 documents after restart**
|
| 1479 |
+
|
| 1480 |
+
This is expected on first startup with a fresh Qdrant collection. The BM25 index rebuilds from Qdrant on startup. If Qdrant has data but BM25 shows 0, check the Qdrant connection settings.
|
| 1481 |
+
|
| 1482 |
+
---
|
| 1483 |
+
|
| 1484 |
+
**Gradio UI not loading or showing "Connecting..."**
|
| 1485 |
+
|
| 1486 |
+
- Ensure the server is running on port 7860 (or whichever port you configured).
|
| 1487 |
+
- The Gradio UI communicates with the API via `http://localhost:7860`. If running in Docker, this internal URL is correct. If running behind a reverse proxy, the UI may need adjustment.
|
| 1488 |
+
|
| 1489 |
+
---
|
| 1490 |
+
|
| 1491 |
+
**Slow first request after startup**
|
| 1492 |
+
|
| 1493 |
+
The first request triggers lazy loading of the reranker model. This is a one-time cost of 1-3 seconds. Subsequent requests are fast.
|
| 1494 |
+
|
| 1495 |
+
---
|
| 1496 |
+
|
| 1497 |
+
**Docker build fails at model download step**
|
| 1498 |
+
|
| 1499 |
+
The Dockerfile pre-downloads ML models during build. This requires internet access during `docker build`. If building behind a corporate proxy, configure Docker's proxy settings. If the download fails, the build will fail. Retry usually resolves transient network issues.
|
app/__init__.py
ADDED
|
File without changes
|
app/api/__init__.py
ADDED
|
File without changes
|
app/api/deps.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
|
| 4 |
+
from app.core.bm25 import BM25Index, get_bm25
|
| 5 |
+
from app.core.embedder import EmbedderService, get_embedder
|
| 6 |
+
from app.core.generator import AnswerGenerator
|
| 7 |
+
from app.core.llm import GeminiService, get_llm
|
| 8 |
+
from app.core.query_analyzer import QueryAnalyzer
|
| 9 |
+
from app.core.reranker import RerankerService, get_reranker
|
| 10 |
+
from app.core.retriever import HybridRetriever
|
| 11 |
+
from app.core.vectorstore import VectorStoreService, get_vectorstore
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def dep_embedder() -> EmbedderService:
|
| 17 |
+
return get_embedder()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def dep_vectorstore() -> VectorStoreService:
|
| 21 |
+
return get_vectorstore()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def dep_bm25() -> BM25Index:
|
| 25 |
+
return get_bm25()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def dep_reranker() -> RerankerService:
|
| 29 |
+
return get_reranker()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def dep_llm() -> GeminiService:
|
| 33 |
+
return get_llm()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@lru_cache
|
| 37 |
+
def dep_query_analyzer() -> QueryAnalyzer:
|
| 38 |
+
return QueryAnalyzer()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def dep_retriever() -> HybridRetriever:
|
| 42 |
+
return HybridRetriever(
|
| 43 |
+
vectorstore=get_vectorstore(),
|
| 44 |
+
bm25=get_bm25(),
|
| 45 |
+
embedder=get_embedder(),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def dep_generator() -> AnswerGenerator:
|
| 50 |
+
return AnswerGenerator(
|
| 51 |
+
llm=get_llm(),
|
| 52 |
+
reranker=get_reranker(),
|
| 53 |
+
)
|
app/api/routes/__init__.py
ADDED
|
File without changes
|
app/api/routes/health.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
|
| 3 |
+
router = APIRouter(tags=["health"])
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@router.get("/health")
|
| 7 |
+
async def health_check():
|
| 8 |
+
status = {"status": "ok", "components": {}}
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from app.core.embedder import _embedder
|
| 12 |
+
status["components"]["embedder"] = "loaded" if _embedder else "not loaded"
|
| 13 |
+
except Exception:
|
| 14 |
+
status["components"]["embedder"] = "error"
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from app.core.bm25 import _bm25
|
| 18 |
+
if _bm25:
|
| 19 |
+
status["components"]["bm25"] = f"{_bm25.doc_count} documents"
|
| 20 |
+
else:
|
| 21 |
+
status["components"]["bm25"] = "not initialized"
|
| 22 |
+
except Exception:
|
| 23 |
+
status["components"]["bm25"] = "error"
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
from app.core.vectorstore import _vectorstore
|
| 27 |
+
if _vectorstore:
|
| 28 |
+
count = _vectorstore.count()
|
| 29 |
+
status["components"]["vectorstore"] = f"connected ({count} points)"
|
| 30 |
+
else:
|
| 31 |
+
status["components"]["vectorstore"] = "not connected"
|
| 32 |
+
except Exception as e:
|
| 33 |
+
status["components"]["vectorstore"] = f"error: {e}"
|
| 34 |
+
status["status"] = "degraded"
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from app.core.llm import _llm
|
| 38 |
+
status["components"]["llm"] = f"ready ({_llm.model_name})" if _llm else "not initialized"
|
| 39 |
+
except Exception:
|
| 40 |
+
status["components"]["llm"] = "error"
|
| 41 |
+
|
| 42 |
+
return status
|
app/api/routes/ingest.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends, HTTPException, UploadFile
|
| 4 |
+
|
| 5 |
+
from app.api.deps import dep_bm25, dep_embedder, dep_vectorstore
|
| 6 |
+
from app.config import get_settings
|
| 7 |
+
from app.core.bm25 import BM25Index
|
| 8 |
+
from app.core.chunker import chunk_text
|
| 9 |
+
from app.core.embedder import EmbedderService
|
| 10 |
+
from app.core.metadata import extract_metadata
|
| 11 |
+
from app.core.vectorstore import VectorStoreService
|
| 12 |
+
from app.models.document import Chunk, Document, DocumentMetadata
|
| 13 |
+
from app.models.schemas import IngestResponse
|
| 14 |
+
from app.utils.helpers import generate_id
|
| 15 |
+
from app.utils.parsers import SUPPORTED_EXTENSIONS, get_page_count, parse_document
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
router = APIRouter(prefix="/api", tags=["ingest"])
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@router.post("/ingest", response_model=IngestResponse)
|
| 23 |
+
async def ingest_document(
|
| 24 |
+
file: UploadFile,
|
| 25 |
+
vectorstore: VectorStoreService = Depends(dep_vectorstore),
|
| 26 |
+
embedder: EmbedderService = Depends(dep_embedder),
|
| 27 |
+
bm25: BM25Index = Depends(dep_bm25),
|
| 28 |
+
):
|
| 29 |
+
settings = get_settings()
|
| 30 |
+
|
| 31 |
+
# Validate file extension
|
| 32 |
+
if not file.filename:
|
| 33 |
+
raise HTTPException(status_code=400, detail="Filename is required")
|
| 34 |
+
|
| 35 |
+
ext = "." + file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else ""
|
| 36 |
+
if ext not in SUPPORTED_EXTENSIONS:
|
| 37 |
+
raise HTTPException(
|
| 38 |
+
status_code=400,
|
| 39 |
+
detail=f"Unsupported file type '{ext}'. Supported: {', '.join(SUPPORTED_EXTENSIONS)}",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Read file
|
| 43 |
+
file_bytes = await file.read()
|
| 44 |
+
|
| 45 |
+
# Validate file size
|
| 46 |
+
max_size = settings.max_file_size_mb * 1024 * 1024
|
| 47 |
+
if len(file_bytes) > max_size:
|
| 48 |
+
raise HTTPException(
|
| 49 |
+
status_code=413,
|
| 50 |
+
detail=f"File too large. Maximum size is {settings.max_file_size_mb}MB",
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Check for duplicate document (same filename already indexed)
|
| 54 |
+
existing_docs = vectorstore.get_document_ids()
|
| 55 |
+
for doc in existing_docs:
|
| 56 |
+
if doc.get("source") == file.filename:
|
| 57 |
+
raise HTTPException(
|
| 58 |
+
status_code=409,
|
| 59 |
+
detail=f"Document '{file.filename}' is already indexed (ID: {doc['document_id'][:12]}...). "
|
| 60 |
+
f"Delete it first if you want to re-upload.",
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Parse document
|
| 64 |
+
try:
|
| 65 |
+
raw_text = parse_document(file_bytes, file.filename)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Failed to parse '{file.filename}': {e}")
|
| 68 |
+
raise HTTPException(status_code=422, detail=f"Failed to parse file: {e}")
|
| 69 |
+
|
| 70 |
+
if not raw_text or not raw_text.strip():
|
| 71 |
+
raise HTTPException(status_code=422, detail="Could not extract text from file")
|
| 72 |
+
|
| 73 |
+
# Extract metadata
|
| 74 |
+
page_count = get_page_count(file_bytes, file.filename)
|
| 75 |
+
metadata = extract_metadata(raw_text, file.filename, page_count=page_count)
|
| 76 |
+
|
| 77 |
+
# Create document
|
| 78 |
+
document_id = generate_id()
|
| 79 |
+
|
| 80 |
+
# Chunk text
|
| 81 |
+
chunk_dicts = chunk_text(
|
| 82 |
+
raw_text,
|
| 83 |
+
chunk_size=settings.chunk_size,
|
| 84 |
+
chunk_overlap=settings.chunk_overlap,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if not chunk_dicts:
|
| 88 |
+
raise HTTPException(status_code=422, detail="Document produced no text chunks")
|
| 89 |
+
|
| 90 |
+
chunks = [
|
| 91 |
+
Chunk(
|
| 92 |
+
chunk_id=generate_id(),
|
| 93 |
+
document_id=document_id,
|
| 94 |
+
text=c["text"],
|
| 95 |
+
metadata=metadata,
|
| 96 |
+
chunk_index=c["chunk_index"],
|
| 97 |
+
start_char=c["start_char"],
|
| 98 |
+
end_char=c["end_char"],
|
| 99 |
+
)
|
| 100 |
+
for c in chunk_dicts
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# Embed chunks
|
| 104 |
+
try:
|
| 105 |
+
texts = [c.text for c in chunks]
|
| 106 |
+
embeddings = embedder.embed_texts(texts)
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Embedding failed for '{file.filename}': {e}")
|
| 109 |
+
raise HTTPException(status_code=500, detail=f"Embedding failed: {e}")
|
| 110 |
+
|
| 111 |
+
# Store in Qdrant
|
| 112 |
+
try:
|
| 113 |
+
vectorstore.upsert_chunks(chunks, embeddings)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Vector store upsert failed: {e}")
|
| 116 |
+
raise HTTPException(status_code=500, detail=f"Failed to store document: {e}")
|
| 117 |
+
|
| 118 |
+
# Add to BM25 index
|
| 119 |
+
bm25.add_documents(chunks)
|
| 120 |
+
|
| 121 |
+
logger.info(f"Ingested '{file.filename}': {len(chunks)} chunks")
|
| 122 |
+
|
| 123 |
+
return IngestResponse(
|
| 124 |
+
document_id=document_id,
|
| 125 |
+
filename=file.filename,
|
| 126 |
+
num_chunks=len(chunks),
|
| 127 |
+
message=f"Successfully ingested '{file.filename}' with {len(chunks)} chunks",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@router.get("/documents")
|
| 132 |
+
async def list_documents(
|
| 133 |
+
vectorstore: VectorStoreService = Depends(dep_vectorstore),
|
| 134 |
+
):
|
| 135 |
+
try:
|
| 136 |
+
docs = vectorstore.get_document_ids()
|
| 137 |
+
return {"documents": docs, "total": len(docs)}
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.error(f"Failed to list documents: {e}")
|
| 140 |
+
raise HTTPException(status_code=500, detail=f"Failed to list documents: {e}")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@router.delete("/documents/{document_id}")
|
| 144 |
+
async def delete_document(
|
| 145 |
+
document_id: str,
|
| 146 |
+
vectorstore: VectorStoreService = Depends(dep_vectorstore),
|
| 147 |
+
bm25: BM25Index = Depends(dep_bm25),
|
| 148 |
+
):
|
| 149 |
+
try:
|
| 150 |
+
vectorstore.delete_document(document_id)
|
| 151 |
+
bm25.rebuild_from_vectorstore(vectorstore)
|
| 152 |
+
return {"message": f"Document '{document_id}' deleted successfully"}
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Failed to delete document '{document_id}': {e}")
|
| 155 |
+
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
app/api/routes/query.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 6 |
+
from fastapi.responses import StreamingResponse
|
| 7 |
+
|
| 8 |
+
from app.api.deps import dep_generator, dep_query_analyzer, dep_retriever
|
| 9 |
+
from app.core.generator import AnswerGenerator
|
| 10 |
+
from app.core.query_analyzer import QueryAnalyzer
|
| 11 |
+
from app.core.retriever import HybridRetriever
|
| 12 |
+
from app.models.schemas import (
|
| 13 |
+
GeneratedAnswer,
|
| 14 |
+
QueryRequest,
|
| 15 |
+
SearchRequest,
|
| 16 |
+
SearchResponse,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
router = APIRouter(prefix="/api", tags=["query"])
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _resolve_filters(request_filters, analyzed_filters):
|
| 25 |
+
"""Use explicit request filters if provided, otherwise use analyzed filters only if they contain values."""
|
| 26 |
+
if request_filters and request_filters.has_filters():
|
| 27 |
+
return request_filters
|
| 28 |
+
if analyzed_filters and analyzed_filters.has_filters():
|
| 29 |
+
return analyzed_filters
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@router.post("/search", response_model=SearchResponse)
|
| 34 |
+
async def search(
|
| 35 |
+
request: SearchRequest,
|
| 36 |
+
retriever: HybridRetriever = Depends(dep_retriever),
|
| 37 |
+
analyzer: QueryAnalyzer = Depends(dep_query_analyzer),
|
| 38 |
+
):
|
| 39 |
+
try:
|
| 40 |
+
start = time.perf_counter()
|
| 41 |
+
|
| 42 |
+
analyzed = analyzer.analyze(request.query)
|
| 43 |
+
filters = _resolve_filters(request.filters, analyzed.extracted_filters)
|
| 44 |
+
|
| 45 |
+
results = retriever.retrieve(
|
| 46 |
+
query=analyzed.clean_query,
|
| 47 |
+
top_k=request.top_k,
|
| 48 |
+
filters=filters,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 52 |
+
|
| 53 |
+
return SearchResponse(
|
| 54 |
+
query=request.query,
|
| 55 |
+
results=results,
|
| 56 |
+
total_results=len(results),
|
| 57 |
+
search_time_ms=elapsed,
|
| 58 |
+
)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Search failed: {e}", exc_info=True)
|
| 61 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {e}")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@router.post("/ask")
|
| 65 |
+
async def ask(
|
| 66 |
+
request: QueryRequest,
|
| 67 |
+
retriever: HybridRetriever = Depends(dep_retriever),
|
| 68 |
+
generator: AnswerGenerator = Depends(dep_generator),
|
| 69 |
+
analyzer: QueryAnalyzer = Depends(dep_query_analyzer),
|
| 70 |
+
):
|
| 71 |
+
try:
|
| 72 |
+
analyzed = analyzer.analyze(request.query)
|
| 73 |
+
filters = _resolve_filters(request.filters, analyzed.extracted_filters)
|
| 74 |
+
|
| 75 |
+
chunks = retriever.retrieve(
|
| 76 |
+
query=analyzed.clean_query,
|
| 77 |
+
top_k=request.top_k,
|
| 78 |
+
filters=filters,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if request.stream:
|
| 82 |
+
return StreamingResponse(
|
| 83 |
+
_stream_response(request.query, chunks, generator, request.rerank_top_k, analyzed.intent),
|
| 84 |
+
media_type="text/event-stream",
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
answer = generator.generate_answer(
|
| 88 |
+
query=request.query,
|
| 89 |
+
chunks=chunks,
|
| 90 |
+
rerank_top_k=request.rerank_top_k,
|
| 91 |
+
intent=analyzed.intent,
|
| 92 |
+
)
|
| 93 |
+
return answer
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Ask failed: {e}", exc_info=True)
|
| 96 |
+
raise HTTPException(status_code=500, detail=f"Query failed: {e}")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
async def _stream_response(
|
| 100 |
+
query: str,
|
| 101 |
+
chunks,
|
| 102 |
+
generator: AnswerGenerator,
|
| 103 |
+
rerank_top_k: int,
|
| 104 |
+
intent: str,
|
| 105 |
+
):
|
| 106 |
+
try:
|
| 107 |
+
async for item in generator.generate_answer_stream(
|
| 108 |
+
query=query,
|
| 109 |
+
chunks=chunks,
|
| 110 |
+
rerank_top_k=rerank_top_k,
|
| 111 |
+
intent=intent,
|
| 112 |
+
):
|
| 113 |
+
if isinstance(item, str):
|
| 114 |
+
yield f"data: {json.dumps({'text': item})}\n\n"
|
| 115 |
+
elif isinstance(item, GeneratedAnswer):
|
| 116 |
+
sources = [
|
| 117 |
+
{
|
| 118 |
+
"chunk_id": s.chunk_id,
|
| 119 |
+
"text": s.text[:200],
|
| 120 |
+
"source": s.metadata.source,
|
| 121 |
+
"score": s.score,
|
| 122 |
+
}
|
| 123 |
+
for s in item.sources
|
| 124 |
+
]
|
| 125 |
+
yield f"data: {json.dumps({'done': True, 'sources': sources, 'model': item.model, 'time_ms': item.generation_time_ms})}\n\n"
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Streaming failed: {e}", exc_info=True)
|
| 128 |
+
yield f"data: {json.dumps({'error': str(e), 'done': True})}\n\n"
|
app/config.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
|
| 4 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Settings(BaseSettings):
|
| 8 |
+
model_config = SettingsConfigDict(
|
| 9 |
+
env_file=".env",
|
| 10 |
+
env_file_encoding="utf-8",
|
| 11 |
+
extra="ignore",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
# API Keys
|
| 15 |
+
gemini_api_key: str = ""
|
| 16 |
+
qdrant_url: str = ""
|
| 17 |
+
qdrant_api_key: str = ""
|
| 18 |
+
|
| 19 |
+
# Embedding
|
| 20 |
+
embedding_model: str = "all-MiniLM-L6-v2"
|
| 21 |
+
embedding_dim: int = 384
|
| 22 |
+
|
| 23 |
+
# Qdrant
|
| 24 |
+
qdrant_collection: str = "ragcore_docs"
|
| 25 |
+
|
| 26 |
+
# Chunking
|
| 27 |
+
chunk_size: int = 512
|
| 28 |
+
chunk_overlap: int = 50
|
| 29 |
+
|
| 30 |
+
# Retrieval
|
| 31 |
+
top_k: int = 10
|
| 32 |
+
rerank_top_k: int = 5
|
| 33 |
+
dense_weight: float = 0.6
|
| 34 |
+
sparse_weight: float = 0.4
|
| 35 |
+
|
| 36 |
+
# LLM
|
| 37 |
+
gemini_model: str = "gemini-2.5-flash"
|
| 38 |
+
gemini_rpm_limit: int = 15
|
| 39 |
+
gemini_temperature: float = 0.3
|
| 40 |
+
gemini_max_tokens: int = 2048
|
| 41 |
+
|
| 42 |
+
# App
|
| 43 |
+
log_level: str = "INFO"
|
| 44 |
+
max_file_size_mb: int = 10
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@lru_cache
|
| 48 |
+
def get_settings() -> Settings:
|
| 49 |
+
return Settings()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def setup_logging() -> None:
|
| 53 |
+
settings = get_settings()
|
| 54 |
+
logging.basicConfig(
|
| 55 |
+
level=getattr(logging, settings.log_level.upper(), logging.INFO),
|
| 56 |
+
format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
|
| 57 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 58 |
+
)
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/bm25.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
from rank_bm25 import BM25Okapi
|
| 6 |
+
|
| 7 |
+
from app.models.document import Chunk
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
STOP_WORDS = {
|
| 12 |
+
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
| 13 |
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
| 14 |
+
"should", "may", "might", "can", "shall", "to", "of", "in", "for",
|
| 15 |
+
"on", "with", "at", "by", "from", "as", "into", "through", "during",
|
| 16 |
+
"before", "after", "and", "but", "or", "not", "no", "if", "then",
|
| 17 |
+
"than", "that", "this", "it", "its", "he", "she", "they", "we", "you",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def tokenize(text: str) -> list[str]:
|
| 22 |
+
text = text.lower()
|
| 23 |
+
words = re.findall(r"\b\w+\b", text)
|
| 24 |
+
return [w for w in words if w not in STOP_WORDS and len(w) > 1]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BM25Index:
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.documents: list[dict] = []
|
| 30 |
+
self.index: BM25Okapi | None = None
|
| 31 |
+
|
| 32 |
+
def build_index(self, chunks: list[Chunk]) -> None:
|
| 33 |
+
self.documents = [
|
| 34 |
+
{
|
| 35 |
+
"chunk_id": chunk.chunk_id,
|
| 36 |
+
"document_id": chunk.document_id,
|
| 37 |
+
"text": chunk.text,
|
| 38 |
+
"tokens": tokenize(chunk.text),
|
| 39 |
+
"metadata": chunk.metadata.model_dump() if chunk.metadata else {},
|
| 40 |
+
}
|
| 41 |
+
for chunk in chunks
|
| 42 |
+
]
|
| 43 |
+
if self.documents:
|
| 44 |
+
corpus = [doc["tokens"] for doc in self.documents]
|
| 45 |
+
self.index = BM25Okapi(corpus)
|
| 46 |
+
logger.info(f"Built BM25 index with {len(self.documents)} documents")
|
| 47 |
+
|
| 48 |
+
def add_documents(self, chunks: list[Chunk]) -> None:
|
| 49 |
+
new_docs = [
|
| 50 |
+
{
|
| 51 |
+
"chunk_id": chunk.chunk_id,
|
| 52 |
+
"document_id": chunk.document_id,
|
| 53 |
+
"text": chunk.text,
|
| 54 |
+
"tokens": tokenize(chunk.text),
|
| 55 |
+
"metadata": chunk.metadata.model_dump() if chunk.metadata else {},
|
| 56 |
+
}
|
| 57 |
+
for chunk in chunks
|
| 58 |
+
]
|
| 59 |
+
self.documents.extend(new_docs)
|
| 60 |
+
if self.documents:
|
| 61 |
+
corpus = [doc["tokens"] for doc in self.documents]
|
| 62 |
+
self.index = BM25Okapi(corpus)
|
| 63 |
+
logger.info(f"BM25 index updated: {len(self.documents)} total documents")
|
| 64 |
+
|
| 65 |
+
def search(self, query: str, top_k: int = 10) -> list[dict]:
|
| 66 |
+
if not self.index or not self.documents:
|
| 67 |
+
return []
|
| 68 |
+
|
| 69 |
+
tokens = tokenize(query)
|
| 70 |
+
if not tokens:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
scores = self.index.get_scores(tokens)
|
| 74 |
+
scored_docs = [
|
| 75 |
+
(score, doc) for score, doc in zip(scores, self.documents) if score > 0
|
| 76 |
+
]
|
| 77 |
+
scored_docs.sort(key=lambda x: x[0], reverse=True)
|
| 78 |
+
|
| 79 |
+
return [
|
| 80 |
+
{
|
| 81 |
+
"chunk_id": doc["chunk_id"],
|
| 82 |
+
"document_id": doc["document_id"],
|
| 83 |
+
"text": doc["text"],
|
| 84 |
+
"score": float(score),
|
| 85 |
+
"metadata": doc["metadata"],
|
| 86 |
+
}
|
| 87 |
+
for score, doc in scored_docs[:top_k]
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
def rebuild_from_vectorstore(self, vectorstore) -> None:
|
| 91 |
+
start = time.perf_counter()
|
| 92 |
+
all_points = vectorstore.scroll_all()
|
| 93 |
+
self.documents = [
|
| 94 |
+
{
|
| 95 |
+
"chunk_id": p["chunk_id"],
|
| 96 |
+
"document_id": p["document_id"],
|
| 97 |
+
"text": p["text"],
|
| 98 |
+
"tokens": tokenize(p["text"]),
|
| 99 |
+
"metadata": p["metadata"],
|
| 100 |
+
}
|
| 101 |
+
for p in all_points
|
| 102 |
+
if p.get("text")
|
| 103 |
+
]
|
| 104 |
+
if self.documents:
|
| 105 |
+
corpus = [doc["tokens"] for doc in self.documents]
|
| 106 |
+
self.index = BM25Okapi(corpus)
|
| 107 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 108 |
+
logger.info(
|
| 109 |
+
f"Rebuilt BM25 index from vectorstore: {len(self.documents)} docs in {elapsed:.0f}ms"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def doc_count(self) -> int:
|
| 114 |
+
return len(self.documents)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
_bm25: BM25Index | None = None
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def get_bm25() -> BM25Index:
|
| 121 |
+
global _bm25
|
| 122 |
+
if _bm25 is None:
|
| 123 |
+
_bm25 = BM25Index()
|
| 124 |
+
return _bm25
|
app/core/chunker.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
logger = logging.getLogger(__name__)
|
| 5 |
+
|
| 6 |
+
SENTENCE_PATTERN = re.compile(r"(?<=[.!?])\s+")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def chunk_text(
|
| 10 |
+
text: str,
|
| 11 |
+
chunk_size: int = 512,
|
| 12 |
+
chunk_overlap: int = 50,
|
| 13 |
+
) -> list[dict]:
|
| 14 |
+
if not text or not text.strip():
|
| 15 |
+
return []
|
| 16 |
+
|
| 17 |
+
sentences = SENTENCE_PATTERN.split(text)
|
| 18 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 19 |
+
|
| 20 |
+
if not sentences:
|
| 21 |
+
return []
|
| 22 |
+
|
| 23 |
+
chunks = []
|
| 24 |
+
current_words: list[str] = []
|
| 25 |
+
current_start = 0
|
| 26 |
+
char_pos = 0
|
| 27 |
+
|
| 28 |
+
for sentence in sentences:
|
| 29 |
+
words = sentence.split()
|
| 30 |
+
|
| 31 |
+
if current_words and len(current_words) + len(words) > chunk_size:
|
| 32 |
+
chunk_text_str = " ".join(current_words)
|
| 33 |
+
chunk_end = current_start + len(chunk_text_str)
|
| 34 |
+
chunks.append({
|
| 35 |
+
"text": chunk_text_str,
|
| 36 |
+
"start_char": current_start,
|
| 37 |
+
"end_char": chunk_end,
|
| 38 |
+
"chunk_index": len(chunks),
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
# Overlap: keep last chunk_overlap words
|
| 42 |
+
overlap_words = current_words[-chunk_overlap:] if chunk_overlap > 0 else []
|
| 43 |
+
overlap_text = " ".join(overlap_words)
|
| 44 |
+
current_start = chunk_end - len(overlap_text)
|
| 45 |
+
current_words = overlap_words
|
| 46 |
+
|
| 47 |
+
current_words.extend(words)
|
| 48 |
+
|
| 49 |
+
# Last chunk
|
| 50 |
+
if current_words:
|
| 51 |
+
chunk_text_str = " ".join(current_words)
|
| 52 |
+
chunks.append({
|
| 53 |
+
"text": chunk_text_str,
|
| 54 |
+
"start_char": current_start,
|
| 55 |
+
"end_char": current_start + len(chunk_text_str),
|
| 56 |
+
"chunk_index": len(chunks),
|
| 57 |
+
})
|
| 58 |
+
|
| 59 |
+
logger.info(f"Chunked text into {len(chunks)} chunks (size={chunk_size}, overlap={chunk_overlap})")
|
| 60 |
+
return chunks
|
app/core/embedder.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
from app.config import get_settings
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class EmbedderService:
|
| 12 |
+
EMBEDDING_DIM = 384
|
| 13 |
+
|
| 14 |
+
def __init__(self, model_name: str):
|
| 15 |
+
start = time.perf_counter()
|
| 16 |
+
self.model = SentenceTransformer(model_name, device="cpu")
|
| 17 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 18 |
+
logger.info(f"Loaded embedding model '{model_name}' in {elapsed:.0f}ms")
|
| 19 |
+
|
| 20 |
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
| 21 |
+
if not texts:
|
| 22 |
+
return []
|
| 23 |
+
embeddings = self.model.encode(
|
| 24 |
+
texts,
|
| 25 |
+
batch_size=64,
|
| 26 |
+
show_progress_bar=False,
|
| 27 |
+
normalize_embeddings=True,
|
| 28 |
+
)
|
| 29 |
+
return embeddings.tolist()
|
| 30 |
+
|
| 31 |
+
def embed_query(self, query: str) -> list[float]:
|
| 32 |
+
return self.embed_texts([query])[0]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
_embedder: EmbedderService | None = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_embedder() -> EmbedderService:
|
| 39 |
+
global _embedder
|
| 40 |
+
if _embedder is None:
|
| 41 |
+
settings = get_settings()
|
| 42 |
+
_embedder = EmbedderService(settings.embedding_model)
|
| 43 |
+
return _embedder
|
app/core/generator.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
from collections.abc import AsyncGenerator
|
| 4 |
+
|
| 5 |
+
from app.core.llm import GeminiService
|
| 6 |
+
from app.core.reranker import RerankerService
|
| 7 |
+
from app.models.schemas import GeneratedAnswer, RetrievedChunk
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
SYSTEM_PROMPT = """You are a helpful assistant answering questions based on the provided context.
|
| 12 |
+
|
| 13 |
+
CONTEXT:
|
| 14 |
+
{context}
|
| 15 |
+
|
| 16 |
+
RULES:
|
| 17 |
+
- Answer based ONLY on the provided context.
|
| 18 |
+
- Cite sources using [1], [2], etc. inline after the relevant information.
|
| 19 |
+
- If the context doesn't contain enough information, say "I don't have enough information in the provided documents to answer this question."
|
| 20 |
+
- Be concise but thorough.
|
| 21 |
+
- Use markdown formatting for readability.
|
| 22 |
+
|
| 23 |
+
QUESTION: {query}
|
| 24 |
+
|
| 25 |
+
ANSWER:"""
|
| 26 |
+
|
| 27 |
+
SUMMARY_PROMPT = """You are a helpful assistant. Summarize the following context.
|
| 28 |
+
|
| 29 |
+
CONTEXT:
|
| 30 |
+
{context}
|
| 31 |
+
|
| 32 |
+
RULES:
|
| 33 |
+
- Provide a structured summary using markdown.
|
| 34 |
+
- Cite sources using [1], [2], etc.
|
| 35 |
+
- Cover the key points from all provided sources.
|
| 36 |
+
|
| 37 |
+
QUESTION: {query}
|
| 38 |
+
|
| 39 |
+
SUMMARY:"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class AnswerGenerator:
|
| 43 |
+
def __init__(self, llm: GeminiService, reranker: RerankerService):
|
| 44 |
+
self.llm = llm
|
| 45 |
+
self.reranker = reranker
|
| 46 |
+
|
| 47 |
+
def _build_context(self, chunks: list[RetrievedChunk]) -> str:
|
| 48 |
+
parts = []
|
| 49 |
+
for i, chunk in enumerate(chunks, 1):
|
| 50 |
+
source = chunk.metadata.source or "unknown"
|
| 51 |
+
header = f"[{i}] (Source: {source})"
|
| 52 |
+
parts.append(f"{header}\n{chunk.text}")
|
| 53 |
+
return "\n\n".join(parts)
|
| 54 |
+
|
| 55 |
+
def _build_prompt(self, query: str, chunks: list[RetrievedChunk], intent: str = "factual") -> str:
|
| 56 |
+
context = self._build_context(chunks)
|
| 57 |
+
template = SUMMARY_PROMPT if intent == "summarize" else SYSTEM_PROMPT
|
| 58 |
+
return template.format(context=context, query=query)
|
| 59 |
+
|
| 60 |
+
def generate_answer(
|
| 61 |
+
self,
|
| 62 |
+
query: str,
|
| 63 |
+
chunks: list[RetrievedChunk],
|
| 64 |
+
rerank_top_k: int = 5,
|
| 65 |
+
intent: str = "factual",
|
| 66 |
+
) -> GeneratedAnswer:
|
| 67 |
+
start = time.perf_counter()
|
| 68 |
+
|
| 69 |
+
# Rerank
|
| 70 |
+
reranked = self.reranker.rerank(query, chunks, top_k=rerank_top_k)
|
| 71 |
+
if not reranked:
|
| 72 |
+
return GeneratedAnswer(
|
| 73 |
+
query=query,
|
| 74 |
+
answer="No relevant documents found to answer your question.",
|
| 75 |
+
sources=[],
|
| 76 |
+
generation_time_ms=0,
|
| 77 |
+
model=self.llm.model_name,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
prompt = self._build_prompt(query, reranked, intent)
|
| 81 |
+
answer = self.llm.generate(prompt)
|
| 82 |
+
|
| 83 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 84 |
+
logger.info(f"Generated answer in {elapsed:.0f}ms")
|
| 85 |
+
|
| 86 |
+
return GeneratedAnswer(
|
| 87 |
+
query=query,
|
| 88 |
+
answer=answer,
|
| 89 |
+
sources=reranked,
|
| 90 |
+
generation_time_ms=elapsed,
|
| 91 |
+
model=self.llm.model_name,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
async def generate_answer_stream(
|
| 95 |
+
self,
|
| 96 |
+
query: str,
|
| 97 |
+
chunks: list[RetrievedChunk],
|
| 98 |
+
rerank_top_k: int = 5,
|
| 99 |
+
intent: str = "factual",
|
| 100 |
+
) -> AsyncGenerator[str | GeneratedAnswer, None]:
|
| 101 |
+
# Rerank
|
| 102 |
+
reranked = self.reranker.rerank(query, chunks, top_k=rerank_top_k)
|
| 103 |
+
if not reranked:
|
| 104 |
+
yield GeneratedAnswer(
|
| 105 |
+
query=query,
|
| 106 |
+
answer="No relevant documents found to answer your question.",
|
| 107 |
+
sources=[],
|
| 108 |
+
generation_time_ms=0,
|
| 109 |
+
model=self.llm.model_name,
|
| 110 |
+
)
|
| 111 |
+
return
|
| 112 |
+
|
| 113 |
+
prompt = self._build_prompt(query, reranked, intent)
|
| 114 |
+
start = time.perf_counter()
|
| 115 |
+
|
| 116 |
+
async for text_chunk in self.llm.generate_stream(prompt):
|
| 117 |
+
yield text_chunk
|
| 118 |
+
|
| 119 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 120 |
+
|
| 121 |
+
# Final message with sources
|
| 122 |
+
yield GeneratedAnswer(
|
| 123 |
+
query=query,
|
| 124 |
+
answer="", # Full answer was streamed
|
| 125 |
+
sources=reranked,
|
| 126 |
+
generation_time_ms=elapsed,
|
| 127 |
+
model=self.llm.model_name,
|
| 128 |
+
)
|
app/core/llm.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
from collections.abc import AsyncGenerator
|
| 5 |
+
|
| 6 |
+
import google.generativeai as genai
|
| 7 |
+
|
| 8 |
+
from app.config import get_settings
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class GeminiService:
|
| 14 |
+
def __init__(self, api_key: str, model_name: str, rpm_limit: int = 15):
|
| 15 |
+
genai.configure(api_key=api_key)
|
| 16 |
+
self.model = genai.GenerativeModel(model_name)
|
| 17 |
+
self.model_name = model_name
|
| 18 |
+
self._min_interval = 60.0 / rpm_limit
|
| 19 |
+
self._last_call_time = 0.0
|
| 20 |
+
logger.info(f"Initialized Gemini '{model_name}' (RPM limit: {rpm_limit})")
|
| 21 |
+
|
| 22 |
+
def _wait_for_rate_limit(self) -> None:
|
| 23 |
+
now = time.time()
|
| 24 |
+
elapsed = now - self._last_call_time
|
| 25 |
+
if elapsed < self._min_interval:
|
| 26 |
+
wait = self._min_interval - elapsed
|
| 27 |
+
logger.debug(f"Rate limiting: waiting {wait:.1f}s")
|
| 28 |
+
time.sleep(wait)
|
| 29 |
+
self._last_call_time = time.time()
|
| 30 |
+
|
| 31 |
+
async def _async_wait_for_rate_limit(self) -> None:
|
| 32 |
+
now = time.time()
|
| 33 |
+
elapsed = now - self._last_call_time
|
| 34 |
+
if elapsed < self._min_interval:
|
| 35 |
+
wait = self._min_interval - elapsed
|
| 36 |
+
logger.debug(f"Rate limiting: waiting {wait:.1f}s")
|
| 37 |
+
await asyncio.sleep(wait)
|
| 38 |
+
self._last_call_time = time.time()
|
| 39 |
+
|
| 40 |
+
def generate(self, prompt: str, temperature: float = 0.3, max_tokens: int = 2048) -> str:
|
| 41 |
+
self._wait_for_rate_limit()
|
| 42 |
+
try:
|
| 43 |
+
response = self.model.generate_content(
|
| 44 |
+
prompt,
|
| 45 |
+
generation_config=genai.types.GenerationConfig(
|
| 46 |
+
temperature=temperature,
|
| 47 |
+
max_output_tokens=max_tokens,
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
return response.text
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Gemini generation failed: {e}")
|
| 53 |
+
raise
|
| 54 |
+
|
| 55 |
+
async def generate_stream(
|
| 56 |
+
self, prompt: str, temperature: float = 0.3, max_tokens: int = 2048
|
| 57 |
+
) -> AsyncGenerator[str, None]:
|
| 58 |
+
await self._async_wait_for_rate_limit()
|
| 59 |
+
try:
|
| 60 |
+
response = self.model.generate_content(
|
| 61 |
+
prompt,
|
| 62 |
+
generation_config=genai.types.GenerationConfig(
|
| 63 |
+
temperature=temperature,
|
| 64 |
+
max_output_tokens=max_tokens,
|
| 65 |
+
),
|
| 66 |
+
stream=True,
|
| 67 |
+
)
|
| 68 |
+
for chunk in response:
|
| 69 |
+
if chunk.text:
|
| 70 |
+
yield chunk.text
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"Gemini streaming failed: {e}")
|
| 73 |
+
raise
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
_llm: GeminiService | None = None
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_llm() -> GeminiService:
|
| 80 |
+
global _llm
|
| 81 |
+
if _llm is None:
|
| 82 |
+
settings = get_settings()
|
| 83 |
+
_llm = GeminiService(
|
| 84 |
+
api_key=settings.gemini_api_key,
|
| 85 |
+
model_name=settings.gemini_model,
|
| 86 |
+
rpm_limit=settings.gemini_rpm_limit,
|
| 87 |
+
)
|
| 88 |
+
return _llm
|
app/core/metadata.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from app.models.document import DocumentMetadata
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
DATE_PATTERNS = [
|
| 12 |
+
re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
|
| 13 |
+
re.compile(r"\b(\d{2}/\d{2}/\d{4})\b"),
|
| 14 |
+
re.compile(
|
| 15 |
+
r"\b((?:January|February|March|April|May|June|July|August|September|October|November|December)"
|
| 16 |
+
r"\s+\d{1,2},?\s+\d{4})\b"
|
| 17 |
+
),
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%B %d, %Y", "%B %d %Y"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def extract_title(text: str) -> str | None:
|
| 24 |
+
for line in text.splitlines():
|
| 25 |
+
line = line.strip()
|
| 26 |
+
if line and len(line) > 3:
|
| 27 |
+
return line[:200]
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def extract_dates(text: str) -> datetime | None:
|
| 32 |
+
for pattern in DATE_PATTERNS:
|
| 33 |
+
match = pattern.search(text[:2000]) # Only scan beginning
|
| 34 |
+
if match:
|
| 35 |
+
date_str = match.group(1)
|
| 36 |
+
for fmt in DATE_FORMATS:
|
| 37 |
+
try:
|
| 38 |
+
return datetime.strptime(date_str, fmt)
|
| 39 |
+
except ValueError:
|
| 40 |
+
continue
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_tags(text: str, max_tags: int = 10) -> list[str]:
|
| 45 |
+
words = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)
|
| 46 |
+
counts = Counter(words)
|
| 47 |
+
tags = [word.lower() for word, count in counts.most_common(max_tags * 2) if count >= 2]
|
| 48 |
+
return tags[:max_tags]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def extract_metadata(raw_text: str, filename: str, page_count: int | None = None) -> DocumentMetadata:
|
| 52 |
+
ext = Path(filename).suffix.lower().lstrip(".")
|
| 53 |
+
doc_type = ext if ext else "unknown"
|
| 54 |
+
|
| 55 |
+
return DocumentMetadata(
|
| 56 |
+
source=filename,
|
| 57 |
+
doc_type=doc_type,
|
| 58 |
+
title=extract_title(raw_text),
|
| 59 |
+
created_date=extract_dates(raw_text),
|
| 60 |
+
tags=extract_tags(raw_text),
|
| 61 |
+
page_count=page_count,
|
| 62 |
+
)
|
app/core/query_analyzer.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
|
| 5 |
+
from dateutil import parser as date_parser
|
| 6 |
+
|
| 7 |
+
from app.models.schemas import AnalyzedQuery, SearchFilters
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
# Doc type patterns
|
| 12 |
+
DOCTYPE_PATTERNS = {
|
| 13 |
+
"pdf": re.compile(r"\bpdfs?\b", re.IGNORECASE),
|
| 14 |
+
"html": re.compile(r"\bhtml\b", re.IGNORECASE),
|
| 15 |
+
"txt": re.compile(r"\btext\s+files?\b|\btxt\b", re.IGNORECASE),
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Relative date patterns
|
| 19 |
+
RELATIVE_DATE_PATTERNS = [
|
| 20 |
+
(re.compile(r"\blast\s+week\b", re.IGNORECASE), lambda: (datetime.now() - timedelta(weeks=1), datetime.now())),
|
| 21 |
+
(re.compile(r"\blast\s+month\b", re.IGNORECASE), lambda: (datetime.now() - timedelta(days=30), datetime.now())),
|
| 22 |
+
(re.compile(r"\blast\s+year\b", re.IGNORECASE), lambda: (datetime.now() - timedelta(days=365), datetime.now())),
|
| 23 |
+
(re.compile(r"\bthis\s+week\b", re.IGNORECASE), lambda: (datetime.now() - timedelta(days=datetime.now().weekday()), datetime.now())),
|
| 24 |
+
(re.compile(r"\bthis\s+month\b", re.IGNORECASE), lambda: (datetime.now().replace(day=1), datetime.now())),
|
| 25 |
+
(re.compile(r"\bthis\s+year\b", re.IGNORECASE), lambda: (datetime.now().replace(month=1, day=1), datetime.now())),
|
| 26 |
+
(re.compile(r"\btoday\b", re.IGNORECASE), lambda: (datetime.now().replace(hour=0, minute=0, second=0), datetime.now())),
|
| 27 |
+
(re.compile(r"\byesterday\b", re.IGNORECASE), lambda: (datetime.now() - timedelta(days=1), datetime.now())),
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
# Absolute date patterns
|
| 31 |
+
AFTER_DATE = re.compile(r"\bafter\s+(\S+)\b", re.IGNORECASE)
|
| 32 |
+
BEFORE_DATE = re.compile(r"\bbefore\s+(\S+)\b", re.IGNORECASE)
|
| 33 |
+
FROM_SOURCE = re.compile(r"\bfrom\s+(\S+\.\w{2,4})\b", re.IGNORECASE)
|
| 34 |
+
|
| 35 |
+
# Intent patterns
|
| 36 |
+
INTENT_PATTERNS = [
|
| 37 |
+
("summarize", re.compile(r"\bsummar(?:ize|y)\b|\boverview\b", re.IGNORECASE)),
|
| 38 |
+
("comparative", re.compile(r"\bcompar[ei]\b|\bdifference\b|\bvs\.?\b|\bversus\b", re.IGNORECASE)),
|
| 39 |
+
("list", re.compile(r"\blist\b|\benumerate\b|\bwhat are all\b", re.IGNORECASE)),
|
| 40 |
+
("explanatory", re.compile(r"^(?:why|how|explain)\b", re.IGNORECASE)),
|
| 41 |
+
("factual", re.compile(r"^(?:what|who|when|where|how many|how much)\b", re.IGNORECASE)),
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class QueryAnalyzer:
|
| 46 |
+
def analyze(self, query: str) -> AnalyzedQuery:
|
| 47 |
+
filters = SearchFilters()
|
| 48 |
+
clean = query
|
| 49 |
+
confidence = 0.5
|
| 50 |
+
phrases_to_remove = []
|
| 51 |
+
|
| 52 |
+
# Extract doc type
|
| 53 |
+
for doc_type, pattern in DOCTYPE_PATTERNS.items():
|
| 54 |
+
match = pattern.search(clean)
|
| 55 |
+
if match:
|
| 56 |
+
filters.doc_type = doc_type
|
| 57 |
+
phrases_to_remove.append(match.group())
|
| 58 |
+
confidence += 0.1
|
| 59 |
+
|
| 60 |
+
# Extract relative dates
|
| 61 |
+
for pattern, date_fn in RELATIVE_DATE_PATTERNS:
|
| 62 |
+
match = pattern.search(clean)
|
| 63 |
+
if match:
|
| 64 |
+
date_from, date_to = date_fn()
|
| 65 |
+
filters.date_from = date_from
|
| 66 |
+
filters.date_to = date_to
|
| 67 |
+
phrases_to_remove.append(match.group())
|
| 68 |
+
confidence += 0.1
|
| 69 |
+
break
|
| 70 |
+
|
| 71 |
+
# Extract absolute dates
|
| 72 |
+
if not filters.date_from:
|
| 73 |
+
match = AFTER_DATE.search(clean)
|
| 74 |
+
if match:
|
| 75 |
+
try:
|
| 76 |
+
filters.date_from = date_parser.parse(match.group(1))
|
| 77 |
+
phrases_to_remove.append(match.group())
|
| 78 |
+
confidence += 0.1
|
| 79 |
+
except (ValueError, OverflowError):
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
if not filters.date_to:
|
| 83 |
+
match = BEFORE_DATE.search(clean)
|
| 84 |
+
if match:
|
| 85 |
+
try:
|
| 86 |
+
filters.date_to = date_parser.parse(match.group(1))
|
| 87 |
+
phrases_to_remove.append(match.group())
|
| 88 |
+
confidence += 0.1
|
| 89 |
+
except (ValueError, OverflowError):
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
# Extract source
|
| 93 |
+
match = FROM_SOURCE.search(clean)
|
| 94 |
+
if match:
|
| 95 |
+
filters.source = match.group(1)
|
| 96 |
+
phrases_to_remove.append(match.group())
|
| 97 |
+
confidence += 0.1
|
| 98 |
+
|
| 99 |
+
# Clean query by removing extracted filter phrases
|
| 100 |
+
for phrase in phrases_to_remove:
|
| 101 |
+
clean = clean.replace(phrase, "")
|
| 102 |
+
clean = re.sub(r"\s+", " ", clean).strip()
|
| 103 |
+
# Remove dangling prepositions and leading ones
|
| 104 |
+
clean = re.sub(r"\b(?:about|from|in|on)\s*$", "", clean).strip()
|
| 105 |
+
clean = re.sub(r"^\b(?:about|from|in|on)\s+", "", clean).strip()
|
| 106 |
+
|
| 107 |
+
if not clean:
|
| 108 |
+
clean = query
|
| 109 |
+
|
| 110 |
+
# Classify intent
|
| 111 |
+
intent = "factual"
|
| 112 |
+
for intent_name, pattern in INTENT_PATTERNS:
|
| 113 |
+
if pattern.search(query):
|
| 114 |
+
intent = intent_name
|
| 115 |
+
break
|
| 116 |
+
|
| 117 |
+
confidence = min(confidence, 1.0)
|
| 118 |
+
|
| 119 |
+
analyzed = AnalyzedQuery(
|
| 120 |
+
original_query=query,
|
| 121 |
+
clean_query=clean,
|
| 122 |
+
intent=intent,
|
| 123 |
+
extracted_filters=filters,
|
| 124 |
+
confidence=confidence,
|
| 125 |
+
)
|
| 126 |
+
logger.info(f"Query analyzed: intent={intent}, filters={filters.model_dump(exclude_none=True)}")
|
| 127 |
+
return analyzed
|
app/core/reranker.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
from app.models.schemas import RetrievedChunk
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RerankerService:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
start = time.perf_counter()
|
| 12 |
+
from flashrank import Ranker
|
| 13 |
+
|
| 14 |
+
self.ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="./flashrank_cache")
|
| 15 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 16 |
+
logger.info(f"Loaded FlashRank reranker in {elapsed:.0f}ms")
|
| 17 |
+
|
| 18 |
+
def rerank(
|
| 19 |
+
self, query: str, chunks: list[RetrievedChunk], top_k: int = 5
|
| 20 |
+
) -> list[RetrievedChunk]:
|
| 21 |
+
if not chunks:
|
| 22 |
+
return []
|
| 23 |
+
|
| 24 |
+
from flashrank import RerankRequest
|
| 25 |
+
|
| 26 |
+
passages = [{"id": chunk.chunk_id, "text": chunk.text} for chunk in chunks]
|
| 27 |
+
request = RerankRequest(query=query, passages=passages)
|
| 28 |
+
results = self.ranker.rerank(request)
|
| 29 |
+
|
| 30 |
+
# Map reranked scores back to chunks
|
| 31 |
+
chunk_map = {chunk.chunk_id: chunk for chunk in chunks}
|
| 32 |
+
reranked = []
|
| 33 |
+
for i, result in enumerate(results[:top_k]):
|
| 34 |
+
chunk_id = result["id"]
|
| 35 |
+
if chunk_id in chunk_map:
|
| 36 |
+
chunk = chunk_map[chunk_id].model_copy()
|
| 37 |
+
chunk.score = float(result["score"])
|
| 38 |
+
chunk.rank = i
|
| 39 |
+
reranked.append(chunk)
|
| 40 |
+
|
| 41 |
+
logger.info(f"Reranked {len(chunks)} → top {len(reranked)} chunks")
|
| 42 |
+
return reranked
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
_reranker: RerankerService | None = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_reranker() -> RerankerService:
|
| 49 |
+
global _reranker
|
| 50 |
+
if _reranker is None:
|
| 51 |
+
_reranker = RerankerService()
|
| 52 |
+
return _reranker
|
app/core/retriever.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
|
| 5 |
+
from app.core.bm25 import BM25Index
|
| 6 |
+
from app.core.embedder import EmbedderService
|
| 7 |
+
from app.core.vectorstore import VectorStoreService
|
| 8 |
+
from app.models.document import DocumentMetadata
|
| 9 |
+
from app.models.schemas import RetrievedChunk, SearchFilters
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class HybridRetriever:
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
vectorstore: VectorStoreService,
|
| 18 |
+
bm25: BM25Index,
|
| 19 |
+
embedder: EmbedderService,
|
| 20 |
+
):
|
| 21 |
+
self.vectorstore = vectorstore
|
| 22 |
+
self.bm25 = bm25
|
| 23 |
+
self.embedder = embedder
|
| 24 |
+
|
| 25 |
+
def retrieve(
|
| 26 |
+
self,
|
| 27 |
+
query: str,
|
| 28 |
+
top_k: int = 10,
|
| 29 |
+
filters: SearchFilters | None = None,
|
| 30 |
+
dense_weight: float = 0.6,
|
| 31 |
+
sparse_weight: float = 0.4,
|
| 32 |
+
) -> list[RetrievedChunk]:
|
| 33 |
+
start = time.perf_counter()
|
| 34 |
+
|
| 35 |
+
query_vector = self.embedder.embed_query(query)
|
| 36 |
+
|
| 37 |
+
# Dense search via Qdrant (over-fetch 2x)
|
| 38 |
+
dense_results = self.vectorstore.search(
|
| 39 |
+
query_vector=query_vector,
|
| 40 |
+
limit=top_k * 2,
|
| 41 |
+
filters=filters,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Sparse search via BM25
|
| 45 |
+
sparse_results = self.bm25.search(query, top_k=top_k * 2)
|
| 46 |
+
|
| 47 |
+
# Post-filter BM25 results if filters are provided
|
| 48 |
+
if filters and filters.has_filters():
|
| 49 |
+
sparse_results = self._apply_filters(sparse_results, filters)
|
| 50 |
+
|
| 51 |
+
# RRF fusion
|
| 52 |
+
fused = self.rrf_fuse(
|
| 53 |
+
[dense_results, sparse_results],
|
| 54 |
+
weights=[dense_weight, sparse_weight],
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Deduplicate by chunk_id and take top_k
|
| 58 |
+
seen = set()
|
| 59 |
+
unique = []
|
| 60 |
+
for item in fused:
|
| 61 |
+
if item["chunk_id"] not in seen:
|
| 62 |
+
seen.add(item["chunk_id"])
|
| 63 |
+
unique.append(item)
|
| 64 |
+
if len(unique) >= top_k:
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
# Convert to RetrievedChunk models
|
| 68 |
+
results = [
|
| 69 |
+
RetrievedChunk(
|
| 70 |
+
chunk_id=item["chunk_id"],
|
| 71 |
+
document_id=item.get("document_id", ""),
|
| 72 |
+
text=item["text"],
|
| 73 |
+
score=item["fused_score"],
|
| 74 |
+
metadata=DocumentMetadata(**item.get("metadata", {})),
|
| 75 |
+
rank=i,
|
| 76 |
+
)
|
| 77 |
+
for i, item in enumerate(unique)
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 81 |
+
logger.info(
|
| 82 |
+
f"Hybrid retrieval: {len(dense_results)} dense + {len(sparse_results)} sparse "
|
| 83 |
+
f"→ {len(results)} results in {elapsed:.0f}ms"
|
| 84 |
+
)
|
| 85 |
+
return results
|
| 86 |
+
|
| 87 |
+
@staticmethod
|
| 88 |
+
def rrf_fuse(
|
| 89 |
+
result_lists: list[list[dict]],
|
| 90 |
+
k: int = 60,
|
| 91 |
+
weights: list[float] | None = None,
|
| 92 |
+
) -> list[dict]:
|
| 93 |
+
if weights is None:
|
| 94 |
+
weights = [1.0] * len(result_lists)
|
| 95 |
+
|
| 96 |
+
scores: dict[str, float] = defaultdict(float)
|
| 97 |
+
docs: dict[str, dict] = {}
|
| 98 |
+
|
| 99 |
+
for result_list, weight in zip(result_lists, weights):
|
| 100 |
+
for rank, item in enumerate(result_list):
|
| 101 |
+
chunk_id = item["chunk_id"]
|
| 102 |
+
scores[chunk_id] += weight * (1.0 / (k + rank))
|
| 103 |
+
if chunk_id not in docs:
|
| 104 |
+
docs[chunk_id] = item
|
| 105 |
+
|
| 106 |
+
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
| 107 |
+
return [
|
| 108 |
+
{**docs[chunk_id], "fused_score": score}
|
| 109 |
+
for chunk_id, score in ranked
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
@staticmethod
|
| 113 |
+
def _apply_filters(results: list[dict], filters: SearchFilters) -> list[dict]:
|
| 114 |
+
filtered = []
|
| 115 |
+
for r in results:
|
| 116 |
+
meta = r.get("metadata", {})
|
| 117 |
+
if filters.source and meta.get("source") != filters.source:
|
| 118 |
+
continue
|
| 119 |
+
if filters.doc_type and meta.get("doc_type") != filters.doc_type:
|
| 120 |
+
continue
|
| 121 |
+
if filters.tags:
|
| 122 |
+
doc_tags = meta.get("tags", [])
|
| 123 |
+
if not any(t in doc_tags for t in filters.tags):
|
| 124 |
+
continue
|
| 125 |
+
filtered.append(r)
|
| 126 |
+
return filtered
|
app/core/vectorstore.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.http.models import (
|
| 5 |
+
Distance,
|
| 6 |
+
FieldCondition,
|
| 7 |
+
Filter,
|
| 8 |
+
MatchAny,
|
| 9 |
+
MatchValue,
|
| 10 |
+
PayloadSchemaType,
|
| 11 |
+
PointStruct,
|
| 12 |
+
Range,
|
| 13 |
+
VectorParams,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
from app.config import get_settings
|
| 17 |
+
from app.models.document import Chunk
|
| 18 |
+
from app.models.schemas import SearchFilters
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class VectorStoreService:
|
| 24 |
+
def __init__(self, url: str, api_key: str, collection_name: str):
|
| 25 |
+
self.client = QdrantClient(url=url, api_key=api_key)
|
| 26 |
+
self.collection_name = collection_name
|
| 27 |
+
logger.info(f"Connected to Qdrant at {url}")
|
| 28 |
+
|
| 29 |
+
def ensure_collection(self, vector_size: int = 384) -> None:
|
| 30 |
+
collections = [c.name for c in self.client.get_collections().collections]
|
| 31 |
+
if self.collection_name not in collections:
|
| 32 |
+
self.client.create_collection(
|
| 33 |
+
collection_name=self.collection_name,
|
| 34 |
+
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
|
| 35 |
+
)
|
| 36 |
+
logger.info(f"Created collection '{self.collection_name}' (dim={vector_size})")
|
| 37 |
+
else:
|
| 38 |
+
logger.info(f"Collection '{self.collection_name}' already exists")
|
| 39 |
+
|
| 40 |
+
# Ensure payload indexes exist for filterable fields
|
| 41 |
+
self._ensure_payload_indexes()
|
| 42 |
+
|
| 43 |
+
def _ensure_payload_indexes(self) -> None:
|
| 44 |
+
"""Create payload indexes for fields used in filtering."""
|
| 45 |
+
index_fields = {
|
| 46 |
+
"document_id": PayloadSchemaType.KEYWORD,
|
| 47 |
+
"source": PayloadSchemaType.KEYWORD,
|
| 48 |
+
"doc_type": PayloadSchemaType.KEYWORD,
|
| 49 |
+
"tags": PayloadSchemaType.KEYWORD,
|
| 50 |
+
"created_date": PayloadSchemaType.KEYWORD,
|
| 51 |
+
}
|
| 52 |
+
try:
|
| 53 |
+
collection_info = self.client.get_collection(self.collection_name)
|
| 54 |
+
existing_indexes = set(collection_info.payload_schema.keys()) if collection_info.payload_schema else set()
|
| 55 |
+
except Exception:
|
| 56 |
+
existing_indexes = set()
|
| 57 |
+
|
| 58 |
+
for field_name, field_type in index_fields.items():
|
| 59 |
+
if field_name not in existing_indexes:
|
| 60 |
+
try:
|
| 61 |
+
self.client.create_payload_index(
|
| 62 |
+
collection_name=self.collection_name,
|
| 63 |
+
field_name=field_name,
|
| 64 |
+
field_schema=field_type,
|
| 65 |
+
)
|
| 66 |
+
logger.info(f"Created payload index: {field_name} ({field_type})")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.warning(f"Could not create index for '{field_name}': {e}")
|
| 69 |
+
|
| 70 |
+
def upsert_chunks(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
|
| 71 |
+
batch_size = 100
|
| 72 |
+
for i in range(0, len(chunks), batch_size):
|
| 73 |
+
batch_chunks = chunks[i : i + batch_size]
|
| 74 |
+
batch_embeddings = embeddings[i : i + batch_size]
|
| 75 |
+
points = [
|
| 76 |
+
PointStruct(
|
| 77 |
+
id=chunk.chunk_id,
|
| 78 |
+
vector=embedding,
|
| 79 |
+
payload={
|
| 80 |
+
"text": chunk.text,
|
| 81 |
+
"document_id": chunk.document_id,
|
| 82 |
+
"chunk_index": chunk.chunk_index,
|
| 83 |
+
"source": chunk.metadata.source,
|
| 84 |
+
"doc_type": chunk.metadata.doc_type,
|
| 85 |
+
"title": chunk.metadata.title,
|
| 86 |
+
"created_date": chunk.metadata.created_date.isoformat()
|
| 87 |
+
if chunk.metadata.created_date
|
| 88 |
+
else None,
|
| 89 |
+
"tags": chunk.metadata.tags,
|
| 90 |
+
"page_count": chunk.metadata.page_count,
|
| 91 |
+
},
|
| 92 |
+
)
|
| 93 |
+
for chunk, embedding in zip(batch_chunks, batch_embeddings)
|
| 94 |
+
]
|
| 95 |
+
self.client.upsert(collection_name=self.collection_name, points=points)
|
| 96 |
+
logger.info(f"Upserted {len(chunks)} chunks to '{self.collection_name}'")
|
| 97 |
+
|
| 98 |
+
def search(
|
| 99 |
+
self,
|
| 100 |
+
query_vector: list[float],
|
| 101 |
+
limit: int = 10,
|
| 102 |
+
filters: SearchFilters | None = None,
|
| 103 |
+
) -> list[dict]:
|
| 104 |
+
qdrant_filter = self._build_filter(filters) if filters and filters.has_filters() else None
|
| 105 |
+
results = self.client.query_points(
|
| 106 |
+
collection_name=self.collection_name,
|
| 107 |
+
query=query_vector,
|
| 108 |
+
limit=limit,
|
| 109 |
+
query_filter=qdrant_filter,
|
| 110 |
+
).points
|
| 111 |
+
return [
|
| 112 |
+
{
|
| 113 |
+
"chunk_id": str(r.id),
|
| 114 |
+
"text": r.payload.get("text", ""),
|
| 115 |
+
"score": r.score,
|
| 116 |
+
"document_id": r.payload.get("document_id", ""),
|
| 117 |
+
"metadata": {
|
| 118 |
+
"source": r.payload.get("source", ""),
|
| 119 |
+
"doc_type": r.payload.get("doc_type", ""),
|
| 120 |
+
"title": r.payload.get("title"),
|
| 121 |
+
"created_date": r.payload.get("created_date"),
|
| 122 |
+
"tags": r.payload.get("tags", []),
|
| 123 |
+
"page_count": r.payload.get("page_count"),
|
| 124 |
+
},
|
| 125 |
+
}
|
| 126 |
+
for r in results
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
def delete_document(self, document_id: str) -> int:
|
| 130 |
+
self.client.delete(
|
| 131 |
+
collection_name=self.collection_name,
|
| 132 |
+
points_selector=Filter(
|
| 133 |
+
must=[FieldCondition(key="document_id", match=MatchValue(value=document_id))]
|
| 134 |
+
),
|
| 135 |
+
)
|
| 136 |
+
logger.info(f"Deleted document '{document_id}' from '{self.collection_name}'")
|
| 137 |
+
return 0
|
| 138 |
+
|
| 139 |
+
def scroll_all(self, batch_size: int = 100) -> list[dict]:
|
| 140 |
+
all_points = []
|
| 141 |
+
offset = None
|
| 142 |
+
while True:
|
| 143 |
+
results, next_offset = self.client.scroll(
|
| 144 |
+
collection_name=self.collection_name,
|
| 145 |
+
limit=batch_size,
|
| 146 |
+
offset=offset,
|
| 147 |
+
with_payload=True,
|
| 148 |
+
with_vectors=False,
|
| 149 |
+
)
|
| 150 |
+
for r in results:
|
| 151 |
+
all_points.append({
|
| 152 |
+
"chunk_id": str(r.id),
|
| 153 |
+
"text": r.payload.get("text", ""),
|
| 154 |
+
"document_id": r.payload.get("document_id", ""),
|
| 155 |
+
"metadata": {
|
| 156 |
+
"source": r.payload.get("source", ""),
|
| 157 |
+
"doc_type": r.payload.get("doc_type", ""),
|
| 158 |
+
"title": r.payload.get("title"),
|
| 159 |
+
"tags": r.payload.get("tags", []),
|
| 160 |
+
},
|
| 161 |
+
})
|
| 162 |
+
if next_offset is None:
|
| 163 |
+
break
|
| 164 |
+
offset = next_offset
|
| 165 |
+
return all_points
|
| 166 |
+
|
| 167 |
+
def get_document_ids(self) -> list[dict]:
|
| 168 |
+
all_points = self.scroll_all()
|
| 169 |
+
docs: dict[str, dict] = {}
|
| 170 |
+
for p in all_points:
|
| 171 |
+
doc_id = p["document_id"]
|
| 172 |
+
if doc_id not in docs:
|
| 173 |
+
docs[doc_id] = {
|
| 174 |
+
"document_id": doc_id,
|
| 175 |
+
"source": p["metadata"]["source"],
|
| 176 |
+
"title": p["metadata"].get("title"),
|
| 177 |
+
"doc_type": p["metadata"]["doc_type"],
|
| 178 |
+
"num_chunks": 0,
|
| 179 |
+
}
|
| 180 |
+
docs[doc_id]["num_chunks"] += 1
|
| 181 |
+
return list(docs.values())
|
| 182 |
+
|
| 183 |
+
def count(self) -> int:
|
| 184 |
+
info = self.client.get_collection(self.collection_name)
|
| 185 |
+
return info.points_count
|
| 186 |
+
|
| 187 |
+
@staticmethod
|
| 188 |
+
def _build_filter(filters: SearchFilters) -> Filter | None:
|
| 189 |
+
conditions = []
|
| 190 |
+
if filters.source:
|
| 191 |
+
conditions.append(FieldCondition(key="source", match=MatchValue(value=filters.source)))
|
| 192 |
+
if filters.doc_type:
|
| 193 |
+
conditions.append(FieldCondition(key="doc_type", match=MatchValue(value=filters.doc_type)))
|
| 194 |
+
if filters.tags:
|
| 195 |
+
conditions.append(FieldCondition(key="tags", match=MatchAny(any=filters.tags)))
|
| 196 |
+
if filters.date_from or filters.date_to:
|
| 197 |
+
range_params = {}
|
| 198 |
+
if filters.date_from:
|
| 199 |
+
range_params["gte"] = filters.date_from.isoformat()
|
| 200 |
+
if filters.date_to:
|
| 201 |
+
range_params["lte"] = filters.date_to.isoformat()
|
| 202 |
+
conditions.append(FieldCondition(key="created_date", range=Range(**range_params)))
|
| 203 |
+
return Filter(must=conditions) if conditions else None
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
_vectorstore: VectorStoreService | None = None
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def get_vectorstore() -> VectorStoreService:
|
| 210 |
+
global _vectorstore
|
| 211 |
+
if _vectorstore is None:
|
| 212 |
+
settings = get_settings()
|
| 213 |
+
_vectorstore = VectorStoreService(
|
| 214 |
+
url=settings.qdrant_url,
|
| 215 |
+
api_key=settings.qdrant_api_key,
|
| 216 |
+
collection_name=settings.qdrant_collection,
|
| 217 |
+
)
|
| 218 |
+
_vectorstore.ensure_collection(vector_size=settings.embedding_dim)
|
| 219 |
+
return _vectorstore
|
app/main.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from fastapi import FastAPI
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from fastapi.responses import RedirectResponse
|
| 8 |
+
|
| 9 |
+
from app.api.routes import health, ingest, query
|
| 10 |
+
from app.config import get_settings, setup_logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@asynccontextmanager
|
| 16 |
+
async def lifespan(app: FastAPI):
|
| 17 |
+
setup_logging()
|
| 18 |
+
logger.info("RagCore starting up...")
|
| 19 |
+
|
| 20 |
+
settings = get_settings()
|
| 21 |
+
|
| 22 |
+
# Initialize services that need warm-up
|
| 23 |
+
try:
|
| 24 |
+
from app.core.embedder import get_embedder
|
| 25 |
+
|
| 26 |
+
get_embedder()
|
| 27 |
+
logger.info("Embedder loaded")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.warning(f"Embedder initialization deferred: {e}")
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from app.core.vectorstore import get_vectorstore
|
| 33 |
+
from app.core.bm25 import get_bm25
|
| 34 |
+
|
| 35 |
+
vs = get_vectorstore()
|
| 36 |
+
bm25 = get_bm25()
|
| 37 |
+
bm25.rebuild_from_vectorstore(vs)
|
| 38 |
+
logger.info(f"BM25 index ready: {bm25.doc_count} documents")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.warning(f"Vectorstore/BM25 initialization deferred: {e}")
|
| 41 |
+
|
| 42 |
+
logger.info("RagCore ready!")
|
| 43 |
+
yield
|
| 44 |
+
logger.info("RagCore shutting down...")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
app = FastAPI(
|
| 48 |
+
title="RagCore",
|
| 49 |
+
description="RAG system with hybrid search and metadata filtering",
|
| 50 |
+
version="0.1.0",
|
| 51 |
+
lifespan=lifespan,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
app.add_middleware(
|
| 55 |
+
CORSMiddleware,
|
| 56 |
+
allow_origins=["*"],
|
| 57 |
+
allow_credentials=True,
|
| 58 |
+
allow_methods=["*"],
|
| 59 |
+
allow_headers=["*"],
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
app.include_router(health.router)
|
| 63 |
+
app.include_router(ingest.router)
|
| 64 |
+
app.include_router(query.router)
|
| 65 |
+
|
| 66 |
+
# Mount Gradio UI
|
| 67 |
+
from app.ui.gradio_app import create_gradio_app
|
| 68 |
+
|
| 69 |
+
gradio_app = create_gradio_app()
|
| 70 |
+
app = gr.mount_gradio_app(app, gradio_app, path="/ui")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@app.get("/", include_in_schema=False)
|
| 74 |
+
async def root():
|
| 75 |
+
return RedirectResponse(url="/ui")
|
app/models/__init__.py
ADDED
|
File without changes
|
app/models/document.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
from app.utils.helpers import generate_id
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DocumentMetadata(BaseModel):
|
| 9 |
+
source: str = ""
|
| 10 |
+
doc_type: str = ""
|
| 11 |
+
title: str | None = None
|
| 12 |
+
created_date: datetime | None = None
|
| 13 |
+
tags: list[str] = Field(default_factory=list)
|
| 14 |
+
page_count: int | None = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Chunk(BaseModel):
|
| 18 |
+
chunk_id: str = Field(default_factory=generate_id)
|
| 19 |
+
document_id: str = ""
|
| 20 |
+
text: str = ""
|
| 21 |
+
metadata: DocumentMetadata = Field(default_factory=DocumentMetadata)
|
| 22 |
+
chunk_index: int = 0
|
| 23 |
+
start_char: int = 0
|
| 24 |
+
end_char: int = 0
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Document(BaseModel):
|
| 28 |
+
document_id: str = Field(default_factory=generate_id)
|
| 29 |
+
filename: str = ""
|
| 30 |
+
metadata: DocumentMetadata = Field(default_factory=DocumentMetadata)
|
| 31 |
+
chunks: list[Chunk] = Field(default_factory=list)
|
| 32 |
+
raw_text: str = ""
|
app/models/schemas.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
from app.models.document import DocumentMetadata
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class IngestResponse(BaseModel):
|
| 9 |
+
document_id: str
|
| 10 |
+
filename: str
|
| 11 |
+
num_chunks: int
|
| 12 |
+
message: str
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SearchFilters(BaseModel):
|
| 16 |
+
source: str | None = None
|
| 17 |
+
doc_type: str | None = None
|
| 18 |
+
date_from: datetime | None = None
|
| 19 |
+
date_to: datetime | None = None
|
| 20 |
+
tags: list[str] | None = None
|
| 21 |
+
|
| 22 |
+
def has_filters(self) -> bool:
|
| 23 |
+
"""Return True only if at least one filter field is set."""
|
| 24 |
+
return any([self.source, self.doc_type, self.date_from, self.date_to, self.tags])
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class RetrievedChunk(BaseModel):
|
| 28 |
+
chunk_id: str
|
| 29 |
+
document_id: str
|
| 30 |
+
text: str
|
| 31 |
+
score: float
|
| 32 |
+
metadata: DocumentMetadata
|
| 33 |
+
rank: int = 0
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SearchRequest(BaseModel):
|
| 37 |
+
query: str
|
| 38 |
+
top_k: int = 10
|
| 39 |
+
filters: SearchFilters | None = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class SearchResponse(BaseModel):
|
| 43 |
+
query: str
|
| 44 |
+
results: list[RetrievedChunk]
|
| 45 |
+
total_results: int
|
| 46 |
+
search_time_ms: float
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class QueryRequest(BaseModel):
|
| 50 |
+
query: str
|
| 51 |
+
top_k: int = 10
|
| 52 |
+
rerank_top_k: int = 5
|
| 53 |
+
filters: SearchFilters | None = None
|
| 54 |
+
stream: bool = False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class GeneratedAnswer(BaseModel):
|
| 58 |
+
query: str
|
| 59 |
+
answer: str
|
| 60 |
+
sources: list[RetrievedChunk] = Field(default_factory=list)
|
| 61 |
+
generation_time_ms: float = 0.0
|
| 62 |
+
model: str = ""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class AnalyzedQuery(BaseModel):
|
| 66 |
+
original_query: str
|
| 67 |
+
clean_query: str
|
| 68 |
+
intent: str = "factual"
|
| 69 |
+
extracted_filters: SearchFilters = Field(default_factory=SearchFilters)
|
| 70 |
+
confidence: float = 0.5
|
app/ui/__init__.py
ADDED
|
File without changes
|
app/ui/gradio_app.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
API_BASE = "http://localhost:7860"
|
| 10 |
+
|
| 11 |
+
CUSTOM_CSS = """
|
| 12 |
+
.main-header {
|
| 13 |
+
text-align: center;
|
| 14 |
+
padding: 1.5rem 0 0.5rem 0;
|
| 15 |
+
}
|
| 16 |
+
.main-header h1 {
|
| 17 |
+
font-size: 2.4rem;
|
| 18 |
+
font-weight: 700;
|
| 19 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 20 |
+
-webkit-background-clip: text;
|
| 21 |
+
-webkit-text-fill-color: transparent;
|
| 22 |
+
margin-bottom: 0.2rem;
|
| 23 |
+
}
|
| 24 |
+
.main-header p {
|
| 25 |
+
color: #6b7280;
|
| 26 |
+
font-size: 1rem;
|
| 27 |
+
margin: 0;
|
| 28 |
+
}
|
| 29 |
+
.stat-card {
|
| 30 |
+
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
| 31 |
+
border-radius: 12px;
|
| 32 |
+
padding: 1rem;
|
| 33 |
+
text-align: center;
|
| 34 |
+
}
|
| 35 |
+
.answer-box {
|
| 36 |
+
border-left: 4px solid #667eea;
|
| 37 |
+
padding-left: 1rem;
|
| 38 |
+
margin-top: 0.5rem;
|
| 39 |
+
}
|
| 40 |
+
.source-card {
|
| 41 |
+
background: #f9fafb;
|
| 42 |
+
border: 1px solid #e5e7eb;
|
| 43 |
+
border-radius: 8px;
|
| 44 |
+
padding: 0.75rem;
|
| 45 |
+
margin: 0.5rem 0;
|
| 46 |
+
}
|
| 47 |
+
.upload-zone {
|
| 48 |
+
border: 2px dashed #667eea !important;
|
| 49 |
+
border-radius: 12px !important;
|
| 50 |
+
background: #f8f9ff !important;
|
| 51 |
+
}
|
| 52 |
+
.search-bar textarea {
|
| 53 |
+
font-size: 1.1rem !important;
|
| 54 |
+
border-radius: 12px !important;
|
| 55 |
+
border: 2px solid #e5e7eb !important;
|
| 56 |
+
padding: 12px 16px !important;
|
| 57 |
+
}
|
| 58 |
+
.search-bar textarea:focus {
|
| 59 |
+
border-color: #667eea !important;
|
| 60 |
+
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.15) !important;
|
| 61 |
+
}
|
| 62 |
+
.primary-btn {
|
| 63 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 64 |
+
border: none !important;
|
| 65 |
+
border-radius: 10px !important;
|
| 66 |
+
font-weight: 600 !important;
|
| 67 |
+
font-size: 1rem !important;
|
| 68 |
+
padding: 10px 24px !important;
|
| 69 |
+
transition: transform 0.15s, box-shadow 0.15s !important;
|
| 70 |
+
}
|
| 71 |
+
.primary-btn:hover {
|
| 72 |
+
transform: translateY(-1px) !important;
|
| 73 |
+
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
|
| 74 |
+
}
|
| 75 |
+
.danger-btn {
|
| 76 |
+
background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
|
| 77 |
+
border: none !important;
|
| 78 |
+
border-radius: 10px !important;
|
| 79 |
+
}
|
| 80 |
+
.filter-row {
|
| 81 |
+
background: #f9fafb;
|
| 82 |
+
border-radius: 10px;
|
| 83 |
+
padding: 8px 12px;
|
| 84 |
+
}
|
| 85 |
+
.doc-table {
|
| 86 |
+
border-radius: 10px !important;
|
| 87 |
+
overflow: hidden !important;
|
| 88 |
+
}
|
| 89 |
+
footer { display: none !important; }
|
| 90 |
+
.tab-nav button {
|
| 91 |
+
font-size: 1rem !important;
|
| 92 |
+
font-weight: 600 !important;
|
| 93 |
+
padding: 10px 20px !important;
|
| 94 |
+
}
|
| 95 |
+
.tab-nav button.selected {
|
| 96 |
+
border-bottom: 3px solid #667eea !important;
|
| 97 |
+
color: #667eea !important;
|
| 98 |
+
}
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def upload_document(file):
|
| 103 |
+
if file is None:
|
| 104 |
+
return "Please select a file to upload."
|
| 105 |
+
try:
|
| 106 |
+
with open(file.name, "rb") as f:
|
| 107 |
+
files = {"file": (file.name.split("/")[-1].split("\\")[-1], f)}
|
| 108 |
+
response = httpx.post(f"{API_BASE}/api/ingest", files=files, timeout=120)
|
| 109 |
+
if response.status_code == 200:
|
| 110 |
+
data = response.json()
|
| 111 |
+
return (
|
| 112 |
+
f"### Document Uploaded\n\n"
|
| 113 |
+
f"| Detail | Value |\n"
|
| 114 |
+
f"|--------|-------|\n"
|
| 115 |
+
f"| **File** | {data['filename']} |\n"
|
| 116 |
+
f"| **Chunks** | {data['num_chunks']} |\n"
|
| 117 |
+
f"| **ID** | `{data['document_id'][:12]}...` |\n"
|
| 118 |
+
)
|
| 119 |
+
else:
|
| 120 |
+
detail = response.json().get("detail", response.text)
|
| 121 |
+
return f"**Upload failed:** {detail}"
|
| 122 |
+
except Exception as e:
|
| 123 |
+
return f"**Upload failed:** {e}"
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def list_documents():
|
| 127 |
+
try:
|
| 128 |
+
response = httpx.get(f"{API_BASE}/api/documents", timeout=30)
|
| 129 |
+
if response.status_code == 200:
|
| 130 |
+
data = response.json()
|
| 131 |
+
docs = data.get("documents", [])
|
| 132 |
+
if not docs:
|
| 133 |
+
return [["—", "—", "—", "—"]]
|
| 134 |
+
return [
|
| 135 |
+
[
|
| 136 |
+
d.get("source", ""),
|
| 137 |
+
d.get("doc_type", "").upper(),
|
| 138 |
+
str(d.get("num_chunks", 0)),
|
| 139 |
+
d.get("document_id", "")[:12] + "...",
|
| 140 |
+
]
|
| 141 |
+
for d in docs
|
| 142 |
+
]
|
| 143 |
+
return [["Error loading", "", "", ""]]
|
| 144 |
+
except Exception as e:
|
| 145 |
+
return [[f"Error: {e}", "", "", ""]]
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def get_doc_count():
|
| 149 |
+
try:
|
| 150 |
+
response = httpx.get(f"{API_BASE}/api/documents", timeout=10)
|
| 151 |
+
if response.status_code == 200:
|
| 152 |
+
data = response.json()
|
| 153 |
+
total = data.get("total", 0)
|
| 154 |
+
docs = data.get("documents", [])
|
| 155 |
+
total_chunks = sum(d.get("num_chunks", 0) for d in docs)
|
| 156 |
+
return f"**{total}** documents | **{total_chunks}** chunks indexed"
|
| 157 |
+
return "Unable to fetch stats"
|
| 158 |
+
except Exception:
|
| 159 |
+
return "Connecting..."
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def delete_document(doc_id):
|
| 163 |
+
if not doc_id or not doc_id.strip():
|
| 164 |
+
return "Enter a document ID to delete."
|
| 165 |
+
try:
|
| 166 |
+
response = httpx.delete(f"{API_BASE}/api/documents/{doc_id.strip()}", timeout=30)
|
| 167 |
+
if response.status_code == 200:
|
| 168 |
+
return f"Document `{doc_id.strip()[:12]}...` deleted successfully."
|
| 169 |
+
return f"**Error:** {response.text}"
|
| 170 |
+
except Exception as e:
|
| 171 |
+
return f"**Delete failed:** {e}"
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def ask_question(query, doc_type_filter, stream_mode):
|
| 175 |
+
if not query or not query.strip():
|
| 176 |
+
yield "Please enter a question."
|
| 177 |
+
return
|
| 178 |
+
|
| 179 |
+
payload = {
|
| 180 |
+
"query": query.strip(),
|
| 181 |
+
"top_k": 10,
|
| 182 |
+
"rerank_top_k": 5,
|
| 183 |
+
"stream": stream_mode,
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
if doc_type_filter and doc_type_filter != "All":
|
| 187 |
+
payload["filters"] = {"doc_type": doc_type_filter.lower()}
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
if stream_mode:
|
| 191 |
+
with httpx.stream(
|
| 192 |
+
"POST",
|
| 193 |
+
f"{API_BASE}/api/ask",
|
| 194 |
+
json=payload,
|
| 195 |
+
timeout=120,
|
| 196 |
+
) as response:
|
| 197 |
+
answer = ""
|
| 198 |
+
sources_text = ""
|
| 199 |
+
for line in response.iter_lines():
|
| 200 |
+
if line.startswith("data: "):
|
| 201 |
+
data = json.loads(line[6:])
|
| 202 |
+
if "text" in data:
|
| 203 |
+
answer += data["text"]
|
| 204 |
+
yield (
|
| 205 |
+
f"<div class='answer-box'>\n\n{answer}\n\n</div>\n\n"
|
| 206 |
+
f"<sub>Generating...</sub>"
|
| 207 |
+
)
|
| 208 |
+
if data.get("done"):
|
| 209 |
+
sources = data.get("sources", [])
|
| 210 |
+
sources_text = _format_sources(sources)
|
| 211 |
+
time_ms = data.get("time_ms", 0)
|
| 212 |
+
model = data.get("model", "")
|
| 213 |
+
footer = f"\n\n<sub>{model} | {time_ms:.0f}ms</sub>"
|
| 214 |
+
yield (
|
| 215 |
+
f"<div class='answer-box'>\n\n{answer}\n\n</div>"
|
| 216 |
+
f"{sources_text}{footer}"
|
| 217 |
+
)
|
| 218 |
+
else:
|
| 219 |
+
response = httpx.post(
|
| 220 |
+
f"{API_BASE}/api/ask",
|
| 221 |
+
json=payload,
|
| 222 |
+
timeout=120,
|
| 223 |
+
)
|
| 224 |
+
if response.status_code == 200:
|
| 225 |
+
data = response.json()
|
| 226 |
+
answer = data.get("answer", "No answer generated.")
|
| 227 |
+
sources = data.get("sources", [])
|
| 228 |
+
sources_text = _format_sources_full(sources)
|
| 229 |
+
time_ms = data.get("generation_time_ms", 0)
|
| 230 |
+
model = data.get("model", "")
|
| 231 |
+
footer = f"\n\n<sub>{model} | {time_ms:.0f}ms</sub>"
|
| 232 |
+
yield (
|
| 233 |
+
f"<div class='answer-box'>\n\n{answer}\n\n</div>"
|
| 234 |
+
f"{sources_text}{footer}"
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
yield f"**Error:** {response.text}"
|
| 238 |
+
except Exception as e:
|
| 239 |
+
yield f"**Error:** {e}"
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def _format_sources(sources):
|
| 243 |
+
if not sources:
|
| 244 |
+
return ""
|
| 245 |
+
text = "\n\n---\n#### Sources\n\n"
|
| 246 |
+
for i, s in enumerate(sources, 1):
|
| 247 |
+
source_name = s.get("source", "unknown")
|
| 248 |
+
score = s.get("score", 0)
|
| 249 |
+
snippet = s.get("text", "")[:120].replace("\n", " ")
|
| 250 |
+
text += (
|
| 251 |
+
f"<div class='source-card'>\n\n"
|
| 252 |
+
f"**[{i}]** `{source_name}` — relevance: {score:.3f}\n\n"
|
| 253 |
+
f"> {snippet}...\n\n"
|
| 254 |
+
f"</div>\n\n"
|
| 255 |
+
)
|
| 256 |
+
return text
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _format_sources_full(sources):
|
| 260 |
+
if not sources:
|
| 261 |
+
return ""
|
| 262 |
+
text = "\n\n---\n#### Sources\n\n"
|
| 263 |
+
for i, s in enumerate(sources, 1):
|
| 264 |
+
meta = s.get("metadata", {})
|
| 265 |
+
source_name = meta.get("source", "unknown")
|
| 266 |
+
score = s.get("score", 0)
|
| 267 |
+
snippet = s.get("text", "")[:120].replace("\n", " ")
|
| 268 |
+
text += (
|
| 269 |
+
f"<div class='source-card'>\n\n"
|
| 270 |
+
f"**[{i}]** `{source_name}` — relevance: {score:.3f}\n\n"
|
| 271 |
+
f"> {snippet}...\n\n"
|
| 272 |
+
f"</div>\n\n"
|
| 273 |
+
)
|
| 274 |
+
return text
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def create_gradio_app() -> gr.Blocks:
|
| 278 |
+
with gr.Blocks(title="RagCore — Smart Document Q&A") as demo:
|
| 279 |
+
|
| 280 |
+
# Inject CSS via style tag since Gradio 6.x doesn't accept css in Blocks()
|
| 281 |
+
gr.HTML(f"<style>{CUSTOM_CSS}</style>")
|
| 282 |
+
|
| 283 |
+
# Header
|
| 284 |
+
gr.HTML(
|
| 285 |
+
"""
|
| 286 |
+
<div class="main-header">
|
| 287 |
+
<h1>RagCore</h1>
|
| 288 |
+
<p>Smart Document Q&A — Hybrid Search + Gemini Flash</p>
|
| 289 |
+
</div>
|
| 290 |
+
"""
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Stats bar
|
| 294 |
+
stats_display = gr.Markdown(value="Connecting...", elem_classes=["stat-card"])
|
| 295 |
+
demo.load(fn=get_doc_count, outputs=stats_display)
|
| 296 |
+
|
| 297 |
+
with gr.Tab("Ask", elem_id="ask-tab"):
|
| 298 |
+
gr.Markdown("#### Ask your documents anything")
|
| 299 |
+
|
| 300 |
+
with gr.Group():
|
| 301 |
+
query_input = gr.Textbox(
|
| 302 |
+
placeholder="e.g. What are the key findings? / Summarize the report / Compare approaches...",
|
| 303 |
+
lines=2,
|
| 304 |
+
show_label=False,
|
| 305 |
+
elem_classes=["search-bar"],
|
| 306 |
+
container=False,
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
with gr.Row(elem_classes=["filter-row"]):
|
| 310 |
+
doc_type_filter = gr.Dropdown(
|
| 311 |
+
choices=["All", "PDF", "TXT", "HTML"],
|
| 312 |
+
value="All",
|
| 313 |
+
label="Document Type",
|
| 314 |
+
scale=1,
|
| 315 |
+
min_width=120,
|
| 316 |
+
)
|
| 317 |
+
stream_toggle = gr.Checkbox(
|
| 318 |
+
label="Stream response",
|
| 319 |
+
value=True,
|
| 320 |
+
scale=1,
|
| 321 |
+
)
|
| 322 |
+
ask_btn = gr.Button(
|
| 323 |
+
"Ask",
|
| 324 |
+
variant="primary",
|
| 325 |
+
scale=1,
|
| 326 |
+
min_width=120,
|
| 327 |
+
elem_classes=["primary-btn"],
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
answer_output = gr.Markdown(
|
| 331 |
+
value="*Upload a document and ask a question to get started.*",
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
ask_btn.click(
|
| 335 |
+
fn=ask_question,
|
| 336 |
+
inputs=[query_input, doc_type_filter, stream_toggle],
|
| 337 |
+
outputs=answer_output,
|
| 338 |
+
)
|
| 339 |
+
query_input.submit(
|
| 340 |
+
fn=ask_question,
|
| 341 |
+
inputs=[query_input, doc_type_filter, stream_toggle],
|
| 342 |
+
outputs=answer_output,
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
gr.Markdown("#### Try these examples")
|
| 346 |
+
gr.Examples(
|
| 347 |
+
examples=[
|
| 348 |
+
["What are the key points in the uploaded documents?"],
|
| 349 |
+
["Summarize all documents"],
|
| 350 |
+
["Compare the main topics across all documents"],
|
| 351 |
+
["List the most important findings"],
|
| 352 |
+
],
|
| 353 |
+
inputs=query_input,
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
with gr.Tab("Documents", elem_id="docs-tab"):
|
| 357 |
+
gr.Markdown("#### Upload & Manage Documents")
|
| 358 |
+
|
| 359 |
+
with gr.Row():
|
| 360 |
+
with gr.Column(scale=3):
|
| 361 |
+
file_upload = gr.File(
|
| 362 |
+
label="Drop your file here",
|
| 363 |
+
file_types=[".pdf", ".txt", ".html", ".htm"],
|
| 364 |
+
elem_classes=["upload-zone"],
|
| 365 |
+
)
|
| 366 |
+
with gr.Column(scale=1, min_width=160):
|
| 367 |
+
upload_btn = gr.Button(
|
| 368 |
+
"Upload & Index",
|
| 369 |
+
variant="primary",
|
| 370 |
+
elem_classes=["primary-btn"],
|
| 371 |
+
size="lg",
|
| 372 |
+
)
|
| 373 |
+
gr.Markdown(
|
| 374 |
+
"<sub>Supported: PDF, TXT, HTML</sub>"
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
upload_status = gr.Markdown()
|
| 378 |
+
upload_btn.click(
|
| 379 |
+
fn=upload_document,
|
| 380 |
+
inputs=file_upload,
|
| 381 |
+
outputs=upload_status,
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
gr.Markdown("---")
|
| 385 |
+
gr.Markdown("#### Indexed Documents")
|
| 386 |
+
|
| 387 |
+
doc_table = gr.Dataframe(
|
| 388 |
+
headers=["Filename", "Type", "Chunks", "Document ID"],
|
| 389 |
+
label="",
|
| 390 |
+
interactive=False,
|
| 391 |
+
wrap=True,
|
| 392 |
+
elem_classes=["doc-table"],
|
| 393 |
+
)
|
| 394 |
+
refresh_btn = gr.Button("Refresh", size="sm")
|
| 395 |
+
refresh_btn.click(fn=list_documents, outputs=doc_table)
|
| 396 |
+
|
| 397 |
+
gr.Markdown("---")
|
| 398 |
+
gr.Markdown("#### Delete a Document")
|
| 399 |
+
with gr.Row():
|
| 400 |
+
delete_id_input = gr.Textbox(
|
| 401 |
+
placeholder="Paste full document ID here...",
|
| 402 |
+
show_label=False,
|
| 403 |
+
scale=3,
|
| 404 |
+
)
|
| 405 |
+
delete_btn = gr.Button(
|
| 406 |
+
"Delete",
|
| 407 |
+
variant="stop",
|
| 408 |
+
scale=1,
|
| 409 |
+
elem_classes=["danger-btn"],
|
| 410 |
+
)
|
| 411 |
+
delete_status = gr.Markdown()
|
| 412 |
+
delete_btn.click(
|
| 413 |
+
fn=delete_document,
|
| 414 |
+
inputs=delete_id_input,
|
| 415 |
+
outputs=delete_status,
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
# Footer
|
| 419 |
+
gr.HTML(
|
| 420 |
+
"""
|
| 421 |
+
<div style="text-align:center; padding: 1rem 0 0.5rem 0; color: #9ca3af; font-size: 0.8rem;">
|
| 422 |
+
RagCore v0.1.0
|
| 423 |
+
</div>
|
| 424 |
+
"""
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
return demo
|
app/utils/__init__.py
ADDED
|
File without changes
|
app/utils/helpers.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
import uuid
|
| 4 |
+
import logging
|
| 5 |
+
from contextlib import contextmanager
|
| 6 |
+
from functools import wraps
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def generate_id() -> str:
|
| 12 |
+
return str(uuid.uuid4())
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def count_words(text: str) -> int:
|
| 16 |
+
return len(text.split())
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def clean_text(text: str) -> str:
|
| 20 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 21 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 22 |
+
lines = [line.strip() for line in text.splitlines()]
|
| 23 |
+
return "\n".join(lines).strip()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@contextmanager
|
| 27 |
+
def timer(label: str = "operation"):
|
| 28 |
+
start = time.perf_counter()
|
| 29 |
+
yield lambda: (time.perf_counter() - start) * 1000
|
| 30 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 31 |
+
logger.info(f"{label} completed in {elapsed:.1f}ms")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def retry_with_backoff(retries: int = 3, base_delay: float = 1.0):
|
| 35 |
+
def decorator(func):
|
| 36 |
+
@wraps(func)
|
| 37 |
+
def wrapper(*args, **kwargs):
|
| 38 |
+
for attempt in range(retries):
|
| 39 |
+
try:
|
| 40 |
+
return func(*args, **kwargs)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
if attempt == retries - 1:
|
| 43 |
+
raise
|
| 44 |
+
delay = base_delay * (2 ** attempt)
|
| 45 |
+
logger.warning(
|
| 46 |
+
f"{func.__name__} failed (attempt {attempt + 1}/{retries}): {e}. "
|
| 47 |
+
f"Retrying in {delay}s..."
|
| 48 |
+
)
|
| 49 |
+
time.sleep(delay)
|
| 50 |
+
return wrapper
|
| 51 |
+
return decorator
|
app/utils/parsers.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from app.utils.helpers import clean_text
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
SUPPORTED_EXTENSIONS = {".pdf", ".txt", ".html", ".htm"}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def parse_pdf(file_bytes: bytes, filename: str) -> str:
|
| 12 |
+
try:
|
| 13 |
+
from pypdf import PdfReader
|
| 14 |
+
from io import BytesIO
|
| 15 |
+
|
| 16 |
+
reader = PdfReader(BytesIO(file_bytes))
|
| 17 |
+
pages = []
|
| 18 |
+
for page in reader.pages:
|
| 19 |
+
text = page.extract_text()
|
| 20 |
+
if text:
|
| 21 |
+
pages.append(text)
|
| 22 |
+
raw = "\n\n".join(pages)
|
| 23 |
+
logger.info(f"Parsed PDF '{filename}': {len(reader.pages)} pages, {len(raw)} chars")
|
| 24 |
+
return clean_text(raw)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"Failed to parse PDF '{filename}': {e}")
|
| 27 |
+
return ""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def parse_text(file_bytes: bytes, filename: str) -> str:
|
| 31 |
+
try:
|
| 32 |
+
text = file_bytes.decode("utf-8")
|
| 33 |
+
except UnicodeDecodeError:
|
| 34 |
+
text = file_bytes.decode("latin-1")
|
| 35 |
+
logger.info(f"Parsed text '{filename}': {len(text)} chars")
|
| 36 |
+
return clean_text(text)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def parse_html(file_bytes: bytes, filename: str) -> str:
|
| 40 |
+
try:
|
| 41 |
+
from bs4 import BeautifulSoup
|
| 42 |
+
|
| 43 |
+
soup = BeautifulSoup(file_bytes, "html.parser")
|
| 44 |
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
| 45 |
+
tag.decompose()
|
| 46 |
+
text = soup.get_text(separator="\n")
|
| 47 |
+
logger.info(f"Parsed HTML '{filename}': {len(text)} chars")
|
| 48 |
+
return clean_text(text)
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"Failed to parse HTML '{filename}': {e}")
|
| 51 |
+
return ""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def parse_document(file_bytes: bytes, filename: str) -> str:
|
| 55 |
+
ext = Path(filename).suffix.lower()
|
| 56 |
+
if ext == ".pdf":
|
| 57 |
+
return parse_pdf(file_bytes, filename)
|
| 58 |
+
elif ext in (".html", ".htm"):
|
| 59 |
+
return parse_html(file_bytes, filename)
|
| 60 |
+
elif ext == ".txt":
|
| 61 |
+
return parse_text(file_bytes, filename)
|
| 62 |
+
else:
|
| 63 |
+
logger.warning(f"Unsupported file type '{ext}' for '{filename}'")
|
| 64 |
+
return ""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def get_page_count(file_bytes: bytes, filename: str) -> int | None:
|
| 68 |
+
ext = Path(filename).suffix.lower()
|
| 69 |
+
if ext == ".pdf":
|
| 70 |
+
try:
|
| 71 |
+
from pypdf import PdfReader
|
| 72 |
+
from io import BytesIO
|
| 73 |
+
return len(PdfReader(BytesIO(file_bytes)).pages)
|
| 74 |
+
except Exception:
|
| 75 |
+
return None
|
| 76 |
+
return None
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
services:
|
| 3 |
+
ragcore:
|
| 4 |
+
build: .
|
| 5 |
+
ports:
|
| 6 |
+
- "8000:7860"
|
| 7 |
+
env_file:
|
| 8 |
+
- .env
|
| 9 |
+
environment:
|
| 10 |
+
- PYTHONUNBUFFERED=1
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110,<1.0
|
| 2 |
+
uvicorn[standard]>=0.29
|
| 3 |
+
python-dotenv>=1.0
|
| 4 |
+
pydantic>=2.6
|
| 5 |
+
pydantic-settings>=2.2
|
| 6 |
+
sentence-transformers>=2.6
|
| 7 |
+
qdrant-client>=1.8
|
| 8 |
+
rank-bm25>=0.2.2
|
| 9 |
+
FlashRank>=0.2
|
| 10 |
+
google-generativeai>=0.5
|
| 11 |
+
gradio>=4.20
|
| 12 |
+
pypdf>=4.1
|
| 13 |
+
beautifulsoup4>=4.12
|
| 14 |
+
httpx>=0.27
|
| 15 |
+
python-multipart>=0.0.9
|
| 16 |
+
python-dateutil>=2.9
|
| 17 |
+
ruff>=0.3
|
| 18 |
+
pytest>=8.0
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from fastapi.testclient import TestClient
|
| 3 |
+
|
| 4 |
+
from app.main import app
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@pytest.fixture
|
| 8 |
+
def client():
|
| 9 |
+
return TestClient(app)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.fixture
|
| 13 |
+
def sample_text():
|
| 14 |
+
return (
|
| 15 |
+
"Retrieval-Augmented Generation (RAG) is a technique that combines "
|
| 16 |
+
"information retrieval with text generation. It was introduced by "
|
| 17 |
+
"Facebook AI Research in 2020. RAG systems first retrieve relevant "
|
| 18 |
+
"documents from a knowledge base, then use a language model to generate "
|
| 19 |
+
"answers based on those documents. This approach reduces hallucinations "
|
| 20 |
+
"and provides more factual responses compared to pure generation."
|
| 21 |
+
)
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.testclient import TestClient
|
| 2 |
+
|
| 3 |
+
from app.main import app
|
| 4 |
+
|
| 5 |
+
client = TestClient(app)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_health():
|
| 9 |
+
response = client.get("/health")
|
| 10 |
+
assert response.status_code == 200
|
| 11 |
+
data = response.json()
|
| 12 |
+
assert data["status"] == "ok"
|
| 13 |
+
assert "components" in data
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_root_redirects():
|
| 17 |
+
response = client.get("/", follow_redirects=False)
|
| 18 |
+
assert response.status_code in (301, 302, 307, 308)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_docs_page():
|
| 22 |
+
response = client.get("/docs")
|
| 23 |
+
assert response.status_code == 200
|
tests/test_chunker.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.chunker import chunk_text
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_empty_text():
|
| 5 |
+
assert chunk_text("") == []
|
| 6 |
+
assert chunk_text(" ") == []
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_single_sentence():
|
| 10 |
+
chunks = chunk_text("This is a single sentence.", chunk_size=100)
|
| 11 |
+
assert len(chunks) == 1
|
| 12 |
+
assert chunks[0]["text"] == "This is a single sentence."
|
| 13 |
+
assert chunks[0]["chunk_index"] == 0
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_multiple_chunks():
|
| 17 |
+
text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here. Fifth sentence here."
|
| 18 |
+
chunks = chunk_text(text, chunk_size=5, chunk_overlap=2)
|
| 19 |
+
assert len(chunks) > 1
|
| 20 |
+
for i, chunk in enumerate(chunks):
|
| 21 |
+
assert chunk["chunk_index"] == i
|
| 22 |
+
assert chunk["text"]
|
| 23 |
+
assert chunk["start_char"] >= 0
|
| 24 |
+
assert chunk["end_char"] > chunk["start_char"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_overlap_present():
|
| 28 |
+
text = "Alpha bravo charlie delta. Echo foxtrot golf hotel. India juliet kilo lima."
|
| 29 |
+
chunks = chunk_text(text, chunk_size=4, chunk_overlap=2)
|
| 30 |
+
if len(chunks) > 1:
|
| 31 |
+
first_words = chunks[0]["text"].split()
|
| 32 |
+
second_words = chunks[1]["text"].split()
|
| 33 |
+
overlap = set(first_words[-2:]) & set(second_words[:2])
|
| 34 |
+
assert len(overlap) > 0
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_chunk_size_respected():
|
| 38 |
+
text = " ".join(["word"] * 100) + "."
|
| 39 |
+
chunks = chunk_text(text, chunk_size=20, chunk_overlap=5)
|
| 40 |
+
for chunk in chunks[:-1]: # Last chunk can be smaller
|
| 41 |
+
word_count = len(chunk["text"].split())
|
| 42 |
+
assert word_count <= 25 # Allow some slack for sentence boundaries
|
tests/test_parsers.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.utils.parsers import parse_document, parse_text, parse_html
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_parse_text_utf8():
|
| 5 |
+
content = "Hello, world! This is a test."
|
| 6 |
+
result = parse_text(content.encode("utf-8"), "test.txt")
|
| 7 |
+
assert "Hello, world" in result
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_parse_text_latin1():
|
| 11 |
+
content = "Héllo wörld"
|
| 12 |
+
result = parse_text(content.encode("latin-1"), "test.txt")
|
| 13 |
+
assert "rld" in result
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_parse_html():
|
| 17 |
+
html = b"<html><body><p>Hello world</p><script>var x=1;</script></body></html>"
|
| 18 |
+
result = parse_html(html, "test.html")
|
| 19 |
+
assert "Hello world" in result
|
| 20 |
+
assert "var x" not in result
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_parse_document_unsupported():
|
| 24 |
+
result = parse_document(b"data", "test.xyz")
|
| 25 |
+
assert result == ""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_parse_empty_text():
|
| 29 |
+
result = parse_text(b"", "empty.txt")
|
| 30 |
+
assert result == ""
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_parse_document_dispatches_by_extension():
|
| 34 |
+
result = parse_document(b"Hello text file", "readme.txt")
|
| 35 |
+
assert "Hello text file" in result
|
tests/test_query_analyzer.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.query_analyzer import QueryAnalyzer
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_intent_factual():
|
| 5 |
+
qa = QueryAnalyzer()
|
| 6 |
+
result = qa.analyze("what is RAG?")
|
| 7 |
+
assert result.intent == "factual"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_intent_comparative():
|
| 11 |
+
qa = QueryAnalyzer()
|
| 12 |
+
result = qa.analyze("compare BM25 and dense search")
|
| 13 |
+
assert result.intent == "comparative"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_intent_summarize():
|
| 17 |
+
qa = QueryAnalyzer()
|
| 18 |
+
result = qa.analyze("summarize the report")
|
| 19 |
+
assert result.intent == "summarize"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_intent_explanatory():
|
| 23 |
+
qa = QueryAnalyzer()
|
| 24 |
+
result = qa.analyze("why is RAG useful?")
|
| 25 |
+
assert result.intent == "explanatory"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_doctype_extraction():
|
| 29 |
+
qa = QueryAnalyzer()
|
| 30 |
+
result = qa.analyze("search PDFs about machine learning")
|
| 31 |
+
assert result.extracted_filters.doc_type == "pdf"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_no_filters():
|
| 35 |
+
qa = QueryAnalyzer()
|
| 36 |
+
result = qa.analyze("what is machine learning?")
|
| 37 |
+
assert result.extracted_filters.doc_type is None
|
| 38 |
+
assert result.extracted_filters.source is None
|
| 39 |
+
assert result.clean_query == result.original_query
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_date_extraction_last_month():
|
| 43 |
+
qa = QueryAnalyzer()
|
| 44 |
+
result = qa.analyze("documents from last month")
|
| 45 |
+
assert result.extracted_filters.date_from is not None
|
| 46 |
+
assert result.extracted_filters.date_to is not None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_clean_query_preserves_meaning():
|
| 50 |
+
qa = QueryAnalyzer()
|
| 51 |
+
result = qa.analyze("what is machine learning?")
|
| 52 |
+
assert "machine learning" in result.clean_query
|
tests/test_retrieval.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.retriever import HybridRetriever
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_rrf_fusion_basic():
|
| 5 |
+
dense = [
|
| 6 |
+
{"chunk_id": "a", "text": "doc a", "score": 0.9, "metadata": {}},
|
| 7 |
+
{"chunk_id": "b", "text": "doc b", "score": 0.8, "metadata": {}},
|
| 8 |
+
]
|
| 9 |
+
sparse = [
|
| 10 |
+
{"chunk_id": "b", "text": "doc b", "score": 5.0, "metadata": {}},
|
| 11 |
+
{"chunk_id": "c", "text": "doc c", "score": 4.0, "metadata": {}},
|
| 12 |
+
]
|
| 13 |
+
fused = HybridRetriever.rrf_fuse([dense, sparse])
|
| 14 |
+
ids = [item["chunk_id"] for item in fused]
|
| 15 |
+
# "b" appears in both lists so should rank highest
|
| 16 |
+
assert ids[0] == "b"
|
| 17 |
+
assert len(fused) == 3
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_rrf_fusion_empty():
|
| 21 |
+
fused = HybridRetriever.rrf_fuse([[], []])
|
| 22 |
+
assert fused == []
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_rrf_fusion_single_list():
|
| 26 |
+
results = [
|
| 27 |
+
{"chunk_id": "x", "text": "x", "score": 1.0, "metadata": {}},
|
| 28 |
+
]
|
| 29 |
+
fused = HybridRetriever.rrf_fuse([results])
|
| 30 |
+
assert len(fused) == 1
|
| 31 |
+
assert fused[0]["chunk_id"] == "x"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_rrf_fusion_with_weights():
|
| 35 |
+
dense = [
|
| 36 |
+
{"chunk_id": "a", "text": "a", "score": 0.9, "metadata": {}},
|
| 37 |
+
]
|
| 38 |
+
sparse = [
|
| 39 |
+
{"chunk_id": "b", "text": "b", "score": 5.0, "metadata": {}},
|
| 40 |
+
]
|
| 41 |
+
fused = HybridRetriever.rrf_fuse([dense, sparse], weights=[1.0, 0.0])
|
| 42 |
+
# With weight 0 on sparse, only dense matters
|
| 43 |
+
assert fused[0]["chunk_id"] == "a"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_apply_filters():
|
| 47 |
+
results = [
|
| 48 |
+
{"chunk_id": "1", "text": "t", "score": 1, "metadata": {"doc_type": "pdf", "source": "a.pdf", "tags": []}},
|
| 49 |
+
{"chunk_id": "2", "text": "t", "score": 1, "metadata": {"doc_type": "html", "source": "b.html", "tags": []}},
|
| 50 |
+
]
|
| 51 |
+
from app.models.schemas import SearchFilters
|
| 52 |
+
|
| 53 |
+
filters = SearchFilters(doc_type="pdf")
|
| 54 |
+
filtered = HybridRetriever._apply_filters(results, filters)
|
| 55 |
+
assert len(filtered) == 1
|
| 56 |
+
assert filtered[0]["chunk_id"] == "1"
|