Spaces:
Sleeping
Sleeping
CI: sync project3-document-qa to Space
Browse files- README.md +54 -0
- app.py +38 -10
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -24,6 +24,8 @@ A RAG (Retrieval-Augmented Generation) application using:
|
|
| 24 |
```bash
|
| 25 |
cd project3-document-qa
|
| 26 |
pip install -r requirements.txt
|
|
|
|
|
|
|
| 27 |
python app.py
|
| 28 |
# Then open http://localhost:7860
|
| 29 |
```
|
|
@@ -92,3 +94,55 @@ Notes:
|
|
| 92 |
- First build may take a while to download models.
|
| 93 |
- You can switch to a lighter LLM in `app.py` (e.g., `sshleifer/tiny-gpt2`) if needed.
|
| 94 |
- For production, consider larger models via Inference Endpoints or OpenAI with an API key.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
```bash
|
| 25 |
cd project3-document-qa
|
| 26 |
pip install -r requirements.txt
|
| 27 |
+
export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store" # optional; defaults internally
|
| 28 |
+
python ../scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
|
| 29 |
python app.py
|
| 30 |
# Then open http://localhost:7860
|
| 31 |
```
|
|
|
|
| 94 |
- First build may take a while to download models.
|
| 95 |
- You can switch to a lighter LLM in `app.py` (e.g., `sshleifer/tiny-gpt2`) if needed.
|
| 96 |
- For production, consider larger models via Inference Endpoints or OpenAI with an API key.
|
| 97 |
+
|
| 98 |
+
## Embedding Refresh (dbt-integrated)
|
| 99 |
+
|
| 100 |
+
The RAG layer can ingest curated documents produced by the dbt data platform (`document_index` mart).
|
| 101 |
+
|
| 102 |
+
### Pre-requisites
|
| 103 |
+
1. Run dbt to build the `document_index` view:
|
| 104 |
+
```bash
|
| 105 |
+
cd data-platform/dbt
|
| 106 |
+
dbt seed && dbt run
|
| 107 |
+
```
|
| 108 |
+
2. Ensure the DuckDB file exists at `data-platform/dbt/warehouse/data.duckdb`.
|
| 109 |
+
|
| 110 |
+
### Build / Rebuild Vector Store
|
| 111 |
+
```bash
|
| 112 |
+
export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store" # choose any directory
|
| 113 |
+
python scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
Flags:
|
| 117 |
+
- `--reset` (optional): clears existing collection before loading
|
| 118 |
+
- `--limit N` (optional): ingest only first N rows for quick tests
|
| 119 |
+
- `--duckdb PATH`: override DuckDB file location
|
| 120 |
+
- `--collection NAME`: change Chroma collection name (default: `documents`)
|
| 121 |
+
|
| 122 |
+
### Launch App Using Persisted Embeddings
|
| 123 |
+
```bash
|
| 124 |
+
cd project3-document-qa
|
| 125 |
+
export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store"
|
| 126 |
+
python app.py
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
At startup the app detects a persistent store and **skips adding sample documents**, using the dbt-derived corpus instead.
|
| 130 |
+
|
| 131 |
+
### Updating Embeddings After dbt Changes
|
| 132 |
+
If you modify source seeds or transformation logic:
|
| 133 |
+
```bash
|
| 134 |
+
cd data-platform/dbt
|
| 135 |
+
dbt seed && dbt run
|
| 136 |
+
cd ../../
|
| 137 |
+
python scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Common Issues
|
| 141 |
+
| Symptom | Cause | Fix |
|
| 142 |
+
|---------|-------|-----|
|
| 143 |
+
| `DuckDB file not found` | dbt not executed | Run `dbt seed && dbt run` |
|
| 144 |
+
| Empty collection after refresh | `document_index` view missing | Confirm model name and rerun dbt |
|
| 145 |
+
| App still shows sample docs | Persist dir not set or empty | Export `CHROMA_PERSIST_DIR` and rebuild embeddings |
|
| 146 |
+
| Duplicate ID errors | Re-running without `--reset` and changed IDs | Use `--reset` for full rebuild |
|
| 147 |
+
|
| 148 |
+
---
|
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from transformers import pipeline, AutoTokenizer
|
|
| 11 |
from typing import List, Dict, Tuple
|
| 12 |
import os
|
| 13 |
from datetime import datetime
|
|
|
|
| 14 |
import logging
|
| 15 |
|
| 16 |
# Configure logging
|
|
@@ -39,20 +40,26 @@ class DocumentQASystem:
|
|
| 39 |
self.embedding_model = SentenceTransformer(embedding_model_name)
|
| 40 |
|
| 41 |
# Initialize ChromaDB
|
| 42 |
-
logger.info("Initializing ChromaDB...")
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Create or get collection
|
| 49 |
try:
|
|
|
|
|
|
|
| 50 |
self.collection = self.chroma_client.create_collection(
|
| 51 |
name=collection_name,
|
| 52 |
metadata={"description": "Document knowledge base"}
|
| 53 |
)
|
| 54 |
-
except Exception:
|
| 55 |
-
self.collection = self.chroma_client.get_collection(name=collection_name)
|
| 56 |
|
| 57 |
# Initialize LLM
|
| 58 |
logger.info("Loading language model: %s", llm_model_name)
|
|
@@ -367,8 +374,16 @@ def _preload_sample_documents_if_empty():
|
|
| 367 |
logger.warning("Could not preload sample documents: %s", e)
|
| 368 |
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
|
| 374 |
# Gradio Interface Functions
|
|
@@ -724,9 +739,22 @@ if __name__ == "__main__":
|
|
| 724 |
print("🛠️ Tech: ChromaDB + SentenceTransformers + HuggingFace")
|
| 725 |
print("=" * 60)
|
| 726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
demo.launch(
|
| 728 |
server_name="0.0.0.0",
|
| 729 |
-
server_port=
|
| 730 |
share=False,
|
| 731 |
show_error=True
|
| 732 |
)
|
|
|
|
| 11 |
from typing import List, Dict, Tuple
|
| 12 |
import os
|
| 13 |
from datetime import datetime
|
| 14 |
+
import socket
|
| 15 |
import logging
|
| 16 |
|
| 17 |
# Configure logging
|
|
|
|
| 40 |
self.embedding_model = SentenceTransformer(embedding_model_name)
|
| 41 |
|
| 42 |
# Initialize ChromaDB
|
| 43 |
+
logger.info("Initializing ChromaDB (persistent if available)...")
|
| 44 |
+
# Resolve persistent directory: env or local folder next to this file
|
| 45 |
+
persist_dir_env = os.getenv("CHROMA_PERSIST_DIR")
|
| 46 |
+
persist_dir = persist_dir_env if persist_dir_env else os.path.join(os.path.dirname(__file__), "chroma_store")
|
| 47 |
+
use_persistent = os.path.isdir(persist_dir) and any(os.scandir(persist_dir))
|
| 48 |
+
if use_persistent:
|
| 49 |
+
logger.info("Using persistent directory: %s", persist_dir)
|
| 50 |
+
self.chroma_client = chromadb.PersistentClient(path=persist_dir, settings=Settings(anonymized_telemetry=False))
|
| 51 |
+
else:
|
| 52 |
+
logger.info("Persistent dir not found (%s); falling back to in-memory client", persist_dir)
|
| 53 |
+
self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False, allow_reset=True))
|
| 54 |
|
| 55 |
# Create or get collection
|
| 56 |
try:
|
| 57 |
+
self.collection = self.chroma_client.get_collection(collection_name)
|
| 58 |
+
except Exception:
|
| 59 |
self.collection = self.chroma_client.create_collection(
|
| 60 |
name=collection_name,
|
| 61 |
metadata={"description": "Document knowledge base"}
|
| 62 |
)
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# Initialize LLM
|
| 65 |
logger.info("Loading language model: %s", llm_model_name)
|
|
|
|
| 374 |
logger.warning("Could not preload sample documents: %s", e)
|
| 375 |
|
| 376 |
|
| 377 |
+
def _persistent_store_present() -> bool:
|
| 378 |
+
persist_dir_env = os.getenv("CHROMA_PERSIST_DIR")
|
| 379 |
+
persist_dir = persist_dir_env if persist_dir_env else os.path.join(os.path.dirname(__file__), "chroma_store")
|
| 380 |
+
return os.path.isdir(persist_dir) and any(os.scandir(persist_dir))
|
| 381 |
+
|
| 382 |
+
# Execute preload only if persistent store not already populated
|
| 383 |
+
if not _persistent_store_present():
|
| 384 |
+
_preload_sample_documents_if_empty()
|
| 385 |
+
else:
|
| 386 |
+
logger.info("🔒 Persistent Chroma store detected; skipping sample preload.")
|
| 387 |
|
| 388 |
|
| 389 |
# Gradio Interface Functions
|
|
|
|
| 739 |
print("🛠️ Tech: ChromaDB + SentenceTransformers + HuggingFace")
|
| 740 |
print("=" * 60)
|
| 741 |
|
| 742 |
+
# Dynamic port selection fallback
|
| 743 |
+
desired_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
|
| 744 |
+
port = desired_port
|
| 745 |
+
for _ in range(10): # try up to 10 sequential ports
|
| 746 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
| 747 |
+
try:
|
| 748 |
+
s.bind(("0.0.0.0", port))
|
| 749 |
+
s.close()
|
| 750 |
+
break # port is free
|
| 751 |
+
except OSError:
|
| 752 |
+
port += 1
|
| 753 |
+
if port != desired_port:
|
| 754 |
+
print(f"⚠️ Port {desired_port} busy; using fallback port {port}")
|
| 755 |
demo.launch(
|
| 756 |
server_name="0.0.0.0",
|
| 757 |
+
server_port=port,
|
| 758 |
share=False,
|
| 759 |
show_error=True
|
| 760 |
)
|
requirements.txt
CHANGED
|
@@ -12,6 +12,7 @@ sentence-transformers
|
|
| 12 |
transformers>=4.35.0
|
| 13 |
torch
|
| 14 |
huggingface_hub<1.0
|
|
|
|
| 15 |
|
| 16 |
# Data Processing
|
| 17 |
pandas
|
|
|
|
| 12 |
transformers>=4.35.0
|
| 13 |
torch
|
| 14 |
huggingface_hub<1.0
|
| 15 |
+
duckdb
|
| 16 |
|
| 17 |
# Data Processing
|
| 18 |
pandas
|