zhangju2023 commited on
Commit
7834287
·
verified ·
1 Parent(s): c9f7073

CI: sync project3-document-qa to Space

Browse files
Files changed (3) hide show
  1. README.md +54 -0
  2. app.py +38 -10
  3. requirements.txt +1 -0
README.md CHANGED
@@ -24,6 +24,8 @@ A RAG (Retrieval-Augmented Generation) application using:
24
  ```bash
25
  cd project3-document-qa
26
  pip install -r requirements.txt
 
 
27
  python app.py
28
  # Then open http://localhost:7860
29
  ```
@@ -92,3 +94,55 @@ Notes:
92
  - First build may take a while to download models.
93
  - You can switch to a lighter LLM in `app.py` (e.g., `sshleifer/tiny-gpt2`) if needed.
94
  - For production, consider larger models via Inference Endpoints or OpenAI with an API key.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ```bash
25
  cd project3-document-qa
26
  pip install -r requirements.txt
27
+ export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store" # optional; defaults internally
28
+ python ../scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
29
  python app.py
30
  # Then open http://localhost:7860
31
  ```
 
94
  - First build may take a while to download models.
95
  - You can switch to a lighter LLM in `app.py` (e.g., `sshleifer/tiny-gpt2`) if needed.
96
  - For production, consider larger models via Inference Endpoints or OpenAI with an API key.
97
+
98
+ ## Embedding Refresh (dbt-integrated)
99
+
100
+ The RAG layer can ingest curated documents produced by the dbt data platform (`document_index` mart).
101
+
102
+ ### Pre-requisites
103
+ 1. Run dbt to build the `document_index` view:
104
+ ```bash
105
+ cd data-platform/dbt
106
+ dbt seed && dbt run
107
+ ```
108
+ 2. Ensure the DuckDB file exists at `data-platform/dbt/warehouse/data.duckdb`.
109
+
110
+ ### Build / Rebuild Vector Store
111
+ ```bash
112
+ export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store" # choose any directory
113
+ python scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
114
+ ```
115
+
116
+ Flags:
117
+ - `--reset` (optional): clears existing collection before loading
118
+ - `--limit N` (optional): ingest only first N rows for quick tests
119
+ - `--duckdb PATH`: override DuckDB file location
120
+ - `--collection NAME`: change Chroma collection name (default: `documents`)
121
+
122
+ ### Launch App Using Persisted Embeddings
123
+ ```bash
124
+ cd project3-document-qa
125
+ export CHROMA_PERSIST_DIR="project3-document-qa/chroma_store"
126
+ python app.py
127
+ ```
128
+
129
+ At startup the app detects a persistent store and **skips adding sample documents**, using the dbt-derived corpus instead.
130
+
131
+ ### Updating Embeddings After dbt Changes
132
+ If you modify source seeds or transformation logic:
133
+ ```bash
134
+ cd data-platform/dbt
135
+ dbt seed && dbt run
136
+ cd ../../
137
+ python scripts/refresh_embeddings.py --persist-dir "$CHROMA_PERSIST_DIR" --reset
138
+ ```
139
+
140
+ ### Common Issues
141
+ | Symptom | Cause | Fix |
142
+ |---------|-------|-----|
143
+ | `DuckDB file not found` | dbt not executed | Run `dbt seed && dbt run` |
144
+ | Empty collection after refresh | `document_index` view missing | Confirm model name and rerun dbt |
145
+ | App still shows sample docs | Persist dir not set or empty | Export `CHROMA_PERSIST_DIR` and rebuild embeddings |
146
+ | Duplicate ID errors | Re-running without `--reset` and changed IDs | Use `--reset` for full rebuild |
147
+
148
+ ---
app.py CHANGED
@@ -11,6 +11,7 @@ from transformers import pipeline, AutoTokenizer
11
  from typing import List, Dict, Tuple
12
  import os
13
  from datetime import datetime
 
14
  import logging
15
 
16
  # Configure logging
@@ -39,20 +40,26 @@ class DocumentQASystem:
39
  self.embedding_model = SentenceTransformer(embedding_model_name)
40
 
41
  # Initialize ChromaDB
42
- logger.info("Initializing ChromaDB...")
43
- self.chroma_client = chromadb.Client(Settings(
44
- anonymized_telemetry=False,
45
- allow_reset=True
46
- ))
 
 
 
 
 
 
47
 
48
  # Create or get collection
49
  try:
 
 
50
  self.collection = self.chroma_client.create_collection(
51
  name=collection_name,
52
  metadata={"description": "Document knowledge base"}
53
  )
54
- except Exception:
55
- self.collection = self.chroma_client.get_collection(name=collection_name)
56
 
57
  # Initialize LLM
58
  logger.info("Loading language model: %s", llm_model_name)
@@ -367,8 +374,16 @@ def _preload_sample_documents_if_empty():
367
  logger.warning("Could not preload sample documents: %s", e)
368
 
369
 
370
- # Execute preload at import/startup
371
- _preload_sample_documents_if_empty()
 
 
 
 
 
 
 
 
372
 
373
 
374
  # Gradio Interface Functions
@@ -724,9 +739,22 @@ if __name__ == "__main__":
724
  print("🛠️ Tech: ChromaDB + SentenceTransformers + HuggingFace")
725
  print("=" * 60)
726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  demo.launch(
728
  server_name="0.0.0.0",
729
- server_port=7860,
730
  share=False,
731
  show_error=True
732
  )
 
11
  from typing import List, Dict, Tuple
12
  import os
13
  from datetime import datetime
14
+ import socket
15
  import logging
16
 
17
  # Configure logging
 
40
  self.embedding_model = SentenceTransformer(embedding_model_name)
41
 
42
  # Initialize ChromaDB
43
+ logger.info("Initializing ChromaDB (persistent if available)...")
44
+ # Resolve persistent directory: env or local folder next to this file
45
+ persist_dir_env = os.getenv("CHROMA_PERSIST_DIR")
46
+ persist_dir = persist_dir_env if persist_dir_env else os.path.join(os.path.dirname(__file__), "chroma_store")
47
+ use_persistent = os.path.isdir(persist_dir) and any(os.scandir(persist_dir))
48
+ if use_persistent:
49
+ logger.info("Using persistent directory: %s", persist_dir)
50
+ self.chroma_client = chromadb.PersistentClient(path=persist_dir, settings=Settings(anonymized_telemetry=False))
51
+ else:
52
+ logger.info("Persistent dir not found (%s); falling back to in-memory client", persist_dir)
53
+ self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False, allow_reset=True))
54
 
55
  # Create or get collection
56
  try:
57
+ self.collection = self.chroma_client.get_collection(collection_name)
58
+ except Exception:
59
  self.collection = self.chroma_client.create_collection(
60
  name=collection_name,
61
  metadata={"description": "Document knowledge base"}
62
  )
 
 
63
 
64
  # Initialize LLM
65
  logger.info("Loading language model: %s", llm_model_name)
 
374
  logger.warning("Could not preload sample documents: %s", e)
375
 
376
 
377
+ def _persistent_store_present() -> bool:
378
+ persist_dir_env = os.getenv("CHROMA_PERSIST_DIR")
379
+ persist_dir = persist_dir_env if persist_dir_env else os.path.join(os.path.dirname(__file__), "chroma_store")
380
+ return os.path.isdir(persist_dir) and any(os.scandir(persist_dir))
381
+
382
+ # Execute preload only if persistent store not already populated
383
+ if not _persistent_store_present():
384
+ _preload_sample_documents_if_empty()
385
+ else:
386
+ logger.info("🔒 Persistent Chroma store detected; skipping sample preload.")
387
 
388
 
389
  # Gradio Interface Functions
 
739
  print("🛠️ Tech: ChromaDB + SentenceTransformers + HuggingFace")
740
  print("=" * 60)
741
 
742
+ # Dynamic port selection fallback
743
+ desired_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
744
+ port = desired_port
745
+ for _ in range(10): # try up to 10 sequential ports
746
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
747
+ try:
748
+ s.bind(("0.0.0.0", port))
749
+ s.close()
750
+ break # port is free
751
+ except OSError:
752
+ port += 1
753
+ if port != desired_port:
754
+ print(f"⚠️ Port {desired_port} busy; using fallback port {port}")
755
  demo.launch(
756
  server_name="0.0.0.0",
757
+ server_port=port,
758
  share=False,
759
  show_error=True
760
  )
requirements.txt CHANGED
@@ -12,6 +12,7 @@ sentence-transformers
12
  transformers>=4.35.0
13
  torch
14
  huggingface_hub<1.0
 
15
 
16
  # Data Processing
17
  pandas
 
12
  transformers>=4.35.0
13
  torch
14
  huggingface_hub<1.0
15
+ duckdb
16
 
17
  # Data Processing
18
  pandas