Commit ·
0d0ccfc
1
Parent(s): 06f6eb8
update: switch to PersistentClient and refresh Dockerfile
Browse files- Dockerfile +8 -9
- app.py +67 -45
- code wget step; clean Dockerfile +62 -0
Dockerfile
CHANGED
|
@@ -10,26 +10,28 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 10 |
PIP_ROOT_USER_ACTION=ignore \
|
| 11 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 12 |
HF_HOME=/data/.huggingface \
|
| 13 |
-
RAG_DB_DIR=/
|
| 14 |
RAG_CORPUS_DIR=/data/corpus \
|
| 15 |
RAG_DATASET_ID=internationalscholarsprogram/DOC \
|
| 16 |
RAG_DATASET_REVISION=main \
|
| 17 |
RAG_PORT=7860 \
|
| 18 |
PORT=7860 \
|
| 19 |
TOKENIZERS_PARALLELISM=false \
|
| 20 |
-
CHROMA_DB_IMPL=duckdb+parquet \
|
| 21 |
-
CHROMADB_TELEMETRY=false \
|
| 22 |
-
ANONYMIZED_TELEMETRY=false \
|
| 23 |
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 24 |
CUDA_VISIBLE_DEVICES="" \
|
| 25 |
-
OMP_NUM_THREADS=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# --- System dependencies ---
|
| 28 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 29 |
tini wget curl ca-certificates tar git \
|
| 30 |
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
|
| 32 |
-
# --- Non-root user (kept for reference
|
| 33 |
RUN useradd -m -u 1000 appuser || true
|
| 34 |
|
| 35 |
WORKDIR /app
|
|
@@ -46,9 +48,6 @@ COPY . .
|
|
| 46 |
RUN mkdir -p /data/chroma_db /data/.huggingface /data/corpus /tmp/chroma_db \
|
| 47 |
&& chmod -R 777 /data /app /tmp
|
| 48 |
|
| 49 |
-
# --- Optional: bootstrap script permissions ---
|
| 50 |
-
RUN if [ -f "bootstrap.sh" ]; then chmod +x bootstrap.sh; fi
|
| 51 |
-
|
| 52 |
# Do NOT switch user; keep root so /data and /tmp are writable in Spaces
|
| 53 |
# USER appuser
|
| 54 |
|
|
|
|
| 10 |
PIP_ROOT_USER_ACTION=ignore \
|
| 11 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 12 |
HF_HOME=/data/.huggingface \
|
| 13 |
+
RAG_DB_DIR=/data/chroma_db \
|
| 14 |
RAG_CORPUS_DIR=/data/corpus \
|
| 15 |
RAG_DATASET_ID=internationalscholarsprogram/DOC \
|
| 16 |
RAG_DATASET_REVISION=main \
|
| 17 |
RAG_PORT=7860 \
|
| 18 |
PORT=7860 \
|
| 19 |
TOKENIZERS_PARALLELISM=false \
|
|
|
|
|
|
|
|
|
|
| 20 |
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 21 |
CUDA_VISIBLE_DEVICES="" \
|
| 22 |
+
OMP_NUM_THREADS=1 \
|
| 23 |
+
ORT_LOG_SEVERITY_LEVEL=3
|
| 24 |
+
|
| 25 |
+
# NOTE:
|
| 26 |
+
# - Removed legacy Chroma envs (CHROMA_DB_IMPL, CHROMADB_TELEMETRY, ANONYMIZED_TELEMETRY)
|
| 27 |
+
# since the new PersistentClient doesn’t use them.
|
| 28 |
|
| 29 |
# --- System dependencies ---
|
| 30 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 31 |
tini wget curl ca-certificates tar git \
|
| 32 |
&& rm -rf /var/lib/apt/lists/*
|
| 33 |
|
| 34 |
+
# --- (Optional) Non-root user (kept for reference) ---
|
| 35 |
RUN useradd -m -u 1000 appuser || true
|
| 36 |
|
| 37 |
WORKDIR /app
|
|
|
|
| 48 |
RUN mkdir -p /data/chroma_db /data/.huggingface /data/corpus /tmp/chroma_db \
|
| 49 |
&& chmod -R 777 /data /app /tmp
|
| 50 |
|
|
|
|
|
|
|
|
|
|
| 51 |
# Do NOT switch user; keep root so /data and /tmp are writable in Spaces
|
| 52 |
# USER appuser
|
| 53 |
|
app.py
CHANGED
|
@@ -10,12 +10,14 @@ Enhancements in this version:
|
|
| 10 |
- Auto-(re)index Chroma when the dataset commit SHA changes
|
| 11 |
- /refresh endpoint to force re-pull + reindex without redeploying
|
| 12 |
- SAFE writable dir detection for Chroma with fallback to /tmp/chroma_db
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import os, sys, logging, warnings, json, shutil, time
|
| 16 |
from typing import List, Optional, Iterable, Dict, Any
|
| 17 |
|
| 18 |
-
# -------------------- Quiet warnings --------------------
|
|
|
|
| 19 |
if not sys.warnoptions:
|
| 20 |
warnings.simplefilter("ignore")
|
| 21 |
for cat in (DeprecationWarning, UserWarning, FutureWarning):
|
|
@@ -23,6 +25,21 @@ for cat in (DeprecationWarning, UserWarning, FutureWarning):
|
|
| 23 |
warnings.filterwarnings("ignore", message=".*LangChainDeprecationWarning.*")
|
| 24 |
os.environ.setdefault("PYTHONWARNINGS", "ignore")
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
logging.basicConfig(
|
| 27 |
level=logging.ERROR,
|
| 28 |
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
|
|
@@ -36,12 +53,14 @@ from fastapi import FastAPI, HTTPException, Header, Depends
|
|
| 36 |
from fastapi.middleware.cors import CORSMiddleware
|
| 37 |
from pydantic import BaseModel, Field
|
| 38 |
|
| 39 |
-
# Vector store
|
| 40 |
try:
|
| 41 |
from langchain_chroma import Chroma
|
| 42 |
except ImportError:
|
| 43 |
from langchain_community.vectorstores import Chroma # fallback
|
| 44 |
|
|
|
|
|
|
|
| 45 |
# LLM endpoint
|
| 46 |
try:
|
| 47 |
from langchain_huggingface import HuggingFaceEndpoint
|
|
@@ -68,17 +87,16 @@ except ImportError:
|
|
| 68 |
from langchain_core.documents import Document
|
| 69 |
from langchain_core.embeddings import Embeddings # modern base
|
| 70 |
|
| 71 |
-
#
|
| 72 |
from huggingface_hub import snapshot_download, HfApi
|
| 73 |
from langchain_community.document_loaders import PyPDFLoader
|
| 74 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 75 |
|
| 76 |
-
# NEW: explicit Chroma settings (prevents telemetry + ensures persistence)
|
| 77 |
-
from chromadb.config import Settings
|
| 78 |
-
|
| 79 |
# -------------------- Config --------------------
|
| 80 |
ENV = os.getenv
|
| 81 |
DB_DIR = ENV("RAG_DB_DIR", "/data/chroma_db") # intended Chroma dir
|
|
|
|
|
|
|
| 82 |
EMBED_PROVIDER = ENV("RAG_EMBED_PROVIDER", "bge").lower() # bge | fastembed | hf_local
|
| 83 |
EMBED_MODEL = ENV("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
|
| 84 |
DEVICE = ENV("RAG_DEVICE", "cpu")
|
|
@@ -101,7 +119,7 @@ HOST = ENV("RAG_HOST", "0.0.0.0")
|
|
| 101 |
PORT = int(ENV("PORT", ENV("RAG_PORT", "7860"))) # Spaces $PORT first
|
| 102 |
CORS_ORIGINS = ENV("RAG_CORS_ORIGINS", "*")
|
| 103 |
|
| 104 |
-
#
|
| 105 |
DATASET_ID = ENV("RAG_DATASET_ID", "internationalscholarsprogram/DOC")
|
| 106 |
DATA_REV = ENV("RAG_DATASET_REVISION", "main") # tag/branch/sha, or "main"
|
| 107 |
CORPUS_DIR = ENV("RAG_CORPUS_DIR", "/data/corpus") # where PDFs are downloaded
|
|
@@ -158,7 +176,7 @@ def build_embeddings(provider: str, model: str, device: str,
|
|
| 158 |
base = HuggingFaceBgeEmbeddings(
|
| 159 |
model_name=model,
|
| 160 |
model_kwargs={"device": device},
|
| 161 |
-
encode_kwargs={
|
| 162 |
)
|
| 163 |
return BGEAdapter(base, use_prefixes=use_prefixes)
|
| 164 |
|
|
@@ -169,7 +187,7 @@ def build_embeddings(provider: str, model: str, device: str,
|
|
| 169 |
base = HFEmbeddings(
|
| 170 |
model_name=model,
|
| 171 |
model_kwargs={"device": device},
|
| 172 |
-
encode_kwargs={
|
| 173 |
)
|
| 174 |
return BGEAdapter(base, use_prefixes=("bge" in model.lower() and use_prefixes))
|
| 175 |
|
|
@@ -184,18 +202,18 @@ embeddings = build_embeddings(
|
|
| 184 |
batch_size=EMBED_BATCH,
|
| 185 |
)
|
| 186 |
|
| 187 |
-
# -------------------- Vector DB handle (
|
| 188 |
os.makedirs(DB_DIR, exist_ok=True)
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
vectordb = Chroma(
|
| 195 |
-
|
| 196 |
embedding_function=embeddings,
|
|
|
|
| 197 |
collection_metadata={"hnsw:space": "cosine"},
|
| 198 |
-
client_settings=client_settings,
|
| 199 |
)
|
| 200 |
|
| 201 |
def build_retriever(k: int):
|
|
@@ -291,48 +309,46 @@ def load_docs_from_pdfs(pdf_paths: List[str]) -> List[Document]:
|
|
| 291 |
|
| 292 |
def _reset_chroma_dir():
|
| 293 |
"""Safely reset the Chroma persist dir even if a client is holding files."""
|
| 294 |
-
# Try to
|
| 295 |
try:
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
try:
|
| 302 |
-
coll = getattr(vectordb, "_collection", None)
|
| 303 |
-
if coll and getattr(coll, "name", None):
|
| 304 |
-
client.delete_collection(coll.name)
|
| 305 |
-
except Exception:
|
| 306 |
-
pass
|
| 307 |
except Exception:
|
| 308 |
pass
|
| 309 |
|
| 310 |
-
#
|
| 311 |
for _ in range(10):
|
| 312 |
try:
|
| 313 |
if os.path.isdir(DB_DIR):
|
| 314 |
shutil.rmtree(DB_DIR)
|
| 315 |
break
|
| 316 |
except OSError:
|
| 317 |
-
time.sleep(0.2)
|
| 318 |
os.makedirs(DB_DIR, exist_ok=True)
|
| 319 |
|
| 320 |
-
def
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
vectordb = Chroma(
|
| 325 |
-
|
| 326 |
embedding_function=embeddings,
|
|
|
|
| 327 |
collection_metadata={"hnsw:space": "cosine"},
|
| 328 |
-
client_settings=client_settings,
|
| 329 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
if docs:
|
| 331 |
# Add in small batches to keep memory low
|
| 332 |
batch = 64
|
| 333 |
for i in range(0, len(docs), batch):
|
| 334 |
vectordb.add_documents(docs[i:i+batch])
|
| 335 |
-
|
| 336 |
|
| 337 |
def reindex_if_needed(force: bool = False, revision: str = DATA_REV) -> Dict[str, Any]:
|
| 338 |
"""
|
|
@@ -342,17 +358,21 @@ def reindex_if_needed(force: bool = False, revision: str = DATA_REV) -> Dict[str
|
|
| 342 |
st = _state_load()
|
| 343 |
old_sha = st.get("dataset_sha")
|
| 344 |
|
| 345 |
-
|
|
|
|
| 346 |
pdfs = list_pdf_paths(CORPUS_DIR)
|
| 347 |
docs = load_docs_from_pdfs(pdfs)
|
| 348 |
rebuild_chroma(docs)
|
| 349 |
st["dataset_sha"] = new_sha
|
| 350 |
_state_save(st)
|
| 351 |
return {"reindexed": True, "commit": new_sha, "docs": len(docs)}
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
# -------------------- Helpers --------------------
|
| 355 |
-
def format_docs(docs: List[Document]
|
| 356 |
parts = []
|
| 357 |
for i, d in enumerate(docs, 1):
|
| 358 |
src = d.metadata.get("source", "unknown")
|
|
@@ -392,7 +412,7 @@ def answer_question(question: str, k: int = TOP_K_DEFAULT) -> Dict[str, Any]:
|
|
| 392 |
return {"answer": answer, "citations": cits, "used_k": k}
|
| 393 |
|
| 394 |
# -------------------- FastAPI --------------------
|
| 395 |
-
app = FastAPI(title="Career GPT RAG API", version="1.
|
| 396 |
app.add_middleware(
|
| 397 |
CORSMiddleware,
|
| 398 |
allow_origins=[o.strip() for o in CORS_ORIGINS.split(",") if o.strip()],
|
|
@@ -432,7 +452,8 @@ def healthz():
|
|
| 432 |
# Best-effort count
|
| 433 |
count = 0
|
| 434 |
try:
|
| 435 |
-
|
|
|
|
| 436 |
except Exception:
|
| 437 |
meta = vectordb.get(limit=1)
|
| 438 |
count = len(meta.get("ids", []))
|
|
@@ -440,7 +461,8 @@ def healthz():
|
|
| 440 |
return {
|
| 441 |
"status": "ok",
|
| 442 |
"db_dir": DB_DIR,
|
| 443 |
-
"
|
|
|
|
| 444 |
"embed_provider": EMBED_PROVIDER,
|
| 445 |
"embed_model": EMBED_MODEL,
|
| 446 |
"llm": HF_LLM_REPO,
|
|
@@ -470,7 +492,7 @@ def ask(req: AskRequest, _ok: bool = Depends(require_api_key)):
|
|
| 470 |
log.exception("Unhandled /ask error")
|
| 471 |
raise HTTPException(status_code=500, detail=str(e))
|
| 472 |
|
| 473 |
-
# ----
|
| 474 |
@app.post("/refresh")
|
| 475 |
def refresh(_ok: bool = Depends(require_api_key)):
|
| 476 |
"""
|
|
|
|
| 10 |
- Auto-(re)index Chroma when the dataset commit SHA changes
|
| 11 |
- /refresh endpoint to force re-pull + reindex without redeploying
|
| 12 |
- SAFE writable dir detection for Chroma with fallback to /tmp/chroma_db
|
| 13 |
+
- UPDATED: Chroma migration to new client API (PersistentClient)
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os, sys, logging, warnings, json, shutil, time
|
| 17 |
from typing import List, Optional, Iterable, Dict, Any
|
| 18 |
|
| 19 |
+
# -------------------- Quiet warnings & env hygiene --------------------
|
| 20 |
+
# Silence common warnings
|
| 21 |
if not sys.warnoptions:
|
| 22 |
warnings.simplefilter("ignore")
|
| 23 |
for cat in (DeprecationWarning, UserWarning, FutureWarning):
|
|
|
|
| 25 |
warnings.filterwarnings("ignore", message=".*LangChainDeprecationWarning.*")
|
| 26 |
os.environ.setdefault("PYTHONWARNINGS", "ignore")
|
| 27 |
|
| 28 |
+
# Sanitize OMP_NUM_THREADS (fixes: "libgomp: Invalid value for OMP_NUM_THREADS")
|
| 29 |
+
_omp = os.environ.get("OMP_NUM_THREADS")
|
| 30 |
+
if _omp:
|
| 31 |
+
try:
|
| 32 |
+
n = int(str(_omp).strip())
|
| 33 |
+
if n <= 0:
|
| 34 |
+
raise ValueError
|
| 35 |
+
except Exception:
|
| 36 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 37 |
+
|
| 38 |
+
# Optionally quiet ONNX Runtime if present
|
| 39 |
+
os.environ.setdefault("ORT_LOG_SEVERITY_LEVEL", "3") # WARN
|
| 40 |
+
# Disable accidental GPU probing on CPU Spaces (harmless if GPU exists)
|
| 41 |
+
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
|
| 42 |
+
|
| 43 |
logging.basicConfig(
|
| 44 |
level=logging.ERROR,
|
| 45 |
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
|
|
|
|
| 53 |
from fastapi.middleware.cors import CORSMiddleware
|
| 54 |
from pydantic import BaseModel, Field
|
| 55 |
|
| 56 |
+
# Vector store (new Chroma client API)
|
| 57 |
try:
|
| 58 |
from langchain_chroma import Chroma
|
| 59 |
except ImportError:
|
| 60 |
from langchain_community.vectorstores import Chroma # fallback
|
| 61 |
|
| 62 |
+
from chromadb import PersistentClient # NEW: explicit client
|
| 63 |
+
|
| 64 |
# LLM endpoint
|
| 65 |
try:
|
| 66 |
from langchain_huggingface import HuggingFaceEndpoint
|
|
|
|
| 87 |
from langchain_core.documents import Document
|
| 88 |
from langchain_core.embeddings import Embeddings # modern base
|
| 89 |
|
| 90 |
+
# Dataset + PDF loading helpers
|
| 91 |
from huggingface_hub import snapshot_download, HfApi
|
| 92 |
from langchain_community.document_loaders import PyPDFLoader
|
| 93 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
# -------------------- Config --------------------
|
| 96 |
ENV = os.getenv
|
| 97 |
DB_DIR = ENV("RAG_DB_DIR", "/data/chroma_db") # intended Chroma dir
|
| 98 |
+
COLLECTION_NAME = ENV("RAG_COLLECTION", "career_gpt") # NEW: explicit collection name
|
| 99 |
+
|
| 100 |
EMBED_PROVIDER = ENV("RAG_EMBED_PROVIDER", "bge").lower() # bge | fastembed | hf_local
|
| 101 |
EMBED_MODEL = ENV("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
|
| 102 |
DEVICE = ENV("RAG_DEVICE", "cpu")
|
|
|
|
| 119 |
PORT = int(ENV("PORT", ENV("RAG_PORT", "7860"))) # Spaces $PORT first
|
| 120 |
CORS_ORIGINS = ENV("RAG_CORS_ORIGINS", "*")
|
| 121 |
|
| 122 |
+
# Dataset sync locations
|
| 123 |
DATASET_ID = ENV("RAG_DATASET_ID", "internationalscholarsprogram/DOC")
|
| 124 |
DATA_REV = ENV("RAG_DATASET_REVISION", "main") # tag/branch/sha, or "main"
|
| 125 |
CORPUS_DIR = ENV("RAG_CORPUS_DIR", "/data/corpus") # where PDFs are downloaded
|
|
|
|
| 176 |
base = HuggingFaceBgeEmbeddings(
|
| 177 |
model_name=model,
|
| 178 |
model_kwargs={"device": device},
|
| 179 |
+
encode_kwargs={'normalize_embeddings': True},
|
| 180 |
)
|
| 181 |
return BGEAdapter(base, use_prefixes=use_prefixes)
|
| 182 |
|
|
|
|
| 187 |
base = HFEmbeddings(
|
| 188 |
model_name=model,
|
| 189 |
model_kwargs={"device": device},
|
| 190 |
+
encode_kwargs={'normalize_embeddings': True},
|
| 191 |
)
|
| 192 |
return BGEAdapter(base, use_prefixes=("bge" in model.lower() and use_prefixes))
|
| 193 |
|
|
|
|
| 202 |
batch_size=EMBED_BATCH,
|
| 203 |
)
|
| 204 |
|
| 205 |
+
# -------------------- Vector DB handle (new Chroma client) --------------------
|
| 206 |
os.makedirs(DB_DIR, exist_ok=True)
|
| 207 |
+
|
| 208 |
+
# Create a persistent Chroma **client** (NEW API)
|
| 209 |
+
_chroma_client = PersistentClient(path=DB_DIR)
|
| 210 |
+
|
| 211 |
+
# Create/open the collection via LangChain's wrapper
|
| 212 |
vectordb = Chroma(
|
| 213 |
+
collection_name=COLLECTION_NAME,
|
| 214 |
embedding_function=embeddings,
|
| 215 |
+
client=_chroma_client,
|
| 216 |
collection_metadata={"hnsw:space": "cosine"},
|
|
|
|
| 217 |
)
|
| 218 |
|
| 219 |
def build_retriever(k: int):
|
|
|
|
| 309 |
|
| 310 |
def _reset_chroma_dir():
|
| 311 |
"""Safely reset the Chroma persist dir even if a client is holding files."""
|
| 312 |
+
# Try to drop the collection cleanly
|
| 313 |
try:
|
| 314 |
+
# Delete collection if it exists
|
| 315 |
+
try:
|
| 316 |
+
_chroma_client.delete_collection(COLLECTION_NAME)
|
| 317 |
+
except Exception:
|
| 318 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
except Exception:
|
| 320 |
pass
|
| 321 |
|
| 322 |
+
# Ensure on-disk dir is clean (client keeps metadata separately)
|
| 323 |
for _ in range(10):
|
| 324 |
try:
|
| 325 |
if os.path.isdir(DB_DIR):
|
| 326 |
shutil.rmtree(DB_DIR)
|
| 327 |
break
|
| 328 |
except OSError:
|
| 329 |
+
time.sleep(0.2)
|
| 330 |
os.makedirs(DB_DIR, exist_ok=True)
|
| 331 |
|
| 332 |
+
def _open_vectordb():
|
| 333 |
+
"""(Re)create a vectordb handle bound to the persistent client."""
|
| 334 |
+
global vectordb, _chroma_client
|
| 335 |
+
_chroma_client = PersistentClient(path=DB_DIR)
|
| 336 |
vectordb = Chroma(
|
| 337 |
+
collection_name=COLLECTION_NAME,
|
| 338 |
embedding_function=embeddings,
|
| 339 |
+
client=_chroma_client,
|
| 340 |
collection_metadata={"hnsw:space": "cosine"},
|
|
|
|
| 341 |
)
|
| 342 |
+
|
| 343 |
+
def rebuild_chroma(docs: List[Document]):
|
| 344 |
+
_reset_chroma_dir()
|
| 345 |
+
_open_vectordb()
|
| 346 |
if docs:
|
| 347 |
# Add in small batches to keep memory low
|
| 348 |
batch = 64
|
| 349 |
for i in range(0, len(docs), batch):
|
| 350 |
vectordb.add_documents(docs[i:i+batch])
|
| 351 |
+
# No explicit persist() call needed; PersistentClient is, well, persistent.
|
| 352 |
|
| 353 |
def reindex_if_needed(force: bool = False, revision: str = DATA_REV) -> Dict[str, Any]:
|
| 354 |
"""
|
|
|
|
| 358 |
st = _state_load()
|
| 359 |
old_sha = st.get("dataset_sha")
|
| 360 |
|
| 361 |
+
needs_rebuild = force or (new_sha != old_sha)
|
| 362 |
+
if needs_rebuild:
|
| 363 |
pdfs = list_pdf_paths(CORPUS_DIR)
|
| 364 |
docs = load_docs_from_pdfs(pdfs)
|
| 365 |
rebuild_chroma(docs)
|
| 366 |
st["dataset_sha"] = new_sha
|
| 367 |
_state_save(st)
|
| 368 |
return {"reindexed": True, "commit": new_sha, "docs": len(docs)}
|
| 369 |
+
else:
|
| 370 |
+
# Ensure handle is open even if no rebuild was needed
|
| 371 |
+
_open_vectordb()
|
| 372 |
+
return {"reindexed": False, "commit": new_sha}
|
| 373 |
|
| 374 |
# -------------------- Helpers --------------------
|
| 375 |
+
def format_docs(docs: List[Document] -> str):
|
| 376 |
parts = []
|
| 377 |
for i, d in enumerate(docs, 1):
|
| 378 |
src = d.metadata.get("source", "unknown")
|
|
|
|
| 412 |
return {"answer": answer, "citations": cits, "used_k": k}
|
| 413 |
|
| 414 |
# -------------------- FastAPI --------------------
|
| 415 |
+
app = FastAPI(title="Career GPT RAG API", version="1.2.0")
|
| 416 |
app.add_middleware(
|
| 417 |
CORSMiddleware,
|
| 418 |
allow_origins=[o.strip() for o in CORS_ORIGINS.split(",") if o.strip()],
|
|
|
|
| 452 |
# Best-effort count
|
| 453 |
count = 0
|
| 454 |
try:
|
| 455 |
+
# Internal attribute may exist; keep as soft attempt
|
| 456 |
+
count = getattr(vectordb, "_collection", None).count() # type: ignore[call-arg, attr-defined]
|
| 457 |
except Exception:
|
| 458 |
meta = vectordb.get(limit=1)
|
| 459 |
count = len(meta.get("ids", []))
|
|
|
|
| 461 |
return {
|
| 462 |
"status": "ok",
|
| 463 |
"db_dir": DB_DIR,
|
| 464 |
+
"collection": COLLECTION_NAME,
|
| 465 |
+
"docs_indexed_estimate": count,
|
| 466 |
"embed_provider": EMBED_PROVIDER,
|
| 467 |
"embed_model": EMBED_MODEL,
|
| 468 |
"llm": HF_LLM_REPO,
|
|
|
|
| 492 |
log.exception("Unhandled /ask error")
|
| 493 |
raise HTTPException(status_code=500, detail=str(e))
|
| 494 |
|
| 495 |
+
# ---- Manual refresh endpoint ----
|
| 496 |
@app.post("/refresh")
|
| 497 |
def refresh(_ok: bool = Depends(require_api_key)):
|
| 498 |
"""
|
code wget step; clean Dockerfile
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[1mdiff --git a/Dockerfile b/Dockerfile[m
|
| 2 |
+
[1mindex b16398c..659aa4c 100644[m
|
| 3 |
+
[1m--- a/Dockerfile[m
|
| 4 |
+
[1m+++ b/Dockerfile[m
|
| 5 |
+
[36m@@ -3,8 +3,6 @@[m
|
| 6 |
+
# ----------------------------------------[m
|
| 7 |
+
FROM python:3.11-slim-bookworm[m
|
| 8 |
+
[m
|
| 9 |
+
[31m-LABEL space.rebuild="final-fix-2025-10-22"[m
|
| 10 |
+
[31m-[m
|
| 11 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \[m
|
| 12 |
+
PYTHONUNBUFFERED=1 \[m
|
| 13 |
+
PIP_NO_CACHE_DIR=1 \[m
|
| 14 |
+
[36m@@ -14,31 +12,31 @@[m [mENV PYTHONDONTWRITEBYTECODE=1 \[m
|
| 15 |
+
RAG_DATASET_ID=internationalscholarsprogram/DOC \[m
|
| 16 |
+
RAG_DATASET_REVISION=main \[m
|
| 17 |
+
RAG_PORT=7860 \[m
|
| 18 |
+
[31m- PORT=7860 \[m
|
| 19 |
+
[31m- PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"[m
|
| 20 |
+
[32m+[m[32m PORT=7860[m
|
| 21 |
+
[m
|
| 22 |
+
[31m-# ✅ Minimal system dependencies only[m
|
| 23 |
+
[32m+[m[32m# System deps (no git, no wget needed)[m
|
| 24 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \[m
|
| 25 |
+
tini curl ca-certificates \[m
|
| 26 |
+
&& rm -rf /var/lib/apt/lists/*[m
|
| 27 |
+
[m
|
| 28 |
+
[31m-# ✅ Create non-root user[m
|
| 29 |
+
[32m+[m[32m# Non-root user[m
|
| 30 |
+
RUN useradd -m -u 1000 appuser[m
|
| 31 |
+
[32m+[m
|
| 32 |
+
WORKDIR /app[m
|
| 33 |
+
[m
|
| 34 |
+
[31m-# ✅ Install dependencies[m
|
| 35 |
+
[32m+[m[32m# Python deps[m
|
| 36 |
+
COPY requirements.txt .[m
|
| 37 |
+
RUN python -m pip install --upgrade pip setuptools wheel \[m
|
| 38 |
+
&& pip install --no-cache-dir -r requirements.txt[m
|
| 39 |
+
[m
|
| 40 |
+
[31m-# ✅ Copy app files[m
|
| 41 |
+
[32m+[m[32m# Project files[m
|
| 42 |
+
COPY . .[m
|
| 43 |
+
[m
|
| 44 |
+
[31m-# ✅ Prepare /data directories[m
|
| 45 |
+
[32m+[m[32m# Persistent dirs & permissions[m
|
| 46 |
+
RUN mkdir -p /data/chroma_db /data/.huggingface /data/corpus \[m
|
| 47 |
+
&& chown -R appuser:appuser /data /app[m
|
| 48 |
+
[m
|
| 49 |
+
[31m-# ✅ Optional: make bootstrap.sh executable if it exists[m
|
| 50 |
+
[32m+[m[32m# Optional start script perms[m
|
| 51 |
+
RUN if [ -f "bootstrap.sh" ]; then chmod +x bootstrap.sh; fi[m
|
| 52 |
+
[m
|
| 53 |
+
USER appuser[m
|
| 54 |
+
[36m@@ -47,5 +45,6 @@[m [mEXPOSE 7860[m
|
| 55 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s \[m
|
| 56 |
+
CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1[m
|
| 57 |
+
[m
|
| 58 |
+
[31m-ENTRYPOINT ["/usr/bin/tini","--"][m
|
| 59 |
+
[31m-CMD ["python","app.py"][m
|
| 60 |
+
[32m+[m[32mENTRYPOINT ["/usr/bin/tini", "--"][m
|
| 61 |
+
[32m+[m[32m# CMD ["bash", "bootstrap.sh"][m
|
| 62 |
+
[32m+[m[32mCMD ["python", "app.py"][m
|