nyakslamo commited on
Commit
573cc03
·
1 Parent(s): 732ff9a

deploy: RAG dataset sync + Docker + persistence

Browse files
Files changed (4) hide show
  1. Dockerfile +27 -20
  2. app.py +156 -28
  3. bootstrap.sh +33 -9
  4. sites.yaml +0 -21
Dockerfile CHANGED
@@ -3,50 +3,57 @@
3
  # ----------------------------------------
4
  FROM python:3.11-slim-bookworm
5
 
6
- # Core env
7
  ENV PYTHONDONTWRITEBYTECODE=1 \
8
  PYTHONUNBUFFERED=1 \
9
  PIP_NO_CACHE_DIR=1 \
10
  HF_HOME=/data/.huggingface \
11
  RAG_DB_DIR=/data/chroma_db \
 
 
 
12
  RAG_PORT=7860 \
13
  PORT=7860
14
 
15
- # System deps (add more if your loaders need them)
16
- # - tini: clean signal handling for FastAPI/uvicorn
17
- # - git, curl: handy for debugging and HF pulls
18
  RUN apt-get update && apt-get install -y --no-install-recommends \
19
- build-essential \
20
- git \
21
- curl \
22
- tini \
23
  && rm -rf /var/lib/apt/lists/*
24
 
 
 
 
25
  WORKDIR /app
26
 
27
- # Python deps
28
  COPY requirements.txt .
29
  RUN python -m pip install --upgrade pip setuptools wheel \
30
- && pip install -r requirements.txt
31
 
32
- # Project files
33
  COPY . .
34
 
35
- # Make and open writable data dirs (Space persistent storage should mount /data)
36
- RUN mkdir -p /data/chroma_db /data/.huggingface && chmod -R 777 /data
 
 
 
 
37
 
38
- # Make bootstrap.sh executable inside the container
39
- RUN chmod +x bootstrap.sh
40
 
41
- # The Space routes traffic to $PORT; exposing is optional but harmless
42
  EXPOSE 7860
43
 
44
- # Healthcheck (optional but useful)
45
  HEALTHCHECK --interval=30s --timeout=5s --start-period=20s \
46
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
47
 
48
- # Use tini as PID 1 so SIGTERM/SIGINT cleanly stop uvicorn
49
  ENTRYPOINT ["/usr/bin/tini", "--"]
50
 
51
- # Entrypoint script
52
- CMD ["bash", "bootstrap.sh"]
 
 
 
3
  # ----------------------------------------
4
  FROM python:3.11-slim-bookworm
5
 
6
+ # ---- Core env (persistent paths + defaults) ----
7
  ENV PYTHONDONTWRITEBYTECODE=1 \
8
  PYTHONUNBUFFERED=1 \
9
  PIP_NO_CACHE_DIR=1 \
10
  HF_HOME=/data/.huggingface \
11
  RAG_DB_DIR=/data/chroma_db \
12
+ RAG_CORPUS_DIR=/data/corpus \
13
+ RAG_DATASET_ID=internationalscholarsprogram/DOC \
14
+ RAG_DATASET_REVISION=main \
15
  RAG_PORT=7860 \
16
  PORT=7860
17
 
18
+ # ---- System deps ----
 
 
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
+ tini curl ca-certificates \
 
 
 
21
  && rm -rf /var/lib/apt/lists/*
22
 
23
+ # ---- Create a non-root user (safer) ----
24
+ RUN useradd -m -u 1000 appuser
25
+
26
  WORKDIR /app
27
 
28
+ # ---- Python deps ----
29
  COPY requirements.txt .
30
  RUN python -m pip install --upgrade pip setuptools wheel \
31
+ && pip install --no-cache-dir -r requirements.txt
32
 
33
+ # ---- Project files ----
34
  COPY . .
35
 
36
+ # ---- Make persistent dirs & relax permissions (Space mounts /data) ----
37
+ RUN mkdir -p /data/chroma_db /data/.huggingface /data/corpus \
38
+ && chown -R appuser:appuser /data /app
39
+
40
+ # If you use a start script, ensure it's executable (optional)
41
+ RUN if [ -f "bootstrap.sh" ]; then chmod +x bootstrap.sh; fi
42
 
43
+ # ---- Drop privileges ----
44
+ USER appuser
45
 
46
+ # ---- Networking ----
47
  EXPOSE 7860
48
 
49
+ # ---- Healthcheck (hits /health) ----
50
  HEALTHCHECK --interval=30s --timeout=5s --start-period=20s \
51
  CMD curl -fsS "http://127.0.0.1:${PORT}/health" || exit 1
52
 
53
+ # ---- PID 1 = tini for clean shutdowns ----
54
  ENTRYPOINT ["/usr/bin/tini", "--"]
55
 
56
+ # ---- Start command ----
57
+ # If you’re using bootstrap.sh, uncomment the next line and comment out the python line.
58
+ # CMD ["bash", "bootstrap.sh"]
59
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -5,13 +5,22 @@
5
  Career GPT RAG API — FastAPI over Chroma + Embeddings + HuggingFace Inference LLM
6
  Optimized for Hugging Face Spaces deployment.
7
 
8
- Key points for Spaces:
9
- - DB_DIR persists under /data/chroma_db (enable Persistent storage in Space settings)
10
- - PORT comes from $PORT (injected by Spaces), or falls back to RAG_PORT / 7860
11
- - LLM uses Hugging Face Inference API (set secret: HUGGINGFACEHUB_API_TOKEN)
 
 
 
 
 
 
 
 
 
12
  """
13
 
14
- import os, sys, logging, warnings
15
  from typing import List, Optional, Iterable, Dict, Any
16
 
17
  # -------------------- Quiet warnings --------------------
@@ -47,7 +56,7 @@ try:
47
  except ImportError:
48
  from langchain_community.llms import HuggingFaceEndpoint # fallback
49
 
50
- # Prefer BGE or FastEmbed (no sklearn/scipy). Fall back to HF Embeddings if available.
51
  from langchain_community.embeddings import (
52
  HuggingFaceBgeEmbeddings,
53
  FastEmbedEmbeddings,
@@ -55,23 +64,27 @@ from langchain_community.embeddings import (
55
  try:
56
  from langchain_huggingface import HuggingFaceEmbeddings as HFEmbeddings # optional
57
  except ImportError:
58
- HFEmbeddings = None # not strictly needed
59
 
60
  from langchain.prompts import ChatPromptTemplate
61
  from langchain_core.output_parsers import StrOutputParser
62
  try:
63
  from langchain_core.runnables import RunnableParallel
64
  except ImportError:
65
- # Very old LC versions: we can run without Parallel wiring
66
  RunnableParallel = None
67
 
68
  from langchain_core.documents import Document
69
  from langchain_core.embeddings import Embeddings # modern base
70
 
 
 
 
 
 
71
  # -------------------- Config --------------------
72
  ENV = os.getenv
73
- DB_DIR = ENV("RAG_DB_DIR", "/data/chroma_db") # persistent volume in Spaces
74
- EMBED_PROVIDER = ENV("RAG_EMBED_PROVIDER", "bge").lower() # bge | fastembed | hf_local
75
  EMBED_MODEL = ENV("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
76
  DEVICE = ENV("RAG_DEVICE", "cpu")
77
  HF_TOKEN = ENV("HUGGINGFACEHUB_API_TOKEN", "")
@@ -88,11 +101,17 @@ FALLBACK_MSG = ENV(
88
  "I am Career GPT for International Scholars Program and I’m still under training. "
89
  "I hope I’ll keep learning and improve my responses next time."
90
  )
91
- API_KEY = ENV("RAG_API_KEY") # optional bearer key for /ask
92
  HOST = ENV("RAG_HOST", "0.0.0.0")
93
- PORT = int(ENV("PORT", ENV("RAG_PORT", "7860"))) # <-- Spaces $PORT first
94
  CORS_ORIGINS = ENV("RAG_CORS_ORIGINS", "*")
95
 
 
 
 
 
 
 
96
  # -------------------- Embeddings --------------------
97
  def batched(iterable: Iterable, n: int):
98
  b = []
@@ -123,9 +142,7 @@ class BGEAdapter(Embeddings):
123
  def build_embeddings(provider: str, model: str, device: str,
124
  use_prefixes: bool, hf_token: str, batch_size: int) -> Embeddings:
125
  provider = (provider or "").lower()
126
- # Preferred: BGE (no scipy/sklearn) — fast & reliable on Spaces CPU
127
  if provider in ("bge", "hf_bge", "bge_small"):
128
- log.info(f"Embedding provider: BGE ({model}) on {device}")
129
  base = HuggingFaceBgeEmbeddings(
130
  model_name=model,
131
  model_kwargs={"device": device},
@@ -133,24 +150,17 @@ def build_embeddings(provider: str, model: str, device: str,
133
  )
134
  return BGEAdapter(base, use_prefixes=use_prefixes)
135
 
136
- # Option: FastEmbed (tiny, very fast)
137
  if provider in ("fastembed", "fe"):
138
- log.info("Embedding provider: FastEmbed")
139
  return FastEmbedEmbeddings()
140
 
141
- # Fallback: classic HF Embeddings (may pull sklearn/scipy via sentence-transformers)
142
  if HFEmbeddings is not None and provider in ("hf_local", "hf", "sentence_transformers", ""):
143
- log.info(f"Embedding provider: HF local ({model}) on {device}")
144
  base = HFEmbeddings(
145
  model_name=model,
146
  model_kwargs={"device": device},
147
  encode_kwargs={"normalize_embeddings": True},
148
  )
149
- # BGEAdapter is harmless even for non-BGE models if use_prefixes=False
150
  return BGEAdapter(base, use_prefixes=("bge" in model.lower() and use_prefixes))
151
 
152
- # Last resort: FastEmbed
153
- log.warning(f"Unknown EMBED_PROVIDER '{provider}', defaulting to FastEmbed.")
154
  return FastEmbedEmbeddings()
155
 
156
  embeddings = build_embeddings(
@@ -162,10 +172,8 @@ embeddings = build_embeddings(
162
  batch_size=EMBED_BATCH,
163
  )
164
 
165
- # -------------------- Vector DB / Retriever --------------------
166
- # Ensure the persistent dir exists (first boot in Space)
167
  os.makedirs(DB_DIR, exist_ok=True)
168
-
169
  vectordb = Chroma(
170
  persist_directory=DB_DIR,
171
  embedding_function=embeddings,
@@ -203,10 +211,100 @@ prompt = ChatPromptTemplate.from_template(
203
  "Answer concisely and include source tags like [1], [2] where relevant."
204
  )
205
 
206
- # Runnable wiring: pass exactly the keys the prompt expects.
207
  parser = StrOutputParser()
208
  chain = (prompt | llm | parser)
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # -------------------- Helpers --------------------
211
  def format_docs(docs: List[Document]) -> str:
212
  parts = []
@@ -248,7 +346,7 @@ def answer_question(question: str, k: int = TOP_K_DEFAULT) -> Dict[str, Any]:
248
  return {"answer": answer, "citations": cits, "used_k": k}
249
 
250
  # -------------------- FastAPI --------------------
251
- app = FastAPI(title="Career GPT RAG API", version="1.0.1")
252
  app.add_middleware(
253
  CORSMiddleware,
254
  allow_origins=[o.strip() for o in CORS_ORIGINS.split(",") if o.strip()],
@@ -275,18 +373,35 @@ class AskResponse(BaseModel):
275
  citations: list
276
  used_k: int
277
 
 
 
 
 
 
 
 
278
  @app.get("/healthz")
279
  def healthz():
280
  try:
281
- meta = vectordb.get(include=["metadatas"], limit=1)
 
 
 
 
 
 
 
282
  return {
283
  "status": "ok",
284
  "db_dir": DB_DIR,
285
- "docs_indexed": len(meta.get("ids", [])),
286
  "embed_provider": EMBED_PROVIDER,
287
  "embed_model": EMBED_MODEL,
288
  "llm": HF_LLM_REPO,
289
  "hf_token_present": bool(HF_TOKEN),
 
 
 
290
  }
291
  except Exception as e:
292
  log.exception("Health check failed")
@@ -309,6 +424,19 @@ def ask(req: AskRequest, _ok: bool = Depends(require_api_key)):
309
  log.exception("Unhandled /ask error")
310
  raise HTTPException(status_code=500, detail=str(e))
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  # -------------------- Runner --------------------
313
  if __name__ == "__main__":
314
  import uvicorn
 
5
  Career GPT RAG API — FastAPI over Chroma + Embeddings + HuggingFace Inference LLM
6
  Optimized for Hugging Face Spaces deployment.
7
 
8
+ Enhancements in this version:
9
+ - Pull PDFs from a Hugging Face DATASET repo into /data/corpus (persistent storage)
10
+ - Auto-(re)index Chroma when the dataset commit SHA changes
11
+ - /refresh endpoint to force re-pull + reindex without redeploying
12
+
13
+ Space requirements:
14
+ - Enable Persistent storage in Space settings
15
+ - Set env (optional defaults shown below):
16
+ RAG_DATASET_ID=internationalscholarsprogram/DOC
17
+ RAG_DATASET_REVISION=main
18
+ RAG_DB_DIR=/data/chroma_db
19
+ RAG_CORPUS_DIR=/data/corpus
20
+ - Add to requirements.txt: huggingface_hub, pypdf, langchain (or your version)
21
  """
22
 
23
+ import os, sys, logging, warnings, json, shutil
24
  from typing import List, Optional, Iterable, Dict, Any
25
 
26
  # -------------------- Quiet warnings --------------------
 
56
  except ImportError:
57
  from langchain_community.llms import HuggingFaceEndpoint # fallback
58
 
59
+ # Embeddings
60
  from langchain_community.embeddings import (
61
  HuggingFaceBgeEmbeddings,
62
  FastEmbedEmbeddings,
 
64
  try:
65
  from langchain_huggingface import HuggingFaceEmbeddings as HFEmbeddings # optional
66
  except ImportError:
67
+ HFEmbeddings = None
68
 
69
  from langchain.prompts import ChatPromptTemplate
70
  from langchain_core.output_parsers import StrOutputParser
71
  try:
72
  from langchain_core.runnables import RunnableParallel
73
  except ImportError:
 
74
  RunnableParallel = None
75
 
76
  from langchain_core.documents import Document
77
  from langchain_core.embeddings import Embeddings # modern base
78
 
79
+ # NEW: dataset + PDF loading helpers
80
+ from huggingface_hub import snapshot_download, get_repo_info
81
+ from langchain_community.document_loaders import PyPDFLoader
82
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
83
+
84
  # -------------------- Config --------------------
85
  ENV = os.getenv
86
+ DB_DIR = ENV("RAG_DB_DIR", "/data/chroma_db") # persistent Chroma dir
87
+ EMBED_PROVIDER = ENV("RAG_EMBED_PROVIDER", "bge").lower() # bge | fastembed | hf_local
88
  EMBED_MODEL = ENV("RAG_EMBED_MODEL", "BAAI/bge-small-en-v1.5")
89
  DEVICE = ENV("RAG_DEVICE", "cpu")
90
  HF_TOKEN = ENV("HUGGINGFACEHUB_API_TOKEN", "")
 
101
  "I am Career GPT for International Scholars Program and I’m still under training. "
102
  "I hope I’ll keep learning and improve my responses next time."
103
  )
104
+ API_KEY = ENV("RAG_API_KEY") # optional bearer key for /ask
105
  HOST = ENV("RAG_HOST", "0.0.0.0")
106
+ PORT = int(ENV("PORT", ENV("RAG_PORT", "7860"))) # Spaces $PORT first
107
  CORS_ORIGINS = ENV("RAG_CORS_ORIGINS", "*")
108
 
109
+ # NEW: dataset sync locations
110
+ DATASET_ID = ENV("RAG_DATASET_ID", "internationalscholarsprogram/DOC")
111
+ DATA_REV = ENV("RAG_DATASET_REVISION", "main") # tag/branch/sha, or "main"
112
+ CORPUS_DIR = ENV("RAG_CORPUS_DIR", "/data/corpus") # where PDFs are downloaded
113
+ STATE_FILE = ENV("RAG_STATE_FILE", "/data/.state.json") # remembers last indexed commit
114
+
115
  # -------------------- Embeddings --------------------
116
  def batched(iterable: Iterable, n: int):
117
  b = []
 
142
  def build_embeddings(provider: str, model: str, device: str,
143
  use_prefixes: bool, hf_token: str, batch_size: int) -> Embeddings:
144
  provider = (provider or "").lower()
 
145
  if provider in ("bge", "hf_bge", "bge_small"):
 
146
  base = HuggingFaceBgeEmbeddings(
147
  model_name=model,
148
  model_kwargs={"device": device},
 
150
  )
151
  return BGEAdapter(base, use_prefixes=use_prefixes)
152
 
 
153
  if provider in ("fastembed", "fe"):
 
154
  return FastEmbedEmbeddings()
155
 
 
156
  if HFEmbeddings is not None and provider in ("hf_local", "hf", "sentence_transformers", ""):
 
157
  base = HFEmbeddings(
158
  model_name=model,
159
  model_kwargs={"device": device},
160
  encode_kwargs={"normalize_embeddings": True},
161
  )
 
162
  return BGEAdapter(base, use_prefixes=("bge" in model.lower() and use_prefixes))
163
 
 
 
164
  return FastEmbedEmbeddings()
165
 
166
  embeddings = build_embeddings(
 
172
  batch_size=EMBED_BATCH,
173
  )
174
 
175
+ # -------------------- Vector DB handle (created now; filled later) --------------------
 
176
  os.makedirs(DB_DIR, exist_ok=True)
 
177
  vectordb = Chroma(
178
  persist_directory=DB_DIR,
179
  embedding_function=embeddings,
 
211
  "Answer concisely and include source tags like [1], [2] where relevant."
212
  )
213
 
 
214
  parser = StrOutputParser()
215
  chain = (prompt | llm | parser)
216
 
217
+ # -------------------- Dataset sync & indexing --------------------
218
+ def _state_load() -> dict:
219
+ if os.path.exists(STATE_FILE):
220
+ try:
221
+ with open(STATE_FILE, "r") as f:
222
+ return json.load(f)
223
+ except Exception:
224
+ return {}
225
+ return {}
226
+
227
+ def _state_save(st: dict):
228
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
229
+ with open(STATE_FILE, "w") as f:
230
+ json.dump(st, f)
231
+
232
+ def sync_pdfs(revision: str = DATA_REV) -> str:
233
+ """
234
+ Pull/update PDFs from the HF dataset into CORPUS_DIR and return the exact commit sha.
235
+ Uses ETag-aware snapshot_download → only changed files are fetched.
236
+ """
237
+ os.makedirs(CORPUS_DIR, exist_ok=True)
238
+ snapshot_download(
239
+ repo_id=DATASET_ID,
240
+ repo_type="dataset",
241
+ revision=revision,
242
+ local_dir=CORPUS_DIR,
243
+ local_dir_use_symlinks=False,
244
+ )
245
+ info = get_repo_info(DATASET_ID, repo_type="dataset", revision=revision)
246
+ return info.sha
247
+
248
+ def list_pdf_paths(root: str) -> List[str]:
249
+ out: List[str] = []
250
+ for r, _, files in os.walk(root):
251
+ for f in files:
252
+ if f.lower().endswith(".pdf"):
253
+ out.append(os.path.join(r, f))
254
+ return sorted(out)
255
+
256
+ def load_docs_from_pdfs(pdf_paths: List[str]) -> List[Document]:
257
+ splitter = RecursiveCharacterTextSplitter(
258
+ chunk_size=1200, chunk_overlap=200, separators=["\n\n", "\n", " ", ""]
259
+ )
260
+ docs: List[Document] = []
261
+ for path in pdf_paths:
262
+ try:
263
+ loader = PyPDFLoader(path)
264
+ pages = loader.load()
265
+ chunks = splitter.split_documents(pages)
266
+ for c in chunks:
267
+ c.metadata.setdefault("source", path)
268
+ docs.extend(chunks)
269
+ except Exception as e:
270
+ log.error(f"Failed to parse {path}: {e}")
271
+ return docs
272
+
273
+ def _reset_chroma_dir():
274
+ # safest reset: delete the dir and recreate
275
+ if os.path.isdir(DB_DIR):
276
+ shutil.rmtree(DB_DIR)
277
+ os.makedirs(DB_DIR, exist_ok=True)
278
+
279
+ def rebuild_chroma(docs: List[Document]):
280
+ global vectordb
281
+ _reset_chroma_dir()
282
+ vectordb = Chroma(
283
+ persist_directory=DB_DIR,
284
+ embedding_function=embeddings,
285
+ collection_metadata={"hnsw:space": "cosine"},
286
+ )
287
+ if docs:
288
+ vectordb.add_documents(docs)
289
+ vectordb.persist()
290
+
291
+ def reindex_if_needed(force: bool = False, revision: str = DATA_REV) -> Dict[str, Any]:
292
+ """
293
+ Pull dataset → compare commit sha → rebuild index if changed or forced.
294
+ """
295
+ new_sha = sync_pdfs(revision)
296
+ st = _state_load()
297
+ old_sha = st.get("dataset_sha")
298
+
299
+ if force or (new_sha != old_sha) or (not os.path.isdir(DB_DIR)):
300
+ pdfs = list_pdf_paths(CORPUS_DIR)
301
+ docs = load_docs_from_pdfs(pdfs)
302
+ rebuild_chroma(docs)
303
+ st["dataset_sha"] = new_sha
304
+ _state_save(st)
305
+ return {"reindexed": True, "commit": new_sha, "docs": len(docs)}
306
+ return {"reindexed": False, "commit": new_sha}
307
+
308
  # -------------------- Helpers --------------------
309
  def format_docs(docs: List[Document]) -> str:
310
  parts = []
 
346
  return {"answer": answer, "citations": cits, "used_k": k}
347
 
348
  # -------------------- FastAPI --------------------
349
+ app = FastAPI(title="Career GPT RAG API", version="1.1.0")
350
  app.add_middleware(
351
  CORSMiddleware,
352
  allow_origins=[o.strip() for o in CORS_ORIGINS.split(",") if o.strip()],
 
373
  citations: list
374
  used_k: int
375
 
376
+ # ---- Startup: sync + (re)index if dataset changed ----
377
+ try:
378
+ info = reindex_if_needed(force=False, revision=DATA_REV)
379
+ log.info(f"Index warmup → {info}")
380
+ except Exception as e:
381
+ log.exception("Initial sync/index failed")
382
+
383
  @app.get("/healthz")
384
  def healthz():
385
  try:
386
+ # Best-effort count
387
+ count = 0
388
+ try:
389
+ count = vectordb._collection.count() # type: ignore[attr-defined]
390
+ except Exception:
391
+ meta = vectordb.get(limit=1)
392
+ count = len(meta.get("ids", []))
393
+ st = _state_load()
394
  return {
395
  "status": "ok",
396
  "db_dir": DB_DIR,
397
+ "docs_indexed": count,
398
  "embed_provider": EMBED_PROVIDER,
399
  "embed_model": EMBED_MODEL,
400
  "llm": HF_LLM_REPO,
401
  "hf_token_present": bool(HF_TOKEN),
402
+ "dataset": DATASET_ID,
403
+ "dataset_rev": DATA_REV,
404
+ "dataset_sha_indexed": st.get("dataset_sha"),
405
  }
406
  except Exception as e:
407
  log.exception("Health check failed")
 
424
  log.exception("Unhandled /ask error")
425
  raise HTTPException(status_code=500, detail=str(e))
426
 
427
+ # ---- NEW: manual refresh endpoint ----
428
+ @app.post("/refresh")
429
+ def refresh(_ok: bool = Depends(require_api_key)):
430
+ """
431
+ Force re-pull dataset + rebuild index (use right after pushing new PDFs).
432
+ """
433
+ try:
434
+ info = reindex_if_needed(force=True, revision=DATA_REV)
435
+ return {"status": "ok", **info}
436
+ except Exception as e:
437
+ log.exception("/refresh failed")
438
+ raise HTTPException(status_code=500, detail=str(e))
439
+
440
  # -------------------- Runner --------------------
441
  if __name__ == "__main__":
442
  import uvicorn
bootstrap.sh CHANGED
@@ -1,20 +1,44 @@
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
- # Respect Spaces’ $PORT if provided
5
  : "${PORT:=7860}"
6
  : "${RAG_PORT:=${PORT}}"
7
  : "${RAG_DB_DIR:=/data/chroma_db}"
 
 
8
 
9
- echo "[bootstrap] Using PORT=${PORT} RAG_DB_DIR=${RAG_DB_DIR}"
10
 
11
- # Optional: pre-warm embeddings or index docs on first run
12
- if [ -d "docs" ]; then
13
- echo "[bootstrap] Ingesting docs -> ${RAG_DB_DIR}"
14
- # light + SciPy-free providers recommended (bge or fastembed)
15
- python ingest.py --docs docs --db "${RAG_DB_DIR}" --embed-provider bge --device cpu || true
 
 
16
  fi
17
 
 
 
18
  echo "[bootstrap] Starting API server..."
19
- # uvicorn picks up HOST/PORT from app.py; still pass here for clarity
20
- python app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
+ # Respect Spaces’ injected PORT; default to 7860 locally
5
  : "${PORT:=7860}"
6
  : "${RAG_PORT:=${PORT}}"
7
  : "${RAG_DB_DIR:=/data/chroma_db}"
8
+ : "${RAG_CORPUS_DIR:=/data/corpus}"
9
+ : "${RAG_FORCE_REFRESH:=0}" # set to 1 to force reindex on startup
10
 
11
+ echo "[bootstrap] PORT=${PORT} RAG_DB_DIR=${RAG_DB_DIR} RAG_CORPUS_DIR=${RAG_CORPUS_DIR} FORCE_REFRESH=${RAG_FORCE_REFRESH}"
12
 
13
+ # Ensure persistent storage paths exist (Spaces mounts /data)
14
+ mkdir -p "${RAG_DB_DIR}" "${RAG_CORPUS_DIR}" /data/.huggingface || true
15
+ chmod -R 777 /data || true
16
+
17
+ # Optional: warn if HF token is missing (private dataset or Inference needs it)
18
+ if [ -z "${HUGGINGFACEHUB_API_TOKEN:-}" ]; then
19
+ echo "[bootstrap] WARNING: HUGGINGFACEHUB_API_TOKEN is not set. Private datasets or HF Inference will fail."
20
  fi
21
 
22
+ # Start the API (app.py does dataset sync + reindex automatically on startup)
23
+ # If you want to *force* a rebuild each boot, export RAG_FORCE_REFRESH=1 and we'll hit /refresh once it's up.
24
  echo "[bootstrap] Starting API server..."
25
+ python app.py &
26
+
27
+ APP_PID=$!
28
+
29
+ # Optionally trigger a forced refresh once the server is listening
30
+ if [ "${RAG_FORCE_REFRESH}" = "1" ]; then
31
+ echo "[bootstrap] Waiting for API to come up to trigger /refresh ..."
32
+ for i in {1..30}; do
33
+ if curl -fsS "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
34
+ echo "[bootstrap] API is up. Forcing reindex via /refresh"
35
+ # If you protect endpoints with RAG_API_KEY, add: -H "Authorization: Bearer ${RAG_API_KEY}"
36
+ curl -fsS -X POST "http://127.0.0.1:${PORT}/refresh" >/dev/null 2>&1 || true
37
+ break
38
+ fi
39
+ sleep 1
40
+ done
41
+ fi
42
+
43
+ # Bring python to the foreground so container signals are handled
44
+ wait "${APP_PID}"
sites.yaml DELETED
@@ -1,21 +0,0 @@
1
-
2
- ---
3
-
4
- ## 5) (Optional) `.gitignore`
5
- Keeps local junk out of the repo (safe to include).
6
-
7
- ```gitignore
8
- __pycache__/
9
- *.pyc
10
- *.pyo
11
- *.pyd
12
- .env
13
- .venv/
14
- venv/
15
- build/
16
- dist/
17
- .DS_Store
18
- .idea/
19
- .vscode/
20
- data/
21
- chroma_db/