Spaces:

NimrodDev
/

RAG_SPACE

Build error

App Files Files Community

NimrodDev commited on Oct 31, 2025

Commit

13a37a9

1 Parent(s): b029948

cmc

Browse files

Files changed (3) hide show

Dockerfile +16 -1
cache_ds.py +5 -0
rag.py +16 -3

Dockerfile CHANGED Viewed

@@ -1,12 +1,27 @@
 FROM python:3.11-slim
-# install faiss dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /code
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]

 FROM python:3.11-slim
+# install build deps
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /code
+# install python deps
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# ---- NEW: pre-cache dataset into /code/.cache ----
+COPY cache_ds.py .
+RUN python cache_ds.py
+# tell HF/datasets to use the pre-cached dir (read-only at runtime)
+ENV HF_DATASETS_OFFLINE=1
+ENV HF_DATASETS_CACHE=/code/.cache
+ENV TRANSFORMERS_CACHE=/code/.cache
+ENV TRANSFORMERS_OFFLINE=1
+# copy app code
 COPY . .
 CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]

cache_ds.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# cache_ds.py
+from datasets import load_dataset
+# downloads & caches the parquet branch into /code/.cache
+load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
+print("Dataset cached.")

rag.py CHANGED Viewed

@@ -94,20 +94,33 @@ def _fallback_answer(company: str, intent: str) -> str:
 # ------------------------------------------------------------------
 # RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
 # ------------------------------------------------------------------
 def load_texts() -> List[str]:
-    ds = load_dataset(HF_DS, revision="refs/convert/parquet", split="train")
     return [row["text"] for row in ds if row.get("text")]
 # ------------------------------------------------------------------
-# SINGLE-BUILD VECTOR STORE (cached for life of worker)
 # ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
     docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
-    return FAISS.from_documents(docs, embeddings)   # <- built ONCE
 # ------------------------------------------------------------------
 # LLM

 # ------------------------------------------------------------------
 # RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
 # ------------------------------------------------------------------
+# ------------------------------------------------------------------
+# RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
+# ------------------------------------------------------------------
 def load_texts() -> List[str]:
+    # offline + trust_remote_code=False  ->  no write, no download
+    ds = load_dataset(
+        HF_DS,
+        revision="refs/convert/parquet",
+        split="train",
+        trust_remote_code=False,
+        keep_in_memory=True          # force RAM, no disk touch
+    )
     return [row["text"] for row in ds if row.get("text")]
 # ------------------------------------------------------------------
+# SINGLE-BUILD VECTOR STORE  (cached for life of worker)
 # ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
     docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
+    # tell sentence-transformers to use the pre-cached model inside the image
+    os.environ["HF_HOME"] = "/code/.cache"               # <-- NEW
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    return FAISS.from_documents(docs, embeddings)        # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM