Spaces:

NimrodDev
/

RAG_SPACE

Build error

App Files Files Community

NimrodDev commited on Oct 31, 2025

Commit

3ab9e0a

1 Parent(s): 13a37a9

cmc

Browse files

Files changed (2) hide show

install_cache.sh +15 -0
rag.py +7 -15

install_cache.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+# install_cache.sh  –– runs ONCE during HF build (online) → caches into ./.cache
+set -e
+python - <<'PY'
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+# 1. download plain text dataset (online, build-time only)
+ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train")
+print("✓ Dataset cached at build time")
+# 2. download embedding model (online, build-time only)
+SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+print("✓ Embedding model cached at build time")
+PY

rag.py CHANGED Viewed

@@ -15,7 +15,7 @@ from supabase import create_client
 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
-HF_DS        = "NimrodDev/LD_Events2"          # parquet branch auto-converted
 EMBED_MODEL  = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL    = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -97,30 +97,22 @@ def _fallback_answer(company: str, intent: str) -> str:
 # ------------------------------------------------------------------
 # RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
 # ------------------------------------------------------------------
 def load_texts() -> List[str]:
-    # offline + trust_remote_code=False  ->  no write, no download
-    ds = load_dataset(
-        HF_DS,
-        revision="refs/convert/parquet",
-        split="train",
-        trust_remote_code=False,
-        keep_in_memory=True          # force RAM, no disk touch
-    )
     return [row["text"] for row in ds if row.get("text")]
-# ------------------------------------------------------------------
-# SINGLE-BUILD VECTOR STORE  (cached for life of worker)
-# ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
     docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
-    # tell sentence-transformers to use the pre-cached model inside the image
-    os.environ["HF_HOME"] = "/code/.cache"               # <-- NEW
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
-    return FAISS.from_documents(docs, embeddings)        # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM

 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
+HF_DS = "NimrodDev/LD_Events_TEXT"        # parquet branch auto-converted
 EMBED_MODEL  = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL    = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 # ------------------------------------------------------------------
 # RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
 # ------------------------------------------------------------------
 def load_texts() -> List[str]:
+    # offline + in-memory → no write, no download at run-time
+    ds = load_dataset(HF_DS, split="train", keep_in_memory=True, trust_remote_code=False)
     return [row["text"] for row in ds if row.get("text")]
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
     docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
+    # force embeddings to use the pre-cached model dir (read-only)
+    os.environ["HF_HOME"] = "/code/.cache"
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    return FAISS.from_documents(docs, embeddings)   #       # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM