Spaces:

NimrodDev
/

RAG_SPACE

Build error

App Files Files Community

NimrodDev commited on Oct 31, 2025

Commit

33d913e

1 Parent(s): cd7e1d5

bullet-proof fetch + empty corpus fallback

Browse files

Files changed (1) hide show

rag.py +37 -25

rag.py CHANGED Viewed

@@ -1,6 +1,6 @@
-# rag.py  –- zero-disk, single-index, API-fetched text, offline runtime
 from __future__ import annotations
-import os, re, json
 from functools import lru_cache
 from typing import List, Tuple
@@ -14,9 +14,13 @@ from supabase import create_client
 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
-TEXT_FILE    = "ld_events_text.json"   # local file created at build time
-EMBED_MODEL  = "sentence-transformers/all-MiniLM-L6-v2"
-LLM_MODEL    = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
 HF_TOKEN     = os.getenv("HF_TOKEN")
@@ -33,7 +37,7 @@ MONEY_RE     = re.compile(r"\b(price|cost|budget|cheap|expensive|money|usd|ksh|p
 COMPLAIN_RE  = re.compile(r"\b(complain|bad|terrible|awful|disappointed|angry|slow|rude)\b", re.I)
 # ------------------------------------------------------------------
-# FALLBACKS – UNBIASED GREETING
 # ------------------------------------------------------------------
 FALLBACKS = {
     "LD Events": {
@@ -91,25 +95,36 @@ def _fallback_answer(company: str, intent: str) -> str:
     return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
 # ------------------------------------------------------------------
-# RAM-ONLY DOCUMENT LOADER – LOCAL JSON CREATED AT BUILD TIME
 # ------------------------------------------------------------------
-def load_texts() -> List[str]:
-    with open(os.path.join(os.path.dirname(__file__), TEXT_FILE), encoding="utf-8") as f:
-        return [row["text"] for row in json.load(f) if row.get("text")]
-# ------------------------------------------------------------------
-# SINGLE-BUILD VECTOR STORE (cached for life of worker)
 # ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
-    texts = load_texts()
-    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
-    docs = splitter.create_documents(texts, metadatas=[{"source": "api"}] * len(texts))
-    # use pre-cached model dir (read-only)
-    os.environ["HF_HOME"] = "/code/.cache"
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
-    return FAISS.from_documents(docs, embeddings)   # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM
@@ -130,7 +145,7 @@ Question: {question}
 Answer:""")
 # ------------------------------------------------------------------
-# MAIN ENTRY
 # ------------------------------------------------------------------
 def ask_question(phone: str, question: str) -> Tuple[str, List]:
     intent  = _detect_intent(question)
@@ -148,10 +163,10 @@ def ask_question(phone: str, question: str) -> Tuple[str, List]:
         _save_chat(phone, question, answer)
         return answer, []
-    # RAG path – re-uses the *same* index every call
     vs   = get_vectorstore()
     docs = vs.similarity_search(question, k=3)
-    if not docs:
         answer = _fallback_answer(company, intent if intent in ("money", "complain") else "default")
         _save_chat(phone, question, answer)
         return answer, []
@@ -167,9 +182,6 @@ def ask_question(phone: str, question: str) -> Tuple[str, List]:
     _save_chat(phone, question, answer)
     return answer, result.get("source_documents", [])
-# ------------------------------------------------------------------
-# CHAT PERSISTENCE
-# ------------------------------------------------------------------
 def _save_chat(phone: str, q: str, a: str) -> None:
     supabase.table("chat_memory").insert({"user_phone": phone, "role": "user", "message": q}).execute()
     supabase.table("chat_memory").insert({"user_phone": phone, "role": "assistant", "message": a}).execute()

+# rag.py  –- bullet-proof: online fetch with fallback on any error
 from __future__ import annotations
+import os, re, json, requests
 from functools import lru_cache
 from typing import List, Tuple
 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
+DATASET_API = "https://datasets-server.huggingface.co/rows"
+DATASET     = "NimrodDev/LD_Events2"
+CONFIG      = "default"
+SPLIT       = "train"
+LIMIT       = 500
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL   = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
 HF_TOKEN     = os.getenv("HF_TOKEN")
 COMPLAIN_RE  = re.compile(r"\b(complain|bad|terrible|awful|disappointed|angry|slow|rude)\b", re.I)
 # ------------------------------------------------------------------
+# FALLBACKS
 # ------------------------------------------------------------------
 FALLBACKS = {
     "LD Events": {
     return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
 # ------------------------------------------------------------------
+# BULLET-PROOF ONLINE FETCH – RETURNS EMPTY LIST ON ANY ERROR
 # ------------------------------------------------------------------
+@lru_cache(maxsize=1)
+def get_texts() -> List[str]:
+    try:
+        url = f"{DATASET_API}?dataset={DATASET}&config={CONFIG}&split={SPLIT}&offset=0&length={LIMIT}"
+        r = requests.get(url, timeout=60)
+        r.raise_for_status()
+        rows = r.json()["rows"]
+        texts = [row["row"]["text"] for row in rows if row["row"].get("text")]
+        print(f"✓ Fetched {len(texts)} texts from {DATASET}")
+        return texts
+    except Exception as e:
+        print(f"⚠ Dataset fetch failed: {e} – using empty corpus")
+        return []
+# ------------------------------------------------------------------
+# RAM-ONLY VECTOR STORE – HANDLES EMPTY CORPUS GRACEFULLY
 # ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
+    texts = get_texts()
+    if not texts:                       # ← no data → return empty FAISS
+        embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+        return FAISS.from_texts([""], embeddings)  # dummy, retriever will be empty
+    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
+    docs = splitter.create_documents(texts, metadatas=[{"source": DATASET}] * len(texts))
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    return FAISS.from_documents(docs, embeddings)
 # ------------------------------------------------------------------
 # LLM
 Answer:""")
 # ------------------------------------------------------------------
+# MAIN ENTRY – NEVER CRASHES
 # ------------------------------------------------------------------
 def ask_question(phone: str, question: str) -> Tuple[str, List]:
     intent  = _detect_intent(question)
         _save_chat(phone, question, answer)
         return answer, []
+    # RAG path – same index every call (empty index → no docs → fallback)
     vs   = get_vectorstore()
     docs = vs.similarity_search(question, k=3)
+    if not docs or docs[0].page_content.strip() == "":  # empty dummy
         answer = _fallback_answer(company, intent if intent in ("money", "complain") else "default")
         _save_chat(phone, question, answer)
         return answer, []
     _save_chat(phone, question, answer)
     return answer, result.get("source_documents", [])
 def _save_chat(phone: str, q: str, a: str) -> None:
     supabase.table("chat_memory").insert({"user_phone": phone, "role": "user", "message": q}).execute()
     supabase.table("chat_memory").insert({"user_phone": phone, "role": "assistant", "message": a}).execute()