Spaces:

NimrodDev
/

RAG_SPACE

Build error

App Files Files Community

NimrodDev commited on Oct 31, 2025

Commit

b1dc91c

1 Parent(s): fa3d68f

cmc

Browse files

Files changed (4) hide show

README.md +3 -4
create_json.py +11 -0
fetch_text.py +13 -0
rag.py +12 -15

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: RAG WhatsApp Assistant
 emoji: 🏗️
 colorFrom: blue
 colorTo: green
 sdk: docker
-sdk_version: 3.11
-app_file: app.py
-pinned: false
 ---
 Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.

 ---
+title: Lamaki RAG Bot
 emoji: 🏗️
 colorFrom: blue
 colorTo: green
 sdk: docker
+pre_install:
+  - python fetch_text.py   # <-- fetches text at build time
 ---
 Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.

create_json.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# create_json.py
+from datasets import load_dataset
+import json
+ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
+plain = [{"text": row["text"]} for row in ds if row.get("text")]
+with open("ld_events_text.json", "w", encoding="utf-8") as f:
+    json.dump(plain, f, ensure_ascii=False, indent=2)
+print("Saved", len(plain), "rows to ld_events_text.json")

fetch_text.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# fetch_text.py  –- runs ONCE during HF build (on-line)
+import requests, json, os
+URL = "https://datasets-server.huggingface.co/rows"
+params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
+rows = requests.get(URL, params=params, timeout=60).json()["rows"]
+plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
+with open("ld_events_text.json", "w", encoding="utf-8") as f:
+    json.dump(plain, f, ensure_ascii=False, indent=2)
+print("Fetched & saved", len(plain), "rows to ld_events_text.json")

rag.py CHANGED Viewed

@@ -1,10 +1,9 @@
-# rag.py  –  single-index, zero-disk, HF-Space-safe edition
 from __future__ import annotations
-import os, re
 from functools import lru_cache
 from typing import List, Tuple
-from datasets import load_dataset
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
@@ -15,7 +14,7 @@ from supabase import create_client
 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
-HF_DS = "NimrodDev/LD_Events_TEXT"        # parquet branch auto-converted
 EMBED_MODEL  = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL    = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -92,27 +91,25 @@ def _fallback_answer(company: str, intent: str) -> str:
     return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
 # ------------------------------------------------------------------
-# RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
 # ------------------------------------------------------------------
-# ------------------------------------------------------------------
-# RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
-# ------------------------------------------------------------------
 def load_texts() -> List[str]:
-    # offline + in-memory → no write, no download at run-time
-    ds = load_dataset(HF_DS, split="train", keep_in_memory=True, trust_remote_code=False)
-    return [row["text"] for row in ds if row.get("text")]
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
-    docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
-    # force embeddings to use the pre-cached model dir (read-only)
     os.environ["HF_HOME"] = "/code/.cache"
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
-    return FAISS.from_documents(docs, embeddings)   #       # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM

+# rag.py  –- zero-disk, single-index, API-fetched text, offline runtime
 from __future__ import annotations
+import os, re, json
 from functools import lru_cache
 from typing import List, Tuple
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
 # ------------------------------------------------------------------
 # CONFIG
 # ------------------------------------------------------------------
+TEXT_FILE    = "ld_events_text.json"   # local file created at build time
 EMBED_MODEL  = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL    = "microsoft/DialoGPT-medium"
 SUPABASE_URL = os.getenv("SUPABASE_URL")
     return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
 # ------------------------------------------------------------------
+# RAM-ONLY DOCUMENT LOADER – LOCAL JSON CREATED AT BUILD TIME
 # ------------------------------------------------------------------
 def load_texts() -> List[str]:
+    with open(os.path.join(os.path.dirname(__file__), TEXT_FILE), encoding="utf-8") as f:
+        return [row["text"] for row in json.load(f) if row.get("text")]
+# ------------------------------------------------------------------
+# SINGLE-BUILD VECTOR STORE (cached for life of worker)
+# ------------------------------------------------------------------
 @lru_cache(maxsize=1)
 def get_vectorstore() -> FAISS:
     texts = load_texts()
     splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
+    docs = splitter.create_documents(texts, metadatas=[{"source": "api"}] * len(texts))
+    # use pre-cached model dir (read-only)
     os.environ["HF_HOME"] = "/code/.cache"
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    return FAISS.from_documents(docs, embeddings)   # built ONCE per worker
 # ------------------------------------------------------------------
 # LLM