NimrodDev commited on
Commit
b1dc91c
Β·
1 Parent(s): fa3d68f
Files changed (4) hide show
  1. README.md +3 -4
  2. create_json.py +11 -0
  3. fetch_text.py +13 -0
  4. rag.py +12 -15
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: RAG WhatsApp Assistant
3
  emoji: πŸ—οΈ
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: docker
7
- sdk_version: 3.11
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
  Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.
 
1
  ---
2
+ title: Lamaki RAG Bot
3
  emoji: πŸ—οΈ
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: docker
7
+ pre_install:
8
+ - python fetch_text.py # <-- fetches text at build time
 
9
  ---
10
 
11
  Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.
create_json.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # create_json.py
2
+ from datasets import load_dataset
3
+ import json
4
+
5
+ ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
6
+ plain = [{"text": row["text"]} for row in ds if row.get("text")]
7
+
8
+ with open("ld_events_text.json", "w", encoding="utf-8") as f:
9
+ json.dump(plain, f, ensure_ascii=False, indent=2)
10
+
11
+ print("Saved", len(plain), "rows to ld_events_text.json")
fetch_text.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fetch_text.py –- runs ONCE during HF build (on-line)
2
+ import requests, json, os
3
+
4
+ URL = "https://datasets-server.huggingface.co/rows"
5
+ params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
6
+ rows = requests.get(URL, params=params, timeout=60).json()["rows"]
7
+
8
+ plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
9
+
10
+ with open("ld_events_text.json", "w", encoding="utf-8") as f:
11
+ json.dump(plain, f, ensure_ascii=False, indent=2)
12
+
13
+ print("Fetched & saved", len(plain), "rows to ld_events_text.json")
rag.py CHANGED
@@ -1,10 +1,9 @@
1
- # rag.py – single-index, zero-disk, HF-Space-safe edition
2
  from __future__ import annotations
3
- import os, re
4
  from functools import lru_cache
5
  from typing import List, Tuple
6
 
7
- from datasets import load_dataset
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
@@ -15,7 +14,7 @@ from supabase import create_client
15
  # ------------------------------------------------------------------
16
  # CONFIG
17
  # ------------------------------------------------------------------
18
- HF_DS = "NimrodDev/LD_Events_TEXT" # parquet branch auto-converted
19
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
20
  LLM_MODEL = "microsoft/DialoGPT-medium"
21
  SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -92,27 +91,25 @@ def _fallback_answer(company: str, intent: str) -> str:
92
  return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
93
 
94
  # ------------------------------------------------------------------
95
- # RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
96
  # ------------------------------------------------------------------
97
- # ------------------------------------------------------------------
98
- # RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
99
- # ------------------------------------------------------------------
100
-
101
  def load_texts() -> List[str]:
102
- # offline + in-memory β†’ no write, no download at run-time
103
- ds = load_dataset(HF_DS, split="train", keep_in_memory=True, trust_remote_code=False)
104
- return [row["text"] for row in ds if row.get("text")]
105
 
 
 
 
106
  @lru_cache(maxsize=1)
107
  def get_vectorstore() -> FAISS:
108
  texts = load_texts()
109
  splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
110
- docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
111
 
112
- # force embeddings to use the pre-cached model dir (read-only)
113
  os.environ["HF_HOME"] = "/code/.cache"
114
  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
115
- return FAISS.from_documents(docs, embeddings) # # built ONCE per worker
116
 
117
  # ------------------------------------------------------------------
118
  # LLM
 
1
+ # rag.py –- zero-disk, single-index, API-fetched text, offline runtime
2
  from __future__ import annotations
3
+ import os, re, json
4
  from functools import lru_cache
5
  from typing import List, Tuple
6
 
 
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
 
14
  # ------------------------------------------------------------------
15
  # CONFIG
16
  # ------------------------------------------------------------------
17
+ TEXT_FILE = "ld_events_text.json" # local file created at build time
18
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
  LLM_MODEL = "microsoft/DialoGPT-medium"
20
  SUPABASE_URL = os.getenv("SUPABASE_URL")
 
91
  return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
92
 
93
  # ------------------------------------------------------------------
94
+ # RAM-ONLY DOCUMENT LOADER – LOCAL JSON CREATED AT BUILD TIME
95
  # ------------------------------------------------------------------
 
 
 
 
96
  def load_texts() -> List[str]:
97
+ with open(os.path.join(os.path.dirname(__file__), TEXT_FILE), encoding="utf-8") as f:
98
+ return [row["text"] for row in json.load(f) if row.get("text")]
 
99
 
100
+ # ------------------------------------------------------------------
101
+ # SINGLE-BUILD VECTOR STORE (cached for life of worker)
102
+ # ------------------------------------------------------------------
103
  @lru_cache(maxsize=1)
104
  def get_vectorstore() -> FAISS:
105
  texts = load_texts()
106
  splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
107
+ docs = splitter.create_documents(texts, metadatas=[{"source": "api"}] * len(texts))
108
 
109
+ # use pre-cached model dir (read-only)
110
  os.environ["HF_HOME"] = "/code/.cache"
111
  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
112
+ return FAISS.from_documents(docs, embeddings) # built ONCE per worker
113
 
114
  # ------------------------------------------------------------------
115
  # LLM