cmc
Browse files- README.md +3 -4
- create_json.py +11 -0
- fetch_text.py +13 -0
- rag.py +12 -15
README.md
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
---
|
| 2 |
-
title: RAG
|
| 3 |
emoji: ποΈ
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Lamaki RAG Bot
|
| 3 |
emoji: ποΈ
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
pre_install:
|
| 8 |
+
- python fetch_text.py # <-- fetches text at build time
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
Lightning-fast RAG webhook for **Lamaki Designs** & **LD Events**.
|
create_json.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# create_json.py
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
|
| 6 |
+
plain = [{"text": row["text"]} for row in ds if row.get("text")]
|
| 7 |
+
|
| 8 |
+
with open("ld_events_text.json", "w", encoding="utf-8") as f:
|
| 9 |
+
json.dump(plain, f, ensure_ascii=False, indent=2)
|
| 10 |
+
|
| 11 |
+
print("Saved", len(plain), "rows to ld_events_text.json")
|
fetch_text.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# fetch_text.py β- runs ONCE during HF build (on-line)
|
| 2 |
+
import requests, json, os
|
| 3 |
+
|
| 4 |
+
URL = "https://datasets-server.huggingface.co/rows"
|
| 5 |
+
params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
|
| 6 |
+
rows = requests.get(URL, params=params, timeout=60).json()["rows"]
|
| 7 |
+
|
| 8 |
+
plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
|
| 9 |
+
|
| 10 |
+
with open("ld_events_text.json", "w", encoding="utf-8") as f:
|
| 11 |
+
json.dump(plain, f, ensure_ascii=False, indent=2)
|
| 12 |
+
|
| 13 |
+
print("Fetched & saved", len(plain), "rows to ld_events_text.json")
|
rag.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
# rag.py
|
| 2 |
from __future__ import annotations
|
| 3 |
-
import os, re
|
| 4 |
from functools import lru_cache
|
| 5 |
from typing import List, Tuple
|
| 6 |
|
| 7 |
-
from datasets import load_dataset
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain_community.vectorstores import FAISS
|
| 10 |
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
|
|
@@ -15,7 +14,7 @@ from supabase import create_client
|
|
| 15 |
# ------------------------------------------------------------------
|
| 16 |
# CONFIG
|
| 17 |
# ------------------------------------------------------------------
|
| 18 |
-
|
| 19 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 20 |
LLM_MODEL = "microsoft/DialoGPT-medium"
|
| 21 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
|
@@ -92,27 +91,25 @@ def _fallback_answer(company: str, intent: str) -> str:
|
|
| 92 |
return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
|
| 93 |
|
| 94 |
# ------------------------------------------------------------------
|
| 95 |
-
# RAM-ONLY DOCUMENT LOADER β
|
| 96 |
# ------------------------------------------------------------------
|
| 97 |
-
# ------------------------------------------------------------------
|
| 98 |
-
# RAM-ONLY DOCUMENT LOADER β OFF-LINE / PRE-CACHED
|
| 99 |
-
# ------------------------------------------------------------------
|
| 100 |
-
|
| 101 |
def load_texts() -> List[str]:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
return [row["text"] for row in ds if row.get("text")]
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
@lru_cache(maxsize=1)
|
| 107 |
def get_vectorstore() -> FAISS:
|
| 108 |
texts = load_texts()
|
| 109 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 110 |
-
docs = splitter.create_documents(texts, metadatas=[{"source":
|
| 111 |
|
| 112 |
-
#
|
| 113 |
os.environ["HF_HOME"] = "/code/.cache"
|
| 114 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 115 |
-
return FAISS.from_documents(docs, embeddings) #
|
| 116 |
|
| 117 |
# ------------------------------------------------------------------
|
| 118 |
# LLM
|
|
|
|
| 1 |
+
# rag.py β- zero-disk, single-index, API-fetched text, offline runtime
|
| 2 |
from __future__ import annotations
|
| 3 |
+
import os, re, json
|
| 4 |
from functools import lru_cache
|
| 5 |
from typing import List, Tuple
|
| 6 |
|
|
|
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain_community.vectorstores import FAISS
|
| 9 |
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
|
|
|
|
| 14 |
# ------------------------------------------------------------------
|
| 15 |
# CONFIG
|
| 16 |
# ------------------------------------------------------------------
|
| 17 |
+
TEXT_FILE = "ld_events_text.json" # local file created at build time
|
| 18 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 19 |
LLM_MODEL = "microsoft/DialoGPT-medium"
|
| 20 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
|
|
|
| 91 |
return FALLBACKS[company].get(intent, FALLBACKS[company]["default"])
|
| 92 |
|
| 93 |
# ------------------------------------------------------------------
|
| 94 |
+
# RAM-ONLY DOCUMENT LOADER β LOCAL JSON CREATED AT BUILD TIME
|
| 95 |
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def load_texts() -> List[str]:
|
| 97 |
+
with open(os.path.join(os.path.dirname(__file__), TEXT_FILE), encoding="utf-8") as f:
|
| 98 |
+
return [row["text"] for row in json.load(f) if row.get("text")]
|
|
|
|
| 99 |
|
| 100 |
+
# ------------------------------------------------------------------
|
| 101 |
+
# SINGLE-BUILD VECTOR STORE (cached for life of worker)
|
| 102 |
+
# ------------------------------------------------------------------
|
| 103 |
@lru_cache(maxsize=1)
|
| 104 |
def get_vectorstore() -> FAISS:
|
| 105 |
texts = load_texts()
|
| 106 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 107 |
+
docs = splitter.create_documents(texts, metadatas=[{"source": "api"}] * len(texts))
|
| 108 |
|
| 109 |
+
# use pre-cached model dir (read-only)
|
| 110 |
os.environ["HF_HOME"] = "/code/.cache"
|
| 111 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 112 |
+
return FAISS.from_documents(docs, embeddings) # built ONCE per worker
|
| 113 |
|
| 114 |
# ------------------------------------------------------------------
|
| 115 |
# LLM
|