cmc
Browse files- install_cache.sh +15 -0
- rag.py +7 -15
install_cache.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# install_cache.sh ββ runs ONCE during HF build (online) β caches into ./.cache
|
| 3 |
+
set -e
|
| 4 |
+
python - <<'PY'
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
# 1. download plain text dataset (online, build-time only)
|
| 9 |
+
ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train")
|
| 10 |
+
print("β Dataset cached at build time")
|
| 11 |
+
|
| 12 |
+
# 2. download embedding model (online, build-time only)
|
| 13 |
+
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 14 |
+
print("β Embedding model cached at build time")
|
| 15 |
+
PY
|
rag.py
CHANGED
|
@@ -15,7 +15,7 @@ from supabase import create_client
|
|
| 15 |
# ------------------------------------------------------------------
|
| 16 |
# CONFIG
|
| 17 |
# ------------------------------------------------------------------
|
| 18 |
-
HF_DS
|
| 19 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 20 |
LLM_MODEL = "microsoft/DialoGPT-medium"
|
| 21 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
|
@@ -97,30 +97,22 @@ def _fallback_answer(company: str, intent: str) -> str:
|
|
| 97 |
# ------------------------------------------------------------------
|
| 98 |
# RAM-ONLY DOCUMENT LOADER β OFF-LINE / PRE-CACHED
|
| 99 |
# ------------------------------------------------------------------
|
|
|
|
| 100 |
def load_texts() -> List[str]:
|
| 101 |
-
# offline +
|
| 102 |
-
ds = load_dataset(
|
| 103 |
-
HF_DS,
|
| 104 |
-
revision="refs/convert/parquet",
|
| 105 |
-
split="train",
|
| 106 |
-
trust_remote_code=False,
|
| 107 |
-
keep_in_memory=True # force RAM, no disk touch
|
| 108 |
-
)
|
| 109 |
return [row["text"] for row in ds if row.get("text")]
|
| 110 |
|
| 111 |
-
# ------------------------------------------------------------------
|
| 112 |
-
# SINGLE-BUILD VECTOR STORE (cached for life of worker)
|
| 113 |
-
# ------------------------------------------------------------------
|
| 114 |
@lru_cache(maxsize=1)
|
| 115 |
def get_vectorstore() -> FAISS:
|
| 116 |
texts = load_texts()
|
| 117 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 118 |
docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
os.environ["HF_HOME"] = "/code/.cache"
|
| 122 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 123 |
-
return FAISS.from_documents(docs, embeddings)
|
| 124 |
|
| 125 |
# ------------------------------------------------------------------
|
| 126 |
# LLM
|
|
|
|
| 15 |
# ------------------------------------------------------------------
|
| 16 |
# CONFIG
|
| 17 |
# ------------------------------------------------------------------
|
| 18 |
+
HF_DS = "NimrodDev/LD_Events_TEXT" # parquet branch auto-converted
|
| 19 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 20 |
LLM_MODEL = "microsoft/DialoGPT-medium"
|
| 21 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
|
|
|
| 97 |
# ------------------------------------------------------------------
|
| 98 |
# RAM-ONLY DOCUMENT LOADER β OFF-LINE / PRE-CACHED
|
| 99 |
# ------------------------------------------------------------------
|
| 100 |
+
|
| 101 |
def load_texts() -> List[str]:
|
| 102 |
+
# offline + in-memory β no write, no download at run-time
|
| 103 |
+
ds = load_dataset(HF_DS, split="train", keep_in_memory=True, trust_remote_code=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return [row["text"] for row in ds if row.get("text")]
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
@lru_cache(maxsize=1)
|
| 107 |
def get_vectorstore() -> FAISS:
|
| 108 |
texts = load_texts()
|
| 109 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 110 |
docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
|
| 111 |
|
| 112 |
+
# force embeddings to use the pre-cached model dir (read-only)
|
| 113 |
+
os.environ["HF_HOME"] = "/code/.cache"
|
| 114 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 115 |
+
return FAISS.from_documents(docs, embeddings) # # built ONCE per worker
|
| 116 |
|
| 117 |
# ------------------------------------------------------------------
|
| 118 |
# LLM
|