Spaces:
Sleeping
Sleeping
Update rag_mini.py
Browse files- rag_mini.py +21 -8
rag_mini.py
CHANGED
|
@@ -3,14 +3,18 @@ import os, re, uuid, textwrap, hashlib, json, shutil
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Iterable, List, Tuple, Dict, Any
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
# --- Paths relative to repo ---
|
| 7 |
ROOT_DIR = Path(__file__).parent.resolve()
|
| 8 |
DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
|
| 9 |
INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
|
| 10 |
MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
|
| 11 |
|
| 12 |
-
DEFAULT_TOPK
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
def ensure_dirs():
|
| 16 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
@@ -41,7 +45,6 @@ def embed_texts(texts: List[str]) -> List[List[float]]:
|
|
| 41 |
|
| 42 |
# --- Loaders ---
|
| 43 |
def normalize_spaces(text: str) -> str:
|
| 44 |
-
import re
|
| 45 |
text = text.replace("\r", "\n")
|
| 46 |
text = re.sub(r"[ \t]+", " ", text)
|
| 47 |
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
@@ -122,7 +125,7 @@ def get_collection(reset: bool = False):
|
|
| 122 |
if reset:
|
| 123 |
try: client.delete_collection("materialmind")
|
| 124 |
except Exception: pass
|
| 125 |
-
return client.get_or_create_collection(name="materialmind") # embeddings
|
| 126 |
|
| 127 |
def add_batch(col, ids, docs, metas):
|
| 128 |
embs = embed_texts(docs)
|
|
@@ -145,7 +148,6 @@ def build_index(batch_size=256) -> int:
|
|
| 145 |
|
| 146 |
# --- Incremental update with manifest ---
|
| 147 |
def file_sig(path: Path):
|
| 148 |
-
import hashlib
|
| 149 |
h = hashlib.sha1()
|
| 150 |
try:
|
| 151 |
with open(path, "rb") as f:
|
|
@@ -203,7 +205,6 @@ def update_index():
|
|
| 203 |
|
| 204 |
# --- Search ---
|
| 205 |
def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
|
| 206 |
-
import chromadb
|
| 207 |
col = get_collection(reset=False)
|
| 208 |
qvec = embed_texts([query])[0]
|
| 209 |
res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
|
|
@@ -218,7 +219,7 @@ def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
|
|
| 218 |
# --- HF dataset bootstrap ---
|
| 219 |
def bootstrap_corpus_and_index():
|
| 220 |
"""
|
| 221 |
-
Download
|
| 222 |
then build or update the local vector index.
|
| 223 |
"""
|
| 224 |
ensure_dirs()
|
|
@@ -230,7 +231,6 @@ def bootstrap_corpus_and_index():
|
|
| 230 |
local_dir=DATA_DIR, local_dir_use_symlinks=False,
|
| 231 |
ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
|
| 232 |
)
|
| 233 |
-
# If index is empty, build fresh; otherwise update
|
| 234 |
if not any(INDEX_DIR.iterdir()):
|
| 235 |
n = build_index()
|
| 236 |
print(f"[BUILD] indexed {n} chunks")
|
|
@@ -238,3 +238,16 @@ def bootstrap_corpus_and_index():
|
|
| 238 |
update_index()
|
| 239 |
except Exception as e:
|
| 240 |
print(f"[WARN] dataset bootstrap failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Iterable, List, Tuple, Dict, Any
|
| 5 |
|
| 6 |
+
# Make Chroma quieter on Spaces
|
| 7 |
+
os.environ.setdefault("CHROMADB_DISABLE_TELEMETRY", "1")
|
| 8 |
+
|
| 9 |
# --- Paths relative to repo ---
|
| 10 |
ROOT_DIR = Path(__file__).parent.resolve()
|
| 11 |
DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
|
| 12 |
INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
|
| 13 |
MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
|
| 14 |
|
| 15 |
+
DEFAULT_TOPK = int(os.getenv("DEFAULT_TOPK", "5"))
|
| 16 |
+
DEFAULT_MODEL = os.getenv("LLM_MODEL", "HuggingFaceH4/zephyr-7b-beta")
|
| 17 |
+
EMB_MODEL = os.getenv("EMB_MODEL", "BAAI/bge-small-en-v1.5")
|
| 18 |
|
| 19 |
def ensure_dirs():
|
| 20 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 45 |
|
| 46 |
# --- Loaders ---
|
| 47 |
def normalize_spaces(text: str) -> str:
|
|
|
|
| 48 |
text = text.replace("\r", "\n")
|
| 49 |
text = re.sub(r"[ \t]+", " ", text)
|
| 50 |
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
|
|
| 125 |
if reset:
|
| 126 |
try: client.delete_collection("materialmind")
|
| 127 |
except Exception: pass
|
| 128 |
+
return client.get_or_create_collection(name="materialmind") # we provide embeddings manually
|
| 129 |
|
| 130 |
def add_batch(col, ids, docs, metas):
|
| 131 |
embs = embed_texts(docs)
|
|
|
|
| 148 |
|
| 149 |
# --- Incremental update with manifest ---
|
| 150 |
def file_sig(path: Path):
|
|
|
|
| 151 |
h = hashlib.sha1()
|
| 152 |
try:
|
| 153 |
with open(path, "rb") as f:
|
|
|
|
| 205 |
|
| 206 |
# --- Search ---
|
| 207 |
def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
|
|
|
|
| 208 |
col = get_collection(reset=False)
|
| 209 |
qvec = embed_texts([query])[0]
|
| 210 |
res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
|
|
|
|
| 219 |
# --- HF dataset bootstrap ---
|
| 220 |
def bootstrap_corpus_and_index():
|
| 221 |
"""
|
| 222 |
+
Download dataset (default: Azizahalq/materialmind-corpus) into DATA_DIR,
|
| 223 |
then build or update the local vector index.
|
| 224 |
"""
|
| 225 |
ensure_dirs()
|
|
|
|
| 231 |
local_dir=DATA_DIR, local_dir_use_symlinks=False,
|
| 232 |
ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
|
| 233 |
)
|
|
|
|
| 234 |
if not any(INDEX_DIR.iterdir()):
|
| 235 |
n = build_index()
|
| 236 |
print(f"[BUILD] indexed {n} chunks")
|
|
|
|
| 238 |
update_index()
|
| 239 |
except Exception as e:
|
| 240 |
print(f"[WARN] dataset bootstrap failed: {e}")
|
| 241 |
+
|
| 242 |
+
def ensure_ready():
|
| 243 |
+
"""
|
| 244 |
+
Ensure folders exist; if sources/ is empty, pull the dataset and build index.
|
| 245 |
+
"""
|
| 246 |
+
ensure_dirs()
|
| 247 |
+
is_empty = not any(DATA_DIR.glob("*"))
|
| 248 |
+
if is_empty:
|
| 249 |
+
print("[BOOTSTRAP] sources/ is empty → pulling dataset and indexing…")
|
| 250 |
+
bootstrap_corpus_and_index()
|
| 251 |
+
elif not any(INDEX_DIR.glob("*")):
|
| 252 |
+
print("[BOOTSTRAP] index folder empty → building from existing sources/")
|
| 253 |
+
build_index()
|