Spaces:

Azizahalq
/

MaterialMind

Sleeping

App Files Files Community

Azizahalq commited on Sep 12, 2025

Commit

71a256d

1 Parent(s): 8f2affe

Update rag_mini.py

Browse files

Files changed (1) hide show

rag_mini.py +21 -8

rag_mini.py CHANGED Viewed

@@ -3,14 +3,18 @@ import os, re, uuid, textwrap, hashlib, json, shutil
 from pathlib import Path
 from typing import Iterable, List, Tuple, Dict, Any
 # --- Paths relative to repo ---
 ROOT_DIR = Path(__file__).parent.resolve()
 DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
 INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
 MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
-DEFAULT_TOPK = 5
-EMB_MODEL = "BAAI/bge-small-en-v1.5"
 def ensure_dirs():
     DATA_DIR.mkdir(parents=True, exist_ok=True)
@@ -41,7 +45,6 @@ def embed_texts(texts: List[str]) -> List[List[float]]:
 # --- Loaders ---
 def normalize_spaces(text: str) -> str:
-    import re
     text = text.replace("\r", "\n")
     text = re.sub(r"[ \t]+", " ", text)
     text = re.sub(r"\n{3,}", "\n\n", text)
@@ -122,7 +125,7 @@ def get_collection(reset: bool = False):
     if reset:
         try: client.delete_collection("materialmind")
         except Exception: pass
-    return client.get_or_create_collection(name="materialmind")  # embeddings provided manually
 def add_batch(col, ids, docs, metas):
     embs = embed_texts(docs)
@@ -145,7 +148,6 @@ def build_index(batch_size=256) -> int:
 # --- Incremental update with manifest ---
 def file_sig(path: Path):
-    import hashlib
     h = hashlib.sha1()
     try:
         with open(path, "rb") as f:
@@ -203,7 +205,6 @@ def update_index():
 # --- Search ---
 def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
-    import chromadb
     col = get_collection(reset=False)
     qvec = embed_texts([query])[0]
     res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
@@ -218,7 +219,7 @@ def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
 # --- HF dataset bootstrap ---
 def bootstrap_corpus_and_index():
     """
-    Download the dataset Azizahalq/materialmind-corpus into DATA_DIR,
     then build or update the local vector index.
     """
     ensure_dirs()
@@ -230,7 +231,6 @@ def bootstrap_corpus_and_index():
             local_dir=DATA_DIR, local_dir_use_symlinks=False,
             ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
         )
-        # If index is empty, build fresh; otherwise update
         if not any(INDEX_DIR.iterdir()):
             n = build_index()
             print(f"[BUILD] indexed {n} chunks")
@@ -238,3 +238,16 @@ def bootstrap_corpus_and_index():
             update_index()
     except Exception as e:
         print(f"[WARN] dataset bootstrap failed: {e}")

 from pathlib import Path
 from typing import Iterable, List, Tuple, Dict, Any
+# Make Chroma quieter on Spaces
+os.environ.setdefault("CHROMADB_DISABLE_TELEMETRY", "1")
 # --- Paths relative to repo ---
 ROOT_DIR = Path(__file__).parent.resolve()
 DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
 INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
 MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
+DEFAULT_TOPK  = int(os.getenv("DEFAULT_TOPK", "5"))
+DEFAULT_MODEL = os.getenv("LLM_MODEL", "HuggingFaceH4/zephyr-7b-beta")
+EMB_MODEL     = os.getenv("EMB_MODEL", "BAAI/bge-small-en-v1.5")
 def ensure_dirs():
     DATA_DIR.mkdir(parents=True, exist_ok=True)
 # --- Loaders ---
 def normalize_spaces(text: str) -> str:
     text = text.replace("\r", "\n")
     text = re.sub(r"[ \t]+", " ", text)
     text = re.sub(r"\n{3,}", "\n\n", text)
     if reset:
         try: client.delete_collection("materialmind")
         except Exception: pass
+    return client.get_or_create_collection(name="materialmind")  # we provide embeddings manually
 def add_batch(col, ids, docs, metas):
     embs = embed_texts(docs)
 # --- Incremental update with manifest ---
 def file_sig(path: Path):
     h = hashlib.sha1()
     try:
         with open(path, "rb") as f:
 # --- Search ---
 def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
     col = get_collection(reset=False)
     qvec = embed_texts([query])[0]
     res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
 # --- HF dataset bootstrap ---
 def bootstrap_corpus_and_index():
     """
+    Download dataset (default: Azizahalq/materialmind-corpus) into DATA_DIR,
     then build or update the local vector index.
     """
     ensure_dirs()
             local_dir=DATA_DIR, local_dir_use_symlinks=False,
             ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
         )
         if not any(INDEX_DIR.iterdir()):
             n = build_index()
             print(f"[BUILD] indexed {n} chunks")
             update_index()
     except Exception as e:
         print(f"[WARN] dataset bootstrap failed: {e}")
+def ensure_ready():
+    """
+    Ensure folders exist; if sources/ is empty, pull the dataset and build index.
+    """
+    ensure_dirs()
+    is_empty = not any(DATA_DIR.glob("*"))
+    if is_empty:
+        print("[BOOTSTRAP] sources/ is empty → pulling dataset and indexing…")
+        bootstrap_corpus_and_index()
+    elif not any(INDEX_DIR.glob("*")):
+        print("[BOOTSTRAP] index folder empty → building from existing sources/")
+        build_index()