Spaces:

Azizahalq
/

MaterialMind

Sleeping

App Files Files Community

Azizahalq commited on Sep 14, 2025

Commit

b384849

verified ·

1 Parent(s): f78275d

Update rag_mini.py

Browse files

Files changed (1) hide show

rag_mini.py +18 -11

rag_mini.py CHANGED Viewed

@@ -18,15 +18,17 @@ EMB_MODEL = "BAAI/bge-small-en-v1.5"
 def _init_embedder():
     global _EMB_FAST, _EMB_ST
-    if _EMB_FAST or _EMB_ST: return
     try:
         from fastembed import TextEmbedding
         _EMB_FAST = TextEmbedding(model_name=EMB_MODEL)
         print("[EMB] FastEmbed ready:", EMB_MODEL, flush=True)
     except Exception as e:
-        print("[EMB] FastEmbed unavailable -> ST:", e, flush=True)
         from sentence_transformers import SentenceTransformer
         _EMB_ST = SentenceTransformer(EMB_MODEL)
 def _embed(texts:List[str])->List[List[float]]:
     _init_embedder()
@@ -35,7 +37,8 @@ def _embed(texts:List[str])->List[List[float]]:
     return _EMB_ST.encode(texts, normalize_embeddings=True).tolist()
 def _has_catalog(dirpath:Path)->bool:
-    for f in ["chroma.sqlite3","chroma.sqlite","chroma-collections.parquet","index_metadata.pickle","data_level0.bin"]:
         if (dirpath/f).exists():
             return True
     return False
@@ -43,14 +46,16 @@ def _has_catalog(dirpath:Path)->bool:
 def _locate_local_index()->Path:
     # If user specified a precise directory, use it
     if INDEX_DIR_ENV:
-        return ROOT_DIR / INDEX_DIR_ENV
-    # default path where we’ll snapshot_download
-    base = MM_ROOT / "index" / "chroma_v3"
     # try direct
-    if _has_catalog(base): return base
     # try nested uuid
     hits = list(base.rglob("chroma.sqlite3"))
-    if hits: return hits[0].parent
     return base
 def ensure_ready():
@@ -67,7 +72,7 @@ def ensure_ready():
                               local_dir=str(MM_ROOT), local_dir_use_symlinks=False)
         except Exception as e:
             print("[RAG] dataset download failed:", e, flush=True)
-    # relocalize (after download)
     local = _locate_local_index()
     if not _has_catalog(local):
         print(f"[RAG] WARNING: No Chroma catalog found in: {local}", flush=True)
@@ -95,7 +100,8 @@ def search(query:str, k:int=DEFAULT_TOPK)->List[Tuple[str,str]]:
     try:
         col = _get_collection()
         qvec = _embed([query])[0]
-        res = col.query(query_embeddings=[qvec], n_results=int(k), include=["documents","metadatas"])
     except Exception as e:
         print("[RAG] query failed:", e, flush=True)
         return []
@@ -103,7 +109,8 @@ def search(query:str, k:int=DEFAULT_TOPK)->List[Tuple[str,str]]:
     metas = (res.get("metadatas") or [[]])[0]
     hits=[]
     for d, m in zip(docs, metas):
-        if not d: continue
         src = (m or {}).get("source") or (m or {}).get("path") or "unknown"
         page= (m or {}).get("page")
         cite = f"{src}" + (f":p.{page}" if page else "")

 def _init_embedder():
     global _EMB_FAST, _EMB_ST
+    if _EMB_FAST or _EMB_ST:
+        return
     try:
         from fastembed import TextEmbedding
         _EMB_FAST = TextEmbedding(model_name=EMB_MODEL)
         print("[EMB] FastEmbed ready:", EMB_MODEL, flush=True)
     except Exception as e:
+        print("[EMB] FastEmbed unavailable -> SentenceTransformers:", e, flush=True)
         from sentence_transformers import SentenceTransformer
         _EMB_ST = SentenceTransformer(EMB_MODEL)
+        print("[EMB] ST ready:", EMB_MODEL, flush=True)
 def _embed(texts:List[str])->List[List[float]]:
     _init_embedder()
     return _EMB_ST.encode(texts, normalize_embeddings=True).tolist()
 def _has_catalog(dirpath:Path)->bool:
+    for f in ["chroma.sqlite3","chroma.sqlite","chroma-collections.parquet",
+              "index_metadata.pickle","data_level0.bin"]:
         if (dirpath/f).exists():
             return True
     return False
 def _locate_local_index()->Path:
     # If user specified a precise directory, use it
     if INDEX_DIR_ENV:
+        return (ROOT_DIR / INDEX_DIR_ENV).resolve()
+    # default base path where we’ll snapshot_download
+    base = (MM_ROOT / "index" / "chroma_v3").resolve()
     # try direct
+    if _has_catalog(base):
+        return base
     # try nested uuid
     hits = list(base.rglob("chroma.sqlite3"))
+    if hits:
+        return hits[0].parent
     return base
 def ensure_ready():
                               local_dir=str(MM_ROOT), local_dir_use_symlinks=False)
         except Exception as e:
             print("[RAG] dataset download failed:", e, flush=True)
+    # re-locate (after download)
     local = _locate_local_index()
     if not _has_catalog(local):
         print(f"[RAG] WARNING: No Chroma catalog found in: {local}", flush=True)
     try:
         col = _get_collection()
         qvec = _embed([query])[0]
+        res = col.query(query_embeddings=[qvec], n_results=int(k),
+                        include=["documents","metadatas"])
     except Exception as e:
         print("[RAG] query failed:", e, flush=True)
         return []
     metas = (res.get("metadatas") or [[]])[0]
     hits=[]
     for d, m in zip(docs, metas):
+        if not d:
+            continue
         src = (m or {}).get("source") or (m or {}).get("path") or "unknown"
         page= (m or {}).get("page")
         cite = f"{src}" + (f":p.{page}" if page else "")