Spaces:

Azizahalq
/

MaterialMind

Sleeping

App Files Files Community

Azizahalq commited on Sep 12, 2025

Commit

b91214e

1 Parent(s): 177fad2

Update rag_mini.py

Browse files

Files changed (1) hide show

rag_mini.py +56 -268

rag_mini.py CHANGED Viewed

@@ -1,292 +1,80 @@
 # rag_mini.py
-import os, re, uuid, textwrap, hashlib, json, shutil
 from pathlib import Path
-from typing import Iterable, List, Tuple, Dict, Any
-# ---------------- Paths ----------------
-ROOT_DIR   = Path(__file__).parent.resolve()
-DATA_ROOT  = ROOT_DIR / "MaterialMind"
-DATA_DIR   = DATA_ROOT / "sources"
-INDEX_DIR  = DATA_ROOT / "index" / "chroma_v3"
-MANIFEST   = DATA_ROOT / "index" / "manifest.json"
 DEFAULT_TOPK = 5
-EMB_MODEL    = "BAAI/bge-small-en-v1.5"
-def ensure_dirs():
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
     INDEX_DIR.mkdir(parents=True, exist_ok=True)
-    MANIFEST.parent.mkdir(parents=True, exist_ok=True)
-# ---------------- Embeddings ----------------
-_EMBED_FAST = None
-_EMBED_ST   = None
-def init_embedder():
-    global _EMBED_FAST, _EMBED_ST
-    if _EMBED_FAST or _EMBED_ST:
-        return
-    try:
-        from fastembed import TextEmbedding
-        _EMBED_FAST = TextEmbedding(model_name=EMB_MODEL)
-        print(f"[EMB] FastEmbed ready: {EMB_MODEL}")
-    except Exception as e:
-        print(f"[EMB] FastEmbed not available ({e}). Falling back to SentenceTransformers.")
-        from sentence_transformers import SentenceTransformer
-        _EMBED_ST = SentenceTransformer(EMB_MODEL)
-def embed_texts(texts: List[str]) -> List[List[float]]:
-    init_embedder()
-    if _EMBED_FAST is not None:
-        return [v for v in _EMBED_FAST.embed(texts)]
-    return _EMBED_ST.encode(texts, normalize_embeddings=True).tolist()
-# ---------------- Loaders ----------------
-def normalize_spaces(text: str) -> str:
-    text = text.replace("\r", "\n")
-    text = re.sub(r"[ \t]+", " ", text)
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    return text.strip()
-def load_text_from_pdf(path: Path):
-    # try pymupdf
-    try:
-        import fitz
-        doc = fitz.open(str(path))
-        any_text = False
-        for i, page in enumerate(doc):
-            t = page.get_text("text").strip()
-            if t:
-                any_text = True
-                yield normalize_spaces(t), i + 1
-        doc.close()
-        if not any_text:
-            print(f"[HINT] scanned? {path.name}")
-        return
-    except Exception:
-        pass
-    # pypdf fallback
-    try:
-        from pypdf import PdfReader
-        r = PdfReader(str(path))
-        any_text = False
-        for i, p in enumerate(r.pages):
-            try:
-                raw = p.extract_text() or ""
-            except Exception:
-                raw = ""
-            t = normalize_spaces(raw)
-            if t:
-                any_text = True
-                yield t, i + 1
-        if not any_text:
-            print(f"[HINT] no extractable text: {path.name}")
-    except Exception as e:
-        print(f"[WARN] PDF read fail {path.name}: {e}")
-def load_text_from_md_txt(path: Path) -> str:
-    try:
-        raw = path.read_text(errors="ignore")
-    except Exception:
-        raw = ""
-    return normalize_spaces(raw)
-def chunk(text: str, max_chars=1200, overlap=150):
-    n = len(text)
-    if n <= max_chars:
-        if n > 0:
-            yield text
-        return
-    i = 0
-    while i < n:
-        j = min(i + max_chars, n)
-        yield text[i:j]
-        i = j - overlap if j < n else j
-def iter_documents():
-    for f in DATA_DIR.rglob("*"):
-        if not f.is_file():
-            continue
-        ext = f.suffix.lower()
-        rel = f.relative_to(ROOT_DIR).as_posix()
-        if ext == ".pdf":
-            any_text = False
-            for page_text, page in load_text_from_pdf(f):
-                any_text = True
-                for c in chunk(page_text):
-                    yield {"id": str(uuid.uuid4()), "text": c, "meta": {"source": rel, "page": page}}
-            if not any_text:
-                yield {"id": str(uuid.uuid4()), "text": f"[NO-TEXT] {f.name}", "meta": {"source": rel, "page": None}}
-        elif ext in (".md", ".txt"):
-            text = load_text_from_md_txt(f)
-            for c in chunk(text):
-                yield {"id": str(uuid.uuid4()), "text": c, "meta": {"source": rel, "page": None}}
-# ---------------- Chroma ----------------
-def _client():
-    import chromadb
-    return chromadb.PersistentClient(path=str(INDEX_DIR))
-def get_collection(reset: bool = False):
     import chromadb
-    client = _client()
-    if reset:
-        try:
-            client.delete_collection("materialmind")
-        except Exception:
-            pass
-    # Important: name must match what you used when you built the index locally.
     return client.get_or_create_collection(name="materialmind")
-def add_batch(col, ids, docs, metas):
-    embs = embed_texts(docs)
-    col.add(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
-def build_index(batch_size=256) -> int:
-    ensure_dirs()
-    col = get_collection(reset=True)
-    ids, docs, metas, total = [], [], [], 0
-    for doc in iter_documents():
-        if doc["text"].startswith("[NO-TEXT]"):
-            print(f"[INFO] skip unextractable: {doc['meta']['source']}")
-            continue
-        ids.append(doc["id"]); docs.append(doc["text"]); metas.append(doc["meta"])
-        if len(ids) >= batch_size:
-            add_batch(col, ids, docs, metas)
-            total += len(ids); ids, docs, metas = [], [], []
-    if ids:
-        add_batch(col, ids, docs, metas); total += len(ids)
-    print(f"[BUILD] complete: {total} chunks")
-    return total
-# ---- Manifested incremental update (optional) ----
-def file_sig(path: Path):
-    h = hashlib.sha1()
-    try:
-        with open(path, "rb") as f:
-            for chunk in iter(lambda: f.read(1 << 20), b""):
-                h.update(chunk)
-    except Exception:
-        return None
-    stat = path.stat()
-    return {"sha1": h.hexdigest(), "size": stat.st_size, "mtime": int(stat.st_mtime)}
-def load_manifest():
-    if MANIFEST.exists():
-        try:
-            return json.loads(MANIFEST.read_text())
-        except Exception:
-            return {}
-    return {}
-def save_manifest(m): MANIFEST.write_text(json.dumps(m, indent=2))
-def update_index():
-    ensure_dirs()
-    col = get_collection(reset=False)
-    manifest = load_manifest()
-    current = {f.relative_to(ROOT_DIR).as_posix(): f for f in DATA_DIR.rglob("*") if f.is_file()}
-    # remove deleted
-    for src in list(manifest.keys()):
-        if src not in current:
-            col.delete(where={"source": src})
-            manifest.pop(src, None)
-            print(f"[DEL] {src}")
-    # add/refresh changed
-    for src, path in current.items():
-        sig = file_sig(path)
-        if sig is None:
-            continue
-        if manifest.get(src) == sig:
-            continue
-        col.delete(where={"source": src})
-        added = 0
-        ext = path.suffix.lower()
-        if ext == ".pdf":
-            any_text = False
-            for page_text, page in load_text_from_pdf(path):
-                any_text = True
-                for c in chunk(page_text):
-                    add_batch(col, [str(uuid.uuid4())], [c], [{"source": src, "page": page}])
-                    added += 1
-            if not any_text:
-                print(f"[INFO] skip unextractable: {src}")
-        elif ext in (".md", ".txt"):
-            text = load_text_from_md_txt(path)
-            for c in chunk(text):
-                add_batch(col, [str(uuid.uuid4())], [c], [{"source": src, "page": None}])
-                added += 1
-        manifest[src] = sig
-        print(f"[UPD] {src} (+{added})")
-    save_manifest(manifest)
-    print("[UPDATE] done.")
-# ---------------- Search ----------------
 def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
     try:
-        col = get_collection(reset=False)
     except Exception as e:
-        print(f"[ERR] Opening collection failed: {e}")
         return []
     try:
-        qvec = embed_texts([query])[0]
-        res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
     except Exception as e:
-        print(f"[ERR] Query failed: {e}")
         return []
     hits = []
-    for doc, meta in zip(res.get("documents", [[]])[0], res.get("metadatas", [[]])[0]):
-        src = meta.get("source", "unknown")
-        page = meta.get("page")
         cite = f"{src}" + (f":p.{page}" if page else "")
-        hits.append((doc, cite))
     return hits
-# ---------------- Ready / Stats ----------------
-def index_stats() -> dict:
     try:
-        col = get_collection(reset=False)
         return {"count": col.count()}
     except Exception as e:
-        return {"count": 0, "error": str(e)}
-def ensure_ready():
-    """
-    Use the prebuilt Chroma index if it exists.
-    If no index is present but 'sources/' exists, build from sources.
-    If neither exists and CORPUS_DS is set, pull that dataset and build.
-    """
-    ensure_dirs()
-    # 1) If index already has data, just use it
-    has_any_file = any(INDEX_DIR.glob("**/*"))
-    if has_any_file:
-        st = index_stats()
-        print(f"[READY] Using existing index at {INDEX_DIR} — count={st.get('count')}")
-        return
-    # 2) If you prefer to build from sources (optional)
-    if any(DATA_DIR.glob("**/*")):
-        print("[READY] No index detected. Building from local 'sources/'.")
-        build_index()
-        return
-    # 3) Optional: pull a dataset then build
-    repo_id = os.getenv("CORPUS_DS", "").strip()
-    if repo_id:
-        try:
-            from huggingface_hub import snapshot_download
-            print(f"[READY] Pulling dataset {repo_id} into {DATA_DIR} …")
-            snapshot_download(
-                repo_id=repo_id, repo_type="dataset",
-                local_dir=DATA_DIR, local_dir_use_symlinks=False,
-                ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
-            )
-            build_index()
-            return
-        except Exception as e:
-            print(f"[WARN] dataset bootstrap failed: {e}")
-    print("[READY] No index found; no sources; no dataset configured. Retrieval will be empty.")

 # rag_mini.py
+import os, json, textwrap
 from pathlib import Path
+from typing import List, Tuple
+# ---------- Paths ----------
+ROOT_DIR = Path(__file__).parent.resolve()
+DATA_ROOT = ROOT_DIR / "MaterialMind"                 # repo root for app data
 DEFAULT_TOPK = 5
+# Allow override from env, else use repo path
+_DEFAULT_INDEX_DIR = DATA_ROOT / "index" / "chroma_v3"
+INDEX_DIR = Path(os.getenv("INDEX_DIR", str(_DEFAULT_INDEX_DIR))).resolve()
+def _has_catalog(path: Path) -> bool:
+    if not path.exists():
+        return False
+    # sqlite catalog (most common)
+    if (path / "chroma.sqlite3").exists():
+        return True
+    # parquet/duckdb variants (older/newer chroma)
+    for n in ["chroma-collections.parquet", "chroma-embeddings.parquet",
+              "chroma.sqlite", "duckdb", "collections.parquet"]:
+        if (path / n).exists():
+            return True
+    return False
+def ensure_ready() -> None:
+    """Check the persistent index exists & print a small stat to logs."""
     INDEX_DIR.mkdir(parents=True, exist_ok=True)
+    if not _has_catalog(INDEX_DIR):
+        print(f"[RAG] WARNING: No Chroma catalog found in {INDEX_DIR}")
+        print("      Upload your prebuilt DB (e.g., chroma.sqlite3) into that folder.")
+    else:
+        try:
+            stats = index_stats()
+            print(f"[RAG] Index ready at {INDEX_DIR} — count={stats.get('count')}")
+        except Exception as e:
+            print(f"[RAG] Could not read index stats: {e}")
+# ---------- Retrieval ----------
+def get_collection():
     import chromadb
+    client = chromadb.PersistentClient(path=str(INDEX_DIR))
+    # NOTE: name must match the collection you used when building the index
     return client.get_or_create_collection(name="materialmind")
 def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
+    """
+    Returns [(snippet_text, 'source_path[:p.PAGE]'), ...]
+    """
     try:
+        col = get_collection()
     except Exception as e:
+        print(f"[RAG] get_collection() failed: {e}")
         return []
     try:
+        res = col.query(query_texts=[query], n_results=k, include=["documents", "metadatas"])
     except Exception as e:
+        print(f"[RAG] query failed: {e}")
         return []
+    docs = (res.get("documents") or [[]])[0]
+    metas = (res.get("metadatas") or [[]])[0]
     hits = []
+    for d, m in zip(docs, metas):
+        src = (m or {}).get("source") or (m or {}).get("path") or "unknown"
+        page = (m or {}).get("page")
         cite = f"{src}" + (f":p.{page}" if page else "")
+        if d:
+            hits.append((d, cite))
     return hits
+def index_stats():
     try:
+        col = get_collection()
         return {"count": col.count()}
     except Exception as e:
+        return {"count": 0, "err": str(e)}