Spaces:

Azizahalq
/

MaterialMind

Sleeping

App Files Files Community

Azizahalq commited on Sep 14, 2025

Commit

76eee36

verified ·

1 Parent(s): 9221408

Create build_index_from_hf.py

Browse files

Files changed (1) hide show

build_index_from_hf.py +142 -0

build_index_from_hf.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python3
+"""
+Rebuild Chroma index from a Hugging Face dataset using BGE-small (384-d) embeddings.
+- Dataset: Azizahalq/materialmind-corpus  (override with --repo)
+- Output:  MaterialMind/index/chroma_v3/<uuid>  (override with --out_dir / --uuid)
+- Collection: materialmind (override with --collection)
+"""
+import os, argparse, uuid, math
+from pathlib import Path
+from typing import Dict, List, Any, Iterable
+from datasets import load_dataset, concatenate_datasets
+from tqdm import tqdm
+EMB_MODEL = "BAAI/bge-small-en-v1.5"
+def pick_text(row: Dict[str, Any]) -> str:
+    candidates = ["text","content","chunk","page_text","passage","body","abstract"]
+    for k in candidates:
+        if k in row and isinstance(row[k], str) and row[k].strip():
+            return row[k]
+    return " ".join([str(v) for v in row.values() if isinstance(v, str)])
+def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
+    text = " ".join(text.split())
+    if len(text) <= max_chars:
+        return [text] if text else []
+    chunks, i = [], 0
+    while i < len(text):
+        j = min(len(text), i + max_chars)
+        cut = text.rfind(". ", i, j)
+        if cut == -1 or cut <= i + 200:
+            cut = j
+        chunk = text[i:cut].strip()
+        if chunk:
+            chunks.append(chunk)
+        i = max(cut - overlap, i + 1)
+    return chunks
+def l2norm(vec: List[float]) -> List[float]:
+    s = math.sqrt(sum(x*x for x in vec)) or 1.0
+    return [x/s for x in vec]
+def embed_bge_small(texts: List[str]) -> List[List[float]]:
+    try:
+        from fastembed import TextEmbedding
+        emb = TextEmbedding(model_name=EMB_MODEL)
+        return [l2norm(v) for v in emb.embed(texts)]
+    except Exception:
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer(EMB_MODEL)
+        arr = model.encode(texts, normalize_embeddings=True)
+        return [l2norm(v.tolist()) for v in arr]
+def batched(iterable, batch_size: int):
+    buf = []
+    for x in iterable:
+        buf.append(x)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if buf:
+        yield buf
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--repo", default="Azizahalq/materialmind-corpus")
+    ap.add_argument("--split", default="train", help="train/test/all")
+    ap.add_argument("--out_dir", default="MaterialMind/index/chroma_v3")
+    ap.add_argument("--uuid", default=str(uuid.uuid4())[:8])
+    ap.add_argument("--collection", default="materialmind")
+    ap.add_argument("--batch", type=int, default=64)
+    args = ap.parse_args()
+    out_root = Path(args.out_dir).resolve()
+    index_dir = (out_root / args.uuid).resolve()
+    index_dir.mkdir(parents=True, exist_ok=True)
+    print(f"[BUILD] Index path: {index_dir}")
+    # Load dataset
+    try:
+        if args.split == "all":
+            ds_map = load_dataset(args.repo)
+            data = concatenate_datasets(list(ds_map.values()))
+        else:
+            data = load_dataset(args.repo, split=args.split)
+    except Exception as e:
+        raise SystemExit(f"[BUILD] Failed to load dataset {args.repo}: {e}")
+    # Chroma
+    import chromadb
+    client = chromadb.PersistentClient(path=str(index_dir))
+    col = client.get_or_create_collection(
+        name=args.collection,
+        metadata={"hnsw:space": "cosine"},
+    )
+    docs, metas, ids = [], [], []
+    total_rows = len(data)
+    print(f"[BUILD] Rows in split '{args.split}': {total_rows}")
+    for ridx in tqdm(range(total_rows), desc="Chunking"):
+        row = data[ridx]
+        text = pick_text(row)
+        if not text:
+            continue
+        meta = {}
+        for key in ("source","path","file","url","title","page"):
+            if key in row:
+                meta[key] = row[key]
+        parts = chunk_text(text, max_chars=900, overlap=120)
+        for pidx, chunk in enumerate(parts):
+            docs.append(chunk)
+            metas.append(meta.copy())
+            ids.append(f"r{ridx}-p{pidx}")
+    if not docs:
+        raise SystemExit("[BUILD] No text to index. Check your dataset fields.")
+    added = 0
+    for bi in tqdm(list(batched(list(zip(ids, docs, metas)), args.batch)), desc="Embedding+Add"):
+        b_ids  = [b[0] for b in bi]
+        b_docs = [b[1] for b in bi]
+        b_meta = [b[2] for b in bi]
+        vecs = embed_bge_small(b_docs)
+        col.add(ids=b_ids, documents=b_docs, metadatas=b_meta, embeddings=vecs)
+        added += len(b_ids)
+    try:
+        count = col.count()
+    except Exception:
+        count = added
+    print(f"[BUILD] Done. Added {added} chunks. Collection count = {count}")
+    print(f"[BUILD] Set env vars for the app:")
+    print(f"  EMB_PROVIDER=hf")
+    print(f"  EMB_MODEL={EMB_MODEL}")
+    print(f"  INDEX_DIR=MaterialMind/index/chroma_v3/{args.uuid}")
+    print(f"  INDEX_COLLECTION={args.collection}")
+if __name__ == "__main__":
+    main()