Spaces:
Sleeping
Sleeping
| import os, uuid, shutil | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download, upload_folder, HfApi | |
| # ---------- Config via secrets ---------- | |
| HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set | |
| CORPUS_DS = os.getenv("CORPUS_DS", "Azizahalq/materialmind-corpus") | |
| INDEX_DS = os.getenv("INDEX_DS", "Azizahalq/materialmind-index") | |
| ROOT = Path(__file__).parent.resolve() | |
| MM_ROOT = ROOT / "MaterialMind" | |
| SRC_DIR = MM_ROOT / "sources" | |
| INDEX_BASE = MM_ROOT / "index" / "chroma_v3" # we’ll create a <uuid> subdir here | |
| EMB_MODEL = "BAAI/bge-small-en-v1.5" | |
| def log(*a): print(*a, flush=True) | |
| def ensure_dirs(): | |
| SRC_DIR.mkdir(parents=True, exist_ok=True) | |
| INDEX_BASE.mkdir(parents=True, exist_ok=True) | |
| def download_corpus(): | |
| log("[Step] Downloading corpus dataset:", CORPUS_DS) | |
| snapshot_download(repo_id=CORPUS_DS, repo_type="dataset", | |
| local_dir=str(SRC_DIR), local_dir_use_symlinks=False) | |
| log("[OK] Corpus ready at", SRC_DIR) | |
| def build_index(): | |
| # Lazy embedder (FastEmbed -> ST) | |
| try: | |
| from fastembed import TextEmbedding | |
| embedder = TextEmbedding(model_name=EMB_MODEL) | |
| def embed(texts): return [v for v in embedder.embed(texts)] | |
| log("[EMB] FastEmbed:", EMB_MODEL) | |
| except Exception as e: | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer(EMB_MODEL) | |
| def embed(texts): return model.encode(texts, normalize_embeddings=True).tolist() | |
| log("[EMB] ST fallback:", EMB_MODEL, e) | |
| # Readers | |
| import re | |
| def norm(s): | |
| s = s.replace("\r","\n") | |
| s = re.sub(r"[ \t]+"," ",s) | |
| s = re.sub(r"\n{3,}","\n\n",s) | |
| return s.strip() | |
| def from_pdf(path:Path): | |
| any_text=False | |
| try: | |
| import fitz | |
| doc=fitz.open(str(path)) | |
| for i,p in enumerate(doc): | |
| t=p.get_text("text").strip() | |
| if t: | |
| any_text=True | |
| yield norm(t), i+1 | |
| doc.close() | |
| except Exception: | |
| pass | |
| if not any_text: | |
| try: | |
| from pypdf import PdfReader | |
| r=PdfReader(str(path)) | |
| for i,p in enumerate(r.pages): | |
| try: raw=p.extract_text() or "" | |
| except: raw="" | |
| t=norm(raw) | |
| if t: | |
| any_text=True | |
| yield t, i+1 | |
| except Exception as e: | |
| log("[WARN] pdf read fail:", path.name, e) | |
| if not any_text: | |
| log("[HINT] no extractable text:", path.name) | |
| def chunk(text, max_chars=1200, overlap=150): | |
| n=len(text); | |
| if n<=max_chars: | |
| if n>0: yield text | |
| return | |
| i=0 | |
| while i<n: | |
| j=min(i+max_chars,n) | |
| yield text[i:j] | |
| i = j-overlap if j<n else j | |
| # Build Chroma catalog under a fresh UUID directory | |
| cat_dir = INDEX_BASE / str(uuid.uuid4()) | |
| cat_dir.mkdir(parents=True, exist_ok=True) | |
| log("[Step] Building Chroma catalog at:", cat_dir) | |
| import chromadb | |
| client = chromadb.PersistentClient(path=str(cat_dir)) | |
| col = client.get_or_create_collection(name="materialmind") | |
| # iterate files | |
| batch_ids, batch_docs, batch_meta = [], [], [] | |
| def flush(): | |
| if not batch_ids: return | |
| embs = embed(batch_docs) | |
| col.add(ids=batch_ids, documents=batch_docs, metadatas=batch_meta, embeddings=embs) | |
| batch_ids.clear(); batch_docs.clear(); batch_meta.clear() | |
| added = 0 | |
| for f in SRC_DIR.rglob("*"): | |
| if not f.is_file(): | |
| continue | |
| if f.suffix.lower() != ".pdf": | |
| continue | |
| rel = f.relative_to(MM_ROOT).as_posix() | |
| for page_text, page in from_pdf(f): | |
| for c in chunk(page_text): | |
| batch_ids.append(str(uuid.uuid4())) | |
| batch_docs.append(c) | |
| batch_meta.append({"source": rel, "page": page}) | |
| if len(batch_ids) >= 256: | |
| flush() | |
| added += 1 | |
| if added % 200 == 0: | |
| log(f" +{added} chunks...") | |
| flush() | |
| log("[OK] Built. Total chunks ~", col.count()) | |
| return cat_dir # MaterialMind/index/chroma_v3/<uuid> | |
| def upload_catalog(cat_dir:Path): | |
| # Upload to dataset INDEX_DS under path: index/chroma_v3/<uuid> | |
| # (the app will snapshot_download INDEX_DS later) | |
| target_path_in_repo = f"index/chroma_v3/{cat_dir.name}" | |
| log("[Step] Uploading catalog to dataset:", INDEX_DS, "at", target_path_in_repo) | |
| api = HfApi(token=HF_TOKEN) | |
| upload_folder( | |
| repo_id=INDEX_DS, | |
| repo_type="dataset", | |
| path_in_repo=target_path_in_repo, | |
| folder_path=str(cat_dir), | |
| token=HF_TOKEN, | |
| allow_patterns=None, | |
| ignore_patterns=["**/__pycache__/**"], | |
| ) | |
| log("[OK] Uploaded.") | |
| log("NOTE: set Space secret INDEX_DS =", INDEX_DS) | |
| log(" optional INDEX_DIR = MaterialMind/index/chroma_v3/" + cat_dir.name) | |
| def run(): | |
| print("==== MaterialMind Index Builder ====") | |
| if not HF_TOKEN: | |
| raise RuntimeError("HF_TOKEN secret is required.") | |
| ensure_dirs() | |
| download_corpus() | |
| cat_dir = build_index() | |
| upload_catalog(cat_dir) | |
| print("==== Done. You can stop this Space. ====") | |
| if __name__ == "__main__": | |
| run() | |