materialmind-builder / builder.py
Azizahalq's picture
Create builder.py
64f1fa4 verified
import os, uuid, shutil
from pathlib import Path
from huggingface_hub import snapshot_download, upload_folder, HfApi
# ---------- Config via secrets ----------
HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set
CORPUS_DS = os.getenv("CORPUS_DS", "Azizahalq/materialmind-corpus")
INDEX_DS = os.getenv("INDEX_DS", "Azizahalq/materialmind-index")
ROOT = Path(__file__).parent.resolve()
MM_ROOT = ROOT / "MaterialMind"
SRC_DIR = MM_ROOT / "sources"
INDEX_BASE = MM_ROOT / "index" / "chroma_v3" # we’ll create a <uuid> subdir here
EMB_MODEL = "BAAI/bge-small-en-v1.5"
def log(*a): print(*a, flush=True)
def ensure_dirs():
SRC_DIR.mkdir(parents=True, exist_ok=True)
INDEX_BASE.mkdir(parents=True, exist_ok=True)
def download_corpus():
log("[Step] Downloading corpus dataset:", CORPUS_DS)
snapshot_download(repo_id=CORPUS_DS, repo_type="dataset",
local_dir=str(SRC_DIR), local_dir_use_symlinks=False)
log("[OK] Corpus ready at", SRC_DIR)
def build_index():
# Lazy embedder (FastEmbed -> ST)
try:
from fastembed import TextEmbedding
embedder = TextEmbedding(model_name=EMB_MODEL)
def embed(texts): return [v for v in embedder.embed(texts)]
log("[EMB] FastEmbed:", EMB_MODEL)
except Exception as e:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(EMB_MODEL)
def embed(texts): return model.encode(texts, normalize_embeddings=True).tolist()
log("[EMB] ST fallback:", EMB_MODEL, e)
# Readers
import re
def norm(s):
s = s.replace("\r","\n")
s = re.sub(r"[ \t]+"," ",s)
s = re.sub(r"\n{3,}","\n\n",s)
return s.strip()
def from_pdf(path:Path):
any_text=False
try:
import fitz
doc=fitz.open(str(path))
for i,p in enumerate(doc):
t=p.get_text("text").strip()
if t:
any_text=True
yield norm(t), i+1
doc.close()
except Exception:
pass
if not any_text:
try:
from pypdf import PdfReader
r=PdfReader(str(path))
for i,p in enumerate(r.pages):
try: raw=p.extract_text() or ""
except: raw=""
t=norm(raw)
if t:
any_text=True
yield t, i+1
except Exception as e:
log("[WARN] pdf read fail:", path.name, e)
if not any_text:
log("[HINT] no extractable text:", path.name)
def chunk(text, max_chars=1200, overlap=150):
n=len(text);
if n<=max_chars:
if n>0: yield text
return
i=0
while i<n:
j=min(i+max_chars,n)
yield text[i:j]
i = j-overlap if j<n else j
# Build Chroma catalog under a fresh UUID directory
cat_dir = INDEX_BASE / str(uuid.uuid4())
cat_dir.mkdir(parents=True, exist_ok=True)
log("[Step] Building Chroma catalog at:", cat_dir)
import chromadb
client = chromadb.PersistentClient(path=str(cat_dir))
col = client.get_or_create_collection(name="materialmind")
# iterate files
batch_ids, batch_docs, batch_meta = [], [], []
def flush():
if not batch_ids: return
embs = embed(batch_docs)
col.add(ids=batch_ids, documents=batch_docs, metadatas=batch_meta, embeddings=embs)
batch_ids.clear(); batch_docs.clear(); batch_meta.clear()
added = 0
for f in SRC_DIR.rglob("*"):
if not f.is_file():
continue
if f.suffix.lower() != ".pdf":
continue
rel = f.relative_to(MM_ROOT).as_posix()
for page_text, page in from_pdf(f):
for c in chunk(page_text):
batch_ids.append(str(uuid.uuid4()))
batch_docs.append(c)
batch_meta.append({"source": rel, "page": page})
if len(batch_ids) >= 256:
flush()
added += 1
if added % 200 == 0:
log(f" +{added} chunks...")
flush()
log("[OK] Built. Total chunks ~", col.count())
return cat_dir # MaterialMind/index/chroma_v3/<uuid>
def upload_catalog(cat_dir:Path):
# Upload to dataset INDEX_DS under path: index/chroma_v3/<uuid>
# (the app will snapshot_download INDEX_DS later)
target_path_in_repo = f"index/chroma_v3/{cat_dir.name}"
log("[Step] Uploading catalog to dataset:", INDEX_DS, "at", target_path_in_repo)
api = HfApi(token=HF_TOKEN)
upload_folder(
repo_id=INDEX_DS,
repo_type="dataset",
path_in_repo=target_path_in_repo,
folder_path=str(cat_dir),
token=HF_TOKEN,
allow_patterns=None,
ignore_patterns=["**/__pycache__/**"],
)
log("[OK] Uploaded.")
log("NOTE: set Space secret INDEX_DS =", INDEX_DS)
log(" optional INDEX_DIR = MaterialMind/index/chroma_v3/" + cat_dir.name)
def run():
print("==== MaterialMind Index Builder ====")
if not HF_TOKEN:
raise RuntimeError("HF_TOKEN secret is required.")
ensure_dirs()
download_corpus()
cat_dir = build_index()
upload_catalog(cat_dir)
print("==== Done. You can stop this Space. ====")
if __name__ == "__main__":
run()