Spaces:
Sleeping
Sleeping
File size: 5,425 Bytes
64f1fa4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os, uuid, shutil
from pathlib import Path
from huggingface_hub import snapshot_download, upload_folder, HfApi
# ---------- Config via secrets ----------
HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set
CORPUS_DS = os.getenv("CORPUS_DS", "Azizahalq/materialmind-corpus")
INDEX_DS = os.getenv("INDEX_DS", "Azizahalq/materialmind-index")
ROOT = Path(__file__).parent.resolve()
MM_ROOT = ROOT / "MaterialMind"
SRC_DIR = MM_ROOT / "sources"
INDEX_BASE = MM_ROOT / "index" / "chroma_v3" # we’ll create a <uuid> subdir here
EMB_MODEL = "BAAI/bge-small-en-v1.5"
def log(*a): print(*a, flush=True)
def ensure_dirs():
SRC_DIR.mkdir(parents=True, exist_ok=True)
INDEX_BASE.mkdir(parents=True, exist_ok=True)
def download_corpus():
log("[Step] Downloading corpus dataset:", CORPUS_DS)
snapshot_download(repo_id=CORPUS_DS, repo_type="dataset",
local_dir=str(SRC_DIR), local_dir_use_symlinks=False)
log("[OK] Corpus ready at", SRC_DIR)
def build_index():
# Lazy embedder (FastEmbed -> ST)
try:
from fastembed import TextEmbedding
embedder = TextEmbedding(model_name=EMB_MODEL)
def embed(texts): return [v for v in embedder.embed(texts)]
log("[EMB] FastEmbed:", EMB_MODEL)
except Exception as e:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(EMB_MODEL)
def embed(texts): return model.encode(texts, normalize_embeddings=True).tolist()
log("[EMB] ST fallback:", EMB_MODEL, e)
# Readers
import re
def norm(s):
s = s.replace("\r","\n")
s = re.sub(r"[ \t]+"," ",s)
s = re.sub(r"\n{3,}","\n\n",s)
return s.strip()
def from_pdf(path:Path):
any_text=False
try:
import fitz
doc=fitz.open(str(path))
for i,p in enumerate(doc):
t=p.get_text("text").strip()
if t:
any_text=True
yield norm(t), i+1
doc.close()
except Exception:
pass
if not any_text:
try:
from pypdf import PdfReader
r=PdfReader(str(path))
for i,p in enumerate(r.pages):
try: raw=p.extract_text() or ""
except: raw=""
t=norm(raw)
if t:
any_text=True
yield t, i+1
except Exception as e:
log("[WARN] pdf read fail:", path.name, e)
if not any_text:
log("[HINT] no extractable text:", path.name)
def chunk(text, max_chars=1200, overlap=150):
n=len(text);
if n<=max_chars:
if n>0: yield text
return
i=0
while i<n:
j=min(i+max_chars,n)
yield text[i:j]
i = j-overlap if j<n else j
# Build Chroma catalog under a fresh UUID directory
cat_dir = INDEX_BASE / str(uuid.uuid4())
cat_dir.mkdir(parents=True, exist_ok=True)
log("[Step] Building Chroma catalog at:", cat_dir)
import chromadb
client = chromadb.PersistentClient(path=str(cat_dir))
col = client.get_or_create_collection(name="materialmind")
# iterate files
batch_ids, batch_docs, batch_meta = [], [], []
def flush():
if not batch_ids: return
embs = embed(batch_docs)
col.add(ids=batch_ids, documents=batch_docs, metadatas=batch_meta, embeddings=embs)
batch_ids.clear(); batch_docs.clear(); batch_meta.clear()
added = 0
for f in SRC_DIR.rglob("*"):
if not f.is_file():
continue
if f.suffix.lower() != ".pdf":
continue
rel = f.relative_to(MM_ROOT).as_posix()
for page_text, page in from_pdf(f):
for c in chunk(page_text):
batch_ids.append(str(uuid.uuid4()))
batch_docs.append(c)
batch_meta.append({"source": rel, "page": page})
if len(batch_ids) >= 256:
flush()
added += 1
if added % 200 == 0:
log(f" +{added} chunks...")
flush()
log("[OK] Built. Total chunks ~", col.count())
return cat_dir # MaterialMind/index/chroma_v3/<uuid>
def upload_catalog(cat_dir:Path):
# Upload to dataset INDEX_DS under path: index/chroma_v3/<uuid>
# (the app will snapshot_download INDEX_DS later)
target_path_in_repo = f"index/chroma_v3/{cat_dir.name}"
log("[Step] Uploading catalog to dataset:", INDEX_DS, "at", target_path_in_repo)
api = HfApi(token=HF_TOKEN)
upload_folder(
repo_id=INDEX_DS,
repo_type="dataset",
path_in_repo=target_path_in_repo,
folder_path=str(cat_dir),
token=HF_TOKEN,
allow_patterns=None,
ignore_patterns=["**/__pycache__/**"],
)
log("[OK] Uploaded.")
log("NOTE: set Space secret INDEX_DS =", INDEX_DS)
log(" optional INDEX_DIR = MaterialMind/index/chroma_v3/" + cat_dir.name)
def run():
print("==== MaterialMind Index Builder ====")
if not HF_TOKEN:
raise RuntimeError("HF_TOKEN secret is required.")
ensure_dirs()
download_corpus()
cat_dir = build_index()
upload_catalog(cat_dir)
print("==== Done. You can stop this Space. ====")
if __name__ == "__main__":
run()
|