Azizahalq commited on
Commit
64f1fa4
·
verified ·
1 Parent(s): 02c4d46

Create builder.py

Browse files
Files changed (1) hide show
  1. builder.py +159 -0
builder.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, uuid, shutil
2
+ from pathlib import Path
3
+ from huggingface_hub import snapshot_download, upload_folder, HfApi
4
+
5
+ # ---------- Config via secrets ----------
6
+ HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set
7
+ CORPUS_DS = os.getenv("CORPUS_DS", "Azizahalq/materialmind-corpus")
8
+ INDEX_DS = os.getenv("INDEX_DS", "Azizahalq/materialmind-index")
9
+
10
+ ROOT = Path(__file__).parent.resolve()
11
+ MM_ROOT = ROOT / "MaterialMind"
12
+ SRC_DIR = MM_ROOT / "sources"
13
+ INDEX_BASE = MM_ROOT / "index" / "chroma_v3" # we’ll create a <uuid> subdir here
14
+
15
+ EMB_MODEL = "BAAI/bge-small-en-v1.5"
16
+
17
+ def log(*a): print(*a, flush=True)
18
+
19
+ def ensure_dirs():
20
+ SRC_DIR.mkdir(parents=True, exist_ok=True)
21
+ INDEX_BASE.mkdir(parents=True, exist_ok=True)
22
+
23
+ def download_corpus():
24
+ log("[Step] Downloading corpus dataset:", CORPUS_DS)
25
+ snapshot_download(repo_id=CORPUS_DS, repo_type="dataset",
26
+ local_dir=str(SRC_DIR), local_dir_use_symlinks=False)
27
+ log("[OK] Corpus ready at", SRC_DIR)
28
+
29
+ def build_index():
30
+ # Lazy embedder (FastEmbed -> ST)
31
+ try:
32
+ from fastembed import TextEmbedding
33
+ embedder = TextEmbedding(model_name=EMB_MODEL)
34
+ def embed(texts): return [v for v in embedder.embed(texts)]
35
+ log("[EMB] FastEmbed:", EMB_MODEL)
36
+ except Exception as e:
37
+ from sentence_transformers import SentenceTransformer
38
+ model = SentenceTransformer(EMB_MODEL)
39
+ def embed(texts): return model.encode(texts, normalize_embeddings=True).tolist()
40
+ log("[EMB] ST fallback:", EMB_MODEL, e)
41
+
42
+ # Readers
43
+ import re
44
+ def norm(s):
45
+ s = s.replace("\r","\n")
46
+ s = re.sub(r"[ \t]+"," ",s)
47
+ s = re.sub(r"\n{3,}","\n\n",s)
48
+ return s.strip()
49
+
50
+ def from_pdf(path:Path):
51
+ any_text=False
52
+ try:
53
+ import fitz
54
+ doc=fitz.open(str(path))
55
+ for i,p in enumerate(doc):
56
+ t=p.get_text("text").strip()
57
+ if t:
58
+ any_text=True
59
+ yield norm(t), i+1
60
+ doc.close()
61
+ except Exception:
62
+ pass
63
+ if not any_text:
64
+ try:
65
+ from pypdf import PdfReader
66
+ r=PdfReader(str(path))
67
+ for i,p in enumerate(r.pages):
68
+ try: raw=p.extract_text() or ""
69
+ except: raw=""
70
+ t=norm(raw)
71
+ if t:
72
+ any_text=True
73
+ yield t, i+1
74
+ except Exception as e:
75
+ log("[WARN] pdf read fail:", path.name, e)
76
+ if not any_text:
77
+ log("[HINT] no extractable text:", path.name)
78
+
79
+ def chunk(text, max_chars=1200, overlap=150):
80
+ n=len(text);
81
+ if n<=max_chars:
82
+ if n>0: yield text
83
+ return
84
+ i=0
85
+ while i<n:
86
+ j=min(i+max_chars,n)
87
+ yield text[i:j]
88
+ i = j-overlap if j<n else j
89
+
90
+ # Build Chroma catalog under a fresh UUID directory
91
+ cat_dir = INDEX_BASE / str(uuid.uuid4())
92
+ cat_dir.mkdir(parents=True, exist_ok=True)
93
+ log("[Step] Building Chroma catalog at:", cat_dir)
94
+
95
+ import chromadb
96
+ client = chromadb.PersistentClient(path=str(cat_dir))
97
+ col = client.get_or_create_collection(name="materialmind")
98
+
99
+ # iterate files
100
+ batch_ids, batch_docs, batch_meta = [], [], []
101
+ def flush():
102
+ if not batch_ids: return
103
+ embs = embed(batch_docs)
104
+ col.add(ids=batch_ids, documents=batch_docs, metadatas=batch_meta, embeddings=embs)
105
+ batch_ids.clear(); batch_docs.clear(); batch_meta.clear()
106
+
107
+ added = 0
108
+ for f in SRC_DIR.rglob("*"):
109
+ if not f.is_file():
110
+ continue
111
+ if f.suffix.lower() != ".pdf":
112
+ continue
113
+ rel = f.relative_to(MM_ROOT).as_posix()
114
+ for page_text, page in from_pdf(f):
115
+ for c in chunk(page_text):
116
+ batch_ids.append(str(uuid.uuid4()))
117
+ batch_docs.append(c)
118
+ batch_meta.append({"source": rel, "page": page})
119
+ if len(batch_ids) >= 256:
120
+ flush()
121
+ added += 1
122
+ if added % 200 == 0:
123
+ log(f" +{added} chunks...")
124
+
125
+ flush()
126
+ log("[OK] Built. Total chunks ~", col.count())
127
+ return cat_dir # MaterialMind/index/chroma_v3/<uuid>
128
+
129
+ def upload_catalog(cat_dir:Path):
130
+ # Upload to dataset INDEX_DS under path: index/chroma_v3/<uuid>
131
+ # (the app will snapshot_download INDEX_DS later)
132
+ target_path_in_repo = f"index/chroma_v3/{cat_dir.name}"
133
+ log("[Step] Uploading catalog to dataset:", INDEX_DS, "at", target_path_in_repo)
134
+ api = HfApi(token=HF_TOKEN)
135
+ upload_folder(
136
+ repo_id=INDEX_DS,
137
+ repo_type="dataset",
138
+ path_in_repo=target_path_in_repo,
139
+ folder_path=str(cat_dir),
140
+ token=HF_TOKEN,
141
+ allow_patterns=None,
142
+ ignore_patterns=["**/__pycache__/**"],
143
+ )
144
+ log("[OK] Uploaded.")
145
+ log("NOTE: set Space secret INDEX_DS =", INDEX_DS)
146
+ log(" optional INDEX_DIR = MaterialMind/index/chroma_v3/" + cat_dir.name)
147
+
148
+ def run():
149
+ print("==== MaterialMind Index Builder ====")
150
+ if not HF_TOKEN:
151
+ raise RuntimeError("HF_TOKEN secret is required.")
152
+ ensure_dirs()
153
+ download_corpus()
154
+ cat_dir = build_index()
155
+ upload_catalog(cat_dir)
156
+ print("==== Done. You can stop this Space. ====")
157
+
158
+ if __name__ == "__main__":
159
+ run()