File size: 5,425 Bytes
64f1fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os, uuid, shutil
from pathlib import Path
from huggingface_hub import snapshot_download, upload_folder, HfApi

# ---------- Config via secrets ----------
HF_TOKEN  = os.getenv("HF_TOKEN")  # MUST be set
CORPUS_DS = os.getenv("CORPUS_DS", "Azizahalq/materialmind-corpus")
INDEX_DS  = os.getenv("INDEX_DS",  "Azizahalq/materialmind-index")

ROOT       = Path(__file__).parent.resolve()
MM_ROOT    = ROOT / "MaterialMind"
SRC_DIR    = MM_ROOT / "sources"
INDEX_BASE = MM_ROOT / "index" / "chroma_v3"  # we’ll create a <uuid> subdir here

EMB_MODEL  = "BAAI/bge-small-en-v1.5"

def log(*a): print(*a, flush=True)

def ensure_dirs():
    SRC_DIR.mkdir(parents=True, exist_ok=True)
    INDEX_BASE.mkdir(parents=True, exist_ok=True)

def download_corpus():
    log("[Step] Downloading corpus dataset:", CORPUS_DS)
    snapshot_download(repo_id=CORPUS_DS, repo_type="dataset",
                      local_dir=str(SRC_DIR), local_dir_use_symlinks=False)
    log("[OK] Corpus ready at", SRC_DIR)

def build_index():
    # Lazy embedder (FastEmbed -> ST)
    try:
        from fastembed import TextEmbedding
        embedder = TextEmbedding(model_name=EMB_MODEL)
        def embed(texts): return [v for v in embedder.embed(texts)]
        log("[EMB] FastEmbed:", EMB_MODEL)
    except Exception as e:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(EMB_MODEL)
        def embed(texts): return model.encode(texts, normalize_embeddings=True).tolist()
        log("[EMB] ST fallback:", EMB_MODEL, e)

    # Readers
    import re
    def norm(s):
        s = s.replace("\r","\n")
        s = re.sub(r"[ \t]+"," ",s)
        s = re.sub(r"\n{3,}","\n\n",s)
        return s.strip()

    def from_pdf(path:Path):
        any_text=False
        try:
            import fitz
            doc=fitz.open(str(path))
            for i,p in enumerate(doc):
                t=p.get_text("text").strip()
                if t:
                    any_text=True
                    yield norm(t), i+1
            doc.close()
        except Exception:
            pass
        if not any_text:
            try:
                from pypdf import PdfReader
                r=PdfReader(str(path))
                for i,p in enumerate(r.pages):
                    try: raw=p.extract_text() or ""
                    except: raw=""
                    t=norm(raw)
                    if t:
                        any_text=True
                        yield t, i+1
            except Exception as e:
                log("[WARN] pdf read fail:", path.name, e)
        if not any_text:
            log("[HINT] no extractable text:", path.name)

    def chunk(text, max_chars=1200, overlap=150):
        n=len(text); 
        if n<=max_chars:
            if n>0: yield text
            return
        i=0
        while i<n:
            j=min(i+max_chars,n)
            yield text[i:j]
            i = j-overlap if j<n else j

    # Build Chroma catalog under a fresh UUID directory
    cat_dir = INDEX_BASE / str(uuid.uuid4())
    cat_dir.mkdir(parents=True, exist_ok=True)
    log("[Step] Building Chroma catalog at:", cat_dir)

    import chromadb
    client = chromadb.PersistentClient(path=str(cat_dir))
    col = client.get_or_create_collection(name="materialmind")

    # iterate files
    batch_ids, batch_docs, batch_meta = [], [], []
    def flush():
        if not batch_ids: return
        embs = embed(batch_docs)
        col.add(ids=batch_ids, documents=batch_docs, metadatas=batch_meta, embeddings=embs)
        batch_ids.clear(); batch_docs.clear(); batch_meta.clear()

    added = 0
    for f in SRC_DIR.rglob("*"):
        if not f.is_file(): 
            continue
        if f.suffix.lower() != ".pdf":
            continue
        rel = f.relative_to(MM_ROOT).as_posix()
        for page_text, page in from_pdf(f):
            for c in chunk(page_text):
                batch_ids.append(str(uuid.uuid4()))
                batch_docs.append(c)
                batch_meta.append({"source": rel, "page": page})
                if len(batch_ids) >= 256:
                    flush()
            added += 1
            if added % 200 == 0:
                log(f"  +{added} chunks...")

    flush()
    log("[OK] Built. Total chunks ~", col.count())
    return cat_dir  # MaterialMind/index/chroma_v3/<uuid>

def upload_catalog(cat_dir:Path):
    # Upload to dataset INDEX_DS under path: index/chroma_v3/<uuid>
    # (the app will snapshot_download INDEX_DS later)
    target_path_in_repo = f"index/chroma_v3/{cat_dir.name}"
    log("[Step] Uploading catalog to dataset:", INDEX_DS, "at", target_path_in_repo)
    api = HfApi(token=HF_TOKEN)
    upload_folder(
        repo_id=INDEX_DS,
        repo_type="dataset",
        path_in_repo=target_path_in_repo,
        folder_path=str(cat_dir),
        token=HF_TOKEN,
        allow_patterns=None,
        ignore_patterns=["**/__pycache__/**"],
    )
    log("[OK] Uploaded.")
    log("NOTE: set Space secret INDEX_DS =", INDEX_DS)
    log("      optional INDEX_DIR = MaterialMind/index/chroma_v3/" + cat_dir.name)

def run():
    print("==== MaterialMind Index Builder ====")
    if not HF_TOKEN:
        raise RuntimeError("HF_TOKEN secret is required.")
    ensure_dirs()
    download_corpus()
    cat_dir = build_index()
    upload_catalog(cat_dir)
    print("==== Done. You can stop this Space. ====")

if __name__ == "__main__":
    run()