NeoAivara commited on
Commit
b4374d8
·
verified ·
1 Parent(s): feca629

Create build.py

Browse files
Files changed (1) hide show
  1. build.py +63 -0
build.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # build.py — Build FAISS index (CPU)
3
+
4
+ import json
5
+ import pickle
6
+ from pathlib import Path
7
+
8
+ import faiss
9
+ import numpy as np
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ # ---------- CONFIG ----------
13
+ ROOT = Path(__file__).parent
14
+ FLATTENED_JSONL = ROOT / "data" / "processed" / "flattened_docs.jsonl"
15
+ INDEX_FILE = ROOT / "law_index.faiss"
16
+ META_FILE = ROOT / "law_meta.pkl"
17
+ EMB_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
18
+ # ----------------------------
19
+
20
+ def load_docs():
21
+ if not FLATTENED_JSONL.exists():
22
+ raise FileNotFoundError(f"{FLATTENED_JSONL} not found")
23
+
24
+ docs = []
25
+ with FLATTENED_JSONL.open("r", encoding="utf-8") as f:
26
+ for line in f:
27
+ if not line.strip():
28
+ continue
29
+ docs.append(json.loads(line))
30
+
31
+ print(f"[i] Loaded {len(docs)} documents")
32
+ return docs
33
+
34
+ def build():
35
+ docs = load_docs()
36
+
37
+ texts = [d.get("text", "") for d in docs]
38
+
39
+ print("[i] Loading embedder...")
40
+ embedder = SentenceTransformer(EMB_MODEL)
41
+
42
+ print("[i] Computing embeddings...")
43
+ embs = embedder.encode(
44
+ texts,
45
+ convert_to_numpy=True,
46
+ show_progress_bar=True
47
+ ).astype(np.float32)
48
+
49
+ faiss.normalize_L2(embs)
50
+
51
+ dim = embs.shape[1]
52
+ index = faiss.IndexFlatIP(dim)
53
+ index.add(embs)
54
+
55
+ faiss.write_index(index, str(INDEX_FILE))
56
+ with open(META_FILE, "wb") as f:
57
+ pickle.dump(docs, f)
58
+
59
+ print("[i] Saved index →", INDEX_FILE)
60
+ print("[i] Saved metadata →", META_FILE)
61
+
62
+ if __name__ == "__main__":
63
+ build()