lastsummerkape commited on
Commit
d6f9cdc
·
verified ·
1 Parent(s): 65e5098

Create build_index_from_txt.py

Browse files
Files changed (1) hide show
  1. build_index_from_txt.py +36 -0
build_index_from_txt.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, pickle, pathlib, re, faiss, tiktoken, numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+ TXT_DIR = pathlib.Path("docs_txt")
5
+ CHUNK = 512
6
+ MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
7
+
8
+ enc = tiktoken.get_encoding("cl100k_base")
9
+ model = SentenceTransformer(MODEL)
10
+
11
+ docs, vecs = [], []
12
+
13
+ for p in TXT_DIR.glob("*.txt"):
14
+ text = p.read_text(encoding="utf-8", errors="ignore")
15
+ parts = re.split(r"\n\s*Статья.+?\n", text, flags=re.I) or text.split("\n\n")
16
+ for part in parts:
17
+ if not part.strip():
18
+ continue
19
+ buf = []
20
+ for line in part.splitlines():
21
+ buf.append(line)
22
+ if len(enc.encode(" ".join(buf))) > CHUNK:
23
+ chunk = " ".join(buf); buf.clear()
24
+ docs.append({"src": p.name, "text": chunk})
25
+ vecs.append(model.encode(chunk, normalize_embeddings=True))
26
+ if buf:
27
+ chunk = " ".join(buf)
28
+ docs.append({"src": p.name, "text": chunk})
29
+ vecs.append(model.encode(chunk, normalize_embeddings=True))
30
+
31
+ print("Фрагментов:", len(docs))
32
+ index = faiss.IndexFlatIP(model.get_sentence_embedding_dimension())
33
+ index.add(np.stack(vecs).astype("float32"))
34
+ faiss.write_index(index, "vectorstore.faiss")
35
+ pickle.dump(docs, open("docs.pkl", "wb"))
36
+ print("✓ Индекс готов — файлы vectorstore.faiss и docs.pkl созданы")