Spaces:
Paused
Paused
Create build_index_from_txt.py
Browse files- build_index_from_txt.py +36 -0
build_index_from_txt.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import glob, pickle, pathlib, re, faiss, tiktoken, numpy as np
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
|
| 4 |
+
TXT_DIR = pathlib.Path("docs_txt")
|
| 5 |
+
CHUNK = 512
|
| 6 |
+
MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 7 |
+
|
| 8 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
| 9 |
+
model = SentenceTransformer(MODEL)
|
| 10 |
+
|
| 11 |
+
docs, vecs = [], []
|
| 12 |
+
|
| 13 |
+
for p in TXT_DIR.glob("*.txt"):
|
| 14 |
+
text = p.read_text(encoding="utf-8", errors="ignore")
|
| 15 |
+
parts = re.split(r"\n\s*Статья.+?\n", text, flags=re.I) or text.split("\n\n")
|
| 16 |
+
for part in parts:
|
| 17 |
+
if not part.strip():
|
| 18 |
+
continue
|
| 19 |
+
buf = []
|
| 20 |
+
for line in part.splitlines():
|
| 21 |
+
buf.append(line)
|
| 22 |
+
if len(enc.encode(" ".join(buf))) > CHUNK:
|
| 23 |
+
chunk = " ".join(buf); buf.clear()
|
| 24 |
+
docs.append({"src": p.name, "text": chunk})
|
| 25 |
+
vecs.append(model.encode(chunk, normalize_embeddings=True))
|
| 26 |
+
if buf:
|
| 27 |
+
chunk = " ".join(buf)
|
| 28 |
+
docs.append({"src": p.name, "text": chunk})
|
| 29 |
+
vecs.append(model.encode(chunk, normalize_embeddings=True))
|
| 30 |
+
|
| 31 |
+
print("Фрагментов:", len(docs))
|
| 32 |
+
index = faiss.IndexFlatIP(model.get_sentence_embedding_dimension())
|
| 33 |
+
index.add(np.stack(vecs).astype("float32"))
|
| 34 |
+
faiss.write_index(index, "vectorstore.faiss")
|
| 35 |
+
pickle.dump(docs, open("docs.pkl", "wb"))
|
| 36 |
+
print("✓ Индекс готов — файлы vectorstore.faiss и docs.pkl созданы")
|