|
|
|
|
|
from docx import Document |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import faiss |
|
|
import pickle |
|
|
|
|
|
|
|
|
def read_docx(file_path): |
|
|
doc = Document(file_path) |
|
|
full_text = [] |
|
|
for para in doc.paragraphs: |
|
|
text = para.text.strip() |
|
|
if text: |
|
|
full_text.append(text) |
|
|
return full_text |
|
|
|
|
|
|
|
|
def split_into_passages(texts, max_length=500): |
|
|
passages = [] |
|
|
current_passage = "" |
|
|
for text in texts: |
|
|
words = text.split() |
|
|
for word in words: |
|
|
if len(current_passage) + len(word) + 1 <= max_length: |
|
|
current_passage += " " + word |
|
|
else: |
|
|
passages.append(current_passage.strip()) |
|
|
current_passage = word |
|
|
if current_passage: |
|
|
passages.append(current_passage.strip()) |
|
|
current_passage = "" |
|
|
return passages |
|
|
|
|
|
|
|
|
def embed_passages(passages): |
|
|
model = SentenceTransformer('bert-base-multilingual-cased') |
|
|
embeddings = model.encode(passages, convert_to_numpy=True) |
|
|
return embeddings, model |
|
|
|
|
|
|
|
|
def build_and_save_index(embeddings, passages, output_dir="vectorstore"): |
|
|
dimension = embeddings.shape[1] |
|
|
index = faiss.IndexFlatL2(dimension) |
|
|
index.add(embeddings) |
|
|
|
|
|
|
|
|
faiss.write_index(index, f"{output_dir}/index.faiss") |
|
|
np.save(f"{output_dir}/passages.npy", passages) |
|
|
|
|
|
print("Индекс и фрагменты успешно сохранены.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
file_path = "дәрістер1-15.docx" |
|
|
raw_text = read_docx(file_path) |
|
|
passages = split_into_passages(raw_text) |
|
|
embeddings, _ = embed_passages(passages) |
|
|
build_and_save_index(embeddings, passages) |