kazdev-ai / preprocess.py
Nurisslam's picture
Update preprocess.py
65db8bb verified
# preprocess.py
from docx import Document
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle
# 1. Чтение текста из файла
def read_docx(file_path):
doc = Document(file_path)
full_text = []
for para in doc.paragraphs:
text = para.text.strip()
if text:
full_text.append(text)
return full_text
# 2. Разбить на абзацы/фрагменты
def split_into_passages(texts, max_length=500):
passages = []
current_passage = ""
for text in texts:
words = text.split()
for word in words:
if len(current_passage) + len(word) + 1 <= max_length:
current_passage += " " + word
else:
passages.append(current_passage.strip())
current_passage = word
if current_passage:
passages.append(current_passage.strip())
current_passage = ""
return passages
# 3. Векторизация
def embed_passages(passages):
model = SentenceTransformer('bert-base-multilingual-cased')
embeddings = model.encode(passages, convert_to_numpy=True)
return embeddings, model
# 4. Сохранить векторы и индекс FAISS
def build_and_save_index(embeddings, passages, output_dir="vectorstore"):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Сохраняем индекс и пассажи
faiss.write_index(index, f"{output_dir}/index.faiss")
np.save(f"{output_dir}/passages.npy", passages)
print("Индекс и фрагменты успешно сохранены.")
if __name__ == "__main__":
file_path = "дәрістер1-15.docx"
raw_text = read_docx(file_path)
passages = split_into_passages(raw_text)
embeddings, _ = embed_passages(passages)
build_and_save_index(embeddings, passages)