# preprocess.py from docx import Document import numpy as np from sentence_transformers import SentenceTransformer import faiss import pickle # 1. Чтение текста из файла def read_docx(file_path): doc = Document(file_path) full_text = [] for para in doc.paragraphs: text = para.text.strip() if text: full_text.append(text) return full_text # 2. Разбить на абзацы/фрагменты def split_into_passages(texts, max_length=500): passages = [] current_passage = "" for text in texts: words = text.split() for word in words: if len(current_passage) + len(word) + 1 <= max_length: current_passage += " " + word else: passages.append(current_passage.strip()) current_passage = word if current_passage: passages.append(current_passage.strip()) current_passage = "" return passages # 3. Векторизация def embed_passages(passages): model = SentenceTransformer('bert-base-multilingual-cased') embeddings = model.encode(passages, convert_to_numpy=True) return embeddings, model # 4. Сохранить векторы и индекс FAISS def build_and_save_index(embeddings, passages, output_dir="vectorstore"): dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) # Сохраняем индекс и пассажи faiss.write_index(index, f"{output_dir}/index.faiss") np.save(f"{output_dir}/passages.npy", passages) print("Индекс и фрагменты успешно сохранены.") if __name__ == "__main__": file_path = "дәрістер1-15.docx" raw_text = read_docx(file_path) passages = split_into_passages(raw_text) embeddings, _ = embed_passages(passages) build_and_save_index(embeddings, passages)