File size: 1,943 Bytes
bbd5c83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65db8bb
bbd5c83
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# preprocess.py
from docx import Document
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# 1. Чтение текста из файла
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            full_text.append(text)
    return full_text

# 2. Разбить на абзацы/фрагменты
def split_into_passages(texts, max_length=500):
    passages = []
    current_passage = ""
    for text in texts:
        words = text.split()
        for word in words:
            if len(current_passage) + len(word) + 1 <= max_length:
                current_passage += " " + word
            else:
                passages.append(current_passage.strip())
                current_passage = word
        if current_passage:
            passages.append(current_passage.strip())
            current_passage = ""
    return passages

# 3. Векторизация
def embed_passages(passages):
    model = SentenceTransformer('bert-base-multilingual-cased')
    embeddings = model.encode(passages, convert_to_numpy=True)
    return embeddings, model

# 4. Сохранить векторы и индекс FAISS
def build_and_save_index(embeddings, passages, output_dir="vectorstore"):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Сохраняем индекс и пассажи
    faiss.write_index(index, f"{output_dir}/index.faiss")
    np.save(f"{output_dir}/passages.npy", passages)

    print("Индекс и фрагменты успешно сохранены.")

if __name__ == "__main__":
    file_path = "дәрістер1-15.docx"
    raw_text = read_docx(file_path)
    passages = split_into_passages(raw_text)
    embeddings, _ = embed_passages(passages)
    build_and_save_index(embeddings, passages)