File size: 2,130 Bytes
6a02b16
 
 
 
87ce102
6a02b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from pathlib import Path
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from data_io import read_jsonl

MODEL_PATH = Path("artifacts/models/finetuned_mpnet")
INDEX_DIR = Path("artifacts/indexes/finetuned")
TOP_K = 5

def detect_lang(s: str):
    s2 = s.lower()
    kz_chars = set("әғқңөұүһі")
    if any(ch in kz_chars for ch in s2):
        return "kz"
    return "ru"

def load_index(lang: str):
    index = faiss.read_index(str(INDEX_DIR / f"{lang}.faiss"))
    meta = read_jsonl(str(INDEX_DIR / f"{lang}_meta.jsonl"))
    meta_by_pos = {int(x["pos"]): x for x in meta}
    return index, meta_by_pos

def search(model, index, meta_by_pos, query):
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    scores, idxs = index.search(q, TOP_K)
    results = []
    for r in range(TOP_K):
        item = meta_by_pos.get(int(idxs[0, r]))
        if not item:
            continue
        results.append(item)
    return results

def print_results(query, results):
    print("\nЗапрос:")
    print(query)
    print("\nРелевантные нормы:\n")

    for i, item in enumerate(results, 1):
        print(f"{i}. {item.get('meta', '')}")
        print(item.get("text", ""))
        print("-" * 80)

def main():
    if not MODEL_PATH.exists():
        raise RuntimeError("finetuned model not found")

    model = SentenceTransformer(str(MODEL_PATH))

    ru_index, ru_meta = load_index("ru")
    kz_index, kz_meta = load_index("kz")

    print("LexIR demo (fine-tuned model)")
    print("Введите запрос (exit — выход)\n")

    while True:
        q = input(">>> ").strip()
        if not q:
            continue
        if q.lower() in {"exit", "quit"}:
            break

        lang = detect_lang(q)
        if lang == "kz":
            results = search(model, kz_index, kz_meta, q)
        else:
            results = search(model, ru_index, ru_meta, q)

        print_results(q, results)

if __name__ == "__main__":
    main()