lexir / src /demo_cli.py
irinaqqq's picture
f
87ce102
from pathlib import Path
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from data_io import read_jsonl
MODEL_PATH = Path("artifacts/models/finetuned_mpnet")
INDEX_DIR = Path("artifacts/indexes/finetuned")
TOP_K = 5
def detect_lang(s: str):
s2 = s.lower()
kz_chars = set("әғқңөұүһі")
if any(ch in kz_chars for ch in s2):
return "kz"
return "ru"
def load_index(lang: str):
index = faiss.read_index(str(INDEX_DIR / f"{lang}.faiss"))
meta = read_jsonl(str(INDEX_DIR / f"{lang}_meta.jsonl"))
meta_by_pos = {int(x["pos"]): x for x in meta}
return index, meta_by_pos
def search(model, index, meta_by_pos, query):
q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
scores, idxs = index.search(q, TOP_K)
results = []
for r in range(TOP_K):
item = meta_by_pos.get(int(idxs[0, r]))
if not item:
continue
results.append(item)
return results
def print_results(query, results):
print("\nЗапрос:")
print(query)
print("\nРелевантные нормы:\n")
for i, item in enumerate(results, 1):
print(f"{i}. {item.get('meta', '')}")
print(item.get("text", ""))
print("-" * 80)
def main():
if not MODEL_PATH.exists():
raise RuntimeError("finetuned model not found")
model = SentenceTransformer(str(MODEL_PATH))
ru_index, ru_meta = load_index("ru")
kz_index, kz_meta = load_index("kz")
print("LexIR demo (fine-tuned model)")
print("Введите запрос (exit — выход)\n")
while True:
q = input(">>> ").strip()
if not q:
continue
if q.lower() in {"exit", "quit"}:
break
lang = detect_lang(q)
if lang == "kz":
results = search(model, kz_index, kz_meta, q)
else:
results = search(model, ru_index, ru_meta, q)
print_results(q, results)
if __name__ == "__main__":
main()