import json from pathlib import Path def read_jsonl(path: str): p = Path(path) items = [] with p.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue items.append(json.loads(line)) return items def write_jsonl(path: str, items): p = Path(path) p.parent.mkdir(parents=True, exist_ok=True) with p.open("w", encoding="utf-8") as f: for it in items: f.write(json.dumps(it, ensure_ascii=False) + "\n") def load_clauses(path: str): clauses = read_jsonl(path) ru = [] kz = [] for c in clauses: cid = c.get("id") or c.get("clause_id") or c.get("uid") meta = {k: v for k, v in c.items() if k not in {"text", "text_ru", "text_kz", "ru", "kz"}} t_ru = c.get("text_ru") or c.get("ru") or c.get("text") t_kz = c.get("text_kz") or c.get("kz") if cid is None: continue if isinstance(t_ru, str) and t_ru.strip(): ru.append({"id": str(cid), "text": t_ru.strip(), "meta": meta, "lang": "ru"}) if isinstance(t_kz, str) and t_kz.strip(): kz.append({"id": str(cid), "text": t_kz.strip(), "meta": meta, "lang": "kz"}) return ru, kz def load_pairs(path: str): pairs = read_jsonl(path) out = [] for x in pairs: q = x.get("query") pos = x.get("positive") pid = x.get("positive_id") lang = x.get("lang") if isinstance(q, str) and q.strip() and isinstance(pos, str) and pos.strip() and pid is not None and lang in {"ru", "kz"}: out.append({"query": q.strip(), "positive": pos.strip(), "positive_id": str(pid), "lang": lang}) return out