| import json | |
| from pathlib import Path | |
| def read_jsonl(path: str): | |
| p = Path(path) | |
| items = [] | |
| with p.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| items.append(json.loads(line)) | |
| return items | |
| def write_jsonl(path: str, items): | |
| p = Path(path) | |
| p.parent.mkdir(parents=True, exist_ok=True) | |
| with p.open("w", encoding="utf-8") as f: | |
| for it in items: | |
| f.write(json.dumps(it, ensure_ascii=False) + "\n") | |
| def load_clauses(path: str): | |
| clauses = read_jsonl(path) | |
| ru = [] | |
| kz = [] | |
| for c in clauses: | |
| cid = c.get("id") or c.get("clause_id") or c.get("uid") | |
| meta = {k: v for k, v in c.items() if k not in {"text", "text_ru", "text_kz", "ru", "kz"}} | |
| t_ru = c.get("text_ru") or c.get("ru") or c.get("text") | |
| t_kz = c.get("text_kz") or c.get("kz") | |
| if cid is None: | |
| continue | |
| if isinstance(t_ru, str) and t_ru.strip(): | |
| ru.append({"id": str(cid), "text": t_ru.strip(), "meta": meta, "lang": "ru"}) | |
| if isinstance(t_kz, str) and t_kz.strip(): | |
| kz.append({"id": str(cid), "text": t_kz.strip(), "meta": meta, "lang": "kz"}) | |
| return ru, kz | |
| def load_pairs(path: str): | |
| pairs = read_jsonl(path) | |
| out = [] | |
| for x in pairs: | |
| q = x.get("query") | |
| pos = x.get("positive") | |
| pid = x.get("positive_id") | |
| lang = x.get("lang") | |
| if isinstance(q, str) and q.strip() and isinstance(pos, str) and pos.strip() and pid is not None and lang in {"ru", "kz"}: | |
| out.append({"query": q.strip(), "positive": pos.strip(), "positive_id": str(pid), "lang": lang}) | |
| return out | |