lexir / src /data_io.py
irinaqqq's picture
Upload folder using huggingface_hub
6a02b16 verified
import json
from pathlib import Path
def read_jsonl(path: str):
p = Path(path)
items = []
with p.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
items.append(json.loads(line))
return items
def write_jsonl(path: str, items):
p = Path(path)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open("w", encoding="utf-8") as f:
for it in items:
f.write(json.dumps(it, ensure_ascii=False) + "\n")
def load_clauses(path: str):
clauses = read_jsonl(path)
ru = []
kz = []
for c in clauses:
cid = c.get("id") or c.get("clause_id") or c.get("uid")
meta = {k: v for k, v in c.items() if k not in {"text", "text_ru", "text_kz", "ru", "kz"}}
t_ru = c.get("text_ru") or c.get("ru") or c.get("text")
t_kz = c.get("text_kz") or c.get("kz")
if cid is None:
continue
if isinstance(t_ru, str) and t_ru.strip():
ru.append({"id": str(cid), "text": t_ru.strip(), "meta": meta, "lang": "ru"})
if isinstance(t_kz, str) and t_kz.strip():
kz.append({"id": str(cid), "text": t_kz.strip(), "meta": meta, "lang": "kz"})
return ru, kz
def load_pairs(path: str):
pairs = read_jsonl(path)
out = []
for x in pairs:
q = x.get("query")
pos = x.get("positive")
pid = x.get("positive_id")
lang = x.get("lang")
if isinstance(q, str) and q.strip() and isinstance(pos, str) and pos.strip() and pid is not None and lang in {"ru", "kz"}:
out.append({"query": q.strip(), "positive": pos.strip(), "positive_id": str(pid), "lang": lang})
return out