File size: 1,788 Bytes
6a02b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
from pathlib import Path

def read_jsonl(path: str):
    p = Path(path)
    items = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

def write_jsonl(path: str, items):
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    with p.open("w", encoding="utf-8") as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

def load_clauses(path: str):
    clauses = read_jsonl(path)
    ru = []
    kz = []
    for c in clauses:
        cid = c.get("id") or c.get("clause_id") or c.get("uid")
        meta = {k: v for k, v in c.items() if k not in {"text", "text_ru", "text_kz", "ru", "kz"}}
        t_ru = c.get("text_ru") or c.get("ru") or c.get("text")
        t_kz = c.get("text_kz") or c.get("kz")
        if cid is None:
            continue
        if isinstance(t_ru, str) and t_ru.strip():
            ru.append({"id": str(cid), "text": t_ru.strip(), "meta": meta, "lang": "ru"})
        if isinstance(t_kz, str) and t_kz.strip():
            kz.append({"id": str(cid), "text": t_kz.strip(), "meta": meta, "lang": "kz"})
    return ru, kz

def load_pairs(path: str):
    pairs = read_jsonl(path)
    out = []
    for x in pairs:
        q = x.get("query")
        pos = x.get("positive")
        pid = x.get("positive_id")
        lang = x.get("lang")
        if isinstance(q, str) and q.strip() and isinstance(pos, str) and pos.strip() and pid is not None and lang in {"ru", "kz"}:
            out.append({"query": q.strip(), "positive": pos.strip(), "positive_id": str(pid), "lang": lang})
    return out