File size: 2,319 Bytes
be5f706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""Create a shuffled JSONL training mix from multiple anime parser datasets."""

import argparse
import json
import random
from pathlib import Path
from typing import Iterable


def iter_jsonl(path: Path, limit: int | None = None) -> Iterable[dict]:
    count = 0
    with path.open("r", encoding="utf-8") as handle:
        for line in handle:
            line = line.strip()
            if not line:
                continue
            item = json.loads(line)
            yield {"tokens": item["tokens"], "labels": item["labels"]}
            count += 1
            if limit is not None and count >= limit:
                break


def main() -> None:
    parser = argparse.ArgumentParser(description="Mix synthetic and weakly-labeled DMHY datasets")
    parser.add_argument("--synthetic", default="data/synthetic.jsonl")
    parser.add_argument("--dmhy", default="data/dmhy/dmhy_weak.jsonl")
    parser.add_argument("--output", default="data/dmhy/mixed_train.jsonl")
    parser.add_argument("--synthetic-limit", type=int, default=None)
    parser.add_argument("--dmhy-limit", type=int, default=None)
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    random.seed(args.seed)
    records = []
    synthetic_count = 0
    dmhy_count = 0

    for item in iter_jsonl(Path(args.synthetic), args.synthetic_limit):
        records.append(item)
        synthetic_count += 1
    for item in iter_jsonl(Path(args.dmhy), args.dmhy_limit):
        records.append(item)
        dmhy_count += 1

    random.shuffle(records)
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as handle:
        for item in records:
            handle.write(json.dumps(item, ensure_ascii=False) + "\n")

    manifest = {
        "synthetic": args.synthetic,
        "dmhy": args.dmhy,
        "output": args.output,
        "synthetic_count": synthetic_count,
        "dmhy_count": dmhy_count,
        "total_count": len(records),
        "seed": args.seed,
    }
    output_path.with_suffix(".manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    print(json.dumps(manifest, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()