File size: 4,016 Bytes
e63569d
 
 
 
 
 
 
 
 
 
 
8c50d16
e63569d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c50d16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Repair known weak-label mistakes in exported AnimeName JSONL datasets."""

from __future__ import annotations

import argparse
import json
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List

from anifilebert.label_repairs import LabelRepair, repair_jsonl_item


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset")
    parser.add_argument("--input", required=True, help="Input JSONL")
    parser.add_argument("--output", required=True, help="Output repaired JSONL")
    parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON")
    parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL")
    parser.add_argument("--example-limit", type=int, default=40)
    return parser.parse_args()


def repair_key(repair: LabelRepair) -> str:
    return f"{repair.kind}:{repair.marker}"


def main() -> None:
    args = parse_args()
    input_path = Path(args.input)
    output_path = Path(args.output)
    manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json")

    counts: Counter[str] = Counter()
    marker_counts: Counter[str] = Counter()
    examples: Dict[str, List[dict]] = defaultdict(list)
    label_counts: Counter[str] = Counter()
    row_count = 0
    repaired_rows = 0

    output_handle = None
    if not args.dry_run:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_handle = output_path.open("w", encoding="utf-8")

    try:
        with input_path.open("r", encoding="utf-8") as handle:
            for line in handle:
                line = line.strip()
                if not line:
                    continue
                row_count += 1
                item = json.loads(line)
                repaired, repairs = repair_jsonl_item(item)
                if repairs:
                    repaired_rows += 1
                    for repair in repairs:
                        key = repair_key(repair)
                        counts[repair.kind] += 1
                        marker_counts[key] += 1
                        if len(examples[key]) < args.example_limit:
                            examples[key].append(
                                {
                                    "file_id": item.get("file_id"),
                                    "filename": item.get("filename"),
                                    "marker": repair.marker,
                                    "value": repair.value,
                                    "span": [repair.start, repair.end],
                                }
                            )
                label_counts.update(repaired.get("labels", []))
                if output_handle is not None:
                    output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
    finally:
        if output_handle is not None:
            output_handle.close()

    manifest = {
        "created_at": datetime.now(timezone.utc).isoformat(),
        "input": str(input_path),
        "output": None if args.dry_run else str(output_path),
        "dry_run": args.dry_run,
        "row_count": row_count,
        "repaired_rows": repaired_rows,
        "repair_counts": dict(counts),
        "marker_counts": dict(marker_counts),
        "label_counts": dict(label_counts),
        "examples": examples,
    }
    manifest_path.parent.mkdir(parents=True, exist_ok=True)
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
    print(json.dumps({
        "row_count": row_count,
        "repaired_rows": repaired_rows,
        "repair_counts": dict(counts),
        "manifest": str(manifest_path),
        "output": None if args.dry_run else str(output_path),
    }, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()