"""Repair known weak-label mistakes in exported AnimeName JSONL datasets.""" from __future__ import annotations import argparse import json from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List from anifilebert.label_repairs import LabelRepair, repair_jsonl_item def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset") parser.add_argument("--input", required=True, help="Input JSONL") parser.add_argument("--output", required=True, help="Output repaired JSONL") parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON") parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL") parser.add_argument("--example-limit", type=int, default=40) return parser.parse_args() def repair_key(repair: LabelRepair) -> str: return f"{repair.kind}:{repair.marker}" def main() -> None: args = parse_args() input_path = Path(args.input) output_path = Path(args.output) manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json") counts: Counter[str] = Counter() marker_counts: Counter[str] = Counter() examples: Dict[str, List[dict]] = defaultdict(list) label_counts: Counter[str] = Counter() row_count = 0 repaired_rows = 0 output_handle = None if not args.dry_run: output_path.parent.mkdir(parents=True, exist_ok=True) output_handle = output_path.open("w", encoding="utf-8") try: with input_path.open("r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue row_count += 1 item = json.loads(line) repaired, repairs = repair_jsonl_item(item) if repairs: repaired_rows += 1 for repair in repairs: key = repair_key(repair) counts[repair.kind] += 1 marker_counts[key] += 1 if len(examples[key]) < args.example_limit: examples[key].append( { "file_id": item.get("file_id"), "filename": item.get("filename"), "marker": repair.marker, "value": repair.value, "span": [repair.start, repair.end], } ) label_counts.update(repaired.get("labels", [])) if output_handle is not None: output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n") finally: if output_handle is not None: output_handle.close() manifest = { "created_at": datetime.now(timezone.utc).isoformat(), "input": str(input_path), "output": None if args.dry_run else str(output_path), "dry_run": args.dry_run, "row_count": row_count, "repaired_rows": repaired_rows, "repair_counts": dict(counts), "marker_counts": dict(marker_counts), "label_counts": dict(label_counts), "examples": examples, } manifest_path.parent.mkdir(parents=True, exist_ok=True) manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") print(json.dumps({ "row_count": row_count, "repaired_rows": repaired_rows, "repair_counts": dict(counts), "manifest": str(manifest_path), "output": None if args.dry_run else str(output_path), }, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()