AniFileBERT / tools /repair_dataset_labels.py
ModerRAS's picture
Organize parser modules and tools
8c50d16
"""Repair known weak-label mistakes in exported AnimeName JSONL datasets."""
from __future__ import annotations
import argparse
import json
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
from anifilebert.label_repairs import LabelRepair, repair_jsonl_item
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset")
parser.add_argument("--input", required=True, help="Input JSONL")
parser.add_argument("--output", required=True, help="Output repaired JSONL")
parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON")
parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL")
parser.add_argument("--example-limit", type=int, default=40)
return parser.parse_args()
def repair_key(repair: LabelRepair) -> str:
return f"{repair.kind}:{repair.marker}"
def main() -> None:
args = parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json")
counts: Counter[str] = Counter()
marker_counts: Counter[str] = Counter()
examples: Dict[str, List[dict]] = defaultdict(list)
label_counts: Counter[str] = Counter()
row_count = 0
repaired_rows = 0
output_handle = None
if not args.dry_run:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_handle = output_path.open("w", encoding="utf-8")
try:
with input_path.open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
continue
row_count += 1
item = json.loads(line)
repaired, repairs = repair_jsonl_item(item)
if repairs:
repaired_rows += 1
for repair in repairs:
key = repair_key(repair)
counts[repair.kind] += 1
marker_counts[key] += 1
if len(examples[key]) < args.example_limit:
examples[key].append(
{
"file_id": item.get("file_id"),
"filename": item.get("filename"),
"marker": repair.marker,
"value": repair.value,
"span": [repair.start, repair.end],
}
)
label_counts.update(repaired.get("labels", []))
if output_handle is not None:
output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
finally:
if output_handle is not None:
output_handle.close()
manifest = {
"created_at": datetime.now(timezone.utc).isoformat(),
"input": str(input_path),
"output": None if args.dry_run else str(output_path),
"dry_run": args.dry_run,
"row_count": row_count,
"repaired_rows": repaired_rows,
"repair_counts": dict(counts),
"marker_counts": dict(marker_counts),
"label_counts": dict(label_counts),
"examples": examples,
}
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print(json.dumps({
"row_count": row_count,
"repaired_rows": repaired_rows,
"repair_counts": dict(counts),
"manifest": str(manifest_path),
"output": None if args.dry_run else str(output_path),
}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()