ModerRAS
/

AniFileBERT

Token Classification

filename-parsing

Eval Results (legacy)

Model card Files Files and versions

AniFileBERT / tools /repair_dataset_labels.py

ModerRAS's picture

Organize parser modules and tools

8c50d16 4 days ago

history blame contribute delete

4.02 kB

	"""Repair known weak-label mistakes in exported AnimeName JSONL datasets."""

	from __future__ import annotations

	import argparse
	import json
	from collections import Counter, defaultdict
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Dict, List

	from anifilebert.label_repairs import LabelRepair, repair_jsonl_item


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset")
	parser.add_argument("--input", required=True, help="Input JSONL")
	parser.add_argument("--output", required=True, help="Output repaired JSONL")
	parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON")
	parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL")
	parser.add_argument("--example-limit", type=int, default=40)
	return parser.parse_args()


	def repair_key(repair: LabelRepair) -> str:
	return f"{repair.kind}:{repair.marker}"


	def main() -> None:
	args = parse_args()
	input_path = Path(args.input)
	output_path = Path(args.output)
	manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json")

	counts: Counter[str] = Counter()
	marker_counts: Counter[str] = Counter()
	examples: Dict[str, List[dict]] = defaultdict(list)
	label_counts: Counter[str] = Counter()
	row_count = 0
	repaired_rows = 0

	output_handle = None
	if not args.dry_run:
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_handle = output_path.open("w", encoding="utf-8")

	try:
	with input_path.open("r", encoding="utf-8") as handle:
	for line in handle:
	line = line.strip()
	if not line:
	continue
	row_count += 1
	item = json.loads(line)
	repaired, repairs = repair_jsonl_item(item)
	if repairs:
	repaired_rows += 1
	for repair in repairs:
	key = repair_key(repair)
	counts[repair.kind] += 1
	marker_counts[key] += 1
	if len(examples[key]) < args.example_limit:
	examples[key].append(
	{
	"file_id": item.get("file_id"),
	"filename": item.get("filename"),
	"marker": repair.marker,
	"value": repair.value,
	"span": [repair.start, repair.end],
	}
	)
	label_counts.update(repaired.get("labels", []))
	if output_handle is not None:
	output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
	finally:
	if output_handle is not None:
	output_handle.close()

	manifest = {
	"created_at": datetime.now(timezone.utc).isoformat(),
	"input": str(input_path),
	"output": None if args.dry_run else str(output_path),
	"dry_run": args.dry_run,
	"row_count": row_count,
	"repaired_rows": repaired_rows,
	"repair_counts": dict(counts),
	"marker_counts": dict(marker_counts),
	"label_counts": dict(label_counts),
	"examples": examples,
	}
	manifest_path.parent.mkdir(parents=True, exist_ok=True)
	manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
	print(json.dumps({
	"row_count": row_count,
	"repaired_rows": repaired_rows,
	"repair_counts": dict(counts),
	"manifest": str(manifest_path),
	"output": None if args.dry_run else str(output_path),
	}, ensure_ascii=False, indent=2))


	if __name__ == "__main__":
	main()