Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Repair known weak-label mistakes in exported AnimeName JSONL datasets.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from collections import Counter, defaultdict | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Dict, List | |
| from anifilebert.label_repairs import LabelRepair, repair_jsonl_item | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset") | |
| parser.add_argument("--input", required=True, help="Input JSONL") | |
| parser.add_argument("--output", required=True, help="Output repaired JSONL") | |
| parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON") | |
| parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL") | |
| parser.add_argument("--example-limit", type=int, default=40) | |
| return parser.parse_args() | |
| def repair_key(repair: LabelRepair) -> str: | |
| return f"{repair.kind}:{repair.marker}" | |
| def main() -> None: | |
| args = parse_args() | |
| input_path = Path(args.input) | |
| output_path = Path(args.output) | |
| manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json") | |
| counts: Counter[str] = Counter() | |
| marker_counts: Counter[str] = Counter() | |
| examples: Dict[str, List[dict]] = defaultdict(list) | |
| label_counts: Counter[str] = Counter() | |
| row_count = 0 | |
| repaired_rows = 0 | |
| output_handle = None | |
| if not args.dry_run: | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| output_handle = output_path.open("w", encoding="utf-8") | |
| try: | |
| with input_path.open("r", encoding="utf-8") as handle: | |
| for line in handle: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| row_count += 1 | |
| item = json.loads(line) | |
| repaired, repairs = repair_jsonl_item(item) | |
| if repairs: | |
| repaired_rows += 1 | |
| for repair in repairs: | |
| key = repair_key(repair) | |
| counts[repair.kind] += 1 | |
| marker_counts[key] += 1 | |
| if len(examples[key]) < args.example_limit: | |
| examples[key].append( | |
| { | |
| "file_id": item.get("file_id"), | |
| "filename": item.get("filename"), | |
| "marker": repair.marker, | |
| "value": repair.value, | |
| "span": [repair.start, repair.end], | |
| } | |
| ) | |
| label_counts.update(repaired.get("labels", [])) | |
| if output_handle is not None: | |
| output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n") | |
| finally: | |
| if output_handle is not None: | |
| output_handle.close() | |
| manifest = { | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "input": str(input_path), | |
| "output": None if args.dry_run else str(output_path), | |
| "dry_run": args.dry_run, | |
| "row_count": row_count, | |
| "repaired_rows": repaired_rows, | |
| "repair_counts": dict(counts), | |
| "marker_counts": dict(marker_counts), | |
| "label_counts": dict(label_counts), | |
| "examples": examples, | |
| } | |
| manifest_path.parent.mkdir(parents=True, exist_ok=True) | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") | |
| print(json.dumps({ | |
| "row_count": row_count, | |
| "repaired_rows": repaired_rows, | |
| "repair_counts": dict(counts), | |
| "manifest": str(manifest_path), | |
| "output": None if args.dry_run else str(output_path), | |
| }, ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |