Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 4,016 Bytes
e63569d 8c50d16 e63569d 8c50d16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | """Repair known weak-label mistakes in exported AnimeName JSONL datasets."""
from __future__ import annotations
import argparse
import json
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
from anifilebert.label_repairs import LabelRepair, repair_jsonl_item
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Repair weak BIO labels in a JSONL dataset")
parser.add_argument("--input", required=True, help="Input JSONL")
parser.add_argument("--output", required=True, help="Output repaired JSONL")
parser.add_argument("--manifest-output", default=None, help="Optional repair manifest JSON")
parser.add_argument("--dry-run", action="store_true", help="Scan only; do not write output JSONL")
parser.add_argument("--example-limit", type=int, default=40)
return parser.parse_args()
def repair_key(repair: LabelRepair) -> str:
return f"{repair.kind}:{repair.marker}"
def main() -> None:
args = parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".manifest.json")
counts: Counter[str] = Counter()
marker_counts: Counter[str] = Counter()
examples: Dict[str, List[dict]] = defaultdict(list)
label_counts: Counter[str] = Counter()
row_count = 0
repaired_rows = 0
output_handle = None
if not args.dry_run:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_handle = output_path.open("w", encoding="utf-8")
try:
with input_path.open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
continue
row_count += 1
item = json.loads(line)
repaired, repairs = repair_jsonl_item(item)
if repairs:
repaired_rows += 1
for repair in repairs:
key = repair_key(repair)
counts[repair.kind] += 1
marker_counts[key] += 1
if len(examples[key]) < args.example_limit:
examples[key].append(
{
"file_id": item.get("file_id"),
"filename": item.get("filename"),
"marker": repair.marker,
"value": repair.value,
"span": [repair.start, repair.end],
}
)
label_counts.update(repaired.get("labels", []))
if output_handle is not None:
output_handle.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
finally:
if output_handle is not None:
output_handle.close()
manifest = {
"created_at": datetime.now(timezone.utc).isoformat(),
"input": str(input_path),
"output": None if args.dry_run else str(output_path),
"dry_run": args.dry_run,
"row_count": row_count,
"repaired_rows": repaired_rows,
"repair_counts": dict(counts),
"marker_counts": dict(marker_counts),
"label_counts": dict(label_counts),
"examples": examples,
}
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print(json.dumps({
"row_count": row_count,
"repaired_rows": repaired_rows,
"repair_counts": dict(counts),
"manifest": str(manifest_path),
"output": None if args.dry_run else str(output_path),
}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
|