Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Enforce a single contiguous TITLE span for every JSONL row. | |
| This script is deterministic and streaming-friendly for very large datasets. | |
| It is intended as a hard safety pass before/alongside LLM relabeling. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, List, Sequence, Tuple | |
| from anifilebert.label_repairs import repair_jsonl_item | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Force contiguous TITLE spans in JSONL labels") | |
| parser.add_argument("--input", required=True, help="Input JSONL") | |
| parser.add_argument("--output", required=True, help="Output JSONL") | |
| parser.add_argument("--manifest-output", default="", help="Optional manifest JSON") | |
| parser.add_argument("--progress", type=int, default=50000, help="Progress print interval") | |
| return parser.parse_args() | |
| def normalize_iob2(labels: Sequence[str]) -> List[str]: | |
| out: List[str] = [] | |
| prev = "" | |
| for lb in labels: | |
| if not isinstance(lb, str) or not lb.startswith(("B-", "I-")): | |
| out.append("O") | |
| prev = "" | |
| continue | |
| entity = lb.split("-", 1)[1] | |
| prefix = "I" if prev == entity else "B" | |
| out.append(f"{prefix}-{entity}") | |
| prev = entity | |
| return out | |
| def is_discontinuous_title(labels: Sequence[str]) -> bool: | |
| seen_title = False | |
| seen_gap = False | |
| for lb in labels: | |
| is_title = isinstance(lb, str) and lb.endswith("TITLE") | |
| if is_title: | |
| if seen_title and seen_gap: | |
| return True | |
| seen_title = True | |
| elif seen_title: | |
| seen_gap = True | |
| return False | |
| def title_segments(labels: Sequence[str]) -> List[Tuple[int, int]]: | |
| segs: List[Tuple[int, int]] = [] | |
| i = 0 | |
| n = len(labels) | |
| while i < n: | |
| if str(labels[i]).endswith("TITLE"): | |
| j = i + 1 | |
| while j < n and str(labels[j]).endswith("TITLE"): | |
| j += 1 | |
| segs.append((i, j)) | |
| i = j | |
| else: | |
| i += 1 | |
| return segs | |
| def first_episode_or_special_index(labels: Sequence[str]) -> int: | |
| for idx, lb in enumerate(labels): | |
| text = str(lb) | |
| if text.endswith("EPISODE") or text.endswith("SPECIAL"): | |
| return idx | |
| return len(labels) | |
| def pick_primary_title_segment(labels: Sequence[str], segs: Sequence[Tuple[int, int]]) -> Tuple[int, int]: | |
| if not segs: | |
| return (-1, -1) | |
| bound = first_episode_or_special_index(labels) | |
| before = [seg for seg in segs if seg[0] < bound] | |
| # Prefer the earliest title span before episode/special boundary. | |
| if before: | |
| return min(before, key=lambda seg: seg[0]) | |
| return min(segs, key=lambda seg: seg[0]) | |
| def enforce_contiguous_title(labels: Sequence[str]) -> List[str]: | |
| fixed = normalize_iob2(labels) | |
| segs = title_segments(fixed) | |
| if len(segs) <= 1: | |
| return fixed | |
| keep_start, keep_end = pick_primary_title_segment(fixed, segs) | |
| if keep_start < 0: | |
| return fixed | |
| out = list(fixed) | |
| for idx, lb in enumerate(out): | |
| if str(lb).endswith("TITLE") and not (keep_start <= idx < keep_end): | |
| out[idx] = "O" | |
| return normalize_iob2(out) | |
| def main() -> None: | |
| args = parse_args() | |
| input_path = Path(args.input) | |
| output_path = Path(args.output) | |
| manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".contiguous_title.manifest.json") | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| manifest_path.parent.mkdir(parents=True, exist_ok=True) | |
| rows = 0 | |
| changed_rows = 0 | |
| bad_before = 0 | |
| bad_after = 0 | |
| invalid_rows = 0 | |
| tmp_path = output_path.with_suffix(output_path.suffix + ".tmp") | |
| with input_path.open("r", encoding="utf-8") as src, tmp_path.open("w", encoding="utf-8", newline="\n") as dst: | |
| for line in src: | |
| line = line.rstrip("\n") | |
| if not line: | |
| continue | |
| rows += 1 | |
| rec = json.loads(line) | |
| tokens = rec.get("tokens", []) | |
| labels = rec.get("labels", []) | |
| if not isinstance(tokens, list) or not isinstance(labels, list) or len(tokens) != len(labels): | |
| invalid_rows += 1 | |
| dst.write(json.dumps(rec, ensure_ascii=False, separators=(",", ":")) + "\n") | |
| continue | |
| if is_discontinuous_title(labels): | |
| bad_before += 1 | |
| new_labels = enforce_contiguous_title(labels) | |
| out_rec: Dict = dict(rec) | |
| out_rec["labels"] = new_labels | |
| repaired, _ = repair_jsonl_item(out_rec) | |
| out_labels = repaired.get("labels", new_labels) | |
| if is_discontinuous_title(out_labels): | |
| bad_after += 1 | |
| if out_labels != labels: | |
| changed_rows += 1 | |
| repaired["labels"] = out_labels | |
| dst.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n") | |
| if args.progress > 0 and rows % args.progress == 0: | |
| print( | |
| f"rows={rows} changed={changed_rows} " | |
| f"bad_before={bad_before} bad_after={bad_after} invalid={invalid_rows}" | |
| ) | |
| tmp_path.replace(output_path) | |
| manifest = { | |
| "input": str(input_path), | |
| "output": str(output_path), | |
| "rows": rows, | |
| "changed_rows": changed_rows, | |
| "discontinuous_before": bad_before, | |
| "discontinuous_after": bad_after, | |
| "invalid_rows": invalid_rows, | |
| } | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") | |
| print(json.dumps(manifest, ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |