Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 5,852 Bytes
fed9d99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | #!/usr/bin/env python3
"""
Enforce a single contiguous TITLE span for every JSONL row.
This script is deterministic and streaming-friendly for very large datasets.
It is intended as a hard safety pass before/alongside LLM relabeling.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Dict, List, Sequence, Tuple
from anifilebert.label_repairs import repair_jsonl_item
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Force contiguous TITLE spans in JSONL labels")
parser.add_argument("--input", required=True, help="Input JSONL")
parser.add_argument("--output", required=True, help="Output JSONL")
parser.add_argument("--manifest-output", default="", help="Optional manifest JSON")
parser.add_argument("--progress", type=int, default=50000, help="Progress print interval")
return parser.parse_args()
def normalize_iob2(labels: Sequence[str]) -> List[str]:
out: List[str] = []
prev = ""
for lb in labels:
if not isinstance(lb, str) or not lb.startswith(("B-", "I-")):
out.append("O")
prev = ""
continue
entity = lb.split("-", 1)[1]
prefix = "I" if prev == entity else "B"
out.append(f"{prefix}-{entity}")
prev = entity
return out
def is_discontinuous_title(labels: Sequence[str]) -> bool:
seen_title = False
seen_gap = False
for lb in labels:
is_title = isinstance(lb, str) and lb.endswith("TITLE")
if is_title:
if seen_title and seen_gap:
return True
seen_title = True
elif seen_title:
seen_gap = True
return False
def title_segments(labels: Sequence[str]) -> List[Tuple[int, int]]:
segs: List[Tuple[int, int]] = []
i = 0
n = len(labels)
while i < n:
if str(labels[i]).endswith("TITLE"):
j = i + 1
while j < n and str(labels[j]).endswith("TITLE"):
j += 1
segs.append((i, j))
i = j
else:
i += 1
return segs
def first_episode_or_special_index(labels: Sequence[str]) -> int:
for idx, lb in enumerate(labels):
text = str(lb)
if text.endswith("EPISODE") or text.endswith("SPECIAL"):
return idx
return len(labels)
def pick_primary_title_segment(labels: Sequence[str], segs: Sequence[Tuple[int, int]]) -> Tuple[int, int]:
if not segs:
return (-1, -1)
bound = first_episode_or_special_index(labels)
before = [seg for seg in segs if seg[0] < bound]
# Prefer the earliest title span before episode/special boundary.
if before:
return min(before, key=lambda seg: seg[0])
return min(segs, key=lambda seg: seg[0])
def enforce_contiguous_title(labels: Sequence[str]) -> List[str]:
fixed = normalize_iob2(labels)
segs = title_segments(fixed)
if len(segs) <= 1:
return fixed
keep_start, keep_end = pick_primary_title_segment(fixed, segs)
if keep_start < 0:
return fixed
out = list(fixed)
for idx, lb in enumerate(out):
if str(lb).endswith("TITLE") and not (keep_start <= idx < keep_end):
out[idx] = "O"
return normalize_iob2(out)
def main() -> None:
args = parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
manifest_path = Path(args.manifest_output) if args.manifest_output else output_path.with_suffix(".contiguous_title.manifest.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.parent.mkdir(parents=True, exist_ok=True)
rows = 0
changed_rows = 0
bad_before = 0
bad_after = 0
invalid_rows = 0
tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
with input_path.open("r", encoding="utf-8") as src, tmp_path.open("w", encoding="utf-8", newline="\n") as dst:
for line in src:
line = line.rstrip("\n")
if not line:
continue
rows += 1
rec = json.loads(line)
tokens = rec.get("tokens", [])
labels = rec.get("labels", [])
if not isinstance(tokens, list) or not isinstance(labels, list) or len(tokens) != len(labels):
invalid_rows += 1
dst.write(json.dumps(rec, ensure_ascii=False, separators=(",", ":")) + "\n")
continue
if is_discontinuous_title(labels):
bad_before += 1
new_labels = enforce_contiguous_title(labels)
out_rec: Dict = dict(rec)
out_rec["labels"] = new_labels
repaired, _ = repair_jsonl_item(out_rec)
out_labels = repaired.get("labels", new_labels)
if is_discontinuous_title(out_labels):
bad_after += 1
if out_labels != labels:
changed_rows += 1
repaired["labels"] = out_labels
dst.write(json.dumps(repaired, ensure_ascii=False, separators=(",", ":")) + "\n")
if args.progress > 0 and rows % args.progress == 0:
print(
f"rows={rows} changed={changed_rows} "
f"bad_before={bad_before} bad_after={bad_after} invalid={invalid_rows}"
)
tmp_path.replace(output_path)
manifest = {
"input": str(input_path),
"output": str(output_path),
"rows": rows,
"changed_rows": changed_rows,
"discontinuous_before": bad_before,
"discontinuous_after": bad_after,
"invalid_rows": invalid_rows,
}
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print(json.dumps(manifest, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
|