Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Deterministic label repairs for known weak-label blind spots.""" | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Dict, Iterable, List, Optional, Sequence, Tuple | |
| SEPARATOR_CHARS = set(" \t-_.|~~") | |
| ROMAN_NUMERAL_VALUES = { | |
| "II": 2, | |
| "III": 3, | |
| "IV": 4, | |
| "V": 5, | |
| "VI": 6, | |
| "VII": 7, | |
| "VIII": 8, | |
| "IX": 9, | |
| "Ⅱ": 2, | |
| "Ⅲ": 3, | |
| "Ⅳ": 4, | |
| "Ⅴ": 5, | |
| "Ⅵ": 6, | |
| "Ⅶ": 7, | |
| "Ⅷ": 8, | |
| "Ⅸ": 9, | |
| } | |
| CN_NUMERAL_VALUES = { | |
| "一": 1, | |
| "二": 2, | |
| "兩": 2, | |
| "两": 2, | |
| "貳": 2, | |
| "贰": 2, | |
| "弐": 2, | |
| "弍": 2, | |
| "三": 3, | |
| "參": 3, | |
| "叁": 3, | |
| "参": 3, | |
| "四": 4, | |
| "肆": 4, | |
| "五": 5, | |
| "伍": 5, | |
| "六": 6, | |
| "陸": 6, | |
| "陆": 6, | |
| "七": 7, | |
| "柒": 7, | |
| "八": 8, | |
| "捌": 8, | |
| "九": 9, | |
| "玖": 9, | |
| "十": 10, | |
| } | |
| READING_MARKER_VALUES = { | |
| "ni no sara": 2, | |
| "ni no shou": 2, | |
| "ni no sho": 2, | |
| "ni no syo": 2, | |
| "ni no shō": 2, | |
| "ni gakki": 2, | |
| "sono ni": 2, | |
| "san no sara": 3, | |
| "san no shou": 3, | |
| "san no sho": 3, | |
| "san no syo": 3, | |
| "yon no sara": 4, | |
| "shi no sara": 4, | |
| "shin no sara": 4, | |
| "go no sara": 5, | |
| "gou no sara": 5, | |
| } | |
| # Bare "Ni" is often the Japanese particle に in romanized titles. Only repair | |
| # it for titles that have been verified as a sequel marker in the release name. | |
| STANDALONE_NI_SEASON_BASES = { | |
| "Kakuriyo no Yadomeshi": 2, | |
| } | |
| EPISODE_CONTEXT_RE = re.compile( | |
| r"^\s*(?:" | |
| r"[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|" | |
| r"#\s*\d{1,4}|" | |
| r"[\[\(【《]\s*(?:EP?|#)?\d{1,4}" | |
| r")", | |
| re.I, | |
| ) | |
| EPISODE_SPAN_RE = re.compile( | |
| r"(?:" | |
| r"[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|" | |
| r"(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|" | |
| r"[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|" | |
| r"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])" | |
| r")", | |
| re.I, | |
| ) | |
| BRACKET_RE = re.compile(r"\[([^\]]*)\]|\(([^)]*)\)|【([^】]*)】|《([^》]*)》") | |
| RESOLUTION_RE = re.compile(r"(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})(?![A-Za-z0-9])") | |
| SOURCE_TOKEN_PATTERN = ( | |
| r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|" | |
| r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|" | |
| r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|" | |
| r"CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中" | |
| ) | |
| SOURCE_RE = re.compile(rf"(?<![A-Za-z0-9])(?:{SOURCE_TOKEN_PATTERN})(?![A-Za-z0-9])", re.I) | |
| SOURCE_TAG_RE = re.compile( | |
| rf"^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$", | |
| re.I, | |
| ) | |
| SPECIAL_TAG_RE = re.compile( | |
| r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+", | |
| re.I, | |
| ) | |
| SPECIAL_CODE_RE = re.compile( | |
| r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$", | |
| re.I, | |
| ) | |
| READING_MARKER_RE = re.compile( | |
| r"(?<![A-Za-z0-9])" | |
| r"(?P<marker>" | |
| r"Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|" | |
| r"San\s+no\s+(?:Sara|Shou|Sho|Syo)|" | |
| r"(?:Yon|Shi|Shin)\s+no\s+Sara|" | |
| r"(?:Go|Gou)\s+no\s+Sara|" | |
| r"Ni\s+Gakki|" | |
| r"Sono\s+Ni" | |
| r")" | |
| r"(?![A-Za-z0-9])", | |
| ) | |
| ROMAN_MARKER_RE = re.compile( | |
| r"(?<![A-Za-z0-9])" | |
| r"(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])" | |
| r"(?![A-Za-z0-9])" | |
| ) | |
| CJK_MARKER_RE = re.compile( | |
| r"(?P<marker>" | |
| r"[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|" | |
| r"第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章]" | |
| r")" | |
| ) | |
| class LabelRepair: | |
| kind: str | |
| marker: str | |
| value: int | |
| start: int | |
| end: int | |
| def clean_marker_text(text: str) -> str: | |
| return text.strip().strip("[]()【】《》()").strip() | |
| def cn_number_to_int(text: str) -> Optional[int]: | |
| text = text.strip() | |
| if text.isdigit(): | |
| return int(text) | |
| if text in CN_NUMERAL_VALUES: | |
| return CN_NUMERAL_VALUES[text] | |
| values = CN_NUMERAL_VALUES | |
| if text.startswith("十") and len(text) == 2: | |
| return 10 + values.get(text[1], 0) | |
| if text.endswith("十") and len(text) == 2: | |
| return values.get(text[0], 0) * 10 | |
| if "十" in text and len(text) == 3: | |
| return values.get(text[0], 0) * 10 + values.get(text[2], 0) | |
| return None | |
| def season_marker_number(text: str) -> Optional[int]: | |
| """Return season number for compact sequel markers such as II or Ni no Sara.""" | |
| clean = clean_marker_text(text) | |
| if not clean: | |
| return None | |
| if clean in ROMAN_NUMERAL_VALUES: | |
| return ROMAN_NUMERAL_VALUES[clean] | |
| lowered = re.sub(r"\s+", " ", clean.lower()).strip() | |
| if lowered in READING_MARKER_VALUES: | |
| return READING_MARKER_VALUES[lowered] | |
| if lowered == "ni": | |
| return 2 | |
| explicit = re.fullmatch(r"第(.+)[季期部章]", clean) | |
| if explicit: | |
| return cn_number_to_int(explicit.group(1)) | |
| cjk = re.fullmatch(r"([一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖])(?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?", clean) | |
| if cjk: | |
| return cn_number_to_int(cjk.group(1)) | |
| return None | |
| def token_offsets_in_text(text: str, tokens: Sequence[str]) -> Optional[List[Tuple[int, int]]]: | |
| offsets: List[Tuple[int, int]] = [] | |
| cursor = 0 | |
| for token in tokens: | |
| if token == "": | |
| offsets.append((cursor, cursor)) | |
| continue | |
| position = text.find(token, cursor) | |
| if position < 0: | |
| return None | |
| end = position + len(token) | |
| offsets.append((position, end)) | |
| cursor = end | |
| return offsets | |
| def has_episode_context(text: str, marker_end: int) -> bool: | |
| tail = text[marker_end:] | |
| if EPISODE_CONTEXT_RE.match(tail): | |
| return True | |
| # Some releases put a season marker at the end of a title bracket and the | |
| # episode in the next bracket: `[Title 貳之章][01]`. | |
| tail = tail.lstrip() | |
| tail = re.sub(r"^[\]\)】》]\s*", "", tail) | |
| tail = re.sub( | |
| r"^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}", | |
| "", | |
| tail, | |
| flags=re.I, | |
| ) | |
| return bool(EPISODE_CONTEXT_RE.match(tail)) | |
| def find_sequel_season_markers(text: str) -> List[LabelRepair]: | |
| """Find high-confidence sequel markers that should be labeled as SEASON.""" | |
| repairs: List[LabelRepair] = [] | |
| for pattern, kind in ( | |
| (READING_MARKER_RE, "reading"), | |
| (ROMAN_MARKER_RE, "roman"), | |
| (CJK_MARKER_RE, "cjk"), | |
| ): | |
| for match in pattern.finditer(text): | |
| marker = match.group("marker") | |
| value = season_marker_number(marker) | |
| if value is None or not has_episode_context(text, match.end()): | |
| continue | |
| repairs.append(LabelRepair(kind, marker, value, match.start(), match.end())) | |
| for base, value in STANDALONE_NI_SEASON_BASES.items(): | |
| pattern = re.compile(rf"(?<![A-Za-z0-9]){re.escape(base)}\s+(?P<marker>Ni)(?![A-Za-z0-9])") | |
| for match in pattern.finditer(text): | |
| if not has_episode_context(text, match.end("marker")): | |
| continue | |
| repairs.append( | |
| LabelRepair( | |
| kind="verified_bare_ni", | |
| marker=match.group("marker"), | |
| value=value, | |
| start=match.start("marker"), | |
| end=match.end("marker"), | |
| ) | |
| ) | |
| repairs.sort(key=lambda item: (item.start, item.end)) | |
| deduped: List[LabelRepair] = [] | |
| for repair in repairs: | |
| if deduped and repair.start < deduped[-1].end: | |
| previous = deduped[-1] | |
| if (repair.end - repair.start) > (previous.end - previous.start): | |
| deduped[-1] = repair | |
| continue | |
| deduped.append(repair) | |
| return deduped | |
| def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool: | |
| return any(label.endswith("SEASON") and end <= marker_start for label, (_start, end) in zip(labels, offsets)) | |
| def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]: | |
| return [ | |
| idx for idx, (tok_start, tok_end) in enumerate(offsets) | |
| if tok_start < end and tok_end > start | |
| ] | |
| def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None: | |
| previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity) | |
| first = not previous_is_same_entity | |
| for idx in indices: | |
| labels[idx] = f"B-{entity}" if first else f"I-{entity}" | |
| first = False | |
| def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool: | |
| previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity) | |
| first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}" | |
| expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1) | |
| if [labels[idx] for idx in indices] == expected: | |
| return False | |
| label_span(labels, indices, entity) | |
| return True | |
| def safe_to_overwrite_meta(labels: Sequence[str], indices: Sequence[int]) -> bool: | |
| if not indices: | |
| return False | |
| return not any( | |
| labels[idx].endswith(("GROUP", "EPISODE", "SEASON")) | |
| for idx in indices | |
| ) | |
| def mark_adjacent_title_separators_o( | |
| tokens: Sequence[str], | |
| labels: List[str], | |
| marker_indices: Sequence[int], | |
| ) -> None: | |
| if not marker_indices: | |
| return | |
| idx = marker_indices[0] - 1 | |
| while idx >= 0 and "".join(tokens[idx]).strip() == "" and labels[idx].endswith("TITLE"): | |
| labels[idx] = "O" | |
| idx -= 1 | |
| idx = marker_indices[-1] + 1 | |
| while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and labels[idx].endswith("TITLE"): | |
| labels[idx] = "O" | |
| idx += 1 | |
| def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int: | |
| ends = [ | |
| end for label, (_start, end) in zip(labels, offsets) | |
| if label.endswith("EPISODE") | |
| ] | |
| if ends: | |
| return min(ends) | |
| match = EPISODE_SPAN_RE.search(text) | |
| return match.end() if match else 0 | |
| def bracket_content_spans(text: str) -> Iterable[Tuple[str, int, int, int, int]]: | |
| for match in BRACKET_RE.finditer(text): | |
| groups = match.groups() | |
| group_index = next((idx for idx, value in enumerate(groups) if value is not None), None) | |
| if group_index is None: | |
| continue | |
| inner = groups[group_index] or "" | |
| # The opening delimiter is one code point in all supported bracket forms. | |
| inner_start = match.start() + 1 | |
| inner_end = inner_start + len(inner) | |
| yield inner.strip(), inner_start, inner_end, match.start(), match.end() | |
| def repair_structural_meta_labels( | |
| text: str, | |
| tokens: Sequence[str], | |
| labels: List[str], | |
| offsets: Sequence[Tuple[int, int]], | |
| ) -> List[LabelRepair]: | |
| repairs: List[LabelRepair] = [] | |
| episode_end = first_episode_end(labels, offsets, text) | |
| for clean, inner_start, inner_end, bracket_start, _bracket_end in bracket_content_spans(text): | |
| if bracket_start < episode_end: | |
| continue | |
| if not clean: | |
| continue | |
| if SPECIAL_TAG_RE.fullmatch(clean) or SPECIAL_CODE_RE.fullmatch(clean): | |
| indices = token_indices_for_span(offsets, inner_start, inner_end) | |
| if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"): | |
| repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end)) | |
| continue | |
| if SOURCE_TAG_RE.fullmatch(clean): | |
| indices = token_indices_for_span(offsets, inner_start, inner_end) | |
| if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SOURCE"): | |
| repairs.append(LabelRepair("source", clean, 0, inner_start, inner_end)) | |
| continue | |
| for match in RESOLUTION_RE.finditer(clean): | |
| start = inner_start + match.start() | |
| end = inner_start + match.end() | |
| indices = token_indices_for_span(offsets, start, end) | |
| if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "RESOLUTION"): | |
| repairs.append(LabelRepair("resolution", match.group(0), 0, start, end)) | |
| for match in SOURCE_RE.finditer(clean): | |
| start = inner_start + match.start() | |
| end = inner_start + match.end() | |
| indices = token_indices_for_span(offsets, start, end) | |
| if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SOURCE"): | |
| repairs.append(LabelRepair("source", match.group(0), 0, start, end)) | |
| # Dot-separated WEB names often carry source/resolution after SxxEyy without | |
| # brackets. Repair only after the episode span to avoid touching titles. | |
| for pattern, entity in ((RESOLUTION_RE, "RESOLUTION"), (SOURCE_RE, "SOURCE")): | |
| for match in pattern.finditer(text): | |
| if match.start() < episode_end: | |
| continue | |
| indices = token_indices_for_span(offsets, match.start(), match.end()) | |
| if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, entity): | |
| repairs.append(LabelRepair(entity.lower(), match.group(0), 0, match.start(), match.end())) | |
| return repairs | |
| def repair_known_label_issues( | |
| item: Dict, | |
| ) -> Tuple[List[str], List[str], List[LabelRepair]]: | |
| """ | |
| Repair known weak-label issues. | |
| The repair is intentionally conservative: | |
| - sequel markers must be immediately before an episode/special context; | |
| - sequel marker spans must currently be part of TITLE/O, not group/meta; | |
| - rows that already have a season before the marker are left alone; | |
| - structural meta repairs only touch spans after the first episode. | |
| """ | |
| source_tokens = [str(token) for token in item.get("tokens", [])] | |
| source_labels = [str(label) for label in item.get("labels", [])] | |
| if len(source_tokens) != len(source_labels): | |
| return source_tokens, source_labels, [] | |
| filename = str(item.get("filename") or "") | |
| text = filename if filename else "".join(source_tokens) | |
| offsets = token_offsets_in_text(text, source_tokens) | |
| if offsets is None: | |
| text = "".join(source_tokens) | |
| offsets = token_offsets_in_text(text, source_tokens) | |
| if offsets is None: | |
| return source_tokens, source_labels, [] | |
| repaired_labels = list(source_labels) | |
| applied: List[LabelRepair] = [] | |
| quick_text = text.lower() | |
| has_sequel_marker_hint = any( | |
| needle in text or needle in quick_text | |
| for needle in ( | |
| " II", " III", " IV", " V", " VI", " VII", " VIII", " IX", | |
| "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", | |
| "之章", "之期", "之季", "之部", "ノ章", "ノ期", "の章", "の期", | |
| "貳", "贰", "弐", "弍", "參", "叁", "参", "肆", "陸", "陆", | |
| "Ni ", " ni ", " no Sara", "Gakki", | |
| ) | |
| ) | |
| if has_sequel_marker_hint: | |
| for repair in find_sequel_season_markers(text): | |
| if labels_have_season_before(repaired_labels, offsets, repair.start): | |
| continue | |
| indices = token_indices_for_span(offsets, repair.start, repair.end) | |
| if not indices: | |
| continue | |
| existing = [repaired_labels[idx] for idx in indices] | |
| if any( | |
| label.endswith(("GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL")) | |
| for label in existing | |
| ): | |
| continue | |
| if not any(label.endswith("TITLE") for label in existing): | |
| continue | |
| label_span(repaired_labels, indices, "SEASON") | |
| mark_adjacent_title_separators_o(source_tokens, repaired_labels, indices) | |
| applied.append(repair) | |
| applied.extend(repair_structural_meta_labels(text, source_tokens, repaired_labels, offsets)) | |
| return source_tokens, repaired_labels, applied | |
| def repair_sequel_season_labels( | |
| item: Dict, | |
| ) -> Tuple[List[str], List[str], List[LabelRepair]]: | |
| """Backward-compatible wrapper for callers that repair known label issues.""" | |
| return repair_known_label_issues(item) | |
| def repair_jsonl_item(item: Dict) -> Tuple[Dict, List[LabelRepair]]: | |
| tokens, labels, repairs = repair_known_label_issues(item) | |
| labels = normalize_iob2(labels) | |
| if not repairs: | |
| if labels == item.get("labels", []): | |
| return item, [] | |
| repaired = dict(item) | |
| repaired["labels"] = labels | |
| return repaired, [] | |
| repaired = dict(item) | |
| repaired["tokens"] = tokens | |
| repaired["labels"] = labels | |
| return repaired, repairs | |
| def normalize_iob2(labels: Sequence[str]) -> List[str]: | |
| normalized: List[str] = [] | |
| previous_entity: Optional[str] = None | |
| for label in labels: | |
| if not label.startswith(("B-", "I-")): | |
| normalized.append("O") | |
| previous_entity = None | |
| continue | |
| entity = label.split("-", 1)[1] | |
| prefix = "I" if previous_entity == entity else "B" | |
| normalized.append(f"{prefix}-{entity}") | |
| previous_entity = entity | |
| return normalized | |