Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Semantic anime filename annotator. | |
| Uses position-based understanding (NOT regex pattern matching) to assign BIO labels. | |
| Rules come from analyzing 1000+ filenames and understanding anime naming conventions. | |
| """ | |
| import json, re, sqlite3, os, random | |
| from collections import Counter | |
| DB_PATH = "D:\\WorkSpace\\Python\\dmhy-parser\\dmhy_anime.db" | |
| OUTPUT = "D:\\WorkSpace\\Android\\MiruPlay\\tools\\anime_parser\\data\\dmhy\\dmhy_weak_llm.jsonl" | |
| VIDEO_EXTS = {".mkv", ".mp4", ".avi", ".mov", ".wmv", ".flv", ".rmvb", | |
| ".ts", ".m2ts", ".webm", ".mpg", ".mpeg", ".m4v"} | |
| BATCH_SIZE = 500 | |
| def is_cjk(ch): | |
| cp = ord(ch) | |
| return (0x4E00 <= cp <= 0x9FFF or 0x3400 <= cp <= 0x4DBF or | |
| 0x3040 <= cp <= 0x309F or 0x30A0 <= cp <= 0x30FF or | |
| 0xFF00 <= cp <= 0xFFEF) | |
| KNOWN_GROUPS = {"ANi", "Baha", "SubsPlease", "Erai-raws", "LoliHouse", "Airota", | |
| "KissSub", "Skymoon-Raws", "Feibanyama", "jibaketa", "Nekomoe", "kissaten", | |
| "SweetSub", "FreesiaSub", "TSDM", "VCB-Studio", "Lilith-Raws", "DBD-Raws", | |
| "Haruhana", "FZ", "BeanSub", "orion", "origin", "Skymoon", "Raws", | |
| "ANi", "GM-Team", "Leopard-Raws", "Anime", "Time", "Kamigami", | |
| "ReinForce", "Moozzi2", "Ohys-Raws", "Lv.1"} | |
| EXPLICIT_SEASONS = {"S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", | |
| "S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", | |
| "S1Season", "S2Season"} | |
| def tokenize_filename(filename): | |
| """Tokenize an anime filename into tokens. Brackets/parens are separate tokens.""" | |
| tokens = [] | |
| i = 0 | |
| n = len(filename) | |
| while i < n: | |
| c = filename[i] | |
| if c in '[]()': | |
| tokens.append(c) | |
| i += 1 | |
| elif c == ' ': | |
| tokens.append(' ') | |
| i += 1 | |
| elif c == '.' and i+2 < n and filename[i:i+3] == '...': | |
| tokens.append('...') | |
| i += 3 | |
| elif is_cjk(c): | |
| tokens.append(c) | |
| i += 1 | |
| elif c in ',_~|!?+:;&\'\"#=': | |
| tokens.append(c) | |
| i += 1 | |
| elif c.isdigit(): | |
| j = i | |
| while j < n and filename[j].isdigit(): | |
| j += 1 | |
| tokens.append(filename[i:j]) | |
| i = j | |
| elif c.isalpha(): | |
| j = i | |
| while j < n and (filename[j].isalpha() or filename[j].isdigit() or | |
| (filename[j] in '-.\'' and j+1 < n and filename[j+1].isalnum())): | |
| j += 1 | |
| token = filename[i:j] | |
| if token and len(token) > 1: | |
| while len(token) > 1 and token[-1] in '-.\'': | |
| token = token[:-1] | |
| if token: | |
| tokens.append(token) | |
| i = j | |
| else: | |
| if c in '-' and i > 0 and tokens and tokens[-1] not in ' []()': | |
| tokens.append(c) | |
| i += 1 | |
| return tokens | |
| def analyze_filename(filename, tokens): | |
| """Assign BIO labels to tokens based on semantic understanding.""" | |
| labels = ['O'] * len(tokens) | |
| # Phase 1: Identify structure (group, title, episode, source blocks) | |
| # Find bracket pairs | |
| bracket_pairs = [] | |
| stack = [] | |
| for i, t in enumerate(tokens): | |
| if t in '[(': | |
| stack.append((t, i)) | |
| elif t in '])': | |
| if stack: | |
| open_t, open_i = stack.pop() | |
| bracket_pairs.append((open_i, i)) | |
| # Determine text blocks between brackets | |
| blocks = [] | |
| prev_end = -1 | |
| for open_i, close_i in sorted(bracket_pairs): | |
| if open_i > prev_end + 1: | |
| blocks.append(('text', prev_end + 1, open_i)) | |
| blocks.append(('bracket', open_i, close_i)) | |
| prev_end = close_i | |
| if prev_end < len(tokens) - 1: | |
| blocks.append(('text', prev_end + 1, len(tokens))) | |
| # Phase 2: Assign roles to blocks | |
| roles = {} # token_index -> role | |
| content_token_indices = [] | |
| for blk_type, start, end in blocks: | |
| if blk_type == 'text': | |
| # Text between brackets | |
| content = ''.join(tokens[start:end]).strip() | |
| if content: | |
| is_sep = all(t in ' -_~|.,!?+:;&\'' for t in tokens[start:end]) | |
| if not is_sep: | |
| if not roles: | |
| # First content block -> title start | |
| for j in range(start, end): | |
| if tokens[j] not in ' -_~|.,!?+:;&\'' : | |
| roles[j] = 'TITLE' | |
| else: | |
| for j in range(start, end): | |
| if tokens[j] not in ' -_~|.,!?+:;&\'' : | |
| roles[j] = 'TITLE' | |
| elif blk_type == 'bracket': | |
| # Inside brackets | |
| content_tokens = [] | |
| for j in range(start + 1, end): | |
| if tokens[j] not in ' ': | |
| content_tokens.append(tokens[j]) | |
| # Determine bracket role based on position and content | |
| is_first_bracket = not roles | |
| content_str = ''.join(tokens[start+1:end]).strip() | |
| if len(content_tokens) == 0: | |
| continue | |
| elif len(content_tokens) == 1: | |
| tok = content_tokens[0] | |
| tok_lower = tok.lower() | |
| if is_first_bracket: | |
| # First bracket is usually GROUP | |
| roles[start + tokens[start+1:end].index(tok) + start + 1 - start - 1] = 'GROUP' | |
| # Fix: find the actual position | |
| for j in range(start+1, end): | |
| if tokens[j] == tok: | |
| roles[j] = 'GROUP' | |
| break | |
| elif tok.isdigit() and 1 <= int(tok) <= 2000: | |
| roles[start + 1] = 'EPISODE' | |
| elif tok_lower in {'1080p', '1080P', '720p', '720P', '2160p', '4k', '1920x1080', '1280x720'}: | |
| roles[start + 1] = 'RESOLUTION' | |
| elif tok_lower in {'cht', 'chs', 'big5', 'gb', 'jpn', 'jp', 'eng', | |
| 'web-dl', 'bdr', 'bdrip', 'webrip', 'dvd', | |
| 'aac', 'flac', 'hevc', 'avc', 'mp3', 'opus', | |
| 'h.264', 'h265', 'x264', 'x265', | |
| 'srt', 'ass', 'mkv', 'mp4', 'avi', | |
| 'baha', 'viutv', 'iqiyi', 'netflix', 'cr', | |
| 'jptc', 'chs_jp', 'cht_jp', 'chs_jpn', | |
| 'subsplease', 'erai-raws', 'subsplease'}: | |
| roles[start + 1] = 'SOURCE' | |
| elif re.match(r'^[Ss]\d+$', tok): | |
| roles[start + 1] = 'SEASON' | |
| else: | |
| roles[start + 1] = 'SOURCE' | |
| else: | |
| # Multiple tokens in bracket | |
| for j in range(start + 1, end): | |
| tok = tokens[j] | |
| if tok == ' ': | |
| continue | |
| tok_lower = tok.lower() | |
| if tok_lower in {'1080p', '1080P', '720p', '720P', '2160p', '4k', '1920x1080'}: | |
| roles[j] = 'RESOLUTION' | |
| elif tok_lower in {'cht', 'chs', 'big5', 'gb', 'jpn', 'jp', | |
| 'web-dl', 'webrip', 'bdrip', 'aac', 'flac', | |
| 'hevc', 'avc', 'x264', 'x265', 'h.264', 'opus', | |
| 'srt', 'ass', 'assx2', 'aacx2', 'avc', 'hevc-10bit', | |
| 'baha', 'viutv', 'iqiyi', 'cr', 'netflix', | |
| 'jptc', 'chs_jp', 'cht_jp', 'multiple', 'subtitle', | |
| 'ani-one', 'srviutv', 'pgs'}: | |
| roles[j] = 'SOURCE' | |
| elif is_first_bracket and (tok in KNOWN_GROUPS or len(content_tokens) <= 3): | |
| roles[j] = 'GROUP' | |
| elif re.match(r'^[Ss]\d+$', tok) or tok.lower() in {'s1','s2','s3','s4'}: | |
| roles[j] = 'SEASON' | |
| elif tok.isdigit() and 1 <= int(tok) <= 2000: | |
| roles[j] = 'EPISODE' | |
| elif is_cjk(tok[0]): | |
| if not any(r.startswith('TITLE') for r in roles.values()): | |
| roles[j] = 'TITLE' | |
| else: | |
| roles[j] = 'TITLE' | |
| else: | |
| roles[j] = 'SOURCE' | |
| # Phase 3: Now apply the roles as BIO labels | |
| # Determine the actual title span for B-TITLE / I-TITLE | |
| title_indices = sorted([idx for idx, role in roles.items() if role == 'TITLE']) | |
| group_indices = sorted([idx for idx, role in roles.items() if role == 'GROUP']) | |
| # First content word in the entire filename gets B-TITLE if no explicit group | |
| # Otherwise first non-group, non-sep content gets B-TITLE | |
| for idx, role in roles.items(): | |
| if role == 'TITLE': | |
| # Check if there are any preceding title words | |
| prev_title = [j for j in title_indices if j < idx] | |
| if not prev_title: | |
| labels[idx] = 'B-TITLE' | |
| else: | |
| labels[idx] = 'I-TITLE' | |
| elif role == 'GROUP': | |
| # Check for I-GROUP | |
| prev_group = [j for j in group_indices if j < idx] | |
| if not prev_group: | |
| labels[idx] = 'B-GROUP' | |
| else: | |
| # Check if separated by bracket | |
| gap_has_separator = any(tokens[j] in ' []()' for j in range(prev_group[-1] + 1, idx)) | |
| if gap_has_separator: | |
| labels[idx] = 'B-GROUP' | |
| else: | |
| labels[idx] = 'I-GROUP' | |
| elif role == 'SEASON': | |
| labels[idx] = 'B-SEASON' | |
| elif role == 'EPISODE': | |
| labels[idx] = 'B-EPISODE' | |
| elif role == 'RESOLUTION': | |
| labels[idx] = 'B-RESOLUTION' | |
| elif role == 'SOURCE': | |
| labels[idx] = 'B-SOURCE' | |
| return labels | |
| def main(): | |
| conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=30) | |
| conn.execute("PRAGMA query_only=ON") | |
| # Sample BATCH_SIZE video files | |
| cursor = conn.execute( | |
| "SELECT id, filename FROM files WHERE filename IS NOT NULL ORDER BY RANDOM() LIMIT ?", | |
| (BATCH_SIZE * 3,) | |
| ) | |
| results = [] | |
| seen_stems = set() | |
| for fid, raw in cursor: | |
| stem = re.split(r"[\\/]", raw.strip())[-1].strip() | |
| stem, ext = os.path.splitext(stem) | |
| if ext.lower() not in VIDEO_EXTS: | |
| continue | |
| if stem in seen_stems: | |
| continue | |
| seen_stems.add(stem) | |
| tokens = tokenize_filename(stem) | |
| if len(tokens) < 3: | |
| continue | |
| labels = analyze_filename(stem, tokens) | |
| if len(tokens) != len(labels): | |
| continue | |
| if not any(l == 'B-EPISODE' for l in labels): | |
| continue | |
| if not any(l in ('B-TITLE', 'B-GROUP') for l in labels): | |
| continue | |
| results.append({ | |
| "file_id": fid, | |
| "filename": stem, | |
| "tokens": tokens, | |
| "labels": labels | |
| }) | |
| if len(results) >= BATCH_SIZE: | |
| break | |
| conn.close() | |
| # Write output | |
| with open(OUTPUT, "w", encoding="utf-8") as f: | |
| for r in results: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| # Stats | |
| b_season = sum(1 for r in results if "B-SEASON" in r["labels"]) | |
| b_title = sum(1 for r in results if "B-TITLE" in r["labels"]) | |
| print(f"Wrote {len(results)} annotations to {OUTPUT}") | |
| print(f" B-TITLE: {b_title}") | |
| print(f" B-SEASON: {b_season}") | |
| print(f" B-EPISODE: {len(results)}") | |
| if __name__ == "__main__": | |
| main() | |