#!/usr/bin/env python3 """ Semantic anime filename annotator. Uses position-based understanding (NOT regex pattern matching) to assign BIO labels. Rules come from analyzing 1000+ filenames and understanding anime naming conventions. """ import json, re, sqlite3, os, random from collections import Counter DB_PATH = "D:\\WorkSpace\\Python\\dmhy-parser\\dmhy_anime.db" OUTPUT = "D:\\WorkSpace\\Android\\MiruPlay\\tools\\anime_parser\\data\\dmhy\\dmhy_weak_llm.jsonl" VIDEO_EXTS = {".mkv", ".mp4", ".avi", ".mov", ".wmv", ".flv", ".rmvb", ".ts", ".m2ts", ".webm", ".mpg", ".mpeg", ".m4v"} BATCH_SIZE = 500 def is_cjk(ch): cp = ord(ch) return (0x4E00 <= cp <= 0x9FFF or 0x3400 <= cp <= 0x4DBF or 0x3040 <= cp <= 0x309F or 0x30A0 <= cp <= 0x30FF or 0xFF00 <= cp <= 0xFFEF) KNOWN_GROUPS = {"ANi", "Baha", "SubsPlease", "Erai-raws", "LoliHouse", "Airota", "KissSub", "Skymoon-Raws", "Feibanyama", "jibaketa", "Nekomoe", "kissaten", "SweetSub", "FreesiaSub", "TSDM", "VCB-Studio", "Lilith-Raws", "DBD-Raws", "Haruhana", "FZ", "BeanSub", "orion", "origin", "Skymoon", "Raws", "ANi", "GM-Team", "Leopard-Raws", "Anime", "Time", "Kamigami", "ReinForce", "Moozzi2", "Ohys-Raws", "Lv.1"} EXPLICIT_SEASONS = {"S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S01", "S02", "S03", "S04", "S05", "S06", "S07", "S08", "S09", "S1Season", "S2Season"} def tokenize_filename(filename): """Tokenize an anime filename into tokens. Brackets/parens are separate tokens.""" tokens = [] i = 0 n = len(filename) while i < n: c = filename[i] if c in '[]()': tokens.append(c) i += 1 elif c == ' ': tokens.append(' ') i += 1 elif c == '.' and i+2 < n and filename[i:i+3] == '...': tokens.append('...') i += 3 elif is_cjk(c): tokens.append(c) i += 1 elif c in ',_~|!?+:;&\'\"#=': tokens.append(c) i += 1 elif c.isdigit(): j = i while j < n and filename[j].isdigit(): j += 1 tokens.append(filename[i:j]) i = j elif c.isalpha(): j = i while j < n and (filename[j].isalpha() or filename[j].isdigit() or (filename[j] in '-.\'' and j+1 < n and filename[j+1].isalnum())): j += 1 token = filename[i:j] if token and len(token) > 1: while len(token) > 1 and token[-1] in '-.\'': token = token[:-1] if token: tokens.append(token) i = j else: if c in '-' and i > 0 and tokens and tokens[-1] not in ' []()': tokens.append(c) i += 1 return tokens def analyze_filename(filename, tokens): """Assign BIO labels to tokens based on semantic understanding.""" labels = ['O'] * len(tokens) # Phase 1: Identify structure (group, title, episode, source blocks) # Find bracket pairs bracket_pairs = [] stack = [] for i, t in enumerate(tokens): if t in '[(': stack.append((t, i)) elif t in '])': if stack: open_t, open_i = stack.pop() bracket_pairs.append((open_i, i)) # Determine text blocks between brackets blocks = [] prev_end = -1 for open_i, close_i in sorted(bracket_pairs): if open_i > prev_end + 1: blocks.append(('text', prev_end + 1, open_i)) blocks.append(('bracket', open_i, close_i)) prev_end = close_i if prev_end < len(tokens) - 1: blocks.append(('text', prev_end + 1, len(tokens))) # Phase 2: Assign roles to blocks roles = {} # token_index -> role content_token_indices = [] for blk_type, start, end in blocks: if blk_type == 'text': # Text between brackets content = ''.join(tokens[start:end]).strip() if content: is_sep = all(t in ' -_~|.,!?+:;&\'' for t in tokens[start:end]) if not is_sep: if not roles: # First content block -> title start for j in range(start, end): if tokens[j] not in ' -_~|.,!?+:;&\'' : roles[j] = 'TITLE' else: for j in range(start, end): if tokens[j] not in ' -_~|.,!?+:;&\'' : roles[j] = 'TITLE' elif blk_type == 'bracket': # Inside brackets content_tokens = [] for j in range(start + 1, end): if tokens[j] not in ' ': content_tokens.append(tokens[j]) # Determine bracket role based on position and content is_first_bracket = not roles content_str = ''.join(tokens[start+1:end]).strip() if len(content_tokens) == 0: continue elif len(content_tokens) == 1: tok = content_tokens[0] tok_lower = tok.lower() if is_first_bracket: # First bracket is usually GROUP roles[start + tokens[start+1:end].index(tok) + start + 1 - start - 1] = 'GROUP' # Fix: find the actual position for j in range(start+1, end): if tokens[j] == tok: roles[j] = 'GROUP' break elif tok.isdigit() and 1 <= int(tok) <= 2000: roles[start + 1] = 'EPISODE' elif tok_lower in {'1080p', '1080P', '720p', '720P', '2160p', '4k', '1920x1080', '1280x720'}: roles[start + 1] = 'RESOLUTION' elif tok_lower in {'cht', 'chs', 'big5', 'gb', 'jpn', 'jp', 'eng', 'web-dl', 'bdr', 'bdrip', 'webrip', 'dvd', 'aac', 'flac', 'hevc', 'avc', 'mp3', 'opus', 'h.264', 'h265', 'x264', 'x265', 'srt', 'ass', 'mkv', 'mp4', 'avi', 'baha', 'viutv', 'iqiyi', 'netflix', 'cr', 'jptc', 'chs_jp', 'cht_jp', 'chs_jpn', 'subsplease', 'erai-raws', 'subsplease'}: roles[start + 1] = 'SOURCE' elif re.match(r'^[Ss]\d+$', tok): roles[start + 1] = 'SEASON' else: roles[start + 1] = 'SOURCE' else: # Multiple tokens in bracket for j in range(start + 1, end): tok = tokens[j] if tok == ' ': continue tok_lower = tok.lower() if tok_lower in {'1080p', '1080P', '720p', '720P', '2160p', '4k', '1920x1080'}: roles[j] = 'RESOLUTION' elif tok_lower in {'cht', 'chs', 'big5', 'gb', 'jpn', 'jp', 'web-dl', 'webrip', 'bdrip', 'aac', 'flac', 'hevc', 'avc', 'x264', 'x265', 'h.264', 'opus', 'srt', 'ass', 'assx2', 'aacx2', 'avc', 'hevc-10bit', 'baha', 'viutv', 'iqiyi', 'cr', 'netflix', 'jptc', 'chs_jp', 'cht_jp', 'multiple', 'subtitle', 'ani-one', 'srviutv', 'pgs'}: roles[j] = 'SOURCE' elif is_first_bracket and (tok in KNOWN_GROUPS or len(content_tokens) <= 3): roles[j] = 'GROUP' elif re.match(r'^[Ss]\d+$', tok) or tok.lower() in {'s1','s2','s3','s4'}: roles[j] = 'SEASON' elif tok.isdigit() and 1 <= int(tok) <= 2000: roles[j] = 'EPISODE' elif is_cjk(tok[0]): if not any(r.startswith('TITLE') for r in roles.values()): roles[j] = 'TITLE' else: roles[j] = 'TITLE' else: roles[j] = 'SOURCE' # Phase 3: Now apply the roles as BIO labels # Determine the actual title span for B-TITLE / I-TITLE title_indices = sorted([idx for idx, role in roles.items() if role == 'TITLE']) group_indices = sorted([idx for idx, role in roles.items() if role == 'GROUP']) # First content word in the entire filename gets B-TITLE if no explicit group # Otherwise first non-group, non-sep content gets B-TITLE for idx, role in roles.items(): if role == 'TITLE': # Check if there are any preceding title words prev_title = [j for j in title_indices if j < idx] if not prev_title: labels[idx] = 'B-TITLE' else: labels[idx] = 'I-TITLE' elif role == 'GROUP': # Check for I-GROUP prev_group = [j for j in group_indices if j < idx] if not prev_group: labels[idx] = 'B-GROUP' else: # Check if separated by bracket gap_has_separator = any(tokens[j] in ' []()' for j in range(prev_group[-1] + 1, idx)) if gap_has_separator: labels[idx] = 'B-GROUP' else: labels[idx] = 'I-GROUP' elif role == 'SEASON': labels[idx] = 'B-SEASON' elif role == 'EPISODE': labels[idx] = 'B-EPISODE' elif role == 'RESOLUTION': labels[idx] = 'B-RESOLUTION' elif role == 'SOURCE': labels[idx] = 'B-SOURCE' return labels def main(): conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=30) conn.execute("PRAGMA query_only=ON") # Sample BATCH_SIZE video files cursor = conn.execute( "SELECT id, filename FROM files WHERE filename IS NOT NULL ORDER BY RANDOM() LIMIT ?", (BATCH_SIZE * 3,) ) results = [] seen_stems = set() for fid, raw in cursor: stem = re.split(r"[\\/]", raw.strip())[-1].strip() stem, ext = os.path.splitext(stem) if ext.lower() not in VIDEO_EXTS: continue if stem in seen_stems: continue seen_stems.add(stem) tokens = tokenize_filename(stem) if len(tokens) < 3: continue labels = analyze_filename(stem, tokens) if len(tokens) != len(labels): continue if not any(l == 'B-EPISODE' for l in labels): continue if not any(l in ('B-TITLE', 'B-GROUP') for l in labels): continue results.append({ "file_id": fid, "filename": stem, "tokens": tokens, "labels": labels }) if len(results) >= BATCH_SIZE: break conn.close() # Write output with open(OUTPUT, "w", encoding="utf-8") as f: for r in results: f.write(json.dumps(r, ensure_ascii=False) + "\n") # Stats b_season = sum(1 for r in results if "B-SEASON" in r["labels"]) b_title = sum(1 for r in results if "B-TITLE" in r["labels"]) print(f"Wrote {len(results)} annotations to {OUTPUT}") print(f" B-TITLE: {b_title}") print(f" B-SEASON: {b_season}") print(f" B-EPISODE: {len(results)}") if __name__ == "__main__": main()