Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| use anyhow::{ensure, Context, Result}; | |
| use fancy_regex::Regex as FancyRegex; | |
| use regex::Regex; | |
| use serde_json::Value; | |
| use std::fs::File; | |
| use std::hint::black_box; | |
| use std::io::{BufRead, BufReader}; | |
| use std::path::PathBuf; | |
| use std::time::Instant; | |
| const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"; | |
| const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}"; | |
| const SPECIAL_TAG_PATTERN: &str = | |
| r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+"; | |
| const SPECIAL_CODE_PATTERN: &str = | |
| r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$"; | |
| const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})"; | |
| const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))"; | |
| const READING_MARKER_PATTERN: &str = r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])"; | |
| const ROMAN_MARKER_PATTERN: &str = | |
| r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])"; | |
| const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])"; | |
| const SPECIAL_CONTEXT_PREFIX_PATTERN: &str = | |
| r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}"; | |
| pub fn run(input: &PathBuf, limit_rows: usize, repeat: usize) -> Result<()> { | |
| ensure!(repeat > 0, "--regex-benchmark-repeat must be greater than 0"); | |
| let filenames = load_filenames(input, limit_rows)?; | |
| if filenames.is_empty() { | |
| anyhow::bail!("no filenames loaded from {}", input.display()); | |
| } | |
| let selective = SelectivePatterns::new()?; | |
| let fancy_all = FancyAllPatterns::new()?; | |
| let (selective_seconds, selective_count) = | |
| time_repeated(repeat, || run_selective(&filenames, &selective))?; | |
| let (fancy_seconds, fancy_count) = | |
| time_repeated(repeat, || run_fancy_all(&filenames, &fancy_all))?; | |
| ensure!( | |
| selective_count == fancy_count, | |
| "selective and fancy-all match counts differ: selective={}, fancy_all={}", | |
| selective_count, | |
| fancy_count | |
| ); | |
| let ratio = if selective_seconds > 0.0 { | |
| fancy_seconds / selective_seconds | |
| } else { | |
| 0.0 | |
| }; | |
| println!( | |
| "{}", | |
| serde_json::json!({ | |
| "rows": filenames.len(), | |
| "repeat": repeat, | |
| "selective_seconds": selective_seconds, | |
| "fancy_all_seconds": fancy_seconds, | |
| "ratio": ratio, | |
| "match_count": selective_count, | |
| }) | |
| ); | |
| Ok(()) | |
| } | |
| fn time_repeated<F>(repeat: usize, mut run_once: F) -> Result<(f64, usize)> | |
| where | |
| F: FnMut() -> Result<usize>, | |
| { | |
| let started = Instant::now(); | |
| let mut count = 0usize; | |
| for _ in 0..repeat { | |
| count = count.wrapping_add(black_box(run_once()?)); | |
| } | |
| Ok((started.elapsed().as_secs_f64(), count)) | |
| } | |
| struct SelectivePatterns { | |
| resolution: Regex, | |
| source: Regex, | |
| source_tag: Regex, | |
| special_tag: Regex, | |
| special_code: Regex, | |
| episode_context: Regex, | |
| episode_span: FancyRegex, | |
| reading_marker: FancyRegex, | |
| roman_marker: FancyRegex, | |
| cjk_marker: Regex, | |
| special_context_prefix: Regex, | |
| } | |
| impl SelectivePatterns { | |
| fn new() -> Result<Self> { | |
| Ok(Self { | |
| resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?, | |
| source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?, | |
| source_tag: Regex::new(&format!( | |
| r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$" | |
| ))?, | |
| special_tag: Regex::new(SPECIAL_TAG_PATTERN)?, | |
| special_code: Regex::new(SPECIAL_CODE_PATTERN)?, | |
| episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?, | |
| episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?, | |
| reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?, | |
| roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?, | |
| cjk_marker: Regex::new(CJK_MARKER_PATTERN)?, | |
| special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?, | |
| }) | |
| } | |
| } | |
| struct FancyAllPatterns { | |
| resolution: FancyRegex, | |
| source: FancyRegex, | |
| source_tag: FancyRegex, | |
| special_tag: FancyRegex, | |
| special_code: FancyRegex, | |
| episode_context: FancyRegex, | |
| episode_span: FancyRegex, | |
| reading_marker: FancyRegex, | |
| roman_marker: FancyRegex, | |
| cjk_marker: FancyRegex, | |
| special_context_prefix: FancyRegex, | |
| } | |
| impl FancyAllPatterns { | |
| fn new() -> Result<Self> { | |
| Ok(Self { | |
| resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?, | |
| source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?, | |
| source_tag: FancyRegex::new(&format!( | |
| r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$" | |
| ))?, | |
| special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?, | |
| special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?, | |
| episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?, | |
| episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?, | |
| reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?, | |
| roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?, | |
| cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?, | |
| special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?, | |
| }) | |
| } | |
| } | |
| fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result<usize> { | |
| let mut count = 0usize; | |
| for filename in filenames { | |
| count = count.wrapping_add( | |
| patterns | |
| .resolution | |
| .find_iter(filename) | |
| .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end())) | |
| .count(), | |
| ); | |
| count = count.wrapping_add( | |
| patterns | |
| .source | |
| .find_iter(filename) | |
| .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end())) | |
| .count(), | |
| ); | |
| count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize); | |
| count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count()); | |
| count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?); | |
| count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?); | |
| count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?); | |
| for inner in bracket_inners(filename) { | |
| count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize); | |
| count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize); | |
| count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize); | |
| count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize); | |
| } | |
| } | |
| Ok(count) | |
| } | |
| fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result<usize> { | |
| let mut count = 0usize; | |
| for filename in filenames { | |
| count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?); | |
| count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?); | |
| count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize); | |
| count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?); | |
| count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?); | |
| count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?); | |
| count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?); | |
| for inner in bracket_inners(filename) { | |
| count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize); | |
| count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize); | |
| count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize); | |
| count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize); | |
| } | |
| } | |
| Ok(count) | |
| } | |
| fn fancy_count(regex: &FancyRegex, text: &str) -> Result<usize> { | |
| let mut count = 0usize; | |
| for item in regex.find_iter(text) { | |
| let _ = item?; | |
| count += 1; | |
| } | |
| Ok(count) | |
| } | |
| fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result<usize> { | |
| let mut count = 0usize; | |
| for item in regex.find_iter(text) { | |
| let mat = item?; | |
| if has_ascii_token_boundaries(text, mat.start(), mat.end()) { | |
| count += 1; | |
| } | |
| } | |
| Ok(count) | |
| } | |
| fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result<Vec<String>> { | |
| let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?; | |
| let reader = BufReader::new(file); | |
| let mut filenames = Vec::new(); | |
| for (idx, line) in reader.lines().enumerate() { | |
| if limit_rows > 0 && filenames.len() >= limit_rows { | |
| break; | |
| } | |
| let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?; | |
| if raw.trim().is_empty() { | |
| continue; | |
| } | |
| let value: Value = serde_json::from_str(&raw) | |
| .with_context(|| format!("invalid JSONL line {}", idx + 1))?; | |
| if let Some(filename) = value.get("filename").and_then(Value::as_str) { | |
| filenames.push(filename.to_string()); | |
| } | |
| } | |
| Ok(filenames) | |
| } | |
| fn bracket_inners(text: &str) -> Vec<String> { | |
| let chars = text.chars().collect::<Vec<_>>(); | |
| let mut spans = Vec::new(); | |
| let mut idx = 0usize; | |
| while idx < chars.len() { | |
| let close = match chars[idx] { | |
| '[' => ']', | |
| '(' => ')', | |
| '【' => '】', | |
| '《' => '》', | |
| _ => { | |
| idx += 1; | |
| continue; | |
| } | |
| }; | |
| if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) { | |
| let end = idx + 1 + relative_end; | |
| spans.push(chars[idx + 1..end].iter().collect::<String>()); | |
| idx = end + 1; | |
| } else { | |
| idx += 1; | |
| } | |
| } | |
| spans | |
| } | |
| fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool { | |
| let previous_ok = text[..start] | |
| .chars() | |
| .next_back() | |
| .map(|ch| !ch.is_ascii_alphanumeric()) | |
| .unwrap_or(true); | |
| let next_ok = text[end..] | |
| .chars() | |
| .next() | |
| .map(|ch| !ch.is_ascii_alphanumeric()) | |
| .unwrap_or(true); | |
| previous_ok && next_ok | |
| } | |
| mod tests { | |
| use super::*; | |
| fn selective_and_fancy_all_count_the_same_matches() -> Result<()> { | |
| let filenames = vec![ | |
| "[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(), | |
| "[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(), | |
| "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(), | |
| "[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(), | |
| "[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(), | |
| ]; | |
| let selective = SelectivePatterns::new()?; | |
| let fancy_all = FancyAllPatterns::new()?; | |
| assert_eq!( | |
| run_selective(&filenames, &selective)?, | |
| run_fancy_all(&filenames, &fancy_all)? | |
| ); | |
| Ok(()) | |
| } | |
| fn bracket_inners_extract_supported_pairs() { | |
| assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]); | |
| } | |
| fn ascii_token_boundaries_reject_embedded_matches() { | |
| let text = "ABC1080p 1080p HEVC2 HEVC"; | |
| assert!(!has_ascii_token_boundaries(text, 3, 8)); | |
| assert!(has_ascii_token_boundaries(text, 9, 14)); | |
| assert!(!has_ascii_token_boundaries(text, 15, 19)); | |
| assert!(has_ascii_token_boundaries(text, 21, 25)); | |
| } | |
| } | |