use anyhow::{bail, Context, Result}; use chrono::Utc; use clap::Parser; use once_cell::sync::{Lazy, OnceCell}; use rayon::prelude::*; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::{HashMap, HashSet}; use std::fs::{self, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; #[derive(Parser, Debug)] #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")] struct Args { #[arg(long)] cluster: bool, #[arg(long)] audit_low_frequency: bool, #[arg(long)] verify_generated_output: bool, #[arg(long)] rich_annotations: bool, #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")] input: PathBuf, #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")] recipes: PathBuf, #[arg( long, default_value = "reports/dmhy_weak.template_generated.rust.jsonl" )] output: PathBuf, #[arg( long, default_value = "reports/dmhy_weak.template_generated.rust.manifest.json" )] manifest_output: PathBuf, #[arg( long, default_value = "reports/dmhy_template_clusters.rust.summary.json" )] summary_output: PathBuf, #[arg( long, default_value = "reports/dmhy_template_clusters.rust.samples.jsonl" )] samples_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")] clusters_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")] recipes_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")] review_output: PathBuf, #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")] audit_output: PathBuf, #[arg(long, default_value = "reports/dmhy_rich_annotations.rust.jsonl")] rich_output: PathBuf, #[arg(long, default_value_t = 50)] audit_max_count: u64, #[arg(long)] limit: Option, #[arg(long)] limit_templates: Option, #[arg(long, default_value_t = 1)] min_count: u64, #[arg(long, default_value_t = 200)] top: usize, #[arg(long, default_value_t = 200)] recipe_top: usize, #[arg(long, default_value_t = 1000)] review_top: usize, #[arg(long, default_value_t = 8)] examples: usize, #[arg(long, default_value_t = 25)] recipe_min_count: usize, #[arg(long, default_value = "high")] confidence: String, #[arg(long, default_value = "all")] expand: String, #[arg(long, default_value_t = 100)] sample_per_template: usize, #[arg(long)] keep_encoding_noise: bool, #[arg(long)] preserve_parent_paths: bool, #[arg(long, default_value = "datasets/AnimeName/dmhy_title_whitelist.txt")] title_whitelist: PathBuf, #[arg(long, default_value = "datasets/AnimeName/dmhy_group_whitelist.txt")] group_whitelist: PathBuf, #[arg(long)] threads: Option, } #[derive(Debug, Default)] struct Whitelists { title_phrases: Vec>, group_names: HashSet, } static RUNTIME_WHITELISTS: OnceCell = OnceCell::new(); #[derive(Debug, Clone, Deserialize)] struct Recipe { template_id: String, template: String, roles: Vec, confidence: Option, count: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Record { filename: String, tokens: Vec, labels: Vec, template_id: String, template: String, #[serde(skip_serializing_if = "Option::is_none")] source_filename: Option, #[serde(skip_serializing_if = "Option::is_none")] path_trimmed: Option, #[serde(skip_serializing_if = "Option::is_none")] dropped_title_candidate_positions: Option>, } #[derive(Debug, Clone)] struct Group { indices: Vec, class_name: String, } #[derive(Debug, Default, Clone, Serialize)] struct Stats { seen: usize, skipped_encoding_noise: usize, skipped_music_audio_collection: usize, trimmed_parent_path: usize, skipped_no_recipe: usize, skipped_sample_cap: usize, skipped_role_mismatch: usize, skipped_low_frequency_audit_warning: usize, written: usize, } #[derive(Debug, Default)] struct Cluster { count: usize, examples: Vec, literal_counts: HashMap, class_counts: HashMap, position_literals: Vec>, } #[derive(Debug)] enum Processed { Written { record: Record, trimmed_parent: bool, }, Skipped { reason: &'static str, trimmed_parent: bool, example: Option, warnings: Vec, }, } static HASH_RE: Lazy = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap()); static RESOLUTION_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap()); static BARE_RESOLUTION_RE: Lazy = Lazy::new(|| Regex::new(r"^(?:360|480|540|576|720|1080|2160)$").unwrap()); static EPISODE_VERSION_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap()); static EPISODE_WITH_SUFFIX_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^\d{1,4}[_ .-]?(?:Notice|Full|R18|R|Uncut|Director'?s?Cut)$").unwrap() }); static EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap()); static DECIMAL_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap()); static NUMERIC_TITLE_PREFIX_RE: Lazy = Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap()); static EPISODE_CJK_RE: Lazy = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap()); static EPISODE_CJK_PREFIX_RE: Lazy = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap()); static EPISODE_RANGE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap()); static EPISODE_BATCH_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*(?:TV|全集|全|END|Fin|Complete|SP|OVA|OAD|NCOP|NCED)|[+_./-])*.{0,16}$").unwrap() }); static SXE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap()); static SXE_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap()); static EPISODE_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4}(?:\.\d{1,2})?)(?:v(\d+))?$").unwrap()); static SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap() }); static CJK_SEASON_TOKEN_RE: Lazy = Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap()); static CJK_SEASON_EMBEDDED_RE: Lazy = Lazy::new(|| Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()); static CJK_EPISODE_EMBEDDED_RE: Lazy = Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap()); static CJK_TITLE_TRAILING_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"^(.+[\p{Han}\p{Hiragana}\p{Katakana}])(\d{2,3})$").unwrap()); static ASCII_SEASON_SUFFIX_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap()); static ORDINAL_SEASON_TOKEN_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap()); static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$") .unwrap() }); static SEASON_WORD_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap()); static CJK_TITLE_LANG_PREFIX_RE: Lazy = Lazy::new(|| { Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap() }); static SEASON_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap()); static SPECIAL_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap() }); static VOLUME_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap()); static DATE_RE: Lazy = Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()); static DATE_RANGE_MIXED_RE: Lazy = Lazy::new(|| { Regex::new( r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$", ) .unwrap() }); static CJK_DATE_RE: Lazy = Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap()); static LANG_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|JPTC|JPSC|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap() }); static MEDIA_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p?|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap() }); static SPECIAL_TITLE_PHRASE_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b") .unwrap() }); static YEAR_RANGE_RE: Lazy = Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap()); static VERSIONISH_TITLE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:19|20)\d{2}(?:版|ver\.?|version)?$").unwrap()); static PATH_SEGMENT_SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap() }); static SEASON_WORD_NUMBER_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap()); static PLAIN_SEASON_SEGMENT_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap()); static S_NUMBER_SEGMENT_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap()); static SXE_SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap() }); static TOKEN_REGEXES: Lazy> = Lazy::new(|| { [ r"^\d{3,4}[xX×]\d{3,4}", r"(?i)^(?:AAC|AC3|EAC3|DTS|FLAC|DDP)\s*\d+(?:\.\d+)?", r"(?i)^h\.?26[45]", r"(?i)^x\.?26[45]", r"^[\\/]+", r"^[-_.::+&|]+", r"^\s+", r"(?i)^Season\s*\d{1,2}", r"^[A-Za-z]+(?:\d+[A-Za-z]*)*", r"^\d+[A-Za-z]+\d*", r"^\d{1,4}(?:[._-]\d{1,4})*", r"^[\p{Hiragana}\p{Katakana}\p{Han}]+", ] .into_iter() .map(|pattern| Regex::new(pattern).unwrap()) .collect() }); static SIMPLE_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap()); static SPECIAL_SPACE_RE: Lazy = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap()); static MUSIC_COLLECTION_RE: Lazy = Lazy::new(|| { Regex::new( r"(?i)(?:^|[^A-Z0-9])(?:MUSIC\s*CLIP|MUSIC\s+COLLECTION|SOUNDTRACK|OST|CHARACTER\s+SONG|DRAMA\s+CD|CD\s+ALBUM|BONUS\s+CD)(?:$|[^A-Z0-9])", ) .unwrap() }); fn main() -> Result<()> { let args = Args::parse(); if let Some(threads) = args.threads { rayon::ThreadPoolBuilder::new() .num_threads(threads) .build_global() .context("failed to configure rayon thread pool")?; } let _ = RUNTIME_WHITELISTS.set(load_whitelists(&args)?); if args.cluster { return run_cluster(&args); } if args.audit_low_frequency { return run_low_frequency_audit(&args); } if args.verify_generated_output { return run_verify_generated_output(&args); } if args.rich_annotations { return run_rich_annotations(&args); } if args.expand != "all" && args.expand != "sample" { bail!("--expand must be all or sample"); } let recipes = load_recipes(&args)?; if recipes.is_empty() { bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates"); } let inputs = load_input(&args.input, args.limit)?; let sample_counters: HashMap = recipes .values() .map(|recipe| (recipe.template_id.clone(), AtomicUsize::new(0))) .collect(); let processed: Vec = inputs .par_iter() .map(|filename| process_filename(filename, &args, &recipes, &sample_counters)) .collect(); if let Some(parent) = args.output.parent() { fs::create_dir_all(parent)?; } if let Some(parent) = args.manifest_output.parent() { fs::create_dir_all(parent)?; } let mut stats = Stats { seen: inputs.len(), ..Stats::default() }; let mut label_counts: HashMap = HashMap::new(); let mut template_counts: HashMap = HashMap::new(); let mut examples = Vec::new(); let mut skipped_music_audio_collection_examples = Vec::new(); let mut skipped_low_frequency_audit_warning_counts: HashMap = HashMap::new(); let mut skipped_low_frequency_audit_warning_examples: HashMap> = HashMap::new(); let mut writer = BufWriter::new(File::create(&args.output)?); for item in processed { match item { Processed::Written { record, trimmed_parent, } => { if trimmed_parent { stats.trimmed_parent_path += 1; } for label in &record.labels { *label_counts.entry(label.clone()).or_default() += 1; } *template_counts .entry(record.template_id.clone()) .or_default() += 1; if examples.len() < 20 { examples.push(serde_json::to_value(&record)?); } serde_json::to_writer(&mut writer, &record)?; writer.write_all(b"\n")?; stats.written += 1; } Processed::Skipped { reason, trimmed_parent, example, warnings, } => { if trimmed_parent { stats.trimmed_parent_path += 1; } match reason { "encoding_noise" => stats.skipped_encoding_noise += 1, "music_audio_collection" => { stats.skipped_music_audio_collection += 1; if let Some(example) = example { if skipped_music_audio_collection_examples.len() < 20 { skipped_music_audio_collection_examples.push(example); } } } "no_recipe" => stats.skipped_no_recipe += 1, "sample_cap" => stats.skipped_sample_cap += 1, "role_mismatch" => stats.skipped_role_mismatch += 1, "low_frequency_audit_warning" => { stats.skipped_low_frequency_audit_warning += 1; for warning in warnings { *skipped_low_frequency_audit_warning_counts .entry(warning.clone()) .or_default() += 1; if let Some(example) = example.as_ref() { let bucket = skipped_low_frequency_audit_warning_examples .entry(warning) .or_default(); if bucket.len() < 10 { bucket.push(example.clone()); } } } } _ => {} } } } } writer.flush()?; let mut top_template_counts: Vec<_> = template_counts.into_iter().collect(); top_template_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); top_template_counts.truncate(20); let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "output": args.output.to_string_lossy(), "selected_templates": recipes.len(), "confidence": args.confidence, "min_count": args.min_count, "low_frequency_audit_max_count": args.audit_max_count, "low_frequency_blocking_warnings": [ "ambiguous_no_episode_title", "encoding_noise_survived", "episode_version_missing_label", "episode_in_title", "generic_title_only", "hash_labeled", "multiple_title_spans", "no_title", "path_retained", "sxe_compact_unexpanded", "tech_in_title", "template_episode_missing_label", "template_sxe_missing_label" ], "expand": args.expand, "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None }, "stats": stats, "label_counts": label_counts, "top_template_counts": top_template_counts, "examples": examples, "skipped_music_audio_collection_examples": skipped_music_audio_collection_examples, "skipped_low_frequency_audit_warning_counts": skipped_low_frequency_audit_warning_counts, "skipped_low_frequency_audit_warning_examples": skipped_low_frequency_audit_warning_examples, "implementation": "rust_dmhy_template_apply" }); fs::write( &args.manifest_output, serde_json::to_string_pretty(&manifest)?, )?; println!("{}", serde_json::to_string_pretty(&manifest)?); Ok(()) } fn load_whitelists(args: &Args) -> Result { Ok(Whitelists { title_phrases: load_title_whitelist(&args.title_whitelist)?, group_names: load_name_whitelist(&args.group_whitelist)?, }) } fn load_title_whitelist(path: &PathBuf) -> Result>> { let mut phrases = Vec::new(); for line in load_whitelist_lines(path)? { let phrase = phrase_parts_for_whitelist(&line); if !phrase.is_empty() { phrases.push(phrase); } } Ok(phrases) } fn load_name_whitelist(path: &PathBuf) -> Result> { Ok(load_whitelist_lines(path)? .into_iter() .map(|line| normalize_whitelist_name(&line)) .filter(|line| !line.is_empty()) .collect()) } fn load_whitelist_lines(path: &PathBuf) -> Result> { if !path.exists() { return Ok(Vec::new()); } let file = File::open(path).with_context(|| format!("failed to open whitelist {}", path.display()))?; let mut lines = Vec::new(); for line in BufReader::new(file).lines() { let line = line?; let line = line.trim(); if line.is_empty() || line.starts_with('#') { continue; } let value = line .split_once('\t') .map(|(_, value)| value) .unwrap_or(line) .trim(); if !value.is_empty() { lines.push(value.to_string()); } } Ok(lines) } fn load_recipes(args: &Args) -> Result> { let file = File::open(&args.recipes) .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?; let mut recipes = HashMap::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { let line = line?; if line.trim().is_empty() { continue; } let row: Recipe = serde_json::from_str(&line).with_context(|| { format!( "invalid recipe JSON at {}:{}", args.recipes.display(), line_number + 1 ) })?; if !args.confidence.is_empty() && row.confidence.as_deref() != Some(args.confidence.as_str()) { continue; } if row.count.unwrap_or(0) < args.min_count { continue; } recipes.insert(row.template.clone(), row); if args .limit_templates .is_some_and(|limit| recipes.len() >= limit) { break; } } Ok(recipes) } fn load_input(path: &PathBuf, limit: Option) -> Result> { let file = File::open(path).with_context(|| format!("input JSONL not found: {}", path.display()))?; let mut values = Vec::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { if limit.is_some_and(|limit| values.len() >= limit) { break; } let line = line?; if line.trim().is_empty() { continue; } let row: Value = serde_json::from_str(&line) .with_context(|| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?; if let Some(value) = row.get("value").and_then(Value::as_str) { let value = value.trim(); if !value.is_empty() { values.push(value.to_string()); } } } Ok(values) } fn run_cluster(args: &Args) -> Result<()> { let inputs = load_input(&args.input, args.limit)?; let source_rows = inputs.len(); let mut clusters: HashMap = HashMap::new(); let mut skipped_encoding_noise = 0usize; let mut trimmed_parent_path = 0usize; let mut total_rows = 0usize; for original in inputs { if !args.keep_encoding_noise && (has_encoding_noise(&original) || has_non_anime_noise(&original) || has_music_collection_noise(&original) || has_abstract_path_noise(&original)) { skipped_encoding_noise += 1; continue; } let filename = if args.preserve_parent_paths { original } else { let (training_filename, was_trimmed) = training_filename_for(&original); if was_trimmed { trimmed_parent_path += 1; } training_filename }; add_cluster(&mut clusters, &filename, args.examples); total_rows += 1; } let mut sorted_clusters: Vec<_> = clusters.into_iter().collect(); sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0))); let cluster_rows: Vec = sorted_clusters .iter() .enumerate() .map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows)) .collect(); let samples: Vec = cluster_rows.iter().take(args.top).cloned().collect(); let recipe_candidates: Vec = cluster_rows.iter().take(args.recipe_top).cloned().collect(); let recipes: Vec = recipe_candidates .iter() .filter(|row| is_high_confidence_recipe(row, args.recipe_min_count)) .map(|row| recipe_row(row, "high")) .collect(); let review: Vec = recipe_candidates .iter() .filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count)) .take(args.review_top) .cloned() .collect(); write_jsonl_values(&args.clusters_output, &cluster_rows)?; write_jsonl_values(&args.samples_output, &samples)?; write_jsonl_values(&args.recipes_output, &recipes)?; write_jsonl_values(&args.review_output, &review)?; let mut histogram: HashMap = HashMap::new(); for (_, cluster) in &sorted_clusters { *histogram.entry(cluster.count).or_default() += 1; } let mut count_histogram_top: Vec<_> = histogram.into_iter().collect(); count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); count_histogram_top.truncate(20); let rows_covered_by_repeated_templates: usize = sorted_clusters .iter() .map(|(_, cluster)| cluster) .filter(|cluster| cluster.count as u64 >= args.min_count) .map(|cluster| cluster.count) .sum(); let templates_at_least_min_count = sorted_clusters .iter() .filter(|(_, cluster)| cluster.count as u64 >= args.min_count) .count(); let top_templates: Vec = cluster_rows.iter().take(20).cloned().collect(); let summary = json!({ "input": args.input.to_string_lossy(), "source_rows": source_rows, "skipped_encoding_noise": skipped_encoding_noise, "trimmed_parent_path": trimmed_parent_path, "total_rows": total_rows, "unique_templates": sorted_clusters.len(), "min_count": args.min_count, "templates_at_least_min_count": templates_at_least_min_count, "rows_covered_by_repeated_templates": rows_covered_by_repeated_templates, "rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 }, "top_output_rows": samples.len(), "clusters_output": args.clusters_output.to_string_lossy(), "cluster_rows": cluster_rows.len(), "recipes_output": args.recipes_output.to_string_lossy(), "recipe_rows": recipes.len(), "review_output": args.review_output.to_string_lossy(), "review_rows": review.len(), "recipe_top": args.recipe_top, "recipe_min_count": args.recipe_min_count, "top_templates": top_templates, "count_histogram_top": count_histogram_top, "implementation": "rust_dmhy_template_cluster", "generated_at": Utc::now().to_rfc3339(), }); if let Some(parent) = args.summary_output.parent() { fs::create_dir_all(parent)?; } fs::write( &args.summary_output, serde_json::to_string_pretty(&summary)?, )?; println!("{}", serde_json::to_string_pretty(&summary)?); Ok(()) } fn add_cluster(clusters: &mut HashMap, filename: &str, example_limit: usize) { let (key, tokens, classes, groups) = template_key_for_filename(filename); let cluster = clusters.entry(key).or_default(); cluster.count += 1; if cluster.examples.len() < example_limit { cluster.examples.push(filename.to_string()); } for (token, class_name) in tokens.iter().zip(classes.iter()) { *cluster.class_counts.entry(class_name.clone()).or_default() += 1; if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") { let cleaned = strip_wrapper(token); if !cleaned.is_empty() { *cluster.literal_counts.entry(cleaned).or_default() += 1; } } } while cluster.position_literals.len() < groups.len() { cluster.position_literals.push(HashMap::new()); } for (index, group) in groups.iter().enumerate() { if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") { let text = group_text(&tokens, group); if !text.is_empty() { *cluster.position_literals[index].entry(text).or_default() += 1; } } } } fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value { json!({ "template_id": format!("tpl_{rank:06}"), "template": key, "count": cluster.count, "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 }, "top_literals": top_counts(&cluster.literal_counts, 12), "suggested_roles": suggested_roles(key), "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::>(), "class_counts": top_counts(&cluster.class_counts, 20), "examples": cluster.examples, }) } fn top_counts(counts: &HashMap, limit: usize) -> Vec<(String, usize)> { let mut items: Vec<_> = counts .iter() .map(|(key, count)| (key.clone(), *count)) .collect(); items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); items.truncate(limit); items } fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool { if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 { return false; } let roles = match row.get("suggested_roles").and_then(Value::as_array) { Some(roles) => roles, None => return false, }; let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect(); if role_strings.iter().any(|role| role.contains("_OR_")) { return false; } if !role_strings.contains(&"TITLE") || !role_strings.iter().any(|role| { role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION") }) { return false; } let template = row.get("template").and_then(Value::as_str).unwrap_or(""); if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") { return false; } !role_strings.contains(&"TITLE_OR_TEXT") } fn recipe_row(row: &Value, confidence: &str) -> Value { json!({ "template_id": row["template_id"], "template": row["template"], "roles": row["suggested_roles"], "confidence": confidence, "count": row["count"], "examples": row["examples"], }) } fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> { if let Some(parent) = path.parent() { fs::create_dir_all(parent)?; } let mut writer = BufWriter::new(File::create(path)?); for row in rows { serde_json::to_writer(&mut writer, row)?; writer.write_all(b"\n")?; } writer.flush()?; Ok(()) } fn run_low_frequency_audit(args: &Args) -> Result<()> { let recipes = load_recipes(args)?; let inputs = load_input(&args.input, args.limit)?; let low_template_total = recipes .values() .filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count) .count(); let mut seen_templates = HashSet::new(); let mut rows = Vec::new(); for original in inputs { if !args.keep_encoding_noise && (has_encoding_noise(&original) || has_non_anime_noise(&original) || has_music_collection_noise(&original) || has_abstract_path_noise(&original)) { continue; } let (training_filename, trimmed_parent) = training_filename_for(&original); let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); let Some(recipe) = recipes.get(&key) else { continue; }; let count = recipe.count.unwrap_or(0); if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) { continue; } if recipe.roles.len() != groups.len() { continue; } let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) else { continue; }; if trimmed_parent { record.source_filename = Some(original.clone()); record.path_trimmed = Some(true); } rows.push(json!({ "template_id": recipe.template_id, "count": count, "template": recipe.template, "filename": record.filename, "source_filename": record.source_filename, "path_trimmed": record.path_trimmed.unwrap_or(false), "spans": entity_spans(&record.tokens, &record.labels), "warnings": audit_warnings(&record), "tokens": record.tokens, "labels": record.labels, })); if seen_templates.len() >= low_template_total { break; } } rows.sort_by(|a, b| { let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0); let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0); let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or(""); let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or(""); count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b)) }); write_jsonl_values(&args.audit_output, &rows)?; let warning_counts = warning_counts(&rows); let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "audit_output": args.audit_output.to_string_lossy(), "audit_max_count": args.audit_max_count, "low_template_total": low_template_total, "audited_templates": rows.len(), "warning_counts": warning_counts, "implementation": "rust_dmhy_low_frequency_audit" }); println!("{}", serde_json::to_string_pretty(&manifest)?); Ok(()) } fn run_verify_generated_output(args: &Args) -> Result<()> { let file = File::open(&args.input) .with_context(|| format!("generated JSONL not found: {}", args.input.display()))?; let recipes_by_id: HashMap = load_recipes(args)? .into_values() .map(|recipe| (recipe.template_id, recipe.count.unwrap_or(0))) .collect(); let mut rows = 0usize; let mut low_frequency_rows = 0usize; let mut warning_counts: HashMap = HashMap::new(); let mut examples: HashMap> = HashMap::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { let line = line?; if line.trim().is_empty() { continue; } let record: Record = serde_json::from_str(&line).with_context(|| { format!( "invalid generated record at {}:{}", args.input.display(), line_number + 1 ) })?; rows += 1; let count = recipes_by_id .get(&record.template_id) .copied() .unwrap_or(u64::MAX); if count > args.audit_max_count { continue; } low_frequency_rows += 1; for warning in audit_warnings(&record) { if !matches!( warning.as_str(), "ambiguous_no_episode_title" | "encoding_noise_survived" | "episode_version_missing_label" | "episode_in_title" | "generic_title_only" | "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" | "sxe_compact_unexpanded" | "tech_in_title" | "template_episode_missing_label" | "template_sxe_missing_label" ) { continue; } *warning_counts.entry(warning.clone()).or_default() += 1; let bucket = examples.entry(warning).or_default(); if bucket.len() < 5 { bucket.push(json!({ "template_id": record.template_id, "template_count": count, "filename": record.filename, "spans": entity_spans(&record.tokens, &record.labels), })); } } } let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "audit_max_count": args.audit_max_count, "rows": rows, "low_frequency_rows": low_frequency_rows, "blocking_warning_counts": warning_counts, "examples": examples, "implementation": "rust_dmhy_generated_output_verify" }); println!("{}", serde_json::to_string_pretty(&manifest)?); if !warning_counts.is_empty() { bail!("generated output still has low-frequency blocking warnings"); } Ok(()) } fn run_rich_annotations(args: &Args) -> Result<()> { let inputs = load_input(&args.input, args.limit)?; if let Some(parent) = args.rich_output.parent() { fs::create_dir_all(parent)?; } let rows: Vec = inputs .par_iter() .filter_map(|original| { if !args.keep_encoding_noise && (has_encoding_noise(original) || has_non_anime_noise(original) || has_music_collection_noise(original) || has_abstract_path_noise(original)) { return None; } Some(rich_annotation_for(original)) }) .collect(); let mut writer = BufWriter::new(File::create(&args.rich_output)?); for row in &rows { serde_json::to_writer(&mut writer, row)?; writer.write_all(b"\n")?; } writer.flush()?; let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "rich_output": args.rich_output.to_string_lossy(), "rows": rows.len(), "implementation": "rust_dmhy_rich_annotations", "notes": [ "rich roles are metadata for review/projection, not final training BIO labels", "TITLE_* candidates may be collapsed or filtered before dmhy_weak generation" ] }); println!("{}", serde_json::to_string_pretty(&manifest)?); Ok(()) } fn rich_annotation_for(original: &str) -> Value { let (training_filename, path_trimmed) = training_filename_for(original); let parts: Vec<&str> = original .split(|ch| ch == '/' || ch == '\\') .map(str::trim) .filter(|part| !part.is_empty()) .collect(); let leaf_index = parts.len().saturating_sub(1); let segments = parts .iter() .enumerate() .map(|(index, segment)| rich_segment(segment, index, index == leaf_index)) .collect::>(); let projection = dmhy_record( &training_filename, "rich_projection", &suggested_roles(&template_key_for_filename(&training_filename).0), ) .map(|record| { json!({ "filename": record.filename, "spans": entity_spans(&record.tokens, &record.labels), "warnings": audit_warnings(&record), }) }); json!({ "source_filename": original, "training_filename": training_filename, "path_trimmed": path_trimmed, "segments": segments, "projection_preview": projection, }) } fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value { let (key, tokens, _classes, groups) = template_key_for_filename(segment); let suggested = suggested_roles(&key); let roles = adjust_contextual_roles(&tokens, &groups, &suggested); let roles = refine_semantic_roles(&tokens, &groups, &roles); let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf); json!({ "index": index, "text": segment, "kind": rich_segment_kind(segment, is_leaf), "template": key, "candidates": candidates, }) } fn rich_segment_kind(segment: &str, is_leaf: bool) -> &'static str { if path_segment_is_media_noise(segment) { "media_noise" } else if path_segment_is_plain_season(segment) { "season_dir" } else if is_leaf { "leaf" } else { "parent" } } fn rich_candidates_for_segment( segment: &str, tokens: &[String], groups: &[Group], roles: &[String], is_leaf: bool, ) -> Vec { let mut output = Vec::new(); let title_ranges = title_candidates(groups, roles); for (candidate_index, (start, end)) in title_ranges.iter().copied().enumerate() { let text = candidate_text(tokens, groups, start, end); if text.trim().is_empty() { continue; } output.push(json!({ "role": fine_title_role_for_candidate(&roles, start, end) .unwrap_or_else(|| fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()).to_string()), "coarse_role": "TITLE", "text": text, "group_start": start, "group_end": end, })); } for (group_index, role) in roles.iter().enumerate() { if is_title_role(role) || role == "O" || role == "HASH" { continue; } let text = group_text(tokens, &groups[group_index]); if text.trim().is_empty() { continue; } let coarse_role = role_label(role) .strip_prefix("B-") .map(str::to_string) .unwrap_or_else(|| "O".to_string()); output.push(json!({ "role": fine_non_title_role(role), "coarse_role": coarse_role, "text": text, "group_start": group_index, "group_end": group_index + 1, })); } output } fn fine_title_role_for_candidate(roles: &[String], start: usize, end: usize) -> Option { let mut entities: Vec<&str> = roles[start..end] .iter() .filter_map(|role| title_entity_from_role(role)) .filter(|entity| *entity != "TITLE") .collect(); entities.sort(); entities.dedup(); match entities.len() { 0 => None, 1 => Some(entities[0].to_string()), _ => Some("TITLE_MIXED".to_string()), } } fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String { let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else { return String::new(); }; let Some(last) = groups .get(end.saturating_sub(1)) .and_then(|group| group.indices.last()) else { return String::new(); }; strip_wrapper(&tokens[*first..=*last].join("")) } fn fine_title_role( segment: &str, text: &str, is_leaf: bool, candidate_index: usize, candidate_count: usize, ) -> &'static str { let cleaned = text.trim(); if VERSIONISH_TITLE_RE.is_match(cleaned) { return "RELEASE_VERSION"; } if matches!( cleaned.to_ascii_lowercase().as_str(), "国漫" | "國漫" | "anime" | "movie" | "movies" ) { return "TITLE_CATEGORY"; } if is_leaf && path_segment_starts_with_episode(segment) { return "EPISODE_TITLE"; } if !is_leaf { return "PATH_TITLE"; } if candidate_count > 1 && candidate_index > 0 { return "TITLE_ALIAS"; } "TITLE_MAIN" } fn fine_non_title_role(role: &str) -> &'static str { match role { "GROUP" => "RELEASE_GROUP", "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE", "SEASON" => "SEASON", "PATH_SEASON" => "PATH_SEASON", "TAG" => "TAG", "SPECIAL" | "VOLUME" => "SPECIAL", "RESOLUTION" => "RESOLUTION", "SOURCE" => "SOURCE", _ => "OTHER", } } fn entity_spans(tokens: &[String], labels: &[String]) -> Vec { let mut spans = Vec::new(); let mut current_label: Option = None; let mut current_text = String::new(); for (token, label) in tokens.iter().zip(labels.iter()) { let entity = label .strip_prefix("B-") .or_else(|| label.strip_prefix("I-")) .unwrap_or("O"); if current_label.as_deref() == Some(entity) { current_text.push_str(token); continue; } if let Some(label) = current_label.take() { if label != "O" { spans.push(json!({ "label": label, "text": current_text })); } } current_label = Some(entity.to_string()); current_text = token.clone(); } if let Some(label) = current_label { if label != "O" { spans.push(json!({ "label": label, "text": current_text })); } } spans } fn audit_warnings(record: &Record) -> Vec { let mut warnings = Vec::new(); let title_texts = title_entity_texts(&record.tokens, &record.labels); let title_spans = title_texts.len(); if title_spans == 0 { warnings.push("no_title".to_string()); } else if repeated_title_entity_spans(&record.labels) { warnings.push("multiple_title_spans".to_string()); } if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) { warnings.push("generic_title_only".to_string()); } if title_texts.iter().any(|title| technical_title_text(title)) { warnings.push("tech_in_title".to_string()); } if title_texts.iter().any(|title| episodeish_title_text(title)) { warnings.push("episode_in_title".to_string()); } let has_episode = record.labels.iter().any(|label| label.ends_with("EPISODE")); let has_season = record.labels.iter().any(|label| label.ends_with("SEASON")); let has_special = record.labels.iter().any(|label| label.ends_with("SPECIAL")); if !has_episode { warnings.push("no_episode".to_string()); if record.template.contains("EPISODE") && !has_special { warnings.push("template_episode_missing_label".to_string()); } if record .dropped_title_candidate_positions .as_ref() .is_some_and(|dropped| !dropped.is_empty()) { warnings.push("ambiguous_no_episode_title".to_string()); } } if record.template.contains("SXE") && (!has_season || !has_episode) { warnings.push("template_sxe_missing_label".to_string()); } if record.filename.contains('/') || record.filename.contains('\\') { warnings.push("path_retained".to_string()); } if has_encoding_noise(&record.filename) || record .source_filename .as_ref() .is_some_and(|source| has_encoding_noise(source)) { warnings.push("encoding_noise_survived".to_string()); } for (index, token) in record.tokens.iter().enumerate() { let entity = record .labels .get(index) .and_then(|label| label_entity(label)); let cleaned = strip_wrapper(token); if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") { warnings.push("hash_labeled".to_string()); break; } if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned)) && entity != Some("EPISODE") { warnings.push("episode_version_missing_label".to_string()); } if SXE_VALUE_RE.is_match(&cleaned) && entity != Some("EPISODE") && entity != Some("SEASON") { warnings.push("sxe_compact_unexpanded".to_string()); } } warnings.sort(); warnings.dedup(); warnings } fn label_entity(label: &str) -> Option<&str> { label .strip_prefix("B-") .or_else(|| label.strip_prefix("I-")) } fn title_entity_texts(tokens: &[String], labels: &[String]) -> Vec { let mut spans = Vec::new(); let mut current = String::new(); let mut current_entity: Option = None; for (token, label) in tokens.iter().zip(labels.iter()) { let entity = label_entity(label).filter(|entity| is_title_entity(entity)); if entity.is_some() && current_entity.as_deref() == entity { current.push_str(token); } else { if !current.trim().is_empty() { spans.push(current.trim().to_string()); } current.clear(); current_entity = entity.map(str::to_string); if entity.is_some() { current.push_str(token); } } } if !current.trim().is_empty() { spans.push(current.trim().to_string()); } spans } fn repeated_title_entity_spans(labels: &[String]) -> bool { let mut seen = HashSet::new(); let mut previous: Option = None; for label in labels { let entity = label_entity(label) .filter(|entity| is_title_entity(entity)) .map(str::to_string); if entity.is_some() && entity != previous { let entity = entity.clone().unwrap(); if !seen.insert(entity) { return true; } } previous = entity; } false } fn generic_title_text(text: &str) -> bool { matches!( text.trim().to_ascii_lowercase().as_str(), "tv" | "movie" | "mov" | "sample" | "commercial" | "commercials" | "cm" | "pv" | "op" | "ed" | "ncop" | "nced" | "menu" | "trailer" | "spot" | "bdmv" | "stream" ) } fn technical_title_text(text: &str) -> bool { let normalized = text.to_ascii_lowercase(); normalized.contains("bdrip") || normalized.contains("webrip") || normalized.contains("web-dl") || normalized.contains("hevc") || normalized.contains("x264") || normalized.contains("x265") || normalized.contains("aac") || normalized.contains("flac") || normalized.contains("sourceunknown") } fn episodeish_title_text(text: &str) -> bool { let trimmed = text.trim(); EPISODE_VALUE_RE.is_match(trimmed) || EPISODE_CJK_RE.is_match(trimmed) || EPISODE_RANGE_RE.is_match(trimmed) || trimmed.chars().all(|ch| ch.is_ascii_digit()) } fn warning_counts(rows: &[Value]) -> HashMap { let mut counts = HashMap::new(); for row in rows { if let Some(warnings) = row.get("warnings").and_then(Value::as_array) { for warning in warnings { if let Some(warning) = warning.as_str() { *counts.entry(warning.to_string()).or_default() += 1; } } } } counts } fn process_filename( original: &str, args: &Args, recipes: &HashMap, sample_counters: &HashMap, ) -> Processed { if !args.keep_encoding_noise && has_music_collection_noise(original) { return Processed::Skipped { reason: "music_audio_collection", trimmed_parent: false, example: Some(original.to_string()), warnings: Vec::new(), }; } if !args.keep_encoding_noise && (has_encoding_noise(original) || has_non_anime_noise(original) || has_abstract_path_noise(original)) { return Processed::Skipped { reason: "encoding_noise", trimmed_parent: false, example: None, warnings: Vec::new(), }; } let (training_filename, trimmed_parent) = training_filename_for(original); let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); let recipe = match recipes.get(&key) { Some(recipe) => recipe, None => { return Processed::Skipped { reason: "no_recipe", trimmed_parent, example: None, warnings: Vec::new(), } } }; if args.expand == "sample" { let counter = sample_counters.get(&recipe.template_id).unwrap(); if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template { return Processed::Skipped { reason: "sample_cap", trimmed_parent, example: None, warnings: Vec::new(), }; } } if recipe.roles.len() != groups.len() { return Processed::Skipped { reason: "role_mismatch", trimmed_parent, example: None, warnings: Vec::new(), }; } let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) { Some(record) => record, None => { return Processed::Skipped { reason: "role_mismatch", trimmed_parent, example: None, warnings: Vec::new(), } } }; let warnings = audit_warnings(&record); if warnings.iter().any(|warning| warning == "no_title") || has_blocking_warnings(&warnings) { return Processed::Skipped { reason: "low_frequency_audit_warning", trimmed_parent, example: Some(record.filename.clone()), warnings, }; } if trimmed_parent { record.source_filename = Some(original.to_string()); record.path_trimmed = Some(true); return Processed::Written { record, trimmed_parent: true, }; } Processed::Written { record, trimmed_parent: false, } } fn has_blocking_warnings(warnings: &[String]) -> bool { warnings.iter().any(|warning| { matches!( warning.as_str(), "ambiguous_no_episode_title" | "encoding_noise_survived" | "episode_version_missing_label" | "episode_in_title" | "generic_title_only" | "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" | "sxe_compact_unexpanded" | "tech_in_title" | "template_episode_missing_label" | "template_sxe_missing_label" ) }) } fn tokenize(value: &str) -> Vec { let mut output = Vec::new(); let mut index = 0; while index < value.len() { let rest = &value[index..]; if let Some((token, len)) = next_token(rest) { output.push(token); index += len; } else { let ch = rest.chars().next().unwrap(); output.push(ch.to_string()); index += ch.len_utf8(); } } output } fn next_token(rest: &str) -> Option<(String, usize)> { let first = rest.chars().next()?; if first == '[' { if let Some(end) = rest.find(']') { if end <= 121 { return Some((rest[..=end].to_string(), end + 1)); } } } if first == '(' { if let Some(end) = rest.find(')') { if end <= 121 { return Some((rest[..=end].to_string(), end + 1)); } } } if first == '【' { if let Some(end) = rest.find('】') { if rest[..end].chars().count() <= 120 { return Some(( rest[..end + '】'.len_utf8()].to_string(), end + '】'.len_utf8(), )); } } } for re in TOKEN_REGEXES.iter() { if let Some(mat) = re.find(rest) { if mat.start() == 0 && mat.end() > 0 { return Some((mat.as_str().to_string(), mat.end())); } } } None } fn strip_wrapper(token: &str) -> String { let chars: Vec = token.chars().collect(); if chars.len() >= 2 { let first = chars[0]; let last = chars[chars.len() - 1]; if (first == '[' && last == ']') || (first == '(' && last == ')') || (first == '【' && last == '】') { return chars[1..chars.len() - 1] .iter() .collect::() .trim() .to_string(); } } token.trim().to_string() } fn split_inner(inner: &str) -> Vec { let mut parts = Vec::new(); let mut current = String::new(); for ch in inner.chars() { if ch.is_whitespace() || "_.,+/&|-()()".contains(ch) { if !current.is_empty() { parts.push(std::mem::take(&mut current)); } } else { current.push(ch); } } if !current.is_empty() { parts.push(current); } parts } fn compact_for_classify(text: &str) -> String { text.chars() .filter(|ch| !ch.is_whitespace() && !matches!(ch, '_' | '.' | ',' | '-')) .collect() } fn classify_atom(text: &str) -> String { let cleaned = strip_wrapper(text); let compact = compact_for_classify(&cleaned); if cleaned.is_empty() { return "EMPTY".to_string(); } if HASH_RE.is_match(&cleaned) { return "HASH".to_string(); } if RESOLUTION_RE.is_match(&cleaned) { return "RESOLUTION".to_string(); } if DATE_RE.is_match(&cleaned) || DATE_RANGE_MIXED_RE.is_match(&cleaned) || CJK_DATE_RE.is_match(&cleaned) { return "DATE".to_string(); } if EPISODE_VERSION_RE.is_match(&compact) { return "EPISODE_VERSION".to_string(); } if EPISODE_WITH_SUFFIX_RE.is_match(&cleaned) { return "EPISODE_VERSION".to_string(); } if SXE_RE.is_match(&compact) { return "SXE".to_string(); } if EPISODE_RE.is_match(&compact) { return "EPISODE".to_string(); } if EPISODE_CJK_RE.is_match(&cleaned) { return "EPISODE".to_string(); } if EPISODE_BATCH_RE.is_match(&cleaned) { return "EPISODE_RANGE".to_string(); } if EPISODE_RANGE_RE.is_match(&cleaned) { return "EPISODE_RANGE".to_string(); } if EPISODE_RE.is_match(&cleaned) { return "EPISODE".to_string(); } if SEASON_RE.is_match(&cleaned) { return "SEASON".to_string(); } if SPECIAL_RE.is_match(&cleaned) { return "SPECIAL".to_string(); } if VOLUME_RE.is_match(&cleaned) { return "VOLUME".to_string(); } if LANG_RE.is_match(&cleaned) || lang_block_matches(&cleaned) { return "LANG".to_string(); } if MEDIA_RE.is_match(&cleaned) { return "MEDIA".to_string(); } "TEXT".to_string() } fn lang_block_matches(text: &str) -> bool { let upper = text.to_ascii_uppercase(); if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"] .iter() .any(|marker| upper.contains(marker)) { return true; } if upper.contains("GB") { return true; } if [ "简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂", ] .iter() .any(|marker| text.contains(marker)) { return true; } let chars: Vec = text.chars().collect(); chars.windows(2).enumerate().any(|(index, pair)| { pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' | '組')) }) } fn classify_token(token: &str) -> String { if token.is_empty() { return "EMPTY".to_string(); } if token.chars().all(char::is_whitespace) { return "SPACE".to_string(); } if token.chars().all(|ch| ch == '/' || ch == '\\') { return "PATH".to_string(); } if token.chars().all(|ch| "-_.::+&|".contains(ch)) { return "SEP".to_string(); } if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') { let inner = strip_wrapper(token); let parts = split_inner(&inner); let whole_class = classify_atom(&inner); let inner_class = if whole_class != "TEXT" { if whole_class == "LANG" && parts.len() > 1 { let part_classes: Vec = parts.iter().map(|part| classify_atom(part)).collect(); if part_classes.iter().all(|item| item == &part_classes[0]) { part_classes[0].clone() } else if part_classes.iter().all(|item| is_media_block_class(item)) { "MEDIA_BLOCK".to_string() } else { whole_class } } else { whole_class } } else if parts.is_empty() { "EMPTY".to_string() } else { let part_classes: Vec = parts.iter().map(|part| classify_atom(part)).collect(); if part_classes.iter().all(|item| item == &part_classes[0]) { part_classes[0].clone() } else if part_classes.iter().all(|item| is_media_block_class(item)) { "MEDIA_BLOCK".to_string() } else if part_classes.iter().any(|item| is_media_block_class(item)) && parts.iter().zip(part_classes.iter()).all(|(part, item)| { is_media_block_class(item) || matches!(part.to_ascii_lowercase().as_str(), "anime" | "アニメ") }) { "MEDIA_BLOCK".to_string() } else if part_classes.iter().any(|item| item == "TEXT") { "TEXT".to_string() } else { let mut set: Vec = part_classes .into_iter() .collect::>() .into_iter() .collect(); set.sort(); set.join("_") } }; return format!("BRACKET_{inner_class}"); } classify_atom(token) } fn is_media_block_class(value: &str) -> bool { matches!(value, "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE") } fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec { let mut groups: Vec = Vec::new(); let mut previous: Option = None; for (index, token_class) in classes.iter().enumerate() { let current = if token_class == "SPACE" { "SEP" } else { token_class } .to_string(); if previous.as_deref() == Some(current.as_str()) && matches!(current.as_str(), "SEP" | "TEXT") { groups.last_mut().unwrap().indices.push(index); } else { groups.push(Group { indices: vec![index], class_name: current.clone(), }); } previous = Some(current); } groups } fn template_key_for_filename(filename: &str) -> (String, Vec, Vec, Vec) { let tokens = tokenize(filename); let classes: Vec = tokens.iter().map(|token| classify_token(token)).collect(); let groups = compact_token_groups(&tokens, &classes); let key = groups .iter() .map(|group| group.class_name.as_str()) .collect::>() .join(" "); (key, tokens, classes, groups) } fn suggested_roles(template: &str) -> Vec { let items: Vec<&str> = template.split_whitespace().collect(); let mut roles = vec!["O".to_string(); items.len()]; let mut segment_starts = vec![0usize]; for (index, item) in items.iter().enumerate() { if *item == "PATH" { segment_starts.push(index + 1); } } for (index, item) in items.iter().enumerate() { roles[index] = if item.contains("EPISODE_VERSION") { "EPISODE_VERSION" } else if item.contains("EPISODE_RANGE") { "EPISODE_RANGE" } else if item.contains("EPISODE") || item.contains("SXE") { "EPISODE" } else if item.contains("RESOLUTION") { "RESOLUTION" } else if item.contains("HASH") { "HASH" } else if item.contains("LANG") || item.contains("MEDIA") { "SOURCE" } else if item.contains("SPECIAL") { "SPECIAL" } else if item.contains("SEASON") { "SEASON" } else if item.contains("VOLUME") { "VOLUME" } else { "O" } .to_string(); } for (offset, start) in segment_starts.iter().enumerate() { let end = if offset + 1 < segment_starts.len() { segment_starts[offset + 1] - 1 } else { items.len() }; if *start >= end { continue; } let first_structural = (*start..end) .find(|&index| { items[index].contains("EPISODE") || matches!(items[index], "SXE" | "SPECIAL" | "SEASON") }) .unwrap_or(end); let bracket_text: Vec = (*start..first_structural) .filter(|&index| items[index] == "BRACKET_TEXT" && roles[index] == "O") .collect(); let text: Vec = (*start..first_structural) .filter(|&index| items[index] == "TEXT" && roles[index] == "O") .collect(); if bracket_text.len() >= 2 { roles[bracket_text[0]] = "GROUP".to_string(); for index in bracket_text.iter().skip(1) { roles[*index] = "TITLE".to_string(); } } else if bracket_text.len() == 1 { roles[bracket_text[0]] = if text.is_empty() { "TITLE" } else if bracket_text[0] == *start { "GROUP" } else { "TITLE" } .to_string(); } for index in text { roles[index] = "TITLE".to_string(); } if !roles[*start..end].iter().any(|role| role == "TITLE") && !items[*start..end].is_empty() && items[*start].contains("EPISODE") { let mut run = Vec::new(); for index in (*start + 1)..end { if items[index] == "TEXT" && roles[index] == "O" { run.push(index); continue; } if items[index] == "SEP" { continue; } if !run.is_empty() { break; } } if run.len() >= 2 { for index in run { roles[index] = "TITLE".to_string(); } } } } roles } fn refine_semantic_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec { let mut output = roles.to_vec(); let mut segment_end = groups .iter() .position(|group| group.class_name == "PATH") .unwrap_or(groups.len()); let mut is_path_segment = segment_end < groups.len(); for index in 0..groups.len() { if groups[index].class_name == "PATH" { segment_end = groups[index + 1..] .iter() .position(|group| group.class_name == "PATH") .map(|offset| index + 1 + offset) .unwrap_or(groups.len()); is_path_segment = segment_end < groups.len(); continue; } let text = group_text(tokens, &groups[index]); let bracketed = is_bracket_group(&groups[index]); if is_category_tag_text(&text, bracketed, is_path_segment) && matches!(output[index].as_str(), "O" | "TITLE" | "GROUP" | "SPECIAL") { output[index] = "TAG".to_string(); continue; } if output[index] == "SEASON" && is_path_segment { output[index] = "PATH_SEASON".to_string(); continue; } if output[index] == "TITLE" { output[index] = title_role_for_text(&text, is_path_segment); } } output } fn filename_has_title(filename: &str) -> bool { let (key, _, _, _) = template_key_for_filename(filename); suggested_roles(&key).iter().any(|role| is_title_role(role)) } fn training_filename_for(original: &str) -> (String, bool) { let parts: Vec<&str> = original .split(|ch| ch == '/' || ch == '\\') .map(str::trim) .filter(|part| !part.is_empty()) .collect(); if parts.len() >= 2 && (path_segment_is_episodeish(parts[parts.len() - 1]) || (!path_segment_is_plain_season(parts[parts.len() - 2]) && path_segment_starts_with_episode(parts[parts.len() - 1]) && !leaf_has_full_title_after_episode(parts[parts.len() - 1]))) { if let Some(parent) = parts[..parts.len() - 1].iter().rev().find(|part| { let trimmed = trim_parent_title_segment(part); filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed) }) { let parent = trim_parent_title_segment(parent.trim()); return ( format!("{} {}", parent, parts[parts.len() - 1].trim()), true, ); } } if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) { if path_segment_has_season(parts[parts.len() - 2]) { if !path_segment_is_plain_season(parts[parts.len() - 2]) { return (parts[parts.len() - 1].to_string(), true); } let parent_seasons = path_segment_seasons(parts[parts.len() - 2]); let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]); if parent_seasons .iter() .any(|season| leaf_seasons.contains(season)) { (parts[parts.len() - 1].to_string(), true) } else { ( format!( "{} {}", parts[parts.len() - 2].trim(), parts[parts.len() - 1].trim() ), true, ) } } else { (parts[parts.len() - 1].to_string(), true) } } else { (original.to_string(), false) } } fn path_segment_is_plain_season(segment: &str) -> bool { let cleaned = strip_wrapper(segment).trim().to_string(); PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned) } fn trim_terminal_series_kind(segment: &str) -> String { let mut output = segment.trim().to_string(); for suffix in ["_TV", ".TV", " TV", "_tv", ".tv", " tv"] { if output.ends_with(suffix) { output.truncate(output.len() - suffix.len()); return output.trim_end_matches(['_', '.', ' ']).to_string(); } } output } fn trim_parent_title_segment(segment: &str) -> String { let mut output = trim_terminal_series_kind(segment); loop { let trimmed = output.trim_end(); let Some(last) = trimmed.chars().next_back() else { return output; }; let open = match last { ')' => '(', ']' => '[', '】' => '【', _ => return output, }; let Some(start) = trimmed.rfind(open) else { return output; }; let suffix = &trimmed[start..]; if path_segment_is_media_noise(suffix) { output.truncate(start); output = output.trim_end_matches([' ', '_', '.', '-']).to_string(); continue; } return output; } } fn path_segment_has_season(value: &str) -> bool { PATH_SEGMENT_SEASON_RE.is_match(value) } fn path_segment_seasons(value: &str) -> HashSet { SEASON_WORD_NUMBER_RE .captures_iter(value) .chain(S_NUMBER_SEGMENT_RE.captures_iter(value)) .chain(SXE_SEASON_RE.captures_iter(value)) .filter_map(|captures| captures.get(1)) .filter_map(|item| item.as_str().parse::().ok()) .collect() } fn has_encoding_noise(value: &str) -> bool { if value.contains('\u{fffd}') { return true; } let markers = [ "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛", "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲", "伄", "椋", "伓", "姘", "帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒", "銇", "銈", "銉", "偅", "偗", "儱", "儫", "兗", "仧", "鏉变", "鍠靛", "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒", "瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "兇銈", "銉砡", "銉砕", "杩风", "硦澶", "銇淬", "仧銉", "銉嗐", "偅銈", "銈躲", ]; let marker_hits = markers .iter() .map(|marker| value.matches(marker).count()) .sum::(); let halfwidth_hits = value .chars() .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch)) .count(); let latin_mojibake = value.split_whitespace().any(|part| { part.chars() .any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛')) && part.chars().any(|ch| ch.is_ascii_alphabetic()) }); marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake } fn has_non_anime_noise(value: &str) -> bool { let normalized = value.replace('\\', "/").trim().to_ascii_lowercase(); normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/") || value.contains("[旅游") || value.contains("[旅游番") || normalized.contains("tokyo deep") || value.contains("日本不思议铁路之旅") || value.contains("ニッポンぶらり鉄道旅") } fn normalized_path_segment(value: &str) -> String { value .split_whitespace() .collect::() .to_ascii_lowercase() } fn normalized_tag_text(value: &str) -> String { value .replace(['_', '.', '-', '・'], " ") .split_whitespace() .collect::>() .join(" ") .trim() .to_ascii_lowercase() } fn compact_tag_text(value: &str) -> String { value .chars() .filter(|ch| ch.is_alphanumeric()) .collect::() .to_ascii_lowercase() } fn is_bracket_group(group: &Group) -> bool { group.class_name.starts_with("BRACKET_") } fn is_category_tag_text(text: &str, bracketed: bool, path_segment: bool) -> bool { let cleaned = strip_wrapper(text); let trimmed = cleaned.trim(); if trimmed.is_empty() { return false; } if (bracketed || path_segment) && (DATE_RE.is_match(trimmed) || YEAR_RANGE_RE.is_match(trimmed)) { return true; } if (bracketed || path_segment) && matches!( trimmed, "国漫" | "國漫" | "日漫" | "剧场版" | "劇場版" | "新番" ) { return true; } if (bracketed || path_segment) && (trimmed.ends_with("月新番") || trimmed.ends_with("月新番合集")) { return true; } let normalized = normalized_tag_text(trimmed); (bracketed || path_segment) && matches!( normalized.as_str(), "anime" | "gekijouban" | "movie" | "movies" | "the movie" | "tv" | "tv series" ) } fn has_music_collection_noise(value: &str) -> bool { let normalized = value .replace(['_', '.', '-', '・', '/', '\\'], " ") .split_whitespace() .collect::>() .join(" "); let compact = compact_tag_text(value); MUSIC_COLLECTION_RE.is_match(&normalized) || compact.contains("musicclip") } fn is_title_role(role: &str) -> bool { role == "TITLE" || role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_") } fn is_path_title_role(role: &str) -> bool { role.starts_with("PATH_TITLE_") } fn title_entity_from_role(role: &str) -> Option<&str> { if role == "TITLE" { Some("TITLE") } else if role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_") { Some(role) } else { None } } fn is_title_entity(entity: &str) -> bool { entity == "TITLE" || matches!( entity, "TITLE_CHS" | "TITLE_CHT" | "TITLE_JPN" | "TITLE_LATIN" | "TITLE_MIXED" | "PATH_TITLE_CHS" | "PATH_TITLE_CHT" | "PATH_TITLE_JPN" | "PATH_TITLE_LATIN" | "PATH_TITLE_MIXED" ) } fn is_title_label(label: &str) -> bool { label_entity(label).is_some_and(is_title_entity) } fn title_language_suffix(text: &str) -> &'static str { let mut has_latin = false; let mut has_han = false; let mut has_kana = false; for ch in text.chars() { if ch.is_ascii_alphabetic() { has_latin = true; } else if ('\u{3040}'..='\u{30ff}').contains(&ch) || ('\u{31f0}'..='\u{31ff}').contains(&ch) { has_kana = true; } else if ('\u{4e00}'..='\u{9fff}').contains(&ch) { has_han = true; } } if has_kana { return "JPN"; } if has_latin && has_han { return "MIXED"; } if has_han { return cjk_title_language_suffix(text); } if has_latin { return "LATIN"; } "MIXED" } fn cjk_title_language_suffix(text: &str) -> &'static str { let japanese_markers = [ '々', 'ヶ', '君', '戦', '気', '辺', '沢', '桜', '竜', '広', '処', '歩', '黒', '円', ]; if text.chars().any(|ch| japanese_markers.contains(&ch)) { return "JPN"; } let simplified_markers = [ '国', '剧', '场', '农', '闲', '汉', '龙', '门', '击', '战', '体', '后', '爱', '边', '声', '岛', '学', '万', ]; if text.chars().any(|ch| simplified_markers.contains(&ch)) { return "CHS"; } let traditional_markers = [ '國', '劇', '場', '農', '閒', '漢', '龍', '門', '擊', '戰', '體', '後', '愛', '邊', '聲', '島', '學', '萬', '縛', '異', '臺', '灣', '搖', '滾', ]; if text.chars().any(|ch| traditional_markers.contains(&ch)) { return "CHT"; } "CHS" } fn title_role_for_text(text: &str, path_title: bool) -> String { let prefix = if path_title { "PATH_TITLE" } else { "TITLE" }; format!("{prefix}_{}", title_language_suffix(text)) } fn path_segment_is_episodeish(value: &str) -> bool { let (_, _, _, groups) = template_key_for_filename(value); let structural: Vec<&String> = groups .iter() .map(|group| &group.class_name) .filter(|item| item.as_str() != "SEP") .collect(); !structural.is_empty() && structural.iter().all(|item| { item.starts_with("EPISODE") || item.as_str() == "SPECIAL" || item.as_str() == "VOLUME" || item.as_str() == "BRACKET_VOLUME" }) } fn path_segment_starts_with_episode(value: &str) -> bool { if EPISODE_CJK_PREFIX_RE.is_match(value.trim()) { return true; } let (key, _, _, groups) = template_key_for_filename(value); let roles = suggested_roles(&key); groups .iter() .zip(roles.iter()) .find(|(group, _)| group.class_name != "SEP") .is_some_and(|(_, role)| role.starts_with("EPISODE")) } fn leaf_has_full_title_after_episode(value: &str) -> bool { let (key, _, _, groups) = template_key_for_filename(value); let roles = suggested_roles(&key); let first_structural = roles.iter().position(|role| role.starts_with("EPISODE")); let Some(first_episode) = first_structural else { return false; }; groups .iter() .zip(roles.iter()) .skip(first_episode + 1) .filter(|(group, _)| group.class_name != "SEP") .any(|(_, role)| role == "TITLE") } fn path_segment_is_media_noise(value: &str) -> bool { let normalized = value.to_ascii_lowercase(); if normalized.contains("sourceunknown") || normalized.contains("sourceunknow") { return true; } if (normalized.contains("dvdrip") || normalized.contains("bdrip") || normalized.contains("webrip") || normalized.contains("web-dl") || normalized.contains("bluray")) && tokenize(value) .iter() .map(|token| classify_atom(token)) .any(|class_name| class_name == "RESOLUTION") { return true; } let (_, _, _, groups) = template_key_for_filename(value); let structural: Vec<&String> = groups .iter() .map(|group| &group.class_name) .filter(|item| item.as_str() != "SEP") .collect(); !structural.is_empty() && structural.iter().all(|item| { matches!( item.as_str(), "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE" | "BRACKET_MEDIA" | "BRACKET_RESOLUTION" | "BRACKET_LANG" | "BRACKET_HASH" | "BRACKET_DATE" | "MEDIA_BLOCK" | "BRACKET_MEDIA_BLOCK" ) }) } fn has_abstract_path_noise(value: &str) -> bool { let parts: Vec<&str> = value .split(|ch| ch == '/' || ch == '\\') .map(str::trim) .filter(|part| !part.is_empty()) .collect(); if parts.len() < 3 { return false; } if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) { return true; } path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1]) } fn role_label(role: &str) -> String { let entity = match role { "GROUP" => Some("GROUP"), role if is_title_role(role) => Some("TITLE"), "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"), "SEASON" => Some("SEASON"), "PATH_SEASON" => Some("PATH_SEASON"), "SPECIAL" | "VOLUME" => Some("SPECIAL"), "RESOLUTION" => Some("RESOLUTION"), "SOURCE" => Some("SOURCE"), "TAG" => Some("TAG"), _ => None, }; entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}")) } fn is_separator(piece: &str) -> bool { piece.is_empty() || piece .chars() .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) } fn char_kind(ch: char) -> &'static str { if ch.is_whitespace() || !ch.is_alphanumeric() { "sep" } else if ch.is_ascii_digit() { "digit" } else if ch.is_ascii_alphabetic() { "alpha" } else { "text" } } fn split_refined_token(token: &str) -> Vec { let whole_class = classify_atom(token); let is_wrapped = { let chars: Vec = token.chars().collect(); chars.len() >= 2 && ((chars[0] == '[' && chars[chars.len() - 1] == ']') || (chars[0] == '(' && chars[chars.len() - 1] == ')') || (chars[0] == '【' && chars[chars.len() - 1] == '】')) }; if !is_wrapped && matches!( whole_class.as_str(), "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" ) && token.chars().all(char::is_alphanumeric) { return vec![token.to_string()]; } if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) { return vec![token.to_string()]; } let mut pieces = Vec::new(); let mut current = String::new(); let mut current_kind: Option<&str> = None; for ch in token.chars() { let kind = char_kind(ch); if kind == "sep" { if !current.is_empty() { pieces.push(std::mem::take(&mut current)); current_kind = None; } pieces.push(ch.to_string()); continue; } if !current.is_empty() && current_kind != Some(kind) { pieces.push(std::mem::take(&mut current)); } current.push(ch); current_kind = Some(kind); } if !current.is_empty() { pieces.push(current); } let mut merged = Vec::new(); let mut index = 0; while index < pieces.len() { if index + 2 < pieces.len() && !is_separator(&pieces[index]) && is_separator(&pieces[index + 1]) && !is_separator(&pieces[index + 2]) { let combined = format!( "{}{}{}", pieces[index], pieces[index + 1], pieces[index + 2] ); let combined_class = classify_atom(&combined); if !pieces[index + 1].chars().any(char::is_whitespace) && matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×") && matches!( combined_class.as_str(), "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" ) { merged.push(combined); index += 3; continue; } } if !is_separator(&pieces[index]) { let mut end = index; let mut combined = String::new(); while end < pieces.len() && !is_separator(&pieces[end]) { combined.push_str(&pieces[end]); end += 1; } if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) { merged.push(combined); index = end; continue; } } if index + 1 < pieces.len() && !is_separator(&pieces[index]) && !is_separator(&pieces[index + 1]) { let combined = format!("{}{}", pieces[index], pieces[index + 1]); if is_mergeable_refined_class(&classify_atom(&combined)) { merged.push(combined); index += 2; continue; } } merged.push(pieces[index].clone()); index += 1; } merged } fn is_mergeable_refined_class(value: &str) -> bool { matches!( value, "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | "SEASON" ) } fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String { if is_separator(piece) { return "O".to_string(); } let atom_class = classify_atom(piece); let upper = piece.to_ascii_uppercase(); if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if atom_class == "SEASON" { return "B-SEASON".to_string(); } if matches!(atom_class.as_str(), "EPISODE" | "EPISODE_VERSION" | "SXE") || piece.chars().all(|ch| ch.is_ascii_digit()) { return "B-EPISODE".to_string(); } if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") || matches!( upper.as_str(), "OVA" | "OAD" | "SP" | "PV" | "CM" | "OP" | "ED" | "NCOP" | "NCED" ) { return "B-SPECIAL".to_string(); } return "O".to_string(); } if role == "SOURCE" || matches!(token_class, "BRACKET_MEDIA_BLOCK" | "MEDIA_BLOCK") { if atom_class == "EPISODE_VERSION" { return "B-EPISODE".to_string(); } if atom_class == "RESOLUTION" { return "B-RESOLUTION".to_string(); } if atom_class == "HASH" { return "O".to_string(); } if matches!(atom_class.as_str(), "MEDIA" | "LANG") { return "B-SOURCE".to_string(); } if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") { return "B-SPECIAL".to_string(); } return if matches!( upper.as_str(), "END" | "FIN" | "COMPLETE" | "TV" | "全集" | "全" ) { "O".to_string() } else { "B-SOURCE".to_string() }; } if role == "RESOLUTION" { return if atom_class == "RESOLUTION" || piece.chars().all(|ch| ch.is_ascii_digit()) { "B-RESOLUTION".to_string() } else { "O".to_string() }; } role_label(role) } fn split_sxe_token(token: &str) -> Option<(Vec, Vec)> { let caps = SXE_VALUE_RE.captures(token)?; let mut pieces = vec![ "S".to_string(), caps[1].to_string(), "E".to_string(), caps[2].to_string(), ]; let mut labels = vec![ "O".to_string(), "B-SEASON".to_string(), "O".to_string(), "B-EPISODE".to_string(), ]; if let Some(version) = caps.get(3) { pieces.push("v".to_string()); pieces.push(version.as_str().to_string()); labels.push("O".to_string()); labels.push("O".to_string()); } Some((pieces, labels)) } fn repair_compact_sxe_tokens( tokens: Vec, labels: Vec, ) -> (Vec, Vec) { let mut output_tokens = Vec::new(); let mut output_labels = Vec::new(); for (token, label) in tokens.into_iter().zip(labels.into_iter()) { if let Some((pieces, piece_labels)) = split_sxe_token(&token) { output_tokens.extend(pieces); output_labels.extend(piece_labels); } else { output_tokens.push(token); output_labels.push(label); } } (output_tokens, output_labels) } fn split_episode_token(token: &str) -> Option<(Vec, Vec)> { if DECIMAL_EPISODE_RE.is_match(token) { let pieces = split_generated_token(token); let labels = pieces.iter().map(|_| "B-EPISODE".to_string()).collect(); return Some((pieces, labels)); } let caps = EPISODE_VALUE_RE.captures(token)?; let mut pieces = vec![caps[1].to_string()]; let mut labels = vec!["O".to_string()]; for piece in split_generated_token(&caps[2]) { pieces.push(piece); labels.push("B-EPISODE".to_string()); } if let Some(version) = caps.get(3) { pieces.push("v".to_string()); pieces.push(version.as_str().to_string()); labels.push("O".to_string()); labels.push("O".to_string()); } Some((pieces, labels)) } fn split_season_token(token: &str) -> Option<(Vec, Vec)> { let caps = SEASON_VALUE_RE.captures(token)?; Some(( vec!["S".to_string(), caps[1].to_string()], vec!["O".to_string(), "B-SEASON".to_string()], )) } fn group_text(tokens: &[String], group: &Group) -> String { strip_wrapper( &group .indices .iter() .map(|&index| tokens[index].as_str()) .collect::(), ) } fn normalize_whitelist_name(value: &str) -> String { value.split_whitespace().collect::>().join(" ") } fn phrase_parts_for_whitelist(value: &str) -> Vec { let tokens = tokenize(value); let classes: Vec = tokens.iter().map(|token| classify_token(token)).collect(); let groups = compact_token_groups(&tokens, &classes); groups .iter() .filter(|group| whitelist_phrase_group(group)) .map(|group| group_text(&tokens, group)) .filter(|part| !part.trim().is_empty()) .collect() } fn whitelist_phrase_group(group: &Group) -> bool { matches!( group.class_name.as_str(), "TEXT" | "EPISODE" | "SPECIAL" | "SEASON" | "BRACKET_TEXT" ) } fn is_special_title_phrase(text: &str) -> bool { let normalized = SPECIAL_SPACE_RE .replace_all(text, " ") .trim() .to_ascii_uppercase(); matches!( normalized.as_str(), "CM" | "EVENT" | "EIZOU" | "EXTRA" | "EXTRAS" | "LOGO" | "MENU" | "OMAKE" | "PREVIEW" | "PV" | "THEATER GREETING EVENT" | "TOKUTEN" | "TRAILER" | "TV SPOT" | "SPOT" | "WORLD PREMIERE" | "予告" | "番宣" | "宣番" | "映像特典" | "特典" ) || normalized.contains("映像特典") || normalized.contains("特典映像") || normalized.contains("番宣") || normalized.contains("宣番") || normalized.contains("TV SPOT") || normalized.contains("BD SPOT") || text.contains("予告") || SPECIAL_TITLE_PHRASE_RE.is_match(text) } fn looks_like_release_group(text: &str) -> bool { let normalized = text.to_ascii_lowercase(); normalized.contains("fansub") || normalized.ends_with("sub") || normalized.contains("sub&") || normalized.contains("&sub") || normalized.contains("字幕组") || normalized.contains("字幕組") } fn title_context_before( tokens: &[String], groups: &[Group], roles: &[String], index: usize, ) -> String { (0..index) .filter(|&cursor| roles[cursor] == "TITLE") .map(|cursor| group_text(tokens, &groups[cursor])) .collect::>() .join(" ") } fn short_number_title_exception(context: &str, number: &str) -> bool { let normalized = normalized_tag_text(context); let compact = compact_tag_text(context); matches!( (normalized.as_str(), number), ("kamisama hajimemashita", "2") | ("ghiblies episode", "2") | ("r", "15") ) || (normalized.contains("91 days") && number == "91") || (context.contains("銀河鉄道") && number == "999") || compact.contains("highschooldd") || (context.contains("機動戦士ガンダム") && number == "00") } fn group_followed_by_quote(tokens: &[String], groups: &[Group], index: usize) -> bool { let Some(last_token) = groups.get(index).and_then(|group| group.indices.last()) else { return false; }; for token in &tokens[*last_token + 1..] { if token.chars().all(char::is_whitespace) { continue; } return matches!(token.as_str(), "「" | "「" | "\"" | "'"); } false } const KNOWN_TITLE_PHRASES: &[&[&str]] = &[ &["SPY", "x", "FAMILY"], &["Spy", "x", "Family"], &["Slime", "300"], &["Zom", "100"], &["Kamisama", "Hajimemashita", "2"], &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"], &["Durarara", "2", "Ketsu"], &["Ghiblies", "Episode", "2"], &["Eien", "no", "831"], &["Lupin The Thrid Jigen Daisuke no Bohyou"], &["Lupin The Third Jigen Daisuke no Bohyou"], ]; fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) { if let Some(whitelists) = RUNTIME_WHITELISTS.get() { for (index, group) in groups.iter().enumerate() { let previous_structural = roles[..index].iter().any(|role| { role.starts_with("EPISODE") || matches!( role.as_str(), "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION" ) }); if group.class_name == "BRACKET_TEXT" && !previous_structural && whitelists .group_names .contains(&normalize_whitelist_name(&group_text(tokens, group))) && !roles.get(index).is_some_and(|role| { matches!( role.as_str(), "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" | "SEASON" | "SOURCE" | "RESOLUTION" | "SPECIAL" ) }) { roles[index] = "GROUP".to_string(); } } } let searchable: Vec<(usize, String)> = groups .iter() .enumerate() .filter(|(_, group)| whitelist_phrase_group(group)) .map(|(index, group)| (index, group_text(tokens, group))) .collect(); for phrase in KNOWN_TITLE_PHRASES { apply_title_phrase(&searchable, phrase, roles, true); } if let Some(whitelists) = RUNTIME_WHITELISTS.get() { for phrase in &whitelists.title_phrases { if phrase.len() >= 2 { apply_title_phrase(&searchable, phrase, roles, false); } } } } fn apply_title_phrase( searchable: &[(usize, String)], phrase: &[impl AsRef], roles: &mut [String], allow_structural_override: bool, ) { if phrase.is_empty() || phrase.len() > searchable.len() { return; } for window in searchable.windows(phrase.len()) { if window .iter() .zip(phrase.iter()) .all(|((_, text), expected)| text.eq_ignore_ascii_case(expected.as_ref())) { for (group_index, _) in window { if roles.get(*group_index).is_some_and(|role| role == "GROUP") { let is_known_group = RUNTIME_WHITELISTS.get().is_some_and(|whitelists| { whitelists .group_names .contains(&normalize_whitelist_name(&window[0].1)) }); if is_known_group { continue; } } if !allow_structural_override && roles.get(*group_index).is_some_and(|role| { matches!( role.as_str(), "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" | "SEASON" | "SOURCE" | "RESOLUTION" ) }) { continue; } { roles[*group_index] = "TITLE".to_string(); } } } } } fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec { let mut output = roles.to_vec(); let ep_markers = ["EP", "E", "Episode", "ep", "episode"]; let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"]; apply_known_title_phrases(tokens, groups, &mut output); if output.first().is_some_and(|role| role == "GROUP") { let first_text = group_text(tokens, &groups[0]); let first_is_known_group = RUNTIME_WHITELISTS.get().is_some_and(|whitelists| { whitelists .group_names .contains(&normalize_whitelist_name(&first_text)) }); if !first_is_known_group { if let Some(groupish_index) = (1..groups.len()).find(|&index| { output[index] == "TITLE" && looks_like_release_group(&group_text(tokens, &groups[index])) }) { output[0] = "TITLE".to_string(); output[groupish_index] = "GROUP".to_string(); } } } if roles .first() .is_some_and(|role| role.starts_with("EPISODE")) && YEAR_RANGE_RE.is_match(&group_text(tokens, &groups[0])) { let first_real_structural = (1..roles.len()) .find(|&index| { roles[index].starts_with("EPISODE") || matches!(roles[index].as_str(), "SEASON" | "SPECIAL") }) .unwrap_or(roles.len()); for index in 1..first_real_structural { if groups[index].class_name == "TEXT" && !matches!( group_text(tokens, &groups[index]) .to_ascii_uppercase() .as_str(), "TV" | "OVA" | "OAD" | "SP" ) { output[index] = "TITLE".to_string(); } } } if !output.iter().any(|role| role == "TITLE") && roles .first() .is_some_and(|role| role.starts_with("EPISODE")) { let mut title_run = Vec::new(); for index in 1..roles.len() { if groups[index].class_name == "TEXT" && output[index] == "O" { title_run.push(index); continue; } if groups[index].class_name == "SEP" { continue; } if !title_run.is_empty() { break; } } if title_run.len() >= 2 { let last_title_index = *title_run.last().unwrap(); let later_structural = roles[last_title_index + 1..].iter().any(|role| { role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") }); if group_text(tokens, &groups[0]) .chars() .all(|ch| ch.is_ascii_digit()) && later_structural { output[0] = "TITLE".to_string(); } for index in title_run { output[index] = "TITLE".to_string(); } } } if roles .first() .is_some_and(|role| role.starts_with("EPISODE")) && group_text(tokens, &groups[0]) .chars() .all(|ch| ch.is_ascii_digit()) { if let Some(first_title) = output.iter().position(|role| role == "TITLE") { let later_structural = roles[first_title + 1..].iter().any(|role| { role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") }); if later_structural { output[0] = "TITLE".to_string(); } } } for index in 0..roles.len() { let text = group_text(tokens, &groups[index]); if output[index] == "O" && groups[index].class_name.contains("SXE") { output[index] = "EPISODE".to_string(); } if text.eq_ignore_ascii_case("TV") { let next_text = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .map(|cursor| (cursor, group_text(tokens, &groups[cursor]))); if let Some((spot_index, spot_text)) = next_text { if spot_text.eq_ignore_ascii_case("Spot") { output[index] = "SPECIAL".to_string(); output[spot_index] = "SPECIAL".to_string(); continue; } } } if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) { output[index] = "O".to_string(); continue; } if roles[index].starts_with("EPISODE") && index >= 2 && matches!( group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X" ) && output[index - 2] == "TITLE" && !roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")) { output[index] = "TITLE".to_string(); if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| { groups[cursor].class_name != "SEP" && groups[cursor].class_name == "TEXT" }) { output[next_text_index] = "TITLE".to_string(); } continue; } if roles[index].starts_with("EPISODE") && !output[..index] .iter() .any(|role| role.starts_with("EPISODE")) && group_text( tokens, &groups[(0..index) .rev() .find(|&cursor| groups[cursor].class_name != "SEP") .unwrap_or(index)], ) .eq_ignore_ascii_case("Movie") { output[index] = "TITLE".to_string(); continue; } if output[index] == "TITLE" && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英") { let next_source_lang = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .is_some_and(|cursor| { output[cursor] == "SOURCE" && group_text(tokens, &groups[cursor]).contains('语') }); if next_source_lang { output[index] = "SOURCE".to_string(); continue; } } if roles[index].starts_with("EPISODE") && index >= 1 && output[..index].iter().any(|role| role == "TITLE") && text.chars().all(|ch| ch.is_ascii_digit()) && short_number_title_exception( &title_context_before(tokens, groups, &output, index), &text, ) { output[index] = "TITLE".to_string(); continue; } if roles[index].starts_with("EPISODE") && index >= 1 && output[index - 1] == "TITLE" && groups[index - 1].class_name != "SEP" && text.chars().all(|ch| ch.is_ascii_digit()) && text.len() <= 2 && roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")) && !group_followed_by_quote(tokens, groups, index) { let context = title_context_before(tokens, groups, &output, index); output[index] = if short_number_title_exception(&context, &text) { "TITLE" } else { "SEASON" } .to_string(); continue; } if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) { let previous_text = group_text(tokens, &groups[index - 2]); let next_special = output[index + 1..roles.len().min(index + 4)] .iter() .any(|role| role == "SPECIAL"); let next_episode = roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")); if groups[index - 1].class_name == "SEP" && matches!( previous_text.to_ascii_lowercase().as_str(), "vol" | "volume" ) { let next_text_before_episode = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .is_some_and(|cursor| { groups[cursor].class_name == "TEXT" && roles[cursor + 1..] .iter() .any(|role| role.starts_with("EPISODE")) }); if next_text_before_episode { output[index - 2] = "TITLE".to_string(); output[index] = "TITLE".to_string(); continue; } output[index - 2] = "SPECIAL".to_string(); output[index] = "SPECIAL".to_string(); continue; } if index >= 1 && output[index - 1] == "TITLE" && groups[index - 1].class_name != "SEP" && text.chars().all(|ch| ch.is_ascii_digit()) && text.len() <= 2 && roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")) && !group_followed_by_quote(tokens, groups, index) { let context = title_context_before(tokens, groups, &output, index); output[index] = if short_number_title_exception(&context, &text) { "TITLE" } else { "SEASON" } .to_string(); continue; } if !output[..index].iter().any(|role| role == "TITLE") && NUMERIC_TITLE_PREFIX_RE.is_match(&text) && output[..index].iter().any(|role| role == "GROUP") && roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")) { output[index] = "TITLE".to_string(); continue; } if !output[..index].iter().any(|role| role == "TITLE") && NUMERIC_TITLE_PREFIX_RE.is_match(&text) && index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && groups[index + 2].class_name == "TEXT" && group_text(tokens, &groups[index + 2]) .chars() .any(|ch| ch.is_alphabetic()) && roles[index + 3..] .iter() .any(|role| role.starts_with("EPISODE")) { output[index] = "TITLE".to_string(); output[index + 2] = "TITLE".to_string(); continue; } if output[index - 2] == "TITLE" && groups[index - 1].class_name == "SEP" && previous_text.len() <= 48 && previous_text.chars().any(|ch| ch.is_alphabetic()) && text.chars().all(|ch| ch.is_ascii_digit()) && text.len() <= 2 && !(index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode")) && !(index + 1 < roles.len() && groups[index + 1].class_name == "SEP" && group_text(tokens, &groups[index + 1]) .chars() .any(|ch| matches!(ch, '「' | '「' | '"' | '\''))) && !group_followed_by_quote(tokens, groups, index) && (next_episode || (next_special && (text.parse::().is_ok_and(|value| value >= 100) || (previous_text.len() <= 4 && previous_text.is_ascii() && previous_text.chars().all(|ch| ch.is_ascii_alphabetic()))))) { output[index] = if next_episode && !short_number_title_exception( &title_context_before(tokens, groups, &output, index), &text, ) { "SEASON" } else { "TITLE" } .to_string(); continue; } } if roles[index].starts_with("EPISODE") && (text.chars().all(|ch| ch.is_ascii_digit()) || matches!(classify_atom(&text).as_str(), "EPISODE" | "EPISODE_VERSION")) && output[..index].iter().any(|role| role == "SPECIAL") && !output[..index] .iter() .any(|role| role.starts_with("EPISODE")) { let previous_structural = (0..index) .rev() .find(|&cursor| groups[cursor].class_name != "SEP") .and_then(|cursor| output.get(cursor)) .map(String::as_str); let next_real = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .and_then(|cursor| roles.get(cursor)) .map(String::as_str); if matches!(previous_structural, Some("SPECIAL")) && !matches!(next_real, Some("TITLE" | "SEASON")) { output[index] = "SPECIAL".to_string(); continue; } } if roles[index].starts_with("EPISODE") && BARE_RESOLUTION_RE.is_match(&text) && index >= 2 && groups[index - 1].class_name == "SEP" { let previous_text = group_text(tokens, &groups[index - 2]); let next_sourceish = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .is_some_and(|cursor| matches!(roles[cursor].as_str(), "SOURCE" | "RESOLUTION")); if previous_text .chars() .any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.')) || next_sourceish { output[index] = "RESOLUTION".to_string(); continue; } } if roles[index].starts_with("EPISODE") && index >= 2 && output[..index].iter().any(|role| role == "TITLE") && group_text(tokens, &groups[index]) .chars() .all(|ch| ch.is_ascii_digit()) { let next_episode_word = index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"); if next_episode_word { let mut run = Vec::new(); let mut cursor = index + 2; while cursor < roles.len() { if groups[cursor].class_name == "SEP" { cursor += 1; continue; } if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE") { run.push(cursor); cursor += 1; continue; } break; } let later_episode = roles[cursor..] .iter() .any(|role| role.starts_with("EPISODE")); if run.len() >= 2 && later_episode { output[index] = "TITLE".to_string(); for item in run { output[item] = "TITLE".to_string(); } continue; } } } if roles[index] == "TITLE" && is_special_title_phrase(&text) { output[index] = "SPECIAL".to_string(); continue; } if roles[index] == "TITLE" && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版") && output .iter() .enumerate() .any(|(other, role)| other != index && role == "TITLE") { output[index] = "O".to_string(); continue; } if roles[index] == "TITLE" && matches!(text.as_str(), "TVアニメ" | "テレビアニメ") && output .iter() .enumerate() .any(|(other, role)| other != index && role == "TITLE") { output[index] = "O".to_string(); continue; } if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") { let later_special = output[index + 1..].iter().any(|role| role == "SPECIAL"); if later_special { output[index] = "SPECIAL".to_string(); continue; } } if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集") { output[index] = "O".to_string(); continue; } if output[index] == "O" && groups[index].class_name == "TEXT" && roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")) && text.chars().any(|ch| ch.is_alphabetic()) && !ep_markers.contains(&text.as_str()) { if !output[..index].iter().any(|role| role == "TITLE") { let previous_structural = (0..index) .rev() .find(|&cursor| groups[cursor].class_name != "SEP") .and_then(|cursor| output.get(cursor)) .map(String::as_str); if matches!(previous_structural, Some("SPECIAL")) { output[index] = "TITLE".to_string(); continue; } } if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") { let episode_since_title = output[last_title + 1..index] .iter() .any(|role| role.starts_with("EPISODE")); if !episode_since_title { output[index] = "TITLE".to_string(); continue; } } } if roles[index] == "TITLE" && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison") && index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && roles[index + 2].starts_with("EPISODE") { if !output[..index].iter().any(|role| role == "TITLE") { output[index] = "O".to_string(); output[index + 2] = "SEASON".to_string(); } continue; } if roles[index] == "TITLE" && text == text.to_ascii_uppercase() && roman.contains(&text.as_str()) { let previous_title = output[..index].iter().any(|role| role == "TITLE"); let next_structural = roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE") || role == "SPECIAL"); if previous_title && next_structural { output[index] = "SEASON".to_string(); continue; } } if roles[index].starts_with("EPISODE") && index + 4 < roles.len() { if groups[index + 1].class_name == "SEP" && ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str()) && groups[index + 3].class_name == "SEP" && roles[index + 4].starts_with("EPISODE") { output[index] = "TITLE".to_string(); output[index + 2] = "O".to_string(); } } if roles[index].starts_with("EPISODE") && !output[index + 1..].iter().any(|role| role == "TITLE") { let mut run = Vec::new(); let mut cursor = index + 1; while cursor < roles.len() { if groups[cursor].class_name == "SEP" { cursor += 1; continue; } if groups[cursor].class_name == "TEXT" && !matches!( roles[cursor].as_str(), "SOURCE" | "RESOLUTION" | "SEASON" | "SPECIAL" ) { run.push(cursor); cursor += 1; continue; } if !run.is_empty() { break; } cursor += 1; } if run.len() >= 2 { for item in run { output[item] = "TITLE".to_string(); } } } if roles[index].starts_with("EPISODE") { let previous_text = if index >= 1 { group_text(tokens, &groups[index - 1]) } else { String::new() }; let next_text = if index + 1 < roles.len() { group_text(tokens, &groups[index + 1]) } else { String::new() }; let previous_real_text = (0..index) .rev() .find(|&cursor| groups[cursor].class_name != "SEP") .map(|cursor| group_text(tokens, &groups[cursor])) .unwrap_or_default(); if previous_text.ends_with('第') && next_text.starts_with('期') { output[index] = "SEASON".to_string(); continue; } if matches!( previous_real_text.to_ascii_lowercase().as_str(), "lesson" | "part" | "no" ) { output[index] = "O".to_string(); continue; } if previous_real_text.contains("予告") || previous_real_text.eq_ignore_ascii_case("Spot") { output[index] = "SPECIAL".to_string(); continue; } if output[..index].iter().any(|role| role == "TITLE") && (output[..index].iter().enumerate().any(|(cursor, role)| { role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor])) })) && !output[..index] .iter() .any(|role| role.starts_with("EPISODE")) && text.chars().all(|ch| ch.is_ascii_digit()) && text.len() <= 3 { output[index] = "SPECIAL".to_string(); continue; } if previous_text.contains('点') || previous_text.contains('點') || previous_text.contains("晚上") || previous_text.contains("上午") || previous_text.contains("下午") || previous_text.contains('年') || previous_text.contains('月') || previous_text.contains('秒') || next_text.contains('点') || next_text.contains('點') || next_text.contains('半') || next_text.contains('月') || next_text.contains('日') || next_text.contains('秒') { output[index] = "O".to_string(); } } } output } fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> { let mut candidates = Vec::new(); let mut index = 0; while index < roles.len() { if !is_title_role(&roles[index]) { index += 1; continue; } let start = index; index += 1; loop { if index < roles.len() && is_title_role(&roles[index]) && !(groups[index - 1].class_name == "BRACKET_TEXT" && groups[index].class_name == "BRACKET_TEXT") { index += 1; continue; } if index + 1 < roles.len() && roles[index] == "O" && groups[index].class_name == "SEP" && is_title_role(&roles[index + 1]) { index += 2; continue; } break; } candidates.push((start, index)); } candidates } fn enforce_single_title_candidate( tokens: &[String], groups: &[Group], roles: &[String], ) -> (Vec, Vec) { let candidates = title_candidates(groups, roles); if candidates.len() <= 1 { return (roles.to_vec(), Vec::new()); } let first_anchor = roles .iter() .position(|role| { role.starts_with("EPISODE") || matches!( role.as_str(), "SEASON" | "PATH_SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION" ) }) .unwrap_or(roles.len()); let before_anchor: Vec<(usize, usize)> = candidates .iter() .copied() .filter(|(_, end)| *end <= first_anchor) .collect(); let before_anchor_only_path_titles = !before_anchor.is_empty() && before_anchor.iter().all(|(start, end)| { (*start..*end) .all(|index| !is_title_role(&roles[index]) || is_path_title_role(&roles[index])) }); let selected_pool = if before_anchor.is_empty() || before_anchor_only_path_titles { &candidates } else { &before_anchor }; let mut selected_by_kind: HashMap = HashMap::new(); for (start, end) in selected_pool.iter().copied() { let score = ( title_candidate_score(tokens, groups, start, end), end, end - start, ); let key = title_candidate_key(tokens, groups, roles, start, end); match selected_by_kind.get(&key) { Some((_, best_score)) if *best_score >= score => {} _ => { selected_by_kind.insert(key, ((start, end), score)); } } } let selected: HashSet<(usize, usize)> = selected_by_kind.values().map(|(range, _)| *range).collect(); let mut output = roles.to_vec(); let mut dropped = Vec::new(); for (start, end) in candidates { if selected.contains(&(start, end)) { continue; } for index in start..end { if is_title_role(&output[index]) { output[index] = "O".to_string(); dropped.push(index.to_string()); } } } (output, dropped) } fn title_candidate_key( tokens: &[String], groups: &[Group], roles: &[String], start: usize, end: usize, ) -> String { let mut entities: Vec = (start..end) .filter_map(|index| title_entity_from_role(&roles[index]).map(str::to_string)) .filter(|entity| entity != "TITLE") .collect(); entities.sort(); entities.dedup(); if entities.is_empty() { let text = candidate_text(tokens, groups, start, end); return title_role_for_text(&text, false); } entities.join("+") } fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize { let text = (start..end) .filter(|&index| roles_candidate_text_group(&groups[index])) .map(|index| group_text(tokens, &groups[index])) .collect::>() .join(""); let cleaned = text.trim(); if cleaned.is_empty() { return -1000; } let mut score = cleaned.chars().filter(|ch| ch.is_alphanumeric()).count() as isize; if VERSIONISH_TITLE_RE.is_match(cleaned) { score -= 500; } if matches!( cleaned.to_ascii_lowercase().as_str(), "国漫" | "國漫" | "anime" | "movie" | "movies" ) { score -= 500; } if title_noise_score_penalty(cleaned) { score -= 700; } score } fn title_noise_score_penalty(text: &str) -> bool { let normalized = text .replace(['_', '-', '.'], " ") .split_whitespace() .collect::>() .join(" ") .to_ascii_lowercase(); normalized.contains("bdrip") || normalized.contains("webrip") || normalized.contains("web dl") || normalized.contains("bluray") || normalized.contains("full hd") || normalized.contains("hdtv") } fn roles_candidate_text_group(group: &Group) -> bool { matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") } fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec, Vec) { let mut output_tokens = Vec::new(); let mut output_labels = Vec::new(); for (token, label) in tokens.iter().zip(labels.iter()) { for piece in split_generated_token(token) { output_labels.push(if label == "O" || is_standalone_separator(&piece) { "O".to_string() } else { label.clone() }); output_tokens.push(piece); } } (output_tokens, output_labels) } fn normalize_title_token(token: &str) -> (Vec, Vec) { let pieces = split_generated_token(token); let mut output_pieces = Vec::new(); let mut labels = Vec::new(); for piece in pieces { if is_standalone_separator(&piece) { output_pieces.push(piece); labels.push("O".to_string()); continue; } if let Some((pieces, piece_labels)) = split_sxe_token(&piece) { output_pieces.extend(pieces); labels.extend(piece_labels); continue; } if EPISODE_VERSION_RE.is_match(&compact_for_classify(&piece)) { output_pieces.push(piece); labels.push("B-EPISODE".to_string()); continue; } if CJK_SEASON_TOKEN_RE.is_match(&piece) || SEASON_RE.is_match(&piece) { output_pieces.push(piece); labels.push("B-SEASON".to_string()); continue; } if EPISODE_CJK_RE.is_match(&piece) { output_pieces.push(piece); labels.push("B-EPISODE".to_string()); continue; } if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) { let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default(); let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default(); if !before.is_empty() { output_pieces.push(before.to_string()); labels.push("B-TITLE".to_string()); } output_pieces.push(season.to_string()); labels.push("B-SEASON".to_string()); continue; } if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) { let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default(); let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default(); let after = caps.get(3).map(|m| m.as_str()).unwrap_or_default(); if !before.is_empty() { output_pieces.push(before.to_string()); labels.push("B-TITLE".to_string()); } output_pieces.push(season.to_string()); labels.push("B-SEASON".to_string()); if !after.is_empty() { output_pieces.push(after.to_string()); labels.push("O".to_string()); } continue; } if let Some(caps) = CJK_EPISODE_EMBEDDED_RE.captures(&piece) { let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default(); let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default(); let after = caps.get(3).map(|m| m.as_str()).unwrap_or_default(); if !before.is_empty() { output_pieces.push(before.to_string()); labels.push("B-TITLE".to_string()); } output_pieces.push(episode.to_string()); labels.push("B-EPISODE".to_string()); if !after.is_empty() { output_pieces.push(after.to_string()); labels.push("O".to_string()); } continue; } if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) { let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default(); let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default(); if before.contains("銀河鉄道") && episode == "999" { output_pieces.push(before.to_string()); labels.push("B-TITLE".to_string()); output_pieces.push(episode.to_string()); labels.push("B-TITLE".to_string()); continue; } if !before.is_empty() { output_pieces.push(before.to_string()); labels.push("B-TITLE".to_string()); } output_pieces.push(episode.to_string()); labels.push("B-EPISODE".to_string()); continue; } output_pieces.push(piece); labels.push("B-TITLE".to_string()); } (output_pieces, labels) } fn split_generated_token(token: &str) -> Vec { let mut pieces = Vec::new(); let mut current = String::new(); for ch in token.chars() { if ch.is_whitespace() || !ch.is_alphanumeric() { if !current.is_empty() { pieces.push(std::mem::take(&mut current)); } pieces.push(ch.to_string()); } else { current.push(ch); } } if !current.is_empty() { pieces.push(current); } pieces } fn is_standalone_separator(token: &str) -> bool { token.chars().count() == 1 && token .chars() .next() .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) } fn is_unicode_roman_season(token: &str) -> bool { matches!( token, "Ⅰ" | "Ⅱ" | "Ⅲ" | "Ⅳ" | "Ⅴ" | "Ⅵ" | "Ⅶ" | "Ⅷ" | "Ⅸ" | "Ⅹ" | "Ⅺ" | "Ⅻ" ) } fn split_cjk_title_lang_prefix(token: &str) -> Option<(Vec, Vec)> { let caps = CJK_TITLE_LANG_PREFIX_RE.captures(token)?; let title = caps.get(1)?.as_str(); let lang = caps.get(2)?.as_str(); let marker = caps.get(3).map(|m| m.as_str()).unwrap_or_default(); if title.chars().count() < 2 { return None; } let mut pieces = vec![title.to_string(), lang.to_string()]; let mut labels = vec!["B-TITLE".to_string(), "B-SOURCE".to_string()]; if !marker.is_empty() { pieces.push(marker.to_string()); labels.push("O".to_string()); } Some((pieces, labels)) } fn project_refined_tokens( tokens: &[String], groups: &[Group], roles: &[String], ) -> (Vec, Vec) { let mut output_tokens = Vec::new(); let mut output_labels = Vec::new(); for (group_index, group) in groups.iter().enumerate() { let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O"); if matches!(group.class_name.as_str(), "SEP" | "PATH" | "EMPTY") { role = "O"; } for &index in &group.indices { let token = &tokens[index]; if matches!( role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" | "SOURCE" | "RESOLUTION" | "SEASON" | "PATH_SEASON" ) { if matches!(role, "SEASON" | "PATH_SEASON") { if let Some((pieces, labels)) = split_season_token(token) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } if role == "SOURCE" { if let Some((pieces, labels)) = split_cjk_title_lang_prefix(token) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } for piece in split_refined_token(token) { if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if let Some((pieces, labels)) = split_season_token(&piece) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } if let Some((pieces, labels)) = split_episode_token(&piece) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } let label = label_for_refined_piece(&piece, role, &group.class_name); let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]); output_tokens.extend(pieces); output_labels.extend(labels); } } else { if is_title_role(role) && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集") { output_tokens.push(token.clone()); output_labels.push("O".to_string()); continue; } if is_title_role(role) && token.ends_with('第') && token.chars().count() > 1 { let trimmed = token.trim_end_matches('第').to_string(); let (pieces, labels) = normalize_generated_tokens( &[trimmed, "第".to_string()], &["B-TITLE".to_string(), "O".to_string()], ); output_tokens.extend(pieces); output_labels.extend(labels); continue; } if is_title_role(role) { let (pieces, labels) = normalize_title_token(token); output_tokens.extend(pieces); output_labels.extend(labels); continue; } let (pieces, labels) = normalize_generated_tokens(&[token.clone()], &[role_label(role)]); output_tokens.extend(pieces); output_labels.extend(labels); } } } (output_tokens, output_labels) } fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec { let joiners = [ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`", "@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥", ]; let title_terminal_punctuation = ["!", "!", "?", "?"]; let entity_joiners = [ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`", "@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥", ]; let mut output = labels.to_vec(); for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() { if label == "B-TITLE" && token == "TV" && index + 1 < tokens.len() && tokens[index + 1] == "アニメ" && output[index + 2..].iter().any(|label| label == "B-TITLE") { output[index] = "O".to_string(); output[index + 1] = "O".to_string(); continue; } if label == "B-TITLE" && token == "アニメ" && output[index + 1..].iter().any(|label| label == "B-TITLE") { output[index] = "O".to_string(); continue; } if label == "B-TITLE" && token.eq_ignore_ascii_case("part") { let next_number = (index + 1..tokens.len()).find(|&cursor| { !joiners.contains(&tokens[cursor].as_str()) && !tokens[cursor].chars().all(char::is_whitespace) }); let nearby_lupin = tokens[..index] .iter() .rev() .take(8) .any(|item| item.eq_ignore_ascii_case("lupin")) || tokens[index + 1..] .iter() .take(12) .any(|item| item.eq_ignore_ascii_case("lupin")); if nearby_lupin && next_number.is_some_and(|cursor| { tokens[cursor].chars().all(|ch| ch.is_ascii_digit()) && tokens[cursor].len() <= 2 }) { output[index] = "B-SEASON".to_string(); continue; } } if label == "B-TITLE" && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() == 3 && index + 1 < tokens.len() && matches!(tokens[index + 1].as_str(), "「" | "「" | "\"" | "'") { output[index] = "B-EPISODE".to_string(); let mut cursor = index + 1; while cursor < tokens.len() { output[cursor] = "O".to_string(); if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1 { break; } cursor += 1; } continue; } if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") { let next_word = (index + 1..tokens.len()) .find(|&cursor| tokens[cursor].chars().any(|ch| ch.is_alphanumeric())); if next_word .is_some_and(|cursor| labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')) { output[index] = "B-SOURCE".to_string(); continue; } } if label == "B-TITLE" && matches!(token.to_ascii_lowercase().as_str(), "ep" | "episode") { let next_episode = (index + 1..tokens.len()).find(|&cursor| { !joiners.contains(&tokens[cursor].as_str()) || labels[cursor] != "O" }); if next_episode.is_some_and(|cursor| labels[cursor] == "B-EPISODE") { output[index] = "O".to_string(); continue; } } if label == "B-TITLE" && is_unicode_roman_season(token) { let previous_title_word = (0..index).rev().find(|&cursor| { output[cursor] == "B-TITLE" && tokens[cursor] .chars() .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch)) }); let later_episode = (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE"); if previous_title_word.is_none() && later_episode { output[index] = "B-SEASON".to_string(); continue; } let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase()); if previous_title_word.is_some() && !matches!(previous_word.as_deref(), Some("lupin")) { output[index] = "B-SEASON".to_string(); continue; } } if label == "B-TITLE" && (ORDINAL_SEASON_TOKEN_RE.is_match(token) || WORD_ORDINAL_SEASON_TOKEN_RE.is_match(token)) { let next_word = (index + 1..tokens.len()).find(|&cursor| { !joiners.contains(&tokens[cursor].as_str()) && tokens[cursor].chars().any(|ch| ch.is_alphabetic()) }); if next_word.is_some_and(|cursor| { labels[cursor] == "B-TITLE" && SEASON_WORD_RE.is_match(&tokens[cursor]) }) { output[index] = "B-SEASON".to_string(); if let Some(cursor) = next_word { for joiner_index in index + 1..cursor { if joiners.contains(&tokens[joiner_index].as_str()) { output[joiner_index] = "B-SEASON".to_string(); } } output[cursor] = "B-SEASON".to_string(); } continue; } } if label == "O" && (EPISODE_CJK_RE.is_match(token) || EPISODE_VALUE_RE.is_match(token) || EPISODE_RANGE_RE.is_match(token)) { output[index] = "B-EPISODE".to_string(); continue; } if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) { let previous_non_space = (0..index) .rev() .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); let next_non_space = (index + 1..tokens.len()) .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); if previous_non_space.is_some_and(|cursor| tokens[cursor] == "#") { output[index] = "B-EPISODE".to_string(); if next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "~")) { if let Some(separator) = next_non_space { output[separator] = "B-EPISODE".to_string(); if let Some(right) = (separator + 1..tokens.len()) .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)) { if tokens[right].chars().all(|ch| ch.is_ascii_digit()) { output[right] = "B-EPISODE".to_string(); } } } } continue; } if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第") && next_non_space.is_some_and(|cursor| { matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集") || tokens[cursor].starts_with('话') || tokens[cursor].starts_with('話') || tokens[cursor].starts_with('回') || tokens[cursor].starts_with('集') }) { if let Some(cursor) = previous_non_space { output[cursor] = "B-EPISODE".to_string(); } output[index] = "B-EPISODE".to_string(); if let Some(cursor) = next_non_space { if matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集") { output[cursor] = "B-EPISODE".to_string(); } } continue; } } if matches!(label.as_str(), "B-TITLE" | "O") && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() <= 3 { let previous_word = (0..index) .rev() .find(|&cursor| { !joiners.contains(&tokens[cursor].as_str()) && tokens[cursor].chars().any(|ch| ch.is_alphabetic()) }) .map(|cursor| tokens[cursor].to_ascii_lowercase()); let next_structural = (index + 1..tokens.len()) .find(|&cursor| !joiners.contains(&tokens[cursor].as_str())) .map(|cursor| tokens[cursor].as_str()); let next_non_space = (index + 1..tokens.len()) .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)) .map(|cursor| tokens[cursor].as_str()); let later_technical_block = output[index + 1..] .iter() .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION")); let nearby_lupin_part = previous_word.as_deref() == Some("part") && (tokens[..index] .iter() .rev() .take(8) .any(|item| item.eq_ignore_ascii_case("lupin")) || tokens[index + 1..] .iter() .take(12) .any(|item| item.eq_ignore_ascii_case("lupin"))); if nearby_lupin_part { output[index] = "B-SEASON".to_string(); continue; } let followed_by_title_word = (index + 1..tokens.len()) .find(|&cursor| { !joiners.contains(&tokens[cursor].as_str()) && !matches!( tokens[cursor].as_str(), "-" | "-" | "," | "," | ":" | ":" ) }) .is_some_and(|cursor| { !matches!( tokens[cursor].as_str(), "[" | "【" | "(" | "(" | "]" | "】" ) && output.get(cursor).is_some_and(|label| label == "B-TITLE") && tokens[cursor].chars().any(|ch| ch.is_alphabetic()) }); if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part")) { output[index] = "B-TITLE".to_string(); continue; } if (later_technical_block || next_non_space.is_some_and(|token| matches!(token, "[" | "【" | "(" | "(")) || next_structural.is_some_and(|token| matches!(token, "[" | "【" | "(" | "("))) && matches!(previous_word.as_deref(), Some("movie" | "part")) { output[index] = "B-SPECIAL".to_string(); continue; } let eien_title_number = token == "831" && previous_word.as_deref() == Some("no") && (0..index).any(|cursor| { output[cursor] == "B-TITLE" && tokens[cursor].eq_ignore_ascii_case("Eien") }); if eien_title_number { for joiner_index in (0..index).rev() { if tokens[joiner_index].eq_ignore_ascii_case("no") { break; } if joiners.contains(&tokens[joiner_index].as_str()) { output[joiner_index] = "B-TITLE".to_string(); } } output[index] = "B-TITLE".to_string(); continue; } } if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() <= 3 { let previous_non_space = (0..index) .rev() .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); let next_non_space = (index + 1..tokens.len()) .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); if previous_non_space .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【")) && next_non_space .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】")) && output[..index].iter().any(|label| label == "B-TITLE") && output[index + 1..] .iter() .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION")) { output[index] = "B-EPISODE".to_string(); continue; } if previous_non_space .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-")) && output[..index].iter().any(|label| label == "B-TITLE") && output[index + 1..] .iter() .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION")) { output[index] = "B-EPISODE".to_string(); continue; } if next_non_space.is_none() && previous_non_space.is_some_and(|cursor| { output[cursor] == "B-TITLE" && tokens[cursor].chars().any(|ch| { ('\u{4e00}'..='\u{9fff}').contains(&ch) || ('\u{3040}'..='\u{30ff}').contains(&ch) }) }) { output[index] = "B-EPISODE".to_string(); continue; } } if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) { let previous_non_space = (0..index) .rev() .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); let next_non_space = (index + 1..tokens.len()) .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)); if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第") && next_non_space.is_some_and(|cursor| { matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集") }) { if let Some(cursor) = previous_non_space { output[cursor] = "B-EPISODE".to_string(); } if let Some(cursor) = next_non_space { output[cursor] = "B-EPISODE".to_string(); } } if previous_non_space .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X")) { let left_title = (0..previous_non_space.unwrap()) .rev() .find(|&cursor| labels[cursor] != "O") .is_some_and(|cursor| labels[cursor] == "B-TITLE"); if left_title { output[index] = "B-TITLE".to_string(); if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| { labels[cursor] == "O" && tokens[cursor].chars().any(|ch| ch.is_alphabetic()) }) { output[next_word] = "B-TITLE".to_string(); } continue; } } let mut previous_word = None; for cursor in (0..index).rev() { if matches!(tokens[cursor].as_str(), "]" | "】" | ")" | ")") { break; } if joiners.contains(&tokens[cursor].as_str()) { continue; } if tokens[cursor].chars().any(|ch| ch.is_alphabetic()) { previous_word = Some(tokens[cursor].to_ascii_lowercase()); } break; } let previous_non_space = (0..index) .rev() .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace)) .map(|cursor| tokens[cursor].as_str()); if matches!(previous_word.as_deref(), Some("lesson" | "part")) || (previous_word.as_deref() == Some("no") && previous_non_space == Some(".")) { output[index] = "O".to_string(); continue; } } if label != "O" || !entity_joiners.contains(&token.as_str()) { continue; } let mut left = index as isize - 1; while left >= 0 && joiners.contains(&tokens[left as usize].as_str()) && labels[left as usize] == "O" { left -= 1; } let mut right = index + 1; while right < tokens.len() && joiners.contains(&tokens[right].as_str()) && labels[right] == "O" { right += 1; } if left >= 0 && right < tokens.len() { let left_label = output[left as usize].clone(); let right_label = labels[right].clone(); if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") { output[index] = left_label.clone(); } if token == "." && left_label == "B-EPISODE" && right_label == "B-EPISODE" { output[index] = "B-EPISODE".to_string(); } } if title_terminal_punctuation.contains(&token.as_str()) && index > 0 { let left_label = &output[index - 1]; if left_label == "B-TITLE" { output[index] = "B-TITLE".to_string(); } } if matches!( token.as_str(), "]" | "】" | ")" | ")" | ">" | ">" | "」" | "」" ) && index > 0 && output[index - 1] == "B-TITLE" && title_span_has_labeled_opener(&tokens[..index], &output[..index], token) { output[index] = "B-TITLE".to_string(); } } output } fn title_span_has_labeled_opener(tokens: &[String], labels: &[String], closer: &str) -> bool { for (token, label) in tokens.iter().zip(labels.iter()).rev() { if label != "B-TITLE" { return false; } if closer_matches_opener(closer, token) { return true; } } false } fn closer_matches_opener(closer: &str, opener: &str) -> bool { matches!( (closer, opener), ("]", "[") | ("】", "【") | (")", "(") | (")", "(") | (">", "<") | (">", "<") | ("」", "「") | ("」", "「") ) } fn retag_semantic_labels(tokens: &[String], labels: &[String]) -> Vec { let last_path = tokens .iter() .rposition(|token| token == "/" || token == "\\"); let mut output = labels.to_vec(); for index in 0..labels.len() { let Some(entity) = label_entity(&labels[index]) else { continue; }; let prefix = if labels[index].starts_with("I-") { "I" } else { "B" }; if entity == "TITLE" { let path_title = last_path.is_some_and(|path_index| index < path_index); let suffix = title_suffix_for_label_index(tokens, labels, index); output[index] = format!( "{prefix}-{}_{}", if path_title { "PATH_TITLE" } else { "TITLE" }, suffix ); } else if entity == "SEASON" && last_path.is_some_and(|path_index| index < path_index) { output[index] = format!("{prefix}-PATH_SEASON"); } } output } fn title_suffix_for_label_index( tokens: &[String], labels: &[String], index: usize, ) -> &'static str { if let Some(suffix) = direct_title_suffix(&tokens[index]) { return suffix; } let left = nearest_title_suffix(tokens, labels, index, true); let right = nearest_title_suffix(tokens, labels, index, false); match (left, right) { (Some(left), Some(right)) if left == right => left, (Some(left), None) => left, (None, Some(right)) => right, _ => "MIXED", } } fn nearest_title_suffix( tokens: &[String], labels: &[String], index: usize, search_left: bool, ) -> Option<&'static str> { let mut cursor = index as isize; loop { cursor += if search_left { -1 } else { 1 }; if cursor < 0 || cursor as usize >= tokens.len() { return None; } let cursor = cursor as usize; if !is_title_label(&labels[cursor]) { if tokens[cursor] .chars() .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) { continue; } return None; } if let Some(suffix) = direct_title_suffix(&tokens[cursor]) { return Some(suffix); } } } fn direct_title_suffix(token: &str) -> Option<&'static str> { if !token.chars().any(|ch| { ch.is_ascii_alphabetic() || ('\u{3040}'..='\u{30ff}').contains(&ch) || ('\u{31f0}'..='\u{31ff}').contains(&ch) || ('\u{4e00}'..='\u{9fff}').contains(&ch) }) { return None; } Some(title_language_suffix(token)) } fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option { let (key, tokens, _classes, groups) = template_key_for_filename(filename); if groups.len() != roles.len() { return None; } let roles = adjust_contextual_roles(&tokens, &groups, roles); let roles = refine_semantic_roles(&tokens, &groups, &roles); let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles); let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles); let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels); let labels = smooth_title_spans(&tokens, &labels); let labels = retag_semantic_labels(&tokens, &labels); if tokens.len() != labels.len() { return None; } Some(Record { filename: filename.to_string(), tokens, labels, template_id: template_id.to_string(), template: key, source_filename: None, path_trimmed: None, dropped_title_candidate_positions: if dropped.is_empty() { None } else { Some(dropped) }, }) } #[cfg(test)] mod tests { use super::*; fn schema_labels_for(filename: &str) -> Vec<(String, String)> { let (key, _, _, _) = template_key_for_filename(filename); let roles = suggested_roles(&key); let record = dmhy_record(filename, "tpl_test", &roles).unwrap(); record.tokens.into_iter().zip(record.labels).collect() } fn labels_for(filename: &str) -> Vec<(String, String)> { schema_labels_for(filename) .into_iter() .map(|(token, label)| (token, legacy_label(&label))) .collect() } fn legacy_label(label: &str) -> String { let Some(entity) = label_entity(label) else { return label.to_string(); }; let prefix = if label.starts_with("I-") { "I" } else { "B" }; if is_title_entity(entity) { return format!("{prefix}-TITLE"); } if entity == "PATH_SEASON" { return format!("{prefix}-SEASON"); } if entity == "TAG" { return format!("{prefix}-SPECIAL"); } label.to_string() } #[test] fn rich_title_candidates_keep_readable_spacing() { let row = rich_annotation_for( "(1998) Initial D First Stage [1080p BDRip AVC AAC DTS-HD]/Initial D First Stage - 01 [1080p BDRip AVC AAC DTS-HD]", ); assert_eq!( row.pointer("/segments/1/candidates/0/text") .and_then(Value::as_str), Some("Initial D First Stage") ); } #[test] fn semantic_schema_roles_cover_multilingual_tags_paths_and_music_skips() { let gm = schema_labels_for( "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4", ); assert!(gm.contains(&("GM".to_string(), "B-GROUP".to_string()))); assert!(gm.contains(&("国漫".to_string(), "B-TAG".to_string()))); assert!(gm.contains(&("神印王座".to_string(), "B-TITLE_CHS".to_string()))); assert!(gm.contains(&("Throne".to_string(), "B-TITLE_LATIN".to_string()))); assert!(gm.contains(&("Seal".to_string(), "B-TITLE_LATIN".to_string()))); assert!(gm.contains(&("2022".to_string(), "B-TAG".to_string()))); assert!(gm.contains(&("200".to_string(), "B-EPISODE".to_string()))); let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]"); assert!(sky.contains(&("Skytree".to_string(), "B-GROUP".to_string()))); assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string()))); assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string()))); assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string()))); assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string()))); let farming = schema_labels_for("異世界悠閒農家 2 - 06"); assert!(farming.contains(&("異世界悠閒農家".to_string(), "B-TITLE_CHT".to_string()))); assert!(farming.contains(&("2".to_string(), "B-SEASON".to_string()))); assert!(farming.contains(&("06".to_string(), "B-EPISODE".to_string()))); let hanako = schema_labels_for("地縛少年花子君 2 - 13"); assert!(hanako.contains(&("地縛少年花子君".to_string(), "B-TITLE_JPN".to_string()))); assert!(hanako.contains(&("2".to_string(), "B-SEASON".to_string()))); assert!(hanako.contains(&("13".to_string(), "B-EPISODE".to_string()))); let one_piece = schema_labels_for("One.Piece.1110"); assert!(one_piece.contains(&("One".to_string(), "B-TITLE_LATIN".to_string()))); assert!(one_piece.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string()))); assert!(one_piece.contains(&("1110".to_string(), "B-EPISODE".to_string()))); assert!(!one_piece.contains(&("1110".to_string(), "B-SEASON".to_string()))); let nekomoe_prefix = schema_labels_for("[喵萌奶茶屋][7月新番][Lycoris Recoil][01][1080P]"); assert!(nekomoe_prefix.contains(&("喵萌奶茶屋".to_string(), "B-GROUP".to_string()))); assert!(nekomoe_prefix.contains(&("7月新番".to_string(), "B-TAG".to_string()))); assert!(nekomoe_prefix.contains(&("Lycoris".to_string(), "B-TITLE_LATIN".to_string()))); let subtitle_group = schema_labels_for("[桜都字幕组][Title][01][1080P]"); assert!(subtitle_group.contains(&("桜都字幕组".to_string(), "B-GROUP".to_string()))); let path = schema_labels_for("海贼王/Season 2/One Piece - 01 [1080P]"); assert!(path.contains(&("海贼王".to_string(), "B-PATH_TITLE_CHS".to_string()))); assert!(path.contains(&("2".to_string(), "B-PATH_SEASON".to_string()))); assert!(path.contains(&("One".to_string(), "B-TITLE_LATIN".to_string()))); assert!(path.contains(&("01".to_string(), "B-EPISODE".to_string()))); let tags = schema_labels_for("[日漫][剧场版][Movie][TV][2024][Title][01][1080P]"); assert!(tags.contains(&("日漫".to_string(), "B-TAG".to_string()))); assert!(tags.contains(&("剧场版".to_string(), "B-TAG".to_string()))); assert!(tags.contains(&("Movie".to_string(), "B-TAG".to_string()))); assert!(tags.contains(&("TV".to_string(), "B-TAG".to_string()))); assert!(tags.contains(&("2024".to_string(), "B-TAG".to_string()))); assert!(tags.contains(&("Title".to_string(), "B-TITLE_LATIN".to_string()))); for skipped in [ "[Group] Title OST [FLAC]", "[Group] Title MUSICCLIP [BDRip]", "[Group] Title Music Collection [FLAC]", "[Group] Title Character Song [MP3]", "[Group] Title Drama CD [FLAC]", "[Group] Title CD Album [FLAC]", "[Group] Title Bonus CD [FLAC]", "[Group] Title Soundtrack [FLAC]", ] { assert!(has_music_collection_noise(skipped), "{skipped}"); } for preserved in [ "[Group] Title OP [FLAC]", "[Group] Title ED [FLAC]", "[Group] Title NCOP [FLAC]", "[Group] Title NCED [FLAC]", "[Group] Title PV [1080P]", "[Group] Title CM [1080P]", "[Group] Title Menu [1080P]", "[Group] Title Trailer [1080P]", ] { assert!(!has_music_collection_noise(preserved), "{preserved}"); } } #[test] fn required_regressions() { let title_91 = labels_for("Title 91 EP 01 [1080p]"); assert!(title_91.contains(&("91".to_string(), "B-SEASON".to_string()))); assert!(title_91.contains(&("EP".to_string(), "O".to_string()))); assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string()))); let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]"); assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string()))); assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string()))); let roman = labels_for("Chibi Maruko-chan I 001"); assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string()))); assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string()))); let dxd = labels_for("High School D×D"); assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string()))); let colon_title = labels_for("Megumi no Daigo:Kyuukoku no Orange 06"); assert!(colon_title.contains(&(":".to_string(), "B-TITLE".to_string()))); let sxe = labels_for("S01E02"); assert_eq!( sxe, vec![ ("S".to_string(), "O".to_string()), ("01".to_string(), "B-SEASON".to_string()), ("E".to_string(), "O".to_string()), ("02".to_string(), "B-EPISODE".to_string()) ] ); let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]"); assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string()))); assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string()))); let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]"); assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string()))); assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string()))); let bocchi_sxe = labels_for("Bocchi the Rock! 孤獨搖滾!S01E12「早起的日頭光照佇你的身上」"); assert!(bocchi_sxe.contains(&("01".to_string(), "B-SEASON".to_string()))); assert!(bocchi_sxe.contains(&("12".to_string(), "B-EPISODE".to_string()))); assert!(!bocchi_sxe.contains(&("S01E12".to_string(), "O".to_string()))); let sxe_range = labels_for( "【CXRAW】【TMNT 2012 TV series】【S5E12-S5E14】【Wanted:Bebop & Rocksteady】【DVDrip】【480p】【AVC Hi10P AAC MP4】", ); assert!(sxe_range.contains(&("5".to_string(), "B-SEASON".to_string()))); assert!(sxe_range.contains(&("12".to_string(), "B-EPISODE".to_string()))); assert!(sxe_range.contains(&("14".to_string(), "B-EPISODE".to_string()))); let episode_version_title = labels_for("[DHR][Dumbbell[10v2][BIG5][720P][AVC_AAC]"); assert!(episode_version_title.contains(&("10v2".to_string(), "B-EPISODE".to_string()))); assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string()))); let episode_version_lang = labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]"); assert!(episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string()))); assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string()))); let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]"); assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string()))); assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string()))); let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)"); assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string()))); assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string()))); let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]"); assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string()))); let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]"); assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string()))); let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)"); assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string()))); let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]"); assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string()))); let hash = labels_for("[Group][Title][01][1080p][00270AC8]"); assert!(hash.contains(&("00270AC8".to_string(), "O".to_string()))); let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001"); assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string()))); assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string()))); let ubw = labels_for("Fate/stay night [Unlimited Blade Works] #00 「プロローグ」"); assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string()))); assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string()))); let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]"); assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string()))); let comma_title = labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]"); assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string()))); let backtick_title = labels_for("[Hayate no Gotoku! Can`t Take My Eyes Off You][01][BDrip X264 AAC 720P]"); assert!(backtick_title.contains(&("`".to_string(), "B-TITLE".to_string()))); assert!(backtick_title.contains(&("t".to_string(), "B-TITLE".to_string()))); let cjk_period_title = labels_for("[云光字幕组]剃须。然后捡到高中生 Hige o Soru. Soshite Joshikousei o Hirou-[ 01 ][简体双语][1080p]"); assert!(cjk_period_title.contains(&("。".to_string(), "B-TITLE".to_string()))); let music_title = labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)"); assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string()))); let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]"); assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string()))); assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string()))); let hdma_block = labels_for( "[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]", ); assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string()))); assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string()))); assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string()))); assert!(!hdma_block.contains(&("1080P".to_string(), "B-TITLE".to_string()))); let extra_menu = labels_for("Extra Menu OVA"); assert!(extra_menu.contains(&("Extra".to_string(), "B-SPECIAL".to_string()))); assert!(!extra_menu.contains(&("Extra".to_string(), "B-TITLE".to_string()))); let eizou_tokuten = labels_for("おジャ魔女どれみ♯ 映像特典「ともだちの唄」(DVD 640x480 )"); assert!(eizou_tokuten.contains(&("映像特典".to_string(), "B-SPECIAL".to_string()))); let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話"); assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string()))); let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]"); assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string()))); let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER~魂狩~ #01 (HEVC 1312x720)"); assert!(soul_taker.contains(&("~".to_string(), "B-TITLE".to_string()))); let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話"); assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string()))); assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string()))); assert!(mayoi.contains(&("]".to_string(), "B-TITLE".to_string()))); let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件"); assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string()))); assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string()))); let zom = labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]"); assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string()))); assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string()))); assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string()))); let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]"); assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string()))); assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string()))); assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string()))); assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string()))); let happy = labels_for("My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG"); assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string()))); assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string()))); let garo = labels_for("[牙狼<GARO>~炎の刻印~][01][1080p]"); assert!(garo.contains(&("牙狼".to_string(), "B-TITLE".to_string()))); assert!(garo.contains(&("<".to_string(), "B-TITLE".to_string()))); assert!(garo.contains(&(">".to_string(), "B-TITLE".to_string()))); assert!(garo.contains(&("炎の刻印".to_string(), "B-TITLE".to_string()))); let akira = labels_for("[QYQ][AKIRA][AVC_AC3x2][1080p]"); assert!(akira.contains(&("AKIRA".to_string(), "B-TITLE".to_string()))); assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string()))); assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string()))); let doraemon = labels_for( "[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了", ); assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string()))); assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string()))); assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string()))); let devilman = labels_for("[DBD-Raws][恶魔人][1972版][01][1080P][BDRip][HEVC-10bit][FLAC]"); assert!(devilman.contains(&("恶魔人".to_string(), "B-TITLE".to_string()))); assert!(!devilman.contains(&("1972版".to_string(), "B-TITLE".to_string()))); let classroom = labels_for("[Dymy][Assassination Classroom (2016)][01][BIG5][1280X720]"); assert!(classroom.contains(&("(".to_string(), "B-TITLE".to_string()))); assert!(classroom.contains(&(")".to_string(), "B-TITLE".to_string()))); assert!(!classroom.contains(&("]".to_string(), "B-TITLE".to_string()))); let bang_season = labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]"); assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string()))); assert!(bang_season.contains(&("2nd".to_string(), "B-SEASON".to_string()))); assert!(bang_season.contains(&("Season".to_string(), "B-SEASON".to_string()))); assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string()))); let basket = labels_for( "[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]", ); assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string()))); assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string()))); assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string()))); assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string()))); assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string()))); let notice = labels_for("[KTXP][Zankyou_no_Terror][08_Notice][GB_BIG5][X264_AAC][720p]"); assert!(notice.contains(&("Zankyou".to_string(), "B-TITLE".to_string()))); assert!(notice.contains(&("08".to_string(), "B-EPISODE".to_string()))); assert!(!notice.contains(&("08".to_string(), "B-TITLE".to_string()))); let full = labels_for("[POPGO][Soukyuu_no_Fafner_Exodus][01_Full][GB][720p]"); assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string()))); let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]"); assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string()))); let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264"); assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string()))); assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string()))); assert!(ddp .iter() .any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE")); let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264"); assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string()))); assert!(!aac_space.contains(&("2".to_string(), "B-EPISODE".to_string()))); assert!(aac_space .iter() .any(|(token, label)| token.starts_with("AAC") && label == "B-SOURCE")); let bare_resolution = labels_for("日本桥15.03.30 720"); assert!(bare_resolution.contains(&("日本桥".to_string(), "B-TITLE".to_string()))); assert!(bare_resolution.contains(&("720".to_string(), "B-RESOLUTION".to_string()))); assert!(!bare_resolution.contains(&("720".to_string(), "B-EPISODE".to_string()))); let air_episode = labels_for("Air 01"); assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string()))); assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string()))); let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]"); assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string()))); assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string()))); assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string()))); let _ = RUNTIME_WHITELISTS.set(Whitelists { title_phrases: Vec::new(), group_names: [ "LowPower-Raws".to_string(), "ANi".to_string(), "LoliHouse".to_string(), "QTS".to_string(), ] .into_iter() .collect(), }); let lowpower = labels_for("[LowPower-Raws] 91 Days - 01 (BD 720P x264 10bit AAC)"); assert!(lowpower.contains(&("LowPower".to_string(), "B-GROUP".to_string()))); assert!(lowpower.contains(&("91".to_string(), "B-TITLE".to_string()))); assert!(lowpower.contains(&("Days".to_string(), "B-TITLE".to_string()))); assert!(lowpower.contains(&("01".to_string(), "B-EPISODE".to_string()))); let ririsa = labels_for("[ANi] 2.5 次元的誘惑 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT]"); assert!(ririsa.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(ririsa.contains(&(".".to_string(), "B-TITLE".to_string()))); assert!(ririsa.contains(&("5".to_string(), "B-TITLE".to_string()))); assert!(ririsa.contains(&("次元的誘惑".to_string(), "B-TITLE".to_string()))); assert!(ririsa.contains(&("01".to_string(), "B-EPISODE".to_string()))); let nanabun = labels_for("[LoliHouse] 22-7 - 01 [WebRip 1080p HEVC-10bit AAC ASS]"); assert!(nanabun.contains(&("22".to_string(), "B-TITLE".to_string()))); assert!(nanabun.contains(&("-".to_string(), "B-TITLE".to_string()))); assert!(nanabun.contains(&("7".to_string(), "B-TITLE".to_string()))); assert!(nanabun.contains(&("01".to_string(), "B-EPISODE".to_string()))); let saint = labels_for("[QTS] OVA Saint Seiya The Lost Canvas Meiou Shinwa ep 01 (BD H264 1920x1080 24fps FLAC)"); assert!(saint.contains(&("OVA".to_string(), "B-SPECIAL".to_string()))); assert!(saint.contains(&("Saint".to_string(), "B-TITLE".to_string()))); assert!(saint.contains(&("Seiya".to_string(), "B-TITLE".to_string()))); assert!(saint.contains(&("01".to_string(), "B-EPISODE".to_string()))); let gundam = labels_for("機動戦士ガンダム00 セカンドシーズン/Ep.01 「# 天使再臨」"); assert!(gundam.contains(&("機動戦士ガンダム".to_string(), "B-TITLE".to_string()))); assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string()))); assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string()))); let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]"); assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string()))); assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string()))); assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string()))); assert!(spy.contains(&("Family".to_string(), "B-TITLE".to_string()))); assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string()))); assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string()))); let spy_s3 = labels_for( "[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]", ); assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string()))); assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string()))); assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string()))); assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string()))); assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string()))); let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]"); assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string()))); assert!( slime.contains(&("300".to_string(), "B-TITLE".to_string())), "{slime:?}" ); assert!(!slime.contains(&("300".to_string(), "B-EPISODE".to_string()))); let kamisama = labels_for("[SFEO-Raws] Kamisama Hajimemashita 2 - 01 (BD 720P x264 10bit AAC)"); assert!(kamisama.contains(&("Kamisama".to_string(), "B-TITLE".to_string()))); assert!(kamisama.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(kamisama.contains(&("01".to_string(), "B-EPISODE".to_string()))); } #[test] fn updated_python_alignment_regressions() { let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; let (trimmed, was_trimmed) = training_filename_for(original); assert!(was_trimmed); assert_eq!( trimmed, "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p" ); let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"; let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon); assert!(pokemon_was_trimmed); assert_eq!( trimmed_pokemon, "Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]" ); let woody = labels_for(&trimmed); assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string()))); assert!(woody.contains(&("E".to_string(), "O".to_string()))); assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string()))); assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string()))); assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string()))); assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string()))); let group = labels_for("[DBD-Raws][Title][01][1080P]"); assert!(group.contains(&("-".to_string(), "B-GROUP".to_string()))); let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]"); assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string()))); let cjk_season = labels_for("[DBD-Raws][魔道祖师 第一季][08][1080P][BDRip][HEVC-10bit][FLAC]"); assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string()))); assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string()))); assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string()))); let (trimmed, was_trimmed) = training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]"); assert!(was_trimmed); assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION" ); let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(short.contains(&("R".to_string(), "B-TITLE".to_string()))); assert!(short.contains(&("-".to_string(), "B-TITLE".to_string()))); assert!(short.contains(&("15".to_string(), "B-TITLE".to_string()))); assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string()))); let short_before_episode = labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string()))); let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]"; let (trimmed, was_trimmed) = training_filename_for(avatar); assert!(was_trimmed); assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]"); let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]"; let (trimmed, was_trimmed) = training_filename_for(plain_season_dir); assert!(was_trimmed); assert_eq!( trimmed, "Season 1 [Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]" ); let plain_season_labels = labels_for(&trimmed); assert!(plain_season_labels.contains(&("1".to_string(), "B-SEASON".to_string()))); assert!(plain_season_labels.contains(&("01".to_string(), "B-EPISODE".to_string()))); let menu_parent = "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)"; let (trimmed, was_trimmed) = training_filename_for(menu_parent); assert!(was_trimmed); assert_eq!( trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)" ); assert!(has_encoding_noise( "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]" )); assert!(has_encoding_noise( "ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01" )); assert!(has_encoding_noise( "[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV" )); assert!(has_encoding_noise("[DAY][Megami no Caf茅 Terrace][01]")); assert!(has_encoding_noise( "[4K_SDR][DBD-Raws][瀵掕潐楦f常涔嬫椂 涓歖[NCED1]" )); assert!(has_non_anime_noise( "13-[旅游番][花丸字幕组][日本不思议铁路之旅][15.03.19-16.02.03][720&1080][中日双语]/铁道旅 15.03.19 720" )); let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"; let (trimmed, was_trimmed) = training_filename_for(tintin); assert!(was_trimmed); assert_eq!( trimmed, "Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)" ); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT" ); let bocchi = "Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"; let (leaf_key, _, _, _) = template_key_for_filename("Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"); assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); assert!(filename_has_title( "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」" )); let (trimmed, was_trimmed) = training_filename_for(bocchi); assert!(was_trimmed); assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]"; let (trimmed, was_trimmed) = training_filename_for(usagi); assert!(was_trimmed); assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA" ); let woody_parent = "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}")); assert!(was_trimmed); assert_eq!( trimmed, "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p" ); let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01"; let (trimmed, was_trimmed) = training_filename_for(najica); assert!(was_trimmed); assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01"); let najica_labels = labels_for(&trimmed); assert!(najica_labels.contains(&("Najica".to_string(), "B-TITLE".to_string()))); assert!(!najica_labels.contains(&("SourceUnknown".to_string(), "B-TITLE".to_string()))); assert!(najica_labels.contains(&("01".to_string(), "B-EPISODE".to_string()))); let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01"; let (trimmed, was_trimmed) = training_filename_for(galient); assert!(was_trimmed); assert_eq!(trimmed, "[1984-1985] Galient_機甲界(機甲界ガリアン) 01"); let galient_labels = labels_for(&trimmed); assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string()))); assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string()))); assert!(galient_labels.contains(&("01".to_string(), "B-EPISODE".to_string()))); let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED"; let (trimmed, was_trimmed) = training_filename_for(nced); assert!(was_trimmed); assert_eq!( trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED" ); let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书"; let (trimmed, was_trimmed) = training_filename_for(sakura); assert!(was_trimmed); assert_eq!( trimmed, "魔卡少女樱(台配国语) 第01集 小樱与不可思议的魔法书" ); let sakura_labels = labels_for(&trimmed); assert!(sakura_labels.contains(&("魔卡少女樱".to_string(), "B-TITLE".to_string()))); assert!(sakura_labels.contains(&("01".to_string(), "B-EPISODE".to_string()))); let volume = labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string()))); assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string()))); assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string()))); assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string()))); assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string()))); let aria_notice = labels_for( "[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)", ); assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string()))); assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string()))); assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string()))); assert!(!aria_notice.contains(&("1".to_string(), "B-EPISODE".to_string()))); let lost_song = labels_for("[Snow-Raws] LOST SONG CM&PV 01(BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(lost_song.contains(&("LOST".to_string(), "B-TITLE".to_string()))); assert!(lost_song.contains(&("CM".to_string(), "B-SPECIAL".to_string()))); assert!(lost_song.contains(&("PV".to_string(), "B-SPECIAL".to_string()))); assert!(lost_song.contains(&("01".to_string(), "B-SPECIAL".to_string()))); assert!(!lost_song.contains(&("01".to_string(), "B-EPISODE".to_string()))); let numeric_title = labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M"); assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string()))); assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string()))); assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string()))); assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string()))); assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string()))); assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string()))); let media_block = labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]"); assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string()))); assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string()))); assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string()))); assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string()))); let ge999 = labels_for("GE999 第024話 「次元航海惑星」1979年02月22日 (720x540 x264 AAC2)"); assert!(ge999.contains(&("GE999".to_string(), "B-TITLE".to_string()))); assert!(ge999.contains(&("024".to_string(), "B-EPISODE".to_string()))); assert!(!ge999.contains(&("22".to_string(), "B-EPISODE".to_string()))); let galaxy = labels_for("銀河鉄道999 第024話 「次元航海惑星」 (DVD 640x480 WMV9)"); assert!(galaxy.contains(&("銀河鉄道".to_string(), "B-TITLE".to_string()))); assert!(galaxy.contains(&("999".to_string(), "B-TITLE".to_string()))); assert!(galaxy.contains(&("024".to_string(), "B-EPISODE".to_string()))); let mahoro = labels_for("[POPGO][FREEWIND][Mahoro_Matic][Full_HD-BDRIP][01]"); assert!(mahoro.contains(&("Mahoro".to_string(), "B-TITLE".to_string()))); assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string()))); assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string()))); let kitaro = labels_for( "[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり", ); assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string()))); assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string()))); assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string()))); assert!(!kitaro.contains(&("1985".to_string(), "B-EPISODE".to_string()))); let urusei = labels_for("Urusei_Yatsura_DVD_Ep042.5_Simu"); assert!(urusei.contains(&("Urusei".to_string(), "B-TITLE".to_string()))); assert!(urusei.contains(&("042".to_string(), "B-EPISODE".to_string()))); assert!(urusei.contains(&(".".to_string(), "B-EPISODE".to_string()))); assert!(urusei.contains(&("5".to_string(), "B-EPISODE".to_string()))); let lupin = labels_for("[Lupin The Thrid Jigen Daisuke no Bohyou][Logo][BDRIP][1080P][H264_FLAC]"); assert!(lupin.contains(&("Lupin".to_string(), "B-TITLE".to_string()))); assert!(!lupin.contains(&("Lupin".to_string(), "B-GROUP".to_string()))); let mirumo = labels_for("【咪路fans】魔法咪路咪路第二季日语版 01[GB][MP4]"); assert!(mirumo.contains(&("魔法咪路咪路".to_string(), "B-TITLE".to_string()))); assert!(mirumo.contains(&("第二季".to_string(), "B-SEASON".to_string()))); assert!(mirumo.contains(&("01".to_string(), "B-EPISODE".to_string()))); let doremi_bonus = labels_for( "おジャ魔女どれみナ・イ・ショ 特典映像07「おジャ魔女どれみナ・イ・ショ エンドテロップ集」(DVD 640x480 )", ); assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string()))); assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string()))); assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string()))); let bd_menu = labels_for("[HYSUB]Kuusen Madoushi Kouhosei no Kyoukan[BDMenu][01v1][MP4][1280X720]"); assert!(bd_menu.contains(&("BDMenu".to_string(), "B-SPECIAL".to_string()))); assert!(bd_menu.contains(&("01v1".to_string(), "B-SPECIAL".to_string()))); assert!(!bd_menu.contains(&("BDMenu".to_string(), "B-TITLE".to_string()))); let ura_on = labels_for("K-ON !! (TV S2 2010). URA-ON !! 01; 1080_h264_flac"); assert!(ura_on.contains(&("K".to_string(), "B-TITLE".to_string()))); assert!(ura_on.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(ura_on.contains(&("1080".to_string(), "B-RESOLUTION".to_string()))); assert!(!ura_on.contains(&("1080".to_string(), "B-EPISODE".to_string()))); let machikado = labels_for("[KTXP][Machikado_Mazoku_S2][Mini][01][GB][1080p][BDrip][HEVC]"); assert!(machikado.contains(&("Machikado".to_string(), "B-TITLE".to_string()))); assert!(machikado.contains(&("S2".to_string(), "B-SEASON".to_string()))); assert!(machikado.contains(&("01".to_string(), "B-EPISODE".to_string()))); let ronin = labels_for("【蓝色狂想】魔神坛斗士国日双语第01集"); assert!(ronin.contains(&("魔神坛斗士".to_string(), "B-TITLE".to_string()))); assert!(ronin.contains(&("国日双语".to_string(), "B-SOURCE".to_string()))); assert!(ronin.contains(&("01".to_string(), "B-EPISODE".to_string()))); let ghiblies = labels_for("Ghiblies - Episode 2 op"); assert!(ghiblies.contains(&("Ghiblies".to_string(), "B-TITLE".to_string()))); assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string()))); let tv_spot = labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]"); assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string()))); assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string()))); assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string()))); let preview_seconds = labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)"); assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string()))); assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string()))); let hi10_source = labels_for("[POPGO][Shigatsu wa Kimi no Uso] [01][Hi10][720P][GB][A964DA24]"); assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string()))); assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string()))); let souten = labels_for( "[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]", ); assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string()))); assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string()))); assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string()))); let bonjour = labels_for( "(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)", ); assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string()))); let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]"); assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string()))); assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string()))); let bd_spot = labels_for("[Moozzi2] Amanchu! [SP05] BD-Spot - 01 (BD 1920x1080 x.264 Flac)"); assert!(bd_spot.contains(&("Spot".to_string(), "B-SPECIAL".to_string()))); assert!(bd_spot.contains(&("01".to_string(), "B-SPECIAL".to_string()))); assert!(!bd_spot.contains(&("01".to_string(), "B-EPISODE".to_string()))); let preview_number = labels_for("[Snow-Raws] 刀使ノ巫女 第02話 予告01 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(preview_number.contains(&("02".to_string(), "B-EPISODE".to_string()))); assert!(preview_number.contains(&("01".to_string(), "B-SPECIAL".to_string()))); let bleach_movie = labels_for("Bleach the Movie 3 - Fade to Black, I Call Your Name"); assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string()))); assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string()))); let conan_movie = labels_for( "[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]", ); assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string()))); assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string()))); let madoka_movie = labels_for( "[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]", ); assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string()))); assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string()))); let fate_first_order = labels_for("[DBD-Raws][Fate Grand Order ‐First Order‐][PV][01][1080P]"); assert!(fate_first_order.contains(&("Fate".to_string(), "B-TITLE".to_string()))); assert!(fate_first_order.contains(&("‐".to_string(), "B-TITLE".to_string()))); assert!(fate_first_order.contains(&("First".to_string(), "B-TITLE".to_string()))); let trillion_game = labels_for("[ANi] 一兆$遊戲 - 03 [1080P][Baha][WEB-DL][AAC AVC][CHT]"); assert!(trillion_game.contains(&("一兆".to_string(), "B-TITLE".to_string()))); assert!(trillion_game.contains(&("$".to_string(), "B-TITLE".to_string()))); assert!(trillion_game.contains(&("遊戲".to_string(), "B-TITLE".to_string()))); let lapis = labels_for("[Nekomoe kissaten&LoliHouse] Lapis Re꞉LiGHTs - PV01 [BDRip 1080p]"); assert!(lapis.contains(&("Re".to_string(), "B-TITLE".to_string()))); assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string()))); assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string()))); let rezero = labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告"); assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string()))); assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string()))); assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string()))); assert!(rezero.contains(&("第".to_string(), "B-EPISODE".to_string()))); assert!(rezero.contains(&("話".to_string(), "B-EPISODE".to_string()))); let shark = labels_for("アニメ『おでかけ子ザメ』第10話「かじゅえん」"); assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string()))); assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string()))); let creditless = labels_for("[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)"); assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string()))); assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string()))); let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」"); assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string()))); assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string()))); let bilingual = labels_for( "辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]", ); assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string()))); assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string()))); let nekomoe_lang = labels_for("[Nekomoe kissaten][UniteUp!][05][720p][JPTC]"); assert!(nekomoe_lang.contains(&("JPTC".to_string(), "B-SOURCE".to_string()))); assert!(!nekomoe_lang.contains(&("JPTC".to_string(), "B-TITLE".to_string()))); let hayate = labels_for("[漏勺rip][Hayate_the_combat_butler_2nd_Season][23][BDrip X264 AAC 720P]"); assert!(hayate.contains(&("Hayate".to_string(), "B-TITLE".to_string()))); assert!(hayate.contains(&("2nd".to_string(), "B-SEASON".to_string()))); assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string()))); assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string()))); let yama = labels_for("[A.I.R.nesSub][Yama_no_Susume_Second_Season][08][720p]"); assert!(yama.contains(&("Yama".to_string(), "B-TITLE".to_string()))); assert!(yama.contains(&("Second".to_string(), "B-SEASON".to_string()))); assert!(yama.contains(&("Season".to_string(), "B-SEASON".to_string()))); let one_room = labels_for("[DMG][One Room Second Season][00][1080P][BIG5]"); assert!(one_room.contains(&("One".to_string(), "B-TITLE".to_string()))); assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string()))); assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string()))); let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]"); assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string()))); assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string()))); assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string()))); assert!(jade.contains(&("12".to_string(), "B-EPISODE".to_string()))); let yu_no = labels_for( "[JYFanSub][Kono_Yo_no_Hate_de_Koi_wo_Utau_Shoujo_YU-NO][23][BIG5][720P][AVC]", ); assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string()))); assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string()))); let yu_no_dash = labels_for("[LowPower-Raws] この世の果てで恋を唄う少女YU-NO - 01 (BD 1080P x264 FLAC)"); assert!(yu_no_dash.contains(&("NO".to_string(), "B-TITLE".to_string()))); assert!(yu_no_dash.contains(&("01".to_string(), "B-EPISODE".to_string()))); let fox = labels_for( "[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]", ); assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string()))); assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string()))); let kage = labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]"); assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string()))); assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string()))); assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string()))); let tiger = labels_for("[虎面人W][Tiger Mask W][01][简日][720p]"); assert!(tiger.contains(&("Tiger".to_string(), "B-TITLE".to_string()))); assert!(tiger.contains(&("W".to_string(), "B-TITLE".to_string()))); assert!(tiger.contains(&("01".to_string(), "B-EPISODE".to_string()))); let date_live_special = labels_for("[ANK-Raws] デート・ア・ライブⅡ CM01 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string()))); assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string()))); let lupin_part = labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]"); assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string()))); assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string()))); assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string()))); assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string()))); assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string()))); let roman_leaf = dmhy_record( "Ⅰ 001 魯邦燃起了鬥志", "tpl_test", &suggested_roles("TEXT SEP EPISODE SEP TEXT"), ) .unwrap(); assert!(roman_leaf .tokens .iter() .zip(roman_leaf.labels.iter()) .any(|(token, label)| token == "Ⅰ" && label == "B-SEASON")); assert!(audit_warnings(&roman_leaf).contains(&"no_title".to_string())); let hallow = labels_for("[c.c动漫 ccwzz.cc][驱魔少年HALLOW][第09话][GB][720p]"); assert!(hallow.contains(&("驱魔少年HALLOW".to_string(), "B-TITLE".to_string()))); assert!(hallow.contains(&("第09话".to_string(), "B-EPISODE".to_string()))); let fairy = labels_for("[魔導少年 最終章][EP35][繁体][1080P]"); assert!(fairy.contains(&("魔導少年".to_string(), "B-TITLE".to_string()))); assert!(fairy.contains(&("EP35".to_string(), "B-EPISODE".to_string()))); let mebius = labels_for("【CXRAW】【ウルトラマンメビウス】【22】【日々の未来】【DVDrip】【x264 Hi10P AAC】【MP4】"); assert!(mebius.contains(&("ウルトラマンメビウス".to_string(), "B-TITLE".to_string()))); assert!(mebius.contains(&("22".to_string(), "B-EPISODE".to_string()))); let battle = labels_for("斗破苍穹三年之约第01话"); assert!(battle.contains(&("斗破苍穹三年之约".to_string(), "B-TITLE".to_string()))); assert!(battle.contains(&("第".to_string(), "B-EPISODE".to_string()))); assert!(battle.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(battle.contains(&("话".to_string(), "B-EPISODE".to_string()))); let hakumei = labels_for("妖精森林的小不点01"); assert!(hakumei.contains(&("妖精森林的小不点".to_string(), "B-TITLE".to_string()))); assert!(hakumei.contains(&("01".to_string(), "B-EPISODE".to_string()))); let decimal_episode_title = labels_for("无限系统树:第1话可能性的起点"); assert!(decimal_episode_title.contains(&("无限系统树".to_string(), "B-TITLE".to_string()))); assert!(decimal_episode_title.contains(&("第".to_string(), "B-EPISODE".to_string()))); assert!(decimal_episode_title.contains(&("1".to_string(), "B-EPISODE".to_string()))); let hash_range = labels_for("花田少年史#1-3"); assert!(hash_range.contains(&("花田少年史".to_string(), "B-TITLE".to_string()))); assert!(hash_range.contains(&("1".to_string(), "B-EPISODE".to_string()))); assert!(hash_range.contains(&("-".to_string(), "B-EPISODE".to_string()))); assert!(hash_range.contains(&("3".to_string(), "B-EPISODE".to_string()))); let movie_number = labels_for("[Kamigami] Haikyuu!! Movie - 01 [BD 1080p x265 Ma10p AAC]"); assert!(movie_number.contains(&("Haikyuu".to_string(), "B-TITLE".to_string()))); assert!(movie_number.contains(&("01".to_string(), "B-SPECIAL".to_string()))); assert!(!movie_number.contains(&("01".to_string(), "B-EPISODE".to_string()))); let ajin_movie = labels_for("[Moozzi2] Ajin The Movie - 01 (BD 1920x1080 x.264 FLACx2)"); assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string()))); assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string()))); let eien = labels_for( "[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]", ); assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string()))); assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string()))); let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap(); assert!(audit_warnings(&ep_only).contains(&"no_title".to_string())); } }