Improve DMHY template labeling pipeline

Browse files

Files changed (2) hide show

datasets/AnimeName +1 -1
tools/rust_dmhy_template_apply/src/main.rs +964 -28

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~ab3fbcad1a4bf889090d050248130c7d763c457e~~


1	+ Subproject commit 081fd450aafd59992f2df794c5b0110dc3cdd42b

tools/rust_dmhy_template_apply/src/main.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 use anyhow::{bail, Context, Result};
 use chrono::Utc;
 use clap::Parser;
-use once_cell::sync::Lazy;
 use rayon::prelude::*;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -21,6 +21,8 @@ struct Args {
     audit_low_frequency: bool,
     #[arg(long)]
     verify_generated_output: bool,
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -53,6 +55,8 @@ struct Args {
     review_output: PathBuf,
     #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
     audit_output: PathBuf,
     #[arg(long, default_value_t = 50)]
     audit_max_count: u64,
     #[arg(long)]
@@ -81,10 +85,22 @@ struct Args {
     keep_encoding_noise: bool,
     #[arg(long)]
     preserve_parent_paths: bool,
     #[arg(long)]
     threads: Option<usize>,
 }
 #[derive(Debug, Clone, Deserialize)]
 struct Recipe {
     template_id: String,
@@ -151,11 +167,20 @@ enum Processed {
 static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
 static RESOLUTION_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
 static EPISODE_VERSION_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
 static EPISODE_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap());
 static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
 static EPISODE_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
 static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
@@ -173,7 +198,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -183,7 +208,7 @@ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
 });
 static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
 });
 static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
@@ -191,6 +216,8 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
 });
 static YEAR_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
 static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
 });
@@ -206,6 +233,7 @@ static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
 static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
     [
         r"^\d{3,4}[xX×]\d{3,4}",
         r"(?i)^h\.?26[45]",
         r"(?i)^x\.?26[45]",
         r"^[\\/]+",
@@ -233,6 +261,7 @@ fn main() -> Result<()> {
             .build_global()
             .context("failed to configure rayon thread pool")?;
     }
     if args.cluster {
         return run_cluster(&args);
     }
@@ -242,6 +271,9 @@ fn main() -> Result<()> {
     if args.verify_generated_output {
         return run_verify_generated_output(&args);
     }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
@@ -334,6 +366,7 @@ fn main() -> Result<()> {
         "min_count": args.min_count,
         "low_frequency_audit_max_count": args.audit_max_count,
         "low_frequency_blocking_warnings": [
             "hash_labeled",
             "multiple_title_spans",
             "no_title",
@@ -355,6 +388,57 @@ fn main() -> Result<()> {
     Ok(())
 }
 fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
     let file = File::open(&args.recipes)
         .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
@@ -745,7 +829,11 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
         for warning in audit_warnings(&record) {
             if !matches!(
                 warning.as_str(),
-                "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
             ) {
                 continue;
             }
@@ -780,6 +868,204 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
     Ok(())
 }
 fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
     let mut spans = Vec::new();
     let mut current_label: Option<String> = None;
@@ -820,8 +1106,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
     } else if title_spans > 1 {
         warnings.push("multiple_title_spans".to_string());
     }
-    if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
         warnings.push("no_episode".to_string());
     }
     if record.filename.contains('/') || record.filename.contains('\\') {
         warnings.push("path_retained".to_string());
@@ -927,7 +1221,11 @@ fn has_blocking_low_frequency_warning(record: &Record) -> bool {
     audit_warnings(record).iter().any(|warning| {
         matches!(
             warning.as_str(),
-            "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
         )
     })
 }
@@ -1046,6 +1344,9 @@ fn classify_atom(text: &str) -> String {
     if EPISODE_VERSION_RE.is_match(&compact) {
         return "EPISODE_VERSION".to_string();
     }
     if SXE_RE.is_match(&compact) {
         return "SXE".to_string();
     }
@@ -1321,8 +1622,33 @@ fn training_filename_for(original: &str) -> (String, bool) {
         .map(str::trim)
         .filter(|part| !part.is_empty())
         .collect();
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
-        if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
             if !path_segment_is_plain_season(parts[parts.len() - 2]) {
                 return (parts[parts.len() - 1].to_string(), true);
             }
@@ -1334,7 +1660,14 @@ fn training_filename_for(original: &str) -> (String, bool) {
             {
                 (parts[parts.len() - 1].to_string(), true)
             } else {
-                (parts[parts.len() - 2..].join("/"), true)
             }
         } else {
             (parts[parts.len() - 1].to_string(), true)
@@ -1349,6 +1682,43 @@ fn path_segment_is_plain_season(segment: &str) -> bool {
     PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
 }
 fn path_segment_has_season(value: &str) -> bool {
     PATH_SEGMENT_SEASON_RE.is_match(value)
 }
@@ -1368,7 +1738,9 @@ fn has_encoding_noise(value: &str) -> bool {
         return true;
     }
     let markers = [
-        "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
     ];
     let marker_hits = markers
         .iter()
@@ -1403,7 +1775,83 @@ fn path_segment_is_episodeish(value: &str) -> bool {
     !structural.is_empty()
         && structural
             .iter()
-            .all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL")
 }
 fn has_abstract_path_noise(value: &str) -> bool {
@@ -1642,6 +2090,11 @@ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
 }
 fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
     let caps = EPISODE_VALUE_RE.captures(token)?;
     let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
     let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
@@ -1672,6 +2125,29 @@ fn group_text(tokens: &[String], group: &Group) -> String {
     )
 }
 fn is_special_title_phrase(text: &str) -> bool {
     let normalized = SPECIAL_SPACE_RE
         .replace_all(text, " ")
@@ -1681,6 +2157,8 @@ fn is_special_title_phrase(text: &str) -> bool {
         normalized.as_str(),
         "CM" | "EVENT"
             | "EIZOU"
             | "LOGO"
             | "MENU"
             | "OMAKE"
@@ -1690,13 +2168,123 @@ fn is_special_title_phrase(text: &str) -> bool {
             | "TOKUTEN"
             | "TRAILER"
             | "WORLD PREMIERE"
-    ) || SPECIAL_TITLE_PHRASE_RE.is_match(text)
 }
 fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
     let mut output = roles.to_vec();
     let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
     let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
     if !output.iter().any(|role| role == "TITLE")
         && roles
             .first()
@@ -1790,17 +2378,40 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             }
             if output[index - 2] == "TITLE"
                 && groups[index - 1].class_name == "SEP"
-                && previous_text.len() <= 4
-                && previous_text.is_ascii()
-                && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
                 && text.chars().all(|ch| ch.is_ascii_digit())
                 && text.len() <= 3
-                && (next_special || next_episode)
             {
                 output[index] = "TITLE".to_string();
                 continue;
             }
         }
         if roles[index].starts_with("EPISODE")
             && index >= 2
             && output[..index].iter().any(|role| role == "TITLE")
@@ -1843,6 +2454,15 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             output[index] = "SPECIAL".to_string();
             continue;
         }
         if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
         {
             output[index] = "O".to_string();
@@ -1870,8 +2490,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             && groups[index + 1].class_name == "SEP"
             && roles[index + 2].starts_with("EPISODE")
         {
-            output[index] = "O".to_string();
-            output[index + 2] = "SEASON".to_string();
             continue;
         }
         if roles[index] == "TITLE"
@@ -1897,6 +2519,37 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                 output[index + 2] = "O".to_string();
             }
         }
         if roles[index].starts_with("EPISODE") {
             let previous_text = if index >= 1 {
                 group_text(tokens, &groups[index - 1])
@@ -1959,6 +2612,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
 }
 fn enforce_single_title_candidate(
     groups: &[Group],
     roles: &[String],
 ) -> (Vec<String>, Vec<String>) {
@@ -1981,13 +2635,20 @@ fn enforce_single_title_candidate(
         .copied()
         .filter(|(_, end)| *end <= first_anchor)
         .collect();
-    let selected = (if before_anchor.is_empty() {
         &candidates
     } else {
         &before_anchor
-    })
     .iter()
-    .max_by_key(|(start, end)| (*end, end - start))
     .copied()
     .unwrap();
     let mut output = roles.to_vec();
@@ -2006,6 +2667,33 @@ fn enforce_single_title_candidate(
     (output, dropped)
 }
 fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
     let mut output_tokens = Vec::new();
     let mut output_labels = Vec::new();
@@ -2162,14 +2850,16 @@ fn project_refined_tokens(
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
         " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
-        "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
-        "】", "｢", "｣", "「", "」", "☆", "@",
     ];
     let title_terminal_punctuation = ["!", "！", "?", "？"];
     let entity_joiners = [
         " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
-        "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
-        "】", "｢", "｣", "「", "」", "☆", "@", "&", "＆",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -2203,17 +2893,50 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 output[index] = "B-TITLE".to_string();
             }
         }
     }
     output
 }
 fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
     let (key, tokens, _classes, groups) = template_key_for_filename(filename);
     if groups.len() != roles.len() {
         return None;
     }
     let roles = adjust_contextual_roles(&tokens, &groups, roles);
-    let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
     let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
     let labels = smooth_title_spans(&tokens, &labels);
     if tokens.len() != labels.len() {
@@ -2246,6 +2969,18 @@ mod tests {
         record.tokens.into_iter().zip(record.labels).collect()
     }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");
@@ -2313,6 +3048,30 @@ mod tests {
         let comma_title =
             labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
         assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
         let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
         assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
         let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
@@ -2322,6 +3081,7 @@ mod tests {
         let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
         assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
         assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
         let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
         assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -2336,6 +3096,121 @@ mod tests {
         assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
         assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
         assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
     }
     #[test]
@@ -2345,7 +3220,7 @@ mod tests {
         assert!(was_trimmed);
         assert_eq!(
             trimmed,
-            "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
         let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
         let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
@@ -2402,6 +3277,27 @@ mod tests {
         assert!(was_trimmed);
         assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
         let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
         let (trimmed, was_trimmed) = training_filename_for(tintin);
         assert!(was_trimmed);
@@ -2442,7 +3338,47 @@ mod tests {
             "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
         let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
         assert!(was_trimmed);
-        assert_eq!(trimmed, woody_parent);
         let volume =
             labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");

 use anyhow::{bail, Context, Result};
 use chrono::Utc;
 use clap::Parser;
+use once_cell::sync::{Lazy, OnceCell};
 use rayon::prelude::*;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
     audit_low_frequency: bool,
     #[arg(long)]
     verify_generated_output: bool,
+    #[arg(long)]
+    rich_annotations: bool,
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
     review_output: PathBuf,
     #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
     audit_output: PathBuf,
+    #[arg(long, default_value = "reports/dmhy_rich_annotations.rust.jsonl")]
+    rich_output: PathBuf,
     #[arg(long, default_value_t = 50)]
     audit_max_count: u64,
     #[arg(long)]
     keep_encoding_noise: bool,
     #[arg(long)]
     preserve_parent_paths: bool,
+    #[arg(long, default_value = "datasets/AnimeName/dmhy_title_whitelist.txt")]
+    title_whitelist: PathBuf,
+    #[arg(long, default_value = "datasets/AnimeName/dmhy_group_whitelist.txt")]
+    group_whitelist: PathBuf,
     #[arg(long)]
     threads: Option<usize>,
 }
+#[derive(Debug, Default)]
+struct Whitelists {
+    title_phrases: Vec<Vec<String>>,
+    group_names: HashSet<String>,
+}
+static RUNTIME_WHITELISTS: OnceCell<Whitelists> = OnceCell::new();
 #[derive(Debug, Clone, Deserialize)]
 struct Recipe {
     template_id: String,
 static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
 static RESOLUTION_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
+static BARE_RESOLUTION_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"^(?:360|480|540|576|720|1080|2160)$").unwrap());
 static EPISODE_VERSION_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
+static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^\d{1,4}[_ .-]?(?:Notice|Full|R18|R|Uncut|Director'?s?Cut)$").unwrap()
+});
 static EPISODE_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
+static DECIMAL_EPISODE_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
 static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
+static EPISODE_CJK_PREFIX_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap());
 static EPISODE_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
 static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
     Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
 });
 static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
 });
 static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
 });
 static YEAR_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
+static VERSIONISH_TITLE_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)^(?:19|20)\d{2}(?:版|ver\.?|version)?$").unwrap());
 static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
 });
 static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
     [
         r"^\d{3,4}[xX×]\d{3,4}",
+        r"(?i)^(?:AAC|AC3|EAC3|DTS|FLAC|DDP)\s*\d+(?:\.\d+)?",
         r"(?i)^h\.?26[45]",
         r"(?i)^x\.?26[45]",
         r"^[\\/]+",
             .build_global()
             .context("failed to configure rayon thread pool")?;
     }
+    let _ = RUNTIME_WHITELISTS.set(load_whitelists(&args)?);
     if args.cluster {
         return run_cluster(&args);
     }
     if args.verify_generated_output {
         return run_verify_generated_output(&args);
     }
+    if args.rich_annotations {
+        return run_rich_annotations(&args);
+    }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
         "min_count": args.min_count,
         "low_frequency_audit_max_count": args.audit_max_count,
         "low_frequency_blocking_warnings": [
+            "ambiguous_no_episode_title",
             "hash_labeled",
             "multiple_title_spans",
             "no_title",
     Ok(())
 }
+fn load_whitelists(args: &Args) -> Result<Whitelists> {
+    Ok(Whitelists {
+        title_phrases: load_title_whitelist(&args.title_whitelist)?,
+        group_names: load_name_whitelist(&args.group_whitelist)?,
+    })
+}
+fn load_title_whitelist(path: &PathBuf) -> Result<Vec<Vec<String>>> {
+    let mut phrases = Vec::new();
+    for line in load_whitelist_lines(path)? {
+        let phrase = phrase_parts_for_whitelist(&line);
+        if !phrase.is_empty() {
+            phrases.push(phrase);
+        }
+    }
+    Ok(phrases)
+}
+fn load_name_whitelist(path: &PathBuf) -> Result<HashSet<String>> {
+    Ok(load_whitelist_lines(path)?
+        .into_iter()
+        .map(|line| normalize_whitelist_name(&line))
+        .filter(|line| !line.is_empty())
+        .collect())
+}
+fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
+    if !path.exists() {
+        return Ok(Vec::new());
+    }
+    let file = File::open(path)
+        .with_context(|| format!("failed to open whitelist {}", path.display()))?;
+    let mut lines = Vec::new();
+    for line in BufReader::new(file).lines() {
+        let line = line?;
+        let line = line.trim();
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let value = line
+            .split_once('\t')
+            .map(|(_, value)| value)
+            .unwrap_or(line)
+            .trim();
+        if !value.is_empty() {
+            lines.push(value.to_string());
+        }
+    }
+    Ok(lines)
+}
 fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
     let file = File::open(&args.recipes)
         .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
         for warning in audit_warnings(&record) {
             if !matches!(
                 warning.as_str(),
+                "ambiguous_no_episode_title"
+                    | "hash_labeled"
+                    | "multiple_title_spans"
+                    | "no_title"
+                    | "path_retained"
             ) {
                 continue;
             }
     Ok(())
 }
+fn run_rich_annotations(args: &Args) -> Result<()> {
+    let inputs = load_input(&args.input, args.limit)?;
+    if let Some(parent) = args.rich_output.parent() {
+        fs::create_dir_all(parent)?;
+    }
+    let rows: Vec<Value> = inputs
+        .par_iter()
+        .filter_map(|original| {
+            if !args.keep_encoding_noise
+                && (has_encoding_noise(original)
+                    || has_non_anime_noise(original)
+                    || has_abstract_path_noise(original))
+            {
+                return None;
+            }
+            Some(rich_annotation_for(original))
+        })
+        .collect();
+    let mut writer = BufWriter::new(File::create(&args.rich_output)?);
+    for row in &rows {
+        serde_json::to_writer(&mut writer, row)?;
+        writer.write_all(b"\n")?;
+    }
+    writer.flush()?;
+    let manifest = json!({
+        "generated_at": Utc::now().to_rfc3339(),
+        "input": args.input.to_string_lossy(),
+        "rich_output": args.rich_output.to_string_lossy(),
+        "rows": rows.len(),
+        "implementation": "rust_dmhy_rich_annotations",
+        "notes": [
+            "rich roles are metadata for review/projection, not final training BIO labels",
+            "TITLE_* candidates may be collapsed or filtered before dmhy_weak generation"
+        ]
+    });
+    println!("{}", serde_json::to_string_pretty(&manifest)?);
+    Ok(())
+}
+fn rich_annotation_for(original: &str) -> Value {
+    let (training_filename, path_trimmed) = training_filename_for(original);
+    let parts: Vec<&str> = original
+        .split(|ch| ch == '/' || ch == '\\')
+        .map(str::trim)
+        .filter(|part| !part.is_empty())
+        .collect();
+    let leaf_index = parts.len().saturating_sub(1);
+    let segments = parts
+        .iter()
+        .enumerate()
+        .map(|(index, segment)| rich_segment(segment, index, index == leaf_index))
+        .collect::<Vec<_>>();
+    let projection = dmhy_record(
+        &training_filename,
+        "rich_projection",
+        &suggested_roles(&template_key_for_filename(&training_filename).0),
+    )
+    .map(|record| {
+        json!({
+            "filename": record.filename,
+            "spans": entity_spans(&record.tokens, &record.labels),
+            "warnings": audit_warnings(&record),
+        })
+    });
+    json!({
+        "source_filename": original,
+        "training_filename": training_filename,
+        "path_trimmed": path_trimmed,
+        "segments": segments,
+        "projection_preview": projection,
+    })
+}
+fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
+    let (key, tokens, _classes, groups) = template_key_for_filename(segment);
+    let suggested = suggested_roles(&key);
+    let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
+    let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
+    json!({
+        "index": index,
+        "text": segment,
+        "kind": rich_segment_kind(segment, is_leaf),
+        "template": key,
+        "candidates": candidates,
+    })
+}
+fn rich_segment_kind(segment: &str, is_leaf: bool) -> &'static str {
+    if path_segment_is_media_noise(segment) {
+        "media_noise"
+    } else if path_segment_is_plain_season(segment) {
+        "season_dir"
+    } else if is_leaf {
+        "leaf"
+    } else {
+        "parent"
+    }
+}
+fn rich_candidates_for_segment(
+    segment: &str,
+    tokens: &[String],
+    groups: &[Group],
+    roles: &[String],
+    is_leaf: bool,
+) -> Vec<Value> {
+    let mut output = Vec::new();
+    let title_ranges = title_candidates(groups, roles);
+    for (candidate_index, (start, end)) in title_ranges.iter().copied().enumerate() {
+        let text = candidate_text(tokens, groups, start, end);
+        if text.trim().is_empty() {
+            continue;
+        }
+        output.push(json!({
+            "role": fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()),
+            "coarse_role": "TITLE",
+            "text": text,
+            "group_start": start,
+            "group_end": end,
+        }));
+    }
+    for (group_index, role) in roles.iter().enumerate() {
+        if role == "TITLE" || role == "O" || role == "HASH" {
+            continue;
+        }
+        let text = group_text(tokens, &groups[group_index]);
+        if text.trim().is_empty() {
+            continue;
+        }
+        let coarse_role = role_label(role)
+            .strip_prefix("B-")
+            .map(str::to_string)
+            .unwrap_or_else(|| "O".to_string());
+        output.push(json!({
+            "role": fine_non_title_role(role),
+            "coarse_role": coarse_role,
+            "text": text,
+            "group_start": group_index,
+            "group_end": group_index + 1,
+        }));
+    }
+    output
+}
+fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
+    let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
+        return String::new();
+    };
+    let Some(last) = groups
+        .get(end.saturating_sub(1))
+        .and_then(|group| group.indices.last())
+    else {
+        return String::new();
+    };
+    strip_wrapper(&tokens[*first..=*last].join(""))
+}
+fn fine_title_role(
+    segment: &str,
+    text: &str,
+    is_leaf: bool,
+    candidate_index: usize,
+    candidate_count: usize,
+) -> &'static str {
+    let cleaned = text.trim();
+    if VERSIONISH_TITLE_RE.is_match(cleaned) {
+        return "RELEASE_VERSION";
+    }
+    if matches!(
+        cleaned.to_ascii_lowercase().as_str(),
+        "国漫" | "國漫" | "anime" | "movie" | "movies"
+    ) {
+        return "TITLE_CATEGORY";
+    }
+    if is_leaf && path_segment_starts_with_episode(segment) {
+        return "EPISODE_TITLE";
+    }
+    if !is_leaf {
+        return "PATH_TITLE";
+    }
+    if candidate_count > 1 && candidate_index > 0 {
+        return "TITLE_ALIAS";
+    }
+    "TITLE_MAIN"
+}
+fn fine_non_title_role(role: &str) -> &'static str {
+    match role {
+        "GROUP" => "RELEASE_GROUP",
+        "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
+        "SEASON" => "SEASON",
+        "SPECIAL" | "VOLUME" => "SPECIAL",
+        "RESOLUTION" => "RESOLUTION",
+        "SOURCE" => "SOURCE",
+        _ => "OTHER",
+    }
+}
 fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
     let mut spans = Vec::new();
     let mut current_label: Option<String> = None;
     } else if title_spans > 1 {
         warnings.push("multiple_title_spans".to_string());
     }
+    let has_episode = record.labels.iter().any(|label| label.ends_with("EPISODE"));
+    if !has_episode {
         warnings.push("no_episode".to_string());
+        if record
+            .dropped_title_candidate_positions
+            .as_ref()
+            .is_some_and(|dropped| !dropped.is_empty())
+        {
+            warnings.push("ambiguous_no_episode_title".to_string());
+        }
     }
     if record.filename.contains('/') || record.filename.contains('\\') {
         warnings.push("path_retained".to_string());
     audit_warnings(record).iter().any(|warning| {
         matches!(
             warning.as_str(),
+            "ambiguous_no_episode_title"
+                | "hash_labeled"
+                | "multiple_title_spans"
+                | "no_title"
+                | "path_retained"
         )
     })
 }
     if EPISODE_VERSION_RE.is_match(&compact) {
         return "EPISODE_VERSION".to_string();
     }
+    if EPISODE_WITH_SUFFIX_RE.is_match(&cleaned) {
+        return "EPISODE_VERSION".to_string();
+    }
     if SXE_RE.is_match(&compact) {
         return "SXE".to_string();
     }
         .map(str::trim)
         .filter(|part| !part.is_empty())
         .collect();
+    if parts.len() >= 2
+        && (path_segment_is_episodeish(parts[parts.len() - 1])
+            || (!path_segment_is_plain_season(parts[parts.len() - 2])
+                && path_segment_starts_with_episode(parts[parts.len() - 1])
+                && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
+    {
+        if let Some(parent) = parts[..parts.len() - 1]
+            .iter()
+            .rev()
+            .find(|part| {
+                let trimmed = trim_parent_title_segment(part);
+                filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
+            })
+        {
+            let parent = trim_parent_title_segment(parent.trim());
+            return (
+                format!(
+                    "{} {}",
+                    parent,
+                    parts[parts.len() - 1].trim()
+                ),
+                true,
+            );
+        }
+    }
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
+        if path_segment_has_season(parts[parts.len() - 2]) {
             if !path_segment_is_plain_season(parts[parts.len() - 2]) {
                 return (parts[parts.len() - 1].to_string(), true);
             }
             {
                 (parts[parts.len() - 1].to_string(), true)
             } else {
+                (
+                    format!(
+                        "{} {}",
+                        parts[parts.len() - 2].trim(),
+                        parts[parts.len() - 1].trim()
+                    ),
+                    true,
+                )
             }
         } else {
             (parts[parts.len() - 1].to_string(), true)
     PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
 }
+fn trim_terminal_series_kind(segment: &str) -> String {
+    let mut output = segment.trim().to_string();
+    for suffix in ["_TV", ".TV", " TV", "_tv", ".tv", " tv"] {
+        if output.ends_with(suffix) {
+            output.truncate(output.len() - suffix.len());
+            return output.trim_end_matches(['_', '.', ' ']).to_string();
+        }
+    }
+    output
+}
+fn trim_parent_title_segment(segment: &str) -> String {
+    let mut output = trim_terminal_series_kind(segment);
+    loop {
+        let trimmed = output.trim_end();
+        let Some(last) = trimmed.chars().next_back() else {
+            return output;
+        };
+        let open = match last {
+            ')' => '(',
+            ']' => '[',
+            '】' => '【',
+            _ => return output,
+        };
+        let Some(start) = trimmed.rfind(open) else {
+            return output;
+        };
+        let suffix = &trimmed[start..];
+        if path_segment_is_media_noise(suffix) {
+            output.truncate(start);
+            output = output.trim_end_matches([' ', '_', '.', '-']).to_string();
+            continue;
+        }
+        return output;
+    }
+}
 fn path_segment_has_season(value: &str) -> bool {
     PATH_SEGMENT_SEASON_RE.is_match(value)
 }
         return true;
     }
     let markers = [
+        "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
+        "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
+        "伄", "椋", "伓", "姘",
     ];
     let marker_hits = markers
         .iter()
     !structural.is_empty()
         && structural
             .iter()
+            .all(|item| {
+                item.starts_with("EPISODE")
+                    || item.as_str() == "SPECIAL"
+                    || item.as_str() == "VOLUME"
+                    || item.as_str() == "BRACKET_VOLUME"
+            })
+}
+fn path_segment_starts_with_episode(value: &str) -> bool {
+    if EPISODE_CJK_PREFIX_RE.is_match(value.trim()) {
+        return true;
+    }
+    let (key, _, _, groups) = template_key_for_filename(value);
+    let roles = suggested_roles(&key);
+    groups
+        .iter()
+        .zip(roles.iter())
+        .find(|(group, _)| group.class_name != "SEP")
+        .is_some_and(|(_, role)| role.starts_with("EPISODE"))
+}
+fn leaf_has_full_title_after_episode(value: &str) -> bool {
+    let (key, _, _, groups) = template_key_for_filename(value);
+    let roles = suggested_roles(&key);
+    let first_structural = roles.iter().position(|role| role.starts_with("EPISODE"));
+    let Some(first_episode) = first_structural else {
+        return false;
+    };
+    groups
+        .iter()
+        .zip(roles.iter())
+        .skip(first_episode + 1)
+        .filter(|(group, _)| group.class_name != "SEP")
+        .any(|(_, role)| role == "TITLE")
+}
+fn path_segment_is_media_noise(value: &str) -> bool {
+    let normalized = value.to_ascii_lowercase();
+    if normalized.contains("sourceunknown") || normalized.contains("sourceunknow") {
+        return true;
+    }
+    if (normalized.contains("dvdrip")
+        || normalized.contains("bdrip")
+        || normalized.contains("webrip")
+        || normalized.contains("web-dl")
+        || normalized.contains("bluray"))
+        && tokenize(value)
+            .iter()
+            .map(|token| classify_atom(token))
+            .any(|class_name| class_name == "RESOLUTION")
+    {
+        return true;
+    }
+    let (_, _, _, groups) = template_key_for_filename(value);
+    let structural: Vec<&String> = groups
+        .iter()
+        .map(|group| &group.class_name)
+        .filter(|item| item.as_str() != "SEP")
+        .collect();
+    !structural.is_empty()
+        && structural.iter().all(|item| {
+            matches!(
+                item.as_str(),
+                "MEDIA"
+                    | "RESOLUTION"
+                    | "LANG"
+                    | "HASH"
+                    | "DATE"
+                    | "BRACKET_MEDIA"
+                    | "BRACKET_RESOLUTION"
+                    | "BRACKET_LANG"
+                    | "BRACKET_HASH"
+                    | "BRACKET_DATE"
+                    | "MEDIA_BLOCK"
+                    | "BRACKET_MEDIA_BLOCK"
+            )
+        })
 }
 fn has_abstract_path_noise(value: &str) -> bool {
 }
 fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
+    if DECIMAL_EPISODE_RE.is_match(token) {
+        let pieces = split_generated_token(token);
+        let labels = pieces.iter().map(|_| "B-EPISODE".to_string()).collect();
+        return Some((pieces, labels));
+    }
     let caps = EPISODE_VALUE_RE.captures(token)?;
     let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
     let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
     )
 }
+fn normalize_whitelist_name(value: &str) -> String {
+    value.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+fn phrase_parts_for_whitelist(value: &str) -> Vec<String> {
+    let tokens = tokenize(value);
+    let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect();
+    let groups = compact_token_groups(&tokens, &classes);
+    groups
+        .iter()
+        .filter(|group| whitelist_phrase_group(group))
+        .map(|group| group_text(&tokens, group))
+        .filter(|part| !part.trim().is_empty())
+        .collect()
+}
+fn whitelist_phrase_group(group: &Group) -> bool {
+    matches!(
+        group.class_name.as_str(),
+        "TEXT" | "EPISODE" | "SPECIAL" | "SEASON" | "BRACKET_TEXT"
+    )
+}
 fn is_special_title_phrase(text: &str) -> bool {
     let normalized = SPECIAL_SPACE_RE
         .replace_all(text, " ")
         normalized.as_str(),
         "CM" | "EVENT"
             | "EIZOU"
+            | "EXTRA"
+            | "EXTRAS"
             | "LOGO"
             | "MENU"
             | "OMAKE"
             | "TOKUTEN"
             | "TRAILER"
             | "WORLD PREMIERE"
+            | "映像特典"
+            | "特典"
+    ) || normalized.contains("映像特典")
+        || SPECIAL_TITLE_PHRASE_RE.is_match(text)
+}
+const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
+    &["SPY", "x", "FAMILY"],
+    &["Spy", "x", "Family"],
+    &["Slime", "300"],
+    &["Zom", "100"],
+    &["Kamisama", "Hajimemashita", "2"],
+    &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
+];
+fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
+    if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
+        for (index, group) in groups.iter().enumerate() {
+            if group.class_name == "BRACKET_TEXT"
+                && roles.get(index).is_some_and(|role| role == "GROUP")
+                && whitelists
+                    .group_names
+                    .contains(&normalize_whitelist_name(&group_text(tokens, group)))
+            {
+                roles[index] = "GROUP".to_string();
+            }
+        }
+    }
+    let searchable: Vec<(usize, String)> = groups
+        .iter()
+        .enumerate()
+        .filter(|(_, group)| whitelist_phrase_group(group))
+        .map(|(index, group)| (index, group_text(tokens, group)))
+        .collect();
+    for phrase in KNOWN_TITLE_PHRASES {
+        apply_title_phrase(&searchable, phrase, roles, true);
+    }
+    if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
+        for phrase in &whitelists.title_phrases {
+            if phrase.len() >= 2 {
+                apply_title_phrase(&searchable, phrase, roles, false);
+            }
+        }
+    }
+}
+fn apply_title_phrase(
+    searchable: &[(usize, String)],
+    phrase: &[impl AsRef<str>],
+    roles: &mut [String],
+    allow_structural_override: bool,
+) {
+    if phrase.is_empty() || phrase.len() > searchable.len() {
+        return;
+    }
+    for window in searchable.windows(phrase.len()) {
+        if window
+            .iter()
+            .zip(phrase.iter())
+            .all(|((_, text), expected)| text.eq_ignore_ascii_case(expected.as_ref()))
+        {
+            for (group_index, _) in window {
+                if roles.get(*group_index).is_some_and(|role| role == "GROUP") {
+                    continue;
+                }
+                if !allow_structural_override
+                    && roles.get(*group_index).is_some_and(|role| {
+                        matches!(
+                            role.as_str(),
+                            "EPISODE"
+                                | "EPISODE_VERSION"
+                                | "EPISODE_RANGE"
+                                | "SEASON"
+                                | "SOURCE"
+                                | "RESOLUTION"
+                        )
+                    })
+                {
+                    continue;
+                }
+                {
+                    roles[*group_index] = "TITLE".to_string();
+                }
+            }
+        }
+    }
 }
 fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
     let mut output = roles.to_vec();
     let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
     let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
+    apply_known_title_phrases(tokens, groups, &mut output);
+    if roles
+        .first()
+        .is_some_and(|role| role.starts_with("EPISODE"))
+        && YEAR_RANGE_RE.is_match(&group_text(tokens, &groups[0]))
+    {
+        let first_real_structural = (1..roles.len())
+            .find(|&index| {
+                roles[index].starts_with("EPISODE")
+                    || matches!(roles[index].as_str(), "SEASON" | "SPECIAL")
+            })
+            .unwrap_or(roles.len());
+        for index in 1..first_real_structural {
+            if groups[index].class_name == "TEXT"
+                && !matches!(
+                    group_text(tokens, &groups[index])
+                        .to_ascii_uppercase()
+                        .as_str(),
+                    "TV" | "OVA" | "OAD" | "SP"
+                )
+            {
+                output[index] = "TITLE".to_string();
+            }
+        }
+    }
     if !output.iter().any(|role| role == "TITLE")
         && roles
             .first()
             }
             if output[index - 2] == "TITLE"
                 && groups[index - 1].class_name == "SEP"
+                && previous_text.len() <= 48
+                && previous_text.chars().any(|ch| ch.is_alphabetic())
                 && text.chars().all(|ch| ch.is_ascii_digit())
                 && text.len() <= 3
+                && !(index + 2 < roles.len()
+                    && groups[index + 1].class_name == "SEP"
+                    && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
+                && (next_episode
+                    || (next_special
+                        && (text.parse::<u16>().is_ok_and(|value| value >= 100)
+                            || (previous_text.len() <= 4
+                                && previous_text.is_ascii()
+                                && previous_text
+                                    .chars()
+                                    .all(|ch| ch.is_ascii_alphabetic())))))
             {
                 output[index] = "TITLE".to_string();
                 continue;
             }
         }
+        if roles[index].starts_with("EPISODE")
+            && BARE_RESOLUTION_RE.is_match(&text)
+            && index >= 2
+            && groups[index - 1].class_name == "SEP"
+        {
+            let previous_text = group_text(tokens, &groups[index - 2]);
+            if previous_text
+                .chars()
+                .any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '．'))
+            {
+                output[index] = "RESOLUTION".to_string();
+                continue;
+            }
+        }
         if roles[index].starts_with("EPISODE")
             && index >= 2
             && output[..index].iter().any(|role| role == "TITLE")
             output[index] = "SPECIAL".to_string();
             continue;
         }
+        if roles[index] == "TITLE"
+            && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
+            && output.iter().enumerate().any(|(other, role)| {
+                other != index && role == "TITLE"
+            })
+        {
+            output[index] = "O".to_string();
+            continue;
+        }
         if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
         {
             output[index] = "O".to_string();
             && groups[index + 1].class_name == "SEP"
             && roles[index + 2].starts_with("EPISODE")
         {
+            if !output[..index].iter().any(|role| role == "TITLE") {
+                output[index] = "O".to_string();
+                output[index + 2] = "SEASON".to_string();
+            }
             continue;
         }
         if roles[index] == "TITLE"
                 output[index + 2] = "O".to_string();
             }
         }
+        if roles[index].starts_with("EPISODE")
+            && !output[index + 1..].iter().any(|role| role == "TITLE")
+        {
+            let mut run = Vec::new();
+            let mut cursor = index + 1;
+            while cursor < roles.len() {
+                if groups[cursor].class_name == "SEP" {
+                    cursor += 1;
+                    continue;
+                }
+                if groups[cursor].class_name == "TEXT"
+                    && !matches!(
+                        roles[cursor].as_str(),
+                        "SOURCE" | "RESOLUTION" | "SEASON" | "SPECIAL"
+                    )
+                {
+                    run.push(cursor);
+                    cursor += 1;
+                    continue;
+                }
+                if !run.is_empty() {
+                    break;
+                }
+                cursor += 1;
+            }
+            if run.len() >= 2 {
+                for item in run {
+                    output[item] = "TITLE".to_string();
+                }
+            }
+        }
         if roles[index].starts_with("EPISODE") {
             let previous_text = if index >= 1 {
                 group_text(tokens, &groups[index - 1])
 }
 fn enforce_single_title_candidate(
+    tokens: &[String],
     groups: &[Group],
     roles: &[String],
 ) -> (Vec<String>, Vec<String>) {
         .copied()
         .filter(|(_, end)| *end <= first_anchor)
         .collect();
+    let selected_pool = if before_anchor.is_empty() {
         &candidates
     } else {
         &before_anchor
+    };
+    let selected = selected_pool
     .iter()
+    .max_by_key(|(start, end)| {
+        (
+            title_candidate_score(tokens, groups, *start, *end),
+            *end,
+            end - start,
+        )
+    })
     .copied()
     .unwrap();
     let mut output = roles.to_vec();
     (output, dropped)
 }
+fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
+    let text = (start..end)
+        .filter(|&index| roles_candidate_text_group(&groups[index]))
+        .map(|index| group_text(tokens, &groups[index]))
+        .collect::<Vec<_>>()
+        .join("");
+    let cleaned = text.trim();
+    if cleaned.is_empty() {
+        return -1000;
+    }
+    let mut score = cleaned.chars().filter(|ch| ch.is_alphanumeric()).count() as isize;
+    if VERSIONISH_TITLE_RE.is_match(cleaned) {
+        score -= 500;
+    }
+    if matches!(
+        cleaned.to_ascii_lowercase().as_str(),
+        "国漫" | "國漫" | "anime" | "movie" | "movies"
+    ) {
+        score -= 500;
+    }
+    score
+}
+fn roles_candidate_text_group(group: &Group) -> bool {
+    matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT")
+}
 fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
     let mut output_tokens = Vec::new();
     let mut output_labels = Vec::new();
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
         " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
+        "？", ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")",
+        "（", "）", "[", "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」",
+        "☆", "♪", "`", "@",
     ];
     let title_terminal_punctuation = ["!", "！", "?", "？"];
     let entity_joiners = [
         " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
+        "？", ";", "；", ",", "，", "、", "。", "～", "~", "－", "+", "＋", "(", ")",
+        "（", "）", "[", "]", "【", "】", "<", ">", "＜", "＞", "｢", "｣", "「", "」",
+        "☆", "♪", "`", "@", "&", "＆",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
                 output[index] = "B-TITLE".to_string();
             }
         }
+        if matches!(token.as_str(), "]" | "】" | ")" | "）" | ">" | "＞" | "｣" | "」")
+            && index > 0
+            && output[index - 1] == "B-TITLE"
+            && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
+        {
+            output[index] = "B-TITLE".to_string();
+        }
     }
     output
 }
+fn title_span_has_labeled_opener(tokens: &[String], labels: &[String], closer: &str) -> bool {
+    for (token, label) in tokens.iter().zip(labels.iter()).rev() {
+        if label != "B-TITLE" {
+            return false;
+        }
+        if closer_matches_opener(closer, token) {
+            return true;
+        }
+    }
+    false
+}
+fn closer_matches_opener(closer: &str, opener: &str) -> bool {
+    matches!(
+        (closer, opener),
+        ("]", "[")
+            | ("】", "【")
+            | (")", "(")
+            | ("）", "（")
+            | (">", "<")
+            | ("＞", "＜")
+            | ("｣", "｢")
+            | ("」", "「")
+    )
+}
 fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
     let (key, tokens, _classes, groups) = template_key_for_filename(filename);
     if groups.len() != roles.len() {
         return None;
     }
     let roles = adjust_contextual_roles(&tokens, &groups, roles);
+    let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
     let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
     let labels = smooth_title_spans(&tokens, &labels);
     if tokens.len() != labels.len() {
         record.tokens.into_iter().zip(record.labels).collect()
     }
+    #[test]
+    fn rich_title_candidates_keep_readable_spacing() {
+        let row = rich_annotation_for(
+            "(1998) Initial D First Stage [1080p BDRip AVC AAC DTS-HD]/Initial D First Stage - 01 [1080p BDRip AVC AAC DTS-HD]",
+        );
+        assert_eq!(
+            row.pointer("/segments/1/candidates/0/text")
+                .and_then(Value::as_str),
+            Some("Initial D First Stage")
+        );
+    }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");
         let comma_title =
             labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
         assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
+        let backtick_title =
+            labels_for("[Hayate no Gotoku! Can`t Take My Eyes Off You][01][BDrip X264 AAC 720P]");
+        assert!(backtick_title.contains(&("`".to_string(), "B-TITLE".to_string())));
+        assert!(backtick_title.contains(&("t".to_string(), "B-TITLE".to_string())));
+        let cjk_period_title =
+            labels_for("[云光字幕组]剃须。然后捡到高中生 Hige o Soru. Soshite Joshikousei o Hirou-[ 01 ][简体双语][1080p]");
+        assert!(cjk_period_title.contains(&("。".to_string(), "B-TITLE".to_string())));
+        let music_title =
+            labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000%  第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
+        assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
+        let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
+        assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
+        assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
+        let hdma_block =
+            labels_for("[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]");
+        assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
+        assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
+        assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
+        assert!(!hdma_block.contains(&("1080P".to_string(), "B-TITLE".to_string())));
+        let extra_menu = labels_for("Extra Menu OVA");
+        assert!(extra_menu.contains(&("Extra".to_string(), "B-SPECIAL".to_string())));
+        assert!(!extra_menu.contains(&("Extra".to_string(), "B-TITLE".to_string())));
+        let eizou_tokuten = labels_for("おジャ魔女どれみ♯ 映像特典｢ともだちの唄｣(DVD 640x480 )");
+        assert!(eizou_tokuten.contains(&("映像特典".to_string(), "B-SPECIAL".to_string())));
         let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
         assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
         let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
         let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
         assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
         assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
+        assert!(mayoi.contains(&("]".to_string(), "B-TITLE".to_string())));
         let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
         assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
         assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
         assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
         assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
+        let happy = labels_for(
+            "My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG",
+        );
+        assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
+        assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
+        let garo = labels_for("[牙狼＜GARO＞～炎の刻印～][01][1080p]");
+        assert!(garo.contains(&("牙狼".to_string(), "B-TITLE".to_string())));
+        assert!(garo.contains(&("＜".to_string(), "B-TITLE".to_string())));
+        assert!(garo.contains(&("＞".to_string(), "B-TITLE".to_string())));
+        assert!(garo.contains(&("炎の刻印".to_string(), "B-TITLE".to_string())));
+        let akira = labels_for("[QYQ][AKIRA][AVC_AC3x2][1080p]");
+        assert!(akira.contains(&("AKIRA".to_string(), "B-TITLE".to_string())));
+        assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
+        assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
+        let doraemon =
+            labels_for("[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了");
+        assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
+        assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
+        assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
+        let devilman = labels_for("[DBD-Raws][恶魔人][1972版][01][1080P][BDRip][HEVC-10bit][FLAC]");
+        assert!(devilman.contains(&("恶魔人".to_string(), "B-TITLE".to_string())));
+        assert!(!devilman.contains(&("1972版".to_string(), "B-TITLE".to_string())));
+        let classroom = labels_for("[Dymy][Assassination Classroom (2016)][01][BIG5][1280X720]");
+        assert!(classroom.contains(&("(".to_string(), "B-TITLE".to_string())));
+        assert!(classroom.contains(&(")".to_string(), "B-TITLE".to_string())));
+        assert!(!classroom.contains(&("]".to_string(), "B-TITLE".to_string())));
+        let bang_season =
+            labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]");
+        assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string())));
+        assert!(bang_season.contains(&("Season".to_string(), "B-TITLE".to_string())));
+        assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
+        let basket =
+            labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
+        assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
+        assert!(basket.contains(&("Season".to_string(), "B-TITLE".to_string())));
+        assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string())));
+        assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string())));
+        let notice = labels_for("[KTXP][Zankyou_no_Terror][08_Notice][GB_BIG5][X264_AAC][720p]");
+        assert!(notice.contains(&("Zankyou".to_string(), "B-TITLE".to_string())));
+        assert!(notice.contains(&("08".to_string(), "B-EPISODE".to_string())));
+        assert!(!notice.contains(&("08".to_string(), "B-TITLE".to_string())));
+        let full = labels_for("[POPGO][Soukyuu_no_Fafner_Exodus][01_Full][GB][720p]");
+        assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
+        let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
+        assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
+        let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
+        assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
+        assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        assert!(ddp.iter().any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
+        let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
+        assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
+        assert!(!aac_space.contains(&("2".to_string(), "B-EPISODE".to_string())));
+        assert!(aac_space
+            .iter()
+            .any(|(token, label)| token.starts_with("AAC") && label == "B-SOURCE"));
+        let bare_resolution = labels_for("日本桥15.03.30 720");
+        assert!(bare_resolution.contains(&("日本桥".to_string(), "B-TITLE".to_string())));
+        assert!(bare_resolution.contains(&("720".to_string(), "B-RESOLUTION".to_string())));
+        assert!(!bare_resolution.contains(&("720".to_string(), "B-EPISODE".to_string())));
+        let air_episode = labels_for("Air 01");
+        assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
+        assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
+        assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
+        assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
+        assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
+        let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
+        assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
+        assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
+        assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
+        assert!(spy.contains(&("Family".to_string(), "B-TITLE".to_string())));
+        assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
+        assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
+        let spy_s3 = labels_for("[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]");
+        assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
+        assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
+        assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
+        assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
+        assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
+        assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
+        assert!(
+            slime.contains(&("300".to_string(), "B-TITLE".to_string())),
+            "{slime:?}"
+        );
+        assert!(!slime.contains(&("300".to_string(), "B-EPISODE".to_string())));
+        let kamisama =
+            labels_for("[SFEO-Raws] Kamisama Hajimemashita 2 - 01 (BD 720P x264 10bit AAC)");
+        assert!(kamisama.contains(&("Kamisama".to_string(), "B-TITLE".to_string())));
+        assert!(kamisama.contains(&("2".to_string(), "B-TITLE".to_string())));
+        assert!(kamisama.contains(&("01".to_string(), "B-EPISODE".to_string())));
     }
     #[test]
         assert!(was_trimmed);
         assert_eq!(
             trimmed,
+            "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
         let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
         let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
         assert!(was_trimmed);
         assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
+        let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
+        let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
+        assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "Season 1 [Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]"
+        );
+        let plain_season_labels = labels_for(&trimmed);
+        assert!(plain_season_labels.contains(&("1".to_string(), "B-SEASON".to_string())));
+        assert!(plain_season_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let menu_parent =
+            "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
+        let (trimmed, was_trimmed) = training_filename_for(menu_parent);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)");
+        assert!(has_encoding_noise(
+            "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
+        ));
         let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
         let (trimmed, was_trimmed) = training_filename_for(tintin);
         assert!(was_trimmed);
             "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
         let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
         assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
+        );
+        let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
+        let (trimmed, was_trimmed) = training_filename_for(najica);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
+        let najica_labels = labels_for(&trimmed);
+        assert!(najica_labels.contains(&("Najica".to_string(), "B-TITLE".to_string())));
+        assert!(!najica_labels.contains(&("SourceUnknown".to_string(), "B-TITLE".to_string())));
+        assert!(najica_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
+        let (trimmed, was_trimmed) = training_filename_for(galient);
+        assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
+        );
+        let galient_labels = labels_for(&trimmed);
+        assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
+        assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
+        assert!(galient_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
+        let (trimmed, was_trimmed) = training_filename_for(nced);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED");
+        let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
+        let (trimmed, was_trimmed) = training_filename_for(sakura);
+        assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "魔卡少女樱(台配国语) 第01集 小樱与不可思议的魔法书"
+        );
+        let sakura_labels = labels_for(&trimmed);
+        assert!(sakura_labels.contains(&("魔卡少女樱".to_string(), "B-TITLE".to_string())));
+        assert!(sakura_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
         let volume =
             labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");