Add low-frequency DMHY audit gate

Browse files

Files changed (2) hide show

tools/rust_dmhy_template_apply/README.md +19 -0
tools/rust_dmhy_template_apply/src/main.rs +308 -17

tools/rust_dmhy_template_apply/README.md CHANGED Viewed

@@ -31,6 +31,18 @@ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml --
   --manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
 ```
 Optional controls:
 ```powershell
@@ -47,3 +59,10 @@ The output is intended to match `tools/apply_dmhy_template_recipes.py` at the
 record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
 plus optional `source_filename`, `path_trimmed`, and
 `dropped_title_candidate_positions`.

   --manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
 ```
+Audit low-frequency recipe output from the repository root:
+```powershell
+cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
+  --audit-low-frequency `
+  --input datasets\AnimeName\dmhy_list.jsonl `
+  --recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
+  --audit-output reports\dmhy_low_frequency_audit.rust.jsonl `
+  --audit-max-count 50 `
+  --threads 24
+```
 Optional controls:
 ```powershell
 record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
 plus optional `source_filename`, `path_trimmed`, and
 `dropped_title_candidate_positions`.
+For low-frequency templates (`count <= --audit-max-count`, default `50`), apply
+uses a conservative gate: records with `no_title`, `multiple_title_spans`,
+`path_retained`, or `hash_labeled` audit warnings are skipped from the training
+JSONL and left in the audit/review files. This keeps common templates stable
+while preventing rare ambiguous path/title cases from polluting the generated
+dataset.

tools/rust_dmhy_template_apply/src/main.rs CHANGED Viewed

@@ -17,6 +17,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 struct Args {
     #[arg(long)]
     cluster: bool,
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -47,6 +49,10 @@ struct Args {
     recipes_output: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
     review_output: PathBuf,
     #[arg(long)]
     limit: Option<usize>,
     #[arg(long)]
@@ -115,6 +121,7 @@ struct Stats {
     skipped_no_recipe: usize,
     skipped_sample_cap: usize,
     skipped_role_mismatch: usize,
     written: usize,
 }
@@ -164,7 +171,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -187,6 +194,8 @@ static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
 });
 static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
 static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
 static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
@@ -225,6 +234,9 @@ fn main() -> Result<()> {
     if args.cluster {
         return run_cluster(&args);
     }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
@@ -293,6 +305,9 @@ fn main() -> Result<()> {
                     "no_recipe" => stats.skipped_no_recipe += 1,
                     "sample_cap" => stats.skipped_sample_cap += 1,
                     "role_mismatch" => stats.skipped_role_mismatch += 1,
                     _ => {}
                 }
             }
@@ -312,6 +327,13 @@ fn main() -> Result<()> {
         "selected_templates": recipes.len(),
         "confidence": args.confidence,
         "min_count": args.min_count,
         "expand": args.expand,
         "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
         "stats": stats,
@@ -603,6 +625,156 @@ fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
     Ok(())
 }
 fn process_filename(
     original: &str,
     args: &Args,
@@ -654,6 +826,13 @@ fn process_filename(
             }
         }
     };
     if trimmed_parent {
         record.source_filename = Some(original.to_string());
         record.path_trimmed = Some(true);
@@ -668,6 +847,15 @@ fn process_filename(
     }
 }
 fn tokenize(value: &str) -> Vec<String> {
     let mut output = Vec::new();
     let mut index = 0;
@@ -1007,7 +1195,14 @@ fn suggested_roles(template: &str) -> Vec<String> {
                 roles[*index] = "TITLE".to_string();
             }
         } else if bracket_text.len() == 1 {
-            roles[bracket_text[0]] = if text.is_empty() { "TITLE" } else { "GROUP" }.to_string();
         }
         for index in text {
             roles[index] = "TITLE".to_string();
@@ -1052,6 +1247,9 @@ fn training_filename_for(original: &str) -> (String, bool) {
         .collect();
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
         if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
             let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
             let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
             if parent_seasons
@@ -1070,6 +1268,11 @@ fn training_filename_for(original: &str) -> (String, bool) {
     }
 }
 fn path_segment_has_season(value: &str) -> bool {
     PATH_SEGMENT_SEASON_RE.is_match(value)
 }
@@ -1150,7 +1353,7 @@ fn role_label(role: &str) -> String {
         "SEASON" => Some("SEASON"),
         "SPECIAL" | "VOLUME" => Some("SPECIAL"),
         "RESOLUTION" => Some("RESOLUTION"),
-        "SOURCE" | "HASH" => Some("SOURCE"),
         _ => None,
     };
     entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
@@ -1311,7 +1514,10 @@ fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String
         if atom_class == "RESOLUTION" {
             return "B-RESOLUTION".to_string();
         }
-        if matches!(atom_class.as_str(), "MEDIA" | "LANG" | "HASH") {
             return "B-SOURCE".to_string();
         }
         if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
@@ -1489,6 +1695,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                     "vol" | "volume"
                 )
             {
                 output[index - 2] = "SPECIAL".to_string();
                 output[index] = "SPECIAL".to_string();
                 continue;
@@ -1548,6 +1767,27 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             output[index] = "SPECIAL".to_string();
             continue;
         }
         if roles[index] == "TITLE"
             && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
             && index + 2 < roles.len()
@@ -1616,19 +1856,26 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
             index += 1;
             continue;
         }
-        if groups[index].class_name == "BRACKET_TEXT" {
-            candidates.push((index, index + 1));
-            index += 1;
-            continue;
-        }
         let start = index;
         index += 1;
-        while index + 1 < roles.len()
-            && roles[index] == "O"
-            && groups[index].class_name == "SEP"
-            && roles[index + 1] == "TITLE"
-        {
-            index += 2;
         }
         candidates.push((start, index));
     }
@@ -1838,10 +2085,15 @@ fn project_refined_tokens(
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
-        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！",
     ];
     let entity_joiners = [
-        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "&", "＆",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -1869,6 +2121,12 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                 output[index] = left_label.clone();
             }
         }
     }
     output
 }
@@ -1962,6 +2220,32 @@ mod tests {
         assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
         assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
         assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
         let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
         assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -1987,6 +2271,13 @@ mod tests {
             trimmed,
             "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
         let woody = labels_for(&trimmed);
         assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
         assert!(woody.contains(&("E".to_string(), "O".to_string())));

 struct Args {
     #[arg(long)]
     cluster: bool,
+    #[arg(long)]
+    audit_low_frequency: bool,
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
     recipes_output: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
     review_output: PathBuf,
+    #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
+    audit_output: PathBuf,
+    #[arg(long, default_value_t = 50)]
+    audit_max_count: u64,
     #[arg(long)]
     limit: Option<usize>,
     #[arg(long)]
     skipped_no_recipe: usize,
     skipped_sample_cap: usize,
     skipped_role_mismatch: usize,
+    skipped_low_frequency_audit_warning: usize,
     written: usize,
 }
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 });
 static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
+static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap());
 static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
 static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
     if args.cluster {
         return run_cluster(&args);
     }
+    if args.audit_low_frequency {
+        return run_low_frequency_audit(&args);
+    }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
                     "no_recipe" => stats.skipped_no_recipe += 1,
                     "sample_cap" => stats.skipped_sample_cap += 1,
                     "role_mismatch" => stats.skipped_role_mismatch += 1,
+                    "low_frequency_audit_warning" => {
+                        stats.skipped_low_frequency_audit_warning += 1
+                    }
                     _ => {}
                 }
             }
         "selected_templates": recipes.len(),
         "confidence": args.confidence,
         "min_count": args.min_count,
+        "low_frequency_audit_max_count": args.audit_max_count,
+        "low_frequency_blocking_warnings": [
+            "hash_labeled",
+            "multiple_title_spans",
+            "no_title",
+            "path_retained"
+        ],
         "expand": args.expand,
         "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
         "stats": stats,
     Ok(())
 }
+fn run_low_frequency_audit(args: &Args) -> Result<()> {
+    let recipes = load_recipes(args)?;
+    let inputs = load_input(&args.input, args.limit)?;
+    let low_template_total = recipes
+        .values()
+        .filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count)
+        .count();
+    let mut seen_templates = HashSet::new();
+    let mut rows = Vec::new();
+    for original in inputs {
+        if !args.keep_encoding_noise
+            && (has_encoding_noise(&original)
+                || has_non_anime_noise(&original)
+                || has_abstract_path_noise(&original))
+        {
+            continue;
+        }
+        let (training_filename, trimmed_parent) = training_filename_for(&original);
+        let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
+        let Some(recipe) = recipes.get(&key) else {
+            continue;
+        };
+        let count = recipe.count.unwrap_or(0);
+        if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) {
+            continue;
+        }
+        if recipe.roles.len() != groups.len() {
+            continue;
+        }
+        let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles)
+        else {
+            continue;
+        };
+        if trimmed_parent {
+            record.source_filename = Some(original.clone());
+            record.path_trimmed = Some(true);
+        }
+        rows.push(json!({
+            "template_id": recipe.template_id,
+            "count": count,
+            "template": recipe.template,
+            "filename": record.filename,
+            "source_filename": record.source_filename,
+            "path_trimmed": record.path_trimmed.unwrap_or(false),
+            "spans": entity_spans(&record.tokens, &record.labels),
+            "warnings": audit_warnings(&record),
+            "tokens": record.tokens,
+            "labels": record.labels,
+        }));
+        if seen_templates.len() >= low_template_total {
+            break;
+        }
+    }
+    rows.sort_by(|a, b| {
+        let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0);
+        let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0);
+        let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or("");
+        let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or("");
+        count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b))
+    });
+    write_jsonl_values(&args.audit_output, &rows)?;
+    let warning_counts = warning_counts(&rows);
+    let manifest = json!({
+        "generated_at": Utc::now().to_rfc3339(),
+        "input": args.input.to_string_lossy(),
+        "recipes": args.recipes.to_string_lossy(),
+        "audit_output": args.audit_output.to_string_lossy(),
+        "audit_max_count": args.audit_max_count,
+        "low_template_total": low_template_total,
+        "audited_templates": rows.len(),
+        "warning_counts": warning_counts,
+        "implementation": "rust_dmhy_low_frequency_audit"
+    });
+    println!("{}", serde_json::to_string_pretty(&manifest)?);
+    Ok(())
+}
+fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
+    let mut spans = Vec::new();
+    let mut current_label: Option<String> = None;
+    let mut current_text = String::new();
+    for (token, label) in tokens.iter().zip(labels.iter()) {
+        let entity = label
+            .strip_prefix("B-")
+            .or_else(|| label.strip_prefix("I-"))
+            .unwrap_or("O");
+        if current_label.as_deref() == Some(entity) {
+            current_text.push_str(token);
+            continue;
+        }
+        if let Some(label) = current_label.take() {
+            if label != "O" {
+                spans.push(json!({ "label": label, "text": current_text }));
+            }
+        }
+        current_label = Some(entity.to_string());
+        current_text = token.clone();
+    }
+    if let Some(label) = current_label {
+        if label != "O" {
+            spans.push(json!({ "label": label, "text": current_text }));
+        }
+    }
+    spans
+}
+fn audit_warnings(record: &Record) -> Vec<String> {
+    let mut warnings = Vec::new();
+    let title_spans = entity_spans(&record.tokens, &record.labels)
+        .into_iter()
+        .filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE"))
+        .count();
+    if title_spans == 0 {
+        warnings.push("no_title".to_string());
+    } else if title_spans > 1 {
+        warnings.push("multiple_title_spans".to_string());
+    }
+    if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
+        warnings.push("no_episode".to_string());
+    }
+    if record.filename.contains('/') || record.filename.contains('\\') {
+        warnings.push("path_retained".to_string());
+    }
+    for (index, token) in record.tokens.iter().enumerate() {
+        if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
+            warnings.push("hash_labeled".to_string());
+            break;
+        }
+    }
+    warnings.sort();
+    warnings.dedup();
+    warnings
+}
+fn warning_counts(rows: &[Value]) -> HashMap<String, usize> {
+    let mut counts = HashMap::new();
+    for row in rows {
+        if let Some(warnings) = row.get("warnings").and_then(Value::as_array) {
+            for warning in warnings {
+                if let Some(warning) = warning.as_str() {
+                    *counts.entry(warning.to_string()).or_default() += 1;
+                }
+            }
+        }
+    }
+    counts
+}
 fn process_filename(
     original: &str,
     args: &Args,
             }
         }
     };
+    if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
+    {
+        return Processed::Skipped {
+            reason: "low_frequency_audit_warning",
+            trimmed_parent,
+        };
+    }
     if trimmed_parent {
         record.source_filename = Some(original.to_string());
         record.path_trimmed = Some(true);
     }
 }
+fn has_blocking_low_frequency_warning(record: &Record) -> bool {
+    audit_warnings(record).iter().any(|warning| {
+        matches!(
+            warning.as_str(),
+            "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
+        )
+    })
+}
 fn tokenize(value: &str) -> Vec<String> {
     let mut output = Vec::new();
     let mut index = 0;
                 roles[*index] = "TITLE".to_string();
             }
         } else if bracket_text.len() == 1 {
+            roles[bracket_text[0]] = if text.is_empty() {
+                "TITLE"
+            } else if bracket_text[0] == *start {
+                "GROUP"
+            } else {
+                "TITLE"
+            }
+            .to_string();
         }
         for index in text {
             roles[index] = "TITLE".to_string();
         .collect();
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
         if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
+            if !path_segment_is_plain_season(parts[parts.len() - 2]) {
+                return (parts[parts.len() - 1].to_string(), true);
+            }
             let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
             let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
             if parent_seasons
     }
 }
+fn path_segment_is_plain_season(segment: &str) -> bool {
+    let cleaned = strip_wrapper(segment).trim().to_string();
+    PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
+}
 fn path_segment_has_season(value: &str) -> bool {
     PATH_SEGMENT_SEASON_RE.is_match(value)
 }
         "SEASON" => Some("SEASON"),
         "SPECIAL" | "VOLUME" => Some("SPECIAL"),
         "RESOLUTION" => Some("RESOLUTION"),
+        "SOURCE" => Some("SOURCE"),
         _ => None,
     };
     entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
         if atom_class == "RESOLUTION" {
             return "B-RESOLUTION".to_string();
         }
+        if atom_class == "HASH" {
+            return "O".to_string();
+        }
+        if matches!(atom_class.as_str(), "MEDIA" | "LANG") {
             return "B-SOURCE".to_string();
         }
         if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
                     "vol" | "volume"
                 )
             {
+                let next_text_before_episode = (index + 1..roles.len())
+                    .find(|&cursor| groups[cursor].class_name != "SEP")
+                    .is_some_and(|cursor| {
+                        groups[cursor].class_name == "TEXT"
+                            && roles[cursor + 1..]
+                                .iter()
+                                .any(|role| role.starts_with("EPISODE"))
+                    });
+                if next_text_before_episode {
+                    output[index - 2] = "TITLE".to_string();
+                    output[index] = "TITLE".to_string();
+                    continue;
+                }
                 output[index - 2] = "SPECIAL".to_string();
                 output[index] = "SPECIAL".to_string();
                 continue;
             output[index] = "SPECIAL".to_string();
             continue;
         }
+        if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
+        {
+            output[index] = "O".to_string();
+            continue;
+        }
+        if output[index] == "O"
+            && groups[index].class_name == "TEXT"
+            && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
+            && text.chars().any(|ch| ch.is_alphabetic())
+            && !ep_markers.contains(&text.as_str())
+        {
+            if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
+                let episode_since_title = output[last_title + 1..index]
+                    .iter()
+                    .any(|role| role.starts_with("EPISODE"));
+                if !episode_since_title {
+                    output[index] = "TITLE".to_string();
+                    continue;
+                }
+            }
+        }
         if roles[index] == "TITLE"
             && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
             && index + 2 < roles.len()
             index += 1;
             continue;
         }
         let start = index;
         index += 1;
+        loop {
+            if index < roles.len()
+                && roles[index] == "TITLE"
+                && !(groups[index - 1].class_name == "BRACKET_TEXT"
+                    && groups[index].class_name == "BRACKET_TEXT")
+            {
+                index += 1;
+                continue;
+            }
+            if index + 1 < roles.len()
+                && roles[index] == "O"
+                && groups[index].class_name == "SEP"
+                && roles[index + 1] == "TITLE"
+            {
+                index += 2;
+                continue;
+            }
+            break;
         }
         candidates.push((start, index));
     }
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     let joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
+        "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
+        "】", "｢", "｣", "「", "」", "☆", "@",
     ];
+    let title_terminal_punctuation = ["!", "！", "?", "？"];
     let entity_joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
+        "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
+        "】", "｢", "｣", "「", "」", "☆", "@", "&", "＆",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
                 output[index] = left_label.clone();
             }
         }
+        if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
+            let left_label = &output[index - 1];
+            if left_label == "B-TITLE" {
+                output[index] = "B-TITLE".to_string();
+            }
+        }
     }
     output
 }
         assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
         assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
         assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)");
+        assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string())));
+        let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]");
+        assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string())));
+        let hash = labels_for("[Group][Title][01][1080p][00270AC8]");
+        assert!(hash.contains(&("00270AC8".to_string(), "O".to_string())));
+        let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001");
+        assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string())));
+        assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string())));
+        let ubw = labels_for("Fate／stay night [Unlimited Blade Works] #00 「プロローグ」");
+        assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string())));
+        assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string())));
+        let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]");
+        assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string())));
+        let comma_title =
+            labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
+        assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
+        let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
+        assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
+        let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
+        assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string())));
+        let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER～魂狩～ #01 (HEVC 1312x720)");
+        assert!(soul_taker.contains(&("～".to_string(), "B-TITLE".to_string())));
+        let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
+        assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
+        assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
         let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
         assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
             trimmed,
             "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
         );
+        let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
+        let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
+        assert!(pokemon_was_trimmed);
+        assert_eq!(
+            trimmed_pokemon,
+            "Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"
+        );
         let woody = labels_for(&trimmed);
         assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
         assert!(woody.contains(&("E".to_string(), "O".to_string())));