Add Rust DMHY template clustering

Browse files

Files changed (2) hide show

tools/rust_dmhy_template_apply/README.md +20 -1
tools/rust_dmhy_template_apply/src/main.rs +556 -13

tools/rust_dmhy_template_apply/README.md CHANGED Viewed

@@ -2,7 +2,26 @@
 Multi-core Rust implementation of the DMHY template recipe apply stage.
-Run from the repository root:
 ```powershell
 cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `

 Multi-core Rust implementation of the DMHY template recipe apply stage.
+Build template recipes from the repository root:
+```powershell
+cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
+  --cluster `
+  --input datasets\AnimeName\dmhy_list.jsonl `
+  --summary-output reports\dmhy_template_clusters.full_top5000.summary.json `
+  --samples-output reports\dmhy_template_clusters.full_top5000.samples.jsonl `
+  --clusters-output reports\dmhy_template_clusters.full_top5000.jsonl `
+  --recipes-output reports\dmhy_template_recipes.full_top5000.seed.jsonl `
+  --review-output reports\dmhy_template_review.full_top5000.jsonl `
+  --top 5000 `
+  --recipe-top 5000 `
+  --review-top 5000 `
+  --min-count 2 `
+  --recipe-min-count 10 `
+  --threads 24
+```
+Apply template recipes from the repository root:
 ```powershell
 cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `

tools/rust_dmhy_template_apply/src/main.rs CHANGED Viewed

@@ -15,6 +15,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 #[derive(Parser, Debug)]
 #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
 struct Args {
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -29,12 +31,38 @@ struct Args {
         default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
     )]
     manifest_output: PathBuf,
     #[arg(long)]
     limit: Option<usize>,
     #[arg(long)]
     limit_templates: Option<usize>,
     #[arg(long, default_value_t = 1)]
     min_count: u64,
     #[arg(long, default_value = "high")]
     confidence: String,
     #[arg(long, default_value = "all")]
@@ -44,6 +72,8 @@ struct Args {
     #[arg(long)]
     keep_encoding_noise: bool,
     #[arg(long)]
     threads: Option<usize>,
 }
@@ -88,6 +118,15 @@ struct Stats {
     written: usize,
 }
 #[derive(Debug)]
 enum Processed {
     Written {
@@ -123,7 +162,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -141,9 +180,21 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
 });
 static YEAR_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
 static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
     [
         r"^\d{3,4}[xX×]\d{3,4}",
         r"^[\\/]+",
         r"^[-_.:：+&|]+",
         r"^\s+",
@@ -169,6 +220,9 @@ fn main() -> Result<()> {
             .build_global()
             .context("failed to configure rayon thread pool")?;
     }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
@@ -331,6 +385,222 @@ fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
     Ok(values)
 }
 fn process_filename(
     original: &str,
     args: &Args,
@@ -472,7 +742,7 @@ fn split_inner(inner: &str) -> Vec<String> {
     let mut parts = Vec::new();
     let mut current = String::new();
     for ch in inner.chars() {
-        if ch.is_whitespace() || "_.,+/&|-".contains(ch) {
             if !current.is_empty() {
                 parts.push(std::mem::take(&mut current));
             }
@@ -586,10 +856,22 @@ fn classify_token(token: &str) -> String {
     }
     if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
         let inner = strip_wrapper(token);
-        let whole_class = classify_atom(&inner);
         let parts = split_inner(&inner);
         let inner_class = if whole_class != "TEXT" {
-            whole_class
         } else if parts.is_empty() {
             "EMPTY".to_string()
         } else {
@@ -677,7 +959,7 @@ fn suggested_roles(template: &str) -> Vec<String> {
             "EPISODE_VERSION"
         } else if item.contains("EPISODE_RANGE") {
             "EPISODE_RANGE"
-        } else if item.contains("EPISODE") || *item == "SXE" {
             "EPISODE"
         } else if item.contains("RESOLUTION") {
             "RESOLUTION"
@@ -767,12 +1049,39 @@ fn training_filename_for(original: &str) -> (String, bool) {
         .filter(|part| !part.is_empty())
         .collect();
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
-        (parts[1..].join("/"), true)
     } else {
         (original.to_string(), false)
     }
 }
 fn has_encoding_noise(value: &str) -> bool {
     if value.contains('\u{fffd}') {
         return true;
@@ -910,6 +1219,30 @@ fn split_refined_token(token: &str) -> Vec<String> {
     let mut merged = Vec::new();
     let mut index = 0;
     while index < pieces.len() {
         if !is_separator(&pieces[index]) {
             let mut end = index;
             let mut combined = String::new();
@@ -1066,7 +1399,7 @@ fn is_special_title_phrase(text: &str) -> bool {
 fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
     let mut output = roles.to_vec();
     let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
-    let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"];
     if !output.iter().any(|role| role == "TITLE")
         && roles
             .first()
@@ -1086,17 +1419,78 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             }
         }
         if title_run.len() >= 2 {
             for index in title_run {
                 output[index] = "TITLE".to_string();
             }
         }
     }
     for index in 0..roles.len() {
         let text = group_text(tokens, &groups[index]);
         if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
             output[index] = "O".to_string();
             continue;
         }
         if roles[index] == "TITLE" && is_special_title_phrase(&text) {
             output[index] = "SPECIAL".to_string();
             continue;
@@ -1111,7 +1505,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             output[index + 2] = "SEASON".to_string();
             continue;
         }
-        if roles[index] == "TITLE" && roman.contains(&text.to_ascii_uppercase().as_str()) {
             let previous_title = output[..index].iter().any(|role| role == "TITLE");
             let next_structural = roles[index + 1..]
                 .iter()
@@ -1131,6 +1528,29 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
                 output[index + 2] = "O".to_string();
             }
         }
     }
     output
 }
@@ -1301,7 +1721,7 @@ fn project_refined_tokens(
                     }
                 }
                 if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
-                    if let Some((pieces, labels)) = split_sxe_token(token) {
                         output_tokens.extend(pieces);
                         output_labels.extend(labels);
                         continue;
@@ -1315,8 +1735,10 @@ fn project_refined_tokens(
                             continue;
                         }
                     }
-                    output_labels.push(label_for_refined_piece(&piece, role, &group.class_name));
-                    output_tokens.push(piece);
                 }
             } else {
                 if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
@@ -1352,9 +1774,11 @@ fn project_refined_tokens(
 }
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
-    let joiners = [" ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":"];
     let entity_joiners = [
-        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "&",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -1442,6 +1866,8 @@ mod tests {
         let dxd = labels_for("High School D×D");
         assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
         let sxe = labels_for("S01E02");
         assert_eq!(
@@ -1453,6 +1879,27 @@ mod tests {
                 ("02".to_string(), "B-EPISODE".to_string())
             ]
         );
         let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
         assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
@@ -1486,5 +1933,101 @@ mod tests {
         assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
         assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
         assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
     }
 }

 #[derive(Parser, Debug)]
 #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
 struct Args {
+    #[arg(long)]
+    cluster: bool,
     #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
     input: PathBuf,
     #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
         default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
     )]
     manifest_output: PathBuf,
+    #[arg(
+        long,
+        default_value = "reports/dmhy_template_clusters.rust.summary.json"
+    )]
+    summary_output: PathBuf,
+    #[arg(
+        long,
+        default_value = "reports/dmhy_template_clusters.rust.samples.jsonl"
+    )]
+    samples_output: PathBuf,
+    #[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")]
+    clusters_output: PathBuf,
+    #[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")]
+    recipes_output: PathBuf,
+    #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
+    review_output: PathBuf,
     #[arg(long)]
     limit: Option<usize>,
     #[arg(long)]
     limit_templates: Option<usize>,
     #[arg(long, default_value_t = 1)]
     min_count: u64,
+    #[arg(long, default_value_t = 200)]
+    top: usize,
+    #[arg(long, default_value_t = 200)]
+    recipe_top: usize,
+    #[arg(long, default_value_t = 1000)]
+    review_top: usize,
+    #[arg(long, default_value_t = 8)]
+    examples: usize,
+    #[arg(long, default_value_t = 10)]
+    recipe_min_count: usize,
     #[arg(long, default_value = "high")]
     confidence: String,
     #[arg(long, default_value = "all")]
     #[arg(long)]
     keep_encoding_noise: bool,
     #[arg(long)]
+    preserve_parent_paths: bool,
+    #[arg(long)]
     threads: Option<usize>,
 }
     written: usize,
 }
+#[derive(Debug, Default)]
+struct Cluster {
+    count: usize,
+    examples: Vec<String>,
+    literal_counts: HashMap<String, usize>,
+    class_counts: HashMap<String, usize>,
+    position_literals: Vec<HashMap<String, usize>>,
+}
 #[derive(Debug)]
 enum Processed {
     Written {
     Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 });
 static YEAR_RANGE_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
+static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
+});
+static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
+static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
+static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap()
+});
 static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
     [
         r"^\d{3,4}[xX×]\d{3,4}",
+        r"(?i)^h\.?26[45]",
+        r"(?i)^x\.?26[45]",
         r"^[\\/]+",
         r"^[-_.:：+&|]+",
         r"^\s+",
             .build_global()
             .context("failed to configure rayon thread pool")?;
     }
+    if args.cluster {
+        return run_cluster(&args);
+    }
     if args.expand != "all" && args.expand != "sample" {
         bail!("--expand must be all or sample");
     }
     Ok(values)
 }
+fn run_cluster(args: &Args) -> Result<()> {
+    let inputs = load_input(&args.input, args.limit)?;
+    let source_rows = inputs.len();
+    let mut clusters: HashMap<String, Cluster> = HashMap::new();
+    let mut skipped_encoding_noise = 0usize;
+    let mut trimmed_parent_path = 0usize;
+    let mut total_rows = 0usize;
+    for original in inputs {
+        if !args.keep_encoding_noise
+            && (has_encoding_noise(&original)
+                || has_non_anime_noise(&original)
+                || has_abstract_path_noise(&original))
+        {
+            skipped_encoding_noise += 1;
+            continue;
+        }
+        let filename = if args.preserve_parent_paths {
+            original
+        } else {
+            let (training_filename, was_trimmed) = training_filename_for(&original);
+            if was_trimmed {
+                trimmed_parent_path += 1;
+            }
+            training_filename
+        };
+        add_cluster(&mut clusters, &filename, args.examples);
+        total_rows += 1;
+    }
+    let mut sorted_clusters: Vec<_> = clusters.into_iter().collect();
+    sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0)));
+    let cluster_rows: Vec<Value> = sorted_clusters
+        .iter()
+        .enumerate()
+        .map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows))
+        .collect();
+    let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect();
+    let recipe_candidates: Vec<Value> =
+        cluster_rows.iter().take(args.recipe_top).cloned().collect();
+    let recipes: Vec<Value> = recipe_candidates
+        .iter()
+        .filter(|row| is_high_confidence_recipe(row, args.recipe_min_count))
+        .map(|row| recipe_row(row, "high"))
+        .collect();
+    let review: Vec<Value> = recipe_candidates
+        .iter()
+        .filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count))
+        .take(args.review_top)
+        .cloned()
+        .collect();
+    write_jsonl_values(&args.clusters_output, &cluster_rows)?;
+    write_jsonl_values(&args.samples_output, &samples)?;
+    write_jsonl_values(&args.recipes_output, &recipes)?;
+    write_jsonl_values(&args.review_output, &review)?;
+    let mut histogram: HashMap<usize, usize> = HashMap::new();
+    for (_, cluster) in &sorted_clusters {
+        *histogram.entry(cluster.count).or_default() += 1;
+    }
+    let mut count_histogram_top: Vec<_> = histogram.into_iter().collect();
+    count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
+    count_histogram_top.truncate(20);
+    let rows_covered_by_repeated_templates: usize = sorted_clusters
+        .iter()
+        .map(|(_, cluster)| cluster)
+        .filter(|cluster| cluster.count as u64 >= args.min_count)
+        .map(|cluster| cluster.count)
+        .sum();
+    let templates_at_least_min_count = sorted_clusters
+        .iter()
+        .filter(|(_, cluster)| cluster.count as u64 >= args.min_count)
+        .count();
+    let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect();
+    let summary = json!({
+        "input": args.input.to_string_lossy(),
+        "source_rows": source_rows,
+        "skipped_encoding_noise": skipped_encoding_noise,
+        "trimmed_parent_path": trimmed_parent_path,
+        "total_rows": total_rows,
+        "unique_templates": sorted_clusters.len(),
+        "min_count": args.min_count,
+        "templates_at_least_min_count": templates_at_least_min_count,
+        "rows_covered_by_repeated_templates": rows_covered_by_repeated_templates,
+        "rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 },
+        "top_output_rows": samples.len(),
+        "clusters_output": args.clusters_output.to_string_lossy(),
+        "cluster_rows": cluster_rows.len(),
+        "recipes_output": args.recipes_output.to_string_lossy(),
+        "recipe_rows": recipes.len(),
+        "review_output": args.review_output.to_string_lossy(),
+        "review_rows": review.len(),
+        "recipe_top": args.recipe_top,
+        "recipe_min_count": args.recipe_min_count,
+        "top_templates": top_templates,
+        "count_histogram_top": count_histogram_top,
+        "implementation": "rust_dmhy_template_cluster",
+        "generated_at": Utc::now().to_rfc3339(),
+    });
+    if let Some(parent) = args.summary_output.parent() {
+        fs::create_dir_all(parent)?;
+    }
+    fs::write(
+        &args.summary_output,
+        serde_json::to_string_pretty(&summary)?,
+    )?;
+    println!("{}", serde_json::to_string_pretty(&summary)?);
+    Ok(())
+}
+fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) {
+    let (key, tokens, classes, groups) = template_key_for_filename(filename);
+    let cluster = clusters.entry(key).or_default();
+    cluster.count += 1;
+    if cluster.examples.len() < example_limit {
+        cluster.examples.push(filename.to_string());
+    }
+    for (token, class_name) in tokens.iter().zip(classes.iter()) {
+        *cluster.class_counts.entry(class_name.clone()).or_default() += 1;
+        if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
+            let cleaned = strip_wrapper(token);
+            if !cleaned.is_empty() {
+                *cluster.literal_counts.entry(cleaned).or_default() += 1;
+            }
+        }
+    }
+    while cluster.position_literals.len() < groups.len() {
+        cluster.position_literals.push(HashMap::new());
+    }
+    for (index, group) in groups.iter().enumerate() {
+        if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
+            let text = group_text(&tokens, group);
+            if !text.is_empty() {
+                *cluster.position_literals[index].entry(text).or_default() += 1;
+            }
+        }
+    }
+}
+fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
+    json!({
+        "template_id": format!("tpl_{rank:06}"),
+        "template": key,
+        "count": cluster.count,
+        "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
+        "top_literals": top_counts(&cluster.literal_counts, 12),
+        "suggested_roles": suggested_roles(key),
+        "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
+        "class_counts": top_counts(&cluster.class_counts, 20),
+        "examples": cluster.examples,
+    })
+}
+fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
+    let mut items: Vec<_> = counts
+        .iter()
+        .map(|(key, count)| (key.clone(), *count))
+        .collect();
+    items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
+    items.truncate(limit);
+    items
+}
+fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool {
+    if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 {
+        return false;
+    }
+    let roles = match row.get("suggested_roles").and_then(Value::as_array) {
+        Some(roles) => roles,
+        None => return false,
+    };
+    let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect();
+    if role_strings.iter().any(|role| role.contains("_OR_")) {
+        return false;
+    }
+    if !role_strings.contains(&"TITLE")
+        || !role_strings.iter().any(|role| {
+            role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION")
+        })
+    {
+        return false;
+    }
+    let template = row.get("template").and_then(Value::as_str).unwrap_or("");
+    if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") {
+        return false;
+    }
+    !role_strings.contains(&"TITLE_OR_TEXT")
+}
+fn recipe_row(row: &Value, confidence: &str) -> Value {
+    json!({
+        "template_id": row["template_id"],
+        "template": row["template"],
+        "roles": row["suggested_roles"],
+        "confidence": confidence,
+        "count": row["count"],
+        "examples": row["examples"],
+    })
+}
+fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
+    if let Some(parent) = path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+    let mut writer = BufWriter::new(File::create(path)?);
+    for row in rows {
+        serde_json::to_writer(&mut writer, row)?;
+        writer.write_all(b"\n")?;
+    }
+    writer.flush()?;
+    Ok(())
+}
 fn process_filename(
     original: &str,
     args: &Args,
     let mut parts = Vec::new();
     let mut current = String::new();
     for ch in inner.chars() {
+        if ch.is_whitespace() || "_.,+/&|-()（）".contains(ch) {
             if !current.is_empty() {
                 parts.push(std::mem::take(&mut current));
             }
     }
     if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
         let inner = strip_wrapper(token);
         let parts = split_inner(&inner);
+        let whole_class = classify_atom(&inner);
         let inner_class = if whole_class != "TEXT" {
+            if whole_class == "LANG" && parts.len() > 1 {
+                let part_classes: Vec<String> =
+                    parts.iter().map(|part| classify_atom(part)).collect();
+                if part_classes.iter().all(|item| item == &part_classes[0]) {
+                    part_classes[0].clone()
+                } else if part_classes.iter().all(|item| is_media_block_class(item)) {
+                    "MEDIA_BLOCK".to_string()
+                } else {
+                    whole_class
+                }
+            } else {
+                whole_class
+            }
         } else if parts.is_empty() {
             "EMPTY".to_string()
         } else {
             "EPISODE_VERSION"
         } else if item.contains("EPISODE_RANGE") {
             "EPISODE_RANGE"
+        } else if item.contains("EPISODE") || item.contains("SXE") {
             "EPISODE"
         } else if item.contains("RESOLUTION") {
             "RESOLUTION"
         .filter(|part| !part.is_empty())
         .collect();
     if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
+        if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
+            let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
+            let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
+            if parent_seasons
+                .iter()
+                .any(|season| leaf_seasons.contains(season))
+            {
+                (parts[parts.len() - 1].to_string(), true)
+            } else {
+                (parts[parts.len() - 2..].join("/"), true)
+            }
+        } else {
+            (parts[parts.len() - 1].to_string(), true)
+        }
     } else {
         (original.to_string(), false)
     }
 }
+fn path_segment_has_season(value: &str) -> bool {
+    PATH_SEGMENT_SEASON_RE.is_match(value)
+}
+fn path_segment_seasons(value: &str) -> HashSet<u8> {
+    SEASON_WORD_NUMBER_RE
+        .captures_iter(value)
+        .chain(S_NUMBER_SEGMENT_RE.captures_iter(value))
+        .chain(SXE_SEASON_RE.captures_iter(value))
+        .filter_map(|captures| captures.get(1))
+        .filter_map(|item| item.as_str().parse::<u8>().ok())
+        .collect()
+}
 fn has_encoding_noise(value: &str) -> bool {
     if value.contains('\u{fffd}') {
         return true;
     let mut merged = Vec::new();
     let mut index = 0;
     while index < pieces.len() {
+        if index + 2 < pieces.len()
+            && !is_separator(&pieces[index])
+            && is_separator(&pieces[index + 1])
+            && !is_separator(&pieces[index + 2])
+        {
+            let combined = format!(
+                "{}{}{}",
+                pieces[index],
+                pieces[index + 1],
+                pieces[index + 2]
+            );
+            let combined_class = classify_atom(&combined);
+            if !pieces[index + 1].chars().any(char::is_whitespace)
+                && matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×")
+                && matches!(
+                    combined_class.as_str(),
+                    "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
+                )
+            {
+                merged.push(combined);
+                index += 3;
+                continue;
+            }
+        }
         if !is_separator(&pieces[index]) {
             let mut end = index;
             let mut combined = String::new();
 fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
     let mut output = roles.to_vec();
     let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
+    let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
     if !output.iter().any(|role| role == "TITLE")
         && roles
             .first()
             }
         }
         if title_run.len() >= 2 {
+            let last_title_index = *title_run.last().unwrap();
+            let later_structural = roles[last_title_index + 1..].iter().any(|role| {
+                role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
+            });
+            if group_text(tokens, &groups[0])
+                .chars()
+                .all(|ch| ch.is_ascii_digit())
+                && later_structural
+            {
+                output[0] = "TITLE".to_string();
+            }
             for index in title_run {
                 output[index] = "TITLE".to_string();
             }
         }
     }
+    if roles
+        .first()
+        .is_some_and(|role| role.starts_with("EPISODE"))
+        && group_text(tokens, &groups[0])
+            .chars()
+            .all(|ch| ch.is_ascii_digit())
+    {
+        if let Some(first_title) = output.iter().position(|role| role == "TITLE") {
+            let later_structural = roles[first_title + 1..].iter().any(|role| {
+                role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
+            });
+            if later_structural {
+                output[0] = "TITLE".to_string();
+            }
+        }
+    }
     for index in 0..roles.len() {
         let text = group_text(tokens, &groups[index]);
+        if output[index] == "O" && groups[index].class_name.contains("SXE") {
+            output[index] = "EPISODE".to_string();
+        }
         if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
             output[index] = "O".to_string();
             continue;
         }
+        if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
+            let previous_text = group_text(tokens, &groups[index - 2]);
+            let next_special = output[index + 1..roles.len().min(index + 4)]
+                .iter()
+                .any(|role| role == "SPECIAL");
+            let next_episode = roles[index + 1..]
+                .iter()
+                .any(|role| role.starts_with("EPISODE"));
+            if groups[index - 1].class_name == "SEP"
+                && matches!(
+                    previous_text.to_ascii_lowercase().as_str(),
+                    "vol" | "volume"
+                )
+            {
+                output[index - 2] = "SPECIAL".to_string();
+                output[index] = "SPECIAL".to_string();
+                continue;
+            }
+            if output[index - 2] == "TITLE"
+                && groups[index - 1].class_name == "SEP"
+                && previous_text.len() <= 4
+                && previous_text.is_ascii()
+                && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
+                && text.chars().all(|ch| ch.is_ascii_digit())
+                && text.len() <= 3
+                && (next_special || next_episode)
+            {
+                output[index] = "TITLE".to_string();
+                continue;
+            }
+        }
         if roles[index] == "TITLE" && is_special_title_phrase(&text) {
             output[index] = "SPECIAL".to_string();
             continue;
             output[index + 2] = "SEASON".to_string();
             continue;
         }
+        if roles[index] == "TITLE"
+            && text == text.to_ascii_uppercase()
+            && roman.contains(&text.as_str())
+        {
             let previous_title = output[..index].iter().any(|role| role == "TITLE");
             let next_structural = roles[index + 1..]
                 .iter()
                 output[index + 2] = "O".to_string();
             }
         }
+        if roles[index].starts_with("EPISODE") {
+            let previous_text = if index >= 1 {
+                group_text(tokens, &groups[index - 1])
+            } else {
+                String::new()
+            };
+            let next_text = if index + 1 < roles.len() {
+                group_text(tokens, &groups[index + 1])
+            } else {
+                String::new()
+            };
+            if previous_text.contains('点')
+                || previous_text.contains('點')
+                || previous_text.contains("晚上")
+                || previous_text.contains("上午")
+                || previous_text.contains("下午")
+                || next_text.contains('点')
+                || next_text.contains('點')
+                || next_text.contains('半')
+            {
+                output[index] = "O".to_string();
+            }
+        }
     }
     output
 }
                     }
                 }
                 if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
+                    if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
                         output_tokens.extend(pieces);
                         output_labels.extend(labels);
                         continue;
                             continue;
                         }
                     }
+                    let label = label_for_refined_piece(&piece, role, &group.class_name);
+                    let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
+                    output_tokens.extend(pieces);
+                    output_labels.extend(labels);
                 }
             } else {
                 if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
 }
 fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
+    let joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！",
+    ];
     let entity_joiners = [
+        " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "&", "＆",
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
         let dxd = labels_for("High School D×D");
         assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
+        let colon_title = labels_for("Megumi no Daigo：Kyuukoku no Orange 06");
+        assert!(colon_title.contains(&("：".to_string(), "B-TITLE".to_string())));
         let sxe = labels_for("S01E02");
         assert_eq!(
                 ("02".to_string(), "B-EPISODE".to_string())
             ]
         );
+        let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
+        assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
+        assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
+        assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string())));
+        assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string())));
+        let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)");
+        assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string())));
+        assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
+        let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
+        assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
+        let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
+        assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
+        assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string())));
+        let zom =
+            labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]");
+        assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string())));
+        assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
+        assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
         let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
         assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
         assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
         assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
         assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
+        let (trimmed, was_trimmed) =
+            training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]");
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]");
+        let (key, _, _, _) = template_key_for_filename(&trimmed);
+        assert_eq!(
+            key,
+            "BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION"
+        );
+        let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
+        assert!(short.contains(&("R".to_string(), "B-TITLE".to_string())));
+        assert!(short.contains(&("-".to_string(), "B-TITLE".to_string())));
+        assert!(short.contains(&("15".to_string(), "B-TITLE".to_string())));
+        assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string())));
+        let short_before_episode =
+            labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
+        assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string())));
+        assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string())));
+        assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string())));
+        assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string())));
+        let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]";
+        let (trimmed, was_trimmed) = training_filename_for(avatar);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
+        let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
+        let (trimmed, was_trimmed) = training_filename_for(tintin);
+        assert!(was_trimmed);
+        assert_eq!(
+            trimmed,
+            "Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"
+        );
+        let (key, _, _, _) = template_key_for_filename(&trimmed);
+        assert_eq!(
+            key,
+            "TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT"
+        );
+        let bocchi = "Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」";
+        let (leaf_key, _, _, _) =
+            template_key_for_filename("Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反��轉」");
+        assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
+        assert!(filename_has_title(
+            "Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」"
+        ));
+        let (trimmed, was_trimmed) = training_filename_for(bocchi);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」");
+        let (key, _, _, _) = template_key_for_filename(&trimmed);
+        assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
+        let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]";
+        let (trimmed, was_trimmed) = training_filename_for(usagi);
+        assert!(was_trimmed);
+        assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]");
+        let (key, _, _, _) = template_key_for_filename(&trimmed);
+        assert_eq!(
+            key,
+            "TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA"
+        );
+        let woody_parent =
+            "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
+        let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
+        assert!(was_trimmed);
+        assert_eq!(trimmed, woody_parent);
+        let volume =
+            labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
+        assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string())));
+        assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string())));
+        assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string())));
+        assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
+        assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let numeric_title =
+            labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
+        assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
+        assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string())));
+        assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string())));
+        assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string())));
+        assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string())));
+        assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string())));
+        let media_block =
+            labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]");
+        assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string())));
+        assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
+        assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
+        assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
     }
 }