use anyhow::{bail, Context, Result}; use chrono::Utc; use clap::Parser; use once_cell::sync::Lazy; use rayon::prelude::*; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::{HashMap, HashSet}; use std::fs::{self, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; #[derive(Parser, Debug)] #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")] struct Args { #[arg(long)] cluster: bool, #[arg(long)] audit_low_frequency: bool, #[arg(long)] verify_generated_output: bool, #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")] input: PathBuf, #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")] recipes: PathBuf, #[arg( long, default_value = "reports/dmhy_weak.template_generated.rust.jsonl" )] output: PathBuf, #[arg( long, default_value = "reports/dmhy_weak.template_generated.rust.manifest.json" )] manifest_output: PathBuf, #[arg( long, default_value = "reports/dmhy_template_clusters.rust.summary.json" )] summary_output: PathBuf, #[arg( long, default_value = "reports/dmhy_template_clusters.rust.samples.jsonl" )] samples_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")] clusters_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")] recipes_output: PathBuf, #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")] review_output: PathBuf, #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")] audit_output: PathBuf, #[arg(long, default_value_t = 50)] audit_max_count: u64, #[arg(long)] limit: Option, #[arg(long)] limit_templates: Option, #[arg(long, default_value_t = 1)] min_count: u64, #[arg(long, default_value_t = 200)] top: usize, #[arg(long, default_value_t = 200)] recipe_top: usize, #[arg(long, default_value_t = 1000)] review_top: usize, #[arg(long, default_value_t = 8)] examples: usize, #[arg(long, default_value_t = 25)] recipe_min_count: usize, #[arg(long, default_value = "high")] confidence: String, #[arg(long, default_value = "all")] expand: String, #[arg(long, default_value_t = 100)] sample_per_template: usize, #[arg(long)] keep_encoding_noise: bool, #[arg(long)] preserve_parent_paths: bool, #[arg(long)] threads: Option, } #[derive(Debug, Clone, Deserialize)] struct Recipe { template_id: String, template: String, roles: Vec, confidence: Option, count: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Record { filename: String, tokens: Vec, labels: Vec, template_id: String, template: String, #[serde(skip_serializing_if = "Option::is_none")] source_filename: Option, #[serde(skip_serializing_if = "Option::is_none")] path_trimmed: Option, #[serde(skip_serializing_if = "Option::is_none")] dropped_title_candidate_positions: Option>, } #[derive(Debug, Clone)] struct Group { indices: Vec, class_name: String, } #[derive(Debug, Default, Clone, Serialize)] struct Stats { seen: usize, skipped_encoding_noise: usize, trimmed_parent_path: usize, skipped_no_recipe: usize, skipped_sample_cap: usize, skipped_role_mismatch: usize, skipped_low_frequency_audit_warning: usize, written: usize, } #[derive(Debug, Default)] struct Cluster { count: usize, examples: Vec, literal_counts: HashMap, class_counts: HashMap, position_literals: Vec>, } #[derive(Debug)] enum Processed { Written { record: Record, trimmed_parent: bool, }, Skipped { reason: &'static str, trimmed_parent: bool, }, } static HASH_RE: Lazy = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap()); static RESOLUTION_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap()); static EPISODE_VERSION_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap()); static EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap()); static EPISODE_CJK_RE: Lazy = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap()); static EPISODE_RANGE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap()); static EPISODE_BATCH_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*(?:TV|全集|全|END|Fin|Complete|SP|OVA|OAD|NCOP|NCED)|[+_./-])*.{0,16}$").unwrap() }); static SXE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap()); static SXE_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap()); static EPISODE_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap()); static SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap() }); static CJK_SEASON_TOKEN_RE: Lazy = Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap()); static SEASON_VALUE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap()); static SPECIAL_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap() }); static VOLUME_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap()); static DATE_RE: Lazy = Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()); static LANG_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap() }); static MEDIA_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap() }); static SPECIAL_TITLE_PHRASE_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b") .unwrap() }); static YEAR_RANGE_RE: Lazy = Lazy::new(|| Regex::new(r"^$?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*$?$").unwrap()); static PATH_SEGMENT_SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap() }); static SEASON_WORD_NUMBER_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap()); static PLAIN_SEASON_SEGMENT_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap()); static S_NUMBER_SEGMENT_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap()); static SXE_SEASON_RE: Lazy = Lazy::new(|| { Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap() }); static TOKEN_REGEXES: Lazy> = Lazy::new(|| { [ r"^\d{3,4}[xX×]\d{3,4}", r"(?i)^h\.?26[45]", r"(?i)^x\.?26[45]", r"^[\\/]+", r"^[-_.:：+&|]+", r"^\s+", r"(?i)^Season\s*\d{1,2}", r"^[A-Za-z]+(?:\d+[A-Za-z]*)*", r"^\d+[A-Za-z]+\d*", r"^\d{1,4}(?:[._-]\d{1,4})*", r"^[\p{Hiragana}\p{Katakana}\p{Han}]+", ] .into_iter() .map(|pattern| Regex::new(pattern).unwrap()) .collect() }); static SIMPLE_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap()); static SPECIAL_SPACE_RE: Lazy = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap()); fn main() -> Result<()> { let args = Args::parse(); if let Some(threads) = args.threads { rayon::ThreadPoolBuilder::new() .num_threads(threads) .build_global() .context("failed to configure rayon thread pool")?; } if args.cluster { return run_cluster(&args); } if args.audit_low_frequency { return run_low_frequency_audit(&args); } if args.verify_generated_output { return run_verify_generated_output(&args); } if args.expand != "all" && args.expand != "sample" { bail!("--expand must be all or sample"); } let recipes = load_recipes(&args)?; if recipes.is_empty() { bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates"); } let inputs = load_input(&args.input, args.limit)?; let sample_counters: HashMap = recipes .values() .map(|recipe| (recipe.template_id.clone(), AtomicUsize::new(0))) .collect(); let processed: Vec = inputs .par_iter() .map(|filename| process_filename(filename, &args, &recipes, &sample_counters)) .collect(); if let Some(parent) = args.output.parent() { fs::create_dir_all(parent)?; } if let Some(parent) = args.manifest_output.parent() { fs::create_dir_all(parent)?; } let mut stats = Stats { seen: inputs.len(), ..Stats::default() }; let mut label_counts: HashMap = HashMap::new(); let mut template_counts: HashMap = HashMap::new(); let mut examples = Vec::new(); let mut writer = BufWriter::new(File::create(&args.output)?); for item in processed { match item { Processed::Written { record, trimmed_parent, } => { if trimmed_parent { stats.trimmed_parent_path += 1; } for label in &record.labels { *label_counts.entry(label.clone()).or_default() += 1; } *template_counts .entry(record.template_id.clone()) .or_default() += 1; if examples.len() < 20 { examples.push(serde_json::to_value(&record)?); } serde_json::to_writer(&mut writer, &record)?; writer.write_all(b"\n")?; stats.written += 1; } Processed::Skipped { reason, trimmed_parent, } => { if trimmed_parent { stats.trimmed_parent_path += 1; } match reason { "encoding_noise" => stats.skipped_encoding_noise += 1, "no_recipe" => stats.skipped_no_recipe += 1, "sample_cap" => stats.skipped_sample_cap += 1, "role_mismatch" => stats.skipped_role_mismatch += 1, "low_frequency_audit_warning" => { stats.skipped_low_frequency_audit_warning += 1 } _ => {} } } } } writer.flush()?; let mut top_template_counts: Vec<_> = template_counts.into_iter().collect(); top_template_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); top_template_counts.truncate(20); let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "output": args.output.to_string_lossy(), "selected_templates": recipes.len(), "confidence": args.confidence, "min_count": args.min_count, "low_frequency_audit_max_count": args.audit_max_count, "low_frequency_blocking_warnings": [ "hash_labeled", "multiple_title_spans", "no_title", "path_retained" ], "expand": args.expand, "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None }, "stats": stats, "label_counts": label_counts, "top_template_counts": top_template_counts, "examples": examples, "implementation": "rust_dmhy_template_apply" }); fs::write( &args.manifest_output, serde_json::to_string_pretty(&manifest)?, )?; println!("{}", serde_json::to_string_pretty(&manifest)?); Ok(()) } fn load_recipes(args: &Args) -> Result> { let file = File::open(&args.recipes) .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?; let mut recipes = HashMap::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { let line = line?; if line.trim().is_empty() { continue; } let row: Recipe = serde_json::from_str(&line).with_context(|| { format!( "invalid recipe JSON at {}:{}", args.recipes.display(), line_number + 1 ) })?; if !args.confidence.is_empty() && row.confidence.as_deref() != Some(args.confidence.as_str()) { continue; } if row.count.unwrap_or(0) < args.min_count { continue; } recipes.insert(row.template.clone(), row); if args .limit_templates .is_some_and(|limit| recipes.len() >= limit) { break; } } Ok(recipes) } fn load_input(path: &PathBuf, limit: Option) -> Result> { let file = File::open(path).with_context(|| format!("input JSONL not found: {}", path.display()))?; let mut values = Vec::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { if limit.is_some_and(|limit| values.len() >= limit) { break; } let line = line?; if line.trim().is_empty() { continue; } let row: Value = serde_json::from_str(&line) .with_context(|| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?; if let Some(value) = row.get("value").and_then(Value::as_str) { let value = value.trim(); if !value.is_empty() { values.push(value.to_string()); } } } Ok(values) } fn run_cluster(args: &Args) -> Result<()> { let inputs = load_input(&args.input, args.limit)?; let source_rows = inputs.len(); let mut clusters: HashMap = HashMap::new(); let mut skipped_encoding_noise = 0usize; let mut trimmed_parent_path = 0usize; let mut total_rows = 0usize; for original in inputs { if !args.keep_encoding_noise && (has_encoding_noise(&original) || has_non_anime_noise(&original) || has_abstract_path_noise(&original)) { skipped_encoding_noise += 1; continue; } let filename = if args.preserve_parent_paths { original } else { let (training_filename, was_trimmed) = training_filename_for(&original); if was_trimmed { trimmed_parent_path += 1; } training_filename }; add_cluster(&mut clusters, &filename, args.examples); total_rows += 1; } let mut sorted_clusters: Vec<_> = clusters.into_iter().collect(); sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0))); let cluster_rows: Vec = sorted_clusters .iter() .enumerate() .map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows)) .collect(); let samples: Vec = cluster_rows.iter().take(args.top).cloned().collect(); let recipe_candidates: Vec = cluster_rows.iter().take(args.recipe_top).cloned().collect(); let recipes: Vec = recipe_candidates .iter() .filter(|row| is_high_confidence_recipe(row, args.recipe_min_count)) .map(|row| recipe_row(row, "high")) .collect(); let review: Vec = recipe_candidates .iter() .filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count)) .take(args.review_top) .cloned() .collect(); write_jsonl_values(&args.clusters_output, &cluster_rows)?; write_jsonl_values(&args.samples_output, &samples)?; write_jsonl_values(&args.recipes_output, &recipes)?; write_jsonl_values(&args.review_output, &review)?; let mut histogram: HashMap = HashMap::new(); for (_, cluster) in &sorted_clusters { *histogram.entry(cluster.count).or_default() += 1; } let mut count_histogram_top: Vec<_> = histogram.into_iter().collect(); count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); count_histogram_top.truncate(20); let rows_covered_by_repeated_templates: usize = sorted_clusters .iter() .map(|(_, cluster)| cluster) .filter(|cluster| cluster.count as u64 >= args.min_count) .map(|cluster| cluster.count) .sum(); let templates_at_least_min_count = sorted_clusters .iter() .filter(|(_, cluster)| cluster.count as u64 >= args.min_count) .count(); let top_templates: Vec = cluster_rows.iter().take(20).cloned().collect(); let summary = json!({ "input": args.input.to_string_lossy(), "source_rows": source_rows, "skipped_encoding_noise": skipped_encoding_noise, "trimmed_parent_path": trimmed_parent_path, "total_rows": total_rows, "unique_templates": sorted_clusters.len(), "min_count": args.min_count, "templates_at_least_min_count": templates_at_least_min_count, "rows_covered_by_repeated_templates": rows_covered_by_repeated_templates, "rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 }, "top_output_rows": samples.len(), "clusters_output": args.clusters_output.to_string_lossy(), "cluster_rows": cluster_rows.len(), "recipes_output": args.recipes_output.to_string_lossy(), "recipe_rows": recipes.len(), "review_output": args.review_output.to_string_lossy(), "review_rows": review.len(), "recipe_top": args.recipe_top, "recipe_min_count": args.recipe_min_count, "top_templates": top_templates, "count_histogram_top": count_histogram_top, "implementation": "rust_dmhy_template_cluster", "generated_at": Utc::now().to_rfc3339(), }); if let Some(parent) = args.summary_output.parent() { fs::create_dir_all(parent)?; } fs::write( &args.summary_output, serde_json::to_string_pretty(&summary)?, )?; println!("{}", serde_json::to_string_pretty(&summary)?); Ok(()) } fn add_cluster(clusters: &mut HashMap, filename: &str, example_limit: usize) { let (key, tokens, classes, groups) = template_key_for_filename(filename); let cluster = clusters.entry(key).or_default(); cluster.count += 1; if cluster.examples.len() < example_limit { cluster.examples.push(filename.to_string()); } for (token, class_name) in tokens.iter().zip(classes.iter()) { *cluster.class_counts.entry(class_name.clone()).or_default() += 1; if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") { let cleaned = strip_wrapper(token); if !cleaned.is_empty() { *cluster.literal_counts.entry(cleaned).or_default() += 1; } } } while cluster.position_literals.len() < groups.len() { cluster.position_literals.push(HashMap::new()); } for (index, group) in groups.iter().enumerate() { if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") { let text = group_text(&tokens, group); if !text.is_empty() { *cluster.position_literals[index].entry(text).or_default() += 1; } } } } fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value { json!({ "template_id": format!("tpl_{rank:06}"), "template": key, "count": cluster.count, "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 }, "top_literals": top_counts(&cluster.literal_counts, 12), "suggested_roles": suggested_roles(key), "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::>(), "class_counts": top_counts(&cluster.class_counts, 20), "examples": cluster.examples, }) } fn top_counts(counts: &HashMap, limit: usize) -> Vec<(String, usize)> { let mut items: Vec<_> = counts .iter() .map(|(key, count)| (key.clone(), *count)) .collect(); items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); items.truncate(limit); items } fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool { if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 { return false; } let roles = match row.get("suggested_roles").and_then(Value::as_array) { Some(roles) => roles, None => return false, }; let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect(); if role_strings.iter().any(|role| role.contains("_OR_")) { return false; } if !role_strings.contains(&"TITLE") || !role_strings.iter().any(|role| { role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION") }) { return false; } let template = row.get("template").and_then(Value::as_str).unwrap_or(""); if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") { return false; } !role_strings.contains(&"TITLE_OR_TEXT") } fn recipe_row(row: &Value, confidence: &str) -> Value { json!({ "template_id": row["template_id"], "template": row["template"], "roles": row["suggested_roles"], "confidence": confidence, "count": row["count"], "examples": row["examples"], }) } fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> { if let Some(parent) = path.parent() { fs::create_dir_all(parent)?; } let mut writer = BufWriter::new(File::create(path)?); for row in rows { serde_json::to_writer(&mut writer, row)?; writer.write_all(b"\n")?; } writer.flush()?; Ok(()) } fn run_low_frequency_audit(args: &Args) -> Result<()> { let recipes = load_recipes(args)?; let inputs = load_input(&args.input, args.limit)?; let low_template_total = recipes .values() .filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count) .count(); let mut seen_templates = HashSet::new(); let mut rows = Vec::new(); for original in inputs { if !args.keep_encoding_noise && (has_encoding_noise(&original) || has_non_anime_noise(&original) || has_abstract_path_noise(&original)) { continue; } let (training_filename, trimmed_parent) = training_filename_for(&original); let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); let Some(recipe) = recipes.get(&key) else { continue; }; let count = recipe.count.unwrap_or(0); if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) { continue; } if recipe.roles.len() != groups.len() { continue; } let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) else { continue; }; if trimmed_parent { record.source_filename = Some(original.clone()); record.path_trimmed = Some(true); } rows.push(json!({ "template_id": recipe.template_id, "count": count, "template": recipe.template, "filename": record.filename, "source_filename": record.source_filename, "path_trimmed": record.path_trimmed.unwrap_or(false), "spans": entity_spans(&record.tokens, &record.labels), "warnings": audit_warnings(&record), "tokens": record.tokens, "labels": record.labels, })); if seen_templates.len() >= low_template_total { break; } } rows.sort_by(|a, b| { let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0); let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0); let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or(""); let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or(""); count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b)) }); write_jsonl_values(&args.audit_output, &rows)?; let warning_counts = warning_counts(&rows); let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "audit_output": args.audit_output.to_string_lossy(), "audit_max_count": args.audit_max_count, "low_template_total": low_template_total, "audited_templates": rows.len(), "warning_counts": warning_counts, "implementation": "rust_dmhy_low_frequency_audit" }); println!("{}", serde_json::to_string_pretty(&manifest)?); Ok(()) } fn run_verify_generated_output(args: &Args) -> Result<()> { let file = File::open(&args.input) .with_context(|| format!("generated JSONL not found: {}", args.input.display()))?; let recipes_by_id: HashMap = load_recipes(args)? .into_values() .map(|recipe| (recipe.template_id, recipe.count.unwrap_or(0))) .collect(); let mut rows = 0usize; let mut low_frequency_rows = 0usize; let mut warning_counts: HashMap = HashMap::new(); let mut examples: HashMap> = HashMap::new(); for (line_number, line) in BufReader::new(file).lines().enumerate() { let line = line?; if line.trim().is_empty() { continue; } let record: Record = serde_json::from_str(&line).with_context(|| { format!( "invalid generated record at {}:{}", args.input.display(), line_number + 1 ) })?; rows += 1; let count = recipes_by_id .get(&record.template_id) .copied() .unwrap_or(u64::MAX); if count > args.audit_max_count { continue; } low_frequency_rows += 1; for warning in audit_warnings(&record) { if !matches!( warning.as_str(), "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" ) { continue; } *warning_counts.entry(warning.clone()).or_default() += 1; let bucket = examples.entry(warning).or_default(); if bucket.len() < 5 { bucket.push(json!({ "template_id": record.template_id, "template_count": count, "filename": record.filename, "spans": entity_spans(&record.tokens, &record.labels), })); } } } let manifest = json!({ "generated_at": Utc::now().to_rfc3339(), "input": args.input.to_string_lossy(), "recipes": args.recipes.to_string_lossy(), "audit_max_count": args.audit_max_count, "rows": rows, "low_frequency_rows": low_frequency_rows, "blocking_warning_counts": warning_counts, "examples": examples, "implementation": "rust_dmhy_generated_output_verify" }); println!("{}", serde_json::to_string_pretty(&manifest)?); if !warning_counts.is_empty() { bail!("generated output still has low-frequency blocking warnings"); } Ok(()) } fn entity_spans(tokens: &[String], labels: &[String]) -> Vec { let mut spans = Vec::new(); let mut current_label: Option = None; let mut current_text = String::new(); for (token, label) in tokens.iter().zip(labels.iter()) { let entity = label .strip_prefix("B-") .or_else(|| label.strip_prefix("I-")) .unwrap_or("O"); if current_label.as_deref() == Some(entity) { current_text.push_str(token); continue; } if let Some(label) = current_label.take() { if label != "O" { spans.push(json!({ "label": label, "text": current_text })); } } current_label = Some(entity.to_string()); current_text = token.clone(); } if let Some(label) = current_label { if label != "O" { spans.push(json!({ "label": label, "text": current_text })); } } spans } fn audit_warnings(record: &Record) -> Vec { let mut warnings = Vec::new(); let title_spans = entity_spans(&record.tokens, &record.labels) .into_iter() .filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE")) .count(); if title_spans == 0 { warnings.push("no_title".to_string()); } else if title_spans > 1 { warnings.push("multiple_title_spans".to_string()); } if !record.labels.iter().any(|label| label.ends_with("EPISODE")) { warnings.push("no_episode".to_string()); } if record.filename.contains('/') || record.filename.contains('\\') { warnings.push("path_retained".to_string()); } for (index, token) in record.tokens.iter().enumerate() { if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") { warnings.push("hash_labeled".to_string()); break; } } warnings.sort(); warnings.dedup(); warnings } fn warning_counts(rows: &[Value]) -> HashMap { let mut counts = HashMap::new(); for row in rows { if let Some(warnings) = row.get("warnings").and_then(Value::as_array) { for warning in warnings { if let Some(warning) = warning.as_str() { *counts.entry(warning.to_string()).or_default() += 1; } } } } counts } fn process_filename( original: &str, args: &Args, recipes: &HashMap, sample_counters: &HashMap, ) -> Processed { if !args.keep_encoding_noise && (has_encoding_noise(original) || has_non_anime_noise(original) || has_abstract_path_noise(original)) { return Processed::Skipped { reason: "encoding_noise", trimmed_parent: false, }; } let (training_filename, trimmed_parent) = training_filename_for(original); let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); let recipe = match recipes.get(&key) { Some(recipe) => recipe, None => { return Processed::Skipped { reason: "no_recipe", trimmed_parent, } } }; if args.expand == "sample" { let counter = sample_counters.get(&recipe.template_id).unwrap(); if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template { return Processed::Skipped { reason: "sample_cap", trimmed_parent, }; } } if recipe.roles.len() != groups.len() { return Processed::Skipped { reason: "role_mismatch", trimmed_parent, }; } let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) { Some(record) => record, None => { return Processed::Skipped { reason: "role_mismatch", trimmed_parent, } } }; if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record) { return Processed::Skipped { reason: "low_frequency_audit_warning", trimmed_parent, }; } if trimmed_parent { record.source_filename = Some(original.to_string()); record.path_trimmed = Some(true); return Processed::Written { record, trimmed_parent: true, }; } Processed::Written { record, trimmed_parent: false, } } fn has_blocking_low_frequency_warning(record: &Record) -> bool { audit_warnings(record).iter().any(|warning| { matches!( warning.as_str(), "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" ) }) } fn tokenize(value: &str) -> Vec { let mut output = Vec::new(); let mut index = 0; while index < value.len() { let rest = &value[index..]; if let Some((token, len)) = next_token(rest) { output.push(token); index += len; } else { let ch = rest.chars().next().unwrap(); output.push(ch.to_string()); index += ch.len_utf8(); } } output } fn next_token(rest: &str) -> Option<(String, usize)> { let first = rest.chars().next()?; if first == '[' { if let Some(end) = rest.find(']') { if end <= 121 { return Some((rest[..=end].to_string(), end + 1)); } } } if first == '(' { if let Some(end) = rest.find(')') { if end <= 121 { return Some((rest[..=end].to_string(), end + 1)); } } } if first == '【' { if let Some(end) = rest.find('】') { if rest[..end].chars().count() <= 120 { return Some(( rest[..end + '】'.len_utf8()].to_string(), end + '】'.len_utf8(), )); } } } for re in TOKEN_REGEXES.iter() { if let Some(mat) = re.find(rest) { if mat.start() == 0 && mat.end() > 0 { return Some((mat.as_str().to_string(), mat.end())); } } } None } fn strip_wrapper(token: &str) -> String { let chars: Vec = token.chars().collect(); if chars.len() >= 2 { let first = chars[0]; let last = chars[chars.len() - 1]; if (first == '[' && last == ']') || (first == '(' && last == ')') || (first == '【' && last == '】') { return chars[1..chars.len() - 1] .iter() .collect::() .trim() .to_string(); } } token.trim().to_string() } fn split_inner(inner: &str) -> Vec { let mut parts = Vec::new(); let mut current = String::new(); for ch in inner.chars() { if ch.is_whitespace() || "_.,+/&|-()（）".contains(ch) { if !current.is_empty() { parts.push(std::mem::take(&mut current)); } } else { current.push(ch); } } if !current.is_empty() { parts.push(current); } parts } fn compact_for_classify(text: &str) -> String { text.chars() .filter(|ch| !ch.is_whitespace() && !matches!(ch, '_' | '.' | ',' | '-')) .collect() } fn classify_atom(text: &str) -> String { let cleaned = strip_wrapper(text); let compact = compact_for_classify(&cleaned); if cleaned.is_empty() { return "EMPTY".to_string(); } if HASH_RE.is_match(&cleaned) { return "HASH".to_string(); } if RESOLUTION_RE.is_match(&cleaned) { return "RESOLUTION".to_string(); } if DATE_RE.is_match(&cleaned) { return "DATE".to_string(); } if EPISODE_VERSION_RE.is_match(&compact) { return "EPISODE_VERSION".to_string(); } if SXE_RE.is_match(&compact) { return "SXE".to_string(); } if EPISODE_RE.is_match(&compact) { return "EPISODE".to_string(); } if EPISODE_CJK_RE.is_match(&cleaned) { return "EPISODE".to_string(); } if EPISODE_BATCH_RE.is_match(&cleaned) { return "EPISODE_RANGE".to_string(); } if EPISODE_RANGE_RE.is_match(&cleaned) { return "EPISODE_RANGE".to_string(); } if EPISODE_RE.is_match(&cleaned) { return "EPISODE".to_string(); } if SEASON_RE.is_match(&cleaned) { return "SEASON".to_string(); } if SPECIAL_RE.is_match(&cleaned) { return "SPECIAL".to_string(); } if VOLUME_RE.is_match(&cleaned) { return "VOLUME".to_string(); } if LANG_RE.is_match(&cleaned) || lang_block_matches(&cleaned) { return "LANG".to_string(); } if MEDIA_RE.is_match(&cleaned) { return "MEDIA".to_string(); } "TEXT".to_string() } fn lang_block_matches(text: &str) -> bool { let upper = text.to_ascii_uppercase(); if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"] .iter() .any(|marker| upper.contains(marker)) { return true; } if upper.contains("GB") { return true; } if [ "简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂", ] .iter() .any(|marker| text.contains(marker)) { return true; } let chars: Vec = text.chars().collect(); chars.windows(2).enumerate().any(|(index, pair)| { pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' | '組')) }) } fn classify_token(token: &str) -> String { if token.is_empty() { return "EMPTY".to_string(); } if token.chars().all(char::is_whitespace) { return "SPACE".to_string(); } if token.chars().all(|ch| ch == '/' || ch == '\\') { return "PATH".to_string(); } if token.chars().all(|ch| "-_.:：+&|".contains(ch)) { return "SEP".to_string(); } if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') { let inner = strip_wrapper(token); let parts = split_inner(&inner); let whole_class = classify_atom(&inner); let inner_class = if whole_class != "TEXT" { if whole_class == "LANG" && parts.len() > 1 { let part_classes: Vec = parts.iter().map(|part| classify_atom(part)).collect(); if part_classes.iter().all(|item| item == &part_classes[0]) { part_classes[0].clone() } else if part_classes.iter().all(|item| is_media_block_class(item)) { "MEDIA_BLOCK".to_string() } else { whole_class } } else { whole_class } } else if parts.is_empty() { "EMPTY".to_string() } else { let part_classes: Vec = parts.iter().map(|part| classify_atom(part)).collect(); if part_classes.iter().all(|item| item == &part_classes[0]) { part_classes[0].clone() } else if part_classes.iter().all(|item| is_media_block_class(item)) { "MEDIA_BLOCK".to_string() } else if part_classes.iter().any(|item| is_media_block_class(item)) && parts.iter().zip(part_classes.iter()).all(|(part, item)| { is_media_block_class(item) || matches!(part.to_ascii_lowercase().as_str(), "anime" | "アニメ") }) { "MEDIA_BLOCK".to_string() } else if part_classes.iter().any(|item| item == "TEXT") { "TEXT".to_string() } else { let mut set: Vec = part_classes .into_iter() .collect::>() .into_iter() .collect(); set.sort(); set.join("_") } }; return format!("BRACKET_{inner_class}"); } classify_atom(token) } fn is_media_block_class(value: &str) -> bool { matches!(value, "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE") } fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec { let mut groups: Vec = Vec::new(); let mut previous: Option = None; for (index, token_class) in classes.iter().enumerate() { let current = if token_class == "SPACE" { "SEP" } else { token_class } .to_string(); if previous.as_deref() == Some(current.as_str()) && matches!(current.as_str(), "SEP" | "TEXT") { groups.last_mut().unwrap().indices.push(index); } else { groups.push(Group { indices: vec![index], class_name: current.clone(), }); } previous = Some(current); } groups } fn template_key_for_filename(filename: &str) -> (String, Vec, Vec, Vec) { let tokens = tokenize(filename); let classes: Vec = tokens.iter().map(|token| classify_token(token)).collect(); let groups = compact_token_groups(&tokens, &classes); let key = groups .iter() .map(|group| group.class_name.as_str()) .collect::>() .join(" "); (key, tokens, classes, groups) } fn suggested_roles(template: &str) -> Vec { let items: Vec<&str> = template.split_whitespace().collect(); let mut roles = vec!["O".to_string(); items.len()]; let mut segment_starts = vec![0usize]; for (index, item) in items.iter().enumerate() { if *item == "PATH" { segment_starts.push(index + 1); } } for (index, item) in items.iter().enumerate() { roles[index] = if item.contains("EPISODE_VERSION") { "EPISODE_VERSION" } else if item.contains("EPISODE_RANGE") { "EPISODE_RANGE" } else if item.contains("EPISODE") || item.contains("SXE") { "EPISODE" } else if item.contains("RESOLUTION") { "RESOLUTION" } else if item.contains("HASH") { "HASH" } else if item.contains("LANG") || item.contains("MEDIA") { "SOURCE" } else if item.contains("SPECIAL") { "SPECIAL" } else if item.contains("SEASON") { "SEASON" } else if item.contains("VOLUME") { "VOLUME" } else { "O" } .to_string(); } for (offset, start) in segment_starts.iter().enumerate() { let end = if offset + 1 < segment_starts.len() { segment_starts[offset + 1] - 1 } else { items.len() }; if *start >= end { continue; } let first_structural = (*start..end) .find(|&index| { items[index].contains("EPISODE") || matches!(items[index], "SXE" | "SPECIAL" | "SEASON") }) .unwrap_or(end); let bracket_text: Vec = (*start..first_structural) .filter(|&index| items[index] == "BRACKET_TEXT" && roles[index] == "O") .collect(); let text: Vec = (*start..first_structural) .filter(|&index| items[index] == "TEXT" && roles[index] == "O") .collect(); if bracket_text.len() >= 2 { roles[bracket_text[0]] = "GROUP".to_string(); for index in bracket_text.iter().skip(1) { roles[*index] = "TITLE".to_string(); } } else if bracket_text.len() == 1 { roles[bracket_text[0]] = if text.is_empty() { "TITLE" } else if bracket_text[0] == *start { "GROUP" } else { "TITLE" } .to_string(); } for index in text { roles[index] = "TITLE".to_string(); } if !roles[*start..end].iter().any(|role| role == "TITLE") && !items[*start..end].is_empty() && items[*start].contains("EPISODE") { let mut run = Vec::new(); for index in (*start + 1)..end { if items[index] == "TEXT" && roles[index] == "O" { run.push(index); continue; } if items[index] == "SEP" { continue; } if !run.is_empty() { break; } } if run.len() >= 2 { for index in run { roles[index] = "TITLE".to_string(); } } } } roles } fn filename_has_title(filename: &str) -> bool { let (key, _, _, _) = template_key_for_filename(filename); suggested_roles(&key).iter().any(|role| role == "TITLE") } fn training_filename_for(original: &str) -> (String, bool) { let parts: Vec<&str> = original .split(|ch| ch == '/' || ch == '\\') .map(str::trim) .filter(|part| !part.is_empty()) .collect(); if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) { if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) { if !path_segment_is_plain_season(parts[parts.len() - 2]) { return (parts[parts.len() - 1].to_string(), true); } let parent_seasons = path_segment_seasons(parts[parts.len() - 2]); let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]); if parent_seasons .iter() .any(|season| leaf_seasons.contains(season)) { (parts[parts.len() - 1].to_string(), true) } else { (parts[parts.len() - 2..].join("/"), true) } } else { (parts[parts.len() - 1].to_string(), true) } } else { (original.to_string(), false) } } fn path_segment_is_plain_season(segment: &str) -> bool { let cleaned = strip_wrapper(segment).trim().to_string(); PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned) } fn path_segment_has_season(value: &str) -> bool { PATH_SEGMENT_SEASON_RE.is_match(value) } fn path_segment_seasons(value: &str) -> HashSet { SEASON_WORD_NUMBER_RE .captures_iter(value) .chain(S_NUMBER_SEGMENT_RE.captures_iter(value)) .chain(SXE_SEASON_RE.captures_iter(value)) .filter_map(|captures| captures.get(1)) .filter_map(|item| item.as_str().parse::().ok()) .collect() } fn has_encoding_noise(value: &str) -> bool { if value.contains('\u{fffd}') { return true; } let markers = [ "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", ]; let marker_hits = markers .iter() .map(|marker| value.matches(marker).count()) .sum::(); let halfwidth_hits = value .chars() .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch)) .count(); marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) } fn has_non_anime_noise(value: &str) -> bool { let normalized = value.replace('\\', "/").trim().to_ascii_lowercase(); normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/") } fn normalized_path_segment(value: &str) -> String { value .split_whitespace() .collect::() .to_ascii_lowercase() } fn path_segment_is_episodeish(value: &str) -> bool { let (_, _, _, groups) = template_key_for_filename(value); let structural: Vec<&String> = groups .iter() .map(|group| &group.class_name) .filter(|item| item.as_str() != "SEP") .collect(); !structural.is_empty() && structural .iter() .all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL") } fn has_abstract_path_noise(value: &str) -> bool { let parts: Vec<&str> = value .split(|ch| ch == '/' || ch == '\\') .map(str::trim) .filter(|part| !part.is_empty()) .collect(); if parts.len() < 3 { return false; } if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) { return true; } path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1]) } fn role_label(role: &str) -> String { let entity = match role { "GROUP" => Some("GROUP"), "TITLE" => Some("TITLE"), "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"), "SEASON" => Some("SEASON"), "SPECIAL" | "VOLUME" => Some("SPECIAL"), "RESOLUTION" => Some("RESOLUTION"), "SOURCE" => Some("SOURCE"), _ => None, }; entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}")) } fn is_separator(piece: &str) -> bool { piece.is_empty() || piece .chars() .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) } fn char_kind(ch: char) -> &'static str { if ch.is_whitespace() || !ch.is_alphanumeric() { "sep" } else if ch.is_ascii_digit() { "digit" } else if ch.is_ascii_alphabetic() { "alpha" } else { "text" } } fn split_refined_token(token: &str) -> Vec { let whole_class = classify_atom(token); let is_wrapped = { let chars: Vec = token.chars().collect(); chars.len() >= 2 && ((chars[0] == '[' && chars[chars.len() - 1] == ']') || (chars[0] == '(' && chars[chars.len() - 1] == ')') || (chars[0] == '【' && chars[chars.len() - 1] == '】')) }; if !is_wrapped && matches!( whole_class.as_str(), "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" ) && token.chars().all(char::is_alphanumeric) { return vec![token.to_string()]; } if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) { return vec![token.to_string()]; } let mut pieces = Vec::new(); let mut current = String::new(); let mut current_kind: Option<&str> = None; for ch in token.chars() { let kind = char_kind(ch); if kind == "sep" { if !current.is_empty() { pieces.push(std::mem::take(&mut current)); current_kind = None; } pieces.push(ch.to_string()); continue; } if !current.is_empty() && current_kind != Some(kind) { pieces.push(std::mem::take(&mut current)); } current.push(ch); current_kind = Some(kind); } if !current.is_empty() { pieces.push(current); } let mut merged = Vec::new(); let mut index = 0; while index < pieces.len() { if index + 2 < pieces.len() && !is_separator(&pieces[index]) && is_separator(&pieces[index + 1]) && !is_separator(&pieces[index + 2]) { let combined = format!( "{}{}{}", pieces[index], pieces[index + 1], pieces[index + 2] ); let combined_class = classify_atom(&combined); if !pieces[index + 1].chars().any(char::is_whitespace) && matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×") && matches!( combined_class.as_str(), "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" ) { merged.push(combined); index += 3; continue; } } if !is_separator(&pieces[index]) { let mut end = index; let mut combined = String::new(); while end < pieces.len() && !is_separator(&pieces[end]) { combined.push_str(&pieces[end]); end += 1; } if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) { merged.push(combined); index = end; continue; } } if index + 1 < pieces.len() && !is_separator(&pieces[index]) && !is_separator(&pieces[index + 1]) { let combined = format!("{}{}", pieces[index], pieces[index + 1]); if is_mergeable_refined_class(&classify_atom(&combined)) { merged.push(combined); index += 2; continue; } } merged.push(pieces[index].clone()); index += 1; } merged } fn is_mergeable_refined_class(value: &str) -> bool { matches!( value, "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | "SEASON" ) } fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String { if is_separator(piece) { return "O".to_string(); } let atom_class = classify_atom(piece); let upper = piece.to_ascii_uppercase(); if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if atom_class == "SEASON" { return "B-SEASON".to_string(); } if matches!(atom_class.as_str(), "EPISODE" | "EPISODE_VERSION" | "SXE") || piece.chars().all(|ch| ch.is_ascii_digit()) { return "B-EPISODE".to_string(); } if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") || matches!( upper.as_str(), "OVA" | "OAD" | "SP" | "PV" | "CM" | "OP" | "ED" | "NCOP" | "NCED" ) { return "B-SPECIAL".to_string(); } return "O".to_string(); } if role == "SOURCE" || matches!(token_class, "BRACKET_MEDIA_BLOCK" | "MEDIA_BLOCK") { if atom_class == "RESOLUTION" { return "B-RESOLUTION".to_string(); } if atom_class == "HASH" { return "O".to_string(); } if matches!(atom_class.as_str(), "MEDIA" | "LANG") { return "B-SOURCE".to_string(); } if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") { return "B-SPECIAL".to_string(); } return if matches!( upper.as_str(), "END" | "FIN" | "COMPLETE" | "TV" | "全集" | "全" ) { "O".to_string() } else { "B-SOURCE".to_string() }; } if role == "RESOLUTION" { return if atom_class == "RESOLUTION" || piece.chars().all(|ch| ch.is_ascii_digit()) { "B-RESOLUTION".to_string() } else { "O".to_string() }; } role_label(role) } fn split_sxe_token(token: &str) -> Option<(Vec, Vec)> { let caps = SXE_VALUE_RE.captures(token)?; let mut pieces = vec![ "S".to_string(), caps[1].to_string(), "E".to_string(), caps[2].to_string(), ]; let mut labels = vec![ "O".to_string(), "B-SEASON".to_string(), "O".to_string(), "B-EPISODE".to_string(), ]; if let Some(version) = caps.get(3) { pieces.push("v".to_string()); pieces.push(version.as_str().to_string()); labels.push("O".to_string()); labels.push("O".to_string()); } Some((pieces, labels)) } fn split_episode_token(token: &str) -> Option<(Vec, Vec)> { let caps = EPISODE_VALUE_RE.captures(token)?; let mut pieces = vec![caps[1].to_string(), caps[2].to_string()]; let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()]; if let Some(version) = caps.get(3) { pieces.push("v".to_string()); pieces.push(version.as_str().to_string()); labels.push("O".to_string()); labels.push("O".to_string()); } Some((pieces, labels)) } fn split_season_token(token: &str) -> Option<(Vec, Vec)> { let caps = SEASON_VALUE_RE.captures(token)?; Some(( vec!["S".to_string(), caps[1].to_string()], vec!["O".to_string(), "B-SEASON".to_string()], )) } fn group_text(tokens: &[String], group: &Group) -> String { strip_wrapper( &group .indices .iter() .map(|&index| tokens[index].as_str()) .collect::(), ) } fn is_special_title_phrase(text: &str) -> bool { let normalized = SPECIAL_SPACE_RE .replace_all(text, " ") .trim() .to_ascii_uppercase(); matches!( normalized.as_str(), "CM" | "EVENT" | "EIZOU" | "LOGO" | "MENU" | "OMAKE" | "PREVIEW" | "PV" | "THEATER GREETING EVENT" | "TOKUTEN" | "TRAILER" | "WORLD PREMIERE" ) || SPECIAL_TITLE_PHRASE_RE.is_match(text) } fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec { let mut output = roles.to_vec(); let ep_markers = ["EP", "E", "Episode", "ep", "episode"]; let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"]; if !output.iter().any(|role| role == "TITLE") && roles .first() .is_some_and(|role| role.starts_with("EPISODE")) { let mut title_run = Vec::new(); for index in 1..roles.len() { if groups[index].class_name == "TEXT" && output[index] == "O" { title_run.push(index); continue; } if groups[index].class_name == "SEP" { continue; } if !title_run.is_empty() { break; } } if title_run.len() >= 2 { let last_title_index = *title_run.last().unwrap(); let later_structural = roles[last_title_index + 1..].iter().any(|role| { role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") }); if group_text(tokens, &groups[0]) .chars() .all(|ch| ch.is_ascii_digit()) && later_structural { output[0] = "TITLE".to_string(); } for index in title_run { output[index] = "TITLE".to_string(); } } } if roles .first() .is_some_and(|role| role.starts_with("EPISODE")) && group_text(tokens, &groups[0]) .chars() .all(|ch| ch.is_ascii_digit()) { if let Some(first_title) = output.iter().position(|role| role == "TITLE") { let later_structural = roles[first_title + 1..].iter().any(|role| { role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") }); if later_structural { output[0] = "TITLE".to_string(); } } } for index in 0..roles.len() { let text = group_text(tokens, &groups[index]); if output[index] == "O" && groups[index].class_name.contains("SXE") { output[index] = "EPISODE".to_string(); } if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) { output[index] = "O".to_string(); continue; } if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) { let previous_text = group_text(tokens, &groups[index - 2]); let next_special = output[index + 1..roles.len().min(index + 4)] .iter() .any(|role| role == "SPECIAL"); let next_episode = roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE")); if groups[index - 1].class_name == "SEP" && matches!( previous_text.to_ascii_lowercase().as_str(), "vol" | "volume" ) { let next_text_before_episode = (index + 1..roles.len()) .find(|&cursor| groups[cursor].class_name != "SEP") .is_some_and(|cursor| { groups[cursor].class_name == "TEXT" && roles[cursor + 1..] .iter() .any(|role| role.starts_with("EPISODE")) }); if next_text_before_episode { output[index - 2] = "TITLE".to_string(); output[index] = "TITLE".to_string(); continue; } output[index - 2] = "SPECIAL".to_string(); output[index] = "SPECIAL".to_string(); continue; } if output[index - 2] == "TITLE" && groups[index - 1].class_name == "SEP" && previous_text.len() <= 4 && previous_text.is_ascii() && previous_text.chars().all(|ch| ch.is_ascii_alphabetic()) && text.chars().all(|ch| ch.is_ascii_digit()) && text.len() <= 3 && (next_special || next_episode) { output[index] = "TITLE".to_string(); continue; } } if roles[index].starts_with("EPISODE") && index >= 2 && output[..index].iter().any(|role| role == "TITLE") && group_text(tokens, &groups[index]) .chars() .all(|ch| ch.is_ascii_digit()) { let next_episode_word = index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"); if next_episode_word { let mut run = Vec::new(); let mut cursor = index + 2; while cursor < roles.len() { if groups[cursor].class_name == "SEP" { cursor += 1; continue; } if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE") { run.push(cursor); cursor += 1; continue; } break; } let later_episode = roles[cursor..] .iter() .any(|role| role.starts_with("EPISODE")); if run.len() >= 2 && later_episode { output[index] = "TITLE".to_string(); for item in run { output[item] = "TITLE".to_string(); } continue; } } } if roles[index] == "TITLE" && is_special_title_phrase(&text) { output[index] = "SPECIAL".to_string(); continue; } if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集") { output[index] = "O".to_string(); continue; } if output[index] == "O" && groups[index].class_name == "TEXT" && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE")) && text.chars().any(|ch| ch.is_alphabetic()) && !ep_markers.contains(&text.as_str()) { if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") { let episode_since_title = output[last_title + 1..index] .iter() .any(|role| role.starts_with("EPISODE")); if !episode_since_title { output[index] = "TITLE".to_string(); continue; } } } if roles[index] == "TITLE" && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison") && index + 2 < roles.len() && groups[index + 1].class_name == "SEP" && roles[index + 2].starts_with("EPISODE") { output[index] = "O".to_string(); output[index + 2] = "SEASON".to_string(); continue; } if roles[index] == "TITLE" && text == text.to_ascii_uppercase() && roman.contains(&text.as_str()) { let previous_title = output[..index].iter().any(|role| role == "TITLE"); let next_structural = roles[index + 1..] .iter() .any(|role| role.starts_with("EPISODE") || role == "SPECIAL"); if previous_title && next_structural { output[index] = "SEASON".to_string(); continue; } } if roles[index].starts_with("EPISODE") && index + 4 < roles.len() { if groups[index + 1].class_name == "SEP" && ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str()) && groups[index + 3].class_name == "SEP" && roles[index + 4].starts_with("EPISODE") { output[index] = "TITLE".to_string(); output[index + 2] = "O".to_string(); } } if roles[index].starts_with("EPISODE") { let previous_text = if index >= 1 { group_text(tokens, &groups[index - 1]) } else { String::new() }; let next_text = if index + 1 < roles.len() { group_text(tokens, &groups[index + 1]) } else { String::new() }; if previous_text.contains('点') || previous_text.contains('點') || previous_text.contains("晚上") || previous_text.contains("上午") || previous_text.contains("下午") || next_text.contains('点') || next_text.contains('點') || next_text.contains('半') { output[index] = "O".to_string(); } } } output } fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> { let mut candidates = Vec::new(); let mut index = 0; while index < roles.len() { if roles[index] != "TITLE" { index += 1; continue; } let start = index; index += 1; loop { if index < roles.len() && roles[index] == "TITLE" && !(groups[index - 1].class_name == "BRACKET_TEXT" && groups[index].class_name == "BRACKET_TEXT") { index += 1; continue; } if index + 1 < roles.len() && roles[index] == "O" && groups[index].class_name == "SEP" && roles[index + 1] == "TITLE" { index += 2; continue; } break; } candidates.push((start, index)); } candidates } fn enforce_single_title_candidate( groups: &[Group], roles: &[String], ) -> (Vec, Vec) { let candidates = title_candidates(groups, roles); if candidates.len() <= 1 { return (roles.to_vec(), Vec::new()); } let first_anchor = roles .iter() .position(|role| { role.starts_with("EPISODE") || matches!( role.as_str(), "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION" ) }) .unwrap_or(roles.len()); let before_anchor: Vec<(usize, usize)> = candidates .iter() .copied() .filter(|(_, end)| *end <= first_anchor) .collect(); let selected = (if before_anchor.is_empty() { &candidates } else { &before_anchor }) .iter() .max_by_key(|(start, end)| (*end, end - start)) .copied() .unwrap(); let mut output = roles.to_vec(); let mut dropped = Vec::new(); for (start, end) in candidates { if (start, end) == selected { continue; } for index in start..end { if output[index] == "TITLE" { output[index] = "O".to_string(); dropped.push(index.to_string()); } } } (output, dropped) } fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec, Vec) { let mut output_tokens = Vec::new(); let mut output_labels = Vec::new(); for (token, label) in tokens.iter().zip(labels.iter()) { for piece in split_generated_token(token) { output_labels.push(if label == "O" || is_standalone_separator(&piece) { "O".to_string() } else { label.clone() }); output_tokens.push(piece); } } (output_tokens, output_labels) } fn normalize_title_token(token: &str) -> (Vec, Vec) { let pieces = split_generated_token(token); let labels = pieces .iter() .map(|piece| { if is_standalone_separator(piece) { "O".to_string() } else if CJK_SEASON_TOKEN_RE.is_match(piece) { "B-SEASON".to_string() } else { "B-TITLE".to_string() } }) .collect(); (pieces, labels) } fn split_generated_token(token: &str) -> Vec { let mut pieces = Vec::new(); let mut current = String::new(); for ch in token.chars() { if ch.is_whitespace() || !ch.is_alphanumeric() { if !current.is_empty() { pieces.push(std::mem::take(&mut current)); } pieces.push(ch.to_string()); } else { current.push(ch); } } if !current.is_empty() { pieces.push(current); } pieces } fn is_standalone_separator(token: &str) -> bool { token.chars().count() == 1 && token .chars() .next() .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) } fn project_refined_tokens( tokens: &[String], groups: &[Group], roles: &[String], ) -> (Vec, Vec) { let mut output_tokens = Vec::new(); let mut output_labels = Vec::new(); for (group_index, group) in groups.iter().enumerate() { let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O"); if matches!(group.class_name.as_str(), "SEP" | "PATH" | "EMPTY") { role = "O"; } for &index in &group.indices { let token = &tokens[index]; if matches!( role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" | "SOURCE" | "RESOLUTION" | "SEASON" ) { if role == "SEASON" { if let Some((pieces, labels)) = split_season_token(token) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } for piece in split_refined_token(token) { if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { if let Some((pieces, labels)) = split_season_token(&piece) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } if let Some((pieces, labels)) = split_episode_token(&piece) { output_tokens.extend(pieces); output_labels.extend(labels); continue; } } let label = label_for_refined_piece(&piece, role, &group.class_name); let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]); output_tokens.extend(pieces); output_labels.extend(labels); } } else { if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集") { output_tokens.push(token.clone()); output_labels.push("O".to_string()); continue; } if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 { let trimmed = token.trim_end_matches('第').to_string(); let (pieces, labels) = normalize_generated_tokens( &[trimmed, "第".to_string()], &["B-TITLE".to_string(), "O".to_string()], ); output_tokens.extend(pieces); output_labels.extend(labels); continue; } if role == "TITLE" { let (pieces, labels) = normalize_title_token(token); output_tokens.extend(pieces); output_labels.extend(labels); continue; } let (pieces, labels) = normalize_generated_tokens(&[token.clone()], &[role_label(role)]); output_tokens.extend(pieces); output_labels.extend(labels); } } } (output_tokens, output_labels) } fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec { let joiners = [ " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?", "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【", "】", "｢", "｣", "「", "」", "☆", "@", ]; let title_terminal_punctuation = ["!", "！", "?", "？"]; let entity_joiners = [ " ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?", "？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【", "】", "｢", "｣", "「", "」", "☆", "@", "&", "＆", ]; let mut output = labels.to_vec(); for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() { if label != "O" || !entity_joiners.contains(&token.as_str()) { continue; } let mut left = index as isize - 1; while left >= 0 && joiners.contains(&tokens[left as usize].as_str()) && labels[left as usize] == "O" { left -= 1; } let mut right = index + 1; while right < tokens.len() && joiners.contains(&tokens[right].as_str()) && labels[right] == "O" { right += 1; } if left >= 0 && right < tokens.len() { let left_label = &output[left as usize]; let right_label = &labels[right]; if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") { output[index] = left_label.clone(); } } if title_terminal_punctuation.contains(&token.as_str()) && index > 0 { let left_label = &output[index - 1]; if left_label == "B-TITLE" { output[index] = "B-TITLE".to_string(); } } } output } fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option { let (key, tokens, _classes, groups) = template_key_for_filename(filename); if groups.len() != roles.len() { return None; } let roles = adjust_contextual_roles(&tokens, &groups, roles); let (roles, dropped) = enforce_single_title_candidate(&groups, &roles); let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles); let labels = smooth_title_spans(&tokens, &labels); if tokens.len() != labels.len() { return None; } Some(Record { filename: filename.to_string(), tokens, labels, template_id: template_id.to_string(), template: key, source_filename: None, path_trimmed: None, dropped_title_candidate_positions: if dropped.is_empty() { None } else { Some(dropped) }, }) } #[cfg(test)] mod tests { use super::*; fn labels_for(filename: &str) -> Vec<(String, String)> { let (key, _, _, _) = template_key_for_filename(filename); let roles = suggested_roles(&key); let record = dmhy_record(filename, "tpl_test", &roles).unwrap(); record.tokens.into_iter().zip(record.labels).collect() } #[test] fn required_regressions() { let title_91 = labels_for("Title 91 EP 01 [1080p]"); assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string()))); assert!(title_91.contains(&("EP".to_string(), "O".to_string()))); assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string()))); let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]"); assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string()))); assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string()))); let roman = labels_for("Chibi Maruko-chan I 001"); assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string()))); assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string()))); let dxd = labels_for("High School D×D"); assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string()))); let colon_title = labels_for("Megumi no Daigo：Kyuukoku no Orange 06"); assert!(colon_title.contains(&("：".to_string(), "B-TITLE".to_string()))); let sxe = labels_for("S01E02"); assert_eq!( sxe, vec![ ("S".to_string(), "O".to_string()), ("01".to_string(), "B-SEASON".to_string()), ("E".to_string(), "O".to_string()), ("02".to_string(), "B-EPISODE".to_string()) ] ); let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]"); assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string()))); assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string()))); let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]"); assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string()))); assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string()))); let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]"); assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string()))); assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string()))); let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)"); assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string()))); assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string()))); let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]"); assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string()))); let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]"); assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string()))); assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string()))); let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)"); assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string()))); let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]"); assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string()))); let hash = labels_for("[Group][Title][01][1080p][00270AC8]"); assert!(hash.contains(&("00270AC8".to_string(), "O".to_string()))); let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001"); assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string()))); assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string()))); let ubw = labels_for("Fate／stay night [Unlimited Blade Works] #00 「プロローグ」"); assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string()))); assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string()))); let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]"); assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string()))); let comma_title = labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]"); assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string()))); let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話"); assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string()))); let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]"); assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string()))); let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER～魂狩～ #01 (HEVC 1312x720)"); assert!(soul_taker.contains(&("～".to_string(), "B-TITLE".to_string()))); let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話"); assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string()))); assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string()))); let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件"); assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string()))); assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string()))); let zom = labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]"); assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string()))); assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string()))); assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string()))); let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]"); assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string()))); assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string()))); assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string()))); } #[test] fn updated_python_alignment_regressions() { let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; let (trimmed, was_trimmed) = training_filename_for(original); assert!(was_trimmed); assert_eq!( trimmed, "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p" ); let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"; let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon); assert!(pokemon_was_trimmed); assert_eq!( trimmed_pokemon, "Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]" ); let woody = labels_for(&trimmed); assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string()))); assert!(woody.contains(&("E".to_string(), "O".to_string()))); assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string()))); assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string()))); assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string()))); assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string()))); let group = labels_for("[DBD-Raws][Title][01][1080P]"); assert!(group.contains(&("-".to_string(), "B-GROUP".to_string()))); let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]"); assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string()))); let cjk_season = labels_for("[DBD-Raws][魔道祖师第一季][08][1080P][BDRip][HEVC-10bit][FLAC]"); assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string()))); assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string()))); assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string()))); let (trimmed, was_trimmed) = training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]"); assert!(was_trimmed); assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION" ); let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(short.contains(&("R".to_string(), "B-TITLE".to_string()))); assert!(short.contains(&("-".to_string(), "B-TITLE".to_string()))); assert!(short.contains(&("15".to_string(), "B-TITLE".to_string()))); assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string()))); let short_before_episode = labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string()))); assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string()))); let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]"; let (trimmed, was_trimmed) = training_filename_for(avatar); assert!(was_trimmed); assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]"); let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"; let (trimmed, was_trimmed) = training_filename_for(tintin); assert!(was_trimmed); assert_eq!( trimmed, "Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)" ); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT" ); let bocchi = "Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」"; let (leaf_key, _, _, _) = template_key_for_filename("Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」"); assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); assert!(filename_has_title( "Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」" )); let (trimmed, was_trimmed) = training_filename_for(bocchi); assert!(was_trimmed); assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]"; let (trimmed, was_trimmed) = training_filename_for(usagi); assert!(was_trimmed); assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]"); let (key, _, _, _) = template_key_for_filename(&trimmed); assert_eq!( key, "TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA" ); let woody_parent = "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}")); assert!(was_trimmed); assert_eq!(trimmed, woody_parent); let volume = labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string()))); assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string()))); assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string()))); assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string()))); assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string()))); let numeric_title = labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M"); assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string()))); assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string()))); assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string()))); assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string()))); assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string()))); assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string()))); assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string()))); let media_block = labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]"); assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string()))); assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string()))); assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string()))); assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string()))); } }