Verify low-frequency DMHY generated output

93f322e 10 days ago

93.6 kB

	use anyhow::{bail, Context, Result};
	use chrono::Utc;
	use clap::Parser;
	use once_cell::sync::Lazy;
	use rayon::prelude::*;
	use regex::Regex;
	use serde::{Deserialize, Serialize};
	use serde_json::{json, Value};
	use std::collections::{HashMap, HashSet};
	use std::fs::{self, File};
	use std::io::{BufRead, BufReader, BufWriter, Write};
	use std::path::PathBuf;
	use std::sync::atomic::{AtomicUsize, Ordering};

	#[derive(Parser, Debug)]
	#[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
	struct Args {
	#[arg(long)]
	cluster: bool,
	#[arg(long)]
	audit_low_frequency: bool,
	#[arg(long)]
	verify_generated_output: bool,
	#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
	input: PathBuf,
	#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
	recipes: PathBuf,
	#[arg(
	long,
	default_value = "reports/dmhy_weak.template_generated.rust.jsonl"
	)]
	output: PathBuf,
	#[arg(
	long,
	default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
	)]
	manifest_output: PathBuf,
	#[arg(
	long,
	default_value = "reports/dmhy_template_clusters.rust.summary.json"
	)]
	summary_output: PathBuf,
	#[arg(
	long,
	default_value = "reports/dmhy_template_clusters.rust.samples.jsonl"
	)]
	samples_output: PathBuf,
	#[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")]
	clusters_output: PathBuf,
	#[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")]
	recipes_output: PathBuf,
	#[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
	review_output: PathBuf,
	#[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
	audit_output: PathBuf,
	#[arg(long, default_value_t = 50)]
	audit_max_count: u64,
	#[arg(long)]
	limit: Option<usize>,
	#[arg(long)]
	limit_templates: Option<usize>,
	#[arg(long, default_value_t = 1)]
	min_count: u64,
	#[arg(long, default_value_t = 200)]
	top: usize,
	#[arg(long, default_value_t = 200)]
	recipe_top: usize,
	#[arg(long, default_value_t = 1000)]
	review_top: usize,
	#[arg(long, default_value_t = 8)]
	examples: usize,
	#[arg(long, default_value_t = 25)]
	recipe_min_count: usize,
	#[arg(long, default_value = "high")]
	confidence: String,
	#[arg(long, default_value = "all")]
	expand: String,
	#[arg(long, default_value_t = 100)]
	sample_per_template: usize,
	#[arg(long)]
	keep_encoding_noise: bool,
	#[arg(long)]
	preserve_parent_paths: bool,
	#[arg(long)]
	threads: Option<usize>,
	}

	#[derive(Debug, Clone, Deserialize)]
	struct Recipe {
	template_id: String,
	template: String,
	roles: Vec<String>,
	confidence: Option<String>,
	count: Option<u64>,
	}

	#[derive(Debug, Clone, Serialize, Deserialize)]
	struct Record {
	filename: String,
	tokens: Vec<String>,
	labels: Vec<String>,
	template_id: String,
	template: String,
	#[serde(skip_serializing_if = "Option::is_none")]
	source_filename: Option<String>,
	#[serde(skip_serializing_if = "Option::is_none")]
	path_trimmed: Option<bool>,
	#[serde(skip_serializing_if = "Option::is_none")]
	dropped_title_candidate_positions: Option<Vec<String>>,
	}

	#[derive(Debug, Clone)]
	struct Group {
	indices: Vec<usize>,
	class_name: String,
	}

	#[derive(Debug, Default, Clone, Serialize)]
	struct Stats {
	seen: usize,
	skipped_encoding_noise: usize,
	trimmed_parent_path: usize,
	skipped_no_recipe: usize,
	skipped_sample_cap: usize,
	skipped_role_mismatch: usize,
	skipped_low_frequency_audit_warning: usize,
	written: usize,
	}

	#[derive(Debug, Default)]
	struct Cluster {
	count: usize,
	examples: Vec<String>,
	literal_counts: HashMap<String, usize>,
	class_counts: HashMap<String, usize>,
	position_literals: Vec<HashMap<String, usize>>,
	}

	#[derive(Debug)]
	enum Processed {
	Written {
	record: Record,
	trimmed_parent: bool,
	},
	Skipped {
	reason: &'static str,
	trimmed_parent: bool,
	},
	}

	static HASH_RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
	static RESOLUTION_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:\d{3,4}p\|\dK\|\d{3,4}[xX×]\d{3,4})$").unwrap());
	static EPISODE_VERSION_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v\|ver\|version\|rev)\d{1,3}$").unwrap());
	static EPISODE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:EP?\|#)?\d{1,4}(?:END)?$").unwrap());
	static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
	static EPISODE_RANGE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^\d{1,4}\s[-~]\s\d{1,4}(?:\s*END)?$").unwrap());
	static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)^\d{1,4}\s[-~]\s\d{1,4}(?:\s(?:TV\|全集\|全\|END\|Fin\|Complete\|SP\|OVA\|OAD\|NCOP\|NCED)\|[+_./-]).{0,16}$").unwrap()
	});
	static SXE_RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
	static SXE_VALUE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
	static EPISODE_VALUE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(EP\|E\|#)(\d{1,4})(?:v(\d+))?$").unwrap());
	static SEASON_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)^(?:S\d{1,2}\|Season\s*\d{1,2}\|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
	});
	static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
	static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
	static SPECIAL_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)^(?:NCOP\|NCED\|OP\|ED\|PV\|CM\|SP\|OVA\|OAD\|IV\|Menu\|Intro\|Preview\|Trailer\|Teaser\|Animatics?)(?:[\s_.-]?(?:\d{0,4}\|Ep\d{1,4}\|[A-Z]))?$").unwrap()
	});
	static VOLUME_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?\|Disc\|CD\|BD\|DVD\|D)\s*\d{1,3}$").unwrap());
	static DATE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"^(?:19\|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
	static LANG_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)^(?:CHS\|CHT\|ZHS\|ZHT\|GB\|BIG5\|JPN?\|JP\|JA\|JAP\|ENG\|EN\|SC\|TC\|简[体體]?\|繁[体體]?\|简日\|繁日\|字幕\|内封\|外挂\|Sub\|Subs\|MSubs?)$").unwrap()
	});
	static MEDIA_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)^(?:WEB[-_. ]?DL\|WEB[-_. ]?Rip\|BDRip\|BluRay\|BDMV\|BD\|DVDRip\|DVD\|HDTV\|TVRip\|REMUX\|x26[45]\|h\.?26[45]\|HEVC\|AVC\|AV1\|AAC\d(?:\.\d+)?\|FLAC\|MP3\|DTS\|DTS-HDMA\|AC3\|Opus\|10[-_. ]?bit\|8[-_. ]?bit\|Hi10p\|Ma10p\|ASSx?\d\|SRTx?\d\|R\d[A-Z]\|NoSub\|MKV\|MP4\|AVI\|RAW\|Raws?)$").unwrap()
	});
	static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)\b(?:theater\s+greeting\s+event\|world\s+prem(?:eie\|iere)\|picture\s+drama)\b")
	.unwrap()
	});
	static YEAR_RANGE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"^$?\s(?:19\|20)\d{2}\s[-~]\s(?:19\|20)\d{2}\s$?$").unwrap());
	static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)(?:^\|[\s_.\-/])(?:season\s*\d{1,2}\|s\d{1,2})(?:$\|[\s_.\-/])").unwrap()
	});
	static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)(?:season\|saison)\s*0?(\d{1,2})").unwrap());
	static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:season\|saison)\s*0?\d{1,2}$\|^s0?\d{1,2}$").unwrap());
	static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)(?:^\|[^\p{L}\p{N}])s0?(\d{1,2})(?:$\|[^\p{L}\p{N}])").unwrap());
	static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(\|\| {
	Regex::new(r"(?i)(?:^\|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$\|[^\p{L}\p{N}])").unwrap()
	});
	static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(\|\| {
	[
	r"^\d{3,4}[xX×]\d{3,4}",
	r"(?i)^h\.?26[45]",
	r"(?i)^x\.?26[45]",
	r"^[\\/]+",
	r"^[-_.:：+&\|]+",
	r"^\s+",
	r"(?i)^Season\s*\d{1,2}",
	r"^[A-Za-z]+(?:\d+[A-Za-z])",
	r"^\d+[A-Za-z]+\d*",
	r"^\d{1,4}(?:[._-]\d{1,4})*",
	r"^[\p{Hiragana}\p{Katakana}\p{Han}]+",
	]
	.into_iter()
	.map(\|pattern\| Regex::new(pattern).unwrap())
	.collect()
	});
	static SIMPLE_EPISODE_RE: Lazy<Regex> =
	Lazy::new(\|\| Regex::new(r"(?i)^(?:EP?\|#)?\d{1,4}$").unwrap());
	static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"[\s_.-]+").unwrap());

	fn main() -> Result<()> {
	let args = Args::parse();
	if let Some(threads) = args.threads {
	rayon::ThreadPoolBuilder::new()
	.num_threads(threads)
	.build_global()
	.context("failed to configure rayon thread pool")?;
	}
	if args.cluster {
	return run_cluster(&args);
	}
	if args.audit_low_frequency {
	return run_low_frequency_audit(&args);
	}
	if args.verify_generated_output {
	return run_verify_generated_output(&args);
	}
	if args.expand != "all" && args.expand != "sample" {
	bail!("--expand must be all or sample");
	}

	let recipes = load_recipes(&args)?;
	if recipes.is_empty() {
	bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates");
	}
	let inputs = load_input(&args.input, args.limit)?;
	let sample_counters: HashMap<String, AtomicUsize> = recipes
	.values()
	.map(\|recipe\| (recipe.template_id.clone(), AtomicUsize::new(0)))
	.collect();

	let processed: Vec<Processed> = inputs
	.par_iter()
	.map(\|filename\| process_filename(filename, &args, &recipes, &sample_counters))
	.collect();

	if let Some(parent) = args.output.parent() {
	fs::create_dir_all(parent)?;
	}
	if let Some(parent) = args.manifest_output.parent() {
	fs::create_dir_all(parent)?;
	}

	let mut stats = Stats {
	seen: inputs.len(),
	..Stats::default()
	};
	let mut label_counts: HashMap<String, usize> = HashMap::new();
	let mut template_counts: HashMap<String, usize> = HashMap::new();
	let mut examples = Vec::new();
	let mut writer = BufWriter::new(File::create(&args.output)?);
	for item in processed {
	match item {
	Processed::Written {
	record,
	trimmed_parent,
	} => {
	if trimmed_parent {
	stats.trimmed_parent_path += 1;
	}
	for label in &record.labels {
	*label_counts.entry(label.clone()).or_default() += 1;
	}
	*template_counts
	.entry(record.template_id.clone())
	.or_default() += 1;
	if examples.len() < 20 {
	examples.push(serde_json::to_value(&record)?);
	}
	serde_json::to_writer(&mut writer, &record)?;
	writer.write_all(b"\n")?;
	stats.written += 1;
	}
	Processed::Skipped {
	reason,
	trimmed_parent,
	} => {
	if trimmed_parent {
	stats.trimmed_parent_path += 1;
	}
	match reason {
	"encoding_noise" => stats.skipped_encoding_noise += 1,
	"no_recipe" => stats.skipped_no_recipe += 1,
	"sample_cap" => stats.skipped_sample_cap += 1,
	"role_mismatch" => stats.skipped_role_mismatch += 1,
	"low_frequency_audit_warning" => {
	stats.skipped_low_frequency_audit_warning += 1
	}
	_ => {}
	}
	}
	}
	}
	writer.flush()?;

	let mut top_template_counts: Vec<_> = template_counts.into_iter().collect();
	top_template_counts.sort_by(\|a, b\| b.1.cmp(&a.1).then_with(\|\| a.0.cmp(&b.0)));
	top_template_counts.truncate(20);

	let manifest = json!({
	"generated_at": Utc::now().to_rfc3339(),
	"input": args.input.to_string_lossy(),
	"recipes": args.recipes.to_string_lossy(),
	"output": args.output.to_string_lossy(),
	"selected_templates": recipes.len(),
	"confidence": args.confidence,
	"min_count": args.min_count,
	"low_frequency_audit_max_count": args.audit_max_count,
	"low_frequency_blocking_warnings": [
	"hash_labeled",
	"multiple_title_spans",
	"no_title",
	"path_retained"
	],
	"expand": args.expand,
	"sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
	"stats": stats,
	"label_counts": label_counts,
	"top_template_counts": top_template_counts,
	"examples": examples,
	"implementation": "rust_dmhy_template_apply"
	});
	fs::write(
	&args.manifest_output,
	serde_json::to_string_pretty(&manifest)?,
	)?;
	println!("{}", serde_json::to_string_pretty(&manifest)?);
	Ok(())
	}

	fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
	let file = File::open(&args.recipes)
	.with_context(\|\| format!("recipe JSONL not found: {}", args.recipes.display()))?;
	let mut recipes = HashMap::new();
	for (line_number, line) in BufReader::new(file).lines().enumerate() {
	let line = line?;
	if line.trim().is_empty() {
	continue;
	}
	let row: Recipe = serde_json::from_str(&line).with_context(\|\| {
	format!(
	"invalid recipe JSON at {}:{}",
	args.recipes.display(),
	line_number + 1
	)
	})?;
	if !args.confidence.is_empty()
	&& row.confidence.as_deref() != Some(args.confidence.as_str())
	{
	continue;
	}
	if row.count.unwrap_or(0) < args.min_count {
	continue;
	}
	recipes.insert(row.template.clone(), row);
	if args
	.limit_templates
	.is_some_and(\|limit\| recipes.len() >= limit)
	{
	break;
	}
	}
	Ok(recipes)
	}

	fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
	let file =
	File::open(path).with_context(\|\| format!("input JSONL not found: {}", path.display()))?;
	let mut values = Vec::new();
	for (line_number, line) in BufReader::new(file).lines().enumerate() {
	if limit.is_some_and(\|limit\| values.len() >= limit) {
	break;
	}
	let line = line?;
	if line.trim().is_empty() {
	continue;
	}
	let row: Value = serde_json::from_str(&line)
	.with_context(\|\| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?;
	if let Some(value) = row.get("value").and_then(Value::as_str) {
	let value = value.trim();
	if !value.is_empty() {
	values.push(value.to_string());
	}
	}
	}
	Ok(values)
	}

	fn run_cluster(args: &Args) -> Result<()> {
	let inputs = load_input(&args.input, args.limit)?;
	let source_rows = inputs.len();
	let mut clusters: HashMap<String, Cluster> = HashMap::new();
	let mut skipped_encoding_noise = 0usize;
	let mut trimmed_parent_path = 0usize;
	let mut total_rows = 0usize;

	for original in inputs {
	if !args.keep_encoding_noise
	&& (has_encoding_noise(&original)
	\|\| has_non_anime_noise(&original)
	\|\| has_abstract_path_noise(&original))
	{
	skipped_encoding_noise += 1;
	continue;
	}
	let filename = if args.preserve_parent_paths {
	original
	} else {
	let (training_filename, was_trimmed) = training_filename_for(&original);
	if was_trimmed {
	trimmed_parent_path += 1;
	}
	training_filename
	};
	add_cluster(&mut clusters, &filename, args.examples);
	total_rows += 1;
	}

	let mut sorted_clusters: Vec<_> = clusters.into_iter().collect();
	sorted_clusters.sort_by(\|a, b\| b.1.count.cmp(&a.1.count).then_with(\|\| a.0.cmp(&b.0)));

	let cluster_rows: Vec<Value> = sorted_clusters
	.iter()
	.enumerate()
	.map(\|(index, (key, cluster))\| cluster_row(index + 1, key, cluster, total_rows))
	.collect();
	let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect();
	let recipe_candidates: Vec<Value> =
	cluster_rows.iter().take(args.recipe_top).cloned().collect();
	let recipes: Vec<Value> = recipe_candidates
	.iter()
	.filter(\|row\| is_high_confidence_recipe(row, args.recipe_min_count))
	.map(\|row\| recipe_row(row, "high"))
	.collect();
	let review: Vec<Value> = recipe_candidates
	.iter()
	.filter(\|row\| !is_high_confidence_recipe(row, args.recipe_min_count))
	.take(args.review_top)
	.cloned()
	.collect();

	write_jsonl_values(&args.clusters_output, &cluster_rows)?;
	write_jsonl_values(&args.samples_output, &samples)?;
	write_jsonl_values(&args.recipes_output, &recipes)?;
	write_jsonl_values(&args.review_output, &review)?;

	let mut histogram: HashMap<usize, usize> = HashMap::new();
	for (_, cluster) in &sorted_clusters {
	*histogram.entry(cluster.count).or_default() += 1;
	}
	let mut count_histogram_top: Vec<_> = histogram.into_iter().collect();
	count_histogram_top.sort_by(\|a, b\| b.1.cmp(&a.1).then_with(\|\| a.0.cmp(&b.0)));
	count_histogram_top.truncate(20);

	let rows_covered_by_repeated_templates: usize = sorted_clusters
	.iter()
	.map(\|(_, cluster)\| cluster)
	.filter(\|cluster\| cluster.count as u64 >= args.min_count)
	.map(\|cluster\| cluster.count)
	.sum();
	let templates_at_least_min_count = sorted_clusters
	.iter()
	.filter(\|(_, cluster)\| cluster.count as u64 >= args.min_count)
	.count();
	let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect();
	let summary = json!({
	"input": args.input.to_string_lossy(),
	"source_rows": source_rows,
	"skipped_encoding_noise": skipped_encoding_noise,
	"trimmed_parent_path": trimmed_parent_path,
	"total_rows": total_rows,
	"unique_templates": sorted_clusters.len(),
	"min_count": args.min_count,
	"templates_at_least_min_count": templates_at_least_min_count,
	"rows_covered_by_repeated_templates": rows_covered_by_repeated_templates,
	"rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 },
	"top_output_rows": samples.len(),
	"clusters_output": args.clusters_output.to_string_lossy(),
	"cluster_rows": cluster_rows.len(),
	"recipes_output": args.recipes_output.to_string_lossy(),
	"recipe_rows": recipes.len(),
	"review_output": args.review_output.to_string_lossy(),
	"review_rows": review.len(),
	"recipe_top": args.recipe_top,
	"recipe_min_count": args.recipe_min_count,
	"top_templates": top_templates,
	"count_histogram_top": count_histogram_top,
	"implementation": "rust_dmhy_template_cluster",
	"generated_at": Utc::now().to_rfc3339(),
	});
	if let Some(parent) = args.summary_output.parent() {
	fs::create_dir_all(parent)?;
	}
	fs::write(
	&args.summary_output,
	serde_json::to_string_pretty(&summary)?,
	)?;
	println!("{}", serde_json::to_string_pretty(&summary)?);
	Ok(())
	}

	fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) {
	let (key, tokens, classes, groups) = template_key_for_filename(filename);
	let cluster = clusters.entry(key).or_default();
	cluster.count += 1;
	if cluster.examples.len() < example_limit {
	cluster.examples.push(filename.to_string());
	}
	for (token, class_name) in tokens.iter().zip(classes.iter()) {
	*cluster.class_counts.entry(class_name.clone()).or_default() += 1;
	if matches!(class_name.as_str(), "TEXT" \| "BRACKET_TEXT") {
	let cleaned = strip_wrapper(token);
	if !cleaned.is_empty() {
	*cluster.literal_counts.entry(cleaned).or_default() += 1;
	}
	}
	}
	while cluster.position_literals.len() < groups.len() {
	cluster.position_literals.push(HashMap::new());
	}
	for (index, group) in groups.iter().enumerate() {
	if matches!(group.class_name.as_str(), "TEXT" \| "BRACKET_TEXT") {
	let text = group_text(&tokens, group);
	if !text.is_empty() {
	*cluster.position_literals[index].entry(text).or_default() += 1;
	}
	}
	}
	}

	fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
	json!({
	"template_id": format!("tpl_{rank:06}"),
	"template": key,
	"count": cluster.count,
	"coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
	"top_literals": top_counts(&cluster.literal_counts, 12),
	"suggested_roles": suggested_roles(key),
	"position_top_literals": cluster.position_literals.iter().map(\|counts\| top_counts(counts, 5)).collect::<Vec<_>>(),
	"class_counts": top_counts(&cluster.class_counts, 20),
	"examples": cluster.examples,
	})
	}

	fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
	let mut items: Vec<_> = counts
	.iter()
	.map(\|(key, count)\| (key.clone(), *count))
	.collect();
	items.sort_by(\|a, b\| b.1.cmp(&a.1).then_with(\|\| a.0.cmp(&b.0)));
	items.truncate(limit);
	items
	}

	fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool {
	if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 {
	return false;
	}
	let roles = match row.get("suggested_roles").and_then(Value::as_array) {
	Some(roles) => roles,
	None => return false,
	};
	let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect();
	if role_strings.iter().any(\|role\| role.contains("_OR_")) {
	return false;
	}
	if !role_strings.contains(&"TITLE")
	\|\| !role_strings.iter().any(\|role\| {
	role.starts_with("EPISODE") \|\| matches!(*role, "SPECIAL" \| "SOURCE" \| "RESOLUTION")
	})
	{
	return false;
	}
	let template = row.get("template").and_then(Value::as_str).unwrap_or("");
	if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") {
	return false;
	}
	!role_strings.contains(&"TITLE_OR_TEXT")
	}

	fn recipe_row(row: &Value, confidence: &str) -> Value {
	json!({
	"template_id": row["template_id"],
	"template": row["template"],
	"roles": row["suggested_roles"],
	"confidence": confidence,
	"count": row["count"],
	"examples": row["examples"],
	})
	}

	fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
	if let Some(parent) = path.parent() {
	fs::create_dir_all(parent)?;
	}
	let mut writer = BufWriter::new(File::create(path)?);
	for row in rows {
	serde_json::to_writer(&mut writer, row)?;
	writer.write_all(b"\n")?;
	}
	writer.flush()?;
	Ok(())
	}

	fn run_low_frequency_audit(args: &Args) -> Result<()> {
	let recipes = load_recipes(args)?;
	let inputs = load_input(&args.input, args.limit)?;
	let low_template_total = recipes
	.values()
	.filter(\|recipe\| recipe.count.unwrap_or(0) <= args.audit_max_count)
	.count();
	let mut seen_templates = HashSet::new();
	let mut rows = Vec::new();

	for original in inputs {
	if !args.keep_encoding_noise
	&& (has_encoding_noise(&original)
	\|\| has_non_anime_noise(&original)
	\|\| has_abstract_path_noise(&original))
	{
	continue;
	}
	let (training_filename, trimmed_parent) = training_filename_for(&original);
	let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
	let Some(recipe) = recipes.get(&key) else {
	continue;
	};
	let count = recipe.count.unwrap_or(0);
	if count > args.audit_max_count \|\| !seen_templates.insert(recipe.template_id.clone()) {
	continue;
	}
	if recipe.roles.len() != groups.len() {
	continue;
	}
	let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles)
	else {
	continue;
	};
	if trimmed_parent {
	record.source_filename = Some(original.clone());
	record.path_trimmed = Some(true);
	}
	rows.push(json!({
	"template_id": recipe.template_id,
	"count": count,
	"template": recipe.template,
	"filename": record.filename,
	"source_filename": record.source_filename,
	"path_trimmed": record.path_trimmed.unwrap_or(false),
	"spans": entity_spans(&record.tokens, &record.labels),
	"warnings": audit_warnings(&record),
	"tokens": record.tokens,
	"labels": record.labels,
	}));
	if seen_templates.len() >= low_template_total {
	break;
	}
	}

	rows.sort_by(\|a, b\| {
	let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0);
	let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0);
	let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or("");
	let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or("");
	count_a.cmp(&count_b).then_with(\|\| id_a.cmp(id_b))
	});
	write_jsonl_values(&args.audit_output, &rows)?;
	let warning_counts = warning_counts(&rows);
	let manifest = json!({
	"generated_at": Utc::now().to_rfc3339(),
	"input": args.input.to_string_lossy(),
	"recipes": args.recipes.to_string_lossy(),
	"audit_output": args.audit_output.to_string_lossy(),
	"audit_max_count": args.audit_max_count,
	"low_template_total": low_template_total,
	"audited_templates": rows.len(),
	"warning_counts": warning_counts,
	"implementation": "rust_dmhy_low_frequency_audit"
	});
	println!("{}", serde_json::to_string_pretty(&manifest)?);
	Ok(())
	}

	fn run_verify_generated_output(args: &Args) -> Result<()> {
	let file = File::open(&args.input)
	.with_context(\|\| format!("generated JSONL not found: {}", args.input.display()))?;
	let recipes_by_id: HashMap<String, u64> = load_recipes(args)?
	.into_values()
	.map(\|recipe\| (recipe.template_id, recipe.count.unwrap_or(0)))
	.collect();
	let mut rows = 0usize;
	let mut low_frequency_rows = 0usize;
	let mut warning_counts: HashMap<String, usize> = HashMap::new();
	let mut examples: HashMap<String, Vec<Value>> = HashMap::new();

	for (line_number, line) in BufReader::new(file).lines().enumerate() {
	let line = line?;
	if line.trim().is_empty() {
	continue;
	}
	let record: Record = serde_json::from_str(&line).with_context(\|\| {
	format!(
	"invalid generated record at {}:{}",
	args.input.display(),
	line_number + 1
	)
	})?;
	rows += 1;
	let count = recipes_by_id
	.get(&record.template_id)
	.copied()
	.unwrap_or(u64::MAX);
	if count > args.audit_max_count {
	continue;
	}
	low_frequency_rows += 1;
	for warning in audit_warnings(&record) {
	if !matches!(
	warning.as_str(),
	"hash_labeled" \| "multiple_title_spans" \| "no_title" \| "path_retained"
	) {
	continue;
	}
	*warning_counts.entry(warning.clone()).or_default() += 1;
	let bucket = examples.entry(warning).or_default();
	if bucket.len() < 5 {
	bucket.push(json!({
	"template_id": record.template_id,
	"template_count": count,
	"filename": record.filename,
	"spans": entity_spans(&record.tokens, &record.labels),
	}));
	}
	}
	}

	let manifest = json!({
	"generated_at": Utc::now().to_rfc3339(),
	"input": args.input.to_string_lossy(),
	"recipes": args.recipes.to_string_lossy(),
	"audit_max_count": args.audit_max_count,
	"rows": rows,
	"low_frequency_rows": low_frequency_rows,
	"blocking_warning_counts": warning_counts,
	"examples": examples,
	"implementation": "rust_dmhy_generated_output_verify"
	});
	println!("{}", serde_json::to_string_pretty(&manifest)?);
	if !warning_counts.is_empty() {
	bail!("generated output still has low-frequency blocking warnings");
	}
	Ok(())
	}

	fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
	let mut spans = Vec::new();
	let mut current_label: Option<String> = None;
	let mut current_text = String::new();
	for (token, label) in tokens.iter().zip(labels.iter()) {
	let entity = label
	.strip_prefix("B-")
	.or_else(\|\| label.strip_prefix("I-"))
	.unwrap_or("O");
	if current_label.as_deref() == Some(entity) {
	current_text.push_str(token);
	continue;
	}
	if let Some(label) = current_label.take() {
	if label != "O" {
	spans.push(json!({ "label": label, "text": current_text }));
	}
	}
	current_label = Some(entity.to_string());
	current_text = token.clone();
	}
	if let Some(label) = current_label {
	if label != "O" {
	spans.push(json!({ "label": label, "text": current_text }));
	}
	}
	spans
	}

	fn audit_warnings(record: &Record) -> Vec<String> {
	let mut warnings = Vec::new();
	let title_spans = entity_spans(&record.tokens, &record.labels)
	.into_iter()
	.filter(\|span\| span.get("label").and_then(Value::as_str) == Some("TITLE"))
	.count();
	if title_spans == 0 {
	warnings.push("no_title".to_string());
	} else if title_spans > 1 {
	warnings.push("multiple_title_spans".to_string());
	}
	if !record.labels.iter().any(\|label\| label.ends_with("EPISODE")) {
	warnings.push("no_episode".to_string());
	}
	if record.filename.contains('/') \|\| record.filename.contains('\\') {
	warnings.push("path_retained".to_string());
	}
	for (index, token) in record.tokens.iter().enumerate() {
	if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(\|label\| label != "O") {
	warnings.push("hash_labeled".to_string());
	break;
	}
	}
	warnings.sort();
	warnings.dedup();
	warnings
	}

	fn warning_counts(rows: &[Value]) -> HashMap<String, usize> {
	let mut counts = HashMap::new();
	for row in rows {
	if let Some(warnings) = row.get("warnings").and_then(Value::as_array) {
	for warning in warnings {
	if let Some(warning) = warning.as_str() {
	*counts.entry(warning.to_string()).or_default() += 1;
	}
	}
	}
	}
	counts
	}

	fn process_filename(
	original: &str,
	args: &Args,
	recipes: &HashMap<String, Recipe>,
	sample_counters: &HashMap<String, AtomicUsize>,
	) -> Processed {
	if !args.keep_encoding_noise
	&& (has_encoding_noise(original)
	\|\| has_non_anime_noise(original)
	\|\| has_abstract_path_noise(original))
	{
	return Processed::Skipped {
	reason: "encoding_noise",
	trimmed_parent: false,
	};
	}
	let (training_filename, trimmed_parent) = training_filename_for(original);
	let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
	let recipe = match recipes.get(&key) {
	Some(recipe) => recipe,
	None => {
	return Processed::Skipped {
	reason: "no_recipe",
	trimmed_parent,
	}
	}
	};
	if args.expand == "sample" {
	let counter = sample_counters.get(&recipe.template_id).unwrap();
	if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template {
	return Processed::Skipped {
	reason: "sample_cap",
	trimmed_parent,
	};
	}
	}
	if recipe.roles.len() != groups.len() {
	return Processed::Skipped {
	reason: "role_mismatch",
	trimmed_parent,
	};
	}
	let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
	Some(record) => record,
	None => {
	return Processed::Skipped {
	reason: "role_mismatch",
	trimmed_parent,
	}
	}
	};
	if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
	{
	return Processed::Skipped {
	reason: "low_frequency_audit_warning",
	trimmed_parent,
	};
	}
	if trimmed_parent {
	record.source_filename = Some(original.to_string());
	record.path_trimmed = Some(true);
	return Processed::Written {
	record,
	trimmed_parent: true,
	};
	}
	Processed::Written {
	record,
	trimmed_parent: false,
	}
	}

	fn has_blocking_low_frequency_warning(record: &Record) -> bool {
	audit_warnings(record).iter().any(\|warning\| {
	matches!(
	warning.as_str(),
	"hash_labeled" \| "multiple_title_spans" \| "no_title" \| "path_retained"
	)
	})
	}

	fn tokenize(value: &str) -> Vec<String> {
	let mut output = Vec::new();
	let mut index = 0;
	while index < value.len() {
	let rest = &value[index..];
	if let Some((token, len)) = next_token(rest) {
	output.push(token);
	index += len;
	} else {
	let ch = rest.chars().next().unwrap();
	output.push(ch.to_string());
	index += ch.len_utf8();
	}
	}
	output
	}

	fn next_token(rest: &str) -> Option<(String, usize)> {
	let first = rest.chars().next()?;
	if first == '[' {
	if let Some(end) = rest.find(']') {
	if end <= 121 {
	return Some((rest[..=end].to_string(), end + 1));
	}
	}
	}
	if first == '(' {
	if let Some(end) = rest.find(')') {
	if end <= 121 {
	return Some((rest[..=end].to_string(), end + 1));
	}
	}
	}
	if first == '【' {
	if let Some(end) = rest.find('】') {
	if rest[..end].chars().count() <= 120 {
	return Some((
	rest[..end + '】'.len_utf8()].to_string(),
	end + '】'.len_utf8(),
	));
	}
	}
	}
	for re in TOKEN_REGEXES.iter() {
	if let Some(mat) = re.find(rest) {
	if mat.start() == 0 && mat.end() > 0 {
	return Some((mat.as_str().to_string(), mat.end()));
	}
	}
	}
	None
	}

	fn strip_wrapper(token: &str) -> String {
	let chars: Vec<char> = token.chars().collect();
	if chars.len() >= 2 {
	let first = chars[0];
	let last = chars[chars.len() - 1];
	if (first == '[' && last == ']')
	\|\| (first == '(' && last == ')')
	\|\| (first == '【' && last == '】')
	{
	return chars[1..chars.len() - 1]
	.iter()
	.collect::<String>()
	.trim()
	.to_string();
	}
	}
	token.trim().to_string()
	}

	fn split_inner(inner: &str) -> Vec<String> {
	let mut parts = Vec::new();
	let mut current = String::new();
	for ch in inner.chars() {
	if ch.is_whitespace() \|\| "_.,+/&\|-()（）".contains(ch) {
	if !current.is_empty() {
	parts.push(std::mem::take(&mut current));
	}
	} else {
	current.push(ch);
	}
	}
	if !current.is_empty() {
	parts.push(current);
	}
	parts
	}

	fn compact_for_classify(text: &str) -> String {
	text.chars()
	.filter(\|ch\| !ch.is_whitespace() && !matches!(ch, '_' \| '.' \| ',' \| '-'))
	.collect()
	}

	fn classify_atom(text: &str) -> String {
	let cleaned = strip_wrapper(text);
	let compact = compact_for_classify(&cleaned);
	if cleaned.is_empty() {
	return "EMPTY".to_string();
	}
	if HASH_RE.is_match(&cleaned) {
	return "HASH".to_string();
	}
	if RESOLUTION_RE.is_match(&cleaned) {
	return "RESOLUTION".to_string();
	}
	if DATE_RE.is_match(&cleaned) {
	return "DATE".to_string();
	}
	if EPISODE_VERSION_RE.is_match(&compact) {
	return "EPISODE_VERSION".to_string();
	}
	if SXE_RE.is_match(&compact) {
	return "SXE".to_string();
	}
	if EPISODE_RE.is_match(&compact) {
	return "EPISODE".to_string();
	}
	if EPISODE_CJK_RE.is_match(&cleaned) {
	return "EPISODE".to_string();
	}
	if EPISODE_BATCH_RE.is_match(&cleaned) {
	return "EPISODE_RANGE".to_string();
	}
	if EPISODE_RANGE_RE.is_match(&cleaned) {
	return "EPISODE_RANGE".to_string();
	}
	if EPISODE_RE.is_match(&cleaned) {
	return "EPISODE".to_string();
	}
	if SEASON_RE.is_match(&cleaned) {
	return "SEASON".to_string();
	}
	if SPECIAL_RE.is_match(&cleaned) {
	return "SPECIAL".to_string();
	}
	if VOLUME_RE.is_match(&cleaned) {
	return "VOLUME".to_string();
	}
	if LANG_RE.is_match(&cleaned) \|\| lang_block_matches(&cleaned) {
	return "LANG".to_string();
	}
	if MEDIA_RE.is_match(&cleaned) {
	return "MEDIA".to_string();
	}
	"TEXT".to_string()
	}

	fn lang_block_matches(text: &str) -> bool {
	let upper = text.to_ascii_uppercase();
	if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"]
	.iter()
	.any(\|marker\| upper.contains(marker))
	{
	return true;
	}
	if upper.contains("GB") {
	return true;
	}
	if [
	"简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂",
	]
	.iter()
	.any(\|marker\| text.contains(marker))
	{
	return true;
	}
	let chars: Vec<char> = text.chars().collect();
	chars.windows(2).enumerate().any(\|(index, pair)\| {
	pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' \| '組'))
	})
	}

	fn classify_token(token: &str) -> String {
	if token.is_empty() {
	return "EMPTY".to_string();
	}
	if token.chars().all(char::is_whitespace) {
	return "SPACE".to_string();
	}
	if token.chars().all(\|ch\| ch == '/' \|\| ch == '\\') {
	return "PATH".to_string();
	}
	if token.chars().all(\|ch\| "-_.:：+&\|".contains(ch)) {
	return "SEP".to_string();
	}
	if token.starts_with('[') \|\| token.starts_with('(') \|\| token.starts_with('【') {
	let inner = strip_wrapper(token);
	let parts = split_inner(&inner);
	let whole_class = classify_atom(&inner);
	let inner_class = if whole_class != "TEXT" {
	if whole_class == "LANG" && parts.len() > 1 {
	let part_classes: Vec<String> =
	parts.iter().map(\|part\| classify_atom(part)).collect();
	if part_classes.iter().all(\|item\| item == &part_classes[0]) {
	part_classes[0].clone()
	} else if part_classes.iter().all(\|item\| is_media_block_class(item)) {
	"MEDIA_BLOCK".to_string()
	} else {
	whole_class
	}
	} else {
	whole_class
	}
	} else if parts.is_empty() {
	"EMPTY".to_string()
	} else {
	let part_classes: Vec<String> = parts.iter().map(\|part\| classify_atom(part)).collect();
	if part_classes.iter().all(\|item\| item == &part_classes[0]) {
	part_classes[0].clone()
	} else if part_classes.iter().all(\|item\| is_media_block_class(item)) {
	"MEDIA_BLOCK".to_string()
	} else if part_classes.iter().any(\|item\| is_media_block_class(item))
	&& parts.iter().zip(part_classes.iter()).all(\|(part, item)\| {
	is_media_block_class(item)
	\|\| matches!(part.to_ascii_lowercase().as_str(), "anime" \| "アニメ")
	})
	{
	"MEDIA_BLOCK".to_string()
	} else if part_classes.iter().any(\|item\| item == "TEXT") {
	"TEXT".to_string()
	} else {
	let mut set: Vec<String> = part_classes
	.into_iter()
	.collect::<HashSet<_>>()
	.into_iter()
	.collect();
	set.sort();
	set.join("_")
	}
	};
	return format!("BRACKET_{inner_class}");
	}
	classify_atom(token)
	}

	fn is_media_block_class(value: &str) -> bool {
	matches!(value, "MEDIA" \| "RESOLUTION" \| "LANG" \| "HASH" \| "DATE")
	}

	fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec<Group> {
	let mut groups: Vec<Group> = Vec::new();
	let mut previous: Option<String> = None;
	for (index, token_class) in classes.iter().enumerate() {
	let current = if token_class == "SPACE" {
	"SEP"
	} else {
	token_class
	}
	.to_string();
	if previous.as_deref() == Some(current.as_str())
	&& matches!(current.as_str(), "SEP" \| "TEXT")
	{
	groups.last_mut().unwrap().indices.push(index);
	} else {
	groups.push(Group {
	indices: vec![index],
	class_name: current.clone(),
	});
	}
	previous = Some(current);
	}
	groups
	}

	fn template_key_for_filename(filename: &str) -> (String, Vec<String>, Vec<String>, Vec<Group>) {
	let tokens = tokenize(filename);
	let classes: Vec<String> = tokens.iter().map(\|token\| classify_token(token)).collect();
	let groups = compact_token_groups(&tokens, &classes);
	let key = groups
	.iter()
	.map(\|group\| group.class_name.as_str())
	.collect::<Vec<_>>()
	.join(" ");
	(key, tokens, classes, groups)
	}

	fn suggested_roles(template: &str) -> Vec<String> {
	let items: Vec<&str> = template.split_whitespace().collect();
	let mut roles = vec!["O".to_string(); items.len()];
	let mut segment_starts = vec![0usize];
	for (index, item) in items.iter().enumerate() {
	if *item == "PATH" {
	segment_starts.push(index + 1);
	}
	}
	for (index, item) in items.iter().enumerate() {
	roles[index] = if item.contains("EPISODE_VERSION") {
	"EPISODE_VERSION"
	} else if item.contains("EPISODE_RANGE") {
	"EPISODE_RANGE"
	} else if item.contains("EPISODE") \|\| item.contains("SXE") {
	"EPISODE"
	} else if item.contains("RESOLUTION") {
	"RESOLUTION"
	} else if item.contains("HASH") {
	"HASH"
	} else if item.contains("LANG") \|\| item.contains("MEDIA") {
	"SOURCE"
	} else if item.contains("SPECIAL") {
	"SPECIAL"
	} else if item.contains("SEASON") {
	"SEASON"
	} else if item.contains("VOLUME") {
	"VOLUME"
	} else {
	"O"
	}
	.to_string();
	}
	for (offset, start) in segment_starts.iter().enumerate() {
	let end = if offset + 1 < segment_starts.len() {
	segment_starts[offset + 1] - 1
	} else {
	items.len()
	};
	if *start >= end {
	continue;
	}
	let first_structural = (*start..end)
	.find(\|&index\| {
	items[index].contains("EPISODE")
	\|\| matches!(items[index], "SXE" \| "SPECIAL" \| "SEASON")
	})
	.unwrap_or(end);
	let bracket_text: Vec<usize> = (*start..first_structural)
	.filter(\|&index\| items[index] == "BRACKET_TEXT" && roles[index] == "O")
	.collect();
	let text: Vec<usize> = (*start..first_structural)
	.filter(\|&index\| items[index] == "TEXT" && roles[index] == "O")
	.collect();
	if bracket_text.len() >= 2 {
	roles[bracket_text[0]] = "GROUP".to_string();
	for index in bracket_text.iter().skip(1) {
	roles[*index] = "TITLE".to_string();
	}
	} else if bracket_text.len() == 1 {
	roles[bracket_text[0]] = if text.is_empty() {
	"TITLE"
	} else if bracket_text[0] == *start {
	"GROUP"
	} else {
	"TITLE"
	}
	.to_string();
	}
	for index in text {
	roles[index] = "TITLE".to_string();
	}
	if !roles[*start..end].iter().any(\|role\| role == "TITLE")
	&& !items[*start..end].is_empty()
	&& items[*start].contains("EPISODE")
	{
	let mut run = Vec::new();
	for index in (*start + 1)..end {
	if items[index] == "TEXT" && roles[index] == "O" {
	run.push(index);
	continue;
	}
	if items[index] == "SEP" {
	continue;
	}
	if !run.is_empty() {
	break;
	}
	}
	if run.len() >= 2 {
	for index in run {
	roles[index] = "TITLE".to_string();
	}
	}
	}
	}
	roles
	}

	fn filename_has_title(filename: &str) -> bool {
	let (key, _, _, _) = template_key_for_filename(filename);
	suggested_roles(&key).iter().any(\|role\| role == "TITLE")
	}

	fn training_filename_for(original: &str) -> (String, bool) {
	let parts: Vec<&str> = original
	.split(\|ch\| ch == '/' \|\| ch == '\\')
	.map(str::trim)
	.filter(\|part\| !part.is_empty())
	.collect();
	if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
	if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
	if !path_segment_is_plain_season(parts[parts.len() - 2]) {
	return (parts[parts.len() - 1].to_string(), true);
	}
	let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
	let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
	if parent_seasons
	.iter()
	.any(\|season\| leaf_seasons.contains(season))
	{
	(parts[parts.len() - 1].to_string(), true)
	} else {
	(parts[parts.len() - 2..].join("/"), true)
	}
	} else {
	(parts[parts.len() - 1].to_string(), true)
	}
	} else {
	(original.to_string(), false)
	}
	}

	fn path_segment_is_plain_season(segment: &str) -> bool {
	let cleaned = strip_wrapper(segment).trim().to_string();
	PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
	}

	fn path_segment_has_season(value: &str) -> bool {
	PATH_SEGMENT_SEASON_RE.is_match(value)
	}

	fn path_segment_seasons(value: &str) -> HashSet<u8> {
	SEASON_WORD_NUMBER_RE
	.captures_iter(value)
	.chain(S_NUMBER_SEGMENT_RE.captures_iter(value))
	.chain(SXE_SEASON_RE.captures_iter(value))
	.filter_map(\|captures\| captures.get(1))
	.filter_map(\|item\| item.as_str().parse::<u8>().ok())
	.collect()
	}

	fn has_encoding_noise(value: &str) -> bool {
	if value.contains('\u{fffd}') {
	return true;
	}
	let markers = [
	"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
	];
	let marker_hits = markers
	.iter()
	.map(\|marker\| value.matches(marker).count())
	.sum::<usize>();
	let halfwidth_hits = value
	.chars()
	.filter(\|ch\| ('\u{ff61}'..='\u{ff9f}').contains(ch))
	.count();
	marker_hits >= 2 \|\| (marker_hits >= 1 && halfwidth_hits >= 1)
	}

	fn has_non_anime_noise(value: &str) -> bool {
	let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
	normalized == "mtv" \|\| normalized.starts_with("mtv/") \|\| normalized.contains("/mtv/")
	}

	fn normalized_path_segment(value: &str) -> String {
	value
	.split_whitespace()
	.collect::<String>()
	.to_ascii_lowercase()
	}

	fn path_segment_is_episodeish(value: &str) -> bool {
	let (_, _, _, groups) = template_key_for_filename(value);
	let structural: Vec<&String> = groups
	.iter()
	.map(\|group\| &group.class_name)
	.filter(\|item\| item.as_str() != "SEP")
	.collect();
	!structural.is_empty()
	&& structural
	.iter()
	.all(\|item\| item.starts_with("EPISODE") \|\| item.as_str() == "SPECIAL")
	}

	fn has_abstract_path_noise(value: &str) -> bool {
	let parts: Vec<&str> = value
	.split(\|ch\| ch == '/' \|\| ch == '\\')
	.map(str::trim)
	.filter(\|part\| !part.is_empty())
	.collect();
	if parts.len() < 3 {
	return false;
	}
	if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) {
	return true;
	}
	path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1])
	}

	fn role_label(role: &str) -> String {
	let entity = match role {
	"GROUP" => Some("GROUP"),
	"TITLE" => Some("TITLE"),
	"EPISODE" \| "EPISODE_VERSION" \| "EPISODE_RANGE" => Some("EPISODE"),
	"SEASON" => Some("SEASON"),
	"SPECIAL" \| "VOLUME" => Some("SPECIAL"),
	"RESOLUTION" => Some("RESOLUTION"),
	"SOURCE" => Some("SOURCE"),
	_ => None,
	};
	entity.map_or_else(\|\| "O".to_string(), \|entity\| format!("B-{entity}"))
	}

	fn is_separator(piece: &str) -> bool {
	piece.is_empty()
	\|\| piece
	.chars()
	.all(\|ch\| ch.is_whitespace() \|\| !ch.is_alphanumeric())
	}

	fn char_kind(ch: char) -> &'static str {
	if ch.is_whitespace() \|\| !ch.is_alphanumeric() {
	"sep"
	} else if ch.is_ascii_digit() {
	"digit"
	} else if ch.is_ascii_alphabetic() {
	"alpha"
	} else {
	"text"
	}
	}

	fn split_refined_token(token: &str) -> Vec<String> {
	let whole_class = classify_atom(token);
	let is_wrapped = {
	let chars: Vec<char> = token.chars().collect();
	chars.len() >= 2
	&& ((chars[0] == '[' && chars[chars.len() - 1] == ']')
	\|\| (chars[0] == '(' && chars[chars.len() - 1] == ')')
	\|\| (chars[0] == '【' && chars[chars.len() - 1] == '】'))
	};
	if !is_wrapped
	&& matches!(
	whole_class.as_str(),
	"RESOLUTION" \| "MEDIA" \| "LANG" \| "HASH" \| "SXE" \| "EPISODE_VERSION"
	)
	&& token.chars().all(char::is_alphanumeric)
	{
	return vec![token.to_string()];
	}
	if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) {
	return vec![token.to_string()];
	}
	let mut pieces = Vec::new();
	let mut current = String::new();
	let mut current_kind: Option<&str> = None;
	for ch in token.chars() {
	let kind = char_kind(ch);
	if kind == "sep" {
	if !current.is_empty() {
	pieces.push(std::mem::take(&mut current));
	current_kind = None;
	}
	pieces.push(ch.to_string());
	continue;
	}
	if !current.is_empty() && current_kind != Some(kind) {
	pieces.push(std::mem::take(&mut current));
	}
	current.push(ch);
	current_kind = Some(kind);
	}
	if !current.is_empty() {
	pieces.push(current);
	}
	let mut merged = Vec::new();
	let mut index = 0;
	while index < pieces.len() {
	if index + 2 < pieces.len()
	&& !is_separator(&pieces[index])
	&& is_separator(&pieces[index + 1])
	&& !is_separator(&pieces[index + 2])
	{
	let combined = format!(
	"{}{}{}",
	pieces[index],
	pieces[index + 1],
	pieces[index + 2]
	);
	let combined_class = classify_atom(&combined);
	if !pieces[index + 1].chars().any(char::is_whitespace)
	&& matches!(pieces[index + 1].as_str(), "." \| "x" \| "X" \| "×")
	&& matches!(
	combined_class.as_str(),
	"RESOLUTION" \| "MEDIA" \| "LANG" \| "HASH" \| "SXE" \| "EPISODE_VERSION"
	)
	{
	merged.push(combined);
	index += 3;
	continue;
	}
	}
	if !is_separator(&pieces[index]) {
	let mut end = index;
	let mut combined = String::new();
	while end < pieces.len() && !is_separator(&pieces[end]) {
	combined.push_str(&pieces[end]);
	end += 1;
	}
	if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) {
	merged.push(combined);
	index = end;
	continue;
	}
	}
	if index + 1 < pieces.len()
	&& !is_separator(&pieces[index])
	&& !is_separator(&pieces[index + 1])
	{
	let combined = format!("{}{}", pieces[index], pieces[index + 1]);
	if is_mergeable_refined_class(&classify_atom(&combined)) {
	merged.push(combined);
	index += 2;
	continue;
	}
	}
	merged.push(pieces[index].clone());
	index += 1;
	}
	merged
	}

	fn is_mergeable_refined_class(value: &str) -> bool {
	matches!(
	value,
	"RESOLUTION" \| "MEDIA" \| "LANG" \| "HASH" \| "SXE" \| "EPISODE_VERSION" \| "SEASON"
	)
	}

	fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String {
	if is_separator(piece) {
	return "O".to_string();
	}
	let atom_class = classify_atom(piece);
	let upper = piece.to_ascii_uppercase();
	if matches!(role, "EPISODE" \| "EPISODE_VERSION" \| "EPISODE_RANGE") {
	if atom_class == "SEASON" {
	return "B-SEASON".to_string();
	}
	if matches!(atom_class.as_str(), "EPISODE" \| "EPISODE_VERSION" \| "SXE")
	\|\| piece.chars().all(\|ch\| ch.is_ascii_digit())
	{
	return "B-EPISODE".to_string();
	}
	if matches!(atom_class.as_str(), "SPECIAL" \| "VOLUME")
	\|\| matches!(
	upper.as_str(),
	"OVA" \| "OAD" \| "SP" \| "PV" \| "CM" \| "OP" \| "ED" \| "NCOP" \| "NCED"
	)
	{
	return "B-SPECIAL".to_string();
	}
	return "O".to_string();
	}
	if role == "SOURCE" \|\| matches!(token_class, "BRACKET_MEDIA_BLOCK" \| "MEDIA_BLOCK") {
	if atom_class == "RESOLUTION" {
	return "B-RESOLUTION".to_string();
	}
	if atom_class == "HASH" {
	return "O".to_string();
	}
	if matches!(atom_class.as_str(), "MEDIA" \| "LANG") {
	return "B-SOURCE".to_string();
	}
	if matches!(atom_class.as_str(), "SPECIAL" \| "VOLUME") {
	return "B-SPECIAL".to_string();
	}
	return if matches!(
	upper.as_str(),
	"END" \| "FIN" \| "COMPLETE" \| "TV" \| "全集" \| "全"
	) {
	"O".to_string()
	} else {
	"B-SOURCE".to_string()
	};
	}
	if role == "RESOLUTION" {
	return if atom_class == "RESOLUTION" \|\| piece.chars().all(\|ch\| ch.is_ascii_digit()) {
	"B-RESOLUTION".to_string()
	} else {
	"O".to_string()
	};
	}
	role_label(role)
	}

	fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
	let caps = SXE_VALUE_RE.captures(token)?;
	let mut pieces = vec![
	"S".to_string(),
	caps[1].to_string(),
	"E".to_string(),
	caps[2].to_string(),
	];
	let mut labels = vec![
	"O".to_string(),
	"B-SEASON".to_string(),
	"O".to_string(),
	"B-EPISODE".to_string(),
	];
	if let Some(version) = caps.get(3) {
	pieces.push("v".to_string());
	pieces.push(version.as_str().to_string());
	labels.push("O".to_string());
	labels.push("O".to_string());
	}
	Some((pieces, labels))
	}

	fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
	let caps = EPISODE_VALUE_RE.captures(token)?;
	let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
	let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
	if let Some(version) = caps.get(3) {
	pieces.push("v".to_string());
	pieces.push(version.as_str().to_string());
	labels.push("O".to_string());
	labels.push("O".to_string());
	}
	Some((pieces, labels))
	}

	fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
	let caps = SEASON_VALUE_RE.captures(token)?;
	Some((
	vec!["S".to_string(), caps[1].to_string()],
	vec!["O".to_string(), "B-SEASON".to_string()],
	))
	}

	fn group_text(tokens: &[String], group: &Group) -> String {
	strip_wrapper(
	&group
	.indices
	.iter()
	.map(\|&index\| tokens[index].as_str())
	.collect::<String>(),
	)
	}

	fn is_special_title_phrase(text: &str) -> bool {
	let normalized = SPECIAL_SPACE_RE
	.replace_all(text, " ")
	.trim()
	.to_ascii_uppercase();
	matches!(
	normalized.as_str(),
	"CM" \| "EVENT"
	\| "EIZOU"
	\| "LOGO"
	\| "MENU"
	\| "OMAKE"
	\| "PREVIEW"
	\| "PV"
	\| "THEATER GREETING EVENT"
	\| "TOKUTEN"
	\| "TRAILER"
	\| "WORLD PREMIERE"
	) \|\| SPECIAL_TITLE_PHRASE_RE.is_match(text)
	}

	fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
	let mut output = roles.to_vec();
	let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
	let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
	if !output.iter().any(\|role\| role == "TITLE")
	&& roles
	.first()
	.is_some_and(\|role\| role.starts_with("EPISODE"))
	{
	let mut title_run = Vec::new();
	for index in 1..roles.len() {
	if groups[index].class_name == "TEXT" && output[index] == "O" {
	title_run.push(index);
	continue;
	}
	if groups[index].class_name == "SEP" {
	continue;
	}
	if !title_run.is_empty() {
	break;
	}
	}
	if title_run.len() >= 2 {
	let last_title_index = *title_run.last().unwrap();
	let later_structural = roles[last_title_index + 1..].iter().any(\|role\| {
	role.starts_with("EPISODE") \|\| matches!(role.as_str(), "SEASON" \| "SPECIAL")
	});
	if group_text(tokens, &groups[0])
	.chars()
	.all(\|ch\| ch.is_ascii_digit())
	&& later_structural
	{
	output[0] = "TITLE".to_string();
	}
	for index in title_run {
	output[index] = "TITLE".to_string();
	}
	}
	}
	if roles
	.first()
	.is_some_and(\|role\| role.starts_with("EPISODE"))
	&& group_text(tokens, &groups[0])
	.chars()
	.all(\|ch\| ch.is_ascii_digit())
	{
	if let Some(first_title) = output.iter().position(\|role\| role == "TITLE") {
	let later_structural = roles[first_title + 1..].iter().any(\|role\| {
	role.starts_with("EPISODE") \|\| matches!(role.as_str(), "SEASON" \| "SPECIAL")
	});
	if later_structural {
	output[0] = "TITLE".to_string();
	}
	}
	}
	for index in 0..roles.len() {
	let text = group_text(tokens, &groups[index]);
	if output[index] == "O" && groups[index].class_name.contains("SXE") {
	output[index] = "EPISODE".to_string();
	}
	if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
	output[index] = "O".to_string();
	continue;
	}
	if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
	let previous_text = group_text(tokens, &groups[index - 2]);
	let next_special = output[index + 1..roles.len().min(index + 4)]
	.iter()
	.any(\|role\| role == "SPECIAL");
	let next_episode = roles[index + 1..]
	.iter()
	.any(\|role\| role.starts_with("EPISODE"));
	if groups[index - 1].class_name == "SEP"
	&& matches!(
	previous_text.to_ascii_lowercase().as_str(),
	"vol" \| "volume"
	)
	{
	let next_text_before_episode = (index + 1..roles.len())
	.find(\|&cursor\| groups[cursor].class_name != "SEP")
	.is_some_and(\|cursor\| {
	groups[cursor].class_name == "TEXT"
	&& roles[cursor + 1..]
	.iter()
	.any(\|role\| role.starts_with("EPISODE"))
	});
	if next_text_before_episode {
	output[index - 2] = "TITLE".to_string();
	output[index] = "TITLE".to_string();
	continue;
	}
	output[index - 2] = "SPECIAL".to_string();
	output[index] = "SPECIAL".to_string();
	continue;
	}
	if output[index - 2] == "TITLE"
	&& groups[index - 1].class_name == "SEP"
	&& previous_text.len() <= 4
	&& previous_text.is_ascii()
	&& previous_text.chars().all(\|ch\| ch.is_ascii_alphabetic())
	&& text.chars().all(\|ch\| ch.is_ascii_digit())
	&& text.len() <= 3
	&& (next_special \|\| next_episode)
	{
	output[index] = "TITLE".to_string();
	continue;
	}
	}
	if roles[index].starts_with("EPISODE")
	&& index >= 2
	&& output[..index].iter().any(\|role\| role == "TITLE")
	&& group_text(tokens, &groups[index])
	.chars()
	.all(\|ch\| ch.is_ascii_digit())
	{
	let next_episode_word = index + 2 < roles.len()
	&& groups[index + 1].class_name == "SEP"
	&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode");
	if next_episode_word {
	let mut run = Vec::new();
	let mut cursor = index + 2;
	while cursor < roles.len() {
	if groups[cursor].class_name == "SEP" {
	cursor += 1;
	continue;
	}
	if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE")
	{
	run.push(cursor);
	cursor += 1;
	continue;
	}
	break;
	}
	let later_episode = roles[cursor..]
	.iter()
	.any(\|role\| role.starts_with("EPISODE"));
	if run.len() >= 2 && later_episode {
	output[index] = "TITLE".to_string();
	for item in run {
	output[item] = "TITLE".to_string();
	}
	continue;
	}
	}
	}
	if roles[index] == "TITLE" && is_special_title_phrase(&text) {
	output[index] = "SPECIAL".to_string();
	continue;
	}
	if roles[index] == "TITLE" && matches!(text.as_str(), "第" \| "話" \| "话" \| "回" \| "集")
	{
	output[index] = "O".to_string();
	continue;
	}
	if output[index] == "O"
	&& groups[index].class_name == "TEXT"
	&& roles[index + 1..].iter().any(\|role\| role.starts_with("EPISODE"))
	&& text.chars().any(\|ch\| ch.is_alphabetic())
	&& !ep_markers.contains(&text.as_str())
	{
	if let Some(last_title) = output[..index].iter().rposition(\|role\| role == "TITLE") {
	let episode_since_title = output[last_title + 1..index]
	.iter()
	.any(\|role\| role.starts_with("EPISODE"));
	if !episode_since_title {
	output[index] = "TITLE".to_string();
	continue;
	}
	}
	}
	if roles[index] == "TITLE"
	&& matches!(text.to_ascii_lowercase().as_str(), "season" \| "saison")
	&& index + 2 < roles.len()
	&& groups[index + 1].class_name == "SEP"
	&& roles[index + 2].starts_with("EPISODE")
	{
	output[index] = "O".to_string();
	output[index + 2] = "SEASON".to_string();
	continue;
	}
	if roles[index] == "TITLE"
	&& text == text.to_ascii_uppercase()
	&& roman.contains(&text.as_str())
	{
	let previous_title = output[..index].iter().any(\|role\| role == "TITLE");
	let next_structural = roles[index + 1..]
	.iter()
	.any(\|role\| role.starts_with("EPISODE") \|\| role == "SPECIAL");
	if previous_title && next_structural {
	output[index] = "SEASON".to_string();
	continue;
	}
	}
	if roles[index].starts_with("EPISODE") && index + 4 < roles.len() {
	if groups[index + 1].class_name == "SEP"
	&& ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str())
	&& groups[index + 3].class_name == "SEP"
	&& roles[index + 4].starts_with("EPISODE")
	{
	output[index] = "TITLE".to_string();
	output[index + 2] = "O".to_string();
	}
	}
	if roles[index].starts_with("EPISODE") {
	let previous_text = if index >= 1 {
	group_text(tokens, &groups[index - 1])
	} else {
	String::new()
	};
	let next_text = if index + 1 < roles.len() {
	group_text(tokens, &groups[index + 1])
	} else {
	String::new()
	};
	if previous_text.contains('点')
	\|\| previous_text.contains('點')
	\|\| previous_text.contains("晚上")
	\|\| previous_text.contains("上午")
	\|\| previous_text.contains("下午")
	\|\| next_text.contains('点')
	\|\| next_text.contains('點')
	\|\| next_text.contains('半')
	{
	output[index] = "O".to_string();
	}
	}
	}
	output
	}

	fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
	let mut candidates = Vec::new();
	let mut index = 0;
	while index < roles.len() {
	if roles[index] != "TITLE" {
	index += 1;
	continue;
	}
	let start = index;
	index += 1;
	loop {
	if index < roles.len()
	&& roles[index] == "TITLE"
	&& !(groups[index - 1].class_name == "BRACKET_TEXT"
	&& groups[index].class_name == "BRACKET_TEXT")
	{
	index += 1;
	continue;
	}
	if index + 1 < roles.len()
	&& roles[index] == "O"
	&& groups[index].class_name == "SEP"
	&& roles[index + 1] == "TITLE"
	{
	index += 2;
	continue;
	}
	break;
	}
	candidates.push((start, index));
	}
	candidates
	}

	fn enforce_single_title_candidate(
	groups: &[Group],
	roles: &[String],
	) -> (Vec<String>, Vec<String>) {
	let candidates = title_candidates(groups, roles);
	if candidates.len() <= 1 {
	return (roles.to_vec(), Vec::new());
	}
	let first_anchor = roles
	.iter()
	.position(\|role\| {
	role.starts_with("EPISODE")
	\|\| matches!(
	role.as_str(),
	"SEASON" \| "SPECIAL" \| "SOURCE" \| "RESOLUTION"
	)
	})
	.unwrap_or(roles.len());
	let before_anchor: Vec<(usize, usize)> = candidates
	.iter()
	.copied()
	.filter(\|(_, end)\| *end <= first_anchor)
	.collect();
	let selected = (if before_anchor.is_empty() {
	&candidates
	} else {
	&before_anchor
	})
	.iter()
	.max_by_key(\|(start, end)\| (*end, end - start))
	.copied()
	.unwrap();
	let mut output = roles.to_vec();
	let mut dropped = Vec::new();
	for (start, end) in candidates {
	if (start, end) == selected {
	continue;
	}
	for index in start..end {
	if output[index] == "TITLE" {
	output[index] = "O".to_string();
	dropped.push(index.to_string());
	}
	}
	}
	(output, dropped)
	}

	fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
	let mut output_tokens = Vec::new();
	let mut output_labels = Vec::new();
	for (token, label) in tokens.iter().zip(labels.iter()) {
	for piece in split_generated_token(token) {
	output_labels.push(if label == "O" \|\| is_standalone_separator(&piece) {
	"O".to_string()
	} else {
	label.clone()
	});
	output_tokens.push(piece);
	}
	}
	(output_tokens, output_labels)
	}

	fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
	let pieces = split_generated_token(token);
	let labels = pieces
	.iter()
	.map(\|piece\| {
	if is_standalone_separator(piece) {
	"O".to_string()
	} else if CJK_SEASON_TOKEN_RE.is_match(piece) {
	"B-SEASON".to_string()
	} else {
	"B-TITLE".to_string()
	}
	})
	.collect();
	(pieces, labels)
	}

	fn split_generated_token(token: &str) -> Vec<String> {
	let mut pieces = Vec::new();
	let mut current = String::new();
	for ch in token.chars() {
	if ch.is_whitespace() \|\| !ch.is_alphanumeric() {
	if !current.is_empty() {
	pieces.push(std::mem::take(&mut current));
	}
	pieces.push(ch.to_string());
	} else {
	current.push(ch);
	}
	}
	if !current.is_empty() {
	pieces.push(current);
	}
	pieces
	}

	fn is_standalone_separator(token: &str) -> bool {
	token.chars().count() == 1
	&& token
	.chars()
	.next()
	.is_some_and(\|ch\| ch.is_whitespace() \|\| !ch.is_alphanumeric())
	}

	fn project_refined_tokens(
	tokens: &[String],
	groups: &[Group],
	roles: &[String],
	) -> (Vec<String>, Vec<String>) {
	let mut output_tokens = Vec::new();
	let mut output_labels = Vec::new();
	for (group_index, group) in groups.iter().enumerate() {
	let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O");
	if matches!(group.class_name.as_str(), "SEP" \| "PATH" \| "EMPTY") {
	role = "O";
	}
	for &index in &group.indices {
	let token = &tokens[index];
	if matches!(
	role,
	"EPISODE"
	\| "EPISODE_VERSION"
	\| "EPISODE_RANGE"
	\| "SOURCE"
	\| "RESOLUTION"
	\| "SEASON"
	) {
	if role == "SEASON" {
	if let Some((pieces, labels)) = split_season_token(token) {
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	}
	if matches!(role, "EPISODE" \| "EPISODE_VERSION" \| "EPISODE_RANGE") {
	if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) {
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	}
	for piece in split_refined_token(token) {
	if matches!(role, "EPISODE" \| "EPISODE_VERSION" \| "EPISODE_RANGE") {
	if let Some((pieces, labels)) = split_season_token(&piece) {
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	if let Some((pieces, labels)) = split_episode_token(&piece) {
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	}
	let label = label_for_refined_piece(&piece, role, &group.class_name);
	let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	}
	} else {
	if role == "TITLE" && matches!(token.as_str(), "第" \| "話" \| "话" \| "回" \| "集")
	{
	output_tokens.push(token.clone());
	output_labels.push("O".to_string());
	continue;
	}
	if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 {
	let trimmed = token.trim_end_matches('第').to_string();
	let (pieces, labels) = normalize_generated_tokens(
	&[trimmed, "第".to_string()],
	&["B-TITLE".to_string(), "O".to_string()],
	);
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	if role == "TITLE" {
	let (pieces, labels) = normalize_title_token(token);
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	continue;
	}
	let (pieces, labels) =
	normalize_generated_tokens(&[token.clone()], &[role_label(role)]);
	output_tokens.extend(pieces);
	output_labels.extend(labels);
	}
	}
	}
	(output_tokens, output_labels)
	}

	fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
	let joiners = [
	" ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
	"？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
	"】", "｢", "｣", "「", "」", "☆", "@",
	];
	let title_terminal_punctuation = ["!", "！", "?", "？"];
	let entity_joiners = [
	" ", ".", "-", "_", "·", "・", "×", "／", "/", "'", "’", ":", "：", "!", "！", "?",
	"？", ";", "；", ",", "，", "～", "~", "－", "(", ")", "（", "）", "[", "]", "【",
	"】", "｢", "｣", "「", "」", "☆", "@", "&", "＆",
	];
	let mut output = labels.to_vec();
	for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
	if label != "O" \|\| !entity_joiners.contains(&token.as_str()) {
	continue;
	}
	let mut left = index as isize - 1;
	while left >= 0
	&& joiners.contains(&tokens[left as usize].as_str())
	&& labels[left as usize] == "O"
	{
	left -= 1;
	}
	let mut right = index + 1;
	while right < tokens.len()
	&& joiners.contains(&tokens[right].as_str())
	&& labels[right] == "O"
	{
	right += 1;
	}
	if left >= 0 && right < tokens.len() {
	let left_label = &output[left as usize];
	let right_label = &labels[right];
	if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" \| "B-GROUP") {
	output[index] = left_label.clone();
	}
	}
	if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
	let left_label = &output[index - 1];
	if left_label == "B-TITLE" {
	output[index] = "B-TITLE".to_string();
	}
	}
	}
	output
	}

	fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
	let (key, tokens, _classes, groups) = template_key_for_filename(filename);
	if groups.len() != roles.len() {
	return None;
	}
	let roles = adjust_contextual_roles(&tokens, &groups, roles);
	let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
	let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
	let labels = smooth_title_spans(&tokens, &labels);
	if tokens.len() != labels.len() {
	return None;
	}
	Some(Record {
	filename: filename.to_string(),
	tokens,
	labels,
	template_id: template_id.to_string(),
	template: key,
	source_filename: None,
	path_trimmed: None,
	dropped_title_candidate_positions: if dropped.is_empty() {
	None
	} else {
	Some(dropped)
	},
	})
	}

	#[cfg(test)]
	mod tests {
	use super::*;

	fn labels_for(filename: &str) -> Vec<(String, String)> {
	let (key, _, _, _) = template_key_for_filename(filename);
	let roles = suggested_roles(&key);
	let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
	record.tokens.into_iter().zip(record.labels).collect()
	}

	#[test]
	fn required_regressions() {
	let title_91 = labels_for("Title 91 EP 01 [1080p]");
	assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string())));
	assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
	assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));

	let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]");
	assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string())));
	assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string())));

	let roman = labels_for("Chibi Maruko-chan I 001");
	assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string())));
	assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string())));

	let dxd = labels_for("High School D×D");
	assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
	let colon_title = labels_for("Megumi no Daigo：Kyuukoku no Orange 06");
	assert!(colon_title.contains(&("：".to_string(), "B-TITLE".to_string())));

	let sxe = labels_for("S01E02");
	assert_eq!(
	sxe,
	vec![
	("S".to_string(), "O".to_string()),
	("01".to_string(), "B-SEASON".to_string()),
	("E".to_string(), "O".to_string()),
	("02".to_string(), "B-EPISODE".to_string())
	]
	);
	let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]");
	assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string())));
	assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string())));
	let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
	assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
	assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));

	let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
	assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string())));
	assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string())));
	let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)");
	assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string())));
	assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
	let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
	assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
	let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]");
	assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string())));
	assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
	assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
	assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
	let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)");
	assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string())));
	let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]");
	assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string())));
	let hash = labels_for("[Group][Title][01][1080p][00270AC8]");
	assert!(hash.contains(&("00270AC8".to_string(), "O".to_string())));
	let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001");
	assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string())));
	assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string())));
	let ubw = labels_for("Fate／stay night [Unlimited Blade Works] #00 「プロローグ」");
	assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string())));
	assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string())));
	let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]");
	assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string())));
	let comma_title =
	labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
	assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
	let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
	assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
	let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
	assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string())));
	let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER～魂狩～ #01 (HEVC 1312x720)");
	assert!(soul_taker.contains(&("～".to_string(), "B-TITLE".to_string())));
	let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
	assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
	assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));

	let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
	assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
	assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string())));
	let zom =
	labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]");
	assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string())));
	assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
	assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));

	let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
	assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
	assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
	assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
	}

	#[test]
	fn updated_python_alignment_regressions() {
	let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
	let (trimmed, was_trimmed) = training_filename_for(original);
	assert!(was_trimmed);
	assert_eq!(
	trimmed,
	"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
	);
	let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
	let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
	assert!(pokemon_was_trimmed);
	assert_eq!(
	trimmed_pokemon,
	"Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"
	);
	let woody = labels_for(&trimmed);
	assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
	assert!(woody.contains(&("E".to_string(), "O".to_string())));
	assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string())));
	assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
	assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
	assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));

	let group = labels_for("[DBD-Raws][Title][01][1080P]");
	assert!(group.contains(&("-".to_string(), "B-GROUP".to_string())));
	let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]");
	assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string())));

	let cjk_season =
	labels_for("[DBD-Raws][魔道祖师第一季][08][1080P][BDRip][HEVC-10bit][FLAC]");
	assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
	assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
	assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));

	let (trimmed, was_trimmed) =
	training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]");
	assert!(was_trimmed);
	assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]");
	let (key, _, _, _) = template_key_for_filename(&trimmed);
	assert_eq!(
	key,
	"BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION"
	);

	let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
	assert!(short.contains(&("R".to_string(), "B-TITLE".to_string())));
	assert!(short.contains(&("-".to_string(), "B-TITLE".to_string())));
	assert!(short.contains(&("15".to_string(), "B-TITLE".to_string())));
	assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string())));

	let short_before_episode =
	labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
	assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string())));
	assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string())));
	assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string())));
	assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
	assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string())));

	let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]";
	let (trimmed, was_trimmed) = training_filename_for(avatar);
	assert!(was_trimmed);
	assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");

	let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
	let (trimmed, was_trimmed) = training_filename_for(tintin);
	assert!(was_trimmed);
	assert_eq!(
	trimmed,
	"Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"
	);
	let (key, _, _, _) = template_key_for_filename(&trimmed);
	assert_eq!(
	key,
	"TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT"
	);

	let bocchi = "Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾！第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」";
	let (leaf_key, _, _, _) =
	template_key_for_filename("Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」");
	assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
	assert!(filename_has_title(
	"Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」"
	));
	let (trimmed, was_trimmed) = training_filename_for(bocchi);
	assert!(was_trimmed);
	assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾！S01E01「孤獨反輾轉」");
	let (key, _, _, _) = template_key_for_filename(&trimmed);
	assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");

	let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]";
	let (trimmed, was_trimmed) = training_filename_for(usagi);
	assert!(was_trimmed);
	assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]");
	let (key, _, _, _) = template_key_for_filename(&trimmed);
	assert_eq!(
	key,
	"TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA"
	);

	let woody_parent =
	"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
	let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
	assert!(was_trimmed);
	assert_eq!(trimmed, woody_parent);

	let volume =
	labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
	assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string())));
	assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string())));
	assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string())));
	assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
	assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));

	let numeric_title =
	labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
	assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
	assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string())));
	assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string())));
	assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string())));
	assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string())));
	assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string())));
	assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string())));

	let media_block =
	labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]");
	assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string())));
	assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
	assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
	assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
	}
	}