ModerRAS's picture
Verify low-frequency DMHY generated output
93f322e
raw
history blame
93.6 kB
use anyhow::{bail, Context, Result};
use chrono::Utc;
use clap::Parser;
use once_cell::sync::Lazy;
use rayon::prelude::*;
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};
use std::fs::{self, File};
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::PathBuf;
use std::sync::atomic::{AtomicUsize, Ordering};
#[derive(Parser, Debug)]
#[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
struct Args {
#[arg(long)]
cluster: bool,
#[arg(long)]
audit_low_frequency: bool,
#[arg(long)]
verify_generated_output: bool,
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
input: PathBuf,
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
recipes: PathBuf,
#[arg(
long,
default_value = "reports/dmhy_weak.template_generated.rust.jsonl"
)]
output: PathBuf,
#[arg(
long,
default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
)]
manifest_output: PathBuf,
#[arg(
long,
default_value = "reports/dmhy_template_clusters.rust.summary.json"
)]
summary_output: PathBuf,
#[arg(
long,
default_value = "reports/dmhy_template_clusters.rust.samples.jsonl"
)]
samples_output: PathBuf,
#[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")]
clusters_output: PathBuf,
#[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")]
recipes_output: PathBuf,
#[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
review_output: PathBuf,
#[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
audit_output: PathBuf,
#[arg(long, default_value_t = 50)]
audit_max_count: u64,
#[arg(long)]
limit: Option<usize>,
#[arg(long)]
limit_templates: Option<usize>,
#[arg(long, default_value_t = 1)]
min_count: u64,
#[arg(long, default_value_t = 200)]
top: usize,
#[arg(long, default_value_t = 200)]
recipe_top: usize,
#[arg(long, default_value_t = 1000)]
review_top: usize,
#[arg(long, default_value_t = 8)]
examples: usize,
#[arg(long, default_value_t = 25)]
recipe_min_count: usize,
#[arg(long, default_value = "high")]
confidence: String,
#[arg(long, default_value = "all")]
expand: String,
#[arg(long, default_value_t = 100)]
sample_per_template: usize,
#[arg(long)]
keep_encoding_noise: bool,
#[arg(long)]
preserve_parent_paths: bool,
#[arg(long)]
threads: Option<usize>,
}
#[derive(Debug, Clone, Deserialize)]
struct Recipe {
template_id: String,
template: String,
roles: Vec<String>,
confidence: Option<String>,
count: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Record {
filename: String,
tokens: Vec<String>,
labels: Vec<String>,
template_id: String,
template: String,
#[serde(skip_serializing_if = "Option::is_none")]
source_filename: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
path_trimmed: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
dropped_title_candidate_positions: Option<Vec<String>>,
}
#[derive(Debug, Clone)]
struct Group {
indices: Vec<usize>,
class_name: String,
}
#[derive(Debug, Default, Clone, Serialize)]
struct Stats {
seen: usize,
skipped_encoding_noise: usize,
trimmed_parent_path: usize,
skipped_no_recipe: usize,
skipped_sample_cap: usize,
skipped_role_mismatch: usize,
skipped_low_frequency_audit_warning: usize,
written: usize,
}
#[derive(Debug, Default)]
struct Cluster {
count: usize,
examples: Vec<String>,
literal_counts: HashMap<String, usize>,
class_counts: HashMap<String, usize>,
position_literals: Vec<HashMap<String, usize>>,
}
#[derive(Debug)]
enum Processed {
Written {
record: Record,
trimmed_parent: bool,
},
Skipped {
reason: &'static str,
trimmed_parent: bool,
},
}
static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
static RESOLUTION_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
static EPISODE_VERSION_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
static EPISODE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap());
static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
static EPISODE_RANGE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*(?:TV|全集|全|END|Fin|Complete|SP|OVA|OAD|NCOP|NCED)|[+_./-])*.{0,16}$").unwrap()
});
static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
static SXE_VALUE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
static EPISODE_VALUE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap());
static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
});
static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
});
static VOLUME_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
static DATE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
static LANG_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
});
static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
});
static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
.unwrap()
});
static YEAR_RANGE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
});
static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap());
static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap()
});
static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
[
r"^\d{3,4}[xX×]\d{3,4}",
r"(?i)^h\.?26[45]",
r"(?i)^x\.?26[45]",
r"^[\\/]+",
r"^[-_.::+&|]+",
r"^\s+",
r"(?i)^Season\s*\d{1,2}",
r"^[A-Za-z]+(?:\d+[A-Za-z]*)*",
r"^\d+[A-Za-z]+\d*",
r"^\d{1,4}(?:[._-]\d{1,4})*",
r"^[\p{Hiragana}\p{Katakana}\p{Han}]+",
]
.into_iter()
.map(|pattern| Regex::new(pattern).unwrap())
.collect()
});
static SIMPLE_EPISODE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
fn main() -> Result<()> {
let args = Args::parse();
if let Some(threads) = args.threads {
rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build_global()
.context("failed to configure rayon thread pool")?;
}
if args.cluster {
return run_cluster(&args);
}
if args.audit_low_frequency {
return run_low_frequency_audit(&args);
}
if args.verify_generated_output {
return run_verify_generated_output(&args);
}
if args.expand != "all" && args.expand != "sample" {
bail!("--expand must be all or sample");
}
let recipes = load_recipes(&args)?;
if recipes.is_empty() {
bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates");
}
let inputs = load_input(&args.input, args.limit)?;
let sample_counters: HashMap<String, AtomicUsize> = recipes
.values()
.map(|recipe| (recipe.template_id.clone(), AtomicUsize::new(0)))
.collect();
let processed: Vec<Processed> = inputs
.par_iter()
.map(|filename| process_filename(filename, &args, &recipes, &sample_counters))
.collect();
if let Some(parent) = args.output.parent() {
fs::create_dir_all(parent)?;
}
if let Some(parent) = args.manifest_output.parent() {
fs::create_dir_all(parent)?;
}
let mut stats = Stats {
seen: inputs.len(),
..Stats::default()
};
let mut label_counts: HashMap<String, usize> = HashMap::new();
let mut template_counts: HashMap<String, usize> = HashMap::new();
let mut examples = Vec::new();
let mut writer = BufWriter::new(File::create(&args.output)?);
for item in processed {
match item {
Processed::Written {
record,
trimmed_parent,
} => {
if trimmed_parent {
stats.trimmed_parent_path += 1;
}
for label in &record.labels {
*label_counts.entry(label.clone()).or_default() += 1;
}
*template_counts
.entry(record.template_id.clone())
.or_default() += 1;
if examples.len() < 20 {
examples.push(serde_json::to_value(&record)?);
}
serde_json::to_writer(&mut writer, &record)?;
writer.write_all(b"\n")?;
stats.written += 1;
}
Processed::Skipped {
reason,
trimmed_parent,
} => {
if trimmed_parent {
stats.trimmed_parent_path += 1;
}
match reason {
"encoding_noise" => stats.skipped_encoding_noise += 1,
"no_recipe" => stats.skipped_no_recipe += 1,
"sample_cap" => stats.skipped_sample_cap += 1,
"role_mismatch" => stats.skipped_role_mismatch += 1,
"low_frequency_audit_warning" => {
stats.skipped_low_frequency_audit_warning += 1
}
_ => {}
}
}
}
}
writer.flush()?;
let mut top_template_counts: Vec<_> = template_counts.into_iter().collect();
top_template_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
top_template_counts.truncate(20);
let manifest = json!({
"generated_at": Utc::now().to_rfc3339(),
"input": args.input.to_string_lossy(),
"recipes": args.recipes.to_string_lossy(),
"output": args.output.to_string_lossy(),
"selected_templates": recipes.len(),
"confidence": args.confidence,
"min_count": args.min_count,
"low_frequency_audit_max_count": args.audit_max_count,
"low_frequency_blocking_warnings": [
"hash_labeled",
"multiple_title_spans",
"no_title",
"path_retained"
],
"expand": args.expand,
"sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
"stats": stats,
"label_counts": label_counts,
"top_template_counts": top_template_counts,
"examples": examples,
"implementation": "rust_dmhy_template_apply"
});
fs::write(
&args.manifest_output,
serde_json::to_string_pretty(&manifest)?,
)?;
println!("{}", serde_json::to_string_pretty(&manifest)?);
Ok(())
}
fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
let file = File::open(&args.recipes)
.with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
let mut recipes = HashMap::new();
for (line_number, line) in BufReader::new(file).lines().enumerate() {
let line = line?;
if line.trim().is_empty() {
continue;
}
let row: Recipe = serde_json::from_str(&line).with_context(|| {
format!(
"invalid recipe JSON at {}:{}",
args.recipes.display(),
line_number + 1
)
})?;
if !args.confidence.is_empty()
&& row.confidence.as_deref() != Some(args.confidence.as_str())
{
continue;
}
if row.count.unwrap_or(0) < args.min_count {
continue;
}
recipes.insert(row.template.clone(), row);
if args
.limit_templates
.is_some_and(|limit| recipes.len() >= limit)
{
break;
}
}
Ok(recipes)
}
fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
let file =
File::open(path).with_context(|| format!("input JSONL not found: {}", path.display()))?;
let mut values = Vec::new();
for (line_number, line) in BufReader::new(file).lines().enumerate() {
if limit.is_some_and(|limit| values.len() >= limit) {
break;
}
let line = line?;
if line.trim().is_empty() {
continue;
}
let row: Value = serde_json::from_str(&line)
.with_context(|| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?;
if let Some(value) = row.get("value").and_then(Value::as_str) {
let value = value.trim();
if !value.is_empty() {
values.push(value.to_string());
}
}
}
Ok(values)
}
fn run_cluster(args: &Args) -> Result<()> {
let inputs = load_input(&args.input, args.limit)?;
let source_rows = inputs.len();
let mut clusters: HashMap<String, Cluster> = HashMap::new();
let mut skipped_encoding_noise = 0usize;
let mut trimmed_parent_path = 0usize;
let mut total_rows = 0usize;
for original in inputs {
if !args.keep_encoding_noise
&& (has_encoding_noise(&original)
|| has_non_anime_noise(&original)
|| has_abstract_path_noise(&original))
{
skipped_encoding_noise += 1;
continue;
}
let filename = if args.preserve_parent_paths {
original
} else {
let (training_filename, was_trimmed) = training_filename_for(&original);
if was_trimmed {
trimmed_parent_path += 1;
}
training_filename
};
add_cluster(&mut clusters, &filename, args.examples);
total_rows += 1;
}
let mut sorted_clusters: Vec<_> = clusters.into_iter().collect();
sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0)));
let cluster_rows: Vec<Value> = sorted_clusters
.iter()
.enumerate()
.map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows))
.collect();
let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect();
let recipe_candidates: Vec<Value> =
cluster_rows.iter().take(args.recipe_top).cloned().collect();
let recipes: Vec<Value> = recipe_candidates
.iter()
.filter(|row| is_high_confidence_recipe(row, args.recipe_min_count))
.map(|row| recipe_row(row, "high"))
.collect();
let review: Vec<Value> = recipe_candidates
.iter()
.filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count))
.take(args.review_top)
.cloned()
.collect();
write_jsonl_values(&args.clusters_output, &cluster_rows)?;
write_jsonl_values(&args.samples_output, &samples)?;
write_jsonl_values(&args.recipes_output, &recipes)?;
write_jsonl_values(&args.review_output, &review)?;
let mut histogram: HashMap<usize, usize> = HashMap::new();
for (_, cluster) in &sorted_clusters {
*histogram.entry(cluster.count).or_default() += 1;
}
let mut count_histogram_top: Vec<_> = histogram.into_iter().collect();
count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
count_histogram_top.truncate(20);
let rows_covered_by_repeated_templates: usize = sorted_clusters
.iter()
.map(|(_, cluster)| cluster)
.filter(|cluster| cluster.count as u64 >= args.min_count)
.map(|cluster| cluster.count)
.sum();
let templates_at_least_min_count = sorted_clusters
.iter()
.filter(|(_, cluster)| cluster.count as u64 >= args.min_count)
.count();
let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect();
let summary = json!({
"input": args.input.to_string_lossy(),
"source_rows": source_rows,
"skipped_encoding_noise": skipped_encoding_noise,
"trimmed_parent_path": trimmed_parent_path,
"total_rows": total_rows,
"unique_templates": sorted_clusters.len(),
"min_count": args.min_count,
"templates_at_least_min_count": templates_at_least_min_count,
"rows_covered_by_repeated_templates": rows_covered_by_repeated_templates,
"rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 },
"top_output_rows": samples.len(),
"clusters_output": args.clusters_output.to_string_lossy(),
"cluster_rows": cluster_rows.len(),
"recipes_output": args.recipes_output.to_string_lossy(),
"recipe_rows": recipes.len(),
"review_output": args.review_output.to_string_lossy(),
"review_rows": review.len(),
"recipe_top": args.recipe_top,
"recipe_min_count": args.recipe_min_count,
"top_templates": top_templates,
"count_histogram_top": count_histogram_top,
"implementation": "rust_dmhy_template_cluster",
"generated_at": Utc::now().to_rfc3339(),
});
if let Some(parent) = args.summary_output.parent() {
fs::create_dir_all(parent)?;
}
fs::write(
&args.summary_output,
serde_json::to_string_pretty(&summary)?,
)?;
println!("{}", serde_json::to_string_pretty(&summary)?);
Ok(())
}
fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) {
let (key, tokens, classes, groups) = template_key_for_filename(filename);
let cluster = clusters.entry(key).or_default();
cluster.count += 1;
if cluster.examples.len() < example_limit {
cluster.examples.push(filename.to_string());
}
for (token, class_name) in tokens.iter().zip(classes.iter()) {
*cluster.class_counts.entry(class_name.clone()).or_default() += 1;
if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
let cleaned = strip_wrapper(token);
if !cleaned.is_empty() {
*cluster.literal_counts.entry(cleaned).or_default() += 1;
}
}
}
while cluster.position_literals.len() < groups.len() {
cluster.position_literals.push(HashMap::new());
}
for (index, group) in groups.iter().enumerate() {
if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
let text = group_text(&tokens, group);
if !text.is_empty() {
*cluster.position_literals[index].entry(text).or_default() += 1;
}
}
}
}
fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
json!({
"template_id": format!("tpl_{rank:06}"),
"template": key,
"count": cluster.count,
"coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
"top_literals": top_counts(&cluster.literal_counts, 12),
"suggested_roles": suggested_roles(key),
"position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
"class_counts": top_counts(&cluster.class_counts, 20),
"examples": cluster.examples,
})
}
fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
let mut items: Vec<_> = counts
.iter()
.map(|(key, count)| (key.clone(), *count))
.collect();
items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
items.truncate(limit);
items
}
fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool {
if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 {
return false;
}
let roles = match row.get("suggested_roles").and_then(Value::as_array) {
Some(roles) => roles,
None => return false,
};
let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect();
if role_strings.iter().any(|role| role.contains("_OR_")) {
return false;
}
if !role_strings.contains(&"TITLE")
|| !role_strings.iter().any(|role| {
role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION")
})
{
return false;
}
let template = row.get("template").and_then(Value::as_str).unwrap_or("");
if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") {
return false;
}
!role_strings.contains(&"TITLE_OR_TEXT")
}
fn recipe_row(row: &Value, confidence: &str) -> Value {
json!({
"template_id": row["template_id"],
"template": row["template"],
"roles": row["suggested_roles"],
"confidence": confidence,
"count": row["count"],
"examples": row["examples"],
})
}
fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
let mut writer = BufWriter::new(File::create(path)?);
for row in rows {
serde_json::to_writer(&mut writer, row)?;
writer.write_all(b"\n")?;
}
writer.flush()?;
Ok(())
}
fn run_low_frequency_audit(args: &Args) -> Result<()> {
let recipes = load_recipes(args)?;
let inputs = load_input(&args.input, args.limit)?;
let low_template_total = recipes
.values()
.filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count)
.count();
let mut seen_templates = HashSet::new();
let mut rows = Vec::new();
for original in inputs {
if !args.keep_encoding_noise
&& (has_encoding_noise(&original)
|| has_non_anime_noise(&original)
|| has_abstract_path_noise(&original))
{
continue;
}
let (training_filename, trimmed_parent) = training_filename_for(&original);
let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
let Some(recipe) = recipes.get(&key) else {
continue;
};
let count = recipe.count.unwrap_or(0);
if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) {
continue;
}
if recipe.roles.len() != groups.len() {
continue;
}
let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles)
else {
continue;
};
if trimmed_parent {
record.source_filename = Some(original.clone());
record.path_trimmed = Some(true);
}
rows.push(json!({
"template_id": recipe.template_id,
"count": count,
"template": recipe.template,
"filename": record.filename,
"source_filename": record.source_filename,
"path_trimmed": record.path_trimmed.unwrap_or(false),
"spans": entity_spans(&record.tokens, &record.labels),
"warnings": audit_warnings(&record),
"tokens": record.tokens,
"labels": record.labels,
}));
if seen_templates.len() >= low_template_total {
break;
}
}
rows.sort_by(|a, b| {
let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0);
let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0);
let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or("");
let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or("");
count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b))
});
write_jsonl_values(&args.audit_output, &rows)?;
let warning_counts = warning_counts(&rows);
let manifest = json!({
"generated_at": Utc::now().to_rfc3339(),
"input": args.input.to_string_lossy(),
"recipes": args.recipes.to_string_lossy(),
"audit_output": args.audit_output.to_string_lossy(),
"audit_max_count": args.audit_max_count,
"low_template_total": low_template_total,
"audited_templates": rows.len(),
"warning_counts": warning_counts,
"implementation": "rust_dmhy_low_frequency_audit"
});
println!("{}", serde_json::to_string_pretty(&manifest)?);
Ok(())
}
fn run_verify_generated_output(args: &Args) -> Result<()> {
let file = File::open(&args.input)
.with_context(|| format!("generated JSONL not found: {}", args.input.display()))?;
let recipes_by_id: HashMap<String, u64> = load_recipes(args)?
.into_values()
.map(|recipe| (recipe.template_id, recipe.count.unwrap_or(0)))
.collect();
let mut rows = 0usize;
let mut low_frequency_rows = 0usize;
let mut warning_counts: HashMap<String, usize> = HashMap::new();
let mut examples: HashMap<String, Vec<Value>> = HashMap::new();
for (line_number, line) in BufReader::new(file).lines().enumerate() {
let line = line?;
if line.trim().is_empty() {
continue;
}
let record: Record = serde_json::from_str(&line).with_context(|| {
format!(
"invalid generated record at {}:{}",
args.input.display(),
line_number + 1
)
})?;
rows += 1;
let count = recipes_by_id
.get(&record.template_id)
.copied()
.unwrap_or(u64::MAX);
if count > args.audit_max_count {
continue;
}
low_frequency_rows += 1;
for warning in audit_warnings(&record) {
if !matches!(
warning.as_str(),
"hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
) {
continue;
}
*warning_counts.entry(warning.clone()).or_default() += 1;
let bucket = examples.entry(warning).or_default();
if bucket.len() < 5 {
bucket.push(json!({
"template_id": record.template_id,
"template_count": count,
"filename": record.filename,
"spans": entity_spans(&record.tokens, &record.labels),
}));
}
}
}
let manifest = json!({
"generated_at": Utc::now().to_rfc3339(),
"input": args.input.to_string_lossy(),
"recipes": args.recipes.to_string_lossy(),
"audit_max_count": args.audit_max_count,
"rows": rows,
"low_frequency_rows": low_frequency_rows,
"blocking_warning_counts": warning_counts,
"examples": examples,
"implementation": "rust_dmhy_generated_output_verify"
});
println!("{}", serde_json::to_string_pretty(&manifest)?);
if !warning_counts.is_empty() {
bail!("generated output still has low-frequency blocking warnings");
}
Ok(())
}
fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
let mut spans = Vec::new();
let mut current_label: Option<String> = None;
let mut current_text = String::new();
for (token, label) in tokens.iter().zip(labels.iter()) {
let entity = label
.strip_prefix("B-")
.or_else(|| label.strip_prefix("I-"))
.unwrap_or("O");
if current_label.as_deref() == Some(entity) {
current_text.push_str(token);
continue;
}
if let Some(label) = current_label.take() {
if label != "O" {
spans.push(json!({ "label": label, "text": current_text }));
}
}
current_label = Some(entity.to_string());
current_text = token.clone();
}
if let Some(label) = current_label {
if label != "O" {
spans.push(json!({ "label": label, "text": current_text }));
}
}
spans
}
fn audit_warnings(record: &Record) -> Vec<String> {
let mut warnings = Vec::new();
let title_spans = entity_spans(&record.tokens, &record.labels)
.into_iter()
.filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE"))
.count();
if title_spans == 0 {
warnings.push("no_title".to_string());
} else if title_spans > 1 {
warnings.push("multiple_title_spans".to_string());
}
if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
warnings.push("no_episode".to_string());
}
if record.filename.contains('/') || record.filename.contains('\\') {
warnings.push("path_retained".to_string());
}
for (index, token) in record.tokens.iter().enumerate() {
if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
warnings.push("hash_labeled".to_string());
break;
}
}
warnings.sort();
warnings.dedup();
warnings
}
fn warning_counts(rows: &[Value]) -> HashMap<String, usize> {
let mut counts = HashMap::new();
for row in rows {
if let Some(warnings) = row.get("warnings").and_then(Value::as_array) {
for warning in warnings {
if let Some(warning) = warning.as_str() {
*counts.entry(warning.to_string()).or_default() += 1;
}
}
}
}
counts
}
fn process_filename(
original: &str,
args: &Args,
recipes: &HashMap<String, Recipe>,
sample_counters: &HashMap<String, AtomicUsize>,
) -> Processed {
if !args.keep_encoding_noise
&& (has_encoding_noise(original)
|| has_non_anime_noise(original)
|| has_abstract_path_noise(original))
{
return Processed::Skipped {
reason: "encoding_noise",
trimmed_parent: false,
};
}
let (training_filename, trimmed_parent) = training_filename_for(original);
let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
let recipe = match recipes.get(&key) {
Some(recipe) => recipe,
None => {
return Processed::Skipped {
reason: "no_recipe",
trimmed_parent,
}
}
};
if args.expand == "sample" {
let counter = sample_counters.get(&recipe.template_id).unwrap();
if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template {
return Processed::Skipped {
reason: "sample_cap",
trimmed_parent,
};
}
}
if recipe.roles.len() != groups.len() {
return Processed::Skipped {
reason: "role_mismatch",
trimmed_parent,
};
}
let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
Some(record) => record,
None => {
return Processed::Skipped {
reason: "role_mismatch",
trimmed_parent,
}
}
};
if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
{
return Processed::Skipped {
reason: "low_frequency_audit_warning",
trimmed_parent,
};
}
if trimmed_parent {
record.source_filename = Some(original.to_string());
record.path_trimmed = Some(true);
return Processed::Written {
record,
trimmed_parent: true,
};
}
Processed::Written {
record,
trimmed_parent: false,
}
}
fn has_blocking_low_frequency_warning(record: &Record) -> bool {
audit_warnings(record).iter().any(|warning| {
matches!(
warning.as_str(),
"hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
)
})
}
fn tokenize(value: &str) -> Vec<String> {
let mut output = Vec::new();
let mut index = 0;
while index < value.len() {
let rest = &value[index..];
if let Some((token, len)) = next_token(rest) {
output.push(token);
index += len;
} else {
let ch = rest.chars().next().unwrap();
output.push(ch.to_string());
index += ch.len_utf8();
}
}
output
}
fn next_token(rest: &str) -> Option<(String, usize)> {
let first = rest.chars().next()?;
if first == '[' {
if let Some(end) = rest.find(']') {
if end <= 121 {
return Some((rest[..=end].to_string(), end + 1));
}
}
}
if first == '(' {
if let Some(end) = rest.find(')') {
if end <= 121 {
return Some((rest[..=end].to_string(), end + 1));
}
}
}
if first == '【' {
if let Some(end) = rest.find('】') {
if rest[..end].chars().count() <= 120 {
return Some((
rest[..end + '】'.len_utf8()].to_string(),
end + '】'.len_utf8(),
));
}
}
}
for re in TOKEN_REGEXES.iter() {
if let Some(mat) = re.find(rest) {
if mat.start() == 0 && mat.end() > 0 {
return Some((mat.as_str().to_string(), mat.end()));
}
}
}
None
}
fn strip_wrapper(token: &str) -> String {
let chars: Vec<char> = token.chars().collect();
if chars.len() >= 2 {
let first = chars[0];
let last = chars[chars.len() - 1];
if (first == '[' && last == ']')
|| (first == '(' && last == ')')
|| (first == '【' && last == '】')
{
return chars[1..chars.len() - 1]
.iter()
.collect::<String>()
.trim()
.to_string();
}
}
token.trim().to_string()
}
fn split_inner(inner: &str) -> Vec<String> {
let mut parts = Vec::new();
let mut current = String::new();
for ch in inner.chars() {
if ch.is_whitespace() || "_.,+/&|-()()".contains(ch) {
if !current.is_empty() {
parts.push(std::mem::take(&mut current));
}
} else {
current.push(ch);
}
}
if !current.is_empty() {
parts.push(current);
}
parts
}
fn compact_for_classify(text: &str) -> String {
text.chars()
.filter(|ch| !ch.is_whitespace() && !matches!(ch, '_' | '.' | ',' | '-'))
.collect()
}
fn classify_atom(text: &str) -> String {
let cleaned = strip_wrapper(text);
let compact = compact_for_classify(&cleaned);
if cleaned.is_empty() {
return "EMPTY".to_string();
}
if HASH_RE.is_match(&cleaned) {
return "HASH".to_string();
}
if RESOLUTION_RE.is_match(&cleaned) {
return "RESOLUTION".to_string();
}
if DATE_RE.is_match(&cleaned) {
return "DATE".to_string();
}
if EPISODE_VERSION_RE.is_match(&compact) {
return "EPISODE_VERSION".to_string();
}
if SXE_RE.is_match(&compact) {
return "SXE".to_string();
}
if EPISODE_RE.is_match(&compact) {
return "EPISODE".to_string();
}
if EPISODE_CJK_RE.is_match(&cleaned) {
return "EPISODE".to_string();
}
if EPISODE_BATCH_RE.is_match(&cleaned) {
return "EPISODE_RANGE".to_string();
}
if EPISODE_RANGE_RE.is_match(&cleaned) {
return "EPISODE_RANGE".to_string();
}
if EPISODE_RE.is_match(&cleaned) {
return "EPISODE".to_string();
}
if SEASON_RE.is_match(&cleaned) {
return "SEASON".to_string();
}
if SPECIAL_RE.is_match(&cleaned) {
return "SPECIAL".to_string();
}
if VOLUME_RE.is_match(&cleaned) {
return "VOLUME".to_string();
}
if LANG_RE.is_match(&cleaned) || lang_block_matches(&cleaned) {
return "LANG".to_string();
}
if MEDIA_RE.is_match(&cleaned) {
return "MEDIA".to_string();
}
"TEXT".to_string()
}
fn lang_block_matches(text: &str) -> bool {
let upper = text.to_ascii_uppercase();
if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"]
.iter()
.any(|marker| upper.contains(marker))
{
return true;
}
if upper.contains("GB") {
return true;
}
if [
"简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂",
]
.iter()
.any(|marker| text.contains(marker))
{
return true;
}
let chars: Vec<char> = text.chars().collect();
chars.windows(2).enumerate().any(|(index, pair)| {
pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' | '組'))
})
}
fn classify_token(token: &str) -> String {
if token.is_empty() {
return "EMPTY".to_string();
}
if token.chars().all(char::is_whitespace) {
return "SPACE".to_string();
}
if token.chars().all(|ch| ch == '/' || ch == '\\') {
return "PATH".to_string();
}
if token.chars().all(|ch| "-_.::+&|".contains(ch)) {
return "SEP".to_string();
}
if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
let inner = strip_wrapper(token);
let parts = split_inner(&inner);
let whole_class = classify_atom(&inner);
let inner_class = if whole_class != "TEXT" {
if whole_class == "LANG" && parts.len() > 1 {
let part_classes: Vec<String> =
parts.iter().map(|part| classify_atom(part)).collect();
if part_classes.iter().all(|item| item == &part_classes[0]) {
part_classes[0].clone()
} else if part_classes.iter().all(|item| is_media_block_class(item)) {
"MEDIA_BLOCK".to_string()
} else {
whole_class
}
} else {
whole_class
}
} else if parts.is_empty() {
"EMPTY".to_string()
} else {
let part_classes: Vec<String> = parts.iter().map(|part| classify_atom(part)).collect();
if part_classes.iter().all(|item| item == &part_classes[0]) {
part_classes[0].clone()
} else if part_classes.iter().all(|item| is_media_block_class(item)) {
"MEDIA_BLOCK".to_string()
} else if part_classes.iter().any(|item| is_media_block_class(item))
&& parts.iter().zip(part_classes.iter()).all(|(part, item)| {
is_media_block_class(item)
|| matches!(part.to_ascii_lowercase().as_str(), "anime" | "アニメ")
})
{
"MEDIA_BLOCK".to_string()
} else if part_classes.iter().any(|item| item == "TEXT") {
"TEXT".to_string()
} else {
let mut set: Vec<String> = part_classes
.into_iter()
.collect::<HashSet<_>>()
.into_iter()
.collect();
set.sort();
set.join("_")
}
};
return format!("BRACKET_{inner_class}");
}
classify_atom(token)
}
fn is_media_block_class(value: &str) -> bool {
matches!(value, "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE")
}
fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec<Group> {
let mut groups: Vec<Group> = Vec::new();
let mut previous: Option<String> = None;
for (index, token_class) in classes.iter().enumerate() {
let current = if token_class == "SPACE" {
"SEP"
} else {
token_class
}
.to_string();
if previous.as_deref() == Some(current.as_str())
&& matches!(current.as_str(), "SEP" | "TEXT")
{
groups.last_mut().unwrap().indices.push(index);
} else {
groups.push(Group {
indices: vec![index],
class_name: current.clone(),
});
}
previous = Some(current);
}
groups
}
fn template_key_for_filename(filename: &str) -> (String, Vec<String>, Vec<String>, Vec<Group>) {
let tokens = tokenize(filename);
let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect();
let groups = compact_token_groups(&tokens, &classes);
let key = groups
.iter()
.map(|group| group.class_name.as_str())
.collect::<Vec<_>>()
.join(" ");
(key, tokens, classes, groups)
}
fn suggested_roles(template: &str) -> Vec<String> {
let items: Vec<&str> = template.split_whitespace().collect();
let mut roles = vec!["O".to_string(); items.len()];
let mut segment_starts = vec![0usize];
for (index, item) in items.iter().enumerate() {
if *item == "PATH" {
segment_starts.push(index + 1);
}
}
for (index, item) in items.iter().enumerate() {
roles[index] = if item.contains("EPISODE_VERSION") {
"EPISODE_VERSION"
} else if item.contains("EPISODE_RANGE") {
"EPISODE_RANGE"
} else if item.contains("EPISODE") || item.contains("SXE") {
"EPISODE"
} else if item.contains("RESOLUTION") {
"RESOLUTION"
} else if item.contains("HASH") {
"HASH"
} else if item.contains("LANG") || item.contains("MEDIA") {
"SOURCE"
} else if item.contains("SPECIAL") {
"SPECIAL"
} else if item.contains("SEASON") {
"SEASON"
} else if item.contains("VOLUME") {
"VOLUME"
} else {
"O"
}
.to_string();
}
for (offset, start) in segment_starts.iter().enumerate() {
let end = if offset + 1 < segment_starts.len() {
segment_starts[offset + 1] - 1
} else {
items.len()
};
if *start >= end {
continue;
}
let first_structural = (*start..end)
.find(|&index| {
items[index].contains("EPISODE")
|| matches!(items[index], "SXE" | "SPECIAL" | "SEASON")
})
.unwrap_or(end);
let bracket_text: Vec<usize> = (*start..first_structural)
.filter(|&index| items[index] == "BRACKET_TEXT" && roles[index] == "O")
.collect();
let text: Vec<usize> = (*start..first_structural)
.filter(|&index| items[index] == "TEXT" && roles[index] == "O")
.collect();
if bracket_text.len() >= 2 {
roles[bracket_text[0]] = "GROUP".to_string();
for index in bracket_text.iter().skip(1) {
roles[*index] = "TITLE".to_string();
}
} else if bracket_text.len() == 1 {
roles[bracket_text[0]] = if text.is_empty() {
"TITLE"
} else if bracket_text[0] == *start {
"GROUP"
} else {
"TITLE"
}
.to_string();
}
for index in text {
roles[index] = "TITLE".to_string();
}
if !roles[*start..end].iter().any(|role| role == "TITLE")
&& !items[*start..end].is_empty()
&& items[*start].contains("EPISODE")
{
let mut run = Vec::new();
for index in (*start + 1)..end {
if items[index] == "TEXT" && roles[index] == "O" {
run.push(index);
continue;
}
if items[index] == "SEP" {
continue;
}
if !run.is_empty() {
break;
}
}
if run.len() >= 2 {
for index in run {
roles[index] = "TITLE".to_string();
}
}
}
}
roles
}
fn filename_has_title(filename: &str) -> bool {
let (key, _, _, _) = template_key_for_filename(filename);
suggested_roles(&key).iter().any(|role| role == "TITLE")
}
fn training_filename_for(original: &str) -> (String, bool) {
let parts: Vec<&str> = original
.split(|ch| ch == '/' || ch == '\\')
.map(str::trim)
.filter(|part| !part.is_empty())
.collect();
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
if !path_segment_is_plain_season(parts[parts.len() - 2]) {
return (parts[parts.len() - 1].to_string(), true);
}
let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
if parent_seasons
.iter()
.any(|season| leaf_seasons.contains(season))
{
(parts[parts.len() - 1].to_string(), true)
} else {
(parts[parts.len() - 2..].join("/"), true)
}
} else {
(parts[parts.len() - 1].to_string(), true)
}
} else {
(original.to_string(), false)
}
}
fn path_segment_is_plain_season(segment: &str) -> bool {
let cleaned = strip_wrapper(segment).trim().to_string();
PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
}
fn path_segment_has_season(value: &str) -> bool {
PATH_SEGMENT_SEASON_RE.is_match(value)
}
fn path_segment_seasons(value: &str) -> HashSet<u8> {
SEASON_WORD_NUMBER_RE
.captures_iter(value)
.chain(S_NUMBER_SEGMENT_RE.captures_iter(value))
.chain(SXE_SEASON_RE.captures_iter(value))
.filter_map(|captures| captures.get(1))
.filter_map(|item| item.as_str().parse::<u8>().ok())
.collect()
}
fn has_encoding_noise(value: &str) -> bool {
if value.contains('\u{fffd}') {
return true;
}
let markers = [
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
];
let marker_hits = markers
.iter()
.map(|marker| value.matches(marker).count())
.sum::<usize>();
let halfwidth_hits = value
.chars()
.filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
.count();
marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1)
}
fn has_non_anime_noise(value: &str) -> bool {
let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
}
fn normalized_path_segment(value: &str) -> String {
value
.split_whitespace()
.collect::<String>()
.to_ascii_lowercase()
}
fn path_segment_is_episodeish(value: &str) -> bool {
let (_, _, _, groups) = template_key_for_filename(value);
let structural: Vec<&String> = groups
.iter()
.map(|group| &group.class_name)
.filter(|item| item.as_str() != "SEP")
.collect();
!structural.is_empty()
&& structural
.iter()
.all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL")
}
fn has_abstract_path_noise(value: &str) -> bool {
let parts: Vec<&str> = value
.split(|ch| ch == '/' || ch == '\\')
.map(str::trim)
.filter(|part| !part.is_empty())
.collect();
if parts.len() < 3 {
return false;
}
if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) {
return true;
}
path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1])
}
fn role_label(role: &str) -> String {
let entity = match role {
"GROUP" => Some("GROUP"),
"TITLE" => Some("TITLE"),
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
"SEASON" => Some("SEASON"),
"SPECIAL" | "VOLUME" => Some("SPECIAL"),
"RESOLUTION" => Some("RESOLUTION"),
"SOURCE" => Some("SOURCE"),
_ => None,
};
entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
}
fn is_separator(piece: &str) -> bool {
piece.is_empty()
|| piece
.chars()
.all(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
}
fn char_kind(ch: char) -> &'static str {
if ch.is_whitespace() || !ch.is_alphanumeric() {
"sep"
} else if ch.is_ascii_digit() {
"digit"
} else if ch.is_ascii_alphabetic() {
"alpha"
} else {
"text"
}
}
fn split_refined_token(token: &str) -> Vec<String> {
let whole_class = classify_atom(token);
let is_wrapped = {
let chars: Vec<char> = token.chars().collect();
chars.len() >= 2
&& ((chars[0] == '[' && chars[chars.len() - 1] == ']')
|| (chars[0] == '(' && chars[chars.len() - 1] == ')')
|| (chars[0] == '【' && chars[chars.len() - 1] == '】'))
};
if !is_wrapped
&& matches!(
whole_class.as_str(),
"RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
)
&& token.chars().all(char::is_alphanumeric)
{
return vec![token.to_string()];
}
if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) {
return vec![token.to_string()];
}
let mut pieces = Vec::new();
let mut current = String::new();
let mut current_kind: Option<&str> = None;
for ch in token.chars() {
let kind = char_kind(ch);
if kind == "sep" {
if !current.is_empty() {
pieces.push(std::mem::take(&mut current));
current_kind = None;
}
pieces.push(ch.to_string());
continue;
}
if !current.is_empty() && current_kind != Some(kind) {
pieces.push(std::mem::take(&mut current));
}
current.push(ch);
current_kind = Some(kind);
}
if !current.is_empty() {
pieces.push(current);
}
let mut merged = Vec::new();
let mut index = 0;
while index < pieces.len() {
if index + 2 < pieces.len()
&& !is_separator(&pieces[index])
&& is_separator(&pieces[index + 1])
&& !is_separator(&pieces[index + 2])
{
let combined = format!(
"{}{}{}",
pieces[index],
pieces[index + 1],
pieces[index + 2]
);
let combined_class = classify_atom(&combined);
if !pieces[index + 1].chars().any(char::is_whitespace)
&& matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×")
&& matches!(
combined_class.as_str(),
"RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
)
{
merged.push(combined);
index += 3;
continue;
}
}
if !is_separator(&pieces[index]) {
let mut end = index;
let mut combined = String::new();
while end < pieces.len() && !is_separator(&pieces[end]) {
combined.push_str(&pieces[end]);
end += 1;
}
if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) {
merged.push(combined);
index = end;
continue;
}
}
if index + 1 < pieces.len()
&& !is_separator(&pieces[index])
&& !is_separator(&pieces[index + 1])
{
let combined = format!("{}{}", pieces[index], pieces[index + 1]);
if is_mergeable_refined_class(&classify_atom(&combined)) {
merged.push(combined);
index += 2;
continue;
}
}
merged.push(pieces[index].clone());
index += 1;
}
merged
}
fn is_mergeable_refined_class(value: &str) -> bool {
matches!(
value,
"RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | "SEASON"
)
}
fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String {
if is_separator(piece) {
return "O".to_string();
}
let atom_class = classify_atom(piece);
let upper = piece.to_ascii_uppercase();
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
if atom_class == "SEASON" {
return "B-SEASON".to_string();
}
if matches!(atom_class.as_str(), "EPISODE" | "EPISODE_VERSION" | "SXE")
|| piece.chars().all(|ch| ch.is_ascii_digit())
{
return "B-EPISODE".to_string();
}
if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME")
|| matches!(
upper.as_str(),
"OVA" | "OAD" | "SP" | "PV" | "CM" | "OP" | "ED" | "NCOP" | "NCED"
)
{
return "B-SPECIAL".to_string();
}
return "O".to_string();
}
if role == "SOURCE" || matches!(token_class, "BRACKET_MEDIA_BLOCK" | "MEDIA_BLOCK") {
if atom_class == "RESOLUTION" {
return "B-RESOLUTION".to_string();
}
if atom_class == "HASH" {
return "O".to_string();
}
if matches!(atom_class.as_str(), "MEDIA" | "LANG") {
return "B-SOURCE".to_string();
}
if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
return "B-SPECIAL".to_string();
}
return if matches!(
upper.as_str(),
"END" | "FIN" | "COMPLETE" | "TV" | "全集" | "全"
) {
"O".to_string()
} else {
"B-SOURCE".to_string()
};
}
if role == "RESOLUTION" {
return if atom_class == "RESOLUTION" || piece.chars().all(|ch| ch.is_ascii_digit()) {
"B-RESOLUTION".to_string()
} else {
"O".to_string()
};
}
role_label(role)
}
fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
let caps = SXE_VALUE_RE.captures(token)?;
let mut pieces = vec![
"S".to_string(),
caps[1].to_string(),
"E".to_string(),
caps[2].to_string(),
];
let mut labels = vec![
"O".to_string(),
"B-SEASON".to_string(),
"O".to_string(),
"B-EPISODE".to_string(),
];
if let Some(version) = caps.get(3) {
pieces.push("v".to_string());
pieces.push(version.as_str().to_string());
labels.push("O".to_string());
labels.push("O".to_string());
}
Some((pieces, labels))
}
fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
let caps = EPISODE_VALUE_RE.captures(token)?;
let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
if let Some(version) = caps.get(3) {
pieces.push("v".to_string());
pieces.push(version.as_str().to_string());
labels.push("O".to_string());
labels.push("O".to_string());
}
Some((pieces, labels))
}
fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
let caps = SEASON_VALUE_RE.captures(token)?;
Some((
vec!["S".to_string(), caps[1].to_string()],
vec!["O".to_string(), "B-SEASON".to_string()],
))
}
fn group_text(tokens: &[String], group: &Group) -> String {
strip_wrapper(
&group
.indices
.iter()
.map(|&index| tokens[index].as_str())
.collect::<String>(),
)
}
fn is_special_title_phrase(text: &str) -> bool {
let normalized = SPECIAL_SPACE_RE
.replace_all(text, " ")
.trim()
.to_ascii_uppercase();
matches!(
normalized.as_str(),
"CM" | "EVENT"
| "EIZOU"
| "LOGO"
| "MENU"
| "OMAKE"
| "PREVIEW"
| "PV"
| "THEATER GREETING EVENT"
| "TOKUTEN"
| "TRAILER"
| "WORLD PREMIERE"
) || SPECIAL_TITLE_PHRASE_RE.is_match(text)
}
fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
let mut output = roles.to_vec();
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
if !output.iter().any(|role| role == "TITLE")
&& roles
.first()
.is_some_and(|role| role.starts_with("EPISODE"))
{
let mut title_run = Vec::new();
for index in 1..roles.len() {
if groups[index].class_name == "TEXT" && output[index] == "O" {
title_run.push(index);
continue;
}
if groups[index].class_name == "SEP" {
continue;
}
if !title_run.is_empty() {
break;
}
}
if title_run.len() >= 2 {
let last_title_index = *title_run.last().unwrap();
let later_structural = roles[last_title_index + 1..].iter().any(|role| {
role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
});
if group_text(tokens, &groups[0])
.chars()
.all(|ch| ch.is_ascii_digit())
&& later_structural
{
output[0] = "TITLE".to_string();
}
for index in title_run {
output[index] = "TITLE".to_string();
}
}
}
if roles
.first()
.is_some_and(|role| role.starts_with("EPISODE"))
&& group_text(tokens, &groups[0])
.chars()
.all(|ch| ch.is_ascii_digit())
{
if let Some(first_title) = output.iter().position(|role| role == "TITLE") {
let later_structural = roles[first_title + 1..].iter().any(|role| {
role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
});
if later_structural {
output[0] = "TITLE".to_string();
}
}
}
for index in 0..roles.len() {
let text = group_text(tokens, &groups[index]);
if output[index] == "O" && groups[index].class_name.contains("SXE") {
output[index] = "EPISODE".to_string();
}
if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
output[index] = "O".to_string();
continue;
}
if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
let previous_text = group_text(tokens, &groups[index - 2]);
let next_special = output[index + 1..roles.len().min(index + 4)]
.iter()
.any(|role| role == "SPECIAL");
let next_episode = roles[index + 1..]
.iter()
.any(|role| role.starts_with("EPISODE"));
if groups[index - 1].class_name == "SEP"
&& matches!(
previous_text.to_ascii_lowercase().as_str(),
"vol" | "volume"
)
{
let next_text_before_episode = (index + 1..roles.len())
.find(|&cursor| groups[cursor].class_name != "SEP")
.is_some_and(|cursor| {
groups[cursor].class_name == "TEXT"
&& roles[cursor + 1..]
.iter()
.any(|role| role.starts_with("EPISODE"))
});
if next_text_before_episode {
output[index - 2] = "TITLE".to_string();
output[index] = "TITLE".to_string();
continue;
}
output[index - 2] = "SPECIAL".to_string();
output[index] = "SPECIAL".to_string();
continue;
}
if output[index - 2] == "TITLE"
&& groups[index - 1].class_name == "SEP"
&& previous_text.len() <= 4
&& previous_text.is_ascii()
&& previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
&& text.chars().all(|ch| ch.is_ascii_digit())
&& text.len() <= 3
&& (next_special || next_episode)
{
output[index] = "TITLE".to_string();
continue;
}
}
if roles[index].starts_with("EPISODE")
&& index >= 2
&& output[..index].iter().any(|role| role == "TITLE")
&& group_text(tokens, &groups[index])
.chars()
.all(|ch| ch.is_ascii_digit())
{
let next_episode_word = index + 2 < roles.len()
&& groups[index + 1].class_name == "SEP"
&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode");
if next_episode_word {
let mut run = Vec::new();
let mut cursor = index + 2;
while cursor < roles.len() {
if groups[cursor].class_name == "SEP" {
cursor += 1;
continue;
}
if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE")
{
run.push(cursor);
cursor += 1;
continue;
}
break;
}
let later_episode = roles[cursor..]
.iter()
.any(|role| role.starts_with("EPISODE"));
if run.len() >= 2 && later_episode {
output[index] = "TITLE".to_string();
for item in run {
output[item] = "TITLE".to_string();
}
continue;
}
}
}
if roles[index] == "TITLE" && is_special_title_phrase(&text) {
output[index] = "SPECIAL".to_string();
continue;
}
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
{
output[index] = "O".to_string();
continue;
}
if output[index] == "O"
&& groups[index].class_name == "TEXT"
&& roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
&& text.chars().any(|ch| ch.is_alphabetic())
&& !ep_markers.contains(&text.as_str())
{
if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
let episode_since_title = output[last_title + 1..index]
.iter()
.any(|role| role.starts_with("EPISODE"));
if !episode_since_title {
output[index] = "TITLE".to_string();
continue;
}
}
}
if roles[index] == "TITLE"
&& matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
&& index + 2 < roles.len()
&& groups[index + 1].class_name == "SEP"
&& roles[index + 2].starts_with("EPISODE")
{
output[index] = "O".to_string();
output[index + 2] = "SEASON".to_string();
continue;
}
if roles[index] == "TITLE"
&& text == text.to_ascii_uppercase()
&& roman.contains(&text.as_str())
{
let previous_title = output[..index].iter().any(|role| role == "TITLE");
let next_structural = roles[index + 1..]
.iter()
.any(|role| role.starts_with("EPISODE") || role == "SPECIAL");
if previous_title && next_structural {
output[index] = "SEASON".to_string();
continue;
}
}
if roles[index].starts_with("EPISODE") && index + 4 < roles.len() {
if groups[index + 1].class_name == "SEP"
&& ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str())
&& groups[index + 3].class_name == "SEP"
&& roles[index + 4].starts_with("EPISODE")
{
output[index] = "TITLE".to_string();
output[index + 2] = "O".to_string();
}
}
if roles[index].starts_with("EPISODE") {
let previous_text = if index >= 1 {
group_text(tokens, &groups[index - 1])
} else {
String::new()
};
let next_text = if index + 1 < roles.len() {
group_text(tokens, &groups[index + 1])
} else {
String::new()
};
if previous_text.contains('点')
|| previous_text.contains('點')
|| previous_text.contains("晚上")
|| previous_text.contains("上午")
|| previous_text.contains("下午")
|| next_text.contains('点')
|| next_text.contains('點')
|| next_text.contains('半')
{
output[index] = "O".to_string();
}
}
}
output
}
fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
let mut candidates = Vec::new();
let mut index = 0;
while index < roles.len() {
if roles[index] != "TITLE" {
index += 1;
continue;
}
let start = index;
index += 1;
loop {
if index < roles.len()
&& roles[index] == "TITLE"
&& !(groups[index - 1].class_name == "BRACKET_TEXT"
&& groups[index].class_name == "BRACKET_TEXT")
{
index += 1;
continue;
}
if index + 1 < roles.len()
&& roles[index] == "O"
&& groups[index].class_name == "SEP"
&& roles[index + 1] == "TITLE"
{
index += 2;
continue;
}
break;
}
candidates.push((start, index));
}
candidates
}
fn enforce_single_title_candidate(
groups: &[Group],
roles: &[String],
) -> (Vec<String>, Vec<String>) {
let candidates = title_candidates(groups, roles);
if candidates.len() <= 1 {
return (roles.to_vec(), Vec::new());
}
let first_anchor = roles
.iter()
.position(|role| {
role.starts_with("EPISODE")
|| matches!(
role.as_str(),
"SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
)
})
.unwrap_or(roles.len());
let before_anchor: Vec<(usize, usize)> = candidates
.iter()
.copied()
.filter(|(_, end)| *end <= first_anchor)
.collect();
let selected = (if before_anchor.is_empty() {
&candidates
} else {
&before_anchor
})
.iter()
.max_by_key(|(start, end)| (*end, end - start))
.copied()
.unwrap();
let mut output = roles.to_vec();
let mut dropped = Vec::new();
for (start, end) in candidates {
if (start, end) == selected {
continue;
}
for index in start..end {
if output[index] == "TITLE" {
output[index] = "O".to_string();
dropped.push(index.to_string());
}
}
}
(output, dropped)
}
fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
let mut output_tokens = Vec::new();
let mut output_labels = Vec::new();
for (token, label) in tokens.iter().zip(labels.iter()) {
for piece in split_generated_token(token) {
output_labels.push(if label == "O" || is_standalone_separator(&piece) {
"O".to_string()
} else {
label.clone()
});
output_tokens.push(piece);
}
}
(output_tokens, output_labels)
}
fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
let pieces = split_generated_token(token);
let labels = pieces
.iter()
.map(|piece| {
if is_standalone_separator(piece) {
"O".to_string()
} else if CJK_SEASON_TOKEN_RE.is_match(piece) {
"B-SEASON".to_string()
} else {
"B-TITLE".to_string()
}
})
.collect();
(pieces, labels)
}
fn split_generated_token(token: &str) -> Vec<String> {
let mut pieces = Vec::new();
let mut current = String::new();
for ch in token.chars() {
if ch.is_whitespace() || !ch.is_alphanumeric() {
if !current.is_empty() {
pieces.push(std::mem::take(&mut current));
}
pieces.push(ch.to_string());
} else {
current.push(ch);
}
}
if !current.is_empty() {
pieces.push(current);
}
pieces
}
fn is_standalone_separator(token: &str) -> bool {
token.chars().count() == 1
&& token
.chars()
.next()
.is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
}
fn project_refined_tokens(
tokens: &[String],
groups: &[Group],
roles: &[String],
) -> (Vec<String>, Vec<String>) {
let mut output_tokens = Vec::new();
let mut output_labels = Vec::new();
for (group_index, group) in groups.iter().enumerate() {
let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O");
if matches!(group.class_name.as_str(), "SEP" | "PATH" | "EMPTY") {
role = "O";
}
for &index in &group.indices {
let token = &tokens[index];
if matches!(
role,
"EPISODE"
| "EPISODE_VERSION"
| "EPISODE_RANGE"
| "SOURCE"
| "RESOLUTION"
| "SEASON"
) {
if role == "SEASON" {
if let Some((pieces, labels)) = split_season_token(token) {
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
}
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) {
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
}
for piece in split_refined_token(token) {
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
if let Some((pieces, labels)) = split_season_token(&piece) {
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
if let Some((pieces, labels)) = split_episode_token(&piece) {
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
}
let label = label_for_refined_piece(&piece, role, &group.class_name);
let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
output_tokens.extend(pieces);
output_labels.extend(labels);
}
} else {
if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
{
output_tokens.push(token.clone());
output_labels.push("O".to_string());
continue;
}
if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 {
let trimmed = token.trim_end_matches('第').to_string();
let (pieces, labels) = normalize_generated_tokens(
&[trimmed, "第".to_string()],
&["B-TITLE".to_string(), "O".to_string()],
);
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
if role == "TITLE" {
let (pieces, labels) = normalize_title_token(token);
output_tokens.extend(pieces);
output_labels.extend(labels);
continue;
}
let (pieces, labels) =
normalize_generated_tokens(&[token.clone()], &[role_label(role)]);
output_tokens.extend(pieces);
output_labels.extend(labels);
}
}
}
(output_tokens, output_labels)
}
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
let joiners = [
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
"?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
"】", "「", "」", "「", "」", "☆", "@",
];
let title_terminal_punctuation = ["!", "!", "?", "?"];
let entity_joiners = [
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
"?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
"】", "「", "」", "「", "」", "☆", "@", "&", "&",
];
let mut output = labels.to_vec();
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
if label != "O" || !entity_joiners.contains(&token.as_str()) {
continue;
}
let mut left = index as isize - 1;
while left >= 0
&& joiners.contains(&tokens[left as usize].as_str())
&& labels[left as usize] == "O"
{
left -= 1;
}
let mut right = index + 1;
while right < tokens.len()
&& joiners.contains(&tokens[right].as_str())
&& labels[right] == "O"
{
right += 1;
}
if left >= 0 && right < tokens.len() {
let left_label = &output[left as usize];
let right_label = &labels[right];
if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") {
output[index] = left_label.clone();
}
}
if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
let left_label = &output[index - 1];
if left_label == "B-TITLE" {
output[index] = "B-TITLE".to_string();
}
}
}
output
}
fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
let (key, tokens, _classes, groups) = template_key_for_filename(filename);
if groups.len() != roles.len() {
return None;
}
let roles = adjust_contextual_roles(&tokens, &groups, roles);
let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
let labels = smooth_title_spans(&tokens, &labels);
if tokens.len() != labels.len() {
return None;
}
Some(Record {
filename: filename.to_string(),
tokens,
labels,
template_id: template_id.to_string(),
template: key,
source_filename: None,
path_trimmed: None,
dropped_title_candidate_positions: if dropped.is_empty() {
None
} else {
Some(dropped)
},
})
}
#[cfg(test)]
mod tests {
use super::*;
fn labels_for(filename: &str) -> Vec<(String, String)> {
let (key, _, _, _) = template_key_for_filename(filename);
let roles = suggested_roles(&key);
let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
record.tokens.into_iter().zip(record.labels).collect()
}
#[test]
fn required_regressions() {
let title_91 = labels_for("Title 91 EP 01 [1080p]");
assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string())));
assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]");
assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string())));
assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string())));
let roman = labels_for("Chibi Maruko-chan I 001");
assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string())));
assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string())));
let dxd = labels_for("High School D×D");
assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
let colon_title = labels_for("Megumi no Daigo:Kyuukoku no Orange 06");
assert!(colon_title.contains(&(":".to_string(), "B-TITLE".to_string())));
let sxe = labels_for("S01E02");
assert_eq!(
sxe,
vec![
("S".to_string(), "O".to_string()),
("01".to_string(), "B-SEASON".to_string()),
("E".to_string(), "O".to_string()),
("02".to_string(), "B-EPISODE".to_string())
]
);
let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]");
assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string())));
assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string())));
let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string())));
assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string())));
let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)");
assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string())));
assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]");
assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string())));
assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)");
assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string())));
let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]");
assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string())));
let hash = labels_for("[Group][Title][01][1080p][00270AC8]");
assert!(hash.contains(&("00270AC8".to_string(), "O".to_string())));
let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001");
assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string())));
assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string())));
let ubw = labels_for("Fate/stay night [Unlimited Blade Works] #00 「プロローグ」");
assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string())));
assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string())));
let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]");
assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string())));
let comma_title =
labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string())));
let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER~魂狩~ #01 (HEVC 1312x720)");
assert!(soul_taker.contains(&("~".to_string(), "B-TITLE".to_string())));
let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string())));
let zom =
labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]");
assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string())));
assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
}
#[test]
fn updated_python_alignment_regressions() {
let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
let (trimmed, was_trimmed) = training_filename_for(original);
assert!(was_trimmed);
assert_eq!(
trimmed,
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
);
let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
assert!(pokemon_was_trimmed);
assert_eq!(
trimmed_pokemon,
"Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"
);
let woody = labels_for(&trimmed);
assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
assert!(woody.contains(&("E".to_string(), "O".to_string())));
assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string())));
assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));
let group = labels_for("[DBD-Raws][Title][01][1080P]");
assert!(group.contains(&("-".to_string(), "B-GROUP".to_string())));
let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]");
assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string())));
let cjk_season =
labels_for("[DBD-Raws][魔道祖师 第一季][08][1080P][BDRip][HEVC-10bit][FLAC]");
assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
let (trimmed, was_trimmed) =
training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]");
assert!(was_trimmed);
assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]");
let (key, _, _, _) = template_key_for_filename(&trimmed);
assert_eq!(
key,
"BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION"
);
let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
assert!(short.contains(&("R".to_string(), "B-TITLE".to_string())));
assert!(short.contains(&("-".to_string(), "B-TITLE".to_string())));
assert!(short.contains(&("15".to_string(), "B-TITLE".to_string())));
assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string())));
let short_before_episode =
labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string())));
assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string())));
assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string())));
assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string())));
let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]";
let (trimmed, was_trimmed) = training_filename_for(avatar);
assert!(was_trimmed);
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
let (trimmed, was_trimmed) = training_filename_for(tintin);
assert!(was_trimmed);
assert_eq!(
trimmed,
"Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"
);
let (key, _, _, _) = template_key_for_filename(&trimmed);
assert_eq!(
key,
"TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT"
);
let bocchi = "Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」";
let (leaf_key, _, _, _) =
template_key_for_filename("Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」");
assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
assert!(filename_has_title(
"Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"
));
let (trimmed, was_trimmed) = training_filename_for(bocchi);
assert!(was_trimmed);
assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」");
let (key, _, _, _) = template_key_for_filename(&trimmed);
assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]";
let (trimmed, was_trimmed) = training_filename_for(usagi);
assert!(was_trimmed);
assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]");
let (key, _, _, _) = template_key_for_filename(&trimmed);
assert_eq!(
key,
"TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA"
);
let woody_parent =
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
assert!(was_trimmed);
assert_eq!(trimmed, woody_parent);
let volume =
labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string())));
assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string())));
assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string())));
assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
let numeric_title =
labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string())));
assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string())));
assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string())));
assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string())));
assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string())));
assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string())));
let media_block =
labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]");
assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string())));
assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
}
}