Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| use anyhow::{bail, Context, Result}; | |
| use chrono::Utc; | |
| use clap::Parser; | |
| use once_cell::sync::Lazy; | |
| use rayon::prelude::*; | |
| use regex::Regex; | |
| use serde::{Deserialize, Serialize}; | |
| use serde_json::{json, Value}; | |
| use std::collections::{HashMap, HashSet}; | |
| use std::fs::{self, File}; | |
| use std::io::{BufRead, BufReader, BufWriter, Write}; | |
| use std::path::PathBuf; | |
| use std::sync::atomic::{AtomicUsize, Ordering}; | |
| struct Args { | |
| cluster: bool, | |
| audit_low_frequency: bool, | |
| verify_generated_output: bool, | |
| input: PathBuf, | |
| recipes: PathBuf, | |
| output: PathBuf, | |
| manifest_output: PathBuf, | |
| summary_output: PathBuf, | |
| samples_output: PathBuf, | |
| clusters_output: PathBuf, | |
| recipes_output: PathBuf, | |
| review_output: PathBuf, | |
| audit_output: PathBuf, | |
| audit_max_count: u64, | |
| limit: Option<usize>, | |
| limit_templates: Option<usize>, | |
| min_count: u64, | |
| top: usize, | |
| recipe_top: usize, | |
| review_top: usize, | |
| examples: usize, | |
| recipe_min_count: usize, | |
| confidence: String, | |
| expand: String, | |
| sample_per_template: usize, | |
| keep_encoding_noise: bool, | |
| preserve_parent_paths: bool, | |
| threads: Option<usize>, | |
| } | |
| struct Recipe { | |
| template_id: String, | |
| template: String, | |
| roles: Vec<String>, | |
| confidence: Option<String>, | |
| count: Option<u64>, | |
| } | |
| struct Record { | |
| filename: String, | |
| tokens: Vec<String>, | |
| labels: Vec<String>, | |
| template_id: String, | |
| template: String, | |
| source_filename: Option<String>, | |
| path_trimmed: Option<bool>, | |
| dropped_title_candidate_positions: Option<Vec<String>>, | |
| } | |
| struct Group { | |
| indices: Vec<usize>, | |
| class_name: String, | |
| } | |
| struct Stats { | |
| seen: usize, | |
| skipped_encoding_noise: usize, | |
| trimmed_parent_path: usize, | |
| skipped_no_recipe: usize, | |
| skipped_sample_cap: usize, | |
| skipped_role_mismatch: usize, | |
| skipped_low_frequency_audit_warning: usize, | |
| written: usize, | |
| } | |
| struct Cluster { | |
| count: usize, | |
| examples: Vec<String>, | |
| literal_counts: HashMap<String, usize>, | |
| class_counts: HashMap<String, usize>, | |
| position_literals: Vec<HashMap<String, usize>>, | |
| } | |
| enum Processed { | |
| Written { | |
| record: Record, | |
| trimmed_parent: bool, | |
| }, | |
| Skipped { | |
| reason: &'static str, | |
| trimmed_parent: bool, | |
| }, | |
| } | |
| static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap()); | |
| static RESOLUTION_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap()); | |
| static EPISODE_VERSION_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap()); | |
| static EPISODE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap()); | |
| static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap()); | |
| static EPISODE_RANGE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap()); | |
| static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*(?:TV|全集|全|END|Fin|Complete|SP|OVA|OAD|NCOP|NCED)|[+_./-])*.{0,16}$").unwrap() | |
| }); | |
| static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap()); | |
| static SXE_VALUE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap()); | |
| static EPISODE_VALUE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap()); | |
| static SEASON_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap() | |
| }); | |
| static CJK_SEASON_TOKEN_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap()); | |
| static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap()); | |
| static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap() | |
| }); | |
| static VOLUME_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap()); | |
| static DATE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()); | |
| static LANG_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap() | |
| }); | |
| static MEDIA_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap() | |
| }); | |
| static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b") | |
| .unwrap() | |
| }); | |
| static YEAR_RANGE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap()); | |
| static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap() | |
| }); | |
| static SEASON_WORD_NUMBER_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap()); | |
| static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap()); | |
| static S_NUMBER_SEGMENT_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap()); | |
| static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| { | |
| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap() | |
| }); | |
| static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| { | |
| [ | |
| r"^\d{3,4}[xX×]\d{3,4}", | |
| r"(?i)^h\.?26[45]", | |
| r"(?i)^x\.?26[45]", | |
| r"^[\\/]+", | |
| r"^[-_.::+&|]+", | |
| r"^\s+", | |
| r"(?i)^Season\s*\d{1,2}", | |
| r"^[A-Za-z]+(?:\d+[A-Za-z]*)*", | |
| r"^\d+[A-Za-z]+\d*", | |
| r"^\d{1,4}(?:[._-]\d{1,4})*", | |
| r"^[\p{Hiragana}\p{Katakana}\p{Han}]+", | |
| ] | |
| .into_iter() | |
| .map(|pattern| Regex::new(pattern).unwrap()) | |
| .collect() | |
| }); | |
| static SIMPLE_EPISODE_RE: Lazy<Regex> = | |
| Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap()); | |
| static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap()); | |
| fn main() -> Result<()> { | |
| let args = Args::parse(); | |
| if let Some(threads) = args.threads { | |
| rayon::ThreadPoolBuilder::new() | |
| .num_threads(threads) | |
| .build_global() | |
| .context("failed to configure rayon thread pool")?; | |
| } | |
| if args.cluster { | |
| return run_cluster(&args); | |
| } | |
| if args.audit_low_frequency { | |
| return run_low_frequency_audit(&args); | |
| } | |
| if args.verify_generated_output { | |
| return run_verify_generated_output(&args); | |
| } | |
| if args.expand != "all" && args.expand != "sample" { | |
| bail!("--expand must be all or sample"); | |
| } | |
| let recipes = load_recipes(&args)?; | |
| if recipes.is_empty() { | |
| bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates"); | |
| } | |
| let inputs = load_input(&args.input, args.limit)?; | |
| let sample_counters: HashMap<String, AtomicUsize> = recipes | |
| .values() | |
| .map(|recipe| (recipe.template_id.clone(), AtomicUsize::new(0))) | |
| .collect(); | |
| let processed: Vec<Processed> = inputs | |
| .par_iter() | |
| .map(|filename| process_filename(filename, &args, &recipes, &sample_counters)) | |
| .collect(); | |
| if let Some(parent) = args.output.parent() { | |
| fs::create_dir_all(parent)?; | |
| } | |
| if let Some(parent) = args.manifest_output.parent() { | |
| fs::create_dir_all(parent)?; | |
| } | |
| let mut stats = Stats { | |
| seen: inputs.len(), | |
| ..Stats::default() | |
| }; | |
| let mut label_counts: HashMap<String, usize> = HashMap::new(); | |
| let mut template_counts: HashMap<String, usize> = HashMap::new(); | |
| let mut examples = Vec::new(); | |
| let mut writer = BufWriter::new(File::create(&args.output)?); | |
| for item in processed { | |
| match item { | |
| Processed::Written { | |
| record, | |
| trimmed_parent, | |
| } => { | |
| if trimmed_parent { | |
| stats.trimmed_parent_path += 1; | |
| } | |
| for label in &record.labels { | |
| *label_counts.entry(label.clone()).or_default() += 1; | |
| } | |
| *template_counts | |
| .entry(record.template_id.clone()) | |
| .or_default() += 1; | |
| if examples.len() < 20 { | |
| examples.push(serde_json::to_value(&record)?); | |
| } | |
| serde_json::to_writer(&mut writer, &record)?; | |
| writer.write_all(b"\n")?; | |
| stats.written += 1; | |
| } | |
| Processed::Skipped { | |
| reason, | |
| trimmed_parent, | |
| } => { | |
| if trimmed_parent { | |
| stats.trimmed_parent_path += 1; | |
| } | |
| match reason { | |
| "encoding_noise" => stats.skipped_encoding_noise += 1, | |
| "no_recipe" => stats.skipped_no_recipe += 1, | |
| "sample_cap" => stats.skipped_sample_cap += 1, | |
| "role_mismatch" => stats.skipped_role_mismatch += 1, | |
| "low_frequency_audit_warning" => { | |
| stats.skipped_low_frequency_audit_warning += 1 | |
| } | |
| _ => {} | |
| } | |
| } | |
| } | |
| } | |
| writer.flush()?; | |
| let mut top_template_counts: Vec<_> = template_counts.into_iter().collect(); | |
| top_template_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); | |
| top_template_counts.truncate(20); | |
| let manifest = json!({ | |
| "generated_at": Utc::now().to_rfc3339(), | |
| "input": args.input.to_string_lossy(), | |
| "recipes": args.recipes.to_string_lossy(), | |
| "output": args.output.to_string_lossy(), | |
| "selected_templates": recipes.len(), | |
| "confidence": args.confidence, | |
| "min_count": args.min_count, | |
| "low_frequency_audit_max_count": args.audit_max_count, | |
| "low_frequency_blocking_warnings": [ | |
| "hash_labeled", | |
| "multiple_title_spans", | |
| "no_title", | |
| "path_retained" | |
| ], | |
| "expand": args.expand, | |
| "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None }, | |
| "stats": stats, | |
| "label_counts": label_counts, | |
| "top_template_counts": top_template_counts, | |
| "examples": examples, | |
| "implementation": "rust_dmhy_template_apply" | |
| }); | |
| fs::write( | |
| &args.manifest_output, | |
| serde_json::to_string_pretty(&manifest)?, | |
| )?; | |
| println!("{}", serde_json::to_string_pretty(&manifest)?); | |
| Ok(()) | |
| } | |
| fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> { | |
| let file = File::open(&args.recipes) | |
| .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?; | |
| let mut recipes = HashMap::new(); | |
| for (line_number, line) in BufReader::new(file).lines().enumerate() { | |
| let line = line?; | |
| if line.trim().is_empty() { | |
| continue; | |
| } | |
| let row: Recipe = serde_json::from_str(&line).with_context(|| { | |
| format!( | |
| "invalid recipe JSON at {}:{}", | |
| args.recipes.display(), | |
| line_number + 1 | |
| ) | |
| })?; | |
| if !args.confidence.is_empty() | |
| && row.confidence.as_deref() != Some(args.confidence.as_str()) | |
| { | |
| continue; | |
| } | |
| if row.count.unwrap_or(0) < args.min_count { | |
| continue; | |
| } | |
| recipes.insert(row.template.clone(), row); | |
| if args | |
| .limit_templates | |
| .is_some_and(|limit| recipes.len() >= limit) | |
| { | |
| break; | |
| } | |
| } | |
| Ok(recipes) | |
| } | |
| fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> { | |
| let file = | |
| File::open(path).with_context(|| format!("input JSONL not found: {}", path.display()))?; | |
| let mut values = Vec::new(); | |
| for (line_number, line) in BufReader::new(file).lines().enumerate() { | |
| if limit.is_some_and(|limit| values.len() >= limit) { | |
| break; | |
| } | |
| let line = line?; | |
| if line.trim().is_empty() { | |
| continue; | |
| } | |
| let row: Value = serde_json::from_str(&line) | |
| .with_context(|| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?; | |
| if let Some(value) = row.get("value").and_then(Value::as_str) { | |
| let value = value.trim(); | |
| if !value.is_empty() { | |
| values.push(value.to_string()); | |
| } | |
| } | |
| } | |
| Ok(values) | |
| } | |
| fn run_cluster(args: &Args) -> Result<()> { | |
| let inputs = load_input(&args.input, args.limit)?; | |
| let source_rows = inputs.len(); | |
| let mut clusters: HashMap<String, Cluster> = HashMap::new(); | |
| let mut skipped_encoding_noise = 0usize; | |
| let mut trimmed_parent_path = 0usize; | |
| let mut total_rows = 0usize; | |
| for original in inputs { | |
| if !args.keep_encoding_noise | |
| && (has_encoding_noise(&original) | |
| || has_non_anime_noise(&original) | |
| || has_abstract_path_noise(&original)) | |
| { | |
| skipped_encoding_noise += 1; | |
| continue; | |
| } | |
| let filename = if args.preserve_parent_paths { | |
| original | |
| } else { | |
| let (training_filename, was_trimmed) = training_filename_for(&original); | |
| if was_trimmed { | |
| trimmed_parent_path += 1; | |
| } | |
| training_filename | |
| }; | |
| add_cluster(&mut clusters, &filename, args.examples); | |
| total_rows += 1; | |
| } | |
| let mut sorted_clusters: Vec<_> = clusters.into_iter().collect(); | |
| sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0))); | |
| let cluster_rows: Vec<Value> = sorted_clusters | |
| .iter() | |
| .enumerate() | |
| .map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows)) | |
| .collect(); | |
| let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect(); | |
| let recipe_candidates: Vec<Value> = | |
| cluster_rows.iter().take(args.recipe_top).cloned().collect(); | |
| let recipes: Vec<Value> = recipe_candidates | |
| .iter() | |
| .filter(|row| is_high_confidence_recipe(row, args.recipe_min_count)) | |
| .map(|row| recipe_row(row, "high")) | |
| .collect(); | |
| let review: Vec<Value> = recipe_candidates | |
| .iter() | |
| .filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count)) | |
| .take(args.review_top) | |
| .cloned() | |
| .collect(); | |
| write_jsonl_values(&args.clusters_output, &cluster_rows)?; | |
| write_jsonl_values(&args.samples_output, &samples)?; | |
| write_jsonl_values(&args.recipes_output, &recipes)?; | |
| write_jsonl_values(&args.review_output, &review)?; | |
| let mut histogram: HashMap<usize, usize> = HashMap::new(); | |
| for (_, cluster) in &sorted_clusters { | |
| *histogram.entry(cluster.count).or_default() += 1; | |
| } | |
| let mut count_histogram_top: Vec<_> = histogram.into_iter().collect(); | |
| count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); | |
| count_histogram_top.truncate(20); | |
| let rows_covered_by_repeated_templates: usize = sorted_clusters | |
| .iter() | |
| .map(|(_, cluster)| cluster) | |
| .filter(|cluster| cluster.count as u64 >= args.min_count) | |
| .map(|cluster| cluster.count) | |
| .sum(); | |
| let templates_at_least_min_count = sorted_clusters | |
| .iter() | |
| .filter(|(_, cluster)| cluster.count as u64 >= args.min_count) | |
| .count(); | |
| let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect(); | |
| let summary = json!({ | |
| "input": args.input.to_string_lossy(), | |
| "source_rows": source_rows, | |
| "skipped_encoding_noise": skipped_encoding_noise, | |
| "trimmed_parent_path": trimmed_parent_path, | |
| "total_rows": total_rows, | |
| "unique_templates": sorted_clusters.len(), | |
| "min_count": args.min_count, | |
| "templates_at_least_min_count": templates_at_least_min_count, | |
| "rows_covered_by_repeated_templates": rows_covered_by_repeated_templates, | |
| "rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 }, | |
| "top_output_rows": samples.len(), | |
| "clusters_output": args.clusters_output.to_string_lossy(), | |
| "cluster_rows": cluster_rows.len(), | |
| "recipes_output": args.recipes_output.to_string_lossy(), | |
| "recipe_rows": recipes.len(), | |
| "review_output": args.review_output.to_string_lossy(), | |
| "review_rows": review.len(), | |
| "recipe_top": args.recipe_top, | |
| "recipe_min_count": args.recipe_min_count, | |
| "top_templates": top_templates, | |
| "count_histogram_top": count_histogram_top, | |
| "implementation": "rust_dmhy_template_cluster", | |
| "generated_at": Utc::now().to_rfc3339(), | |
| }); | |
| if let Some(parent) = args.summary_output.parent() { | |
| fs::create_dir_all(parent)?; | |
| } | |
| fs::write( | |
| &args.summary_output, | |
| serde_json::to_string_pretty(&summary)?, | |
| )?; | |
| println!("{}", serde_json::to_string_pretty(&summary)?); | |
| Ok(()) | |
| } | |
| fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) { | |
| let (key, tokens, classes, groups) = template_key_for_filename(filename); | |
| let cluster = clusters.entry(key).or_default(); | |
| cluster.count += 1; | |
| if cluster.examples.len() < example_limit { | |
| cluster.examples.push(filename.to_string()); | |
| } | |
| for (token, class_name) in tokens.iter().zip(classes.iter()) { | |
| *cluster.class_counts.entry(class_name.clone()).or_default() += 1; | |
| if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") { | |
| let cleaned = strip_wrapper(token); | |
| if !cleaned.is_empty() { | |
| *cluster.literal_counts.entry(cleaned).or_default() += 1; | |
| } | |
| } | |
| } | |
| while cluster.position_literals.len() < groups.len() { | |
| cluster.position_literals.push(HashMap::new()); | |
| } | |
| for (index, group) in groups.iter().enumerate() { | |
| if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") { | |
| let text = group_text(&tokens, group); | |
| if !text.is_empty() { | |
| *cluster.position_literals[index].entry(text).or_default() += 1; | |
| } | |
| } | |
| } | |
| } | |
| fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value { | |
| json!({ | |
| "template_id": format!("tpl_{rank:06}"), | |
| "template": key, | |
| "count": cluster.count, | |
| "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 }, | |
| "top_literals": top_counts(&cluster.literal_counts, 12), | |
| "suggested_roles": suggested_roles(key), | |
| "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(), | |
| "class_counts": top_counts(&cluster.class_counts, 20), | |
| "examples": cluster.examples, | |
| }) | |
| } | |
| fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> { | |
| let mut items: Vec<_> = counts | |
| .iter() | |
| .map(|(key, count)| (key.clone(), *count)) | |
| .collect(); | |
| items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); | |
| items.truncate(limit); | |
| items | |
| } | |
| fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool { | |
| if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 { | |
| return false; | |
| } | |
| let roles = match row.get("suggested_roles").and_then(Value::as_array) { | |
| Some(roles) => roles, | |
| None => return false, | |
| }; | |
| let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect(); | |
| if role_strings.iter().any(|role| role.contains("_OR_")) { | |
| return false; | |
| } | |
| if !role_strings.contains(&"TITLE") | |
| || !role_strings.iter().any(|role| { | |
| role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION") | |
| }) | |
| { | |
| return false; | |
| } | |
| let template = row.get("template").and_then(Value::as_str).unwrap_or(""); | |
| if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") { | |
| return false; | |
| } | |
| !role_strings.contains(&"TITLE_OR_TEXT") | |
| } | |
| fn recipe_row(row: &Value, confidence: &str) -> Value { | |
| json!({ | |
| "template_id": row["template_id"], | |
| "template": row["template"], | |
| "roles": row["suggested_roles"], | |
| "confidence": confidence, | |
| "count": row["count"], | |
| "examples": row["examples"], | |
| }) | |
| } | |
| fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> { | |
| if let Some(parent) = path.parent() { | |
| fs::create_dir_all(parent)?; | |
| } | |
| let mut writer = BufWriter::new(File::create(path)?); | |
| for row in rows { | |
| serde_json::to_writer(&mut writer, row)?; | |
| writer.write_all(b"\n")?; | |
| } | |
| writer.flush()?; | |
| Ok(()) | |
| } | |
| fn run_low_frequency_audit(args: &Args) -> Result<()> { | |
| let recipes = load_recipes(args)?; | |
| let inputs = load_input(&args.input, args.limit)?; | |
| let low_template_total = recipes | |
| .values() | |
| .filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count) | |
| .count(); | |
| let mut seen_templates = HashSet::new(); | |
| let mut rows = Vec::new(); | |
| for original in inputs { | |
| if !args.keep_encoding_noise | |
| && (has_encoding_noise(&original) | |
| || has_non_anime_noise(&original) | |
| || has_abstract_path_noise(&original)) | |
| { | |
| continue; | |
| } | |
| let (training_filename, trimmed_parent) = training_filename_for(&original); | |
| let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); | |
| let Some(recipe) = recipes.get(&key) else { | |
| continue; | |
| }; | |
| let count = recipe.count.unwrap_or(0); | |
| if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) { | |
| continue; | |
| } | |
| if recipe.roles.len() != groups.len() { | |
| continue; | |
| } | |
| let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) | |
| else { | |
| continue; | |
| }; | |
| if trimmed_parent { | |
| record.source_filename = Some(original.clone()); | |
| record.path_trimmed = Some(true); | |
| } | |
| rows.push(json!({ | |
| "template_id": recipe.template_id, | |
| "count": count, | |
| "template": recipe.template, | |
| "filename": record.filename, | |
| "source_filename": record.source_filename, | |
| "path_trimmed": record.path_trimmed.unwrap_or(false), | |
| "spans": entity_spans(&record.tokens, &record.labels), | |
| "warnings": audit_warnings(&record), | |
| "tokens": record.tokens, | |
| "labels": record.labels, | |
| })); | |
| if seen_templates.len() >= low_template_total { | |
| break; | |
| } | |
| } | |
| rows.sort_by(|a, b| { | |
| let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0); | |
| let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0); | |
| let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or(""); | |
| let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or(""); | |
| count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b)) | |
| }); | |
| write_jsonl_values(&args.audit_output, &rows)?; | |
| let warning_counts = warning_counts(&rows); | |
| let manifest = json!({ | |
| "generated_at": Utc::now().to_rfc3339(), | |
| "input": args.input.to_string_lossy(), | |
| "recipes": args.recipes.to_string_lossy(), | |
| "audit_output": args.audit_output.to_string_lossy(), | |
| "audit_max_count": args.audit_max_count, | |
| "low_template_total": low_template_total, | |
| "audited_templates": rows.len(), | |
| "warning_counts": warning_counts, | |
| "implementation": "rust_dmhy_low_frequency_audit" | |
| }); | |
| println!("{}", serde_json::to_string_pretty(&manifest)?); | |
| Ok(()) | |
| } | |
| fn run_verify_generated_output(args: &Args) -> Result<()> { | |
| let file = File::open(&args.input) | |
| .with_context(|| format!("generated JSONL not found: {}", args.input.display()))?; | |
| let recipes_by_id: HashMap<String, u64> = load_recipes(args)? | |
| .into_values() | |
| .map(|recipe| (recipe.template_id, recipe.count.unwrap_or(0))) | |
| .collect(); | |
| let mut rows = 0usize; | |
| let mut low_frequency_rows = 0usize; | |
| let mut warning_counts: HashMap<String, usize> = HashMap::new(); | |
| let mut examples: HashMap<String, Vec<Value>> = HashMap::new(); | |
| for (line_number, line) in BufReader::new(file).lines().enumerate() { | |
| let line = line?; | |
| if line.trim().is_empty() { | |
| continue; | |
| } | |
| let record: Record = serde_json::from_str(&line).with_context(|| { | |
| format!( | |
| "invalid generated record at {}:{}", | |
| args.input.display(), | |
| line_number + 1 | |
| ) | |
| })?; | |
| rows += 1; | |
| let count = recipes_by_id | |
| .get(&record.template_id) | |
| .copied() | |
| .unwrap_or(u64::MAX); | |
| if count > args.audit_max_count { | |
| continue; | |
| } | |
| low_frequency_rows += 1; | |
| for warning in audit_warnings(&record) { | |
| if !matches!( | |
| warning.as_str(), | |
| "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" | |
| ) { | |
| continue; | |
| } | |
| *warning_counts.entry(warning.clone()).or_default() += 1; | |
| let bucket = examples.entry(warning).or_default(); | |
| if bucket.len() < 5 { | |
| bucket.push(json!({ | |
| "template_id": record.template_id, | |
| "template_count": count, | |
| "filename": record.filename, | |
| "spans": entity_spans(&record.tokens, &record.labels), | |
| })); | |
| } | |
| } | |
| } | |
| let manifest = json!({ | |
| "generated_at": Utc::now().to_rfc3339(), | |
| "input": args.input.to_string_lossy(), | |
| "recipes": args.recipes.to_string_lossy(), | |
| "audit_max_count": args.audit_max_count, | |
| "rows": rows, | |
| "low_frequency_rows": low_frequency_rows, | |
| "blocking_warning_counts": warning_counts, | |
| "examples": examples, | |
| "implementation": "rust_dmhy_generated_output_verify" | |
| }); | |
| println!("{}", serde_json::to_string_pretty(&manifest)?); | |
| if !warning_counts.is_empty() { | |
| bail!("generated output still has low-frequency blocking warnings"); | |
| } | |
| Ok(()) | |
| } | |
| fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> { | |
| let mut spans = Vec::new(); | |
| let mut current_label: Option<String> = None; | |
| let mut current_text = String::new(); | |
| for (token, label) in tokens.iter().zip(labels.iter()) { | |
| let entity = label | |
| .strip_prefix("B-") | |
| .or_else(|| label.strip_prefix("I-")) | |
| .unwrap_or("O"); | |
| if current_label.as_deref() == Some(entity) { | |
| current_text.push_str(token); | |
| continue; | |
| } | |
| if let Some(label) = current_label.take() { | |
| if label != "O" { | |
| spans.push(json!({ "label": label, "text": current_text })); | |
| } | |
| } | |
| current_label = Some(entity.to_string()); | |
| current_text = token.clone(); | |
| } | |
| if let Some(label) = current_label { | |
| if label != "O" { | |
| spans.push(json!({ "label": label, "text": current_text })); | |
| } | |
| } | |
| spans | |
| } | |
| fn audit_warnings(record: &Record) -> Vec<String> { | |
| let mut warnings = Vec::new(); | |
| let title_spans = entity_spans(&record.tokens, &record.labels) | |
| .into_iter() | |
| .filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE")) | |
| .count(); | |
| if title_spans == 0 { | |
| warnings.push("no_title".to_string()); | |
| } else if title_spans > 1 { | |
| warnings.push("multiple_title_spans".to_string()); | |
| } | |
| if !record.labels.iter().any(|label| label.ends_with("EPISODE")) { | |
| warnings.push("no_episode".to_string()); | |
| } | |
| if record.filename.contains('/') || record.filename.contains('\\') { | |
| warnings.push("path_retained".to_string()); | |
| } | |
| for (index, token) in record.tokens.iter().enumerate() { | |
| if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") { | |
| warnings.push("hash_labeled".to_string()); | |
| break; | |
| } | |
| } | |
| warnings.sort(); | |
| warnings.dedup(); | |
| warnings | |
| } | |
| fn warning_counts(rows: &[Value]) -> HashMap<String, usize> { | |
| let mut counts = HashMap::new(); | |
| for row in rows { | |
| if let Some(warnings) = row.get("warnings").and_then(Value::as_array) { | |
| for warning in warnings { | |
| if let Some(warning) = warning.as_str() { | |
| *counts.entry(warning.to_string()).or_default() += 1; | |
| } | |
| } | |
| } | |
| } | |
| counts | |
| } | |
| fn process_filename( | |
| original: &str, | |
| args: &Args, | |
| recipes: &HashMap<String, Recipe>, | |
| sample_counters: &HashMap<String, AtomicUsize>, | |
| ) -> Processed { | |
| if !args.keep_encoding_noise | |
| && (has_encoding_noise(original) | |
| || has_non_anime_noise(original) | |
| || has_abstract_path_noise(original)) | |
| { | |
| return Processed::Skipped { | |
| reason: "encoding_noise", | |
| trimmed_parent: false, | |
| }; | |
| } | |
| let (training_filename, trimmed_parent) = training_filename_for(original); | |
| let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename); | |
| let recipe = match recipes.get(&key) { | |
| Some(recipe) => recipe, | |
| None => { | |
| return Processed::Skipped { | |
| reason: "no_recipe", | |
| trimmed_parent, | |
| } | |
| } | |
| }; | |
| if args.expand == "sample" { | |
| let counter = sample_counters.get(&recipe.template_id).unwrap(); | |
| if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template { | |
| return Processed::Skipped { | |
| reason: "sample_cap", | |
| trimmed_parent, | |
| }; | |
| } | |
| } | |
| if recipe.roles.len() != groups.len() { | |
| return Processed::Skipped { | |
| reason: "role_mismatch", | |
| trimmed_parent, | |
| }; | |
| } | |
| let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) { | |
| Some(record) => record, | |
| None => { | |
| return Processed::Skipped { | |
| reason: "role_mismatch", | |
| trimmed_parent, | |
| } | |
| } | |
| }; | |
| if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record) | |
| { | |
| return Processed::Skipped { | |
| reason: "low_frequency_audit_warning", | |
| trimmed_parent, | |
| }; | |
| } | |
| if trimmed_parent { | |
| record.source_filename = Some(original.to_string()); | |
| record.path_trimmed = Some(true); | |
| return Processed::Written { | |
| record, | |
| trimmed_parent: true, | |
| }; | |
| } | |
| Processed::Written { | |
| record, | |
| trimmed_parent: false, | |
| } | |
| } | |
| fn has_blocking_low_frequency_warning(record: &Record) -> bool { | |
| audit_warnings(record).iter().any(|warning| { | |
| matches!( | |
| warning.as_str(), | |
| "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained" | |
| ) | |
| }) | |
| } | |
| fn tokenize(value: &str) -> Vec<String> { | |
| let mut output = Vec::new(); | |
| let mut index = 0; | |
| while index < value.len() { | |
| let rest = &value[index..]; | |
| if let Some((token, len)) = next_token(rest) { | |
| output.push(token); | |
| index += len; | |
| } else { | |
| let ch = rest.chars().next().unwrap(); | |
| output.push(ch.to_string()); | |
| index += ch.len_utf8(); | |
| } | |
| } | |
| output | |
| } | |
| fn next_token(rest: &str) -> Option<(String, usize)> { | |
| let first = rest.chars().next()?; | |
| if first == '[' { | |
| if let Some(end) = rest.find(']') { | |
| if end <= 121 { | |
| return Some((rest[..=end].to_string(), end + 1)); | |
| } | |
| } | |
| } | |
| if first == '(' { | |
| if let Some(end) = rest.find(')') { | |
| if end <= 121 { | |
| return Some((rest[..=end].to_string(), end + 1)); | |
| } | |
| } | |
| } | |
| if first == '【' { | |
| if let Some(end) = rest.find('】') { | |
| if rest[..end].chars().count() <= 120 { | |
| return Some(( | |
| rest[..end + '】'.len_utf8()].to_string(), | |
| end + '】'.len_utf8(), | |
| )); | |
| } | |
| } | |
| } | |
| for re in TOKEN_REGEXES.iter() { | |
| if let Some(mat) = re.find(rest) { | |
| if mat.start() == 0 && mat.end() > 0 { | |
| return Some((mat.as_str().to_string(), mat.end())); | |
| } | |
| } | |
| } | |
| None | |
| } | |
| fn strip_wrapper(token: &str) -> String { | |
| let chars: Vec<char> = token.chars().collect(); | |
| if chars.len() >= 2 { | |
| let first = chars[0]; | |
| let last = chars[chars.len() - 1]; | |
| if (first == '[' && last == ']') | |
| || (first == '(' && last == ')') | |
| || (first == '【' && last == '】') | |
| { | |
| return chars[1..chars.len() - 1] | |
| .iter() | |
| .collect::<String>() | |
| .trim() | |
| .to_string(); | |
| } | |
| } | |
| token.trim().to_string() | |
| } | |
| fn split_inner(inner: &str) -> Vec<String> { | |
| let mut parts = Vec::new(); | |
| let mut current = String::new(); | |
| for ch in inner.chars() { | |
| if ch.is_whitespace() || "_.,+/&|-()()".contains(ch) { | |
| if !current.is_empty() { | |
| parts.push(std::mem::take(&mut current)); | |
| } | |
| } else { | |
| current.push(ch); | |
| } | |
| } | |
| if !current.is_empty() { | |
| parts.push(current); | |
| } | |
| parts | |
| } | |
| fn compact_for_classify(text: &str) -> String { | |
| text.chars() | |
| .filter(|ch| !ch.is_whitespace() && !matches!(ch, '_' | '.' | ',' | '-')) | |
| .collect() | |
| } | |
| fn classify_atom(text: &str) -> String { | |
| let cleaned = strip_wrapper(text); | |
| let compact = compact_for_classify(&cleaned); | |
| if cleaned.is_empty() { | |
| return "EMPTY".to_string(); | |
| } | |
| if HASH_RE.is_match(&cleaned) { | |
| return "HASH".to_string(); | |
| } | |
| if RESOLUTION_RE.is_match(&cleaned) { | |
| return "RESOLUTION".to_string(); | |
| } | |
| if DATE_RE.is_match(&cleaned) { | |
| return "DATE".to_string(); | |
| } | |
| if EPISODE_VERSION_RE.is_match(&compact) { | |
| return "EPISODE_VERSION".to_string(); | |
| } | |
| if SXE_RE.is_match(&compact) { | |
| return "SXE".to_string(); | |
| } | |
| if EPISODE_RE.is_match(&compact) { | |
| return "EPISODE".to_string(); | |
| } | |
| if EPISODE_CJK_RE.is_match(&cleaned) { | |
| return "EPISODE".to_string(); | |
| } | |
| if EPISODE_BATCH_RE.is_match(&cleaned) { | |
| return "EPISODE_RANGE".to_string(); | |
| } | |
| if EPISODE_RANGE_RE.is_match(&cleaned) { | |
| return "EPISODE_RANGE".to_string(); | |
| } | |
| if EPISODE_RE.is_match(&cleaned) { | |
| return "EPISODE".to_string(); | |
| } | |
| if SEASON_RE.is_match(&cleaned) { | |
| return "SEASON".to_string(); | |
| } | |
| if SPECIAL_RE.is_match(&cleaned) { | |
| return "SPECIAL".to_string(); | |
| } | |
| if VOLUME_RE.is_match(&cleaned) { | |
| return "VOLUME".to_string(); | |
| } | |
| if LANG_RE.is_match(&cleaned) || lang_block_matches(&cleaned) { | |
| return "LANG".to_string(); | |
| } | |
| if MEDIA_RE.is_match(&cleaned) { | |
| return "MEDIA".to_string(); | |
| } | |
| "TEXT".to_string() | |
| } | |
| fn lang_block_matches(text: &str) -> bool { | |
| let upper = text.to_ascii_uppercase(); | |
| if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"] | |
| .iter() | |
| .any(|marker| upper.contains(marker)) | |
| { | |
| return true; | |
| } | |
| if upper.contains("GB") { | |
| return true; | |
| } | |
| if [ | |
| "简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂", | |
| ] | |
| .iter() | |
| .any(|marker| text.contains(marker)) | |
| { | |
| return true; | |
| } | |
| let chars: Vec<char> = text.chars().collect(); | |
| chars.windows(2).enumerate().any(|(index, pair)| { | |
| pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' | '組')) | |
| }) | |
| } | |
| fn classify_token(token: &str) -> String { | |
| if token.is_empty() { | |
| return "EMPTY".to_string(); | |
| } | |
| if token.chars().all(char::is_whitespace) { | |
| return "SPACE".to_string(); | |
| } | |
| if token.chars().all(|ch| ch == '/' || ch == '\\') { | |
| return "PATH".to_string(); | |
| } | |
| if token.chars().all(|ch| "-_.::+&|".contains(ch)) { | |
| return "SEP".to_string(); | |
| } | |
| if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') { | |
| let inner = strip_wrapper(token); | |
| let parts = split_inner(&inner); | |
| let whole_class = classify_atom(&inner); | |
| let inner_class = if whole_class != "TEXT" { | |
| if whole_class == "LANG" && parts.len() > 1 { | |
| let part_classes: Vec<String> = | |
| parts.iter().map(|part| classify_atom(part)).collect(); | |
| if part_classes.iter().all(|item| item == &part_classes[0]) { | |
| part_classes[0].clone() | |
| } else if part_classes.iter().all(|item| is_media_block_class(item)) { | |
| "MEDIA_BLOCK".to_string() | |
| } else { | |
| whole_class | |
| } | |
| } else { | |
| whole_class | |
| } | |
| } else if parts.is_empty() { | |
| "EMPTY".to_string() | |
| } else { | |
| let part_classes: Vec<String> = parts.iter().map(|part| classify_atom(part)).collect(); | |
| if part_classes.iter().all(|item| item == &part_classes[0]) { | |
| part_classes[0].clone() | |
| } else if part_classes.iter().all(|item| is_media_block_class(item)) { | |
| "MEDIA_BLOCK".to_string() | |
| } else if part_classes.iter().any(|item| is_media_block_class(item)) | |
| && parts.iter().zip(part_classes.iter()).all(|(part, item)| { | |
| is_media_block_class(item) | |
| || matches!(part.to_ascii_lowercase().as_str(), "anime" | "アニメ") | |
| }) | |
| { | |
| "MEDIA_BLOCK".to_string() | |
| } else if part_classes.iter().any(|item| item == "TEXT") { | |
| "TEXT".to_string() | |
| } else { | |
| let mut set: Vec<String> = part_classes | |
| .into_iter() | |
| .collect::<HashSet<_>>() | |
| .into_iter() | |
| .collect(); | |
| set.sort(); | |
| set.join("_") | |
| } | |
| }; | |
| return format!("BRACKET_{inner_class}"); | |
| } | |
| classify_atom(token) | |
| } | |
| fn is_media_block_class(value: &str) -> bool { | |
| matches!(value, "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE") | |
| } | |
| fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec<Group> { | |
| let mut groups: Vec<Group> = Vec::new(); | |
| let mut previous: Option<String> = None; | |
| for (index, token_class) in classes.iter().enumerate() { | |
| let current = if token_class == "SPACE" { | |
| "SEP" | |
| } else { | |
| token_class | |
| } | |
| .to_string(); | |
| if previous.as_deref() == Some(current.as_str()) | |
| && matches!(current.as_str(), "SEP" | "TEXT") | |
| { | |
| groups.last_mut().unwrap().indices.push(index); | |
| } else { | |
| groups.push(Group { | |
| indices: vec![index], | |
| class_name: current.clone(), | |
| }); | |
| } | |
| previous = Some(current); | |
| } | |
| groups | |
| } | |
| fn template_key_for_filename(filename: &str) -> (String, Vec<String>, Vec<String>, Vec<Group>) { | |
| let tokens = tokenize(filename); | |
| let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect(); | |
| let groups = compact_token_groups(&tokens, &classes); | |
| let key = groups | |
| .iter() | |
| .map(|group| group.class_name.as_str()) | |
| .collect::<Vec<_>>() | |
| .join(" "); | |
| (key, tokens, classes, groups) | |
| } | |
| fn suggested_roles(template: &str) -> Vec<String> { | |
| let items: Vec<&str> = template.split_whitespace().collect(); | |
| let mut roles = vec!["O".to_string(); items.len()]; | |
| let mut segment_starts = vec![0usize]; | |
| for (index, item) in items.iter().enumerate() { | |
| if *item == "PATH" { | |
| segment_starts.push(index + 1); | |
| } | |
| } | |
| for (index, item) in items.iter().enumerate() { | |
| roles[index] = if item.contains("EPISODE_VERSION") { | |
| "EPISODE_VERSION" | |
| } else if item.contains("EPISODE_RANGE") { | |
| "EPISODE_RANGE" | |
| } else if item.contains("EPISODE") || item.contains("SXE") { | |
| "EPISODE" | |
| } else if item.contains("RESOLUTION") { | |
| "RESOLUTION" | |
| } else if item.contains("HASH") { | |
| "HASH" | |
| } else if item.contains("LANG") || item.contains("MEDIA") { | |
| "SOURCE" | |
| } else if item.contains("SPECIAL") { | |
| "SPECIAL" | |
| } else if item.contains("SEASON") { | |
| "SEASON" | |
| } else if item.contains("VOLUME") { | |
| "VOLUME" | |
| } else { | |
| "O" | |
| } | |
| .to_string(); | |
| } | |
| for (offset, start) in segment_starts.iter().enumerate() { | |
| let end = if offset + 1 < segment_starts.len() { | |
| segment_starts[offset + 1] - 1 | |
| } else { | |
| items.len() | |
| }; | |
| if *start >= end { | |
| continue; | |
| } | |
| let first_structural = (*start..end) | |
| .find(|&index| { | |
| items[index].contains("EPISODE") | |
| || matches!(items[index], "SXE" | "SPECIAL" | "SEASON") | |
| }) | |
| .unwrap_or(end); | |
| let bracket_text: Vec<usize> = (*start..first_structural) | |
| .filter(|&index| items[index] == "BRACKET_TEXT" && roles[index] == "O") | |
| .collect(); | |
| let text: Vec<usize> = (*start..first_structural) | |
| .filter(|&index| items[index] == "TEXT" && roles[index] == "O") | |
| .collect(); | |
| if bracket_text.len() >= 2 { | |
| roles[bracket_text[0]] = "GROUP".to_string(); | |
| for index in bracket_text.iter().skip(1) { | |
| roles[*index] = "TITLE".to_string(); | |
| } | |
| } else if bracket_text.len() == 1 { | |
| roles[bracket_text[0]] = if text.is_empty() { | |
| "TITLE" | |
| } else if bracket_text[0] == *start { | |
| "GROUP" | |
| } else { | |
| "TITLE" | |
| } | |
| .to_string(); | |
| } | |
| for index in text { | |
| roles[index] = "TITLE".to_string(); | |
| } | |
| if !roles[*start..end].iter().any(|role| role == "TITLE") | |
| && !items[*start..end].is_empty() | |
| && items[*start].contains("EPISODE") | |
| { | |
| let mut run = Vec::new(); | |
| for index in (*start + 1)..end { | |
| if items[index] == "TEXT" && roles[index] == "O" { | |
| run.push(index); | |
| continue; | |
| } | |
| if items[index] == "SEP" { | |
| continue; | |
| } | |
| if !run.is_empty() { | |
| break; | |
| } | |
| } | |
| if run.len() >= 2 { | |
| for index in run { | |
| roles[index] = "TITLE".to_string(); | |
| } | |
| } | |
| } | |
| } | |
| roles | |
| } | |
| fn filename_has_title(filename: &str) -> bool { | |
| let (key, _, _, _) = template_key_for_filename(filename); | |
| suggested_roles(&key).iter().any(|role| role == "TITLE") | |
| } | |
| fn training_filename_for(original: &str) -> (String, bool) { | |
| let parts: Vec<&str> = original | |
| .split(|ch| ch == '/' || ch == '\\') | |
| .map(str::trim) | |
| .filter(|part| !part.is_empty()) | |
| .collect(); | |
| if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) { | |
| if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) { | |
| if !path_segment_is_plain_season(parts[parts.len() - 2]) { | |
| return (parts[parts.len() - 1].to_string(), true); | |
| } | |
| let parent_seasons = path_segment_seasons(parts[parts.len() - 2]); | |
| let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]); | |
| if parent_seasons | |
| .iter() | |
| .any(|season| leaf_seasons.contains(season)) | |
| { | |
| (parts[parts.len() - 1].to_string(), true) | |
| } else { | |
| (parts[parts.len() - 2..].join("/"), true) | |
| } | |
| } else { | |
| (parts[parts.len() - 1].to_string(), true) | |
| } | |
| } else { | |
| (original.to_string(), false) | |
| } | |
| } | |
| fn path_segment_is_plain_season(segment: &str) -> bool { | |
| let cleaned = strip_wrapper(segment).trim().to_string(); | |
| PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned) | |
| } | |
| fn path_segment_has_season(value: &str) -> bool { | |
| PATH_SEGMENT_SEASON_RE.is_match(value) | |
| } | |
| fn path_segment_seasons(value: &str) -> HashSet<u8> { | |
| SEASON_WORD_NUMBER_RE | |
| .captures_iter(value) | |
| .chain(S_NUMBER_SEGMENT_RE.captures_iter(value)) | |
| .chain(SXE_SEASON_RE.captures_iter(value)) | |
| .filter_map(|captures| captures.get(1)) | |
| .filter_map(|item| item.as_str().parse::<u8>().ok()) | |
| .collect() | |
| } | |
| fn has_encoding_noise(value: &str) -> bool { | |
| if value.contains('\u{fffd}') { | |
| return true; | |
| } | |
| let markers = [ | |
| "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", | |
| ]; | |
| let marker_hits = markers | |
| .iter() | |
| .map(|marker| value.matches(marker).count()) | |
| .sum::<usize>(); | |
| let halfwidth_hits = value | |
| .chars() | |
| .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch)) | |
| .count(); | |
| marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) | |
| } | |
| fn has_non_anime_noise(value: &str) -> bool { | |
| let normalized = value.replace('\\', "/").trim().to_ascii_lowercase(); | |
| normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/") | |
| } | |
| fn normalized_path_segment(value: &str) -> String { | |
| value | |
| .split_whitespace() | |
| .collect::<String>() | |
| .to_ascii_lowercase() | |
| } | |
| fn path_segment_is_episodeish(value: &str) -> bool { | |
| let (_, _, _, groups) = template_key_for_filename(value); | |
| let structural: Vec<&String> = groups | |
| .iter() | |
| .map(|group| &group.class_name) | |
| .filter(|item| item.as_str() != "SEP") | |
| .collect(); | |
| !structural.is_empty() | |
| && structural | |
| .iter() | |
| .all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL") | |
| } | |
| fn has_abstract_path_noise(value: &str) -> bool { | |
| let parts: Vec<&str> = value | |
| .split(|ch| ch == '/' || ch == '\\') | |
| .map(str::trim) | |
| .filter(|part| !part.is_empty()) | |
| .collect(); | |
| if parts.len() < 3 { | |
| return false; | |
| } | |
| if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) { | |
| return true; | |
| } | |
| path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1]) | |
| } | |
| fn role_label(role: &str) -> String { | |
| let entity = match role { | |
| "GROUP" => Some("GROUP"), | |
| "TITLE" => Some("TITLE"), | |
| "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"), | |
| "SEASON" => Some("SEASON"), | |
| "SPECIAL" | "VOLUME" => Some("SPECIAL"), | |
| "RESOLUTION" => Some("RESOLUTION"), | |
| "SOURCE" => Some("SOURCE"), | |
| _ => None, | |
| }; | |
| entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}")) | |
| } | |
| fn is_separator(piece: &str) -> bool { | |
| piece.is_empty() | |
| || piece | |
| .chars() | |
| .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) | |
| } | |
| fn char_kind(ch: char) -> &'static str { | |
| if ch.is_whitespace() || !ch.is_alphanumeric() { | |
| "sep" | |
| } else if ch.is_ascii_digit() { | |
| "digit" | |
| } else if ch.is_ascii_alphabetic() { | |
| "alpha" | |
| } else { | |
| "text" | |
| } | |
| } | |
| fn split_refined_token(token: &str) -> Vec<String> { | |
| let whole_class = classify_atom(token); | |
| let is_wrapped = { | |
| let chars: Vec<char> = token.chars().collect(); | |
| chars.len() >= 2 | |
| && ((chars[0] == '[' && chars[chars.len() - 1] == ']') | |
| || (chars[0] == '(' && chars[chars.len() - 1] == ')') | |
| || (chars[0] == '【' && chars[chars.len() - 1] == '】')) | |
| }; | |
| if !is_wrapped | |
| && matches!( | |
| whole_class.as_str(), | |
| "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | |
| ) | |
| && token.chars().all(char::is_alphanumeric) | |
| { | |
| return vec![token.to_string()]; | |
| } | |
| if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) { | |
| return vec![token.to_string()]; | |
| } | |
| let mut pieces = Vec::new(); | |
| let mut current = String::new(); | |
| let mut current_kind: Option<&str> = None; | |
| for ch in token.chars() { | |
| let kind = char_kind(ch); | |
| if kind == "sep" { | |
| if !current.is_empty() { | |
| pieces.push(std::mem::take(&mut current)); | |
| current_kind = None; | |
| } | |
| pieces.push(ch.to_string()); | |
| continue; | |
| } | |
| if !current.is_empty() && current_kind != Some(kind) { | |
| pieces.push(std::mem::take(&mut current)); | |
| } | |
| current.push(ch); | |
| current_kind = Some(kind); | |
| } | |
| if !current.is_empty() { | |
| pieces.push(current); | |
| } | |
| let mut merged = Vec::new(); | |
| let mut index = 0; | |
| while index < pieces.len() { | |
| if index + 2 < pieces.len() | |
| && !is_separator(&pieces[index]) | |
| && is_separator(&pieces[index + 1]) | |
| && !is_separator(&pieces[index + 2]) | |
| { | |
| let combined = format!( | |
| "{}{}{}", | |
| pieces[index], | |
| pieces[index + 1], | |
| pieces[index + 2] | |
| ); | |
| let combined_class = classify_atom(&combined); | |
| if !pieces[index + 1].chars().any(char::is_whitespace) | |
| && matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×") | |
| && matches!( | |
| combined_class.as_str(), | |
| "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | |
| ) | |
| { | |
| merged.push(combined); | |
| index += 3; | |
| continue; | |
| } | |
| } | |
| if !is_separator(&pieces[index]) { | |
| let mut end = index; | |
| let mut combined = String::new(); | |
| while end < pieces.len() && !is_separator(&pieces[end]) { | |
| combined.push_str(&pieces[end]); | |
| end += 1; | |
| } | |
| if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) { | |
| merged.push(combined); | |
| index = end; | |
| continue; | |
| } | |
| } | |
| if index + 1 < pieces.len() | |
| && !is_separator(&pieces[index]) | |
| && !is_separator(&pieces[index + 1]) | |
| { | |
| let combined = format!("{}{}", pieces[index], pieces[index + 1]); | |
| if is_mergeable_refined_class(&classify_atom(&combined)) { | |
| merged.push(combined); | |
| index += 2; | |
| continue; | |
| } | |
| } | |
| merged.push(pieces[index].clone()); | |
| index += 1; | |
| } | |
| merged | |
| } | |
| fn is_mergeable_refined_class(value: &str) -> bool { | |
| matches!( | |
| value, | |
| "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | "SEASON" | |
| ) | |
| } | |
| fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String { | |
| if is_separator(piece) { | |
| return "O".to_string(); | |
| } | |
| let atom_class = classify_atom(piece); | |
| let upper = piece.to_ascii_uppercase(); | |
| if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { | |
| if atom_class == "SEASON" { | |
| return "B-SEASON".to_string(); | |
| } | |
| if matches!(atom_class.as_str(), "EPISODE" | "EPISODE_VERSION" | "SXE") | |
| || piece.chars().all(|ch| ch.is_ascii_digit()) | |
| { | |
| return "B-EPISODE".to_string(); | |
| } | |
| if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") | |
| || matches!( | |
| upper.as_str(), | |
| "OVA" | "OAD" | "SP" | "PV" | "CM" | "OP" | "ED" | "NCOP" | "NCED" | |
| ) | |
| { | |
| return "B-SPECIAL".to_string(); | |
| } | |
| return "O".to_string(); | |
| } | |
| if role == "SOURCE" || matches!(token_class, "BRACKET_MEDIA_BLOCK" | "MEDIA_BLOCK") { | |
| if atom_class == "RESOLUTION" { | |
| return "B-RESOLUTION".to_string(); | |
| } | |
| if atom_class == "HASH" { | |
| return "O".to_string(); | |
| } | |
| if matches!(atom_class.as_str(), "MEDIA" | "LANG") { | |
| return "B-SOURCE".to_string(); | |
| } | |
| if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") { | |
| return "B-SPECIAL".to_string(); | |
| } | |
| return if matches!( | |
| upper.as_str(), | |
| "END" | "FIN" | "COMPLETE" | "TV" | "全集" | "全" | |
| ) { | |
| "O".to_string() | |
| } else { | |
| "B-SOURCE".to_string() | |
| }; | |
| } | |
| if role == "RESOLUTION" { | |
| return if atom_class == "RESOLUTION" || piece.chars().all(|ch| ch.is_ascii_digit()) { | |
| "B-RESOLUTION".to_string() | |
| } else { | |
| "O".to_string() | |
| }; | |
| } | |
| role_label(role) | |
| } | |
| fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> { | |
| let caps = SXE_VALUE_RE.captures(token)?; | |
| let mut pieces = vec![ | |
| "S".to_string(), | |
| caps[1].to_string(), | |
| "E".to_string(), | |
| caps[2].to_string(), | |
| ]; | |
| let mut labels = vec![ | |
| "O".to_string(), | |
| "B-SEASON".to_string(), | |
| "O".to_string(), | |
| "B-EPISODE".to_string(), | |
| ]; | |
| if let Some(version) = caps.get(3) { | |
| pieces.push("v".to_string()); | |
| pieces.push(version.as_str().to_string()); | |
| labels.push("O".to_string()); | |
| labels.push("O".to_string()); | |
| } | |
| Some((pieces, labels)) | |
| } | |
| fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> { | |
| let caps = EPISODE_VALUE_RE.captures(token)?; | |
| let mut pieces = vec![caps[1].to_string(), caps[2].to_string()]; | |
| let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()]; | |
| if let Some(version) = caps.get(3) { | |
| pieces.push("v".to_string()); | |
| pieces.push(version.as_str().to_string()); | |
| labels.push("O".to_string()); | |
| labels.push("O".to_string()); | |
| } | |
| Some((pieces, labels)) | |
| } | |
| fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> { | |
| let caps = SEASON_VALUE_RE.captures(token)?; | |
| Some(( | |
| vec!["S".to_string(), caps[1].to_string()], | |
| vec!["O".to_string(), "B-SEASON".to_string()], | |
| )) | |
| } | |
| fn group_text(tokens: &[String], group: &Group) -> String { | |
| strip_wrapper( | |
| &group | |
| .indices | |
| .iter() | |
| .map(|&index| tokens[index].as_str()) | |
| .collect::<String>(), | |
| ) | |
| } | |
| fn is_special_title_phrase(text: &str) -> bool { | |
| let normalized = SPECIAL_SPACE_RE | |
| .replace_all(text, " ") | |
| .trim() | |
| .to_ascii_uppercase(); | |
| matches!( | |
| normalized.as_str(), | |
| "CM" | "EVENT" | |
| | "EIZOU" | |
| | "LOGO" | |
| | "MENU" | |
| | "OMAKE" | |
| | "PREVIEW" | |
| | "PV" | |
| | "THEATER GREETING EVENT" | |
| | "TOKUTEN" | |
| | "TRAILER" | |
| | "WORLD PREMIERE" | |
| ) || SPECIAL_TITLE_PHRASE_RE.is_match(text) | |
| } | |
| fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> { | |
| let mut output = roles.to_vec(); | |
| let ep_markers = ["EP", "E", "Episode", "ep", "episode"]; | |
| let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"]; | |
| if !output.iter().any(|role| role == "TITLE") | |
| && roles | |
| .first() | |
| .is_some_and(|role| role.starts_with("EPISODE")) | |
| { | |
| let mut title_run = Vec::new(); | |
| for index in 1..roles.len() { | |
| if groups[index].class_name == "TEXT" && output[index] == "O" { | |
| title_run.push(index); | |
| continue; | |
| } | |
| if groups[index].class_name == "SEP" { | |
| continue; | |
| } | |
| if !title_run.is_empty() { | |
| break; | |
| } | |
| } | |
| if title_run.len() >= 2 { | |
| let last_title_index = *title_run.last().unwrap(); | |
| let later_structural = roles[last_title_index + 1..].iter().any(|role| { | |
| role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") | |
| }); | |
| if group_text(tokens, &groups[0]) | |
| .chars() | |
| .all(|ch| ch.is_ascii_digit()) | |
| && later_structural | |
| { | |
| output[0] = "TITLE".to_string(); | |
| } | |
| for index in title_run { | |
| output[index] = "TITLE".to_string(); | |
| } | |
| } | |
| } | |
| if roles | |
| .first() | |
| .is_some_and(|role| role.starts_with("EPISODE")) | |
| && group_text(tokens, &groups[0]) | |
| .chars() | |
| .all(|ch| ch.is_ascii_digit()) | |
| { | |
| if let Some(first_title) = output.iter().position(|role| role == "TITLE") { | |
| let later_structural = roles[first_title + 1..].iter().any(|role| { | |
| role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL") | |
| }); | |
| if later_structural { | |
| output[0] = "TITLE".to_string(); | |
| } | |
| } | |
| } | |
| for index in 0..roles.len() { | |
| let text = group_text(tokens, &groups[index]); | |
| if output[index] == "O" && groups[index].class_name.contains("SXE") { | |
| output[index] = "EPISODE".to_string(); | |
| } | |
| if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) { | |
| output[index] = "O".to_string(); | |
| continue; | |
| } | |
| if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) { | |
| let previous_text = group_text(tokens, &groups[index - 2]); | |
| let next_special = output[index + 1..roles.len().min(index + 4)] | |
| .iter() | |
| .any(|role| role == "SPECIAL"); | |
| let next_episode = roles[index + 1..] | |
| .iter() | |
| .any(|role| role.starts_with("EPISODE")); | |
| if groups[index - 1].class_name == "SEP" | |
| && matches!( | |
| previous_text.to_ascii_lowercase().as_str(), | |
| "vol" | "volume" | |
| ) | |
| { | |
| let next_text_before_episode = (index + 1..roles.len()) | |
| .find(|&cursor| groups[cursor].class_name != "SEP") | |
| .is_some_and(|cursor| { | |
| groups[cursor].class_name == "TEXT" | |
| && roles[cursor + 1..] | |
| .iter() | |
| .any(|role| role.starts_with("EPISODE")) | |
| }); | |
| if next_text_before_episode { | |
| output[index - 2] = "TITLE".to_string(); | |
| output[index] = "TITLE".to_string(); | |
| continue; | |
| } | |
| output[index - 2] = "SPECIAL".to_string(); | |
| output[index] = "SPECIAL".to_string(); | |
| continue; | |
| } | |
| if output[index - 2] == "TITLE" | |
| && groups[index - 1].class_name == "SEP" | |
| && previous_text.len() <= 4 | |
| && previous_text.is_ascii() | |
| && previous_text.chars().all(|ch| ch.is_ascii_alphabetic()) | |
| && text.chars().all(|ch| ch.is_ascii_digit()) | |
| && text.len() <= 3 | |
| && (next_special || next_episode) | |
| { | |
| output[index] = "TITLE".to_string(); | |
| continue; | |
| } | |
| } | |
| if roles[index].starts_with("EPISODE") | |
| && index >= 2 | |
| && output[..index].iter().any(|role| role == "TITLE") | |
| && group_text(tokens, &groups[index]) | |
| .chars() | |
| .all(|ch| ch.is_ascii_digit()) | |
| { | |
| let next_episode_word = index + 2 < roles.len() | |
| && groups[index + 1].class_name == "SEP" | |
| && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"); | |
| if next_episode_word { | |
| let mut run = Vec::new(); | |
| let mut cursor = index + 2; | |
| while cursor < roles.len() { | |
| if groups[cursor].class_name == "SEP" { | |
| cursor += 1; | |
| continue; | |
| } | |
| if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE") | |
| { | |
| run.push(cursor); | |
| cursor += 1; | |
| continue; | |
| } | |
| break; | |
| } | |
| let later_episode = roles[cursor..] | |
| .iter() | |
| .any(|role| role.starts_with("EPISODE")); | |
| if run.len() >= 2 && later_episode { | |
| output[index] = "TITLE".to_string(); | |
| for item in run { | |
| output[item] = "TITLE".to_string(); | |
| } | |
| continue; | |
| } | |
| } | |
| } | |
| if roles[index] == "TITLE" && is_special_title_phrase(&text) { | |
| output[index] = "SPECIAL".to_string(); | |
| continue; | |
| } | |
| if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集") | |
| { | |
| output[index] = "O".to_string(); | |
| continue; | |
| } | |
| if output[index] == "O" | |
| && groups[index].class_name == "TEXT" | |
| && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE")) | |
| && text.chars().any(|ch| ch.is_alphabetic()) | |
| && !ep_markers.contains(&text.as_str()) | |
| { | |
| if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") { | |
| let episode_since_title = output[last_title + 1..index] | |
| .iter() | |
| .any(|role| role.starts_with("EPISODE")); | |
| if !episode_since_title { | |
| output[index] = "TITLE".to_string(); | |
| continue; | |
| } | |
| } | |
| } | |
| if roles[index] == "TITLE" | |
| && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison") | |
| && index + 2 < roles.len() | |
| && groups[index + 1].class_name == "SEP" | |
| && roles[index + 2].starts_with("EPISODE") | |
| { | |
| output[index] = "O".to_string(); | |
| output[index + 2] = "SEASON".to_string(); | |
| continue; | |
| } | |
| if roles[index] == "TITLE" | |
| && text == text.to_ascii_uppercase() | |
| && roman.contains(&text.as_str()) | |
| { | |
| let previous_title = output[..index].iter().any(|role| role == "TITLE"); | |
| let next_structural = roles[index + 1..] | |
| .iter() | |
| .any(|role| role.starts_with("EPISODE") || role == "SPECIAL"); | |
| if previous_title && next_structural { | |
| output[index] = "SEASON".to_string(); | |
| continue; | |
| } | |
| } | |
| if roles[index].starts_with("EPISODE") && index + 4 < roles.len() { | |
| if groups[index + 1].class_name == "SEP" | |
| && ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str()) | |
| && groups[index + 3].class_name == "SEP" | |
| && roles[index + 4].starts_with("EPISODE") | |
| { | |
| output[index] = "TITLE".to_string(); | |
| output[index + 2] = "O".to_string(); | |
| } | |
| } | |
| if roles[index].starts_with("EPISODE") { | |
| let previous_text = if index >= 1 { | |
| group_text(tokens, &groups[index - 1]) | |
| } else { | |
| String::new() | |
| }; | |
| let next_text = if index + 1 < roles.len() { | |
| group_text(tokens, &groups[index + 1]) | |
| } else { | |
| String::new() | |
| }; | |
| if previous_text.contains('点') | |
| || previous_text.contains('點') | |
| || previous_text.contains("晚上") | |
| || previous_text.contains("上午") | |
| || previous_text.contains("下午") | |
| || next_text.contains('点') | |
| || next_text.contains('點') | |
| || next_text.contains('半') | |
| { | |
| output[index] = "O".to_string(); | |
| } | |
| } | |
| } | |
| output | |
| } | |
| fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> { | |
| let mut candidates = Vec::new(); | |
| let mut index = 0; | |
| while index < roles.len() { | |
| if roles[index] != "TITLE" { | |
| index += 1; | |
| continue; | |
| } | |
| let start = index; | |
| index += 1; | |
| loop { | |
| if index < roles.len() | |
| && roles[index] == "TITLE" | |
| && !(groups[index - 1].class_name == "BRACKET_TEXT" | |
| && groups[index].class_name == "BRACKET_TEXT") | |
| { | |
| index += 1; | |
| continue; | |
| } | |
| if index + 1 < roles.len() | |
| && roles[index] == "O" | |
| && groups[index].class_name == "SEP" | |
| && roles[index + 1] == "TITLE" | |
| { | |
| index += 2; | |
| continue; | |
| } | |
| break; | |
| } | |
| candidates.push((start, index)); | |
| } | |
| candidates | |
| } | |
| fn enforce_single_title_candidate( | |
| groups: &[Group], | |
| roles: &[String], | |
| ) -> (Vec<String>, Vec<String>) { | |
| let candidates = title_candidates(groups, roles); | |
| if candidates.len() <= 1 { | |
| return (roles.to_vec(), Vec::new()); | |
| } | |
| let first_anchor = roles | |
| .iter() | |
| .position(|role| { | |
| role.starts_with("EPISODE") | |
| || matches!( | |
| role.as_str(), | |
| "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION" | |
| ) | |
| }) | |
| .unwrap_or(roles.len()); | |
| let before_anchor: Vec<(usize, usize)> = candidates | |
| .iter() | |
| .copied() | |
| .filter(|(_, end)| *end <= first_anchor) | |
| .collect(); | |
| let selected = (if before_anchor.is_empty() { | |
| &candidates | |
| } else { | |
| &before_anchor | |
| }) | |
| .iter() | |
| .max_by_key(|(start, end)| (*end, end - start)) | |
| .copied() | |
| .unwrap(); | |
| let mut output = roles.to_vec(); | |
| let mut dropped = Vec::new(); | |
| for (start, end) in candidates { | |
| if (start, end) == selected { | |
| continue; | |
| } | |
| for index in start..end { | |
| if output[index] == "TITLE" { | |
| output[index] = "O".to_string(); | |
| dropped.push(index.to_string()); | |
| } | |
| } | |
| } | |
| (output, dropped) | |
| } | |
| fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) { | |
| let mut output_tokens = Vec::new(); | |
| let mut output_labels = Vec::new(); | |
| for (token, label) in tokens.iter().zip(labels.iter()) { | |
| for piece in split_generated_token(token) { | |
| output_labels.push(if label == "O" || is_standalone_separator(&piece) { | |
| "O".to_string() | |
| } else { | |
| label.clone() | |
| }); | |
| output_tokens.push(piece); | |
| } | |
| } | |
| (output_tokens, output_labels) | |
| } | |
| fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) { | |
| let pieces = split_generated_token(token); | |
| let labels = pieces | |
| .iter() | |
| .map(|piece| { | |
| if is_standalone_separator(piece) { | |
| "O".to_string() | |
| } else if CJK_SEASON_TOKEN_RE.is_match(piece) { | |
| "B-SEASON".to_string() | |
| } else { | |
| "B-TITLE".to_string() | |
| } | |
| }) | |
| .collect(); | |
| (pieces, labels) | |
| } | |
| fn split_generated_token(token: &str) -> Vec<String> { | |
| let mut pieces = Vec::new(); | |
| let mut current = String::new(); | |
| for ch in token.chars() { | |
| if ch.is_whitespace() || !ch.is_alphanumeric() { | |
| if !current.is_empty() { | |
| pieces.push(std::mem::take(&mut current)); | |
| } | |
| pieces.push(ch.to_string()); | |
| } else { | |
| current.push(ch); | |
| } | |
| } | |
| if !current.is_empty() { | |
| pieces.push(current); | |
| } | |
| pieces | |
| } | |
| fn is_standalone_separator(token: &str) -> bool { | |
| token.chars().count() == 1 | |
| && token | |
| .chars() | |
| .next() | |
| .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric()) | |
| } | |
| fn project_refined_tokens( | |
| tokens: &[String], | |
| groups: &[Group], | |
| roles: &[String], | |
| ) -> (Vec<String>, Vec<String>) { | |
| let mut output_tokens = Vec::new(); | |
| let mut output_labels = Vec::new(); | |
| for (group_index, group) in groups.iter().enumerate() { | |
| let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O"); | |
| if matches!(group.class_name.as_str(), "SEP" | "PATH" | "EMPTY") { | |
| role = "O"; | |
| } | |
| for &index in &group.indices { | |
| let token = &tokens[index]; | |
| if matches!( | |
| role, | |
| "EPISODE" | |
| | "EPISODE_VERSION" | |
| | "EPISODE_RANGE" | |
| | "SOURCE" | |
| | "RESOLUTION" | |
| | "SEASON" | |
| ) { | |
| if role == "SEASON" { | |
| if let Some((pieces, labels)) = split_season_token(token) { | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| } | |
| if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { | |
| if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) { | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) { | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| } | |
| for piece in split_refined_token(token) { | |
| if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") { | |
| if let Some((pieces, labels)) = split_season_token(&piece) { | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| if let Some((pieces, labels)) = split_episode_token(&piece) { | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| } | |
| let label = label_for_refined_piece(&piece, role, &group.class_name); | |
| let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]); | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| } | |
| } else { | |
| if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集") | |
| { | |
| output_tokens.push(token.clone()); | |
| output_labels.push("O".to_string()); | |
| continue; | |
| } | |
| if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 { | |
| let trimmed = token.trim_end_matches('第').to_string(); | |
| let (pieces, labels) = normalize_generated_tokens( | |
| &[trimmed, "第".to_string()], | |
| &["B-TITLE".to_string(), "O".to_string()], | |
| ); | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| if role == "TITLE" { | |
| let (pieces, labels) = normalize_title_token(token); | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| continue; | |
| } | |
| let (pieces, labels) = | |
| normalize_generated_tokens(&[token.clone()], &[role_label(role)]); | |
| output_tokens.extend(pieces); | |
| output_labels.extend(labels); | |
| } | |
| } | |
| } | |
| (output_tokens, output_labels) | |
| } | |
| fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> { | |
| let joiners = [ | |
| " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", | |
| "?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【", | |
| "】", "「", "」", "「", "」", "☆", "@", | |
| ]; | |
| let title_terminal_punctuation = ["!", "!", "?", "?"]; | |
| let entity_joiners = [ | |
| " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", | |
| "?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【", | |
| "】", "「", "」", "「", "」", "☆", "@", "&", "&", | |
| ]; | |
| let mut output = labels.to_vec(); | |
| for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() { | |
| if label != "O" || !entity_joiners.contains(&token.as_str()) { | |
| continue; | |
| } | |
| let mut left = index as isize - 1; | |
| while left >= 0 | |
| && joiners.contains(&tokens[left as usize].as_str()) | |
| && labels[left as usize] == "O" | |
| { | |
| left -= 1; | |
| } | |
| let mut right = index + 1; | |
| while right < tokens.len() | |
| && joiners.contains(&tokens[right].as_str()) | |
| && labels[right] == "O" | |
| { | |
| right += 1; | |
| } | |
| if left >= 0 && right < tokens.len() { | |
| let left_label = &output[left as usize]; | |
| let right_label = &labels[right]; | |
| if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") { | |
| output[index] = left_label.clone(); | |
| } | |
| } | |
| if title_terminal_punctuation.contains(&token.as_str()) && index > 0 { | |
| let left_label = &output[index - 1]; | |
| if left_label == "B-TITLE" { | |
| output[index] = "B-TITLE".to_string(); | |
| } | |
| } | |
| } | |
| output | |
| } | |
| fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> { | |
| let (key, tokens, _classes, groups) = template_key_for_filename(filename); | |
| if groups.len() != roles.len() { | |
| return None; | |
| } | |
| let roles = adjust_contextual_roles(&tokens, &groups, roles); | |
| let (roles, dropped) = enforce_single_title_candidate(&groups, &roles); | |
| let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles); | |
| let labels = smooth_title_spans(&tokens, &labels); | |
| if tokens.len() != labels.len() { | |
| return None; | |
| } | |
| Some(Record { | |
| filename: filename.to_string(), | |
| tokens, | |
| labels, | |
| template_id: template_id.to_string(), | |
| template: key, | |
| source_filename: None, | |
| path_trimmed: None, | |
| dropped_title_candidate_positions: if dropped.is_empty() { | |
| None | |
| } else { | |
| Some(dropped) | |
| }, | |
| }) | |
| } | |
| mod tests { | |
| use super::*; | |
| fn labels_for(filename: &str) -> Vec<(String, String)> { | |
| let (key, _, _, _) = template_key_for_filename(filename); | |
| let roles = suggested_roles(&key); | |
| let record = dmhy_record(filename, "tpl_test", &roles).unwrap(); | |
| record.tokens.into_iter().zip(record.labels).collect() | |
| } | |
| fn required_regressions() { | |
| let title_91 = labels_for("Title 91 EP 01 [1080p]"); | |
| assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string()))); | |
| assert!(title_91.contains(&("EP".to_string(), "O".to_string()))); | |
| assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]"); | |
| assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string()))); | |
| assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string()))); | |
| let roman = labels_for("Chibi Maruko-chan I 001"); | |
| assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string()))); | |
| assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string()))); | |
| let dxd = labels_for("High School D×D"); | |
| assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string()))); | |
| let colon_title = labels_for("Megumi no Daigo:Kyuukoku no Orange 06"); | |
| assert!(colon_title.contains(&(":".to_string(), "B-TITLE".to_string()))); | |
| let sxe = labels_for("S01E02"); | |
| assert_eq!( | |
| sxe, | |
| vec![ | |
| ("S".to_string(), "O".to_string()), | |
| ("01".to_string(), "B-SEASON".to_string()), | |
| ("E".to_string(), "O".to_string()), | |
| ("02".to_string(), "B-EPISODE".to_string()) | |
| ] | |
| ); | |
| let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]"); | |
| assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string()))); | |
| assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]"); | |
| assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string()))); | |
| assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]"); | |
| assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string()))); | |
| assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string()))); | |
| let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)"); | |
| assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string()))); | |
| assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string()))); | |
| let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]"); | |
| assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string()))); | |
| let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]"); | |
| assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string()))); | |
| assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string()))); | |
| assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string()))); | |
| assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)"); | |
| assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string()))); | |
| let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]"); | |
| assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string()))); | |
| let hash = labels_for("[Group][Title][01][1080p][00270AC8]"); | |
| assert!(hash.contains(&("00270AC8".to_string(), "O".to_string()))); | |
| let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001"); | |
| assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string()))); | |
| assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string()))); | |
| let ubw = labels_for("Fate/stay night [Unlimited Blade Works] #00 「プロローグ」"); | |
| assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string()))); | |
| assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string()))); | |
| let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]"); | |
| assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string()))); | |
| let comma_title = | |
| labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]"); | |
| assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string()))); | |
| let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話"); | |
| assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string()))); | |
| let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]"); | |
| assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string()))); | |
| let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER~魂狩~ #01 (HEVC 1312x720)"); | |
| assert!(soul_taker.contains(&("~".to_string(), "B-TITLE".to_string()))); | |
| let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話"); | |
| assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string()))); | |
| assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string()))); | |
| let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件"); | |
| assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string()))); | |
| assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string()))); | |
| let zom = | |
| labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]"); | |
| assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string()))); | |
| assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string()))); | |
| assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string()))); | |
| let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]"); | |
| assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string()))); | |
| assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string()))); | |
| assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string()))); | |
| } | |
| fn updated_python_alignment_regressions() { | |
| let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; | |
| let (trimmed, was_trimmed) = training_filename_for(original); | |
| assert!(was_trimmed); | |
| assert_eq!( | |
| trimmed, | |
| "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p" | |
| ); | |
| let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"; | |
| let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon); | |
| assert!(pokemon_was_trimmed); | |
| assert_eq!( | |
| trimmed_pokemon, | |
| "Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]" | |
| ); | |
| let woody = labels_for(&trimmed); | |
| assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string()))); | |
| assert!(woody.contains(&("E".to_string(), "O".to_string()))); | |
| assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string()))); | |
| assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string()))); | |
| assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string()))); | |
| assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string()))); | |
| let group = labels_for("[DBD-Raws][Title][01][1080P]"); | |
| assert!(group.contains(&("-".to_string(), "B-GROUP".to_string()))); | |
| let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]"); | |
| assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string()))); | |
| let cjk_season = | |
| labels_for("[DBD-Raws][魔道祖师 第一季][08][1080P][BDRip][HEVC-10bit][FLAC]"); | |
| assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string()))); | |
| assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string()))); | |
| assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string()))); | |
| let (trimmed, was_trimmed) = | |
| training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]"); | |
| assert!(was_trimmed); | |
| assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]"); | |
| let (key, _, _, _) = template_key_for_filename(&trimmed); | |
| assert_eq!( | |
| key, | |
| "BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION" | |
| ); | |
| let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); | |
| assert!(short.contains(&("R".to_string(), "B-TITLE".to_string()))); | |
| assert!(short.contains(&("-".to_string(), "B-TITLE".to_string()))); | |
| assert!(short.contains(&("15".to_string(), "B-TITLE".to_string()))); | |
| assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string()))); | |
| let short_before_episode = | |
| labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); | |
| assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string()))); | |
| assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string()))); | |
| assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string()))); | |
| assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string()))); | |
| let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]"; | |
| let (trimmed, was_trimmed) = training_filename_for(avatar); | |
| assert!(was_trimmed); | |
| assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]"); | |
| let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"; | |
| let (trimmed, was_trimmed) = training_filename_for(tintin); | |
| assert!(was_trimmed); | |
| assert_eq!( | |
| trimmed, | |
| "Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)" | |
| ); | |
| let (key, _, _, _) = template_key_for_filename(&trimmed); | |
| assert_eq!( | |
| key, | |
| "TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT" | |
| ); | |
| let bocchi = "Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"; | |
| let (leaf_key, _, _, _) = | |
| template_key_for_filename("Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"); | |
| assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); | |
| assert!(filename_has_title( | |
| "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」" | |
| )); | |
| let (trimmed, was_trimmed) = training_filename_for(bocchi); | |
| assert!(was_trimmed); | |
| assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"); | |
| let (key, _, _, _) = template_key_for_filename(&trimmed); | |
| assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT"); | |
| let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]"; | |
| let (trimmed, was_trimmed) = training_filename_for(usagi); | |
| assert!(was_trimmed); | |
| assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]"); | |
| let (key, _, _, _) = template_key_for_filename(&trimmed); | |
| assert_eq!( | |
| key, | |
| "TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA" | |
| ); | |
| let woody_parent = | |
| "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"; | |
| let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}")); | |
| assert!(was_trimmed); | |
| assert_eq!(trimmed, woody_parent); | |
| let volume = | |
| labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)"); | |
| assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string()))); | |
| assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string()))); | |
| assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string()))); | |
| assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string()))); | |
| assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| let numeric_title = | |
| labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M"); | |
| assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string()))); | |
| assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string()))); | |
| assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string()))); | |
| assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string()))); | |
| assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string()))); | |
| assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string()))); | |
| assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string()))); | |
| let media_block = | |
| labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]"); | |
| assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string()))); | |
| assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string()))); | |
| assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string()))); | |
| assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string()))); | |
| } | |
| } | |