ModerRAS commited on
Commit
59042eb
·
1 Parent(s): 93f322e

Improve DMHY template labeling pipeline

Browse files
datasets/AnimeName CHANGED
@@ -1 +1 @@
1
- Subproject commit ab3fbcad1a4bf889090d050248130c7d763c457e
 
1
+ Subproject commit 081fd450aafd59992f2df794c5b0110dc3cdd42b
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -1,7 +1,7 @@
1
  use anyhow::{bail, Context, Result};
2
  use chrono::Utc;
3
  use clap::Parser;
4
- use once_cell::sync::Lazy;
5
  use rayon::prelude::*;
6
  use regex::Regex;
7
  use serde::{Deserialize, Serialize};
@@ -21,6 +21,8 @@ struct Args {
21
  audit_low_frequency: bool,
22
  #[arg(long)]
23
  verify_generated_output: bool,
 
 
24
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
25
  input: PathBuf,
26
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -53,6 +55,8 @@ struct Args {
53
  review_output: PathBuf,
54
  #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
55
  audit_output: PathBuf,
 
 
56
  #[arg(long, default_value_t = 50)]
57
  audit_max_count: u64,
58
  #[arg(long)]
@@ -81,10 +85,22 @@ struct Args {
81
  keep_encoding_noise: bool,
82
  #[arg(long)]
83
  preserve_parent_paths: bool,
 
 
 
 
84
  #[arg(long)]
85
  threads: Option<usize>,
86
  }
87
 
 
 
 
 
 
 
 
 
88
  #[derive(Debug, Clone, Deserialize)]
89
  struct Recipe {
90
  template_id: String,
@@ -151,11 +167,20 @@ enum Processed {
151
  static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
152
  static RESOLUTION_RE: Lazy<Regex> =
153
  Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
 
 
154
  static EPISODE_VERSION_RE: Lazy<Regex> =
155
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
 
 
 
156
  static EPISODE_RE: Lazy<Regex> =
157
- Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap());
 
 
158
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
 
 
159
  static EPISODE_RANGE_RE: Lazy<Regex> =
160
  Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
161
  static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
@@ -173,7 +198,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
173
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
174
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
175
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
176
- Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
177
  });
178
  static VOLUME_RE: Lazy<Regex> =
179
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -183,7 +208,7 @@ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
183
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
184
  });
185
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
186
- Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
187
  });
188
  static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
189
  Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
@@ -191,6 +216,8 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
191
  });
192
  static YEAR_RANGE_RE: Lazy<Regex> =
193
  Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
 
 
194
  static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
195
  Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
196
  });
@@ -206,6 +233,7 @@ static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
206
  static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
207
  [
208
  r"^\d{3,4}[xX×]\d{3,4}",
 
209
  r"(?i)^h\.?26[45]",
210
  r"(?i)^x\.?26[45]",
211
  r"^[\\/]+",
@@ -233,6 +261,7 @@ fn main() -> Result<()> {
233
  .build_global()
234
  .context("failed to configure rayon thread pool")?;
235
  }
 
236
  if args.cluster {
237
  return run_cluster(&args);
238
  }
@@ -242,6 +271,9 @@ fn main() -> Result<()> {
242
  if args.verify_generated_output {
243
  return run_verify_generated_output(&args);
244
  }
 
 
 
245
  if args.expand != "all" && args.expand != "sample" {
246
  bail!("--expand must be all or sample");
247
  }
@@ -334,6 +366,7 @@ fn main() -> Result<()> {
334
  "min_count": args.min_count,
335
  "low_frequency_audit_max_count": args.audit_max_count,
336
  "low_frequency_blocking_warnings": [
 
337
  "hash_labeled",
338
  "multiple_title_spans",
339
  "no_title",
@@ -355,6 +388,57 @@ fn main() -> Result<()> {
355
  Ok(())
356
  }
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
359
  let file = File::open(&args.recipes)
360
  .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
@@ -745,7 +829,11 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
745
  for warning in audit_warnings(&record) {
746
  if !matches!(
747
  warning.as_str(),
748
- "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
 
 
 
 
749
  ) {
750
  continue;
751
  }
@@ -780,6 +868,204 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
780
  Ok(())
781
  }
782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
783
  fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
784
  let mut spans = Vec::new();
785
  let mut current_label: Option<String> = None;
@@ -820,8 +1106,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
820
  } else if title_spans > 1 {
821
  warnings.push("multiple_title_spans".to_string());
822
  }
823
- if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
 
824
  warnings.push("no_episode".to_string());
 
 
 
 
 
 
 
825
  }
826
  if record.filename.contains('/') || record.filename.contains('\\') {
827
  warnings.push("path_retained".to_string());
@@ -927,7 +1221,11 @@ fn has_blocking_low_frequency_warning(record: &Record) -> bool {
927
  audit_warnings(record).iter().any(|warning| {
928
  matches!(
929
  warning.as_str(),
930
- "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
 
 
 
 
931
  )
932
  })
933
  }
@@ -1046,6 +1344,9 @@ fn classify_atom(text: &str) -> String {
1046
  if EPISODE_VERSION_RE.is_match(&compact) {
1047
  return "EPISODE_VERSION".to_string();
1048
  }
 
 
 
1049
  if SXE_RE.is_match(&compact) {
1050
  return "SXE".to_string();
1051
  }
@@ -1321,8 +1622,33 @@ fn training_filename_for(original: &str) -> (String, bool) {
1321
  .map(str::trim)
1322
  .filter(|part| !part.is_empty())
1323
  .collect();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1324
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
1325
- if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
1326
  if !path_segment_is_plain_season(parts[parts.len() - 2]) {
1327
  return (parts[parts.len() - 1].to_string(), true);
1328
  }
@@ -1334,7 +1660,14 @@ fn training_filename_for(original: &str) -> (String, bool) {
1334
  {
1335
  (parts[parts.len() - 1].to_string(), true)
1336
  } else {
1337
- (parts[parts.len() - 2..].join("/"), true)
 
 
 
 
 
 
 
1338
  }
1339
  } else {
1340
  (parts[parts.len() - 1].to_string(), true)
@@ -1349,6 +1682,43 @@ fn path_segment_is_plain_season(segment: &str) -> bool {
1349
  PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
1350
  }
1351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1352
  fn path_segment_has_season(value: &str) -> bool {
1353
  PATH_SEGMENT_SEASON_RE.is_match(value)
1354
  }
@@ -1368,7 +1738,9 @@ fn has_encoding_noise(value: &str) -> bool {
1368
  return true;
1369
  }
1370
  let markers = [
1371
- "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
 
 
1372
  ];
1373
  let marker_hits = markers
1374
  .iter()
@@ -1403,7 +1775,83 @@ fn path_segment_is_episodeish(value: &str) -> bool {
1403
  !structural.is_empty()
1404
  && structural
1405
  .iter()
1406
- .all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1407
  }
1408
 
1409
  fn has_abstract_path_noise(value: &str) -> bool {
@@ -1642,6 +2090,11 @@ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1642
  }
1643
 
1644
  fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
 
 
 
 
 
1645
  let caps = EPISODE_VALUE_RE.captures(token)?;
1646
  let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
1647
  let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
@@ -1672,6 +2125,29 @@ fn group_text(tokens: &[String], group: &Group) -> String {
1672
  )
1673
  }
1674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1675
  fn is_special_title_phrase(text: &str) -> bool {
1676
  let normalized = SPECIAL_SPACE_RE
1677
  .replace_all(text, " ")
@@ -1681,6 +2157,8 @@ fn is_special_title_phrase(text: &str) -> bool {
1681
  normalized.as_str(),
1682
  "CM" | "EVENT"
1683
  | "EIZOU"
 
 
1684
  | "LOGO"
1685
  | "MENU"
1686
  | "OMAKE"
@@ -1690,13 +2168,123 @@ fn is_special_title_phrase(text: &str) -> bool {
1690
  | "TOKUTEN"
1691
  | "TRAILER"
1692
  | "WORLD PREMIERE"
1693
- ) || SPECIAL_TITLE_PHRASE_RE.is_match(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1694
  }
1695
 
1696
  fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
1697
  let mut output = roles.to_vec();
1698
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
1699
  let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1700
  if !output.iter().any(|role| role == "TITLE")
1701
  && roles
1702
  .first()
@@ -1790,17 +2378,40 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1790
  }
1791
  if output[index - 2] == "TITLE"
1792
  && groups[index - 1].class_name == "SEP"
1793
- && previous_text.len() <= 4
1794
- && previous_text.is_ascii()
1795
- && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
1796
  && text.chars().all(|ch| ch.is_ascii_digit())
1797
  && text.len() <= 3
1798
- && (next_special || next_episode)
 
 
 
 
 
 
 
 
 
 
1799
  {
1800
  output[index] = "TITLE".to_string();
1801
  continue;
1802
  }
1803
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804
  if roles[index].starts_with("EPISODE")
1805
  && index >= 2
1806
  && output[..index].iter().any(|role| role == "TITLE")
@@ -1843,6 +2454,15 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1843
  output[index] = "SPECIAL".to_string();
1844
  continue;
1845
  }
 
 
 
 
 
 
 
 
 
1846
  if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
1847
  {
1848
  output[index] = "O".to_string();
@@ -1870,8 +2490,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1870
  && groups[index + 1].class_name == "SEP"
1871
  && roles[index + 2].starts_with("EPISODE")
1872
  {
1873
- output[index] = "O".to_string();
1874
- output[index + 2] = "SEASON".to_string();
 
 
1875
  continue;
1876
  }
1877
  if roles[index] == "TITLE"
@@ -1897,6 +2519,37 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1897
  output[index + 2] = "O".to_string();
1898
  }
1899
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1900
  if roles[index].starts_with("EPISODE") {
1901
  let previous_text = if index >= 1 {
1902
  group_text(tokens, &groups[index - 1])
@@ -1959,6 +2612,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
1959
  }
1960
 
1961
  fn enforce_single_title_candidate(
 
1962
  groups: &[Group],
1963
  roles: &[String],
1964
  ) -> (Vec<String>, Vec<String>) {
@@ -1981,13 +2635,20 @@ fn enforce_single_title_candidate(
1981
  .copied()
1982
  .filter(|(_, end)| *end <= first_anchor)
1983
  .collect();
1984
- let selected = (if before_anchor.is_empty() {
1985
  &candidates
1986
  } else {
1987
  &before_anchor
1988
- })
 
1989
  .iter()
1990
- .max_by_key(|(start, end)| (*end, end - start))
 
 
 
 
 
 
1991
  .copied()
1992
  .unwrap();
1993
  let mut output = roles.to_vec();
@@ -2006,6 +2667,33 @@ fn enforce_single_title_candidate(
2006
  (output, dropped)
2007
  }
2008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2009
  fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
2010
  let mut output_tokens = Vec::new();
2011
  let mut output_labels = Vec::new();
@@ -2162,14 +2850,16 @@ fn project_refined_tokens(
2162
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
2163
  let joiners = [
2164
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2165
- "?", ";", ";", ",", ",", "", "~", "", "(", ")", "", "", "[", "]", "【",
2166
- "】", "", "", "", "", "", "@",
 
2167
  ];
2168
  let title_terminal_punctuation = ["!", "!", "?", "?"];
2169
  let entity_joiners = [
2170
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2171
- "?", ";", ";", ",", ",", "", "~", "", "(", ")", "", "", "[", "]", "【",
2172
- "】", "", "", "", "", "", "@", "&", "",
 
2173
  ];
2174
  let mut output = labels.to_vec();
2175
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -2203,17 +2893,50 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
2203
  output[index] = "B-TITLE".to_string();
2204
  }
2205
  }
 
 
 
 
 
 
 
2206
  }
2207
  output
2208
  }
2209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2210
  fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
2211
  let (key, tokens, _classes, groups) = template_key_for_filename(filename);
2212
  if groups.len() != roles.len() {
2213
  return None;
2214
  }
2215
  let roles = adjust_contextual_roles(&tokens, &groups, roles);
2216
- let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
2217
  let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
2218
  let labels = smooth_title_spans(&tokens, &labels);
2219
  if tokens.len() != labels.len() {
@@ -2246,6 +2969,18 @@ mod tests {
2246
  record.tokens.into_iter().zip(record.labels).collect()
2247
  }
2248
 
 
 
 
 
 
 
 
 
 
 
 
 
2249
  #[test]
2250
  fn required_regressions() {
2251
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
@@ -2313,6 +3048,30 @@ mod tests {
2313
  let comma_title =
2314
  labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
2315
  assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2316
  let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
2317
  assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
2318
  let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
@@ -2322,6 +3081,7 @@ mod tests {
2322
  let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
2323
  assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
2324
  assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
 
2325
 
2326
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
2327
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -2336,6 +3096,121 @@ mod tests {
2336
  assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
2337
  assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
2338
  assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2339
  }
2340
 
2341
  #[test]
@@ -2345,7 +3220,7 @@ mod tests {
2345
  assert!(was_trimmed);
2346
  assert_eq!(
2347
  trimmed,
2348
- "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
2349
  );
2350
  let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
2351
  let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
@@ -2402,6 +3277,27 @@ mod tests {
2402
  assert!(was_trimmed);
2403
  assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
2404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2405
  let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
2406
  let (trimmed, was_trimmed) = training_filename_for(tintin);
2407
  assert!(was_trimmed);
@@ -2442,7 +3338,47 @@ mod tests {
2442
  "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
2443
  let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
2444
  assert!(was_trimmed);
2445
- assert_eq!(trimmed, woody_parent);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2446
 
2447
  let volume =
2448
  labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
 
1
  use anyhow::{bail, Context, Result};
2
  use chrono::Utc;
3
  use clap::Parser;
4
+ use once_cell::sync::{Lazy, OnceCell};
5
  use rayon::prelude::*;
6
  use regex::Regex;
7
  use serde::{Deserialize, Serialize};
 
21
  audit_low_frequency: bool,
22
  #[arg(long)]
23
  verify_generated_output: bool,
24
+ #[arg(long)]
25
+ rich_annotations: bool,
26
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
27
  input: PathBuf,
28
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
 
55
  review_output: PathBuf,
56
  #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
57
  audit_output: PathBuf,
58
+ #[arg(long, default_value = "reports/dmhy_rich_annotations.rust.jsonl")]
59
+ rich_output: PathBuf,
60
  #[arg(long, default_value_t = 50)]
61
  audit_max_count: u64,
62
  #[arg(long)]
 
85
  keep_encoding_noise: bool,
86
  #[arg(long)]
87
  preserve_parent_paths: bool,
88
+ #[arg(long, default_value = "datasets/AnimeName/dmhy_title_whitelist.txt")]
89
+ title_whitelist: PathBuf,
90
+ #[arg(long, default_value = "datasets/AnimeName/dmhy_group_whitelist.txt")]
91
+ group_whitelist: PathBuf,
92
  #[arg(long)]
93
  threads: Option<usize>,
94
  }
95
 
96
+ #[derive(Debug, Default)]
97
+ struct Whitelists {
98
+ title_phrases: Vec<Vec<String>>,
99
+ group_names: HashSet<String>,
100
+ }
101
+
102
+ static RUNTIME_WHITELISTS: OnceCell<Whitelists> = OnceCell::new();
103
+
104
  #[derive(Debug, Clone, Deserialize)]
105
  struct Recipe {
106
  template_id: String,
 
167
  static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
168
  static RESOLUTION_RE: Lazy<Regex> =
169
  Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
170
+ static BARE_RESOLUTION_RE: Lazy<Regex> =
171
+ Lazy::new(|| Regex::new(r"^(?:360|480|540|576|720|1080|2160)$").unwrap());
172
  static EPISODE_VERSION_RE: Lazy<Regex> =
173
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
174
+ static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
175
+ Regex::new(r"(?i)^\d{1,4}[_ .-]?(?:Notice|Full|R18|R|Uncut|Director'?s?Cut)$").unwrap()
176
+ });
177
  static EPISODE_RE: Lazy<Regex> =
178
+ Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
179
+ static DECIMAL_EPISODE_RE: Lazy<Regex> =
180
+ Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
181
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
182
+ static EPISODE_CJK_PREFIX_RE: Lazy<Regex> =
183
+ Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap());
184
  static EPISODE_RANGE_RE: Lazy<Regex> =
185
  Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
186
  static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
 
198
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
199
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
200
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
201
+ Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
202
  });
203
  static VOLUME_RE: Lazy<Regex> =
204
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 
208
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
209
  });
210
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
211
+ Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
212
  });
213
  static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
214
  Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
 
216
  });
217
  static YEAR_RANGE_RE: Lazy<Regex> =
218
  Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
219
+ static VERSIONISH_TITLE_RE: Lazy<Regex> =
220
+ Lazy::new(|| Regex::new(r"(?i)^(?:19|20)\d{2}(?:版|ver\.?|version)?$").unwrap());
221
  static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
222
  Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
223
  });
 
233
  static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
234
  [
235
  r"^\d{3,4}[xX×]\d{3,4}",
236
+ r"(?i)^(?:AAC|AC3|EAC3|DTS|FLAC|DDP)\s*\d+(?:\.\d+)?",
237
  r"(?i)^h\.?26[45]",
238
  r"(?i)^x\.?26[45]",
239
  r"^[\\/]+",
 
261
  .build_global()
262
  .context("failed to configure rayon thread pool")?;
263
  }
264
+ let _ = RUNTIME_WHITELISTS.set(load_whitelists(&args)?);
265
  if args.cluster {
266
  return run_cluster(&args);
267
  }
 
271
  if args.verify_generated_output {
272
  return run_verify_generated_output(&args);
273
  }
274
+ if args.rich_annotations {
275
+ return run_rich_annotations(&args);
276
+ }
277
  if args.expand != "all" && args.expand != "sample" {
278
  bail!("--expand must be all or sample");
279
  }
 
366
  "min_count": args.min_count,
367
  "low_frequency_audit_max_count": args.audit_max_count,
368
  "low_frequency_blocking_warnings": [
369
+ "ambiguous_no_episode_title",
370
  "hash_labeled",
371
  "multiple_title_spans",
372
  "no_title",
 
388
  Ok(())
389
  }
390
 
391
+ fn load_whitelists(args: &Args) -> Result<Whitelists> {
392
+ Ok(Whitelists {
393
+ title_phrases: load_title_whitelist(&args.title_whitelist)?,
394
+ group_names: load_name_whitelist(&args.group_whitelist)?,
395
+ })
396
+ }
397
+
398
+ fn load_title_whitelist(path: &PathBuf) -> Result<Vec<Vec<String>>> {
399
+ let mut phrases = Vec::new();
400
+ for line in load_whitelist_lines(path)? {
401
+ let phrase = phrase_parts_for_whitelist(&line);
402
+ if !phrase.is_empty() {
403
+ phrases.push(phrase);
404
+ }
405
+ }
406
+ Ok(phrases)
407
+ }
408
+
409
+ fn load_name_whitelist(path: &PathBuf) -> Result<HashSet<String>> {
410
+ Ok(load_whitelist_lines(path)?
411
+ .into_iter()
412
+ .map(|line| normalize_whitelist_name(&line))
413
+ .filter(|line| !line.is_empty())
414
+ .collect())
415
+ }
416
+
417
+ fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
418
+ if !path.exists() {
419
+ return Ok(Vec::new());
420
+ }
421
+ let file = File::open(path)
422
+ .with_context(|| format!("failed to open whitelist {}", path.display()))?;
423
+ let mut lines = Vec::new();
424
+ for line in BufReader::new(file).lines() {
425
+ let line = line?;
426
+ let line = line.trim();
427
+ if line.is_empty() || line.starts_with('#') {
428
+ continue;
429
+ }
430
+ let value = line
431
+ .split_once('\t')
432
+ .map(|(_, value)| value)
433
+ .unwrap_or(line)
434
+ .trim();
435
+ if !value.is_empty() {
436
+ lines.push(value.to_string());
437
+ }
438
+ }
439
+ Ok(lines)
440
+ }
441
+
442
  fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
443
  let file = File::open(&args.recipes)
444
  .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
 
829
  for warning in audit_warnings(&record) {
830
  if !matches!(
831
  warning.as_str(),
832
+ "ambiguous_no_episode_title"
833
+ | "hash_labeled"
834
+ | "multiple_title_spans"
835
+ | "no_title"
836
+ | "path_retained"
837
  ) {
838
  continue;
839
  }
 
868
  Ok(())
869
  }
870
 
871
+ fn run_rich_annotations(args: &Args) -> Result<()> {
872
+ let inputs = load_input(&args.input, args.limit)?;
873
+ if let Some(parent) = args.rich_output.parent() {
874
+ fs::create_dir_all(parent)?;
875
+ }
876
+ let rows: Vec<Value> = inputs
877
+ .par_iter()
878
+ .filter_map(|original| {
879
+ if !args.keep_encoding_noise
880
+ && (has_encoding_noise(original)
881
+ || has_non_anime_noise(original)
882
+ || has_abstract_path_noise(original))
883
+ {
884
+ return None;
885
+ }
886
+ Some(rich_annotation_for(original))
887
+ })
888
+ .collect();
889
+ let mut writer = BufWriter::new(File::create(&args.rich_output)?);
890
+ for row in &rows {
891
+ serde_json::to_writer(&mut writer, row)?;
892
+ writer.write_all(b"\n")?;
893
+ }
894
+ writer.flush()?;
895
+ let manifest = json!({
896
+ "generated_at": Utc::now().to_rfc3339(),
897
+ "input": args.input.to_string_lossy(),
898
+ "rich_output": args.rich_output.to_string_lossy(),
899
+ "rows": rows.len(),
900
+ "implementation": "rust_dmhy_rich_annotations",
901
+ "notes": [
902
+ "rich roles are metadata for review/projection, not final training BIO labels",
903
+ "TITLE_* candidates may be collapsed or filtered before dmhy_weak generation"
904
+ ]
905
+ });
906
+ println!("{}", serde_json::to_string_pretty(&manifest)?);
907
+ Ok(())
908
+ }
909
+
910
+ fn rich_annotation_for(original: &str) -> Value {
911
+ let (training_filename, path_trimmed) = training_filename_for(original);
912
+ let parts: Vec<&str> = original
913
+ .split(|ch| ch == '/' || ch == '\\')
914
+ .map(str::trim)
915
+ .filter(|part| !part.is_empty())
916
+ .collect();
917
+ let leaf_index = parts.len().saturating_sub(1);
918
+ let segments = parts
919
+ .iter()
920
+ .enumerate()
921
+ .map(|(index, segment)| rich_segment(segment, index, index == leaf_index))
922
+ .collect::<Vec<_>>();
923
+ let projection = dmhy_record(
924
+ &training_filename,
925
+ "rich_projection",
926
+ &suggested_roles(&template_key_for_filename(&training_filename).0),
927
+ )
928
+ .map(|record| {
929
+ json!({
930
+ "filename": record.filename,
931
+ "spans": entity_spans(&record.tokens, &record.labels),
932
+ "warnings": audit_warnings(&record),
933
+ })
934
+ });
935
+ json!({
936
+ "source_filename": original,
937
+ "training_filename": training_filename,
938
+ "path_trimmed": path_trimmed,
939
+ "segments": segments,
940
+ "projection_preview": projection,
941
+ })
942
+ }
943
+
944
+ fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
945
+ let (key, tokens, _classes, groups) = template_key_for_filename(segment);
946
+ let suggested = suggested_roles(&key);
947
+ let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
948
+ let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
949
+ json!({
950
+ "index": index,
951
+ "text": segment,
952
+ "kind": rich_segment_kind(segment, is_leaf),
953
+ "template": key,
954
+ "candidates": candidates,
955
+ })
956
+ }
957
+
958
+ fn rich_segment_kind(segment: &str, is_leaf: bool) -> &'static str {
959
+ if path_segment_is_media_noise(segment) {
960
+ "media_noise"
961
+ } else if path_segment_is_plain_season(segment) {
962
+ "season_dir"
963
+ } else if is_leaf {
964
+ "leaf"
965
+ } else {
966
+ "parent"
967
+ }
968
+ }
969
+
970
+ fn rich_candidates_for_segment(
971
+ segment: &str,
972
+ tokens: &[String],
973
+ groups: &[Group],
974
+ roles: &[String],
975
+ is_leaf: bool,
976
+ ) -> Vec<Value> {
977
+ let mut output = Vec::new();
978
+ let title_ranges = title_candidates(groups, roles);
979
+ for (candidate_index, (start, end)) in title_ranges.iter().copied().enumerate() {
980
+ let text = candidate_text(tokens, groups, start, end);
981
+ if text.trim().is_empty() {
982
+ continue;
983
+ }
984
+ output.push(json!({
985
+ "role": fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()),
986
+ "coarse_role": "TITLE",
987
+ "text": text,
988
+ "group_start": start,
989
+ "group_end": end,
990
+ }));
991
+ }
992
+ for (group_index, role) in roles.iter().enumerate() {
993
+ if role == "TITLE" || role == "O" || role == "HASH" {
994
+ continue;
995
+ }
996
+ let text = group_text(tokens, &groups[group_index]);
997
+ if text.trim().is_empty() {
998
+ continue;
999
+ }
1000
+ let coarse_role = role_label(role)
1001
+ .strip_prefix("B-")
1002
+ .map(str::to_string)
1003
+ .unwrap_or_else(|| "O".to_string());
1004
+ output.push(json!({
1005
+ "role": fine_non_title_role(role),
1006
+ "coarse_role": coarse_role,
1007
+ "text": text,
1008
+ "group_start": group_index,
1009
+ "group_end": group_index + 1,
1010
+ }));
1011
+ }
1012
+ output
1013
+ }
1014
+
1015
+ fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
1016
+ let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
1017
+ return String::new();
1018
+ };
1019
+ let Some(last) = groups
1020
+ .get(end.saturating_sub(1))
1021
+ .and_then(|group| group.indices.last())
1022
+ else {
1023
+ return String::new();
1024
+ };
1025
+ strip_wrapper(&tokens[*first..=*last].join(""))
1026
+ }
1027
+
1028
+ fn fine_title_role(
1029
+ segment: &str,
1030
+ text: &str,
1031
+ is_leaf: bool,
1032
+ candidate_index: usize,
1033
+ candidate_count: usize,
1034
+ ) -> &'static str {
1035
+ let cleaned = text.trim();
1036
+ if VERSIONISH_TITLE_RE.is_match(cleaned) {
1037
+ return "RELEASE_VERSION";
1038
+ }
1039
+ if matches!(
1040
+ cleaned.to_ascii_lowercase().as_str(),
1041
+ "国漫" | "國漫" | "anime" | "movie" | "movies"
1042
+ ) {
1043
+ return "TITLE_CATEGORY";
1044
+ }
1045
+ if is_leaf && path_segment_starts_with_episode(segment) {
1046
+ return "EPISODE_TITLE";
1047
+ }
1048
+ if !is_leaf {
1049
+ return "PATH_TITLE";
1050
+ }
1051
+ if candidate_count > 1 && candidate_index > 0 {
1052
+ return "TITLE_ALIAS";
1053
+ }
1054
+ "TITLE_MAIN"
1055
+ }
1056
+
1057
+ fn fine_non_title_role(role: &str) -> &'static str {
1058
+ match role {
1059
+ "GROUP" => "RELEASE_GROUP",
1060
+ "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
1061
+ "SEASON" => "SEASON",
1062
+ "SPECIAL" | "VOLUME" => "SPECIAL",
1063
+ "RESOLUTION" => "RESOLUTION",
1064
+ "SOURCE" => "SOURCE",
1065
+ _ => "OTHER",
1066
+ }
1067
+ }
1068
+
1069
  fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
1070
  let mut spans = Vec::new();
1071
  let mut current_label: Option<String> = None;
 
1106
  } else if title_spans > 1 {
1107
  warnings.push("multiple_title_spans".to_string());
1108
  }
1109
+ let has_episode = record.labels.iter().any(|label| label.ends_with("EPISODE"));
1110
+ if !has_episode {
1111
  warnings.push("no_episode".to_string());
1112
+ if record
1113
+ .dropped_title_candidate_positions
1114
+ .as_ref()
1115
+ .is_some_and(|dropped| !dropped.is_empty())
1116
+ {
1117
+ warnings.push("ambiguous_no_episode_title".to_string());
1118
+ }
1119
  }
1120
  if record.filename.contains('/') || record.filename.contains('\\') {
1121
  warnings.push("path_retained".to_string());
 
1221
  audit_warnings(record).iter().any(|warning| {
1222
  matches!(
1223
  warning.as_str(),
1224
+ "ambiguous_no_episode_title"
1225
+ | "hash_labeled"
1226
+ | "multiple_title_spans"
1227
+ | "no_title"
1228
+ | "path_retained"
1229
  )
1230
  })
1231
  }
 
1344
  if EPISODE_VERSION_RE.is_match(&compact) {
1345
  return "EPISODE_VERSION".to_string();
1346
  }
1347
+ if EPISODE_WITH_SUFFIX_RE.is_match(&cleaned) {
1348
+ return "EPISODE_VERSION".to_string();
1349
+ }
1350
  if SXE_RE.is_match(&compact) {
1351
  return "SXE".to_string();
1352
  }
 
1622
  .map(str::trim)
1623
  .filter(|part| !part.is_empty())
1624
  .collect();
1625
+ if parts.len() >= 2
1626
+ && (path_segment_is_episodeish(parts[parts.len() - 1])
1627
+ || (!path_segment_is_plain_season(parts[parts.len() - 2])
1628
+ && path_segment_starts_with_episode(parts[parts.len() - 1])
1629
+ && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
1630
+ {
1631
+ if let Some(parent) = parts[..parts.len() - 1]
1632
+ .iter()
1633
+ .rev()
1634
+ .find(|part| {
1635
+ let trimmed = trim_parent_title_segment(part);
1636
+ filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
1637
+ })
1638
+ {
1639
+ let parent = trim_parent_title_segment(parent.trim());
1640
+ return (
1641
+ format!(
1642
+ "{} {}",
1643
+ parent,
1644
+ parts[parts.len() - 1].trim()
1645
+ ),
1646
+ true,
1647
+ );
1648
+ }
1649
+ }
1650
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
1651
+ if path_segment_has_season(parts[parts.len() - 2]) {
1652
  if !path_segment_is_plain_season(parts[parts.len() - 2]) {
1653
  return (parts[parts.len() - 1].to_string(), true);
1654
  }
 
1660
  {
1661
  (parts[parts.len() - 1].to_string(), true)
1662
  } else {
1663
+ (
1664
+ format!(
1665
+ "{} {}",
1666
+ parts[parts.len() - 2].trim(),
1667
+ parts[parts.len() - 1].trim()
1668
+ ),
1669
+ true,
1670
+ )
1671
  }
1672
  } else {
1673
  (parts[parts.len() - 1].to_string(), true)
 
1682
  PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
1683
  }
1684
 
1685
+ fn trim_terminal_series_kind(segment: &str) -> String {
1686
+ let mut output = segment.trim().to_string();
1687
+ for suffix in ["_TV", ".TV", " TV", "_tv", ".tv", " tv"] {
1688
+ if output.ends_with(suffix) {
1689
+ output.truncate(output.len() - suffix.len());
1690
+ return output.trim_end_matches(['_', '.', ' ']).to_string();
1691
+ }
1692
+ }
1693
+ output
1694
+ }
1695
+
1696
+ fn trim_parent_title_segment(segment: &str) -> String {
1697
+ let mut output = trim_terminal_series_kind(segment);
1698
+ loop {
1699
+ let trimmed = output.trim_end();
1700
+ let Some(last) = trimmed.chars().next_back() else {
1701
+ return output;
1702
+ };
1703
+ let open = match last {
1704
+ ')' => '(',
1705
+ ']' => '[',
1706
+ '】' => '【',
1707
+ _ => return output,
1708
+ };
1709
+ let Some(start) = trimmed.rfind(open) else {
1710
+ return output;
1711
+ };
1712
+ let suffix = &trimmed[start..];
1713
+ if path_segment_is_media_noise(suffix) {
1714
+ output.truncate(start);
1715
+ output = output.trim_end_matches([' ', '_', '.', '-']).to_string();
1716
+ continue;
1717
+ }
1718
+ return output;
1719
+ }
1720
+ }
1721
+
1722
  fn path_segment_has_season(value: &str) -> bool {
1723
  PATH_SEGMENT_SEASON_RE.is_match(value)
1724
  }
 
1738
  return true;
1739
  }
1740
  let markers = [
1741
+ "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1742
+ "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1743
+ "伄", "椋", "伓", "姘",
1744
  ];
1745
  let marker_hits = markers
1746
  .iter()
 
1775
  !structural.is_empty()
1776
  && structural
1777
  .iter()
1778
+ .all(|item| {
1779
+ item.starts_with("EPISODE")
1780
+ || item.as_str() == "SPECIAL"
1781
+ || item.as_str() == "VOLUME"
1782
+ || item.as_str() == "BRACKET_VOLUME"
1783
+ })
1784
+ }
1785
+
1786
+ fn path_segment_starts_with_episode(value: &str) -> bool {
1787
+ if EPISODE_CJK_PREFIX_RE.is_match(value.trim()) {
1788
+ return true;
1789
+ }
1790
+ let (key, _, _, groups) = template_key_for_filename(value);
1791
+ let roles = suggested_roles(&key);
1792
+ groups
1793
+ .iter()
1794
+ .zip(roles.iter())
1795
+ .find(|(group, _)| group.class_name != "SEP")
1796
+ .is_some_and(|(_, role)| role.starts_with("EPISODE"))
1797
+ }
1798
+
1799
+ fn leaf_has_full_title_after_episode(value: &str) -> bool {
1800
+ let (key, _, _, groups) = template_key_for_filename(value);
1801
+ let roles = suggested_roles(&key);
1802
+ let first_structural = roles.iter().position(|role| role.starts_with("EPISODE"));
1803
+ let Some(first_episode) = first_structural else {
1804
+ return false;
1805
+ };
1806
+ groups
1807
+ .iter()
1808
+ .zip(roles.iter())
1809
+ .skip(first_episode + 1)
1810
+ .filter(|(group, _)| group.class_name != "SEP")
1811
+ .any(|(_, role)| role == "TITLE")
1812
+ }
1813
+
1814
+ fn path_segment_is_media_noise(value: &str) -> bool {
1815
+ let normalized = value.to_ascii_lowercase();
1816
+ if normalized.contains("sourceunknown") || normalized.contains("sourceunknow") {
1817
+ return true;
1818
+ }
1819
+ if (normalized.contains("dvdrip")
1820
+ || normalized.contains("bdrip")
1821
+ || normalized.contains("webrip")
1822
+ || normalized.contains("web-dl")
1823
+ || normalized.contains("bluray"))
1824
+ && tokenize(value)
1825
+ .iter()
1826
+ .map(|token| classify_atom(token))
1827
+ .any(|class_name| class_name == "RESOLUTION")
1828
+ {
1829
+ return true;
1830
+ }
1831
+ let (_, _, _, groups) = template_key_for_filename(value);
1832
+ let structural: Vec<&String> = groups
1833
+ .iter()
1834
+ .map(|group| &group.class_name)
1835
+ .filter(|item| item.as_str() != "SEP")
1836
+ .collect();
1837
+ !structural.is_empty()
1838
+ && structural.iter().all(|item| {
1839
+ matches!(
1840
+ item.as_str(),
1841
+ "MEDIA"
1842
+ | "RESOLUTION"
1843
+ | "LANG"
1844
+ | "HASH"
1845
+ | "DATE"
1846
+ | "BRACKET_MEDIA"
1847
+ | "BRACKET_RESOLUTION"
1848
+ | "BRACKET_LANG"
1849
+ | "BRACKET_HASH"
1850
+ | "BRACKET_DATE"
1851
+ | "MEDIA_BLOCK"
1852
+ | "BRACKET_MEDIA_BLOCK"
1853
+ )
1854
+ })
1855
  }
1856
 
1857
  fn has_abstract_path_noise(value: &str) -> bool {
 
2090
  }
2091
 
2092
  fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
2093
+ if DECIMAL_EPISODE_RE.is_match(token) {
2094
+ let pieces = split_generated_token(token);
2095
+ let labels = pieces.iter().map(|_| "B-EPISODE".to_string()).collect();
2096
+ return Some((pieces, labels));
2097
+ }
2098
  let caps = EPISODE_VALUE_RE.captures(token)?;
2099
  let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
2100
  let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
 
2125
  )
2126
  }
2127
 
2128
+ fn normalize_whitelist_name(value: &str) -> String {
2129
+ value.split_whitespace().collect::<Vec<_>>().join(" ")
2130
+ }
2131
+
2132
+ fn phrase_parts_for_whitelist(value: &str) -> Vec<String> {
2133
+ let tokens = tokenize(value);
2134
+ let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect();
2135
+ let groups = compact_token_groups(&tokens, &classes);
2136
+ groups
2137
+ .iter()
2138
+ .filter(|group| whitelist_phrase_group(group))
2139
+ .map(|group| group_text(&tokens, group))
2140
+ .filter(|part| !part.trim().is_empty())
2141
+ .collect()
2142
+ }
2143
+
2144
+ fn whitelist_phrase_group(group: &Group) -> bool {
2145
+ matches!(
2146
+ group.class_name.as_str(),
2147
+ "TEXT" | "EPISODE" | "SPECIAL" | "SEASON" | "BRACKET_TEXT"
2148
+ )
2149
+ }
2150
+
2151
  fn is_special_title_phrase(text: &str) -> bool {
2152
  let normalized = SPECIAL_SPACE_RE
2153
  .replace_all(text, " ")
 
2157
  normalized.as_str(),
2158
  "CM" | "EVENT"
2159
  | "EIZOU"
2160
+ | "EXTRA"
2161
+ | "EXTRAS"
2162
  | "LOGO"
2163
  | "MENU"
2164
  | "OMAKE"
 
2168
  | "TOKUTEN"
2169
  | "TRAILER"
2170
  | "WORLD PREMIERE"
2171
+ | "映像特典"
2172
+ | "特典"
2173
+ ) || normalized.contains("映像特典")
2174
+ || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2175
+ }
2176
+
2177
+ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2178
+ &["SPY", "x", "FAMILY"],
2179
+ &["Spy", "x", "Family"],
2180
+ &["Slime", "300"],
2181
+ &["Zom", "100"],
2182
+ &["Kamisama", "Hajimemashita", "2"],
2183
+ &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2184
+ ];
2185
+
2186
+ fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
2187
+ if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2188
+ for (index, group) in groups.iter().enumerate() {
2189
+ if group.class_name == "BRACKET_TEXT"
2190
+ && roles.get(index).is_some_and(|role| role == "GROUP")
2191
+ && whitelists
2192
+ .group_names
2193
+ .contains(&normalize_whitelist_name(&group_text(tokens, group)))
2194
+ {
2195
+ roles[index] = "GROUP".to_string();
2196
+ }
2197
+ }
2198
+ }
2199
+ let searchable: Vec<(usize, String)> = groups
2200
+ .iter()
2201
+ .enumerate()
2202
+ .filter(|(_, group)| whitelist_phrase_group(group))
2203
+ .map(|(index, group)| (index, group_text(tokens, group)))
2204
+ .collect();
2205
+ for phrase in KNOWN_TITLE_PHRASES {
2206
+ apply_title_phrase(&searchable, phrase, roles, true);
2207
+ }
2208
+ if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2209
+ for phrase in &whitelists.title_phrases {
2210
+ if phrase.len() >= 2 {
2211
+ apply_title_phrase(&searchable, phrase, roles, false);
2212
+ }
2213
+ }
2214
+ }
2215
+ }
2216
+
2217
+ fn apply_title_phrase(
2218
+ searchable: &[(usize, String)],
2219
+ phrase: &[impl AsRef<str>],
2220
+ roles: &mut [String],
2221
+ allow_structural_override: bool,
2222
+ ) {
2223
+ if phrase.is_empty() || phrase.len() > searchable.len() {
2224
+ return;
2225
+ }
2226
+ for window in searchable.windows(phrase.len()) {
2227
+ if window
2228
+ .iter()
2229
+ .zip(phrase.iter())
2230
+ .all(|((_, text), expected)| text.eq_ignore_ascii_case(expected.as_ref()))
2231
+ {
2232
+ for (group_index, _) in window {
2233
+ if roles.get(*group_index).is_some_and(|role| role == "GROUP") {
2234
+ continue;
2235
+ }
2236
+ if !allow_structural_override
2237
+ && roles.get(*group_index).is_some_and(|role| {
2238
+ matches!(
2239
+ role.as_str(),
2240
+ "EPISODE"
2241
+ | "EPISODE_VERSION"
2242
+ | "EPISODE_RANGE"
2243
+ | "SEASON"
2244
+ | "SOURCE"
2245
+ | "RESOLUTION"
2246
+ )
2247
+ })
2248
+ {
2249
+ continue;
2250
+ }
2251
+ {
2252
+ roles[*group_index] = "TITLE".to_string();
2253
+ }
2254
+ }
2255
+ }
2256
+ }
2257
  }
2258
 
2259
  fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
2260
  let mut output = roles.to_vec();
2261
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
2262
  let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
2263
+ apply_known_title_phrases(tokens, groups, &mut output);
2264
+ if roles
2265
+ .first()
2266
+ .is_some_and(|role| role.starts_with("EPISODE"))
2267
+ && YEAR_RANGE_RE.is_match(&group_text(tokens, &groups[0]))
2268
+ {
2269
+ let first_real_structural = (1..roles.len())
2270
+ .find(|&index| {
2271
+ roles[index].starts_with("EPISODE")
2272
+ || matches!(roles[index].as_str(), "SEASON" | "SPECIAL")
2273
+ })
2274
+ .unwrap_or(roles.len());
2275
+ for index in 1..first_real_structural {
2276
+ if groups[index].class_name == "TEXT"
2277
+ && !matches!(
2278
+ group_text(tokens, &groups[index])
2279
+ .to_ascii_uppercase()
2280
+ .as_str(),
2281
+ "TV" | "OVA" | "OAD" | "SP"
2282
+ )
2283
+ {
2284
+ output[index] = "TITLE".to_string();
2285
+ }
2286
+ }
2287
+ }
2288
  if !output.iter().any(|role| role == "TITLE")
2289
  && roles
2290
  .first()
 
2378
  }
2379
  if output[index - 2] == "TITLE"
2380
  && groups[index - 1].class_name == "SEP"
2381
+ && previous_text.len() <= 48
2382
+ && previous_text.chars().any(|ch| ch.is_alphabetic())
 
2383
  && text.chars().all(|ch| ch.is_ascii_digit())
2384
  && text.len() <= 3
2385
+ && !(index + 2 < roles.len()
2386
+ && groups[index + 1].class_name == "SEP"
2387
+ && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
2388
+ && (next_episode
2389
+ || (next_special
2390
+ && (text.parse::<u16>().is_ok_and(|value| value >= 100)
2391
+ || (previous_text.len() <= 4
2392
+ && previous_text.is_ascii()
2393
+ && previous_text
2394
+ .chars()
2395
+ .all(|ch| ch.is_ascii_alphabetic())))))
2396
  {
2397
  output[index] = "TITLE".to_string();
2398
  continue;
2399
  }
2400
  }
2401
+ if roles[index].starts_with("EPISODE")
2402
+ && BARE_RESOLUTION_RE.is_match(&text)
2403
+ && index >= 2
2404
+ && groups[index - 1].class_name == "SEP"
2405
+ {
2406
+ let previous_text = group_text(tokens, &groups[index - 2]);
2407
+ if previous_text
2408
+ .chars()
2409
+ .any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
2410
+ {
2411
+ output[index] = "RESOLUTION".to_string();
2412
+ continue;
2413
+ }
2414
+ }
2415
  if roles[index].starts_with("EPISODE")
2416
  && index >= 2
2417
  && output[..index].iter().any(|role| role == "TITLE")
 
2454
  output[index] = "SPECIAL".to_string();
2455
  continue;
2456
  }
2457
+ if roles[index] == "TITLE"
2458
+ && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
2459
+ && output.iter().enumerate().any(|(other, role)| {
2460
+ other != index && role == "TITLE"
2461
+ })
2462
+ {
2463
+ output[index] = "O".to_string();
2464
+ continue;
2465
+ }
2466
  if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
2467
  {
2468
  output[index] = "O".to_string();
 
2490
  && groups[index + 1].class_name == "SEP"
2491
  && roles[index + 2].starts_with("EPISODE")
2492
  {
2493
+ if !output[..index].iter().any(|role| role == "TITLE") {
2494
+ output[index] = "O".to_string();
2495
+ output[index + 2] = "SEASON".to_string();
2496
+ }
2497
  continue;
2498
  }
2499
  if roles[index] == "TITLE"
 
2519
  output[index + 2] = "O".to_string();
2520
  }
2521
  }
2522
+ if roles[index].starts_with("EPISODE")
2523
+ && !output[index + 1..].iter().any(|role| role == "TITLE")
2524
+ {
2525
+ let mut run = Vec::new();
2526
+ let mut cursor = index + 1;
2527
+ while cursor < roles.len() {
2528
+ if groups[cursor].class_name == "SEP" {
2529
+ cursor += 1;
2530
+ continue;
2531
+ }
2532
+ if groups[cursor].class_name == "TEXT"
2533
+ && !matches!(
2534
+ roles[cursor].as_str(),
2535
+ "SOURCE" | "RESOLUTION" | "SEASON" | "SPECIAL"
2536
+ )
2537
+ {
2538
+ run.push(cursor);
2539
+ cursor += 1;
2540
+ continue;
2541
+ }
2542
+ if !run.is_empty() {
2543
+ break;
2544
+ }
2545
+ cursor += 1;
2546
+ }
2547
+ if run.len() >= 2 {
2548
+ for item in run {
2549
+ output[item] = "TITLE".to_string();
2550
+ }
2551
+ }
2552
+ }
2553
  if roles[index].starts_with("EPISODE") {
2554
  let previous_text = if index >= 1 {
2555
  group_text(tokens, &groups[index - 1])
 
2612
  }
2613
 
2614
  fn enforce_single_title_candidate(
2615
+ tokens: &[String],
2616
  groups: &[Group],
2617
  roles: &[String],
2618
  ) -> (Vec<String>, Vec<String>) {
 
2635
  .copied()
2636
  .filter(|(_, end)| *end <= first_anchor)
2637
  .collect();
2638
+ let selected_pool = if before_anchor.is_empty() {
2639
  &candidates
2640
  } else {
2641
  &before_anchor
2642
+ };
2643
+ let selected = selected_pool
2644
  .iter()
2645
+ .max_by_key(|(start, end)| {
2646
+ (
2647
+ title_candidate_score(tokens, groups, *start, *end),
2648
+ *end,
2649
+ end - start,
2650
+ )
2651
+ })
2652
  .copied()
2653
  .unwrap();
2654
  let mut output = roles.to_vec();
 
2667
  (output, dropped)
2668
  }
2669
 
2670
+ fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
2671
+ let text = (start..end)
2672
+ .filter(|&index| roles_candidate_text_group(&groups[index]))
2673
+ .map(|index| group_text(tokens, &groups[index]))
2674
+ .collect::<Vec<_>>()
2675
+ .join("");
2676
+ let cleaned = text.trim();
2677
+ if cleaned.is_empty() {
2678
+ return -1000;
2679
+ }
2680
+ let mut score = cleaned.chars().filter(|ch| ch.is_alphanumeric()).count() as isize;
2681
+ if VERSIONISH_TITLE_RE.is_match(cleaned) {
2682
+ score -= 500;
2683
+ }
2684
+ if matches!(
2685
+ cleaned.to_ascii_lowercase().as_str(),
2686
+ "国漫" | "國漫" | "anime" | "movie" | "movies"
2687
+ ) {
2688
+ score -= 500;
2689
+ }
2690
+ score
2691
+ }
2692
+
2693
+ fn roles_candidate_text_group(group: &Group) -> bool {
2694
+ matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT")
2695
+ }
2696
+
2697
  fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
2698
  let mut output_tokens = Vec::new();
2699
  let mut output_labels = Vec::new();
 
2850
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
2851
  let joiners = [
2852
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2853
+ "?", ";", ";", ",", ",", "", "", "", "~", "", "+", "", "(", ")",
2854
+ "(", ")", "[", "]", "【", "】", "<", ">", "", "", "", "", "「", "」",
2855
+ "☆", "♪", "`", "@",
2856
  ];
2857
  let title_terminal_punctuation = ["!", "!", "?", "?"];
2858
  let entity_joiners = [
2859
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2860
+ "?", ";", ";", ",", ",", "", "", "", "~", "", "+", "", "(", ")",
2861
+ "(", ")", "[", "]", "【", "】", "<", ">", "", "", "", "", "", "",
2862
+ "☆", "♪", "`", "@", "&", "&",
2863
  ];
2864
  let mut output = labels.to_vec();
2865
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
2893
  output[index] = "B-TITLE".to_string();
2894
  }
2895
  }
2896
+ if matches!(token.as_str(), "]" | "】" | ")" | ")" | ">" | ">" | "」" | "」")
2897
+ && index > 0
2898
+ && output[index - 1] == "B-TITLE"
2899
+ && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
2900
+ {
2901
+ output[index] = "B-TITLE".to_string();
2902
+ }
2903
  }
2904
  output
2905
  }
2906
 
2907
+ fn title_span_has_labeled_opener(tokens: &[String], labels: &[String], closer: &str) -> bool {
2908
+ for (token, label) in tokens.iter().zip(labels.iter()).rev() {
2909
+ if label != "B-TITLE" {
2910
+ return false;
2911
+ }
2912
+ if closer_matches_opener(closer, token) {
2913
+ return true;
2914
+ }
2915
+ }
2916
+ false
2917
+ }
2918
+
2919
+ fn closer_matches_opener(closer: &str, opener: &str) -> bool {
2920
+ matches!(
2921
+ (closer, opener),
2922
+ ("]", "[")
2923
+ | ("】", "【")
2924
+ | (")", "(")
2925
+ | (")", "(")
2926
+ | (">", "<")
2927
+ | (">", "<")
2928
+ | ("」", "「")
2929
+ | ("」", "「")
2930
+ )
2931
+ }
2932
+
2933
  fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
2934
  let (key, tokens, _classes, groups) = template_key_for_filename(filename);
2935
  if groups.len() != roles.len() {
2936
  return None;
2937
  }
2938
  let roles = adjust_contextual_roles(&tokens, &groups, roles);
2939
+ let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
2940
  let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
2941
  let labels = smooth_title_spans(&tokens, &labels);
2942
  if tokens.len() != labels.len() {
 
2969
  record.tokens.into_iter().zip(record.labels).collect()
2970
  }
2971
 
2972
+ #[test]
2973
+ fn rich_title_candidates_keep_readable_spacing() {
2974
+ let row = rich_annotation_for(
2975
+ "(1998) Initial D First Stage [1080p BDRip AVC AAC DTS-HD]/Initial D First Stage - 01 [1080p BDRip AVC AAC DTS-HD]",
2976
+ );
2977
+ assert_eq!(
2978
+ row.pointer("/segments/1/candidates/0/text")
2979
+ .and_then(Value::as_str),
2980
+ Some("Initial D First Stage")
2981
+ );
2982
+ }
2983
+
2984
  #[test]
2985
  fn required_regressions() {
2986
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
 
3048
  let comma_title =
3049
  labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
3050
  assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
3051
+ let backtick_title =
3052
+ labels_for("[Hayate no Gotoku! Can`t Take My Eyes Off You][01][BDrip X264 AAC 720P]");
3053
+ assert!(backtick_title.contains(&("`".to_string(), "B-TITLE".to_string())));
3054
+ assert!(backtick_title.contains(&("t".to_string(), "B-TITLE".to_string())));
3055
+ let cjk_period_title =
3056
+ labels_for("[云光字幕组]剃须。然后捡到高中生 Hige o Soru. Soshite Joshikousei o Hirou-[ 01 ][简体双语][1080p]");
3057
+ assert!(cjk_period_title.contains(&("。".to_string(), "B-TITLE".to_string())));
3058
+ let music_title =
3059
+ labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
3060
+ assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
3061
+ let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
3062
+ assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
3063
+ assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
3064
+ let hdma_block =
3065
+ labels_for("[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]");
3066
+ assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
3067
+ assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
3068
+ assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
3069
+ assert!(!hdma_block.contains(&("1080P".to_string(), "B-TITLE".to_string())));
3070
+ let extra_menu = labels_for("Extra Menu OVA");
3071
+ assert!(extra_menu.contains(&("Extra".to_string(), "B-SPECIAL".to_string())));
3072
+ assert!(!extra_menu.contains(&("Extra".to_string(), "B-TITLE".to_string())));
3073
+ let eizou_tokuten = labels_for("おジャ魔女どれみ♯ 映像特典「ともだちの唄」(DVD 640x480 )");
3074
+ assert!(eizou_tokuten.contains(&("映像特典".to_string(), "B-SPECIAL".to_string())));
3075
  let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
3076
  assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
3077
  let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
 
3081
  let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
3082
  assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
3083
  assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
3084
+ assert!(mayoi.contains(&("]".to_string(), "B-TITLE".to_string())));
3085
 
3086
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
3087
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
 
3096
  assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
3097
  assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
3098
  assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
3099
+
3100
+ let happy = labels_for(
3101
+ "My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG",
3102
+ );
3103
+ assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
3104
+ assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
3105
+ assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
3106
+
3107
+ let garo = labels_for("[牙狼<GARO>~炎の刻印~][01][1080p]");
3108
+ assert!(garo.contains(&("牙狼".to_string(), "B-TITLE".to_string())));
3109
+ assert!(garo.contains(&("<".to_string(), "B-TITLE".to_string())));
3110
+ assert!(garo.contains(&(">".to_string(), "B-TITLE".to_string())));
3111
+ assert!(garo.contains(&("炎の刻印".to_string(), "B-TITLE".to_string())));
3112
+
3113
+ let akira = labels_for("[QYQ][AKIRA][AVC_AC3x2][1080p]");
3114
+ assert!(akira.contains(&("AKIRA".to_string(), "B-TITLE".to_string())));
3115
+ assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
3116
+ assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
3117
+
3118
+ let doraemon =
3119
+ labels_for("[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了");
3120
+ assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
3121
+ assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
3122
+ assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
3123
+
3124
+ let devilman = labels_for("[DBD-Raws][恶魔人][1972版][01][1080P][BDRip][HEVC-10bit][FLAC]");
3125
+ assert!(devilman.contains(&("恶魔人".to_string(), "B-TITLE".to_string())));
3126
+ assert!(!devilman.contains(&("1972版".to_string(), "B-TITLE".to_string())));
3127
+
3128
+ let classroom = labels_for("[Dymy][Assassination Classroom (2016)][01][BIG5][1280X720]");
3129
+ assert!(classroom.contains(&("(".to_string(), "B-TITLE".to_string())));
3130
+ assert!(classroom.contains(&(")".to_string(), "B-TITLE".to_string())));
3131
+ assert!(!classroom.contains(&("]".to_string(), "B-TITLE".to_string())));
3132
+
3133
+ let bang_season =
3134
+ labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]");
3135
+ assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string())));
3136
+ assert!(bang_season.contains(&("Season".to_string(), "B-TITLE".to_string())));
3137
+ assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
3138
+ assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
3139
+
3140
+ let basket =
3141
+ labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
3142
+ assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
3143
+ assert!(basket.contains(&("Season".to_string(), "B-TITLE".to_string())));
3144
+ assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string())));
3145
+ assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string())));
3146
+
3147
+ let notice = labels_for("[KTXP][Zankyou_no_Terror][08_Notice][GB_BIG5][X264_AAC][720p]");
3148
+ assert!(notice.contains(&("Zankyou".to_string(), "B-TITLE".to_string())));
3149
+ assert!(notice.contains(&("08".to_string(), "B-EPISODE".to_string())));
3150
+ assert!(!notice.contains(&("08".to_string(), "B-TITLE".to_string())));
3151
+
3152
+ let full = labels_for("[POPGO][Soukyuu_no_Fafner_Exodus][01_Full][GB][720p]");
3153
+ assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
3154
+ assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
3155
+
3156
+ let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
3157
+ assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
3158
+ assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
3159
+
3160
+ let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
3161
+ assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
3162
+ assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
3163
+ assert!(ddp.iter().any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
3164
+
3165
+ let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
3166
+ assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
3167
+ assert!(!aac_space.contains(&("2".to_string(), "B-EPISODE".to_string())));
3168
+ assert!(aac_space
3169
+ .iter()
3170
+ .any(|(token, label)| token.starts_with("AAC") && label == "B-SOURCE"));
3171
+
3172
+ let bare_resolution = labels_for("日本桥15.03.30 720");
3173
+ assert!(bare_resolution.contains(&("日本桥".to_string(), "B-TITLE".to_string())));
3174
+ assert!(bare_resolution.contains(&("720".to_string(), "B-RESOLUTION".to_string())));
3175
+ assert!(!bare_resolution.contains(&("720".to_string(), "B-EPISODE".to_string())));
3176
+
3177
+ let air_episode = labels_for("Air 01");
3178
+ assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
3179
+ assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
3180
+
3181
+ let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
3182
+ assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
3183
+ assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
3184
+ assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
3185
+
3186
+ let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
3187
+ assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
3188
+ assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
3189
+ assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
3190
+ assert!(spy.contains(&("Family".to_string(), "B-TITLE".to_string())));
3191
+ assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
3192
+ assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
3193
+
3194
+ let spy_s3 = labels_for("[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]");
3195
+ assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
3196
+ assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
3197
+ assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
3198
+ assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
3199
+ assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
3200
+
3201
+ let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
3202
+ assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
3203
+ assert!(
3204
+ slime.contains(&("300".to_string(), "B-TITLE".to_string())),
3205
+ "{slime:?}"
3206
+ );
3207
+ assert!(!slime.contains(&("300".to_string(), "B-EPISODE".to_string())));
3208
+
3209
+ let kamisama =
3210
+ labels_for("[SFEO-Raws] Kamisama Hajimemashita 2 - 01 (BD 720P x264 10bit AAC)");
3211
+ assert!(kamisama.contains(&("Kamisama".to_string(), "B-TITLE".to_string())));
3212
+ assert!(kamisama.contains(&("2".to_string(), "B-TITLE".to_string())));
3213
+ assert!(kamisama.contains(&("01".to_string(), "B-EPISODE".to_string())));
3214
  }
3215
 
3216
  #[test]
 
3220
  assert!(was_trimmed);
3221
  assert_eq!(
3222
  trimmed,
3223
+ "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
3224
  );
3225
  let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
3226
  let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
 
3277
  assert!(was_trimmed);
3278
  assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
3279
 
3280
+ let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
3281
+ let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
3282
+ assert!(was_trimmed);
3283
+ assert_eq!(
3284
+ trimmed,
3285
+ "Season 1 [Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]"
3286
+ );
3287
+ let plain_season_labels = labels_for(&trimmed);
3288
+ assert!(plain_season_labels.contains(&("1".to_string(), "B-SEASON".to_string())));
3289
+ assert!(plain_season_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
3290
+
3291
+ let menu_parent =
3292
+ "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
3293
+ let (trimmed, was_trimmed) = training_filename_for(menu_parent);
3294
+ assert!(was_trimmed);
3295
+ assert_eq!(trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)");
3296
+
3297
+ assert!(has_encoding_noise(
3298
+ "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
3299
+ ));
3300
+
3301
  let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
3302
  let (trimmed, was_trimmed) = training_filename_for(tintin);
3303
  assert!(was_trimmed);
 
3338
  "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
3339
  let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
3340
  assert!(was_trimmed);
3341
+ assert_eq!(
3342
+ trimmed,
3343
+ "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
3344
+ );
3345
+
3346
+ let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
3347
+ let (trimmed, was_trimmed) = training_filename_for(najica);
3348
+ assert!(was_trimmed);
3349
+ assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
3350
+ let najica_labels = labels_for(&trimmed);
3351
+ assert!(najica_labels.contains(&("Najica".to_string(), "B-TITLE".to_string())));
3352
+ assert!(!najica_labels.contains(&("SourceUnknown".to_string(), "B-TITLE".to_string())));
3353
+ assert!(najica_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
3354
+
3355
+ let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
3356
+ let (trimmed, was_trimmed) = training_filename_for(galient);
3357
+ assert!(was_trimmed);
3358
+ assert_eq!(
3359
+ trimmed,
3360
+ "[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
3361
+ );
3362
+ let galient_labels = labels_for(&trimmed);
3363
+ assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
3364
+ assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
3365
+ assert!(galient_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
3366
+
3367
+ let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
3368
+ let (trimmed, was_trimmed) = training_filename_for(nced);
3369
+ assert!(was_trimmed);
3370
+ assert_eq!(trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED");
3371
+
3372
+ let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
3373
+ let (trimmed, was_trimmed) = training_filename_for(sakura);
3374
+ assert!(was_trimmed);
3375
+ assert_eq!(
3376
+ trimmed,
3377
+ "魔卡少女樱(台配国语) 第01集 小樱与不可思议的魔法书"
3378
+ );
3379
+ let sakura_labels = labels_for(&trimmed);
3380
+ assert!(sakura_labels.contains(&("魔卡少女樱".to_string(), "B-TITLE".to_string())));
3381
+ assert!(sakura_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
3382
 
3383
  let volume =
3384
  labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");