ModerRAS commited on
Commit
f484458
·
1 Parent(s): b7152ef

Tighten DMHY template label heuristics

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -201,10 +201,18 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
201
  static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
  Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
  });
 
 
 
 
204
  static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
205
  Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
206
  static ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> =
207
  Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap());
 
 
 
 
208
  static SEASON_WORD_RE: Lazy<Regex> =
209
  Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
210
  static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
@@ -1764,7 +1772,7 @@ fn has_encoding_noise(value: &str) -> bool {
1764
  let markers = [
1765
  "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1766
  "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1767
- "伄", "椋", "伓", "姘", "帽",
1768
  ];
1769
  let marker_hits = markers
1770
  .iter()
@@ -1783,6 +1791,11 @@ fn has_encoding_noise(value: &str) -> bool {
1783
  fn has_non_anime_noise(value: &str) -> bool {
1784
  let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
1785
  normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
 
 
 
 
 
1786
  }
1787
 
1788
  fn normalized_path_segment(value: &str) -> String {
@@ -2235,6 +2248,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2235
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2236
  &["Durarara", "2", "Ketsu"],
2237
  &["Ghiblies", "Episode", "2"],
 
2238
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2239
  &["Lupin The Third Jigen Daisuke no Bohyou"],
2240
  ];
@@ -2705,6 +2719,25 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2705
  output[index] = "O".to_string();
2706
  continue;
2707
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2708
  if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
2709
  {
2710
  output[index] = "O".to_string();
@@ -3039,6 +3072,11 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
3039
  labels.push("B-SEASON".to_string());
3040
  continue;
3041
  }
 
 
 
 
 
3042
  if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
3043
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3044
  let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
@@ -3066,6 +3104,33 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
3066
  }
3067
  continue;
3068
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3069
  output_pieces.push(piece);
3070
  labels.push("B-TITLE".to_string());
3071
  }
@@ -3227,18 +3292,58 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3227
  let joiners = [
3228
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3229
  "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3230
- "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
3231
- "☆", "♪", "`", "@",
3232
  ];
3233
  let title_terminal_punctuation = ["!", "!", "?", "?"];
3234
  let entity_joiners = [
3235
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3236
  "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3237
- "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
3238
- "☆", "♪", "`", "@", "&", "&",
3239
  ];
3240
  let mut output = labels.to_vec();
3241
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3242
  if label == "B-TITLE"
3243
  && token.chars().all(|ch| ch.is_ascii_digit())
3244
  && token.len() == 3
@@ -3283,6 +3388,11 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3283
  .chars()
3284
  .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
3285
  });
 
 
 
 
 
3286
  let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
3287
  if previous_title_word.is_some()
3288
  && !matches!(previous_word.as_deref(), Some("lupin"))
@@ -3291,7 +3401,10 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3291
  continue;
3292
  }
3293
  }
3294
- if label == "B-TITLE" && ORDINAL_SEASON_TOKEN_RE.is_match(token) {
 
 
 
3295
  let next_word = (index + 1..tokens.len()).find(|&cursor| {
3296
  !joiners.contains(&tokens[cursor].as_str())
3297
  && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
@@ -3311,6 +3424,136 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3311
  continue;
3312
  }
3313
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3314
  if label == "O"
3315
  && token.chars().all(|ch| ch.is_ascii_digit())
3316
  && token.len() <= 3
@@ -3320,8 +3563,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3320
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3321
  let next_non_space = (index + 1..tokens.len())
3322
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3323
- if previous_non_space.is_some_and(|cursor| tokens[cursor] == "[")
3324
- && next_non_space.is_some_and(|cursor| tokens[cursor] == "]")
3325
  && output[..index].iter().any(|label| label == "B-TITLE")
3326
  && output[index + 1..]
3327
  .iter()
@@ -3330,11 +3573,45 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3330
  output[index] = "B-EPISODE".to_string();
3331
  continue;
3332
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3333
  }
3334
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3335
  let previous_non_space = (0..index)
3336
  .rev()
3337
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
 
 
 
 
 
 
 
 
 
 
 
 
 
3338
  if previous_non_space
3339
  .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
3340
  {
@@ -3366,7 +3643,13 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3366
  }
3367
  break;
3368
  }
3369
- if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
 
 
 
 
 
 
3370
  output[index] = "O".to_string();
3371
  continue;
3372
  }
@@ -3851,6 +4134,10 @@ mod tests {
3851
  assert!(has_encoding_noise(
3852
  "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
3853
  ));
 
 
 
 
3854
 
3855
  let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
3856
  let (trimmed, was_trimmed) = training_filename_for(tintin);
@@ -4091,6 +4378,49 @@ mod tests {
4091
  assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
4092
  assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
4093
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4094
  let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
4095
  assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
4096
  assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
@@ -4110,6 +4440,16 @@ mod tests {
4110
  assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string())));
4111
  assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string())));
4112
 
 
 
 
 
 
 
 
 
 
 
4113
  let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
4114
  assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
4115
  assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
@@ -4122,6 +4462,11 @@ mod tests {
4122
  assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string())));
4123
  assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string())));
4124
 
 
 
 
 
 
4125
  let fox = labels_for(
4126
  "[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]",
4127
  );
@@ -4143,6 +4488,68 @@ mod tests {
4143
  assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
4144
  assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
4145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4146
  let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
4147
  assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
4148
  }
 
201
  static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
  Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
  });
204
+ static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
205
+ Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
206
+ static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
207
+ Lazy::new(|| Regex::new(r"^(.+[\p{Han}\p{Hiragana}\p{Katakana}])(\d{2,3})$").unwrap());
208
  static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
209
  Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
210
  static ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> =
211
  Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap());
212
+ static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
213
+ Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
214
+ .unwrap()
215
+ });
216
  static SEASON_WORD_RE: Lazy<Regex> =
217
  Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
218
  static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
 
1772
  let markers = [
1773
  "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1774
  "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1775
+ "伄", "椋", "伓", "姘", "帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒",
1776
  ];
1777
  let marker_hits = markers
1778
  .iter()
 
1791
  fn has_non_anime_noise(value: &str) -> bool {
1792
  let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
1793
  normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
1794
+ || value.contains("[旅游")
1795
+ || value.contains("[旅游番")
1796
+ || normalized.contains("tokyo deep")
1797
+ || value.contains("日本不思议铁路之旅")
1798
+ || value.contains("ニッポンぶらり鉄道旅")
1799
  }
1800
 
1801
  fn normalized_path_segment(value: &str) -> String {
 
2248
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2249
  &["Durarara", "2", "Ketsu"],
2250
  &["Ghiblies", "Episode", "2"],
2251
+ &["Eien", "no", "831"],
2252
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2253
  &["Lupin The Third Jigen Daisuke no Bohyou"],
2254
  ];
 
2719
  output[index] = "O".to_string();
2720
  continue;
2721
  }
2722
+ if roles[index] == "TITLE"
2723
+ && matches!(text.as_str(), "TVアニメ" | "テレビアニメ")
2724
+ && output
2725
+ .iter()
2726
+ .enumerate()
2727
+ .any(|(other, role)| other != index && role == "TITLE")
2728
+ {
2729
+ output[index] = "O".to_string();
2730
+ continue;
2731
+ }
2732
+ if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
2733
+ let later_special = output[index + 1..]
2734
+ .iter()
2735
+ .any(|role| role == "SPECIAL");
2736
+ if later_special {
2737
+ output[index] = "SPECIAL".to_string();
2738
+ continue;
2739
+ }
2740
+ }
2741
  if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
2742
  {
2743
  output[index] = "O".to_string();
 
3072
  labels.push("B-SEASON".to_string());
3073
  continue;
3074
  }
3075
+ if EPISODE_CJK_RE.is_match(&piece) {
3076
+ output_pieces.push(piece);
3077
+ labels.push("B-EPISODE".to_string());
3078
+ continue;
3079
+ }
3080
  if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
3081
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3082
  let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
 
3104
  }
3105
  continue;
3106
  }
3107
+ if let Some(caps) = CJK_EPISODE_EMBEDDED_RE.captures(&piece) {
3108
+ let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3109
+ let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
3110
+ let after = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
3111
+ if !before.is_empty() {
3112
+ output_pieces.push(before.to_string());
3113
+ labels.push("B-TITLE".to_string());
3114
+ }
3115
+ output_pieces.push(episode.to_string());
3116
+ labels.push("B-EPISODE".to_string());
3117
+ if !after.is_empty() {
3118
+ output_pieces.push(after.to_string());
3119
+ labels.push("O".to_string());
3120
+ }
3121
+ continue;
3122
+ }
3123
+ if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
3124
+ let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3125
+ let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
3126
+ if !before.is_empty() {
3127
+ output_pieces.push(before.to_string());
3128
+ labels.push("B-TITLE".to_string());
3129
+ }
3130
+ output_pieces.push(episode.to_string());
3131
+ labels.push("B-EPISODE".to_string());
3132
+ continue;
3133
+ }
3134
  output_pieces.push(piece);
3135
  labels.push("B-TITLE".to_string());
3136
  }
 
3292
  let joiners = [
3293
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3294
  "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3295
+ "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
3296
+ "☆", "♪", "`", "@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3297
  ];
3298
  let title_terminal_punctuation = ["!", "!", "?", "?"];
3299
  let entity_joiners = [
3300
  " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3301
  "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3302
+ "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
3303
+ "☆", "♪", "`", "@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3304
  ];
3305
  let mut output = labels.to_vec();
3306
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
3307
+ if label == "B-TITLE"
3308
+ && token == "TV"
3309
+ && index + 1 < tokens.len()
3310
+ && tokens[index + 1] == "アニメ"
3311
+ && output[index + 2..].iter().any(|label| label == "B-TITLE")
3312
+ {
3313
+ output[index] = "O".to_string();
3314
+ output[index + 1] = "O".to_string();
3315
+ continue;
3316
+ }
3317
+ if label == "B-TITLE"
3318
+ && token == "アニメ"
3319
+ && output[index + 1..].iter().any(|label| label == "B-TITLE")
3320
+ {
3321
+ output[index] = "O".to_string();
3322
+ continue;
3323
+ }
3324
+ if label == "B-TITLE" && token.eq_ignore_ascii_case("part") {
3325
+ let next_number = (index + 1..tokens.len()).find(|&cursor| {
3326
+ !joiners.contains(&tokens[cursor].as_str())
3327
+ && !tokens[cursor].chars().all(char::is_whitespace)
3328
+ });
3329
+ let nearby_lupin = tokens[..index]
3330
+ .iter()
3331
+ .rev()
3332
+ .take(8)
3333
+ .any(|item| item.eq_ignore_ascii_case("lupin"))
3334
+ || tokens[index + 1..]
3335
+ .iter()
3336
+ .take(12)
3337
+ .any(|item| item.eq_ignore_ascii_case("lupin"));
3338
+ if nearby_lupin
3339
+ && next_number.is_some_and(|cursor| {
3340
+ tokens[cursor].chars().all(|ch| ch.is_ascii_digit()) && tokens[cursor].len() <= 2
3341
+ })
3342
+ {
3343
+ output[index] = "B-SEASON".to_string();
3344
+ continue;
3345
+ }
3346
+ }
3347
  if label == "B-TITLE"
3348
  && token.chars().all(|ch| ch.is_ascii_digit())
3349
  && token.len() == 3
 
3388
  .chars()
3389
  .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
3390
  });
3391
+ let later_episode = (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
3392
+ if previous_title_word.is_none() && later_episode {
3393
+ output[index] = "B-SEASON".to_string();
3394
+ continue;
3395
+ }
3396
  let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
3397
  if previous_title_word.is_some()
3398
  && !matches!(previous_word.as_deref(), Some("lupin"))
 
3401
  continue;
3402
  }
3403
  }
3404
+ if label == "B-TITLE"
3405
+ && (ORDINAL_SEASON_TOKEN_RE.is_match(token)
3406
+ || WORD_ORDINAL_SEASON_TOKEN_RE.is_match(token))
3407
+ {
3408
  let next_word = (index + 1..tokens.len()).find(|&cursor| {
3409
  !joiners.contains(&tokens[cursor].as_str())
3410
  && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
 
3424
  continue;
3425
  }
3426
  }
3427
+ if label == "O"
3428
+ && (EPISODE_CJK_RE.is_match(token)
3429
+ || EPISODE_VALUE_RE.is_match(token)
3430
+ || EPISODE_RANGE_RE.is_match(token))
3431
+ {
3432
+ output[index] = "B-EPISODE".to_string();
3433
+ continue;
3434
+ }
3435
+ if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) {
3436
+ let previous_non_space = (0..index)
3437
+ .rev()
3438
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3439
+ let next_non_space = (index + 1..tokens.len())
3440
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3441
+ if previous_non_space.is_some_and(|cursor| tokens[cursor] == "#") {
3442
+ output[index] = "B-EPISODE".to_string();
3443
+ if next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "~"))
3444
+ {
3445
+ if let Some(separator) = next_non_space {
3446
+ output[separator] = "B-EPISODE".to_string();
3447
+ if let Some(right) = (separator + 1..tokens.len())
3448
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
3449
+ {
3450
+ if tokens[right].chars().all(|ch| ch.is_ascii_digit()) {
3451
+ output[right] = "B-EPISODE".to_string();
3452
+ }
3453
+ }
3454
+ }
3455
+ }
3456
+ continue;
3457
+ }
3458
+ if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
3459
+ && next_non_space
3460
+ .is_some_and(|cursor| {
3461
+ matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
3462
+ || tokens[cursor].starts_with('话')
3463
+ || tokens[cursor].starts_with('話')
3464
+ || tokens[cursor].starts_with('回')
3465
+ || tokens[cursor].starts_with('集')
3466
+ })
3467
+ {
3468
+ if let Some(cursor) = previous_non_space {
3469
+ output[cursor] = "B-EPISODE".to_string();
3470
+ }
3471
+ output[index] = "B-EPISODE".to_string();
3472
+ if let Some(cursor) = next_non_space {
3473
+ if matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集") {
3474
+ output[cursor] = "B-EPISODE".to_string();
3475
+ }
3476
+ }
3477
+ continue;
3478
+ }
3479
+ }
3480
+ if matches!(label.as_str(), "B-TITLE" | "O")
3481
+ && token.chars().all(|ch| ch.is_ascii_digit())
3482
+ && token.len() <= 3
3483
+ {
3484
+ let previous_word = (0..index)
3485
+ .rev()
3486
+ .find(|&cursor| {
3487
+ !joiners.contains(&tokens[cursor].as_str())
3488
+ && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3489
+ })
3490
+ .map(|cursor| tokens[cursor].to_ascii_lowercase());
3491
+ let next_structural = (index + 1..tokens.len())
3492
+ .find(|&cursor| !joiners.contains(&tokens[cursor].as_str()))
3493
+ .map(|cursor| tokens[cursor].as_str());
3494
+ let next_non_space = (index + 1..tokens.len())
3495
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
3496
+ .map(|cursor| tokens[cursor].as_str());
3497
+ let later_technical_block = output[index + 1..]
3498
+ .iter()
3499
+ .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION"));
3500
+ let nearby_lupin_part = previous_word.as_deref() == Some("part")
3501
+ && (tokens[..index]
3502
+ .iter()
3503
+ .rev()
3504
+ .take(8)
3505
+ .any(|item| item.eq_ignore_ascii_case("lupin"))
3506
+ || tokens[index + 1..]
3507
+ .iter()
3508
+ .take(12)
3509
+ .any(|item| item.eq_ignore_ascii_case("lupin")));
3510
+ if nearby_lupin_part {
3511
+ output[index] = "B-SEASON".to_string();
3512
+ continue;
3513
+ }
3514
+ let followed_by_title_word = (index + 1..tokens.len())
3515
+ .find(|&cursor| {
3516
+ !joiners.contains(&tokens[cursor].as_str())
3517
+ && !matches!(tokens[cursor].as_str(), "-" | "-" | "," | "," | ":" | ":")
3518
+ })
3519
+ .is_some_and(|cursor| {
3520
+ !matches!(tokens[cursor].as_str(), "[" | "【" | "(" | "(" | "]" | "】")
3521
+ && output
3522
+ .get(cursor)
3523
+ .is_some_and(|label| label == "B-TITLE")
3524
+ && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3525
+ });
3526
+ if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
3527
+ {
3528
+ output[index] = "B-TITLE".to_string();
3529
+ continue;
3530
+ }
3531
+ if (later_technical_block
3532
+ || next_non_space.is_some_and(|token| matches!(token, "[" | "【" | "(" | "("))
3533
+ || next_structural.is_some_and(|token| matches!(token, "[" | "【" | "(" | "(")))
3534
+ && matches!(previous_word.as_deref(), Some("movie" | "part"))
3535
+ {
3536
+ output[index] = "B-SPECIAL".to_string();
3537
+ continue;
3538
+ }
3539
+ let eien_title_number = token == "831"
3540
+ && previous_word.as_deref() == Some("no")
3541
+ && (0..index).any(|cursor| {
3542
+ output[cursor] == "B-TITLE" && tokens[cursor].eq_ignore_ascii_case("Eien")
3543
+ });
3544
+ if eien_title_number {
3545
+ for joiner_index in (0..index).rev() {
3546
+ if tokens[joiner_index].eq_ignore_ascii_case("no") {
3547
+ break;
3548
+ }
3549
+ if joiners.contains(&tokens[joiner_index].as_str()) {
3550
+ output[joiner_index] = "B-TITLE".to_string();
3551
+ }
3552
+ }
3553
+ output[index] = "B-TITLE".to_string();
3554
+ continue;
3555
+ }
3556
+ }
3557
  if label == "O"
3558
  && token.chars().all(|ch| ch.is_ascii_digit())
3559
  && token.len() <= 3
 
3563
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3564
  let next_non_space = (index + 1..tokens.len())
3565
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3566
+ if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
3567
+ && next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
3568
  && output[..index].iter().any(|label| label == "B-TITLE")
3569
  && output[index + 1..]
3570
  .iter()
 
3573
  output[index] = "B-EPISODE".to_string();
3574
  continue;
3575
  }
3576
+ if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-"))
3577
+ && output[..index].iter().any(|label| label == "B-TITLE")
3578
+ && output[index + 1..]
3579
+ .iter()
3580
+ .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION"))
3581
+ {
3582
+ output[index] = "B-EPISODE".to_string();
3583
+ continue;
3584
+ }
3585
+ if next_non_space.is_none()
3586
+ && previous_non_space.is_some_and(|cursor| {
3587
+ output[cursor] == "B-TITLE"
3588
+ && tokens[cursor].chars().any(|ch| {
3589
+ ('\u{4e00}'..='\u{9fff}').contains(&ch)
3590
+ || ('\u{3040}'..='\u{30ff}').contains(&ch)
3591
+ })
3592
+ })
3593
+ {
3594
+ output[index] = "B-EPISODE".to_string();
3595
+ continue;
3596
+ }
3597
  }
3598
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3599
  let previous_non_space = (0..index)
3600
  .rev()
3601
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3602
+ let next_non_space = (index + 1..tokens.len())
3603
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3604
+ if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
3605
+ && next_non_space
3606
+ .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集"))
3607
+ {
3608
+ if let Some(cursor) = previous_non_space {
3609
+ output[cursor] = "B-EPISODE".to_string();
3610
+ }
3611
+ if let Some(cursor) = next_non_space {
3612
+ output[cursor] = "B-EPISODE".to_string();
3613
+ }
3614
+ }
3615
  if previous_non_space
3616
  .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
3617
  {
 
3643
  }
3644
  break;
3645
  }
3646
+ let previous_non_space = (0..index)
3647
+ .rev()
3648
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
3649
+ .map(|cursor| tokens[cursor].as_str());
3650
+ if matches!(previous_word.as_deref(), Some("lesson" | "part"))
3651
+ || (previous_word.as_deref() == Some("no") && previous_non_space == Some("."))
3652
+ {
3653
  output[index] = "O".to_string();
3654
  continue;
3655
  }
 
4134
  assert!(has_encoding_noise(
4135
  "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
4136
  ));
4137
+ assert!(has_encoding_noise("ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"));
4138
+ assert!(has_non_anime_noise(
4139
+ "13-[旅游番][花丸字幕组][日本不思议铁路之旅][15.03.19-16.02.03][720&1080][中日双语]/铁道旅 15.03.19 720"
4140
+ ));
4141
 
4142
  let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
4143
  let (trimmed, was_trimmed) = training_filename_for(tintin);
 
4378
  assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
4379
  assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
4380
 
4381
+ let conan_movie =
4382
+ labels_for("[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]");
4383
+ assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
4384
+ assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
4385
+
4386
+ let madoka_movie =
4387
+ labels_for("[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]");
4388
+ assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
4389
+ assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
4390
+
4391
+ let fate_first_order =
4392
+ labels_for("[DBD-Raws][Fate Grand Order ‐First Order‐][PV][01][1080P]");
4393
+ assert!(fate_first_order.contains(&("Fate".to_string(), "B-TITLE".to_string())));
4394
+ assert!(fate_first_order.contains(&("‐".to_string(), "B-TITLE".to_string())));
4395
+ assert!(fate_first_order.contains(&("First".to_string(), "B-TITLE".to_string())));
4396
+
4397
+ let trillion_game = labels_for("[ANi] 一兆$遊戲 - 03 [1080P][Baha][WEB-DL][AAC AVC][CHT]");
4398
+ assert!(trillion_game.contains(&("一兆".to_string(), "B-TITLE".to_string())));
4399
+ assert!(trillion_game.contains(&("$".to_string(), "B-TITLE".to_string())));
4400
+ assert!(trillion_game.contains(&("遊戲".to_string(), "B-TITLE".to_string())));
4401
+
4402
+ let lapis = labels_for("[Nekomoe kissaten&LoliHouse] Lapis Re꞉LiGHTs - PV01 [BDRip 1080p]");
4403
+ assert!(lapis.contains(&("Re".to_string(), "B-TITLE".to_string())));
4404
+ assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
4405
+ assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
4406
+
4407
+ let rezero = labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
4408
+ assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
4409
+ assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
4410
+ assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
4411
+ assert!(rezero.contains(&("第".to_string(), "B-EPISODE".to_string())));
4412
+ assert!(rezero.contains(&("話".to_string(), "B-EPISODE".to_string())));
4413
+
4414
+ let shark = labels_for("アニメ『おでかけ子ザメ』第10話「かじゅえん」");
4415
+ assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
4416
+ assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
4417
+
4418
+ let creditless = labels_for(
4419
+ "[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)",
4420
+ );
4421
+ assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
4422
+ assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
4423
+
4424
  let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
4425
  assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
4426
  assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
 
4440
  assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string())));
4441
  assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string())));
4442
 
4443
+ let yama = labels_for("[A.I.R.nesSub][Yama_no_Susume_Second_Season][08][720p]");
4444
+ assert!(yama.contains(&("Yama".to_string(), "B-TITLE".to_string())));
4445
+ assert!(yama.contains(&("Second".to_string(), "B-SEASON".to_string())));
4446
+ assert!(yama.contains(&("Season".to_string(), "B-SEASON".to_string())));
4447
+
4448
+ let one_room = labels_for("[DMG][One Room Second Season][00][1080P][BIG5]");
4449
+ assert!(one_room.contains(&("One".to_string(), "B-TITLE".to_string())));
4450
+ assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
4451
+ assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
4452
+
4453
  let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
4454
  assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
4455
  assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
 
4462
  assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string())));
4463
  assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string())));
4464
 
4465
+ let yu_no_dash =
4466
+ labels_for("[LowPower-Raws] この世の果てで恋を唄う少女YU-NO - 01 (BD 1080P x264 FLAC)");
4467
+ assert!(yu_no_dash.contains(&("NO".to_string(), "B-TITLE".to_string())));
4468
+ assert!(yu_no_dash.contains(&("01".to_string(), "B-EPISODE".to_string())));
4469
+
4470
  let fox = labels_for(
4471
  "[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]",
4472
  );
 
4488
  assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
4489
  assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
4490
 
4491
+ let lupin_part =
4492
+ labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
4493
+ assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
4494
+ assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
4495
+ assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
4496
+ assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
4497
+ assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
4498
+
4499
+ let roman_leaf = dmhy_record("Ⅰ 001 魯邦燃起了鬥志", "tpl_test", &suggested_roles("TEXT SEP EPISODE SEP TEXT")).unwrap();
4500
+ assert!(roman_leaf
4501
+ .tokens
4502
+ .iter()
4503
+ .zip(roman_leaf.labels.iter())
4504
+ .any(|(token, label)| token == "Ⅰ" && label == "B-SEASON"));
4505
+ assert!(audit_warnings(&roman_leaf).contains(&"no_title".to_string()));
4506
+
4507
+ let hallow = labels_for("[c.c动漫 ccwzz.cc][驱魔少年HALLOW][第09话][GB][720p]");
4508
+ assert!(hallow.contains(&("驱魔少年HALLOW".to_string(), "B-TITLE".to_string())));
4509
+ assert!(hallow.contains(&("第09话".to_string(), "B-EPISODE".to_string())));
4510
+
4511
+ let fairy = labels_for("[魔導少年 最終章][EP35][繁体][1080P]");
4512
+ assert!(fairy.contains(&("魔導少年".to_string(), "B-TITLE".to_string())));
4513
+ assert!(fairy.contains(&("EP35".to_string(), "B-EPISODE".to_string())));
4514
+
4515
+ let mebius = labels_for("【CXRAW】【ウルトラマンメビウス】【22】【日々の未来】【DVDrip】【x264 Hi10P AAC】【MP4】");
4516
+ assert!(mebius.contains(&("ウルトラマンメビウス".to_string(), "B-TITLE".to_string())));
4517
+ assert!(mebius.contains(&("22".to_string(), "B-EPISODE".to_string())));
4518
+
4519
+ let battle = labels_for("斗破苍穹三年之约第01话");
4520
+ assert!(battle.contains(&("斗破苍穹三年之约".to_string(), "B-TITLE".to_string())));
4521
+ assert!(battle.contains(&("第".to_string(), "B-EPISODE".to_string())));
4522
+ assert!(battle.contains(&("01".to_string(), "B-EPISODE".to_string())));
4523
+ assert!(battle.contains(&("话".to_string(), "B-EPISODE".to_string())));
4524
+
4525
+ let hakumei = labels_for("妖精森林的小不点01");
4526
+ assert!(hakumei.contains(&("妖精森林的小不点".to_string(), "B-TITLE".to_string())));
4527
+ assert!(hakumei.contains(&("01".to_string(), "B-EPISODE".to_string())));
4528
+
4529
+ let decimal_episode_title = labels_for("无限系统树:第1话可能性的起点");
4530
+ assert!(decimal_episode_title.contains(&("无限系统树".to_string(), "B-TITLE".to_string())));
4531
+ assert!(decimal_episode_title.contains(&("第".to_string(), "B-EPISODE".to_string())));
4532
+ assert!(decimal_episode_title.contains(&("1".to_string(), "B-EPISODE".to_string())));
4533
+
4534
+ let hash_range = labels_for("花田少年史#1-3");
4535
+ assert!(hash_range.contains(&("花田少年史".to_string(), "B-TITLE".to_string())));
4536
+ assert!(hash_range.contains(&("1".to_string(), "B-EPISODE".to_string())));
4537
+ assert!(hash_range.contains(&("-".to_string(), "B-EPISODE".to_string())));
4538
+ assert!(hash_range.contains(&("3".to_string(), "B-EPISODE".to_string())));
4539
+
4540
+ let movie_number = labels_for("[Kamigami] Haikyuu!! Movie - 01 [BD 1080p x265 Ma10p AAC]");
4541
+ assert!(movie_number.contains(&("Haikyuu".to_string(), "B-TITLE".to_string())));
4542
+ assert!(movie_number.contains(&("01".to_string(), "B-SPECIAL".to_string())));
4543
+ assert!(!movie_number.contains(&("01".to_string(), "B-EPISODE".to_string())));
4544
+
4545
+ let ajin_movie = labels_for("[Moozzi2] Ajin The Movie - 01 (BD 1920x1080 x.264 FLACx2)");
4546
+ assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
4547
+ assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
4548
+
4549
+ let eien = labels_for("[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]");
4550
+ assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
4551
+ assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
4552
+
4553
  let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
4554
  assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
4555
  }