ModerRAS commited on
Commit
aa41de9
·
1 Parent(s): 59042eb

Refine low-frequency DMHY labeling rules

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -178,6 +178,8 @@ static EPISODE_RE: Lazy<Regex> =
178
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
179
  static DECIMAL_EPISODE_RE: Lazy<Regex> =
180
  Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
 
 
181
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
182
  static EPISODE_CJK_PREFIX_RE: Lazy<Regex> =
183
  Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap());
@@ -190,12 +192,15 @@ static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v
190
  static SXE_VALUE_RE: Lazy<Regex> =
191
  Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
192
  static EPISODE_VALUE_RE: Lazy<Regex> =
193
- Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap());
194
  static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
195
  Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
196
  });
197
  static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
198
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
 
 
 
199
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
200
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
201
  Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
@@ -204,6 +209,11 @@ static VOLUME_RE: Lazy<Regex> =
204
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
205
  static DATE_RE: Lazy<Regex> =
206
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
 
 
 
 
 
207
  static LANG_RE: Lazy<Regex> = Lazy::new(|| {
208
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
209
  });
@@ -1338,7 +1348,10 @@ fn classify_atom(text: &str) -> String {
1338
  if RESOLUTION_RE.is_match(&cleaned) {
1339
  return "RESOLUTION".to_string();
1340
  }
1341
- if DATE_RE.is_match(&cleaned) {
 
 
 
1342
  return "DATE".to_string();
1343
  }
1344
  if EPISODE_VERSION_RE.is_match(&compact) {
@@ -1740,7 +1753,7 @@ fn has_encoding_noise(value: &str) -> bool {
1740
  let markers = [
1741
  "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1742
  "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1743
- "伄", "椋", "伓", "姘",
1744
  ];
1745
  let marker_hits = markers
1746
  .iter()
@@ -1750,7 +1763,10 @@ fn has_encoding_noise(value: &str) -> bool {
1750
  .chars()
1751
  .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
1752
  .count();
1753
- marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1)
 
 
 
1754
  }
1755
 
1756
  fn has_non_anime_noise(value: &str) -> bool {
@@ -2096,8 +2112,12 @@ fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
2096
  return Some((pieces, labels));
2097
  }
2098
  let caps = EPISODE_VALUE_RE.captures(token)?;
2099
- let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
2100
- let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
 
 
 
 
2101
  if let Some(version) = caps.get(3) {
2102
  pieces.push("v".to_string());
2103
  pieces.push(version.as_str().to_string());
@@ -2168,9 +2188,14 @@ fn is_special_title_phrase(text: &str) -> bool {
2168
  | "TOKUTEN"
2169
  | "TRAILER"
2170
  | "WORLD PREMIERE"
 
 
2171
  | "映像特典"
2172
  | "特典"
2173
  ) || normalized.contains("映像特典")
 
 
 
2174
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2175
  }
2176
 
@@ -2181,16 +2206,29 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2181
  &["Zom", "100"],
2182
  &["Kamisama", "Hajimemashita", "2"],
2183
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
 
 
2184
  ];
2185
 
2186
  fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
2187
  if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2188
  for (index, group) in groups.iter().enumerate() {
2189
  if group.class_name == "BRACKET_TEXT"
2190
- && roles.get(index).is_some_and(|role| role == "GROUP")
2191
  && whitelists
2192
  .group_names
2193
  .contains(&normalize_whitelist_name(&group_text(tokens, group)))
 
 
 
 
 
 
 
 
 
 
 
 
2194
  {
2195
  roles[index] = "GROUP".to_string();
2196
  }
@@ -2231,7 +2269,14 @@ fn apply_title_phrase(
2231
  {
2232
  for (group_index, _) in window {
2233
  if roles.get(*group_index).is_some_and(|role| role == "GROUP") {
2234
- continue;
 
 
 
 
 
 
 
2235
  }
2236
  if !allow_structural_override
2237
  && roles.get(*group_index).is_some_and(|role| {
@@ -2345,6 +2390,24 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2345
  output[index] = "O".to_string();
2346
  continue;
2347
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2348
  if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
2349
  let previous_text = group_text(tokens, &groups[index - 2]);
2350
  let next_special = output[index + 1..roles.len().min(index + 4)]
@@ -2376,6 +2439,49 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2376
  output[index] = "SPECIAL".to_string();
2377
  continue;
2378
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2379
  if output[index - 2] == "TITLE"
2380
  && groups[index - 1].class_name == "SEP"
2381
  && previous_text.len() <= 48
@@ -2398,6 +2504,27 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2398
  continue;
2399
  }
2400
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2401
  if roles[index].starts_with("EPISODE")
2402
  && BARE_RESOLUTION_RE.is_match(&text)
2403
  && index >= 2
@@ -2474,6 +2601,17 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2474
  && text.chars().any(|ch| ch.is_alphabetic())
2475
  && !ep_markers.contains(&text.as_str())
2476
  {
 
 
 
 
 
 
 
 
 
 
 
2477
  if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
2478
  let episode_since_title = output[last_title + 1..index]
2479
  .iter()
@@ -2561,14 +2699,36 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2561
  } else {
2562
  String::new()
2563
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2564
  if previous_text.contains('点')
2565
  || previous_text.contains('點')
2566
  || previous_text.contains("晚上")
2567
  || previous_text.contains("上午")
2568
  || previous_text.contains("下午")
 
 
2569
  || next_text.contains('点')
2570
  || next_text.contains('點')
2571
  || next_text.contains('半')
 
 
2572
  {
2573
  output[index] = "O".to_string();
2574
  }
@@ -2687,9 +2847,27 @@ fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end:
2687
  ) {
2688
  score -= 500;
2689
  }
 
 
 
2690
  score
2691
  }
2692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2693
  fn roles_candidate_text_group(group: &Group) -> bool {
2694
  matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT")
2695
  }
@@ -2712,19 +2890,39 @@ fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<Stri
2712
 
2713
  fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
2714
  let pieces = split_generated_token(token);
2715
- let labels = pieces
2716
- .iter()
2717
- .map(|piece| {
2718
- if is_standalone_separator(piece) {
2719
- "O".to_string()
2720
- } else if CJK_SEASON_TOKEN_RE.is_match(piece) {
2721
- "B-SEASON".to_string()
2722
- } else {
2723
- "B-TITLE".to_string()
 
 
 
 
 
 
 
 
 
 
 
2724
  }
2725
- })
2726
- .collect();
2727
- (pieces, labels)
 
 
 
 
 
 
 
 
 
2728
  }
2729
 
2730
  fn split_generated_token(token: &str) -> Vec<String> {
@@ -2881,11 +3079,14 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
2881
  right += 1;
2882
  }
2883
  if left >= 0 && right < tokens.len() {
2884
- let left_label = &output[left as usize];
2885
- let right_label = &labels[right];
2886
  if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") {
2887
  output[index] = left_label.clone();
2888
  }
 
 
 
2889
  }
2890
  if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
2891
  let left_label = &output[index - 1];
@@ -3183,6 +3384,47 @@ mod tests {
3183
  assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
3184
  assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
3185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3186
  let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
3187
  assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
3188
  assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
@@ -3388,6 +3630,21 @@ mod tests {
3388
  assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
3389
  assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
3390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3391
  let numeric_title =
3392
  labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
3393
  assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
@@ -3404,5 +3661,49 @@ mod tests {
3404
  assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
3405
  assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
3406
  assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3407
  }
3408
  }
 
178
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
179
  static DECIMAL_EPISODE_RE: Lazy<Regex> =
180
  Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
181
+ static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
182
+ Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
183
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
184
  static EPISODE_CJK_PREFIX_RE: Lazy<Regex> =
185
  Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap());
 
192
  static SXE_VALUE_RE: Lazy<Regex> =
193
  Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
194
  static EPISODE_VALUE_RE: Lazy<Regex> =
195
+ Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4}(?:\.\d{1,2})?)(?:v(\d+))?$").unwrap());
196
  static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
197
  Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
198
  });
199
  static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
200
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
201
+ static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
+ Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
+ });
204
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
205
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
206
  Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 
209
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
210
  static DATE_RE: Lazy<Regex> =
211
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
212
+ static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
213
+ Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()
214
+ });
215
+ static CJK_DATE_RE: Lazy<Regex> =
216
+ Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
217
  static LANG_RE: Lazy<Regex> = Lazy::new(|| {
218
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
219
  });
 
1348
  if RESOLUTION_RE.is_match(&cleaned) {
1349
  return "RESOLUTION".to_string();
1350
  }
1351
+ if DATE_RE.is_match(&cleaned)
1352
+ || DATE_RANGE_MIXED_RE.is_match(&cleaned)
1353
+ || CJK_DATE_RE.is_match(&cleaned)
1354
+ {
1355
  return "DATE".to_string();
1356
  }
1357
  if EPISODE_VERSION_RE.is_match(&compact) {
 
1753
  let markers = [
1754
  "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1755
  "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1756
+ "伄", "椋", "伓", "姘", "帽",
1757
  ];
1758
  let marker_hits = markers
1759
  .iter()
 
1763
  .chars()
1764
  .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
1765
  .count();
1766
+ let latin_mojibake = value.split_whitespace().any(|part| {
1767
+ part.contains('帽') && part.chars().any(|ch| ch.is_ascii_alphabetic())
1768
+ });
1769
+ marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
1770
  }
1771
 
1772
  fn has_non_anime_noise(value: &str) -> bool {
 
2112
  return Some((pieces, labels));
2113
  }
2114
  let caps = EPISODE_VALUE_RE.captures(token)?;
2115
+ let mut pieces = vec![caps[1].to_string()];
2116
+ let mut labels = vec!["O".to_string()];
2117
+ for piece in split_generated_token(&caps[2]) {
2118
+ pieces.push(piece);
2119
+ labels.push("B-EPISODE".to_string());
2120
+ }
2121
  if let Some(version) = caps.get(3) {
2122
  pieces.push("v".to_string());
2123
  pieces.push(version.as_str().to_string());
 
2188
  | "TOKUTEN"
2189
  | "TRAILER"
2190
  | "WORLD PREMIERE"
2191
+ | "番宣"
2192
+ | "宣番"
2193
  | "映像特典"
2194
  | "特典"
2195
  ) || normalized.contains("映像特典")
2196
+ || normalized.contains("特典映像")
2197
+ || normalized.contains("番宣")
2198
+ || normalized.contains("宣番")
2199
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2200
  }
2201
 
 
2206
  &["Zom", "100"],
2207
  &["Kamisama", "Hajimemashita", "2"],
2208
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2209
+ &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2210
+ &["Lupin The Third Jigen Daisuke no Bohyou"],
2211
  ];
2212
 
2213
  fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
2214
  if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2215
  for (index, group) in groups.iter().enumerate() {
2216
  if group.class_name == "BRACKET_TEXT"
 
2217
  && whitelists
2218
  .group_names
2219
  .contains(&normalize_whitelist_name(&group_text(tokens, group)))
2220
+ && !roles.get(index).is_some_and(|role| {
2221
+ matches!(
2222
+ role.as_str(),
2223
+ "EPISODE"
2224
+ | "EPISODE_VERSION"
2225
+ | "EPISODE_RANGE"
2226
+ | "SEASON"
2227
+ | "SOURCE"
2228
+ | "RESOLUTION"
2229
+ | "SPECIAL"
2230
+ )
2231
+ })
2232
  {
2233
  roles[index] = "GROUP".to_string();
2234
  }
 
2269
  {
2270
  for (group_index, _) in window {
2271
  if roles.get(*group_index).is_some_and(|role| role == "GROUP") {
2272
+ let is_known_group = RUNTIME_WHITELISTS.get().is_some_and(|whitelists| {
2273
+ whitelists
2274
+ .group_names
2275
+ .contains(&normalize_whitelist_name(&window[0].1))
2276
+ });
2277
+ if is_known_group {
2278
+ continue;
2279
+ }
2280
  }
2281
  if !allow_structural_override
2282
  && roles.get(*group_index).is_some_and(|role| {
 
2390
  output[index] = "O".to_string();
2391
  continue;
2392
  }
2393
+ if roles[index].starts_with("EPISODE")
2394
+ && index >= 1
2395
+ && output[index - 1] == "TITLE"
2396
+ && groups[index - 1].class_name != "SEP"
2397
+ && text.chars().all(|ch| ch.is_ascii_digit())
2398
+ && (text.len() <= 2
2399
+ || (text.len() <= 3
2400
+ && group_text(tokens, &groups[index - 1])
2401
+ .chars()
2402
+ .any(|ch| !ch.is_ascii())
2403
+ && !group_text(tokens, &groups[index - 1]).ends_with('第')))
2404
+ && roles[index + 1..]
2405
+ .iter()
2406
+ .any(|role| role.starts_with("EPISODE"))
2407
+ {
2408
+ output[index] = "TITLE".to_string();
2409
+ continue;
2410
+ }
2411
  if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
2412
  let previous_text = group_text(tokens, &groups[index - 2]);
2413
  let next_special = output[index + 1..roles.len().min(index + 4)]
 
2439
  output[index] = "SPECIAL".to_string();
2440
  continue;
2441
  }
2442
+ if index >= 1
2443
+ && output[index - 1] == "TITLE"
2444
+ && groups[index - 1].class_name != "SEP"
2445
+ && text.chars().all(|ch| ch.is_ascii_digit())
2446
+ && (text.len() <= 2
2447
+ || (text.len() <= 3
2448
+ && group_text(tokens, &groups[index - 1])
2449
+ .chars()
2450
+ .any(|ch| !ch.is_ascii())
2451
+ && !group_text(tokens, &groups[index - 1]).ends_with('第')))
2452
+ && roles[index + 1..]
2453
+ .iter()
2454
+ .any(|role| role.starts_with("EPISODE"))
2455
+ {
2456
+ output[index] = "TITLE".to_string();
2457
+ continue;
2458
+ }
2459
+ if !output[..index].iter().any(|role| role == "TITLE")
2460
+ && NUMERIC_TITLE_PREFIX_RE.is_match(&text)
2461
+ && output[..index].iter().any(|role| role == "GROUP")
2462
+ && roles[index + 1..]
2463
+ .iter()
2464
+ .any(|role| role.starts_with("EPISODE"))
2465
+ {
2466
+ output[index] = "TITLE".to_string();
2467
+ continue;
2468
+ }
2469
+ if !output[..index].iter().any(|role| role == "TITLE")
2470
+ && NUMERIC_TITLE_PREFIX_RE.is_match(&text)
2471
+ && index + 2 < roles.len()
2472
+ && groups[index + 1].class_name == "SEP"
2473
+ && groups[index + 2].class_name == "TEXT"
2474
+ && group_text(tokens, &groups[index + 2])
2475
+ .chars()
2476
+ .any(|ch| ch.is_alphabetic())
2477
+ && roles[index + 3..]
2478
+ .iter()
2479
+ .any(|role| role.starts_with("EPISODE"))
2480
+ {
2481
+ output[index] = "TITLE".to_string();
2482
+ output[index + 2] = "TITLE".to_string();
2483
+ continue;
2484
+ }
2485
  if output[index - 2] == "TITLE"
2486
  && groups[index - 1].class_name == "SEP"
2487
  && previous_text.len() <= 48
 
2504
  continue;
2505
  }
2506
  }
2507
+ if roles[index].starts_with("EPISODE")
2508
+ && text.chars().all(|ch| ch.is_ascii_digit())
2509
+ && output[..index].iter().any(|role| role == "SPECIAL")
2510
+ && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
2511
+ {
2512
+ let previous_structural = (0..index)
2513
+ .rev()
2514
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2515
+ .and_then(|cursor| output.get(cursor))
2516
+ .map(String::as_str);
2517
+ let next_real = (index + 1..roles.len())
2518
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2519
+ .and_then(|cursor| roles.get(cursor))
2520
+ .map(String::as_str);
2521
+ if matches!(previous_structural, Some("SPECIAL"))
2522
+ && !matches!(next_real, Some("TITLE" | "SEASON"))
2523
+ {
2524
+ output[index] = "SPECIAL".to_string();
2525
+ continue;
2526
+ }
2527
+ }
2528
  if roles[index].starts_with("EPISODE")
2529
  && BARE_RESOLUTION_RE.is_match(&text)
2530
  && index >= 2
 
2601
  && text.chars().any(|ch| ch.is_alphabetic())
2602
  && !ep_markers.contains(&text.as_str())
2603
  {
2604
+ if !output[..index].iter().any(|role| role == "TITLE") {
2605
+ let previous_structural = (0..index)
2606
+ .rev()
2607
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2608
+ .and_then(|cursor| output.get(cursor))
2609
+ .map(String::as_str);
2610
+ if matches!(previous_structural, Some("SPECIAL")) {
2611
+ output[index] = "TITLE".to_string();
2612
+ continue;
2613
+ }
2614
+ }
2615
  if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
2616
  let episode_since_title = output[last_title + 1..index]
2617
  .iter()
 
2699
  } else {
2700
  String::new()
2701
  };
2702
+ if previous_text.ends_with('第') && next_text.starts_with('期') {
2703
+ output[index] = "SEASON".to_string();
2704
+ continue;
2705
+ }
2706
+ if output[..index].iter().any(|role| role == "TITLE")
2707
+ && (output[..index]
2708
+ .iter()
2709
+ .enumerate()
2710
+ .any(|(cursor, role)| {
2711
+ role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
2712
+ }))
2713
+ && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
2714
+ && text.chars().all(|ch| ch.is_ascii_digit())
2715
+ && text.len() <= 3
2716
+ {
2717
+ output[index] = "SPECIAL".to_string();
2718
+ continue;
2719
+ }
2720
  if previous_text.contains('点')
2721
  || previous_text.contains('點')
2722
  || previous_text.contains("晚上")
2723
  || previous_text.contains("上午")
2724
  || previous_text.contains("下午")
2725
+ || previous_text.contains('年')
2726
+ || previous_text.contains('月')
2727
  || next_text.contains('点')
2728
  || next_text.contains('點')
2729
  || next_text.contains('半')
2730
+ || next_text.contains('月')
2731
+ || next_text.contains('日')
2732
  {
2733
  output[index] = "O".to_string();
2734
  }
 
2847
  ) {
2848
  score -= 500;
2849
  }
2850
+ if title_noise_score_penalty(cleaned) {
2851
+ score -= 700;
2852
+ }
2853
  score
2854
  }
2855
 
2856
+ fn title_noise_score_penalty(text: &str) -> bool {
2857
+ let normalized = text
2858
+ .replace(['_', '-', '.'], " ")
2859
+ .split_whitespace()
2860
+ .collect::<Vec<_>>()
2861
+ .join(" ")
2862
+ .to_ascii_lowercase();
2863
+ normalized.contains("bdrip")
2864
+ || normalized.contains("webrip")
2865
+ || normalized.contains("web dl")
2866
+ || normalized.contains("bluray")
2867
+ || normalized.contains("full hd")
2868
+ || normalized.contains("hdtv")
2869
+ }
2870
+
2871
  fn roles_candidate_text_group(group: &Group) -> bool {
2872
  matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT")
2873
  }
 
2890
 
2891
  fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
2892
  let pieces = split_generated_token(token);
2893
+ let mut output_pieces = Vec::new();
2894
+ let mut labels = Vec::new();
2895
+ for piece in pieces {
2896
+ if is_standalone_separator(&piece) {
2897
+ output_pieces.push(piece);
2898
+ labels.push("O".to_string());
2899
+ continue;
2900
+ }
2901
+ if CJK_SEASON_TOKEN_RE.is_match(&piece) {
2902
+ output_pieces.push(piece);
2903
+ labels.push("B-SEASON".to_string());
2904
+ continue;
2905
+ }
2906
+ if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) {
2907
+ let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
2908
+ let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
2909
+ let after = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
2910
+ if !before.is_empty() {
2911
+ output_pieces.push(before.to_string());
2912
+ labels.push("B-TITLE".to_string());
2913
  }
2914
+ output_pieces.push(season.to_string());
2915
+ labels.push("B-SEASON".to_string());
2916
+ if !after.is_empty() {
2917
+ output_pieces.push(after.to_string());
2918
+ labels.push("O".to_string());
2919
+ }
2920
+ continue;
2921
+ }
2922
+ output_pieces.push(piece);
2923
+ labels.push("B-TITLE".to_string());
2924
+ }
2925
+ (output_pieces, labels)
2926
  }
2927
 
2928
  fn split_generated_token(token: &str) -> Vec<String> {
 
3079
  right += 1;
3080
  }
3081
  if left >= 0 && right < tokens.len() {
3082
+ let left_label = output[left as usize].clone();
3083
+ let right_label = labels[right].clone();
3084
  if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") {
3085
  output[index] = left_label.clone();
3086
  }
3087
+ if token == "." && left_label == "B-EPISODE" && right_label == "B-EPISODE" {
3088
+ output[index] = "B-EPISODE".to_string();
3089
+ }
3090
  }
3091
  if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
3092
  let left_label = &output[index - 1];
 
3384
  assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
3385
  assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
3386
 
3387
+ let _ = RUNTIME_WHITELISTS.set(Whitelists {
3388
+ title_phrases: Vec::new(),
3389
+ group_names: [
3390
+ "LowPower-Raws".to_string(),
3391
+ "ANi".to_string(),
3392
+ "LoliHouse".to_string(),
3393
+ "QTS".to_string(),
3394
+ ]
3395
+ .into_iter()
3396
+ .collect(),
3397
+ });
3398
+ let lowpower = labels_for("[LowPower-Raws] 91 Days - 01 (BD 720P x264 10bit AAC)");
3399
+ assert!(lowpower.contains(&("LowPower".to_string(), "B-GROUP".to_string())));
3400
+ assert!(lowpower.contains(&("91".to_string(), "B-TITLE".to_string())));
3401
+ assert!(lowpower.contains(&("Days".to_string(), "B-TITLE".to_string())));
3402
+ assert!(lowpower.contains(&("01".to_string(), "B-EPISODE".to_string())));
3403
+
3404
+ let ririsa = labels_for("[ANi] 2.5 次元的誘惑 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT]");
3405
+ assert!(ririsa.contains(&("2".to_string(), "B-TITLE".to_string())));
3406
+ assert!(ririsa.contains(&(".".to_string(), "B-TITLE".to_string())));
3407
+ assert!(ririsa.contains(&("5".to_string(), "B-TITLE".to_string())));
3408
+ assert!(ririsa.contains(&("次元的誘惑".to_string(), "B-TITLE".to_string())));
3409
+ assert!(ririsa.contains(&("01".to_string(), "B-EPISODE".to_string())));
3410
+
3411
+ let nanabun = labels_for("[LoliHouse] 22-7 - 01 [WebRip 1080p HEVC-10bit AAC ASS]");
3412
+ assert!(nanabun.contains(&("22".to_string(), "B-TITLE".to_string())));
3413
+ assert!(nanabun.contains(&("-".to_string(), "B-TITLE".to_string())));
3414
+ assert!(nanabun.contains(&("7".to_string(), "B-TITLE".to_string())));
3415
+ assert!(nanabun.contains(&("01".to_string(), "B-EPISODE".to_string())));
3416
+
3417
+ let saint = labels_for("[QTS] OVA Saint Seiya The Lost Canvas Meiou Shinwa ep 01 (BD H264 1920x1080 24fps FLAC)");
3418
+ assert!(saint.contains(&("OVA".to_string(), "B-SPECIAL".to_string())));
3419
+ assert!(saint.contains(&("Saint".to_string(), "B-TITLE".to_string())));
3420
+ assert!(saint.contains(&("Seiya".to_string(), "B-TITLE".to_string())));
3421
+ assert!(saint.contains(&("01".to_string(), "B-EPISODE".to_string())));
3422
+
3423
+ let gundam = labels_for("機動戦士ガンダム00 セカンドシーズン/Ep.01 「# 天使再臨」");
3424
+ assert!(gundam.contains(&("機動戦士ガンダム".to_string(), "B-TITLE".to_string())));
3425
+ assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
3426
+ assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
3427
+
3428
  let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
3429
  assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
3430
  assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
 
3630
  assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
3631
  assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
3632
 
3633
+ let aria_notice =
3634
+ labels_for("[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)");
3635
+ assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
3636
+ assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
3637
+ assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
3638
+ assert!(!aria_notice.contains(&("1".to_string(), "B-EPISODE".to_string())));
3639
+
3640
+ let lost_song =
3641
+ labels_for("[Snow-Raws] LOST SONG CM&PV 01(BD 1920x1080 HEVC-YUV420P10 FLAC)");
3642
+ assert!(lost_song.contains(&("LOST".to_string(), "B-TITLE".to_string())));
3643
+ assert!(lost_song.contains(&("CM".to_string(), "B-SPECIAL".to_string())));
3644
+ assert!(lost_song.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
3645
+ assert!(lost_song.contains(&("01".to_string(), "B-SPECIAL".to_string())));
3646
+ assert!(!lost_song.contains(&("01".to_string(), "B-EPISODE".to_string())));
3647
+
3648
  let numeric_title =
3649
  labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
3650
  assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
 
3661
  assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
3662
  assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
3663
  assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
3664
+
3665
+ let ge999 = labels_for("GE999 第024話 「次元航海惑星」1979年02月22日 (720x540 x264 AAC2)");
3666
+ assert!(ge999.contains(&("GE999".to_string(), "B-TITLE".to_string())));
3667
+ assert!(ge999.contains(&("024".to_string(), "B-EPISODE".to_string())));
3668
+ assert!(!ge999.contains(&("22".to_string(), "B-EPISODE".to_string())));
3669
+
3670
+ let galaxy = labels_for("銀河鉄道999 第024話 「次元航海惑星」 (DVD 640x480 WMV9)");
3671
+ assert!(galaxy.contains(&("銀河鉄道".to_string(), "B-TITLE".to_string())));
3672
+ assert!(galaxy.contains(&("999".to_string(), "B-TITLE".to_string())));
3673
+ assert!(galaxy.contains(&("024".to_string(), "B-EPISODE".to_string())));
3674
+
3675
+ let mahoro = labels_for("[POPGO][FREEWIND][Mahoro_Matic][Full_HD-BDRIP][01]");
3676
+ assert!(mahoro.contains(&("Mahoro".to_string(), "B-TITLE".to_string())));
3677
+ assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
3678
+ assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
3679
+
3680
+ let kitaro = labels_for("[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり");
3681
+ assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
3682
+ assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
3683
+ assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
3684
+ assert!(!kitaro.contains(&("1985".to_string(), "B-EPISODE".to_string())));
3685
+
3686
+ let urusei = labels_for("Urusei_Yatsura_DVD_Ep042.5_Simu");
3687
+ assert!(urusei.contains(&("Urusei".to_string(), "B-TITLE".to_string())));
3688
+ assert!(urusei.contains(&("042".to_string(), "B-EPISODE".to_string())));
3689
+ assert!(urusei.contains(&(".".to_string(), "B-EPISODE".to_string())));
3690
+ assert!(urusei.contains(&("5".to_string(), "B-EPISODE".to_string())));
3691
+
3692
+ let lupin =
3693
+ labels_for("[Lupin The Thrid Jigen Daisuke no Bohyou][Logo][BDRIP][1080P][H264_FLAC]");
3694
+ assert!(lupin.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
3695
+ assert!(!lupin.contains(&("Lupin".to_string(), "B-GROUP".to_string())));
3696
+
3697
+ let mirumo = labels_for("【咪路fans】魔法咪路咪路第二季日语版 01[GB][MP4]");
3698
+ assert!(mirumo.contains(&("魔法咪路咪路".to_string(), "B-TITLE".to_string())));
3699
+ assert!(mirumo.contains(&("第二季".to_string(), "B-SEASON".to_string())));
3700
+ assert!(mirumo.contains(&("01".to_string(), "B-EPISODE".to_string())));
3701
+
3702
+ let doremi_bonus = labels_for(
3703
+ "おジャ魔女どれみナ・イ・ショ 特典映像07「おジャ魔女どれみナ・イ・ショ エンドテロップ集」(DVD 640x480 )",
3704
+ );
3705
+ assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string())));
3706
+ assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string())));
3707
+ assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string())));
3708
  }
3709
  }