ModerRAS commited on
Commit
f9e69dd
·
1 Parent(s): aa41de9

Tighten DMHY low-frequency special cases

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -201,9 +201,13 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
201
  static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
  Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
  });
 
 
 
 
204
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
205
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
206
- Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
207
  });
208
  static VOLUME_RE: Lazy<Regex> =
209
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -2187,7 +2191,9 @@ fn is_special_title_phrase(text: &str) -> bool {
2187
  | "THEATER GREETING EVENT"
2188
  | "TOKUTEN"
2189
  | "TRAILER"
 
2190
  | "WORLD PREMIERE"
 
2191
  | "番宣"
2192
  | "宣番"
2193
  | "映像特典"
@@ -2196,6 +2202,8 @@ fn is_special_title_phrase(text: &str) -> bool {
2196
  || normalized.contains("特典映像")
2197
  || normalized.contains("番宣")
2198
  || normalized.contains("宣番")
 
 
2199
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2200
  }
2201
 
@@ -2206,6 +2214,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2206
  &["Zom", "100"],
2207
  &["Kamisama", "Hajimemashita", "2"],
2208
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
 
2209
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2210
  &["Lupin The Third Jigen Daisuke no Bohyou"],
2211
  ];
@@ -2386,6 +2395,18 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2386
  if output[index] == "O" && groups[index].class_name.contains("SXE") {
2387
  output[index] = "EPISODE".to_string();
2388
  }
 
 
 
 
 
 
 
 
 
 
 
 
2389
  if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
2390
  output[index] = "O".to_string();
2391
  continue;
@@ -2505,7 +2526,11 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2505
  }
2506
  }
2507
  if roles[index].starts_with("EPISODE")
2508
- && text.chars().all(|ch| ch.is_ascii_digit())
 
 
 
 
2509
  && output[..index].iter().any(|role| role == "SPECIAL")
2510
  && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
2511
  {
@@ -2531,9 +2556,13 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2531
  && groups[index - 1].class_name == "SEP"
2532
  {
2533
  let previous_text = group_text(tokens, &groups[index - 2]);
 
 
 
2534
  if previous_text
2535
  .chars()
2536
  .any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
 
2537
  {
2538
  output[index] = "RESOLUTION".to_string();
2539
  continue;
@@ -2724,11 +2753,13 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2724
  || previous_text.contains("下午")
2725
  || previous_text.contains('年')
2726
  || previous_text.contains('月')
 
2727
  || next_text.contains('点')
2728
  || next_text.contains('點')
2729
  || next_text.contains('半')
2730
  || next_text.contains('月')
2731
  || next_text.contains('日')
 
2732
  {
2733
  output[index] = "O".to_string();
2734
  }
@@ -2898,11 +2929,22 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
2898
  labels.push("O".to_string());
2899
  continue;
2900
  }
2901
- if CJK_SEASON_TOKEN_RE.is_match(&piece) {
2902
  output_pieces.push(piece);
2903
  labels.push("B-SEASON".to_string());
2904
  continue;
2905
  }
 
 
 
 
 
 
 
 
 
 
 
2906
  if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) {
2907
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
2908
  let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
@@ -2952,6 +2994,23 @@ fn is_standalone_separator(token: &str) -> bool {
2952
  .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
2953
  }
2954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2955
  fn project_refined_tokens(
2956
  tokens: &[String],
2957
  groups: &[Group],
@@ -2982,6 +3041,13 @@ fn project_refined_tokens(
2982
  continue;
2983
  }
2984
  }
 
 
 
 
 
 
 
2985
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
2986
  if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
2987
  output_tokens.extend(pieces);
@@ -3705,5 +3771,42 @@ mod tests {
3705
  assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string())));
3706
  assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string())));
3707
  assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3708
  }
3709
  }
 
201
  static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
  Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
  });
204
+ static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
205
+ Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
206
+ static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
207
+ Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
208
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
209
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
210
+ Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
211
  });
212
  static VOLUME_RE: Lazy<Regex> =
213
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 
2191
  | "THEATER GREETING EVENT"
2192
  | "TOKUTEN"
2193
  | "TRAILER"
2194
+ | "TV SPOT"
2195
  | "WORLD PREMIERE"
2196
+ | "予告"
2197
  | "番宣"
2198
  | "宣番"
2199
  | "映像特典"
 
2202
  || normalized.contains("特典映像")
2203
  || normalized.contains("番宣")
2204
  || normalized.contains("宣番")
2205
+ || normalized.contains("TV SPOT")
2206
+ || text.contains("予告")
2207
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2208
  }
2209
 
 
2214
  &["Zom", "100"],
2215
  &["Kamisama", "Hajimemashita", "2"],
2216
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2217
+ &["Ghiblies", "Episode", "2"],
2218
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2219
  &["Lupin The Third Jigen Daisuke no Bohyou"],
2220
  ];
 
2395
  if output[index] == "O" && groups[index].class_name.contains("SXE") {
2396
  output[index] = "EPISODE".to_string();
2397
  }
2398
+ if text.eq_ignore_ascii_case("TV") {
2399
+ let next_text = (index + 1..roles.len())
2400
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2401
+ .map(|cursor| (cursor, group_text(tokens, &groups[cursor])));
2402
+ if let Some((spot_index, spot_text)) = next_text {
2403
+ if spot_text.eq_ignore_ascii_case("Spot") {
2404
+ output[index] = "SPECIAL".to_string();
2405
+ output[spot_index] = "SPECIAL".to_string();
2406
+ continue;
2407
+ }
2408
+ }
2409
+ }
2410
  if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
2411
  output[index] = "O".to_string();
2412
  continue;
 
2526
  }
2527
  }
2528
  if roles[index].starts_with("EPISODE")
2529
+ && (text.chars().all(|ch| ch.is_ascii_digit())
2530
+ || matches!(
2531
+ classify_atom(&text).as_str(),
2532
+ "EPISODE" | "EPISODE_VERSION"
2533
+ ))
2534
  && output[..index].iter().any(|role| role == "SPECIAL")
2535
  && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
2536
  {
 
2556
  && groups[index - 1].class_name == "SEP"
2557
  {
2558
  let previous_text = group_text(tokens, &groups[index - 2]);
2559
+ let next_sourceish = (index + 1..roles.len())
2560
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2561
+ .is_some_and(|cursor| matches!(roles[cursor].as_str(), "SOURCE" | "RESOLUTION"));
2562
  if previous_text
2563
  .chars()
2564
  .any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
2565
+ || next_sourceish
2566
  {
2567
  output[index] = "RESOLUTION".to_string();
2568
  continue;
 
2753
  || previous_text.contains("下午")
2754
  || previous_text.contains('年')
2755
  || previous_text.contains('月')
2756
+ || previous_text.contains('秒')
2757
  || next_text.contains('点')
2758
  || next_text.contains('點')
2759
  || next_text.contains('半')
2760
  || next_text.contains('月')
2761
  || next_text.contains('日')
2762
+ || next_text.contains('秒')
2763
  {
2764
  output[index] = "O".to_string();
2765
  }
 
2929
  labels.push("O".to_string());
2930
  continue;
2931
  }
2932
+ if CJK_SEASON_TOKEN_RE.is_match(&piece) || SEASON_RE.is_match(&piece) {
2933
  output_pieces.push(piece);
2934
  labels.push("B-SEASON".to_string());
2935
  continue;
2936
  }
2937
+ if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
2938
+ let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
2939
+ let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
2940
+ if !before.is_empty() {
2941
+ output_pieces.push(before.to_string());
2942
+ labels.push("B-TITLE".to_string());
2943
+ }
2944
+ output_pieces.push(season.to_string());
2945
+ labels.push("B-SEASON".to_string());
2946
+ continue;
2947
+ }
2948
  if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) {
2949
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
2950
  let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
 
2994
  .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
2995
  }
2996
 
2997
+ fn split_cjk_title_lang_prefix(token: &str) -> Option<(Vec<String>, Vec<String>)> {
2998
+ let caps = CJK_TITLE_LANG_PREFIX_RE.captures(token)?;
2999
+ let title = caps.get(1)?.as_str();
3000
+ let lang = caps.get(2)?.as_str();
3001
+ let marker = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
3002
+ if title.chars().count() < 2 {
3003
+ return None;
3004
+ }
3005
+ let mut pieces = vec![title.to_string(), lang.to_string()];
3006
+ let mut labels = vec!["B-TITLE".to_string(), "B-SOURCE".to_string()];
3007
+ if !marker.is_empty() {
3008
+ pieces.push(marker.to_string());
3009
+ labels.push("O".to_string());
3010
+ }
3011
+ Some((pieces, labels))
3012
+ }
3013
+
3014
  fn project_refined_tokens(
3015
  tokens: &[String],
3016
  groups: &[Group],
 
3041
  continue;
3042
  }
3043
  }
3044
+ if role == "SOURCE" {
3045
+ if let Some((pieces, labels)) = split_cjk_title_lang_prefix(token) {
3046
+ output_tokens.extend(pieces);
3047
+ output_labels.extend(labels);
3048
+ continue;
3049
+ }
3050
+ }
3051
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
3052
  if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
3053
  output_tokens.extend(pieces);
 
3771
  assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string())));
3772
  assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string())));
3773
  assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string())));
3774
+
3775
+ let bd_menu =
3776
+ labels_for("[HYSUB]Kuusen Madoushi Kouhosei no Kyoukan[BDMenu][01v1][MP4][1280X720]");
3777
+ assert!(bd_menu.contains(&("BDMenu".to_string(), "B-SPECIAL".to_string())));
3778
+ assert!(bd_menu.contains(&("01v1".to_string(), "B-SPECIAL".to_string())));
3779
+ assert!(!bd_menu.contains(&("BDMenu".to_string(), "B-TITLE".to_string())));
3780
+
3781
+ let ura_on = labels_for("K-ON !! (TV S2 2010). URA-ON !! 01; 1080_h264_flac");
3782
+ assert!(ura_on.contains(&("K".to_string(), "B-TITLE".to_string())));
3783
+ assert!(ura_on.contains(&("01".to_string(), "B-EPISODE".to_string())));
3784
+ assert!(ura_on.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
3785
+ assert!(!ura_on.contains(&("1080".to_string(), "B-EPISODE".to_string())));
3786
+
3787
+ let machikado = labels_for("[KTXP][Machikado_Mazoku_S2][Mini][01][GB][1080p][BDrip][HEVC]");
3788
+ assert!(machikado.contains(&("Machikado".to_string(), "B-TITLE".to_string())));
3789
+ assert!(machikado.contains(&("S2".to_string(), "B-SEASON".to_string())));
3790
+ assert!(machikado.contains(&("01".to_string(), "B-EPISODE".to_string())));
3791
+
3792
+ let ronin = labels_for("【蓝色狂想】魔神坛斗士国日双语第01集");
3793
+ assert!(ronin.contains(&("魔神坛斗士".to_string(), "B-TITLE".to_string())));
3794
+ assert!(ronin.contains(&("国日双语".to_string(), "B-SOURCE".to_string())));
3795
+ assert!(ronin.contains(&("01".to_string(), "B-EPISODE".to_string())));
3796
+
3797
+ let ghiblies = labels_for("Ghiblies - Episode 2 op");
3798
+ assert!(ghiblies.contains(&("Ghiblies".to_string(), "B-TITLE".to_string())));
3799
+ assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
3800
+ assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
3801
+
3802
+ let tv_spot = labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
3803
+ assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
3804
+ assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
3805
+ assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
3806
+
3807
+ let preview_seconds =
3808
+ labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
3809
+ assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
3810
+ assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
3811
  }
3812
  }