ModerRAS commited on
Commit
acafd67
·
1 Parent(s): 299b1af

Refine DMHY bonus labeling cases

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -207,7 +207,7 @@ static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
207
  Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
208
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
209
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
210
- Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
211
  });
212
  static VOLUME_RE: Lazy<Regex> =
213
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -2192,6 +2192,7 @@ fn is_special_title_phrase(text: &str) -> bool {
2192
  | "TOKUTEN"
2193
  | "TRAILER"
2194
  | "TV SPOT"
 
2195
  | "WORLD PREMIERE"
2196
  | "予告"
2197
  | "番宣"
@@ -2203,6 +2204,7 @@ fn is_special_title_phrase(text: &str) -> bool {
2203
  || normalized.contains("番宣")
2204
  || normalized.contains("宣番")
2205
  || normalized.contains("TV SPOT")
 
2206
  || text.contains("予告")
2207
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2208
  }
@@ -2224,6 +2226,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2224
  &["Zom", "100"],
2225
  &["Kamisama", "Hajimemashita", "2"],
2226
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
 
2227
  &["Ghiblies", "Episode", "2"],
2228
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2229
  &["Lupin The Third Jigen Daisuke no Bohyou"],
@@ -2445,6 +2448,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2445
  output[index] = "O".to_string();
2446
  continue;
2447
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2448
  if roles[index].starts_with("EPISODE")
2449
  && index >= 1
2450
  && output[index - 1] == "TITLE"
@@ -2773,12 +2818,18 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2773
  }
2774
  if matches!(
2775
  previous_real_text.to_ascii_lowercase().as_str(),
2776
- "lesson" | "part"
2777
  )
2778
  {
2779
  output[index] = "O".to_string();
2780
  continue;
2781
  }
 
 
 
 
 
 
2782
  if output[..index].iter().any(|role| role == "TITLE")
2783
  && (output[..index]
2784
  .iter()
@@ -3174,7 +3225,56 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3174
  ];
3175
  let mut output = labels.to_vec();
3176
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3177
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3178
  let previous_word = (0..index)
3179
  .rev()
3180
  .find(|&cursor| {
@@ -3182,7 +3282,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3182
  || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3183
  })
3184
  .map(|cursor| tokens[cursor].to_ascii_lowercase());
3185
- if matches!(previous_word.as_deref(), Some("lesson" | "part")) {
3186
  output[index] = "O".to_string();
3187
  continue;
3188
  }
@@ -3884,5 +3984,33 @@ mod tests {
3884
  labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
3885
  assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
3886
  assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3887
  }
3888
  }
 
207
  Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
208
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
209
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
210
+ Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
211
  });
212
  static VOLUME_RE: Lazy<Regex> =
213
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 
2192
  | "TOKUTEN"
2193
  | "TRAILER"
2194
  | "TV SPOT"
2195
+ | "SPOT"
2196
  | "WORLD PREMIERE"
2197
  | "予告"
2198
  | "番宣"
 
2204
  || normalized.contains("番宣")
2205
  || normalized.contains("宣番")
2206
  || normalized.contains("TV SPOT")
2207
+ || normalized.contains("BD SPOT")
2208
  || text.contains("予告")
2209
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2210
  }
 
2226
  &["Zom", "100"],
2227
  &["Kamisama", "Hajimemashita", "2"],
2228
  &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
2229
+ &["Durarara", "2", "Ketsu"],
2230
  &["Ghiblies", "Episode", "2"],
2231
  &["Lupin The Thrid Jigen Daisuke no Bohyou"],
2232
  &["Lupin The Third Jigen Daisuke no Bohyou"],
 
2448
  output[index] = "O".to_string();
2449
  continue;
2450
  }
2451
+ if roles[index].starts_with("EPISODE")
2452
+ && index >= 2
2453
+ && matches!(group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X")
2454
+ && output[index - 2] == "TITLE"
2455
+ && !roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
2456
+ {
2457
+ output[index] = "TITLE".to_string();
2458
+ if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
2459
+ groups[cursor].class_name != "SEP" && groups[cursor].class_name == "TEXT"
2460
+ }) {
2461
+ output[next_text_index] = "TITLE".to_string();
2462
+ }
2463
+ continue;
2464
+ }
2465
+ if roles[index].starts_with("EPISODE")
2466
+ && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
2467
+ && group_text(
2468
+ tokens,
2469
+ &groups[(0..index)
2470
+ .rev()
2471
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2472
+ .unwrap_or(index)],
2473
+ )
2474
+ .eq_ignore_ascii_case("Movie")
2475
+ {
2476
+ output[index] = "TITLE".to_string();
2477
+ continue;
2478
+ }
2479
+ if output[index] == "TITLE"
2480
+ && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
2481
+ {
2482
+ let next_source_lang = (index + 1..roles.len())
2483
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2484
+ .is_some_and(|cursor| {
2485
+ output[cursor] == "SOURCE"
2486
+ && group_text(tokens, &groups[cursor]).contains('语')
2487
+ });
2488
+ if next_source_lang {
2489
+ output[index] = "SOURCE".to_string();
2490
+ continue;
2491
+ }
2492
+ }
2493
  if roles[index].starts_with("EPISODE")
2494
  && index >= 1
2495
  && output[index - 1] == "TITLE"
 
2818
  }
2819
  if matches!(
2820
  previous_real_text.to_ascii_lowercase().as_str(),
2821
+ "lesson" | "part" | "no"
2822
  )
2823
  {
2824
  output[index] = "O".to_string();
2825
  continue;
2826
  }
2827
+ if previous_real_text.contains("予告")
2828
+ || previous_real_text.eq_ignore_ascii_case("Spot")
2829
+ {
2830
+ output[index] = "SPECIAL".to_string();
2831
+ continue;
2832
+ }
2833
  if output[..index].iter().any(|role| role == "TITLE")
2834
  && (output[..index]
2835
  .iter()
 
3225
  ];
3226
  let mut output = labels.to_vec();
3227
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
3228
+ if label == "B-TITLE"
3229
+ && token.chars().all(|ch| ch.is_ascii_digit())
3230
+ && token.len() == 3
3231
+ && index + 1 < tokens.len()
3232
+ && matches!(tokens[index + 1].as_str(), "「" | "「" | "\"" | "'")
3233
+ {
3234
+ output[index] = "B-EPISODE".to_string();
3235
+ let mut cursor = index + 1;
3236
+ while cursor < tokens.len() {
3237
+ output[cursor] = "O".to_string();
3238
+ if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1 {
3239
+ break;
3240
+ }
3241
+ cursor += 1;
3242
+ }
3243
+ continue;
3244
+ }
3245
+ if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") {
3246
+ let next_word = (index + 1..tokens.len()).find(|&cursor| {
3247
+ tokens[cursor].chars().any(|ch| ch.is_alphanumeric())
3248
+ });
3249
+ if next_word.is_some_and(|cursor| {
3250
+ labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
3251
+ }) {
3252
+ output[index] = "B-SOURCE".to_string();
3253
+ continue;
3254
+ }
3255
+ }
3256
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3257
+ let previous_non_space = (0..index)
3258
+ .rev()
3259
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3260
+ if previous_non_space
3261
+ .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
3262
+ {
3263
+ let left_title = (0..previous_non_space.unwrap())
3264
+ .rev()
3265
+ .find(|&cursor| labels[cursor] != "O")
3266
+ .is_some_and(|cursor| labels[cursor] == "B-TITLE");
3267
+ if left_title {
3268
+ output[index] = "B-TITLE".to_string();
3269
+ if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
3270
+ labels[cursor] == "O"
3271
+ && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3272
+ }) {
3273
+ output[next_word] = "B-TITLE".to_string();
3274
+ }
3275
+ continue;
3276
+ }
3277
+ }
3278
  let previous_word = (0..index)
3279
  .rev()
3280
  .find(|&cursor| {
 
3282
  || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3283
  })
3284
  .map(|cursor| tokens[cursor].to_ascii_lowercase());
3285
+ if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
3286
  output[index] = "O".to_string();
3287
  continue;
3288
  }
 
3984
  labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
3985
  assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
3986
  assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
3987
+
3988
+ let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
3989
+ assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
3990
+ assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
3991
+ assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
3992
+
3993
+ let bd_spot =
3994
+ labels_for("[Moozzi2] Amanchu! [SP05] BD-Spot - 01 (BD 1920x1080 x.264 Flac)");
3995
+ assert!(bd_spot.contains(&("Spot".to_string(), "B-SPECIAL".to_string())));
3996
+ assert!(bd_spot.contains(&("01".to_string(), "B-SPECIAL".to_string())));
3997
+ assert!(!bd_spot.contains(&("01".to_string(), "B-EPISODE".to_string())));
3998
+
3999
+ let preview_number =
4000
+ labels_for("[Snow-Raws] 刀使ノ巫女 第02話 予告01 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
4001
+ assert!(preview_number.contains(&("02".to_string(), "B-EPISODE".to_string())));
4002
+ assert!(preview_number.contains(&("01".to_string(), "B-SPECIAL".to_string())));
4003
+
4004
+ let bleach_movie = labels_for("Bleach the Movie 3 - Fade to Black, I Call Your Name");
4005
+ assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
4006
+ assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
4007
+
4008
+ let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
4009
+ assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
4010
+ assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
4011
+
4012
+ let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
4013
+ assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
4014
+ assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
4015
  }
4016
  }