ModerRAS commited on
Commit
299b1af
·
1 Parent(s): f9e69dd

Improve DMHY low-frequency metadata handling

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -222,7 +222,7 @@ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
222
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
223
  });
224
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
225
- Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
226
  });
227
  static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
228
  Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
@@ -2207,6 +2207,16 @@ fn is_special_title_phrase(text: &str) -> bool {
2207
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2208
  }
2209
 
 
 
 
 
 
 
 
 
 
 
2210
  const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2211
  &["SPY", "x", "FAMILY"],
2212
  &["Spy", "x", "Family"],
@@ -2222,7 +2232,15 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2222
  fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
2223
  if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2224
  for (index, group) in groups.iter().enumerate() {
 
 
 
 
 
 
 
2225
  if group.class_name == "BRACKET_TEXT"
 
2226
  && whitelists
2227
  .group_names
2228
  .contains(&normalize_whitelist_name(&group_text(tokens, group)))
@@ -2315,6 +2333,22 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2315
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
2316
  let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
2317
  apply_known_title_phrases(tokens, groups, &mut output);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2318
  if roles
2319
  .first()
2320
  .is_some_and(|role| role.starts_with("EPISODE"))
@@ -2728,10 +2762,23 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2728
  } else {
2729
  String::new()
2730
  };
 
 
 
 
 
2731
  if previous_text.ends_with('第') && next_text.starts_with('期') {
2732
  output[index] = "SEASON".to_string();
2733
  continue;
2734
  }
 
 
 
 
 
 
 
 
2735
  if output[..index].iter().any(|role| role == "TITLE")
2736
  && (output[..index]
2737
  .iter()
@@ -3127,6 +3174,19 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3127
  ];
3128
  let mut output = labels.to_vec();
3129
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
 
 
 
 
 
 
 
 
 
 
 
 
3130
  if label != "O" || !entity_joiners.contains(&token.as_str()) {
3131
  continue;
3132
  }
@@ -3808,5 +3868,21 @@ mod tests {
3808
  labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
3809
  assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
3810
  assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3811
  }
3812
  }
 
222
  Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
223
  });
224
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
225
+ Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p?|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
226
  });
227
  static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
228
  Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
 
2207
  || SPECIAL_TITLE_PHRASE_RE.is_match(text)
2208
  }
2209
 
2210
+ fn looks_like_release_group(text: &str) -> bool {
2211
+ let normalized = text.to_ascii_lowercase();
2212
+ normalized.contains("fansub")
2213
+ || normalized.ends_with("sub")
2214
+ || normalized.contains("sub&")
2215
+ || normalized.contains("&sub")
2216
+ || normalized.contains("字幕组")
2217
+ || normalized.contains("字幕組")
2218
+ }
2219
+
2220
  const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2221
  &["SPY", "x", "FAMILY"],
2222
  &["Spy", "x", "Family"],
 
2232
  fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
2233
  if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
2234
  for (index, group) in groups.iter().enumerate() {
2235
+ let previous_structural = roles[..index].iter().any(|role| {
2236
+ role.starts_with("EPISODE")
2237
+ || matches!(
2238
+ role.as_str(),
2239
+ "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
2240
+ )
2241
+ });
2242
  if group.class_name == "BRACKET_TEXT"
2243
+ && !previous_structural
2244
  && whitelists
2245
  .group_names
2246
  .contains(&normalize_whitelist_name(&group_text(tokens, group)))
 
2333
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
2334
  let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
2335
  apply_known_title_phrases(tokens, groups, &mut output);
2336
+ if output.first().is_some_and(|role| role == "GROUP") {
2337
+ let first_text = group_text(tokens, &groups[0]);
2338
+ let first_is_known_group = RUNTIME_WHITELISTS.get().is_some_and(|whitelists| {
2339
+ whitelists
2340
+ .group_names
2341
+ .contains(&normalize_whitelist_name(&first_text))
2342
+ });
2343
+ if !first_is_known_group {
2344
+ if let Some(groupish_index) = (1..groups.len()).find(|&index| {
2345
+ output[index] == "TITLE" && looks_like_release_group(&group_text(tokens, &groups[index]))
2346
+ }) {
2347
+ output[0] = "TITLE".to_string();
2348
+ output[groupish_index] = "GROUP".to_string();
2349
+ }
2350
+ }
2351
+ }
2352
  if roles
2353
  .first()
2354
  .is_some_and(|role| role.starts_with("EPISODE"))
 
2762
  } else {
2763
  String::new()
2764
  };
2765
+ let previous_real_text = (0..index)
2766
+ .rev()
2767
+ .find(|&cursor| groups[cursor].class_name != "SEP")
2768
+ .map(|cursor| group_text(tokens, &groups[cursor]))
2769
+ .unwrap_or_default();
2770
  if previous_text.ends_with('第') && next_text.starts_with('期') {
2771
  output[index] = "SEASON".to_string();
2772
  continue;
2773
  }
2774
+ if matches!(
2775
+ previous_real_text.to_ascii_lowercase().as_str(),
2776
+ "lesson" | "part"
2777
+ )
2778
+ {
2779
+ output[index] = "O".to_string();
2780
+ continue;
2781
+ }
2782
  if output[..index].iter().any(|role| role == "TITLE")
2783
  && (output[..index]
2784
  .iter()
 
3174
  ];
3175
  let mut output = labels.to_vec();
3176
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
3177
+ if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3178
+ let previous_word = (0..index)
3179
+ .rev()
3180
+ .find(|&cursor| {
3181
+ !joiners.contains(&tokens[cursor].as_str()) && labels[cursor] != "O"
3182
+ || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3183
+ })
3184
+ .map(|cursor| tokens[cursor].to_ascii_lowercase());
3185
+ if matches!(previous_word.as_deref(), Some("lesson" | "part")) {
3186
+ output[index] = "O".to_string();
3187
+ continue;
3188
+ }
3189
+ }
3190
  if label != "O" || !entity_joiners.contains(&token.as_str()) {
3191
  continue;
3192
  }
 
3868
  labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
3869
  assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
3870
  assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
3871
+
3872
+ let hi10_source =
3873
+ labels_for("[POPGO][Shigatsu wa Kimi no Uso] [01][Hi10][720P][GB][A964DA24]");
3874
+ assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
3875
+ assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
3876
+
3877
+ let souten =
3878
+ labels_for("[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]");
3879
+ assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
3880
+ assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
3881
+ assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
3882
+
3883
+ let bonjour =
3884
+ labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
3885
+ assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
3886
+ assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
3887
  }
3888
  }