ModerRAS commited on
Commit
b7152ef
·
1 Parent(s): acafd67

Refine high-frequency DMHY labels

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -203,6 +203,10 @@ static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
203
  });
204
  static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
205
  Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
 
 
 
 
206
  static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
207
  Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
208
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
@@ -219,7 +223,7 @@ static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
219
  static CJK_DATE_RE: Lazy<Regex> =
220
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
221
  static LANG_RE: Lazy<Regex> = Lazy::new(|| {
222
- Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
223
  });
224
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
225
  Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p?|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
@@ -1210,7 +1214,10 @@ fn process_filename(
1210
  }
1211
  }
1212
  };
1213
- if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
 
 
 
1214
  {
1215
  return Processed::Skipped {
1216
  reason: "low_frequency_audit_warning",
@@ -1231,8 +1238,8 @@ fn process_filename(
1231
  }
1232
  }
1233
 
1234
- fn has_blocking_low_frequency_warning(record: &Record) -> bool {
1235
- audit_warnings(record).iter().any(|warning| {
1236
  matches!(
1237
  warning.as_str(),
1238
  "ambiguous_no_episode_title"
@@ -3092,6 +3099,13 @@ fn is_standalone_separator(token: &str) -> bool {
3092
  .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
3093
  }
3094
 
 
 
 
 
 
 
 
3095
  fn split_cjk_title_lang_prefix(token: &str) -> Option<(Vec<String>, Vec<String>)> {
3096
  let caps = CJK_TITLE_LANG_PREFIX_RE.captures(token)?;
3097
  let title = caps.get(1)?.as_str();
@@ -3253,6 +3267,70 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3253
  continue;
3254
  }
3255
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3256
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3257
  let previous_non_space = (0..index)
3258
  .rev()
@@ -3275,13 +3353,19 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3275
  continue;
3276
  }
3277
  }
3278
- let previous_word = (0..index)
3279
- .rev()
3280
- .find(|&cursor| {
3281
- !joiners.contains(&tokens[cursor].as_str()) && labels[cursor] != "O"
3282
- || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3283
- })
3284
- .map(|cursor| tokens[cursor].to_ascii_lowercase());
 
 
 
 
 
 
3285
  if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
3286
  output[index] = "O".to_string();
3287
  continue;
@@ -3560,14 +3644,16 @@ mod tests {
3560
  let bang_season =
3561
  labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]");
3562
  assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string())));
3563
- assert!(bang_season.contains(&("Season".to_string(), "B-TITLE".to_string())));
 
3564
  assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
3565
  assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
3566
 
3567
  let basket =
3568
  labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
3569
  assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
3570
- assert!(basket.contains(&("Season".to_string(), "B-TITLE".to_string())));
 
3571
  assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string())));
3572
  assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string())));
3573
 
@@ -4012,5 +4098,52 @@ mod tests {
4012
  let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
4013
  assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
4014
  assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4015
  }
4016
  }
 
203
  });
204
  static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
205
  Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
206
+ static ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> =
207
+ Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap());
208
+ static SEASON_WORD_RE: Lazy<Regex> =
209
+ Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
210
  static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
211
  Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
212
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 
223
  static CJK_DATE_RE: Lazy<Regex> =
224
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
225
  static LANG_RE: Lazy<Regex> = Lazy::new(|| {
226
+ Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|JPTC|JPSC|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
227
  });
228
  static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
229
  Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p?|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
 
1214
  }
1215
  }
1216
  };
1217
+ let warnings = audit_warnings(&record);
1218
+ if warnings.iter().any(|warning| warning == "no_title")
1219
+ || (recipe.count.unwrap_or(0) <= args.audit_max_count
1220
+ && has_blocking_warnings(&warnings))
1221
  {
1222
  return Processed::Skipped {
1223
  reason: "low_frequency_audit_warning",
 
1238
  }
1239
  }
1240
 
1241
+ fn has_blocking_warnings(warnings: &[String]) -> bool {
1242
+ warnings.iter().any(|warning| {
1243
  matches!(
1244
  warning.as_str(),
1245
  "ambiguous_no_episode_title"
 
3099
  .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
3100
  }
3101
 
3102
+ fn is_unicode_roman_season(token: &str) -> bool {
3103
+ matches!(
3104
+ token,
3105
+ "Ⅰ" | "Ⅱ" | "Ⅲ" | "Ⅳ" | "Ⅴ" | "Ⅵ" | "Ⅶ" | "Ⅷ" | "Ⅸ" | "Ⅹ" | "Ⅺ" | "Ⅻ"
3106
+ )
3107
+ }
3108
+
3109
  fn split_cjk_title_lang_prefix(token: &str) -> Option<(Vec<String>, Vec<String>)> {
3110
  let caps = CJK_TITLE_LANG_PREFIX_RE.captures(token)?;
3111
  let title = caps.get(1)?.as_str();
 
3267
  continue;
3268
  }
3269
  }
3270
+ if label == "B-TITLE" && matches!(token.to_ascii_lowercase().as_str(), "ep" | "episode") {
3271
+ let next_episode = (index + 1..tokens.len()).find(|&cursor| {
3272
+ !joiners.contains(&tokens[cursor].as_str()) || labels[cursor] != "O"
3273
+ });
3274
+ if next_episode.is_some_and(|cursor| labels[cursor] == "B-EPISODE") {
3275
+ output[index] = "O".to_string();
3276
+ continue;
3277
+ }
3278
+ }
3279
+ if label == "B-TITLE" && is_unicode_roman_season(token) {
3280
+ let previous_title_word = (0..index).rev().find(|&cursor| {
3281
+ output[cursor] == "B-TITLE"
3282
+ && tokens[cursor]
3283
+ .chars()
3284
+ .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
3285
+ });
3286
+ let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
3287
+ if previous_title_word.is_some()
3288
+ && !matches!(previous_word.as_deref(), Some("lupin"))
3289
+ {
3290
+ output[index] = "B-SEASON".to_string();
3291
+ continue;
3292
+ }
3293
+ }
3294
+ if label == "B-TITLE" && ORDINAL_SEASON_TOKEN_RE.is_match(token) {
3295
+ let next_word = (index + 1..tokens.len()).find(|&cursor| {
3296
+ !joiners.contains(&tokens[cursor].as_str())
3297
+ && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3298
+ });
3299
+ if next_word.is_some_and(|cursor| {
3300
+ labels[cursor] == "B-TITLE" && SEASON_WORD_RE.is_match(&tokens[cursor])
3301
+ }) {
3302
+ output[index] = "B-SEASON".to_string();
3303
+ if let Some(cursor) = next_word {
3304
+ for joiner_index in index + 1..cursor {
3305
+ if joiners.contains(&tokens[joiner_index].as_str()) {
3306
+ output[joiner_index] = "B-SEASON".to_string();
3307
+ }
3308
+ }
3309
+ output[cursor] = "B-SEASON".to_string();
3310
+ }
3311
+ continue;
3312
+ }
3313
+ }
3314
+ if label == "O"
3315
+ && token.chars().all(|ch| ch.is_ascii_digit())
3316
+ && token.len() <= 3
3317
+ {
3318
+ let previous_non_space = (0..index)
3319
+ .rev()
3320
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3321
+ let next_non_space = (index + 1..tokens.len())
3322
+ .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3323
+ if previous_non_space.is_some_and(|cursor| tokens[cursor] == "[")
3324
+ && next_non_space.is_some_and(|cursor| tokens[cursor] == "]")
3325
+ && output[..index].iter().any(|label| label == "B-TITLE")
3326
+ && output[index + 1..]
3327
+ .iter()
3328
+ .any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION"))
3329
+ {
3330
+ output[index] = "B-EPISODE".to_string();
3331
+ continue;
3332
+ }
3333
+ }
3334
  if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
3335
  let previous_non_space = (0..index)
3336
  .rev()
 
3353
  continue;
3354
  }
3355
  }
3356
+ let mut previous_word = None;
3357
+ for cursor in (0..index).rev() {
3358
+ if matches!(tokens[cursor].as_str(), "]" | "】" | ")" | ")") {
3359
+ break;
3360
+ }
3361
+ if joiners.contains(&tokens[cursor].as_str()) {
3362
+ continue;
3363
+ }
3364
+ if tokens[cursor].chars().any(|ch| ch.is_alphabetic()) {
3365
+ previous_word = Some(tokens[cursor].to_ascii_lowercase());
3366
+ }
3367
+ break;
3368
+ }
3369
  if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
3370
  output[index] = "O".to_string();
3371
  continue;
 
3644
  let bang_season =
3645
  labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]");
3646
  assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string())));
3647
+ assert!(bang_season.contains(&("2nd".to_string(), "B-SEASON".to_string())));
3648
+ assert!(bang_season.contains(&("Season".to_string(), "B-SEASON".to_string())));
3649
  assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
3650
  assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
3651
 
3652
  let basket =
3653
  labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
3654
  assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
3655
+ assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
3656
+ assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
3657
  assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string())));
3658
  assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string())));
3659
 
 
4098
  let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
4099
  assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
4100
  assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
4101
+
4102
+ let nekomoe_lang = labels_for("[Nekomoe kissaten][UniteUp!][05][720p][JPTC]");
4103
+ assert!(nekomoe_lang.contains(&("JPTC".to_string(), "B-SOURCE".to_string())));
4104
+ assert!(!nekomoe_lang.contains(&("JPTC".to_string(), "B-TITLE".to_string())));
4105
+
4106
+ let hayate =
4107
+ labels_for("[漏勺rip][Hayate_the_combat_butler_2nd_Season][23][BDrip X264 AAC 720P]");
4108
+ assert!(hayate.contains(&("Hayate".to_string(), "B-TITLE".to_string())));
4109
+ assert!(hayate.contains(&("2nd".to_string(), "B-SEASON".to_string())));
4110
+ assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string())));
4111
+ assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string())));
4112
+
4113
+ let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty ��][2024][12][AVC][GB][1080P]");
4114
+ assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
4115
+ assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
4116
+ assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
4117
+ assert!(jade.contains(&("12".to_string(), "B-EPISODE".to_string())));
4118
+
4119
+ let yu_no = labels_for(
4120
+ "[JYFanSub][Kono_Yo_no_Hate_de_Koi_wo_Utau_Shoujo_YU-NO][23][BIG5][720P][AVC]",
4121
+ );
4122
+ assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string())));
4123
+ assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string())));
4124
+
4125
+ let fox = labels_for(
4126
+ "[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]",
4127
+ );
4128
+ assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
4129
+ assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
4130
+
4131
+ let kage = labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
4132
+ assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
4133
+ assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
4134
+ assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
4135
+
4136
+ let tiger = labels_for("[虎面人W][Tiger Mask W][01][简日][720p]");
4137
+ assert!(tiger.contains(&("Tiger".to_string(), "B-TITLE".to_string())));
4138
+ assert!(tiger.contains(&("W".to_string(), "B-TITLE".to_string())));
4139
+ assert!(tiger.contains(&("01".to_string(), "B-EPISODE".to_string())));
4140
+
4141
+ let date_live_special =
4142
+ labels_for("[ANK-Raws] デート・ア・ライブⅡ CM01 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)");
4143
+ assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
4144
+ assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
4145
+
4146
+ let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
4147
+ assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
4148
  }
4149
  }