ModerRAS commited on
Commit
494b24c
·
1 Parent(s): 79b4d87

Add low-frequency DMHY audit gate

Browse files
tools/rust_dmhy_template_apply/README.md CHANGED
@@ -31,6 +31,18 @@ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml --
31
  --manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
32
  ```
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  Optional controls:
35
 
36
  ```powershell
@@ -47,3 +59,10 @@ The output is intended to match `tools/apply_dmhy_template_recipes.py` at the
47
  record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
48
  plus optional `source_filename`, `path_trimmed`, and
49
  `dropped_title_candidate_positions`.
 
 
 
 
 
 
 
 
31
  --manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
32
  ```
33
 
34
+ Audit low-frequency recipe output from the repository root:
35
+
36
+ ```powershell
37
+ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
38
+ --audit-low-frequency `
39
+ --input datasets\AnimeName\dmhy_list.jsonl `
40
+ --recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
41
+ --audit-output reports\dmhy_low_frequency_audit.rust.jsonl `
42
+ --audit-max-count 50 `
43
+ --threads 24
44
+ ```
45
+
46
  Optional controls:
47
 
48
  ```powershell
 
59
  record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
60
  plus optional `source_filename`, `path_trimmed`, and
61
  `dropped_title_candidate_positions`.
62
+
63
+ For low-frequency templates (`count <= --audit-max-count`, default `50`), apply
64
+ uses a conservative gate: records with `no_title`, `multiple_title_spans`,
65
+ `path_retained`, or `hash_labeled` audit warnings are skipped from the training
66
+ JSONL and left in the audit/review files. This keeps common templates stable
67
+ while preventing rare ambiguous path/title cases from polluting the generated
68
+ dataset.
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -17,6 +17,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
17
  struct Args {
18
  #[arg(long)]
19
  cluster: bool,
 
 
20
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
21
  input: PathBuf,
22
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -47,6 +49,10 @@ struct Args {
47
  recipes_output: PathBuf,
48
  #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
49
  review_output: PathBuf,
 
 
 
 
50
  #[arg(long)]
51
  limit: Option<usize>,
52
  #[arg(long)]
@@ -115,6 +121,7 @@ struct Stats {
115
  skipped_no_recipe: usize,
116
  skipped_sample_cap: usize,
117
  skipped_role_mismatch: usize,
 
118
  written: usize,
119
  }
120
 
@@ -164,7 +171,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
164
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
165
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
166
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
167
- Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
168
  });
169
  static VOLUME_RE: Lazy<Regex> =
170
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -187,6 +194,8 @@ static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
187
  });
188
  static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
189
  Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
 
 
190
  static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
191
  Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
192
  static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
@@ -225,6 +234,9 @@ fn main() -> Result<()> {
225
  if args.cluster {
226
  return run_cluster(&args);
227
  }
 
 
 
228
  if args.expand != "all" && args.expand != "sample" {
229
  bail!("--expand must be all or sample");
230
  }
@@ -293,6 +305,9 @@ fn main() -> Result<()> {
293
  "no_recipe" => stats.skipped_no_recipe += 1,
294
  "sample_cap" => stats.skipped_sample_cap += 1,
295
  "role_mismatch" => stats.skipped_role_mismatch += 1,
 
 
 
296
  _ => {}
297
  }
298
  }
@@ -312,6 +327,13 @@ fn main() -> Result<()> {
312
  "selected_templates": recipes.len(),
313
  "confidence": args.confidence,
314
  "min_count": args.min_count,
 
 
 
 
 
 
 
315
  "expand": args.expand,
316
  "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
317
  "stats": stats,
@@ -603,6 +625,156 @@ fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
603
  Ok(())
604
  }
605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  fn process_filename(
607
  original: &str,
608
  args: &Args,
@@ -654,6 +826,13 @@ fn process_filename(
654
  }
655
  }
656
  };
 
 
 
 
 
 
 
657
  if trimmed_parent {
658
  record.source_filename = Some(original.to_string());
659
  record.path_trimmed = Some(true);
@@ -668,6 +847,15 @@ fn process_filename(
668
  }
669
  }
670
 
 
 
 
 
 
 
 
 
 
671
  fn tokenize(value: &str) -> Vec<String> {
672
  let mut output = Vec::new();
673
  let mut index = 0;
@@ -1007,7 +1195,14 @@ fn suggested_roles(template: &str) -> Vec<String> {
1007
  roles[*index] = "TITLE".to_string();
1008
  }
1009
  } else if bracket_text.len() == 1 {
1010
- roles[bracket_text[0]] = if text.is_empty() { "TITLE" } else { "GROUP" }.to_string();
 
 
 
 
 
 
 
1011
  }
1012
  for index in text {
1013
  roles[index] = "TITLE".to_string();
@@ -1052,6 +1247,9 @@ fn training_filename_for(original: &str) -> (String, bool) {
1052
  .collect();
1053
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
1054
  if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
 
 
 
1055
  let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
1056
  let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
1057
  if parent_seasons
@@ -1070,6 +1268,11 @@ fn training_filename_for(original: &str) -> (String, bool) {
1070
  }
1071
  }
1072
 
 
 
 
 
 
1073
  fn path_segment_has_season(value: &str) -> bool {
1074
  PATH_SEGMENT_SEASON_RE.is_match(value)
1075
  }
@@ -1150,7 +1353,7 @@ fn role_label(role: &str) -> String {
1150
  "SEASON" => Some("SEASON"),
1151
  "SPECIAL" | "VOLUME" => Some("SPECIAL"),
1152
  "RESOLUTION" => Some("RESOLUTION"),
1153
- "SOURCE" | "HASH" => Some("SOURCE"),
1154
  _ => None,
1155
  };
1156
  entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
@@ -1311,7 +1514,10 @@ fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String
1311
  if atom_class == "RESOLUTION" {
1312
  return "B-RESOLUTION".to_string();
1313
  }
1314
- if matches!(atom_class.as_str(), "MEDIA" | "LANG" | "HASH") {
 
 
 
1315
  return "B-SOURCE".to_string();
1316
  }
1317
  if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
@@ -1489,6 +1695,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1489
  "vol" | "volume"
1490
  )
1491
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
1492
  output[index - 2] = "SPECIAL".to_string();
1493
  output[index] = "SPECIAL".to_string();
1494
  continue;
@@ -1548,6 +1767,27 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1548
  output[index] = "SPECIAL".to_string();
1549
  continue;
1550
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1551
  if roles[index] == "TITLE"
1552
  && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
1553
  && index + 2 < roles.len()
@@ -1616,19 +1856,26 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
1616
  index += 1;
1617
  continue;
1618
  }
1619
- if groups[index].class_name == "BRACKET_TEXT" {
1620
- candidates.push((index, index + 1));
1621
- index += 1;
1622
- continue;
1623
- }
1624
  let start = index;
1625
  index += 1;
1626
- while index + 1 < roles.len()
1627
- && roles[index] == "O"
1628
- && groups[index].class_name == "SEP"
1629
- && roles[index + 1] == "TITLE"
1630
- {
1631
- index += 2;
 
 
 
 
 
 
 
 
 
 
 
 
1632
  }
1633
  candidates.push((start, index));
1634
  }
@@ -1838,10 +2085,15 @@ fn project_refined_tokens(
1838
 
1839
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
1840
  let joiners = [
1841
- " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!",
 
 
1842
  ];
 
1843
  let entity_joiners = [
1844
- " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "&", "&",
 
 
1845
  ];
1846
  let mut output = labels.to_vec();
1847
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -1869,6 +2121,12 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
1869
  output[index] = left_label.clone();
1870
  }
1871
  }
 
 
 
 
 
 
1872
  }
1873
  output
1874
  }
@@ -1962,6 +2220,32 @@ mod tests {
1962
  assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
1963
  assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
1964
  assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1965
 
1966
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
1967
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -1987,6 +2271,13 @@ mod tests {
1987
  trimmed,
1988
  "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
1989
  );
 
 
 
 
 
 
 
1990
  let woody = labels_for(&trimmed);
1991
  assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
1992
  assert!(woody.contains(&("E".to_string(), "O".to_string())));
 
17
  struct Args {
18
  #[arg(long)]
19
  cluster: bool,
20
+ #[arg(long)]
21
+ audit_low_frequency: bool,
22
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
23
  input: PathBuf,
24
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
 
49
  recipes_output: PathBuf,
50
  #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
51
  review_output: PathBuf,
52
+ #[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
53
+ audit_output: PathBuf,
54
+ #[arg(long, default_value_t = 50)]
55
+ audit_max_count: u64,
56
  #[arg(long)]
57
  limit: Option<usize>,
58
  #[arg(long)]
 
121
  skipped_no_recipe: usize,
122
  skipped_sample_cap: usize,
123
  skipped_role_mismatch: usize,
124
+ skipped_low_frequency_audit_warning: usize,
125
  written: usize,
126
  }
127
 
 
171
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
172
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
173
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
174
+ Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
175
  });
176
  static VOLUME_RE: Lazy<Regex> =
177
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 
194
  });
195
  static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
196
  Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
197
+ static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> =
198
+ Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap());
199
  static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
200
  Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
201
  static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
 
234
  if args.cluster {
235
  return run_cluster(&args);
236
  }
237
+ if args.audit_low_frequency {
238
+ return run_low_frequency_audit(&args);
239
+ }
240
  if args.expand != "all" && args.expand != "sample" {
241
  bail!("--expand must be all or sample");
242
  }
 
305
  "no_recipe" => stats.skipped_no_recipe += 1,
306
  "sample_cap" => stats.skipped_sample_cap += 1,
307
  "role_mismatch" => stats.skipped_role_mismatch += 1,
308
+ "low_frequency_audit_warning" => {
309
+ stats.skipped_low_frequency_audit_warning += 1
310
+ }
311
  _ => {}
312
  }
313
  }
 
327
  "selected_templates": recipes.len(),
328
  "confidence": args.confidence,
329
  "min_count": args.min_count,
330
+ "low_frequency_audit_max_count": args.audit_max_count,
331
+ "low_frequency_blocking_warnings": [
332
+ "hash_labeled",
333
+ "multiple_title_spans",
334
+ "no_title",
335
+ "path_retained"
336
+ ],
337
  "expand": args.expand,
338
  "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
339
  "stats": stats,
 
625
  Ok(())
626
  }
627
 
628
+ fn run_low_frequency_audit(args: &Args) -> Result<()> {
629
+ let recipes = load_recipes(args)?;
630
+ let inputs = load_input(&args.input, args.limit)?;
631
+ let low_template_total = recipes
632
+ .values()
633
+ .filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count)
634
+ .count();
635
+ let mut seen_templates = HashSet::new();
636
+ let mut rows = Vec::new();
637
+
638
+ for original in inputs {
639
+ if !args.keep_encoding_noise
640
+ && (has_encoding_noise(&original)
641
+ || has_non_anime_noise(&original)
642
+ || has_abstract_path_noise(&original))
643
+ {
644
+ continue;
645
+ }
646
+ let (training_filename, trimmed_parent) = training_filename_for(&original);
647
+ let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
648
+ let Some(recipe) = recipes.get(&key) else {
649
+ continue;
650
+ };
651
+ let count = recipe.count.unwrap_or(0);
652
+ if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) {
653
+ continue;
654
+ }
655
+ if recipe.roles.len() != groups.len() {
656
+ continue;
657
+ }
658
+ let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles)
659
+ else {
660
+ continue;
661
+ };
662
+ if trimmed_parent {
663
+ record.source_filename = Some(original.clone());
664
+ record.path_trimmed = Some(true);
665
+ }
666
+ rows.push(json!({
667
+ "template_id": recipe.template_id,
668
+ "count": count,
669
+ "template": recipe.template,
670
+ "filename": record.filename,
671
+ "source_filename": record.source_filename,
672
+ "path_trimmed": record.path_trimmed.unwrap_or(false),
673
+ "spans": entity_spans(&record.tokens, &record.labels),
674
+ "warnings": audit_warnings(&record),
675
+ "tokens": record.tokens,
676
+ "labels": record.labels,
677
+ }));
678
+ if seen_templates.len() >= low_template_total {
679
+ break;
680
+ }
681
+ }
682
+
683
+ rows.sort_by(|a, b| {
684
+ let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0);
685
+ let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0);
686
+ let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or("");
687
+ let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or("");
688
+ count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b))
689
+ });
690
+ write_jsonl_values(&args.audit_output, &rows)?;
691
+ let warning_counts = warning_counts(&rows);
692
+ let manifest = json!({
693
+ "generated_at": Utc::now().to_rfc3339(),
694
+ "input": args.input.to_string_lossy(),
695
+ "recipes": args.recipes.to_string_lossy(),
696
+ "audit_output": args.audit_output.to_string_lossy(),
697
+ "audit_max_count": args.audit_max_count,
698
+ "low_template_total": low_template_total,
699
+ "audited_templates": rows.len(),
700
+ "warning_counts": warning_counts,
701
+ "implementation": "rust_dmhy_low_frequency_audit"
702
+ });
703
+ println!("{}", serde_json::to_string_pretty(&manifest)?);
704
+ Ok(())
705
+ }
706
+
707
+ fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
708
+ let mut spans = Vec::new();
709
+ let mut current_label: Option<String> = None;
710
+ let mut current_text = String::new();
711
+ for (token, label) in tokens.iter().zip(labels.iter()) {
712
+ let entity = label
713
+ .strip_prefix("B-")
714
+ .or_else(|| label.strip_prefix("I-"))
715
+ .unwrap_or("O");
716
+ if current_label.as_deref() == Some(entity) {
717
+ current_text.push_str(token);
718
+ continue;
719
+ }
720
+ if let Some(label) = current_label.take() {
721
+ if label != "O" {
722
+ spans.push(json!({ "label": label, "text": current_text }));
723
+ }
724
+ }
725
+ current_label = Some(entity.to_string());
726
+ current_text = token.clone();
727
+ }
728
+ if let Some(label) = current_label {
729
+ if label != "O" {
730
+ spans.push(json!({ "label": label, "text": current_text }));
731
+ }
732
+ }
733
+ spans
734
+ }
735
+
736
+ fn audit_warnings(record: &Record) -> Vec<String> {
737
+ let mut warnings = Vec::new();
738
+ let title_spans = entity_spans(&record.tokens, &record.labels)
739
+ .into_iter()
740
+ .filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE"))
741
+ .count();
742
+ if title_spans == 0 {
743
+ warnings.push("no_title".to_string());
744
+ } else if title_spans > 1 {
745
+ warnings.push("multiple_title_spans".to_string());
746
+ }
747
+ if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
748
+ warnings.push("no_episode".to_string());
749
+ }
750
+ if record.filename.contains('/') || record.filename.contains('\\') {
751
+ warnings.push("path_retained".to_string());
752
+ }
753
+ for (index, token) in record.tokens.iter().enumerate() {
754
+ if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
755
+ warnings.push("hash_labeled".to_string());
756
+ break;
757
+ }
758
+ }
759
+ warnings.sort();
760
+ warnings.dedup();
761
+ warnings
762
+ }
763
+
764
+ fn warning_counts(rows: &[Value]) -> HashMap<String, usize> {
765
+ let mut counts = HashMap::new();
766
+ for row in rows {
767
+ if let Some(warnings) = row.get("warnings").and_then(Value::as_array) {
768
+ for warning in warnings {
769
+ if let Some(warning) = warning.as_str() {
770
+ *counts.entry(warning.to_string()).or_default() += 1;
771
+ }
772
+ }
773
+ }
774
+ }
775
+ counts
776
+ }
777
+
778
  fn process_filename(
779
  original: &str,
780
  args: &Args,
 
826
  }
827
  }
828
  };
829
+ if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
830
+ {
831
+ return Processed::Skipped {
832
+ reason: "low_frequency_audit_warning",
833
+ trimmed_parent,
834
+ };
835
+ }
836
  if trimmed_parent {
837
  record.source_filename = Some(original.to_string());
838
  record.path_trimmed = Some(true);
 
847
  }
848
  }
849
 
850
+ fn has_blocking_low_frequency_warning(record: &Record) -> bool {
851
+ audit_warnings(record).iter().any(|warning| {
852
+ matches!(
853
+ warning.as_str(),
854
+ "hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
855
+ )
856
+ })
857
+ }
858
+
859
  fn tokenize(value: &str) -> Vec<String> {
860
  let mut output = Vec::new();
861
  let mut index = 0;
 
1195
  roles[*index] = "TITLE".to_string();
1196
  }
1197
  } else if bracket_text.len() == 1 {
1198
+ roles[bracket_text[0]] = if text.is_empty() {
1199
+ "TITLE"
1200
+ } else if bracket_text[0] == *start {
1201
+ "GROUP"
1202
+ } else {
1203
+ "TITLE"
1204
+ }
1205
+ .to_string();
1206
  }
1207
  for index in text {
1208
  roles[index] = "TITLE".to_string();
 
1247
  .collect();
1248
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
1249
  if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
1250
+ if !path_segment_is_plain_season(parts[parts.len() - 2]) {
1251
+ return (parts[parts.len() - 1].to_string(), true);
1252
+ }
1253
  let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
1254
  let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
1255
  if parent_seasons
 
1268
  }
1269
  }
1270
 
1271
+ fn path_segment_is_plain_season(segment: &str) -> bool {
1272
+ let cleaned = strip_wrapper(segment).trim().to_string();
1273
+ PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
1274
+ }
1275
+
1276
  fn path_segment_has_season(value: &str) -> bool {
1277
  PATH_SEGMENT_SEASON_RE.is_match(value)
1278
  }
 
1353
  "SEASON" => Some("SEASON"),
1354
  "SPECIAL" | "VOLUME" => Some("SPECIAL"),
1355
  "RESOLUTION" => Some("RESOLUTION"),
1356
+ "SOURCE" => Some("SOURCE"),
1357
  _ => None,
1358
  };
1359
  entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
 
1514
  if atom_class == "RESOLUTION" {
1515
  return "B-RESOLUTION".to_string();
1516
  }
1517
+ if atom_class == "HASH" {
1518
+ return "O".to_string();
1519
+ }
1520
+ if matches!(atom_class.as_str(), "MEDIA" | "LANG") {
1521
  return "B-SOURCE".to_string();
1522
  }
1523
  if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
 
1695
  "vol" | "volume"
1696
  )
1697
  {
1698
+ let next_text_before_episode = (index + 1..roles.len())
1699
+ .find(|&cursor| groups[cursor].class_name != "SEP")
1700
+ .is_some_and(|cursor| {
1701
+ groups[cursor].class_name == "TEXT"
1702
+ && roles[cursor + 1..]
1703
+ .iter()
1704
+ .any(|role| role.starts_with("EPISODE"))
1705
+ });
1706
+ if next_text_before_episode {
1707
+ output[index - 2] = "TITLE".to_string();
1708
+ output[index] = "TITLE".to_string();
1709
+ continue;
1710
+ }
1711
  output[index - 2] = "SPECIAL".to_string();
1712
  output[index] = "SPECIAL".to_string();
1713
  continue;
 
1767
  output[index] = "SPECIAL".to_string();
1768
  continue;
1769
  }
1770
+ if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
1771
+ {
1772
+ output[index] = "O".to_string();
1773
+ continue;
1774
+ }
1775
+ if output[index] == "O"
1776
+ && groups[index].class_name == "TEXT"
1777
+ && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
1778
+ && text.chars().any(|ch| ch.is_alphabetic())
1779
+ && !ep_markers.contains(&text.as_str())
1780
+ {
1781
+ if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
1782
+ let episode_since_title = output[last_title + 1..index]
1783
+ .iter()
1784
+ .any(|role| role.starts_with("EPISODE"));
1785
+ if !episode_since_title {
1786
+ output[index] = "TITLE".to_string();
1787
+ continue;
1788
+ }
1789
+ }
1790
+ }
1791
  if roles[index] == "TITLE"
1792
  && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
1793
  && index + 2 < roles.len()
 
1856
  index += 1;
1857
  continue;
1858
  }
 
 
 
 
 
1859
  let start = index;
1860
  index += 1;
1861
+ loop {
1862
+ if index < roles.len()
1863
+ && roles[index] == "TITLE"
1864
+ && !(groups[index - 1].class_name == "BRACKET_TEXT"
1865
+ && groups[index].class_name == "BRACKET_TEXT")
1866
+ {
1867
+ index += 1;
1868
+ continue;
1869
+ }
1870
+ if index + 1 < roles.len()
1871
+ && roles[index] == "O"
1872
+ && groups[index].class_name == "SEP"
1873
+ && roles[index + 1] == "TITLE"
1874
+ {
1875
+ index += 2;
1876
+ continue;
1877
+ }
1878
+ break;
1879
  }
1880
  candidates.push((start, index));
1881
  }
 
2085
 
2086
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
2087
  let joiners = [
2088
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2089
+ "?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
2090
+ "】", "「", "」", "「", "」", "☆", "@",
2091
  ];
2092
+ let title_terminal_punctuation = ["!", "!", "?", "?"];
2093
  let entity_joiners = [
2094
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
2095
+ "?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
2096
+ "】", "「", "」", "「", "」", "☆", "@", "&", "&",
2097
  ];
2098
  let mut output = labels.to_vec();
2099
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
2121
  output[index] = left_label.clone();
2122
  }
2123
  }
2124
+ if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
2125
+ let left_label = &output[index - 1];
2126
+ if left_label == "B-TITLE" {
2127
+ output[index] = "B-TITLE".to_string();
2128
+ }
2129
+ }
2130
  }
2131
  output
2132
  }
 
2220
  assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
2221
  assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
2222
  assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
2223
+ let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)");
2224
+ assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string())));
2225
+ let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]");
2226
+ assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string())));
2227
+ let hash = labels_for("[Group][Title][01][1080p][00270AC8]");
2228
+ assert!(hash.contains(&("00270AC8".to_string(), "O".to_string())));
2229
+ let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001");
2230
+ assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string())));
2231
+ assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string())));
2232
+ let ubw = labels_for("Fate/stay night [Unlimited Blade Works] #00 「プロローグ」");
2233
+ assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string())));
2234
+ assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string())));
2235
+ let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]");
2236
+ assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string())));
2237
+ let comma_title =
2238
+ labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
2239
+ assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
2240
+ let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
2241
+ assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
2242
+ let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
2243
+ assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string())));
2244
+ let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER~魂狩~ #01 (HEVC 1312x720)");
2245
+ assert!(soul_taker.contains(&("~".to_string(), "B-TITLE".to_string())));
2246
+ let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
2247
+ assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
2248
+ assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
2249
 
2250
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
2251
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
 
2271
  trimmed,
2272
  "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
2273
  );
2274
+ let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
2275
+ let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
2276
+ assert!(pokemon_was_trimmed);
2277
+ assert_eq!(
2278
+ trimmed_pokemon,
2279
+ "Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"
2280
+ );
2281
  let woody = labels_for(&trimmed);
2282
  assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
2283
  assert!(woody.contains(&("E".to_string(), "O".to_string())));