ModerRAS commited on
Commit
736c211
·
1 Parent(s): 7f87a1d

Add Rust DMHY template clustering

Browse files
tools/rust_dmhy_template_apply/README.md CHANGED
@@ -2,7 +2,26 @@
2
 
3
  Multi-core Rust implementation of the DMHY template recipe apply stage.
4
 
5
- Run from the repository root:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  ```powershell
8
  cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
 
2
 
3
  Multi-core Rust implementation of the DMHY template recipe apply stage.
4
 
5
+ Build template recipes from the repository root:
6
+
7
+ ```powershell
8
+ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
9
+ --cluster `
10
+ --input datasets\AnimeName\dmhy_list.jsonl `
11
+ --summary-output reports\dmhy_template_clusters.full_top5000.summary.json `
12
+ --samples-output reports\dmhy_template_clusters.full_top5000.samples.jsonl `
13
+ --clusters-output reports\dmhy_template_clusters.full_top5000.jsonl `
14
+ --recipes-output reports\dmhy_template_recipes.full_top5000.seed.jsonl `
15
+ --review-output reports\dmhy_template_review.full_top5000.jsonl `
16
+ --top 5000 `
17
+ --recipe-top 5000 `
18
+ --review-top 5000 `
19
+ --min-count 2 `
20
+ --recipe-min-count 10 `
21
+ --threads 24
22
+ ```
23
+
24
+ Apply template recipes from the repository root:
25
 
26
  ```powershell
27
  cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -15,6 +15,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
15
  #[derive(Parser, Debug)]
16
  #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
17
  struct Args {
 
 
18
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
19
  input: PathBuf,
20
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
@@ -29,12 +31,38 @@ struct Args {
29
  default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
30
  )]
31
  manifest_output: PathBuf,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  #[arg(long)]
33
  limit: Option<usize>,
34
  #[arg(long)]
35
  limit_templates: Option<usize>,
36
  #[arg(long, default_value_t = 1)]
37
  min_count: u64,
 
 
 
 
 
 
 
 
 
 
38
  #[arg(long, default_value = "high")]
39
  confidence: String,
40
  #[arg(long, default_value = "all")]
@@ -44,6 +72,8 @@ struct Args {
44
  #[arg(long)]
45
  keep_encoding_noise: bool,
46
  #[arg(long)]
 
 
47
  threads: Option<usize>,
48
  }
49
 
@@ -88,6 +118,15 @@ struct Stats {
88
  written: usize,
89
  }
90
 
 
 
 
 
 
 
 
 
 
91
  #[derive(Debug)]
92
  enum Processed {
93
  Written {
@@ -123,7 +162,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
123
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
124
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
125
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
126
- Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
127
  });
128
  static VOLUME_RE: Lazy<Regex> =
129
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -141,9 +180,21 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
141
  });
142
  static YEAR_RANGE_RE: Lazy<Regex> =
143
  Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
 
 
 
 
 
 
 
 
 
 
144
  static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
145
  [
146
  r"^\d{3,4}[xX×]\d{3,4}",
 
 
147
  r"^[\\/]+",
148
  r"^[-_.::+&|]+",
149
  r"^\s+",
@@ -169,6 +220,9 @@ fn main() -> Result<()> {
169
  .build_global()
170
  .context("failed to configure rayon thread pool")?;
171
  }
 
 
 
172
  if args.expand != "all" && args.expand != "sample" {
173
  bail!("--expand must be all or sample");
174
  }
@@ -331,6 +385,222 @@ fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
331
  Ok(values)
332
  }
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  fn process_filename(
335
  original: &str,
336
  args: &Args,
@@ -472,7 +742,7 @@ fn split_inner(inner: &str) -> Vec<String> {
472
  let mut parts = Vec::new();
473
  let mut current = String::new();
474
  for ch in inner.chars() {
475
- if ch.is_whitespace() || "_.,+/&|-".contains(ch) {
476
  if !current.is_empty() {
477
  parts.push(std::mem::take(&mut current));
478
  }
@@ -586,10 +856,22 @@ fn classify_token(token: &str) -> String {
586
  }
587
  if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
588
  let inner = strip_wrapper(token);
589
- let whole_class = classify_atom(&inner);
590
  let parts = split_inner(&inner);
 
591
  let inner_class = if whole_class != "TEXT" {
592
- whole_class
 
 
 
 
 
 
 
 
 
 
 
 
593
  } else if parts.is_empty() {
594
  "EMPTY".to_string()
595
  } else {
@@ -677,7 +959,7 @@ fn suggested_roles(template: &str) -> Vec<String> {
677
  "EPISODE_VERSION"
678
  } else if item.contains("EPISODE_RANGE") {
679
  "EPISODE_RANGE"
680
- } else if item.contains("EPISODE") || *item == "SXE" {
681
  "EPISODE"
682
  } else if item.contains("RESOLUTION") {
683
  "RESOLUTION"
@@ -767,12 +1049,39 @@ fn training_filename_for(original: &str) -> (String, bool) {
767
  .filter(|part| !part.is_empty())
768
  .collect();
769
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
770
- (parts[1..].join("/"), true)
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  } else {
772
  (original.to_string(), false)
773
  }
774
  }
775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  fn has_encoding_noise(value: &str) -> bool {
777
  if value.contains('\u{fffd}') {
778
  return true;
@@ -910,6 +1219,30 @@ fn split_refined_token(token: &str) -> Vec<String> {
910
  let mut merged = Vec::new();
911
  let mut index = 0;
912
  while index < pieces.len() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  if !is_separator(&pieces[index]) {
914
  let mut end = index;
915
  let mut combined = String::new();
@@ -1066,7 +1399,7 @@ fn is_special_title_phrase(text: &str) -> bool {
1066
  fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
1067
  let mut output = roles.to_vec();
1068
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
1069
- let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"];
1070
  if !output.iter().any(|role| role == "TITLE")
1071
  && roles
1072
  .first()
@@ -1086,17 +1419,78 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1086
  }
1087
  }
1088
  if title_run.len() >= 2 {
 
 
 
 
 
 
 
 
 
 
 
1089
  for index in title_run {
1090
  output[index] = "TITLE".to_string();
1091
  }
1092
  }
1093
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1094
  for index in 0..roles.len() {
1095
  let text = group_text(tokens, &groups[index]);
 
 
 
1096
  if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
1097
  output[index] = "O".to_string();
1098
  continue;
1099
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
  if roles[index] == "TITLE" && is_special_title_phrase(&text) {
1101
  output[index] = "SPECIAL".to_string();
1102
  continue;
@@ -1111,7 +1505,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1111
  output[index + 2] = "SEASON".to_string();
1112
  continue;
1113
  }
1114
- if roles[index] == "TITLE" && roman.contains(&text.to_ascii_uppercase().as_str()) {
 
 
 
1115
  let previous_title = output[..index].iter().any(|role| role == "TITLE");
1116
  let next_structural = roles[index + 1..]
1117
  .iter()
@@ -1131,6 +1528,29 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1131
  output[index + 2] = "O".to_string();
1132
  }
1133
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1134
  }
1135
  output
1136
  }
@@ -1301,7 +1721,7 @@ fn project_refined_tokens(
1301
  }
1302
  }
1303
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
1304
- if let Some((pieces, labels)) = split_sxe_token(token) {
1305
  output_tokens.extend(pieces);
1306
  output_labels.extend(labels);
1307
  continue;
@@ -1315,8 +1735,10 @@ fn project_refined_tokens(
1315
  continue;
1316
  }
1317
  }
1318
- output_labels.push(label_for_refined_piece(&piece, role, &group.class_name));
1319
- output_tokens.push(piece);
 
 
1320
  }
1321
  } else {
1322
  if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
@@ -1352,9 +1774,11 @@ fn project_refined_tokens(
1352
  }
1353
 
1354
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
1355
- let joiners = [" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":"];
 
 
1356
  let entity_joiners = [
1357
- " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", "&",
1358
  ];
1359
  let mut output = labels.to_vec();
1360
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -1442,6 +1866,8 @@ mod tests {
1442
 
1443
  let dxd = labels_for("High School D×D");
1444
  assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
 
 
1445
 
1446
  let sxe = labels_for("S01E02");
1447
  assert_eq!(
@@ -1453,6 +1879,27 @@ mod tests {
1453
  ("02".to_string(), "B-EPISODE".to_string())
1454
  ]
1455
  );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1456
 
1457
  let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
1458
  assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
@@ -1486,5 +1933,101 @@ mod tests {
1486
  assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
1487
  assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
1488
  assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1489
  }
1490
  }
 
15
  #[derive(Parser, Debug)]
16
  #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
17
  struct Args {
18
+ #[arg(long)]
19
+ cluster: bool,
20
  #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
21
  input: PathBuf,
22
  #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
 
31
  default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
32
  )]
33
  manifest_output: PathBuf,
34
+ #[arg(
35
+ long,
36
+ default_value = "reports/dmhy_template_clusters.rust.summary.json"
37
+ )]
38
+ summary_output: PathBuf,
39
+ #[arg(
40
+ long,
41
+ default_value = "reports/dmhy_template_clusters.rust.samples.jsonl"
42
+ )]
43
+ samples_output: PathBuf,
44
+ #[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")]
45
+ clusters_output: PathBuf,
46
+ #[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")]
47
+ recipes_output: PathBuf,
48
+ #[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
49
+ review_output: PathBuf,
50
  #[arg(long)]
51
  limit: Option<usize>,
52
  #[arg(long)]
53
  limit_templates: Option<usize>,
54
  #[arg(long, default_value_t = 1)]
55
  min_count: u64,
56
+ #[arg(long, default_value_t = 200)]
57
+ top: usize,
58
+ #[arg(long, default_value_t = 200)]
59
+ recipe_top: usize,
60
+ #[arg(long, default_value_t = 1000)]
61
+ review_top: usize,
62
+ #[arg(long, default_value_t = 8)]
63
+ examples: usize,
64
+ #[arg(long, default_value_t = 10)]
65
+ recipe_min_count: usize,
66
  #[arg(long, default_value = "high")]
67
  confidence: String,
68
  #[arg(long, default_value = "all")]
 
72
  #[arg(long)]
73
  keep_encoding_noise: bool,
74
  #[arg(long)]
75
+ preserve_parent_paths: bool,
76
+ #[arg(long)]
77
  threads: Option<usize>,
78
  }
79
 
 
118
  written: usize,
119
  }
120
 
121
+ #[derive(Debug, Default)]
122
+ struct Cluster {
123
+ count: usize,
124
+ examples: Vec<String>,
125
+ literal_counts: HashMap<String, usize>,
126
+ class_counts: HashMap<String, usize>,
127
+ position_literals: Vec<HashMap<String, usize>>,
128
+ }
129
+
130
  #[derive(Debug)]
131
  enum Processed {
132
  Written {
 
162
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
163
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
164
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
165
+ Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
166
  });
167
  static VOLUME_RE: Lazy<Regex> =
168
  Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
 
180
  });
181
  static YEAR_RANGE_RE: Lazy<Regex> =
182
  Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
183
+ static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
184
+ Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
185
+ });
186
+ static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
187
+ Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
188
+ static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
189
+ Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
190
+ static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
191
+ Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap()
192
+ });
193
  static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
194
  [
195
  r"^\d{3,4}[xX×]\d{3,4}",
196
+ r"(?i)^h\.?26[45]",
197
+ r"(?i)^x\.?26[45]",
198
  r"^[\\/]+",
199
  r"^[-_.::+&|]+",
200
  r"^\s+",
 
220
  .build_global()
221
  .context("failed to configure rayon thread pool")?;
222
  }
223
+ if args.cluster {
224
+ return run_cluster(&args);
225
+ }
226
  if args.expand != "all" && args.expand != "sample" {
227
  bail!("--expand must be all or sample");
228
  }
 
385
  Ok(values)
386
  }
387
 
388
+ fn run_cluster(args: &Args) -> Result<()> {
389
+ let inputs = load_input(&args.input, args.limit)?;
390
+ let source_rows = inputs.len();
391
+ let mut clusters: HashMap<String, Cluster> = HashMap::new();
392
+ let mut skipped_encoding_noise = 0usize;
393
+ let mut trimmed_parent_path = 0usize;
394
+ let mut total_rows = 0usize;
395
+
396
+ for original in inputs {
397
+ if !args.keep_encoding_noise
398
+ && (has_encoding_noise(&original)
399
+ || has_non_anime_noise(&original)
400
+ || has_abstract_path_noise(&original))
401
+ {
402
+ skipped_encoding_noise += 1;
403
+ continue;
404
+ }
405
+ let filename = if args.preserve_parent_paths {
406
+ original
407
+ } else {
408
+ let (training_filename, was_trimmed) = training_filename_for(&original);
409
+ if was_trimmed {
410
+ trimmed_parent_path += 1;
411
+ }
412
+ training_filename
413
+ };
414
+ add_cluster(&mut clusters, &filename, args.examples);
415
+ total_rows += 1;
416
+ }
417
+
418
+ let mut sorted_clusters: Vec<_> = clusters.into_iter().collect();
419
+ sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0)));
420
+
421
+ let cluster_rows: Vec<Value> = sorted_clusters
422
+ .iter()
423
+ .enumerate()
424
+ .map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows))
425
+ .collect();
426
+ let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect();
427
+ let recipe_candidates: Vec<Value> =
428
+ cluster_rows.iter().take(args.recipe_top).cloned().collect();
429
+ let recipes: Vec<Value> = recipe_candidates
430
+ .iter()
431
+ .filter(|row| is_high_confidence_recipe(row, args.recipe_min_count))
432
+ .map(|row| recipe_row(row, "high"))
433
+ .collect();
434
+ let review: Vec<Value> = recipe_candidates
435
+ .iter()
436
+ .filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count))
437
+ .take(args.review_top)
438
+ .cloned()
439
+ .collect();
440
+
441
+ write_jsonl_values(&args.clusters_output, &cluster_rows)?;
442
+ write_jsonl_values(&args.samples_output, &samples)?;
443
+ write_jsonl_values(&args.recipes_output, &recipes)?;
444
+ write_jsonl_values(&args.review_output, &review)?;
445
+
446
+ let mut histogram: HashMap<usize, usize> = HashMap::new();
447
+ for (_, cluster) in &sorted_clusters {
448
+ *histogram.entry(cluster.count).or_default() += 1;
449
+ }
450
+ let mut count_histogram_top: Vec<_> = histogram.into_iter().collect();
451
+ count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
452
+ count_histogram_top.truncate(20);
453
+
454
+ let rows_covered_by_repeated_templates: usize = sorted_clusters
455
+ .iter()
456
+ .map(|(_, cluster)| cluster)
457
+ .filter(|cluster| cluster.count as u64 >= args.min_count)
458
+ .map(|cluster| cluster.count)
459
+ .sum();
460
+ let templates_at_least_min_count = sorted_clusters
461
+ .iter()
462
+ .filter(|(_, cluster)| cluster.count as u64 >= args.min_count)
463
+ .count();
464
+ let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect();
465
+ let summary = json!({
466
+ "input": args.input.to_string_lossy(),
467
+ "source_rows": source_rows,
468
+ "skipped_encoding_noise": skipped_encoding_noise,
469
+ "trimmed_parent_path": trimmed_parent_path,
470
+ "total_rows": total_rows,
471
+ "unique_templates": sorted_clusters.len(),
472
+ "min_count": args.min_count,
473
+ "templates_at_least_min_count": templates_at_least_min_count,
474
+ "rows_covered_by_repeated_templates": rows_covered_by_repeated_templates,
475
+ "rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 },
476
+ "top_output_rows": samples.len(),
477
+ "clusters_output": args.clusters_output.to_string_lossy(),
478
+ "cluster_rows": cluster_rows.len(),
479
+ "recipes_output": args.recipes_output.to_string_lossy(),
480
+ "recipe_rows": recipes.len(),
481
+ "review_output": args.review_output.to_string_lossy(),
482
+ "review_rows": review.len(),
483
+ "recipe_top": args.recipe_top,
484
+ "recipe_min_count": args.recipe_min_count,
485
+ "top_templates": top_templates,
486
+ "count_histogram_top": count_histogram_top,
487
+ "implementation": "rust_dmhy_template_cluster",
488
+ "generated_at": Utc::now().to_rfc3339(),
489
+ });
490
+ if let Some(parent) = args.summary_output.parent() {
491
+ fs::create_dir_all(parent)?;
492
+ }
493
+ fs::write(
494
+ &args.summary_output,
495
+ serde_json::to_string_pretty(&summary)?,
496
+ )?;
497
+ println!("{}", serde_json::to_string_pretty(&summary)?);
498
+ Ok(())
499
+ }
500
+
501
+ fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) {
502
+ let (key, tokens, classes, groups) = template_key_for_filename(filename);
503
+ let cluster = clusters.entry(key).or_default();
504
+ cluster.count += 1;
505
+ if cluster.examples.len() < example_limit {
506
+ cluster.examples.push(filename.to_string());
507
+ }
508
+ for (token, class_name) in tokens.iter().zip(classes.iter()) {
509
+ *cluster.class_counts.entry(class_name.clone()).or_default() += 1;
510
+ if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
511
+ let cleaned = strip_wrapper(token);
512
+ if !cleaned.is_empty() {
513
+ *cluster.literal_counts.entry(cleaned).or_default() += 1;
514
+ }
515
+ }
516
+ }
517
+ while cluster.position_literals.len() < groups.len() {
518
+ cluster.position_literals.push(HashMap::new());
519
+ }
520
+ for (index, group) in groups.iter().enumerate() {
521
+ if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
522
+ let text = group_text(&tokens, group);
523
+ if !text.is_empty() {
524
+ *cluster.position_literals[index].entry(text).or_default() += 1;
525
+ }
526
+ }
527
+ }
528
+ }
529
+
530
+ fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
531
+ json!({
532
+ "template_id": format!("tpl_{rank:06}"),
533
+ "template": key,
534
+ "count": cluster.count,
535
+ "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
536
+ "top_literals": top_counts(&cluster.literal_counts, 12),
537
+ "suggested_roles": suggested_roles(key),
538
+ "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
539
+ "class_counts": top_counts(&cluster.class_counts, 20),
540
+ "examples": cluster.examples,
541
+ })
542
+ }
543
+
544
+ fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
545
+ let mut items: Vec<_> = counts
546
+ .iter()
547
+ .map(|(key, count)| (key.clone(), *count))
548
+ .collect();
549
+ items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
550
+ items.truncate(limit);
551
+ items
552
+ }
553
+
554
+ fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool {
555
+ if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 {
556
+ return false;
557
+ }
558
+ let roles = match row.get("suggested_roles").and_then(Value::as_array) {
559
+ Some(roles) => roles,
560
+ None => return false,
561
+ };
562
+ let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect();
563
+ if role_strings.iter().any(|role| role.contains("_OR_")) {
564
+ return false;
565
+ }
566
+ if !role_strings.contains(&"TITLE")
567
+ || !role_strings.iter().any(|role| {
568
+ role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION")
569
+ })
570
+ {
571
+ return false;
572
+ }
573
+ let template = row.get("template").and_then(Value::as_str).unwrap_or("");
574
+ if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") {
575
+ return false;
576
+ }
577
+ !role_strings.contains(&"TITLE_OR_TEXT")
578
+ }
579
+
580
+ fn recipe_row(row: &Value, confidence: &str) -> Value {
581
+ json!({
582
+ "template_id": row["template_id"],
583
+ "template": row["template"],
584
+ "roles": row["suggested_roles"],
585
+ "confidence": confidence,
586
+ "count": row["count"],
587
+ "examples": row["examples"],
588
+ })
589
+ }
590
+
591
+ fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
592
+ if let Some(parent) = path.parent() {
593
+ fs::create_dir_all(parent)?;
594
+ }
595
+ let mut writer = BufWriter::new(File::create(path)?);
596
+ for row in rows {
597
+ serde_json::to_writer(&mut writer, row)?;
598
+ writer.write_all(b"\n")?;
599
+ }
600
+ writer.flush()?;
601
+ Ok(())
602
+ }
603
+
604
  fn process_filename(
605
  original: &str,
606
  args: &Args,
 
742
  let mut parts = Vec::new();
743
  let mut current = String::new();
744
  for ch in inner.chars() {
745
+ if ch.is_whitespace() || "_.,+/&|-()()".contains(ch) {
746
  if !current.is_empty() {
747
  parts.push(std::mem::take(&mut current));
748
  }
 
856
  }
857
  if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
858
  let inner = strip_wrapper(token);
 
859
  let parts = split_inner(&inner);
860
+ let whole_class = classify_atom(&inner);
861
  let inner_class = if whole_class != "TEXT" {
862
+ if whole_class == "LANG" && parts.len() > 1 {
863
+ let part_classes: Vec<String> =
864
+ parts.iter().map(|part| classify_atom(part)).collect();
865
+ if part_classes.iter().all(|item| item == &part_classes[0]) {
866
+ part_classes[0].clone()
867
+ } else if part_classes.iter().all(|item| is_media_block_class(item)) {
868
+ "MEDIA_BLOCK".to_string()
869
+ } else {
870
+ whole_class
871
+ }
872
+ } else {
873
+ whole_class
874
+ }
875
  } else if parts.is_empty() {
876
  "EMPTY".to_string()
877
  } else {
 
959
  "EPISODE_VERSION"
960
  } else if item.contains("EPISODE_RANGE") {
961
  "EPISODE_RANGE"
962
+ } else if item.contains("EPISODE") || item.contains("SXE") {
963
  "EPISODE"
964
  } else if item.contains("RESOLUTION") {
965
  "RESOLUTION"
 
1049
  .filter(|part| !part.is_empty())
1050
  .collect();
1051
  if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
1052
+ if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
1053
+ let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
1054
+ let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
1055
+ if parent_seasons
1056
+ .iter()
1057
+ .any(|season| leaf_seasons.contains(season))
1058
+ {
1059
+ (parts[parts.len() - 1].to_string(), true)
1060
+ } else {
1061
+ (parts[parts.len() - 2..].join("/"), true)
1062
+ }
1063
+ } else {
1064
+ (parts[parts.len() - 1].to_string(), true)
1065
+ }
1066
  } else {
1067
  (original.to_string(), false)
1068
  }
1069
  }
1070
 
1071
+ fn path_segment_has_season(value: &str) -> bool {
1072
+ PATH_SEGMENT_SEASON_RE.is_match(value)
1073
+ }
1074
+
1075
+ fn path_segment_seasons(value: &str) -> HashSet<u8> {
1076
+ SEASON_WORD_NUMBER_RE
1077
+ .captures_iter(value)
1078
+ .chain(S_NUMBER_SEGMENT_RE.captures_iter(value))
1079
+ .chain(SXE_SEASON_RE.captures_iter(value))
1080
+ .filter_map(|captures| captures.get(1))
1081
+ .filter_map(|item| item.as_str().parse::<u8>().ok())
1082
+ .collect()
1083
+ }
1084
+
1085
  fn has_encoding_noise(value: &str) -> bool {
1086
  if value.contains('\u{fffd}') {
1087
  return true;
 
1219
  let mut merged = Vec::new();
1220
  let mut index = 0;
1221
  while index < pieces.len() {
1222
+ if index + 2 < pieces.len()
1223
+ && !is_separator(&pieces[index])
1224
+ && is_separator(&pieces[index + 1])
1225
+ && !is_separator(&pieces[index + 2])
1226
+ {
1227
+ let combined = format!(
1228
+ "{}{}{}",
1229
+ pieces[index],
1230
+ pieces[index + 1],
1231
+ pieces[index + 2]
1232
+ );
1233
+ let combined_class = classify_atom(&combined);
1234
+ if !pieces[index + 1].chars().any(char::is_whitespace)
1235
+ && matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×")
1236
+ && matches!(
1237
+ combined_class.as_str(),
1238
+ "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
1239
+ )
1240
+ {
1241
+ merged.push(combined);
1242
+ index += 3;
1243
+ continue;
1244
+ }
1245
+ }
1246
  if !is_separator(&pieces[index]) {
1247
  let mut end = index;
1248
  let mut combined = String::new();
 
1399
  fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
1400
  let mut output = roles.to_vec();
1401
  let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
1402
+ let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
1403
  if !output.iter().any(|role| role == "TITLE")
1404
  && roles
1405
  .first()
 
1419
  }
1420
  }
1421
  if title_run.len() >= 2 {
1422
+ let last_title_index = *title_run.last().unwrap();
1423
+ let later_structural = roles[last_title_index + 1..].iter().any(|role| {
1424
+ role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
1425
+ });
1426
+ if group_text(tokens, &groups[0])
1427
+ .chars()
1428
+ .all(|ch| ch.is_ascii_digit())
1429
+ && later_structural
1430
+ {
1431
+ output[0] = "TITLE".to_string();
1432
+ }
1433
  for index in title_run {
1434
  output[index] = "TITLE".to_string();
1435
  }
1436
  }
1437
  }
1438
+ if roles
1439
+ .first()
1440
+ .is_some_and(|role| role.starts_with("EPISODE"))
1441
+ && group_text(tokens, &groups[0])
1442
+ .chars()
1443
+ .all(|ch| ch.is_ascii_digit())
1444
+ {
1445
+ if let Some(first_title) = output.iter().position(|role| role == "TITLE") {
1446
+ let later_structural = roles[first_title + 1..].iter().any(|role| {
1447
+ role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
1448
+ });
1449
+ if later_structural {
1450
+ output[0] = "TITLE".to_string();
1451
+ }
1452
+ }
1453
+ }
1454
  for index in 0..roles.len() {
1455
  let text = group_text(tokens, &groups[index]);
1456
+ if output[index] == "O" && groups[index].class_name.contains("SXE") {
1457
+ output[index] = "EPISODE".to_string();
1458
+ }
1459
  if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
1460
  output[index] = "O".to_string();
1461
  continue;
1462
  }
1463
+ if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
1464
+ let previous_text = group_text(tokens, &groups[index - 2]);
1465
+ let next_special = output[index + 1..roles.len().min(index + 4)]
1466
+ .iter()
1467
+ .any(|role| role == "SPECIAL");
1468
+ let next_episode = roles[index + 1..]
1469
+ .iter()
1470
+ .any(|role| role.starts_with("EPISODE"));
1471
+ if groups[index - 1].class_name == "SEP"
1472
+ && matches!(
1473
+ previous_text.to_ascii_lowercase().as_str(),
1474
+ "vol" | "volume"
1475
+ )
1476
+ {
1477
+ output[index - 2] = "SPECIAL".to_string();
1478
+ output[index] = "SPECIAL".to_string();
1479
+ continue;
1480
+ }
1481
+ if output[index - 2] == "TITLE"
1482
+ && groups[index - 1].class_name == "SEP"
1483
+ && previous_text.len() <= 4
1484
+ && previous_text.is_ascii()
1485
+ && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
1486
+ && text.chars().all(|ch| ch.is_ascii_digit())
1487
+ && text.len() <= 3
1488
+ && (next_special || next_episode)
1489
+ {
1490
+ output[index] = "TITLE".to_string();
1491
+ continue;
1492
+ }
1493
+ }
1494
  if roles[index] == "TITLE" && is_special_title_phrase(&text) {
1495
  output[index] = "SPECIAL".to_string();
1496
  continue;
 
1505
  output[index + 2] = "SEASON".to_string();
1506
  continue;
1507
  }
1508
+ if roles[index] == "TITLE"
1509
+ && text == text.to_ascii_uppercase()
1510
+ && roman.contains(&text.as_str())
1511
+ {
1512
  let previous_title = output[..index].iter().any(|role| role == "TITLE");
1513
  let next_structural = roles[index + 1..]
1514
  .iter()
 
1528
  output[index + 2] = "O".to_string();
1529
  }
1530
  }
1531
+ if roles[index].starts_with("EPISODE") {
1532
+ let previous_text = if index >= 1 {
1533
+ group_text(tokens, &groups[index - 1])
1534
+ } else {
1535
+ String::new()
1536
+ };
1537
+ let next_text = if index + 1 < roles.len() {
1538
+ group_text(tokens, &groups[index + 1])
1539
+ } else {
1540
+ String::new()
1541
+ };
1542
+ if previous_text.contains('点')
1543
+ || previous_text.contains('點')
1544
+ || previous_text.contains("晚上")
1545
+ || previous_text.contains("上午")
1546
+ || previous_text.contains("下午")
1547
+ || next_text.contains('点')
1548
+ || next_text.contains('點')
1549
+ || next_text.contains('半')
1550
+ {
1551
+ output[index] = "O".to_string();
1552
+ }
1553
+ }
1554
  }
1555
  output
1556
  }
 
1721
  }
1722
  }
1723
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
1724
+ if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
1725
  output_tokens.extend(pieces);
1726
  output_labels.extend(labels);
1727
  continue;
 
1735
  continue;
1736
  }
1737
  }
1738
+ let label = label_for_refined_piece(&piece, role, &group.class_name);
1739
+ let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
1740
+ output_tokens.extend(pieces);
1741
+ output_labels.extend(labels);
1742
  }
1743
  } else {
1744
  if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
 
1774
  }
1775
 
1776
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
1777
+ let joiners = [
1778
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!",
1779
+ ];
1780
  let entity_joiners = [
1781
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "&", "&",
1782
  ];
1783
  let mut output = labels.to_vec();
1784
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
1866
 
1867
  let dxd = labels_for("High School D×D");
1868
  assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
1869
+ let colon_title = labels_for("Megumi no Daigo:Kyuukoku no Orange 06");
1870
+ assert!(colon_title.contains(&(":".to_string(), "B-TITLE".to_string())));
1871
 
1872
  let sxe = labels_for("S01E02");
1873
  assert_eq!(
 
1879
  ("02".to_string(), "B-EPISODE".to_string())
1880
  ]
1881
  );
1882
+ let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
1883
+ assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
1884
+ assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
1885
+
1886
+ let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
1887
+ assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string())));
1888
+ assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string())));
1889
+ let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)");
1890
+ assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string())));
1891
+ assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
1892
+ let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
1893
+ assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
1894
+
1895
+ let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
1896
+ assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
1897
+ assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string())));
1898
+ let zom =
1899
+ labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]");
1900
+ assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string())));
1901
+ assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
1902
+ assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
1903
 
1904
  let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
1905
  assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
 
1933
  assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
1934
  assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
1935
  assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
1936
+
1937
+ let (trimmed, was_trimmed) =
1938
+ training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]");
1939
+ assert!(was_trimmed);
1940
+ assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]");
1941
+ let (key, _, _, _) = template_key_for_filename(&trimmed);
1942
+ assert_eq!(
1943
+ key,
1944
+ "BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION"
1945
+ );
1946
+
1947
+ let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
1948
+ assert!(short.contains(&("R".to_string(), "B-TITLE".to_string())));
1949
+ assert!(short.contains(&("-".to_string(), "B-TITLE".to_string())));
1950
+ assert!(short.contains(&("15".to_string(), "B-TITLE".to_string())));
1951
+ assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string())));
1952
+
1953
+ let short_before_episode =
1954
+ labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
1955
+ assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string())));
1956
+ assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string())));
1957
+ assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string())));
1958
+ assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
1959
+ assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string())));
1960
+
1961
+ let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]";
1962
+ let (trimmed, was_trimmed) = training_filename_for(avatar);
1963
+ assert!(was_trimmed);
1964
+ assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
1965
+
1966
+ let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
1967
+ let (trimmed, was_trimmed) = training_filename_for(tintin);
1968
+ assert!(was_trimmed);
1969
+ assert_eq!(
1970
+ trimmed,
1971
+ "Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"
1972
+ );
1973
+ let (key, _, _, _) = template_key_for_filename(&trimmed);
1974
+ assert_eq!(
1975
+ key,
1976
+ "TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT"
1977
+ );
1978
+
1979
+ let bocchi = "Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」";
1980
+ let (leaf_key, _, _, _) =
1981
+ template_key_for_filename("Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反��轉」");
1982
+ assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
1983
+ assert!(filename_has_title(
1984
+ "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"
1985
+ ));
1986
+ let (trimmed, was_trimmed) = training_filename_for(bocchi);
1987
+ assert!(was_trimmed);
1988
+ assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」");
1989
+ let (key, _, _, _) = template_key_for_filename(&trimmed);
1990
+ assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
1991
+
1992
+ let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]";
1993
+ let (trimmed, was_trimmed) = training_filename_for(usagi);
1994
+ assert!(was_trimmed);
1995
+ assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]");
1996
+ let (key, _, _, _) = template_key_for_filename(&trimmed);
1997
+ assert_eq!(
1998
+ key,
1999
+ "TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA"
2000
+ );
2001
+
2002
+ let woody_parent =
2003
+ "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
2004
+ let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
2005
+ assert!(was_trimmed);
2006
+ assert_eq!(trimmed, woody_parent);
2007
+
2008
+ let volume =
2009
+ labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
2010
+ assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string())));
2011
+ assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string())));
2012
+ assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string())));
2013
+ assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
2014
+ assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
2015
+
2016
+ let numeric_title =
2017
+ labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
2018
+ assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
2019
+ assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string())));
2020
+ assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string())));
2021
+ assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string())));
2022
+ assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string())));
2023
+ assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string())));
2024
+ assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string())));
2025
+
2026
+ let media_block =
2027
+ labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]");
2028
+ assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string())));
2029
+ assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
2030
+ assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
2031
+ assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
2032
  }
2033
  }