ModerRAS commited on
Commit
79b4d87
·
1 Parent(s): 01e4424

Refine DMHY template labeling rules

Browse files
tools/rust_dmhy_template_apply/README.md CHANGED
@@ -17,7 +17,7 @@ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml --
17
  --recipe-top 5000 `
18
  --review-top 5000 `
19
  --min-count 2 `
20
- --recipe-min-count 10 `
21
  --threads 24
22
  ```
23
 
 
17
  --recipe-top 5000 `
18
  --review-top 5000 `
19
  --min-count 2 `
20
+ --recipe-min-count 25 `
21
  --threads 24
22
  ```
23
 
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -61,7 +61,7 @@ struct Args {
61
  review_top: usize,
62
  #[arg(long, default_value_t = 8)]
63
  examples: usize,
64
- #[arg(long, default_value_t = 10)]
65
  recipe_min_count: usize,
66
  #[arg(long, default_value = "high")]
67
  confidence: String,
@@ -155,6 +155,8 @@ static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
155
  static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
156
  static SXE_VALUE_RE: Lazy<Regex> =
157
  Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
 
 
158
  static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
159
  Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
160
  });
@@ -1357,6 +1359,19 @@ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1357
  Some((pieces, labels))
1358
  }
1359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1360
  fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1361
  let caps = SEASON_VALUE_RE.captures(token)?;
1362
  Some((
@@ -1491,6 +1506,44 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
1491
  continue;
1492
  }
1493
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1494
  if roles[index] == "TITLE" && is_special_title_phrase(&text) {
1495
  output[index] = "SPECIAL".to_string();
1496
  continue;
@@ -1726,6 +1779,11 @@ fn project_refined_tokens(
1726
  output_labels.extend(labels);
1727
  continue;
1728
  }
 
 
 
 
 
1729
  }
1730
  for piece in split_refined_token(token) {
1731
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
@@ -1734,6 +1792,11 @@ fn project_refined_tokens(
1734
  output_labels.extend(labels);
1735
  continue;
1736
  }
 
 
 
 
 
1737
  }
1738
  let label = label_for_refined_piece(&piece, role, &group.class_name);
1739
  let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
@@ -1879,6 +1942,9 @@ mod tests {
1879
  ("02".to_string(), "B-EPISODE".to_string())
1880
  ]
1881
  );
 
 
 
1882
  let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
1883
  assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
1884
  assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
@@ -1891,6 +1957,11 @@ mod tests {
1891
  assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
1892
  let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
1893
  assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
 
 
 
 
 
1894
 
1895
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
1896
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -1918,7 +1989,8 @@ mod tests {
1918
  );
1919
  let woody = labels_for(&trimmed);
1920
  assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
1921
- assert!(woody.contains(&("E07".to_string(), "B-EPISODE".to_string())));
 
1922
  assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
1923
  assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
1924
  assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));
 
61
  review_top: usize,
62
  #[arg(long, default_value_t = 8)]
63
  examples: usize,
64
+ #[arg(long, default_value_t = 25)]
65
  recipe_min_count: usize,
66
  #[arg(long, default_value = "high")]
67
  confidence: String,
 
155
  static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
156
  static SXE_VALUE_RE: Lazy<Regex> =
157
  Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
158
+ static EPISODE_VALUE_RE: Lazy<Regex> =
159
+ Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap());
160
  static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
161
  Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
162
  });
 
1359
  Some((pieces, labels))
1360
  }
1361
 
1362
+ fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1363
+ let caps = EPISODE_VALUE_RE.captures(token)?;
1364
+ let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
1365
+ let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
1366
+ if let Some(version) = caps.get(3) {
1367
+ pieces.push("v".to_string());
1368
+ pieces.push(version.as_str().to_string());
1369
+ labels.push("O".to_string());
1370
+ labels.push("O".to_string());
1371
+ }
1372
+ Some((pieces, labels))
1373
+ }
1374
+
1375
  fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1376
  let caps = SEASON_VALUE_RE.captures(token)?;
1377
  Some((
 
1506
  continue;
1507
  }
1508
  }
1509
+ if roles[index].starts_with("EPISODE")
1510
+ && index >= 2
1511
+ && output[..index].iter().any(|role| role == "TITLE")
1512
+ && group_text(tokens, &groups[index])
1513
+ .chars()
1514
+ .all(|ch| ch.is_ascii_digit())
1515
+ {
1516
+ let next_episode_word = index + 2 < roles.len()
1517
+ && groups[index + 1].class_name == "SEP"
1518
+ && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode");
1519
+ if next_episode_word {
1520
+ let mut run = Vec::new();
1521
+ let mut cursor = index + 2;
1522
+ while cursor < roles.len() {
1523
+ if groups[cursor].class_name == "SEP" {
1524
+ cursor += 1;
1525
+ continue;
1526
+ }
1527
+ if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE")
1528
+ {
1529
+ run.push(cursor);
1530
+ cursor += 1;
1531
+ continue;
1532
+ }
1533
+ break;
1534
+ }
1535
+ let later_episode = roles[cursor..]
1536
+ .iter()
1537
+ .any(|role| role.starts_with("EPISODE"));
1538
+ if run.len() >= 2 && later_episode {
1539
+ output[index] = "TITLE".to_string();
1540
+ for item in run {
1541
+ output[item] = "TITLE".to_string();
1542
+ }
1543
+ continue;
1544
+ }
1545
+ }
1546
+ }
1547
  if roles[index] == "TITLE" && is_special_title_phrase(&text) {
1548
  output[index] = "SPECIAL".to_string();
1549
  continue;
 
1779
  output_labels.extend(labels);
1780
  continue;
1781
  }
1782
+ if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) {
1783
+ output_tokens.extend(pieces);
1784
+ output_labels.extend(labels);
1785
+ continue;
1786
+ }
1787
  }
1788
  for piece in split_refined_token(token) {
1789
  if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
 
1792
  output_labels.extend(labels);
1793
  continue;
1794
  }
1795
+ if let Some((pieces, labels)) = split_episode_token(&piece) {
1796
+ output_tokens.extend(pieces);
1797
+ output_labels.extend(labels);
1798
+ continue;
1799
+ }
1800
  }
1801
  let label = label_for_refined_piece(&piece, role, &group.class_name);
1802
  let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
 
1942
  ("02".to_string(), "B-EPISODE".to_string())
1943
  ]
1944
  );
1945
+ let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]");
1946
+ assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string())));
1947
+ assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string())));
1948
  let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
1949
  assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
1950
  assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
 
1957
  assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
1958
  let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
1959
  assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
1960
+ let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]");
1961
+ assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string())));
1962
+ assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
1963
+ assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
1964
+ assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
1965
 
1966
  let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
1967
  assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
 
1989
  );
1990
  let woody = labels_for(&trimmed);
1991
  assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
1992
+ assert!(woody.contains(&("E".to_string(), "O".to_string())));
1993
+ assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string())));
1994
  assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
1995
  assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
1996
  assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));