ModerRAS commited on
Commit
8c1da73
·
1 Parent(s): 219e9dd

Add Rust template title span metadata

Browse files
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -715,14 +715,167 @@ fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_
715
  }
716
  }
717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
 
 
 
719
  json!({
720
  "template_id": format!("tpl_{rank:06}"),
721
  "template": key,
722
  "count": cluster.count,
723
  "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
724
  "top_literals": top_counts(&cluster.literal_counts, 12),
725
- "suggested_roles": suggested_roles(key),
 
 
726
  "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
727
  "class_counts": top_counts(&cluster.class_counts, 20),
728
  "examples": cluster.examples,
@@ -770,6 +923,8 @@ fn recipe_row(row: &Value, confidence: &str) -> Value {
770
  "template_id": row["template_id"],
771
  "template": row["template"],
772
  "roles": row["suggested_roles"],
 
 
773
  "confidence": confidence,
774
  "count": row["count"],
775
  "examples": row["examples"],
@@ -4550,6 +4705,40 @@ mod tests {
4550
  }
4551
  }
4552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4553
  #[test]
4554
  fn required_regressions() {
4555
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
 
715
  }
716
  }
717
 
718
+ fn normalized_similarity_text(value: &str) -> String {
719
+ value
720
+ .chars()
721
+ .flat_map(|ch| ch.to_lowercase())
722
+ .filter(|ch| ch.is_alphanumeric())
723
+ .collect()
724
+ }
725
+
726
+ fn char_bigrams(value: &str) -> HashSet<String> {
727
+ let chars: Vec<char> = value.chars().collect();
728
+ if chars.len() < 2 {
729
+ if value.is_empty() {
730
+ return HashSet::new();
731
+ }
732
+ return HashSet::from([value.to_string()]);
733
+ }
734
+ chars
735
+ .windows(2)
736
+ .map(|window| window.iter().collect())
737
+ .collect()
738
+ }
739
+
740
+ fn dice_similarity(left: &str, right: &str) -> f64 {
741
+ let left = normalized_similarity_text(left);
742
+ let right = normalized_similarity_text(right);
743
+ if left.is_empty() || right.is_empty() {
744
+ return 0.0;
745
+ }
746
+ if left == right {
747
+ return 1.0;
748
+ }
749
+ if left.contains(&right) || right.contains(&left) {
750
+ let shorter = left.chars().count().min(right.chars().count()) as f64;
751
+ let longer = left.chars().count().max(right.chars().count()) as f64;
752
+ return shorter / longer;
753
+ }
754
+ let left_bigrams = char_bigrams(&left);
755
+ let right_bigrams = char_bigrams(&right);
756
+ if left_bigrams.is_empty() || right_bigrams.is_empty() {
757
+ return 0.0;
758
+ }
759
+ let overlap = left_bigrams.intersection(&right_bigrams).count() as f64;
760
+ 2.0 * overlap / (left_bigrams.len() + right_bigrams.len()) as f64
761
+ }
762
+
763
+ fn position_literal_similarity(cluster: &Cluster, left_index: usize, right_index: usize) -> f64 {
764
+ let left = match cluster.position_literals.get(left_index) {
765
+ Some(value) => top_counts(value, 5),
766
+ None => return 0.0,
767
+ };
768
+ let right = match cluster.position_literals.get(right_index) {
769
+ Some(value) => top_counts(value, 5),
770
+ None => return 0.0,
771
+ };
772
+ let mut best = 0.0;
773
+ for (left_text, _) in &left {
774
+ for (right_text, _) in &right {
775
+ let similarity = dice_similarity(left_text, right_text);
776
+ if similarity > best {
777
+ best = similarity;
778
+ }
779
+ }
780
+ }
781
+ best
782
+ }
783
+
784
+ fn compute_title_span_metadata(
785
+ key: &str,
786
+ roles: &[String],
787
+ cluster: &Cluster,
788
+ ) -> (Vec<Value>, Vec<Value>) {
789
+ let items: Vec<&str> = key.split_whitespace().collect();
790
+ let title_indices: Vec<usize> = roles
791
+ .iter()
792
+ .enumerate()
793
+ .filter_map(|(index, role)| if role == "TITLE" { Some(index) } else { None })
794
+ .collect();
795
+ let mut spans: Vec<Vec<usize>> = Vec::new();
796
+ let mut decisions: Vec<Value> = Vec::new();
797
+ let mut current_span: Vec<usize> = Vec::new();
798
+ let similarity_keep_threshold = 0.72;
799
+
800
+ for title_index in title_indices {
801
+ if current_span.is_empty() {
802
+ current_span.push(title_index);
803
+ continue;
804
+ }
805
+ let previous = *current_span.last().unwrap();
806
+ let between = &items[(previous + 1)..title_index];
807
+ let roles_between = &roles[(previous + 1)..title_index];
808
+ let has_path_boundary = between.iter().any(|item| *item == "PATH");
809
+ let only_separators_between = !between.is_empty()
810
+ && between.iter().all(|item| *item == "SEP")
811
+ && roles_between.iter().all(|role| role == "O");
812
+ let similarity = position_literal_similarity(cluster, previous, title_index);
813
+
814
+ if has_path_boundary {
815
+ decisions.push(json!({
816
+ "left_role_index": previous,
817
+ "right_role_index": title_index,
818
+ "similarity": similarity,
819
+ "decision": "keep_path_boundary",
820
+ "source": "heuristic",
821
+ }));
822
+ spans.push(current_span);
823
+ current_span = vec![title_index];
824
+ } else if only_separators_between && similarity < similarity_keep_threshold {
825
+ decisions.push(json!({
826
+ "left_role_index": previous,
827
+ "right_role_index": title_index,
828
+ "similarity": similarity,
829
+ "decision": "merge_dissimilar_title_continuation",
830
+ "source": "heuristic",
831
+ }));
832
+ current_span.push(title_index);
833
+ } else {
834
+ let decision = if only_separators_between {
835
+ "keep_similar_duplicate_or_alias"
836
+ } else {
837
+ "keep_structural_boundary"
838
+ };
839
+ decisions.push(json!({
840
+ "left_role_index": previous,
841
+ "right_role_index": title_index,
842
+ "similarity": similarity,
843
+ "decision": decision,
844
+ "source": "heuristic",
845
+ }));
846
+ spans.push(current_span);
847
+ current_span = vec![title_index];
848
+ }
849
+ }
850
+ if !current_span.is_empty() {
851
+ spans.push(current_span);
852
+ }
853
+
854
+ let title_spans = spans
855
+ .into_iter()
856
+ .map(|role_indices| {
857
+ json!({
858
+ "role_indices": role_indices,
859
+ "source": "heuristic",
860
+ })
861
+ })
862
+ .collect();
863
+ (title_spans, decisions)
864
+ }
865
+
866
  fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
867
+ let suggested_roles = suggested_roles(key);
868
+ let (title_spans, title_boundary_decisions) =
869
+ compute_title_span_metadata(key, &suggested_roles, cluster);
870
  json!({
871
  "template_id": format!("tpl_{rank:06}"),
872
  "template": key,
873
  "count": cluster.count,
874
  "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
875
  "top_literals": top_counts(&cluster.literal_counts, 12),
876
+ "suggested_roles": suggested_roles,
877
+ "title_spans": title_spans,
878
+ "title_boundary_decisions": title_boundary_decisions,
879
  "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
880
  "class_counts": top_counts(&cluster.class_counts, 20),
881
  "examples": cluster.examples,
 
923
  "template_id": row["template_id"],
924
  "template": row["template"],
925
  "roles": row["suggested_roles"],
926
+ "title_spans": row.get("title_spans").cloned().unwrap_or_else(|| json!([])),
927
+ "title_boundary_decisions": row.get("title_boundary_decisions").cloned().unwrap_or_else(|| json!([])),
928
  "confidence": confidence,
929
  "count": row["count"],
930
  "examples": row["examples"],
 
4705
  }
4706
  }
4707
 
4708
+ #[test]
4709
+ fn title_span_metadata_merges_dissimilar_continuations() {
4710
+ let key = "BRACKET_TEXT SEP TEXT SEP TEXT SEP TEXT SEP EPISODE SEP BRACKET_MEDIA_BLOCK";
4711
+ let roles = suggested_roles(key);
4712
+ let mut cluster = Cluster::default();
4713
+ cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
4714
+ cluster.position_literals[2].insert("Tales".to_string(), 10);
4715
+ cluster.position_literals[4].insert("of Phantasia".to_string(), 10);
4716
+ cluster.position_literals[6].insert("The Animation".to_string(), 10);
4717
+ let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
4718
+ assert_eq!(spans.len(), 1, "{spans:?} {decisions:?}");
4719
+ assert_eq!(
4720
+ spans[0]["role_indices"].as_array().unwrap().len(),
4721
+ 3,
4722
+ "{spans:?}"
4723
+ );
4724
+ assert!(decisions
4725
+ .iter()
4726
+ .all(|decision| { decision["decision"] == "merge_dissimilar_title_continuation" }));
4727
+ }
4728
+
4729
+ #[test]
4730
+ fn title_span_metadata_keeps_similar_duplicates_separate() {
4731
+ let key = "TEXT SEP TEXT SEP EPISODE";
4732
+ let roles = suggested_roles(key);
4733
+ let mut cluster = Cluster::default();
4734
+ cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
4735
+ cluster.position_literals[0].insert("Frieren".to_string(), 10);
4736
+ cluster.position_literals[2].insert("Frieren".to_string(), 10);
4737
+ let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
4738
+ assert_eq!(spans.len(), 2, "{spans:?} {decisions:?}");
4739
+ assert_eq!(decisions[0]["decision"], "keep_similar_duplicate_or_alias");
4740
+ }
4741
+
4742
  #[test]
4743
  fn required_regressions() {
4744
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
tools/schema_v2_synthetic_augment/src/main.rs CHANGED
@@ -5,7 +5,7 @@ use regex::Regex;
5
  use serde::{Deserialize, Serialize};
6
  #[cfg(test)]
7
  use serde_json::Value;
8
- use std::collections::HashSet;
9
  use std::fs::{self, File};
10
  use std::io::{BufRead, BufReader, BufWriter, Write};
11
  use std::path::{Path, PathBuf};
@@ -57,11 +57,18 @@ struct LabelSchema {
57
  labels: Vec<String>,
58
  }
59
 
 
 
 
 
 
60
  #[derive(Debug, Clone, Deserialize)]
61
  struct Recipe {
62
  template_id: String,
63
  template: String,
64
  roles: Vec<String>,
 
 
65
  confidence: Option<String>,
66
  #[serde(rename = "count")]
67
  _count: Option<u64>,
@@ -692,23 +699,84 @@ fn validate_record_labels(record: &Record, label_set: &HashSet<String>) -> Resul
692
  Ok(())
693
  }
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
696
  let classes: Vec<&str> = recipe.template.split_whitespace().collect();
697
  if classes.len() != recipe.roles.len() {
698
  return None;
699
  }
700
- if recipe
701
- .roles
702
- .iter()
703
- .filter(|role| role.as_str() == "TITLE")
704
- .count()
705
- != 1
706
- {
707
  return None;
708
  }
 
 
 
 
 
 
 
709
  let mut builder = CharBuilder::default();
710
  let mut previous_role = "";
711
- for (class_name, role) in classes.iter().zip(recipe.roles.iter()) {
712
  let special_number = role == "EPISODE" && previous_role == "SPECIAL";
713
  let role_for_label = if special_number {
714
  "SPECIAL"
@@ -720,7 +788,7 @@ fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<
720
  class_name,
721
  role,
722
  role_for_label,
723
- title,
724
  variant,
725
  special_number,
726
  )?;
@@ -1404,6 +1472,7 @@ mod tests {
1404
  "SPECIAL".to_string(),
1405
  "EPISODE".to_string(),
1406
  ],
 
1407
  confidence: Some("high".to_string()),
1408
  _count: Some(1),
1409
  };
@@ -1433,12 +1502,49 @@ mod tests {
1433
  "O".to_string(),
1434
  "EPISODE".to_string(),
1435
  ],
 
1436
  confidence: Some("high".to_string()),
1437
  _count: Some(1),
1438
  };
1439
  assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
1440
  }
1441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1442
  #[test]
1443
  fn path_title_season_episode_labels_are_projected() {
1444
  let base = char_record_from_spans(
 
5
  use serde::{Deserialize, Serialize};
6
  #[cfg(test)]
7
  use serde_json::Value;
8
+ use std::collections::{HashMap, HashSet};
9
  use std::fs::{self, File};
10
  use std::io::{BufRead, BufReader, BufWriter, Write};
11
  use std::path::{Path, PathBuf};
 
57
  labels: Vec<String>,
58
  }
59
 
60
+ #[derive(Debug, Clone, Deserialize)]
61
+ struct TitleSpan {
62
+ role_indices: Vec<usize>,
63
+ }
64
+
65
  #[derive(Debug, Clone, Deserialize)]
66
  struct Recipe {
67
  template_id: String,
68
  template: String,
69
  roles: Vec<String>,
70
+ #[serde(default)]
71
+ title_spans: Vec<TitleSpan>,
72
  confidence: Option<String>,
73
  #[serde(rename = "count")]
74
  _count: Option<u64>,
 
699
  Ok(())
700
  }
701
 
702
+ fn logical_title_spans(recipe: &Recipe) -> Option<Vec<Vec<usize>>> {
703
+ if recipe.title_spans.is_empty() {
704
+ return Some(
705
+ recipe
706
+ .roles
707
+ .iter()
708
+ .enumerate()
709
+ .filter_map(|(index, role)| {
710
+ if role == "TITLE" {
711
+ Some(vec![index])
712
+ } else {
713
+ None
714
+ }
715
+ })
716
+ .collect(),
717
+ );
718
+ }
719
+ let mut spans = Vec::new();
720
+ for span in &recipe.title_spans {
721
+ if span.role_indices.is_empty() {
722
+ return None;
723
+ }
724
+ for &index in &span.role_indices {
725
+ if recipe.roles.get(index).map(String::as_str) != Some("TITLE") {
726
+ return None;
727
+ }
728
+ }
729
+ spans.push(span.role_indices.clone());
730
+ }
731
+ Some(spans)
732
+ }
733
+
734
+ fn split_title_for_slots(title: &str, slots: usize) -> Option<Vec<String>> {
735
+ if slots == 0 {
736
+ return None;
737
+ }
738
+ if slots == 1 {
739
+ return Some(vec![title.to_string()]);
740
+ }
741
+ let words: Vec<&str> = title
742
+ .split_whitespace()
743
+ .filter(|word| !word.is_empty())
744
+ .collect();
745
+ if words.len() < slots {
746
+ return None;
747
+ }
748
+ let mut chunks = Vec::with_capacity(slots);
749
+ let mut start = 0usize;
750
+ for slot in 0..slots {
751
+ let remaining_words = words.len() - start;
752
+ let remaining_slots = slots - slot;
753
+ let take = remaining_words.div_ceil(remaining_slots);
754
+ let end = start + take;
755
+ chunks.push(words[start..end].join(" "));
756
+ start = end;
757
+ }
758
+ Some(chunks)
759
+ }
760
+
761
  fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
762
  let classes: Vec<&str> = recipe.template.split_whitespace().collect();
763
  if classes.len() != recipe.roles.len() {
764
  return None;
765
  }
766
+ let title_spans = logical_title_spans(recipe)?;
767
+ if title_spans.len() != 1 {
 
 
 
 
 
768
  return None;
769
  }
770
+ let title_role_indices = &title_spans[0];
771
+ let title_chunks = split_title_for_slots(title, title_role_indices.len())?;
772
+ let title_chunk_by_index: HashMap<usize, &str> = title_role_indices
773
+ .iter()
774
+ .copied()
775
+ .zip(title_chunks.iter().map(String::as_str))
776
+ .collect();
777
  let mut builder = CharBuilder::default();
778
  let mut previous_role = "";
779
+ for (index, (class_name, role)) in classes.iter().zip(recipe.roles.iter()).enumerate() {
780
  let special_number = role == "EPISODE" && previous_role == "SPECIAL";
781
  let role_for_label = if special_number {
782
  "SPECIAL"
 
788
  class_name,
789
  role,
790
  role_for_label,
791
+ title_chunk_by_index.get(&index).copied().unwrap_or(title),
792
  variant,
793
  special_number,
794
  )?;
 
1472
  "SPECIAL".to_string(),
1473
  "EPISODE".to_string(),
1474
  ],
1475
+ title_spans: Vec::new(),
1476
  confidence: Some("high".to_string()),
1477
  _count: Some(1),
1478
  };
 
1502
  "O".to_string(),
1503
  "EPISODE".to_string(),
1504
  ],
1505
+ title_spans: Vec::new(),
1506
  confidence: Some("high".to_string()),
1507
  _count: Some(1),
1508
  };
1509
  assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
1510
  }
1511
 
1512
+ #[test]
1513
+ fn numeric_generation_splits_merged_logical_title_span() {
1514
+ let recipe = Recipe {
1515
+ template_id: "tpl_merged_title".to_string(),
1516
+ template: "TEXT SEP TEXT SEP TEXT SEP EPISODE".to_string(),
1517
+ roles: vec![
1518
+ "TITLE".to_string(),
1519
+ "O".to_string(),
1520
+ "TITLE".to_string(),
1521
+ "O".to_string(),
1522
+ "TITLE".to_string(),
1523
+ "O".to_string(),
1524
+ "EPISODE".to_string(),
1525
+ ],
1526
+ title_spans: vec![TitleSpan {
1527
+ role_indices: vec![0, 2, 4],
1528
+ }],
1529
+ confidence: Some("high".to_string()),
1530
+ _count: Some(1),
1531
+ };
1532
+ let record = build_numeric_record(&recipe, "500 Days Until the Dungeon Closes", 0).unwrap();
1533
+ assert!(record.filename.contains("500 Days"));
1534
+ assert!(record.filename.contains("Until the"));
1535
+ assert!(record.filename.contains("Dungeon Closes"));
1536
+ assert_eq!(
1537
+ record
1538
+ .filename
1539
+ .matches("500 Days Until the Dungeon Closes")
1540
+ .count(),
1541
+ 1
1542
+ );
1543
+ assert_all_entity(&record, "500 Days", "TITLE_LATIN");
1544
+ assert_all_entity(&record, "Until the", "TITLE_LATIN");
1545
+ assert_all_entity(&record, "Dungeon Closes", "TITLE_LATIN");
1546
+ }
1547
+
1548
  #[test]
1549
  fn path_title_season_episode_labels_are_projected() {
1550
  let base = char_record_from_spans(