Add Rust template title span metadata

Browse files

Files changed (2) hide show

tools/rust_dmhy_template_apply/src/main.rs +190 -1
tools/schema_v2_synthetic_augment/src/main.rs +116 -10

tools/rust_dmhy_template_apply/src/main.rs CHANGED Viewed

@@ -715,14 +715,167 @@ fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_
     }
 }
 fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
     json!({
         "template_id": format!("tpl_{rank:06}"),
         "template": key,
         "count": cluster.count,
         "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
         "top_literals": top_counts(&cluster.literal_counts, 12),
-        "suggested_roles": suggested_roles(key),
         "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
         "class_counts": top_counts(&cluster.class_counts, 20),
         "examples": cluster.examples,
@@ -770,6 +923,8 @@ fn recipe_row(row: &Value, confidence: &str) -> Value {
         "template_id": row["template_id"],
         "template": row["template"],
         "roles": row["suggested_roles"],
         "confidence": confidence,
         "count": row["count"],
         "examples": row["examples"],
@@ -4550,6 +4705,40 @@ mod tests {
         }
     }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");

     }
 }
+fn normalized_similarity_text(value: &str) -> String {
+    value
+        .chars()
+        .flat_map(|ch| ch.to_lowercase())
+        .filter(|ch| ch.is_alphanumeric())
+        .collect()
+}
+fn char_bigrams(value: &str) -> HashSet<String> {
+    let chars: Vec<char> = value.chars().collect();
+    if chars.len() < 2 {
+        if value.is_empty() {
+            return HashSet::new();
+        }
+        return HashSet::from([value.to_string()]);
+    }
+    chars
+        .windows(2)
+        .map(|window| window.iter().collect())
+        .collect()
+}
+fn dice_similarity(left: &str, right: &str) -> f64 {
+    let left = normalized_similarity_text(left);
+    let right = normalized_similarity_text(right);
+    if left.is_empty() || right.is_empty() {
+        return 0.0;
+    }
+    if left == right {
+        return 1.0;
+    }
+    if left.contains(&right) || right.contains(&left) {
+        let shorter = left.chars().count().min(right.chars().count()) as f64;
+        let longer = left.chars().count().max(right.chars().count()) as f64;
+        return shorter / longer;
+    }
+    let left_bigrams = char_bigrams(&left);
+    let right_bigrams = char_bigrams(&right);
+    if left_bigrams.is_empty() || right_bigrams.is_empty() {
+        return 0.0;
+    }
+    let overlap = left_bigrams.intersection(&right_bigrams).count() as f64;
+    2.0 * overlap / (left_bigrams.len() + right_bigrams.len()) as f64
+}
+fn position_literal_similarity(cluster: &Cluster, left_index: usize, right_index: usize) -> f64 {
+    let left = match cluster.position_literals.get(left_index) {
+        Some(value) => top_counts(value, 5),
+        None => return 0.0,
+    };
+    let right = match cluster.position_literals.get(right_index) {
+        Some(value) => top_counts(value, 5),
+        None => return 0.0,
+    };
+    let mut best = 0.0;
+    for (left_text, _) in &left {
+        for (right_text, _) in &right {
+            let similarity = dice_similarity(left_text, right_text);
+            if similarity > best {
+                best = similarity;
+            }
+        }
+    }
+    best
+}
+fn compute_title_span_metadata(
+    key: &str,
+    roles: &[String],
+    cluster: &Cluster,
+) -> (Vec<Value>, Vec<Value>) {
+    let items: Vec<&str> = key.split_whitespace().collect();
+    let title_indices: Vec<usize> = roles
+        .iter()
+        .enumerate()
+        .filter_map(|(index, role)| if role == "TITLE" { Some(index) } else { None })
+        .collect();
+    let mut spans: Vec<Vec<usize>> = Vec::new();
+    let mut decisions: Vec<Value> = Vec::new();
+    let mut current_span: Vec<usize> = Vec::new();
+    let similarity_keep_threshold = 0.72;
+    for title_index in title_indices {
+        if current_span.is_empty() {
+            current_span.push(title_index);
+            continue;
+        }
+        let previous = *current_span.last().unwrap();
+        let between = &items[(previous + 1)..title_index];
+        let roles_between = &roles[(previous + 1)..title_index];
+        let has_path_boundary = between.iter().any(|item| *item == "PATH");
+        let only_separators_between = !between.is_empty()
+            && between.iter().all(|item| *item == "SEP")
+            && roles_between.iter().all(|role| role == "O");
+        let similarity = position_literal_similarity(cluster, previous, title_index);
+        if has_path_boundary {
+            decisions.push(json!({
+                "left_role_index": previous,
+                "right_role_index": title_index,
+                "similarity": similarity,
+                "decision": "keep_path_boundary",
+                "source": "heuristic",
+            }));
+            spans.push(current_span);
+            current_span = vec![title_index];
+        } else if only_separators_between && similarity < similarity_keep_threshold {
+            decisions.push(json!({
+                "left_role_index": previous,
+                "right_role_index": title_index,
+                "similarity": similarity,
+                "decision": "merge_dissimilar_title_continuation",
+                "source": "heuristic",
+            }));
+            current_span.push(title_index);
+        } else {
+            let decision = if only_separators_between {
+                "keep_similar_duplicate_or_alias"
+            } else {
+                "keep_structural_boundary"
+            };
+            decisions.push(json!({
+                "left_role_index": previous,
+                "right_role_index": title_index,
+                "similarity": similarity,
+                "decision": decision,
+                "source": "heuristic",
+            }));
+            spans.push(current_span);
+            current_span = vec![title_index];
+        }
+    }
+    if !current_span.is_empty() {
+        spans.push(current_span);
+    }
+    let title_spans = spans
+        .into_iter()
+        .map(|role_indices| {
+            json!({
+                "role_indices": role_indices,
+                "source": "heuristic",
+            })
+        })
+        .collect();
+    (title_spans, decisions)
+}
 fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
+    let suggested_roles = suggested_roles(key);
+    let (title_spans, title_boundary_decisions) =
+        compute_title_span_metadata(key, &suggested_roles, cluster);
     json!({
         "template_id": format!("tpl_{rank:06}"),
         "template": key,
         "count": cluster.count,
         "coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
         "top_literals": top_counts(&cluster.literal_counts, 12),
+        "suggested_roles": suggested_roles,
+        "title_spans": title_spans,
+        "title_boundary_decisions": title_boundary_decisions,
         "position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
         "class_counts": top_counts(&cluster.class_counts, 20),
         "examples": cluster.examples,
         "template_id": row["template_id"],
         "template": row["template"],
         "roles": row["suggested_roles"],
+        "title_spans": row.get("title_spans").cloned().unwrap_or_else(|| json!([])),
+        "title_boundary_decisions": row.get("title_boundary_decisions").cloned().unwrap_or_else(|| json!([])),
         "confidence": confidence,
         "count": row["count"],
         "examples": row["examples"],
         }
     }
+    #[test]
+    fn title_span_metadata_merges_dissimilar_continuations() {
+        let key = "BRACKET_TEXT SEP TEXT SEP TEXT SEP TEXT SEP EPISODE SEP BRACKET_MEDIA_BLOCK";
+        let roles = suggested_roles(key);
+        let mut cluster = Cluster::default();
+        cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
+        cluster.position_literals[2].insert("Tales".to_string(), 10);
+        cluster.position_literals[4].insert("of Phantasia".to_string(), 10);
+        cluster.position_literals[6].insert("The Animation".to_string(), 10);
+        let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
+        assert_eq!(spans.len(), 1, "{spans:?} {decisions:?}");
+        assert_eq!(
+            spans[0]["role_indices"].as_array().unwrap().len(),
+            3,
+            "{spans:?}"
+        );
+        assert!(decisions
+            .iter()
+            .all(|decision| { decision["decision"] == "merge_dissimilar_title_continuation" }));
+    }
+    #[test]
+    fn title_span_metadata_keeps_similar_duplicates_separate() {
+        let key = "TEXT SEP TEXT SEP EPISODE";
+        let roles = suggested_roles(key);
+        let mut cluster = Cluster::default();
+        cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
+        cluster.position_literals[0].insert("Frieren".to_string(), 10);
+        cluster.position_literals[2].insert("Frieren".to_string(), 10);
+        let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
+        assert_eq!(spans.len(), 2, "{spans:?} {decisions:?}");
+        assert_eq!(decisions[0]["decision"], "keep_similar_duplicate_or_alias");
+    }
     #[test]
     fn required_regressions() {
         let title_91 = labels_for("Title 91 EP 01 [1080p]");

tools/schema_v2_synthetic_augment/src/main.rs CHANGED Viewed

@@ -5,7 +5,7 @@ use regex::Regex;
 use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use serde_json::Value;
-use std::collections::HashSet;
 use std::fs::{self, File};
 use std::io::{BufRead, BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
@@ -57,11 +57,18 @@ struct LabelSchema {
     labels: Vec<String>,
 }
 #[derive(Debug, Clone, Deserialize)]
 struct Recipe {
     template_id: String,
     template: String,
     roles: Vec<String>,
     confidence: Option<String>,
     #[serde(rename = "count")]
     _count: Option<u64>,
@@ -692,23 +699,84 @@ fn validate_record_labels(record: &Record, label_set: &HashSet<String>) -> Resul
     Ok(())
 }
 fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
     let classes: Vec<&str> = recipe.template.split_whitespace().collect();
     if classes.len() != recipe.roles.len() {
         return None;
     }
-    if recipe
-        .roles
-        .iter()
-        .filter(|role| role.as_str() == "TITLE")
-        .count()
-        != 1
-    {
         return None;
     }
     let mut builder = CharBuilder::default();
     let mut previous_role = "";
-    for (class_name, role) in classes.iter().zip(recipe.roles.iter()) {
         let special_number = role == "EPISODE" && previous_role == "SPECIAL";
         let role_for_label = if special_number {
             "SPECIAL"
@@ -720,7 +788,7 @@ fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<
             class_name,
             role,
             role_for_label,
-            title,
             variant,
             special_number,
         )?;
@@ -1404,6 +1472,7 @@ mod tests {
                 "SPECIAL".to_string(),
                 "EPISODE".to_string(),
             ],
             confidence: Some("high".to_string()),
             _count: Some(1),
         };
@@ -1433,12 +1502,49 @@ mod tests {
                 "O".to_string(),
                 "EPISODE".to_string(),
             ],
             confidence: Some("high".to_string()),
             _count: Some(1),
         };
         assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
     }
     #[test]
     fn path_title_season_episode_labels_are_projected() {
         let base = char_record_from_spans(

 use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use serde_json::Value;
+use std::collections::{HashMap, HashSet};
 use std::fs::{self, File};
 use std::io::{BufRead, BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
     labels: Vec<String>,
 }
+#[derive(Debug, Clone, Deserialize)]
+struct TitleSpan {
+    role_indices: Vec<usize>,
+}
 #[derive(Debug, Clone, Deserialize)]
 struct Recipe {
     template_id: String,
     template: String,
     roles: Vec<String>,
+    #[serde(default)]
+    title_spans: Vec<TitleSpan>,
     confidence: Option<String>,
     #[serde(rename = "count")]
     _count: Option<u64>,
     Ok(())
 }
+fn logical_title_spans(recipe: &Recipe) -> Option<Vec<Vec<usize>>> {
+    if recipe.title_spans.is_empty() {
+        return Some(
+            recipe
+                .roles
+                .iter()
+                .enumerate()
+                .filter_map(|(index, role)| {
+                    if role == "TITLE" {
+                        Some(vec![index])
+                    } else {
+                        None
+                    }
+                })
+                .collect(),
+        );
+    }
+    let mut spans = Vec::new();
+    for span in &recipe.title_spans {
+        if span.role_indices.is_empty() {
+            return None;
+        }
+        for &index in &span.role_indices {
+            if recipe.roles.get(index).map(String::as_str) != Some("TITLE") {
+                return None;
+            }
+        }
+        spans.push(span.role_indices.clone());
+    }
+    Some(spans)
+}
+fn split_title_for_slots(title: &str, slots: usize) -> Option<Vec<String>> {
+    if slots == 0 {
+        return None;
+    }
+    if slots == 1 {
+        return Some(vec![title.to_string()]);
+    }
+    let words: Vec<&str> = title
+        .split_whitespace()
+        .filter(|word| !word.is_empty())
+        .collect();
+    if words.len() < slots {
+        return None;
+    }
+    let mut chunks = Vec::with_capacity(slots);
+    let mut start = 0usize;
+    for slot in 0..slots {
+        let remaining_words = words.len() - start;
+        let remaining_slots = slots - slot;
+        let take = remaining_words.div_ceil(remaining_slots);
+        let end = start + take;
+        chunks.push(words[start..end].join(" "));
+        start = end;
+    }
+    Some(chunks)
+}
 fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
     let classes: Vec<&str> = recipe.template.split_whitespace().collect();
     if classes.len() != recipe.roles.len() {
         return None;
     }
+    let title_spans = logical_title_spans(recipe)?;
+    if title_spans.len() != 1 {
         return None;
     }
+    let title_role_indices = &title_spans[0];
+    let title_chunks = split_title_for_slots(title, title_role_indices.len())?;
+    let title_chunk_by_index: HashMap<usize, &str> = title_role_indices
+        .iter()
+        .copied()
+        .zip(title_chunks.iter().map(String::as_str))
+        .collect();
     let mut builder = CharBuilder::default();
     let mut previous_role = "";
+    for (index, (class_name, role)) in classes.iter().zip(recipe.roles.iter()).enumerate() {
         let special_number = role == "EPISODE" && previous_role == "SPECIAL";
         let role_for_label = if special_number {
             "SPECIAL"
             class_name,
             role,
             role_for_label,
+            title_chunk_by_index.get(&index).copied().unwrap_or(title),
             variant,
             special_number,
         )?;
                 "SPECIAL".to_string(),
                 "EPISODE".to_string(),
             ],
+            title_spans: Vec::new(),
             confidence: Some("high".to_string()),
             _count: Some(1),
         };
                 "O".to_string(),
                 "EPISODE".to_string(),
             ],
+            title_spans: Vec::new(),
             confidence: Some("high".to_string()),
             _count: Some(1),
         };
         assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
     }
+    #[test]
+    fn numeric_generation_splits_merged_logical_title_span() {
+        let recipe = Recipe {
+            template_id: "tpl_merged_title".to_string(),
+            template: "TEXT SEP TEXT SEP TEXT SEP EPISODE".to_string(),
+            roles: vec![
+                "TITLE".to_string(),
+                "O".to_string(),
+                "TITLE".to_string(),
+                "O".to_string(),
+                "TITLE".to_string(),
+                "O".to_string(),
+                "EPISODE".to_string(),
+            ],
+            title_spans: vec![TitleSpan {
+                role_indices: vec![0, 2, 4],
+            }],
+            confidence: Some("high".to_string()),
+            _count: Some(1),
+        };
+        let record = build_numeric_record(&recipe, "500 Days Until the Dungeon Closes", 0).unwrap();
+        assert!(record.filename.contains("500 Days"));
+        assert!(record.filename.contains("Until the"));
+        assert!(record.filename.contains("Dungeon Closes"));
+        assert_eq!(
+            record
+                .filename
+                .matches("500 Days Until the Dungeon Closes")
+                .count(),
+            1
+        );
+        assert_all_entity(&record, "500 Days", "TITLE_LATIN");
+        assert_all_entity(&record, "Until the", "TITLE_LATIN");
+        assert_all_entity(&record, "Dungeon Closes", "TITLE_LATIN");
+    }
     #[test]
     fn path_title_season_episode_labels_are_projected() {
         let base = char_record_from_spans(