Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Add Rust template title span metadata
Browse files
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -715,14 +715,167 @@ fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_
|
|
| 715 |
}
|
| 716 |
}
|
| 717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
|
|
|
|
|
|
|
|
|
|
| 719 |
json!({
|
| 720 |
"template_id": format!("tpl_{rank:06}"),
|
| 721 |
"template": key,
|
| 722 |
"count": cluster.count,
|
| 723 |
"coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
|
| 724 |
"top_literals": top_counts(&cluster.literal_counts, 12),
|
| 725 |
-
"suggested_roles": suggested_roles
|
|
|
|
|
|
|
| 726 |
"position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
|
| 727 |
"class_counts": top_counts(&cluster.class_counts, 20),
|
| 728 |
"examples": cluster.examples,
|
|
@@ -770,6 +923,8 @@ fn recipe_row(row: &Value, confidence: &str) -> Value {
|
|
| 770 |
"template_id": row["template_id"],
|
| 771 |
"template": row["template"],
|
| 772 |
"roles": row["suggested_roles"],
|
|
|
|
|
|
|
| 773 |
"confidence": confidence,
|
| 774 |
"count": row["count"],
|
| 775 |
"examples": row["examples"],
|
|
@@ -4550,6 +4705,40 @@ mod tests {
|
|
| 4550 |
}
|
| 4551 |
}
|
| 4552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4553 |
#[test]
|
| 4554 |
fn required_regressions() {
|
| 4555 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
|
|
|
| 715 |
}
|
| 716 |
}
|
| 717 |
|
| 718 |
+
fn normalized_similarity_text(value: &str) -> String {
|
| 719 |
+
value
|
| 720 |
+
.chars()
|
| 721 |
+
.flat_map(|ch| ch.to_lowercase())
|
| 722 |
+
.filter(|ch| ch.is_alphanumeric())
|
| 723 |
+
.collect()
|
| 724 |
+
}
|
| 725 |
+
|
| 726 |
+
fn char_bigrams(value: &str) -> HashSet<String> {
|
| 727 |
+
let chars: Vec<char> = value.chars().collect();
|
| 728 |
+
if chars.len() < 2 {
|
| 729 |
+
if value.is_empty() {
|
| 730 |
+
return HashSet::new();
|
| 731 |
+
}
|
| 732 |
+
return HashSet::from([value.to_string()]);
|
| 733 |
+
}
|
| 734 |
+
chars
|
| 735 |
+
.windows(2)
|
| 736 |
+
.map(|window| window.iter().collect())
|
| 737 |
+
.collect()
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
fn dice_similarity(left: &str, right: &str) -> f64 {
|
| 741 |
+
let left = normalized_similarity_text(left);
|
| 742 |
+
let right = normalized_similarity_text(right);
|
| 743 |
+
if left.is_empty() || right.is_empty() {
|
| 744 |
+
return 0.0;
|
| 745 |
+
}
|
| 746 |
+
if left == right {
|
| 747 |
+
return 1.0;
|
| 748 |
+
}
|
| 749 |
+
if left.contains(&right) || right.contains(&left) {
|
| 750 |
+
let shorter = left.chars().count().min(right.chars().count()) as f64;
|
| 751 |
+
let longer = left.chars().count().max(right.chars().count()) as f64;
|
| 752 |
+
return shorter / longer;
|
| 753 |
+
}
|
| 754 |
+
let left_bigrams = char_bigrams(&left);
|
| 755 |
+
let right_bigrams = char_bigrams(&right);
|
| 756 |
+
if left_bigrams.is_empty() || right_bigrams.is_empty() {
|
| 757 |
+
return 0.0;
|
| 758 |
+
}
|
| 759 |
+
let overlap = left_bigrams.intersection(&right_bigrams).count() as f64;
|
| 760 |
+
2.0 * overlap / (left_bigrams.len() + right_bigrams.len()) as f64
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
fn position_literal_similarity(cluster: &Cluster, left_index: usize, right_index: usize) -> f64 {
|
| 764 |
+
let left = match cluster.position_literals.get(left_index) {
|
| 765 |
+
Some(value) => top_counts(value, 5),
|
| 766 |
+
None => return 0.0,
|
| 767 |
+
};
|
| 768 |
+
let right = match cluster.position_literals.get(right_index) {
|
| 769 |
+
Some(value) => top_counts(value, 5),
|
| 770 |
+
None => return 0.0,
|
| 771 |
+
};
|
| 772 |
+
let mut best = 0.0;
|
| 773 |
+
for (left_text, _) in &left {
|
| 774 |
+
for (right_text, _) in &right {
|
| 775 |
+
let similarity = dice_similarity(left_text, right_text);
|
| 776 |
+
if similarity > best {
|
| 777 |
+
best = similarity;
|
| 778 |
+
}
|
| 779 |
+
}
|
| 780 |
+
}
|
| 781 |
+
best
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
fn compute_title_span_metadata(
|
| 785 |
+
key: &str,
|
| 786 |
+
roles: &[String],
|
| 787 |
+
cluster: &Cluster,
|
| 788 |
+
) -> (Vec<Value>, Vec<Value>) {
|
| 789 |
+
let items: Vec<&str> = key.split_whitespace().collect();
|
| 790 |
+
let title_indices: Vec<usize> = roles
|
| 791 |
+
.iter()
|
| 792 |
+
.enumerate()
|
| 793 |
+
.filter_map(|(index, role)| if role == "TITLE" { Some(index) } else { None })
|
| 794 |
+
.collect();
|
| 795 |
+
let mut spans: Vec<Vec<usize>> = Vec::new();
|
| 796 |
+
let mut decisions: Vec<Value> = Vec::new();
|
| 797 |
+
let mut current_span: Vec<usize> = Vec::new();
|
| 798 |
+
let similarity_keep_threshold = 0.72;
|
| 799 |
+
|
| 800 |
+
for title_index in title_indices {
|
| 801 |
+
if current_span.is_empty() {
|
| 802 |
+
current_span.push(title_index);
|
| 803 |
+
continue;
|
| 804 |
+
}
|
| 805 |
+
let previous = *current_span.last().unwrap();
|
| 806 |
+
let between = &items[(previous + 1)..title_index];
|
| 807 |
+
let roles_between = &roles[(previous + 1)..title_index];
|
| 808 |
+
let has_path_boundary = between.iter().any(|item| *item == "PATH");
|
| 809 |
+
let only_separators_between = !between.is_empty()
|
| 810 |
+
&& between.iter().all(|item| *item == "SEP")
|
| 811 |
+
&& roles_between.iter().all(|role| role == "O");
|
| 812 |
+
let similarity = position_literal_similarity(cluster, previous, title_index);
|
| 813 |
+
|
| 814 |
+
if has_path_boundary {
|
| 815 |
+
decisions.push(json!({
|
| 816 |
+
"left_role_index": previous,
|
| 817 |
+
"right_role_index": title_index,
|
| 818 |
+
"similarity": similarity,
|
| 819 |
+
"decision": "keep_path_boundary",
|
| 820 |
+
"source": "heuristic",
|
| 821 |
+
}));
|
| 822 |
+
spans.push(current_span);
|
| 823 |
+
current_span = vec![title_index];
|
| 824 |
+
} else if only_separators_between && similarity < similarity_keep_threshold {
|
| 825 |
+
decisions.push(json!({
|
| 826 |
+
"left_role_index": previous,
|
| 827 |
+
"right_role_index": title_index,
|
| 828 |
+
"similarity": similarity,
|
| 829 |
+
"decision": "merge_dissimilar_title_continuation",
|
| 830 |
+
"source": "heuristic",
|
| 831 |
+
}));
|
| 832 |
+
current_span.push(title_index);
|
| 833 |
+
} else {
|
| 834 |
+
let decision = if only_separators_between {
|
| 835 |
+
"keep_similar_duplicate_or_alias"
|
| 836 |
+
} else {
|
| 837 |
+
"keep_structural_boundary"
|
| 838 |
+
};
|
| 839 |
+
decisions.push(json!({
|
| 840 |
+
"left_role_index": previous,
|
| 841 |
+
"right_role_index": title_index,
|
| 842 |
+
"similarity": similarity,
|
| 843 |
+
"decision": decision,
|
| 844 |
+
"source": "heuristic",
|
| 845 |
+
}));
|
| 846 |
+
spans.push(current_span);
|
| 847 |
+
current_span = vec![title_index];
|
| 848 |
+
}
|
| 849 |
+
}
|
| 850 |
+
if !current_span.is_empty() {
|
| 851 |
+
spans.push(current_span);
|
| 852 |
+
}
|
| 853 |
+
|
| 854 |
+
let title_spans = spans
|
| 855 |
+
.into_iter()
|
| 856 |
+
.map(|role_indices| {
|
| 857 |
+
json!({
|
| 858 |
+
"role_indices": role_indices,
|
| 859 |
+
"source": "heuristic",
|
| 860 |
+
})
|
| 861 |
+
})
|
| 862 |
+
.collect();
|
| 863 |
+
(title_spans, decisions)
|
| 864 |
+
}
|
| 865 |
+
|
| 866 |
fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
|
| 867 |
+
let suggested_roles = suggested_roles(key);
|
| 868 |
+
let (title_spans, title_boundary_decisions) =
|
| 869 |
+
compute_title_span_metadata(key, &suggested_roles, cluster);
|
| 870 |
json!({
|
| 871 |
"template_id": format!("tpl_{rank:06}"),
|
| 872 |
"template": key,
|
| 873 |
"count": cluster.count,
|
| 874 |
"coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
|
| 875 |
"top_literals": top_counts(&cluster.literal_counts, 12),
|
| 876 |
+
"suggested_roles": suggested_roles,
|
| 877 |
+
"title_spans": title_spans,
|
| 878 |
+
"title_boundary_decisions": title_boundary_decisions,
|
| 879 |
"position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
|
| 880 |
"class_counts": top_counts(&cluster.class_counts, 20),
|
| 881 |
"examples": cluster.examples,
|
|
|
|
| 923 |
"template_id": row["template_id"],
|
| 924 |
"template": row["template"],
|
| 925 |
"roles": row["suggested_roles"],
|
| 926 |
+
"title_spans": row.get("title_spans").cloned().unwrap_or_else(|| json!([])),
|
| 927 |
+
"title_boundary_decisions": row.get("title_boundary_decisions").cloned().unwrap_or_else(|| json!([])),
|
| 928 |
"confidence": confidence,
|
| 929 |
"count": row["count"],
|
| 930 |
"examples": row["examples"],
|
|
|
|
| 4705 |
}
|
| 4706 |
}
|
| 4707 |
|
| 4708 |
+
#[test]
|
| 4709 |
+
fn title_span_metadata_merges_dissimilar_continuations() {
|
| 4710 |
+
let key = "BRACKET_TEXT SEP TEXT SEP TEXT SEP TEXT SEP EPISODE SEP BRACKET_MEDIA_BLOCK";
|
| 4711 |
+
let roles = suggested_roles(key);
|
| 4712 |
+
let mut cluster = Cluster::default();
|
| 4713 |
+
cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
|
| 4714 |
+
cluster.position_literals[2].insert("Tales".to_string(), 10);
|
| 4715 |
+
cluster.position_literals[4].insert("of Phantasia".to_string(), 10);
|
| 4716 |
+
cluster.position_literals[6].insert("The Animation".to_string(), 10);
|
| 4717 |
+
let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
|
| 4718 |
+
assert_eq!(spans.len(), 1, "{spans:?} {decisions:?}");
|
| 4719 |
+
assert_eq!(
|
| 4720 |
+
spans[0]["role_indices"].as_array().unwrap().len(),
|
| 4721 |
+
3,
|
| 4722 |
+
"{spans:?}"
|
| 4723 |
+
);
|
| 4724 |
+
assert!(decisions
|
| 4725 |
+
.iter()
|
| 4726 |
+
.all(|decision| { decision["decision"] == "merge_dissimilar_title_continuation" }));
|
| 4727 |
+
}
|
| 4728 |
+
|
| 4729 |
+
#[test]
|
| 4730 |
+
fn title_span_metadata_keeps_similar_duplicates_separate() {
|
| 4731 |
+
let key = "TEXT SEP TEXT SEP EPISODE";
|
| 4732 |
+
let roles = suggested_roles(key);
|
| 4733 |
+
let mut cluster = Cluster::default();
|
| 4734 |
+
cluster.position_literals = vec![HashMap::new(); key.split_whitespace().count()];
|
| 4735 |
+
cluster.position_literals[0].insert("Frieren".to_string(), 10);
|
| 4736 |
+
cluster.position_literals[2].insert("Frieren".to_string(), 10);
|
| 4737 |
+
let (spans, decisions) = compute_title_span_metadata(key, &roles, &cluster);
|
| 4738 |
+
assert_eq!(spans.len(), 2, "{spans:?} {decisions:?}");
|
| 4739 |
+
assert_eq!(decisions[0]["decision"], "keep_similar_duplicate_or_alias");
|
| 4740 |
+
}
|
| 4741 |
+
|
| 4742 |
#[test]
|
| 4743 |
fn required_regressions() {
|
| 4744 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
tools/schema_v2_synthetic_augment/src/main.rs
CHANGED
|
@@ -5,7 +5,7 @@ use regex::Regex;
|
|
| 5 |
use serde::{Deserialize, Serialize};
|
| 6 |
#[cfg(test)]
|
| 7 |
use serde_json::Value;
|
| 8 |
-
use std::collections::HashSet;
|
| 9 |
use std::fs::{self, File};
|
| 10 |
use std::io::{BufRead, BufReader, BufWriter, Write};
|
| 11 |
use std::path::{Path, PathBuf};
|
|
@@ -57,11 +57,18 @@ struct LabelSchema {
|
|
| 57 |
labels: Vec<String>,
|
| 58 |
}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
#[derive(Debug, Clone, Deserialize)]
|
| 61 |
struct Recipe {
|
| 62 |
template_id: String,
|
| 63 |
template: String,
|
| 64 |
roles: Vec<String>,
|
|
|
|
|
|
|
| 65 |
confidence: Option<String>,
|
| 66 |
#[serde(rename = "count")]
|
| 67 |
_count: Option<u64>,
|
|
@@ -692,23 +699,84 @@ fn validate_record_labels(record: &Record, label_set: &HashSet<String>) -> Resul
|
|
| 692 |
Ok(())
|
| 693 |
}
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
|
| 696 |
let classes: Vec<&str> = recipe.template.split_whitespace().collect();
|
| 697 |
if classes.len() != recipe.roles.len() {
|
| 698 |
return None;
|
| 699 |
}
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
.iter()
|
| 703 |
-
.filter(|role| role.as_str() == "TITLE")
|
| 704 |
-
.count()
|
| 705 |
-
!= 1
|
| 706 |
-
{
|
| 707 |
return None;
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
let mut builder = CharBuilder::default();
|
| 710 |
let mut previous_role = "";
|
| 711 |
-
for (class_name, role) in classes.iter().zip(recipe.roles.iter()) {
|
| 712 |
let special_number = role == "EPISODE" && previous_role == "SPECIAL";
|
| 713 |
let role_for_label = if special_number {
|
| 714 |
"SPECIAL"
|
|
@@ -720,7 +788,7 @@ fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<
|
|
| 720 |
class_name,
|
| 721 |
role,
|
| 722 |
role_for_label,
|
| 723 |
-
title,
|
| 724 |
variant,
|
| 725 |
special_number,
|
| 726 |
)?;
|
|
@@ -1404,6 +1472,7 @@ mod tests {
|
|
| 1404 |
"SPECIAL".to_string(),
|
| 1405 |
"EPISODE".to_string(),
|
| 1406 |
],
|
|
|
|
| 1407 |
confidence: Some("high".to_string()),
|
| 1408 |
_count: Some(1),
|
| 1409 |
};
|
|
@@ -1433,12 +1502,49 @@ mod tests {
|
|
| 1433 |
"O".to_string(),
|
| 1434 |
"EPISODE".to_string(),
|
| 1435 |
],
|
|
|
|
| 1436 |
confidence: Some("high".to_string()),
|
| 1437 |
_count: Some(1),
|
| 1438 |
};
|
| 1439 |
assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
|
| 1440 |
}
|
| 1441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1442 |
#[test]
|
| 1443 |
fn path_title_season_episode_labels_are_projected() {
|
| 1444 |
let base = char_record_from_spans(
|
|
|
|
| 5 |
use serde::{Deserialize, Serialize};
|
| 6 |
#[cfg(test)]
|
| 7 |
use serde_json::Value;
|
| 8 |
+
use std::collections::{HashMap, HashSet};
|
| 9 |
use std::fs::{self, File};
|
| 10 |
use std::io::{BufRead, BufReader, BufWriter, Write};
|
| 11 |
use std::path::{Path, PathBuf};
|
|
|
|
| 57 |
labels: Vec<String>,
|
| 58 |
}
|
| 59 |
|
| 60 |
+
#[derive(Debug, Clone, Deserialize)]
|
| 61 |
+
struct TitleSpan {
|
| 62 |
+
role_indices: Vec<usize>,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
#[derive(Debug, Clone, Deserialize)]
|
| 66 |
struct Recipe {
|
| 67 |
template_id: String,
|
| 68 |
template: String,
|
| 69 |
roles: Vec<String>,
|
| 70 |
+
#[serde(default)]
|
| 71 |
+
title_spans: Vec<TitleSpan>,
|
| 72 |
confidence: Option<String>,
|
| 73 |
#[serde(rename = "count")]
|
| 74 |
_count: Option<u64>,
|
|
|
|
| 699 |
Ok(())
|
| 700 |
}
|
| 701 |
|
| 702 |
+
fn logical_title_spans(recipe: &Recipe) -> Option<Vec<Vec<usize>>> {
|
| 703 |
+
if recipe.title_spans.is_empty() {
|
| 704 |
+
return Some(
|
| 705 |
+
recipe
|
| 706 |
+
.roles
|
| 707 |
+
.iter()
|
| 708 |
+
.enumerate()
|
| 709 |
+
.filter_map(|(index, role)| {
|
| 710 |
+
if role == "TITLE" {
|
| 711 |
+
Some(vec![index])
|
| 712 |
+
} else {
|
| 713 |
+
None
|
| 714 |
+
}
|
| 715 |
+
})
|
| 716 |
+
.collect(),
|
| 717 |
+
);
|
| 718 |
+
}
|
| 719 |
+
let mut spans = Vec::new();
|
| 720 |
+
for span in &recipe.title_spans {
|
| 721 |
+
if span.role_indices.is_empty() {
|
| 722 |
+
return None;
|
| 723 |
+
}
|
| 724 |
+
for &index in &span.role_indices {
|
| 725 |
+
if recipe.roles.get(index).map(String::as_str) != Some("TITLE") {
|
| 726 |
+
return None;
|
| 727 |
+
}
|
| 728 |
+
}
|
| 729 |
+
spans.push(span.role_indices.clone());
|
| 730 |
+
}
|
| 731 |
+
Some(spans)
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
fn split_title_for_slots(title: &str, slots: usize) -> Option<Vec<String>> {
|
| 735 |
+
if slots == 0 {
|
| 736 |
+
return None;
|
| 737 |
+
}
|
| 738 |
+
if slots == 1 {
|
| 739 |
+
return Some(vec![title.to_string()]);
|
| 740 |
+
}
|
| 741 |
+
let words: Vec<&str> = title
|
| 742 |
+
.split_whitespace()
|
| 743 |
+
.filter(|word| !word.is_empty())
|
| 744 |
+
.collect();
|
| 745 |
+
if words.len() < slots {
|
| 746 |
+
return None;
|
| 747 |
+
}
|
| 748 |
+
let mut chunks = Vec::with_capacity(slots);
|
| 749 |
+
let mut start = 0usize;
|
| 750 |
+
for slot in 0..slots {
|
| 751 |
+
let remaining_words = words.len() - start;
|
| 752 |
+
let remaining_slots = slots - slot;
|
| 753 |
+
let take = remaining_words.div_ceil(remaining_slots);
|
| 754 |
+
let end = start + take;
|
| 755 |
+
chunks.push(words[start..end].join(" "));
|
| 756 |
+
start = end;
|
| 757 |
+
}
|
| 758 |
+
Some(chunks)
|
| 759 |
+
}
|
| 760 |
+
|
| 761 |
fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
|
| 762 |
let classes: Vec<&str> = recipe.template.split_whitespace().collect();
|
| 763 |
if classes.len() != recipe.roles.len() {
|
| 764 |
return None;
|
| 765 |
}
|
| 766 |
+
let title_spans = logical_title_spans(recipe)?;
|
| 767 |
+
if title_spans.len() != 1 {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
return None;
|
| 769 |
}
|
| 770 |
+
let title_role_indices = &title_spans[0];
|
| 771 |
+
let title_chunks = split_title_for_slots(title, title_role_indices.len())?;
|
| 772 |
+
let title_chunk_by_index: HashMap<usize, &str> = title_role_indices
|
| 773 |
+
.iter()
|
| 774 |
+
.copied()
|
| 775 |
+
.zip(title_chunks.iter().map(String::as_str))
|
| 776 |
+
.collect();
|
| 777 |
let mut builder = CharBuilder::default();
|
| 778 |
let mut previous_role = "";
|
| 779 |
+
for (index, (class_name, role)) in classes.iter().zip(recipe.roles.iter()).enumerate() {
|
| 780 |
let special_number = role == "EPISODE" && previous_role == "SPECIAL";
|
| 781 |
let role_for_label = if special_number {
|
| 782 |
"SPECIAL"
|
|
|
|
| 788 |
class_name,
|
| 789 |
role,
|
| 790 |
role_for_label,
|
| 791 |
+
title_chunk_by_index.get(&index).copied().unwrap_or(title),
|
| 792 |
variant,
|
| 793 |
special_number,
|
| 794 |
)?;
|
|
|
|
| 1472 |
"SPECIAL".to_string(),
|
| 1473 |
"EPISODE".to_string(),
|
| 1474 |
],
|
| 1475 |
+
title_spans: Vec::new(),
|
| 1476 |
confidence: Some("high".to_string()),
|
| 1477 |
_count: Some(1),
|
| 1478 |
};
|
|
|
|
| 1502 |
"O".to_string(),
|
| 1503 |
"EPISODE".to_string(),
|
| 1504 |
],
|
| 1505 |
+
title_spans: Vec::new(),
|
| 1506 |
confidence: Some("high".to_string()),
|
| 1507 |
_count: Some(1),
|
| 1508 |
};
|
| 1509 |
assert!(build_numeric_record(&recipe, "91 Days", 0).is_none());
|
| 1510 |
}
|
| 1511 |
|
| 1512 |
+
#[test]
|
| 1513 |
+
fn numeric_generation_splits_merged_logical_title_span() {
|
| 1514 |
+
let recipe = Recipe {
|
| 1515 |
+
template_id: "tpl_merged_title".to_string(),
|
| 1516 |
+
template: "TEXT SEP TEXT SEP TEXT SEP EPISODE".to_string(),
|
| 1517 |
+
roles: vec![
|
| 1518 |
+
"TITLE".to_string(),
|
| 1519 |
+
"O".to_string(),
|
| 1520 |
+
"TITLE".to_string(),
|
| 1521 |
+
"O".to_string(),
|
| 1522 |
+
"TITLE".to_string(),
|
| 1523 |
+
"O".to_string(),
|
| 1524 |
+
"EPISODE".to_string(),
|
| 1525 |
+
],
|
| 1526 |
+
title_spans: vec![TitleSpan {
|
| 1527 |
+
role_indices: vec![0, 2, 4],
|
| 1528 |
+
}],
|
| 1529 |
+
confidence: Some("high".to_string()),
|
| 1530 |
+
_count: Some(1),
|
| 1531 |
+
};
|
| 1532 |
+
let record = build_numeric_record(&recipe, "500 Days Until the Dungeon Closes", 0).unwrap();
|
| 1533 |
+
assert!(record.filename.contains("500 Days"));
|
| 1534 |
+
assert!(record.filename.contains("Until the"));
|
| 1535 |
+
assert!(record.filename.contains("Dungeon Closes"));
|
| 1536 |
+
assert_eq!(
|
| 1537 |
+
record
|
| 1538 |
+
.filename
|
| 1539 |
+
.matches("500 Days Until the Dungeon Closes")
|
| 1540 |
+
.count(),
|
| 1541 |
+
1
|
| 1542 |
+
);
|
| 1543 |
+
assert_all_entity(&record, "500 Days", "TITLE_LATIN");
|
| 1544 |
+
assert_all_entity(&record, "Until the", "TITLE_LATIN");
|
| 1545 |
+
assert_all_entity(&record, "Dungeon Closes", "TITLE_LATIN");
|
| 1546 |
+
}
|
| 1547 |
+
|
| 1548 |
#[test]
|
| 1549 |
fn path_title_season_episode_labels_are_projected() {
|
| 1550 |
let base = char_record_from_spans(
|