Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Add Rust DMHY template clustering
Browse files
tools/rust_dmhy_template_apply/README.md
CHANGED
|
@@ -2,7 +2,26 @@
|
|
| 2 |
|
| 3 |
Multi-core Rust implementation of the DMHY template recipe apply stage.
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
```powershell
|
| 8 |
cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
|
|
|
|
| 2 |
|
| 3 |
Multi-core Rust implementation of the DMHY template recipe apply stage.
|
| 4 |
|
| 5 |
+
Build template recipes from the repository root:
|
| 6 |
+
|
| 7 |
+
```powershell
|
| 8 |
+
cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
|
| 9 |
+
--cluster `
|
| 10 |
+
--input datasets\AnimeName\dmhy_list.jsonl `
|
| 11 |
+
--summary-output reports\dmhy_template_clusters.full_top5000.summary.json `
|
| 12 |
+
--samples-output reports\dmhy_template_clusters.full_top5000.samples.jsonl `
|
| 13 |
+
--clusters-output reports\dmhy_template_clusters.full_top5000.jsonl `
|
| 14 |
+
--recipes-output reports\dmhy_template_recipes.full_top5000.seed.jsonl `
|
| 15 |
+
--review-output reports\dmhy_template_review.full_top5000.jsonl `
|
| 16 |
+
--top 5000 `
|
| 17 |
+
--recipe-top 5000 `
|
| 18 |
+
--review-top 5000 `
|
| 19 |
+
--min-count 2 `
|
| 20 |
+
--recipe-min-count 10 `
|
| 21 |
+
--threads 24
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
Apply template recipes from the repository root:
|
| 25 |
|
| 26 |
```powershell
|
| 27 |
cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
|
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -15,6 +15,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
|
|
| 15 |
#[derive(Parser, Debug)]
|
| 16 |
#[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
|
| 17 |
struct Args {
|
|
|
|
|
|
|
| 18 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 19 |
input: PathBuf,
|
| 20 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
@@ -29,12 +31,38 @@ struct Args {
|
|
| 29 |
default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
|
| 30 |
)]
|
| 31 |
manifest_output: PathBuf,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
#[arg(long)]
|
| 33 |
limit: Option<usize>,
|
| 34 |
#[arg(long)]
|
| 35 |
limit_templates: Option<usize>,
|
| 36 |
#[arg(long, default_value_t = 1)]
|
| 37 |
min_count: u64,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
#[arg(long, default_value = "high")]
|
| 39 |
confidence: String,
|
| 40 |
#[arg(long, default_value = "all")]
|
|
@@ -44,6 +72,8 @@ struct Args {
|
|
| 44 |
#[arg(long)]
|
| 45 |
keep_encoding_noise: bool,
|
| 46 |
#[arg(long)]
|
|
|
|
|
|
|
| 47 |
threads: Option<usize>,
|
| 48 |
}
|
| 49 |
|
|
@@ -88,6 +118,15 @@ struct Stats {
|
|
| 88 |
written: usize,
|
| 89 |
}
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
#[derive(Debug)]
|
| 92 |
enum Processed {
|
| 93 |
Written {
|
|
@@ -123,7 +162,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
|
| 123 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 124 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 125 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 126 |
-
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
|
| 127 |
});
|
| 128 |
static VOLUME_RE: Lazy<Regex> =
|
| 129 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
@@ -141,9 +180,21 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 141 |
});
|
| 142 |
static YEAR_RANGE_RE: Lazy<Regex> =
|
| 143 |
Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
|
| 145 |
[
|
| 146 |
r"^\d{3,4}[xX×]\d{3,4}",
|
|
|
|
|
|
|
| 147 |
r"^[\\/]+",
|
| 148 |
r"^[-_.::+&|]+",
|
| 149 |
r"^\s+",
|
|
@@ -169,6 +220,9 @@ fn main() -> Result<()> {
|
|
| 169 |
.build_global()
|
| 170 |
.context("failed to configure rayon thread pool")?;
|
| 171 |
}
|
|
|
|
|
|
|
|
|
|
| 172 |
if args.expand != "all" && args.expand != "sample" {
|
| 173 |
bail!("--expand must be all or sample");
|
| 174 |
}
|
|
@@ -331,6 +385,222 @@ fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
|
|
| 331 |
Ok(values)
|
| 332 |
}
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
fn process_filename(
|
| 335 |
original: &str,
|
| 336 |
args: &Args,
|
|
@@ -472,7 +742,7 @@ fn split_inner(inner: &str) -> Vec<String> {
|
|
| 472 |
let mut parts = Vec::new();
|
| 473 |
let mut current = String::new();
|
| 474 |
for ch in inner.chars() {
|
| 475 |
-
if ch.is_whitespace() || "_.,+/&|-".contains(ch) {
|
| 476 |
if !current.is_empty() {
|
| 477 |
parts.push(std::mem::take(&mut current));
|
| 478 |
}
|
|
@@ -586,10 +856,22 @@ fn classify_token(token: &str) -> String {
|
|
| 586 |
}
|
| 587 |
if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
|
| 588 |
let inner = strip_wrapper(token);
|
| 589 |
-
let whole_class = classify_atom(&inner);
|
| 590 |
let parts = split_inner(&inner);
|
|
|
|
| 591 |
let inner_class = if whole_class != "TEXT" {
|
| 592 |
-
whole_class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
} else if parts.is_empty() {
|
| 594 |
"EMPTY".to_string()
|
| 595 |
} else {
|
|
@@ -677,7 +959,7 @@ fn suggested_roles(template: &str) -> Vec<String> {
|
|
| 677 |
"EPISODE_VERSION"
|
| 678 |
} else if item.contains("EPISODE_RANGE") {
|
| 679 |
"EPISODE_RANGE"
|
| 680 |
-
} else if item.contains("EPISODE") ||
|
| 681 |
"EPISODE"
|
| 682 |
} else if item.contains("RESOLUTION") {
|
| 683 |
"RESOLUTION"
|
|
@@ -767,12 +1049,39 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 767 |
.filter(|part| !part.is_empty())
|
| 768 |
.collect();
|
| 769 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 770 |
-
(parts[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
} else {
|
| 772 |
(original.to_string(), false)
|
| 773 |
}
|
| 774 |
}
|
| 775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
fn has_encoding_noise(value: &str) -> bool {
|
| 777 |
if value.contains('\u{fffd}') {
|
| 778 |
return true;
|
|
@@ -910,6 +1219,30 @@ fn split_refined_token(token: &str) -> Vec<String> {
|
|
| 910 |
let mut merged = Vec::new();
|
| 911 |
let mut index = 0;
|
| 912 |
while index < pieces.len() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
if !is_separator(&pieces[index]) {
|
| 914 |
let mut end = index;
|
| 915 |
let mut combined = String::new();
|
|
@@ -1066,7 +1399,7 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 1066 |
fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
|
| 1067 |
let mut output = roles.to_vec();
|
| 1068 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 1069 |
-
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"
|
| 1070 |
if !output.iter().any(|role| role == "TITLE")
|
| 1071 |
&& roles
|
| 1072 |
.first()
|
|
@@ -1086,17 +1419,78 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1086 |
}
|
| 1087 |
}
|
| 1088 |
if title_run.len() >= 2 {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1089 |
for index in title_run {
|
| 1090 |
output[index] = "TITLE".to_string();
|
| 1091 |
}
|
| 1092 |
}
|
| 1093 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1094 |
for index in 0..roles.len() {
|
| 1095 |
let text = group_text(tokens, &groups[index]);
|
|
|
|
|
|
|
|
|
|
| 1096 |
if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
|
| 1097 |
output[index] = "O".to_string();
|
| 1098 |
continue;
|
| 1099 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
if roles[index] == "TITLE" && is_special_title_phrase(&text) {
|
| 1101 |
output[index] = "SPECIAL".to_string();
|
| 1102 |
continue;
|
|
@@ -1111,7 +1505,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1111 |
output[index + 2] = "SEASON".to_string();
|
| 1112 |
continue;
|
| 1113 |
}
|
| 1114 |
-
if roles[index] == "TITLE"
|
|
|
|
|
|
|
|
|
|
| 1115 |
let previous_title = output[..index].iter().any(|role| role == "TITLE");
|
| 1116 |
let next_structural = roles[index + 1..]
|
| 1117 |
.iter()
|
|
@@ -1131,6 +1528,29 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1131 |
output[index + 2] = "O".to_string();
|
| 1132 |
}
|
| 1133 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
}
|
| 1135 |
output
|
| 1136 |
}
|
|
@@ -1301,7 +1721,7 @@ fn project_refined_tokens(
|
|
| 1301 |
}
|
| 1302 |
}
|
| 1303 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
| 1304 |
-
if let Some((pieces, labels)) = split_sxe_token(token) {
|
| 1305 |
output_tokens.extend(pieces);
|
| 1306 |
output_labels.extend(labels);
|
| 1307 |
continue;
|
|
@@ -1315,8 +1735,10 @@ fn project_refined_tokens(
|
|
| 1315 |
continue;
|
| 1316 |
}
|
| 1317 |
}
|
| 1318 |
-
|
| 1319 |
-
|
|
|
|
|
|
|
| 1320 |
}
|
| 1321 |
} else {
|
| 1322 |
if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
|
|
@@ -1352,9 +1774,11 @@ fn project_refined_tokens(
|
|
| 1352 |
}
|
| 1353 |
|
| 1354 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 1355 |
-
let joiners = [
|
|
|
|
|
|
|
| 1356 |
let entity_joiners = [
|
| 1357 |
-
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", "&",
|
| 1358 |
];
|
| 1359 |
let mut output = labels.to_vec();
|
| 1360 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
@@ -1442,6 +1866,8 @@ mod tests {
|
|
| 1442 |
|
| 1443 |
let dxd = labels_for("High School D×D");
|
| 1444 |
assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
|
|
|
|
|
|
|
| 1445 |
|
| 1446 |
let sxe = labels_for("S01E02");
|
| 1447 |
assert_eq!(
|
|
@@ -1453,6 +1879,27 @@ mod tests {
|
|
| 1453 |
("02".to_string(), "B-EPISODE".to_string())
|
| 1454 |
]
|
| 1455 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1456 |
|
| 1457 |
let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
|
| 1458 |
assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
|
|
@@ -1486,5 +1933,101 @@ mod tests {
|
|
| 1486 |
assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
|
| 1487 |
assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
|
| 1488 |
assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1489 |
}
|
| 1490 |
}
|
|
|
|
| 15 |
#[derive(Parser, Debug)]
|
| 16 |
#[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
|
| 17 |
struct Args {
|
| 18 |
+
#[arg(long)]
|
| 19 |
+
cluster: bool,
|
| 20 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 21 |
input: PathBuf,
|
| 22 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
|
|
| 31 |
default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
|
| 32 |
)]
|
| 33 |
manifest_output: PathBuf,
|
| 34 |
+
#[arg(
|
| 35 |
+
long,
|
| 36 |
+
default_value = "reports/dmhy_template_clusters.rust.summary.json"
|
| 37 |
+
)]
|
| 38 |
+
summary_output: PathBuf,
|
| 39 |
+
#[arg(
|
| 40 |
+
long,
|
| 41 |
+
default_value = "reports/dmhy_template_clusters.rust.samples.jsonl"
|
| 42 |
+
)]
|
| 43 |
+
samples_output: PathBuf,
|
| 44 |
+
#[arg(long, default_value = "reports/dmhy_template_clusters.rust.jsonl")]
|
| 45 |
+
clusters_output: PathBuf,
|
| 46 |
+
#[arg(long, default_value = "reports/dmhy_template_recipes.rust.seed.jsonl")]
|
| 47 |
+
recipes_output: PathBuf,
|
| 48 |
+
#[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
|
| 49 |
+
review_output: PathBuf,
|
| 50 |
#[arg(long)]
|
| 51 |
limit: Option<usize>,
|
| 52 |
#[arg(long)]
|
| 53 |
limit_templates: Option<usize>,
|
| 54 |
#[arg(long, default_value_t = 1)]
|
| 55 |
min_count: u64,
|
| 56 |
+
#[arg(long, default_value_t = 200)]
|
| 57 |
+
top: usize,
|
| 58 |
+
#[arg(long, default_value_t = 200)]
|
| 59 |
+
recipe_top: usize,
|
| 60 |
+
#[arg(long, default_value_t = 1000)]
|
| 61 |
+
review_top: usize,
|
| 62 |
+
#[arg(long, default_value_t = 8)]
|
| 63 |
+
examples: usize,
|
| 64 |
+
#[arg(long, default_value_t = 10)]
|
| 65 |
+
recipe_min_count: usize,
|
| 66 |
#[arg(long, default_value = "high")]
|
| 67 |
confidence: String,
|
| 68 |
#[arg(long, default_value = "all")]
|
|
|
|
| 72 |
#[arg(long)]
|
| 73 |
keep_encoding_noise: bool,
|
| 74 |
#[arg(long)]
|
| 75 |
+
preserve_parent_paths: bool,
|
| 76 |
+
#[arg(long)]
|
| 77 |
threads: Option<usize>,
|
| 78 |
}
|
| 79 |
|
|
|
|
| 118 |
written: usize,
|
| 119 |
}
|
| 120 |
|
| 121 |
+
#[derive(Debug, Default)]
|
| 122 |
+
struct Cluster {
|
| 123 |
+
count: usize,
|
| 124 |
+
examples: Vec<String>,
|
| 125 |
+
literal_counts: HashMap<String, usize>,
|
| 126 |
+
class_counts: HashMap<String, usize>,
|
| 127 |
+
position_literals: Vec<HashMap<String, usize>>,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
#[derive(Debug)]
|
| 131 |
enum Processed {
|
| 132 |
Written {
|
|
|
|
| 162 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 163 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 164 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 165 |
+
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
|
| 166 |
});
|
| 167 |
static VOLUME_RE: Lazy<Regex> =
|
| 168 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
|
|
| 180 |
});
|
| 181 |
static YEAR_RANGE_RE: Lazy<Regex> =
|
| 182 |
Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
|
| 183 |
+
static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 184 |
+
Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
|
| 185 |
+
});
|
| 186 |
+
static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
|
| 187 |
+
Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
|
| 188 |
+
static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
|
| 189 |
+
Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
|
| 190 |
+
static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 191 |
+
Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})e\d{1,4}(?:$|[^\p{L}\p{N}])").unwrap()
|
| 192 |
+
});
|
| 193 |
static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
|
| 194 |
[
|
| 195 |
r"^\d{3,4}[xX×]\d{3,4}",
|
| 196 |
+
r"(?i)^h\.?26[45]",
|
| 197 |
+
r"(?i)^x\.?26[45]",
|
| 198 |
r"^[\\/]+",
|
| 199 |
r"^[-_.::+&|]+",
|
| 200 |
r"^\s+",
|
|
|
|
| 220 |
.build_global()
|
| 221 |
.context("failed to configure rayon thread pool")?;
|
| 222 |
}
|
| 223 |
+
if args.cluster {
|
| 224 |
+
return run_cluster(&args);
|
| 225 |
+
}
|
| 226 |
if args.expand != "all" && args.expand != "sample" {
|
| 227 |
bail!("--expand must be all or sample");
|
| 228 |
}
|
|
|
|
| 385 |
Ok(values)
|
| 386 |
}
|
| 387 |
|
| 388 |
+
fn run_cluster(args: &Args) -> Result<()> {
|
| 389 |
+
let inputs = load_input(&args.input, args.limit)?;
|
| 390 |
+
let source_rows = inputs.len();
|
| 391 |
+
let mut clusters: HashMap<String, Cluster> = HashMap::new();
|
| 392 |
+
let mut skipped_encoding_noise = 0usize;
|
| 393 |
+
let mut trimmed_parent_path = 0usize;
|
| 394 |
+
let mut total_rows = 0usize;
|
| 395 |
+
|
| 396 |
+
for original in inputs {
|
| 397 |
+
if !args.keep_encoding_noise
|
| 398 |
+
&& (has_encoding_noise(&original)
|
| 399 |
+
|| has_non_anime_noise(&original)
|
| 400 |
+
|| has_abstract_path_noise(&original))
|
| 401 |
+
{
|
| 402 |
+
skipped_encoding_noise += 1;
|
| 403 |
+
continue;
|
| 404 |
+
}
|
| 405 |
+
let filename = if args.preserve_parent_paths {
|
| 406 |
+
original
|
| 407 |
+
} else {
|
| 408 |
+
let (training_filename, was_trimmed) = training_filename_for(&original);
|
| 409 |
+
if was_trimmed {
|
| 410 |
+
trimmed_parent_path += 1;
|
| 411 |
+
}
|
| 412 |
+
training_filename
|
| 413 |
+
};
|
| 414 |
+
add_cluster(&mut clusters, &filename, args.examples);
|
| 415 |
+
total_rows += 1;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
let mut sorted_clusters: Vec<_> = clusters.into_iter().collect();
|
| 419 |
+
sorted_clusters.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0)));
|
| 420 |
+
|
| 421 |
+
let cluster_rows: Vec<Value> = sorted_clusters
|
| 422 |
+
.iter()
|
| 423 |
+
.enumerate()
|
| 424 |
+
.map(|(index, (key, cluster))| cluster_row(index + 1, key, cluster, total_rows))
|
| 425 |
+
.collect();
|
| 426 |
+
let samples: Vec<Value> = cluster_rows.iter().take(args.top).cloned().collect();
|
| 427 |
+
let recipe_candidates: Vec<Value> =
|
| 428 |
+
cluster_rows.iter().take(args.recipe_top).cloned().collect();
|
| 429 |
+
let recipes: Vec<Value> = recipe_candidates
|
| 430 |
+
.iter()
|
| 431 |
+
.filter(|row| is_high_confidence_recipe(row, args.recipe_min_count))
|
| 432 |
+
.map(|row| recipe_row(row, "high"))
|
| 433 |
+
.collect();
|
| 434 |
+
let review: Vec<Value> = recipe_candidates
|
| 435 |
+
.iter()
|
| 436 |
+
.filter(|row| !is_high_confidence_recipe(row, args.recipe_min_count))
|
| 437 |
+
.take(args.review_top)
|
| 438 |
+
.cloned()
|
| 439 |
+
.collect();
|
| 440 |
+
|
| 441 |
+
write_jsonl_values(&args.clusters_output, &cluster_rows)?;
|
| 442 |
+
write_jsonl_values(&args.samples_output, &samples)?;
|
| 443 |
+
write_jsonl_values(&args.recipes_output, &recipes)?;
|
| 444 |
+
write_jsonl_values(&args.review_output, &review)?;
|
| 445 |
+
|
| 446 |
+
let mut histogram: HashMap<usize, usize> = HashMap::new();
|
| 447 |
+
for (_, cluster) in &sorted_clusters {
|
| 448 |
+
*histogram.entry(cluster.count).or_default() += 1;
|
| 449 |
+
}
|
| 450 |
+
let mut count_histogram_top: Vec<_> = histogram.into_iter().collect();
|
| 451 |
+
count_histogram_top.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
|
| 452 |
+
count_histogram_top.truncate(20);
|
| 453 |
+
|
| 454 |
+
let rows_covered_by_repeated_templates: usize = sorted_clusters
|
| 455 |
+
.iter()
|
| 456 |
+
.map(|(_, cluster)| cluster)
|
| 457 |
+
.filter(|cluster| cluster.count as u64 >= args.min_count)
|
| 458 |
+
.map(|cluster| cluster.count)
|
| 459 |
+
.sum();
|
| 460 |
+
let templates_at_least_min_count = sorted_clusters
|
| 461 |
+
.iter()
|
| 462 |
+
.filter(|(_, cluster)| cluster.count as u64 >= args.min_count)
|
| 463 |
+
.count();
|
| 464 |
+
let top_templates: Vec<Value> = cluster_rows.iter().take(20).cloned().collect();
|
| 465 |
+
let summary = json!({
|
| 466 |
+
"input": args.input.to_string_lossy(),
|
| 467 |
+
"source_rows": source_rows,
|
| 468 |
+
"skipped_encoding_noise": skipped_encoding_noise,
|
| 469 |
+
"trimmed_parent_path": trimmed_parent_path,
|
| 470 |
+
"total_rows": total_rows,
|
| 471 |
+
"unique_templates": sorted_clusters.len(),
|
| 472 |
+
"min_count": args.min_count,
|
| 473 |
+
"templates_at_least_min_count": templates_at_least_min_count,
|
| 474 |
+
"rows_covered_by_repeated_templates": rows_covered_by_repeated_templates,
|
| 475 |
+
"rows_covered_by_repeated_templates_ratio": if total_rows == 0 { 0.0 } else { rows_covered_by_repeated_templates as f64 / total_rows as f64 },
|
| 476 |
+
"top_output_rows": samples.len(),
|
| 477 |
+
"clusters_output": args.clusters_output.to_string_lossy(),
|
| 478 |
+
"cluster_rows": cluster_rows.len(),
|
| 479 |
+
"recipes_output": args.recipes_output.to_string_lossy(),
|
| 480 |
+
"recipe_rows": recipes.len(),
|
| 481 |
+
"review_output": args.review_output.to_string_lossy(),
|
| 482 |
+
"review_rows": review.len(),
|
| 483 |
+
"recipe_top": args.recipe_top,
|
| 484 |
+
"recipe_min_count": args.recipe_min_count,
|
| 485 |
+
"top_templates": top_templates,
|
| 486 |
+
"count_histogram_top": count_histogram_top,
|
| 487 |
+
"implementation": "rust_dmhy_template_cluster",
|
| 488 |
+
"generated_at": Utc::now().to_rfc3339(),
|
| 489 |
+
});
|
| 490 |
+
if let Some(parent) = args.summary_output.parent() {
|
| 491 |
+
fs::create_dir_all(parent)?;
|
| 492 |
+
}
|
| 493 |
+
fs::write(
|
| 494 |
+
&args.summary_output,
|
| 495 |
+
serde_json::to_string_pretty(&summary)?,
|
| 496 |
+
)?;
|
| 497 |
+
println!("{}", serde_json::to_string_pretty(&summary)?);
|
| 498 |
+
Ok(())
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
fn add_cluster(clusters: &mut HashMap<String, Cluster>, filename: &str, example_limit: usize) {
|
| 502 |
+
let (key, tokens, classes, groups) = template_key_for_filename(filename);
|
| 503 |
+
let cluster = clusters.entry(key).or_default();
|
| 504 |
+
cluster.count += 1;
|
| 505 |
+
if cluster.examples.len() < example_limit {
|
| 506 |
+
cluster.examples.push(filename.to_string());
|
| 507 |
+
}
|
| 508 |
+
for (token, class_name) in tokens.iter().zip(classes.iter()) {
|
| 509 |
+
*cluster.class_counts.entry(class_name.clone()).or_default() += 1;
|
| 510 |
+
if matches!(class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
|
| 511 |
+
let cleaned = strip_wrapper(token);
|
| 512 |
+
if !cleaned.is_empty() {
|
| 513 |
+
*cluster.literal_counts.entry(cleaned).or_default() += 1;
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
}
|
| 517 |
+
while cluster.position_literals.len() < groups.len() {
|
| 518 |
+
cluster.position_literals.push(HashMap::new());
|
| 519 |
+
}
|
| 520 |
+
for (index, group) in groups.iter().enumerate() {
|
| 521 |
+
if matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT") {
|
| 522 |
+
let text = group_text(&tokens, group);
|
| 523 |
+
if !text.is_empty() {
|
| 524 |
+
*cluster.position_literals[index].entry(text).or_default() += 1;
|
| 525 |
+
}
|
| 526 |
+
}
|
| 527 |
+
}
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
fn cluster_row(rank: usize, key: &str, cluster: &Cluster, total: usize) -> Value {
|
| 531 |
+
json!({
|
| 532 |
+
"template_id": format!("tpl_{rank:06}"),
|
| 533 |
+
"template": key,
|
| 534 |
+
"count": cluster.count,
|
| 535 |
+
"coverage": if total == 0 { 0.0 } else { cluster.count as f64 / total as f64 },
|
| 536 |
+
"top_literals": top_counts(&cluster.literal_counts, 12),
|
| 537 |
+
"suggested_roles": suggested_roles(key),
|
| 538 |
+
"position_top_literals": cluster.position_literals.iter().map(|counts| top_counts(counts, 5)).collect::<Vec<_>>(),
|
| 539 |
+
"class_counts": top_counts(&cluster.class_counts, 20),
|
| 540 |
+
"examples": cluster.examples,
|
| 541 |
+
})
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
fn top_counts(counts: &HashMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
|
| 545 |
+
let mut items: Vec<_> = counts
|
| 546 |
+
.iter()
|
| 547 |
+
.map(|(key, count)| (key.clone(), *count))
|
| 548 |
+
.collect();
|
| 549 |
+
items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
|
| 550 |
+
items.truncate(limit);
|
| 551 |
+
items
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
fn is_high_confidence_recipe(row: &Value, min_count: usize) -> bool {
|
| 555 |
+
if row.get("count").and_then(Value::as_u64).unwrap_or(0) < min_count as u64 {
|
| 556 |
+
return false;
|
| 557 |
+
}
|
| 558 |
+
let roles = match row.get("suggested_roles").and_then(Value::as_array) {
|
| 559 |
+
Some(roles) => roles,
|
| 560 |
+
None => return false,
|
| 561 |
+
};
|
| 562 |
+
let role_strings: Vec<&str> = roles.iter().filter_map(Value::as_str).collect();
|
| 563 |
+
if role_strings.iter().any(|role| role.contains("_OR_")) {
|
| 564 |
+
return false;
|
| 565 |
+
}
|
| 566 |
+
if !role_strings.contains(&"TITLE")
|
| 567 |
+
|| !role_strings.iter().any(|role| {
|
| 568 |
+
role.starts_with("EPISODE") || matches!(*role, "SPECIAL" | "SOURCE" | "RESOLUTION")
|
| 569 |
+
})
|
| 570 |
+
{
|
| 571 |
+
return false;
|
| 572 |
+
}
|
| 573 |
+
let template = row.get("template").and_then(Value::as_str).unwrap_or("");
|
| 574 |
+
if template.contains("BRACKET_TEXT BRACKET_TEXT") && !role_strings.contains(&"GROUP") {
|
| 575 |
+
return false;
|
| 576 |
+
}
|
| 577 |
+
!role_strings.contains(&"TITLE_OR_TEXT")
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
fn recipe_row(row: &Value, confidence: &str) -> Value {
|
| 581 |
+
json!({
|
| 582 |
+
"template_id": row["template_id"],
|
| 583 |
+
"template": row["template"],
|
| 584 |
+
"roles": row["suggested_roles"],
|
| 585 |
+
"confidence": confidence,
|
| 586 |
+
"count": row["count"],
|
| 587 |
+
"examples": row["examples"],
|
| 588 |
+
})
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
|
| 592 |
+
if let Some(parent) = path.parent() {
|
| 593 |
+
fs::create_dir_all(parent)?;
|
| 594 |
+
}
|
| 595 |
+
let mut writer = BufWriter::new(File::create(path)?);
|
| 596 |
+
for row in rows {
|
| 597 |
+
serde_json::to_writer(&mut writer, row)?;
|
| 598 |
+
writer.write_all(b"\n")?;
|
| 599 |
+
}
|
| 600 |
+
writer.flush()?;
|
| 601 |
+
Ok(())
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
fn process_filename(
|
| 605 |
original: &str,
|
| 606 |
args: &Args,
|
|
|
|
| 742 |
let mut parts = Vec::new();
|
| 743 |
let mut current = String::new();
|
| 744 |
for ch in inner.chars() {
|
| 745 |
+
if ch.is_whitespace() || "_.,+/&|-()()".contains(ch) {
|
| 746 |
if !current.is_empty() {
|
| 747 |
parts.push(std::mem::take(&mut current));
|
| 748 |
}
|
|
|
|
| 856 |
}
|
| 857 |
if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
|
| 858 |
let inner = strip_wrapper(token);
|
|
|
|
| 859 |
let parts = split_inner(&inner);
|
| 860 |
+
let whole_class = classify_atom(&inner);
|
| 861 |
let inner_class = if whole_class != "TEXT" {
|
| 862 |
+
if whole_class == "LANG" && parts.len() > 1 {
|
| 863 |
+
let part_classes: Vec<String> =
|
| 864 |
+
parts.iter().map(|part| classify_atom(part)).collect();
|
| 865 |
+
if part_classes.iter().all(|item| item == &part_classes[0]) {
|
| 866 |
+
part_classes[0].clone()
|
| 867 |
+
} else if part_classes.iter().all(|item| is_media_block_class(item)) {
|
| 868 |
+
"MEDIA_BLOCK".to_string()
|
| 869 |
+
} else {
|
| 870 |
+
whole_class
|
| 871 |
+
}
|
| 872 |
+
} else {
|
| 873 |
+
whole_class
|
| 874 |
+
}
|
| 875 |
} else if parts.is_empty() {
|
| 876 |
"EMPTY".to_string()
|
| 877 |
} else {
|
|
|
|
| 959 |
"EPISODE_VERSION"
|
| 960 |
} else if item.contains("EPISODE_RANGE") {
|
| 961 |
"EPISODE_RANGE"
|
| 962 |
+
} else if item.contains("EPISODE") || item.contains("SXE") {
|
| 963 |
"EPISODE"
|
| 964 |
} else if item.contains("RESOLUTION") {
|
| 965 |
"RESOLUTION"
|
|
|
|
| 1049 |
.filter(|part| !part.is_empty())
|
| 1050 |
.collect();
|
| 1051 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 1052 |
+
if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
|
| 1053 |
+
let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
|
| 1054 |
+
let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
|
| 1055 |
+
if parent_seasons
|
| 1056 |
+
.iter()
|
| 1057 |
+
.any(|season| leaf_seasons.contains(season))
|
| 1058 |
+
{
|
| 1059 |
+
(parts[parts.len() - 1].to_string(), true)
|
| 1060 |
+
} else {
|
| 1061 |
+
(parts[parts.len() - 2..].join("/"), true)
|
| 1062 |
+
}
|
| 1063 |
+
} else {
|
| 1064 |
+
(parts[parts.len() - 1].to_string(), true)
|
| 1065 |
+
}
|
| 1066 |
} else {
|
| 1067 |
(original.to_string(), false)
|
| 1068 |
}
|
| 1069 |
}
|
| 1070 |
|
| 1071 |
+
fn path_segment_has_season(value: &str) -> bool {
|
| 1072 |
+
PATH_SEGMENT_SEASON_RE.is_match(value)
|
| 1073 |
+
}
|
| 1074 |
+
|
| 1075 |
+
fn path_segment_seasons(value: &str) -> HashSet<u8> {
|
| 1076 |
+
SEASON_WORD_NUMBER_RE
|
| 1077 |
+
.captures_iter(value)
|
| 1078 |
+
.chain(S_NUMBER_SEGMENT_RE.captures_iter(value))
|
| 1079 |
+
.chain(SXE_SEASON_RE.captures_iter(value))
|
| 1080 |
+
.filter_map(|captures| captures.get(1))
|
| 1081 |
+
.filter_map(|item| item.as_str().parse::<u8>().ok())
|
| 1082 |
+
.collect()
|
| 1083 |
+
}
|
| 1084 |
+
|
| 1085 |
fn has_encoding_noise(value: &str) -> bool {
|
| 1086 |
if value.contains('\u{fffd}') {
|
| 1087 |
return true;
|
|
|
|
| 1219 |
let mut merged = Vec::new();
|
| 1220 |
let mut index = 0;
|
| 1221 |
while index < pieces.len() {
|
| 1222 |
+
if index + 2 < pieces.len()
|
| 1223 |
+
&& !is_separator(&pieces[index])
|
| 1224 |
+
&& is_separator(&pieces[index + 1])
|
| 1225 |
+
&& !is_separator(&pieces[index + 2])
|
| 1226 |
+
{
|
| 1227 |
+
let combined = format!(
|
| 1228 |
+
"{}{}{}",
|
| 1229 |
+
pieces[index],
|
| 1230 |
+
pieces[index + 1],
|
| 1231 |
+
pieces[index + 2]
|
| 1232 |
+
);
|
| 1233 |
+
let combined_class = classify_atom(&combined);
|
| 1234 |
+
if !pieces[index + 1].chars().any(char::is_whitespace)
|
| 1235 |
+
&& matches!(pieces[index + 1].as_str(), "." | "x" | "X" | "×")
|
| 1236 |
+
&& matches!(
|
| 1237 |
+
combined_class.as_str(),
|
| 1238 |
+
"RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
|
| 1239 |
+
)
|
| 1240 |
+
{
|
| 1241 |
+
merged.push(combined);
|
| 1242 |
+
index += 3;
|
| 1243 |
+
continue;
|
| 1244 |
+
}
|
| 1245 |
+
}
|
| 1246 |
if !is_separator(&pieces[index]) {
|
| 1247 |
let mut end = index;
|
| 1248 |
let mut combined = String::new();
|
|
|
|
| 1399 |
fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
|
| 1400 |
let mut output = roles.to_vec();
|
| 1401 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 1402 |
+
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
|
| 1403 |
if !output.iter().any(|role| role == "TITLE")
|
| 1404 |
&& roles
|
| 1405 |
.first()
|
|
|
|
| 1419 |
}
|
| 1420 |
}
|
| 1421 |
if title_run.len() >= 2 {
|
| 1422 |
+
let last_title_index = *title_run.last().unwrap();
|
| 1423 |
+
let later_structural = roles[last_title_index + 1..].iter().any(|role| {
|
| 1424 |
+
role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
|
| 1425 |
+
});
|
| 1426 |
+
if group_text(tokens, &groups[0])
|
| 1427 |
+
.chars()
|
| 1428 |
+
.all(|ch| ch.is_ascii_digit())
|
| 1429 |
+
&& later_structural
|
| 1430 |
+
{
|
| 1431 |
+
output[0] = "TITLE".to_string();
|
| 1432 |
+
}
|
| 1433 |
for index in title_run {
|
| 1434 |
output[index] = "TITLE".to_string();
|
| 1435 |
}
|
| 1436 |
}
|
| 1437 |
}
|
| 1438 |
+
if roles
|
| 1439 |
+
.first()
|
| 1440 |
+
.is_some_and(|role| role.starts_with("EPISODE"))
|
| 1441 |
+
&& group_text(tokens, &groups[0])
|
| 1442 |
+
.chars()
|
| 1443 |
+
.all(|ch| ch.is_ascii_digit())
|
| 1444 |
+
{
|
| 1445 |
+
if let Some(first_title) = output.iter().position(|role| role == "TITLE") {
|
| 1446 |
+
let later_structural = roles[first_title + 1..].iter().any(|role| {
|
| 1447 |
+
role.starts_with("EPISODE") || matches!(role.as_str(), "SEASON" | "SPECIAL")
|
| 1448 |
+
});
|
| 1449 |
+
if later_structural {
|
| 1450 |
+
output[0] = "TITLE".to_string();
|
| 1451 |
+
}
|
| 1452 |
+
}
|
| 1453 |
+
}
|
| 1454 |
for index in 0..roles.len() {
|
| 1455 |
let text = group_text(tokens, &groups[index]);
|
| 1456 |
+
if output[index] == "O" && groups[index].class_name.contains("SXE") {
|
| 1457 |
+
output[index] = "EPISODE".to_string();
|
| 1458 |
+
}
|
| 1459 |
if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
|
| 1460 |
output[index] = "O".to_string();
|
| 1461 |
continue;
|
| 1462 |
}
|
| 1463 |
+
if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
|
| 1464 |
+
let previous_text = group_text(tokens, &groups[index - 2]);
|
| 1465 |
+
let next_special = output[index + 1..roles.len().min(index + 4)]
|
| 1466 |
+
.iter()
|
| 1467 |
+
.any(|role| role == "SPECIAL");
|
| 1468 |
+
let next_episode = roles[index + 1..]
|
| 1469 |
+
.iter()
|
| 1470 |
+
.any(|role| role.starts_with("EPISODE"));
|
| 1471 |
+
if groups[index - 1].class_name == "SEP"
|
| 1472 |
+
&& matches!(
|
| 1473 |
+
previous_text.to_ascii_lowercase().as_str(),
|
| 1474 |
+
"vol" | "volume"
|
| 1475 |
+
)
|
| 1476 |
+
{
|
| 1477 |
+
output[index - 2] = "SPECIAL".to_string();
|
| 1478 |
+
output[index] = "SPECIAL".to_string();
|
| 1479 |
+
continue;
|
| 1480 |
+
}
|
| 1481 |
+
if output[index - 2] == "TITLE"
|
| 1482 |
+
&& groups[index - 1].class_name == "SEP"
|
| 1483 |
+
&& previous_text.len() <= 4
|
| 1484 |
+
&& previous_text.is_ascii()
|
| 1485 |
+
&& previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
|
| 1486 |
+
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 1487 |
+
&& text.len() <= 3
|
| 1488 |
+
&& (next_special || next_episode)
|
| 1489 |
+
{
|
| 1490 |
+
output[index] = "TITLE".to_string();
|
| 1491 |
+
continue;
|
| 1492 |
+
}
|
| 1493 |
+
}
|
| 1494 |
if roles[index] == "TITLE" && is_special_title_phrase(&text) {
|
| 1495 |
output[index] = "SPECIAL".to_string();
|
| 1496 |
continue;
|
|
|
|
| 1505 |
output[index + 2] = "SEASON".to_string();
|
| 1506 |
continue;
|
| 1507 |
}
|
| 1508 |
+
if roles[index] == "TITLE"
|
| 1509 |
+
&& text == text.to_ascii_uppercase()
|
| 1510 |
+
&& roman.contains(&text.as_str())
|
| 1511 |
+
{
|
| 1512 |
let previous_title = output[..index].iter().any(|role| role == "TITLE");
|
| 1513 |
let next_structural = roles[index + 1..]
|
| 1514 |
.iter()
|
|
|
|
| 1528 |
output[index + 2] = "O".to_string();
|
| 1529 |
}
|
| 1530 |
}
|
| 1531 |
+
if roles[index].starts_with("EPISODE") {
|
| 1532 |
+
let previous_text = if index >= 1 {
|
| 1533 |
+
group_text(tokens, &groups[index - 1])
|
| 1534 |
+
} else {
|
| 1535 |
+
String::new()
|
| 1536 |
+
};
|
| 1537 |
+
let next_text = if index + 1 < roles.len() {
|
| 1538 |
+
group_text(tokens, &groups[index + 1])
|
| 1539 |
+
} else {
|
| 1540 |
+
String::new()
|
| 1541 |
+
};
|
| 1542 |
+
if previous_text.contains('点')
|
| 1543 |
+
|| previous_text.contains('點')
|
| 1544 |
+
|| previous_text.contains("晚上")
|
| 1545 |
+
|| previous_text.contains("上午")
|
| 1546 |
+
|| previous_text.contains("下午")
|
| 1547 |
+
|| next_text.contains('点')
|
| 1548 |
+
|| next_text.contains('點')
|
| 1549 |
+
|| next_text.contains('半')
|
| 1550 |
+
{
|
| 1551 |
+
output[index] = "O".to_string();
|
| 1552 |
+
}
|
| 1553 |
+
}
|
| 1554 |
}
|
| 1555 |
output
|
| 1556 |
}
|
|
|
|
| 1721 |
}
|
| 1722 |
}
|
| 1723 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
| 1724 |
+
if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
|
| 1725 |
output_tokens.extend(pieces);
|
| 1726 |
output_labels.extend(labels);
|
| 1727 |
continue;
|
|
|
|
| 1735 |
continue;
|
| 1736 |
}
|
| 1737 |
}
|
| 1738 |
+
let label = label_for_refined_piece(&piece, role, &group.class_name);
|
| 1739 |
+
let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
|
| 1740 |
+
output_tokens.extend(pieces);
|
| 1741 |
+
output_labels.extend(labels);
|
| 1742 |
}
|
| 1743 |
} else {
|
| 1744 |
if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
|
|
|
|
| 1774 |
}
|
| 1775 |
|
| 1776 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 1777 |
+
let joiners = [
|
| 1778 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!",
|
| 1779 |
+
];
|
| 1780 |
let entity_joiners = [
|
| 1781 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "&", "&",
|
| 1782 |
];
|
| 1783 |
let mut output = labels.to_vec();
|
| 1784 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
| 1866 |
|
| 1867 |
let dxd = labels_for("High School D×D");
|
| 1868 |
assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
|
| 1869 |
+
let colon_title = labels_for("Megumi no Daigo:Kyuukoku no Orange 06");
|
| 1870 |
+
assert!(colon_title.contains(&(":".to_string(), "B-TITLE".to_string())));
|
| 1871 |
|
| 1872 |
let sxe = labels_for("S01E02");
|
| 1873 |
assert_eq!(
|
|
|
|
| 1879 |
("02".to_string(), "B-EPISODE".to_string())
|
| 1880 |
]
|
| 1881 |
);
|
| 1882 |
+
let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
|
| 1883 |
+
assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
|
| 1884 |
+
assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 1885 |
+
|
| 1886 |
+
let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
|
| 1887 |
+
assert!(cursed.contains(&("x".to_string(), "B-TITLE".to_string())));
|
| 1888 |
+
assert!(!cursed.contains(&("x".to_string(), "B-SEASON".to_string())));
|
| 1889 |
+
let beyblade = labels_for("[jibaketa]Beyblade X - 118 (WEB 1920x1080 AVC AAC)");
|
| 1890 |
+
assert!(beyblade.contains(&("X".to_string(), "B-TITLE".to_string())));
|
| 1891 |
+
assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
|
| 1892 |
+
let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
|
| 1893 |
+
assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
|
| 1894 |
+
|
| 1895 |
+
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 1896 |
+
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
| 1897 |
+
assert!(!conan_time.contains(&("7".to_string(), "B-EPISODE".to_string())));
|
| 1898 |
+
let zom =
|
| 1899 |
+
labels_for("[Nekomoe kissaten&VCB-Studio] Zom 100 [Animatics02][Ma10p_1080p][x265]");
|
| 1900 |
+
assert!(zom.contains(&("100".to_string(), "B-TITLE".to_string())));
|
| 1901 |
+
assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
|
| 1902 |
+
assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
|
| 1903 |
|
| 1904 |
let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
|
| 1905 |
assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 1933 |
assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
|
| 1934 |
assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
|
| 1935 |
assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
|
| 1936 |
+
|
| 1937 |
+
let (trimmed, was_trimmed) =
|
| 1938 |
+
training_filename_for("12/小剧场/[LKSUB][KAGE-JITSU!][01][GB][720P]");
|
| 1939 |
+
assert!(was_trimmed);
|
| 1940 |
+
assert_eq!(trimmed, "[LKSUB][KAGE-JITSU!][01][GB][720P]");
|
| 1941 |
+
let (key, _, _, _) = template_key_for_filename(&trimmed);
|
| 1942 |
+
assert_eq!(
|
| 1943 |
+
key,
|
| 1944 |
+
"BRACKET_TEXT BRACKET_TEXT BRACKET_EPISODE BRACKET_LANG BRACKET_RESOLUTION"
|
| 1945 |
+
);
|
| 1946 |
+
|
| 1947 |
+
let short = labels_for("[Snow-Raws] R-15 CM&PV12 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|
| 1948 |
+
assert!(short.contains(&("R".to_string(), "B-TITLE".to_string())));
|
| 1949 |
+
assert!(short.contains(&("-".to_string(), "B-TITLE".to_string())));
|
| 1950 |
+
assert!(short.contains(&("15".to_string(), "B-TITLE".to_string())));
|
| 1951 |
+
assert!(!short.contains(&("15".to_string(), "B-EPISODE".to_string())));
|
| 1952 |
+
|
| 1953 |
+
let short_before_episode =
|
| 1954 |
+
labels_for("[Snow-Raws] R-15 第01話 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|
| 1955 |
+
assert!(short_before_episode.contains(&("R".to_string(), "B-TITLE".to_string())));
|
| 1956 |
+
assert!(short_before_episode.contains(&("-".to_string(), "B-TITLE".to_string())));
|
| 1957 |
+
assert!(short_before_episode.contains(&("15".to_string(), "B-TITLE".to_string())));
|
| 1958 |
+
assert!(short_before_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 1959 |
+
assert!(!short_before_episode.contains(&("15".to_string(), "B-EPISODE".to_string())));
|
| 1960 |
+
|
| 1961 |
+
let avatar = "Avatar The Last Airbender S2/Avatar The Last Airbender S2 14 [1080p]";
|
| 1962 |
+
let (trimmed, was_trimmed) = training_filename_for(avatar);
|
| 1963 |
+
assert!(was_trimmed);
|
| 1964 |
+
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
|
| 1965 |
+
|
| 1966 |
+
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
|
| 1967 |
+
let (trimmed, was_trimmed) = training_filename_for(tintin);
|
| 1968 |
+
assert!(was_trimmed);
|
| 1969 |
+
assert_eq!(
|
| 1970 |
+
trimmed,
|
| 1971 |
+
"Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)"
|
| 1972 |
+
);
|
| 1973 |
+
let (key, _, _, _) = template_key_for_filename(&trimmed);
|
| 1974 |
+
assert_eq!(
|
| 1975 |
+
key,
|
| 1976 |
+
"TEXT SEP TEXT SEP TEXT SEP BRACKET_DATE SEP SXE SEP TEXT SEP TEXT SEP TEXT SEP TEXT SEP BRACKET_TEXT SEP BRACKET_TEXT"
|
| 1977 |
+
);
|
| 1978 |
+
|
| 1979 |
+
let bocchi = "Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][臺灣閩南語配音]/Bocchi the Rock S01 孤獨搖滾!第一季 [Taiwanese Hokkien Dub][Hàn-jī Hardsub][臺灣閩南語配音][漢字字幕]/Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」";
|
| 1980 |
+
let (leaf_key, _, _, _) =
|
| 1981 |
+
template_key_for_filename("Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反��轉」");
|
| 1982 |
+
assert_eq!(leaf_key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
|
| 1983 |
+
assert!(filename_has_title(
|
| 1984 |
+
"Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」"
|
| 1985 |
+
));
|
| 1986 |
+
let (trimmed, was_trimmed) = training_filename_for(bocchi);
|
| 1987 |
+
assert!(was_trimmed);
|
| 1988 |
+
assert_eq!(trimmed, "Bocchi the Rock! 孤獨搖滾!S01E01「孤獨反輾轉」");
|
| 1989 |
+
let (key, _, _, _) = template_key_for_filename(&trimmed);
|
| 1990 |
+
assert_eq!(key, "TEXT SEP TEXT SEP TEXT SEP TEXT SXE TEXT");
|
| 1991 |
+
|
| 1992 |
+
let usagi = "Gochuumon wa Usagi Desuka-60fps/Gochuumon wa Usagi Desuka S1/Usagi S1[01][60fps][8bit_1080p][x265_flac]";
|
| 1993 |
+
let (trimmed, was_trimmed) = training_filename_for(usagi);
|
| 1994 |
+
assert!(was_trimmed);
|
| 1995 |
+
assert_eq!(trimmed, "Usagi S1[01][60fps][8bit_1080p][x265_flac]");
|
| 1996 |
+
let (key, _, _, _) = template_key_for_filename(&trimmed);
|
| 1997 |
+
assert_eq!(
|
| 1998 |
+
key,
|
| 1999 |
+
"TEXT SEP SEASON BRACKET_EPISODE BRACKET_TEXT BRACKET_MEDIA_BLOCK BRACKET_MEDIA"
|
| 2000 |
+
);
|
| 2001 |
+
|
| 2002 |
+
let woody_parent =
|
| 2003 |
+
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
|
| 2004 |
+
let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
|
| 2005 |
+
assert!(was_trimmed);
|
| 2006 |
+
assert_eq!(trimmed, woody_parent);
|
| 2007 |
+
|
| 2008 |
+
let volume =
|
| 2009 |
+
labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|
| 2010 |
+
assert!(volume.contains(&("生徒会役員共".to_string(), "B-TITLE".to_string())));
|
| 2011 |
+
assert!(volume.contains(&("Vol".to_string(), "B-SPECIAL".to_string())));
|
| 2012 |
+
assert!(volume.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 2013 |
+
assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
|
| 2014 |
+
assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 2015 |
+
|
| 2016 |
+
let numeric_title =
|
| 2017 |
+
labels_for("3000.Leagues.in.Search.of.Mother.S01E01.1080p.WEB-DL.H.264-D00oo00M");
|
| 2018 |
+
assert!(numeric_title.contains(&("3000".to_string(), "B-TITLE".to_string())));
|
| 2019 |
+
assert!(numeric_title.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 2020 |
+
assert!(numeric_title.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 2021 |
+
assert!(numeric_title.contains(&("1080p".to_string(), "B-RESOLUTION".to_string())));
|
| 2022 |
+
assert!(numeric_title.contains(&("H".to_string(), "B-SOURCE".to_string())));
|
| 2023 |
+
assert!(numeric_title.contains(&("264".to_string(), "B-SOURCE".to_string())));
|
| 2024 |
+
assert!(!numeric_title.contains(&("264".to_string(), "B-EPISODE".to_string())));
|
| 2025 |
+
|
| 2026 |
+
let media_block =
|
| 2027 |
+
labels_for("[Kamigami] Kantai Collection - 06v2 [1920×1080 x264 AAC Sub(Chs,Cht,Jap)]");
|
| 2028 |
+
assert!(media_block.contains(&("1920".to_string(), "B-RESOLUTION".to_string())));
|
| 2029 |
+
assert!(media_block.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
|
| 2030 |
+
assert!(media_block.contains(&("x264".to_string(), "B-SOURCE".to_string())));
|
| 2031 |
+
assert!(media_block.contains(&("Chs".to_string(), "B-SOURCE".to_string())));
|
| 2032 |
}
|
| 2033 |
}
|