Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Improve DMHY template labeling pipeline
Browse files- datasets/AnimeName +1 -1
- tools/rust_dmhy_template_apply/src/main.rs +964 -28
datasets/AnimeName
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 081fd450aafd59992f2df794c5b0110dc3cdd42b
|
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
use anyhow::{bail, Context, Result};
|
| 2 |
use chrono::Utc;
|
| 3 |
use clap::Parser;
|
| 4 |
-
use once_cell::sync::Lazy;
|
| 5 |
use rayon::prelude::*;
|
| 6 |
use regex::Regex;
|
| 7 |
use serde::{Deserialize, Serialize};
|
|
@@ -21,6 +21,8 @@ struct Args {
|
|
| 21 |
audit_low_frequency: bool,
|
| 22 |
#[arg(long)]
|
| 23 |
verify_generated_output: bool,
|
|
|
|
|
|
|
| 24 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 25 |
input: PathBuf,
|
| 26 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
@@ -53,6 +55,8 @@ struct Args {
|
|
| 53 |
review_output: PathBuf,
|
| 54 |
#[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
|
| 55 |
audit_output: PathBuf,
|
|
|
|
|
|
|
| 56 |
#[arg(long, default_value_t = 50)]
|
| 57 |
audit_max_count: u64,
|
| 58 |
#[arg(long)]
|
|
@@ -81,10 +85,22 @@ struct Args {
|
|
| 81 |
keep_encoding_noise: bool,
|
| 82 |
#[arg(long)]
|
| 83 |
preserve_parent_paths: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
#[arg(long)]
|
| 85 |
threads: Option<usize>,
|
| 86 |
}
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
#[derive(Debug, Clone, Deserialize)]
|
| 89 |
struct Recipe {
|
| 90 |
template_id: String,
|
|
@@ -151,11 +167,20 @@ enum Processed {
|
|
| 151 |
static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
|
| 152 |
static RESOLUTION_RE: Lazy<Regex> =
|
| 153 |
Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
|
|
|
|
|
|
|
| 154 |
static EPISODE_VERSION_RE: Lazy<Regex> =
|
| 155 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
|
|
|
|
|
|
|
|
|
|
| 156 |
static EPISODE_RE: Lazy<Regex> =
|
| 157 |
-
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap());
|
|
|
|
|
|
|
| 158 |
static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
|
|
|
|
|
|
|
| 159 |
static EPISODE_RANGE_RE: Lazy<Regex> =
|
| 160 |
Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
|
| 161 |
static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
|
|
@@ -173,7 +198,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
|
| 173 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 174 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 175 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 176 |
-
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
|
| 177 |
});
|
| 178 |
static VOLUME_RE: Lazy<Regex> =
|
| 179 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
@@ -183,7 +208,7 @@ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 183 |
Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
|
| 184 |
});
|
| 185 |
static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
|
| 186 |
-
Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|
|
| 187 |
});
|
| 188 |
static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
| 189 |
Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
|
|
@@ -191,6 +216,8 @@ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 191 |
});
|
| 192 |
static YEAR_RANGE_RE: Lazy<Regex> =
|
| 193 |
Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
|
|
|
|
|
|
|
| 194 |
static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 195 |
Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
|
| 196 |
});
|
|
@@ -206,6 +233,7 @@ static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 206 |
static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
|
| 207 |
[
|
| 208 |
r"^\d{3,4}[xX×]\d{3,4}",
|
|
|
|
| 209 |
r"(?i)^h\.?26[45]",
|
| 210 |
r"(?i)^x\.?26[45]",
|
| 211 |
r"^[\\/]+",
|
|
@@ -233,6 +261,7 @@ fn main() -> Result<()> {
|
|
| 233 |
.build_global()
|
| 234 |
.context("failed to configure rayon thread pool")?;
|
| 235 |
}
|
|
|
|
| 236 |
if args.cluster {
|
| 237 |
return run_cluster(&args);
|
| 238 |
}
|
|
@@ -242,6 +271,9 @@ fn main() -> Result<()> {
|
|
| 242 |
if args.verify_generated_output {
|
| 243 |
return run_verify_generated_output(&args);
|
| 244 |
}
|
|
|
|
|
|
|
|
|
|
| 245 |
if args.expand != "all" && args.expand != "sample" {
|
| 246 |
bail!("--expand must be all or sample");
|
| 247 |
}
|
|
@@ -334,6 +366,7 @@ fn main() -> Result<()> {
|
|
| 334 |
"min_count": args.min_count,
|
| 335 |
"low_frequency_audit_max_count": args.audit_max_count,
|
| 336 |
"low_frequency_blocking_warnings": [
|
|
|
|
| 337 |
"hash_labeled",
|
| 338 |
"multiple_title_spans",
|
| 339 |
"no_title",
|
|
@@ -355,6 +388,57 @@ fn main() -> Result<()> {
|
|
| 355 |
Ok(())
|
| 356 |
}
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
|
| 359 |
let file = File::open(&args.recipes)
|
| 360 |
.with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
|
|
@@ -745,7 +829,11 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
|
|
| 745 |
for warning in audit_warnings(&record) {
|
| 746 |
if !matches!(
|
| 747 |
warning.as_str(),
|
| 748 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
) {
|
| 750 |
continue;
|
| 751 |
}
|
|
@@ -780,6 +868,204 @@ fn run_verify_generated_output(args: &Args) -> Result<()> {
|
|
| 780 |
Ok(())
|
| 781 |
}
|
| 782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
|
| 784 |
let mut spans = Vec::new();
|
| 785 |
let mut current_label: Option<String> = None;
|
|
@@ -820,8 +1106,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
|
|
| 820 |
} else if title_spans > 1 {
|
| 821 |
warnings.push("multiple_title_spans".to_string());
|
| 822 |
}
|
| 823 |
-
|
|
|
|
| 824 |
warnings.push("no_episode".to_string());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
}
|
| 826 |
if record.filename.contains('/') || record.filename.contains('\\') {
|
| 827 |
warnings.push("path_retained".to_string());
|
|
@@ -927,7 +1221,11 @@ fn has_blocking_low_frequency_warning(record: &Record) -> bool {
|
|
| 927 |
audit_warnings(record).iter().any(|warning| {
|
| 928 |
matches!(
|
| 929 |
warning.as_str(),
|
| 930 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
)
|
| 932 |
})
|
| 933 |
}
|
|
@@ -1046,6 +1344,9 @@ fn classify_atom(text: &str) -> String {
|
|
| 1046 |
if EPISODE_VERSION_RE.is_match(&compact) {
|
| 1047 |
return "EPISODE_VERSION".to_string();
|
| 1048 |
}
|
|
|
|
|
|
|
|
|
|
| 1049 |
if SXE_RE.is_match(&compact) {
|
| 1050 |
return "SXE".to_string();
|
| 1051 |
}
|
|
@@ -1321,8 +1622,33 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 1321 |
.map(str::trim)
|
| 1322 |
.filter(|part| !part.is_empty())
|
| 1323 |
.collect();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 1325 |
-
if
|
| 1326 |
if !path_segment_is_plain_season(parts[parts.len() - 2]) {
|
| 1327 |
return (parts[parts.len() - 1].to_string(), true);
|
| 1328 |
}
|
|
@@ -1334,7 +1660,14 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 1334 |
{
|
| 1335 |
(parts[parts.len() - 1].to_string(), true)
|
| 1336 |
} else {
|
| 1337 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1338 |
}
|
| 1339 |
} else {
|
| 1340 |
(parts[parts.len() - 1].to_string(), true)
|
|
@@ -1349,6 +1682,43 @@ fn path_segment_is_plain_season(segment: &str) -> bool {
|
|
| 1349 |
PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
|
| 1350 |
}
|
| 1351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1352 |
fn path_segment_has_season(value: &str) -> bool {
|
| 1353 |
PATH_SEGMENT_SEASON_RE.is_match(value)
|
| 1354 |
}
|
|
@@ -1368,7 +1738,9 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1368 |
return true;
|
| 1369 |
}
|
| 1370 |
let markers = [
|
| 1371 |
-
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
|
|
|
|
|
|
|
| 1372 |
];
|
| 1373 |
let marker_hits = markers
|
| 1374 |
.iter()
|
|
@@ -1403,7 +1775,83 @@ fn path_segment_is_episodeish(value: &str) -> bool {
|
|
| 1403 |
!structural.is_empty()
|
| 1404 |
&& structural
|
| 1405 |
.iter()
|
| 1406 |
-
.all(|item|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1407 |
}
|
| 1408 |
|
| 1409 |
fn has_abstract_path_noise(value: &str) -> bool {
|
|
@@ -1642,6 +2090,11 @@ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
|
| 1642 |
}
|
| 1643 |
|
| 1644 |
fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1645 |
let caps = EPISODE_VALUE_RE.captures(token)?;
|
| 1646 |
let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
|
| 1647 |
let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
|
|
@@ -1672,6 +2125,29 @@ fn group_text(tokens: &[String], group: &Group) -> String {
|
|
| 1672 |
)
|
| 1673 |
}
|
| 1674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1675 |
fn is_special_title_phrase(text: &str) -> bool {
|
| 1676 |
let normalized = SPECIAL_SPACE_RE
|
| 1677 |
.replace_all(text, " ")
|
|
@@ -1681,6 +2157,8 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 1681 |
normalized.as_str(),
|
| 1682 |
"CM" | "EVENT"
|
| 1683 |
| "EIZOU"
|
|
|
|
|
|
|
| 1684 |
| "LOGO"
|
| 1685 |
| "MENU"
|
| 1686 |
| "OMAKE"
|
|
@@ -1690,13 +2168,123 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 1690 |
| "TOKUTEN"
|
| 1691 |
| "TRAILER"
|
| 1692 |
| "WORLD PREMIERE"
|
| 1693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1694 |
}
|
| 1695 |
|
| 1696 |
fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
|
| 1697 |
let mut output = roles.to_vec();
|
| 1698 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 1699 |
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1700 |
if !output.iter().any(|role| role == "TITLE")
|
| 1701 |
&& roles
|
| 1702 |
.first()
|
|
@@ -1790,17 +2378,40 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1790 |
}
|
| 1791 |
if output[index - 2] == "TITLE"
|
| 1792 |
&& groups[index - 1].class_name == "SEP"
|
| 1793 |
-
&& previous_text.len() <=
|
| 1794 |
-
&& previous_text.
|
| 1795 |
-
&& previous_text.chars().all(|ch| ch.is_ascii_alphabetic())
|
| 1796 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 1797 |
&& text.len() <= 3
|
| 1798 |
-
&& (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1799 |
{
|
| 1800 |
output[index] = "TITLE".to_string();
|
| 1801 |
continue;
|
| 1802 |
}
|
| 1803 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1804 |
if roles[index].starts_with("EPISODE")
|
| 1805 |
&& index >= 2
|
| 1806 |
&& output[..index].iter().any(|role| role == "TITLE")
|
|
@@ -1843,6 +2454,15 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1843 |
output[index] = "SPECIAL".to_string();
|
| 1844 |
continue;
|
| 1845 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1846 |
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 1847 |
{
|
| 1848 |
output[index] = "O".to_string();
|
|
@@ -1870,8 +2490,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1870 |
&& groups[index + 1].class_name == "SEP"
|
| 1871 |
&& roles[index + 2].starts_with("EPISODE")
|
| 1872 |
{
|
| 1873 |
-
output[index] = "
|
| 1874 |
-
|
|
|
|
|
|
|
| 1875 |
continue;
|
| 1876 |
}
|
| 1877 |
if roles[index] == "TITLE"
|
|
@@ -1897,6 +2519,37 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1897 |
output[index + 2] = "O".to_string();
|
| 1898 |
}
|
| 1899 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1900 |
if roles[index].starts_with("EPISODE") {
|
| 1901 |
let previous_text = if index >= 1 {
|
| 1902 |
group_text(tokens, &groups[index - 1])
|
|
@@ -1959,6 +2612,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
|
|
| 1959 |
}
|
| 1960 |
|
| 1961 |
fn enforce_single_title_candidate(
|
|
|
|
| 1962 |
groups: &[Group],
|
| 1963 |
roles: &[String],
|
| 1964 |
) -> (Vec<String>, Vec<String>) {
|
|
@@ -1981,13 +2635,20 @@ fn enforce_single_title_candidate(
|
|
| 1981 |
.copied()
|
| 1982 |
.filter(|(_, end)| *end <= first_anchor)
|
| 1983 |
.collect();
|
| 1984 |
-
let
|
| 1985 |
&candidates
|
| 1986 |
} else {
|
| 1987 |
&before_anchor
|
| 1988 |
-
}
|
|
|
|
| 1989 |
.iter()
|
| 1990 |
-
.max_by_key(|(start, end)|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1991 |
.copied()
|
| 1992 |
.unwrap();
|
| 1993 |
let mut output = roles.to_vec();
|
|
@@ -2006,6 +2667,33 @@ fn enforce_single_title_candidate(
|
|
| 2006 |
(output, dropped)
|
| 2007 |
}
|
| 2008 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2009 |
fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
|
| 2010 |
let mut output_tokens = Vec::new();
|
| 2011 |
let mut output_labels = Vec::new();
|
|
@@ -2162,14 +2850,16 @@ fn project_refined_tokens(
|
|
| 2162 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 2163 |
let joiners = [
|
| 2164 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2165 |
-
"?", ";", ";", ",", ",", "
|
| 2166 |
-
"】", "
|
|
|
|
| 2167 |
];
|
| 2168 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 2169 |
let entity_joiners = [
|
| 2170 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2171 |
-
"?", ";", ";", ",", ",", "
|
| 2172 |
-
"】", "
|
|
|
|
| 2173 |
];
|
| 2174 |
let mut output = labels.to_vec();
|
| 2175 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
@@ -2203,17 +2893,50 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 2203 |
output[index] = "B-TITLE".to_string();
|
| 2204 |
}
|
| 2205 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2206 |
}
|
| 2207 |
output
|
| 2208 |
}
|
| 2209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2210 |
fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
|
| 2211 |
let (key, tokens, _classes, groups) = template_key_for_filename(filename);
|
| 2212 |
if groups.len() != roles.len() {
|
| 2213 |
return None;
|
| 2214 |
}
|
| 2215 |
let roles = adjust_contextual_roles(&tokens, &groups, roles);
|
| 2216 |
-
let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
|
| 2217 |
let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
|
| 2218 |
let labels = smooth_title_spans(&tokens, &labels);
|
| 2219 |
if tokens.len() != labels.len() {
|
|
@@ -2246,6 +2969,18 @@ mod tests {
|
|
| 2246 |
record.tokens.into_iter().zip(record.labels).collect()
|
| 2247 |
}
|
| 2248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2249 |
#[test]
|
| 2250 |
fn required_regressions() {
|
| 2251 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
|
@@ -2313,6 +3048,30 @@ mod tests {
|
|
| 2313 |
let comma_title =
|
| 2314 |
labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
|
| 2315 |
assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2316 |
let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
|
| 2317 |
assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
|
| 2318 |
let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
|
|
@@ -2322,6 +3081,7 @@ mod tests {
|
|
| 2322 |
let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
|
| 2323 |
assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
|
| 2324 |
assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 2325 |
|
| 2326 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 2327 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
@@ -2336,6 +3096,121 @@ mod tests {
|
|
| 2336 |
assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
|
| 2337 |
assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
|
| 2338 |
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2339 |
}
|
| 2340 |
|
| 2341 |
#[test]
|
|
@@ -2345,7 +3220,7 @@ mod tests {
|
|
| 2345 |
assert!(was_trimmed);
|
| 2346 |
assert_eq!(
|
| 2347 |
trimmed,
|
| 2348 |
-
"Season 4
|
| 2349 |
);
|
| 2350 |
let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
|
| 2351 |
let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
|
|
@@ -2402,6 +3277,27 @@ mod tests {
|
|
| 2402 |
assert!(was_trimmed);
|
| 2403 |
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
|
| 2404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2405 |
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
|
| 2406 |
let (trimmed, was_trimmed) = training_filename_for(tintin);
|
| 2407 |
assert!(was_trimmed);
|
|
@@ -2442,7 +3338,47 @@ mod tests {
|
|
| 2442 |
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
|
| 2443 |
let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
|
| 2444 |
assert!(was_trimmed);
|
| 2445 |
-
assert_eq!(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2446 |
|
| 2447 |
let volume =
|
| 2448 |
labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|
|
|
|
| 1 |
use anyhow::{bail, Context, Result};
|
| 2 |
use chrono::Utc;
|
| 3 |
use clap::Parser;
|
| 4 |
+
use once_cell::sync::{Lazy, OnceCell};
|
| 5 |
use rayon::prelude::*;
|
| 6 |
use regex::Regex;
|
| 7 |
use serde::{Deserialize, Serialize};
|
|
|
|
| 21 |
audit_low_frequency: bool,
|
| 22 |
#[arg(long)]
|
| 23 |
verify_generated_output: bool,
|
| 24 |
+
#[arg(long)]
|
| 25 |
+
rich_annotations: bool,
|
| 26 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 27 |
input: PathBuf,
|
| 28 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
|
|
| 55 |
review_output: PathBuf,
|
| 56 |
#[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
|
| 57 |
audit_output: PathBuf,
|
| 58 |
+
#[arg(long, default_value = "reports/dmhy_rich_annotations.rust.jsonl")]
|
| 59 |
+
rich_output: PathBuf,
|
| 60 |
#[arg(long, default_value_t = 50)]
|
| 61 |
audit_max_count: u64,
|
| 62 |
#[arg(long)]
|
|
|
|
| 85 |
keep_encoding_noise: bool,
|
| 86 |
#[arg(long)]
|
| 87 |
preserve_parent_paths: bool,
|
| 88 |
+
#[arg(long, default_value = "datasets/AnimeName/dmhy_title_whitelist.txt")]
|
| 89 |
+
title_whitelist: PathBuf,
|
| 90 |
+
#[arg(long, default_value = "datasets/AnimeName/dmhy_group_whitelist.txt")]
|
| 91 |
+
group_whitelist: PathBuf,
|
| 92 |
#[arg(long)]
|
| 93 |
threads: Option<usize>,
|
| 94 |
}
|
| 95 |
|
| 96 |
+
#[derive(Debug, Default)]
|
| 97 |
+
struct Whitelists {
|
| 98 |
+
title_phrases: Vec<Vec<String>>,
|
| 99 |
+
group_names: HashSet<String>,
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
static RUNTIME_WHITELISTS: OnceCell<Whitelists> = OnceCell::new();
|
| 103 |
+
|
| 104 |
#[derive(Debug, Clone, Deserialize)]
|
| 105 |
struct Recipe {
|
| 106 |
template_id: String,
|
|
|
|
| 167 |
static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
|
| 168 |
static RESOLUTION_RE: Lazy<Regex> =
|
| 169 |
Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
|
| 170 |
+
static BARE_RESOLUTION_RE: Lazy<Regex> =
|
| 171 |
+
Lazy::new(|| Regex::new(r"^(?:360|480|540|576|720|1080|2160)$").unwrap());
|
| 172 |
static EPISODE_VERSION_RE: Lazy<Regex> =
|
| 173 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
|
| 174 |
+
static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
|
| 175 |
+
Regex::new(r"(?i)^\d{1,4}[_ .-]?(?:Notice|Full|R18|R|Uncut|Director'?s?Cut)$").unwrap()
|
| 176 |
+
});
|
| 177 |
static EPISODE_RE: Lazy<Regex> =
|
| 178 |
+
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
|
| 179 |
+
static DECIMAL_EPISODE_RE: Lazy<Regex> =
|
| 180 |
+
Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
|
| 181 |
static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
|
| 182 |
+
static EPISODE_CJK_PREFIX_RE: Lazy<Regex> =
|
| 183 |
+
Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]").unwrap());
|
| 184 |
static EPISODE_RANGE_RE: Lazy<Regex> =
|
| 185 |
Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
|
| 186 |
static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
|
|
|
|
| 198 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 199 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 200 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 201 |
+
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
| 202 |
});
|
| 203 |
static VOLUME_RE: Lazy<Regex> =
|
| 204 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
|
|
| 208 |
Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
|
| 209 |
});
|
| 210 |
static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
|
| 211 |
+
Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
|
| 212 |
});
|
| 213 |
static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
| 214 |
Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
|
|
|
|
| 216 |
});
|
| 217 |
static YEAR_RANGE_RE: Lazy<Regex> =
|
| 218 |
Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
|
| 219 |
+
static VERSIONISH_TITLE_RE: Lazy<Regex> =
|
| 220 |
+
Lazy::new(|| Regex::new(r"(?i)^(?:19|20)\d{2}(?:版|ver\.?|version)?$").unwrap());
|
| 221 |
static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 222 |
Regex::new(r"(?i)(?:^|[\s_.\-/])(?:season\s*\d{1,2}|s\d{1,2})(?:$|[\s_.\-/])").unwrap()
|
| 223 |
});
|
|
|
|
| 233 |
static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
|
| 234 |
[
|
| 235 |
r"^\d{3,4}[xX×]\d{3,4}",
|
| 236 |
+
r"(?i)^(?:AAC|AC3|EAC3|DTS|FLAC|DDP)\s*\d+(?:\.\d+)?",
|
| 237 |
r"(?i)^h\.?26[45]",
|
| 238 |
r"(?i)^x\.?26[45]",
|
| 239 |
r"^[\\/]+",
|
|
|
|
| 261 |
.build_global()
|
| 262 |
.context("failed to configure rayon thread pool")?;
|
| 263 |
}
|
| 264 |
+
let _ = RUNTIME_WHITELISTS.set(load_whitelists(&args)?);
|
| 265 |
if args.cluster {
|
| 266 |
return run_cluster(&args);
|
| 267 |
}
|
|
|
|
| 271 |
if args.verify_generated_output {
|
| 272 |
return run_verify_generated_output(&args);
|
| 273 |
}
|
| 274 |
+
if args.rich_annotations {
|
| 275 |
+
return run_rich_annotations(&args);
|
| 276 |
+
}
|
| 277 |
if args.expand != "all" && args.expand != "sample" {
|
| 278 |
bail!("--expand must be all or sample");
|
| 279 |
}
|
|
|
|
| 366 |
"min_count": args.min_count,
|
| 367 |
"low_frequency_audit_max_count": args.audit_max_count,
|
| 368 |
"low_frequency_blocking_warnings": [
|
| 369 |
+
"ambiguous_no_episode_title",
|
| 370 |
"hash_labeled",
|
| 371 |
"multiple_title_spans",
|
| 372 |
"no_title",
|
|
|
|
| 388 |
Ok(())
|
| 389 |
}
|
| 390 |
|
| 391 |
+
fn load_whitelists(args: &Args) -> Result<Whitelists> {
|
| 392 |
+
Ok(Whitelists {
|
| 393 |
+
title_phrases: load_title_whitelist(&args.title_whitelist)?,
|
| 394 |
+
group_names: load_name_whitelist(&args.group_whitelist)?,
|
| 395 |
+
})
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
fn load_title_whitelist(path: &PathBuf) -> Result<Vec<Vec<String>>> {
|
| 399 |
+
let mut phrases = Vec::new();
|
| 400 |
+
for line in load_whitelist_lines(path)? {
|
| 401 |
+
let phrase = phrase_parts_for_whitelist(&line);
|
| 402 |
+
if !phrase.is_empty() {
|
| 403 |
+
phrases.push(phrase);
|
| 404 |
+
}
|
| 405 |
+
}
|
| 406 |
+
Ok(phrases)
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
fn load_name_whitelist(path: &PathBuf) -> Result<HashSet<String>> {
|
| 410 |
+
Ok(load_whitelist_lines(path)?
|
| 411 |
+
.into_iter()
|
| 412 |
+
.map(|line| normalize_whitelist_name(&line))
|
| 413 |
+
.filter(|line| !line.is_empty())
|
| 414 |
+
.collect())
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
|
| 418 |
+
if !path.exists() {
|
| 419 |
+
return Ok(Vec::new());
|
| 420 |
+
}
|
| 421 |
+
let file = File::open(path)
|
| 422 |
+
.with_context(|| format!("failed to open whitelist {}", path.display()))?;
|
| 423 |
+
let mut lines = Vec::new();
|
| 424 |
+
for line in BufReader::new(file).lines() {
|
| 425 |
+
let line = line?;
|
| 426 |
+
let line = line.trim();
|
| 427 |
+
if line.is_empty() || line.starts_with('#') {
|
| 428 |
+
continue;
|
| 429 |
+
}
|
| 430 |
+
let value = line
|
| 431 |
+
.split_once('\t')
|
| 432 |
+
.map(|(_, value)| value)
|
| 433 |
+
.unwrap_or(line)
|
| 434 |
+
.trim();
|
| 435 |
+
if !value.is_empty() {
|
| 436 |
+
lines.push(value.to_string());
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
Ok(lines)
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
|
| 443 |
let file = File::open(&args.recipes)
|
| 444 |
.with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
|
|
|
|
| 829 |
for warning in audit_warnings(&record) {
|
| 830 |
if !matches!(
|
| 831 |
warning.as_str(),
|
| 832 |
+
"ambiguous_no_episode_title"
|
| 833 |
+
| "hash_labeled"
|
| 834 |
+
| "multiple_title_spans"
|
| 835 |
+
| "no_title"
|
| 836 |
+
| "path_retained"
|
| 837 |
) {
|
| 838 |
continue;
|
| 839 |
}
|
|
|
|
| 868 |
Ok(())
|
| 869 |
}
|
| 870 |
|
| 871 |
+
fn run_rich_annotations(args: &Args) -> Result<()> {
|
| 872 |
+
let inputs = load_input(&args.input, args.limit)?;
|
| 873 |
+
if let Some(parent) = args.rich_output.parent() {
|
| 874 |
+
fs::create_dir_all(parent)?;
|
| 875 |
+
}
|
| 876 |
+
let rows: Vec<Value> = inputs
|
| 877 |
+
.par_iter()
|
| 878 |
+
.filter_map(|original| {
|
| 879 |
+
if !args.keep_encoding_noise
|
| 880 |
+
&& (has_encoding_noise(original)
|
| 881 |
+
|| has_non_anime_noise(original)
|
| 882 |
+
|| has_abstract_path_noise(original))
|
| 883 |
+
{
|
| 884 |
+
return None;
|
| 885 |
+
}
|
| 886 |
+
Some(rich_annotation_for(original))
|
| 887 |
+
})
|
| 888 |
+
.collect();
|
| 889 |
+
let mut writer = BufWriter::new(File::create(&args.rich_output)?);
|
| 890 |
+
for row in &rows {
|
| 891 |
+
serde_json::to_writer(&mut writer, row)?;
|
| 892 |
+
writer.write_all(b"\n")?;
|
| 893 |
+
}
|
| 894 |
+
writer.flush()?;
|
| 895 |
+
let manifest = json!({
|
| 896 |
+
"generated_at": Utc::now().to_rfc3339(),
|
| 897 |
+
"input": args.input.to_string_lossy(),
|
| 898 |
+
"rich_output": args.rich_output.to_string_lossy(),
|
| 899 |
+
"rows": rows.len(),
|
| 900 |
+
"implementation": "rust_dmhy_rich_annotations",
|
| 901 |
+
"notes": [
|
| 902 |
+
"rich roles are metadata for review/projection, not final training BIO labels",
|
| 903 |
+
"TITLE_* candidates may be collapsed or filtered before dmhy_weak generation"
|
| 904 |
+
]
|
| 905 |
+
});
|
| 906 |
+
println!("{}", serde_json::to_string_pretty(&manifest)?);
|
| 907 |
+
Ok(())
|
| 908 |
+
}
|
| 909 |
+
|
| 910 |
+
fn rich_annotation_for(original: &str) -> Value {
|
| 911 |
+
let (training_filename, path_trimmed) = training_filename_for(original);
|
| 912 |
+
let parts: Vec<&str> = original
|
| 913 |
+
.split(|ch| ch == '/' || ch == '\\')
|
| 914 |
+
.map(str::trim)
|
| 915 |
+
.filter(|part| !part.is_empty())
|
| 916 |
+
.collect();
|
| 917 |
+
let leaf_index = parts.len().saturating_sub(1);
|
| 918 |
+
let segments = parts
|
| 919 |
+
.iter()
|
| 920 |
+
.enumerate()
|
| 921 |
+
.map(|(index, segment)| rich_segment(segment, index, index == leaf_index))
|
| 922 |
+
.collect::<Vec<_>>();
|
| 923 |
+
let projection = dmhy_record(
|
| 924 |
+
&training_filename,
|
| 925 |
+
"rich_projection",
|
| 926 |
+
&suggested_roles(&template_key_for_filename(&training_filename).0),
|
| 927 |
+
)
|
| 928 |
+
.map(|record| {
|
| 929 |
+
json!({
|
| 930 |
+
"filename": record.filename,
|
| 931 |
+
"spans": entity_spans(&record.tokens, &record.labels),
|
| 932 |
+
"warnings": audit_warnings(&record),
|
| 933 |
+
})
|
| 934 |
+
});
|
| 935 |
+
json!({
|
| 936 |
+
"source_filename": original,
|
| 937 |
+
"training_filename": training_filename,
|
| 938 |
+
"path_trimmed": path_trimmed,
|
| 939 |
+
"segments": segments,
|
| 940 |
+
"projection_preview": projection,
|
| 941 |
+
})
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
|
| 945 |
+
let (key, tokens, _classes, groups) = template_key_for_filename(segment);
|
| 946 |
+
let suggested = suggested_roles(&key);
|
| 947 |
+
let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
|
| 948 |
+
let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
|
| 949 |
+
json!({
|
| 950 |
+
"index": index,
|
| 951 |
+
"text": segment,
|
| 952 |
+
"kind": rich_segment_kind(segment, is_leaf),
|
| 953 |
+
"template": key,
|
| 954 |
+
"candidates": candidates,
|
| 955 |
+
})
|
| 956 |
+
}
|
| 957 |
+
|
| 958 |
+
fn rich_segment_kind(segment: &str, is_leaf: bool) -> &'static str {
|
| 959 |
+
if path_segment_is_media_noise(segment) {
|
| 960 |
+
"media_noise"
|
| 961 |
+
} else if path_segment_is_plain_season(segment) {
|
| 962 |
+
"season_dir"
|
| 963 |
+
} else if is_leaf {
|
| 964 |
+
"leaf"
|
| 965 |
+
} else {
|
| 966 |
+
"parent"
|
| 967 |
+
}
|
| 968 |
+
}
|
| 969 |
+
|
| 970 |
+
fn rich_candidates_for_segment(
|
| 971 |
+
segment: &str,
|
| 972 |
+
tokens: &[String],
|
| 973 |
+
groups: &[Group],
|
| 974 |
+
roles: &[String],
|
| 975 |
+
is_leaf: bool,
|
| 976 |
+
) -> Vec<Value> {
|
| 977 |
+
let mut output = Vec::new();
|
| 978 |
+
let title_ranges = title_candidates(groups, roles);
|
| 979 |
+
for (candidate_index, (start, end)) in title_ranges.iter().copied().enumerate() {
|
| 980 |
+
let text = candidate_text(tokens, groups, start, end);
|
| 981 |
+
if text.trim().is_empty() {
|
| 982 |
+
continue;
|
| 983 |
+
}
|
| 984 |
+
output.push(json!({
|
| 985 |
+
"role": fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()),
|
| 986 |
+
"coarse_role": "TITLE",
|
| 987 |
+
"text": text,
|
| 988 |
+
"group_start": start,
|
| 989 |
+
"group_end": end,
|
| 990 |
+
}));
|
| 991 |
+
}
|
| 992 |
+
for (group_index, role) in roles.iter().enumerate() {
|
| 993 |
+
if role == "TITLE" || role == "O" || role == "HASH" {
|
| 994 |
+
continue;
|
| 995 |
+
}
|
| 996 |
+
let text = group_text(tokens, &groups[group_index]);
|
| 997 |
+
if text.trim().is_empty() {
|
| 998 |
+
continue;
|
| 999 |
+
}
|
| 1000 |
+
let coarse_role = role_label(role)
|
| 1001 |
+
.strip_prefix("B-")
|
| 1002 |
+
.map(str::to_string)
|
| 1003 |
+
.unwrap_or_else(|| "O".to_string());
|
| 1004 |
+
output.push(json!({
|
| 1005 |
+
"role": fine_non_title_role(role),
|
| 1006 |
+
"coarse_role": coarse_role,
|
| 1007 |
+
"text": text,
|
| 1008 |
+
"group_start": group_index,
|
| 1009 |
+
"group_end": group_index + 1,
|
| 1010 |
+
}));
|
| 1011 |
+
}
|
| 1012 |
+
output
|
| 1013 |
+
}
|
| 1014 |
+
|
| 1015 |
+
fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
|
| 1016 |
+
let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
|
| 1017 |
+
return String::new();
|
| 1018 |
+
};
|
| 1019 |
+
let Some(last) = groups
|
| 1020 |
+
.get(end.saturating_sub(1))
|
| 1021 |
+
.and_then(|group| group.indices.last())
|
| 1022 |
+
else {
|
| 1023 |
+
return String::new();
|
| 1024 |
+
};
|
| 1025 |
+
strip_wrapper(&tokens[*first..=*last].join(""))
|
| 1026 |
+
}
|
| 1027 |
+
|
| 1028 |
+
fn fine_title_role(
|
| 1029 |
+
segment: &str,
|
| 1030 |
+
text: &str,
|
| 1031 |
+
is_leaf: bool,
|
| 1032 |
+
candidate_index: usize,
|
| 1033 |
+
candidate_count: usize,
|
| 1034 |
+
) -> &'static str {
|
| 1035 |
+
let cleaned = text.trim();
|
| 1036 |
+
if VERSIONISH_TITLE_RE.is_match(cleaned) {
|
| 1037 |
+
return "RELEASE_VERSION";
|
| 1038 |
+
}
|
| 1039 |
+
if matches!(
|
| 1040 |
+
cleaned.to_ascii_lowercase().as_str(),
|
| 1041 |
+
"国漫" | "國漫" | "anime" | "movie" | "movies"
|
| 1042 |
+
) {
|
| 1043 |
+
return "TITLE_CATEGORY";
|
| 1044 |
+
}
|
| 1045 |
+
if is_leaf && path_segment_starts_with_episode(segment) {
|
| 1046 |
+
return "EPISODE_TITLE";
|
| 1047 |
+
}
|
| 1048 |
+
if !is_leaf {
|
| 1049 |
+
return "PATH_TITLE";
|
| 1050 |
+
}
|
| 1051 |
+
if candidate_count > 1 && candidate_index > 0 {
|
| 1052 |
+
return "TITLE_ALIAS";
|
| 1053 |
+
}
|
| 1054 |
+
"TITLE_MAIN"
|
| 1055 |
+
}
|
| 1056 |
+
|
| 1057 |
+
fn fine_non_title_role(role: &str) -> &'static str {
|
| 1058 |
+
match role {
|
| 1059 |
+
"GROUP" => "RELEASE_GROUP",
|
| 1060 |
+
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
|
| 1061 |
+
"SEASON" => "SEASON",
|
| 1062 |
+
"SPECIAL" | "VOLUME" => "SPECIAL",
|
| 1063 |
+
"RESOLUTION" => "RESOLUTION",
|
| 1064 |
+
"SOURCE" => "SOURCE",
|
| 1065 |
+
_ => "OTHER",
|
| 1066 |
+
}
|
| 1067 |
+
}
|
| 1068 |
+
|
| 1069 |
fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
|
| 1070 |
let mut spans = Vec::new();
|
| 1071 |
let mut current_label: Option<String> = None;
|
|
|
|
| 1106 |
} else if title_spans > 1 {
|
| 1107 |
warnings.push("multiple_title_spans".to_string());
|
| 1108 |
}
|
| 1109 |
+
let has_episode = record.labels.iter().any(|label| label.ends_with("EPISODE"));
|
| 1110 |
+
if !has_episode {
|
| 1111 |
warnings.push("no_episode".to_string());
|
| 1112 |
+
if record
|
| 1113 |
+
.dropped_title_candidate_positions
|
| 1114 |
+
.as_ref()
|
| 1115 |
+
.is_some_and(|dropped| !dropped.is_empty())
|
| 1116 |
+
{
|
| 1117 |
+
warnings.push("ambiguous_no_episode_title".to_string());
|
| 1118 |
+
}
|
| 1119 |
}
|
| 1120 |
if record.filename.contains('/') || record.filename.contains('\\') {
|
| 1121 |
warnings.push("path_retained".to_string());
|
|
|
|
| 1221 |
audit_warnings(record).iter().any(|warning| {
|
| 1222 |
matches!(
|
| 1223 |
warning.as_str(),
|
| 1224 |
+
"ambiguous_no_episode_title"
|
| 1225 |
+
| "hash_labeled"
|
| 1226 |
+
| "multiple_title_spans"
|
| 1227 |
+
| "no_title"
|
| 1228 |
+
| "path_retained"
|
| 1229 |
)
|
| 1230 |
})
|
| 1231 |
}
|
|
|
|
| 1344 |
if EPISODE_VERSION_RE.is_match(&compact) {
|
| 1345 |
return "EPISODE_VERSION".to_string();
|
| 1346 |
}
|
| 1347 |
+
if EPISODE_WITH_SUFFIX_RE.is_match(&cleaned) {
|
| 1348 |
+
return "EPISODE_VERSION".to_string();
|
| 1349 |
+
}
|
| 1350 |
if SXE_RE.is_match(&compact) {
|
| 1351 |
return "SXE".to_string();
|
| 1352 |
}
|
|
|
|
| 1622 |
.map(str::trim)
|
| 1623 |
.filter(|part| !part.is_empty())
|
| 1624 |
.collect();
|
| 1625 |
+
if parts.len() >= 2
|
| 1626 |
+
&& (path_segment_is_episodeish(parts[parts.len() - 1])
|
| 1627 |
+
|| (!path_segment_is_plain_season(parts[parts.len() - 2])
|
| 1628 |
+
&& path_segment_starts_with_episode(parts[parts.len() - 1])
|
| 1629 |
+
&& !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
|
| 1630 |
+
{
|
| 1631 |
+
if let Some(parent) = parts[..parts.len() - 1]
|
| 1632 |
+
.iter()
|
| 1633 |
+
.rev()
|
| 1634 |
+
.find(|part| {
|
| 1635 |
+
let trimmed = trim_parent_title_segment(part);
|
| 1636 |
+
filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
|
| 1637 |
+
})
|
| 1638 |
+
{
|
| 1639 |
+
let parent = trim_parent_title_segment(parent.trim());
|
| 1640 |
+
return (
|
| 1641 |
+
format!(
|
| 1642 |
+
"{} {}",
|
| 1643 |
+
parent,
|
| 1644 |
+
parts[parts.len() - 1].trim()
|
| 1645 |
+
),
|
| 1646 |
+
true,
|
| 1647 |
+
);
|
| 1648 |
+
}
|
| 1649 |
+
}
|
| 1650 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 1651 |
+
if path_segment_has_season(parts[parts.len() - 2]) {
|
| 1652 |
if !path_segment_is_plain_season(parts[parts.len() - 2]) {
|
| 1653 |
return (parts[parts.len() - 1].to_string(), true);
|
| 1654 |
}
|
|
|
|
| 1660 |
{
|
| 1661 |
(parts[parts.len() - 1].to_string(), true)
|
| 1662 |
} else {
|
| 1663 |
+
(
|
| 1664 |
+
format!(
|
| 1665 |
+
"{} {}",
|
| 1666 |
+
parts[parts.len() - 2].trim(),
|
| 1667 |
+
parts[parts.len() - 1].trim()
|
| 1668 |
+
),
|
| 1669 |
+
true,
|
| 1670 |
+
)
|
| 1671 |
}
|
| 1672 |
} else {
|
| 1673 |
(parts[parts.len() - 1].to_string(), true)
|
|
|
|
| 1682 |
PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
|
| 1683 |
}
|
| 1684 |
|
| 1685 |
+
fn trim_terminal_series_kind(segment: &str) -> String {
|
| 1686 |
+
let mut output = segment.trim().to_string();
|
| 1687 |
+
for suffix in ["_TV", ".TV", " TV", "_tv", ".tv", " tv"] {
|
| 1688 |
+
if output.ends_with(suffix) {
|
| 1689 |
+
output.truncate(output.len() - suffix.len());
|
| 1690 |
+
return output.trim_end_matches(['_', '.', ' ']).to_string();
|
| 1691 |
+
}
|
| 1692 |
+
}
|
| 1693 |
+
output
|
| 1694 |
+
}
|
| 1695 |
+
|
| 1696 |
+
fn trim_parent_title_segment(segment: &str) -> String {
|
| 1697 |
+
let mut output = trim_terminal_series_kind(segment);
|
| 1698 |
+
loop {
|
| 1699 |
+
let trimmed = output.trim_end();
|
| 1700 |
+
let Some(last) = trimmed.chars().next_back() else {
|
| 1701 |
+
return output;
|
| 1702 |
+
};
|
| 1703 |
+
let open = match last {
|
| 1704 |
+
')' => '(',
|
| 1705 |
+
']' => '[',
|
| 1706 |
+
'】' => '【',
|
| 1707 |
+
_ => return output,
|
| 1708 |
+
};
|
| 1709 |
+
let Some(start) = trimmed.rfind(open) else {
|
| 1710 |
+
return output;
|
| 1711 |
+
};
|
| 1712 |
+
let suffix = &trimmed[start..];
|
| 1713 |
+
if path_segment_is_media_noise(suffix) {
|
| 1714 |
+
output.truncate(start);
|
| 1715 |
+
output = output.trim_end_matches([' ', '_', '.', '-']).to_string();
|
| 1716 |
+
continue;
|
| 1717 |
+
}
|
| 1718 |
+
return output;
|
| 1719 |
+
}
|
| 1720 |
+
}
|
| 1721 |
+
|
| 1722 |
fn path_segment_has_season(value: &str) -> bool {
|
| 1723 |
PATH_SEGMENT_SEASON_RE.is_match(value)
|
| 1724 |
}
|
|
|
|
| 1738 |
return true;
|
| 1739 |
}
|
| 1740 |
let markers = [
|
| 1741 |
+
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
|
| 1742 |
+
"楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
|
| 1743 |
+
"伄", "椋", "伓", "姘",
|
| 1744 |
];
|
| 1745 |
let marker_hits = markers
|
| 1746 |
.iter()
|
|
|
|
| 1775 |
!structural.is_empty()
|
| 1776 |
&& structural
|
| 1777 |
.iter()
|
| 1778 |
+
.all(|item| {
|
| 1779 |
+
item.starts_with("EPISODE")
|
| 1780 |
+
|| item.as_str() == "SPECIAL"
|
| 1781 |
+
|| item.as_str() == "VOLUME"
|
| 1782 |
+
|| item.as_str() == "BRACKET_VOLUME"
|
| 1783 |
+
})
|
| 1784 |
+
}
|
| 1785 |
+
|
| 1786 |
+
fn path_segment_starts_with_episode(value: &str) -> bool {
|
| 1787 |
+
if EPISODE_CJK_PREFIX_RE.is_match(value.trim()) {
|
| 1788 |
+
return true;
|
| 1789 |
+
}
|
| 1790 |
+
let (key, _, _, groups) = template_key_for_filename(value);
|
| 1791 |
+
let roles = suggested_roles(&key);
|
| 1792 |
+
groups
|
| 1793 |
+
.iter()
|
| 1794 |
+
.zip(roles.iter())
|
| 1795 |
+
.find(|(group, _)| group.class_name != "SEP")
|
| 1796 |
+
.is_some_and(|(_, role)| role.starts_with("EPISODE"))
|
| 1797 |
+
}
|
| 1798 |
+
|
| 1799 |
+
fn leaf_has_full_title_after_episode(value: &str) -> bool {
|
| 1800 |
+
let (key, _, _, groups) = template_key_for_filename(value);
|
| 1801 |
+
let roles = suggested_roles(&key);
|
| 1802 |
+
let first_structural = roles.iter().position(|role| role.starts_with("EPISODE"));
|
| 1803 |
+
let Some(first_episode) = first_structural else {
|
| 1804 |
+
return false;
|
| 1805 |
+
};
|
| 1806 |
+
groups
|
| 1807 |
+
.iter()
|
| 1808 |
+
.zip(roles.iter())
|
| 1809 |
+
.skip(first_episode + 1)
|
| 1810 |
+
.filter(|(group, _)| group.class_name != "SEP")
|
| 1811 |
+
.any(|(_, role)| role == "TITLE")
|
| 1812 |
+
}
|
| 1813 |
+
|
| 1814 |
+
fn path_segment_is_media_noise(value: &str) -> bool {
|
| 1815 |
+
let normalized = value.to_ascii_lowercase();
|
| 1816 |
+
if normalized.contains("sourceunknown") || normalized.contains("sourceunknow") {
|
| 1817 |
+
return true;
|
| 1818 |
+
}
|
| 1819 |
+
if (normalized.contains("dvdrip")
|
| 1820 |
+
|| normalized.contains("bdrip")
|
| 1821 |
+
|| normalized.contains("webrip")
|
| 1822 |
+
|| normalized.contains("web-dl")
|
| 1823 |
+
|| normalized.contains("bluray"))
|
| 1824 |
+
&& tokenize(value)
|
| 1825 |
+
.iter()
|
| 1826 |
+
.map(|token| classify_atom(token))
|
| 1827 |
+
.any(|class_name| class_name == "RESOLUTION")
|
| 1828 |
+
{
|
| 1829 |
+
return true;
|
| 1830 |
+
}
|
| 1831 |
+
let (_, _, _, groups) = template_key_for_filename(value);
|
| 1832 |
+
let structural: Vec<&String> = groups
|
| 1833 |
+
.iter()
|
| 1834 |
+
.map(|group| &group.class_name)
|
| 1835 |
+
.filter(|item| item.as_str() != "SEP")
|
| 1836 |
+
.collect();
|
| 1837 |
+
!structural.is_empty()
|
| 1838 |
+
&& structural.iter().all(|item| {
|
| 1839 |
+
matches!(
|
| 1840 |
+
item.as_str(),
|
| 1841 |
+
"MEDIA"
|
| 1842 |
+
| "RESOLUTION"
|
| 1843 |
+
| "LANG"
|
| 1844 |
+
| "HASH"
|
| 1845 |
+
| "DATE"
|
| 1846 |
+
| "BRACKET_MEDIA"
|
| 1847 |
+
| "BRACKET_RESOLUTION"
|
| 1848 |
+
| "BRACKET_LANG"
|
| 1849 |
+
| "BRACKET_HASH"
|
| 1850 |
+
| "BRACKET_DATE"
|
| 1851 |
+
| "MEDIA_BLOCK"
|
| 1852 |
+
| "BRACKET_MEDIA_BLOCK"
|
| 1853 |
+
)
|
| 1854 |
+
})
|
| 1855 |
}
|
| 1856 |
|
| 1857 |
fn has_abstract_path_noise(value: &str) -> bool {
|
|
|
|
| 2090 |
}
|
| 2091 |
|
| 2092 |
fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
| 2093 |
+
if DECIMAL_EPISODE_RE.is_match(token) {
|
| 2094 |
+
let pieces = split_generated_token(token);
|
| 2095 |
+
let labels = pieces.iter().map(|_| "B-EPISODE".to_string()).collect();
|
| 2096 |
+
return Some((pieces, labels));
|
| 2097 |
+
}
|
| 2098 |
let caps = EPISODE_VALUE_RE.captures(token)?;
|
| 2099 |
let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
|
| 2100 |
let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
|
|
|
|
| 2125 |
)
|
| 2126 |
}
|
| 2127 |
|
| 2128 |
+
fn normalize_whitelist_name(value: &str) -> String {
|
| 2129 |
+
value.split_whitespace().collect::<Vec<_>>().join(" ")
|
| 2130 |
+
}
|
| 2131 |
+
|
| 2132 |
+
fn phrase_parts_for_whitelist(value: &str) -> Vec<String> {
|
| 2133 |
+
let tokens = tokenize(value);
|
| 2134 |
+
let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect();
|
| 2135 |
+
let groups = compact_token_groups(&tokens, &classes);
|
| 2136 |
+
groups
|
| 2137 |
+
.iter()
|
| 2138 |
+
.filter(|group| whitelist_phrase_group(group))
|
| 2139 |
+
.map(|group| group_text(&tokens, group))
|
| 2140 |
+
.filter(|part| !part.trim().is_empty())
|
| 2141 |
+
.collect()
|
| 2142 |
+
}
|
| 2143 |
+
|
| 2144 |
+
fn whitelist_phrase_group(group: &Group) -> bool {
|
| 2145 |
+
matches!(
|
| 2146 |
+
group.class_name.as_str(),
|
| 2147 |
+
"TEXT" | "EPISODE" | "SPECIAL" | "SEASON" | "BRACKET_TEXT"
|
| 2148 |
+
)
|
| 2149 |
+
}
|
| 2150 |
+
|
| 2151 |
fn is_special_title_phrase(text: &str) -> bool {
|
| 2152 |
let normalized = SPECIAL_SPACE_RE
|
| 2153 |
.replace_all(text, " ")
|
|
|
|
| 2157 |
normalized.as_str(),
|
| 2158 |
"CM" | "EVENT"
|
| 2159 |
| "EIZOU"
|
| 2160 |
+
| "EXTRA"
|
| 2161 |
+
| "EXTRAS"
|
| 2162 |
| "LOGO"
|
| 2163 |
| "MENU"
|
| 2164 |
| "OMAKE"
|
|
|
|
| 2168 |
| "TOKUTEN"
|
| 2169 |
| "TRAILER"
|
| 2170 |
| "WORLD PREMIERE"
|
| 2171 |
+
| "映像特典"
|
| 2172 |
+
| "特典"
|
| 2173 |
+
) || normalized.contains("映像特典")
|
| 2174 |
+
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2175 |
+
}
|
| 2176 |
+
|
| 2177 |
+
const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
| 2178 |
+
&["SPY", "x", "FAMILY"],
|
| 2179 |
+
&["Spy", "x", "Family"],
|
| 2180 |
+
&["Slime", "300"],
|
| 2181 |
+
&["Zom", "100"],
|
| 2182 |
+
&["Kamisama", "Hajimemashita", "2"],
|
| 2183 |
+
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
| 2184 |
+
];
|
| 2185 |
+
|
| 2186 |
+
fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
|
| 2187 |
+
if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
|
| 2188 |
+
for (index, group) in groups.iter().enumerate() {
|
| 2189 |
+
if group.class_name == "BRACKET_TEXT"
|
| 2190 |
+
&& roles.get(index).is_some_and(|role| role == "GROUP")
|
| 2191 |
+
&& whitelists
|
| 2192 |
+
.group_names
|
| 2193 |
+
.contains(&normalize_whitelist_name(&group_text(tokens, group)))
|
| 2194 |
+
{
|
| 2195 |
+
roles[index] = "GROUP".to_string();
|
| 2196 |
+
}
|
| 2197 |
+
}
|
| 2198 |
+
}
|
| 2199 |
+
let searchable: Vec<(usize, String)> = groups
|
| 2200 |
+
.iter()
|
| 2201 |
+
.enumerate()
|
| 2202 |
+
.filter(|(_, group)| whitelist_phrase_group(group))
|
| 2203 |
+
.map(|(index, group)| (index, group_text(tokens, group)))
|
| 2204 |
+
.collect();
|
| 2205 |
+
for phrase in KNOWN_TITLE_PHRASES {
|
| 2206 |
+
apply_title_phrase(&searchable, phrase, roles, true);
|
| 2207 |
+
}
|
| 2208 |
+
if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
|
| 2209 |
+
for phrase in &whitelists.title_phrases {
|
| 2210 |
+
if phrase.len() >= 2 {
|
| 2211 |
+
apply_title_phrase(&searchable, phrase, roles, false);
|
| 2212 |
+
}
|
| 2213 |
+
}
|
| 2214 |
+
}
|
| 2215 |
+
}
|
| 2216 |
+
|
| 2217 |
+
fn apply_title_phrase(
|
| 2218 |
+
searchable: &[(usize, String)],
|
| 2219 |
+
phrase: &[impl AsRef<str>],
|
| 2220 |
+
roles: &mut [String],
|
| 2221 |
+
allow_structural_override: bool,
|
| 2222 |
+
) {
|
| 2223 |
+
if phrase.is_empty() || phrase.len() > searchable.len() {
|
| 2224 |
+
return;
|
| 2225 |
+
}
|
| 2226 |
+
for window in searchable.windows(phrase.len()) {
|
| 2227 |
+
if window
|
| 2228 |
+
.iter()
|
| 2229 |
+
.zip(phrase.iter())
|
| 2230 |
+
.all(|((_, text), expected)| text.eq_ignore_ascii_case(expected.as_ref()))
|
| 2231 |
+
{
|
| 2232 |
+
for (group_index, _) in window {
|
| 2233 |
+
if roles.get(*group_index).is_some_and(|role| role == "GROUP") {
|
| 2234 |
+
continue;
|
| 2235 |
+
}
|
| 2236 |
+
if !allow_structural_override
|
| 2237 |
+
&& roles.get(*group_index).is_some_and(|role| {
|
| 2238 |
+
matches!(
|
| 2239 |
+
role.as_str(),
|
| 2240 |
+
"EPISODE"
|
| 2241 |
+
| "EPISODE_VERSION"
|
| 2242 |
+
| "EPISODE_RANGE"
|
| 2243 |
+
| "SEASON"
|
| 2244 |
+
| "SOURCE"
|
| 2245 |
+
| "RESOLUTION"
|
| 2246 |
+
)
|
| 2247 |
+
})
|
| 2248 |
+
{
|
| 2249 |
+
continue;
|
| 2250 |
+
}
|
| 2251 |
+
{
|
| 2252 |
+
roles[*group_index] = "TITLE".to_string();
|
| 2253 |
+
}
|
| 2254 |
+
}
|
| 2255 |
+
}
|
| 2256 |
+
}
|
| 2257 |
}
|
| 2258 |
|
| 2259 |
fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
|
| 2260 |
let mut output = roles.to_vec();
|
| 2261 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 2262 |
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
|
| 2263 |
+
apply_known_title_phrases(tokens, groups, &mut output);
|
| 2264 |
+
if roles
|
| 2265 |
+
.first()
|
| 2266 |
+
.is_some_and(|role| role.starts_with("EPISODE"))
|
| 2267 |
+
&& YEAR_RANGE_RE.is_match(&group_text(tokens, &groups[0]))
|
| 2268 |
+
{
|
| 2269 |
+
let first_real_structural = (1..roles.len())
|
| 2270 |
+
.find(|&index| {
|
| 2271 |
+
roles[index].starts_with("EPISODE")
|
| 2272 |
+
|| matches!(roles[index].as_str(), "SEASON" | "SPECIAL")
|
| 2273 |
+
})
|
| 2274 |
+
.unwrap_or(roles.len());
|
| 2275 |
+
for index in 1..first_real_structural {
|
| 2276 |
+
if groups[index].class_name == "TEXT"
|
| 2277 |
+
&& !matches!(
|
| 2278 |
+
group_text(tokens, &groups[index])
|
| 2279 |
+
.to_ascii_uppercase()
|
| 2280 |
+
.as_str(),
|
| 2281 |
+
"TV" | "OVA" | "OAD" | "SP"
|
| 2282 |
+
)
|
| 2283 |
+
{
|
| 2284 |
+
output[index] = "TITLE".to_string();
|
| 2285 |
+
}
|
| 2286 |
+
}
|
| 2287 |
+
}
|
| 2288 |
if !output.iter().any(|role| role == "TITLE")
|
| 2289 |
&& roles
|
| 2290 |
.first()
|
|
|
|
| 2378 |
}
|
| 2379 |
if output[index - 2] == "TITLE"
|
| 2380 |
&& groups[index - 1].class_name == "SEP"
|
| 2381 |
+
&& previous_text.len() <= 48
|
| 2382 |
+
&& previous_text.chars().any(|ch| ch.is_alphabetic())
|
|
|
|
| 2383 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 2384 |
&& text.len() <= 3
|
| 2385 |
+
&& !(index + 2 < roles.len()
|
| 2386 |
+
&& groups[index + 1].class_name == "SEP"
|
| 2387 |
+
&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
|
| 2388 |
+
&& (next_episode
|
| 2389 |
+
|| (next_special
|
| 2390 |
+
&& (text.parse::<u16>().is_ok_and(|value| value >= 100)
|
| 2391 |
+
|| (previous_text.len() <= 4
|
| 2392 |
+
&& previous_text.is_ascii()
|
| 2393 |
+
&& previous_text
|
| 2394 |
+
.chars()
|
| 2395 |
+
.all(|ch| ch.is_ascii_alphabetic())))))
|
| 2396 |
{
|
| 2397 |
output[index] = "TITLE".to_string();
|
| 2398 |
continue;
|
| 2399 |
}
|
| 2400 |
}
|
| 2401 |
+
if roles[index].starts_with("EPISODE")
|
| 2402 |
+
&& BARE_RESOLUTION_RE.is_match(&text)
|
| 2403 |
+
&& index >= 2
|
| 2404 |
+
&& groups[index - 1].class_name == "SEP"
|
| 2405 |
+
{
|
| 2406 |
+
let previous_text = group_text(tokens, &groups[index - 2]);
|
| 2407 |
+
if previous_text
|
| 2408 |
+
.chars()
|
| 2409 |
+
.any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
|
| 2410 |
+
{
|
| 2411 |
+
output[index] = "RESOLUTION".to_string();
|
| 2412 |
+
continue;
|
| 2413 |
+
}
|
| 2414 |
+
}
|
| 2415 |
if roles[index].starts_with("EPISODE")
|
| 2416 |
&& index >= 2
|
| 2417 |
&& output[..index].iter().any(|role| role == "TITLE")
|
|
|
|
| 2454 |
output[index] = "SPECIAL".to_string();
|
| 2455 |
continue;
|
| 2456 |
}
|
| 2457 |
+
if roles[index] == "TITLE"
|
| 2458 |
+
&& matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
|
| 2459 |
+
&& output.iter().enumerate().any(|(other, role)| {
|
| 2460 |
+
other != index && role == "TITLE"
|
| 2461 |
+
})
|
| 2462 |
+
{
|
| 2463 |
+
output[index] = "O".to_string();
|
| 2464 |
+
continue;
|
| 2465 |
+
}
|
| 2466 |
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 2467 |
{
|
| 2468 |
output[index] = "O".to_string();
|
|
|
|
| 2490 |
&& groups[index + 1].class_name == "SEP"
|
| 2491 |
&& roles[index + 2].starts_with("EPISODE")
|
| 2492 |
{
|
| 2493 |
+
if !output[..index].iter().any(|role| role == "TITLE") {
|
| 2494 |
+
output[index] = "O".to_string();
|
| 2495 |
+
output[index + 2] = "SEASON".to_string();
|
| 2496 |
+
}
|
| 2497 |
continue;
|
| 2498 |
}
|
| 2499 |
if roles[index] == "TITLE"
|
|
|
|
| 2519 |
output[index + 2] = "O".to_string();
|
| 2520 |
}
|
| 2521 |
}
|
| 2522 |
+
if roles[index].starts_with("EPISODE")
|
| 2523 |
+
&& !output[index + 1..].iter().any(|role| role == "TITLE")
|
| 2524 |
+
{
|
| 2525 |
+
let mut run = Vec::new();
|
| 2526 |
+
let mut cursor = index + 1;
|
| 2527 |
+
while cursor < roles.len() {
|
| 2528 |
+
if groups[cursor].class_name == "SEP" {
|
| 2529 |
+
cursor += 1;
|
| 2530 |
+
continue;
|
| 2531 |
+
}
|
| 2532 |
+
if groups[cursor].class_name == "TEXT"
|
| 2533 |
+
&& !matches!(
|
| 2534 |
+
roles[cursor].as_str(),
|
| 2535 |
+
"SOURCE" | "RESOLUTION" | "SEASON" | "SPECIAL"
|
| 2536 |
+
)
|
| 2537 |
+
{
|
| 2538 |
+
run.push(cursor);
|
| 2539 |
+
cursor += 1;
|
| 2540 |
+
continue;
|
| 2541 |
+
}
|
| 2542 |
+
if !run.is_empty() {
|
| 2543 |
+
break;
|
| 2544 |
+
}
|
| 2545 |
+
cursor += 1;
|
| 2546 |
+
}
|
| 2547 |
+
if run.len() >= 2 {
|
| 2548 |
+
for item in run {
|
| 2549 |
+
output[item] = "TITLE".to_string();
|
| 2550 |
+
}
|
| 2551 |
+
}
|
| 2552 |
+
}
|
| 2553 |
if roles[index].starts_with("EPISODE") {
|
| 2554 |
let previous_text = if index >= 1 {
|
| 2555 |
group_text(tokens, &groups[index - 1])
|
|
|
|
| 2612 |
}
|
| 2613 |
|
| 2614 |
fn enforce_single_title_candidate(
|
| 2615 |
+
tokens: &[String],
|
| 2616 |
groups: &[Group],
|
| 2617 |
roles: &[String],
|
| 2618 |
) -> (Vec<String>, Vec<String>) {
|
|
|
|
| 2635 |
.copied()
|
| 2636 |
.filter(|(_, end)| *end <= first_anchor)
|
| 2637 |
.collect();
|
| 2638 |
+
let selected_pool = if before_anchor.is_empty() {
|
| 2639 |
&candidates
|
| 2640 |
} else {
|
| 2641 |
&before_anchor
|
| 2642 |
+
};
|
| 2643 |
+
let selected = selected_pool
|
| 2644 |
.iter()
|
| 2645 |
+
.max_by_key(|(start, end)| {
|
| 2646 |
+
(
|
| 2647 |
+
title_candidate_score(tokens, groups, *start, *end),
|
| 2648 |
+
*end,
|
| 2649 |
+
end - start,
|
| 2650 |
+
)
|
| 2651 |
+
})
|
| 2652 |
.copied()
|
| 2653 |
.unwrap();
|
| 2654 |
let mut output = roles.to_vec();
|
|
|
|
| 2667 |
(output, dropped)
|
| 2668 |
}
|
| 2669 |
|
| 2670 |
+
fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
|
| 2671 |
+
let text = (start..end)
|
| 2672 |
+
.filter(|&index| roles_candidate_text_group(&groups[index]))
|
| 2673 |
+
.map(|index| group_text(tokens, &groups[index]))
|
| 2674 |
+
.collect::<Vec<_>>()
|
| 2675 |
+
.join("");
|
| 2676 |
+
let cleaned = text.trim();
|
| 2677 |
+
if cleaned.is_empty() {
|
| 2678 |
+
return -1000;
|
| 2679 |
+
}
|
| 2680 |
+
let mut score = cleaned.chars().filter(|ch| ch.is_alphanumeric()).count() as isize;
|
| 2681 |
+
if VERSIONISH_TITLE_RE.is_match(cleaned) {
|
| 2682 |
+
score -= 500;
|
| 2683 |
+
}
|
| 2684 |
+
if matches!(
|
| 2685 |
+
cleaned.to_ascii_lowercase().as_str(),
|
| 2686 |
+
"国漫" | "國漫" | "anime" | "movie" | "movies"
|
| 2687 |
+
) {
|
| 2688 |
+
score -= 500;
|
| 2689 |
+
}
|
| 2690 |
+
score
|
| 2691 |
+
}
|
| 2692 |
+
|
| 2693 |
+
fn roles_candidate_text_group(group: &Group) -> bool {
|
| 2694 |
+
matches!(group.class_name.as_str(), "TEXT" | "BRACKET_TEXT")
|
| 2695 |
+
}
|
| 2696 |
+
|
| 2697 |
fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
|
| 2698 |
let mut output_tokens = Vec::new();
|
| 2699 |
let mut output_labels = Vec::new();
|
|
|
|
| 2850 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 2851 |
let joiners = [
|
| 2852 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2853 |
+
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 2854 |
+
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
|
| 2855 |
+
"☆", "♪", "`", "@",
|
| 2856 |
];
|
| 2857 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 2858 |
let entity_joiners = [
|
| 2859 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2860 |
+
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 2861 |
+
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
|
| 2862 |
+
"☆", "♪", "`", "@", "&", "&",
|
| 2863 |
];
|
| 2864 |
let mut output = labels.to_vec();
|
| 2865 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
| 2893 |
output[index] = "B-TITLE".to_string();
|
| 2894 |
}
|
| 2895 |
}
|
| 2896 |
+
if matches!(token.as_str(), "]" | "】" | ")" | ")" | ">" | ">" | "」" | "」")
|
| 2897 |
+
&& index > 0
|
| 2898 |
+
&& output[index - 1] == "B-TITLE"
|
| 2899 |
+
&& title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
|
| 2900 |
+
{
|
| 2901 |
+
output[index] = "B-TITLE".to_string();
|
| 2902 |
+
}
|
| 2903 |
}
|
| 2904 |
output
|
| 2905 |
}
|
| 2906 |
|
| 2907 |
+
fn title_span_has_labeled_opener(tokens: &[String], labels: &[String], closer: &str) -> bool {
|
| 2908 |
+
for (token, label) in tokens.iter().zip(labels.iter()).rev() {
|
| 2909 |
+
if label != "B-TITLE" {
|
| 2910 |
+
return false;
|
| 2911 |
+
}
|
| 2912 |
+
if closer_matches_opener(closer, token) {
|
| 2913 |
+
return true;
|
| 2914 |
+
}
|
| 2915 |
+
}
|
| 2916 |
+
false
|
| 2917 |
+
}
|
| 2918 |
+
|
| 2919 |
+
fn closer_matches_opener(closer: &str, opener: &str) -> bool {
|
| 2920 |
+
matches!(
|
| 2921 |
+
(closer, opener),
|
| 2922 |
+
("]", "[")
|
| 2923 |
+
| ("】", "【")
|
| 2924 |
+
| (")", "(")
|
| 2925 |
+
| (")", "(")
|
| 2926 |
+
| (">", "<")
|
| 2927 |
+
| (">", "<")
|
| 2928 |
+
| ("」", "「")
|
| 2929 |
+
| ("」", "「")
|
| 2930 |
+
)
|
| 2931 |
+
}
|
| 2932 |
+
|
| 2933 |
fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
|
| 2934 |
let (key, tokens, _classes, groups) = template_key_for_filename(filename);
|
| 2935 |
if groups.len() != roles.len() {
|
| 2936 |
return None;
|
| 2937 |
}
|
| 2938 |
let roles = adjust_contextual_roles(&tokens, &groups, roles);
|
| 2939 |
+
let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
|
| 2940 |
let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
|
| 2941 |
let labels = smooth_title_spans(&tokens, &labels);
|
| 2942 |
if tokens.len() != labels.len() {
|
|
|
|
| 2969 |
record.tokens.into_iter().zip(record.labels).collect()
|
| 2970 |
}
|
| 2971 |
|
| 2972 |
+
#[test]
|
| 2973 |
+
fn rich_title_candidates_keep_readable_spacing() {
|
| 2974 |
+
let row = rich_annotation_for(
|
| 2975 |
+
"(1998) Initial D First Stage [1080p BDRip AVC AAC DTS-HD]/Initial D First Stage - 01 [1080p BDRip AVC AAC DTS-HD]",
|
| 2976 |
+
);
|
| 2977 |
+
assert_eq!(
|
| 2978 |
+
row.pointer("/segments/1/candidates/0/text")
|
| 2979 |
+
.and_then(Value::as_str),
|
| 2980 |
+
Some("Initial D First Stage")
|
| 2981 |
+
);
|
| 2982 |
+
}
|
| 2983 |
+
|
| 2984 |
#[test]
|
| 2985 |
fn required_regressions() {
|
| 2986 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
|
|
|
| 3048 |
let comma_title =
|
| 3049 |
labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
|
| 3050 |
assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
|
| 3051 |
+
let backtick_title =
|
| 3052 |
+
labels_for("[Hayate no Gotoku! Can`t Take My Eyes Off You][01][BDrip X264 AAC 720P]");
|
| 3053 |
+
assert!(backtick_title.contains(&("`".to_string(), "B-TITLE".to_string())));
|
| 3054 |
+
assert!(backtick_title.contains(&("t".to_string(), "B-TITLE".to_string())));
|
| 3055 |
+
let cjk_period_title =
|
| 3056 |
+
labels_for("[云光字幕组]剃须。然后捡到高中生 Hige o Soru. Soshite Joshikousei o Hirou-[ 01 ][简体双语][1080p]");
|
| 3057 |
+
assert!(cjk_period_title.contains(&("。".to_string(), "B-TITLE".to_string())));
|
| 3058 |
+
let music_title =
|
| 3059 |
+
labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
|
| 3060 |
+
assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
|
| 3061 |
+
let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
|
| 3062 |
+
assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
|
| 3063 |
+
assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
|
| 3064 |
+
let hdma_block =
|
| 3065 |
+
labels_for("[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]");
|
| 3066 |
+
assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
|
| 3067 |
+
assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
|
| 3068 |
+
assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
|
| 3069 |
+
assert!(!hdma_block.contains(&("1080P".to_string(), "B-TITLE".to_string())));
|
| 3070 |
+
let extra_menu = labels_for("Extra Menu OVA");
|
| 3071 |
+
assert!(extra_menu.contains(&("Extra".to_string(), "B-SPECIAL".to_string())));
|
| 3072 |
+
assert!(!extra_menu.contains(&("Extra".to_string(), "B-TITLE".to_string())));
|
| 3073 |
+
let eizou_tokuten = labels_for("おジャ魔女どれみ♯ 映像特典「ともだちの唄」(DVD 640x480 )");
|
| 3074 |
+
assert!(eizou_tokuten.contains(&("映像特典".to_string(), "B-SPECIAL".to_string())));
|
| 3075 |
let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
|
| 3076 |
assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
|
| 3077 |
let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
|
|
|
|
| 3081 |
let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
|
| 3082 |
assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
|
| 3083 |
assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
|
| 3084 |
+
assert!(mayoi.contains(&("]".to_string(), "B-TITLE".to_string())));
|
| 3085 |
|
| 3086 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 3087 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 3096 |
assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
|
| 3097 |
assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
|
| 3098 |
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
|
| 3099 |
+
|
| 3100 |
+
let happy = labels_for(
|
| 3101 |
+
"My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG",
|
| 3102 |
+
);
|
| 3103 |
+
assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 3104 |
+
assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3105 |
+
assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
|
| 3106 |
+
|
| 3107 |
+
let garo = labels_for("[牙狼<GARO>~炎の刻印~][01][1080p]");
|
| 3108 |
+
assert!(garo.contains(&("牙狼".to_string(), "B-TITLE".to_string())));
|
| 3109 |
+
assert!(garo.contains(&("<".to_string(), "B-TITLE".to_string())));
|
| 3110 |
+
assert!(garo.contains(&(">".to_string(), "B-TITLE".to_string())));
|
| 3111 |
+
assert!(garo.contains(&("炎の刻印".to_string(), "B-TITLE".to_string())));
|
| 3112 |
+
|
| 3113 |
+
let akira = labels_for("[QYQ][AKIRA][AVC_AC3x2][1080p]");
|
| 3114 |
+
assert!(akira.contains(&("AKIRA".to_string(), "B-TITLE".to_string())));
|
| 3115 |
+
assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
|
| 3116 |
+
assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
|
| 3117 |
+
|
| 3118 |
+
let doraemon =
|
| 3119 |
+
labels_for("[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了");
|
| 3120 |
+
assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
|
| 3121 |
+
assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
|
| 3122 |
+
assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
|
| 3123 |
+
|
| 3124 |
+
let devilman = labels_for("[DBD-Raws][恶魔人][1972版][01][1080P][BDRip][HEVC-10bit][FLAC]");
|
| 3125 |
+
assert!(devilman.contains(&("恶魔人".to_string(), "B-TITLE".to_string())));
|
| 3126 |
+
assert!(!devilman.contains(&("1972版".to_string(), "B-TITLE".to_string())));
|
| 3127 |
+
|
| 3128 |
+
let classroom = labels_for("[Dymy][Assassination Classroom (2016)][01][BIG5][1280X720]");
|
| 3129 |
+
assert!(classroom.contains(&("(".to_string(), "B-TITLE".to_string())));
|
| 3130 |
+
assert!(classroom.contains(&(")".to_string(), "B-TITLE".to_string())));
|
| 3131 |
+
assert!(!classroom.contains(&("]".to_string(), "B-TITLE".to_string())));
|
| 3132 |
+
|
| 3133 |
+
let bang_season =
|
| 3134 |
+
labels_for("[LoliHouse] Bang Dream! 2nd Season - 01 [BDRip 1080p HEVC-10bit FLAC]");
|
| 3135 |
+
assert!(bang_season.contains(&("Bang".to_string(), "B-TITLE".to_string())));
|
| 3136 |
+
assert!(bang_season.contains(&("Season".to_string(), "B-TITLE".to_string())));
|
| 3137 |
+
assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3138 |
+
assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 3139 |
+
|
| 3140 |
+
let basket =
|
| 3141 |
+
labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
|
| 3142 |
+
assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
|
| 3143 |
+
assert!(basket.contains(&("Season".to_string(), "B-TITLE".to_string())));
|
| 3144 |
+
assert!(basket.contains(&("24".to_string(), "B-EPISODE".to_string())));
|
| 3145 |
+
assert!(!basket.contains(&("24".to_string(), "B-SEASON".to_string())));
|
| 3146 |
+
|
| 3147 |
+
let notice = labels_for("[KTXP][Zankyou_no_Terror][08_Notice][GB_BIG5][X264_AAC][720p]");
|
| 3148 |
+
assert!(notice.contains(&("Zankyou".to_string(), "B-TITLE".to_string())));
|
| 3149 |
+
assert!(notice.contains(&("08".to_string(), "B-EPISODE".to_string())));
|
| 3150 |
+
assert!(!notice.contains(&("08".to_string(), "B-TITLE".to_string())));
|
| 3151 |
+
|
| 3152 |
+
let full = labels_for("[POPGO][Soukyuu_no_Fafner_Exodus][01_Full][GB][720p]");
|
| 3153 |
+
assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3154 |
+
assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 3155 |
+
|
| 3156 |
+
let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
|
| 3157 |
+
assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3158 |
+
assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 3159 |
+
|
| 3160 |
+
let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
|
| 3161 |
+
assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 3162 |
+
assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 3163 |
+
assert!(ddp.iter().any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
|
| 3164 |
+
|
| 3165 |
+
let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
|
| 3166 |
+
assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 3167 |
+
assert!(!aac_space.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
| 3168 |
+
assert!(aac_space
|
| 3169 |
+
.iter()
|
| 3170 |
+
.any(|(token, label)| token.starts_with("AAC") && label == "B-SOURCE"));
|
| 3171 |
+
|
| 3172 |
+
let bare_resolution = labels_for("日本桥15.03.30 720");
|
| 3173 |
+
assert!(bare_resolution.contains(&("日本桥".to_string(), "B-TITLE".to_string())));
|
| 3174 |
+
assert!(bare_resolution.contains(&("720".to_string(), "B-RESOLUTION".to_string())));
|
| 3175 |
+
assert!(!bare_resolution.contains(&("720".to_string(), "B-EPISODE".to_string())));
|
| 3176 |
+
|
| 3177 |
+
let air_episode = labels_for("Air 01");
|
| 3178 |
+
assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
|
| 3179 |
+
assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3180 |
+
|
| 3181 |
+
let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
|
| 3182 |
+
assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 3183 |
+
assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
|
| 3184 |
+
assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
|
| 3185 |
+
|
| 3186 |
+
let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
|
| 3187 |
+
assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
|
| 3188 |
+
assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
|
| 3189 |
+
assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
|
| 3190 |
+
assert!(spy.contains(&("Family".to_string(), "B-TITLE".to_string())));
|
| 3191 |
+
assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
|
| 3192 |
+
assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
|
| 3193 |
+
|
| 3194 |
+
let spy_s3 = labels_for("[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]");
|
| 3195 |
+
assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
|
| 3196 |
+
assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
|
| 3197 |
+
assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
|
| 3198 |
+
assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
|
| 3199 |
+
assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3200 |
+
|
| 3201 |
+
let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
|
| 3202 |
+
assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
|
| 3203 |
+
assert!(
|
| 3204 |
+
slime.contains(&("300".to_string(), "B-TITLE".to_string())),
|
| 3205 |
+
"{slime:?}"
|
| 3206 |
+
);
|
| 3207 |
+
assert!(!slime.contains(&("300".to_string(), "B-EPISODE".to_string())));
|
| 3208 |
+
|
| 3209 |
+
let kamisama =
|
| 3210 |
+
labels_for("[SFEO-Raws] Kamisama Hajimemashita 2 - 01 (BD 720P x264 10bit AAC)");
|
| 3211 |
+
assert!(kamisama.contains(&("Kamisama".to_string(), "B-TITLE".to_string())));
|
| 3212 |
+
assert!(kamisama.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 3213 |
+
assert!(kamisama.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3214 |
}
|
| 3215 |
|
| 3216 |
#[test]
|
|
|
|
| 3220 |
assert!(was_trimmed);
|
| 3221 |
assert_eq!(
|
| 3222 |
trimmed,
|
| 3223 |
+
"Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 3224 |
);
|
| 3225 |
let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
|
| 3226 |
let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
|
|
|
|
| 3277 |
assert!(was_trimmed);
|
| 3278 |
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
|
| 3279 |
|
| 3280 |
+
let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
|
| 3281 |
+
let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
|
| 3282 |
+
assert!(was_trimmed);
|
| 3283 |
+
assert_eq!(
|
| 3284 |
+
trimmed,
|
| 3285 |
+
"Season 1 [Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]"
|
| 3286 |
+
);
|
| 3287 |
+
let plain_season_labels = labels_for(&trimmed);
|
| 3288 |
+
assert!(plain_season_labels.contains(&("1".to_string(), "B-SEASON".to_string())));
|
| 3289 |
+
assert!(plain_season_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3290 |
+
|
| 3291 |
+
let menu_parent =
|
| 3292 |
+
"[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
|
| 3293 |
+
let (trimmed, was_trimmed) = training_filename_for(menu_parent);
|
| 3294 |
+
assert!(was_trimmed);
|
| 3295 |
+
assert_eq!(trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)");
|
| 3296 |
+
|
| 3297 |
+
assert!(has_encoding_noise(
|
| 3298 |
+
"[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
|
| 3299 |
+
));
|
| 3300 |
+
|
| 3301 |
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
|
| 3302 |
let (trimmed, was_trimmed) = training_filename_for(tintin);
|
| 3303 |
assert!(was_trimmed);
|
|
|
|
| 3338 |
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
|
| 3339 |
let (trimmed, was_trimmed) = training_filename_for(&format!("Batch/{woody_parent}"));
|
| 3340 |
assert!(was_trimmed);
|
| 3341 |
+
assert_eq!(
|
| 3342 |
+
trimmed,
|
| 3343 |
+
"Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 3344 |
+
);
|
| 3345 |
+
|
| 3346 |
+
let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
|
| 3347 |
+
let (trimmed, was_trimmed) = training_filename_for(najica);
|
| 3348 |
+
assert!(was_trimmed);
|
| 3349 |
+
assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
|
| 3350 |
+
let najica_labels = labels_for(&trimmed);
|
| 3351 |
+
assert!(najica_labels.contains(&("Najica".to_string(), "B-TITLE".to_string())));
|
| 3352 |
+
assert!(!najica_labels.contains(&("SourceUnknown".to_string(), "B-TITLE".to_string())));
|
| 3353 |
+
assert!(najica_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3354 |
+
|
| 3355 |
+
let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
|
| 3356 |
+
let (trimmed, was_trimmed) = training_filename_for(galient);
|
| 3357 |
+
assert!(was_trimmed);
|
| 3358 |
+
assert_eq!(
|
| 3359 |
+
trimmed,
|
| 3360 |
+
"[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
|
| 3361 |
+
);
|
| 3362 |
+
let galient_labels = labels_for(&trimmed);
|
| 3363 |
+
assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
|
| 3364 |
+
assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
| 3365 |
+
assert!(galient_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3366 |
+
|
| 3367 |
+
let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
|
| 3368 |
+
let (trimmed, was_trimmed) = training_filename_for(nced);
|
| 3369 |
+
assert!(was_trimmed);
|
| 3370 |
+
assert_eq!(trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED");
|
| 3371 |
+
|
| 3372 |
+
let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
|
| 3373 |
+
let (trimmed, was_trimmed) = training_filename_for(sakura);
|
| 3374 |
+
assert!(was_trimmed);
|
| 3375 |
+
assert_eq!(
|
| 3376 |
+
trimmed,
|
| 3377 |
+
"魔卡少女樱(台配国语) 第01集 小樱与不可思议的魔法书"
|
| 3378 |
+
);
|
| 3379 |
+
let sakura_labels = labels_for(&trimmed);
|
| 3380 |
+
assert!(sakura_labels.contains(&("魔卡少女樱".to_string(), "B-TITLE".to_string())));
|
| 3381 |
+
assert!(sakura_labels.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3382 |
|
| 3383 |
let volume =
|
| 3384 |
labels_for("[Snow-Raws] 生徒会役員共 Vol.01 MENU02 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|