Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Tighten DMHY template label heuristics
Browse files
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -201,10 +201,18 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
|
| 201 |
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 202 |
Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
|
| 203 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
|
| 205 |
Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
|
| 206 |
static ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> =
|
| 207 |
Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap());
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
static SEASON_WORD_RE: Lazy<Regex> =
|
| 209 |
Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
|
| 210 |
static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
|
|
@@ -1764,7 +1772,7 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1764 |
let markers = [
|
| 1765 |
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
|
| 1766 |
"楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
|
| 1767 |
-
"伄", "椋", "伓", "姘", "帽",
|
| 1768 |
];
|
| 1769 |
let marker_hits = markers
|
| 1770 |
.iter()
|
|
@@ -1783,6 +1791,11 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1783 |
fn has_non_anime_noise(value: &str) -> bool {
|
| 1784 |
let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
|
| 1785 |
normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1786 |
}
|
| 1787 |
|
| 1788 |
fn normalized_path_segment(value: &str) -> String {
|
|
@@ -2235,6 +2248,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
|
| 2235 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
| 2236 |
&["Durarara", "2", "Ketsu"],
|
| 2237 |
&["Ghiblies", "Episode", "2"],
|
|
|
|
| 2238 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2239 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
| 2240 |
];
|
|
@@ -2705,6 +2719,25 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2705 |
output[index] = "O".to_string();
|
| 2706 |
continue;
|
| 2707 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2708 |
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 2709 |
{
|
| 2710 |
output[index] = "O".to_string();
|
|
@@ -3039,6 +3072,11 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
|
|
| 3039 |
labels.push("B-SEASON".to_string());
|
| 3040 |
continue;
|
| 3041 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3042 |
if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
|
| 3043 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3044 |
let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
|
@@ -3066,6 +3104,33 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
|
|
| 3066 |
}
|
| 3067 |
continue;
|
| 3068 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3069 |
output_pieces.push(piece);
|
| 3070 |
labels.push("B-TITLE".to_string());
|
| 3071 |
}
|
|
@@ -3227,18 +3292,58 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3227 |
let joiners = [
|
| 3228 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3229 |
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 3230 |
-
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
|
| 3231 |
-
"☆", "♪", "`", "@",
|
| 3232 |
];
|
| 3233 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 3234 |
let entity_joiners = [
|
| 3235 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3236 |
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 3237 |
-
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」",
|
| 3238 |
-
"☆", "♪", "`", "@", "&", "&",
|
| 3239 |
];
|
| 3240 |
let mut output = labels.to_vec();
|
| 3241 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3242 |
if label == "B-TITLE"
|
| 3243 |
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3244 |
&& token.len() == 3
|
|
@@ -3283,6 +3388,11 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3283 |
.chars()
|
| 3284 |
.any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
|
| 3285 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3286 |
let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3287 |
if previous_title_word.is_some()
|
| 3288 |
&& !matches!(previous_word.as_deref(), Some("lupin"))
|
|
@@ -3291,7 +3401,10 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3291 |
continue;
|
| 3292 |
}
|
| 3293 |
}
|
| 3294 |
-
if label == "B-TITLE"
|
|
|
|
|
|
|
|
|
|
| 3295 |
let next_word = (index + 1..tokens.len()).find(|&cursor| {
|
| 3296 |
!joiners.contains(&tokens[cursor].as_str())
|
| 3297 |
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
|
@@ -3311,6 +3424,136 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3311 |
continue;
|
| 3312 |
}
|
| 3313 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3314 |
if label == "O"
|
| 3315 |
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3316 |
&& token.len() <= 3
|
|
@@ -3320,8 +3563,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3320 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3321 |
let next_non_space = (index + 1..tokens.len())
|
| 3322 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3323 |
-
if previous_non_space.is_some_and(|cursor| tokens[cursor]
|
| 3324 |
-
&& next_non_space.is_some_and(|cursor| tokens[cursor]
|
| 3325 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 3326 |
&& output[index + 1..]
|
| 3327 |
.iter()
|
|
@@ -3330,11 +3573,45 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3330 |
output[index] = "B-EPISODE".to_string();
|
| 3331 |
continue;
|
| 3332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3333 |
}
|
| 3334 |
if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
| 3335 |
let previous_non_space = (0..index)
|
| 3336 |
.rev()
|
| 3337 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3338 |
if previous_non_space
|
| 3339 |
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
|
| 3340 |
{
|
|
@@ -3366,7 +3643,13 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3366 |
}
|
| 3367 |
break;
|
| 3368 |
}
|
| 3369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3370 |
output[index] = "O".to_string();
|
| 3371 |
continue;
|
| 3372 |
}
|
|
@@ -3851,6 +4134,10 @@ mod tests {
|
|
| 3851 |
assert!(has_encoding_noise(
|
| 3852 |
"[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
|
| 3853 |
));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3854 |
|
| 3855 |
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
|
| 3856 |
let (trimmed, was_trimmed) = training_filename_for(tintin);
|
|
@@ -4091,6 +4378,49 @@ mod tests {
|
|
| 4091 |
assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
|
| 4092 |
assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 4093 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4094 |
let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
|
| 4095 |
assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
|
| 4096 |
assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4110,6 +4440,16 @@ mod tests {
|
|
| 4110 |
assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 4111 |
assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string())));
|
| 4112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4113 |
let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
|
| 4114 |
assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
|
| 4115 |
assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
|
|
@@ -4122,6 +4462,11 @@ mod tests {
|
|
| 4122 |
assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string())));
|
| 4123 |
assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string())));
|
| 4124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4125 |
let fox = labels_for(
|
| 4126 |
"[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]",
|
| 4127 |
);
|
|
@@ -4143,6 +4488,68 @@ mod tests {
|
|
| 4143 |
assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
| 4144 |
assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
|
| 4145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4146 |
let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
|
| 4147 |
assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
|
| 4148 |
}
|
|
|
|
| 201 |
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 202 |
Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
|
| 203 |
});
|
| 204 |
+
static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
|
| 205 |
+
Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
|
| 206 |
+
static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
|
| 207 |
+
Lazy::new(|| Regex::new(r"^(.+[\p{Han}\p{Hiragana}\p{Katakana}])(\d{2,3})$").unwrap());
|
| 208 |
static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
|
| 209 |
Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
|
| 210 |
static ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> =
|
| 211 |
Lazy::new(|| Regex::new(r"(?i)^\d{1,2}(?:st|nd|rd|th)$").unwrap());
|
| 212 |
+
static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
|
| 213 |
+
Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
|
| 214 |
+
.unwrap()
|
| 215 |
+
});
|
| 216 |
static SEASON_WORD_RE: Lazy<Regex> =
|
| 217 |
Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
|
| 218 |
static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
|
|
|
|
| 1772 |
let markers = [
|
| 1773 |
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
|
| 1774 |
"楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
|
| 1775 |
+
"伄", "椋", "伓", "姘", "帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒",
|
| 1776 |
];
|
| 1777 |
let marker_hits = markers
|
| 1778 |
.iter()
|
|
|
|
| 1791 |
fn has_non_anime_noise(value: &str) -> bool {
|
| 1792 |
let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
|
| 1793 |
normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
|
| 1794 |
+
|| value.contains("[旅游")
|
| 1795 |
+
|| value.contains("[旅游番")
|
| 1796 |
+
|| normalized.contains("tokyo deep")
|
| 1797 |
+
|| value.contains("日本不思议铁路之旅")
|
| 1798 |
+
|| value.contains("ニッポンぶらり鉄道旅")
|
| 1799 |
}
|
| 1800 |
|
| 1801 |
fn normalized_path_segment(value: &str) -> String {
|
|
|
|
| 2248 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
| 2249 |
&["Durarara", "2", "Ketsu"],
|
| 2250 |
&["Ghiblies", "Episode", "2"],
|
| 2251 |
+
&["Eien", "no", "831"],
|
| 2252 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2253 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
| 2254 |
];
|
|
|
|
| 2719 |
output[index] = "O".to_string();
|
| 2720 |
continue;
|
| 2721 |
}
|
| 2722 |
+
if roles[index] == "TITLE"
|
| 2723 |
+
&& matches!(text.as_str(), "TVアニメ" | "テレビアニメ")
|
| 2724 |
+
&& output
|
| 2725 |
+
.iter()
|
| 2726 |
+
.enumerate()
|
| 2727 |
+
.any(|(other, role)| other != index && role == "TITLE")
|
| 2728 |
+
{
|
| 2729 |
+
output[index] = "O".to_string();
|
| 2730 |
+
continue;
|
| 2731 |
+
}
|
| 2732 |
+
if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
|
| 2733 |
+
let later_special = output[index + 1..]
|
| 2734 |
+
.iter()
|
| 2735 |
+
.any(|role| role == "SPECIAL");
|
| 2736 |
+
if later_special {
|
| 2737 |
+
output[index] = "SPECIAL".to_string();
|
| 2738 |
+
continue;
|
| 2739 |
+
}
|
| 2740 |
+
}
|
| 2741 |
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 2742 |
{
|
| 2743 |
output[index] = "O".to_string();
|
|
|
|
| 3072 |
labels.push("B-SEASON".to_string());
|
| 3073 |
continue;
|
| 3074 |
}
|
| 3075 |
+
if EPISODE_CJK_RE.is_match(&piece) {
|
| 3076 |
+
output_pieces.push(piece);
|
| 3077 |
+
labels.push("B-EPISODE".to_string());
|
| 3078 |
+
continue;
|
| 3079 |
+
}
|
| 3080 |
if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
|
| 3081 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3082 |
let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
|
|
|
| 3104 |
}
|
| 3105 |
continue;
|
| 3106 |
}
|
| 3107 |
+
if let Some(caps) = CJK_EPISODE_EMBEDDED_RE.captures(&piece) {
|
| 3108 |
+
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3109 |
+
let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
| 3110 |
+
let after = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
|
| 3111 |
+
if !before.is_empty() {
|
| 3112 |
+
output_pieces.push(before.to_string());
|
| 3113 |
+
labels.push("B-TITLE".to_string());
|
| 3114 |
+
}
|
| 3115 |
+
output_pieces.push(episode.to_string());
|
| 3116 |
+
labels.push("B-EPISODE".to_string());
|
| 3117 |
+
if !after.is_empty() {
|
| 3118 |
+
output_pieces.push(after.to_string());
|
| 3119 |
+
labels.push("O".to_string());
|
| 3120 |
+
}
|
| 3121 |
+
continue;
|
| 3122 |
+
}
|
| 3123 |
+
if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
|
| 3124 |
+
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3125 |
+
let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
| 3126 |
+
if !before.is_empty() {
|
| 3127 |
+
output_pieces.push(before.to_string());
|
| 3128 |
+
labels.push("B-TITLE".to_string());
|
| 3129 |
+
}
|
| 3130 |
+
output_pieces.push(episode.to_string());
|
| 3131 |
+
labels.push("B-EPISODE".to_string());
|
| 3132 |
+
continue;
|
| 3133 |
+
}
|
| 3134 |
output_pieces.push(piece);
|
| 3135 |
labels.push("B-TITLE".to_string());
|
| 3136 |
}
|
|
|
|
| 3292 |
let joiners = [
|
| 3293 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3294 |
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 3295 |
+
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
|
| 3296 |
+
"☆", "♪", "`", "@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
|
| 3297 |
];
|
| 3298 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 3299 |
let entity_joiners = [
|
| 3300 |
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3301 |
"?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
|
| 3302 |
+
"(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
|
| 3303 |
+
"☆", "♪", "`", "@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
|
| 3304 |
];
|
| 3305 |
let mut output = labels.to_vec();
|
| 3306 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
| 3307 |
+
if label == "B-TITLE"
|
| 3308 |
+
&& token == "TV"
|
| 3309 |
+
&& index + 1 < tokens.len()
|
| 3310 |
+
&& tokens[index + 1] == "アニメ"
|
| 3311 |
+
&& output[index + 2..].iter().any(|label| label == "B-TITLE")
|
| 3312 |
+
{
|
| 3313 |
+
output[index] = "O".to_string();
|
| 3314 |
+
output[index + 1] = "O".to_string();
|
| 3315 |
+
continue;
|
| 3316 |
+
}
|
| 3317 |
+
if label == "B-TITLE"
|
| 3318 |
+
&& token == "アニメ"
|
| 3319 |
+
&& output[index + 1..].iter().any(|label| label == "B-TITLE")
|
| 3320 |
+
{
|
| 3321 |
+
output[index] = "O".to_string();
|
| 3322 |
+
continue;
|
| 3323 |
+
}
|
| 3324 |
+
if label == "B-TITLE" && token.eq_ignore_ascii_case("part") {
|
| 3325 |
+
let next_number = (index + 1..tokens.len()).find(|&cursor| {
|
| 3326 |
+
!joiners.contains(&tokens[cursor].as_str())
|
| 3327 |
+
&& !tokens[cursor].chars().all(char::is_whitespace)
|
| 3328 |
+
});
|
| 3329 |
+
let nearby_lupin = tokens[..index]
|
| 3330 |
+
.iter()
|
| 3331 |
+
.rev()
|
| 3332 |
+
.take(8)
|
| 3333 |
+
.any(|item| item.eq_ignore_ascii_case("lupin"))
|
| 3334 |
+
|| tokens[index + 1..]
|
| 3335 |
+
.iter()
|
| 3336 |
+
.take(12)
|
| 3337 |
+
.any(|item| item.eq_ignore_ascii_case("lupin"));
|
| 3338 |
+
if nearby_lupin
|
| 3339 |
+
&& next_number.is_some_and(|cursor| {
|
| 3340 |
+
tokens[cursor].chars().all(|ch| ch.is_ascii_digit()) && tokens[cursor].len() <= 2
|
| 3341 |
+
})
|
| 3342 |
+
{
|
| 3343 |
+
output[index] = "B-SEASON".to_string();
|
| 3344 |
+
continue;
|
| 3345 |
+
}
|
| 3346 |
+
}
|
| 3347 |
if label == "B-TITLE"
|
| 3348 |
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3349 |
&& token.len() == 3
|
|
|
|
| 3388 |
.chars()
|
| 3389 |
.any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
|
| 3390 |
});
|
| 3391 |
+
let later_episode = (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
|
| 3392 |
+
if previous_title_word.is_none() && later_episode {
|
| 3393 |
+
output[index] = "B-SEASON".to_string();
|
| 3394 |
+
continue;
|
| 3395 |
+
}
|
| 3396 |
let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3397 |
if previous_title_word.is_some()
|
| 3398 |
&& !matches!(previous_word.as_deref(), Some("lupin"))
|
|
|
|
| 3401 |
continue;
|
| 3402 |
}
|
| 3403 |
}
|
| 3404 |
+
if label == "B-TITLE"
|
| 3405 |
+
&& (ORDINAL_SEASON_TOKEN_RE.is_match(token)
|
| 3406 |
+
|| WORD_ORDINAL_SEASON_TOKEN_RE.is_match(token))
|
| 3407 |
+
{
|
| 3408 |
let next_word = (index + 1..tokens.len()).find(|&cursor| {
|
| 3409 |
!joiners.contains(&tokens[cursor].as_str())
|
| 3410 |
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
|
|
|
| 3424 |
continue;
|
| 3425 |
}
|
| 3426 |
}
|
| 3427 |
+
if label == "O"
|
| 3428 |
+
&& (EPISODE_CJK_RE.is_match(token)
|
| 3429 |
+
|| EPISODE_VALUE_RE.is_match(token)
|
| 3430 |
+
|| EPISODE_RANGE_RE.is_match(token))
|
| 3431 |
+
{
|
| 3432 |
+
output[index] = "B-EPISODE".to_string();
|
| 3433 |
+
continue;
|
| 3434 |
+
}
|
| 3435 |
+
if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
| 3436 |
+
let previous_non_space = (0..index)
|
| 3437 |
+
.rev()
|
| 3438 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3439 |
+
let next_non_space = (index + 1..tokens.len())
|
| 3440 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3441 |
+
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "#") {
|
| 3442 |
+
output[index] = "B-EPISODE".to_string();
|
| 3443 |
+
if next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "~"))
|
| 3444 |
+
{
|
| 3445 |
+
if let Some(separator) = next_non_space {
|
| 3446 |
+
output[separator] = "B-EPISODE".to_string();
|
| 3447 |
+
if let Some(right) = (separator + 1..tokens.len())
|
| 3448 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
|
| 3449 |
+
{
|
| 3450 |
+
if tokens[right].chars().all(|ch| ch.is_ascii_digit()) {
|
| 3451 |
+
output[right] = "B-EPISODE".to_string();
|
| 3452 |
+
}
|
| 3453 |
+
}
|
| 3454 |
+
}
|
| 3455 |
+
}
|
| 3456 |
+
continue;
|
| 3457 |
+
}
|
| 3458 |
+
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 3459 |
+
&& next_non_space
|
| 3460 |
+
.is_some_and(|cursor| {
|
| 3461 |
+
matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
|
| 3462 |
+
|| tokens[cursor].starts_with('话')
|
| 3463 |
+
|| tokens[cursor].starts_with('話')
|
| 3464 |
+
|| tokens[cursor].starts_with('回')
|
| 3465 |
+
|| tokens[cursor].starts_with('集')
|
| 3466 |
+
})
|
| 3467 |
+
{
|
| 3468 |
+
if let Some(cursor) = previous_non_space {
|
| 3469 |
+
output[cursor] = "B-EPISODE".to_string();
|
| 3470 |
+
}
|
| 3471 |
+
output[index] = "B-EPISODE".to_string();
|
| 3472 |
+
if let Some(cursor) = next_non_space {
|
| 3473 |
+
if matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集") {
|
| 3474 |
+
output[cursor] = "B-EPISODE".to_string();
|
| 3475 |
+
}
|
| 3476 |
+
}
|
| 3477 |
+
continue;
|
| 3478 |
+
}
|
| 3479 |
+
}
|
| 3480 |
+
if matches!(label.as_str(), "B-TITLE" | "O")
|
| 3481 |
+
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3482 |
+
&& token.len() <= 3
|
| 3483 |
+
{
|
| 3484 |
+
let previous_word = (0..index)
|
| 3485 |
+
.rev()
|
| 3486 |
+
.find(|&cursor| {
|
| 3487 |
+
!joiners.contains(&tokens[cursor].as_str())
|
| 3488 |
+
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3489 |
+
})
|
| 3490 |
+
.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3491 |
+
let next_structural = (index + 1..tokens.len())
|
| 3492 |
+
.find(|&cursor| !joiners.contains(&tokens[cursor].as_str()))
|
| 3493 |
+
.map(|cursor| tokens[cursor].as_str());
|
| 3494 |
+
let next_non_space = (index + 1..tokens.len())
|
| 3495 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
|
| 3496 |
+
.map(|cursor| tokens[cursor].as_str());
|
| 3497 |
+
let later_technical_block = output[index + 1..]
|
| 3498 |
+
.iter()
|
| 3499 |
+
.any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION"));
|
| 3500 |
+
let nearby_lupin_part = previous_word.as_deref() == Some("part")
|
| 3501 |
+
&& (tokens[..index]
|
| 3502 |
+
.iter()
|
| 3503 |
+
.rev()
|
| 3504 |
+
.take(8)
|
| 3505 |
+
.any(|item| item.eq_ignore_ascii_case("lupin"))
|
| 3506 |
+
|| tokens[index + 1..]
|
| 3507 |
+
.iter()
|
| 3508 |
+
.take(12)
|
| 3509 |
+
.any(|item| item.eq_ignore_ascii_case("lupin")));
|
| 3510 |
+
if nearby_lupin_part {
|
| 3511 |
+
output[index] = "B-SEASON".to_string();
|
| 3512 |
+
continue;
|
| 3513 |
+
}
|
| 3514 |
+
let followed_by_title_word = (index + 1..tokens.len())
|
| 3515 |
+
.find(|&cursor| {
|
| 3516 |
+
!joiners.contains(&tokens[cursor].as_str())
|
| 3517 |
+
&& !matches!(tokens[cursor].as_str(), "-" | "-" | "," | "," | ":" | ":")
|
| 3518 |
+
})
|
| 3519 |
+
.is_some_and(|cursor| {
|
| 3520 |
+
!matches!(tokens[cursor].as_str(), "[" | "【" | "(" | "(" | "]" | "】")
|
| 3521 |
+
&& output
|
| 3522 |
+
.get(cursor)
|
| 3523 |
+
.is_some_and(|label| label == "B-TITLE")
|
| 3524 |
+
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3525 |
+
});
|
| 3526 |
+
if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
|
| 3527 |
+
{
|
| 3528 |
+
output[index] = "B-TITLE".to_string();
|
| 3529 |
+
continue;
|
| 3530 |
+
}
|
| 3531 |
+
if (later_technical_block
|
| 3532 |
+
|| next_non_space.is_some_and(|token| matches!(token, "[" | "【" | "(" | "("))
|
| 3533 |
+
|| next_structural.is_some_and(|token| matches!(token, "[" | "【" | "(" | "(")))
|
| 3534 |
+
&& matches!(previous_word.as_deref(), Some("movie" | "part"))
|
| 3535 |
+
{
|
| 3536 |
+
output[index] = "B-SPECIAL".to_string();
|
| 3537 |
+
continue;
|
| 3538 |
+
}
|
| 3539 |
+
let eien_title_number = token == "831"
|
| 3540 |
+
&& previous_word.as_deref() == Some("no")
|
| 3541 |
+
&& (0..index).any(|cursor| {
|
| 3542 |
+
output[cursor] == "B-TITLE" && tokens[cursor].eq_ignore_ascii_case("Eien")
|
| 3543 |
+
});
|
| 3544 |
+
if eien_title_number {
|
| 3545 |
+
for joiner_index in (0..index).rev() {
|
| 3546 |
+
if tokens[joiner_index].eq_ignore_ascii_case("no") {
|
| 3547 |
+
break;
|
| 3548 |
+
}
|
| 3549 |
+
if joiners.contains(&tokens[joiner_index].as_str()) {
|
| 3550 |
+
output[joiner_index] = "B-TITLE".to_string();
|
| 3551 |
+
}
|
| 3552 |
+
}
|
| 3553 |
+
output[index] = "B-TITLE".to_string();
|
| 3554 |
+
continue;
|
| 3555 |
+
}
|
| 3556 |
+
}
|
| 3557 |
if label == "O"
|
| 3558 |
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3559 |
&& token.len() <= 3
|
|
|
|
| 3563 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3564 |
let next_non_space = (index + 1..tokens.len())
|
| 3565 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3566 |
+
if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
|
| 3567 |
+
&& next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
|
| 3568 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 3569 |
&& output[index + 1..]
|
| 3570 |
.iter()
|
|
|
|
| 3573 |
output[index] = "B-EPISODE".to_string();
|
| 3574 |
continue;
|
| 3575 |
}
|
| 3576 |
+
if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-"))
|
| 3577 |
+
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 3578 |
+
&& output[index + 1..]
|
| 3579 |
+
.iter()
|
| 3580 |
+
.any(|label| matches!(label.as_str(), "B-SOURCE" | "B-RESOLUTION"))
|
| 3581 |
+
{
|
| 3582 |
+
output[index] = "B-EPISODE".to_string();
|
| 3583 |
+
continue;
|
| 3584 |
+
}
|
| 3585 |
+
if next_non_space.is_none()
|
| 3586 |
+
&& previous_non_space.is_some_and(|cursor| {
|
| 3587 |
+
output[cursor] == "B-TITLE"
|
| 3588 |
+
&& tokens[cursor].chars().any(|ch| {
|
| 3589 |
+
('\u{4e00}'..='\u{9fff}').contains(&ch)
|
| 3590 |
+
|| ('\u{3040}'..='\u{30ff}').contains(&ch)
|
| 3591 |
+
})
|
| 3592 |
+
})
|
| 3593 |
+
{
|
| 3594 |
+
output[index] = "B-EPISODE".to_string();
|
| 3595 |
+
continue;
|
| 3596 |
+
}
|
| 3597 |
}
|
| 3598 |
if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
| 3599 |
let previous_non_space = (0..index)
|
| 3600 |
.rev()
|
| 3601 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3602 |
+
let next_non_space = (index + 1..tokens.len())
|
| 3603 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3604 |
+
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 3605 |
+
&& next_non_space
|
| 3606 |
+
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集"))
|
| 3607 |
+
{
|
| 3608 |
+
if let Some(cursor) = previous_non_space {
|
| 3609 |
+
output[cursor] = "B-EPISODE".to_string();
|
| 3610 |
+
}
|
| 3611 |
+
if let Some(cursor) = next_non_space {
|
| 3612 |
+
output[cursor] = "B-EPISODE".to_string();
|
| 3613 |
+
}
|
| 3614 |
+
}
|
| 3615 |
if previous_non_space
|
| 3616 |
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
|
| 3617 |
{
|
|
|
|
| 3643 |
}
|
| 3644 |
break;
|
| 3645 |
}
|
| 3646 |
+
let previous_non_space = (0..index)
|
| 3647 |
+
.rev()
|
| 3648 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace))
|
| 3649 |
+
.map(|cursor| tokens[cursor].as_str());
|
| 3650 |
+
if matches!(previous_word.as_deref(), Some("lesson" | "part"))
|
| 3651 |
+
|| (previous_word.as_deref() == Some("no") && previous_non_space == Some("."))
|
| 3652 |
+
{
|
| 3653 |
output[index] = "O".to_string();
|
| 3654 |
continue;
|
| 3655 |
}
|
|
|
|
| 4134 |
assert!(has_encoding_noise(
|
| 4135 |
"[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
|
| 4136 |
));
|
| 4137 |
+
assert!(has_encoding_noise("ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"));
|
| 4138 |
+
assert!(has_non_anime_noise(
|
| 4139 |
+
"13-[旅游番][花丸字幕组][日本不思议铁路之旅][15.03.19-16.02.03][720&1080][中日双语]/铁道旅 15.03.19 720"
|
| 4140 |
+
));
|
| 4141 |
|
| 4142 |
let tintin = "Adventures of Tintin (1991) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit EAC3 2.0 Garshasp)/Season 1/Adventures of Tintin (1991) - S01E06 - Cigars of the Pharaoh (Part 1) (1080p BluRay x265 Garshasp)";
|
| 4143 |
let (trimmed, was_trimmed) = training_filename_for(tintin);
|
|
|
|
| 4378 |
assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
|
| 4379 |
assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 4380 |
|
| 4381 |
+
let conan_movie =
|
| 4382 |
+
labels_for("[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]");
|
| 4383 |
+
assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
|
| 4384 |
+
assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
|
| 4385 |
+
|
| 4386 |
+
let madoka_movie =
|
| 4387 |
+
labels_for("[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]");
|
| 4388 |
+
assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4389 |
+
assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
|
| 4390 |
+
|
| 4391 |
+
let fate_first_order =
|
| 4392 |
+
labels_for("[DBD-Raws][Fate Grand Order ‐First Order‐][PV][01][1080P]");
|
| 4393 |
+
assert!(fate_first_order.contains(&("Fate".to_string(), "B-TITLE".to_string())));
|
| 4394 |
+
assert!(fate_first_order.contains(&("‐".to_string(), "B-TITLE".to_string())));
|
| 4395 |
+
assert!(fate_first_order.contains(&("First".to_string(), "B-TITLE".to_string())));
|
| 4396 |
+
|
| 4397 |
+
let trillion_game = labels_for("[ANi] 一兆$遊戲 - 03 [1080P][Baha][WEB-DL][AAC AVC][CHT]");
|
| 4398 |
+
assert!(trillion_game.contains(&("一兆".to_string(), "B-TITLE".to_string())));
|
| 4399 |
+
assert!(trillion_game.contains(&("$".to_string(), "B-TITLE".to_string())));
|
| 4400 |
+
assert!(trillion_game.contains(&("遊戲".to_string(), "B-TITLE".to_string())));
|
| 4401 |
+
|
| 4402 |
+
let lapis = labels_for("[Nekomoe kissaten&LoliHouse] Lapis Re꞉LiGHTs - PV01 [BDRip 1080p]");
|
| 4403 |
+
assert!(lapis.contains(&("Re".to_string(), "B-TITLE".to_string())));
|
| 4404 |
+
assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
|
| 4405 |
+
assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
|
| 4406 |
+
|
| 4407 |
+
let rezero = labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
|
| 4408 |
+
assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
| 4409 |
+
assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 4410 |
+
assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
|
| 4411 |
+
assert!(rezero.contains(&("第".to_string(), "B-EPISODE".to_string())));
|
| 4412 |
+
assert!(rezero.contains(&("話".to_string(), "B-EPISODE".to_string())));
|
| 4413 |
+
|
| 4414 |
+
let shark = labels_for("アニメ『おでかけ子ザメ』第10話「かじゅえん」");
|
| 4415 |
+
assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 4416 |
+
assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
|
| 4417 |
+
|
| 4418 |
+
let creditless = labels_for(
|
| 4419 |
+
"[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)",
|
| 4420 |
+
);
|
| 4421 |
+
assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
|
| 4422 |
+
assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
|
| 4423 |
+
|
| 4424 |
let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
|
| 4425 |
assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
|
| 4426 |
assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 4440 |
assert!(hayate.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 4441 |
assert!(hayate.contains(&("23".to_string(), "B-EPISODE".to_string())));
|
| 4442 |
|
| 4443 |
+
let yama = labels_for("[A.I.R.nesSub][Yama_no_Susume_Second_Season][08][720p]");
|
| 4444 |
+
assert!(yama.contains(&("Yama".to_string(), "B-TITLE".to_string())));
|
| 4445 |
+
assert!(yama.contains(&("Second".to_string(), "B-SEASON".to_string())));
|
| 4446 |
+
assert!(yama.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 4447 |
+
|
| 4448 |
+
let one_room = labels_for("[DMG][One Room Second Season][00][1080P][BIG5]");
|
| 4449 |
+
assert!(one_room.contains(&("One".to_string(), "B-TITLE".to_string())));
|
| 4450 |
+
assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
|
| 4451 |
+
assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 4452 |
+
|
| 4453 |
let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
|
| 4454 |
assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
|
| 4455 |
assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 4462 |
assert!(yu_no.contains(&("NO".to_string(), "B-TITLE".to_string())));
|
| 4463 |
assert!(yu_no.contains(&("23".to_string(), "B-EPISODE".to_string())));
|
| 4464 |
|
| 4465 |
+
let yu_no_dash =
|
| 4466 |
+
labels_for("[LowPower-Raws] この世の果てで恋を唄う少女YU-NO - 01 (BD 1080P x264 FLAC)");
|
| 4467 |
+
assert!(yu_no_dash.contains(&("NO".to_string(), "B-TITLE".to_string())));
|
| 4468 |
+
assert!(yu_no_dash.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4469 |
+
|
| 4470 |
let fox = labels_for(
|
| 4471 |
"[GM-Team][国漫][狐妖小红娘 尾生篇][Fox Spirit Matchmaker Ⅷ][2019][05][AVC][GB][1080P]",
|
| 4472 |
);
|
|
|
|
| 4488 |
assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
| 4489 |
assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
|
| 4490 |
|
| 4491 |
+
let lupin_part =
|
| 4492 |
+
labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
|
| 4493 |
+
assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
|
| 4494 |
+
assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
|
| 4495 |
+
assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
|
| 4496 |
+
assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
|
| 4497 |
+
assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
|
| 4498 |
+
|
| 4499 |
+
let roman_leaf = dmhy_record("Ⅰ 001 魯邦燃起了鬥志", "tpl_test", &suggested_roles("TEXT SEP EPISODE SEP TEXT")).unwrap();
|
| 4500 |
+
assert!(roman_leaf
|
| 4501 |
+
.tokens
|
| 4502 |
+
.iter()
|
| 4503 |
+
.zip(roman_leaf.labels.iter())
|
| 4504 |
+
.any(|(token, label)| token == "Ⅰ" && label == "B-SEASON"));
|
| 4505 |
+
assert!(audit_warnings(&roman_leaf).contains(&"no_title".to_string()));
|
| 4506 |
+
|
| 4507 |
+
let hallow = labels_for("[c.c动漫 ccwzz.cc][驱魔少年HALLOW][第09话][GB][720p]");
|
| 4508 |
+
assert!(hallow.contains(&("驱魔少年HALLOW".to_string(), "B-TITLE".to_string())));
|
| 4509 |
+
assert!(hallow.contains(&("第09话".to_string(), "B-EPISODE".to_string())));
|
| 4510 |
+
|
| 4511 |
+
let fairy = labels_for("[魔導少年 最終章][EP35][繁体][1080P]");
|
| 4512 |
+
assert!(fairy.contains(&("魔導少年".to_string(), "B-TITLE".to_string())));
|
| 4513 |
+
assert!(fairy.contains(&("EP35".to_string(), "B-EPISODE".to_string())));
|
| 4514 |
+
|
| 4515 |
+
let mebius = labels_for("【CXRAW】【ウルトラマンメビウス】【22】【日々の未来】【DVDrip】【x264 Hi10P AAC】【MP4】");
|
| 4516 |
+
assert!(mebius.contains(&("ウルトラマンメビウス".to_string(), "B-TITLE".to_string())));
|
| 4517 |
+
assert!(mebius.contains(&("22".to_string(), "B-EPISODE".to_string())));
|
| 4518 |
+
|
| 4519 |
+
let battle = labels_for("斗破苍穹三年之约第01话");
|
| 4520 |
+
assert!(battle.contains(&("斗破苍穹三年之约".to_string(), "B-TITLE".to_string())));
|
| 4521 |
+
assert!(battle.contains(&("第".to_string(), "B-EPISODE".to_string())));
|
| 4522 |
+
assert!(battle.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4523 |
+
assert!(battle.contains(&("话".to_string(), "B-EPISODE".to_string())));
|
| 4524 |
+
|
| 4525 |
+
let hakumei = labels_for("妖精森林的小不点01");
|
| 4526 |
+
assert!(hakumei.contains(&("妖精森林的小不点".to_string(), "B-TITLE".to_string())));
|
| 4527 |
+
assert!(hakumei.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4528 |
+
|
| 4529 |
+
let decimal_episode_title = labels_for("无限系统树:第1话可能性的起点");
|
| 4530 |
+
assert!(decimal_episode_title.contains(&("无限系统树".to_string(), "B-TITLE".to_string())));
|
| 4531 |
+
assert!(decimal_episode_title.contains(&("第".to_string(), "B-EPISODE".to_string())));
|
| 4532 |
+
assert!(decimal_episode_title.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4533 |
+
|
| 4534 |
+
let hash_range = labels_for("花田少年史#1-3");
|
| 4535 |
+
assert!(hash_range.contains(&("花田少年史".to_string(), "B-TITLE".to_string())));
|
| 4536 |
+
assert!(hash_range.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4537 |
+
assert!(hash_range.contains(&("-".to_string(), "B-EPISODE".to_string())));
|
| 4538 |
+
assert!(hash_range.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 4539 |
+
|
| 4540 |
+
let movie_number = labels_for("[Kamigami] Haikyuu!! Movie - 01 [BD 1080p x265 Ma10p AAC]");
|
| 4541 |
+
assert!(movie_number.contains(&("Haikyuu".to_string(), "B-TITLE".to_string())));
|
| 4542 |
+
assert!(movie_number.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 4543 |
+
assert!(!movie_number.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4544 |
+
|
| 4545 |
+
let ajin_movie = labels_for("[Moozzi2] Ajin The Movie - 01 (BD 1920x1080 x.264 FLACx2)");
|
| 4546 |
+
assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
|
| 4547 |
+
assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 4548 |
+
|
| 4549 |
+
let eien = labels_for("[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]");
|
| 4550 |
+
assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
|
| 4551 |
+
assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
|
| 4552 |
+
|
| 4553 |
let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
|
| 4554 |
assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
|
| 4555 |
}
|