Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Refine DMHY bonus labeling cases
Browse files
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -207,7 +207,7 @@ static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
|
|
| 207 |
Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
|
| 208 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 209 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 210 |
-
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
| 211 |
});
|
| 212 |
static VOLUME_RE: Lazy<Regex> =
|
| 213 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
@@ -2192,6 +2192,7 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 2192 |
| "TOKUTEN"
|
| 2193 |
| "TRAILER"
|
| 2194 |
| "TV SPOT"
|
|
|
|
| 2195 |
| "WORLD PREMIERE"
|
| 2196 |
| "予告"
|
| 2197 |
| "番宣"
|
|
@@ -2203,6 +2204,7 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 2203 |
|| normalized.contains("番宣")
|
| 2204 |
|| normalized.contains("宣番")
|
| 2205 |
|| normalized.contains("TV SPOT")
|
|
|
|
| 2206 |
|| text.contains("予告")
|
| 2207 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2208 |
}
|
|
@@ -2224,6 +2226,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
|
| 2224 |
&["Zom", "100"],
|
| 2225 |
&["Kamisama", "Hajimemashita", "2"],
|
| 2226 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
|
|
|
| 2227 |
&["Ghiblies", "Episode", "2"],
|
| 2228 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2229 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
|
@@ -2445,6 +2448,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2445 |
output[index] = "O".to_string();
|
| 2446 |
continue;
|
| 2447 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2448 |
if roles[index].starts_with("EPISODE")
|
| 2449 |
&& index >= 1
|
| 2450 |
&& output[index - 1] == "TITLE"
|
|
@@ -2773,12 +2818,18 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2773 |
}
|
| 2774 |
if matches!(
|
| 2775 |
previous_real_text.to_ascii_lowercase().as_str(),
|
| 2776 |
-
"lesson" | "part"
|
| 2777 |
)
|
| 2778 |
{
|
| 2779 |
output[index] = "O".to_string();
|
| 2780 |
continue;
|
| 2781 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2782 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 2783 |
&& (output[..index]
|
| 2784 |
.iter()
|
|
@@ -3174,7 +3225,56 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3174 |
];
|
| 3175 |
let mut output = labels.to_vec();
|
| 3176 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3177 |
if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3178 |
let previous_word = (0..index)
|
| 3179 |
.rev()
|
| 3180 |
.find(|&cursor| {
|
|
@@ -3182,7 +3282,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3182 |
|| tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3183 |
})
|
| 3184 |
.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3185 |
-
if matches!(previous_word.as_deref(), Some("lesson" | "part")) {
|
| 3186 |
output[index] = "O".to_string();
|
| 3187 |
continue;
|
| 3188 |
}
|
|
@@ -3884,5 +3984,33 @@ mod tests {
|
|
| 3884 |
labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
|
| 3885 |
assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3886 |
assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3887 |
}
|
| 3888 |
}
|
|
|
|
| 207 |
Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
|
| 208 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 209 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 210 |
+
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
| 211 |
});
|
| 212 |
static VOLUME_RE: Lazy<Regex> =
|
| 213 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
|
|
| 2192 |
| "TOKUTEN"
|
| 2193 |
| "TRAILER"
|
| 2194 |
| "TV SPOT"
|
| 2195 |
+
| "SPOT"
|
| 2196 |
| "WORLD PREMIERE"
|
| 2197 |
| "予告"
|
| 2198 |
| "番宣"
|
|
|
|
| 2204 |
|| normalized.contains("番宣")
|
| 2205 |
|| normalized.contains("宣番")
|
| 2206 |
|| normalized.contains("TV SPOT")
|
| 2207 |
+
|| normalized.contains("BD SPOT")
|
| 2208 |
|| text.contains("予告")
|
| 2209 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2210 |
}
|
|
|
|
| 2226 |
&["Zom", "100"],
|
| 2227 |
&["Kamisama", "Hajimemashita", "2"],
|
| 2228 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
| 2229 |
+
&["Durarara", "2", "Ketsu"],
|
| 2230 |
&["Ghiblies", "Episode", "2"],
|
| 2231 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2232 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
|
|
|
| 2448 |
output[index] = "O".to_string();
|
| 2449 |
continue;
|
| 2450 |
}
|
| 2451 |
+
if roles[index].starts_with("EPISODE")
|
| 2452 |
+
&& index >= 2
|
| 2453 |
+
&& matches!(group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X")
|
| 2454 |
+
&& output[index - 2] == "TITLE"
|
| 2455 |
+
&& !roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
|
| 2456 |
+
{
|
| 2457 |
+
output[index] = "TITLE".to_string();
|
| 2458 |
+
if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
|
| 2459 |
+
groups[cursor].class_name != "SEP" && groups[cursor].class_name == "TEXT"
|
| 2460 |
+
}) {
|
| 2461 |
+
output[next_text_index] = "TITLE".to_string();
|
| 2462 |
+
}
|
| 2463 |
+
continue;
|
| 2464 |
+
}
|
| 2465 |
+
if roles[index].starts_with("EPISODE")
|
| 2466 |
+
&& !output[..index].iter().any(|role| role.starts_with("EPISODE"))
|
| 2467 |
+
&& group_text(
|
| 2468 |
+
tokens,
|
| 2469 |
+
&groups[(0..index)
|
| 2470 |
+
.rev()
|
| 2471 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2472 |
+
.unwrap_or(index)],
|
| 2473 |
+
)
|
| 2474 |
+
.eq_ignore_ascii_case("Movie")
|
| 2475 |
+
{
|
| 2476 |
+
output[index] = "TITLE".to_string();
|
| 2477 |
+
continue;
|
| 2478 |
+
}
|
| 2479 |
+
if output[index] == "TITLE"
|
| 2480 |
+
&& matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
|
| 2481 |
+
{
|
| 2482 |
+
let next_source_lang = (index + 1..roles.len())
|
| 2483 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2484 |
+
.is_some_and(|cursor| {
|
| 2485 |
+
output[cursor] == "SOURCE"
|
| 2486 |
+
&& group_text(tokens, &groups[cursor]).contains('语')
|
| 2487 |
+
});
|
| 2488 |
+
if next_source_lang {
|
| 2489 |
+
output[index] = "SOURCE".to_string();
|
| 2490 |
+
continue;
|
| 2491 |
+
}
|
| 2492 |
+
}
|
| 2493 |
if roles[index].starts_with("EPISODE")
|
| 2494 |
&& index >= 1
|
| 2495 |
&& output[index - 1] == "TITLE"
|
|
|
|
| 2818 |
}
|
| 2819 |
if matches!(
|
| 2820 |
previous_real_text.to_ascii_lowercase().as_str(),
|
| 2821 |
+
"lesson" | "part" | "no"
|
| 2822 |
)
|
| 2823 |
{
|
| 2824 |
output[index] = "O".to_string();
|
| 2825 |
continue;
|
| 2826 |
}
|
| 2827 |
+
if previous_real_text.contains("予告")
|
| 2828 |
+
|| previous_real_text.eq_ignore_ascii_case("Spot")
|
| 2829 |
+
{
|
| 2830 |
+
output[index] = "SPECIAL".to_string();
|
| 2831 |
+
continue;
|
| 2832 |
+
}
|
| 2833 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 2834 |
&& (output[..index]
|
| 2835 |
.iter()
|
|
|
|
| 3225 |
];
|
| 3226 |
let mut output = labels.to_vec();
|
| 3227 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
| 3228 |
+
if label == "B-TITLE"
|
| 3229 |
+
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3230 |
+
&& token.len() == 3
|
| 3231 |
+
&& index + 1 < tokens.len()
|
| 3232 |
+
&& matches!(tokens[index + 1].as_str(), "「" | "「" | "\"" | "'")
|
| 3233 |
+
{
|
| 3234 |
+
output[index] = "B-EPISODE".to_string();
|
| 3235 |
+
let mut cursor = index + 1;
|
| 3236 |
+
while cursor < tokens.len() {
|
| 3237 |
+
output[cursor] = "O".to_string();
|
| 3238 |
+
if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1 {
|
| 3239 |
+
break;
|
| 3240 |
+
}
|
| 3241 |
+
cursor += 1;
|
| 3242 |
+
}
|
| 3243 |
+
continue;
|
| 3244 |
+
}
|
| 3245 |
+
if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") {
|
| 3246 |
+
let next_word = (index + 1..tokens.len()).find(|&cursor| {
|
| 3247 |
+
tokens[cursor].chars().any(|ch| ch.is_alphanumeric())
|
| 3248 |
+
});
|
| 3249 |
+
if next_word.is_some_and(|cursor| {
|
| 3250 |
+
labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
|
| 3251 |
+
}) {
|
| 3252 |
+
output[index] = "B-SOURCE".to_string();
|
| 3253 |
+
continue;
|
| 3254 |
+
}
|
| 3255 |
+
}
|
| 3256 |
if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
| 3257 |
+
let previous_non_space = (0..index)
|
| 3258 |
+
.rev()
|
| 3259 |
+
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3260 |
+
if previous_non_space
|
| 3261 |
+
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
|
| 3262 |
+
{
|
| 3263 |
+
let left_title = (0..previous_non_space.unwrap())
|
| 3264 |
+
.rev()
|
| 3265 |
+
.find(|&cursor| labels[cursor] != "O")
|
| 3266 |
+
.is_some_and(|cursor| labels[cursor] == "B-TITLE");
|
| 3267 |
+
if left_title {
|
| 3268 |
+
output[index] = "B-TITLE".to_string();
|
| 3269 |
+
if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
|
| 3270 |
+
labels[cursor] == "O"
|
| 3271 |
+
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3272 |
+
}) {
|
| 3273 |
+
output[next_word] = "B-TITLE".to_string();
|
| 3274 |
+
}
|
| 3275 |
+
continue;
|
| 3276 |
+
}
|
| 3277 |
+
}
|
| 3278 |
let previous_word = (0..index)
|
| 3279 |
.rev()
|
| 3280 |
.find(|&cursor| {
|
|
|
|
| 3282 |
|| tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3283 |
})
|
| 3284 |
.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3285 |
+
if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
|
| 3286 |
output[index] = "O".to_string();
|
| 3287 |
continue;
|
| 3288 |
}
|
|
|
|
| 3984 |
labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
|
| 3985 |
assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3986 |
assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 3987 |
+
|
| 3988 |
+
let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
|
| 3989 |
+
assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
|
| 3990 |
+
assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 3991 |
+
assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
| 3992 |
+
|
| 3993 |
+
let bd_spot =
|
| 3994 |
+
labels_for("[Moozzi2] Amanchu! [SP05] BD-Spot - 01 (BD 1920x1080 x.264 Flac)");
|
| 3995 |
+
assert!(bd_spot.contains(&("Spot".to_string(), "B-SPECIAL".to_string())));
|
| 3996 |
+
assert!(bd_spot.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 3997 |
+
assert!(!bd_spot.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3998 |
+
|
| 3999 |
+
let preview_number =
|
| 4000 |
+
labels_for("[Snow-Raws] 刀使ノ巫女 第02話 予告01 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
|
| 4001 |
+
assert!(preview_number.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 4002 |
+
assert!(preview_number.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 4003 |
+
|
| 4004 |
+
let bleach_movie = labels_for("Bleach the Movie 3 - Fade to Black, I Call Your Name");
|
| 4005 |
+
assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
|
| 4006 |
+
assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 4007 |
+
|
| 4008 |
+
let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
|
| 4009 |
+
assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
|
| 4010 |
+
assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4011 |
+
|
| 4012 |
+
let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
|
| 4013 |
+
assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
|
| 4014 |
+
assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
|
| 4015 |
}
|
| 4016 |
}
|