Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Tighten DMHY low-frequency special cases
Browse files
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -201,9 +201,13 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
|
| 201 |
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 202 |
Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
|
| 203 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 205 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 206 |
-
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
| 207 |
});
|
| 208 |
static VOLUME_RE: Lazy<Regex> =
|
| 209 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
@@ -2187,7 +2191,9 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 2187 |
| "THEATER GREETING EVENT"
|
| 2188 |
| "TOKUTEN"
|
| 2189 |
| "TRAILER"
|
|
|
|
| 2190 |
| "WORLD PREMIERE"
|
|
|
|
| 2191 |
| "番宣"
|
| 2192 |
| "宣番"
|
| 2193 |
| "映像特典"
|
|
@@ -2196,6 +2202,8 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 2196 |
|| normalized.contains("特典映像")
|
| 2197 |
|| normalized.contains("番宣")
|
| 2198 |
|| normalized.contains("宣番")
|
|
|
|
|
|
|
| 2199 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2200 |
}
|
| 2201 |
|
|
@@ -2206,6 +2214,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
|
| 2206 |
&["Zom", "100"],
|
| 2207 |
&["Kamisama", "Hajimemashita", "2"],
|
| 2208 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
|
|
|
| 2209 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2210 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
| 2211 |
];
|
|
@@ -2386,6 +2395,18 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2386 |
if output[index] == "O" && groups[index].class_name.contains("SXE") {
|
| 2387 |
output[index] = "EPISODE".to_string();
|
| 2388 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2389 |
if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
|
| 2390 |
output[index] = "O".to_string();
|
| 2391 |
continue;
|
|
@@ -2505,7 +2526,11 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2505 |
}
|
| 2506 |
}
|
| 2507 |
if roles[index].starts_with("EPISODE")
|
| 2508 |
-
&& text.chars().all(|ch| ch.is_ascii_digit())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2509 |
&& output[..index].iter().any(|role| role == "SPECIAL")
|
| 2510 |
&& !output[..index].iter().any(|role| role.starts_with("EPISODE"))
|
| 2511 |
{
|
|
@@ -2531,9 +2556,13 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2531 |
&& groups[index - 1].class_name == "SEP"
|
| 2532 |
{
|
| 2533 |
let previous_text = group_text(tokens, &groups[index - 2]);
|
|
|
|
|
|
|
|
|
|
| 2534 |
if previous_text
|
| 2535 |
.chars()
|
| 2536 |
.any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
|
|
|
|
| 2537 |
{
|
| 2538 |
output[index] = "RESOLUTION".to_string();
|
| 2539 |
continue;
|
|
@@ -2724,11 +2753,13 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2724 |
|| previous_text.contains("下午")
|
| 2725 |
|| previous_text.contains('年')
|
| 2726 |
|| previous_text.contains('月')
|
|
|
|
| 2727 |
|| next_text.contains('点')
|
| 2728 |
|| next_text.contains('點')
|
| 2729 |
|| next_text.contains('半')
|
| 2730 |
|| next_text.contains('月')
|
| 2731 |
|| next_text.contains('日')
|
|
|
|
| 2732 |
{
|
| 2733 |
output[index] = "O".to_string();
|
| 2734 |
}
|
|
@@ -2898,11 +2929,22 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
|
|
| 2898 |
labels.push("O".to_string());
|
| 2899 |
continue;
|
| 2900 |
}
|
| 2901 |
-
if CJK_SEASON_TOKEN_RE.is_match(&piece) {
|
| 2902 |
output_pieces.push(piece);
|
| 2903 |
labels.push("B-SEASON".to_string());
|
| 2904 |
continue;
|
| 2905 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2906 |
if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) {
|
| 2907 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 2908 |
let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
|
@@ -2952,6 +2994,23 @@ fn is_standalone_separator(token: &str) -> bool {
|
|
| 2952 |
.is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
|
| 2953 |
}
|
| 2954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2955 |
fn project_refined_tokens(
|
| 2956 |
tokens: &[String],
|
| 2957 |
groups: &[Group],
|
|
@@ -2982,6 +3041,13 @@ fn project_refined_tokens(
|
|
| 2982 |
continue;
|
| 2983 |
}
|
| 2984 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2985 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
| 2986 |
if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
|
| 2987 |
output_tokens.extend(pieces);
|
|
@@ -3705,5 +3771,42 @@ mod tests {
|
|
| 3705 |
assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string())));
|
| 3706 |
assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string())));
|
| 3707 |
assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3708 |
}
|
| 3709 |
}
|
|
|
|
| 201 |
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 202 |
Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
|
| 203 |
});
|
| 204 |
+
static ASCII_SEASON_SUFFIX_RE: Lazy<Regex> =
|
| 205 |
+
Lazy::new(|| Regex::new(r"(?i)^(.+?)[\s_.-]+(S\d{1,2})$").unwrap());
|
| 206 |
+
static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
|
| 207 |
+
Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
|
| 208 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 209 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 210 |
+
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
| 211 |
});
|
| 212 |
static VOLUME_RE: Lazy<Regex> =
|
| 213 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
|
|
| 2191 |
| "THEATER GREETING EVENT"
|
| 2192 |
| "TOKUTEN"
|
| 2193 |
| "TRAILER"
|
| 2194 |
+
| "TV SPOT"
|
| 2195 |
| "WORLD PREMIERE"
|
| 2196 |
+
| "予告"
|
| 2197 |
| "番宣"
|
| 2198 |
| "宣番"
|
| 2199 |
| "映像特典"
|
|
|
|
| 2202 |
|| normalized.contains("特典映像")
|
| 2203 |
|| normalized.contains("番宣")
|
| 2204 |
|| normalized.contains("宣番")
|
| 2205 |
+
|| normalized.contains("TV SPOT")
|
| 2206 |
+
|| text.contains("予告")
|
| 2207 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2208 |
}
|
| 2209 |
|
|
|
|
| 2214 |
&["Zom", "100"],
|
| 2215 |
&["Kamisama", "Hajimemashita", "2"],
|
| 2216 |
&["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
|
| 2217 |
+
&["Ghiblies", "Episode", "2"],
|
| 2218 |
&["Lupin The Thrid Jigen Daisuke no Bohyou"],
|
| 2219 |
&["Lupin The Third Jigen Daisuke no Bohyou"],
|
| 2220 |
];
|
|
|
|
| 2395 |
if output[index] == "O" && groups[index].class_name.contains("SXE") {
|
| 2396 |
output[index] = "EPISODE".to_string();
|
| 2397 |
}
|
| 2398 |
+
if text.eq_ignore_ascii_case("TV") {
|
| 2399 |
+
let next_text = (index + 1..roles.len())
|
| 2400 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2401 |
+
.map(|cursor| (cursor, group_text(tokens, &groups[cursor])));
|
| 2402 |
+
if let Some((spot_index, spot_text)) = next_text {
|
| 2403 |
+
if spot_text.eq_ignore_ascii_case("Spot") {
|
| 2404 |
+
output[index] = "SPECIAL".to_string();
|
| 2405 |
+
output[spot_index] = "SPECIAL".to_string();
|
| 2406 |
+
continue;
|
| 2407 |
+
}
|
| 2408 |
+
}
|
| 2409 |
+
}
|
| 2410 |
if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
|
| 2411 |
output[index] = "O".to_string();
|
| 2412 |
continue;
|
|
|
|
| 2526 |
}
|
| 2527 |
}
|
| 2528 |
if roles[index].starts_with("EPISODE")
|
| 2529 |
+
&& (text.chars().all(|ch| ch.is_ascii_digit())
|
| 2530 |
+
|| matches!(
|
| 2531 |
+
classify_atom(&text).as_str(),
|
| 2532 |
+
"EPISODE" | "EPISODE_VERSION"
|
| 2533 |
+
))
|
| 2534 |
&& output[..index].iter().any(|role| role == "SPECIAL")
|
| 2535 |
&& !output[..index].iter().any(|role| role.starts_with("EPISODE"))
|
| 2536 |
{
|
|
|
|
| 2556 |
&& groups[index - 1].class_name == "SEP"
|
| 2557 |
{
|
| 2558 |
let previous_text = group_text(tokens, &groups[index - 2]);
|
| 2559 |
+
let next_sourceish = (index + 1..roles.len())
|
| 2560 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2561 |
+
.is_some_and(|cursor| matches!(roles[cursor].as_str(), "SOURCE" | "RESOLUTION"));
|
| 2562 |
if previous_text
|
| 2563 |
.chars()
|
| 2564 |
.any(|ch| ch.is_ascii_digit() || matches!(ch, '.' | '-' | '_' | '.'))
|
| 2565 |
+
|| next_sourceish
|
| 2566 |
{
|
| 2567 |
output[index] = "RESOLUTION".to_string();
|
| 2568 |
continue;
|
|
|
|
| 2753 |
|| previous_text.contains("下午")
|
| 2754 |
|| previous_text.contains('年')
|
| 2755 |
|| previous_text.contains('月')
|
| 2756 |
+
|| previous_text.contains('秒')
|
| 2757 |
|| next_text.contains('点')
|
| 2758 |
|| next_text.contains('點')
|
| 2759 |
|| next_text.contains('半')
|
| 2760 |
|| next_text.contains('月')
|
| 2761 |
|| next_text.contains('日')
|
| 2762 |
+
|| next_text.contains('秒')
|
| 2763 |
{
|
| 2764 |
output[index] = "O".to_string();
|
| 2765 |
}
|
|
|
|
| 2929 |
labels.push("O".to_string());
|
| 2930 |
continue;
|
| 2931 |
}
|
| 2932 |
+
if CJK_SEASON_TOKEN_RE.is_match(&piece) || SEASON_RE.is_match(&piece) {
|
| 2933 |
output_pieces.push(piece);
|
| 2934 |
labels.push("B-SEASON".to_string());
|
| 2935 |
continue;
|
| 2936 |
}
|
| 2937 |
+
if let Some(caps) = ASCII_SEASON_SUFFIX_RE.captures(&piece) {
|
| 2938 |
+
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 2939 |
+
let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
| 2940 |
+
if !before.is_empty() {
|
| 2941 |
+
output_pieces.push(before.to_string());
|
| 2942 |
+
labels.push("B-TITLE".to_string());
|
| 2943 |
+
}
|
| 2944 |
+
output_pieces.push(season.to_string());
|
| 2945 |
+
labels.push("B-SEASON".to_string());
|
| 2946 |
+
continue;
|
| 2947 |
+
}
|
| 2948 |
if let Some(caps) = CJK_SEASON_EMBEDDED_RE.captures(&piece) {
|
| 2949 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 2950 |
let season = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
|
|
|
| 2994 |
.is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
|
| 2995 |
}
|
| 2996 |
|
| 2997 |
+
fn split_cjk_title_lang_prefix(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
| 2998 |
+
let caps = CJK_TITLE_LANG_PREFIX_RE.captures(token)?;
|
| 2999 |
+
let title = caps.get(1)?.as_str();
|
| 3000 |
+
let lang = caps.get(2)?.as_str();
|
| 3001 |
+
let marker = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
|
| 3002 |
+
if title.chars().count() < 2 {
|
| 3003 |
+
return None;
|
| 3004 |
+
}
|
| 3005 |
+
let mut pieces = vec![title.to_string(), lang.to_string()];
|
| 3006 |
+
let mut labels = vec!["B-TITLE".to_string(), "B-SOURCE".to_string()];
|
| 3007 |
+
if !marker.is_empty() {
|
| 3008 |
+
pieces.push(marker.to_string());
|
| 3009 |
+
labels.push("O".to_string());
|
| 3010 |
+
}
|
| 3011 |
+
Some((pieces, labels))
|
| 3012 |
+
}
|
| 3013 |
+
|
| 3014 |
fn project_refined_tokens(
|
| 3015 |
tokens: &[String],
|
| 3016 |
groups: &[Group],
|
|
|
|
| 3041 |
continue;
|
| 3042 |
}
|
| 3043 |
}
|
| 3044 |
+
if role == "SOURCE" {
|
| 3045 |
+
if let Some((pieces, labels)) = split_cjk_title_lang_prefix(token) {
|
| 3046 |
+
output_tokens.extend(pieces);
|
| 3047 |
+
output_labels.extend(labels);
|
| 3048 |
+
continue;
|
| 3049 |
+
}
|
| 3050 |
+
}
|
| 3051 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
| 3052 |
if let Some((pieces, labels)) = split_sxe_token(&strip_wrapper(token)) {
|
| 3053 |
output_tokens.extend(pieces);
|
|
|
|
| 3771 |
assert!(doremi_bonus.contains(&("おジャ魔女どれみナ".to_string(), "B-TITLE".to_string())));
|
| 3772 |
assert!(doremi_bonus.contains(&("07".to_string(), "B-SPECIAL".to_string())));
|
| 3773 |
assert!(!doremi_bonus.contains(&("07".to_string(), "B-EPISODE".to_string())));
|
| 3774 |
+
|
| 3775 |
+
let bd_menu =
|
| 3776 |
+
labels_for("[HYSUB]Kuusen Madoushi Kouhosei no Kyoukan[BDMenu][01v1][MP4][1280X720]");
|
| 3777 |
+
assert!(bd_menu.contains(&("BDMenu".to_string(), "B-SPECIAL".to_string())));
|
| 3778 |
+
assert!(bd_menu.contains(&("01v1".to_string(), "B-SPECIAL".to_string())));
|
| 3779 |
+
assert!(!bd_menu.contains(&("BDMenu".to_string(), "B-TITLE".to_string())));
|
| 3780 |
+
|
| 3781 |
+
let ura_on = labels_for("K-ON !! (TV S2 2010). URA-ON !! 01; 1080_h264_flac");
|
| 3782 |
+
assert!(ura_on.contains(&("K".to_string(), "B-TITLE".to_string())));
|
| 3783 |
+
assert!(ura_on.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3784 |
+
assert!(ura_on.contains(&("1080".to_string(), "B-RESOLUTION".to_string())));
|
| 3785 |
+
assert!(!ura_on.contains(&("1080".to_string(), "B-EPISODE".to_string())));
|
| 3786 |
+
|
| 3787 |
+
let machikado = labels_for("[KTXP][Machikado_Mazoku_S2][Mini][01][GB][1080p][BDrip][HEVC]");
|
| 3788 |
+
assert!(machikado.contains(&("Machikado".to_string(), "B-TITLE".to_string())));
|
| 3789 |
+
assert!(machikado.contains(&("S2".to_string(), "B-SEASON".to_string())));
|
| 3790 |
+
assert!(machikado.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3791 |
+
|
| 3792 |
+
let ronin = labels_for("【蓝色狂想】魔神坛斗士国日双语第01集");
|
| 3793 |
+
assert!(ronin.contains(&("魔神坛斗士".to_string(), "B-TITLE".to_string())));
|
| 3794 |
+
assert!(ronin.contains(&("国日双语".to_string(), "B-SOURCE".to_string())));
|
| 3795 |
+
assert!(ronin.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3796 |
+
|
| 3797 |
+
let ghiblies = labels_for("Ghiblies - Episode 2 op");
|
| 3798 |
+
assert!(ghiblies.contains(&("Ghiblies".to_string(), "B-TITLE".to_string())));
|
| 3799 |
+
assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 3800 |
+
assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
| 3801 |
+
|
| 3802 |
+
let tv_spot = labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
|
| 3803 |
+
assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
|
| 3804 |
+
assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
|
| 3805 |
+
assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 3806 |
+
|
| 3807 |
+
let preview_seconds =
|
| 3808 |
+
labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
|
| 3809 |
+
assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
|
| 3810 |
+
assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
|
| 3811 |
}
|
| 3812 |
}
|