Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Improve DMHY low-frequency metadata handling
Browse files
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -222,7 +222,7 @@ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 222 |
Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
|
| 223 |
});
|
| 224 |
static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
|
| 225 |
-
Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
|
| 226 |
});
|
| 227 |
static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
| 228 |
Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
|
|
@@ -2207,6 +2207,16 @@ fn is_special_title_phrase(text: &str) -> bool {
|
|
| 2207 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2208 |
}
|
| 2209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2210 |
const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
| 2211 |
&["SPY", "x", "FAMILY"],
|
| 2212 |
&["Spy", "x", "Family"],
|
|
@@ -2222,7 +2232,15 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
|
| 2222 |
fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
|
| 2223 |
if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
|
| 2224 |
for (index, group) in groups.iter().enumerate() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2225 |
if group.class_name == "BRACKET_TEXT"
|
|
|
|
| 2226 |
&& whitelists
|
| 2227 |
.group_names
|
| 2228 |
.contains(&normalize_whitelist_name(&group_text(tokens, group)))
|
|
@@ -2315,6 +2333,22 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2315 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 2316 |
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
|
| 2317 |
apply_known_title_phrases(tokens, groups, &mut output);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2318 |
if roles
|
| 2319 |
.first()
|
| 2320 |
.is_some_and(|role| role.starts_with("EPISODE"))
|
|
@@ -2728,10 +2762,23 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2728 |
} else {
|
| 2729 |
String::new()
|
| 2730 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2731 |
if previous_text.ends_with('第') && next_text.starts_with('期') {
|
| 2732 |
output[index] = "SEASON".to_string();
|
| 2733 |
continue;
|
| 2734 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2735 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 2736 |
&& (output[..index]
|
| 2737 |
.iter()
|
|
@@ -3127,6 +3174,19 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3127 |
];
|
| 3128 |
let mut output = labels.to_vec();
|
| 3129 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3130 |
if label != "O" || !entity_joiners.contains(&token.as_str()) {
|
| 3131 |
continue;
|
| 3132 |
}
|
|
@@ -3808,5 +3868,21 @@ mod tests {
|
|
| 3808 |
labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
|
| 3809 |
assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
|
| 3810 |
assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3811 |
}
|
| 3812 |
}
|
|
|
|
| 222 |
Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
|
| 223 |
});
|
| 224 |
static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
|
| 225 |
+
Regex::new(r"(?i)^(?:WEB|WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HD|UHD|HDTV|TVRip|REMUX|xvid|x26[45]|h\.?26[45]|HEVC|AVC|AV1|YUV\d+P?\d*|AAC\s*\d*(?:\.\d+)?|DDP\s*\d*(?:\.\d+)?|FLAC|MP3|DTS|HDMA|DTS-HDMA|E?AC3x?\d*(?:\.\d+)?|Opus|WMV\d*|\d(?:\.\d)?ch|10[-_. ]?bit|8[-_. ]?bit|Hi10p?|Ma10p|ASSx?\d*|SRTx?\d*|SUP|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
|
| 226 |
});
|
| 227 |
static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
|
| 228 |
Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
|
|
|
|
| 2207 |
|| SPECIAL_TITLE_PHRASE_RE.is_match(text)
|
| 2208 |
}
|
| 2209 |
|
| 2210 |
+
fn looks_like_release_group(text: &str) -> bool {
|
| 2211 |
+
let normalized = text.to_ascii_lowercase();
|
| 2212 |
+
normalized.contains("fansub")
|
| 2213 |
+
|| normalized.ends_with("sub")
|
| 2214 |
+
|| normalized.contains("sub&")
|
| 2215 |
+
|| normalized.contains("&sub")
|
| 2216 |
+
|| normalized.contains("字幕组")
|
| 2217 |
+
|| normalized.contains("字幕組")
|
| 2218 |
+
}
|
| 2219 |
+
|
| 2220 |
const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
| 2221 |
&["SPY", "x", "FAMILY"],
|
| 2222 |
&["Spy", "x", "Family"],
|
|
|
|
| 2232 |
fn apply_known_title_phrases(tokens: &[String], groups: &[Group], roles: &mut [String]) {
|
| 2233 |
if let Some(whitelists) = RUNTIME_WHITELISTS.get() {
|
| 2234 |
for (index, group) in groups.iter().enumerate() {
|
| 2235 |
+
let previous_structural = roles[..index].iter().any(|role| {
|
| 2236 |
+
role.starts_with("EPISODE")
|
| 2237 |
+
|| matches!(
|
| 2238 |
+
role.as_str(),
|
| 2239 |
+
"SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
|
| 2240 |
+
)
|
| 2241 |
+
});
|
| 2242 |
if group.class_name == "BRACKET_TEXT"
|
| 2243 |
+
&& !previous_structural
|
| 2244 |
&& whitelists
|
| 2245 |
.group_names
|
| 2246 |
.contains(&normalize_whitelist_name(&group_text(tokens, group)))
|
|
|
|
| 2333 |
let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
|
| 2334 |
let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"];
|
| 2335 |
apply_known_title_phrases(tokens, groups, &mut output);
|
| 2336 |
+
if output.first().is_some_and(|role| role == "GROUP") {
|
| 2337 |
+
let first_text = group_text(tokens, &groups[0]);
|
| 2338 |
+
let first_is_known_group = RUNTIME_WHITELISTS.get().is_some_and(|whitelists| {
|
| 2339 |
+
whitelists
|
| 2340 |
+
.group_names
|
| 2341 |
+
.contains(&normalize_whitelist_name(&first_text))
|
| 2342 |
+
});
|
| 2343 |
+
if !first_is_known_group {
|
| 2344 |
+
if let Some(groupish_index) = (1..groups.len()).find(|&index| {
|
| 2345 |
+
output[index] == "TITLE" && looks_like_release_group(&group_text(tokens, &groups[index]))
|
| 2346 |
+
}) {
|
| 2347 |
+
output[0] = "TITLE".to_string();
|
| 2348 |
+
output[groupish_index] = "GROUP".to_string();
|
| 2349 |
+
}
|
| 2350 |
+
}
|
| 2351 |
+
}
|
| 2352 |
if roles
|
| 2353 |
.first()
|
| 2354 |
.is_some_and(|role| role.starts_with("EPISODE"))
|
|
|
|
| 2762 |
} else {
|
| 2763 |
String::new()
|
| 2764 |
};
|
| 2765 |
+
let previous_real_text = (0..index)
|
| 2766 |
+
.rev()
|
| 2767 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2768 |
+
.map(|cursor| group_text(tokens, &groups[cursor]))
|
| 2769 |
+
.unwrap_or_default();
|
| 2770 |
if previous_text.ends_with('第') && next_text.starts_with('期') {
|
| 2771 |
output[index] = "SEASON".to_string();
|
| 2772 |
continue;
|
| 2773 |
}
|
| 2774 |
+
if matches!(
|
| 2775 |
+
previous_real_text.to_ascii_lowercase().as_str(),
|
| 2776 |
+
"lesson" | "part"
|
| 2777 |
+
)
|
| 2778 |
+
{
|
| 2779 |
+
output[index] = "O".to_string();
|
| 2780 |
+
continue;
|
| 2781 |
+
}
|
| 2782 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 2783 |
&& (output[..index]
|
| 2784 |
.iter()
|
|
|
|
| 3174 |
];
|
| 3175 |
let mut output = labels.to_vec();
|
| 3176 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
| 3177 |
+
if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
|
| 3178 |
+
let previous_word = (0..index)
|
| 3179 |
+
.rev()
|
| 3180 |
+
.find(|&cursor| {
|
| 3181 |
+
!joiners.contains(&tokens[cursor].as_str()) && labels[cursor] != "O"
|
| 3182 |
+
|| tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3183 |
+
})
|
| 3184 |
+
.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3185 |
+
if matches!(previous_word.as_deref(), Some("lesson" | "part")) {
|
| 3186 |
+
output[index] = "O".to_string();
|
| 3187 |
+
continue;
|
| 3188 |
+
}
|
| 3189 |
+
}
|
| 3190 |
if label != "O" || !entity_joiners.contains(&token.as_str()) {
|
| 3191 |
continue;
|
| 3192 |
}
|
|
|
|
| 3868 |
labels_for("[DVD] 鋼鉄天使くるみ 予告 第03話 30秒バージョン (640x480 WMV9)");
|
| 3869 |
assert!(preview_seconds.contains(&("03".to_string(), "B-EPISODE".to_string())));
|
| 3870 |
assert!(!preview_seconds.contains(&("30".to_string(), "B-EPISODE".to_string())));
|
| 3871 |
+
|
| 3872 |
+
let hi10_source =
|
| 3873 |
+
labels_for("[POPGO][Shigatsu wa Kimi no Uso] [01][Hi10][720P][GB][A964DA24]");
|
| 3874 |
+
assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
|
| 3875 |
+
assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
|
| 3876 |
+
|
| 3877 |
+
let souten =
|
| 3878 |
+
labels_for("[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]");
|
| 3879 |
+
assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
|
| 3880 |
+
assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
|
| 3881 |
+
assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
|
| 3882 |
+
|
| 3883 |
+
let bonjour =
|
| 3884 |
+
labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
|
| 3885 |
+
assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3886 |
+
assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 3887 |
}
|
| 3888 |
}
|