Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Refine DMHY template labeling rules
Browse files
tools/rust_dmhy_template_apply/README.md
CHANGED
|
@@ -17,7 +17,7 @@ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml --
|
|
| 17 |
--recipe-top 5000 `
|
| 18 |
--review-top 5000 `
|
| 19 |
--min-count 2 `
|
| 20 |
-
--recipe-min-count
|
| 21 |
--threads 24
|
| 22 |
```
|
| 23 |
|
|
|
|
| 17 |
--recipe-top 5000 `
|
| 18 |
--review-top 5000 `
|
| 19 |
--min-count 2 `
|
| 20 |
+
--recipe-min-count 25 `
|
| 21 |
--threads 24
|
| 22 |
```
|
| 23 |
|
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -61,7 +61,7 @@ struct Args {
|
|
| 61 |
review_top: usize,
|
| 62 |
#[arg(long, default_value_t = 8)]
|
| 63 |
examples: usize,
|
| 64 |
-
#[arg(long, default_value_t =
|
| 65 |
recipe_min_count: usize,
|
| 66 |
#[arg(long, default_value = "high")]
|
| 67 |
confidence: String,
|
|
@@ -155,6 +155,8 @@ static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 155 |
static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
|
| 156 |
static SXE_VALUE_RE: Lazy<Regex> =
|
| 157 |
Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
|
|
|
|
|
|
|
| 158 |
static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 159 |
Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
|
| 160 |
});
|
|
@@ -1357,6 +1359,19 @@ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
|
| 1357 |
Some((pieces, labels))
|
| 1358 |
}
|
| 1359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
| 1361 |
let caps = SEASON_VALUE_RE.captures(token)?;
|
| 1362 |
Some((
|
|
@@ -1491,6 +1506,44 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1491 |
continue;
|
| 1492 |
}
|
| 1493 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1494 |
if roles[index] == "TITLE" && is_special_title_phrase(&text) {
|
| 1495 |
output[index] = "SPECIAL".to_string();
|
| 1496 |
continue;
|
|
@@ -1726,6 +1779,11 @@ fn project_refined_tokens(
|
|
| 1726 |
output_labels.extend(labels);
|
| 1727 |
continue;
|
| 1728 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1729 |
}
|
| 1730 |
for piece in split_refined_token(token) {
|
| 1731 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
|
@@ -1734,6 +1792,11 @@ fn project_refined_tokens(
|
|
| 1734 |
output_labels.extend(labels);
|
| 1735 |
continue;
|
| 1736 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1737 |
}
|
| 1738 |
let label = label_for_refined_piece(&piece, role, &group.class_name);
|
| 1739 |
let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
|
|
@@ -1879,6 +1942,9 @@ mod tests {
|
|
| 1879 |
("02".to_string(), "B-EPISODE".to_string())
|
| 1880 |
]
|
| 1881 |
);
|
|
|
|
|
|
|
|
|
|
| 1882 |
let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
|
| 1883 |
assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
|
| 1884 |
assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
|
@@ -1891,6 +1957,11 @@ mod tests {
|
|
| 1891 |
assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
|
| 1892 |
let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
|
| 1893 |
assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1894 |
|
| 1895 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 1896 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
@@ -1918,7 +1989,8 @@ mod tests {
|
|
| 1918 |
);
|
| 1919 |
let woody = labels_for(&trimmed);
|
| 1920 |
assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
|
| 1921 |
-
assert!(woody.contains(&("
|
|
|
|
| 1922 |
assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
|
| 1923 |
assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
|
| 1924 |
assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 61 |
review_top: usize,
|
| 62 |
#[arg(long, default_value_t = 8)]
|
| 63 |
examples: usize,
|
| 64 |
+
#[arg(long, default_value_t = 25)]
|
| 65 |
recipe_min_count: usize,
|
| 66 |
#[arg(long, default_value = "high")]
|
| 67 |
confidence: String,
|
|
|
|
| 155 |
static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
|
| 156 |
static SXE_VALUE_RE: Lazy<Regex> =
|
| 157 |
Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
|
| 158 |
+
static EPISODE_VALUE_RE: Lazy<Regex> =
|
| 159 |
+
Lazy::new(|| Regex::new(r"(?i)^(EP|E|#)(\d{1,4})(?:v(\d+))?$").unwrap());
|
| 160 |
static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
| 161 |
Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
|
| 162 |
});
|
|
|
|
| 1359 |
Some((pieces, labels))
|
| 1360 |
}
|
| 1361 |
|
| 1362 |
+
fn split_episode_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
| 1363 |
+
let caps = EPISODE_VALUE_RE.captures(token)?;
|
| 1364 |
+
let mut pieces = vec![caps[1].to_string(), caps[2].to_string()];
|
| 1365 |
+
let mut labels = vec!["O".to_string(), "B-EPISODE".to_string()];
|
| 1366 |
+
if let Some(version) = caps.get(3) {
|
| 1367 |
+
pieces.push("v".to_string());
|
| 1368 |
+
pieces.push(version.as_str().to_string());
|
| 1369 |
+
labels.push("O".to_string());
|
| 1370 |
+
labels.push("O".to_string());
|
| 1371 |
+
}
|
| 1372 |
+
Some((pieces, labels))
|
| 1373 |
+
}
|
| 1374 |
+
|
| 1375 |
fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
|
| 1376 |
let caps = SEASON_VALUE_RE.captures(token)?;
|
| 1377 |
Some((
|
|
|
|
| 1506 |
continue;
|
| 1507 |
}
|
| 1508 |
}
|
| 1509 |
+
if roles[index].starts_with("EPISODE")
|
| 1510 |
+
&& index >= 2
|
| 1511 |
+
&& output[..index].iter().any(|role| role == "TITLE")
|
| 1512 |
+
&& group_text(tokens, &groups[index])
|
| 1513 |
+
.chars()
|
| 1514 |
+
.all(|ch| ch.is_ascii_digit())
|
| 1515 |
+
{
|
| 1516 |
+
let next_episode_word = index + 2 < roles.len()
|
| 1517 |
+
&& groups[index + 1].class_name == "SEP"
|
| 1518 |
+
&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode");
|
| 1519 |
+
if next_episode_word {
|
| 1520 |
+
let mut run = Vec::new();
|
| 1521 |
+
let mut cursor = index + 2;
|
| 1522 |
+
while cursor < roles.len() {
|
| 1523 |
+
if groups[cursor].class_name == "SEP" {
|
| 1524 |
+
cursor += 1;
|
| 1525 |
+
continue;
|
| 1526 |
+
}
|
| 1527 |
+
if groups[cursor].class_name == "TEXT" && !roles[cursor].starts_with("EPISODE")
|
| 1528 |
+
{
|
| 1529 |
+
run.push(cursor);
|
| 1530 |
+
cursor += 1;
|
| 1531 |
+
continue;
|
| 1532 |
+
}
|
| 1533 |
+
break;
|
| 1534 |
+
}
|
| 1535 |
+
let later_episode = roles[cursor..]
|
| 1536 |
+
.iter()
|
| 1537 |
+
.any(|role| role.starts_with("EPISODE"));
|
| 1538 |
+
if run.len() >= 2 && later_episode {
|
| 1539 |
+
output[index] = "TITLE".to_string();
|
| 1540 |
+
for item in run {
|
| 1541 |
+
output[item] = "TITLE".to_string();
|
| 1542 |
+
}
|
| 1543 |
+
continue;
|
| 1544 |
+
}
|
| 1545 |
+
}
|
| 1546 |
+
}
|
| 1547 |
if roles[index] == "TITLE" && is_special_title_phrase(&text) {
|
| 1548 |
output[index] = "SPECIAL".to_string();
|
| 1549 |
continue;
|
|
|
|
| 1779 |
output_labels.extend(labels);
|
| 1780 |
continue;
|
| 1781 |
}
|
| 1782 |
+
if let Some((pieces, labels)) = split_episode_token(&strip_wrapper(token)) {
|
| 1783 |
+
output_tokens.extend(pieces);
|
| 1784 |
+
output_labels.extend(labels);
|
| 1785 |
+
continue;
|
| 1786 |
+
}
|
| 1787 |
}
|
| 1788 |
for piece in split_refined_token(token) {
|
| 1789 |
if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
|
|
|
|
| 1792 |
output_labels.extend(labels);
|
| 1793 |
continue;
|
| 1794 |
}
|
| 1795 |
+
if let Some((pieces, labels)) = split_episode_token(&piece) {
|
| 1796 |
+
output_tokens.extend(pieces);
|
| 1797 |
+
output_labels.extend(labels);
|
| 1798 |
+
continue;
|
| 1799 |
+
}
|
| 1800 |
}
|
| 1801 |
let label = label_for_refined_piece(&piece, role, &group.class_name);
|
| 1802 |
let (pieces, labels) = normalize_generated_tokens(&[piece], &[label]);
|
|
|
|
| 1942 |
("02".to_string(), "B-EPISODE".to_string())
|
| 1943 |
]
|
| 1944 |
);
|
| 1945 |
+
let ep_prefix = labels_for("Toradora! EP01 [BD 1080p]");
|
| 1946 |
+
assert!(ep_prefix.contains(&("EP".to_string(), "O".to_string())));
|
| 1947 |
+
assert!(ep_prefix.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 1948 |
let bracket_sxe = labels_for("[FLsnow.feat.PO][Himitsu_no_Aipri][1080P][S2E01]");
|
| 1949 |
assert!(bracket_sxe.contains(&("2".to_string(), "B-SEASON".to_string())));
|
| 1950 |
assert!(bracket_sxe.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 1957 |
assert!(!beyblade.contains(&("X".to_string(), "B-SEASON".to_string())));
|
| 1958 |
let bang_title = labels_for("[Dymy][Gugure! Kokkuri-san][06][BIG5][1280X720]");
|
| 1959 |
assert!(bang_title.contains(&("!".to_string(), "B-TITLE".to_string())));
|
| 1960 |
+
let pso2 = labels_for("[Lilith-Raws] Phantasy Star Online 2 Episode Oracle - 01 [1080p]");
|
| 1961 |
+
assert!(pso2.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 1962 |
+
assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
|
| 1963 |
+
assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
|
| 1964 |
+
assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 1965 |
|
| 1966 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 1967 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 1989 |
);
|
| 1990 |
let woody = labels_for(&trimmed);
|
| 1991 |
assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
|
| 1992 |
+
assert!(woody.contains(&("E".to_string(), "O".to_string())));
|
| 1993 |
+
assert!(woody.contains(&("07".to_string(), "B-EPISODE".to_string())));
|
| 1994 |
assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
|
| 1995 |
assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
|
| 1996 |
assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));
|