Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Add low-frequency DMHY audit gate
Browse files
tools/rust_dmhy_template_apply/README.md
CHANGED
|
@@ -31,6 +31,18 @@ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml --
|
|
| 31 |
--manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
|
| 32 |
```
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
Optional controls:
|
| 35 |
|
| 36 |
```powershell
|
|
@@ -47,3 +59,10 @@ The output is intended to match `tools/apply_dmhy_template_recipes.py` at the
|
|
| 47 |
record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
|
| 48 |
plus optional `source_filename`, `path_trimmed`, and
|
| 49 |
`dropped_title_candidate_positions`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
--manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
|
| 32 |
```
|
| 33 |
|
| 34 |
+
Audit low-frequency recipe output from the repository root:
|
| 35 |
+
|
| 36 |
+
```powershell
|
| 37 |
+
cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
|
| 38 |
+
--audit-low-frequency `
|
| 39 |
+
--input datasets\AnimeName\dmhy_list.jsonl `
|
| 40 |
+
--recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
|
| 41 |
+
--audit-output reports\dmhy_low_frequency_audit.rust.jsonl `
|
| 42 |
+
--audit-max-count 50 `
|
| 43 |
+
--threads 24
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
Optional controls:
|
| 47 |
|
| 48 |
```powershell
|
|
|
|
| 59 |
record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
|
| 60 |
plus optional `source_filename`, `path_trimmed`, and
|
| 61 |
`dropped_title_candidate_positions`.
|
| 62 |
+
|
| 63 |
+
For low-frequency templates (`count <= --audit-max-count`, default `50`), apply
|
| 64 |
+
uses a conservative gate: records with `no_title`, `multiple_title_spans`,
|
| 65 |
+
`path_retained`, or `hash_labeled` audit warnings are skipped from the training
|
| 66 |
+
JSONL and left in the audit/review files. This keeps common templates stable
|
| 67 |
+
while preventing rare ambiguous path/title cases from polluting the generated
|
| 68 |
+
dataset.
|
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -17,6 +17,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
|
|
| 17 |
struct Args {
|
| 18 |
#[arg(long)]
|
| 19 |
cluster: bool,
|
|
|
|
|
|
|
| 20 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 21 |
input: PathBuf,
|
| 22 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
@@ -47,6 +49,10 @@ struct Args {
|
|
| 47 |
recipes_output: PathBuf,
|
| 48 |
#[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
|
| 49 |
review_output: PathBuf,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
#[arg(long)]
|
| 51 |
limit: Option<usize>,
|
| 52 |
#[arg(long)]
|
|
@@ -115,6 +121,7 @@ struct Stats {
|
|
| 115 |
skipped_no_recipe: usize,
|
| 116 |
skipped_sample_cap: usize,
|
| 117 |
skipped_role_mismatch: usize,
|
|
|
|
| 118 |
written: usize,
|
| 119 |
}
|
| 120 |
|
|
@@ -164,7 +171,7 @@ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
|
| 164 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 165 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 166 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 167 |
-
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
|
| 168 |
});
|
| 169 |
static VOLUME_RE: Lazy<Regex> =
|
| 170 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
@@ -187,6 +194,8 @@ static PATH_SEGMENT_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 187 |
});
|
| 188 |
static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
|
| 189 |
Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
|
|
|
|
|
|
|
| 190 |
static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
|
| 191 |
Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
|
| 192 |
static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
|
@@ -225,6 +234,9 @@ fn main() -> Result<()> {
|
|
| 225 |
if args.cluster {
|
| 226 |
return run_cluster(&args);
|
| 227 |
}
|
|
|
|
|
|
|
|
|
|
| 228 |
if args.expand != "all" && args.expand != "sample" {
|
| 229 |
bail!("--expand must be all or sample");
|
| 230 |
}
|
|
@@ -293,6 +305,9 @@ fn main() -> Result<()> {
|
|
| 293 |
"no_recipe" => stats.skipped_no_recipe += 1,
|
| 294 |
"sample_cap" => stats.skipped_sample_cap += 1,
|
| 295 |
"role_mismatch" => stats.skipped_role_mismatch += 1,
|
|
|
|
|
|
|
|
|
|
| 296 |
_ => {}
|
| 297 |
}
|
| 298 |
}
|
|
@@ -312,6 +327,13 @@ fn main() -> Result<()> {
|
|
| 312 |
"selected_templates": recipes.len(),
|
| 313 |
"confidence": args.confidence,
|
| 314 |
"min_count": args.min_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
"expand": args.expand,
|
| 316 |
"sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
|
| 317 |
"stats": stats,
|
|
@@ -603,6 +625,156 @@ fn write_jsonl_values(path: &PathBuf, rows: &[Value]) -> Result<()> {
|
|
| 603 |
Ok(())
|
| 604 |
}
|
| 605 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
fn process_filename(
|
| 607 |
original: &str,
|
| 608 |
args: &Args,
|
|
@@ -654,6 +826,13 @@ fn process_filename(
|
|
| 654 |
}
|
| 655 |
}
|
| 656 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
if trimmed_parent {
|
| 658 |
record.source_filename = Some(original.to_string());
|
| 659 |
record.path_trimmed = Some(true);
|
|
@@ -668,6 +847,15 @@ fn process_filename(
|
|
| 668 |
}
|
| 669 |
}
|
| 670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
fn tokenize(value: &str) -> Vec<String> {
|
| 672 |
let mut output = Vec::new();
|
| 673 |
let mut index = 0;
|
|
@@ -1007,7 +1195,14 @@ fn suggested_roles(template: &str) -> Vec<String> {
|
|
| 1007 |
roles[*index] = "TITLE".to_string();
|
| 1008 |
}
|
| 1009 |
} else if bracket_text.len() == 1 {
|
| 1010 |
-
roles[bracket_text[0]] = if text.is_empty() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
}
|
| 1012 |
for index in text {
|
| 1013 |
roles[index] = "TITLE".to_string();
|
|
@@ -1052,6 +1247,9 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 1052 |
.collect();
|
| 1053 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 1054 |
if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
|
|
|
|
|
|
|
|
|
|
| 1055 |
let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
|
| 1056 |
let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
|
| 1057 |
if parent_seasons
|
|
@@ -1070,6 +1268,11 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 1070 |
}
|
| 1071 |
}
|
| 1072 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1073 |
fn path_segment_has_season(value: &str) -> bool {
|
| 1074 |
PATH_SEGMENT_SEASON_RE.is_match(value)
|
| 1075 |
}
|
|
@@ -1150,7 +1353,7 @@ fn role_label(role: &str) -> String {
|
|
| 1150 |
"SEASON" => Some("SEASON"),
|
| 1151 |
"SPECIAL" | "VOLUME" => Some("SPECIAL"),
|
| 1152 |
"RESOLUTION" => Some("RESOLUTION"),
|
| 1153 |
-
"SOURCE"
|
| 1154 |
_ => None,
|
| 1155 |
};
|
| 1156 |
entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
|
|
@@ -1311,7 +1514,10 @@ fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String
|
|
| 1311 |
if atom_class == "RESOLUTION" {
|
| 1312 |
return "B-RESOLUTION".to_string();
|
| 1313 |
}
|
| 1314 |
-
if
|
|
|
|
|
|
|
|
|
|
| 1315 |
return "B-SOURCE".to_string();
|
| 1316 |
}
|
| 1317 |
if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
|
|
@@ -1489,6 +1695,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1489 |
"vol" | "volume"
|
| 1490 |
)
|
| 1491 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1492 |
output[index - 2] = "SPECIAL".to_string();
|
| 1493 |
output[index] = "SPECIAL".to_string();
|
| 1494 |
continue;
|
|
@@ -1548,6 +1767,27 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 1548 |
output[index] = "SPECIAL".to_string();
|
| 1549 |
continue;
|
| 1550 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1551 |
if roles[index] == "TITLE"
|
| 1552 |
&& matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
|
| 1553 |
&& index + 2 < roles.len()
|
|
@@ -1616,19 +1856,26 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
|
|
| 1616 |
index += 1;
|
| 1617 |
continue;
|
| 1618 |
}
|
| 1619 |
-
if groups[index].class_name == "BRACKET_TEXT" {
|
| 1620 |
-
candidates.push((index, index + 1));
|
| 1621 |
-
index += 1;
|
| 1622 |
-
continue;
|
| 1623 |
-
}
|
| 1624 |
let start = index;
|
| 1625 |
index += 1;
|
| 1626 |
-
|
| 1627 |
-
|
| 1628 |
-
|
| 1629 |
-
|
| 1630 |
-
|
| 1631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1632 |
}
|
| 1633 |
candidates.push((start, index));
|
| 1634 |
}
|
|
@@ -1838,10 +2085,15 @@ fn project_refined_tokens(
|
|
| 1838 |
|
| 1839 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 1840 |
let joiners = [
|
| 1841 |
-
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!",
|
|
|
|
|
|
|
| 1842 |
];
|
|
|
|
| 1843 |
let entity_joiners = [
|
| 1844 |
-
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "
|
|
|
|
|
|
|
| 1845 |
];
|
| 1846 |
let mut output = labels.to_vec();
|
| 1847 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
@@ -1869,6 +2121,12 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 1869 |
output[index] = left_label.clone();
|
| 1870 |
}
|
| 1871 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1872 |
}
|
| 1873 |
output
|
| 1874 |
}
|
|
@@ -1962,6 +2220,32 @@ mod tests {
|
|
| 1962 |
assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
|
| 1963 |
assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
|
| 1964 |
assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1965 |
|
| 1966 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 1967 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
@@ -1987,6 +2271,13 @@ mod tests {
|
|
| 1987 |
trimmed,
|
| 1988 |
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 1989 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1990 |
let woody = labels_for(&trimmed);
|
| 1991 |
assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
|
| 1992 |
assert!(woody.contains(&("E".to_string(), "O".to_string())));
|
|
|
|
| 17 |
struct Args {
|
| 18 |
#[arg(long)]
|
| 19 |
cluster: bool,
|
| 20 |
+
#[arg(long)]
|
| 21 |
+
audit_low_frequency: bool,
|
| 22 |
#[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
|
| 23 |
input: PathBuf,
|
| 24 |
#[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
|
|
|
|
| 49 |
recipes_output: PathBuf,
|
| 50 |
#[arg(long, default_value = "reports/dmhy_template_review.rust.jsonl")]
|
| 51 |
review_output: PathBuf,
|
| 52 |
+
#[arg(long, default_value = "reports/dmhy_low_frequency_audit.rust.jsonl")]
|
| 53 |
+
audit_output: PathBuf,
|
| 54 |
+
#[arg(long, default_value_t = 50)]
|
| 55 |
+
audit_max_count: u64,
|
| 56 |
#[arg(long)]
|
| 57 |
limit: Option<usize>,
|
| 58 |
#[arg(long)]
|
|
|
|
| 121 |
skipped_no_recipe: usize,
|
| 122 |
skipped_sample_cap: usize,
|
| 123 |
skipped_role_mismatch: usize,
|
| 124 |
+
skipped_low_frequency_audit_warning: usize,
|
| 125 |
written: usize,
|
| 126 |
}
|
| 127 |
|
|
|
|
| 171 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 172 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 173 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 174 |
+
Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
|
| 175 |
});
|
| 176 |
static VOLUME_RE: Lazy<Regex> =
|
| 177 |
Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
|
|
|
|
| 194 |
});
|
| 195 |
static SEASON_WORD_NUMBER_RE: Lazy<Regex> =
|
| 196 |
Lazy::new(|| Regex::new(r"(?i)(?:season|saison)\s*0?(\d{1,2})").unwrap());
|
| 197 |
+
static PLAIN_SEASON_SEGMENT_RE: Lazy<Regex> =
|
| 198 |
+
Lazy::new(|| Regex::new(r"(?i)^(?:season|saison)\s*0?\d{1,2}$|^s0?\d{1,2}$").unwrap());
|
| 199 |
static S_NUMBER_SEGMENT_RE: Lazy<Regex> =
|
| 200 |
Lazy::new(|| Regex::new(r"(?i)(?:^|[^\p{L}\p{N}])s0?(\d{1,2})(?:$|[^\p{L}\p{N}])").unwrap());
|
| 201 |
static SXE_SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
|
|
|
| 234 |
if args.cluster {
|
| 235 |
return run_cluster(&args);
|
| 236 |
}
|
| 237 |
+
if args.audit_low_frequency {
|
| 238 |
+
return run_low_frequency_audit(&args);
|
| 239 |
+
}
|
| 240 |
if args.expand != "all" && args.expand != "sample" {
|
| 241 |
bail!("--expand must be all or sample");
|
| 242 |
}
|
|
|
|
| 305 |
"no_recipe" => stats.skipped_no_recipe += 1,
|
| 306 |
"sample_cap" => stats.skipped_sample_cap += 1,
|
| 307 |
"role_mismatch" => stats.skipped_role_mismatch += 1,
|
| 308 |
+
"low_frequency_audit_warning" => {
|
| 309 |
+
stats.skipped_low_frequency_audit_warning += 1
|
| 310 |
+
}
|
| 311 |
_ => {}
|
| 312 |
}
|
| 313 |
}
|
|
|
|
| 327 |
"selected_templates": recipes.len(),
|
| 328 |
"confidence": args.confidence,
|
| 329 |
"min_count": args.min_count,
|
| 330 |
+
"low_frequency_audit_max_count": args.audit_max_count,
|
| 331 |
+
"low_frequency_blocking_warnings": [
|
| 332 |
+
"hash_labeled",
|
| 333 |
+
"multiple_title_spans",
|
| 334 |
+
"no_title",
|
| 335 |
+
"path_retained"
|
| 336 |
+
],
|
| 337 |
"expand": args.expand,
|
| 338 |
"sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
|
| 339 |
"stats": stats,
|
|
|
|
| 625 |
Ok(())
|
| 626 |
}
|
| 627 |
|
| 628 |
+
fn run_low_frequency_audit(args: &Args) -> Result<()> {
|
| 629 |
+
let recipes = load_recipes(args)?;
|
| 630 |
+
let inputs = load_input(&args.input, args.limit)?;
|
| 631 |
+
let low_template_total = recipes
|
| 632 |
+
.values()
|
| 633 |
+
.filter(|recipe| recipe.count.unwrap_or(0) <= args.audit_max_count)
|
| 634 |
+
.count();
|
| 635 |
+
let mut seen_templates = HashSet::new();
|
| 636 |
+
let mut rows = Vec::new();
|
| 637 |
+
|
| 638 |
+
for original in inputs {
|
| 639 |
+
if !args.keep_encoding_noise
|
| 640 |
+
&& (has_encoding_noise(&original)
|
| 641 |
+
|| has_non_anime_noise(&original)
|
| 642 |
+
|| has_abstract_path_noise(&original))
|
| 643 |
+
{
|
| 644 |
+
continue;
|
| 645 |
+
}
|
| 646 |
+
let (training_filename, trimmed_parent) = training_filename_for(&original);
|
| 647 |
+
let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
|
| 648 |
+
let Some(recipe) = recipes.get(&key) else {
|
| 649 |
+
continue;
|
| 650 |
+
};
|
| 651 |
+
let count = recipe.count.unwrap_or(0);
|
| 652 |
+
if count > args.audit_max_count || !seen_templates.insert(recipe.template_id.clone()) {
|
| 653 |
+
continue;
|
| 654 |
+
}
|
| 655 |
+
if recipe.roles.len() != groups.len() {
|
| 656 |
+
continue;
|
| 657 |
+
}
|
| 658 |
+
let Some(mut record) = dmhy_record(&training_filename, &recipe.template_id, &recipe.roles)
|
| 659 |
+
else {
|
| 660 |
+
continue;
|
| 661 |
+
};
|
| 662 |
+
if trimmed_parent {
|
| 663 |
+
record.source_filename = Some(original.clone());
|
| 664 |
+
record.path_trimmed = Some(true);
|
| 665 |
+
}
|
| 666 |
+
rows.push(json!({
|
| 667 |
+
"template_id": recipe.template_id,
|
| 668 |
+
"count": count,
|
| 669 |
+
"template": recipe.template,
|
| 670 |
+
"filename": record.filename,
|
| 671 |
+
"source_filename": record.source_filename,
|
| 672 |
+
"path_trimmed": record.path_trimmed.unwrap_or(false),
|
| 673 |
+
"spans": entity_spans(&record.tokens, &record.labels),
|
| 674 |
+
"warnings": audit_warnings(&record),
|
| 675 |
+
"tokens": record.tokens,
|
| 676 |
+
"labels": record.labels,
|
| 677 |
+
}));
|
| 678 |
+
if seen_templates.len() >= low_template_total {
|
| 679 |
+
break;
|
| 680 |
+
}
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
rows.sort_by(|a, b| {
|
| 684 |
+
let count_a = a.get("count").and_then(Value::as_u64).unwrap_or(0);
|
| 685 |
+
let count_b = b.get("count").and_then(Value::as_u64).unwrap_or(0);
|
| 686 |
+
let id_a = a.get("template_id").and_then(Value::as_str).unwrap_or("");
|
| 687 |
+
let id_b = b.get("template_id").and_then(Value::as_str).unwrap_or("");
|
| 688 |
+
count_a.cmp(&count_b).then_with(|| id_a.cmp(id_b))
|
| 689 |
+
});
|
| 690 |
+
write_jsonl_values(&args.audit_output, &rows)?;
|
| 691 |
+
let warning_counts = warning_counts(&rows);
|
| 692 |
+
let manifest = json!({
|
| 693 |
+
"generated_at": Utc::now().to_rfc3339(),
|
| 694 |
+
"input": args.input.to_string_lossy(),
|
| 695 |
+
"recipes": args.recipes.to_string_lossy(),
|
| 696 |
+
"audit_output": args.audit_output.to_string_lossy(),
|
| 697 |
+
"audit_max_count": args.audit_max_count,
|
| 698 |
+
"low_template_total": low_template_total,
|
| 699 |
+
"audited_templates": rows.len(),
|
| 700 |
+
"warning_counts": warning_counts,
|
| 701 |
+
"implementation": "rust_dmhy_low_frequency_audit"
|
| 702 |
+
});
|
| 703 |
+
println!("{}", serde_json::to_string_pretty(&manifest)?);
|
| 704 |
+
Ok(())
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
|
| 708 |
+
let mut spans = Vec::new();
|
| 709 |
+
let mut current_label: Option<String> = None;
|
| 710 |
+
let mut current_text = String::new();
|
| 711 |
+
for (token, label) in tokens.iter().zip(labels.iter()) {
|
| 712 |
+
let entity = label
|
| 713 |
+
.strip_prefix("B-")
|
| 714 |
+
.or_else(|| label.strip_prefix("I-"))
|
| 715 |
+
.unwrap_or("O");
|
| 716 |
+
if current_label.as_deref() == Some(entity) {
|
| 717 |
+
current_text.push_str(token);
|
| 718 |
+
continue;
|
| 719 |
+
}
|
| 720 |
+
if let Some(label) = current_label.take() {
|
| 721 |
+
if label != "O" {
|
| 722 |
+
spans.push(json!({ "label": label, "text": current_text }));
|
| 723 |
+
}
|
| 724 |
+
}
|
| 725 |
+
current_label = Some(entity.to_string());
|
| 726 |
+
current_text = token.clone();
|
| 727 |
+
}
|
| 728 |
+
if let Some(label) = current_label {
|
| 729 |
+
if label != "O" {
|
| 730 |
+
spans.push(json!({ "label": label, "text": current_text }));
|
| 731 |
+
}
|
| 732 |
+
}
|
| 733 |
+
spans
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
fn audit_warnings(record: &Record) -> Vec<String> {
|
| 737 |
+
let mut warnings = Vec::new();
|
| 738 |
+
let title_spans = entity_spans(&record.tokens, &record.labels)
|
| 739 |
+
.into_iter()
|
| 740 |
+
.filter(|span| span.get("label").and_then(Value::as_str) == Some("TITLE"))
|
| 741 |
+
.count();
|
| 742 |
+
if title_spans == 0 {
|
| 743 |
+
warnings.push("no_title".to_string());
|
| 744 |
+
} else if title_spans > 1 {
|
| 745 |
+
warnings.push("multiple_title_spans".to_string());
|
| 746 |
+
}
|
| 747 |
+
if !record.labels.iter().any(|label| label.ends_with("EPISODE")) {
|
| 748 |
+
warnings.push("no_episode".to_string());
|
| 749 |
+
}
|
| 750 |
+
if record.filename.contains('/') || record.filename.contains('\\') {
|
| 751 |
+
warnings.push("path_retained".to_string());
|
| 752 |
+
}
|
| 753 |
+
for (index, token) in record.tokens.iter().enumerate() {
|
| 754 |
+
if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
|
| 755 |
+
warnings.push("hash_labeled".to_string());
|
| 756 |
+
break;
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
warnings.sort();
|
| 760 |
+
warnings.dedup();
|
| 761 |
+
warnings
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
fn warning_counts(rows: &[Value]) -> HashMap<String, usize> {
|
| 765 |
+
let mut counts = HashMap::new();
|
| 766 |
+
for row in rows {
|
| 767 |
+
if let Some(warnings) = row.get("warnings").and_then(Value::as_array) {
|
| 768 |
+
for warning in warnings {
|
| 769 |
+
if let Some(warning) = warning.as_str() {
|
| 770 |
+
*counts.entry(warning.to_string()).or_default() += 1;
|
| 771 |
+
}
|
| 772 |
+
}
|
| 773 |
+
}
|
| 774 |
+
}
|
| 775 |
+
counts
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
fn process_filename(
|
| 779 |
original: &str,
|
| 780 |
args: &Args,
|
|
|
|
| 826 |
}
|
| 827 |
}
|
| 828 |
};
|
| 829 |
+
if recipe.count.unwrap_or(0) <= args.audit_max_count && has_blocking_low_frequency_warning(&record)
|
| 830 |
+
{
|
| 831 |
+
return Processed::Skipped {
|
| 832 |
+
reason: "low_frequency_audit_warning",
|
| 833 |
+
trimmed_parent,
|
| 834 |
+
};
|
| 835 |
+
}
|
| 836 |
if trimmed_parent {
|
| 837 |
record.source_filename = Some(original.to_string());
|
| 838 |
record.path_trimmed = Some(true);
|
|
|
|
| 847 |
}
|
| 848 |
}
|
| 849 |
|
| 850 |
+
fn has_blocking_low_frequency_warning(record: &Record) -> bool {
|
| 851 |
+
audit_warnings(record).iter().any(|warning| {
|
| 852 |
+
matches!(
|
| 853 |
+
warning.as_str(),
|
| 854 |
+
"hash_labeled" | "multiple_title_spans" | "no_title" | "path_retained"
|
| 855 |
+
)
|
| 856 |
+
})
|
| 857 |
+
}
|
| 858 |
+
|
| 859 |
fn tokenize(value: &str) -> Vec<String> {
|
| 860 |
let mut output = Vec::new();
|
| 861 |
let mut index = 0;
|
|
|
|
| 1195 |
roles[*index] = "TITLE".to_string();
|
| 1196 |
}
|
| 1197 |
} else if bracket_text.len() == 1 {
|
| 1198 |
+
roles[bracket_text[0]] = if text.is_empty() {
|
| 1199 |
+
"TITLE"
|
| 1200 |
+
} else if bracket_text[0] == *start {
|
| 1201 |
+
"GROUP"
|
| 1202 |
+
} else {
|
| 1203 |
+
"TITLE"
|
| 1204 |
+
}
|
| 1205 |
+
.to_string();
|
| 1206 |
}
|
| 1207 |
for index in text {
|
| 1208 |
roles[index] = "TITLE".to_string();
|
|
|
|
| 1247 |
.collect();
|
| 1248 |
if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
|
| 1249 |
if parts.len() >= 3 && path_segment_has_season(parts[parts.len() - 2]) {
|
| 1250 |
+
if !path_segment_is_plain_season(parts[parts.len() - 2]) {
|
| 1251 |
+
return (parts[parts.len() - 1].to_string(), true);
|
| 1252 |
+
}
|
| 1253 |
let parent_seasons = path_segment_seasons(parts[parts.len() - 2]);
|
| 1254 |
let leaf_seasons = path_segment_seasons(parts[parts.len() - 1]);
|
| 1255 |
if parent_seasons
|
|
|
|
| 1268 |
}
|
| 1269 |
}
|
| 1270 |
|
| 1271 |
+
fn path_segment_is_plain_season(segment: &str) -> bool {
|
| 1272 |
+
let cleaned = strip_wrapper(segment).trim().to_string();
|
| 1273 |
+
PLAIN_SEASON_SEGMENT_RE.is_match(&cleaned)
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
fn path_segment_has_season(value: &str) -> bool {
|
| 1277 |
PATH_SEGMENT_SEASON_RE.is_match(value)
|
| 1278 |
}
|
|
|
|
| 1353 |
"SEASON" => Some("SEASON"),
|
| 1354 |
"SPECIAL" | "VOLUME" => Some("SPECIAL"),
|
| 1355 |
"RESOLUTION" => Some("RESOLUTION"),
|
| 1356 |
+
"SOURCE" => Some("SOURCE"),
|
| 1357 |
_ => None,
|
| 1358 |
};
|
| 1359 |
entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
|
|
|
|
| 1514 |
if atom_class == "RESOLUTION" {
|
| 1515 |
return "B-RESOLUTION".to_string();
|
| 1516 |
}
|
| 1517 |
+
if atom_class == "HASH" {
|
| 1518 |
+
return "O".to_string();
|
| 1519 |
+
}
|
| 1520 |
+
if matches!(atom_class.as_str(), "MEDIA" | "LANG") {
|
| 1521 |
return "B-SOURCE".to_string();
|
| 1522 |
}
|
| 1523 |
if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
|
|
|
|
| 1695 |
"vol" | "volume"
|
| 1696 |
)
|
| 1697 |
{
|
| 1698 |
+
let next_text_before_episode = (index + 1..roles.len())
|
| 1699 |
+
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 1700 |
+
.is_some_and(|cursor| {
|
| 1701 |
+
groups[cursor].class_name == "TEXT"
|
| 1702 |
+
&& roles[cursor + 1..]
|
| 1703 |
+
.iter()
|
| 1704 |
+
.any(|role| role.starts_with("EPISODE"))
|
| 1705 |
+
});
|
| 1706 |
+
if next_text_before_episode {
|
| 1707 |
+
output[index - 2] = "TITLE".to_string();
|
| 1708 |
+
output[index] = "TITLE".to_string();
|
| 1709 |
+
continue;
|
| 1710 |
+
}
|
| 1711 |
output[index - 2] = "SPECIAL".to_string();
|
| 1712 |
output[index] = "SPECIAL".to_string();
|
| 1713 |
continue;
|
|
|
|
| 1767 |
output[index] = "SPECIAL".to_string();
|
| 1768 |
continue;
|
| 1769 |
}
|
| 1770 |
+
if roles[index] == "TITLE" && matches!(text.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 1771 |
+
{
|
| 1772 |
+
output[index] = "O".to_string();
|
| 1773 |
+
continue;
|
| 1774 |
+
}
|
| 1775 |
+
if output[index] == "O"
|
| 1776 |
+
&& groups[index].class_name == "TEXT"
|
| 1777 |
+
&& roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
|
| 1778 |
+
&& text.chars().any(|ch| ch.is_alphabetic())
|
| 1779 |
+
&& !ep_markers.contains(&text.as_str())
|
| 1780 |
+
{
|
| 1781 |
+
if let Some(last_title) = output[..index].iter().rposition(|role| role == "TITLE") {
|
| 1782 |
+
let episode_since_title = output[last_title + 1..index]
|
| 1783 |
+
.iter()
|
| 1784 |
+
.any(|role| role.starts_with("EPISODE"));
|
| 1785 |
+
if !episode_since_title {
|
| 1786 |
+
output[index] = "TITLE".to_string();
|
| 1787 |
+
continue;
|
| 1788 |
+
}
|
| 1789 |
+
}
|
| 1790 |
+
}
|
| 1791 |
if roles[index] == "TITLE"
|
| 1792 |
&& matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
|
| 1793 |
&& index + 2 < roles.len()
|
|
|
|
| 1856 |
index += 1;
|
| 1857 |
continue;
|
| 1858 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1859 |
let start = index;
|
| 1860 |
index += 1;
|
| 1861 |
+
loop {
|
| 1862 |
+
if index < roles.len()
|
| 1863 |
+
&& roles[index] == "TITLE"
|
| 1864 |
+
&& !(groups[index - 1].class_name == "BRACKET_TEXT"
|
| 1865 |
+
&& groups[index].class_name == "BRACKET_TEXT")
|
| 1866 |
+
{
|
| 1867 |
+
index += 1;
|
| 1868 |
+
continue;
|
| 1869 |
+
}
|
| 1870 |
+
if index + 1 < roles.len()
|
| 1871 |
+
&& roles[index] == "O"
|
| 1872 |
+
&& groups[index].class_name == "SEP"
|
| 1873 |
+
&& roles[index + 1] == "TITLE"
|
| 1874 |
+
{
|
| 1875 |
+
index += 2;
|
| 1876 |
+
continue;
|
| 1877 |
+
}
|
| 1878 |
+
break;
|
| 1879 |
}
|
| 1880 |
candidates.push((start, index));
|
| 1881 |
}
|
|
|
|
| 2085 |
|
| 2086 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 2087 |
let joiners = [
|
| 2088 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2089 |
+
"?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
|
| 2090 |
+
"】", "「", "」", "「", "」", "☆", "@",
|
| 2091 |
];
|
| 2092 |
+
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 2093 |
let entity_joiners = [
|
| 2094 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 2095 |
+
"?", ";", ";", ",", ",", "~", "~", "-", "(", ")", "(", ")", "[", "]", "【",
|
| 2096 |
+
"】", "「", "」", "「", "」", "☆", "@", "&", "&",
|
| 2097 |
];
|
| 2098 |
let mut output = labels.to_vec();
|
| 2099 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
| 2121 |
output[index] = left_label.clone();
|
| 2122 |
}
|
| 2123 |
}
|
| 2124 |
+
if title_terminal_punctuation.contains(&token.as_str()) && index > 0 {
|
| 2125 |
+
let left_label = &output[index - 1];
|
| 2126 |
+
if left_label == "B-TITLE" {
|
| 2127 |
+
output[index] = "B-TITLE".to_string();
|
| 2128 |
+
}
|
| 2129 |
+
}
|
| 2130 |
}
|
| 2131 |
output
|
| 2132 |
}
|
|
|
|
| 2220 |
assert!(pso2.contains(&("Episode".to_string(), "B-TITLE".to_string())));
|
| 2221 |
assert!(pso2.contains(&("Oracle".to_string(), "B-TITLE".to_string())));
|
| 2222 |
assert!(pso2.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 2223 |
+
let aikatsu = labels_for("Aikatsu Friends! - S2E01 (BD 1920x1080 x264 FLAC)");
|
| 2224 |
+
assert!(aikatsu.contains(&("!".to_string(), "B-TITLE".to_string())));
|
| 2225 |
+
let intro = labels_for("[VCB-Studio] LoveLive! µ's Live Collection [01][intro][1080p]");
|
| 2226 |
+
assert!(intro.contains(&("intro".to_string(), "B-SPECIAL".to_string())));
|
| 2227 |
+
let hash = labels_for("[Group][Title][01][1080p][00270AC8]");
|
| 2228 |
+
assert!(hash.contains(&("00270AC8".to_string(), "O".to_string())));
|
| 2229 |
+
let yamato = labels_for("[1995.01] YAMATO2520 Vol.1 明日への希望-0001");
|
| 2230 |
+
assert!(yamato.contains(&("YAMATO2520".to_string(), "B-TITLE".to_string())));
|
| 2231 |
+
assert!(yamato.contains(&("明日への希望".to_string(), "B-TITLE".to_string())));
|
| 2232 |
+
let ubw = labels_for("Fate/stay night [Unlimited Blade Works] #00 「プロローグ」");
|
| 2233 |
+
assert!(ubw.contains(&("Unlimited".to_string(), "B-TITLE".to_string())));
|
| 2234 |
+
assert!(!ubw.contains(&("Unlimited".to_string(), "B-GROUP".to_string())));
|
| 2235 |
+
let alias_title = labels_for("[Koten_Gars] Tegami Bachi; Letter Bee - 01 [1080p]");
|
| 2236 |
+
assert!(alias_title.contains(&(";".to_string(), "B-TITLE".to_string())));
|
| 2237 |
+
let comma_title =
|
| 2238 |
+
labels_for("[Studio GreenTea] Kamiina Botan, Yoeru Sugata wa Yuri no Hana [01]");
|
| 2239 |
+
assert!(comma_title.contains(&(",".to_string(), "B-TITLE".to_string())));
|
| 2240 |
+
let happy_lesson = labels_for("【DVD】 HAPPY☆LESSON THE TV 第01話");
|
| 2241 |
+
assert!(happy_lesson.contains(&("☆".to_string(), "B-TITLE".to_string())));
|
| 2242 |
+
let idolmaster = labels_for("[CASO&SumiSora][THE_IDOLM@STER_CINDERELLA_GIRLS][07.5_SP]");
|
| 2243 |
+
assert!(idolmaster.contains(&("@".to_string(), "B-TITLE".to_string())));
|
| 2244 |
+
let soul_taker = labels_for("[AI-Raws] THE SOUL TAKER~魂狩~ #01 (HEVC 1312x720)");
|
| 2245 |
+
assert!(soul_taker.contains(&("~".to_string(), "B-TITLE".to_string())));
|
| 2246 |
+
let mayoi = labels_for("[Snow-Raws] 迷家[マヨイガ] 第01話");
|
| 2247 |
+
assert!(mayoi.contains(&("迷家".to_string(), "B-TITLE".to_string())));
|
| 2248 |
+
assert!(mayoi.contains(&("マヨイガ".to_string(), "B-TITLE".to_string())));
|
| 2249 |
|
| 2250 |
let conan_time = labels_for("【蓝色狂想】名侦探柯南TV版第036集-周一晚上7点半杀人事件");
|
| 2251 |
assert!(conan_time.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 2271 |
trimmed,
|
| 2272 |
"Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 2273 |
);
|
| 2274 |
+
let pokemon = "Pokémon Season 2 - Orange League [Ep. 83-118]/Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]";
|
| 2275 |
+
let (trimmed_pokemon, pokemon_was_trimmed) = training_filename_for(pokemon);
|
| 2276 |
+
assert!(pokemon_was_trimmed);
|
| 2277 |
+
assert_eq!(
|
| 2278 |
+
trimmed_pokemon,
|
| 2279 |
+
"Pokemon - 084 - OL002 - A Scare in the Air [DVD][PM-Dragon-x264-AC3][00270AC8]"
|
| 2280 |
+
);
|
| 2281 |
let woody = labels_for(&trimmed);
|
| 2282 |
assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
|
| 2283 |
assert!(woody.contains(&("E".to_string(), "O".to_string())));
|