Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Fix Rust encoded cache label repairs
Browse files
tools/encoded_dataset_cache/Cargo.lock
CHANGED
|
@@ -17,6 +17,7 @@ version = "0.1.0"
|
|
| 17 |
dependencies = [
|
| 18 |
"anyhow",
|
| 19 |
"clap",
|
|
|
|
| 20 |
"rand",
|
| 21 |
"rayon",
|
| 22 |
"regex",
|
|
@@ -80,6 +81,21 @@ version = "1.0.102"
|
|
| 80 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 81 |
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
[[package]]
|
| 84 |
name = "cfg-if"
|
| 85 |
version = "1.0.4"
|
|
@@ -163,6 +179,17 @@ version = "1.16.0"
|
|
| 163 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 164 |
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
[[package]]
|
| 167 |
name = "getrandom"
|
| 168 |
version = "0.2.17"
|
|
|
|
| 17 |
dependencies = [
|
| 18 |
"anyhow",
|
| 19 |
"clap",
|
| 20 |
+
"fancy-regex",
|
| 21 |
"rand",
|
| 22 |
"rayon",
|
| 23 |
"regex",
|
|
|
|
| 81 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 82 |
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
| 83 |
|
| 84 |
+
[[package]]
|
| 85 |
+
name = "bit-set"
|
| 86 |
+
version = "0.8.0"
|
| 87 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 88 |
+
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
| 89 |
+
dependencies = [
|
| 90 |
+
"bit-vec",
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
[[package]]
|
| 94 |
+
name = "bit-vec"
|
| 95 |
+
version = "0.8.0"
|
| 96 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 97 |
+
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
| 98 |
+
|
| 99 |
[[package]]
|
| 100 |
name = "cfg-if"
|
| 101 |
version = "1.0.4"
|
|
|
|
| 179 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 180 |
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
|
| 181 |
|
| 182 |
+
[[package]]
|
| 183 |
+
name = "fancy-regex"
|
| 184 |
+
version = "0.18.0"
|
| 185 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
| 186 |
+
checksum = "e1e1dacd0d2082dfcf1351c4bdd566bbe89a2b263235a2b50058f1e130a47277"
|
| 187 |
+
dependencies = [
|
| 188 |
+
"bit-set",
|
| 189 |
+
"regex-automata",
|
| 190 |
+
"regex-syntax",
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
[[package]]
|
| 194 |
name = "getrandom"
|
| 195 |
version = "0.2.17"
|
tools/encoded_dataset_cache/Cargo.toml
CHANGED
|
@@ -8,6 +8,7 @@ anyhow = "1.0"
|
|
| 8 |
clap = { version = "4.5", features = ["derive"] }
|
| 9 |
rand = "0.8"
|
| 10 |
rayon = "1.10"
|
|
|
|
| 11 |
regex = "1.11"
|
| 12 |
serde = { version = "1.0", features = ["derive"] }
|
| 13 |
serde_json = "1.0"
|
|
|
|
| 8 |
clap = { version = "4.5", features = ["derive"] }
|
| 9 |
rand = "0.8"
|
| 10 |
rayon = "1.10"
|
| 11 |
+
fancy-regex = "0.18"
|
| 12 |
regex = "1.11"
|
| 13 |
serde = { version = "1.0", features = ["derive"] }
|
| 14 |
serde_json = "1.0"
|
tools/encoded_dataset_cache/src/bin/regex_benchmark.rs
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
use anyhow::{ensure, Context, Result};
|
| 2 |
+
use clap::Parser;
|
| 3 |
+
use fancy_regex::Regex as FancyRegex;
|
| 4 |
+
use regex::Regex;
|
| 5 |
+
use serde_json::Value;
|
| 6 |
+
use std::fs::File;
|
| 7 |
+
use std::hint::black_box;
|
| 8 |
+
use std::io::{BufRead, BufReader};
|
| 9 |
+
use std::path::PathBuf;
|
| 10 |
+
use std::time::Instant;
|
| 11 |
+
|
| 12 |
+
const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
|
| 13 |
+
const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}";
|
| 14 |
+
const SPECIAL_TAG_PATTERN: &str =
|
| 15 |
+
r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+";
|
| 16 |
+
const SPECIAL_CODE_PATTERN: &str =
|
| 17 |
+
r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$";
|
| 18 |
+
const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})";
|
| 19 |
+
const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))";
|
| 20 |
+
const READING_MARKER_PATTERN: &str = r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])";
|
| 21 |
+
const ROMAN_MARKER_PATTERN: &str =
|
| 22 |
+
r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])";
|
| 23 |
+
const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])";
|
| 24 |
+
const SPECIAL_CONTEXT_PREFIX_PATTERN: &str =
|
| 25 |
+
r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}";
|
| 26 |
+
|
| 27 |
+
#[derive(Parser, Debug)]
|
| 28 |
+
#[command(
|
| 29 |
+
about = "Compare regex vs fancy-regex workload costs for AniFileBERT cache preprocessing"
|
| 30 |
+
)]
|
| 31 |
+
struct Args {
|
| 32 |
+
#[arg(long)]
|
| 33 |
+
input: PathBuf,
|
| 34 |
+
|
| 35 |
+
#[arg(long, default_value_t = 0)]
|
| 36 |
+
limit_rows: usize,
|
| 37 |
+
|
| 38 |
+
#[arg(long, default_value_t = 3)]
|
| 39 |
+
repeat: usize,
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
fn main() -> Result<()> {
|
| 43 |
+
let args = Args::parse();
|
| 44 |
+
ensure!(args.repeat > 0, "--repeat must be greater than 0");
|
| 45 |
+
|
| 46 |
+
let filenames = load_filenames(&args.input, args.limit_rows)?;
|
| 47 |
+
if filenames.is_empty() {
|
| 48 |
+
anyhow::bail!("no filenames loaded from {}", args.input.display());
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
let selective = SelectivePatterns::new()?;
|
| 52 |
+
let fancy_all = FancyAllPatterns::new()?;
|
| 53 |
+
|
| 54 |
+
let (selective_seconds, selective_count) =
|
| 55 |
+
time_repeated(args.repeat, || run_selective(&filenames, &selective))?;
|
| 56 |
+
let (fancy_seconds, fancy_count) =
|
| 57 |
+
time_repeated(args.repeat, || run_fancy_all(&filenames, &fancy_all))?;
|
| 58 |
+
ensure!(
|
| 59 |
+
selective_count == fancy_count,
|
| 60 |
+
"selective and fancy-all match counts differ: selective={}, fancy_all={}",
|
| 61 |
+
selective_count,
|
| 62 |
+
fancy_count
|
| 63 |
+
);
|
| 64 |
+
|
| 65 |
+
let ratio = if selective_seconds > 0.0 {
|
| 66 |
+
fancy_seconds / selective_seconds
|
| 67 |
+
} else {
|
| 68 |
+
0.0
|
| 69 |
+
};
|
| 70 |
+
println!(
|
| 71 |
+
"{}",
|
| 72 |
+
serde_json::json!({
|
| 73 |
+
"rows": filenames.len(),
|
| 74 |
+
"repeat": args.repeat,
|
| 75 |
+
"selective_seconds": selective_seconds,
|
| 76 |
+
"fancy_all_seconds": fancy_seconds,
|
| 77 |
+
"ratio": ratio,
|
| 78 |
+
"match_count": selective_count,
|
| 79 |
+
})
|
| 80 |
+
);
|
| 81 |
+
Ok(())
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
fn time_repeated<F>(repeat: usize, mut run_once: F) -> Result<(f64, usize)>
|
| 85 |
+
where
|
| 86 |
+
F: FnMut() -> Result<usize>,
|
| 87 |
+
{
|
| 88 |
+
let started = Instant::now();
|
| 89 |
+
let mut count = 0usize;
|
| 90 |
+
for _ in 0..repeat {
|
| 91 |
+
count = count.wrapping_add(black_box(run_once()?));
|
| 92 |
+
}
|
| 93 |
+
Ok((started.elapsed().as_secs_f64(), count))
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
struct SelectivePatterns {
|
| 97 |
+
resolution: Regex,
|
| 98 |
+
source: Regex,
|
| 99 |
+
source_tag: Regex,
|
| 100 |
+
special_tag: Regex,
|
| 101 |
+
special_code: Regex,
|
| 102 |
+
episode_context: Regex,
|
| 103 |
+
episode_span: FancyRegex,
|
| 104 |
+
reading_marker: FancyRegex,
|
| 105 |
+
roman_marker: FancyRegex,
|
| 106 |
+
cjk_marker: Regex,
|
| 107 |
+
special_context_prefix: Regex,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
impl SelectivePatterns {
|
| 111 |
+
fn new() -> Result<Self> {
|
| 112 |
+
Ok(Self {
|
| 113 |
+
resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
|
| 114 |
+
source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
|
| 115 |
+
source_tag: Regex::new(&format!(
|
| 116 |
+
r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
|
| 117 |
+
))?,
|
| 118 |
+
special_tag: Regex::new(SPECIAL_TAG_PATTERN)?,
|
| 119 |
+
special_code: Regex::new(SPECIAL_CODE_PATTERN)?,
|
| 120 |
+
episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?,
|
| 121 |
+
episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
|
| 122 |
+
reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
|
| 123 |
+
roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
|
| 124 |
+
cjk_marker: Regex::new(CJK_MARKER_PATTERN)?,
|
| 125 |
+
special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
|
| 126 |
+
})
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
struct FancyAllPatterns {
|
| 131 |
+
resolution: FancyRegex,
|
| 132 |
+
source: FancyRegex,
|
| 133 |
+
source_tag: FancyRegex,
|
| 134 |
+
special_tag: FancyRegex,
|
| 135 |
+
special_code: FancyRegex,
|
| 136 |
+
episode_context: FancyRegex,
|
| 137 |
+
episode_span: FancyRegex,
|
| 138 |
+
reading_marker: FancyRegex,
|
| 139 |
+
roman_marker: FancyRegex,
|
| 140 |
+
cjk_marker: FancyRegex,
|
| 141 |
+
special_context_prefix: FancyRegex,
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
impl FancyAllPatterns {
|
| 145 |
+
fn new() -> Result<Self> {
|
| 146 |
+
Ok(Self {
|
| 147 |
+
resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
|
| 148 |
+
source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
|
| 149 |
+
source_tag: FancyRegex::new(&format!(
|
| 150 |
+
r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
|
| 151 |
+
))?,
|
| 152 |
+
special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?,
|
| 153 |
+
special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?,
|
| 154 |
+
episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?,
|
| 155 |
+
episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
|
| 156 |
+
reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
|
| 157 |
+
roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
|
| 158 |
+
cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?,
|
| 159 |
+
special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
|
| 160 |
+
})
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result<usize> {
|
| 165 |
+
let mut count = 0usize;
|
| 166 |
+
for filename in filenames {
|
| 167 |
+
count = count.wrapping_add(
|
| 168 |
+
patterns
|
| 169 |
+
.resolution
|
| 170 |
+
.find_iter(filename)
|
| 171 |
+
.filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
|
| 172 |
+
.count(),
|
| 173 |
+
);
|
| 174 |
+
count = count.wrapping_add(
|
| 175 |
+
patterns
|
| 176 |
+
.source
|
| 177 |
+
.find_iter(filename)
|
| 178 |
+
.filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
|
| 179 |
+
.count(),
|
| 180 |
+
);
|
| 181 |
+
count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize);
|
| 182 |
+
count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count());
|
| 183 |
+
count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
|
| 184 |
+
count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
|
| 185 |
+
count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
|
| 186 |
+
|
| 187 |
+
for inner in bracket_inners(filename) {
|
| 188 |
+
count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize);
|
| 189 |
+
count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize);
|
| 190 |
+
count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize);
|
| 191 |
+
count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize);
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
Ok(count)
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result<usize> {
|
| 198 |
+
let mut count = 0usize;
|
| 199 |
+
for filename in filenames {
|
| 200 |
+
count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?);
|
| 201 |
+
count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?);
|
| 202 |
+
count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize);
|
| 203 |
+
count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?);
|
| 204 |
+
count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
|
| 205 |
+
count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
|
| 206 |
+
count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
|
| 207 |
+
|
| 208 |
+
for inner in bracket_inners(filename) {
|
| 209 |
+
count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize);
|
| 210 |
+
count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize);
|
| 211 |
+
count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize);
|
| 212 |
+
count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize);
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
Ok(count)
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
fn fancy_count(regex: &FancyRegex, text: &str) -> Result<usize> {
|
| 219 |
+
let mut count = 0usize;
|
| 220 |
+
for item in regex.find_iter(text) {
|
| 221 |
+
let _ = item?;
|
| 222 |
+
count += 1;
|
| 223 |
+
}
|
| 224 |
+
Ok(count)
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result<usize> {
|
| 228 |
+
let mut count = 0usize;
|
| 229 |
+
for item in regex.find_iter(text) {
|
| 230 |
+
let mat = item?;
|
| 231 |
+
if has_ascii_token_boundaries(text, mat.start(), mat.end()) {
|
| 232 |
+
count += 1;
|
| 233 |
+
}
|
| 234 |
+
}
|
| 235 |
+
Ok(count)
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result<Vec<String>> {
|
| 239 |
+
let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
|
| 240 |
+
let reader = BufReader::new(file);
|
| 241 |
+
let mut filenames = Vec::new();
|
| 242 |
+
for (idx, line) in reader.lines().enumerate() {
|
| 243 |
+
if limit_rows > 0 && filenames.len() >= limit_rows {
|
| 244 |
+
break;
|
| 245 |
+
}
|
| 246 |
+
let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?;
|
| 247 |
+
if raw.trim().is_empty() {
|
| 248 |
+
continue;
|
| 249 |
+
}
|
| 250 |
+
let value: Value = serde_json::from_str(&raw)
|
| 251 |
+
.with_context(|| format!("invalid JSONL line {}", idx + 1))?;
|
| 252 |
+
if let Some(filename) = value.get("filename").and_then(Value::as_str) {
|
| 253 |
+
filenames.push(filename.to_string());
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
Ok(filenames)
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
fn bracket_inners(text: &str) -> Vec<String> {
|
| 260 |
+
let chars = text.chars().collect::<Vec<_>>();
|
| 261 |
+
let mut spans = Vec::new();
|
| 262 |
+
let mut idx = 0usize;
|
| 263 |
+
while idx < chars.len() {
|
| 264 |
+
let close = match chars[idx] {
|
| 265 |
+
'[' => ']',
|
| 266 |
+
'(' => ')',
|
| 267 |
+
'【' => '】',
|
| 268 |
+
'《' => '》',
|
| 269 |
+
_ => {
|
| 270 |
+
idx += 1;
|
| 271 |
+
continue;
|
| 272 |
+
}
|
| 273 |
+
};
|
| 274 |
+
if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) {
|
| 275 |
+
let end = idx + 1 + relative_end;
|
| 276 |
+
spans.push(chars[idx + 1..end].iter().collect::<String>());
|
| 277 |
+
idx = end + 1;
|
| 278 |
+
} else {
|
| 279 |
+
idx += 1;
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
spans
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
|
| 286 |
+
let previous_ok = text[..start]
|
| 287 |
+
.chars()
|
| 288 |
+
.next_back()
|
| 289 |
+
.map(|ch| !ch.is_ascii_alphanumeric())
|
| 290 |
+
.unwrap_or(true);
|
| 291 |
+
let next_ok = text[end..]
|
| 292 |
+
.chars()
|
| 293 |
+
.next()
|
| 294 |
+
.map(|ch| !ch.is_ascii_alphanumeric())
|
| 295 |
+
.unwrap_or(true);
|
| 296 |
+
previous_ok && next_ok
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
#[cfg(test)]
|
| 300 |
+
mod tests {
|
| 301 |
+
use super::*;
|
| 302 |
+
|
| 303 |
+
#[test]
|
| 304 |
+
fn selective_and_fancy_all_count_the_same_matches() -> Result<()> {
|
| 305 |
+
let filenames = vec![
|
| 306 |
+
"[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(),
|
| 307 |
+
"[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(),
|
| 308 |
+
"Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(),
|
| 309 |
+
"[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(),
|
| 310 |
+
"[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(),
|
| 311 |
+
];
|
| 312 |
+
let selective = SelectivePatterns::new()?;
|
| 313 |
+
let fancy_all = FancyAllPatterns::new()?;
|
| 314 |
+
|
| 315 |
+
assert_eq!(
|
| 316 |
+
run_selective(&filenames, &selective)?,
|
| 317 |
+
run_fancy_all(&filenames, &fancy_all)?
|
| 318 |
+
);
|
| 319 |
+
Ok(())
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
#[test]
|
| 323 |
+
fn bracket_inners_extract_supported_pairs() {
|
| 324 |
+
assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]);
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
#[test]
|
| 328 |
+
fn ascii_token_boundaries_reject_embedded_matches() {
|
| 329 |
+
let text = "ABC1080p 1080p HEVC2 HEVC";
|
| 330 |
+
assert!(!has_ascii_token_boundaries(text, 3, 8));
|
| 331 |
+
assert!(has_ascii_token_boundaries(text, 9, 14));
|
| 332 |
+
assert!(!has_ascii_token_boundaries(text, 15, 19));
|
| 333 |
+
assert!(has_ascii_token_boundaries(text, 21, 25));
|
| 334 |
+
}
|
| 335 |
+
}
|
tools/encoded_dataset_cache/src/main.rs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
use anyhow::{bail, Context, Result};
|
| 2 |
use clap::Parser;
|
|
|
|
| 3 |
use rand::rngs::StdRng;
|
| 4 |
use rand::seq::SliceRandom;
|
| 5 |
use rand::SeedableRng;
|
|
@@ -56,11 +57,19 @@ const FALLBACK_LABELS: [&str; 37] = [
|
|
| 56 |
|
| 57 |
const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
|
| 58 |
|
| 59 |
-
static RESOLUTION_RE: OnceLock<
|
| 60 |
static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
|
| 61 |
static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
|
| 62 |
static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
|
| 63 |
static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
#[derive(Parser, Debug)]
|
| 66 |
#[command(
|
|
@@ -457,28 +466,26 @@ fn encode_row(row: &SourceRow, context: &EncodeContext) -> Result<(Vec<u16>, Vec
|
|
| 457 |
}
|
| 458 |
|
| 459 |
fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
|
| 460 |
-
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
let filename_chars = chars_as_strings(filename);
|
| 463 |
if row.tokens == filename_chars {
|
| 464 |
-
return (row.tokens.clone(),
|
| 465 |
}
|
| 466 |
}
|
| 467 |
-
}
|
| 468 |
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
let (tokens,
|
| 472 |
-
repair_structural_meta_labels(filename, &mut labels);
|
| 473 |
return (tokens, labels);
|
| 474 |
}
|
| 475 |
}
|
| 476 |
|
| 477 |
-
|
| 478 |
-
if let Some(filename) = row.filename.as_deref() {
|
| 479 |
-
repair_structural_meta_labels(filename, &mut labels);
|
| 480 |
-
}
|
| 481 |
-
(tokens, labels)
|
| 482 |
}
|
| 483 |
|
| 484 |
fn project_labels_from_filename(
|
|
@@ -579,11 +586,13 @@ fn align_tokens_to_chars(tokens: &[String], labels: &[String]) -> (Vec<String>,
|
|
| 579 |
(char_tokens, char_labels)
|
| 580 |
}
|
| 581 |
|
| 582 |
-
fn repair_structural_meta_labels(
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
| 587 |
for (inner_start, inner_end) in bracket_inner_spans(text) {
|
| 588 |
let bracket_start = inner_start.saturating_sub(1);
|
| 589 |
if bracket_start < episode_end {
|
|
@@ -595,46 +604,46 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
|
|
| 595 |
continue;
|
| 596 |
}
|
| 597 |
let clean = chars_slice_to_string(&inner, trim_start, trim_end);
|
| 598 |
-
let clean_start = inner_start + trim_start;
|
| 599 |
-
let clean_end = inner_start + trim_end;
|
| 600 |
|
| 601 |
if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
|
| 602 |
-
|
|
|
|
| 603 |
continue;
|
| 604 |
}
|
| 605 |
if source_tag_re().is_match(&clean) {
|
| 606 |
-
|
|
|
|
| 607 |
continue;
|
| 608 |
}
|
| 609 |
|
| 610 |
-
for mat in resolution_re()
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
let start = inner_start + char_index_at_byte(&
|
| 615 |
-
let end = inner_start + char_index_at_byte(&
|
| 616 |
-
|
|
|
|
| 617 |
}
|
| 618 |
-
for mat in source_re().find_iter(&
|
| 619 |
-
if !has_ascii_token_boundaries(&
|
| 620 |
continue;
|
| 621 |
}
|
| 622 |
-
let start = inner_start + char_index_at_byte(&
|
| 623 |
-
let end = inner_start + char_index_at_byte(&
|
| 624 |
-
|
|
|
|
| 625 |
}
|
| 626 |
}
|
| 627 |
|
| 628 |
-
for mat in resolution_re().find_iter(text) {
|
| 629 |
-
if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
|
| 630 |
-
continue;
|
| 631 |
-
}
|
| 632 |
let start = char_index_at_byte(text, mat.start());
|
| 633 |
if start < episode_end {
|
| 634 |
continue;
|
| 635 |
}
|
| 636 |
let end = char_index_at_byte(text, mat.end());
|
| 637 |
-
|
|
|
|
| 638 |
}
|
| 639 |
for mat in source_re().find_iter(text) {
|
| 640 |
if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
|
|
@@ -645,23 +654,461 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
|
|
| 645 |
continue;
|
| 646 |
}
|
| 647 |
let end = char_index_at_byte(text, mat.end());
|
| 648 |
-
|
|
|
|
| 649 |
}
|
| 650 |
}
|
| 651 |
|
| 652 |
-
fn
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
}
|
| 660 |
-
return end;
|
| 661 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
idx += 1;
|
| 663 |
}
|
| 664 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
}
|
| 666 |
|
| 667 |
fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
|
|
@@ -717,28 +1164,19 @@ fn chars_slice_to_string(text: &str, start: usize, end: usize) -> String {
|
|
| 717 |
.collect()
|
| 718 |
}
|
| 719 |
|
| 720 |
-
fn label_span_if_safe(labels: &mut [String],
|
| 721 |
-
if
|
| 722 |
return;
|
| 723 |
}
|
| 724 |
-
if
|
| 725 |
matches!(
|
| 726 |
-
label_entity(
|
| 727 |
Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
|
| 728 |
)
|
| 729 |
}) {
|
| 730 |
return;
|
| 731 |
}
|
| 732 |
-
|
| 733 |
-
let mut first = !previous_same;
|
| 734 |
-
for label in labels.iter_mut().take(end).skip(start) {
|
| 735 |
-
*label = if first {
|
| 736 |
-
format!("B-{entity}")
|
| 737 |
-
} else {
|
| 738 |
-
format!("I-{entity}")
|
| 739 |
-
};
|
| 740 |
-
first = false;
|
| 741 |
-
}
|
| 742 |
}
|
| 743 |
|
| 744 |
fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
|
|
@@ -764,9 +1202,13 @@ fn label_entity(label: &str) -> Option<&str> {
|
|
| 764 |
}
|
| 765 |
}
|
| 766 |
|
| 767 |
-
fn resolution_re() -> &'static
|
| 768 |
-
RESOLUTION_RE
|
| 769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
}
|
| 771 |
|
| 772 |
fn source_re() -> &'static Regex {
|
|
@@ -795,6 +1237,60 @@ fn special_code_re() -> &'static Regex {
|
|
| 795 |
})
|
| 796 |
}
|
| 797 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
fn chars_as_strings(text: &str) -> Vec<String> {
|
| 799 |
text.chars().map(|ch| ch.to_string()).collect()
|
| 800 |
}
|
|
@@ -907,3 +1403,123 @@ fn write_npy_header<W: Write>(writer: &mut W, descr: &str, rows: usize, cols: us
|
|
| 907 |
writer.write_all(&header)?;
|
| 908 |
Ok(())
|
| 909 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
use anyhow::{bail, Context, Result};
|
| 2 |
use clap::Parser;
|
| 3 |
+
use fancy_regex::Regex as FancyRegex;
|
| 4 |
use rand::rngs::StdRng;
|
| 5 |
use rand::seq::SliceRandom;
|
| 6 |
use rand::SeedableRng;
|
|
|
|
| 57 |
|
| 58 |
const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
|
| 59 |
|
| 60 |
+
static RESOLUTION_RE: OnceLock<FancyRegex> = OnceLock::new();
|
| 61 |
static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
|
| 62 |
static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
|
| 63 |
static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
|
| 64 |
static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
|
| 65 |
+
static EPISODE_CONTEXT_RE: OnceLock<Regex> = OnceLock::new();
|
| 66 |
+
static EPISODE_SPAN_RE: OnceLock<FancyRegex> = OnceLock::new();
|
| 67 |
+
static READING_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
|
| 68 |
+
static ROMAN_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
|
| 69 |
+
static CJK_MARKER_RE: OnceLock<Regex> = OnceLock::new();
|
| 70 |
+
static SPECIAL_CONTEXT_PREFIX_RE: OnceLock<Regex> = OnceLock::new();
|
| 71 |
+
|
| 72 |
+
const SEPARATOR_CHARS: &[char] = &[' ', '\t', '-', '_', '.', '|', '~', '~'];
|
| 73 |
|
| 74 |
#[derive(Parser, Debug)]
|
| 75 |
#[command(
|
|
|
|
| 466 |
}
|
| 467 |
|
| 468 |
fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
|
| 469 |
+
let mut source_labels = row.labels.clone();
|
| 470 |
+
|
| 471 |
+
if let Some(filename) = row.filename.as_deref() {
|
| 472 |
+
repair_known_label_issues(filename, &row.tokens, &mut source_labels);
|
| 473 |
+
|
| 474 |
+
if row.tokenizer_variant.as_deref() == Some("char") {
|
| 475 |
let filename_chars = chars_as_strings(filename);
|
| 476 |
if row.tokens == filename_chars {
|
| 477 |
+
return (row.tokens.clone(), source_labels);
|
| 478 |
}
|
| 479 |
}
|
|
|
|
| 480 |
|
| 481 |
+
if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &source_labels)
|
| 482 |
+
{
|
| 483 |
+
let (tokens, labels) = projected;
|
|
|
|
| 484 |
return (tokens, labels);
|
| 485 |
}
|
| 486 |
}
|
| 487 |
|
| 488 |
+
align_tokens_to_chars(&row.tokens, &source_labels)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
}
|
| 490 |
|
| 491 |
fn project_labels_from_filename(
|
|
|
|
| 586 |
(char_tokens, char_labels)
|
| 587 |
}
|
| 588 |
|
| 589 |
+
fn repair_structural_meta_labels(
|
| 590 |
+
text: &str,
|
| 591 |
+
_tokens: &[String],
|
| 592 |
+
labels: &mut [String],
|
| 593 |
+
offsets: &[(usize, usize)],
|
| 594 |
+
) {
|
| 595 |
+
let episode_end = first_episode_span_end(labels, offsets, text);
|
| 596 |
for (inner_start, inner_end) in bracket_inner_spans(text) {
|
| 597 |
let bracket_start = inner_start.saturating_sub(1);
|
| 598 |
if bracket_start < episode_end {
|
|
|
|
| 604 |
continue;
|
| 605 |
}
|
| 606 |
let clean = chars_slice_to_string(&inner, trim_start, trim_end);
|
|
|
|
|
|
|
| 607 |
|
| 608 |
if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
|
| 609 |
+
let indices = token_indices_for_span(offsets, inner_start, inner_end);
|
| 610 |
+
label_span_if_safe(labels, &indices, "SPECIAL");
|
| 611 |
continue;
|
| 612 |
}
|
| 613 |
if source_tag_re().is_match(&clean) {
|
| 614 |
+
let indices = token_indices_for_span(offsets, inner_start, inner_end);
|
| 615 |
+
label_span_if_safe(labels, &indices, "SOURCE");
|
| 616 |
continue;
|
| 617 |
}
|
| 618 |
|
| 619 |
+
for mat in resolution_re()
|
| 620 |
+
.find_iter(&clean)
|
| 621 |
+
.filter_map(|item| item.ok())
|
| 622 |
+
{
|
| 623 |
+
let start = inner_start + char_index_at_byte(&clean, mat.start());
|
| 624 |
+
let end = inner_start + char_index_at_byte(&clean, mat.end());
|
| 625 |
+
let indices = token_indices_for_span(offsets, start, end);
|
| 626 |
+
label_span_if_safe(labels, &indices, "RESOLUTION");
|
| 627 |
}
|
| 628 |
+
for mat in source_re().find_iter(&clean) {
|
| 629 |
+
if !has_ascii_token_boundaries(&clean, mat.start(), mat.end()) {
|
| 630 |
continue;
|
| 631 |
}
|
| 632 |
+
let start = inner_start + char_index_at_byte(&clean, mat.start());
|
| 633 |
+
let end = inner_start + char_index_at_byte(&clean, mat.end());
|
| 634 |
+
let indices = token_indices_for_span(offsets, start, end);
|
| 635 |
+
label_span_if_safe(labels, &indices, "SOURCE");
|
| 636 |
}
|
| 637 |
}
|
| 638 |
|
| 639 |
+
for mat in resolution_re().find_iter(text).filter_map(|item| item.ok()) {
|
|
|
|
|
|
|
|
|
|
| 640 |
let start = char_index_at_byte(text, mat.start());
|
| 641 |
if start < episode_end {
|
| 642 |
continue;
|
| 643 |
}
|
| 644 |
let end = char_index_at_byte(text, mat.end());
|
| 645 |
+
let indices = token_indices_for_span(offsets, start, end);
|
| 646 |
+
label_span_if_safe(labels, &indices, "RESOLUTION");
|
| 647 |
}
|
| 648 |
for mat in source_re().find_iter(text) {
|
| 649 |
if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
|
|
|
|
| 654 |
continue;
|
| 655 |
}
|
| 656 |
let end = char_index_at_byte(text, mat.end());
|
| 657 |
+
let indices = token_indices_for_span(offsets, start, end);
|
| 658 |
+
label_span_if_safe(labels, &indices, "SOURCE");
|
| 659 |
}
|
| 660 |
}
|
| 661 |
|
| 662 |
+
fn repair_known_label_issues(text: &str, tokens: &[String], labels: &mut [String]) {
|
| 663 |
+
if tokens.len() != labels.len() {
|
| 664 |
+
return;
|
| 665 |
+
}
|
| 666 |
+
let Some(offsets) = token_offsets_in_text(text, tokens) else {
|
| 667 |
+
return;
|
| 668 |
+
};
|
| 669 |
+
|
| 670 |
+
let quick_text = text.to_lowercase();
|
| 671 |
+
let has_sequel_marker_hint = [
|
| 672 |
+
" II", " III", " IV", " V", " VI", " VII", " VIII", " IX", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ",
|
| 673 |
+
"Ⅷ", "Ⅸ", "之章", "之期", "之季", "之部", "ノ章", "ノ期", "の章", "の期", "貳", "贰", "弐",
|
| 674 |
+
"弍", "參", "叁", "参", "肆", "陸", "陆", "Ni ", " ni ", " no Sara", "Gakki",
|
| 675 |
+
]
|
| 676 |
+
.iter()
|
| 677 |
+
.any(|needle| text.contains(needle) || quick_text.contains(&needle.to_lowercase()));
|
| 678 |
+
|
| 679 |
+
if has_sequel_marker_hint {
|
| 680 |
+
for (start, end) in find_sequel_season_markers(text) {
|
| 681 |
+
if labels_have_season_before(labels, &offsets, start) {
|
| 682 |
+
continue;
|
| 683 |
+
}
|
| 684 |
+
let indices = token_indices_for_span(&offsets, start, end);
|
| 685 |
+
if indices.is_empty() {
|
| 686 |
+
continue;
|
| 687 |
+
}
|
| 688 |
+
if indices.iter().any(|idx| {
|
| 689 |
+
matches!(
|
| 690 |
+
label_entity(&labels[*idx]),
|
| 691 |
+
Some(
|
| 692 |
+
"GROUP"
|
| 693 |
+
| "EPISODE"
|
| 694 |
+
| "RESOLUTION"
|
| 695 |
+
| "SOURCE"
|
| 696 |
+
| "SPECIAL"
|
| 697 |
+
| "TAG"
|
| 698 |
+
| "PATH_SEASON"
|
| 699 |
+
)
|
| 700 |
+
)
|
| 701 |
+
}) {
|
| 702 |
+
continue;
|
| 703 |
+
}
|
| 704 |
+
if !indices.iter().any(|idx| is_title_like_label(&labels[*idx])) {
|
| 705 |
+
continue;
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
label_span_indices(labels, &indices, "SEASON");
|
| 709 |
+
mark_adjacent_title_separators_o(tokens, labels, &indices);
|
| 710 |
+
}
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
repair_structural_meta_labels(text, tokens, labels, &offsets);
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
fn find_sequel_season_markers(text: &str) -> Vec<(usize, usize)> {
|
| 717 |
+
let mut repairs = Vec::new();
|
| 718 |
+
|
| 719 |
+
for mat in reading_marker_re()
|
| 720 |
+
.find_iter(text)
|
| 721 |
+
.filter_map(|item| item.ok())
|
| 722 |
+
{
|
| 723 |
+
let marker = mat.as_str();
|
| 724 |
+
if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
|
| 725 |
+
continue;
|
| 726 |
+
}
|
| 727 |
+
repairs.push((
|
| 728 |
+
char_index_at_byte(text, mat.start()),
|
| 729 |
+
char_index_at_byte(text, mat.end()),
|
| 730 |
+
));
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
for mat in roman_marker_re()
|
| 734 |
+
.find_iter(text)
|
| 735 |
+
.filter_map(|item| item.ok())
|
| 736 |
+
{
|
| 737 |
+
let marker = mat.as_str();
|
| 738 |
+
if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
|
| 739 |
+
continue;
|
| 740 |
+
}
|
| 741 |
+
repairs.push((
|
| 742 |
+
char_index_at_byte(text, mat.start()),
|
| 743 |
+
char_index_at_byte(text, mat.end()),
|
| 744 |
+
));
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
for mat in cjk_marker_re().find_iter(text) {
|
| 748 |
+
let marker = mat.as_str();
|
| 749 |
+
if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
|
| 750 |
+
continue;
|
| 751 |
+
}
|
| 752 |
+
repairs.push((
|
| 753 |
+
char_index_at_byte(text, mat.start()),
|
| 754 |
+
char_index_at_byte(text, mat.end()),
|
| 755 |
+
));
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
for (base, value) in standalone_ni_season_bases() {
|
| 759 |
+
let mut search_start = 0usize;
|
| 760 |
+
while let Some(relative) = text[search_start..].find(base) {
|
| 761 |
+
let base_start = search_start + relative;
|
| 762 |
+
let base_end = base_start + base.len();
|
| 763 |
+
let Some((ni_start, ni_end)) = standalone_ni_after_base(text, base_end) else {
|
| 764 |
+
search_start = base_end;
|
| 765 |
+
continue;
|
| 766 |
+
};
|
| 767 |
+
if *value == 2
|
| 768 |
+
&& has_episode_context(text, ni_end)
|
| 769 |
+
&& has_ascii_token_boundaries(text, ni_start, ni_end)
|
| 770 |
+
{
|
| 771 |
+
repairs.push((
|
| 772 |
+
char_index_at_byte(text, ni_start),
|
| 773 |
+
char_index_at_byte(text, ni_end),
|
| 774 |
+
));
|
| 775 |
+
}
|
| 776 |
+
search_start = base_end;
|
| 777 |
+
}
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
repairs.sort_by_key(|(start, end)| (*start, *end));
|
| 781 |
+
let mut deduped: Vec<(usize, usize)> = Vec::new();
|
| 782 |
+
for repair in repairs {
|
| 783 |
+
if let Some(previous) = deduped.last_mut() {
|
| 784 |
+
if repair.0 < previous.1 {
|
| 785 |
+
if repair.1.saturating_sub(repair.0) > previous.1.saturating_sub(previous.0) {
|
| 786 |
+
*previous = repair;
|
| 787 |
+
}
|
| 788 |
+
continue;
|
| 789 |
+
}
|
| 790 |
+
}
|
| 791 |
+
deduped.push(repair);
|
| 792 |
+
}
|
| 793 |
+
deduped
|
| 794 |
+
}
|
| 795 |
+
|
| 796 |
+
fn season_marker_number(text: &str) -> Option<u8> {
|
| 797 |
+
let clean = clean_marker_text(text);
|
| 798 |
+
if clean.is_empty() {
|
| 799 |
+
return None;
|
| 800 |
+
}
|
| 801 |
+
if let Some(value) = roman_numeral_value(&clean) {
|
| 802 |
+
return Some(value);
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
let lowered = clean
|
| 806 |
+
.split_whitespace()
|
| 807 |
+
.collect::<Vec<_>>()
|
| 808 |
+
.join(" ")
|
| 809 |
+
.to_lowercase();
|
| 810 |
+
if let Some(value) = reading_marker_value(&lowered) {
|
| 811 |
+
return Some(value);
|
| 812 |
+
}
|
| 813 |
+
if lowered == "ni" {
|
| 814 |
+
return Some(2);
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
if clean.starts_with('第') {
|
| 818 |
+
if let Some(last) = clean.chars().last() {
|
| 819 |
+
if matches!(last, '季' | '期' | '部' | '章') {
|
| 820 |
+
let inner = clean
|
| 821 |
+
.chars()
|
| 822 |
+
.skip(1)
|
| 823 |
+
.take(clean.chars().count().saturating_sub(2))
|
| 824 |
+
.collect::<String>();
|
| 825 |
+
return cn_number_to_int(&inner);
|
| 826 |
+
}
|
| 827 |
+
}
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
let cjk_chars = clean.chars().collect::<Vec<_>>();
|
| 831 |
+
if let Some(first) = cjk_chars.first() {
|
| 832 |
+
if let Some(value) = cn_number_to_int(&first.to_string()) {
|
| 833 |
+
let rest = cjk_chars.iter().skip(1).collect::<String>();
|
| 834 |
+
if rest.trim().is_empty() || cjk_marker_suffix_remainder_ok(&rest) {
|
| 835 |
+
return Some(value);
|
| 836 |
}
|
|
|
|
| 837 |
}
|
| 838 |
+
}
|
| 839 |
+
|
| 840 |
+
None
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
fn clean_marker_text(text: &str) -> String {
|
| 844 |
+
text.trim()
|
| 845 |
+
.trim_matches(|ch| {
|
| 846 |
+
matches!(
|
| 847 |
+
ch,
|
| 848 |
+
'[' | ']' | '(' | ')' | '【' | '】' | '《' | '》' | '(' | ')'
|
| 849 |
+
)
|
| 850 |
+
})
|
| 851 |
+
.trim()
|
| 852 |
+
.to_string()
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
fn cn_number_to_int(text: &str) -> Option<u8> {
|
| 856 |
+
let text = text.trim();
|
| 857 |
+
if text.is_empty() {
|
| 858 |
+
return None;
|
| 859 |
+
}
|
| 860 |
+
if let Ok(value) = text.parse::<u8>() {
|
| 861 |
+
return Some(value);
|
| 862 |
+
}
|
| 863 |
+
if let Some(value) = cn_digit_value(text) {
|
| 864 |
+
return Some(value);
|
| 865 |
+
}
|
| 866 |
+
let chars = text.chars().collect::<Vec<_>>();
|
| 867 |
+
if chars.len() == 2 && chars[0] == '十' {
|
| 868 |
+
return Some(10 + cn_digit_value(&chars[1].to_string()).unwrap_or(0));
|
| 869 |
+
}
|
| 870 |
+
if chars.len() == 2 && chars[1] == '十' {
|
| 871 |
+
return Some(cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10);
|
| 872 |
+
}
|
| 873 |
+
if chars.len() == 3 && chars[1] == '十' {
|
| 874 |
+
return Some(
|
| 875 |
+
cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10
|
| 876 |
+
+ cn_digit_value(&chars[2].to_string()).unwrap_or(0),
|
| 877 |
+
);
|
| 878 |
+
}
|
| 879 |
+
None
|
| 880 |
+
}
|
| 881 |
+
|
| 882 |
+
fn cn_digit_value(text: &str) -> Option<u8> {
|
| 883 |
+
match text {
|
| 884 |
+
"一" => Some(1),
|
| 885 |
+
"二" | "兩" | "两" | "貳" | "贰" | "弐" | "弍" => Some(2),
|
| 886 |
+
"三" | "參" | "叁" | "参" => Some(3),
|
| 887 |
+
"四" | "肆" => Some(4),
|
| 888 |
+
"五" | "伍" => Some(5),
|
| 889 |
+
"六" | "陸" | "陆" => Some(6),
|
| 890 |
+
"七" | "柒" => Some(7),
|
| 891 |
+
"八" | "捌" => Some(8),
|
| 892 |
+
"九" | "玖" => Some(9),
|
| 893 |
+
"十" => Some(10),
|
| 894 |
+
_ => None,
|
| 895 |
+
}
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
fn roman_numeral_value(text: &str) -> Option<u8> {
|
| 899 |
+
match text {
|
| 900 |
+
"II" | "Ⅱ" => Some(2),
|
| 901 |
+
"III" | "Ⅲ" => Some(3),
|
| 902 |
+
"IV" | "Ⅳ" => Some(4),
|
| 903 |
+
"V" | "Ⅴ" => Some(5),
|
| 904 |
+
"VI" | "Ⅵ" => Some(6),
|
| 905 |
+
"VII" | "Ⅶ" => Some(7),
|
| 906 |
+
"VIII" | "Ⅷ" => Some(8),
|
| 907 |
+
"IX" | "Ⅸ" => Some(9),
|
| 908 |
+
_ => None,
|
| 909 |
+
}
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
fn reading_marker_value(text: &str) -> Option<u8> {
|
| 913 |
+
match text {
|
| 914 |
+
"ni no sara" | "ni no shou" | "ni no sho" | "ni no syo" | "ni no shō" | "ni gakki"
|
| 915 |
+
| "sono ni" => Some(2),
|
| 916 |
+
"san no sara" | "san no shou" | "san no sho" | "san no syo" => Some(3),
|
| 917 |
+
"yon no sara" | "shi no sara" | "shin no sara" => Some(4),
|
| 918 |
+
"go no sara" | "gou no sara" => Some(5),
|
| 919 |
+
_ => None,
|
| 920 |
+
}
|
| 921 |
+
}
|
| 922 |
+
|
| 923 |
+
fn cjk_marker_suffix_remainder_ok(rest: &str) -> bool {
|
| 924 |
+
let compact = rest.split_whitespace().collect::<String>();
|
| 925 |
+
matches!(
|
| 926 |
+
compact.as_str(),
|
| 927 |
+
"ノ章"
|
| 928 |
+
| "ノ期"
|
| 929 |
+
| "ノ季"
|
| 930 |
+
| "ノ部"
|
| 931 |
+
| "の章"
|
| 932 |
+
| "の期"
|
| 933 |
+
| "の季"
|
| 934 |
+
| "の部"
|
| 935 |
+
| "之章"
|
| 936 |
+
| "之期"
|
| 937 |
+
| "之季"
|
| 938 |
+
| "之部"
|
| 939 |
+
)
|
| 940 |
+
}
|
| 941 |
+
|
| 942 |
+
fn has_episode_context(text: &str, marker_end_byte: usize) -> bool {
|
| 943 |
+
let tail = &text[marker_end_byte..];
|
| 944 |
+
if episode_context_re().is_match(tail) {
|
| 945 |
+
return true;
|
| 946 |
+
}
|
| 947 |
+
|
| 948 |
+
let mut tail = tail.trim_start();
|
| 949 |
+
if let Some(ch) = tail.chars().next() {
|
| 950 |
+
if matches!(ch, ']' | ')' | '】' | '》') {
|
| 951 |
+
tail = &tail[ch.len_utf8()..];
|
| 952 |
+
tail = tail.trim_start();
|
| 953 |
+
}
|
| 954 |
+
}
|
| 955 |
+
if let Some(mat) = special_context_prefix_re().find(tail) {
|
| 956 |
+
tail = &tail[mat.end()..];
|
| 957 |
+
}
|
| 958 |
+
episode_context_re().is_match(tail)
|
| 959 |
+
}
|
| 960 |
+
|
| 961 |
+
fn first_episode_regex_end(text: &str) -> Option<usize> {
|
| 962 |
+
episode_span_re()
|
| 963 |
+
.find_iter(text)
|
| 964 |
+
.filter_map(|item| item.ok())
|
| 965 |
+
.map(|mat| char_index_at_byte(text, mat.end()))
|
| 966 |
+
.next()
|
| 967 |
+
}
|
| 968 |
+
|
| 969 |
+
fn labels_have_season_before(
|
| 970 |
+
labels: &[String],
|
| 971 |
+
offsets: &[(usize, usize)],
|
| 972 |
+
marker_start: usize,
|
| 973 |
+
) -> bool {
|
| 974 |
+
labels
|
| 975 |
+
.iter()
|
| 976 |
+
.zip(offsets.iter())
|
| 977 |
+
.any(|(label, (_start, end))| is_season_like_label(label) && *end <= marker_start)
|
| 978 |
+
}
|
| 979 |
+
|
| 980 |
+
fn token_indices_for_span(offsets: &[(usize, usize)], start: usize, end: usize) -> Vec<usize> {
|
| 981 |
+
offsets
|
| 982 |
+
.iter()
|
| 983 |
+
.enumerate()
|
| 984 |
+
.filter_map(|(idx, (token_start, token_end))| {
|
| 985 |
+
if *token_start < end && *token_end > start {
|
| 986 |
+
Some(idx)
|
| 987 |
+
} else {
|
| 988 |
+
None
|
| 989 |
+
}
|
| 990 |
+
})
|
| 991 |
+
.collect()
|
| 992 |
+
}
|
| 993 |
+
|
| 994 |
+
#[cfg(test)]
|
| 995 |
+
fn label_span(labels: &mut [String], start: usize, end: usize, entity: &str) {
|
| 996 |
+
let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
|
| 997 |
+
let mut first = !previous_same;
|
| 998 |
+
for label in labels.iter_mut().take(end).skip(start) {
|
| 999 |
+
*label = if first {
|
| 1000 |
+
format!("B-{entity}")
|
| 1001 |
+
} else {
|
| 1002 |
+
format!("I-{entity}")
|
| 1003 |
+
};
|
| 1004 |
+
first = false;
|
| 1005 |
+
}
|
| 1006 |
+
}
|
| 1007 |
+
|
| 1008 |
+
fn label_span_indices(labels: &mut [String], indices: &[usize], entity: &str) {
|
| 1009 |
+
if indices.is_empty() {
|
| 1010 |
+
return;
|
| 1011 |
+
}
|
| 1012 |
+
let previous_same = indices[0] > 0 && label_entity(&labels[indices[0] - 1]) == Some(entity);
|
| 1013 |
+
let mut first = !previous_same;
|
| 1014 |
+
for idx in indices {
|
| 1015 |
+
labels[*idx] = if first {
|
| 1016 |
+
format!("B-{entity}")
|
| 1017 |
+
} else {
|
| 1018 |
+
format!("I-{entity}")
|
| 1019 |
+
};
|
| 1020 |
+
first = false;
|
| 1021 |
+
}
|
| 1022 |
+
}
|
| 1023 |
+
|
| 1024 |
+
fn mark_adjacent_title_separators_o(
|
| 1025 |
+
tokens: &[String],
|
| 1026 |
+
labels: &mut [String],
|
| 1027 |
+
marker_indices: &[usize],
|
| 1028 |
+
) {
|
| 1029 |
+
if marker_indices.is_empty() {
|
| 1030 |
+
return;
|
| 1031 |
+
}
|
| 1032 |
+
|
| 1033 |
+
let mut idx = marker_indices[0];
|
| 1034 |
+
while idx > 0 {
|
| 1035 |
+
let prev = idx - 1;
|
| 1036 |
+
if !tokens[prev].trim().is_empty() || !is_title_like_label(&labels[prev]) {
|
| 1037 |
+
break;
|
| 1038 |
+
}
|
| 1039 |
+
labels[prev] = "O".to_string();
|
| 1040 |
+
idx = prev;
|
| 1041 |
+
}
|
| 1042 |
+
|
| 1043 |
+
let mut idx = marker_indices[marker_indices.len() - 1] + 1;
|
| 1044 |
+
while idx < tokens.len()
|
| 1045 |
+
&& tokens[idx].chars().all(|ch| SEPARATOR_CHARS.contains(&ch))
|
| 1046 |
+
&& is_title_like_label(&labels[idx])
|
| 1047 |
+
{
|
| 1048 |
+
labels[idx] = "O".to_string();
|
| 1049 |
idx += 1;
|
| 1050 |
}
|
| 1051 |
+
}
|
| 1052 |
+
|
| 1053 |
+
fn standalone_ni_season_bases() -> &'static [(&'static str, u8)] {
|
| 1054 |
+
&[("Kakuriyo no Yadomeshi", 2)]
|
| 1055 |
+
}
|
| 1056 |
+
|
| 1057 |
+
fn standalone_ni_after_base(text: &str, base_end: usize) -> Option<(usize, usize)> {
|
| 1058 |
+
let mut cursor = base_end;
|
| 1059 |
+
while let Some(ch) = text[cursor..].chars().next() {
|
| 1060 |
+
if !ch.is_whitespace() {
|
| 1061 |
+
break;
|
| 1062 |
+
}
|
| 1063 |
+
cursor += ch.len_utf8();
|
| 1064 |
+
}
|
| 1065 |
+
let ni_end = cursor.checked_add(2)?;
|
| 1066 |
+
if text.get(cursor..ni_end)? == "Ni" {
|
| 1067 |
+
Some((cursor, ni_end))
|
| 1068 |
+
} else {
|
| 1069 |
+
None
|
| 1070 |
+
}
|
| 1071 |
+
}
|
| 1072 |
+
|
| 1073 |
+
fn is_title_like_label(label: &str) -> bool {
|
| 1074 |
+
matches!(
|
| 1075 |
+
label_entity(label),
|
| 1076 |
+
Some(
|
| 1077 |
+
"TITLE"
|
| 1078 |
+
| "TITLE_CHS"
|
| 1079 |
+
| "TITLE_CHT"
|
| 1080 |
+
| "TITLE_JPN"
|
| 1081 |
+
| "TITLE_LATIN"
|
| 1082 |
+
| "TITLE_MIXED"
|
| 1083 |
+
| "PATH_TITLE_CHS"
|
| 1084 |
+
| "PATH_TITLE_CHT"
|
| 1085 |
+
| "PATH_TITLE_JPN"
|
| 1086 |
+
| "PATH_TITLE_LATIN"
|
| 1087 |
+
| "PATH_TITLE_MIXED"
|
| 1088 |
+
)
|
| 1089 |
+
)
|
| 1090 |
+
}
|
| 1091 |
+
|
| 1092 |
+
fn is_season_like_label(label: &str) -> bool {
|
| 1093 |
+
matches!(label_entity(label), Some("SEASON" | "PATH_SEASON"))
|
| 1094 |
+
}
|
| 1095 |
+
|
| 1096 |
+
fn first_episode_span_end(labels: &[String], offsets: &[(usize, usize)], text: &str) -> usize {
|
| 1097 |
+
let ends = labels
|
| 1098 |
+
.iter()
|
| 1099 |
+
.zip(offsets.iter())
|
| 1100 |
+
.filter_map(|(label, (_start, end))| {
|
| 1101 |
+
if label_entity(label) == Some("EPISODE") {
|
| 1102 |
+
Some(*end)
|
| 1103 |
+
} else {
|
| 1104 |
+
None
|
| 1105 |
+
}
|
| 1106 |
+
})
|
| 1107 |
+
.collect::<Vec<_>>();
|
| 1108 |
+
if let Some(end) = ends.into_iter().min() {
|
| 1109 |
+
return end;
|
| 1110 |
+
}
|
| 1111 |
+
first_episode_regex_end(text).unwrap_or(0)
|
| 1112 |
}
|
| 1113 |
|
| 1114 |
fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
|
|
|
|
| 1164 |
.collect()
|
| 1165 |
}
|
| 1166 |
|
| 1167 |
+
fn label_span_if_safe(labels: &mut [String], indices: &[usize], entity: &str) {
|
| 1168 |
+
if indices.is_empty() {
|
| 1169 |
return;
|
| 1170 |
}
|
| 1171 |
+
if indices.iter().any(|idx| {
|
| 1172 |
matches!(
|
| 1173 |
+
label_entity(&labels[*idx]),
|
| 1174 |
Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
|
| 1175 |
)
|
| 1176 |
}) {
|
| 1177 |
return;
|
| 1178 |
}
|
| 1179 |
+
label_span_indices(labels, indices, entity);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1180 |
}
|
| 1181 |
|
| 1182 |
fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
|
|
|
|
| 1202 |
}
|
| 1203 |
}
|
| 1204 |
|
| 1205 |
+
fn resolution_re() -> &'static FancyRegex {
|
| 1206 |
+
RESOLUTION_RE.get_or_init(|| {
|
| 1207 |
+
FancyRegex::new(
|
| 1208 |
+
r"(?i)(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})(?![A-Za-z0-9])",
|
| 1209 |
+
)
|
| 1210 |
+
.unwrap()
|
| 1211 |
+
})
|
| 1212 |
}
|
| 1213 |
|
| 1214 |
fn source_re() -> &'static Regex {
|
|
|
|
| 1237 |
})
|
| 1238 |
}
|
| 1239 |
|
| 1240 |
+
fn episode_context_re() -> &'static Regex {
|
| 1241 |
+
EPISODE_CONTEXT_RE.get_or_init(|| {
|
| 1242 |
+
Regex::new(
|
| 1243 |
+
r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})",
|
| 1244 |
+
)
|
| 1245 |
+
.unwrap()
|
| 1246 |
+
})
|
| 1247 |
+
}
|
| 1248 |
+
|
| 1249 |
+
fn episode_span_re() -> &'static FancyRegex {
|
| 1250 |
+
EPISODE_SPAN_RE.get_or_init(|| {
|
| 1251 |
+
FancyRegex::new(
|
| 1252 |
+
r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))",
|
| 1253 |
+
)
|
| 1254 |
+
.unwrap()
|
| 1255 |
+
})
|
| 1256 |
+
}
|
| 1257 |
+
|
| 1258 |
+
fn reading_marker_re() -> &'static FancyRegex {
|
| 1259 |
+
READING_MARKER_RE.get_or_init(|| {
|
| 1260 |
+
FancyRegex::new(
|
| 1261 |
+
r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])",
|
| 1262 |
+
)
|
| 1263 |
+
.unwrap()
|
| 1264 |
+
})
|
| 1265 |
+
}
|
| 1266 |
+
|
| 1267 |
+
fn roman_marker_re() -> &'static FancyRegex {
|
| 1268 |
+
ROMAN_MARKER_RE.get_or_init(|| {
|
| 1269 |
+
FancyRegex::new(
|
| 1270 |
+
r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])",
|
| 1271 |
+
)
|
| 1272 |
+
.unwrap()
|
| 1273 |
+
})
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
+
fn cjk_marker_re() -> &'static Regex {
|
| 1277 |
+
CJK_MARKER_RE.get_or_init(|| {
|
| 1278 |
+
Regex::new(
|
| 1279 |
+
r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])",
|
| 1280 |
+
)
|
| 1281 |
+
.unwrap()
|
| 1282 |
+
})
|
| 1283 |
+
}
|
| 1284 |
+
|
| 1285 |
+
fn special_context_prefix_re() -> &'static Regex {
|
| 1286 |
+
SPECIAL_CONTEXT_PREFIX_RE.get_or_init(|| {
|
| 1287 |
+
Regex::new(
|
| 1288 |
+
r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}",
|
| 1289 |
+
)
|
| 1290 |
+
.unwrap()
|
| 1291 |
+
})
|
| 1292 |
+
}
|
| 1293 |
+
|
| 1294 |
fn chars_as_strings(text: &str) -> Vec<String> {
|
| 1295 |
text.chars().map(|ch| ch.to_string()).collect()
|
| 1296 |
}
|
|
|
|
| 1403 |
writer.write_all(&header)?;
|
| 1404 |
Ok(())
|
| 1405 |
}
|
| 1406 |
+
|
| 1407 |
+
#[cfg(test)]
|
| 1408 |
+
mod tests {
|
| 1409 |
+
use super::*;
|
| 1410 |
+
|
| 1411 |
+
fn char_row(
|
| 1412 |
+
text: &str,
|
| 1413 |
+
title_spans: &[(usize, usize)],
|
| 1414 |
+
episode_spans: &[(usize, usize)],
|
| 1415 |
+
) -> SourceRow {
|
| 1416 |
+
let tokens = chars_as_strings(text);
|
| 1417 |
+
let mut labels = vec!["O".to_string(); tokens.len()];
|
| 1418 |
+
for (start, end) in title_spans {
|
| 1419 |
+
label_span(&mut labels, *start, *end, "TITLE_LATIN");
|
| 1420 |
+
}
|
| 1421 |
+
for (start, end) in episode_spans {
|
| 1422 |
+
label_span(&mut labels, *start, *end, "EPISODE");
|
| 1423 |
+
}
|
| 1424 |
+
SourceRow {
|
| 1425 |
+
row_index: 0,
|
| 1426 |
+
raw_line: String::new(),
|
| 1427 |
+
filename: Some(text.to_string()),
|
| 1428 |
+
tokens,
|
| 1429 |
+
labels,
|
| 1430 |
+
tokenizer_variant: Some("char".to_string()),
|
| 1431 |
+
}
|
| 1432 |
+
}
|
| 1433 |
+
|
| 1434 |
+
#[test]
|
| 1435 |
+
fn repairs_cjk_sequel_marker_in_char_fast_path() {
|
| 1436 |
+
let text = "妖怪旅館營業中 貳 - 11";
|
| 1437 |
+
let title_end = char_index_at_byte(text, text.find(" - ").unwrap());
|
| 1438 |
+
let episode_start = char_index_at_byte(text, text.find("11").unwrap());
|
| 1439 |
+
let row = char_row(
|
| 1440 |
+
text,
|
| 1441 |
+
&[(0, title_end)],
|
| 1442 |
+
&[(episode_start, episode_start + 2)],
|
| 1443 |
+
);
|
| 1444 |
+
|
| 1445 |
+
let (_tokens, labels) = labels_for_char_tokenizer(&row);
|
| 1446 |
+
let marker = char_index_at_byte(text, text.find('貳').unwrap());
|
| 1447 |
+
let before_marker = marker - 1;
|
| 1448 |
+
|
| 1449 |
+
assert_eq!(labels[before_marker], "O");
|
| 1450 |
+
assert_eq!(labels[marker], "B-SEASON");
|
| 1451 |
+
assert_eq!(labels[episode_start], "B-EPISODE");
|
| 1452 |
+
}
|
| 1453 |
+
|
| 1454 |
+
#[test]
|
| 1455 |
+
fn repairs_reading_sequel_marker() {
|
| 1456 |
+
let text = "Shokugeki no Souma Ni no Sara - 13";
|
| 1457 |
+
let title_end = text.find(" - ").unwrap();
|
| 1458 |
+
let episode_start = text.find("13").unwrap();
|
| 1459 |
+
let row = char_row(
|
| 1460 |
+
text,
|
| 1461 |
+
&[(0, title_end)],
|
| 1462 |
+
&[(episode_start, episode_start + 2)],
|
| 1463 |
+
);
|
| 1464 |
+
|
| 1465 |
+
let (_tokens, labels) = labels_for_char_tokenizer(&row);
|
| 1466 |
+
let marker_start = text.find("Ni").unwrap();
|
| 1467 |
+
let marker_end = text.find(" - ").unwrap();
|
| 1468 |
+
|
| 1469 |
+
assert_eq!(labels[marker_start - 1], "O");
|
| 1470 |
+
assert_eq!(labels[marker_start], "B-SEASON");
|
| 1471 |
+
assert!(labels[marker_start + 1..marker_end]
|
| 1472 |
+
.iter()
|
| 1473 |
+
.all(|label| label == "I-SEASON"));
|
| 1474 |
+
}
|
| 1475 |
+
|
| 1476 |
+
#[test]
|
| 1477 |
+
fn keeps_numeric_title_suffix_out_of_sequel_repair() {
|
| 1478 |
+
let text = "Kamisama Hajimemashita 2 - 01";
|
| 1479 |
+
let title_end = text.find(" - ").unwrap();
|
| 1480 |
+
let episode_start = text.find("01").unwrap();
|
| 1481 |
+
let row = char_row(
|
| 1482 |
+
text,
|
| 1483 |
+
&[(0, title_end)],
|
| 1484 |
+
&[(episode_start, episode_start + 2)],
|
| 1485 |
+
);
|
| 1486 |
+
|
| 1487 |
+
let (_tokens, labels) = labels_for_char_tokenizer(&row);
|
| 1488 |
+
let suffix = text.find('2').unwrap();
|
| 1489 |
+
|
| 1490 |
+
assert_eq!(labels[suffix], "I-TITLE_LATIN");
|
| 1491 |
+
assert!(!labels
|
| 1492 |
+
.iter()
|
| 1493 |
+
.any(|label| label_entity(label) == Some("SEASON")));
|
| 1494 |
+
}
|
| 1495 |
+
|
| 1496 |
+
#[test]
|
| 1497 |
+
fn skips_alias_marker_when_season_already_exists() {
|
| 1498 |
+
let text = "樱桃小丸子第二期(Chibi Maruko-chan II)[1439]";
|
| 1499 |
+
let tokens = chars_as_strings(text);
|
| 1500 |
+
let mut labels = vec!["O".to_string(); tokens.len()];
|
| 1501 |
+
let title_end = char_index_at_byte(text, text.find("第二期").unwrap());
|
| 1502 |
+
label_span(&mut labels, 0, title_end, "TITLE_CHS");
|
| 1503 |
+
let season_start = title_end;
|
| 1504 |
+
let season_end = season_start + "第二期".chars().count();
|
| 1505 |
+
label_span(&mut labels, season_start, season_end, "SEASON");
|
| 1506 |
+
let alias_start = char_index_at_byte(text, text.find("Chibi").unwrap());
|
| 1507 |
+
let alias_end = char_index_at_byte(text, text.find(")").unwrap());
|
| 1508 |
+
label_span(&mut labels, alias_start, alias_end, "TITLE_LATIN");
|
| 1509 |
+
let episode_start = char_index_at_byte(text, text.find("1439").unwrap());
|
| 1510 |
+
label_span(&mut labels, episode_start, episode_start + 4, "EPISODE");
|
| 1511 |
+
let row = SourceRow {
|
| 1512 |
+
row_index: 0,
|
| 1513 |
+
raw_line: String::new(),
|
| 1514 |
+
filename: Some(text.to_string()),
|
| 1515 |
+
tokens,
|
| 1516 |
+
labels,
|
| 1517 |
+
tokenizer_variant: Some("char".to_string()),
|
| 1518 |
+
};
|
| 1519 |
+
|
| 1520 |
+
let (_tokens, labels) = labels_for_char_tokenizer(&row);
|
| 1521 |
+
let roman = char_index_at_byte(text, text.find("II").unwrap());
|
| 1522 |
+
|
| 1523 |
+
assert_eq!(labels[roman], "I-TITLE_LATIN");
|
| 1524 |
+
}
|
| 1525 |
+
}
|