Fix Rust encoded cache label repairs

Browse files

Files changed (4) hide show

tools/encoded_dataset_cache/Cargo.lock +27 -0
tools/encoded_dataset_cache/Cargo.toml +1 -0
tools/encoded_dataset_cache/src/bin/regex_benchmark.rs +335 -0
tools/encoded_dataset_cache/src/main.rs +683 -67

tools/encoded_dataset_cache/Cargo.lock CHANGED Viewed

@@ -17,6 +17,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
  "rand",
  "rayon",
  "regex",
@@ -80,6 +81,21 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -163,6 +179,17 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
 [[package]]
 name = "getrandom"
 version = "0.2.17"

 dependencies = [
  "anyhow",
  "clap",
+ "fancy-regex",
  "rand",
  "rayon",
  "regex",
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+[[package]]
+name = "bit-set"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
+dependencies = [
+ "bit-vec",
+]
+[[package]]
+name = "bit-vec"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+[[package]]
+name = "fancy-regex"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1e1dacd0d2082dfcf1351c4bdd566bbe89a2b263235a2b50058f1e130a47277"
+dependencies = [
+ "bit-set",
+ "regex-automata",
+ "regex-syntax",
+]
 [[package]]
 name = "getrandom"
 version = "0.2.17"

tools/encoded_dataset_cache/Cargo.toml CHANGED Viewed

@@ -8,6 +8,7 @@ anyhow = "1.0"
 clap = { version = "4.5", features = ["derive"] }
 rand = "0.8"
 rayon = "1.10"
 regex = "1.11"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"

 clap = { version = "4.5", features = ["derive"] }
 rand = "0.8"
 rayon = "1.10"
+fancy-regex = "0.18"
 regex = "1.11"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"

tools/encoded_dataset_cache/src/bin/regex_benchmark.rs ADDED Viewed

	@@ -0,0 +1,335 @@

+use anyhow::{ensure, Context, Result};
+use clap::Parser;
+use fancy_regex::Regex as FancyRegex;
+use regex::Regex;
+use serde_json::Value;
+use std::fs::File;
+use std::hint::black_box;
+use std::io::{BufRead, BufReader};
+use std::path::PathBuf;
+use std::time::Instant;
+const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
+const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}";
+const SPECIAL_TAG_PATTERN: &str =
+    r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+";
+const SPECIAL_CODE_PATTERN: &str =
+    r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$";
+const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})";
+const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))";
+const READING_MARKER_PATTERN: &str = r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])";
+const ROMAN_MARKER_PATTERN: &str =
+    r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])";
+const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])";
+const SPECIAL_CONTEXT_PREFIX_PATTERN: &str =
+    r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}";
+#[derive(Parser, Debug)]
+#[command(
+    about = "Compare regex vs fancy-regex workload costs for AniFileBERT cache preprocessing"
+)]
+struct Args {
+    #[arg(long)]
+    input: PathBuf,
+    #[arg(long, default_value_t = 0)]
+    limit_rows: usize,
+    #[arg(long, default_value_t = 3)]
+    repeat: usize,
+}
+fn main() -> Result<()> {
+    let args = Args::parse();
+    ensure!(args.repeat > 0, "--repeat must be greater than 0");
+    let filenames = load_filenames(&args.input, args.limit_rows)?;
+    if filenames.is_empty() {
+        anyhow::bail!("no filenames loaded from {}", args.input.display());
+    }
+    let selective = SelectivePatterns::new()?;
+    let fancy_all = FancyAllPatterns::new()?;
+    let (selective_seconds, selective_count) =
+        time_repeated(args.repeat, || run_selective(&filenames, &selective))?;
+    let (fancy_seconds, fancy_count) =
+        time_repeated(args.repeat, || run_fancy_all(&filenames, &fancy_all))?;
+    ensure!(
+        selective_count == fancy_count,
+        "selective and fancy-all match counts differ: selective={}, fancy_all={}",
+        selective_count,
+        fancy_count
+    );
+    let ratio = if selective_seconds > 0.0 {
+        fancy_seconds / selective_seconds
+    } else {
+        0.0
+    };
+    println!(
+        "{}",
+        serde_json::json!({
+            "rows": filenames.len(),
+            "repeat": args.repeat,
+            "selective_seconds": selective_seconds,
+            "fancy_all_seconds": fancy_seconds,
+            "ratio": ratio,
+            "match_count": selective_count,
+        })
+    );
+    Ok(())
+}
+fn time_repeated<F>(repeat: usize, mut run_once: F) -> Result<(f64, usize)>
+where
+    F: FnMut() -> Result<usize>,
+{
+    let started = Instant::now();
+    let mut count = 0usize;
+    for _ in 0..repeat {
+        count = count.wrapping_add(black_box(run_once()?));
+    }
+    Ok((started.elapsed().as_secs_f64(), count))
+}
+struct SelectivePatterns {
+    resolution: Regex,
+    source: Regex,
+    source_tag: Regex,
+    special_tag: Regex,
+    special_code: Regex,
+    episode_context: Regex,
+    episode_span: FancyRegex,
+    reading_marker: FancyRegex,
+    roman_marker: FancyRegex,
+    cjk_marker: Regex,
+    special_context_prefix: Regex,
+}
+impl SelectivePatterns {
+    fn new() -> Result<Self> {
+        Ok(Self {
+            resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
+            source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
+            source_tag: Regex::new(&format!(
+                r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
+            ))?,
+            special_tag: Regex::new(SPECIAL_TAG_PATTERN)?,
+            special_code: Regex::new(SPECIAL_CODE_PATTERN)?,
+            episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?,
+            episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
+            reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
+            roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
+            cjk_marker: Regex::new(CJK_MARKER_PATTERN)?,
+            special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
+        })
+    }
+}
+struct FancyAllPatterns {
+    resolution: FancyRegex,
+    source: FancyRegex,
+    source_tag: FancyRegex,
+    special_tag: FancyRegex,
+    special_code: FancyRegex,
+    episode_context: FancyRegex,
+    episode_span: FancyRegex,
+    reading_marker: FancyRegex,
+    roman_marker: FancyRegex,
+    cjk_marker: FancyRegex,
+    special_context_prefix: FancyRegex,
+}
+impl FancyAllPatterns {
+    fn new() -> Result<Self> {
+        Ok(Self {
+            resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
+            source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
+            source_tag: FancyRegex::new(&format!(
+                r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
+            ))?,
+            special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?,
+            special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?,
+            episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?,
+            episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
+            reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
+            roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
+            cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?,
+            special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
+        })
+    }
+}
+fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result<usize> {
+    let mut count = 0usize;
+    for filename in filenames {
+        count = count.wrapping_add(
+            patterns
+                .resolution
+                .find_iter(filename)
+                .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
+                .count(),
+        );
+        count = count.wrapping_add(
+            patterns
+                .source
+                .find_iter(filename)
+                .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
+                .count(),
+        );
+        count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize);
+        count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count());
+        count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
+        count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
+        count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
+        for inner in bracket_inners(filename) {
+            count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize);
+            count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize);
+            count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize);
+            count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize);
+        }
+    }
+    Ok(count)
+}
+fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result<usize> {
+    let mut count = 0usize;
+    for filename in filenames {
+        count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?);
+        count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?);
+        count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize);
+        count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?);
+        count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
+        count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
+        count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
+        for inner in bracket_inners(filename) {
+            count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize);
+            count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize);
+            count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize);
+            count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize);
+        }
+    }
+    Ok(count)
+}
+fn fancy_count(regex: &FancyRegex, text: &str) -> Result<usize> {
+    let mut count = 0usize;
+    for item in regex.find_iter(text) {
+        let _ = item?;
+        count += 1;
+    }
+    Ok(count)
+}
+fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result<usize> {
+    let mut count = 0usize;
+    for item in regex.find_iter(text) {
+        let mat = item?;
+        if has_ascii_token_boundaries(text, mat.start(), mat.end()) {
+            count += 1;
+        }
+    }
+    Ok(count)
+}
+fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result<Vec<String>> {
+    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut filenames = Vec::new();
+    for (idx, line) in reader.lines().enumerate() {
+        if limit_rows > 0 && filenames.len() >= limit_rows {
+            break;
+        }
+        let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?;
+        if raw.trim().is_empty() {
+            continue;
+        }
+        let value: Value = serde_json::from_str(&raw)
+            .with_context(|| format!("invalid JSONL line {}", idx + 1))?;
+        if let Some(filename) = value.get("filename").and_then(Value::as_str) {
+            filenames.push(filename.to_string());
+        }
+    }
+    Ok(filenames)
+}
+fn bracket_inners(text: &str) -> Vec<String> {
+    let chars = text.chars().collect::<Vec<_>>();
+    let mut spans = Vec::new();
+    let mut idx = 0usize;
+    while idx < chars.len() {
+        let close = match chars[idx] {
+            '[' => ']',
+            '(' => ')',
+            '【' => '】',
+            '《' => '》',
+            _ => {
+                idx += 1;
+                continue;
+            }
+        };
+        if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) {
+            let end = idx + 1 + relative_end;
+            spans.push(chars[idx + 1..end].iter().collect::<String>());
+            idx = end + 1;
+        } else {
+            idx += 1;
+        }
+    }
+    spans
+}
+fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
+    let previous_ok = text[..start]
+        .chars()
+        .next_back()
+        .map(|ch| !ch.is_ascii_alphanumeric())
+        .unwrap_or(true);
+    let next_ok = text[end..]
+        .chars()
+        .next()
+        .map(|ch| !ch.is_ascii_alphanumeric())
+        .unwrap_or(true);
+    previous_ok && next_ok
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn selective_and_fancy_all_count_the_same_matches() -> Result<()> {
+        let filenames = vec![
+            "[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(),
+            "[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(),
+            "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(),
+            "[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(),
+            "[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(),
+        ];
+        let selective = SelectivePatterns::new()?;
+        let fancy_all = FancyAllPatterns::new()?;
+        assert_eq!(
+            run_selective(&filenames, &selective)?,
+            run_fancy_all(&filenames, &fancy_all)?
+        );
+        Ok(())
+    }
+    #[test]
+    fn bracket_inners_extract_supported_pairs() {
+        assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]);
+    }
+    #[test]
+    fn ascii_token_boundaries_reject_embedded_matches() {
+        let text = "ABC1080p 1080p HEVC2 HEVC";
+        assert!(!has_ascii_token_boundaries(text, 3, 8));
+        assert!(has_ascii_token_boundaries(text, 9, 14));
+        assert!(!has_ascii_token_boundaries(text, 15, 19));
+        assert!(has_ascii_token_boundaries(text, 21, 25));
+    }
+}

tools/encoded_dataset_cache/src/main.rs CHANGED Viewed

@@ -1,5 +1,6 @@
 use anyhow::{bail, Context, Result};
 use clap::Parser;
 use rand::rngs::StdRng;
 use rand::seq::SliceRandom;
 use rand::SeedableRng;
@@ -56,11 +57,19 @@ const FALLBACK_LABELS: [&str; 37] = [
 const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
-static RESOLUTION_RE: OnceLock<Regex> = OnceLock::new();
 static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
 static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
 static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
 static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
 #[derive(Parser, Debug)]
 #[command(
@@ -457,28 +466,26 @@ fn encode_row(row: &SourceRow, context: &EncodeContext) -> Result<(Vec<u16>, Vec
 }
 fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
-    if row.tokenizer_variant.as_deref() == Some("char") {
-        if let Some(filename) = row.filename.as_deref() {
             let filename_chars = chars_as_strings(filename);
             if row.tokens == filename_chars {
-                return (row.tokens.clone(), row.labels.clone());
             }
         }
-    }
-    if let Some(filename) = row.filename.as_deref() {
-        if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &row.labels) {
-            let (tokens, mut labels) = projected;
-            repair_structural_meta_labels(filename, &mut labels);
             return (tokens, labels);
         }
     }
-    let (tokens, mut labels) = align_tokens_to_chars(&row.tokens, &row.labels);
-    if let Some(filename) = row.filename.as_deref() {
-        repair_structural_meta_labels(filename, &mut labels);
-    }
-    (tokens, labels)
 }
 fn project_labels_from_filename(
@@ -579,11 +586,13 @@ fn align_tokens_to_chars(tokens: &[String], labels: &[String]) -> (Vec<String>,
     (char_tokens, char_labels)
 }
-fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
-    if labels.len() != text.chars().count() {
-        return;
-    }
-    let episode_end = first_episode_span_end(labels);
     for (inner_start, inner_end) in bracket_inner_spans(text) {
         let bracket_start = inner_start.saturating_sub(1);
         if bracket_start < episode_end {
@@ -595,46 +604,46 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
             continue;
         }
         let clean = chars_slice_to_string(&inner, trim_start, trim_end);
-        let clean_start = inner_start + trim_start;
-        let clean_end = inner_start + trim_end;
         if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
-            label_span_if_safe(labels, clean_start, clean_end, "SPECIAL");
             continue;
         }
         if source_tag_re().is_match(&clean) {
-            label_span_if_safe(labels, clean_start, clean_end, "SOURCE");
             continue;
         }
-        for mat in resolution_re().find_iter(&inner) {
-            if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
-                continue;
-            }
-            let start = inner_start + char_index_at_byte(&inner, mat.start());
-            let end = inner_start + char_index_at_byte(&inner, mat.end());
-            label_span_if_safe(labels, start, end, "RESOLUTION");
         }
-        for mat in source_re().find_iter(&inner) {
-            if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
                 continue;
             }
-            let start = inner_start + char_index_at_byte(&inner, mat.start());
-            let end = inner_start + char_index_at_byte(&inner, mat.end());
-            label_span_if_safe(labels, start, end, "SOURCE");
         }
     }
-    for mat in resolution_re().find_iter(text) {
-        if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
-            continue;
-        }
         let start = char_index_at_byte(text, mat.start());
         if start < episode_end {
             continue;
         }
         let end = char_index_at_byte(text, mat.end());
-        label_span_if_safe(labels, start, end, "RESOLUTION");
     }
     for mat in source_re().find_iter(text) {
         if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
@@ -645,23 +654,461 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
             continue;
         }
         let end = char_index_at_byte(text, mat.end());
-        label_span_if_safe(labels, start, end, "SOURCE");
     }
 }
-fn first_episode_span_end(labels: &[String]) -> usize {
-    let mut idx = 0usize;
-    while idx < labels.len() {
-        if label_entity(&labels[idx]) == Some("EPISODE") {
-            let mut end = idx + 1;
-            while end < labels.len() && label_entity(&labels[end]) == Some("EPISODE") {
-                end += 1;
             }
-            return end;
         }
         idx += 1;
     }
-    0
 }
 fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
@@ -717,28 +1164,19 @@ fn chars_slice_to_string(text: &str, start: usize, end: usize) -> String {
         .collect()
 }
-fn label_span_if_safe(labels: &mut [String], start: usize, end: usize, entity: &str) {
-    if start >= end || end > labels.len() {
         return;
     }
-    if labels[start..end].iter().any(|label| {
         matches!(
-            label_entity(label),
             Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
         )
     }) {
         return;
     }
-    let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
-    let mut first = !previous_same;
-    for label in labels.iter_mut().take(end).skip(start) {
-        *label = if first {
-            format!("B-{entity}")
-        } else {
-            format!("I-{entity}")
-        };
-        first = false;
-    }
 }
 fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
@@ -764,9 +1202,13 @@ fn label_entity(label: &str) -> Option<&str> {
     }
 }
-fn resolution_re() -> &'static Regex {
-    RESOLUTION_RE
-        .get_or_init(|| Regex::new(r"(?i)(?:\d{3,4}p|\d[kK]|\d{3,4}[xX×]\d{3,4})").unwrap())
 }
 fn source_re() -> &'static Regex {
@@ -795,6 +1237,60 @@ fn special_code_re() -> &'static Regex {
     })
 }
 fn chars_as_strings(text: &str) -> Vec<String> {
     text.chars().map(|ch| ch.to_string()).collect()
 }
@@ -907,3 +1403,123 @@ fn write_npy_header<W: Write>(writer: &mut W, descr: &str, rows: usize, cols: us
     writer.write_all(&header)?;
     Ok(())
 }

 use anyhow::{bail, Context, Result};
 use clap::Parser;
+use fancy_regex::Regex as FancyRegex;
 use rand::rngs::StdRng;
 use rand::seq::SliceRandom;
 use rand::SeedableRng;
 const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
+static RESOLUTION_RE: OnceLock<FancyRegex> = OnceLock::new();
 static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
 static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
 static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
 static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
+static EPISODE_CONTEXT_RE: OnceLock<Regex> = OnceLock::new();
+static EPISODE_SPAN_RE: OnceLock<FancyRegex> = OnceLock::new();
+static READING_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
+static ROMAN_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
+static CJK_MARKER_RE: OnceLock<Regex> = OnceLock::new();
+static SPECIAL_CONTEXT_PREFIX_RE: OnceLock<Regex> = OnceLock::new();
+const SEPARATOR_CHARS: &[char] = &[' ', '\t', '-', '_', '.', '|', '~', '～'];
 #[derive(Parser, Debug)]
 #[command(
 }
 fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
+    let mut source_labels = row.labels.clone();
+    if let Some(filename) = row.filename.as_deref() {
+        repair_known_label_issues(filename, &row.tokens, &mut source_labels);
+        if row.tokenizer_variant.as_deref() == Some("char") {
             let filename_chars = chars_as_strings(filename);
             if row.tokens == filename_chars {
+                return (row.tokens.clone(), source_labels);
             }
         }
+        if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &source_labels)
+        {
+            let (tokens, labels) = projected;
             return (tokens, labels);
         }
     }
+    align_tokens_to_chars(&row.tokens, &source_labels)
 }
 fn project_labels_from_filename(
     (char_tokens, char_labels)
 }
+fn repair_structural_meta_labels(
+    text: &str,
+    _tokens: &[String],
+    labels: &mut [String],
+    offsets: &[(usize, usize)],
+) {
+    let episode_end = first_episode_span_end(labels, offsets, text);
     for (inner_start, inner_end) in bracket_inner_spans(text) {
         let bracket_start = inner_start.saturating_sub(1);
         if bracket_start < episode_end {
             continue;
         }
         let clean = chars_slice_to_string(&inner, trim_start, trim_end);
         if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
+            let indices = token_indices_for_span(offsets, inner_start, inner_end);
+            label_span_if_safe(labels, &indices, "SPECIAL");
             continue;
         }
         if source_tag_re().is_match(&clean) {
+            let indices = token_indices_for_span(offsets, inner_start, inner_end);
+            label_span_if_safe(labels, &indices, "SOURCE");
             continue;
         }
+        for mat in resolution_re()
+            .find_iter(&clean)
+            .filter_map(|item| item.ok())
+        {
+            let start = inner_start + char_index_at_byte(&clean, mat.start());
+            let end = inner_start + char_index_at_byte(&clean, mat.end());
+            let indices = token_indices_for_span(offsets, start, end);
+            label_span_if_safe(labels, &indices, "RESOLUTION");
         }
+        for mat in source_re().find_iter(&clean) {
+            if !has_ascii_token_boundaries(&clean, mat.start(), mat.end()) {
                 continue;
             }
+            let start = inner_start + char_index_at_byte(&clean, mat.start());
+            let end = inner_start + char_index_at_byte(&clean, mat.end());
+            let indices = token_indices_for_span(offsets, start, end);
+            label_span_if_safe(labels, &indices, "SOURCE");
         }
     }
+    for mat in resolution_re().find_iter(text).filter_map(|item| item.ok()) {
         let start = char_index_at_byte(text, mat.start());
         if start < episode_end {
             continue;
         }
         let end = char_index_at_byte(text, mat.end());
+        let indices = token_indices_for_span(offsets, start, end);
+        label_span_if_safe(labels, &indices, "RESOLUTION");
     }
     for mat in source_re().find_iter(text) {
         if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
             continue;
         }
         let end = char_index_at_byte(text, mat.end());
+        let indices = token_indices_for_span(offsets, start, end);
+        label_span_if_safe(labels, &indices, "SOURCE");
     }
 }
+fn repair_known_label_issues(text: &str, tokens: &[String], labels: &mut [String]) {
+    if tokens.len() != labels.len() {
+        return;
+    }
+    let Some(offsets) = token_offsets_in_text(text, tokens) else {
+        return;
+    };
+    let quick_text = text.to_lowercase();
+    let has_sequel_marker_hint = [
+        " II", " III", " IV", " V", " VI", " VII", " VIII", " IX", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ",
+        "Ⅷ", "Ⅸ", "之章", "之期", "之季", "之部", "ノ章", "ノ期", "の章", "の期", "貳", "贰", "弐",
+        "弍", "參", "叁", "参", "肆", "陸", "陆", "Ni ", " ni ", " no Sara", "Gakki",
+    ]
+    .iter()
+    .any(|needle| text.contains(needle) || quick_text.contains(&needle.to_lowercase()));
+    if has_sequel_marker_hint {
+        for (start, end) in find_sequel_season_markers(text) {
+            if labels_have_season_before(labels, &offsets, start) {
+                continue;
+            }
+            let indices = token_indices_for_span(&offsets, start, end);
+            if indices.is_empty() {
+                continue;
+            }
+            if indices.iter().any(|idx| {
+                matches!(
+                    label_entity(&labels[*idx]),
+                    Some(
+                        "GROUP"
+                            | "EPISODE"
+                            | "RESOLUTION"
+                            | "SOURCE"
+                            | "SPECIAL"
+                            | "TAG"
+                            | "PATH_SEASON"
+                    )
+                )
+            }) {
+                continue;
+            }
+            if !indices.iter().any(|idx| is_title_like_label(&labels[*idx])) {
+                continue;
+            }
+            label_span_indices(labels, &indices, "SEASON");
+            mark_adjacent_title_separators_o(tokens, labels, &indices);
+        }
+    }
+    repair_structural_meta_labels(text, tokens, labels, &offsets);
+}
+fn find_sequel_season_markers(text: &str) -> Vec<(usize, usize)> {
+    let mut repairs = Vec::new();
+    for mat in reading_marker_re()
+        .find_iter(text)
+        .filter_map(|item| item.ok())
+    {
+        let marker = mat.as_str();
+        if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
+            continue;
+        }
+        repairs.push((
+            char_index_at_byte(text, mat.start()),
+            char_index_at_byte(text, mat.end()),
+        ));
+    }
+    for mat in roman_marker_re()
+        .find_iter(text)
+        .filter_map(|item| item.ok())
+    {
+        let marker = mat.as_str();
+        if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
+            continue;
+        }
+        repairs.push((
+            char_index_at_byte(text, mat.start()),
+            char_index_at_byte(text, mat.end()),
+        ));
+    }
+    for mat in cjk_marker_re().find_iter(text) {
+        let marker = mat.as_str();
+        if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
+            continue;
+        }
+        repairs.push((
+            char_index_at_byte(text, mat.start()),
+            char_index_at_byte(text, mat.end()),
+        ));
+    }
+    for (base, value) in standalone_ni_season_bases() {
+        let mut search_start = 0usize;
+        while let Some(relative) = text[search_start..].find(base) {
+            let base_start = search_start + relative;
+            let base_end = base_start + base.len();
+            let Some((ni_start, ni_end)) = standalone_ni_after_base(text, base_end) else {
+                search_start = base_end;
+                continue;
+            };
+            if *value == 2
+                && has_episode_context(text, ni_end)
+                && has_ascii_token_boundaries(text, ni_start, ni_end)
+            {
+                repairs.push((
+                    char_index_at_byte(text, ni_start),
+                    char_index_at_byte(text, ni_end),
+                ));
+            }
+            search_start = base_end;
+        }
+    }
+    repairs.sort_by_key(|(start, end)| (*start, *end));
+    let mut deduped: Vec<(usize, usize)> = Vec::new();
+    for repair in repairs {
+        if let Some(previous) = deduped.last_mut() {
+            if repair.0 < previous.1 {
+                if repair.1.saturating_sub(repair.0) > previous.1.saturating_sub(previous.0) {
+                    *previous = repair;
+                }
+                continue;
+            }
+        }
+        deduped.push(repair);
+    }
+    deduped
+}
+fn season_marker_number(text: &str) -> Option<u8> {
+    let clean = clean_marker_text(text);
+    if clean.is_empty() {
+        return None;
+    }
+    if let Some(value) = roman_numeral_value(&clean) {
+        return Some(value);
+    }
+    let lowered = clean
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+        .to_lowercase();
+    if let Some(value) = reading_marker_value(&lowered) {
+        return Some(value);
+    }
+    if lowered == "ni" {
+        return Some(2);
+    }
+    if clean.starts_with('第') {
+        if let Some(last) = clean.chars().last() {
+            if matches!(last, '季' | '期' | '部' | '章') {
+                let inner = clean
+                    .chars()
+                    .skip(1)
+                    .take(clean.chars().count().saturating_sub(2))
+                    .collect::<String>();
+                return cn_number_to_int(&inner);
+            }
+        }
+    }
+    let cjk_chars = clean.chars().collect::<Vec<_>>();
+    if let Some(first) = cjk_chars.first() {
+        if let Some(value) = cn_number_to_int(&first.to_string()) {
+            let rest = cjk_chars.iter().skip(1).collect::<String>();
+            if rest.trim().is_empty() || cjk_marker_suffix_remainder_ok(&rest) {
+                return Some(value);
             }
         }
+    }
+    None
+}
+fn clean_marker_text(text: &str) -> String {
+    text.trim()
+        .trim_matches(|ch| {
+            matches!(
+                ch,
+                '[' | ']' | '(' | ')' | '【' | '】' | '《' | '》' | '（' | '）'
+            )
+        })
+        .trim()
+        .to_string()
+}
+fn cn_number_to_int(text: &str) -> Option<u8> {
+    let text = text.trim();
+    if text.is_empty() {
+        return None;
+    }
+    if let Ok(value) = text.parse::<u8>() {
+        return Some(value);
+    }
+    if let Some(value) = cn_digit_value(text) {
+        return Some(value);
+    }
+    let chars = text.chars().collect::<Vec<_>>();
+    if chars.len() == 2 && chars[0] == '十' {
+        return Some(10 + cn_digit_value(&chars[1].to_string()).unwrap_or(0));
+    }
+    if chars.len() == 2 && chars[1] == '十' {
+        return Some(cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10);
+    }
+    if chars.len() == 3 && chars[1] == '十' {
+        return Some(
+            cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10
+                + cn_digit_value(&chars[2].to_string()).unwrap_or(0),
+        );
+    }
+    None
+}
+fn cn_digit_value(text: &str) -> Option<u8> {
+    match text {
+        "一" => Some(1),
+        "二" | "兩" | "两" | "貳" | "贰" | "弐" | "弍" => Some(2),
+        "三" | "參" | "叁" | "参" => Some(3),
+        "四" | "肆" => Some(4),
+        "五" | "伍" => Some(5),
+        "六" | "陸" | "陆" => Some(6),
+        "七" | "柒" => Some(7),
+        "八" | "捌" => Some(8),
+        "九" | "玖" => Some(9),
+        "十" => Some(10),
+        _ => None,
+    }
+}
+fn roman_numeral_value(text: &str) -> Option<u8> {
+    match text {
+        "II" | "Ⅱ" => Some(2),
+        "III" | "Ⅲ" => Some(3),
+        "IV" | "Ⅳ" => Some(4),
+        "V" | "Ⅴ" => Some(5),
+        "VI" | "Ⅵ" => Some(6),
+        "VII" | "Ⅶ" => Some(7),
+        "VIII" | "Ⅷ" => Some(8),
+        "IX" | "Ⅸ" => Some(9),
+        _ => None,
+    }
+}
+fn reading_marker_value(text: &str) -> Option<u8> {
+    match text {
+        "ni no sara" | "ni no shou" | "ni no sho" | "ni no syo" | "ni no shō" | "ni gakki"
+        | "sono ni" => Some(2),
+        "san no sara" | "san no shou" | "san no sho" | "san no syo" => Some(3),
+        "yon no sara" | "shi no sara" | "shin no sara" => Some(4),
+        "go no sara" | "gou no sara" => Some(5),
+        _ => None,
+    }
+}
+fn cjk_marker_suffix_remainder_ok(rest: &str) -> bool {
+    let compact = rest.split_whitespace().collect::<String>();
+    matches!(
+        compact.as_str(),
+        "ノ章"
+            | "ノ期"
+            | "ノ季"
+            | "ノ部"
+            | "の章"
+            | "の期"
+            | "の季"
+            | "の部"
+            | "之章"
+            | "之期"
+            | "之季"
+            | "之部"
+    )
+}
+fn has_episode_context(text: &str, marker_end_byte: usize) -> bool {
+    let tail = &text[marker_end_byte..];
+    if episode_context_re().is_match(tail) {
+        return true;
+    }
+    let mut tail = tail.trim_start();
+    if let Some(ch) = tail.chars().next() {
+        if matches!(ch, ']' | ')' | '】' | '》') {
+            tail = &tail[ch.len_utf8()..];
+            tail = tail.trim_start();
+        }
+    }
+    if let Some(mat) = special_context_prefix_re().find(tail) {
+        tail = &tail[mat.end()..];
+    }
+    episode_context_re().is_match(tail)
+}
+fn first_episode_regex_end(text: &str) -> Option<usize> {
+    episode_span_re()
+        .find_iter(text)
+        .filter_map(|item| item.ok())
+        .map(|mat| char_index_at_byte(text, mat.end()))
+        .next()
+}
+fn labels_have_season_before(
+    labels: &[String],
+    offsets: &[(usize, usize)],
+    marker_start: usize,
+) -> bool {
+    labels
+        .iter()
+        .zip(offsets.iter())
+        .any(|(label, (_start, end))| is_season_like_label(label) && *end <= marker_start)
+}
+fn token_indices_for_span(offsets: &[(usize, usize)], start: usize, end: usize) -> Vec<usize> {
+    offsets
+        .iter()
+        .enumerate()
+        .filter_map(|(idx, (token_start, token_end))| {
+            if *token_start < end && *token_end > start {
+                Some(idx)
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+#[cfg(test)]
+fn label_span(labels: &mut [String], start: usize, end: usize, entity: &str) {
+    let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
+    let mut first = !previous_same;
+    for label in labels.iter_mut().take(end).skip(start) {
+        *label = if first {
+            format!("B-{entity}")
+        } else {
+            format!("I-{entity}")
+        };
+        first = false;
+    }
+}
+fn label_span_indices(labels: &mut [String], indices: &[usize], entity: &str) {
+    if indices.is_empty() {
+        return;
+    }
+    let previous_same = indices[0] > 0 && label_entity(&labels[indices[0] - 1]) == Some(entity);
+    let mut first = !previous_same;
+    for idx in indices {
+        labels[*idx] = if first {
+            format!("B-{entity}")
+        } else {
+            format!("I-{entity}")
+        };
+        first = false;
+    }
+}
+fn mark_adjacent_title_separators_o(
+    tokens: &[String],
+    labels: &mut [String],
+    marker_indices: &[usize],
+) {
+    if marker_indices.is_empty() {
+        return;
+    }
+    let mut idx = marker_indices[0];
+    while idx > 0 {
+        let prev = idx - 1;
+        if !tokens[prev].trim().is_empty() || !is_title_like_label(&labels[prev]) {
+            break;
+        }
+        labels[prev] = "O".to_string();
+        idx = prev;
+    }
+    let mut idx = marker_indices[marker_indices.len() - 1] + 1;
+    while idx < tokens.len()
+        && tokens[idx].chars().all(|ch| SEPARATOR_CHARS.contains(&ch))
+        && is_title_like_label(&labels[idx])
+    {
+        labels[idx] = "O".to_string();
         idx += 1;
     }
+}
+fn standalone_ni_season_bases() -> &'static [(&'static str, u8)] {
+    &[("Kakuriyo no Yadomeshi", 2)]
+}
+fn standalone_ni_after_base(text: &str, base_end: usize) -> Option<(usize, usize)> {
+    let mut cursor = base_end;
+    while let Some(ch) = text[cursor..].chars().next() {
+        if !ch.is_whitespace() {
+            break;
+        }
+        cursor += ch.len_utf8();
+    }
+    let ni_end = cursor.checked_add(2)?;
+    if text.get(cursor..ni_end)? == "Ni" {
+        Some((cursor, ni_end))
+    } else {
+        None
+    }
+}
+fn is_title_like_label(label: &str) -> bool {
+    matches!(
+        label_entity(label),
+        Some(
+            "TITLE"
+                | "TITLE_CHS"
+                | "TITLE_CHT"
+                | "TITLE_JPN"
+                | "TITLE_LATIN"
+                | "TITLE_MIXED"
+                | "PATH_TITLE_CHS"
+                | "PATH_TITLE_CHT"
+                | "PATH_TITLE_JPN"
+                | "PATH_TITLE_LATIN"
+                | "PATH_TITLE_MIXED"
+        )
+    )
+}
+fn is_season_like_label(label: &str) -> bool {
+    matches!(label_entity(label), Some("SEASON" | "PATH_SEASON"))
+}
+fn first_episode_span_end(labels: &[String], offsets: &[(usize, usize)], text: &str) -> usize {
+    let ends = labels
+        .iter()
+        .zip(offsets.iter())
+        .filter_map(|(label, (_start, end))| {
+            if label_entity(label) == Some("EPISODE") {
+                Some(*end)
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>();
+    if let Some(end) = ends.into_iter().min() {
+        return end;
+    }
+    first_episode_regex_end(text).unwrap_or(0)
 }
 fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
         .collect()
 }
+fn label_span_if_safe(labels: &mut [String], indices: &[usize], entity: &str) {
+    if indices.is_empty() {
         return;
     }
+    if indices.iter().any(|idx| {
         matches!(
+            label_entity(&labels[*idx]),
             Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
         )
     }) {
         return;
     }
+    label_span_indices(labels, indices, entity);
 }
 fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
     }
 }
+fn resolution_re() -> &'static FancyRegex {
+    RESOLUTION_RE.get_or_init(|| {
+        FancyRegex::new(
+            r"(?i)(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})(?![A-Za-z0-9])",
+        )
+        .unwrap()
+    })
 }
 fn source_re() -> &'static Regex {
     })
 }
+fn episode_context_re() -> &'static Regex {
+    EPISODE_CONTEXT_RE.get_or_init(|| {
+        Regex::new(
+            r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})",
+        )
+        .unwrap()
+    })
+}
+fn episode_span_re() -> &'static FancyRegex {
+    EPISODE_SPAN_RE.get_or_init(|| {
+        FancyRegex::new(
+            r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))",
+        )
+        .unwrap()
+    })
+}
+fn reading_marker_re() -> &'static FancyRegex {
+    READING_MARKER_RE.get_or_init(|| {
+        FancyRegex::new(
+            r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])",
+        )
+        .unwrap()
+    })
+}
+fn roman_marker_re() -> &'static FancyRegex {
+    ROMAN_MARKER_RE.get_or_init(|| {
+        FancyRegex::new(
+            r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])",
+        )
+        .unwrap()
+    })
+}
+fn cjk_marker_re() -> &'static Regex {
+    CJK_MARKER_RE.get_or_init(|| {
+        Regex::new(
+            r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])",
+        )
+        .unwrap()
+    })
+}
+fn special_context_prefix_re() -> &'static Regex {
+    SPECIAL_CONTEXT_PREFIX_RE.get_or_init(|| {
+        Regex::new(
+            r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}",
+        )
+        .unwrap()
+    })
+}
 fn chars_as_strings(text: &str) -> Vec<String> {
     text.chars().map(|ch| ch.to_string()).collect()
 }
     writer.write_all(&header)?;
     Ok(())
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    fn char_row(
+        text: &str,
+        title_spans: &[(usize, usize)],
+        episode_spans: &[(usize, usize)],
+    ) -> SourceRow {
+        let tokens = chars_as_strings(text);
+        let mut labels = vec!["O".to_string(); tokens.len()];
+        for (start, end) in title_spans {
+            label_span(&mut labels, *start, *end, "TITLE_LATIN");
+        }
+        for (start, end) in episode_spans {
+            label_span(&mut labels, *start, *end, "EPISODE");
+        }
+        SourceRow {
+            row_index: 0,
+            raw_line: String::new(),
+            filename: Some(text.to_string()),
+            tokens,
+            labels,
+            tokenizer_variant: Some("char".to_string()),
+        }
+    }
+    #[test]
+    fn repairs_cjk_sequel_marker_in_char_fast_path() {
+        let text = "妖怪旅館營業中 貳 - 11";
+        let title_end = char_index_at_byte(text, text.find(" - ").unwrap());
+        let episode_start = char_index_at_byte(text, text.find("11").unwrap());
+        let row = char_row(
+            text,
+            &[(0, title_end)],
+            &[(episode_start, episode_start + 2)],
+        );
+        let (_tokens, labels) = labels_for_char_tokenizer(&row);
+        let marker = char_index_at_byte(text, text.find('貳').unwrap());
+        let before_marker = marker - 1;
+        assert_eq!(labels[before_marker], "O");
+        assert_eq!(labels[marker], "B-SEASON");
+        assert_eq!(labels[episode_start], "B-EPISODE");
+    }
+    #[test]
+    fn repairs_reading_sequel_marker() {
+        let text = "Shokugeki no Souma Ni no Sara - 13";
+        let title_end = text.find(" - ").unwrap();
+        let episode_start = text.find("13").unwrap();
+        let row = char_row(
+            text,
+            &[(0, title_end)],
+            &[(episode_start, episode_start + 2)],
+        );
+        let (_tokens, labels) = labels_for_char_tokenizer(&row);
+        let marker_start = text.find("Ni").unwrap();
+        let marker_end = text.find(" - ").unwrap();
+        assert_eq!(labels[marker_start - 1], "O");
+        assert_eq!(labels[marker_start], "B-SEASON");
+        assert!(labels[marker_start + 1..marker_end]
+            .iter()
+            .all(|label| label == "I-SEASON"));
+    }
+    #[test]
+    fn keeps_numeric_title_suffix_out_of_sequel_repair() {
+        let text = "Kamisama Hajimemashita 2 - 01";
+        let title_end = text.find(" - ").unwrap();
+        let episode_start = text.find("01").unwrap();
+        let row = char_row(
+            text,
+            &[(0, title_end)],
+            &[(episode_start, episode_start + 2)],
+        );
+        let (_tokens, labels) = labels_for_char_tokenizer(&row);
+        let suffix = text.find('2').unwrap();
+        assert_eq!(labels[suffix], "I-TITLE_LATIN");
+        assert!(!labels
+            .iter()
+            .any(|label| label_entity(label) == Some("SEASON")));
+    }
+    #[test]
+    fn skips_alias_marker_when_season_already_exists() {
+        let text = "樱桃小丸子第二期(Chibi Maruko-chan II)[1439]";
+        let tokens = chars_as_strings(text);
+        let mut labels = vec!["O".to_string(); tokens.len()];
+        let title_end = char_index_at_byte(text, text.find("第二期").unwrap());
+        label_span(&mut labels, 0, title_end, "TITLE_CHS");
+        let season_start = title_end;
+        let season_end = season_start + "第二期".chars().count();
+        label_span(&mut labels, season_start, season_end, "SEASON");
+        let alias_start = char_index_at_byte(text, text.find("Chibi").unwrap());
+        let alias_end = char_index_at_byte(text, text.find(")").unwrap());
+        label_span(&mut labels, alias_start, alias_end, "TITLE_LATIN");
+        let episode_start = char_index_at_byte(text, text.find("1439").unwrap());
+        label_span(&mut labels, episode_start, episode_start + 4, "EPISODE");
+        let row = SourceRow {
+            row_index: 0,
+            raw_line: String::new(),
+            filename: Some(text.to_string()),
+            tokens,
+            labels,
+            tokenizer_variant: Some("char".to_string()),
+        };
+        let (_tokens, labels) = labels_for_char_tokenizer(&row);
+        let roman = char_index_at_byte(text, text.find("II").unwrap());
+        assert_eq!(labels[roman], "I-TITLE_LATIN");
+    }
+}