ModerRAS commited on
Commit
7934324
·
1 Parent(s): 6a5f135

Fix Rust encoded cache label repairs

Browse files
tools/encoded_dataset_cache/Cargo.lock CHANGED
@@ -17,6 +17,7 @@ version = "0.1.0"
17
  dependencies = [
18
  "anyhow",
19
  "clap",
 
20
  "rand",
21
  "rayon",
22
  "regex",
@@ -80,6 +81,21 @@ version = "1.0.102"
80
  source = "registry+https://github.com/rust-lang/crates.io-index"
81
  checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  [[package]]
84
  name = "cfg-if"
85
  version = "1.0.4"
@@ -163,6 +179,17 @@ version = "1.16.0"
163
  source = "registry+https://github.com/rust-lang/crates.io-index"
164
  checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
165
 
 
 
 
 
 
 
 
 
 
 
 
166
  [[package]]
167
  name = "getrandom"
168
  version = "0.2.17"
 
17
  dependencies = [
18
  "anyhow",
19
  "clap",
20
+ "fancy-regex",
21
  "rand",
22
  "rayon",
23
  "regex",
 
81
  source = "registry+https://github.com/rust-lang/crates.io-index"
82
  checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
83
 
84
+ [[package]]
85
+ name = "bit-set"
86
+ version = "0.8.0"
87
+ source = "registry+https://github.com/rust-lang/crates.io-index"
88
+ checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
89
+ dependencies = [
90
+ "bit-vec",
91
+ ]
92
+
93
+ [[package]]
94
+ name = "bit-vec"
95
+ version = "0.8.0"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
98
+
99
  [[package]]
100
  name = "cfg-if"
101
  version = "1.0.4"
 
179
  source = "registry+https://github.com/rust-lang/crates.io-index"
180
  checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
181
 
182
+ [[package]]
183
+ name = "fancy-regex"
184
+ version = "0.18.0"
185
+ source = "registry+https://github.com/rust-lang/crates.io-index"
186
+ checksum = "e1e1dacd0d2082dfcf1351c4bdd566bbe89a2b263235a2b50058f1e130a47277"
187
+ dependencies = [
188
+ "bit-set",
189
+ "regex-automata",
190
+ "regex-syntax",
191
+ ]
192
+
193
  [[package]]
194
  name = "getrandom"
195
  version = "0.2.17"
tools/encoded_dataset_cache/Cargo.toml CHANGED
@@ -8,6 +8,7 @@ anyhow = "1.0"
8
  clap = { version = "4.5", features = ["derive"] }
9
  rand = "0.8"
10
  rayon = "1.10"
 
11
  regex = "1.11"
12
  serde = { version = "1.0", features = ["derive"] }
13
  serde_json = "1.0"
 
8
  clap = { version = "4.5", features = ["derive"] }
9
  rand = "0.8"
10
  rayon = "1.10"
11
+ fancy-regex = "0.18"
12
  regex = "1.11"
13
  serde = { version = "1.0", features = ["derive"] }
14
  serde_json = "1.0"
tools/encoded_dataset_cache/src/bin/regex_benchmark.rs ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use anyhow::{ensure, Context, Result};
2
+ use clap::Parser;
3
+ use fancy_regex::Regex as FancyRegex;
4
+ use regex::Regex;
5
+ use serde_json::Value;
6
+ use std::fs::File;
7
+ use std::hint::black_box;
8
+ use std::io::{BufRead, BufReader};
9
+ use std::path::PathBuf;
10
+ use std::time::Instant;
11
+
12
+ const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
13
+ const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}";
14
+ const SPECIAL_TAG_PATTERN: &str =
15
+ r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+";
16
+ const SPECIAL_CODE_PATTERN: &str =
17
+ r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$";
18
+ const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})";
19
+ const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))";
20
+ const READING_MARKER_PATTERN: &str = r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])";
21
+ const ROMAN_MARKER_PATTERN: &str =
22
+ r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])";
23
+ const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])";
24
+ const SPECIAL_CONTEXT_PREFIX_PATTERN: &str =
25
+ r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}";
26
+
27
+ #[derive(Parser, Debug)]
28
+ #[command(
29
+ about = "Compare regex vs fancy-regex workload costs for AniFileBERT cache preprocessing"
30
+ )]
31
+ struct Args {
32
+ #[arg(long)]
33
+ input: PathBuf,
34
+
35
+ #[arg(long, default_value_t = 0)]
36
+ limit_rows: usize,
37
+
38
+ #[arg(long, default_value_t = 3)]
39
+ repeat: usize,
40
+ }
41
+
42
+ fn main() -> Result<()> {
43
+ let args = Args::parse();
44
+ ensure!(args.repeat > 0, "--repeat must be greater than 0");
45
+
46
+ let filenames = load_filenames(&args.input, args.limit_rows)?;
47
+ if filenames.is_empty() {
48
+ anyhow::bail!("no filenames loaded from {}", args.input.display());
49
+ }
50
+
51
+ let selective = SelectivePatterns::new()?;
52
+ let fancy_all = FancyAllPatterns::new()?;
53
+
54
+ let (selective_seconds, selective_count) =
55
+ time_repeated(args.repeat, || run_selective(&filenames, &selective))?;
56
+ let (fancy_seconds, fancy_count) =
57
+ time_repeated(args.repeat, || run_fancy_all(&filenames, &fancy_all))?;
58
+ ensure!(
59
+ selective_count == fancy_count,
60
+ "selective and fancy-all match counts differ: selective={}, fancy_all={}",
61
+ selective_count,
62
+ fancy_count
63
+ );
64
+
65
+ let ratio = if selective_seconds > 0.0 {
66
+ fancy_seconds / selective_seconds
67
+ } else {
68
+ 0.0
69
+ };
70
+ println!(
71
+ "{}",
72
+ serde_json::json!({
73
+ "rows": filenames.len(),
74
+ "repeat": args.repeat,
75
+ "selective_seconds": selective_seconds,
76
+ "fancy_all_seconds": fancy_seconds,
77
+ "ratio": ratio,
78
+ "match_count": selective_count,
79
+ })
80
+ );
81
+ Ok(())
82
+ }
83
+
84
+ fn time_repeated<F>(repeat: usize, mut run_once: F) -> Result<(f64, usize)>
85
+ where
86
+ F: FnMut() -> Result<usize>,
87
+ {
88
+ let started = Instant::now();
89
+ let mut count = 0usize;
90
+ for _ in 0..repeat {
91
+ count = count.wrapping_add(black_box(run_once()?));
92
+ }
93
+ Ok((started.elapsed().as_secs_f64(), count))
94
+ }
95
+
96
+ struct SelectivePatterns {
97
+ resolution: Regex,
98
+ source: Regex,
99
+ source_tag: Regex,
100
+ special_tag: Regex,
101
+ special_code: Regex,
102
+ episode_context: Regex,
103
+ episode_span: FancyRegex,
104
+ reading_marker: FancyRegex,
105
+ roman_marker: FancyRegex,
106
+ cjk_marker: Regex,
107
+ special_context_prefix: Regex,
108
+ }
109
+
110
+ impl SelectivePatterns {
111
+ fn new() -> Result<Self> {
112
+ Ok(Self {
113
+ resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
114
+ source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
115
+ source_tag: Regex::new(&format!(
116
+ r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
117
+ ))?,
118
+ special_tag: Regex::new(SPECIAL_TAG_PATTERN)?,
119
+ special_code: Regex::new(SPECIAL_CODE_PATTERN)?,
120
+ episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?,
121
+ episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
122
+ reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
123
+ roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
124
+ cjk_marker: Regex::new(CJK_MARKER_PATTERN)?,
125
+ special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
126
+ })
127
+ }
128
+ }
129
+
130
+ struct FancyAllPatterns {
131
+ resolution: FancyRegex,
132
+ source: FancyRegex,
133
+ source_tag: FancyRegex,
134
+ special_tag: FancyRegex,
135
+ special_code: FancyRegex,
136
+ episode_context: FancyRegex,
137
+ episode_span: FancyRegex,
138
+ reading_marker: FancyRegex,
139
+ roman_marker: FancyRegex,
140
+ cjk_marker: FancyRegex,
141
+ special_context_prefix: FancyRegex,
142
+ }
143
+
144
+ impl FancyAllPatterns {
145
+ fn new() -> Result<Self> {
146
+ Ok(Self {
147
+ resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
148
+ source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
149
+ source_tag: FancyRegex::new(&format!(
150
+ r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
151
+ ))?,
152
+ special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?,
153
+ special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?,
154
+ episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?,
155
+ episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
156
+ reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
157
+ roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
158
+ cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?,
159
+ special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
160
+ })
161
+ }
162
+ }
163
+
164
+ fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result<usize> {
165
+ let mut count = 0usize;
166
+ for filename in filenames {
167
+ count = count.wrapping_add(
168
+ patterns
169
+ .resolution
170
+ .find_iter(filename)
171
+ .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
172
+ .count(),
173
+ );
174
+ count = count.wrapping_add(
175
+ patterns
176
+ .source
177
+ .find_iter(filename)
178
+ .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
179
+ .count(),
180
+ );
181
+ count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize);
182
+ count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count());
183
+ count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
184
+ count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
185
+ count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
186
+
187
+ for inner in bracket_inners(filename) {
188
+ count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize);
189
+ count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize);
190
+ count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize);
191
+ count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize);
192
+ }
193
+ }
194
+ Ok(count)
195
+ }
196
+
197
+ fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result<usize> {
198
+ let mut count = 0usize;
199
+ for filename in filenames {
200
+ count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?);
201
+ count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?);
202
+ count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize);
203
+ count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?);
204
+ count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
205
+ count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
206
+ count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
207
+
208
+ for inner in bracket_inners(filename) {
209
+ count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize);
210
+ count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize);
211
+ count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize);
212
+ count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize);
213
+ }
214
+ }
215
+ Ok(count)
216
+ }
217
+
218
+ fn fancy_count(regex: &FancyRegex, text: &str) -> Result<usize> {
219
+ let mut count = 0usize;
220
+ for item in regex.find_iter(text) {
221
+ let _ = item?;
222
+ count += 1;
223
+ }
224
+ Ok(count)
225
+ }
226
+
227
+ fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result<usize> {
228
+ let mut count = 0usize;
229
+ for item in regex.find_iter(text) {
230
+ let mat = item?;
231
+ if has_ascii_token_boundaries(text, mat.start(), mat.end()) {
232
+ count += 1;
233
+ }
234
+ }
235
+ Ok(count)
236
+ }
237
+
238
+ fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result<Vec<String>> {
239
+ let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
240
+ let reader = BufReader::new(file);
241
+ let mut filenames = Vec::new();
242
+ for (idx, line) in reader.lines().enumerate() {
243
+ if limit_rows > 0 && filenames.len() >= limit_rows {
244
+ break;
245
+ }
246
+ let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?;
247
+ if raw.trim().is_empty() {
248
+ continue;
249
+ }
250
+ let value: Value = serde_json::from_str(&raw)
251
+ .with_context(|| format!("invalid JSONL line {}", idx + 1))?;
252
+ if let Some(filename) = value.get("filename").and_then(Value::as_str) {
253
+ filenames.push(filename.to_string());
254
+ }
255
+ }
256
+ Ok(filenames)
257
+ }
258
+
259
+ fn bracket_inners(text: &str) -> Vec<String> {
260
+ let chars = text.chars().collect::<Vec<_>>();
261
+ let mut spans = Vec::new();
262
+ let mut idx = 0usize;
263
+ while idx < chars.len() {
264
+ let close = match chars[idx] {
265
+ '[' => ']',
266
+ '(' => ')',
267
+ '【' => '】',
268
+ '《' => '》',
269
+ _ => {
270
+ idx += 1;
271
+ continue;
272
+ }
273
+ };
274
+ if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) {
275
+ let end = idx + 1 + relative_end;
276
+ spans.push(chars[idx + 1..end].iter().collect::<String>());
277
+ idx = end + 1;
278
+ } else {
279
+ idx += 1;
280
+ }
281
+ }
282
+ spans
283
+ }
284
+
285
+ fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
286
+ let previous_ok = text[..start]
287
+ .chars()
288
+ .next_back()
289
+ .map(|ch| !ch.is_ascii_alphanumeric())
290
+ .unwrap_or(true);
291
+ let next_ok = text[end..]
292
+ .chars()
293
+ .next()
294
+ .map(|ch| !ch.is_ascii_alphanumeric())
295
+ .unwrap_or(true);
296
+ previous_ok && next_ok
297
+ }
298
+
299
+ #[cfg(test)]
300
+ mod tests {
301
+ use super::*;
302
+
303
+ #[test]
304
+ fn selective_and_fancy_all_count_the_same_matches() -> Result<()> {
305
+ let filenames = vec![
306
+ "[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(),
307
+ "[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(),
308
+ "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(),
309
+ "[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(),
310
+ "[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(),
311
+ ];
312
+ let selective = SelectivePatterns::new()?;
313
+ let fancy_all = FancyAllPatterns::new()?;
314
+
315
+ assert_eq!(
316
+ run_selective(&filenames, &selective)?,
317
+ run_fancy_all(&filenames, &fancy_all)?
318
+ );
319
+ Ok(())
320
+ }
321
+
322
+ #[test]
323
+ fn bracket_inners_extract_supported_pairs() {
324
+ assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]);
325
+ }
326
+
327
+ #[test]
328
+ fn ascii_token_boundaries_reject_embedded_matches() {
329
+ let text = "ABC1080p 1080p HEVC2 HEVC";
330
+ assert!(!has_ascii_token_boundaries(text, 3, 8));
331
+ assert!(has_ascii_token_boundaries(text, 9, 14));
332
+ assert!(!has_ascii_token_boundaries(text, 15, 19));
333
+ assert!(has_ascii_token_boundaries(text, 21, 25));
334
+ }
335
+ }
tools/encoded_dataset_cache/src/main.rs CHANGED
@@ -1,5 +1,6 @@
1
  use anyhow::{bail, Context, Result};
2
  use clap::Parser;
 
3
  use rand::rngs::StdRng;
4
  use rand::seq::SliceRandom;
5
  use rand::SeedableRng;
@@ -56,11 +57,19 @@ const FALLBACK_LABELS: [&str; 37] = [
56
 
57
  const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
58
 
59
- static RESOLUTION_RE: OnceLock<Regex> = OnceLock::new();
60
  static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
61
  static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
62
  static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
63
  static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
 
 
 
 
 
 
 
 
64
 
65
  #[derive(Parser, Debug)]
66
  #[command(
@@ -457,28 +466,26 @@ fn encode_row(row: &SourceRow, context: &EncodeContext) -> Result<(Vec<u16>, Vec
457
  }
458
 
459
  fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
460
- if row.tokenizer_variant.as_deref() == Some("char") {
461
- if let Some(filename) = row.filename.as_deref() {
 
 
 
 
462
  let filename_chars = chars_as_strings(filename);
463
  if row.tokens == filename_chars {
464
- return (row.tokens.clone(), row.labels.clone());
465
  }
466
  }
467
- }
468
 
469
- if let Some(filename) = row.filename.as_deref() {
470
- if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &row.labels) {
471
- let (tokens, mut labels) = projected;
472
- repair_structural_meta_labels(filename, &mut labels);
473
  return (tokens, labels);
474
  }
475
  }
476
 
477
- let (tokens, mut labels) = align_tokens_to_chars(&row.tokens, &row.labels);
478
- if let Some(filename) = row.filename.as_deref() {
479
- repair_structural_meta_labels(filename, &mut labels);
480
- }
481
- (tokens, labels)
482
  }
483
 
484
  fn project_labels_from_filename(
@@ -579,11 +586,13 @@ fn align_tokens_to_chars(tokens: &[String], labels: &[String]) -> (Vec<String>,
579
  (char_tokens, char_labels)
580
  }
581
 
582
- fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
583
- if labels.len() != text.chars().count() {
584
- return;
585
- }
586
- let episode_end = first_episode_span_end(labels);
 
 
587
  for (inner_start, inner_end) in bracket_inner_spans(text) {
588
  let bracket_start = inner_start.saturating_sub(1);
589
  if bracket_start < episode_end {
@@ -595,46 +604,46 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
595
  continue;
596
  }
597
  let clean = chars_slice_to_string(&inner, trim_start, trim_end);
598
- let clean_start = inner_start + trim_start;
599
- let clean_end = inner_start + trim_end;
600
 
601
  if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
602
- label_span_if_safe(labels, clean_start, clean_end, "SPECIAL");
 
603
  continue;
604
  }
605
  if source_tag_re().is_match(&clean) {
606
- label_span_if_safe(labels, clean_start, clean_end, "SOURCE");
 
607
  continue;
608
  }
609
 
610
- for mat in resolution_re().find_iter(&inner) {
611
- if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
612
- continue;
613
- }
614
- let start = inner_start + char_index_at_byte(&inner, mat.start());
615
- let end = inner_start + char_index_at_byte(&inner, mat.end());
616
- label_span_if_safe(labels, start, end, "RESOLUTION");
 
617
  }
618
- for mat in source_re().find_iter(&inner) {
619
- if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
620
  continue;
621
  }
622
- let start = inner_start + char_index_at_byte(&inner, mat.start());
623
- let end = inner_start + char_index_at_byte(&inner, mat.end());
624
- label_span_if_safe(labels, start, end, "SOURCE");
 
625
  }
626
  }
627
 
628
- for mat in resolution_re().find_iter(text) {
629
- if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
630
- continue;
631
- }
632
  let start = char_index_at_byte(text, mat.start());
633
  if start < episode_end {
634
  continue;
635
  }
636
  let end = char_index_at_byte(text, mat.end());
637
- label_span_if_safe(labels, start, end, "RESOLUTION");
 
638
  }
639
  for mat in source_re().find_iter(text) {
640
  if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
@@ -645,23 +654,461 @@ fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
645
  continue;
646
  }
647
  let end = char_index_at_byte(text, mat.end());
648
- label_span_if_safe(labels, start, end, "SOURCE");
 
649
  }
650
  }
651
 
652
- fn first_episode_span_end(labels: &[String]) -> usize {
653
- let mut idx = 0usize;
654
- while idx < labels.len() {
655
- if label_entity(&labels[idx]) == Some("EPISODE") {
656
- let mut end = idx + 1;
657
- while end < labels.len() && label_entity(&labels[end]) == Some("EPISODE") {
658
- end += 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  }
660
- return end;
661
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  idx += 1;
663
  }
664
- 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  }
666
 
667
  fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
@@ -717,28 +1164,19 @@ fn chars_slice_to_string(text: &str, start: usize, end: usize) -> String {
717
  .collect()
718
  }
719
 
720
- fn label_span_if_safe(labels: &mut [String], start: usize, end: usize, entity: &str) {
721
- if start >= end || end > labels.len() {
722
  return;
723
  }
724
- if labels[start..end].iter().any(|label| {
725
  matches!(
726
- label_entity(label),
727
  Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
728
  )
729
  }) {
730
  return;
731
  }
732
- let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
733
- let mut first = !previous_same;
734
- for label in labels.iter_mut().take(end).skip(start) {
735
- *label = if first {
736
- format!("B-{entity}")
737
- } else {
738
- format!("I-{entity}")
739
- };
740
- first = false;
741
- }
742
  }
743
 
744
  fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
@@ -764,9 +1202,13 @@ fn label_entity(label: &str) -> Option<&str> {
764
  }
765
  }
766
 
767
- fn resolution_re() -> &'static Regex {
768
- RESOLUTION_RE
769
- .get_or_init(|| Regex::new(r"(?i)(?:\d{3,4}p|\d[kK]|\d{3,4}[xX×]\d{3,4})").unwrap())
 
 
 
 
770
  }
771
 
772
  fn source_re() -> &'static Regex {
@@ -795,6 +1237,60 @@ fn special_code_re() -> &'static Regex {
795
  })
796
  }
797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798
  fn chars_as_strings(text: &str) -> Vec<String> {
799
  text.chars().map(|ch| ch.to_string()).collect()
800
  }
@@ -907,3 +1403,123 @@ fn write_npy_header<W: Write>(writer: &mut W, descr: &str, rows: usize, cols: us
907
  writer.write_all(&header)?;
908
  Ok(())
909
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  use anyhow::{bail, Context, Result};
2
  use clap::Parser;
3
+ use fancy_regex::Regex as FancyRegex;
4
  use rand::rngs::StdRng;
5
  use rand::seq::SliceRandom;
6
  use rand::SeedableRng;
 
57
 
58
  const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
59
 
60
+ static RESOLUTION_RE: OnceLock<FancyRegex> = OnceLock::new();
61
  static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
62
  static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
63
  static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
64
  static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
65
+ static EPISODE_CONTEXT_RE: OnceLock<Regex> = OnceLock::new();
66
+ static EPISODE_SPAN_RE: OnceLock<FancyRegex> = OnceLock::new();
67
+ static READING_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
68
+ static ROMAN_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new();
69
+ static CJK_MARKER_RE: OnceLock<Regex> = OnceLock::new();
70
+ static SPECIAL_CONTEXT_PREFIX_RE: OnceLock<Regex> = OnceLock::new();
71
+
72
+ const SEPARATOR_CHARS: &[char] = &[' ', '\t', '-', '_', '.', '|', '~', '~'];
73
 
74
  #[derive(Parser, Debug)]
75
  #[command(
 
466
  }
467
 
468
  fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
469
+ let mut source_labels = row.labels.clone();
470
+
471
+ if let Some(filename) = row.filename.as_deref() {
472
+ repair_known_label_issues(filename, &row.tokens, &mut source_labels);
473
+
474
+ if row.tokenizer_variant.as_deref() == Some("char") {
475
  let filename_chars = chars_as_strings(filename);
476
  if row.tokens == filename_chars {
477
+ return (row.tokens.clone(), source_labels);
478
  }
479
  }
 
480
 
481
+ if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &source_labels)
482
+ {
483
+ let (tokens, labels) = projected;
 
484
  return (tokens, labels);
485
  }
486
  }
487
 
488
+ align_tokens_to_chars(&row.tokens, &source_labels)
 
 
 
 
489
  }
490
 
491
  fn project_labels_from_filename(
 
586
  (char_tokens, char_labels)
587
  }
588
 
589
+ fn repair_structural_meta_labels(
590
+ text: &str,
591
+ _tokens: &[String],
592
+ labels: &mut [String],
593
+ offsets: &[(usize, usize)],
594
+ ) {
595
+ let episode_end = first_episode_span_end(labels, offsets, text);
596
  for (inner_start, inner_end) in bracket_inner_spans(text) {
597
  let bracket_start = inner_start.saturating_sub(1);
598
  if bracket_start < episode_end {
 
604
  continue;
605
  }
606
  let clean = chars_slice_to_string(&inner, trim_start, trim_end);
 
 
607
 
608
  if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
609
+ let indices = token_indices_for_span(offsets, inner_start, inner_end);
610
+ label_span_if_safe(labels, &indices, "SPECIAL");
611
  continue;
612
  }
613
  if source_tag_re().is_match(&clean) {
614
+ let indices = token_indices_for_span(offsets, inner_start, inner_end);
615
+ label_span_if_safe(labels, &indices, "SOURCE");
616
  continue;
617
  }
618
 
619
+ for mat in resolution_re()
620
+ .find_iter(&clean)
621
+ .filter_map(|item| item.ok())
622
+ {
623
+ let start = inner_start + char_index_at_byte(&clean, mat.start());
624
+ let end = inner_start + char_index_at_byte(&clean, mat.end());
625
+ let indices = token_indices_for_span(offsets, start, end);
626
+ label_span_if_safe(labels, &indices, "RESOLUTION");
627
  }
628
+ for mat in source_re().find_iter(&clean) {
629
+ if !has_ascii_token_boundaries(&clean, mat.start(), mat.end()) {
630
  continue;
631
  }
632
+ let start = inner_start + char_index_at_byte(&clean, mat.start());
633
+ let end = inner_start + char_index_at_byte(&clean, mat.end());
634
+ let indices = token_indices_for_span(offsets, start, end);
635
+ label_span_if_safe(labels, &indices, "SOURCE");
636
  }
637
  }
638
 
639
+ for mat in resolution_re().find_iter(text).filter_map(|item| item.ok()) {
 
 
 
640
  let start = char_index_at_byte(text, mat.start());
641
  if start < episode_end {
642
  continue;
643
  }
644
  let end = char_index_at_byte(text, mat.end());
645
+ let indices = token_indices_for_span(offsets, start, end);
646
+ label_span_if_safe(labels, &indices, "RESOLUTION");
647
  }
648
  for mat in source_re().find_iter(text) {
649
  if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
 
654
  continue;
655
  }
656
  let end = char_index_at_byte(text, mat.end());
657
+ let indices = token_indices_for_span(offsets, start, end);
658
+ label_span_if_safe(labels, &indices, "SOURCE");
659
  }
660
  }
661
 
662
+ fn repair_known_label_issues(text: &str, tokens: &[String], labels: &mut [String]) {
663
+ if tokens.len() != labels.len() {
664
+ return;
665
+ }
666
+ let Some(offsets) = token_offsets_in_text(text, tokens) else {
667
+ return;
668
+ };
669
+
670
+ let quick_text = text.to_lowercase();
671
+ let has_sequel_marker_hint = [
672
+ " II", " III", " IV", " V", " VI", " VII", " VIII", " IX", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ",
673
+ "Ⅷ", "Ⅸ", "之章", "之期", "之季", "之部", "ノ章", "ノ期", "の章", "の期", "貳", "贰", "弐",
674
+ "弍", "參", "叁", "参", "肆", "陸", "陆", "Ni ", " ni ", " no Sara", "Gakki",
675
+ ]
676
+ .iter()
677
+ .any(|needle| text.contains(needle) || quick_text.contains(&needle.to_lowercase()));
678
+
679
+ if has_sequel_marker_hint {
680
+ for (start, end) in find_sequel_season_markers(text) {
681
+ if labels_have_season_before(labels, &offsets, start) {
682
+ continue;
683
+ }
684
+ let indices = token_indices_for_span(&offsets, start, end);
685
+ if indices.is_empty() {
686
+ continue;
687
+ }
688
+ if indices.iter().any(|idx| {
689
+ matches!(
690
+ label_entity(&labels[*idx]),
691
+ Some(
692
+ "GROUP"
693
+ | "EPISODE"
694
+ | "RESOLUTION"
695
+ | "SOURCE"
696
+ | "SPECIAL"
697
+ | "TAG"
698
+ | "PATH_SEASON"
699
+ )
700
+ )
701
+ }) {
702
+ continue;
703
+ }
704
+ if !indices.iter().any(|idx| is_title_like_label(&labels[*idx])) {
705
+ continue;
706
+ }
707
+
708
+ label_span_indices(labels, &indices, "SEASON");
709
+ mark_adjacent_title_separators_o(tokens, labels, &indices);
710
+ }
711
+ }
712
+
713
+ repair_structural_meta_labels(text, tokens, labels, &offsets);
714
+ }
715
+
716
+ fn find_sequel_season_markers(text: &str) -> Vec<(usize, usize)> {
717
+ let mut repairs = Vec::new();
718
+
719
+ for mat in reading_marker_re()
720
+ .find_iter(text)
721
+ .filter_map(|item| item.ok())
722
+ {
723
+ let marker = mat.as_str();
724
+ if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
725
+ continue;
726
+ }
727
+ repairs.push((
728
+ char_index_at_byte(text, mat.start()),
729
+ char_index_at_byte(text, mat.end()),
730
+ ));
731
+ }
732
+
733
+ for mat in roman_marker_re()
734
+ .find_iter(text)
735
+ .filter_map(|item| item.ok())
736
+ {
737
+ let marker = mat.as_str();
738
+ if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
739
+ continue;
740
+ }
741
+ repairs.push((
742
+ char_index_at_byte(text, mat.start()),
743
+ char_index_at_byte(text, mat.end()),
744
+ ));
745
+ }
746
+
747
+ for mat in cjk_marker_re().find_iter(text) {
748
+ let marker = mat.as_str();
749
+ if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) {
750
+ continue;
751
+ }
752
+ repairs.push((
753
+ char_index_at_byte(text, mat.start()),
754
+ char_index_at_byte(text, mat.end()),
755
+ ));
756
+ }
757
+
758
+ for (base, value) in standalone_ni_season_bases() {
759
+ let mut search_start = 0usize;
760
+ while let Some(relative) = text[search_start..].find(base) {
761
+ let base_start = search_start + relative;
762
+ let base_end = base_start + base.len();
763
+ let Some((ni_start, ni_end)) = standalone_ni_after_base(text, base_end) else {
764
+ search_start = base_end;
765
+ continue;
766
+ };
767
+ if *value == 2
768
+ && has_episode_context(text, ni_end)
769
+ && has_ascii_token_boundaries(text, ni_start, ni_end)
770
+ {
771
+ repairs.push((
772
+ char_index_at_byte(text, ni_start),
773
+ char_index_at_byte(text, ni_end),
774
+ ));
775
+ }
776
+ search_start = base_end;
777
+ }
778
+ }
779
+
780
+ repairs.sort_by_key(|(start, end)| (*start, *end));
781
+ let mut deduped: Vec<(usize, usize)> = Vec::new();
782
+ for repair in repairs {
783
+ if let Some(previous) = deduped.last_mut() {
784
+ if repair.0 < previous.1 {
785
+ if repair.1.saturating_sub(repair.0) > previous.1.saturating_sub(previous.0) {
786
+ *previous = repair;
787
+ }
788
+ continue;
789
+ }
790
+ }
791
+ deduped.push(repair);
792
+ }
793
+ deduped
794
+ }
795
+
796
+ fn season_marker_number(text: &str) -> Option<u8> {
797
+ let clean = clean_marker_text(text);
798
+ if clean.is_empty() {
799
+ return None;
800
+ }
801
+ if let Some(value) = roman_numeral_value(&clean) {
802
+ return Some(value);
803
+ }
804
+
805
+ let lowered = clean
806
+ .split_whitespace()
807
+ .collect::<Vec<_>>()
808
+ .join(" ")
809
+ .to_lowercase();
810
+ if let Some(value) = reading_marker_value(&lowered) {
811
+ return Some(value);
812
+ }
813
+ if lowered == "ni" {
814
+ return Some(2);
815
+ }
816
+
817
+ if clean.starts_with('第') {
818
+ if let Some(last) = clean.chars().last() {
819
+ if matches!(last, '季' | '期' | '部' | '章') {
820
+ let inner = clean
821
+ .chars()
822
+ .skip(1)
823
+ .take(clean.chars().count().saturating_sub(2))
824
+ .collect::<String>();
825
+ return cn_number_to_int(&inner);
826
+ }
827
+ }
828
+ }
829
+
830
+ let cjk_chars = clean.chars().collect::<Vec<_>>();
831
+ if let Some(first) = cjk_chars.first() {
832
+ if let Some(value) = cn_number_to_int(&first.to_string()) {
833
+ let rest = cjk_chars.iter().skip(1).collect::<String>();
834
+ if rest.trim().is_empty() || cjk_marker_suffix_remainder_ok(&rest) {
835
+ return Some(value);
836
  }
 
837
  }
838
+ }
839
+
840
+ None
841
+ }
842
+
843
+ fn clean_marker_text(text: &str) -> String {
844
+ text.trim()
845
+ .trim_matches(|ch| {
846
+ matches!(
847
+ ch,
848
+ '[' | ']' | '(' | ')' | '【' | '】' | '《' | '》' | '(' | ')'
849
+ )
850
+ })
851
+ .trim()
852
+ .to_string()
853
+ }
854
+
855
+ fn cn_number_to_int(text: &str) -> Option<u8> {
856
+ let text = text.trim();
857
+ if text.is_empty() {
858
+ return None;
859
+ }
860
+ if let Ok(value) = text.parse::<u8>() {
861
+ return Some(value);
862
+ }
863
+ if let Some(value) = cn_digit_value(text) {
864
+ return Some(value);
865
+ }
866
+ let chars = text.chars().collect::<Vec<_>>();
867
+ if chars.len() == 2 && chars[0] == '十' {
868
+ return Some(10 + cn_digit_value(&chars[1].to_string()).unwrap_or(0));
869
+ }
870
+ if chars.len() == 2 && chars[1] == '十' {
871
+ return Some(cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10);
872
+ }
873
+ if chars.len() == 3 && chars[1] == '十' {
874
+ return Some(
875
+ cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10
876
+ + cn_digit_value(&chars[2].to_string()).unwrap_or(0),
877
+ );
878
+ }
879
+ None
880
+ }
881
+
882
+ fn cn_digit_value(text: &str) -> Option<u8> {
883
+ match text {
884
+ "一" => Some(1),
885
+ "二" | "兩" | "两" | "貳" | "贰" | "弐" | "弍" => Some(2),
886
+ "三" | "參" | "叁" | "参" => Some(3),
887
+ "四" | "肆" => Some(4),
888
+ "五" | "伍" => Some(5),
889
+ "六" | "陸" | "陆" => Some(6),
890
+ "七" | "柒" => Some(7),
891
+ "八" | "捌" => Some(8),
892
+ "九" | "玖" => Some(9),
893
+ "十" => Some(10),
894
+ _ => None,
895
+ }
896
+ }
897
+
898
+ fn roman_numeral_value(text: &str) -> Option<u8> {
899
+ match text {
900
+ "II" | "Ⅱ" => Some(2),
901
+ "III" | "Ⅲ" => Some(3),
902
+ "IV" | "Ⅳ" => Some(4),
903
+ "V" | "Ⅴ" => Some(5),
904
+ "VI" | "Ⅵ" => Some(6),
905
+ "VII" | "Ⅶ" => Some(7),
906
+ "VIII" | "Ⅷ" => Some(8),
907
+ "IX" | "Ⅸ" => Some(9),
908
+ _ => None,
909
+ }
910
+ }
911
+
912
+ fn reading_marker_value(text: &str) -> Option<u8> {
913
+ match text {
914
+ "ni no sara" | "ni no shou" | "ni no sho" | "ni no syo" | "ni no shō" | "ni gakki"
915
+ | "sono ni" => Some(2),
916
+ "san no sara" | "san no shou" | "san no sho" | "san no syo" => Some(3),
917
+ "yon no sara" | "shi no sara" | "shin no sara" => Some(4),
918
+ "go no sara" | "gou no sara" => Some(5),
919
+ _ => None,
920
+ }
921
+ }
922
+
923
+ fn cjk_marker_suffix_remainder_ok(rest: &str) -> bool {
924
+ let compact = rest.split_whitespace().collect::<String>();
925
+ matches!(
926
+ compact.as_str(),
927
+ "ノ章"
928
+ | "ノ期"
929
+ | "ノ季"
930
+ | "ノ部"
931
+ | "の章"
932
+ | "の期"
933
+ | "の季"
934
+ | "の部"
935
+ | "之章"
936
+ | "之期"
937
+ | "之季"
938
+ | "之部"
939
+ )
940
+ }
941
+
942
+ fn has_episode_context(text: &str, marker_end_byte: usize) -> bool {
943
+ let tail = &text[marker_end_byte..];
944
+ if episode_context_re().is_match(tail) {
945
+ return true;
946
+ }
947
+
948
+ let mut tail = tail.trim_start();
949
+ if let Some(ch) = tail.chars().next() {
950
+ if matches!(ch, ']' | ')' | '】' | '》') {
951
+ tail = &tail[ch.len_utf8()..];
952
+ tail = tail.trim_start();
953
+ }
954
+ }
955
+ if let Some(mat) = special_context_prefix_re().find(tail) {
956
+ tail = &tail[mat.end()..];
957
+ }
958
+ episode_context_re().is_match(tail)
959
+ }
960
+
961
+ fn first_episode_regex_end(text: &str) -> Option<usize> {
962
+ episode_span_re()
963
+ .find_iter(text)
964
+ .filter_map(|item| item.ok())
965
+ .map(|mat| char_index_at_byte(text, mat.end()))
966
+ .next()
967
+ }
968
+
969
+ fn labels_have_season_before(
970
+ labels: &[String],
971
+ offsets: &[(usize, usize)],
972
+ marker_start: usize,
973
+ ) -> bool {
974
+ labels
975
+ .iter()
976
+ .zip(offsets.iter())
977
+ .any(|(label, (_start, end))| is_season_like_label(label) && *end <= marker_start)
978
+ }
979
+
980
+ fn token_indices_for_span(offsets: &[(usize, usize)], start: usize, end: usize) -> Vec<usize> {
981
+ offsets
982
+ .iter()
983
+ .enumerate()
984
+ .filter_map(|(idx, (token_start, token_end))| {
985
+ if *token_start < end && *token_end > start {
986
+ Some(idx)
987
+ } else {
988
+ None
989
+ }
990
+ })
991
+ .collect()
992
+ }
993
+
994
+ #[cfg(test)]
995
+ fn label_span(labels: &mut [String], start: usize, end: usize, entity: &str) {
996
+ let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
997
+ let mut first = !previous_same;
998
+ for label in labels.iter_mut().take(end).skip(start) {
999
+ *label = if first {
1000
+ format!("B-{entity}")
1001
+ } else {
1002
+ format!("I-{entity}")
1003
+ };
1004
+ first = false;
1005
+ }
1006
+ }
1007
+
1008
+ fn label_span_indices(labels: &mut [String], indices: &[usize], entity: &str) {
1009
+ if indices.is_empty() {
1010
+ return;
1011
+ }
1012
+ let previous_same = indices[0] > 0 && label_entity(&labels[indices[0] - 1]) == Some(entity);
1013
+ let mut first = !previous_same;
1014
+ for idx in indices {
1015
+ labels[*idx] = if first {
1016
+ format!("B-{entity}")
1017
+ } else {
1018
+ format!("I-{entity}")
1019
+ };
1020
+ first = false;
1021
+ }
1022
+ }
1023
+
1024
+ fn mark_adjacent_title_separators_o(
1025
+ tokens: &[String],
1026
+ labels: &mut [String],
1027
+ marker_indices: &[usize],
1028
+ ) {
1029
+ if marker_indices.is_empty() {
1030
+ return;
1031
+ }
1032
+
1033
+ let mut idx = marker_indices[0];
1034
+ while idx > 0 {
1035
+ let prev = idx - 1;
1036
+ if !tokens[prev].trim().is_empty() || !is_title_like_label(&labels[prev]) {
1037
+ break;
1038
+ }
1039
+ labels[prev] = "O".to_string();
1040
+ idx = prev;
1041
+ }
1042
+
1043
+ let mut idx = marker_indices[marker_indices.len() - 1] + 1;
1044
+ while idx < tokens.len()
1045
+ && tokens[idx].chars().all(|ch| SEPARATOR_CHARS.contains(&ch))
1046
+ && is_title_like_label(&labels[idx])
1047
+ {
1048
+ labels[idx] = "O".to_string();
1049
  idx += 1;
1050
  }
1051
+ }
1052
+
1053
+ fn standalone_ni_season_bases() -> &'static [(&'static str, u8)] {
1054
+ &[("Kakuriyo no Yadomeshi", 2)]
1055
+ }
1056
+
1057
+ fn standalone_ni_after_base(text: &str, base_end: usize) -> Option<(usize, usize)> {
1058
+ let mut cursor = base_end;
1059
+ while let Some(ch) = text[cursor..].chars().next() {
1060
+ if !ch.is_whitespace() {
1061
+ break;
1062
+ }
1063
+ cursor += ch.len_utf8();
1064
+ }
1065
+ let ni_end = cursor.checked_add(2)?;
1066
+ if text.get(cursor..ni_end)? == "Ni" {
1067
+ Some((cursor, ni_end))
1068
+ } else {
1069
+ None
1070
+ }
1071
+ }
1072
+
1073
+ fn is_title_like_label(label: &str) -> bool {
1074
+ matches!(
1075
+ label_entity(label),
1076
+ Some(
1077
+ "TITLE"
1078
+ | "TITLE_CHS"
1079
+ | "TITLE_CHT"
1080
+ | "TITLE_JPN"
1081
+ | "TITLE_LATIN"
1082
+ | "TITLE_MIXED"
1083
+ | "PATH_TITLE_CHS"
1084
+ | "PATH_TITLE_CHT"
1085
+ | "PATH_TITLE_JPN"
1086
+ | "PATH_TITLE_LATIN"
1087
+ | "PATH_TITLE_MIXED"
1088
+ )
1089
+ )
1090
+ }
1091
+
1092
+ fn is_season_like_label(label: &str) -> bool {
1093
+ matches!(label_entity(label), Some("SEASON" | "PATH_SEASON"))
1094
+ }
1095
+
1096
+ fn first_episode_span_end(labels: &[String], offsets: &[(usize, usize)], text: &str) -> usize {
1097
+ let ends = labels
1098
+ .iter()
1099
+ .zip(offsets.iter())
1100
+ .filter_map(|(label, (_start, end))| {
1101
+ if label_entity(label) == Some("EPISODE") {
1102
+ Some(*end)
1103
+ } else {
1104
+ None
1105
+ }
1106
+ })
1107
+ .collect::<Vec<_>>();
1108
+ if let Some(end) = ends.into_iter().min() {
1109
+ return end;
1110
+ }
1111
+ first_episode_regex_end(text).unwrap_or(0)
1112
  }
1113
 
1114
  fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
 
1164
  .collect()
1165
  }
1166
 
1167
+ fn label_span_if_safe(labels: &mut [String], indices: &[usize], entity: &str) {
1168
+ if indices.is_empty() {
1169
  return;
1170
  }
1171
+ if indices.iter().any(|idx| {
1172
  matches!(
1173
+ label_entity(&labels[*idx]),
1174
  Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
1175
  )
1176
  }) {
1177
  return;
1178
  }
1179
+ label_span_indices(labels, indices, entity);
 
 
 
 
 
 
 
 
 
1180
  }
1181
 
1182
  fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
 
1202
  }
1203
  }
1204
 
1205
+ fn resolution_re() -> &'static FancyRegex {
1206
+ RESOLUTION_RE.get_or_init(|| {
1207
+ FancyRegex::new(
1208
+ r"(?i)(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})(?![A-Za-z0-9])",
1209
+ )
1210
+ .unwrap()
1211
+ })
1212
  }
1213
 
1214
  fn source_re() -> &'static Regex {
 
1237
  })
1238
  }
1239
 
1240
+ fn episode_context_re() -> &'static Regex {
1241
+ EPISODE_CONTEXT_RE.get_or_init(|| {
1242
+ Regex::new(
1243
+ r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})",
1244
+ )
1245
+ .unwrap()
1246
+ })
1247
+ }
1248
+
1249
+ fn episode_span_re() -> &'static FancyRegex {
1250
+ EPISODE_SPAN_RE.get_or_init(|| {
1251
+ FancyRegex::new(
1252
+ r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))",
1253
+ )
1254
+ .unwrap()
1255
+ })
1256
+ }
1257
+
1258
+ fn reading_marker_re() -> &'static FancyRegex {
1259
+ READING_MARKER_RE.get_or_init(|| {
1260
+ FancyRegex::new(
1261
+ r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])",
1262
+ )
1263
+ .unwrap()
1264
+ })
1265
+ }
1266
+
1267
+ fn roman_marker_re() -> &'static FancyRegex {
1268
+ ROMAN_MARKER_RE.get_or_init(|| {
1269
+ FancyRegex::new(
1270
+ r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])",
1271
+ )
1272
+ .unwrap()
1273
+ })
1274
+ }
1275
+
1276
+ fn cjk_marker_re() -> &'static Regex {
1277
+ CJK_MARKER_RE.get_or_init(|| {
1278
+ Regex::new(
1279
+ r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])",
1280
+ )
1281
+ .unwrap()
1282
+ })
1283
+ }
1284
+
1285
+ fn special_context_prefix_re() -> &'static Regex {
1286
+ SPECIAL_CONTEXT_PREFIX_RE.get_or_init(|| {
1287
+ Regex::new(
1288
+ r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}",
1289
+ )
1290
+ .unwrap()
1291
+ })
1292
+ }
1293
+
1294
  fn chars_as_strings(text: &str) -> Vec<String> {
1295
  text.chars().map(|ch| ch.to_string()).collect()
1296
  }
 
1403
  writer.write_all(&header)?;
1404
  Ok(())
1405
  }
1406
+
1407
+ #[cfg(test)]
1408
+ mod tests {
1409
+ use super::*;
1410
+
1411
+ fn char_row(
1412
+ text: &str,
1413
+ title_spans: &[(usize, usize)],
1414
+ episode_spans: &[(usize, usize)],
1415
+ ) -> SourceRow {
1416
+ let tokens = chars_as_strings(text);
1417
+ let mut labels = vec!["O".to_string(); tokens.len()];
1418
+ for (start, end) in title_spans {
1419
+ label_span(&mut labels, *start, *end, "TITLE_LATIN");
1420
+ }
1421
+ for (start, end) in episode_spans {
1422
+ label_span(&mut labels, *start, *end, "EPISODE");
1423
+ }
1424
+ SourceRow {
1425
+ row_index: 0,
1426
+ raw_line: String::new(),
1427
+ filename: Some(text.to_string()),
1428
+ tokens,
1429
+ labels,
1430
+ tokenizer_variant: Some("char".to_string()),
1431
+ }
1432
+ }
1433
+
1434
+ #[test]
1435
+ fn repairs_cjk_sequel_marker_in_char_fast_path() {
1436
+ let text = "妖怪旅館營業中 貳 - 11";
1437
+ let title_end = char_index_at_byte(text, text.find(" - ").unwrap());
1438
+ let episode_start = char_index_at_byte(text, text.find("11").unwrap());
1439
+ let row = char_row(
1440
+ text,
1441
+ &[(0, title_end)],
1442
+ &[(episode_start, episode_start + 2)],
1443
+ );
1444
+
1445
+ let (_tokens, labels) = labels_for_char_tokenizer(&row);
1446
+ let marker = char_index_at_byte(text, text.find('貳').unwrap());
1447
+ let before_marker = marker - 1;
1448
+
1449
+ assert_eq!(labels[before_marker], "O");
1450
+ assert_eq!(labels[marker], "B-SEASON");
1451
+ assert_eq!(labels[episode_start], "B-EPISODE");
1452
+ }
1453
+
1454
+ #[test]
1455
+ fn repairs_reading_sequel_marker() {
1456
+ let text = "Shokugeki no Souma Ni no Sara - 13";
1457
+ let title_end = text.find(" - ").unwrap();
1458
+ let episode_start = text.find("13").unwrap();
1459
+ let row = char_row(
1460
+ text,
1461
+ &[(0, title_end)],
1462
+ &[(episode_start, episode_start + 2)],
1463
+ );
1464
+
1465
+ let (_tokens, labels) = labels_for_char_tokenizer(&row);
1466
+ let marker_start = text.find("Ni").unwrap();
1467
+ let marker_end = text.find(" - ").unwrap();
1468
+
1469
+ assert_eq!(labels[marker_start - 1], "O");
1470
+ assert_eq!(labels[marker_start], "B-SEASON");
1471
+ assert!(labels[marker_start + 1..marker_end]
1472
+ .iter()
1473
+ .all(|label| label == "I-SEASON"));
1474
+ }
1475
+
1476
+ #[test]
1477
+ fn keeps_numeric_title_suffix_out_of_sequel_repair() {
1478
+ let text = "Kamisama Hajimemashita 2 - 01";
1479
+ let title_end = text.find(" - ").unwrap();
1480
+ let episode_start = text.find("01").unwrap();
1481
+ let row = char_row(
1482
+ text,
1483
+ &[(0, title_end)],
1484
+ &[(episode_start, episode_start + 2)],
1485
+ );
1486
+
1487
+ let (_tokens, labels) = labels_for_char_tokenizer(&row);
1488
+ let suffix = text.find('2').unwrap();
1489
+
1490
+ assert_eq!(labels[suffix], "I-TITLE_LATIN");
1491
+ assert!(!labels
1492
+ .iter()
1493
+ .any(|label| label_entity(label) == Some("SEASON")));
1494
+ }
1495
+
1496
+ #[test]
1497
+ fn skips_alias_marker_when_season_already_exists() {
1498
+ let text = "樱桃小丸子第二期(Chibi Maruko-chan II)[1439]";
1499
+ let tokens = chars_as_strings(text);
1500
+ let mut labels = vec!["O".to_string(); tokens.len()];
1501
+ let title_end = char_index_at_byte(text, text.find("第二期").unwrap());
1502
+ label_span(&mut labels, 0, title_end, "TITLE_CHS");
1503
+ let season_start = title_end;
1504
+ let season_end = season_start + "第二期".chars().count();
1505
+ label_span(&mut labels, season_start, season_end, "SEASON");
1506
+ let alias_start = char_index_at_byte(text, text.find("Chibi").unwrap());
1507
+ let alias_end = char_index_at_byte(text, text.find(")").unwrap());
1508
+ label_span(&mut labels, alias_start, alias_end, "TITLE_LATIN");
1509
+ let episode_start = char_index_at_byte(text, text.find("1439").unwrap());
1510
+ label_span(&mut labels, episode_start, episode_start + 4, "EPISODE");
1511
+ let row = SourceRow {
1512
+ row_index: 0,
1513
+ raw_line: String::new(),
1514
+ filename: Some(text.to_string()),
1515
+ tokens,
1516
+ labels,
1517
+ tokenizer_variant: Some("char".to_string()),
1518
+ };
1519
+
1520
+ let (_tokens, labels) = labels_for_char_tokenizer(&row);
1521
+ let roman = char_index_at_byte(text, text.find("II").unwrap());
1522
+
1523
+ assert_eq!(labels[roman], "I-TITLE_LATIN");
1524
+ }
1525
+ }