ModerRAS commited on
Commit
1bef818
·
1 Parent(s): 651ad49

Add tests for combo generator

Browse files
tools/virtual_dataset_generator/Cargo.lock CHANGED
@@ -2,6 +2,15 @@
2
  # It is not intended for manual editing.
3
  version = 4
4
 
 
 
 
 
 
 
 
 
 
5
  [[package]]
6
  name = "anifilebert-virtual-dataset-generator"
7
  version = "0.1.0"
@@ -10,6 +19,7 @@ dependencies = [
10
  "clap",
11
  "rand",
12
  "rayon",
 
13
  "serde",
14
  "serde_json",
15
  ]
@@ -277,6 +287,35 @@ dependencies = [
277
  "crossbeam-utils",
278
  ]
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  [[package]]
281
  name = "serde"
282
  version = "1.0.228"
 
2
  # It is not intended for manual editing.
3
  version = 4
4
 
5
+ [[package]]
6
+ name = "aho-corasick"
7
+ version = "1.1.4"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
+ dependencies = [
11
+ "memchr",
12
+ ]
13
+
14
  [[package]]
15
  name = "anifilebert-virtual-dataset-generator"
16
  version = "0.1.0"
 
19
  "clap",
20
  "rand",
21
  "rayon",
22
+ "regex",
23
  "serde",
24
  "serde_json",
25
  ]
 
287
  "crossbeam-utils",
288
  ]
289
 
290
+ [[package]]
291
+ name = "regex"
292
+ version = "1.12.3"
293
+ source = "registry+https://github.com/rust-lang/crates.io-index"
294
+ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
295
+ dependencies = [
296
+ "aho-corasick",
297
+ "memchr",
298
+ "regex-automata",
299
+ "regex-syntax",
300
+ ]
301
+
302
+ [[package]]
303
+ name = "regex-automata"
304
+ version = "0.4.14"
305
+ source = "registry+https://github.com/rust-lang/crates.io-index"
306
+ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
307
+ dependencies = [
308
+ "aho-corasick",
309
+ "memchr",
310
+ "regex-syntax",
311
+ ]
312
+
313
+ [[package]]
314
+ name = "regex-syntax"
315
+ version = "0.8.10"
316
+ source = "registry+https://github.com/rust-lang/crates.io-index"
317
+ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
318
+
319
  [[package]]
320
  name = "serde"
321
  version = "1.0.228"
tools/virtual_dataset_generator/Cargo.toml CHANGED
@@ -6,6 +6,7 @@ edition = "2021"
6
  [dependencies]
7
  anyhow = "1.0"
8
  clap = { version = "4.5", features = ["derive"] }
 
9
  rand = "0.8"
10
  rayon = "1.10"
11
  serde = { version = "1.0", features = ["derive"] }
 
6
  [dependencies]
7
  anyhow = "1.0"
8
  clap = { version = "4.5", features = ["derive"] }
9
+ regex = "1.11"
10
  rand = "0.8"
11
  rayon = "1.10"
12
  serde = { version = "1.0", features = ["derive"] }
tools/virtual_dataset_generator/src/bin/case_combo_generator.rs ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use anyhow::{Context, Result};
2
+ use clap::Parser;
3
+ use rand::rngs::StdRng;
4
+ use rand::seq::SliceRandom;
5
+ use rand::Rng;
6
+ use rand::SeedableRng;
7
+ use regex::Regex;
8
+ use serde::{Deserialize, Serialize};
9
+ use serde_json::Value;
10
+ use std::collections::{HashMap, HashSet};
11
+ use std::fs::File;
12
+ use std::io::{BufRead, BufReader, BufWriter, Write};
13
+ use std::path::PathBuf;
14
+
15
+ #[derive(Parser, Debug)]
16
+ #[command(
17
+ about = "Generate combo-heavy AniFileBERT char samples from real DMHY rows",
18
+ version
19
+ )]
20
+ struct Args {
21
+ #[arg(long)]
22
+ input: PathBuf,
23
+
24
+ #[arg(long)]
25
+ output: PathBuf,
26
+
27
+ #[arg(long, action = clap::ArgAction::Append)]
28
+ failure_report: Vec<PathBuf>,
29
+
30
+ #[arg(long, default_value_t = 12_000)]
31
+ selected_samples: usize,
32
+
33
+ #[arg(long, default_value_t = 60_000)]
34
+ context_samples: usize,
35
+
36
+ #[arg(long, default_value_t = 4)]
37
+ combos_per_row: usize,
38
+
39
+ #[arg(long, default_value_t = 42)]
40
+ seed: u64,
41
+ }
42
+
43
+ #[derive(Clone, Debug, Serialize, Deserialize)]
44
+ struct CharRow {
45
+ filename: String,
46
+ tokens: Vec<String>,
47
+ labels: Vec<String>,
48
+ #[serde(default)]
49
+ tokenizer_variant: Option<String>,
50
+ #[serde(default)]
51
+ source: Option<String>,
52
+ }
53
+
54
+ fn main() -> Result<()> {
55
+ let args = Args::parse();
56
+ let target_re = Regex::new(
57
+ r"(?ix)(?:\b(?:S\d{1,2}|Season\s*\d{1,2}|第\s*\d+\s*[季期部章]|[ⅡⅢⅣⅤ]|II|III|IV|V)\b|\b(?:NCOP|NCED|OP|ED|PV|CM|TVCM|OVA|OAD|SP)\d*\b|(?:劇場版|特别篇|特別篇|ステージ|イベント|Event|Movie|PV|CM)\b|\b(?:Blu[-_ ]?ray\s*&\s*DVD|BD[-_ ]?BOX|Disc\.?\s*\d+|Vol\.?\s*\d+)\b|\b(?:h\.?264|x\.?264|h\.?265|x\.?265|AVC[-_ ]?YUV|yuv\d+p?\d*|AAC\([^)]*\))\b)",
58
+ )?;
59
+
60
+ let targets = failure_filenames(&args.failure_report)?;
61
+ let input =
62
+ File::open(&args.input).with_context(|| format!("open {}", args.input.display()))?;
63
+ let reader = BufReader::new(input);
64
+ let mut rng = StdRng::seed_from_u64(args.seed);
65
+
66
+ let mut selected_rows: Vec<CharRow> = Vec::new();
67
+ let mut context_rows: Vec<CharRow> = Vec::new();
68
+ let mut selected_seen = 0usize;
69
+ let mut context_seen = 0usize;
70
+ let mut seen_filenames = HashSet::new();
71
+ let mut total_rows = 0usize;
72
+
73
+ for line in reader.lines() {
74
+ let line = line?;
75
+ let line = line.trim();
76
+ if line.is_empty() {
77
+ continue;
78
+ }
79
+ total_rows += 1;
80
+ let row: CharRow = serde_json::from_str(line)
81
+ .with_context(|| format!("parse JSONL line {}", total_rows))?;
82
+ if row.filename.is_empty() || !seen_filenames.insert(row.filename.clone()) {
83
+ continue;
84
+ }
85
+ if targets.contains(&row.filename) || target_re.is_match(&row.filename) {
86
+ selected_seen += 1;
87
+ reservoir_push(
88
+ &mut selected_rows,
89
+ row,
90
+ args.selected_samples,
91
+ selected_seen,
92
+ &mut rng,
93
+ );
94
+ } else {
95
+ context_seen += 1;
96
+ reservoir_push(
97
+ &mut context_rows,
98
+ row,
99
+ args.context_samples,
100
+ context_seen,
101
+ &mut rng,
102
+ );
103
+ }
104
+ }
105
+
106
+ let mut out_rows: Vec<CharRow> = Vec::new();
107
+ let mut source_counts: HashMap<String, usize> = HashMap::new();
108
+
109
+ for row in &selected_rows {
110
+ let mut base = row.clone();
111
+ base.source = Some("combo_selected_base".to_string());
112
+ push_row(&mut out_rows, base, &mut source_counts);
113
+
114
+ let mut variants = build_combo_variants(row, 128);
115
+ variants.shuffle(&mut rng);
116
+ for variant in variants.into_iter().take(args.combos_per_row) {
117
+ push_row(&mut out_rows, variant, &mut source_counts);
118
+ }
119
+ }
120
+
121
+ for row in &context_rows {
122
+ let mut ctx = row.clone();
123
+ ctx.source = Some("combo_context".to_string());
124
+ push_row(&mut out_rows, ctx, &mut source_counts);
125
+ }
126
+
127
+ out_rows.shuffle(&mut rng);
128
+ let output =
129
+ File::create(&args.output).with_context(|| format!("create {}", args.output.display()))?;
130
+ let mut writer = BufWriter::new(output);
131
+ for row in &out_rows {
132
+ serde_json::to_writer(&mut writer, row)?;
133
+ writer.write_all(b"\n")?;
134
+ }
135
+ writer.flush()?;
136
+
137
+ let summary = serde_json::json!({
138
+ "input": args.input,
139
+ "output": args.output,
140
+ "total_rows": total_rows,
141
+ "failure_targets": targets.len(),
142
+ "selected_rows": selected_rows.len(),
143
+ "context_rows": context_rows.len(),
144
+ "written_rows": out_rows.len(),
145
+ "source_counts": source_counts,
146
+ });
147
+ println!("{}", serde_json::to_string_pretty(&summary)?);
148
+ Ok(())
149
+ }
150
+
151
+ fn reservoir_push<T: Clone>(
152
+ rows: &mut Vec<T>,
153
+ item: T,
154
+ limit: usize,
155
+ seen_count: usize,
156
+ rng: &mut StdRng,
157
+ ) {
158
+ if limit == 0 {
159
+ return;
160
+ }
161
+ if rows.len() < limit {
162
+ rows.push(item);
163
+ return;
164
+ }
165
+ let index = rng.gen_range(0..seen_count);
166
+ if index < limit {
167
+ rows[index] = item;
168
+ }
169
+ }
170
+
171
+ fn push_row(row: &mut Vec<CharRow>, item: CharRow, counts: &mut HashMap<String, usize>) {
172
+ let source = item.source.clone().unwrap_or_else(|| "unknown".to_string());
173
+ *counts.entry(source).or_insert(0) += 1;
174
+ row.push(item);
175
+ }
176
+
177
+ fn failure_filenames(report_paths: &[PathBuf]) -> Result<HashSet<String>> {
178
+ let mut filenames = HashSet::new();
179
+ for path in report_paths {
180
+ if !path.exists() {
181
+ continue;
182
+ }
183
+ let text = std::fs::read_to_string(path)
184
+ .with_context(|| format!("read failure report {}", path.display()))?;
185
+ let report: Value = serde_json::from_str(&text)
186
+ .with_context(|| format!("parse failure report {}", path.display()))?;
187
+ if let Some(modes) = report.get("modes").and_then(|v| v.as_object()) {
188
+ for mode in modes.values() {
189
+ if let Some(mode_obj) = mode.as_object() {
190
+ if let Some(failures) = mode_obj.get("failures").and_then(|v| v.as_array()) {
191
+ for failure in failures {
192
+ if let Some(filename) = failure.get("filename").and_then(|v| v.as_str())
193
+ {
194
+ filenames.insert(filename.to_string());
195
+ }
196
+ }
197
+ }
198
+ if let Some(results) = mode_obj.get("results").and_then(|v| v.as_array()) {
199
+ for result in results {
200
+ if result.get("ok").and_then(|v| v.as_bool()) == Some(true) {
201
+ continue;
202
+ }
203
+ if let Some(filename) = result.get("filename").and_then(|v| v.as_str())
204
+ {
205
+ filenames.insert(filename.to_string());
206
+ }
207
+ }
208
+ }
209
+ }
210
+ }
211
+ }
212
+ }
213
+ Ok(filenames)
214
+ }
215
+
216
+ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
217
+ let entities = extract_entities_from_labels(&row.tokens, &row.labels);
218
+ let title = first_value(&entities, "TITLE");
219
+ let season = first_value(&entities, "SEASON");
220
+ let episode = first_value(&entities, "EPISODE");
221
+ let special = first_value(&entities, "SPECIAL");
222
+ let resolution = first_value(&entities, "RESOLUTION");
223
+ let source = first_value(&entities, "SOURCE");
224
+
225
+ let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
226
+ if let Some(title) = title.clone() {
227
+ specs.push((
228
+ title.clone(),
229
+ vec![(title.clone(), "TITLE".to_string())],
230
+ "combo_title",
231
+ ));
232
+ if let Some(season) = season.clone() {
233
+ specs.push((
234
+ format!("{title} {season}"),
235
+ vec![
236
+ (title.clone(), "TITLE".to_string()),
237
+ (season.clone(), "SEASON".to_string()),
238
+ ],
239
+ "combo_title_season",
240
+ ));
241
+ if let Some(episode) = episode.clone() {
242
+ specs.push((
243
+ format!("{title} {season} {episode}"),
244
+ vec![
245
+ (title.clone(), "TITLE".to_string()),
246
+ (season.clone(), "SEASON".to_string()),
247
+ (episode.clone(), "EPISODE".to_string()),
248
+ ],
249
+ "combo_title_season_episode",
250
+ ));
251
+ if let (Some(resolution), Some(source)) = (resolution.clone(), source.clone()) {
252
+ specs.push((
253
+ format!("{title} {season} {episode} [{resolution}][{source}]"),
254
+ vec![
255
+ (title.clone(), "TITLE".to_string()),
256
+ (season.clone(), "SEASON".to_string()),
257
+ (episode.clone(), "EPISODE".to_string()),
258
+ (resolution.clone(), "RESOLUTION".to_string()),
259
+ (source.clone(), "SOURCE".to_string()),
260
+ ],
261
+ "combo_title_season_episode_resolution_source",
262
+ ));
263
+ }
264
+ }
265
+ }
266
+ }
267
+ if let Some(episode) = episode.clone() {
268
+ if let Some(resolution) = resolution.clone() {
269
+ specs.push((
270
+ format!("{episode} [{resolution}]"),
271
+ vec![
272
+ (episode.clone(), "EPISODE".to_string()),
273
+ (resolution.clone(), "RESOLUTION".to_string()),
274
+ ],
275
+ "combo_episode_resolution",
276
+ ));
277
+ if let Some(source) = source.clone() {
278
+ specs.push((
279
+ format!("{episode} [{resolution}][{source}]"),
280
+ vec![
281
+ (episode.clone(), "EPISODE".to_string()),
282
+ (resolution.clone(), "RESOLUTION".to_string()),
283
+ (source, "SOURCE".to_string()),
284
+ ],
285
+ "combo_episode_resolution_source",
286
+ ));
287
+ }
288
+ }
289
+ }
290
+ if let Some(special) = special.clone() {
291
+ specs.push((
292
+ special.clone(),
293
+ vec![(special.clone(), "SPECIAL".to_string())],
294
+ "combo_special_only",
295
+ ));
296
+ }
297
+ if let (Some(title), Some(special)) = (title.clone(), special.clone()) {
298
+ specs.push((
299
+ format!("{title} - {special}"),
300
+ vec![
301
+ (title.clone(), "TITLE".to_string()),
302
+ (special.clone(), "SPECIAL".to_string()),
303
+ ],
304
+ "combo_title_special",
305
+ ));
306
+ if let Some(episode) = episode.clone() {
307
+ specs.push((
308
+ format!("{title} - {special} [{episode}]"),
309
+ vec![
310
+ (title.clone(), "TITLE".to_string()),
311
+ (special.clone(), "SPECIAL".to_string()),
312
+ (episode, "EPISODE".to_string()),
313
+ ],
314
+ "combo_title_special_episode",
315
+ ));
316
+ }
317
+ if let (Some(resolution), Some(source)) = (resolution.clone(), source.clone()) {
318
+ specs.push((
319
+ format!("{title} - {special} [{resolution}][{source}]"),
320
+ vec![
321
+ (title, "TITLE".to_string()),
322
+ (special, "SPECIAL".to_string()),
323
+ (resolution.clone(), "RESOLUTION".to_string()),
324
+ (source, "SOURCE".to_string()),
325
+ ],
326
+ "combo_title_special_resolution_source",
327
+ ));
328
+ }
329
+ }
330
+ if let (Some(title), Some(resolution), Some(source)) =
331
+ (title, resolution.clone(), source.clone())
332
+ {
333
+ specs.push((
334
+ format!("{title} [{resolution}][{source}]"),
335
+ vec![
336
+ (title.clone(), "TITLE".to_string()),
337
+ (resolution.clone(), "RESOLUTION".to_string()),
338
+ (source, "SOURCE".to_string()),
339
+ ],
340
+ "combo_title_resolution_source",
341
+ ));
342
+ }
343
+
344
+ let mut variants = Vec::new();
345
+ let mut seen_texts = HashSet::new();
346
+ for (text, spans, source_name) in specs {
347
+ if text.len() < 2 || text.len() > max_chars || !seen_texts.insert(text.clone()) {
348
+ continue;
349
+ }
350
+ if let Some(item) = char_item(&text, &spans, source_name) {
351
+ variants.push(item);
352
+ }
353
+ }
354
+ variants
355
+ }
356
+
357
+ fn extract_entities_from_labels(
358
+ tokens: &[String],
359
+ labels: &[String],
360
+ ) -> HashMap<String, Vec<String>> {
361
+ let mut entities: HashMap<String, Vec<String>> = HashMap::new();
362
+ let mut active_entity: Option<String> = None;
363
+ let mut active_tokens: Vec<String> = Vec::new();
364
+
365
+ for (token, label) in tokens.iter().zip(labels.iter()) {
366
+ if let Some(rest) = label.strip_prefix("B-") {
367
+ if let Some(entity) = active_entity.take() {
368
+ if !active_tokens.is_empty() {
369
+ entities
370
+ .entry(entity)
371
+ .or_default()
372
+ .push(active_tokens.join(""));
373
+ }
374
+ }
375
+ active_entity = Some(rest.to_string());
376
+ active_tokens = vec![token.clone()];
377
+ } else if let Some(rest) = label.strip_prefix("I-") {
378
+ if active_entity.as_deref() == Some(rest) {
379
+ active_tokens.push(token.clone());
380
+ } else {
381
+ if let Some(entity) = active_entity.take() {
382
+ if !active_tokens.is_empty() {
383
+ entities
384
+ .entry(entity)
385
+ .or_default()
386
+ .push(active_tokens.join(""));
387
+ }
388
+ }
389
+ active_entity = Some(rest.to_string());
390
+ active_tokens = vec![token.clone()];
391
+ }
392
+ } else {
393
+ if let Some(entity) = active_entity.take() {
394
+ if !active_tokens.is_empty() {
395
+ entities
396
+ .entry(entity)
397
+ .or_default()
398
+ .push(active_tokens.join(""));
399
+ }
400
+ }
401
+ active_tokens.clear();
402
+ }
403
+ }
404
+
405
+ if let Some(entity) = active_entity.take() {
406
+ if !active_tokens.is_empty() {
407
+ entities
408
+ .entry(entity)
409
+ .or_default()
410
+ .push(active_tokens.join(""));
411
+ }
412
+ }
413
+ entities
414
+ }
415
+
416
+ fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
417
+ entities
418
+ .get(name)
419
+ .and_then(|values| values.iter().find(|value| !value.trim().is_empty()))
420
+ .map(|value| value.trim().to_string())
421
+ }
422
+
423
+ fn char_item(filename: &str, spans: &[(String, String)], source: &str) -> Option<CharRow> {
424
+ let mut labels = vec!["O".to_string(); filename.chars().count()];
425
+ let tokens: Vec<String> = filename.chars().map(|c| c.to_string()).collect();
426
+ let mut cursor = 0usize;
427
+ for (text, entity) in spans {
428
+ if text.is_empty() {
429
+ continue;
430
+ }
431
+ if let Some(start) = find_substring(filename, text, cursor) {
432
+ let end = start + text.chars().count();
433
+ if start < labels.len() {
434
+ labels[start] = format!("B-{entity}");
435
+ for idx in (start + 1)..end.min(labels.len()) {
436
+ labels[idx] = format!("I-{entity}");
437
+ }
438
+ cursor = end;
439
+ continue;
440
+ }
441
+ }
442
+ if let Some(start) = find_substring(filename, text, 0) {
443
+ let end = start + text.chars().count();
444
+ if start < labels.len() {
445
+ labels[start] = format!("B-{entity}");
446
+ for idx in (start + 1)..end.min(labels.len()) {
447
+ labels[idx] = format!("I-{entity}");
448
+ }
449
+ cursor = end;
450
+ continue;
451
+ }
452
+ }
453
+ return None;
454
+ }
455
+
456
+ Some(CharRow {
457
+ filename: filename.to_string(),
458
+ tokens,
459
+ labels,
460
+ tokenizer_variant: Some("char".to_string()),
461
+ source: Some(source.to_string()),
462
+ })
463
+ }
464
+
465
+ fn find_substring(haystack: &str, needle: &str, start_char: usize) -> Option<usize> {
466
+ let mut char_indices: Vec<(usize, char)> = haystack.char_indices().collect();
467
+ char_indices.push((haystack.len(), '\0'));
468
+ let start_byte = if start_char >= char_indices.len() - 1 {
469
+ haystack.len()
470
+ } else {
471
+ char_indices[start_char].0
472
+ };
473
+ haystack[start_byte..].find(needle).map(|byte_offset| {
474
+ haystack[start_byte..start_byte + byte_offset]
475
+ .chars()
476
+ .count()
477
+ + start_char
478
+ })
479
+ }
480
+
481
+ #[cfg(test)]
482
+ mod tests {
483
+ use super::*;
484
+ use std::fs;
485
+ use std::time::{SystemTime, UNIX_EPOCH};
486
+
487
+ fn unique_temp_path(prefix: &str) -> PathBuf {
488
+ let stamp = SystemTime::now()
489
+ .duration_since(UNIX_EPOCH)
490
+ .expect("clock drift")
491
+ .as_nanos();
492
+ std::env::temp_dir().join(format!("{prefix}_{stamp}_{}.json", std::process::id()))
493
+ }
494
+
495
+ fn make_row(filename: &str, spans: &[(String, String)], source: &str) -> CharRow {
496
+ char_item(filename, spans, source).expect("expected valid char row")
497
+ }
498
+
499
+ #[test]
500
+ fn failure_filenames_collects_failed_results_and_deduplicates() {
501
+ let path = unique_temp_path("case_combo_report");
502
+ let report = serde_json::json!({
503
+ "modes": {
504
+ "model_only": {
505
+ "failures": [
506
+ {"filename": "a.mkv"},
507
+ {"filename": "a.mkv"}
508
+ ],
509
+ "results": [
510
+ {"filename": "b.mkv", "ok": false},
511
+ {"filename": "c.mkv", "ok": true}
512
+ ]
513
+ },
514
+ "normalized_only": {
515
+ "results": [
516
+ {"filename": "d.mkv", "ok": false}
517
+ ]
518
+ }
519
+ }
520
+ });
521
+ fs::write(&path, serde_json::to_string(&report).unwrap()).unwrap();
522
+
523
+ let set = failure_filenames(&[path.clone()]).unwrap();
524
+ assert!(set.contains("a.mkv"));
525
+ assert!(set.contains("b.mkv"));
526
+ assert!(set.contains("d.mkv"));
527
+ assert!(!set.contains("c.mkv"));
528
+ assert_eq!(set.len(), 3);
529
+
530
+ let _ = fs::remove_file(path);
531
+ }
532
+
533
+ #[test]
534
+ fn build_combo_variants_includes_full_episode_source_combo() {
535
+ let row = make_row(
536
+ "One Piece Season 21 1110 [1080p][WEB-DL].mkv",
537
+ &[
538
+ ("One Piece".to_string(), "TITLE".to_string()),
539
+ ("Season 21".to_string(), "SEASON".to_string()),
540
+ ("1110".to_string(), "EPISODE".to_string()),
541
+ ("1080p".to_string(), "RESOLUTION".to_string()),
542
+ ("WEB-DL".to_string(), "SOURCE".to_string()),
543
+ ],
544
+ "combo_seed",
545
+ );
546
+
547
+ let variants = build_combo_variants(&row, 128);
548
+ let combo = variants
549
+ .iter()
550
+ .find(|item| {
551
+ item.source.as_deref() == Some("combo_title_season_episode_resolution_source")
552
+ })
553
+ .expect("missing title+season+episode+resolution+source combo");
554
+ assert_eq!(combo.filename, "One Piece Season 21 1110 [1080p][WEB-DL]");
555
+ assert_eq!(
556
+ &combo.labels[0..9],
557
+ &[
558
+ "B-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE",
559
+ "I-TITLE", "I-TITLE"
560
+ ]
561
+ );
562
+ assert_eq!(
563
+ &combo.labels[10..19],
564
+ &[
565
+ "B-SEASON", "I-SEASON", "I-SEASON", "I-SEASON", "I-SEASON", "I-SEASON", "I-SEASON",
566
+ "I-SEASON", "I-SEASON"
567
+ ]
568
+ );
569
+ assert_eq!(
570
+ &combo.labels[20..24],
571
+ &["B-EPISODE", "I-EPISODE", "I-EPISODE", "I-EPISODE"]
572
+ );
573
+ assert_eq!(
574
+ &combo.labels[26..31],
575
+ &[
576
+ "B-RESOLUTION",
577
+ "I-RESOLUTION",
578
+ "I-RESOLUTION",
579
+ "I-RESOLUTION",
580
+ "I-RESOLUTION"
581
+ ]
582
+ );
583
+ assert_eq!(
584
+ &combo.labels[33..39],
585
+ &["B-SOURCE", "I-SOURCE", "I-SOURCE", "I-SOURCE", "I-SOURCE", "I-SOURCE"]
586
+ );
587
+ assert_eq!(combo.labels[9], "O");
588
+ assert_eq!(combo.labels[19], "O");
589
+ assert_eq!(combo.labels[24], "O");
590
+ assert_eq!(combo.labels[25], "O");
591
+ assert_eq!(combo.labels[31], "O");
592
+ assert_eq!(combo.labels[32], "O");
593
+ assert_eq!(combo.labels[39], "O");
594
+ }
595
+ }