ModerRAS commited on
Commit
b57780c
·
1 Parent(s): e458112

Add parser diagnostics and inference debugging

Browse files

- add diagnose_pipeline.py for BIO validation, tokenizer alignment, entity stats, truncation, UNK, and confusion analysis

- add diagnostics reports for char and regex DMHY datasets

- add inference debug output, constrained BIO decoding, checkpoint max-length handling, and rule-assisted parsing

Files changed (4) hide show
  1. diagnose_pipeline.py +709 -0
  2. diagnostics_report.md +277 -0
  3. diagnostics_report_word.md +2678 -0
  4. inference.py +355 -35
diagnose_pipeline.py ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Diagnostics for the anime filename NER pipeline.
2
+
3
+ The checks focus on structured filename parsing failure modes:
4
+
5
+ - train/inference tokenizer mismatch
6
+ - BIO legality and boundary drift
7
+ - tokenizer split and vocabulary coverage
8
+ - label/entity distribution
9
+ - optional model confusion on a sampled validation split
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import math
17
+ import os
18
+ import random
19
+ import re
20
+ from collections import Counter, defaultdict
21
+ from pathlib import Path
22
+ from typing import Dict, Iterable, List, Optional, Tuple
23
+
24
+ import numpy as np
25
+ import torch
26
+ from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
27
+ from transformers import BertForTokenClassification
28
+
29
+ from config import Config
30
+ from dataset import align_tokens_for_tokenizer
31
+ from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
32
+
33
+
34
+ def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
35
+ with path.open("r", encoding="utf-8") as handle:
36
+ for line_no, line in enumerate(handle, 1):
37
+ if limit is not None and line_no > limit:
38
+ break
39
+ line = line.strip()
40
+ if not line:
41
+ continue
42
+ try:
43
+ yield json.loads(line)
44
+ except json.JSONDecodeError as exc:
45
+ raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
46
+
47
+
48
+ def detect_dataset_variant(samples: List[dict], vocab_file: Optional[str]) -> str:
49
+ variants = {sample.get("tokenizer_variant") for sample in samples if sample.get("tokenizer_variant")}
50
+ if len(variants) == 1:
51
+ return next(iter(variants))
52
+ if len(variants) > 1:
53
+ return "mixed"
54
+ if vocab_file and ".char" in os.path.basename(vocab_file).lower():
55
+ return "char"
56
+ char_like = 0
57
+ with_filename = 0
58
+ for sample in samples:
59
+ filename = sample.get("filename")
60
+ if filename is None:
61
+ continue
62
+ with_filename += 1
63
+ if sample.get("tokens") == list(filename):
64
+ char_like += 1
65
+ if with_filename and char_like / with_filename >= 0.95:
66
+ return "char"
67
+ return "regex"
68
+
69
+
70
+ def entity_type(label: str) -> Optional[str]:
71
+ if "-" not in label:
72
+ return None
73
+ return label.split("-", 1)[1]
74
+
75
+
76
+ def bio_violations(tokens: List[str], labels: List[str]) -> List[dict]:
77
+ violations: List[dict] = []
78
+ previous_label = "O"
79
+ current_entity: Optional[str] = None
80
+
81
+ for idx, label in enumerate(labels):
82
+ token = tokens[idx] if idx < len(tokens) else None
83
+ if label == "O":
84
+ if previous_label.startswith("B-"):
85
+ violations.append(
86
+ {
87
+ "type": "B_DIRECT_TO_O",
88
+ "index": idx,
89
+ "prev_label": previous_label,
90
+ "label": label,
91
+ "token": token,
92
+ }
93
+ )
94
+ current_entity = None
95
+ elif label.startswith("B-"):
96
+ current_entity = entity_type(label)
97
+ elif label.startswith("I-"):
98
+ label_entity = entity_type(label)
99
+ previous_entity = entity_type(previous_label)
100
+ if idx == 0 or previous_label == "O" or previous_entity != label_entity:
101
+ violations.append(
102
+ {
103
+ "type": "ORPHAN_I",
104
+ "index": idx,
105
+ "prev_label": previous_label,
106
+ "label": label,
107
+ "token": token,
108
+ }
109
+ )
110
+ current_entity = label_entity
111
+ else:
112
+ violations.append(
113
+ {
114
+ "type": "UNKNOWN_LABEL",
115
+ "index": idx,
116
+ "prev_label": previous_label,
117
+ "label": label,
118
+ "token": token,
119
+ }
120
+ )
121
+ current_entity = None
122
+ previous_label = label
123
+
124
+ return violations
125
+
126
+
127
+ def spans_from_labels(tokens: List[str], labels: List[str]) -> List[dict]:
128
+ spans: List[dict] = []
129
+ start: Optional[int] = None
130
+ current_type: Optional[str] = None
131
+ current_tokens: List[str] = []
132
+
133
+ for idx, (token, label) in enumerate(zip(tokens, labels)):
134
+ if label.startswith("B-"):
135
+ if current_type is not None and start is not None:
136
+ spans.append(
137
+ {
138
+ "type": current_type,
139
+ "start": start,
140
+ "end": idx,
141
+ "text": "".join(current_tokens),
142
+ }
143
+ )
144
+ current_type = entity_type(label)
145
+ start = idx
146
+ current_tokens = [token]
147
+ elif label.startswith("I-") and current_type == entity_type(label):
148
+ current_tokens.append(token)
149
+ elif label.startswith("I-"):
150
+ if current_type is not None and start is not None:
151
+ spans.append(
152
+ {
153
+ "type": current_type,
154
+ "start": start,
155
+ "end": idx,
156
+ "text": "".join(current_tokens),
157
+ }
158
+ )
159
+ current_type = entity_type(label)
160
+ start = idx
161
+ current_tokens = [token]
162
+ else:
163
+ if current_type is not None and start is not None:
164
+ spans.append(
165
+ {
166
+ "type": current_type,
167
+ "start": start,
168
+ "end": idx,
169
+ "text": "".join(current_tokens),
170
+ }
171
+ )
172
+ current_type = None
173
+ start = None
174
+ current_tokens = []
175
+
176
+ if current_type is not None and start is not None:
177
+ spans.append(
178
+ {
179
+ "type": current_type,
180
+ "start": start,
181
+ "end": len(labels),
182
+ "text": "".join(current_tokens),
183
+ }
184
+ )
185
+ return spans
186
+
187
+
188
+ def count_entities(samples: List[dict]) -> Counter:
189
+ counts: Counter = Counter()
190
+ for sample in samples:
191
+ for span in spans_from_labels(sample["tokens"], sample["labels"]):
192
+ counts[span["type"]] += 1
193
+ return counts
194
+
195
+
196
+ def percentile(values: List[int], pct: float) -> int:
197
+ if not values:
198
+ return 0
199
+ ordered = sorted(values)
200
+ idx = min(len(ordered) - 1, round((pct / 100) * (len(ordered) - 1)))
201
+ return ordered[idx]
202
+
203
+
204
+ def token_mismatch(sample: dict, tokenizer: AnimeTokenizer) -> Optional[dict]:
205
+ filename = sample.get("filename")
206
+ if filename is None:
207
+ return None
208
+ inferred = tokenizer.tokenize(filename)
209
+ dataset_tokens = sample.get("tokens", [])
210
+ if inferred == dataset_tokens:
211
+ return None
212
+ prefix = 0
213
+ for left, right in zip(inferred, dataset_tokens):
214
+ if left != right:
215
+ break
216
+ prefix += 1
217
+ return {
218
+ "file_id": sample.get("file_id"),
219
+ "filename": filename,
220
+ "common_prefix": prefix,
221
+ "dataset_tokens": dataset_tokens[:40],
222
+ "tokenizer_tokens": inferred[:40],
223
+ "dataset_len": len(dataset_tokens),
224
+ "tokenizer_len": len(inferred),
225
+ }
226
+
227
+
228
+ def format_counter(counter: Counter, total: Optional[int] = None, limit: Optional[int] = None) -> str:
229
+ if total is None:
230
+ total = sum(counter.values())
231
+ rows = []
232
+ items = counter.most_common(limit)
233
+ for key, count in items:
234
+ pct = count / total * 100 if total else 0.0
235
+ rows.append(f"- `{key}`: {count:,} ({pct:.2f}%)")
236
+ return "\n".join(rows) if rows else "- none"
237
+
238
+
239
+ def token_id_stats(samples: List[dict], tokenizer: AnimeTokenizer) -> dict:
240
+ total = 0
241
+ unk = 0
242
+ unk_counter: Counter = Counter()
243
+ for sample in samples:
244
+ tokens, labels = align_tokens_for_tokenizer(sample["tokens"], sample["labels"], tokenizer)
245
+ ids = tokenizer.convert_tokens_to_ids(tokens)
246
+ for token, token_id in zip(tokens, ids):
247
+ total += 1
248
+ if token_id == tokenizer.unk_token_id:
249
+ unk += 1
250
+ unk_counter[token] += 1
251
+ return {
252
+ "total": total,
253
+ "unk": unk,
254
+ "unk_rate": unk / total if total else 0.0,
255
+ "top_unk": unk_counter.most_common(25),
256
+ }
257
+
258
+
259
+ def prepare_inputs(
260
+ tokens: List[str],
261
+ labels: List[str],
262
+ tokenizer: AnimeTokenizer,
263
+ label2id: Dict[str, int],
264
+ max_length: int,
265
+ ) -> Tuple[List[int], List[int], List[int], List[str]]:
266
+ tokens, labels = align_tokens_for_tokenizer(tokens, labels, tokenizer)
267
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
268
+ input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
269
+ label_ids = [-100] + [label2id.get(label, 0) for label in labels] + [-100]
270
+ attention_mask = [1] * len(input_ids)
271
+
272
+ if len(input_ids) > max_length:
273
+ input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [input_ids[-1]]
274
+ label_ids = [label_ids[0]] + label_ids[1:max_length - 1] + [label_ids[-1]]
275
+ attention_mask = [1] * len(input_ids)
276
+
277
+ pad_len = max_length - len(input_ids)
278
+ if pad_len > 0:
279
+ input_ids += [tokenizer.pad_token_id] * pad_len
280
+ label_ids += [-100] * pad_len
281
+ attention_mask += [0] * pad_len
282
+
283
+ return input_ids, attention_mask, label_ids, tokens
284
+
285
+
286
+ def evaluate_model(
287
+ samples: List[dict],
288
+ model_dir: Path,
289
+ tokenizer: AnimeTokenizer,
290
+ max_length: int,
291
+ limit: int,
292
+ seed: int,
293
+ ) -> dict:
294
+ cfg = Config()
295
+ model = BertForTokenClassification.from_pretrained(str(model_dir))
296
+ model.eval()
297
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
298
+ model.to(device)
299
+
300
+ rng = random.Random(seed)
301
+ eval_samples = list(samples)
302
+ rng.shuffle(eval_samples)
303
+ eval_samples = eval_samples[:limit]
304
+
305
+ id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
306
+ label2id = {v: int(k) for k, v in id2label.items()}
307
+ if not label2id:
308
+ label2id = cfg.label2id
309
+ id2label = cfg.id2label
310
+
311
+ true_sequences: List[List[str]] = []
312
+ pred_sequences: List[List[str]] = []
313
+ confusion: Counter = Counter()
314
+ entity_confusion: Counter = Counter()
315
+ boundary_errors: Counter = Counter()
316
+
317
+ with torch.no_grad():
318
+ for sample in eval_samples:
319
+ input_ids, attention_mask, label_ids, _tokens = prepare_inputs(
320
+ sample["tokens"],
321
+ sample["labels"],
322
+ tokenizer,
323
+ label2id,
324
+ max_length,
325
+ )
326
+ input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
327
+ mask_tensor = torch.tensor([attention_mask], dtype=torch.long, device=device)
328
+ logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
329
+ pred_ids = torch.argmax(logits, dim=-1)[0].detach().cpu().tolist()
330
+
331
+ true_labels: List[str] = []
332
+ pred_labels: List[str] = []
333
+ for pred_id, label_id in zip(pred_ids, label_ids):
334
+ if label_id == -100:
335
+ continue
336
+ true_label = id2label.get(label_id, "O")
337
+ pred_label = id2label.get(pred_id, "O")
338
+ true_labels.append(true_label)
339
+ pred_labels.append(pred_label)
340
+ confusion[(true_label, pred_label)] += 1
341
+ entity_confusion[(entity_type(true_label) or "O", entity_type(pred_label) or "O")] += 1
342
+ if true_label != pred_label:
343
+ if true_label.startswith("B-") or pred_label.startswith("B-"):
344
+ boundary_errors["B-boundary"] += 1
345
+ elif entity_type(true_label) != entity_type(pred_label):
346
+ boundary_errors["entity-type"] += 1
347
+ else:
348
+ boundary_errors["BIO-prefix"] += 1
349
+ true_sequences.append(true_labels)
350
+ pred_sequences.append(pred_labels)
351
+
352
+ errors = confusion.copy()
353
+ for label in set(label for pair in confusion for label in pair):
354
+ errors.pop((label, label), None)
355
+
356
+ return {
357
+ "sample_count": len(eval_samples),
358
+ "precision": precision_score(true_sequences, pred_sequences),
359
+ "recall": recall_score(true_sequences, pred_sequences),
360
+ "f1": f1_score(true_sequences, pred_sequences),
361
+ "classification_report": classification_report(true_sequences, pred_sequences, digits=4),
362
+ "top_token_confusions": errors.most_common(30),
363
+ "top_entity_confusions": Counter(
364
+ {k: v for k, v in entity_confusion.items() if k[0] != k[1]}
365
+ ).most_common(30),
366
+ "boundary_errors": boundary_errors,
367
+ }
368
+
369
+
370
+ def tokenizer_split_examples(samples: List[dict], tokenizers: Dict[str, AnimeTokenizer], limit: int = 8) -> List[dict]:
371
+ examples: List[dict] = []
372
+ for sample in samples:
373
+ filename = sample.get("filename")
374
+ if not filename:
375
+ continue
376
+ row = {
377
+ "file_id": sample.get("file_id"),
378
+ "filename": filename,
379
+ "dataset_tokens": sample.get("tokens", [])[:80],
380
+ }
381
+ for name, tokenizer in tokenizers.items():
382
+ row[f"{name}_tokens"] = tokenizer.tokenize(filename)[:80]
383
+ examples.append(row)
384
+ if len(examples) >= limit:
385
+ break
386
+ return examples
387
+
388
+
389
+ def write_report(path: Path, title: str, sections: List[Tuple[str, str]]) -> None:
390
+ parts = [f"# {title}", ""]
391
+ for heading, body in sections:
392
+ parts.append(f"## {heading}")
393
+ parts.append("")
394
+ parts.append(body.strip() if body.strip() else "_No data._")
395
+ parts.append("")
396
+ path.write_text("\n".join(parts), encoding="utf-8")
397
+
398
+
399
+ def markdown_json(value) -> str:
400
+ return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"
401
+
402
+
403
+ def markdown_table(headers: List[str], rows: List[List[str]], limit: Optional[int] = None) -> str:
404
+ if limit is not None:
405
+ rows = rows[:limit]
406
+ table = ["| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |"]
407
+ for row in rows:
408
+ table.append("| " + " | ".join(str(cell).replace("\n", " ") for cell in row) + " |")
409
+ return "\n".join(table)
410
+
411
+
412
+ def main() -> None:
413
+ parser = argparse.ArgumentParser(description="Diagnose anime filename NER data and model pipeline")
414
+ parser.add_argument("--data-file", required=True, help="JSONL dataset with tokens and labels")
415
+ parser.add_argument("--vocab-file", default=None, help="Tokenizer vocab JSON")
416
+ parser.add_argument("--tokenizer", choices=["regex", "char"], default=None,
417
+ help="Tokenizer variant to diagnose. Defaults to dataset metadata")
418
+ parser.add_argument("--model-dir", default=None, help="Optional model directory for confusion analysis")
419
+ parser.add_argument("--max-length", type=int, default=None, help="Max sequence length for model eval/truncation stats")
420
+ parser.add_argument("--sample-limit", type=int, default=20000, help="Rows to inspect for data diagnostics")
421
+ parser.add_argument("--eval-limit", type=int, default=512, help="Rows to evaluate when --model-dir is provided")
422
+ parser.add_argument("--output", default="diagnostics_report.md", help="Markdown report path")
423
+ parser.add_argument("--seed", type=int, default=42)
424
+ args = parser.parse_args()
425
+
426
+ data_path = Path(args.data_file)
427
+ samples = list(iter_jsonl(data_path, args.sample_limit))
428
+ if not samples:
429
+ raise ValueError(f"No samples loaded from {data_path}")
430
+
431
+ dataset_variant = detect_dataset_variant(samples, args.vocab_file)
432
+ tokenizer_variant = args.tokenizer or (dataset_variant if dataset_variant != "mixed" else "regex")
433
+ vocab_file = args.vocab_file
434
+ if vocab_file is None:
435
+ vocab_file = str(data_path.with_name("vocab.char.json" if tokenizer_variant == "char" else "vocab.json"))
436
+ tokenizer = create_tokenizer(tokenizer_variant, vocab_file=vocab_file)
437
+
438
+ if args.model_dir:
439
+ model_tokenizer = load_tokenizer(args.model_dir)
440
+ else:
441
+ model_tokenizer = tokenizer
442
+
443
+ label_counter: Counter = Counter()
444
+ length_values: List[int] = []
445
+ aligned_length_values: List[int] = []
446
+ violations: List[dict] = []
447
+ mismatch_examples: List[dict] = []
448
+ space_label_counter: Counter = Counter()
449
+ boundary_drift_counter: Counter = Counter()
450
+ truncation_count = 0
451
+ max_length = args.max_length
452
+ if max_length is None and args.model_dir:
453
+ model_config = BertForTokenClassification.from_pretrained(args.model_dir).config
454
+ max_length = int(getattr(model_config, "max_seq_length", 64))
455
+ max_length = max_length or (128 if tokenizer_variant == "char" else 64)
456
+
457
+ for row_idx, sample in enumerate(samples, 1):
458
+ tokens = sample.get("tokens", [])
459
+ labels = sample.get("labels", [])
460
+ if len(tokens) != len(labels):
461
+ violations.append(
462
+ {
463
+ "type": "LENGTH_MISMATCH",
464
+ "row": row_idx,
465
+ "file_id": sample.get("file_id"),
466
+ "token_count": len(tokens),
467
+ "label_count": len(labels),
468
+ "filename": sample.get("filename"),
469
+ }
470
+ )
471
+ continue
472
+
473
+ label_counter.update(labels)
474
+ length_values.append(len(tokens))
475
+ aligned_tokens, aligned_labels = align_tokens_for_tokenizer(tokens, labels, tokenizer)
476
+ aligned_length_values.append(len(aligned_tokens))
477
+ if len(aligned_tokens) + 2 > max_length:
478
+ truncation_count += 1
479
+ for token, label in zip(tokens, labels):
480
+ if token.isspace():
481
+ space_label_counter[label] += 1
482
+ for violation in bio_violations(tokens, labels):
483
+ violation.update(
484
+ {
485
+ "row": row_idx,
486
+ "file_id": sample.get("file_id"),
487
+ "filename": sample.get("filename"),
488
+ "context_tokens": tokens[max(0, violation["index"] - 5):violation["index"] + 6],
489
+ "context_labels": labels[max(0, violation["index"] - 5):violation["index"] + 6],
490
+ }
491
+ )
492
+ violations.append(violation)
493
+ for span in spans_from_labels(tokens, labels):
494
+ text = span["text"]
495
+ if span["type"] == "TITLE":
496
+ if text.startswith("[") or text.endswith("[") or "]" in text[:3]:
497
+ boundary_drift_counter["title_contains_bracket_edge"] += 1
498
+ if re.search(r"\b(?:WEB[-_ ]?DL|WebRip|\d{3,4}[pP]|HEVC|AVC|AAC)\b", text, re.I):
499
+ boundary_drift_counter["title_contains_meta"] += 1
500
+ if span["type"] == "GROUP" and ("[" in text or "]" in text):
501
+ boundary_drift_counter["group_contains_bracket"] += 1
502
+
503
+ if len(mismatch_examples) < 10:
504
+ mismatch = token_mismatch(sample, tokenizer)
505
+ if mismatch:
506
+ mismatch_examples.append(mismatch)
507
+
508
+ entity_counter = count_entities(samples)
509
+ id_stats = token_id_stats(samples, tokenizer)
510
+ split_examples = tokenizer_split_examples(
511
+ samples,
512
+ {
513
+ "diagnosed": tokenizer,
514
+ "regex": create_tokenizer("regex", vocab_file=str(data_path.with_name("vocab.json"))),
515
+ "char": create_tokenizer("char", vocab_file=str(data_path.with_name("vocab.char.json"))),
516
+ },
517
+ )
518
+
519
+ model_eval = None
520
+ if args.model_dir:
521
+ model_eval = evaluate_model(
522
+ samples=samples,
523
+ model_dir=Path(args.model_dir),
524
+ tokenizer=model_tokenizer,
525
+ max_length=max_length,
526
+ limit=args.eval_limit,
527
+ seed=args.seed,
528
+ )
529
+
530
+ total_labels = sum(label_counter.values())
531
+ o_count = label_counter.get("O", 0)
532
+ sections: List[Tuple[str, str]] = []
533
+
534
+ sections.append(
535
+ (
536
+ "Executive Summary",
537
+ "\n".join(
538
+ [
539
+ f"- Dataset: `{data_path}`",
540
+ f"- Inspected rows: {len(samples):,}",
541
+ f"- Dataset tokenizer variant: `{dataset_variant}`",
542
+ f"- Diagnosed tokenizer variant: `{tokenizer_variant}`",
543
+ f"- Vocab: `{vocab_file}` ({tokenizer.vocab_size:,} tokens)",
544
+ f"- Max sequence length checked: {max_length}",
545
+ f"- O-label ratio: {o_count / total_labels * 100:.2f}%" if total_labels else "- O-label ratio: n/a",
546
+ f"- Truncation risk: {truncation_count:,}/{len(samples):,} rows ({truncation_count / len(samples) * 100:.2f}%)",
547
+ f"- UNK rate after selected tokenizer: {id_stats['unk_rate'] * 100:.4f}%",
548
+ f"- BIO warnings collected: {len(violations):,}",
549
+ "",
550
+ "Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.",
551
+ ]
552
+ ),
553
+ )
554
+ )
555
+
556
+ sections.append(
557
+ (
558
+ "Label And Entity Statistics",
559
+ "\n".join(
560
+ [
561
+ "### Label distribution",
562
+ format_counter(label_counter, total_labels),
563
+ "",
564
+ "### Entity count",
565
+ format_counter(entity_counter),
566
+ "",
567
+ "### Length distribution",
568
+ markdown_json(
569
+ {
570
+ "raw_tokens": {
571
+ "min": min(length_values),
572
+ "p50": percentile(length_values, 50),
573
+ "p90": percentile(length_values, 90),
574
+ "p95": percentile(length_values, 95),
575
+ "p99": percentile(length_values, 99),
576
+ "max": max(length_values),
577
+ },
578
+ "aligned_tokens": {
579
+ "min": min(aligned_length_values),
580
+ "p50": percentile(aligned_length_values, 50),
581
+ "p90": percentile(aligned_length_values, 90),
582
+ "p95": percentile(aligned_length_values, 95),
583
+ "p99": percentile(aligned_length_values, 99),
584
+ "max": max(aligned_length_values),
585
+ },
586
+ }
587
+ ),
588
+ "",
589
+ "### Whitespace labels",
590
+ format_counter(space_label_counter),
591
+ ]
592
+ ),
593
+ )
594
+ )
595
+
596
+ violation_counter = Counter(v["type"] for v in violations)
597
+ sections.append(
598
+ (
599
+ "BIO Violations And Boundary Drift",
600
+ "\n".join(
601
+ [
602
+ "### Violation counts",
603
+ format_counter(violation_counter),
604
+ "",
605
+ "### Boundary drift heuristics",
606
+ format_counter(boundary_drift_counter),
607
+ "",
608
+ "### Sample violations",
609
+ markdown_json(violations[:30]),
610
+ ]
611
+ ),
612
+ )
613
+ )
614
+
615
+ sections.append(
616
+ (
617
+ "Tokenizer Split And Alignment",
618
+ "\n".join(
619
+ [
620
+ "### Dataset tokens vs selected tokenizer mismatches",
621
+ markdown_json(mismatch_examples),
622
+ "",
623
+ "### Split examples",
624
+ markdown_json(split_examples),
625
+ "",
626
+ "### Vocabulary coverage",
627
+ markdown_json(id_stats),
628
+ ]
629
+ ),
630
+ )
631
+ )
632
+
633
+ if args.model_dir:
634
+ model_tokenizer_variant = getattr(model_tokenizer, "tokenizer_variant", "unknown")
635
+ sections.append(
636
+ (
637
+ "Train Inference Tokenizer Comparison",
638
+ "\n".join(
639
+ [
640
+ f"- Model dir: `{args.model_dir}`",
641
+ f"- Model tokenizer variant: `{model_tokenizer_variant}`",
642
+ f"- Dataset tokenizer variant: `{dataset_variant}`",
643
+ f"- Diagnostic tokenizer variant: `{tokenizer_variant}`",
644
+ f"- Model tokenizer vocab size: {model_tokenizer.vocab_size:,}",
645
+ f"- Diagnostic tokenizer vocab size: {tokenizer.vocab_size:,}",
646
+ "",
647
+ "If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.",
648
+ ]
649
+ ),
650
+ )
651
+ )
652
+
653
+ if model_eval:
654
+ token_rows = [
655
+ [true, pred, f"{count:,}"]
656
+ for (true, pred), count in model_eval["top_token_confusions"]
657
+ ]
658
+ entity_rows = [
659
+ [true, pred, f"{count:,}"]
660
+ for (true, pred), count in model_eval["top_entity_confusions"]
661
+ ]
662
+ sections.append(
663
+ (
664
+ "Model Confusion Analysis",
665
+ "\n".join(
666
+ [
667
+ f"- Evaluated samples: {model_eval['sample_count']:,}",
668
+ f"- Entity precision: {model_eval['precision']:.4f}",
669
+ f"- Entity recall: {model_eval['recall']:.4f}",
670
+ f"- Entity F1: {model_eval['f1']:.4f}",
671
+ "",
672
+ "### Boundary error classes",
673
+ format_counter(model_eval["boundary_errors"]),
674
+ "",
675
+ "### Top token-label confusions",
676
+ markdown_table(["true", "pred", "count"], token_rows) if token_rows else "- none",
677
+ "",
678
+ "### Top entity-type confusions",
679
+ markdown_table(["true", "pred", "count"], entity_rows) if entity_rows else "- none",
680
+ "",
681
+ "### Seqeval report",
682
+ "```text\n" + model_eval["classification_report"] + "\n```",
683
+ ]
684
+ ),
685
+ )
686
+ )
687
+
688
+ sections.append(
689
+ (
690
+ "Recommended Pipeline",
691
+ "\n".join(
692
+ [
693
+ "1. Use one tokenizer variant end to end and save it in the checkpoint metadata.",
694
+ "2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.",
695
+ "3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.",
696
+ "4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.",
697
+ "5. Keep rule-assisted post-processing for high-confidence structural anchors: leading group bracket, ` - 07`, `S01E07`, source, and resolution.",
698
+ "6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.",
699
+ ]
700
+ ),
701
+ )
702
+ )
703
+
704
+ write_report(Path(args.output), "Anime Filename Parser Diagnostics Report", sections)
705
+ print(f"Wrote diagnostics report: {args.output}")
706
+
707
+
708
+ if __name__ == "__main__":
709
+ main()
diagnostics_report.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Anime Filename Parser Diagnostics Report
2
+
3
+ ## 根因分析
4
+
5
+ 当前症状不是 learning rate 问题,而是训练、验证、推理没有在同一个结构化输入空间里工作。
6
+
7
+ 最高优先级根因是 tokenizer/data 配置错位:你给出的训练命令使用 `dmhy_weak_char.jsonl` 和 `vocab.char.json`,但没有传 `--tokenizer char`。旧版 `train.py` 默认 `regex`,因此 char 数据会被当作 regex 训练配置保存,checkpoint metadata 会写成 `tokenizer_variant=regex`。推理时 `load_tokenizer()` 按 checkpoint metadata 重新加载 regex tokenizer,于是 `[LoliHouse]` 这类结构 token 会作为一个整体进入模型,而 char 训练数据里它是 `[`, `L`, `o`, ..., `]`。这会直接导致 group/title 边界漂移。
8
+
9
+ 第二个根因是 word-level 数据和当前 `AnimeTokenizer` 也不完全一致。`dmhy_weak.jsonl` 里示例 token 是 `[`, `LoliHouse`, `]`,但当前 regex tokenizer 对原始文件名会输出 `[LoliHouse]`。这说明 word-level 数据名义上是 regex,但不是严格由当前 inference tokenizer 重放得到的 token 序列。
10
+
11
+ 第三个根因是 char 训练命令没有设置 `--max-seq-length 128`。在抽样 5,000 条 char 数据中,默认 64 长度会截断 2,058 条,占 41.16%。episode/source/resolution 往往在后半段,默认长度会让模型训练和推理都丢失结构锚点。
12
+
13
+ 第四个根因是评估指标误导。低 validation loss 和 token accuracy 会被大量 `O`、`I-TITLE` 稀释;真实任务需要 entity-level F1、字段 exact match,以及结构案例回归。
14
+
15
+ ## 问题优先级
16
+
17
+ P0: 训练命令必须显式或自动使用 char tokenizer。已修改 `train.py`,现在会从数据集 metadata 自动识别 `char`,并把 char 默认 max length 提升到 128。
18
+
19
+ P0: 不允许 tokenizer variant 与 dataset metadata 不一致。已修改 `train.py`,检测到 dataset `tokenizer_variant` 与选择的 tokenizer 不一致会报错。
20
+
21
+ P0: 推理必须使用 checkpoint 保存的 tokenizer 和 max length。已修改 `inference.py`,默认读取 `model.config.max_seq_length`,并新增 `--debug` 输出 token/label/score/UNK/截断信息。
22
+
23
+ P1: 从旧 checkpoint fine-tune 到不同 vocab 时,不能按 ID 盲目 `resize_token_embeddings()`。已修改为按 token 字符串重映射 embedding,未匹配 token 再随机初始化。
24
+
25
+ P1: 数据集存在 BIO/边界质量问题。char 抽样 5,000 条发现 468 个 `ORPHAN_I`,典型是标题被括号 `O` 打断后仍继续 `I-TITLE`。`B-X -> O` 本身是合法 BIO,但在 group/title/source 频繁出现时是边界告警。
26
+
27
+ P2: 当前 `BertForTokenClassification` 独立逐 token 解码,不能约束非法转移。建议后续加 CRF 或 constrained BIO decoder。
28
+
29
+ ## 自动诊断结果
30
+
31
+ 新增脚本:
32
+
33
+ ```bash
34
+ python diagnose_pipeline.py --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --model-dir checkpoints/dmhy-finetune/final --sample-limit 5000 --eval-limit 128 --output diagnostics_report.md
35
+ ```
36
+
37
+ char 数据抽样结果:
38
+
39
+ - tokenizer variant: `char`
40
+ - vocab size: 6,199
41
+ - UNK rate: 0.0000%
42
+ - O-label ratio: 37.47%
43
+ - p95 length: 101, p99 length: 125
44
+ - default max length 64 truncation: 41.16%
45
+ - `ORPHAN_I`: 468
46
+ - regex checkpoint 直接评 char 数据时 entity F1: 0.0832
47
+
48
+ word 数据抽样结果保存在 `diagnostics_report_word.md`:
49
+
50
+ - tokenizer variant: `regex`
51
+ - vocab size: 8,000
52
+ - UNK rate: 6.9158%
53
+ - default max length 64 truncation: 0%
54
+ - 当前 regex checkpoint 在抽样 word 数据上 entity F1: 0.9549
55
+ - 但 model checkpoint vocab 是 3,000,诊断 vocab 是 8,000,继续 fine-tune 必须重映射 embedding
56
+
57
+ ## Tokenizer Split 示例
58
+
59
+ 输入:
60
+
61
+ ```text
62
+ [LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]
63
+ ```
64
+
65
+ char tokenizer:
66
+
67
+ ```text
68
+ [, L, o, l, i, H, o, u, s, e, ], , Y, o, m, i, , n, o, , T, s, u, g, a, i, , -, , 0, 7, ...
69
+ ```
70
+
71
+ 当前 regex tokenizer:
72
+
73
+ ```text
74
+ [LoliHouse], , Yomi, , no, , Tsugai, , -, , 07, , [WebRip 1080p HEVC-10bit AAC ASSx2]
75
+ ```
76
+
77
+ 这两个 token 序列不是同一个标注空间。char label 不能直接套到 regex token 上,regex 模型也不能在 char token 序列上解释 logits。
78
+
79
+ ## BIO 与边界问题
80
+
81
+ 真实非法 BIO:
82
+
83
+ ```text
84
+ ... ( O, K I-TITLE, a I-TITLE ...
85
+ ```
86
+
87
+ 示例:
88
+
89
+ ```text
90
+ [LoliHouse] Kanteishi (Kari) - 07 [WebRip 1080p HEVC-10bit AAC]
91
+ ```
92
+
93
+ `(` 被标为 `O`,后面的 `Kari` 继续 `I-TITLE`,形成 `O -> I-TITLE`。这会让模型学习到标题可以跨越被标为非实体的括号,边界自然会漂。
94
+
95
+ 结构边界告警:
96
+
97
+ ```text
98
+ [KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]
99
+ ```
100
+
101
+ `KissSub` 是 `B-GROUP`,右括号是 `O`,这是合法 BIO;但如果 tokenizer 在推理时把 `[KissSub]` 合成一个 token,模型就无法只给内部文字打 `GROUP`,只能把整个 bracket token 判成一个类别。
102
+
103
+ ## Confusion 分析
104
+
105
+ 故意用 char 数据评估 regex checkpoint,entity F1 只有 0.0832。主要混淆:
106
+
107
+ - `O -> TITLE`: 930
108
+ - `SOURCE -> TITLE`: 236
109
+ - `EPISODE -> TITLE`: 228
110
+ - `GROUP -> TITLE`: 86
111
+
112
+ 这与实际症状一致:模型把结构锚点和 meta 区域吸进 title,group/title 边界混淆,episode 被 title 或 O 吞掉。
113
+
114
+ ## 已修改的代码
115
+
116
+ `train.py`
117
+
118
+ - `--tokenizer` 默认从数据集 metadata/vocab 名称/样本结构自动推断。
119
+ - char 数据默认 `max_seq_length >= 128`。
120
+ - dataset metadata 与 tokenizer 不一致会直接报错。
121
+ - fine-tune 到新 vocab 时按 token 字符串重映射 embedding,避免 token ID 语义错位。
122
+ - checkpoint 保存正确的 `tokenizer_variant` 和 `max_seq_length`。
123
+
124
+ `inference.py`
125
+
126
+ - 新增 `--debug`,输出 tokenizer variant、token IDs、labels、scores、UNK rate、truncation、entity spans。
127
+ - 默认使用 checkpoint `max_seq_length`。
128
+ - 修正推理截断逻辑,保留 `[SEP]`,与训练一致。
129
+ - 默认使用 constrained BIO Viterbi 解码,阻止 `O -> I-X` 这类非法转移;可用 `--no-constrained-bio` 查看原始 greedy 输出。
130
+ - 新增 rule-assisted parsing,兜底修复高置信结构锚点:leading group bracket、` - 07`、`S01E07`、resolution、source。
131
+ - 可用 `--no-rule-assist` 关闭规则兜底,只看模型原始输出。
132
+
133
+ `diagnose_pipeline.py`
134
+
135
+ - 自动检查 token/label 长度。
136
+ - 输出 BIO 违规样本与边界告警。
137
+ - 输出 tokenizer split 示例。
138
+ - 输出 train/inference tokenizer 对比。
139
+ - 输出实体、label、空格 label、UNK、截断统计。
140
+ - 可选加载 checkpoint 做 confusion 和 seqeval entity-level F1。
141
+
142
+ ## 修改后的 Pipeline
143
+
144
+ 推荐 char-level pipeline:
145
+
146
+ ```bash
147
+ python diagnose_pipeline.py ^
148
+ --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
149
+ --vocab-file datasets/AnimeName/vocab.char.json ^
150
+ --sample-limit 20000 ^
151
+ --output diagnostics_report.md
152
+
153
+ python train.py ^
154
+ --tokenizer char ^
155
+ --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
156
+ --vocab-file datasets/AnimeName/vocab.char.json ^
157
+ --save-dir checkpoints/dmhy-char ^
158
+ --epochs 10 ^
159
+ --batch-size 128 ^
160
+ --learning-rate 0.0003 ^
161
+ --warmup-steps 300 ^
162
+ --max-seq-length 128 ^
163
+ --seed 42
164
+
165
+ python inference.py ^
166
+ --model-dir checkpoints/dmhy-char/final ^
167
+ --debug ^
168
+ "[LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]"
169
+ ```
170
+
171
+ 如果继续使用 word/regex pipeline,必须先重新生成数据,使 `sample["tokens"] == AnimeTokenizer.tokenize(sample["filename"])` 对绝大多数样本成立;否则验证集仍然是训练 token 空间,真实 inference 是另一个 token 空间。
172
+
173
+ ## 最合理的 Tokenizer 方案
174
+
175
+ 当前任务更适合 char-level 或 deterministic hybrid tokenizer,不适合通用 subword tokenizer。
176
+
177
+ char-level 优点:
178
+
179
+ - train/inference 最容易完全一致。
180
+ - 不会把 `[LoliHouse]`、`[WebRip ...]` 这类结构块压成单 token。
181
+ - 对未知标题、组名、罗马音、中文、日文都没有 OOV。
182
+ - 更适合学习括号、空格、连字符、集数位置这些结构信号。
183
+
184
+ char-level 缺点:
185
+
186
+ - 序列更长,必须用 `max_seq_length=128`。
187
+ - 逐 token softmax 容易出现 BIO 非法转移,建议加 CRF。
188
+
189
+ word-level/regex 优点:
190
+
191
+ - 序列短,训练快。
192
+ - 当前已有 checkpoint 在同 token 空间验证集上 F1 较高。
193
+
194
+ word-level/regex 缺点:
195
+
196
+ - 如果 bracket protection 把整段合并,内部 label 无法表达。
197
+ - 数据生成 tokenizer 和 inference tokenizer 稍有不一致就会严重错位。
198
+ - OOV 对新番标题和组名仍然明显。
199
+
200
+ 结论:短期用 char-level + rule-assisted parsing;中期改为 hybrid tokenizer:保留结构符号 `[ ] ( ) - _ . space` 为独立 token,英文数字连续串可作为片段但必须能映射回字符 offset,并在 label alignment 上以 offset 为准;长期加 BERT + CRF。
201
+
202
+ ## 建议训练配置
203
+
204
+ 首选:
205
+
206
+ ```bash
207
+ python train.py --tokenizer char ^
208
+ --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
209
+ --vocab-file datasets/AnimeName/vocab.char.json ^
210
+ --save-dir checkpoints/dmhy-char ^
211
+ --epochs 10 --batch-size 128 ^
212
+ --learning-rate 0.0003 --warmup-steps 300 ^
213
+ --max-seq-length 128 --seed 42
214
+ ```
215
+
216
+ 不要从 regex checkpoint 直接当作同构模型继续训练 char;如果要迁移,当前代码会按 token 字符串 remap embedding,但多数 char token 与 regex token 共享有限,最好从头训练 char 模型或只迁移 encoder 非 embedding 层。
217
+
218
+ 必须新增评估:
219
+
220
+ - entity-level F1 by field
221
+ - field exact match: `group/title/episode/resolution/source`
222
+ - full parse exact match
223
+ - episode recall
224
+ - boundary errors: group-title, title-episode, episode-meta
225
+ - inference debug sample set,固定 50-200 个真实文件名回归
226
+
227
+ ## 真实案例分析
228
+
229
+ 输入:
230
+
231
+ ```text
232
+ [LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]
233
+ ```
234
+
235
+ 旧 regex checkpoint 原始模型输出:
236
+
237
+ ```json
238
+ {
239
+ "entities": [
240
+ {"type": "TITLE", "text": "[LoliHouse] Yomi no Tsugai"},
241
+ {"type": "EPISODE", "text": "07"}
242
+ ]
243
+ }
244
+ ```
245
+
246
+ 问题点:
247
+
248
+ - `[LoliHouse]` 被 tokenizer 合成一个 token。
249
+ - 模型把该 token 判成 `B-TITLE`,无法只把内部 `LoliHouse` 判成 `GROUP`。
250
+ - `Yomi` 和 `Tsugai` 在 3,000 vocab checkpoint 中是 `[UNK]`,但模型仍高置信输出 `I-TITLE`,说明 loss/置信度不能代表字段正确性。
251
+
252
+ 修改后带规则辅助的最终输出:
253
+
254
+ ```json
255
+ {
256
+ "group": "LoliHouse",
257
+ "title": "Yomi no Tsugai",
258
+ "episode": 7,
259
+ "source": "WebRip",
260
+ "resolution": "1080p"
261
+ }
262
+ ```
263
+
264
+ 这只是上线兜底;真正修复仍应训练一个 train/inference token 完全一致的 char 或 hybrid 模型。
265
+
266
+ ## 架构建议
267
+
268
+ 最推荐的重构路线:
269
+
270
+ 1. `BERT encoder + CRF`:约束 `O -> I-X`、`B-X -> I-Y` 等非法/低质量转移。
271
+ 2. char-level NER:保证 token-label alignment 不受 subword split 影响。
272
+ 3. rule-assisted parser:先抽取高置信结构锚点,再让模型负责模糊 title/group 边界。
273
+ 4. offset-based dataset:每条数据保存 raw filename、entity spans、tokens、offset_mapping、labels,训练时由 tokenizer 统一生成 labels。
274
+
275
+ 当前代码已先实现“无训练 CRF”的 constrained BIO decoding,作为上线前的轻量保护。完整 BERT+CRF 仍建议作为下一阶段训练架构重构。
276
+
277
+ 不要只优化 loss。这个任务的目标函数应更接近真实解析准确率:字段级 exact match + episode recall + title boundary F1。
diagnostics_report_word.md ADDED
@@ -0,0 +1,2678 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Anime Filename Parser Diagnostics Report
2
+
3
+ ## Executive Summary
4
+
5
+ - Dataset: `datasets\AnimeName\dmhy_weak.jsonl`
6
+ - Inspected rows: 5,000
7
+ - Dataset tokenizer variant: `regex`
8
+ - Diagnosed tokenizer variant: `regex`
9
+ - Vocab: `datasets\AnimeName\vocab.json` (8,000 tokens)
10
+ - Max sequence length checked: 64
11
+ - O-label ratio: 38.12%
12
+ - Truncation risk: 0/5,000 rows (0.00%)
13
+ - UNK rate after selected tokenizer: 6.9158%
14
+ - BIO warnings collected: 9,711
15
+
16
+ Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.
17
+
18
+ ## Label And Entity Statistics
19
+
20
+ ### Label distribution
21
+ - `O`: 32,517 (38.12%)
22
+ - `I-TITLE`: 30,321 (35.54%)
23
+ - `B-TITLE`: 5,593 (6.56%)
24
+ - `B-EPISODE`: 5,000 (5.86%)
25
+ - `B-SOURCE`: 4,032 (4.73%)
26
+ - `I-GROUP`: 2,459 (2.88%)
27
+ - `B-GROUP`: 2,299 (2.69%)
28
+ - `B-RESOLUTION`: 1,765 (2.07%)
29
+ - `B-SEASON`: 1,269 (1.49%)
30
+ - `B-SPECIAL`: 57 (0.07%)
31
+
32
+ ### Entity count
33
+ - `TITLE`: 6,061 (29.59%)
34
+ - `EPISODE`: 5,000 (24.41%)
35
+ - `SOURCE`: 4,032 (19.68%)
36
+ - `GROUP`: 2,299 (11.22%)
37
+ - `RESOLUTION`: 1,765 (8.62%)
38
+ - `SEASON`: 1,269 (6.20%)
39
+ - `SPECIAL`: 57 (0.28%)
40
+
41
+ ### Length distribution
42
+ ```json
43
+ {
44
+ "raw_tokens": {
45
+ "min": 3,
46
+ "p50": 17,
47
+ "p90": 28,
48
+ "p95": 31,
49
+ "p99": 39,
50
+ "max": 54
51
+ },
52
+ "aligned_tokens": {
53
+ "min": 3,
54
+ "p50": 17,
55
+ "p90": 28,
56
+ "p95": 31,
57
+ "p99": 39,
58
+ "max": 54
59
+ }
60
+ }
61
+ ```
62
+
63
+ ### Whitespace labels
64
+ - `I-TITLE`: 10,539 (48.98%)
65
+ - `O`: 10,484 (48.72%)
66
+ - `I-GROUP`: 411 (1.91%)
67
+ - `B-TITLE`: 84 (0.39%)
68
+
69
+ ## BIO Violations And Boundary Drift
70
+
71
+ ### Violation counts
72
+ - `B_DIRECT_TO_O`: 9,243 (95.18%)
73
+ - `ORPHAN_I`: 468 (4.82%)
74
+
75
+ ### Boundary drift heuristics
76
+ - none
77
+
78
+ ### Sample violations
79
+ ```json
80
+ [
81
+ {
82
+ "type": "B_DIRECT_TO_O",
83
+ "index": 8,
84
+ "prev_label": "B-EPISODE",
85
+ "label": "O",
86
+ "token": ".",
87
+ "row": 1,
88
+ "file_id": 1,
89
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
90
+ "context_tokens": [
91
+ ".",
92
+ "Atelier",
93
+ ".",
94
+ "S01",
95
+ "E07",
96
+ ".",
97
+ "1080p",
98
+ ".",
99
+ "NF",
100
+ ".",
101
+ "WEB-DL"
102
+ ],
103
+ "context_labels": [
104
+ "I-TITLE",
105
+ "I-TITLE",
106
+ "O",
107
+ "B-SEASON",
108
+ "B-EPISODE",
109
+ "O",
110
+ "B-RESOLUTION",
111
+ "O",
112
+ "B-SOURCE",
113
+ "O",
114
+ "B-SOURCE"
115
+ ]
116
+ },
117
+ {
118
+ "type": "B_DIRECT_TO_O",
119
+ "index": 10,
120
+ "prev_label": "B-RESOLUTION",
121
+ "label": "O",
122
+ "token": ".",
123
+ "row": 1,
124
+ "file_id": 1,
125
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
126
+ "context_tokens": [
127
+ ".",
128
+ "S01",
129
+ "E07",
130
+ ".",
131
+ "1080p",
132
+ ".",
133
+ "NF",
134
+ ".",
135
+ "WEB-DL",
136
+ ".",
137
+ "JP"
138
+ ],
139
+ "context_labels": [
140
+ "O",
141
+ "B-SEASON",
142
+ "B-EPISODE",
143
+ "O",
144
+ "B-RESOLUTION",
145
+ "O",
146
+ "B-SOURCE",
147
+ "O",
148
+ "B-SOURCE",
149
+ "O",
150
+ "B-SOURCE"
151
+ ]
152
+ },
153
+ {
154
+ "type": "B_DIRECT_TO_O",
155
+ "index": 12,
156
+ "prev_label": "B-SOURCE",
157
+ "label": "O",
158
+ "token": ".",
159
+ "row": 1,
160
+ "file_id": 1,
161
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
162
+ "context_tokens": [
163
+ "E07",
164
+ ".",
165
+ "1080p",
166
+ ".",
167
+ "NF",
168
+ ".",
169
+ "WEB-DL",
170
+ ".",
171
+ "JP",
172
+ "N",
173
+ "."
174
+ ],
175
+ "context_labels": [
176
+ "B-EPISODE",
177
+ "O",
178
+ "B-RESOLUTION",
179
+ "O",
180
+ "B-SOURCE",
181
+ "O",
182
+ "B-SOURCE",
183
+ "O",
184
+ "B-SOURCE",
185
+ "O",
186
+ "O"
187
+ ]
188
+ },
189
+ {
190
+ "type": "B_DIRECT_TO_O",
191
+ "index": 14,
192
+ "prev_label": "B-SOURCE",
193
+ "label": "O",
194
+ "token": ".",
195
+ "row": 1,
196
+ "file_id": 1,
197
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
198
+ "context_tokens": [
199
+ "1080p",
200
+ ".",
201
+ "NF",
202
+ ".",
203
+ "WEB-DL",
204
+ ".",
205
+ "JP",
206
+ "N",
207
+ ".",
208
+ "AAC",
209
+ "2"
210
+ ],
211
+ "context_labels": [
212
+ "B-RESOLUTION",
213
+ "O",
214
+ "B-SOURCE",
215
+ "O",
216
+ "B-SOURCE",
217
+ "O",
218
+ "B-SOURCE",
219
+ "O",
220
+ "O",
221
+ "B-SOURCE",
222
+ "O"
223
+ ]
224
+ },
225
+ {
226
+ "type": "B_DIRECT_TO_O",
227
+ "index": 16,
228
+ "prev_label": "B-SOURCE",
229
+ "label": "O",
230
+ "token": "N",
231
+ "row": 1,
232
+ "file_id": 1,
233
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
234
+ "context_tokens": [
235
+ "NF",
236
+ ".",
237
+ "WEB-DL",
238
+ ".",
239
+ "JP",
240
+ "N",
241
+ ".",
242
+ "AAC",
243
+ "2",
244
+ ".",
245
+ "0"
246
+ ],
247
+ "context_labels": [
248
+ "B-SOURCE",
249
+ "O",
250
+ "B-SOURCE",
251
+ "O",
252
+ "B-SOURCE",
253
+ "O",
254
+ "O",
255
+ "B-SOURCE",
256
+ "O",
257
+ "O",
258
+ "O"
259
+ ]
260
+ },
261
+ {
262
+ "type": "B_DIRECT_TO_O",
263
+ "index": 19,
264
+ "prev_label": "B-SOURCE",
265
+ "label": "O",
266
+ "token": "2",
267
+ "row": 1,
268
+ "file_id": 1,
269
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
270
+ "context_tokens": [
271
+ ".",
272
+ "JP",
273
+ "N",
274
+ ".",
275
+ "AAC",
276
+ "2",
277
+ ".",
278
+ "0",
279
+ ".",
280
+ "H.264",
281
+ "."
282
+ ],
283
+ "context_labels": [
284
+ "O",
285
+ "B-SOURCE",
286
+ "O",
287
+ "O",
288
+ "B-SOURCE",
289
+ "O",
290
+ "O",
291
+ "O",
292
+ "O",
293
+ "B-SOURCE",
294
+ "O"
295
+ ]
296
+ },
297
+ {
298
+ "type": "B_DIRECT_TO_O",
299
+ "index": 24,
300
+ "prev_label": "B-SOURCE",
301
+ "label": "O",
302
+ "token": ".",
303
+ "row": 1,
304
+ "file_id": 1,
305
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
306
+ "context_tokens": [
307
+ "2",
308
+ ".",
309
+ "0",
310
+ ".",
311
+ "H.264",
312
+ ".",
313
+ "MSubs",
314
+ "-",
315
+ "ToonsHub"
316
+ ],
317
+ "context_labels": [
318
+ "O",
319
+ "O",
320
+ "O",
321
+ "O",
322
+ "B-SOURCE",
323
+ "O",
324
+ "B-SOURCE",
325
+ "O",
326
+ "O"
327
+ ]
328
+ },
329
+ {
330
+ "type": "B_DIRECT_TO_O",
331
+ "index": 26,
332
+ "prev_label": "B-SOURCE",
333
+ "label": "O",
334
+ "token": "-",
335
+ "row": 1,
336
+ "file_id": 1,
337
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
338
+ "context_tokens": [
339
+ "0",
340
+ ".",
341
+ "H.264",
342
+ ".",
343
+ "MSubs",
344
+ "-",
345
+ "ToonsHub"
346
+ ],
347
+ "context_labels": [
348
+ "O",
349
+ "O",
350
+ "B-SOURCE",
351
+ "O",
352
+ "B-SOURCE",
353
+ "O",
354
+ "O"
355
+ ]
356
+ },
357
+ {
358
+ "type": "B_DIRECT_TO_O",
359
+ "index": 2,
360
+ "prev_label": "B-GROUP",
361
+ "label": "O",
362
+ "token": "]",
363
+ "row": 2,
364
+ "file_id": 2,
365
+ "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
366
+ "context_tokens": [
367
+ "[",
368
+ "LoliHouse",
369
+ "]",
370
+ " ",
371
+ "Maid",
372
+ "-",
373
+ "san",
374
+ " "
375
+ ],
376
+ "context_labels": [
377
+ "O",
378
+ "B-GROUP",
379
+ "O",
380
+ "O",
381
+ "B-TITLE",
382
+ "I-TITLE",
383
+ "I-TITLE",
384
+ "I-TITLE"
385
+ ]
386
+ },
387
+ {
388
+ "type": "B_DIRECT_TO_O",
389
+ "index": 17,
390
+ "prev_label": "B-EPISODE",
391
+ "label": "O",
392
+ "token": " ",
393
+ "row": 2,
394
+ "file_id": 2,
395
+ "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
396
+ "context_tokens": [
397
+ "Dake",
398
+ " ",
399
+ "-",
400
+ " ",
401
+ "07",
402
+ " ",
403
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
404
+ ],
405
+ "context_labels": [
406
+ "I-TITLE",
407
+ "O",
408
+ "O",
409
+ "O",
410
+ "B-EPISODE",
411
+ "O",
412
+ "O"
413
+ ]
414
+ },
415
+ {
416
+ "type": "B_DIRECT_TO_O",
417
+ "index": 2,
418
+ "prev_label": "B-GROUP",
419
+ "label": "O",
420
+ "token": "]",
421
+ "row": 3,
422
+ "file_id": 3,
423
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
424
+ "context_tokens": [
425
+ "[",
426
+ "ANi",
427
+ "]",
428
+ " ",
429
+ "異",
430
+ "世",
431
+ "界",
432
+ "悠"
433
+ ],
434
+ "context_labels": [
435
+ "O",
436
+ "B-GROUP",
437
+ "O",
438
+ "O",
439
+ "B-TITLE",
440
+ "I-TITLE",
441
+ "I-TITLE",
442
+ "I-TITLE"
443
+ ]
444
+ },
445
+ {
446
+ "type": "B_DIRECT_TO_O",
447
+ "index": 13,
448
+ "prev_label": "B-SEASON",
449
+ "label": "O",
450
+ "token": " ",
451
+ "row": 3,
452
+ "file_id": 3,
453
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
454
+ "context_tokens": [
455
+ "閒",
456
+ "農",
457
+ "家",
458
+ " ",
459
+ "2",
460
+ " ",
461
+ "-",
462
+ " ",
463
+ "06",
464
+ " ",
465
+ "[1080P]"
466
+ ],
467
+ "context_labels": [
468
+ "I-TITLE",
469
+ "I-TITLE",
470
+ "I-TITLE",
471
+ "O",
472
+ "B-SEASON",
473
+ "O",
474
+ "O",
475
+ "O",
476
+ "B-EPISODE",
477
+ "O",
478
+ "B-RESOLUTION"
479
+ ]
480
+ },
481
+ {
482
+ "type": "B_DIRECT_TO_O",
483
+ "index": 17,
484
+ "prev_label": "B-EPISODE",
485
+ "label": "O",
486
+ "token": " ",
487
+ "row": 3,
488
+ "file_id": 3,
489
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
490
+ "context_tokens": [
491
+ "2",
492
+ " ",
493
+ "-",
494
+ " ",
495
+ "06",
496
+ " ",
497
+ "[1080P]",
498
+ "[Baha]",
499
+ "[WEB-DL]",
500
+ "[AAC AVC]",
501
+ "[CHT]"
502
+ ],
503
+ "context_labels": [
504
+ "B-SEASON",
505
+ "O",
506
+ "O",
507
+ "O",
508
+ "B-EPISODE",
509
+ "O",
510
+ "B-RESOLUTION",
511
+ "B-SOURCE",
512
+ "B-SOURCE",
513
+ "O",
514
+ "B-SOURCE"
515
+ ]
516
+ },
517
+ {
518
+ "type": "B_DIRECT_TO_O",
519
+ "index": 21,
520
+ "prev_label": "B-SOURCE",
521
+ "label": "O",
522
+ "token": "[AAC AVC]",
523
+ "row": 3,
524
+ "file_id": 3,
525
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
526
+ "context_tokens": [
527
+ "06",
528
+ " ",
529
+ "[1080P]",
530
+ "[Baha]",
531
+ "[WEB-DL]",
532
+ "[AAC AVC]",
533
+ "[CHT]"
534
+ ],
535
+ "context_labels": [
536
+ "B-EPISODE",
537
+ "O",
538
+ "B-RESOLUTION",
539
+ "B-SOURCE",
540
+ "B-SOURCE",
541
+ "O",
542
+ "B-SOURCE"
543
+ ]
544
+ },
545
+ {
546
+ "type": "B_DIRECT_TO_O",
547
+ "index": 2,
548
+ "prev_label": "B-GROUP",
549
+ "label": "O",
550
+ "token": "]",
551
+ "row": 4,
552
+ "file_id": 4,
553
+ "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
554
+ "context_tokens": [
555
+ "[",
556
+ "ANi",
557
+ "]",
558
+ " ",
559
+ "木",
560
+ "頭",
561
+ "風",
562
+ "紀"
563
+ ],
564
+ "context_labels": [
565
+ "O",
566
+ "B-GROUP",
567
+ "O",
568
+ "O",
569
+ "B-TITLE",
570
+ "I-TITLE",
571
+ "I-TITLE",
572
+ "I-TITLE"
573
+ ]
574
+ },
575
+ {
576
+ "type": "B_DIRECT_TO_O",
577
+ "index": 24,
578
+ "prev_label": "B-EPISODE",
579
+ "label": "O",
580
+ "token": " ",
581
+ "row": 4,
582
+ "file_id": 4,
583
+ "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
584
+ "context_tokens": [
585
+ "事",
586
+ " ",
587
+ "-",
588
+ " ",
589
+ "06",
590
+ " ",
591
+ "[1080P]",
592
+ "[Baha]",
593
+ "[WEB-DL]",
594
+ "[AAC AVC]",
595
+ "[CHT]"
596
+ ],
597
+ "context_labels": [
598
+ "I-TITLE",
599
+ "O",
600
+ "O",
601
+ "O",
602
+ "B-EPISODE",
603
+ "O",
604
+ "B-RESOLUTION",
605
+ "B-SOURCE",
606
+ "B-SOURCE",
607
+ "O",
608
+ "B-SOURCE"
609
+ ]
610
+ },
611
+ {
612
+ "type": "B_DIRECT_TO_O",
613
+ "index": 28,
614
+ "prev_label": "B-SOURCE",
615
+ "label": "O",
616
+ "token": "[AAC AVC]",
617
+ "row": 4,
618
+ "file_id": 4,
619
+ "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
620
+ "context_tokens": [
621
+ "06",
622
+ " ",
623
+ "[1080P]",
624
+ "[Baha]",
625
+ "[WEB-DL]",
626
+ "[AAC AVC]",
627
+ "[CHT]"
628
+ ],
629
+ "context_labels": [
630
+ "B-EPISODE",
631
+ "O",
632
+ "B-RESOLUTION",
633
+ "B-SOURCE",
634
+ "B-SOURCE",
635
+ "O",
636
+ "B-SOURCE"
637
+ ]
638
+ },
639
+ {
640
+ "type": "B_DIRECT_TO_O",
641
+ "index": 2,
642
+ "prev_label": "B-GROUP",
643
+ "label": "O",
644
+ "token": "]",
645
+ "row": 5,
646
+ "file_id": 5,
647
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
648
+ "context_tokens": [
649
+ "[",
650
+ "KissSub",
651
+ "]",
652
+ "[",
653
+ "Shunkashuutou",
654
+ " ",
655
+ "Daikousha",
656
+ " "
657
+ ],
658
+ "context_labels": [
659
+ "O",
660
+ "B-GROUP",
661
+ "O",
662
+ "O",
663
+ "B-TITLE",
664
+ "I-TITLE",
665
+ "I-TITLE",
666
+ "I-TITLE"
667
+ ]
668
+ },
669
+ {
670
+ "type": "B_DIRECT_TO_O",
671
+ "index": 19,
672
+ "prev_label": "B-SOURCE",
673
+ "label": "O",
674
+ "token": "[MP4]",
675
+ "row": 5,
676
+ "file_id": 5,
677
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
678
+ "context_tokens": [
679
+ "Mai",
680
+ "]",
681
+ "[05]",
682
+ "[1080P]",
683
+ "[GB]",
684
+ "[MP4]"
685
+ ],
686
+ "context_labels": [
687
+ "I-TITLE",
688
+ "O",
689
+ "B-EPISODE",
690
+ "B-RESOLUTION",
691
+ "B-SOURCE",
692
+ "O"
693
+ ]
694
+ },
695
+ {
696
+ "type": "B_DIRECT_TO_O",
697
+ "index": 2,
698
+ "prev_label": "B-GROUP",
699
+ "label": "O",
700
+ "token": "]",
701
+ "row": 6,
702
+ "file_id": 6,
703
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
704
+ "context_tokens": [
705
+ "[",
706
+ "KissSub",
707
+ "]",
708
+ "[",
709
+ "Shunkashuutou",
710
+ " ",
711
+ "Daikousha",
712
+ " "
713
+ ],
714
+ "context_labels": [
715
+ "O",
716
+ "B-GROUP",
717
+ "O",
718
+ "O",
719
+ "B-TITLE",
720
+ "I-TITLE",
721
+ "I-TITLE",
722
+ "I-TITLE"
723
+ ]
724
+ },
725
+ {
726
+ "type": "B_DIRECT_TO_O",
727
+ "index": 19,
728
+ "prev_label": "B-SOURCE",
729
+ "label": "O",
730
+ "token": "[MP4]",
731
+ "row": 6,
732
+ "file_id": 6,
733
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
734
+ "context_tokens": [
735
+ "Mai",
736
+ "]",
737
+ "[06]",
738
+ "[1080P]",
739
+ "[GB]",
740
+ "[MP4]"
741
+ ],
742
+ "context_labels": [
743
+ "I-TITLE",
744
+ "O",
745
+ "B-EPISODE",
746
+ "B-RESOLUTION",
747
+ "B-SOURCE",
748
+ "O"
749
+ ]
750
+ },
751
+ {
752
+ "type": "B_DIRECT_TO_O",
753
+ "index": 2,
754
+ "prev_label": "B-GROUP",
755
+ "label": "O",
756
+ "token": "]",
757
+ "row": 7,
758
+ "file_id": 7,
759
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
760
+ "context_tokens": [
761
+ "[",
762
+ "KissSub",
763
+ "]",
764
+ "[",
765
+ "Shunkashuutou",
766
+ " ",
767
+ "Daikousha",
768
+ " "
769
+ ],
770
+ "context_labels": [
771
+ "O",
772
+ "B-GROUP",
773
+ "O",
774
+ "O",
775
+ "B-TITLE",
776
+ "I-TITLE",
777
+ "I-TITLE",
778
+ "I-TITLE"
779
+ ]
780
+ },
781
+ {
782
+ "type": "B_DIRECT_TO_O",
783
+ "index": 19,
784
+ "prev_label": "B-SOURCE",
785
+ "label": "O",
786
+ "token": "[MP4]",
787
+ "row": 7,
788
+ "file_id": 7,
789
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
790
+ "context_tokens": [
791
+ "Mai",
792
+ "]",
793
+ "[06]",
794
+ "[1080P]",
795
+ "[BIG5]",
796
+ "[MP4]"
797
+ ],
798
+ "context_labels": [
799
+ "I-TITLE",
800
+ "O",
801
+ "B-EPISODE",
802
+ "B-RESOLUTION",
803
+ "B-SOURCE",
804
+ "O"
805
+ ]
806
+ },
807
+ {
808
+ "type": "B_DIRECT_TO_O",
809
+ "index": 2,
810
+ "prev_label": "B-GROUP",
811
+ "label": "O",
812
+ "token": "]",
813
+ "row": 8,
814
+ "file_id": 8,
815
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
816
+ "context_tokens": [
817
+ "[",
818
+ "KissSub",
819
+ "]",
820
+ "[",
821
+ "Shunkashuutou",
822
+ " ",
823
+ "Daikousha",
824
+ " "
825
+ ],
826
+ "context_labels": [
827
+ "O",
828
+ "B-GROUP",
829
+ "O",
830
+ "O",
831
+ "B-TITLE",
832
+ "I-TITLE",
833
+ "I-TITLE",
834
+ "I-TITLE"
835
+ ]
836
+ },
837
+ {
838
+ "type": "B_DIRECT_TO_O",
839
+ "index": 19,
840
+ "prev_label": "B-SOURCE",
841
+ "label": "O",
842
+ "token": "[MP4]",
843
+ "row": 8,
844
+ "file_id": 8,
845
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
846
+ "context_tokens": [
847
+ "Mai",
848
+ "]",
849
+ "[05]",
850
+ "[1080P]",
851
+ "[BIG5]",
852
+ "[MP4]"
853
+ ],
854
+ "context_labels": [
855
+ "I-TITLE",
856
+ "O",
857
+ "B-EPISODE",
858
+ "B-RESOLUTION",
859
+ "B-SOURCE",
860
+ "O"
861
+ ]
862
+ },
863
+ {
864
+ "type": "B_DIRECT_TO_O",
865
+ "index": 2,
866
+ "prev_label": "B-GROUP",
867
+ "label": "O",
868
+ "token": "]",
869
+ "row": 9,
870
+ "file_id": 9,
871
+ "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
872
+ "context_tokens": [
873
+ "[",
874
+ "Airota",
875
+ "]",
876
+ "[",
877
+ "Sousou",
878
+ " ",
879
+ "no",
880
+ " "
881
+ ],
882
+ "context_labels": [
883
+ "O",
884
+ "B-GROUP",
885
+ "O",
886
+ "O",
887
+ "B-TITLE",
888
+ "I-TITLE",
889
+ "I-TITLE",
890
+ "I-TITLE"
891
+ ]
892
+ },
893
+ {
894
+ "type": "B_DIRECT_TO_O",
895
+ "index": 11,
896
+ "prev_label": "B-EPISODE",
897
+ "label": "O",
898
+ "token": "[1080p AVC AAC]",
899
+ "row": 9,
900
+ "file_id": 9,
901
+ "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
902
+ "context_tokens": [
903
+ "no",
904
+ " ",
905
+ "Frieren",
906
+ "]",
907
+ "[29]",
908
+ "[1080p AVC AAC]",
909
+ "[CHT]"
910
+ ],
911
+ "context_labels": [
912
+ "I-TITLE",
913
+ "I-TITLE",
914
+ "I-TITLE",
915
+ "O",
916
+ "B-EPISODE",
917
+ "O",
918
+ "B-SOURCE"
919
+ ]
920
+ },
921
+ {
922
+ "type": "B_DIRECT_TO_O",
923
+ "index": 2,
924
+ "prev_label": "B-GROUP",
925
+ "label": "O",
926
+ "token": "]",
927
+ "row": 10,
928
+ "file_id": 10,
929
+ "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
930
+ "context_tokens": [
931
+ "[",
932
+ "Airota",
933
+ "]",
934
+ "[",
935
+ "Sousou",
936
+ " ",
937
+ "no",
938
+ " "
939
+ ],
940
+ "context_labels": [
941
+ "O",
942
+ "B-GROUP",
943
+ "O",
944
+ "O",
945
+ "B-TITLE",
946
+ "I-TITLE",
947
+ "I-TITLE",
948
+ "I-TITLE"
949
+ ]
950
+ },
951
+ {
952
+ "type": "B_DIRECT_TO_O",
953
+ "index": 11,
954
+ "prev_label": "B-EPISODE",
955
+ "label": "O",
956
+ "token": "[1080p AVC AAC]",
957
+ "row": 10,
958
+ "file_id": 10,
959
+ "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
960
+ "context_tokens": [
961
+ "no",
962
+ " ",
963
+ "Frieren",
964
+ "]",
965
+ "[30]",
966
+ "[1080p AVC AAC]",
967
+ "[CHT]"
968
+ ],
969
+ "context_labels": [
970
+ "I-TITLE",
971
+ "I-TITLE",
972
+ "I-TITLE",
973
+ "O",
974
+ "B-EPISODE",
975
+ "O",
976
+ "B-SOURCE"
977
+ ]
978
+ },
979
+ {
980
+ "type": "B_DIRECT_TO_O",
981
+ "index": 2,
982
+ "prev_label": "B-GROUP",
983
+ "label": "O",
984
+ "token": "]",
985
+ "row": 11,
986
+ "file_id": 11,
987
+ "filename": "[Airota][Sousou no Frieren][31][1080p AVC AAC][CHT]",
988
+ "context_tokens": [
989
+ "[",
990
+ "Airota",
991
+ "]",
992
+ "[",
993
+ "Sousou",
994
+ " ",
995
+ "no",
996
+ " "
997
+ ],
998
+ "context_labels": [
999
+ "O",
1000
+ "B-GROUP",
1001
+ "O",
1002
+ "O",
1003
+ "B-TITLE",
1004
+ "I-TITLE",
1005
+ "I-TITLE",
1006
+ "I-TITLE"
1007
+ ]
1008
+ }
1009
+ ]
1010
+ ```
1011
+
1012
+ ## Tokenizer Split And Alignment
1013
+
1014
+ ### Dataset tokens vs selected tokenizer mismatches
1015
+ ```json
1016
+ [
1017
+ {
1018
+ "file_id": 2,
1019
+ "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
1020
+ "common_prefix": 0,
1021
+ "dataset_tokens": [
1022
+ "[",
1023
+ "LoliHouse",
1024
+ "]",
1025
+ " ",
1026
+ "Maid",
1027
+ "-",
1028
+ "san",
1029
+ " ",
1030
+ "wa",
1031
+ " ",
1032
+ "Taberu",
1033
+ " ",
1034
+ "Dake",
1035
+ " ",
1036
+ "-",
1037
+ " ",
1038
+ "07",
1039
+ " ",
1040
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
1041
+ ],
1042
+ "tokenizer_tokens": [
1043
+ "[LoliHouse]",
1044
+ " ",
1045
+ "Maid",
1046
+ "-",
1047
+ "san",
1048
+ " ",
1049
+ "wa",
1050
+ " ",
1051
+ "Taberu",
1052
+ " ",
1053
+ "Dake",
1054
+ " ",
1055
+ "-",
1056
+ " ",
1057
+ "07",
1058
+ " ",
1059
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
1060
+ ],
1061
+ "dataset_len": 19,
1062
+ "tokenizer_len": 17
1063
+ },
1064
+ {
1065
+ "file_id": 3,
1066
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
1067
+ "common_prefix": 0,
1068
+ "dataset_tokens": [
1069
+ "[",
1070
+ "ANi",
1071
+ "]",
1072
+ " ",
1073
+ "異",
1074
+ "世",
1075
+ "界",
1076
+ "悠",
1077
+ "閒",
1078
+ "農",
1079
+ "家",
1080
+ " ",
1081
+ "2",
1082
+ " ",
1083
+ "-",
1084
+ " ",
1085
+ "06",
1086
+ " ",
1087
+ "[1080P]",
1088
+ "[Baha]",
1089
+ "[WEB-DL]",
1090
+ "[AAC AVC]",
1091
+ "[CHT]"
1092
+ ],
1093
+ "tokenizer_tokens": [
1094
+ "[ANi]",
1095
+ " ",
1096
+ "異",
1097
+ "��",
1098
+ "界",
1099
+ "悠",
1100
+ "閒",
1101
+ "農",
1102
+ "家",
1103
+ " ",
1104
+ "2",
1105
+ " ",
1106
+ "-",
1107
+ " ",
1108
+ "06",
1109
+ " ",
1110
+ "[1080P]",
1111
+ "[Baha]",
1112
+ "[WEB-DL]",
1113
+ "[AAC AVC]",
1114
+ "[CHT]"
1115
+ ],
1116
+ "dataset_len": 23,
1117
+ "tokenizer_len": 21
1118
+ },
1119
+ {
1120
+ "file_id": 4,
1121
+ "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
1122
+ "common_prefix": 0,
1123
+ "dataset_tokens": [
1124
+ "[",
1125
+ "ANi",
1126
+ "]",
1127
+ " ",
1128
+ "木",
1129
+ "頭",
1130
+ "風",
1131
+ "紀",
1132
+ "委",
1133
+ "員",
1134
+ "和",
1135
+ "迷",
1136
+ "你",
1137
+ "裙",
1138
+ " ",
1139
+ "JK",
1140
+ " ",
1141
+ "的",
1142
+ "故",
1143
+ "事",
1144
+ " ",
1145
+ "-",
1146
+ " ",
1147
+ "06",
1148
+ " ",
1149
+ "[1080P]",
1150
+ "[Baha]",
1151
+ "[WEB-DL]",
1152
+ "[AAC AVC]",
1153
+ "[CHT]"
1154
+ ],
1155
+ "tokenizer_tokens": [
1156
+ "[ANi]",
1157
+ " ",
1158
+ "木",
1159
+ "頭",
1160
+ "風",
1161
+ "紀",
1162
+ "委",
1163
+ "員",
1164
+ "和",
1165
+ "迷",
1166
+ "你",
1167
+ "裙",
1168
+ " ",
1169
+ "JK",
1170
+ " ",
1171
+ "的",
1172
+ "故",
1173
+ "事",
1174
+ " ",
1175
+ "-",
1176
+ " ",
1177
+ "06",
1178
+ " ",
1179
+ "[1080P]",
1180
+ "[Baha]",
1181
+ "[WEB-DL]",
1182
+ "[AAC AVC]",
1183
+ "[CHT]"
1184
+ ],
1185
+ "dataset_len": 30,
1186
+ "tokenizer_len": 28
1187
+ },
1188
+ {
1189
+ "file_id": 5,
1190
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
1191
+ "common_prefix": 0,
1192
+ "dataset_tokens": [
1193
+ "[",
1194
+ "KissSub",
1195
+ "]",
1196
+ "[",
1197
+ "Shunkashuutou",
1198
+ " ",
1199
+ "Daikousha",
1200
+ " ",
1201
+ "-",
1202
+ " ",
1203
+ "Haru",
1204
+ " ",
1205
+ "no",
1206
+ " ",
1207
+ "Mai",
1208
+ "]",
1209
+ "[05]",
1210
+ "[1080P]",
1211
+ "[GB]",
1212
+ "[MP4]"
1213
+ ],
1214
+ "tokenizer_tokens": [
1215
+ "[KissSub]",
1216
+ "[Shunkashuutou Daikousha - Haru no Mai]",
1217
+ "[05]",
1218
+ "[1080P]",
1219
+ "[GB]",
1220
+ "[MP4]"
1221
+ ],
1222
+ "dataset_len": 20,
1223
+ "tokenizer_len": 6
1224
+ },
1225
+ {
1226
+ "file_id": 6,
1227
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
1228
+ "common_prefix": 0,
1229
+ "dataset_tokens": [
1230
+ "[",
1231
+ "KissSub",
1232
+ "]",
1233
+ "[",
1234
+ "Shunkashuutou",
1235
+ " ",
1236
+ "Daikousha",
1237
+ " ",
1238
+ "-",
1239
+ " ",
1240
+ "Haru",
1241
+ " ",
1242
+ "no",
1243
+ " ",
1244
+ "Mai",
1245
+ "]",
1246
+ "[06]",
1247
+ "[1080P]",
1248
+ "[GB]",
1249
+ "[MP4]"
1250
+ ],
1251
+ "tokenizer_tokens": [
1252
+ "[KissSub]",
1253
+ "[Shunkashuutou Daikousha - Haru no Mai]",
1254
+ "[06]",
1255
+ "[1080P]",
1256
+ "[GB]",
1257
+ "[MP4]"
1258
+ ],
1259
+ "dataset_len": 20,
1260
+ "tokenizer_len": 6
1261
+ },
1262
+ {
1263
+ "file_id": 7,
1264
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
1265
+ "common_prefix": 0,
1266
+ "dataset_tokens": [
1267
+ "[",
1268
+ "KissSub",
1269
+ "]",
1270
+ "[",
1271
+ "Shunkashuutou",
1272
+ " ",
1273
+ "Daikousha",
1274
+ " ",
1275
+ "-",
1276
+ " ",
1277
+ "Haru",
1278
+ " ",
1279
+ "no",
1280
+ " ",
1281
+ "Mai",
1282
+ "]",
1283
+ "[06]",
1284
+ "[1080P]",
1285
+ "[BIG5]",
1286
+ "[MP4]"
1287
+ ],
1288
+ "tokenizer_tokens": [
1289
+ "[KissSub]",
1290
+ "[Shunkashuutou Daikousha - Haru no Mai]",
1291
+ "[06]",
1292
+ "[1080P]",
1293
+ "[BIG5]",
1294
+ "[MP4]"
1295
+ ],
1296
+ "dataset_len": 20,
1297
+ "tokenizer_len": 6
1298
+ },
1299
+ {
1300
+ "file_id": 8,
1301
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
1302
+ "common_prefix": 0,
1303
+ "dataset_tokens": [
1304
+ "[",
1305
+ "KissSub",
1306
+ "]",
1307
+ "[",
1308
+ "Shunkashuutou",
1309
+ " ",
1310
+ "Daikousha",
1311
+ " ",
1312
+ "-",
1313
+ " ",
1314
+ "Haru",
1315
+ " ",
1316
+ "no",
1317
+ " ",
1318
+ "Mai",
1319
+ "]",
1320
+ "[05]",
1321
+ "[1080P]",
1322
+ "[BIG5]",
1323
+ "[MP4]"
1324
+ ],
1325
+ "tokenizer_tokens": [
1326
+ "[KissSub]",
1327
+ "[Shunkashuutou Daikousha - Haru no Mai]",
1328
+ "[05]",
1329
+ "[1080P]",
1330
+ "[BIG5]",
1331
+ "[MP4]"
1332
+ ],
1333
+ "dataset_len": 20,
1334
+ "tokenizer_len": 6
1335
+ },
1336
+ {
1337
+ "file_id": 9,
1338
+ "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
1339
+ "common_prefix": 0,
1340
+ "dataset_tokens": [
1341
+ "[",
1342
+ "Airota",
1343
+ "]",
1344
+ "[",
1345
+ "Sousou",
1346
+ " ",
1347
+ "no",
1348
+ " ",
1349
+ "Frieren",
1350
+ "]",
1351
+ "[29]",
1352
+ "[1080p AVC AAC]",
1353
+ "[CHT]"
1354
+ ],
1355
+ "tokenizer_tokens": [
1356
+ "[Airota]",
1357
+ "[Sousou no Frieren]",
1358
+ "[29]",
1359
+ "[1080p AVC AAC]",
1360
+ "[CHT]"
1361
+ ],
1362
+ "dataset_len": 13,
1363
+ "tokenizer_len": 5
1364
+ },
1365
+ {
1366
+ "file_id": 10,
1367
+ "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
1368
+ "common_prefix": 0,
1369
+ "dataset_tokens": [
1370
+ "[",
1371
+ "Airota",
1372
+ "]",
1373
+ "[",
1374
+ "Sousou",
1375
+ " ",
1376
+ "no",
1377
+ " ",
1378
+ "Frieren",
1379
+ "]",
1380
+ "[30]",
1381
+ "[1080p AVC AAC]",
1382
+ "[CHT]"
1383
+ ],
1384
+ "tokenizer_tokens": [
1385
+ "[Airota]",
1386
+ "[Sousou no Frieren]",
1387
+ "[30]",
1388
+ "[1080p AVC AAC]",
1389
+ "[CHT]"
1390
+ ],
1391
+ "dataset_len": 13,
1392
+ "tokenizer_len": 5
1393
+ },
1394
+ {
1395
+ "file_id": 11,
1396
+ "filename": "[Airota][Sousou no Frieren][31][1080p AVC AAC][CHT]",
1397
+ "common_prefix": 0,
1398
+ "dataset_tokens": [
1399
+ "[",
1400
+ "Airota",
1401
+ "]",
1402
+ "[",
1403
+ "Sousou",
1404
+ " ",
1405
+ "no",
1406
+ " ",
1407
+ "Frieren",
1408
+ "]",
1409
+ "[31]",
1410
+ "[1080p AVC AAC]",
1411
+ "[CHT]"
1412
+ ],
1413
+ "tokenizer_tokens": [
1414
+ "[Airota]",
1415
+ "[Sousou no Frieren]",
1416
+ "[31]",
1417
+ "[1080p AVC AAC]",
1418
+ "[CHT]"
1419
+ ],
1420
+ "dataset_len": 13,
1421
+ "tokenizer_len": 5
1422
+ }
1423
+ ]
1424
+ ```
1425
+
1426
+ ### Split examples
1427
+ ```json
1428
+ [
1429
+ {
1430
+ "file_id": 1,
1431
+ "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
1432
+ "dataset_tokens": [
1433
+ "Witch",
1434
+ ".",
1435
+ "Hat",
1436
+ ".",
1437
+ "Atelier",
1438
+ ".",
1439
+ "S01",
1440
+ "E07",
1441
+ ".",
1442
+ "1080p",
1443
+ ".",
1444
+ "NF",
1445
+ ".",
1446
+ "WEB-DL",
1447
+ ".",
1448
+ "JP",
1449
+ "N",
1450
+ ".",
1451
+ "AAC",
1452
+ "2",
1453
+ ".",
1454
+ "0",
1455
+ ".",
1456
+ "H.264",
1457
+ ".",
1458
+ "MSubs",
1459
+ "-",
1460
+ "ToonsHub"
1461
+ ],
1462
+ "diagnosed_tokens": [
1463
+ "Witch",
1464
+ ".",
1465
+ "Hat",
1466
+ ".",
1467
+ "Atelier",
1468
+ ".",
1469
+ "S01",
1470
+ "E07",
1471
+ ".",
1472
+ "1080p",
1473
+ ".",
1474
+ "NF",
1475
+ ".",
1476
+ "WEB-DL",
1477
+ ".",
1478
+ "JP",
1479
+ "N",
1480
+ ".",
1481
+ "AAC",
1482
+ "2",
1483
+ ".",
1484
+ "0",
1485
+ ".",
1486
+ "H.264",
1487
+ ".",
1488
+ "MSubs",
1489
+ "-",
1490
+ "ToonsHub"
1491
+ ],
1492
+ "regex_tokens": [
1493
+ "Witch",
1494
+ ".",
1495
+ "Hat",
1496
+ ".",
1497
+ "Atelier",
1498
+ ".",
1499
+ "S01",
1500
+ "E07",
1501
+ ".",
1502
+ "1080p",
1503
+ ".",
1504
+ "NF",
1505
+ ".",
1506
+ "WEB-DL",
1507
+ ".",
1508
+ "JP",
1509
+ "N",
1510
+ ".",
1511
+ "AAC",
1512
+ "2",
1513
+ ".",
1514
+ "0",
1515
+ ".",
1516
+ "H.264",
1517
+ ".",
1518
+ "MSubs",
1519
+ "-",
1520
+ "ToonsHub"
1521
+ ],
1522
+ "char_tokens": [
1523
+ "W",
1524
+ "i",
1525
+ "t",
1526
+ "c",
1527
+ "h",
1528
+ ".",
1529
+ "H",
1530
+ "a",
1531
+ "t",
1532
+ ".",
1533
+ "A",
1534
+ "t",
1535
+ "e",
1536
+ "l",
1537
+ "i",
1538
+ "e",
1539
+ "r",
1540
+ ".",
1541
+ "S",
1542
+ "0",
1543
+ "1",
1544
+ "E",
1545
+ "0",
1546
+ "7",
1547
+ ".",
1548
+ "1",
1549
+ "0",
1550
+ "8",
1551
+ "0",
1552
+ "p",
1553
+ ".",
1554
+ "N",
1555
+ "F",
1556
+ ".",
1557
+ "W",
1558
+ "E",
1559
+ "B",
1560
+ "-",
1561
+ "D",
1562
+ "L",
1563
+ ".",
1564
+ "J",
1565
+ "P",
1566
+ "N",
1567
+ ".",
1568
+ "A",
1569
+ "A",
1570
+ "C",
1571
+ "2",
1572
+ ".",
1573
+ "0",
1574
+ ".",
1575
+ "H",
1576
+ ".",
1577
+ "2",
1578
+ "6",
1579
+ "4",
1580
+ ".",
1581
+ "M",
1582
+ "S",
1583
+ "u",
1584
+ "b",
1585
+ "s",
1586
+ "-",
1587
+ "T",
1588
+ "o",
1589
+ "o",
1590
+ "n",
1591
+ "s",
1592
+ "H",
1593
+ "u",
1594
+ "b"
1595
+ ]
1596
+ },
1597
+ {
1598
+ "file_id": 2,
1599
+ "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
1600
+ "dataset_tokens": [
1601
+ "[",
1602
+ "LoliHouse",
1603
+ "]",
1604
+ " ",
1605
+ "Maid",
1606
+ "-",
1607
+ "san",
1608
+ " ",
1609
+ "wa",
1610
+ " ",
1611
+ "Taberu",
1612
+ " ",
1613
+ "Dake",
1614
+ " ",
1615
+ "-",
1616
+ " ",
1617
+ "07",
1618
+ " ",
1619
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
1620
+ ],
1621
+ "diagnosed_tokens": [
1622
+ "[LoliHouse]",
1623
+ " ",
1624
+ "Maid",
1625
+ "-",
1626
+ "san",
1627
+ " ",
1628
+ "wa",
1629
+ " ",
1630
+ "Taberu",
1631
+ " ",
1632
+ "Dake",
1633
+ " ",
1634
+ "-",
1635
+ " ",
1636
+ "07",
1637
+ " ",
1638
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
1639
+ ],
1640
+ "regex_tokens": [
1641
+ "[LoliHouse]",
1642
+ " ",
1643
+ "Maid",
1644
+ "-",
1645
+ "san",
1646
+ " ",
1647
+ "wa",
1648
+ " ",
1649
+ "Taberu",
1650
+ " ",
1651
+ "Dake",
1652
+ " ",
1653
+ "-",
1654
+ " ",
1655
+ "07",
1656
+ " ",
1657
+ "[WebRip 1080p HEVC-10bit AAC ASSx2]"
1658
+ ],
1659
+ "char_tokens": [
1660
+ "[",
1661
+ "L",
1662
+ "o",
1663
+ "l",
1664
+ "i",
1665
+ "H",
1666
+ "o",
1667
+ "u",
1668
+ "s",
1669
+ "e",
1670
+ "]",
1671
+ " ",
1672
+ "M",
1673
+ "a",
1674
+ "i",
1675
+ "d",
1676
+ "-",
1677
+ "s",
1678
+ "a",
1679
+ "n",
1680
+ " ",
1681
+ "w",
1682
+ "a",
1683
+ " ",
1684
+ "T",
1685
+ "a",
1686
+ "b",
1687
+ "e",
1688
+ "r",
1689
+ "u",
1690
+ " ",
1691
+ "D",
1692
+ "a",
1693
+ "k",
1694
+ "e",
1695
+ " ",
1696
+ "-",
1697
+ " ",
1698
+ "0",
1699
+ "7",
1700
+ " ",
1701
+ "[",
1702
+ "W",
1703
+ "e",
1704
+ "b",
1705
+ "R",
1706
+ "i",
1707
+ "p",
1708
+ " ",
1709
+ "1",
1710
+ "0",
1711
+ "8",
1712
+ "0",
1713
+ "p",
1714
+ " ",
1715
+ "H",
1716
+ "E",
1717
+ "V",
1718
+ "C",
1719
+ "-",
1720
+ "1",
1721
+ "0",
1722
+ "b",
1723
+ "i",
1724
+ "t",
1725
+ " ",
1726
+ "A",
1727
+ "A",
1728
+ "C",
1729
+ " ",
1730
+ "A",
1731
+ "S",
1732
+ "S",
1733
+ "x",
1734
+ "2",
1735
+ "]"
1736
+ ]
1737
+ },
1738
+ {
1739
+ "file_id": 3,
1740
+ "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
1741
+ "dataset_tokens": [
1742
+ "[",
1743
+ "ANi",
1744
+ "]",
1745
+ " ",
1746
+ "異",
1747
+ "世",
1748
+ "界",
1749
+ "悠",
1750
+ "閒",
1751
+ "農",
1752
+ "家",
1753
+ " ",
1754
+ "2",
1755
+ " ",
1756
+ "-",
1757
+ " ",
1758
+ "06",
1759
+ " ",
1760
+ "[1080P]",
1761
+ "[Baha]",
1762
+ "[WEB-DL]",
1763
+ "[AAC AVC]",
1764
+ "[CHT]"
1765
+ ],
1766
+ "diagnosed_tokens": [
1767
+ "[ANi]",
1768
+ " ",
1769
+ "異",
1770
+ "世",
1771
+ "界",
1772
+ "悠",
1773
+ "閒",
1774
+ "農",
1775
+ "家",
1776
+ " ",
1777
+ "2",
1778
+ " ",
1779
+ "-",
1780
+ " ",
1781
+ "06",
1782
+ " ",
1783
+ "[1080P]",
1784
+ "[Baha]",
1785
+ "[WEB-DL]",
1786
+ "[AAC AVC]",
1787
+ "[CHT]"
1788
+ ],
1789
+ "regex_tokens": [
1790
+ "[ANi]",
1791
+ " ",
1792
+ "異",
1793
+ "世",
1794
+ "界",
1795
+ "悠",
1796
+ "閒",
1797
+ "農",
1798
+ "家",
1799
+ " ",
1800
+ "2",
1801
+ " ",
1802
+ "-",
1803
+ " ",
1804
+ "06",
1805
+ " ",
1806
+ "[1080P]",
1807
+ "[Baha]",
1808
+ "[WEB-DL]",
1809
+ "[AAC AVC]",
1810
+ "[CHT]"
1811
+ ],
1812
+ "char_tokens": [
1813
+ "[",
1814
+ "A",
1815
+ "N",
1816
+ "i",
1817
+ "]",
1818
+ " ",
1819
+ "異",
1820
+ "世",
1821
+ "界",
1822
+ "悠",
1823
+ "閒",
1824
+ "農",
1825
+ "家",
1826
+ " ",
1827
+ "2",
1828
+ " ",
1829
+ "-",
1830
+ " ",
1831
+ "0",
1832
+ "6",
1833
+ " ",
1834
+ "[",
1835
+ "1",
1836
+ "0",
1837
+ "8",
1838
+ "0",
1839
+ "P",
1840
+ "]",
1841
+ "[",
1842
+ "B",
1843
+ "a",
1844
+ "h",
1845
+ "a",
1846
+ "]",
1847
+ "[",
1848
+ "W",
1849
+ "E",
1850
+ "B",
1851
+ "-",
1852
+ "D",
1853
+ "L",
1854
+ "]",
1855
+ "[",
1856
+ "A",
1857
+ "A",
1858
+ "C",
1859
+ " ",
1860
+ "A",
1861
+ "V",
1862
+ "C",
1863
+ "]",
1864
+ "[",
1865
+ "C",
1866
+ "H",
1867
+ "T",
1868
+ "]"
1869
+ ]
1870
+ },
1871
+ {
1872
+ "file_id": 4,
1873
+ "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
1874
+ "dataset_tokens": [
1875
+ "[",
1876
+ "ANi",
1877
+ "]",
1878
+ " ",
1879
+ "木",
1880
+ "頭",
1881
+ "風",
1882
+ "紀",
1883
+ "委",
1884
+ "員",
1885
+ "和",
1886
+ "迷",
1887
+ "你",
1888
+ "裙",
1889
+ " ",
1890
+ "JK",
1891
+ " ",
1892
+ "的",
1893
+ "故",
1894
+ "事",
1895
+ " ",
1896
+ "-",
1897
+ " ",
1898
+ "06",
1899
+ " ",
1900
+ "[1080P]",
1901
+ "[Baha]",
1902
+ "[WEB-DL]",
1903
+ "[AAC AVC]",
1904
+ "[CHT]"
1905
+ ],
1906
+ "diagnosed_tokens": [
1907
+ "[ANi]",
1908
+ " ",
1909
+ "木",
1910
+ "頭",
1911
+ "風",
1912
+ "紀",
1913
+ "委",
1914
+ "員",
1915
+ "和",
1916
+ "迷",
1917
+ "你",
1918
+ "裙",
1919
+ " ",
1920
+ "JK",
1921
+ " ",
1922
+ "的",
1923
+ "故",
1924
+ "事",
1925
+ " ",
1926
+ "-",
1927
+ " ",
1928
+ "06",
1929
+ " ",
1930
+ "[1080P]",
1931
+ "[Baha]",
1932
+ "[WEB-DL]",
1933
+ "[AAC AVC]",
1934
+ "[CHT]"
1935
+ ],
1936
+ "regex_tokens": [
1937
+ "[ANi]",
1938
+ " ",
1939
+ "木",
1940
+ "頭",
1941
+ "風",
1942
+ "紀",
1943
+ "委",
1944
+ "員",
1945
+ "和",
1946
+ "迷",
1947
+ "你",
1948
+ "裙",
1949
+ " ",
1950
+ "JK",
1951
+ " ",
1952
+ "的",
1953
+ "故",
1954
+ "事",
1955
+ " ",
1956
+ "-",
1957
+ " ",
1958
+ "06",
1959
+ " ",
1960
+ "[1080P]",
1961
+ "[Baha]",
1962
+ "[WEB-DL]",
1963
+ "[AAC AVC]",
1964
+ "[CHT]"
1965
+ ],
1966
+ "char_tokens": [
1967
+ "[",
1968
+ "A",
1969
+ "N",
1970
+ "i",
1971
+ "]",
1972
+ " ",
1973
+ "木",
1974
+ "頭",
1975
+ "風",
1976
+ "紀",
1977
+ "委",
1978
+ "員",
1979
+ "和",
1980
+ "迷",
1981
+ "你",
1982
+ "裙",
1983
+ " ",
1984
+ "J",
1985
+ "K",
1986
+ " ",
1987
+ "的",
1988
+ "故",
1989
+ "事",
1990
+ " ",
1991
+ "-",
1992
+ " ",
1993
+ "0",
1994
+ "6",
1995
+ " ",
1996
+ "[",
1997
+ "1",
1998
+ "0",
1999
+ "8",
2000
+ "0",
2001
+ "P",
2002
+ "]",
2003
+ "[",
2004
+ "B",
2005
+ "a",
2006
+ "h",
2007
+ "a",
2008
+ "]",
2009
+ "[",
2010
+ "W",
2011
+ "E",
2012
+ "B",
2013
+ "-",
2014
+ "D",
2015
+ "L",
2016
+ "]",
2017
+ "[",
2018
+ "A",
2019
+ "A",
2020
+ "C",
2021
+ " ",
2022
+ "A",
2023
+ "V",
2024
+ "C",
2025
+ "]",
2026
+ "[",
2027
+ "C",
2028
+ "H",
2029
+ "T",
2030
+ "]"
2031
+ ]
2032
+ },
2033
+ {
2034
+ "file_id": 5,
2035
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
2036
+ "dataset_tokens": [
2037
+ "[",
2038
+ "KissSub",
2039
+ "]",
2040
+ "[",
2041
+ "Shunkashuutou",
2042
+ " ",
2043
+ "Daikousha",
2044
+ " ",
2045
+ "-",
2046
+ " ",
2047
+ "Haru",
2048
+ " ",
2049
+ "no",
2050
+ " ",
2051
+ "Mai",
2052
+ "]",
2053
+ "[05]",
2054
+ "[1080P]",
2055
+ "[GB]",
2056
+ "[MP4]"
2057
+ ],
2058
+ "diagnosed_tokens": [
2059
+ "[KissSub]",
2060
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2061
+ "[05]",
2062
+ "[1080P]",
2063
+ "[GB]",
2064
+ "[MP4]"
2065
+ ],
2066
+ "regex_tokens": [
2067
+ "[KissSub]",
2068
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2069
+ "[05]",
2070
+ "[1080P]",
2071
+ "[GB]",
2072
+ "[MP4]"
2073
+ ],
2074
+ "char_tokens": [
2075
+ "[",
2076
+ "K",
2077
+ "i",
2078
+ "s",
2079
+ "s",
2080
+ "S",
2081
+ "u",
2082
+ "b",
2083
+ "]",
2084
+ "[",
2085
+ "S",
2086
+ "h",
2087
+ "u",
2088
+ "n",
2089
+ "k",
2090
+ "a",
2091
+ "s",
2092
+ "h",
2093
+ "u",
2094
+ "u",
2095
+ "t",
2096
+ "o",
2097
+ "u",
2098
+ " ",
2099
+ "D",
2100
+ "a",
2101
+ "i",
2102
+ "k",
2103
+ "o",
2104
+ "u",
2105
+ "s",
2106
+ "h",
2107
+ "a",
2108
+ " ",
2109
+ "-",
2110
+ " ",
2111
+ "H",
2112
+ "a",
2113
+ "r",
2114
+ "u",
2115
+ " ",
2116
+ "n",
2117
+ "o",
2118
+ " ",
2119
+ "M",
2120
+ "a",
2121
+ "i",
2122
+ "]",
2123
+ "[",
2124
+ "0",
2125
+ "5",
2126
+ "]",
2127
+ "[",
2128
+ "1",
2129
+ "0",
2130
+ "8",
2131
+ "0",
2132
+ "P",
2133
+ "]",
2134
+ "[",
2135
+ "G",
2136
+ "B",
2137
+ "]",
2138
+ "[",
2139
+ "M",
2140
+ "P",
2141
+ "4",
2142
+ "]"
2143
+ ]
2144
+ },
2145
+ {
2146
+ "file_id": 6,
2147
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
2148
+ "dataset_tokens": [
2149
+ "[",
2150
+ "KissSub",
2151
+ "]",
2152
+ "[",
2153
+ "Shunkashuutou",
2154
+ " ",
2155
+ "Daikousha",
2156
+ " ",
2157
+ "-",
2158
+ " ",
2159
+ "Haru",
2160
+ " ",
2161
+ "no",
2162
+ " ",
2163
+ "Mai",
2164
+ "]",
2165
+ "[06]",
2166
+ "[1080P]",
2167
+ "[GB]",
2168
+ "[MP4]"
2169
+ ],
2170
+ "diagnosed_tokens": [
2171
+ "[KissSub]",
2172
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2173
+ "[06]",
2174
+ "[1080P]",
2175
+ "[GB]",
2176
+ "[MP4]"
2177
+ ],
2178
+ "regex_tokens": [
2179
+ "[KissSub]",
2180
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2181
+ "[06]",
2182
+ "[1080P]",
2183
+ "[GB]",
2184
+ "[MP4]"
2185
+ ],
2186
+ "char_tokens": [
2187
+ "[",
2188
+ "K",
2189
+ "i",
2190
+ "s",
2191
+ "s",
2192
+ "S",
2193
+ "u",
2194
+ "b",
2195
+ "]",
2196
+ "[",
2197
+ "S",
2198
+ "h",
2199
+ "u",
2200
+ "n",
2201
+ "k",
2202
+ "a",
2203
+ "s",
2204
+ "h",
2205
+ "u",
2206
+ "u",
2207
+ "t",
2208
+ "o",
2209
+ "u",
2210
+ " ",
2211
+ "D",
2212
+ "a",
2213
+ "i",
2214
+ "k",
2215
+ "o",
2216
+ "u",
2217
+ "s",
2218
+ "h",
2219
+ "a",
2220
+ " ",
2221
+ "-",
2222
+ " ",
2223
+ "H",
2224
+ "a",
2225
+ "r",
2226
+ "u",
2227
+ " ",
2228
+ "n",
2229
+ "o",
2230
+ " ",
2231
+ "M",
2232
+ "a",
2233
+ "i",
2234
+ "]",
2235
+ "[",
2236
+ "0",
2237
+ "6",
2238
+ "]",
2239
+ "[",
2240
+ "1",
2241
+ "0",
2242
+ "8",
2243
+ "0",
2244
+ "P",
2245
+ "]",
2246
+ "[",
2247
+ "G",
2248
+ "B",
2249
+ "]",
2250
+ "[",
2251
+ "M",
2252
+ "P",
2253
+ "4",
2254
+ "]"
2255
+ ]
2256
+ },
2257
+ {
2258
+ "file_id": 7,
2259
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
2260
+ "dataset_tokens": [
2261
+ "[",
2262
+ "KissSub",
2263
+ "]",
2264
+ "[",
2265
+ "Shunkashuutou",
2266
+ " ",
2267
+ "Daikousha",
2268
+ " ",
2269
+ "-",
2270
+ " ",
2271
+ "Haru",
2272
+ " ",
2273
+ "no",
2274
+ " ",
2275
+ "Mai",
2276
+ "]",
2277
+ "[06]",
2278
+ "[1080P]",
2279
+ "[BIG5]",
2280
+ "[MP4]"
2281
+ ],
2282
+ "diagnosed_tokens": [
2283
+ "[KissSub]",
2284
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2285
+ "[06]",
2286
+ "[1080P]",
2287
+ "[BIG5]",
2288
+ "[MP4]"
2289
+ ],
2290
+ "regex_tokens": [
2291
+ "[KissSub]",
2292
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2293
+ "[06]",
2294
+ "[1080P]",
2295
+ "[BIG5]",
2296
+ "[MP4]"
2297
+ ],
2298
+ "char_tokens": [
2299
+ "[",
2300
+ "K",
2301
+ "i",
2302
+ "s",
2303
+ "s",
2304
+ "S",
2305
+ "u",
2306
+ "b",
2307
+ "]",
2308
+ "[",
2309
+ "S",
2310
+ "h",
2311
+ "u",
2312
+ "n",
2313
+ "k",
2314
+ "a",
2315
+ "s",
2316
+ "h",
2317
+ "u",
2318
+ "u",
2319
+ "t",
2320
+ "o",
2321
+ "u",
2322
+ " ",
2323
+ "D",
2324
+ "a",
2325
+ "i",
2326
+ "k",
2327
+ "o",
2328
+ "u",
2329
+ "s",
2330
+ "h",
2331
+ "a",
2332
+ " ",
2333
+ "-",
2334
+ " ",
2335
+ "H",
2336
+ "a",
2337
+ "r",
2338
+ "u",
2339
+ " ",
2340
+ "n",
2341
+ "o",
2342
+ " ",
2343
+ "M",
2344
+ "a",
2345
+ "i",
2346
+ "]",
2347
+ "[",
2348
+ "0",
2349
+ "6",
2350
+ "]",
2351
+ "[",
2352
+ "1",
2353
+ "0",
2354
+ "8",
2355
+ "0",
2356
+ "P",
2357
+ "]",
2358
+ "[",
2359
+ "B",
2360
+ "I",
2361
+ "G",
2362
+ "5",
2363
+ "]",
2364
+ "[",
2365
+ "M",
2366
+ "P",
2367
+ "4",
2368
+ "]"
2369
+ ]
2370
+ },
2371
+ {
2372
+ "file_id": 8,
2373
+ "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
2374
+ "dataset_tokens": [
2375
+ "[",
2376
+ "KissSub",
2377
+ "]",
2378
+ "[",
2379
+ "Shunkashuutou",
2380
+ " ",
2381
+ "Daikousha",
2382
+ " ",
2383
+ "-",
2384
+ " ",
2385
+ "Haru",
2386
+ " ",
2387
+ "no",
2388
+ " ",
2389
+ "Mai",
2390
+ "]",
2391
+ "[05]",
2392
+ "[1080P]",
2393
+ "[BIG5]",
2394
+ "[MP4]"
2395
+ ],
2396
+ "diagnosed_tokens": [
2397
+ "[KissSub]",
2398
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2399
+ "[05]",
2400
+ "[1080P]",
2401
+ "[BIG5]",
2402
+ "[MP4]"
2403
+ ],
2404
+ "regex_tokens": [
2405
+ "[KissSub]",
2406
+ "[Shunkashuutou Daikousha - Haru no Mai]",
2407
+ "[05]",
2408
+ "[1080P]",
2409
+ "[BIG5]",
2410
+ "[MP4]"
2411
+ ],
2412
+ "char_tokens": [
2413
+ "[",
2414
+ "K",
2415
+ "i",
2416
+ "s",
2417
+ "s",
2418
+ "S",
2419
+ "u",
2420
+ "b",
2421
+ "]",
2422
+ "[",
2423
+ "S",
2424
+ "h",
2425
+ "u",
2426
+ "n",
2427
+ "k",
2428
+ "a",
2429
+ "s",
2430
+ "h",
2431
+ "u",
2432
+ "u",
2433
+ "t",
2434
+ "o",
2435
+ "u",
2436
+ " ",
2437
+ "D",
2438
+ "a",
2439
+ "i",
2440
+ "k",
2441
+ "o",
2442
+ "u",
2443
+ "s",
2444
+ "h",
2445
+ "a",
2446
+ " ",
2447
+ "-",
2448
+ " ",
2449
+ "H",
2450
+ "a",
2451
+ "r",
2452
+ "u",
2453
+ " ",
2454
+ "n",
2455
+ "o",
2456
+ " ",
2457
+ "M",
2458
+ "a",
2459
+ "i",
2460
+ "]",
2461
+ "[",
2462
+ "0",
2463
+ "5",
2464
+ "]",
2465
+ "[",
2466
+ "1",
2467
+ "0",
2468
+ "8",
2469
+ "0",
2470
+ "P",
2471
+ "]",
2472
+ "[",
2473
+ "B",
2474
+ "I",
2475
+ "G",
2476
+ "5",
2477
+ "]",
2478
+ "[",
2479
+ "M",
2480
+ "P",
2481
+ "4",
2482
+ "]"
2483
+ ]
2484
+ }
2485
+ ]
2486
+ ```
2487
+
2488
+ ### Vocabulary coverage
2489
+ ```json
2490
+ {
2491
+ "total": 85312,
2492
+ "unk": 5900,
2493
+ "unk_rate": 0.06915791447861966,
2494
+ "top_unk": [
2495
+ [
2496
+ "(BDRip 720p x264)",
2497
+ 66
2498
+ ],
2499
+ [
2500
+ "Partie",
2501
+ 59
2502
+ ],
2503
+ [
2504
+ "incantevole",
2505
+ 54
2506
+ ],
2507
+ [
2508
+ "Muxed",
2509
+ 54
2510
+ ],
2511
+ [
2512
+ "nonscordarmi",
2513
+ 54
2514
+ ],
2515
+ [
2516
+ "NEET",
2517
+ 52
2518
+ ],
2519
+ [
2520
+ "Dousei",
2521
+ 52
2522
+ ],
2523
+ [
2524
+ "[krikoun68]",
2525
+ 52
2526
+ ],
2527
+ [
2528
+ "[Blu-Ray - MUX - 960p - x264 - AC3 ITA-JAP - SUB ITA]",
2529
+ 51
2530
+ ],
2531
+ [
2532
+ "CTR",
2533
+ 45
2534
+ ],
2535
+ [
2536
+ "joseol",
2537
+ 45
2538
+ ],
2539
+ [
2540
+ "e99",
2541
+ 45
2542
+ ],
2543
+ [
2544
+ "(1440x1080 h264 AC3 AAC)",
2545
+ 45
2546
+ ],
2547
+ [
2548
+ "VERS",
2549
+ 37
2550
+ ],
2551
+ [
2552
+ "脙",
2553
+ 37
2554
+ ],
2555
+ [
2556
+ "Shunkashuutou",
2557
+ 36
2558
+ ],
2559
+ [
2560
+ "Daikousha",
2561
+ 36
2562
+ ],
2563
+ [
2564
+ "houbatsu",
2565
+ 36
2566
+ ],
2567
+ [
2568
+ "DEFINITIVA",
2569
+ 36
2570
+ ],
2571
+ [
2572
+ "Crash",
2573
+ 35
2574
+ ],
2575
+ [
2576
+ "Realm",
2577
+ 31
2578
+ ],
2579
+ [
2580
+ "UHD",
2581
+ 31
2582
+ ],
2583
+ [
2584
+ "[BDrip 1080P HEVC-10bit AAC]",
2585
+ 29
2586
+ ],
2587
+ [
2588
+ "Choroi",
2589
+ 28
2590
+ ],
2591
+ [
2592
+ "완",
2593
+ 28
2594
+ ]
2595
+ ]
2596
+ }
2597
+ ```
2598
+
2599
+ ## Train Inference Tokenizer Comparison
2600
+
2601
+ - Model dir: `checkpoints\dmhy-finetune\final`
2602
+ - Model tokenizer variant: `regex`
2603
+ - Dataset tokenizer variant: `regex`
2604
+ - Diagnostic tokenizer variant: `regex`
2605
+ - Model tokenizer vocab size: 3,000
2606
+ - Diagnostic tokenizer vocab size: 8,000
2607
+
2608
+ If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.
2609
+
2610
+ ## Model Confusion Analysis
2611
+
2612
+ - Evaluated samples: 128
2613
+ - Entity precision: 0.9568
2614
+ - Entity recall: 0.9530
2615
+ - Entity F1: 0.9549
2616
+
2617
+ ### Boundary error classes
2618
+ - `B-boundary`: 26 (56.52%)
2619
+ - `entity-type`: 20 (43.48%)
2620
+
2621
+ ### Top token-label confusions
2622
+ | true | pred | count |
2623
+ | --- | --- | --- |
2624
+ | O | I-TITLE | 17 |
2625
+ | O | B-EPISODE | 6 |
2626
+ | B-SOURCE | O | 4 |
2627
+ | I-TITLE | O | 3 |
2628
+ | B-EPISODE | O | 3 |
2629
+ | B-SEASON | O | 2 |
2630
+ | B-RESOLUTION | B-SOURCE | 2 |
2631
+ | B-EPISODE | I-TITLE | 2 |
2632
+ | O | B-TITLE | 2 |
2633
+ | B-TITLE | I-TITLE | 2 |
2634
+ | O | B-SOURCE | 1 |
2635
+ | B-SEASON | I-TITLE | 1 |
2636
+ | O | B-SEASON | 1 |
2637
+
2638
+ ### Top entity-type confusions
2639
+ | true | pred | count |
2640
+ | --- | --- | --- |
2641
+ | O | TITLE | 19 |
2642
+ | O | EPISODE | 6 |
2643
+ | SOURCE | O | 4 |
2644
+ | TITLE | O | 3 |
2645
+ | EPISODE | O | 3 |
2646
+ | SEASON | O | 2 |
2647
+ | RESOLUTION | SOURCE | 2 |
2648
+ | EPISODE | TITLE | 2 |
2649
+ | O | SOURCE | 1 |
2650
+ | SEASON | TITLE | 1 |
2651
+ | O | SEASON | 1 |
2652
+
2653
+ ### Seqeval report
2654
+ ```text
2655
+ precision recall f1-score support
2656
+
2657
+ EPISODE 0.9535 0.9609 0.9572 128
2658
+ GROUP 1.0000 1.0000 1.0000 53
2659
+ RESOLUTION 1.0000 0.9545 0.9767 44
2660
+ SEASON 0.9630 0.8966 0.9286 29
2661
+ SOURCE 0.9703 0.9608 0.9655 102
2662
+ SPECIAL 1.0000 1.0000 1.0000 5
2663
+ TITLE 0.9211 0.9333 0.9272 150
2664
+
2665
+ micro avg 0.9568 0.9530 0.9549 511
2666
+ macro avg 0.9725 0.9580 0.9650 511
2667
+ weighted avg 0.9571 0.9530 0.9550 511
2668
+
2669
+ ```
2670
+
2671
+ ## Recommended Pipeline
2672
+
2673
+ 1. Use one tokenizer variant end to end and save it in the checkpoint metadata.
2674
+ 2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.
2675
+ 3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.
2676
+ 4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.
2677
+ 5. Keep rule-assisted post-processing for high-confidence structural anchors: leading group bracket, ` - 07`, `S01E07`, source, and resolution.
2678
+ 6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.
inference.py CHANGED
@@ -14,7 +14,7 @@ import json
14
  import os
15
  import re
16
  import sys
17
- from typing import Dict, List, Optional
18
 
19
  import torch
20
  from transformers import BertForTokenClassification
@@ -70,58 +70,149 @@ def extract_resolution(text: str) -> Optional[str]:
70
  return clean if clean else None
71
 
72
 
 
 
 
 
 
 
 
 
 
73
  def trim_decorations(text: str) -> str:
74
  """Trim outer release brackets from an extracted entity."""
75
  return text.strip().strip("[]()【】《》()").strip()
76
 
77
 
78
- def postprocess(tokens: List[str], labels: List[str]) -> Dict:
79
- """
80
- Convert BIO-labeled tokens into structured metadata.
 
 
 
 
 
81
 
82
- Merges consecutive B- / I- tokens of the same entity type,
83
- then extracts structured fields.
 
 
 
 
84
  """
85
- result: Dict = {
86
- "title": None,
87
- "season": None,
88
- "episode": None,
89
- "group": None,
90
- "resolution": None,
91
- "source": None,
92
- "special": None,
93
- }
94
 
95
- # Merge consecutive B- / I- tokens into entities
96
- entities: List[tuple] = []
 
 
97
  current_entity: Optional[str] = None
98
  current_tokens: List[str] = []
99
 
100
  for token, label in zip(tokens, labels):
101
  if label.startswith("B-"):
102
- # Finalize previous entity
103
  if current_entity:
104
- entities.append((current_entity, "".join(current_tokens)))
105
- current_entity = label[2:] # Remove "B-"
106
  current_tokens = [token]
107
  elif label.startswith("I-"):
108
  entity_type = label[2:]
109
  if current_entity == entity_type:
110
  current_tokens.append(token)
111
  else:
112
- # Orphaned I- — start new entity
113
  if current_entity:
114
- entities.append((current_entity, "".join(current_tokens)))
115
  current_entity = entity_type
116
  current_tokens = [token]
117
- else: # O
118
  if current_entity:
119
- entities.append((current_entity, "".join(current_tokens)))
120
  current_entity = None
121
  current_tokens = []
122
 
123
  if current_entity:
124
- entities.append((current_entity, "".join(current_tokens)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  # Fill result
127
  for entity_type, text in entities:
@@ -163,15 +254,177 @@ def postprocess(tokens: List[str], labels: List[str]) -> Dict:
163
  if (trimmed := trim_decorations(f))
164
  )
165
 
 
 
 
166
  return result
167
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def parse_filename(
170
  filename: str,
171
  model: BertForTokenClassification,
172
  tokenizer: AnimeTokenizer,
173
  id2label: Dict[int, str],
174
  max_length: int = 64,
 
 
 
175
  ) -> Dict:
176
  """
177
  Parse an anime filename and extract structured metadata.
@@ -195,6 +448,8 @@ def parse_filename(
195
 
196
  # Convert to input IDs
197
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
 
 
198
 
199
  # Add special tokens
200
  input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
@@ -202,8 +457,8 @@ def parse_filename(
202
 
203
  # Truncate if needed
204
  if len(input_ids) > max_length:
205
- input_ids = input_ids[:max_length]
206
- attention_mask = attention_mask[:max_length]
207
 
208
  # Pad
209
  pad_len = max_length - len(input_ids)
@@ -216,10 +471,6 @@ def parse_filename(
216
  input_tensor = torch.tensor([input_ids], device=device)
217
  mask_tensor = torch.tensor([attention_mask], device=device)
218
 
219
- with torch.no_grad():
220
- logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
221
- predictions = torch.argmax(logits, dim=-1)[0]
222
-
223
  # Remove special token predictions
224
  # Count real tokens used (minus CLS/SEP)
225
  real_token_count = len(tokens)
@@ -230,11 +481,62 @@ def parse_filename(
230
  "group": None, "resolution": None, "source": None,
231
  "special": None}
232
 
233
- pred_labels = predictions[1:1 + available].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  label_strings = [id2label.get(p, "O") for p in pred_labels]
235
 
236
  # Post-process
237
- return postprocess(tokens[:available], label_strings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
 
240
  def main():
@@ -248,6 +550,12 @@ def main():
248
  help="Tokenizer variant override. Defaults to checkpoint metadata")
249
  parser.add_argument("--max-length", type=int, default=64,
250
  help="Maximum sequence length")
 
 
 
 
 
 
251
  args = parser.parse_args()
252
 
253
  # Load config
@@ -262,7 +570,10 @@ def main():
262
  model = BertForTokenClassification.from_pretrained(args.model_dir)
263
  model.eval()
264
 
265
- id2label = cfg.id2label
 
 
 
266
 
267
  # Process filenames
268
  filenames_to_parse: List[str] = []
@@ -283,7 +594,16 @@ def main():
283
  for fn in filenames_to_parse:
284
  if not fn.strip():
285
  continue
286
- result = parse_filename(fn, model, tokenizer, id2label, args.max_length)
 
 
 
 
 
 
 
 
 
287
  result["_input"] = fn
288
  results.append(result)
289
 
 
14
  import os
15
  import re
16
  import sys
17
+ from typing import Dict, List, Optional, Tuple
18
 
19
  import torch
20
  from transformers import BertForTokenClassification
 
70
  return clean if clean else None
71
 
72
 
73
+ def display_token(token: str) -> str:
74
+ """Make whitespace tokens visible in debug output."""
75
+ if token == " ":
76
+ return "<SPACE>"
77
+ if token == "\t":
78
+ return "<TAB>"
79
+ return token
80
+
81
+
82
  def trim_decorations(text: str) -> str:
83
  """Trim outer release brackets from an extracted entity."""
84
  return text.strip().strip("[]()【】《》()").strip()
85
 
86
 
87
+ def join_entity_tokens(tokens: List[str], tokenizer: Optional[AnimeTokenizer] = None) -> str:
88
+ """Join entity tokens according to the tokenizer granularity."""
89
+ if tokenizer is not None and getattr(tokenizer, "tokenizer_variant", "regex") == "char":
90
+ return "".join(tokens)
91
+ text = "".join(tokens)
92
+ if " " in tokens:
93
+ return text
94
+ return text
95
 
96
+
97
+ def labels_to_entities(
98
+ tokens: List[str],
99
+ labels: List[str],
100
+ tokenizer: Optional[AnimeTokenizer] = None,
101
+ ) -> List[Tuple[str, str]]:
102
  """
103
+ Convert BIO labels into entity spans.
 
 
 
 
 
 
 
 
104
 
105
+ Illegal orphan I-X labels start a new entity so debug output exposes the
106
+ model behavior instead of silently dropping tokens.
107
+ """
108
+ entities: List[Tuple[str, str]] = []
109
  current_entity: Optional[str] = None
110
  current_tokens: List[str] = []
111
 
112
  for token, label in zip(tokens, labels):
113
  if label.startswith("B-"):
 
114
  if current_entity:
115
+ entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
116
+ current_entity = label[2:]
117
  current_tokens = [token]
118
  elif label.startswith("I-"):
119
  entity_type = label[2:]
120
  if current_entity == entity_type:
121
  current_tokens.append(token)
122
  else:
 
123
  if current_entity:
124
+ entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
125
  current_entity = entity_type
126
  current_tokens = [token]
127
+ else:
128
  if current_entity:
129
+ entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
130
  current_entity = None
131
  current_tokens = []
132
 
133
  if current_entity:
134
+ entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
135
+ return entities
136
+
137
+
138
+ def is_allowed_bio_transition(previous_label: str, label: str) -> bool:
139
+ """Return whether previous_label -> label is valid under IOB2."""
140
+ if label.startswith("I-"):
141
+ entity = label[2:]
142
+ return previous_label in {f"B-{entity}", f"I-{entity}"}
143
+ return True
144
+
145
+
146
+ def constrained_bio_decode(emissions: torch.Tensor, id2label: Dict[int, str]) -> List[int]:
147
+ """
148
+ Decode token logits with hard BIO transition constraints.
149
+
150
+ This is a lightweight CRF-style Viterbi decoder without learned transition
151
+ weights. It prevents impossible orphan I-X spans at inference time.
152
+ """
153
+ if emissions.numel() == 0:
154
+ return []
155
+
156
+ num_tokens, num_labels = emissions.shape
157
+ scores = emissions.detach().cpu()
158
+ backpointers = torch.zeros((num_tokens, num_labels), dtype=torch.long)
159
+ dp = torch.full((num_labels,), float("-inf"))
160
+
161
+ for label_id in range(num_labels):
162
+ label = id2label.get(label_id, "O")
163
+ if not label.startswith("I-"):
164
+ dp[label_id] = scores[0, label_id]
165
+
166
+ for idx in range(1, num_tokens):
167
+ next_dp = torch.full((num_labels,), float("-inf"))
168
+ for label_id in range(num_labels):
169
+ label = id2label.get(label_id, "O")
170
+ best_score = float("-inf")
171
+ best_prev = 0
172
+ for prev_id in range(num_labels):
173
+ prev_label = id2label.get(prev_id, "O")
174
+ if not is_allowed_bio_transition(prev_label, label):
175
+ continue
176
+ candidate = dp[prev_id] + scores[idx, label_id]
177
+ if candidate > best_score:
178
+ best_score = float(candidate)
179
+ best_prev = prev_id
180
+ next_dp[label_id] = best_score
181
+ backpointers[idx, label_id] = best_prev
182
+ dp = next_dp
183
+
184
+ best_last = int(torch.argmax(dp).item())
185
+ decoded = [best_last]
186
+ for idx in range(num_tokens - 1, 0, -1):
187
+ decoded.append(int(backpointers[idx, decoded[-1]].item()))
188
+ decoded.reverse()
189
+ return decoded
190
+
191
+
192
+ def postprocess(
193
+ tokens: List[str],
194
+ labels: List[str],
195
+ tokenizer: Optional[AnimeTokenizer] = None,
196
+ filename: Optional[str] = None,
197
+ use_rules: bool = True,
198
+ ) -> Dict:
199
+ """
200
+ Convert BIO-labeled tokens into structured metadata.
201
+
202
+ Merges consecutive B- / I- tokens of the same entity type,
203
+ then extracts structured fields.
204
+ """
205
+ result: Dict = {
206
+ "title": None,
207
+ "season": None,
208
+ "episode": None,
209
+ "group": None,
210
+ "resolution": None,
211
+ "source": None,
212
+ "special": None,
213
+ }
214
+
215
+ entities = labels_to_entities(tokens, labels, tokenizer)
216
 
217
  # Fill result
218
  for entity_type, text in entities:
 
254
  if (trimmed := trim_decorations(f))
255
  )
256
 
257
+ if use_rules and filename:
258
+ result = apply_rule_assists(filename, result)
259
+
260
  return result
261
 
262
 
263
+ BRACKET_RE = re.compile(r"\[([^\]]+)\]|\(([^)]+)\)|【([^】]+)】|《([^》]+)》")
264
+ RESOLUTION_RE = re.compile(r"\b(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})\b")
265
+ SOURCE_RE = re.compile(
266
+ r"\b(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|DVDRip|DVD|TVRip|HDTV|"
267
+ r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X)\b",
268
+ re.I,
269
+ )
270
+ EPISODE_PATTERNS = [
271
+ re.compile(r"(?:^|[\s._\-\[\(【《#])(?:EP?|第)?(?P<ep>\d{1,4})(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])", re.I),
272
+ re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I),
273
+ ]
274
+ SEASON_RE = re.compile(r"(?:^|[\s._\-\[\(【《])(?:[Ss](?P<s1>\d{1,2})|Season\s*(?P<s2>\d{1,2})|第(?P<s3>[一二三四五六七八九十\d]+)[季期部])", re.I)
275
+ NOISE_META_RE = re.compile(
276
+ r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|DVDRip|DVD|TVRip|"
277
+ r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
278
+ r"Opus|ASS.*|CHS|CHT|BIG5|GB|JPN?|MP4|MKV|繁中|简中|内封|外挂)$",
279
+ re.I,
280
+ )
281
+
282
+
283
+ def cn_number_to_int(text: str) -> Optional[int]:
284
+ if text.isdigit():
285
+ return int(text)
286
+ values = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
287
+ if text == "十":
288
+ return 10
289
+ if text.startswith("十") and len(text) == 2:
290
+ return 10 + values.get(text[1], 0)
291
+ if text.endswith("十") and len(text) == 2:
292
+ return values.get(text[0], 0) * 10
293
+ if "十" in text and len(text) == 3:
294
+ return values.get(text[0], 0) * 10 + values.get(text[2], 0)
295
+ return values.get(text)
296
+
297
+
298
+ def bracket_parts(filename: str) -> List[Tuple[str, int, int]]:
299
+ parts: List[Tuple[str, int, int]] = []
300
+ for match in BRACKET_RE.finditer(filename):
301
+ text = next(group for group in match.groups() if group is not None)
302
+ parts.append((text.strip(), match.start(), match.end()))
303
+ return parts
304
+
305
+
306
+ def looks_like_group(text: str) -> bool:
307
+ if not text or NOISE_META_RE.search(text):
308
+ return False
309
+ return bool(
310
+ re.search(
311
+ r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
312
+ r"loli|ani|vcb|airota|kiss|dmhy|erai|subsplease)",
313
+ text,
314
+ re.I,
315
+ )
316
+ )
317
+
318
+
319
+ def apply_rule_assists(filename: str, result: Dict) -> Dict:
320
+ """
321
+ Fill high-confidence structural fields from filename conventions.
322
+
323
+ The model remains the primary tagger; rules only fill missing obvious fields
324
+ or repair common boundary drift around leading group brackets and episodes.
325
+ """
326
+ repaired = dict(result)
327
+ brackets = bracket_parts(filename)
328
+
329
+ if (not repaired.get("group") or (repaired.get("title") and repaired["group"] in repaired["title"])) and brackets:
330
+ first_text, first_start, _first_end = brackets[0]
331
+ if first_start == 0 and looks_like_group(first_text):
332
+ repaired["group"] = first_text
333
+
334
+ if not repaired.get("resolution"):
335
+ match = RESOLUTION_RE.search(filename)
336
+ if match:
337
+ repaired["resolution"] = match.group(0)
338
+
339
+ if not repaired.get("source"):
340
+ match = SOURCE_RE.search(filename)
341
+ if match:
342
+ repaired["source"] = match.group(0).replace("_", "-")
343
+
344
+ if repaired.get("season") is None:
345
+ match = SEASON_RE.search(filename)
346
+ if match:
347
+ value = next(group for group in match.groups() if group)
348
+ season = cn_number_to_int(value)
349
+ if season is not None:
350
+ repaired["season"] = season
351
+
352
+ if repaired.get("episode") is None:
353
+ candidates: List[Tuple[int, int, str]] = []
354
+ for pattern in EPISODE_PATTERNS:
355
+ for match in pattern.finditer(filename):
356
+ ep_text = match.group("ep")
357
+ ep = int(ep_text)
358
+ if ep == 0 or ep > 2000:
359
+ continue
360
+ score = match.start()
361
+ if 1 <= ep <= 200:
362
+ score += 10000
363
+ if "-" in filename[max(0, match.start() - 3):match.start() + 1]:
364
+ score += 1000
365
+ if match.start() > len(filename) // 3:
366
+ score += 200
367
+ candidates.append((score, ep, ep_text))
368
+ if candidates:
369
+ repaired["episode"] = max(candidates, key=lambda item: item[0])[1]
370
+
371
+ title = repaired.get("title")
372
+ group = repaired.get("group")
373
+ if title and group and title.startswith(group):
374
+ title = title[len(group):].lstrip("]】)>})》 \t-_.")
375
+ repaired["title"] = title or repaired["title"]
376
+
377
+ if (not repaired.get("title") or (group and repaired["title"].startswith(group))) and repaired.get("episode"):
378
+ repaired_title = infer_title_span(filename, group, repaired["episode"])
379
+ if repaired_title:
380
+ repaired["title"] = repaired_title
381
+
382
+ return repaired
383
+
384
+
385
+ def infer_title_span(filename: str, group: Optional[str], episode: Optional[int]) -> Optional[str]:
386
+ start = 0
387
+ if group:
388
+ first = BRACKET_RE.match(filename)
389
+ if first and group in first.group(0):
390
+ start = first.end()
391
+
392
+ end = None
393
+ if episode is not None:
394
+ ep_patterns = [
395
+ rf"\s[-_]\s*0*{episode}(?:v\d+)?(?=$|[\s\[\(【《._-])",
396
+ rf"[\[\(【《]0*{episode}(?:v\d+)?[\]\)】》]",
397
+ rf"[Ee]0*{episode}(?:v\d+)?",
398
+ ]
399
+ for pattern in ep_patterns:
400
+ match = re.search(pattern, filename[start:], re.I)
401
+ if match:
402
+ end = start + match.start()
403
+ break
404
+
405
+ if end is None:
406
+ for text, bracket_start, _bracket_end in bracket_parts(filename):
407
+ if bracket_start <= start:
408
+ continue
409
+ if NOISE_META_RE.search(text) or RESOLUTION_RE.search(text) or SOURCE_RE.search(text):
410
+ end = bracket_start
411
+ break
412
+
413
+ if end is None or end <= start:
414
+ return None
415
+ title = filename[start:end].strip(" \t-_.[]()【】《》()")
416
+ return title or None
417
+
418
+
419
  def parse_filename(
420
  filename: str,
421
  model: BertForTokenClassification,
422
  tokenizer: AnimeTokenizer,
423
  id2label: Dict[int, str],
424
  max_length: int = 64,
425
+ debug: bool = False,
426
+ use_rules: bool = True,
427
+ constrain_bio: bool = True,
428
  ) -> Dict:
429
  """
430
  Parse an anime filename and extract structured metadata.
 
448
 
449
  # Convert to input IDs
450
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
451
+ unk_token_id = tokenizer.unk_token_id
452
+ unk_tokens = [token for token, token_id in zip(tokens, input_ids) if token_id == unk_token_id]
453
 
454
  # Add special tokens
455
  input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
 
457
 
458
  # Truncate if needed
459
  if len(input_ids) > max_length:
460
+ input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [tokenizer.sep_token_id]
461
+ attention_mask = [1] * len(input_ids)
462
 
463
  # Pad
464
  pad_len = max_length - len(input_ids)
 
471
  input_tensor = torch.tensor([input_ids], device=device)
472
  mask_tensor = torch.tensor([attention_mask], device=device)
473
 
 
 
 
 
474
  # Remove special token predictions
475
  # Count real tokens used (minus CLS/SEP)
476
  real_token_count = len(tokens)
 
481
  "group": None, "resolution": None, "source": None,
482
  "special": None}
483
 
484
+ with torch.no_grad():
485
+ logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
486
+ token_logits = logits[0, 1:1 + available, :]
487
+ probabilities = torch.softmax(token_logits, dim=-1)
488
+ scores, greedy_predictions = torch.max(probabilities, dim=-1)
489
+ if constrain_bio:
490
+ pred_labels = constrained_bio_decode(token_logits, id2label)
491
+ selected_scores = [
492
+ probabilities[idx, label_id].detach().cpu().item()
493
+ for idx, label_id in enumerate(pred_labels)
494
+ ]
495
+ else:
496
+ pred_labels = greedy_predictions.detach().cpu().tolist()
497
+ selected_scores = scores.detach().cpu().tolist()
498
  label_strings = [id2label.get(p, "O") for p in pred_labels]
499
 
500
  # Post-process
501
+ result = postprocess(
502
+ tokens[:available],
503
+ label_strings,
504
+ tokenizer=tokenizer,
505
+ filename=filename,
506
+ use_rules=use_rules,
507
+ )
508
+ if debug:
509
+ result["_debug"] = {
510
+ "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
511
+ "decoder": "constrained_bio" if constrain_bio else "greedy",
512
+ "max_length": max_length,
513
+ "token_count": len(tokens),
514
+ "available_token_count": available,
515
+ "truncated": len(tokens) > available,
516
+ "unk_count": len(unk_tokens),
517
+ "unk_rate": len(unk_tokens) / len(tokens) if tokens else 0.0,
518
+ "unk_tokens": unk_tokens[:50],
519
+ "tokens": tokens[:available],
520
+ "labels": label_strings,
521
+ "scores": [round(float(score), 4) for score in selected_scores],
522
+ "token_table": [
523
+ {
524
+ "i": i,
525
+ "token": display_token(token),
526
+ "id": int(token_id),
527
+ "label": label,
528
+ "score": round(float(score), 4),
529
+ }
530
+ for i, (token, token_id, label, score) in enumerate(
531
+ zip(tokens[:available], input_ids[1:1 + available], label_strings, selected_scores)
532
+ )
533
+ ],
534
+ "entities": [
535
+ {"type": entity_type, "text": text}
536
+ for entity_type, text in labels_to_entities(tokens[:available], label_strings, tokenizer)
537
+ ],
538
+ }
539
+ return result
540
 
541
 
542
  def main():
 
550
  help="Tokenizer variant override. Defaults to checkpoint metadata")
551
  parser.add_argument("--max-length", type=int, default=64,
552
  help="Maximum sequence length")
553
+ parser.add_argument("--debug", action="store_true",
554
+ help="Include tokenizer, labels, scores, and entity spans in JSON output")
555
+ parser.add_argument("--no-rule-assist", action="store_true",
556
+ help="Disable high-confidence structural post-processing rules")
557
+ parser.add_argument("--no-constrained-bio", action="store_true",
558
+ help="Use greedy per-token decoding instead of constrained BIO Viterbi")
559
  args = parser.parse_args()
560
 
561
  # Load config
 
570
  model = BertForTokenClassification.from_pretrained(args.model_dir)
571
  model.eval()
572
 
573
+ id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
574
+ max_length = args.max_length
575
+ if max_length == 64:
576
+ max_length = int(getattr(model.config, "max_seq_length", max_length))
577
 
578
  # Process filenames
579
  filenames_to_parse: List[str] = []
 
594
  for fn in filenames_to_parse:
595
  if not fn.strip():
596
  continue
597
+ result = parse_filename(
598
+ fn,
599
+ model,
600
+ tokenizer,
601
+ id2label,
602
+ max_length,
603
+ debug=args.debug,
604
+ use_rules=not args.no_rule_assist,
605
+ constrain_bio=not args.no_constrained_bio,
606
+ )
607
  result["_input"] = fn
608
  results.append(result)
609