Add parser diagnostics and inference debugging

- add diagnose_pipeline.py for BIO validation, tokenizer alignment, entity stats, truncation, UNK, and confusion analysis

- add diagnostics reports for char and regex DMHY datasets

- add inference debug output, constrained BIO decoding, checkpoint max-length handling, and rule-assisted parsing

Files changed (4) hide show

diagnose_pipeline.py +709 -0
diagnostics_report.md +277 -0
diagnostics_report_word.md +2678 -0
inference.py +355 -35

diagnose_pipeline.py ADDED Viewed

	@@ -0,0 +1,709 @@

+"""Diagnostics for the anime filename NER pipeline.
+The checks focus on structured filename parsing failure modes:
+- train/inference tokenizer mismatch
+- BIO legality and boundary drift
+- tokenizer split and vocabulary coverage
+- label/entity distribution
+- optional model confusion on a sampled validation split
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import random
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import torch
+from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
+from transformers import BertForTokenClassification
+from config import Config
+from dataset import align_tokens_for_tokenizer
+from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
+def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
+    with path.open("r", encoding="utf-8") as handle:
+        for line_no, line in enumerate(handle, 1):
+            if limit is not None and line_no > limit:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                yield json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
+def detect_dataset_variant(samples: List[dict], vocab_file: Optional[str]) -> str:
+    variants = {sample.get("tokenizer_variant") for sample in samples if sample.get("tokenizer_variant")}
+    if len(variants) == 1:
+        return next(iter(variants))
+    if len(variants) > 1:
+        return "mixed"
+    if vocab_file and ".char" in os.path.basename(vocab_file).lower():
+        return "char"
+    char_like = 0
+    with_filename = 0
+    for sample in samples:
+        filename = sample.get("filename")
+        if filename is None:
+            continue
+        with_filename += 1
+        if sample.get("tokens") == list(filename):
+            char_like += 1
+    if with_filename and char_like / with_filename >= 0.95:
+        return "char"
+    return "regex"
+def entity_type(label: str) -> Optional[str]:
+    if "-" not in label:
+        return None
+    return label.split("-", 1)[1]
+def bio_violations(tokens: List[str], labels: List[str]) -> List[dict]:
+    violations: List[dict] = []
+    previous_label = "O"
+    current_entity: Optional[str] = None
+    for idx, label in enumerate(labels):
+        token = tokens[idx] if idx < len(tokens) else None
+        if label == "O":
+            if previous_label.startswith("B-"):
+                violations.append(
+                    {
+                        "type": "B_DIRECT_TO_O",
+                        "index": idx,
+                        "prev_label": previous_label,
+                        "label": label,
+                        "token": token,
+                    }
+                )
+            current_entity = None
+        elif label.startswith("B-"):
+            current_entity = entity_type(label)
+        elif label.startswith("I-"):
+            label_entity = entity_type(label)
+            previous_entity = entity_type(previous_label)
+            if idx == 0 or previous_label == "O" or previous_entity != label_entity:
+                violations.append(
+                    {
+                        "type": "ORPHAN_I",
+                        "index": idx,
+                        "prev_label": previous_label,
+                        "label": label,
+                        "token": token,
+                    }
+                )
+            current_entity = label_entity
+        else:
+            violations.append(
+                {
+                    "type": "UNKNOWN_LABEL",
+                    "index": idx,
+                    "prev_label": previous_label,
+                    "label": label,
+                    "token": token,
+                }
+            )
+            current_entity = None
+        previous_label = label
+    return violations
+def spans_from_labels(tokens: List[str], labels: List[str]) -> List[dict]:
+    spans: List[dict] = []
+    start: Optional[int] = None
+    current_type: Optional[str] = None
+    current_tokens: List[str] = []
+    for idx, (token, label) in enumerate(zip(tokens, labels)):
+        if label.startswith("B-"):
+            if current_type is not None and start is not None:
+                spans.append(
+                    {
+                        "type": current_type,
+                        "start": start,
+                        "end": idx,
+                        "text": "".join(current_tokens),
+                    }
+                )
+            current_type = entity_type(label)
+            start = idx
+            current_tokens = [token]
+        elif label.startswith("I-") and current_type == entity_type(label):
+            current_tokens.append(token)
+        elif label.startswith("I-"):
+            if current_type is not None and start is not None:
+                spans.append(
+                    {
+                        "type": current_type,
+                        "start": start,
+                        "end": idx,
+                        "text": "".join(current_tokens),
+                    }
+                )
+            current_type = entity_type(label)
+            start = idx
+            current_tokens = [token]
+        else:
+            if current_type is not None and start is not None:
+                spans.append(
+                    {
+                        "type": current_type,
+                        "start": start,
+                        "end": idx,
+                        "text": "".join(current_tokens),
+                    }
+                )
+            current_type = None
+            start = None
+            current_tokens = []
+    if current_type is not None and start is not None:
+        spans.append(
+            {
+                "type": current_type,
+                "start": start,
+                "end": len(labels),
+                "text": "".join(current_tokens),
+            }
+        )
+    return spans
+def count_entities(samples: List[dict]) -> Counter:
+    counts: Counter = Counter()
+    for sample in samples:
+        for span in spans_from_labels(sample["tokens"], sample["labels"]):
+            counts[span["type"]] += 1
+    return counts
+def percentile(values: List[int], pct: float) -> int:
+    if not values:
+        return 0
+    ordered = sorted(values)
+    idx = min(len(ordered) - 1, round((pct / 100) * (len(ordered) - 1)))
+    return ordered[idx]
+def token_mismatch(sample: dict, tokenizer: AnimeTokenizer) -> Optional[dict]:
+    filename = sample.get("filename")
+    if filename is None:
+        return None
+    inferred = tokenizer.tokenize(filename)
+    dataset_tokens = sample.get("tokens", [])
+    if inferred == dataset_tokens:
+        return None
+    prefix = 0
+    for left, right in zip(inferred, dataset_tokens):
+        if left != right:
+            break
+        prefix += 1
+    return {
+        "file_id": sample.get("file_id"),
+        "filename": filename,
+        "common_prefix": prefix,
+        "dataset_tokens": dataset_tokens[:40],
+        "tokenizer_tokens": inferred[:40],
+        "dataset_len": len(dataset_tokens),
+        "tokenizer_len": len(inferred),
+    }
+def format_counter(counter: Counter, total: Optional[int] = None, limit: Optional[int] = None) -> str:
+    if total is None:
+        total = sum(counter.values())
+    rows = []
+    items = counter.most_common(limit)
+    for key, count in items:
+        pct = count / total * 100 if total else 0.0
+        rows.append(f"- `{key}`: {count:,} ({pct:.2f}%)")
+    return "\n".join(rows) if rows else "- none"
+def token_id_stats(samples: List[dict], tokenizer: AnimeTokenizer) -> dict:
+    total = 0
+    unk = 0
+    unk_counter: Counter = Counter()
+    for sample in samples:
+        tokens, labels = align_tokens_for_tokenizer(sample["tokens"], sample["labels"], tokenizer)
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        for token, token_id in zip(tokens, ids):
+            total += 1
+            if token_id == tokenizer.unk_token_id:
+                unk += 1
+                unk_counter[token] += 1
+    return {
+        "total": total,
+        "unk": unk,
+        "unk_rate": unk / total if total else 0.0,
+        "top_unk": unk_counter.most_common(25),
+    }
+def prepare_inputs(
+    tokens: List[str],
+    labels: List[str],
+    tokenizer: AnimeTokenizer,
+    label2id: Dict[str, int],
+    max_length: int,
+) -> Tuple[List[int], List[int], List[int], List[str]]:
+    tokens, labels = align_tokens_for_tokenizer(tokens, labels, tokenizer)
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
+    label_ids = [-100] + [label2id.get(label, 0) for label in labels] + [-100]
+    attention_mask = [1] * len(input_ids)
+    if len(input_ids) > max_length:
+        input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [input_ids[-1]]
+        label_ids = [label_ids[0]] + label_ids[1:max_length - 1] + [label_ids[-1]]
+        attention_mask = [1] * len(input_ids)
+    pad_len = max_length - len(input_ids)
+    if pad_len > 0:
+        input_ids += [tokenizer.pad_token_id] * pad_len
+        label_ids += [-100] * pad_len
+        attention_mask += [0] * pad_len
+    return input_ids, attention_mask, label_ids, tokens
+def evaluate_model(
+    samples: List[dict],
+    model_dir: Path,
+    tokenizer: AnimeTokenizer,
+    max_length: int,
+    limit: int,
+    seed: int,
+) -> dict:
+    cfg = Config()
+    model = BertForTokenClassification.from_pretrained(str(model_dir))
+    model.eval()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    rng = random.Random(seed)
+    eval_samples = list(samples)
+    rng.shuffle(eval_samples)
+    eval_samples = eval_samples[:limit]
+    id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
+    label2id = {v: int(k) for k, v in id2label.items()}
+    if not label2id:
+        label2id = cfg.label2id
+        id2label = cfg.id2label
+    true_sequences: List[List[str]] = []
+    pred_sequences: List[List[str]] = []
+    confusion: Counter = Counter()
+    entity_confusion: Counter = Counter()
+    boundary_errors: Counter = Counter()
+    with torch.no_grad():
+        for sample in eval_samples:
+            input_ids, attention_mask, label_ids, _tokens = prepare_inputs(
+                sample["tokens"],
+                sample["labels"],
+                tokenizer,
+                label2id,
+                max_length,
+            )
+            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
+            mask_tensor = torch.tensor([attention_mask], dtype=torch.long, device=device)
+            logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
+            pred_ids = torch.argmax(logits, dim=-1)[0].detach().cpu().tolist()
+            true_labels: List[str] = []
+            pred_labels: List[str] = []
+            for pred_id, label_id in zip(pred_ids, label_ids):
+                if label_id == -100:
+                    continue
+                true_label = id2label.get(label_id, "O")
+                pred_label = id2label.get(pred_id, "O")
+                true_labels.append(true_label)
+                pred_labels.append(pred_label)
+                confusion[(true_label, pred_label)] += 1
+                entity_confusion[(entity_type(true_label) or "O", entity_type(pred_label) or "O")] += 1
+                if true_label != pred_label:
+                    if true_label.startswith("B-") or pred_label.startswith("B-"):
+                        boundary_errors["B-boundary"] += 1
+                    elif entity_type(true_label) != entity_type(pred_label):
+                        boundary_errors["entity-type"] += 1
+                    else:
+                        boundary_errors["BIO-prefix"] += 1
+            true_sequences.append(true_labels)
+            pred_sequences.append(pred_labels)
+    errors = confusion.copy()
+    for label in set(label for pair in confusion for label in pair):
+        errors.pop((label, label), None)
+    return {
+        "sample_count": len(eval_samples),
+        "precision": precision_score(true_sequences, pred_sequences),
+        "recall": recall_score(true_sequences, pred_sequences),
+        "f1": f1_score(true_sequences, pred_sequences),
+        "classification_report": classification_report(true_sequences, pred_sequences, digits=4),
+        "top_token_confusions": errors.most_common(30),
+        "top_entity_confusions": Counter(
+            {k: v for k, v in entity_confusion.items() if k[0] != k[1]}
+        ).most_common(30),
+        "boundary_errors": boundary_errors,
+    }
+def tokenizer_split_examples(samples: List[dict], tokenizers: Dict[str, AnimeTokenizer], limit: int = 8) -> List[dict]:
+    examples: List[dict] = []
+    for sample in samples:
+        filename = sample.get("filename")
+        if not filename:
+            continue
+        row = {
+            "file_id": sample.get("file_id"),
+            "filename": filename,
+            "dataset_tokens": sample.get("tokens", [])[:80],
+        }
+        for name, tokenizer in tokenizers.items():
+            row[f"{name}_tokens"] = tokenizer.tokenize(filename)[:80]
+        examples.append(row)
+        if len(examples) >= limit:
+            break
+    return examples
+def write_report(path: Path, title: str, sections: List[Tuple[str, str]]) -> None:
+    parts = [f"# {title}", ""]
+    for heading, body in sections:
+        parts.append(f"## {heading}")
+        parts.append("")
+        parts.append(body.strip() if body.strip() else "_No data._")
+        parts.append("")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def markdown_json(value) -> str:
+    return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"
+def markdown_table(headers: List[str], rows: List[List[str]], limit: Optional[int] = None) -> str:
+    if limit is not None:
+        rows = rows[:limit]
+    table = ["| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |"]
+    for row in rows:
+        table.append("| " + " | ".join(str(cell).replace("\n", " ") for cell in row) + " |")
+    return "\n".join(table)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Diagnose anime filename NER data and model pipeline")
+    parser.add_argument("--data-file", required=True, help="JSONL dataset with tokens and labels")
+    parser.add_argument("--vocab-file", default=None, help="Tokenizer vocab JSON")
+    parser.add_argument("--tokenizer", choices=["regex", "char"], default=None,
+                        help="Tokenizer variant to diagnose. Defaults to dataset metadata")
+    parser.add_argument("--model-dir", default=None, help="Optional model directory for confusion analysis")
+    parser.add_argument("--max-length", type=int, default=None, help="Max sequence length for model eval/truncation stats")
+    parser.add_argument("--sample-limit", type=int, default=20000, help="Rows to inspect for data diagnostics")
+    parser.add_argument("--eval-limit", type=int, default=512, help="Rows to evaluate when --model-dir is provided")
+    parser.add_argument("--output", default="diagnostics_report.md", help="Markdown report path")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    data_path = Path(args.data_file)
+    samples = list(iter_jsonl(data_path, args.sample_limit))
+    if not samples:
+        raise ValueError(f"No samples loaded from {data_path}")
+    dataset_variant = detect_dataset_variant(samples, args.vocab_file)
+    tokenizer_variant = args.tokenizer or (dataset_variant if dataset_variant != "mixed" else "regex")
+    vocab_file = args.vocab_file
+    if vocab_file is None:
+        vocab_file = str(data_path.with_name("vocab.char.json" if tokenizer_variant == "char" else "vocab.json"))
+    tokenizer = create_tokenizer(tokenizer_variant, vocab_file=vocab_file)
+    if args.model_dir:
+        model_tokenizer = load_tokenizer(args.model_dir)
+    else:
+        model_tokenizer = tokenizer
+    label_counter: Counter = Counter()
+    length_values: List[int] = []
+    aligned_length_values: List[int] = []
+    violations: List[dict] = []
+    mismatch_examples: List[dict] = []
+    space_label_counter: Counter = Counter()
+    boundary_drift_counter: Counter = Counter()
+    truncation_count = 0
+    max_length = args.max_length
+    if max_length is None and args.model_dir:
+        model_config = BertForTokenClassification.from_pretrained(args.model_dir).config
+        max_length = int(getattr(model_config, "max_seq_length", 64))
+    max_length = max_length or (128 if tokenizer_variant == "char" else 64)
+    for row_idx, sample in enumerate(samples, 1):
+        tokens = sample.get("tokens", [])
+        labels = sample.get("labels", [])
+        if len(tokens) != len(labels):
+            violations.append(
+                {
+                    "type": "LENGTH_MISMATCH",
+                    "row": row_idx,
+                    "file_id": sample.get("file_id"),
+                    "token_count": len(tokens),
+                    "label_count": len(labels),
+                    "filename": sample.get("filename"),
+                }
+            )
+            continue
+        label_counter.update(labels)
+        length_values.append(len(tokens))
+        aligned_tokens, aligned_labels = align_tokens_for_tokenizer(tokens, labels, tokenizer)
+        aligned_length_values.append(len(aligned_tokens))
+        if len(aligned_tokens) + 2 > max_length:
+            truncation_count += 1
+        for token, label in zip(tokens, labels):
+            if token.isspace():
+                space_label_counter[label] += 1
+        for violation in bio_violations(tokens, labels):
+            violation.update(
+                {
+                    "row": row_idx,
+                    "file_id": sample.get("file_id"),
+                    "filename": sample.get("filename"),
+                    "context_tokens": tokens[max(0, violation["index"] - 5):violation["index"] + 6],
+                    "context_labels": labels[max(0, violation["index"] - 5):violation["index"] + 6],
+                }
+            )
+            violations.append(violation)
+        for span in spans_from_labels(tokens, labels):
+            text = span["text"]
+            if span["type"] == "TITLE":
+                if text.startswith("[") or text.endswith("[") or "]" in text[:3]:
+                    boundary_drift_counter["title_contains_bracket_edge"] += 1
+                if re.search(r"\b(?:WEB[-_ ]?DL|WebRip|\d{3,4}[pP]|HEVC|AVC|AAC)\b", text, re.I):
+                    boundary_drift_counter["title_contains_meta"] += 1
+            if span["type"] == "GROUP" and ("[" in text or "]" in text):
+                boundary_drift_counter["group_contains_bracket"] += 1
+        if len(mismatch_examples) < 10:
+            mismatch = token_mismatch(sample, tokenizer)
+            if mismatch:
+                mismatch_examples.append(mismatch)
+    entity_counter = count_entities(samples)
+    id_stats = token_id_stats(samples, tokenizer)
+    split_examples = tokenizer_split_examples(
+        samples,
+        {
+            "diagnosed": tokenizer,
+            "regex": create_tokenizer("regex", vocab_file=str(data_path.with_name("vocab.json"))),
+            "char": create_tokenizer("char", vocab_file=str(data_path.with_name("vocab.char.json"))),
+        },
+    )
+    model_eval = None
+    if args.model_dir:
+        model_eval = evaluate_model(
+            samples=samples,
+            model_dir=Path(args.model_dir),
+            tokenizer=model_tokenizer,
+            max_length=max_length,
+            limit=args.eval_limit,
+            seed=args.seed,
+        )
+    total_labels = sum(label_counter.values())
+    o_count = label_counter.get("O", 0)
+    sections: List[Tuple[str, str]] = []
+    sections.append(
+        (
+            "Executive Summary",
+            "\n".join(
+                [
+                    f"- Dataset: `{data_path}`",
+                    f"- Inspected rows: {len(samples):,}",
+                    f"- Dataset tokenizer variant: `{dataset_variant}`",
+                    f"- Diagnosed tokenizer variant: `{tokenizer_variant}`",
+                    f"- Vocab: `{vocab_file}` ({tokenizer.vocab_size:,} tokens)",
+                    f"- Max sequence length checked: {max_length}",
+                    f"- O-label ratio: {o_count / total_labels * 100:.2f}%" if total_labels else "- O-label ratio: n/a",
+                    f"- Truncation risk: {truncation_count:,}/{len(samples):,} rows ({truncation_count / len(samples) * 100:.2f}%)",
+                    f"- UNK rate after selected tokenizer: {id_stats['unk_rate'] * 100:.4f}%",
+                    f"- BIO warnings collected: {len(violations):,}",
+                    "",
+                    "Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.",
+                ]
+            ),
+        )
+    )
+    sections.append(
+        (
+            "Label And Entity Statistics",
+            "\n".join(
+                [
+                    "### Label distribution",
+                    format_counter(label_counter, total_labels),
+                    "",
+                    "### Entity count",
+                    format_counter(entity_counter),
+                    "",
+                    "### Length distribution",
+                    markdown_json(
+                        {
+                            "raw_tokens": {
+                                "min": min(length_values),
+                                "p50": percentile(length_values, 50),
+                                "p90": percentile(length_values, 90),
+                                "p95": percentile(length_values, 95),
+                                "p99": percentile(length_values, 99),
+                                "max": max(length_values),
+                            },
+                            "aligned_tokens": {
+                                "min": min(aligned_length_values),
+                                "p50": percentile(aligned_length_values, 50),
+                                "p90": percentile(aligned_length_values, 90),
+                                "p95": percentile(aligned_length_values, 95),
+                                "p99": percentile(aligned_length_values, 99),
+                                "max": max(aligned_length_values),
+                            },
+                        }
+                    ),
+                    "",
+                    "### Whitespace labels",
+                    format_counter(space_label_counter),
+                ]
+            ),
+        )
+    )
+    violation_counter = Counter(v["type"] for v in violations)
+    sections.append(
+        (
+            "BIO Violations And Boundary Drift",
+            "\n".join(
+                [
+                    "### Violation counts",
+                    format_counter(violation_counter),
+                    "",
+                    "### Boundary drift heuristics",
+                    format_counter(boundary_drift_counter),
+                    "",
+                    "### Sample violations",
+                    markdown_json(violations[:30]),
+                ]
+            ),
+        )
+    )
+    sections.append(
+        (
+            "Tokenizer Split And Alignment",
+            "\n".join(
+                [
+                    "### Dataset tokens vs selected tokenizer mismatches",
+                    markdown_json(mismatch_examples),
+                    "",
+                    "### Split examples",
+                    markdown_json(split_examples),
+                    "",
+                    "### Vocabulary coverage",
+                    markdown_json(id_stats),
+                ]
+            ),
+        )
+    )
+    if args.model_dir:
+        model_tokenizer_variant = getattr(model_tokenizer, "tokenizer_variant", "unknown")
+        sections.append(
+            (
+                "Train Inference Tokenizer Comparison",
+                "\n".join(
+                    [
+                        f"- Model dir: `{args.model_dir}`",
+                        f"- Model tokenizer variant: `{model_tokenizer_variant}`",
+                        f"- Dataset tokenizer variant: `{dataset_variant}`",
+                        f"- Diagnostic tokenizer variant: `{tokenizer_variant}`",
+                        f"- Model tokenizer vocab size: {model_tokenizer.vocab_size:,}",
+                        f"- Diagnostic tokenizer vocab size: {tokenizer.vocab_size:,}",
+                        "",
+                        "If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.",
+                    ]
+                ),
+            )
+        )
+    if model_eval:
+        token_rows = [
+            [true, pred, f"{count:,}"]
+            for (true, pred), count in model_eval["top_token_confusions"]
+        ]
+        entity_rows = [
+            [true, pred, f"{count:,}"]
+            for (true, pred), count in model_eval["top_entity_confusions"]
+        ]
+        sections.append(
+            (
+                "Model Confusion Analysis",
+                "\n".join(
+                    [
+                        f"- Evaluated samples: {model_eval['sample_count']:,}",
+                        f"- Entity precision: {model_eval['precision']:.4f}",
+                        f"- Entity recall: {model_eval['recall']:.4f}",
+                        f"- Entity F1: {model_eval['f1']:.4f}",
+                        "",
+                        "### Boundary error classes",
+                        format_counter(model_eval["boundary_errors"]),
+                        "",
+                        "### Top token-label confusions",
+                        markdown_table(["true", "pred", "count"], token_rows) if token_rows else "- none",
+                        "",
+                        "### Top entity-type confusions",
+                        markdown_table(["true", "pred", "count"], entity_rows) if entity_rows else "- none",
+                        "",
+                        "### Seqeval report",
+                        "```text\n" + model_eval["classification_report"] + "\n```",
+                    ]
+                ),
+            )
+        )
+    sections.append(
+        (
+            "Recommended Pipeline",
+            "\n".join(
+                [
+                    "1. Use one tokenizer variant end to end and save it in the checkpoint metadata.",
+                    "2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.",
+                    "3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.",
+                    "4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.",
+                    "5. Keep rule-assisted post-processing for high-confidence structural anchors: leading group bracket, ` - 07`, `S01E07`, source, and resolution.",
+                    "6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.",
+                ]
+            ),
+        )
+    )
+    write_report(Path(args.output), "Anime Filename Parser Diagnostics Report", sections)
+    print(f"Wrote diagnostics report: {args.output}")
+if __name__ == "__main__":
+    main()

diagnostics_report.md ADDED Viewed

	@@ -0,0 +1,277 @@

+# Anime Filename Parser Diagnostics Report
+## 根因分析
+当前症状不是 learning rate 问题，而是训练、验证、推理没有在同一个结构化输入空间里工作。
+最高优先级根因是 tokenizer/data 配置错位：你给出的训练命令使用 `dmhy_weak_char.jsonl` 和 `vocab.char.json`，但没有传 `--tokenizer char`。旧版 `train.py` 默认 `regex`，因此 char 数据会被当作 regex 训练配置保存，checkpoint metadata 会写成 `tokenizer_variant=regex`。推理时 `load_tokenizer()` 按 checkpoint metadata 重新加载 regex tokenizer，于是 `[LoliHouse]` 这类结构 token 会作为一个整体进入模型，而 char 训练数据里它是 `[`, `L`, `o`, ..., `]`。这会直接导致 group/title 边界漂移。
+第二个根因是 word-level 数据和当前 `AnimeTokenizer` 也不完全一致。`dmhy_weak.jsonl` 里示例 token 是 `[`, `LoliHouse`, `]`，但当前 regex tokenizer 对原始文件名会输出 `[LoliHouse]`。这说明 word-level 数据名义上是 regex，但不是严格由当前 inference tokenizer 重放得到的 token 序列。
+第三个根因是 char 训练命令没有设置 `--max-seq-length 128`。在抽样 5,000 条 char 数据中，默认 64 长度会截断 2,058 条，占 41.16%。episode/source/resolution 往往在后半段，默认长度会让模型训练和推理都丢失结构锚点。
+第四个根因是评估指标误导。低 validation loss 和 token accuracy 会被大量 `O`、`I-TITLE` 稀释；真实任务需要 entity-level F1、字段 exact match，以及结构案例回归。
+## 问题优先级
+P0: 训练命令必须显式或自动使用 char tokenizer。已修改 `train.py`，现在会从数据集 metadata 自动识别 `char`，并把 char 默认 max length 提升到 128。
+P0: 不允许 tokenizer variant 与 dataset metadata 不一致。已修改 `train.py`，检测到 dataset `tokenizer_variant` 与选择的 tokenizer 不一致会报错。
+P0: 推理必须使用 checkpoint 保存的 tokenizer 和 max length。已修改 `inference.py`，默认读取 `model.config.max_seq_length`，并新增 `--debug` 输出 token/label/score/UNK/截断信息。
+P1: 从旧 checkpoint fine-tune 到不同 vocab 时，不能按 ID 盲目 `resize_token_embeddings()`。已修改为按 token 字符串重映射 embedding，未匹配 token 再随机初始化。
+P1: 数据集存在 BIO/边界质量问题。char 抽样 5,000 条发现 468 个 `ORPHAN_I`，典型是标题被括号 `O` 打断后仍继续 `I-TITLE`。`B-X -> O` 本身是合法 BIO，但在 group/title/source 频繁出现时是边界告警。
+P2: 当前 `BertForTokenClassification` 独立逐 token 解码，不能约束非法转移。建议后续加 CRF 或 constrained BIO decoder。
+## 自动诊断结果
+新增脚本：
+```bash
+python diagnose_pipeline.py --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --model-dir checkpoints/dmhy-finetune/final --sample-limit 5000 --eval-limit 128 --output diagnostics_report.md
+```
+char 数据抽样结果：
+- tokenizer variant: `char`
+- vocab size: 6,199
+- UNK rate: 0.0000%
+- O-label ratio: 37.47%
+- p95 length: 101, p99 length: 125
+- default max length 64 truncation: 41.16%
+- `ORPHAN_I`: 468
+- regex checkpoint 直接评 char 数据时 entity F1: 0.0832
+word 数据抽样结果保存在 `diagnostics_report_word.md`：
+- tokenizer variant: `regex`
+- vocab size: 8,000
+- UNK rate: 6.9158%
+- default max length 64 truncation: 0%
+- 当前 regex checkpoint 在抽样 word 数据上 entity F1: 0.9549
+- 但 model checkpoint vocab 是 3,000，诊断 vocab 是 8,000，继续 fine-tune 必须重映射 embedding
+## Tokenizer Split 示例
+输入：
+```text
+[LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]
+```
+char tokenizer：
+```text
+[, L, o, l, i, H, o, u, s, e, ],  , Y, o, m, i,  , n, o,  , T, s, u, g, a, i,  , -,  , 0, 7, ...
+```
+当前 regex tokenizer：
+```text
+[LoliHouse],  , Yomi,  , no,  , Tsugai,  , -,  , 07,  , [WebRip 1080p HEVC-10bit AAC ASSx2]
+```
+这两个 token 序列不是同一个标注空间。char label 不能直接套到 regex token 上，regex 模型也不能在 char token 序列上解释 logits。
+## BIO 与边界问题
+真实非法 BIO：
+```text
+... ( O, K I-TITLE, a I-TITLE ...
+```
+示例：
+```text
+[LoliHouse] Kanteishi (Kari) - 07 [WebRip 1080p HEVC-10bit AAC]
+```
+`(` 被标为 `O`，后面的 `Kari` 继续 `I-TITLE`，形成 `O -> I-TITLE`。这会让模型学习到标题可以跨越被标为非实体的括号，边界自然会漂。
+结构边界告警：
+```text
+[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]
+```
+`KissSub` 是 `B-GROUP`，右括号是 `O`，这是合法 BIO；但如果 tokenizer 在推理时把 `[KissSub]` 合成一个 token，模型就无法只给内部文字打 `GROUP`，只能把整个 bracket token 判成一个类别。
+## Confusion 分析
+故意用 char 数据评估 regex checkpoint，entity F1 只有 0.0832。主要混淆：
+- `O -> TITLE`: 930
+- `SOURCE -> TITLE`: 236
+- `EPISODE -> TITLE`: 228
+- `GROUP -> TITLE`: 86
+这与实际症状一致：模型把结构锚点和 meta 区域吸进 title，group/title 边界混淆，episode 被 title 或 O 吞掉。
+## 已修改的代码
+`train.py`
+- `--tokenizer` 默认从数据集 metadata/vocab 名称/样本结构自动推断。
+- char 数据默认 `max_seq_length >= 128`。
+- dataset metadata 与 tokenizer 不一致会直接报错。
+- fine-tune 到新 vocab 时按 token 字符串重映射 embedding，避免 token ID 语义错位。
+- checkpoint 保存正确的 `tokenizer_variant` 和 `max_seq_length`。
+`inference.py`
+- 新增 `--debug`，输出 tokenizer variant、token IDs、labels、scores、UNK rate、truncation、entity spans。
+- 默认使用 checkpoint `max_seq_length`。
+- 修正推理截断逻辑，保留 `[SEP]`，与训练一致。
+- 默认使用 constrained BIO Viterbi 解码，阻止 `O -> I-X` 这类非法转移；可用 `--no-constrained-bio` 查看原始 greedy 输出。
+- 新增 rule-assisted parsing，兜底修复高置信结构锚点：leading group bracket、` - 07`、`S01E07`、resolution、source。
+- 可用 `--no-rule-assist` 关闭规则兜底，只看模型原始输出。
+`diagnose_pipeline.py`
+- 自动检查 token/label 长度。
+- 输出 BIO 违规样本与边界告警。
+- 输出 tokenizer split 示例。
+- 输出 train/inference tokenizer 对比。
+- 输出实体、label、空格 label、UNK、截断统计。
+- 可选加载 checkpoint 做 confusion 和 seqeval entity-level F1。
+## 修改后的 Pipeline
+推荐 char-level pipeline：
+```bash
+python diagnose_pipeline.py ^
+  --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
+  --vocab-file datasets/AnimeName/vocab.char.json ^
+  --sample-limit 20000 ^
+  --output diagnostics_report.md
+python train.py ^
+  --tokenizer char ^
+  --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
+  --vocab-file datasets/AnimeName/vocab.char.json ^
+  --save-dir checkpoints/dmhy-char ^
+  --epochs 10 ^
+  --batch-size 128 ^
+  --learning-rate 0.0003 ^
+  --warmup-steps 300 ^
+  --max-seq-length 128 ^
+  --seed 42
+python inference.py ^
+  --model-dir checkpoints/dmhy-char/final ^
+  --debug ^
+  "[LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]"
+```
+如果继续使用 word/regex pipeline，必须先重新生成数据，使 `sample["tokens"] == AnimeTokenizer.tokenize(sample["filename"])` 对绝大多数样本成立；否则验证集仍然是训练 token 空间，真实 inference 是另一个 token 空间。
+## 最合理的 Tokenizer 方案
+当前任务更适合 char-level 或 deterministic hybrid tokenizer，不适合通用 subword tokenizer。
+char-level 优点：
+- train/inference 最容易完全一致。
+- 不会把 `[LoliHouse]`、`[WebRip ...]` 这类结构块压成单 token。
+- 对未知标题、组名、罗马音、中文、日文都没有 OOV。
+- 更适合学习括号、空格、连字符、集数位置这些结构信号。
+char-level 缺点：
+- 序列更长，必须用 `max_seq_length=128`。
+- 逐 token softmax 容易出现 BIO 非法转移，建议加 CRF。
+word-level/regex 优点：
+- 序列短，训练快。
+- 当前已有 checkpoint 在同 token 空间验证集上 F1 较高。
+word-level/regex 缺点：
+- 如果 bracket protection 把整段合并，内部 label 无法表达。
+- 数据生成 tokenizer 和 inference tokenizer 稍有不一致就会严重错位。
+- OOV 对新番标题和组名仍然明显。
+结论：短期用 char-level + rule-assisted parsing；中期改为 hybrid tokenizer：保留结构符号 `[ ] ( ) - _ . space` 为独立 token，英文数字连续串可作为片段但必须能映射回字符 offset，并在 label alignment 上以 offset 为准；长期加 BERT + CRF。
+## 建议训练配置
+首选：
+```bash
+python train.py --tokenizer char ^
+  --data-file datasets/AnimeName/dmhy_weak_char.jsonl ^
+  --vocab-file datasets/AnimeName/vocab.char.json ^
+  --save-dir checkpoints/dmhy-char ^
+  --epochs 10 --batch-size 128 ^
+  --learning-rate 0.0003 --warmup-steps 300 ^
+  --max-seq-length 128 --seed 42
+```
+不要从 regex checkpoint 直接当作同构模型继续训练 char；如果要迁移，当前代码会按 token 字符串 remap embedding，但多数 char token 与 regex token 共享有限，最好从头训练 char 模型或只迁移 encoder 非 embedding 层。
+必须新增评估：
+- entity-level F1 by field
+- field exact match: `group/title/episode/resolution/source`
+- full parse exact match
+- episode recall
+- boundary errors: group-title, title-episode, episode-meta
+- inference debug sample set，固定 50-200 个真实文件名回归
+## 真实案例分析
+输入：
+```text
+[LoliHouse] Yomi no Tsugai - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]
+```
+旧 regex checkpoint 原始模型输出：
+```json
+{
+  "entities": [
+    {"type": "TITLE", "text": "[LoliHouse] Yomi no Tsugai"},
+    {"type": "EPISODE", "text": "07"}
+  ]
+}
+```
+问题点：
+- `[LoliHouse]` 被 tokenizer 合成一个 token。
+- 模型把该 token 判成 `B-TITLE`，无法只把内部 `LoliHouse` 判成 `GROUP`。
+- `Yomi` 和 `Tsugai` 在 3,000 vocab checkpoint 中是 `[UNK]`，但模型仍高置信输出 `I-TITLE`，说明 loss/置信度不能代表字段正确性。
+修改后带规则辅助的最终输出：
+```json
+{
+  "group": "LoliHouse",
+  "title": "Yomi no Tsugai",
+  "episode": 7,
+  "source": "WebRip",
+  "resolution": "1080p"
+}
+```
+这只是上线兜底；真正修复仍应训练一个 train/inference token 完全一致的 char 或 hybrid 模型。
+## 架构建议
+最推荐的重构路线：
+1. `BERT encoder + CRF`：约束 `O -> I-X`、`B-X -> I-Y` 等非法/低质量转移。
+2. char-level NER：保证 token-label alignment 不受 subword split 影响。
+3. rule-assisted parser：先抽取高置信结构锚点，再让模型负责模糊 title/group 边界。
+4. offset-based dataset：每条数据保存 raw filename、entity spans、tokens、offset_mapping、labels，训练时由 tokenizer 统一生成 labels。
+当前代码已先实现“无训练 CRF”的 constrained BIO decoding，作为上线前的轻量保护。完整 BERT+CRF 仍建议作为下一阶段训练架构重构。
+不要只优化 loss。这个任务的目标函数应更接近真实解析准确率：字段级 exact match + episode recall + title boundary F1。

diagnostics_report_word.md ADDED Viewed

	@@ -0,0 +1,2678 @@

+# Anime Filename Parser Diagnostics Report
+## Executive Summary
+- Dataset: `datasets\AnimeName\dmhy_weak.jsonl`
+- Inspected rows: 5,000
+- Dataset tokenizer variant: `regex`
+- Diagnosed tokenizer variant: `regex`
+- Vocab: `datasets\AnimeName\vocab.json` (8,000 tokens)
+- Max sequence length checked: 64
+- O-label ratio: 38.12%
+- Truncation risk: 0/5,000 rows (0.00%)
+- UNK rate after selected tokenizer: 6.9158%
+- BIO warnings collected: 9,711
+Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.
+## Label And Entity Statistics
+### Label distribution
+- `O`: 32,517 (38.12%)
+- `I-TITLE`: 30,321 (35.54%)
+- `B-TITLE`: 5,593 (6.56%)
+- `B-EPISODE`: 5,000 (5.86%)
+- `B-SOURCE`: 4,032 (4.73%)
+- `I-GROUP`: 2,459 (2.88%)
+- `B-GROUP`: 2,299 (2.69%)
+- `B-RESOLUTION`: 1,765 (2.07%)
+- `B-SEASON`: 1,269 (1.49%)
+- `B-SPECIAL`: 57 (0.07%)
+### Entity count
+- `TITLE`: 6,061 (29.59%)
+- `EPISODE`: 5,000 (24.41%)
+- `SOURCE`: 4,032 (19.68%)
+- `GROUP`: 2,299 (11.22%)
+- `RESOLUTION`: 1,765 (8.62%)
+- `SEASON`: 1,269 (6.20%)
+- `SPECIAL`: 57 (0.28%)
+### Length distribution
+```json
+{
+  "raw_tokens": {
+    "min": 3,
+    "p50": 17,
+    "p90": 28,
+    "p95": 31,
+    "p99": 39,
+    "max": 54
+  },
+  "aligned_tokens": {
+    "min": 3,
+    "p50": 17,
+    "p90": 28,
+    "p95": 31,
+    "p99": 39,
+    "max": 54
+  }
+}
+```
+### Whitespace labels
+- `I-TITLE`: 10,539 (48.98%)
+- `O`: 10,484 (48.72%)
+- `I-GROUP`: 411 (1.91%)
+- `B-TITLE`: 84 (0.39%)
+## BIO Violations And Boundary Drift
+### Violation counts
+- `B_DIRECT_TO_O`: 9,243 (95.18%)
+- `ORPHAN_I`: 468 (4.82%)
+### Boundary drift heuristics
+- none
+### Sample violations
+```json
+[
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 8,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": ".",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      ".",
+      "Atelier",
+      ".",
+      "S01",
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "I-TITLE",
+      "O",
+      "B-SEASON",
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 10,
+    "prev_label": "B-RESOLUTION",
+    "label": "O",
+    "token": ".",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      ".",
+      "S01",
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP"
+    ],
+    "context_labels": [
+      "O",
+      "B-SEASON",
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 12,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": ".",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      "."
+    ],
+    "context_labels": [
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 14,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": ".",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2"
+    ],
+    "context_labels": [
+      "B-RESOLUTION",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 16,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "N",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2",
+      ".",
+      "0"
+    ],
+    "context_labels": [
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 19,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "2",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2",
+      ".",
+      "0",
+      ".",
+      "H.264",
+      "."
+    ],
+    "context_labels": [
+      "O",
+      "B-SOURCE",
+      "O",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O",
+      "O",
+      "O",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 24,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": ".",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      "2",
+      ".",
+      "0",
+      ".",
+      "H.264",
+      ".",
+      "MSubs",
+      "-",
+      "ToonsHub"
+    ],
+    "context_labels": [
+      "O",
+      "O",
+      "O",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 26,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "-",
+    "row": 1,
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "context_tokens": [
+      "0",
+      ".",
+      "H.264",
+      ".",
+      "MSubs",
+      "-",
+      "ToonsHub"
+    ],
+    "context_labels": [
+      "O",
+      "O",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE",
+      "O",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 2,
+    "file_id": 2,
+    "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
+    "context_tokens": [
+      "[",
+      "LoliHouse",
+      "]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 17,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": " ",
+    "row": 2,
+    "file_id": 2,
+    "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
+    "context_tokens": [
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "O",
+      "O",
+      "B-EPISODE",
+      "O",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 3,
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠"
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 13,
+    "prev_label": "B-SEASON",
+    "label": "O",
+    "token": " ",
+    "row": 3,
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "O",
+      "B-SEASON",
+      "O",
+      "O",
+      "O",
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 17,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": " ",
+    "row": 3,
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "B-SEASON",
+      "O",
+      "O",
+      "O",
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 21,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[AAC AVC]",
+    "row": 3,
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 4,
+    "file_id": 4,
+    "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀"
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 24,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": " ",
+    "row": 4,
+    "file_id": 4,
+    "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "O",
+      "O",
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 28,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[AAC AVC]",
+    "row": 4,
+    "file_id": 4,
+    "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "context_tokens": [
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "B-EPISODE",
+      "O",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "B-SOURCE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 5,
+    "file_id": 5,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
+    "context_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 19,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[MP4]",
+    "row": 5,
+    "file_id": 5,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
+    "context_tokens": [
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 6,
+    "file_id": 6,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
+    "context_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 19,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[MP4]",
+    "row": 6,
+    "file_id": 6,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
+    "context_tokens": [
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 7,
+    "file_id": 7,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
+    "context_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 19,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[MP4]",
+    "row": 7,
+    "file_id": 7,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
+    "context_tokens": [
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 8,
+    "file_id": 8,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
+    "context_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 19,
+    "prev_label": "B-SOURCE",
+    "label": "O",
+    "token": "[MP4]",
+    "row": 8,
+    "file_id": 8,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
+    "context_tokens": [
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "B-RESOLUTION",
+      "B-SOURCE",
+      "O"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 9,
+    "file_id": 9,
+    "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
+    "context_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 11,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": "[1080p AVC AAC]",
+    "row": 9,
+    "file_id": 9,
+    "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
+    "context_tokens": [
+      "no",
+      " ",
+      "Frieren",
+      "]",
+      "[29]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 10,
+    "file_id": 10,
+    "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
+    "context_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 11,
+    "prev_label": "B-EPISODE",
+    "label": "O",
+    "token": "[1080p AVC AAC]",
+    "row": 10,
+    "file_id": 10,
+    "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
+    "context_tokens": [
+      "no",
+      " ",
+      "Frieren",
+      "]",
+      "[30]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "context_labels": [
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "O",
+      "B-EPISODE",
+      "O",
+      "B-SOURCE"
+    ]
+  },
+  {
+    "type": "B_DIRECT_TO_O",
+    "index": 2,
+    "prev_label": "B-GROUP",
+    "label": "O",
+    "token": "]",
+    "row": 11,
+    "file_id": 11,
+    "filename": "[Airota][Sousou no Frieren][31][1080p AVC AAC][CHT]",
+    "context_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " "
+    ],
+    "context_labels": [
+      "O",
+      "B-GROUP",
+      "O",
+      "O",
+      "B-TITLE",
+      "I-TITLE",
+      "I-TITLE",
+      "I-TITLE"
+    ]
+  }
+]
+```
+## Tokenizer Split And Alignment
+### Dataset tokens vs selected tokenizer mismatches
+```json
+[
+  {
+    "file_id": 2,
+    "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "LoliHouse",
+      "]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " ",
+      "wa",
+      " ",
+      "Taberu",
+      " ",
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "tokenizer_tokens": [
+      "[LoliHouse]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " ",
+      "wa",
+      " ",
+      "Taberu",
+      " ",
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "dataset_len": 19,
+    "tokenizer_len": 17
+  },
+  {
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "tokenizer_tokens": [
+      "[ANi]",
+      " ",
+      "異",
+      "��",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "dataset_len": 23,
+    "tokenizer_len": 21
+  },
+  {
+    "file_id": 4,
+    "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "JK",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "tokenizer_tokens": [
+      "[ANi]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "JK",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "dataset_len": 30,
+    "tokenizer_len": 28
+  },
+  {
+    "file_id": 5,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "tokenizer_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "dataset_len": 20,
+    "tokenizer_len": 6
+  },
+  {
+    "file_id": 6,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "tokenizer_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "dataset_len": 20,
+    "tokenizer_len": 6
+  },
+  {
+    "file_id": 7,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "tokenizer_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "dataset_len": 20,
+    "tokenizer_len": 6
+  },
+  {
+    "file_id": 8,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "tokenizer_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "dataset_len": 20,
+    "tokenizer_len": 6
+  },
+  {
+    "file_id": 9,
+    "filename": "[Airota][Sousou no Frieren][29][1080p AVC AAC][CHT]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " ",
+      "Frieren",
+      "]",
+      "[29]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "tokenizer_tokens": [
+      "[Airota]",
+      "[Sousou no Frieren]",
+      "[29]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "dataset_len": 13,
+    "tokenizer_len": 5
+  },
+  {
+    "file_id": 10,
+    "filename": "[Airota][Sousou no Frieren][30][1080p AVC AAC][CHT]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " ",
+      "Frieren",
+      "]",
+      "[30]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "tokenizer_tokens": [
+      "[Airota]",
+      "[Sousou no Frieren]",
+      "[30]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "dataset_len": 13,
+    "tokenizer_len": 5
+  },
+  {
+    "file_id": 11,
+    "filename": "[Airota][Sousou no Frieren][31][1080p AVC AAC][CHT]",
+    "common_prefix": 0,
+    "dataset_tokens": [
+      "[",
+      "Airota",
+      "]",
+      "[",
+      "Sousou",
+      " ",
+      "no",
+      " ",
+      "Frieren",
+      "]",
+      "[31]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "tokenizer_tokens": [
+      "[Airota]",
+      "[Sousou no Frieren]",
+      "[31]",
+      "[1080p AVC AAC]",
+      "[CHT]"
+    ],
+    "dataset_len": 13,
+    "tokenizer_len": 5
+  }
+]
+```
+### Split examples
+```json
+[
+  {
+    "file_id": 1,
+    "filename": "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub",
+    "dataset_tokens": [
+      "Witch",
+      ".",
+      "Hat",
+      ".",
+      "Atelier",
+      ".",
+      "S01",
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2",
+      ".",
+      "0",
+      ".",
+      "H.264",
+      ".",
+      "MSubs",
+      "-",
+      "ToonsHub"
+    ],
+    "diagnosed_tokens": [
+      "Witch",
+      ".",
+      "Hat",
+      ".",
+      "Atelier",
+      ".",
+      "S01",
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2",
+      ".",
+      "0",
+      ".",
+      "H.264",
+      ".",
+      "MSubs",
+      "-",
+      "ToonsHub"
+    ],
+    "regex_tokens": [
+      "Witch",
+      ".",
+      "Hat",
+      ".",
+      "Atelier",
+      ".",
+      "S01",
+      "E07",
+      ".",
+      "1080p",
+      ".",
+      "NF",
+      ".",
+      "WEB-DL",
+      ".",
+      "JP",
+      "N",
+      ".",
+      "AAC",
+      "2",
+      ".",
+      "0",
+      ".",
+      "H.264",
+      ".",
+      "MSubs",
+      "-",
+      "ToonsHub"
+    ],
+    "char_tokens": [
+      "W",
+      "i",
+      "t",
+      "c",
+      "h",
+      ".",
+      "H",
+      "a",
+      "t",
+      ".",
+      "A",
+      "t",
+      "e",
+      "l",
+      "i",
+      "e",
+      "r",
+      ".",
+      "S",
+      "0",
+      "1",
+      "E",
+      "0",
+      "7",
+      ".",
+      "1",
+      "0",
+      "8",
+      "0",
+      "p",
+      ".",
+      "N",
+      "F",
+      ".",
+      "W",
+      "E",
+      "B",
+      "-",
+      "D",
+      "L",
+      ".",
+      "J",
+      "P",
+      "N",
+      ".",
+      "A",
+      "A",
+      "C",
+      "2",
+      ".",
+      "0",
+      ".",
+      "H",
+      ".",
+      "2",
+      "6",
+      "4",
+      ".",
+      "M",
+      "S",
+      "u",
+      "b",
+      "s",
+      "-",
+      "T",
+      "o",
+      "o",
+      "n",
+      "s",
+      "H",
+      "u",
+      "b"
+    ]
+  },
+  {
+    "file_id": 2,
+    "filename": "[LoliHouse] Maid-san wa Taberu Dake - 07 [WebRip 1080p HEVC-10bit AAC ASSx2]",
+    "dataset_tokens": [
+      "[",
+      "LoliHouse",
+      "]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " ",
+      "wa",
+      " ",
+      "Taberu",
+      " ",
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "diagnosed_tokens": [
+      "[LoliHouse]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " ",
+      "wa",
+      " ",
+      "Taberu",
+      " ",
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "regex_tokens": [
+      "[LoliHouse]",
+      " ",
+      "Maid",
+      "-",
+      "san",
+      " ",
+      "wa",
+      " ",
+      "Taberu",
+      " ",
+      "Dake",
+      " ",
+      "-",
+      " ",
+      "07",
+      " ",
+      "[WebRip 1080p HEVC-10bit AAC ASSx2]"
+    ],
+    "char_tokens": [
+      "[",
+      "L",
+      "o",
+      "l",
+      "i",
+      "H",
+      "o",
+      "u",
+      "s",
+      "e",
+      "]",
+      " ",
+      "M",
+      "a",
+      "i",
+      "d",
+      "-",
+      "s",
+      "a",
+      "n",
+      " ",
+      "w",
+      "a",
+      " ",
+      "T",
+      "a",
+      "b",
+      "e",
+      "r",
+      "u",
+      " ",
+      "D",
+      "a",
+      "k",
+      "e",
+      " ",
+      "-",
+      " ",
+      "0",
+      "7",
+      " ",
+      "[",
+      "W",
+      "e",
+      "b",
+      "R",
+      "i",
+      "p",
+      " ",
+      "1",
+      "0",
+      "8",
+      "0",
+      "p",
+      " ",
+      "H",
+      "E",
+      "V",
+      "C",
+      "-",
+      "1",
+      "0",
+      "b",
+      "i",
+      "t",
+      " ",
+      "A",
+      "A",
+      "C",
+      " ",
+      "A",
+      "S",
+      "S",
+      "x",
+      "2",
+      "]"
+    ]
+  },
+  {
+    "file_id": 3,
+    "filename": "[ANi] 異世界悠閒農家 2 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "dataset_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "diagnosed_tokens": [
+      "[ANi]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "regex_tokens": [
+      "[ANi]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "char_tokens": [
+      "[",
+      "A",
+      "N",
+      "i",
+      "]",
+      " ",
+      "異",
+      "世",
+      "界",
+      "悠",
+      "閒",
+      "農",
+      "家",
+      " ",
+      "2",
+      " ",
+      "-",
+      " ",
+      "0",
+      "6",
+      " ",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "B",
+      "a",
+      "h",
+      "a",
+      "]",
+      "[",
+      "W",
+      "E",
+      "B",
+      "-",
+      "D",
+      "L",
+      "]",
+      "[",
+      "A",
+      "A",
+      "C",
+      " ",
+      "A",
+      "V",
+      "C",
+      "]",
+      "[",
+      "C",
+      "H",
+      "T",
+      "]"
+    ]
+  },
+  {
+    "file_id": 4,
+    "filename": "[ANi] 木頭風紀委員和迷你裙 JK 的故事 - 06 [1080P][Baha][WEB-DL][AAC AVC][CHT]",
+    "dataset_tokens": [
+      "[",
+      "ANi",
+      "]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "JK",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "diagnosed_tokens": [
+      "[ANi]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "JK",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "regex_tokens": [
+      "[ANi]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "JK",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "06",
+      " ",
+      "[1080P]",
+      "[Baha]",
+      "[WEB-DL]",
+      "[AAC AVC]",
+      "[CHT]"
+    ],
+    "char_tokens": [
+      "[",
+      "A",
+      "N",
+      "i",
+      "]",
+      " ",
+      "木",
+      "頭",
+      "風",
+      "紀",
+      "委",
+      "員",
+      "和",
+      "迷",
+      "你",
+      "裙",
+      " ",
+      "J",
+      "K",
+      " ",
+      "的",
+      "故",
+      "事",
+      " ",
+      "-",
+      " ",
+      "0",
+      "6",
+      " ",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "B",
+      "a",
+      "h",
+      "a",
+      "]",
+      "[",
+      "W",
+      "E",
+      "B",
+      "-",
+      "D",
+      "L",
+      "]",
+      "[",
+      "A",
+      "A",
+      "C",
+      " ",
+      "A",
+      "V",
+      "C",
+      "]",
+      "[",
+      "C",
+      "H",
+      "T",
+      "]"
+    ]
+  },
+  {
+    "file_id": 5,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][GB][MP4]",
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "diagnosed_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "regex_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "char_tokens": [
+      "[",
+      "K",
+      "i",
+      "s",
+      "s",
+      "S",
+      "u",
+      "b",
+      "]",
+      "[",
+      "S",
+      "h",
+      "u",
+      "n",
+      "k",
+      "a",
+      "s",
+      "h",
+      "u",
+      "u",
+      "t",
+      "o",
+      "u",
+      " ",
+      "D",
+      "a",
+      "i",
+      "k",
+      "o",
+      "u",
+      "s",
+      "h",
+      "a",
+      " ",
+      "-",
+      " ",
+      "H",
+      "a",
+      "r",
+      "u",
+      " ",
+      "n",
+      "o",
+      " ",
+      "M",
+      "a",
+      "i",
+      "]",
+      "[",
+      "0",
+      "5",
+      "]",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "G",
+      "B",
+      "]",
+      "[",
+      "M",
+      "P",
+      "4",
+      "]"
+    ]
+  },
+  {
+    "file_id": 6,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][GB][MP4]",
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "diagnosed_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "regex_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[GB]",
+      "[MP4]"
+    ],
+    "char_tokens": [
+      "[",
+      "K",
+      "i",
+      "s",
+      "s",
+      "S",
+      "u",
+      "b",
+      "]",
+      "[",
+      "S",
+      "h",
+      "u",
+      "n",
+      "k",
+      "a",
+      "s",
+      "h",
+      "u",
+      "u",
+      "t",
+      "o",
+      "u",
+      " ",
+      "D",
+      "a",
+      "i",
+      "k",
+      "o",
+      "u",
+      "s",
+      "h",
+      "a",
+      " ",
+      "-",
+      " ",
+      "H",
+      "a",
+      "r",
+      "u",
+      " ",
+      "n",
+      "o",
+      " ",
+      "M",
+      "a",
+      "i",
+      "]",
+      "[",
+      "0",
+      "6",
+      "]",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "G",
+      "B",
+      "]",
+      "[",
+      "M",
+      "P",
+      "4",
+      "]"
+    ]
+  },
+  {
+    "file_id": 7,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][06][1080P][BIG5][MP4]",
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "diagnosed_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "regex_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[06]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "char_tokens": [
+      "[",
+      "K",
+      "i",
+      "s",
+      "s",
+      "S",
+      "u",
+      "b",
+      "]",
+      "[",
+      "S",
+      "h",
+      "u",
+      "n",
+      "k",
+      "a",
+      "s",
+      "h",
+      "u",
+      "u",
+      "t",
+      "o",
+      "u",
+      " ",
+      "D",
+      "a",
+      "i",
+      "k",
+      "o",
+      "u",
+      "s",
+      "h",
+      "a",
+      " ",
+      "-",
+      " ",
+      "H",
+      "a",
+      "r",
+      "u",
+      " ",
+      "n",
+      "o",
+      " ",
+      "M",
+      "a",
+      "i",
+      "]",
+      "[",
+      "0",
+      "6",
+      "]",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "B",
+      "I",
+      "G",
+      "5",
+      "]",
+      "[",
+      "M",
+      "P",
+      "4",
+      "]"
+    ]
+  },
+  {
+    "file_id": 8,
+    "filename": "[KissSub][Shunkashuutou Daikousha - Haru no Mai][05][1080P][BIG5][MP4]",
+    "dataset_tokens": [
+      "[",
+      "KissSub",
+      "]",
+      "[",
+      "Shunkashuutou",
+      " ",
+      "Daikousha",
+      " ",
+      "-",
+      " ",
+      "Haru",
+      " ",
+      "no",
+      " ",
+      "Mai",
+      "]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "diagnosed_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "regex_tokens": [
+      "[KissSub]",
+      "[Shunkashuutou Daikousha - Haru no Mai]",
+      "[05]",
+      "[1080P]",
+      "[BIG5]",
+      "[MP4]"
+    ],
+    "char_tokens": [
+      "[",
+      "K",
+      "i",
+      "s",
+      "s",
+      "S",
+      "u",
+      "b",
+      "]",
+      "[",
+      "S",
+      "h",
+      "u",
+      "n",
+      "k",
+      "a",
+      "s",
+      "h",
+      "u",
+      "u",
+      "t",
+      "o",
+      "u",
+      " ",
+      "D",
+      "a",
+      "i",
+      "k",
+      "o",
+      "u",
+      "s",
+      "h",
+      "a",
+      " ",
+      "-",
+      " ",
+      "H",
+      "a",
+      "r",
+      "u",
+      " ",
+      "n",
+      "o",
+      " ",
+      "M",
+      "a",
+      "i",
+      "]",
+      "[",
+      "0",
+      "5",
+      "]",
+      "[",
+      "1",
+      "0",
+      "8",
+      "0",
+      "P",
+      "]",
+      "[",
+      "B",
+      "I",
+      "G",
+      "5",
+      "]",
+      "[",
+      "M",
+      "P",
+      "4",
+      "]"
+    ]
+  }
+]
+```
+### Vocabulary coverage
+```json
+{
+  "total": 85312,
+  "unk": 5900,
+  "unk_rate": 0.06915791447861966,
+  "top_unk": [
+    [
+      "(BDRip 720p x264)",
+      66
+    ],
+    [
+      "Partie",
+      59
+    ],
+    [
+      "incantevole",
+      54
+    ],
+    [
+      "Muxed",
+      54
+    ],
+    [
+      "nonscordarmi",
+      54
+    ],
+    [
+      "NEET",
+      52
+    ],
+    [
+      "Dousei",
+      52
+    ],
+    [
+      "[krikoun68]",
+      52
+    ],
+    [
+      "[Blu-Ray - MUX - 960p - x264 - AC3 ITA-JAP - SUB ITA]",
+      51
+    ],
+    [
+      "CTR",
+      45
+    ],
+    [
+      "joseol",
+      45
+    ],
+    [
+      "e99",
+      45
+    ],
+    [
+      "(1440x1080 h264 AC3 AAC)",
+      45
+    ],
+    [
+      "VERS",
+      37
+    ],
+    [
+      "脙",
+      37
+    ],
+    [
+      "Shunkashuutou",
+      36
+    ],
+    [
+      "Daikousha",
+      36
+    ],
+    [
+      "houbatsu",
+      36
+    ],
+    [
+      "DEFINITIVA",
+      36
+    ],
+    [
+      "Crash",
+      35
+    ],
+    [
+      "Realm",
+      31
+    ],
+    [
+      "UHD",
+      31
+    ],
+    [
+      "[BDrip 1080P HEVC-10bit AAC]",
+      29
+    ],
+    [
+      "Choroi",
+      28
+    ],
+    [
+      "완",
+      28
+    ]
+  ]
+}
+```
+## Train Inference Tokenizer Comparison
+- Model dir: `checkpoints\dmhy-finetune\final`
+- Model tokenizer variant: `regex`
+- Dataset tokenizer variant: `regex`
+- Diagnostic tokenizer variant: `regex`
+- Model tokenizer vocab size: 3,000
+- Diagnostic tokenizer vocab size: 8,000
+If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.
+## Model Confusion Analysis
+- Evaluated samples: 128
+- Entity precision: 0.9568
+- Entity recall: 0.9530
+- Entity F1: 0.9549
+### Boundary error classes
+- `B-boundary`: 26 (56.52%)
+- `entity-type`: 20 (43.48%)
+### Top token-label confusions
+| true | pred | count |
+| --- | --- | --- |
+| O | I-TITLE | 17 |
+| O | B-EPISODE | 6 |
+| B-SOURCE | O | 4 |
+| I-TITLE | O | 3 |
+| B-EPISODE | O | 3 |
+| B-SEASON | O | 2 |
+| B-RESOLUTION | B-SOURCE | 2 |
+| B-EPISODE | I-TITLE | 2 |
+| O | B-TITLE | 2 |
+| B-TITLE | I-TITLE | 2 |
+| O | B-SOURCE | 1 |
+| B-SEASON | I-TITLE | 1 |
+| O | B-SEASON | 1 |
+### Top entity-type confusions
+| true | pred | count |
+| --- | --- | --- |
+| O | TITLE | 19 |
+| O | EPISODE | 6 |
+| SOURCE | O | 4 |
+| TITLE | O | 3 |
+| EPISODE | O | 3 |
+| SEASON | O | 2 |
+| RESOLUTION | SOURCE | 2 |
+| EPISODE | TITLE | 2 |
+| O | SOURCE | 1 |
+| SEASON | TITLE | 1 |
+| O | SEASON | 1 |
+### Seqeval report
+```text
+              precision    recall  f1-score   support
+     EPISODE     0.9535    0.9609    0.9572       128
+       GROUP     1.0000    1.0000    1.0000        53
+  RESOLUTION     1.0000    0.9545    0.9767        44
+      SEASON     0.9630    0.8966    0.9286        29
+      SOURCE     0.9703    0.9608    0.9655       102
+     SPECIAL     1.0000    1.0000    1.0000         5
+       TITLE     0.9211    0.9333    0.9272       150
+   micro avg     0.9568    0.9530    0.9549       511
+   macro avg     0.9725    0.9580    0.9650       511
+weighted avg     0.9571    0.9530    0.9550       511
+```
+## Recommended Pipeline
+1. Use one tokenizer variant end to end and save it in the checkpoint metadata.
+2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.
+3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.
+4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.
+5. Keep rule-assisted post-processing for high-confidence structural anchors: leading group bracket, ` - 07`, `S01E07`, source, and resolution.
+6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.

inference.py CHANGED Viewed

@@ -14,7 +14,7 @@ import json
 import os
 import re
 import sys
-from typing import Dict, List, Optional
 import torch
 from transformers import BertForTokenClassification
@@ -70,58 +70,149 @@ def extract_resolution(text: str) -> Optional[str]:
     return clean if clean else None
 def trim_decorations(text: str) -> str:
     """Trim outer release brackets from an extracted entity."""
     return text.strip().strip("[]()【】《》（）").strip()
-def postprocess(tokens: List[str], labels: List[str]) -> Dict:
-    """
-    Convert BIO-labeled tokens into structured metadata.
-    Merges consecutive B- / I- tokens of the same entity type,
-    then extracts structured fields.
     """
-    result: Dict = {
-        "title": None,
-        "season": None,
-        "episode": None,
-        "group": None,
-        "resolution": None,
-        "source": None,
-        "special": None,
-    }
-    # Merge consecutive B- / I- tokens into entities
-    entities: List[tuple] = []
     current_entity: Optional[str] = None
     current_tokens: List[str] = []
     for token, label in zip(tokens, labels):
         if label.startswith("B-"):
-            # Finalize previous entity
             if current_entity:
-                entities.append((current_entity, "".join(current_tokens)))
-            current_entity = label[2:]  # Remove "B-"
             current_tokens = [token]
         elif label.startswith("I-"):
             entity_type = label[2:]
             if current_entity == entity_type:
                 current_tokens.append(token)
             else:
-                # Orphaned I- — start new entity
                 if current_entity:
-                    entities.append((current_entity, "".join(current_tokens)))
                 current_entity = entity_type
                 current_tokens = [token]
-        else:  # O
             if current_entity:
-                entities.append((current_entity, "".join(current_tokens)))
                 current_entity = None
                 current_tokens = []
     if current_entity:
-        entities.append((current_entity, "".join(current_tokens)))
     # Fill result
     for entity_type, text in entities:
@@ -163,15 +254,177 @@ def postprocess(tokens: List[str], labels: List[str]) -> Dict:
             if (trimmed := trim_decorations(f))
         )
     return result
 def parse_filename(
     filename: str,
     model: BertForTokenClassification,
     tokenizer: AnimeTokenizer,
     id2label: Dict[int, str],
     max_length: int = 64,
 ) -> Dict:
     """
     Parse an anime filename and extract structured metadata.
@@ -195,6 +448,8 @@ def parse_filename(
     # Convert to input IDs
     input_ids = tokenizer.convert_tokens_to_ids(tokens)
     # Add special tokens
     input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
@@ -202,8 +457,8 @@ def parse_filename(
     # Truncate if needed
     if len(input_ids) > max_length:
-        input_ids = input_ids[:max_length]
-        attention_mask = attention_mask[:max_length]
     # Pad
     pad_len = max_length - len(input_ids)
@@ -216,10 +471,6 @@ def parse_filename(
     input_tensor = torch.tensor([input_ids], device=device)
     mask_tensor = torch.tensor([attention_mask], device=device)
-    with torch.no_grad():
-        logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
-    predictions = torch.argmax(logits, dim=-1)[0]
     # Remove special token predictions
     # Count real tokens used (minus CLS/SEP)
     real_token_count = len(tokens)
@@ -230,11 +481,62 @@ def parse_filename(
                 "group": None, "resolution": None, "source": None,
                 "special": None}
-    pred_labels = predictions[1:1 + available].tolist()
     label_strings = [id2label.get(p, "O") for p in pred_labels]
     # Post-process
-    return postprocess(tokens[:available], label_strings)
 def main():
@@ -248,6 +550,12 @@ def main():
                         help="Tokenizer variant override. Defaults to checkpoint metadata")
     parser.add_argument("--max-length", type=int, default=64,
                         help="Maximum sequence length")
     args = parser.parse_args()
     # Load config
@@ -262,7 +570,10 @@ def main():
     model = BertForTokenClassification.from_pretrained(args.model_dir)
     model.eval()
-    id2label = cfg.id2label
     # Process filenames
     filenames_to_parse: List[str] = []
@@ -283,7 +594,16 @@ def main():
     for fn in filenames_to_parse:
         if not fn.strip():
             continue
-        result = parse_filename(fn, model, tokenizer, id2label, args.max_length)
         result["_input"] = fn
         results.append(result)

 import os
 import re
 import sys
+from typing import Dict, List, Optional, Tuple
 import torch
 from transformers import BertForTokenClassification
     return clean if clean else None
+def display_token(token: str) -> str:
+    """Make whitespace tokens visible in debug output."""
+    if token == " ":
+        return "<SPACE>"
+    if token == "\t":
+        return "<TAB>"
+    return token
 def trim_decorations(text: str) -> str:
     """Trim outer release brackets from an extracted entity."""
     return text.strip().strip("[]()【】《》（）").strip()
+def join_entity_tokens(tokens: List[str], tokenizer: Optional[AnimeTokenizer] = None) -> str:
+    """Join entity tokens according to the tokenizer granularity."""
+    if tokenizer is not None and getattr(tokenizer, "tokenizer_variant", "regex") == "char":
+        return "".join(tokens)
+    text = "".join(tokens)
+    if " " in tokens:
+        return text
+    return text
+def labels_to_entities(
+    tokens: List[str],
+    labels: List[str],
+    tokenizer: Optional[AnimeTokenizer] = None,
+) -> List[Tuple[str, str]]:
     """
+    Convert BIO labels into entity spans.
+    Illegal orphan I-X labels start a new entity so debug output exposes the
+    model behavior instead of silently dropping tokens.
+    """
+    entities: List[Tuple[str, str]] = []
     current_entity: Optional[str] = None
     current_tokens: List[str] = []
     for token, label in zip(tokens, labels):
         if label.startswith("B-"):
             if current_entity:
+                entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
+            current_entity = label[2:]
             current_tokens = [token]
         elif label.startswith("I-"):
             entity_type = label[2:]
             if current_entity == entity_type:
                 current_tokens.append(token)
             else:
                 if current_entity:
+                    entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
                 current_entity = entity_type
                 current_tokens = [token]
+        else:
             if current_entity:
+                entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
                 current_entity = None
                 current_tokens = []
     if current_entity:
+        entities.append((current_entity, join_entity_tokens(current_tokens, tokenizer)))
+    return entities
+def is_allowed_bio_transition(previous_label: str, label: str) -> bool:
+    """Return whether previous_label -> label is valid under IOB2."""
+    if label.startswith("I-"):
+        entity = label[2:]
+        return previous_label in {f"B-{entity}", f"I-{entity}"}
+    return True
+def constrained_bio_decode(emissions: torch.Tensor, id2label: Dict[int, str]) -> List[int]:
+    """
+    Decode token logits with hard BIO transition constraints.
+    This is a lightweight CRF-style Viterbi decoder without learned transition
+    weights. It prevents impossible orphan I-X spans at inference time.
+    """
+    if emissions.numel() == 0:
+        return []
+    num_tokens, num_labels = emissions.shape
+    scores = emissions.detach().cpu()
+    backpointers = torch.zeros((num_tokens, num_labels), dtype=torch.long)
+    dp = torch.full((num_labels,), float("-inf"))
+    for label_id in range(num_labels):
+        label = id2label.get(label_id, "O")
+        if not label.startswith("I-"):
+            dp[label_id] = scores[0, label_id]
+    for idx in range(1, num_tokens):
+        next_dp = torch.full((num_labels,), float("-inf"))
+        for label_id in range(num_labels):
+            label = id2label.get(label_id, "O")
+            best_score = float("-inf")
+            best_prev = 0
+            for prev_id in range(num_labels):
+                prev_label = id2label.get(prev_id, "O")
+                if not is_allowed_bio_transition(prev_label, label):
+                    continue
+                candidate = dp[prev_id] + scores[idx, label_id]
+                if candidate > best_score:
+                    best_score = float(candidate)
+                    best_prev = prev_id
+            next_dp[label_id] = best_score
+            backpointers[idx, label_id] = best_prev
+        dp = next_dp
+    best_last = int(torch.argmax(dp).item())
+    decoded = [best_last]
+    for idx in range(num_tokens - 1, 0, -1):
+        decoded.append(int(backpointers[idx, decoded[-1]].item()))
+    decoded.reverse()
+    return decoded
+def postprocess(
+    tokens: List[str],
+    labels: List[str],
+    tokenizer: Optional[AnimeTokenizer] = None,
+    filename: Optional[str] = None,
+    use_rules: bool = True,
+) -> Dict:
+    """
+    Convert BIO-labeled tokens into structured metadata.
+    Merges consecutive B- / I- tokens of the same entity type,
+    then extracts structured fields.
+    """
+    result: Dict = {
+        "title": None,
+        "season": None,
+        "episode": None,
+        "group": None,
+        "resolution": None,
+        "source": None,
+        "special": None,
+    }
+    entities = labels_to_entities(tokens, labels, tokenizer)
     # Fill result
     for entity_type, text in entities:
             if (trimmed := trim_decorations(f))
         )
+    if use_rules and filename:
+        result = apply_rule_assists(filename, result)
     return result
+BRACKET_RE = re.compile(r"\[([^\]]+)\]|\(([^)]+)\)|【([^】]+)】|《([^》]+)》")
+RESOLUTION_RE = re.compile(r"\b(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})\b")
+SOURCE_RE = re.compile(
+    r"\b(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|DVDRip|DVD|TVRip|HDTV|"
+    r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X)\b",
+    re.I,
+)
+EPISODE_PATTERNS = [
+    re.compile(r"(?:^|[\s._\-\[\(【《#])(?:EP?|第)?(?P<ep>\d{1,4})(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])", re.I),
+    re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I),
+]
+SEASON_RE = re.compile(r"(?:^|[\s._\-\[\(【《])(?:[Ss](?P<s1>\d{1,2})|Season\s*(?P<s2>\d{1,2})|第(?P<s3>[一二三四五六七八九十\d]+)[季期部])", re.I)
+NOISE_META_RE = re.compile(
+    r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|DVDRip|DVD|TVRip|"
+    r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
+    r"Opus|ASS.*|CHS|CHT|BIG5|GB|JPN?|MP4|MKV|繁中|简中|内封|外挂)$",
+    re.I,
+)
+def cn_number_to_int(text: str) -> Optional[int]:
+    if text.isdigit():
+        return int(text)
+    values = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
+    if text == "十":
+        return 10
+    if text.startswith("十") and len(text) == 2:
+        return 10 + values.get(text[1], 0)
+    if text.endswith("十") and len(text) == 2:
+        return values.get(text[0], 0) * 10
+    if "十" in text and len(text) == 3:
+        return values.get(text[0], 0) * 10 + values.get(text[2], 0)
+    return values.get(text)
+def bracket_parts(filename: str) -> List[Tuple[str, int, int]]:
+    parts: List[Tuple[str, int, int]] = []
+    for match in BRACKET_RE.finditer(filename):
+        text = next(group for group in match.groups() if group is not None)
+        parts.append((text.strip(), match.start(), match.end()))
+    return parts
+def looks_like_group(text: str) -> bool:
+    if not text or NOISE_META_RE.search(text):
+        return False
+    return bool(
+        re.search(
+            r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
+            r"loli|ani|vcb|airota|kiss|dmhy|erai|subsplease)",
+            text,
+            re.I,
+        )
+    )
+def apply_rule_assists(filename: str, result: Dict) -> Dict:
+    """
+    Fill high-confidence structural fields from filename conventions.
+    The model remains the primary tagger; rules only fill missing obvious fields
+    or repair common boundary drift around leading group brackets and episodes.
+    """
+    repaired = dict(result)
+    brackets = bracket_parts(filename)
+    if (not repaired.get("group") or (repaired.get("title") and repaired["group"] in repaired["title"])) and brackets:
+        first_text, first_start, _first_end = brackets[0]
+        if first_start == 0 and looks_like_group(first_text):
+            repaired["group"] = first_text
+    if not repaired.get("resolution"):
+        match = RESOLUTION_RE.search(filename)
+        if match:
+            repaired["resolution"] = match.group(0)
+    if not repaired.get("source"):
+        match = SOURCE_RE.search(filename)
+        if match:
+            repaired["source"] = match.group(0).replace("_", "-")
+    if repaired.get("season") is None:
+        match = SEASON_RE.search(filename)
+        if match:
+            value = next(group for group in match.groups() if group)
+            season = cn_number_to_int(value)
+            if season is not None:
+                repaired["season"] = season
+    if repaired.get("episode") is None:
+        candidates: List[Tuple[int, int, str]] = []
+        for pattern in EPISODE_PATTERNS:
+            for match in pattern.finditer(filename):
+                ep_text = match.group("ep")
+                ep = int(ep_text)
+                if ep == 0 or ep > 2000:
+                    continue
+                score = match.start()
+                if 1 <= ep <= 200:
+                    score += 10000
+                if "-" in filename[max(0, match.start() - 3):match.start() + 1]:
+                    score += 1000
+                if match.start() > len(filename) // 3:
+                    score += 200
+                candidates.append((score, ep, ep_text))
+        if candidates:
+            repaired["episode"] = max(candidates, key=lambda item: item[0])[1]
+    title = repaired.get("title")
+    group = repaired.get("group")
+    if title and group and title.startswith(group):
+        title = title[len(group):].lstrip("]】)>}）》 \t-_.")
+        repaired["title"] = title or repaired["title"]
+    if (not repaired.get("title") or (group and repaired["title"].startswith(group))) and repaired.get("episode"):
+        repaired_title = infer_title_span(filename, group, repaired["episode"])
+        if repaired_title:
+            repaired["title"] = repaired_title
+    return repaired
+def infer_title_span(filename: str, group: Optional[str], episode: Optional[int]) -> Optional[str]:
+    start = 0
+    if group:
+        first = BRACKET_RE.match(filename)
+        if first and group in first.group(0):
+            start = first.end()
+    end = None
+    if episode is not None:
+        ep_patterns = [
+            rf"\s[-_]\s*0*{episode}(?:v\d+)?(?=$|[\s\[\(【《._-])",
+            rf"[\[\(【《]0*{episode}(?:v\d+)?[\]\)】》]",
+            rf"[Ee]0*{episode}(?:v\d+)?",
+        ]
+        for pattern in ep_patterns:
+            match = re.search(pattern, filename[start:], re.I)
+            if match:
+                end = start + match.start()
+                break
+    if end is None:
+        for text, bracket_start, _bracket_end in bracket_parts(filename):
+            if bracket_start <= start:
+                continue
+            if NOISE_META_RE.search(text) or RESOLUTION_RE.search(text) or SOURCE_RE.search(text):
+                end = bracket_start
+                break
+    if end is None or end <= start:
+        return None
+    title = filename[start:end].strip(" \t-_.[]()【】《》（）")
+    return title or None
 def parse_filename(
     filename: str,
     model: BertForTokenClassification,
     tokenizer: AnimeTokenizer,
     id2label: Dict[int, str],
     max_length: int = 64,
+    debug: bool = False,
+    use_rules: bool = True,
+    constrain_bio: bool = True,
 ) -> Dict:
     """
     Parse an anime filename and extract structured metadata.
     # Convert to input IDs
     input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    unk_token_id = tokenizer.unk_token_id
+    unk_tokens = [token for token, token_id in zip(tokens, input_ids) if token_id == unk_token_id]
     # Add special tokens
     input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
     # Truncate if needed
     if len(input_ids) > max_length:
+        input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [tokenizer.sep_token_id]
+        attention_mask = [1] * len(input_ids)
     # Pad
     pad_len = max_length - len(input_ids)
     input_tensor = torch.tensor([input_ids], device=device)
     mask_tensor = torch.tensor([attention_mask], device=device)
     # Remove special token predictions
     # Count real tokens used (minus CLS/SEP)
     real_token_count = len(tokens)
                 "group": None, "resolution": None, "source": None,
                 "special": None}
+    with torch.no_grad():
+        logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
+    token_logits = logits[0, 1:1 + available, :]
+    probabilities = torch.softmax(token_logits, dim=-1)
+    scores, greedy_predictions = torch.max(probabilities, dim=-1)
+    if constrain_bio:
+        pred_labels = constrained_bio_decode(token_logits, id2label)
+        selected_scores = [
+            probabilities[idx, label_id].detach().cpu().item()
+            for idx, label_id in enumerate(pred_labels)
+        ]
+    else:
+        pred_labels = greedy_predictions.detach().cpu().tolist()
+        selected_scores = scores.detach().cpu().tolist()
     label_strings = [id2label.get(p, "O") for p in pred_labels]
     # Post-process
+    result = postprocess(
+        tokens[:available],
+        label_strings,
+        tokenizer=tokenizer,
+        filename=filename,
+        use_rules=use_rules,
+    )
+    if debug:
+        result["_debug"] = {
+            "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
+            "decoder": "constrained_bio" if constrain_bio else "greedy",
+            "max_length": max_length,
+            "token_count": len(tokens),
+            "available_token_count": available,
+            "truncated": len(tokens) > available,
+            "unk_count": len(unk_tokens),
+            "unk_rate": len(unk_tokens) / len(tokens) if tokens else 0.0,
+            "unk_tokens": unk_tokens[:50],
+            "tokens": tokens[:available],
+            "labels": label_strings,
+            "scores": [round(float(score), 4) for score in selected_scores],
+            "token_table": [
+                {
+                    "i": i,
+                    "token": display_token(token),
+                    "id": int(token_id),
+                    "label": label,
+                    "score": round(float(score), 4),
+                }
+                for i, (token, token_id, label, score) in enumerate(
+                    zip(tokens[:available], input_ids[1:1 + available], label_strings, selected_scores)
+                )
+            ],
+            "entities": [
+                {"type": entity_type, "text": text}
+                for entity_type, text in labels_to_entities(tokens[:available], label_strings, tokenizer)
+            ],
+        }
+    return result
 def main():
                         help="Tokenizer variant override. Defaults to checkpoint metadata")
     parser.add_argument("--max-length", type=int, default=64,
                         help="Maximum sequence length")
+    parser.add_argument("--debug", action="store_true",
+                        help="Include tokenizer, labels, scores, and entity spans in JSON output")
+    parser.add_argument("--no-rule-assist", action="store_true",
+                        help="Disable high-confidence structural post-processing rules")
+    parser.add_argument("--no-constrained-bio", action="store_true",
+                        help="Use greedy per-token decoding instead of constrained BIO Viterbi")
     args = parser.parse_args()
     # Load config
     model = BertForTokenClassification.from_pretrained(args.model_dir)
     model.eval()
+    id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
+    max_length = args.max_length
+    if max_length == 64:
+        max_length = int(getattr(model.config, "max_seq_length", max_length))
     # Process filenames
     filenames_to_parse: List[str] = []
     for fn in filenames_to_parse:
         if not fn.strip():
             continue
+        result = parse_filename(
+            fn,
+            model,
+            tokenizer,
+            id2label,
+            max_length,
+            debug=args.debug,
+            use_rules=not args.no_rule_assist,
+            constrain_bio=not args.no_constrained_bio,
+        )
         result["_input"] = fn
         results.append(result)