File size: 33,567 Bytes

"""Diagnostics for the anime filename NER pipeline.

The checks focus on structured filename parsing failure modes:

- train/inference tokenizer mismatch
- BIO legality and boundary drift
- tokenizer split and vocabulary coverage
- label/entity distribution
- optional model confusion on a sampled validation split
"""

from __future__ import annotations

import argparse
import json
import math
import os
import random
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import torch
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import BertForTokenClassification

from anifilebert.config import Config
from anifilebert.dataset import labels_for_tokenizer
from anifilebert.inference import constrained_bio_decode, postprocess
from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer


def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
    with path.open("r", encoding="utf-8") as handle:
        for line_no, line in enumerate(handle, 1):
            if limit is not None and line_no > limit:
                break
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as exc:
                raise ValueError(f"{path}:{line_no}: invalid JSON") from exc


def detect_dataset_variant(samples: List[dict], vocab_file: Optional[str]) -> str:
    variants = {sample.get("tokenizer_variant") for sample in samples if sample.get("tokenizer_variant")}
    if len(variants) == 1:
        return next(iter(variants))
    if len(variants) > 1:
        return "mixed"
    if vocab_file and ".char" in os.path.basename(vocab_file).lower():
        return "char"
    char_like = 0
    with_filename = 0
    for sample in samples:
        filename = sample.get("filename")
        if filename is None:
            continue
        with_filename += 1
        if sample.get("tokens") == list(filename):
            char_like += 1
    if with_filename and char_like / with_filename >= 0.95:
        return "char"
    return "regex"


def entity_type(label: str) -> Optional[str]:
    if "-" not in label:
        return None
    return label.split("-", 1)[1]


def bio_violations(tokens: List[str], labels: List[str]) -> List[dict]:
    violations: List[dict] = []
    previous_label = "O"
    current_entity: Optional[str] = None

    for idx, label in enumerate(labels):
        token = tokens[idx] if idx < len(tokens) else None
        if label == "O":
            current_entity = None
        elif label.startswith("B-"):
            current_entity = entity_type(label)
        elif label.startswith("I-"):
            label_entity = entity_type(label)
            previous_entity = entity_type(previous_label)
            if idx == 0 or previous_label == "O" or previous_entity != label_entity:
                violations.append(
                    {
                        "type": "ORPHAN_I",
                        "index": idx,
                        "prev_label": previous_label,
                        "label": label,
                        "token": token,
                    }
                )
            current_entity = label_entity
        else:
            violations.append(
                {
                    "type": "UNKNOWN_LABEL",
                    "index": idx,
                    "prev_label": previous_label,
                    "label": label,
                    "token": token,
                }
            )
            current_entity = None
        previous_label = label

    return violations


def bio_boundary_warnings(tokens: List[str], labels: List[str]) -> List[dict]:
    """Collect legal-but-suspicious boundary patterns separately from BIO errors."""
    warnings: List[dict] = []
    for idx, label in enumerate(labels[1:], 1):
        previous_label = labels[idx - 1]
        if label == "O" and previous_label.startswith("B-"):
            warnings.append(
                {
                    "type": "SINGLE_TOKEN_ENTITY",
                    "index": idx,
                    "prev_label": previous_label,
                    "label": label,
                    "token": tokens[idx] if idx < len(tokens) else None,
                }
            )
    return warnings


def spans_from_labels(tokens: List[str], labels: List[str]) -> List[dict]:
    spans: List[dict] = []
    start: Optional[int] = None
    current_type: Optional[str] = None
    current_tokens: List[str] = []

    for idx, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith("B-"):
            if current_type is not None and start is not None:
                spans.append(
                    {
                        "type": current_type,
                        "start": start,
                        "end": idx,
                        "text": "".join(current_tokens),
                    }
                )
            current_type = entity_type(label)
            start = idx
            current_tokens = [token]
        elif label.startswith("I-") and current_type == entity_type(label):
            current_tokens.append(token)
        elif label.startswith("I-"):
            if current_type is not None and start is not None:
                spans.append(
                    {
                        "type": current_type,
                        "start": start,
                        "end": idx,
                        "text": "".join(current_tokens),
                    }
                )
            current_type = entity_type(label)
            start = idx
            current_tokens = [token]
        else:
            if current_type is not None and start is not None:
                spans.append(
                    {
                        "type": current_type,
                        "start": start,
                        "end": idx,
                        "text": "".join(current_tokens),
                    }
                )
            current_type = None
            start = None
            current_tokens = []

    if current_type is not None and start is not None:
        spans.append(
            {
                "type": current_type,
                "start": start,
                "end": len(labels),
                "text": "".join(current_tokens),
            }
        )
    return spans


def count_entities(samples: List[dict]) -> Counter:
    counts: Counter = Counter()
    for sample in samples:
        for span in spans_from_labels(sample["tokens"], sample["labels"]):
            counts[span["type"]] += 1
    return counts


def percentile(values: List[int], pct: float) -> int:
    if not values:
        return 0
    ordered = sorted(values)
    idx = min(len(ordered) - 1, round((pct / 100) * (len(ordered) - 1)))
    return ordered[idx]


def token_mismatch(sample: dict, tokenizer: AnimeTokenizer) -> Optional[dict]:
    filename = sample.get("filename")
    if filename is None:
        return None
    inferred = tokenizer.tokenize(filename)
    dataset_tokens = sample.get("tokens", [])
    if inferred == dataset_tokens:
        return None
    prefix = 0
    for left, right in zip(inferred, dataset_tokens):
        if left != right:
            break
        prefix += 1
    return {
        "file_id": sample.get("file_id"),
        "filename": filename,
        "common_prefix": prefix,
        "dataset_tokens": dataset_tokens[:40],
        "tokenizer_tokens": inferred[:40],
        "dataset_len": len(dataset_tokens),
        "tokenizer_len": len(inferred),
    }


def format_counter(counter: Counter, total: Optional[int] = None, limit: Optional[int] = None) -> str:
    if total is None:
        total = sum(counter.values())
    rows = []
    items = counter.most_common(limit)
    for key, count in items:
        pct = count / total * 100 if total else 0.0
        rows.append(f"- `{key}`: {count:,} ({pct:.2f}%)")
    return "\n".join(rows) if rows else "- none"


def token_id_stats(samples: List[dict], tokenizer: AnimeTokenizer) -> dict:
    total = 0
    unk = 0
    unk_counter: Counter = Counter()
    for sample in samples:
        tokens, _labels = labels_for_tokenizer(sample, tokenizer)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        for token, token_id in zip(tokens, ids):
            total += 1
            if token_id == tokenizer.unk_token_id:
                unk += 1
                unk_counter[token] += 1
    return {
        "total": total,
        "unk": unk,
        "unk_rate": unk / total if total else 0.0,
        "top_unk": unk_counter.most_common(25),
    }


def prepare_inputs(
    sample: dict,
    tokenizer: AnimeTokenizer,
    label2id: Dict[str, int],
    max_length: int,
) -> Tuple[List[int], List[int], List[int], List[str]]:
    tokens, labels = labels_for_tokenizer(sample, tokenizer)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
    label_ids = [-100] + [label2id.get(label, 0) for label in labels] + [-100]
    attention_mask = [1] * len(input_ids)

    if len(input_ids) > max_length:
        input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [input_ids[-1]]
        label_ids = [label_ids[0]] + label_ids[1:max_length - 1] + [label_ids[-1]]
        attention_mask = [1] * len(input_ids)

    pad_len = max_length - len(input_ids)
    if pad_len > 0:
        input_ids += [tokenizer.pad_token_id] * pad_len
        label_ids += [-100] * pad_len
        attention_mask += [0] * pad_len

    return input_ids, attention_mask, label_ids, tokens


def normalize_field_value(field: str, value) -> Optional[str]:
    if value is None:
        return None
    if field in {"episode", "season"}:
        try:
            return str(int(value))
        except (TypeError, ValueError):
            return str(value).strip().lower()
    text = str(value).strip()
    if field in {"resolution", "source"}:
        return text.lower().replace("_", "-")
    return re.sub(r"\s+", " ", text).strip().lower()


def update_parse_metrics(counter: Counter, gold: dict, pred: dict) -> None:
    fields = ["group", "title", "season", "episode", "resolution", "source", "special"]
    all_match = True
    for field in fields:
        gold_value = normalize_field_value(field, gold.get(field))
        pred_value = normalize_field_value(field, pred.get(field))
        if gold_value == pred_value:
            counter[f"{field}_correct"] += 1
        else:
            all_match = False
            counter[(field, gold_value, pred_value)] += 1
        counter[f"{field}_total"] += 1
    if all_match:
        counter["full_match_correct"] += 1
    counter["full_match_total"] += 1


def collect_field_failures(gold: dict, pred: dict) -> Dict[str, Dict[str, Optional[str]]]:
    return {
        field: {
            "gold": normalize_field_value(field, gold.get(field)),
            "pred": normalize_field_value(field, pred.get(field)),
        }
        for field in ["group", "title", "season", "episode", "resolution", "source", "special"]
        if normalize_field_value(field, gold.get(field)) != normalize_field_value(field, pred.get(field))
    }


def evaluate_model(
    samples: List[dict],
    model_dir: Path,
    tokenizer: AnimeTokenizer,
    max_length: int,
    limit: int,
    seed: int,
) -> dict:
    cfg = Config()
    model = BertForTokenClassification.from_pretrained(str(model_dir))
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    rng = random.Random(seed)
    eval_samples = list(samples)
    rng.shuffle(eval_samples)
    eval_samples = eval_samples[:limit]

    id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
    label2id = {v: int(k) for k, v in id2label.items()}
    if not label2id:
        label2id = cfg.label2id
        id2label = cfg.id2label

    true_sequences: List[List[str]] = []
    pred_sequences: List[List[str]] = []
    confusion: Counter = Counter()
    entity_confusion: Counter = Counter()
    boundary_errors: Counter = Counter()
    parse_metrics: Counter = Counter()
    field_failures: List[dict] = []

    with torch.no_grad():
        for sample in eval_samples:
            input_ids, attention_mask, label_ids, sample_tokens = prepare_inputs(
                sample,
                tokenizer,
                label2id,
                max_length,
            )
            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
            mask_tensor = torch.tensor([attention_mask], dtype=torch.long, device=device)
            logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
            active_count = sum(1 for label_id in label_ids if label_id != -100)
            pred_ids = constrained_bio_decode(logits[0, 1:1 + active_count, :], id2label)

            true_labels: List[str] = []
            pred_labels: List[str] = []
            pred_idx = 0
            for label_id in label_ids:
                if label_id == -100:
                    continue
                pred_id = pred_ids[pred_idx]
                pred_idx += 1
                true_label = id2label.get(label_id, "O")
                pred_label = id2label.get(pred_id, "O")
                true_labels.append(true_label)
                pred_labels.append(pred_label)
                confusion[(true_label, pred_label)] += 1
                entity_confusion[(entity_type(true_label) or "O", entity_type(pred_label) or "O")] += 1
                if true_label != pred_label:
                    if true_label.startswith("B-") or pred_label.startswith("B-"):
                        boundary_errors["B-boundary"] += 1
                    elif entity_type(true_label) != entity_type(pred_label):
                        boundary_errors["entity-type"] += 1
                    else:
                        boundary_errors["BIO-prefix"] += 1
            true_sequences.append(true_labels)
            pred_sequences.append(pred_labels)
            active_tokens = sample_tokens[:len(true_labels)]
            gold_parse = postprocess(
                active_tokens,
                true_labels,
                tokenizer=tokenizer,
            )
            pred_parse = postprocess(
                active_tokens,
                pred_labels,
                tokenizer=tokenizer,
            )
            update_parse_metrics(parse_metrics, gold_parse, pred_parse)
            failures = collect_field_failures(gold_parse, pred_parse)
            if failures and len(field_failures) < 30:
                field_failures.append(
                    {
                        "filename": sample.get("filename"),
                        "errors": failures,
                        "gold": gold_parse,
                        "pred": pred_parse,
                    }
                )

    errors = confusion.copy()
    for label in set(label for pair in confusion for label in pair):
        errors.pop((label, label), None)

    return {
        "sample_count": len(eval_samples),
        "precision": precision_score(true_sequences, pred_sequences),
        "recall": recall_score(true_sequences, pred_sequences),
        "f1": f1_score(true_sequences, pred_sequences),
        "classification_report": classification_report(true_sequences, pred_sequences, digits=4),
        "top_token_confusions": errors.most_common(30),
        "top_entity_confusions": Counter(
            {k: v for k, v in entity_confusion.items() if k[0] != k[1]}
        ).most_common(30),
        "boundary_errors": boundary_errors,
        "parse_metrics": parse_metrics,
        "field_failures": field_failures,
    }


def tokenizer_split_examples(samples: List[dict], tokenizers: Dict[str, AnimeTokenizer], limit: int = 8) -> List[dict]:
    examples: List[dict] = []
    for sample in samples:
        filename = sample.get("filename")
        if not filename:
            continue
        row = {
            "file_id": sample.get("file_id"),
            "filename": filename,
            "dataset_tokens": sample.get("tokens", [])[:80],
        }
        for name, tokenizer in tokenizers.items():
            row[f"{name}_tokens"] = tokenizer.tokenize(filename)[:80]
        examples.append(row)
        if len(examples) >= limit:
            break
    return examples


def write_report(path: Path, title: str, sections: List[Tuple[str, str]]) -> None:
    parts = [f"# {title}", ""]
    for heading, body in sections:
        parts.append(f"## {heading}")
        parts.append("")
        parts.append(body.strip() if body.strip() else "_No data._")
        parts.append("")
    path.write_text("\n".join(parts), encoding="utf-8")


def markdown_json(value) -> str:
    return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"


def markdown_table(headers: List[str], rows: List[List[str]], limit: Optional[int] = None) -> str:
    if limit is not None:
        rows = rows[:limit]
    table = ["| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |"]
    for row in rows:
        table.append("| " + " | ".join(str(cell).replace("\n", " ") for cell in row) + " |")
    return "\n".join(table)


def main() -> None:
    parser = argparse.ArgumentParser(description="Diagnose anime filename NER data and model pipeline")
    parser.add_argument("--data-file", required=True, help="JSONL dataset with tokens and labels")
    parser.add_argument("--vocab-file", default=None, help="Tokenizer vocab JSON")
    parser.add_argument("--tokenizer", choices=["regex", "char"], default=None,
                        help="Tokenizer variant to diagnose. Defaults to dataset metadata")
    parser.add_argument("--model-dir", default=None, help="Optional model directory for confusion analysis")
    parser.add_argument("--max-length", type=int, default=None, help="Max sequence length for model eval/truncation stats")
    parser.add_argument("--sample-limit", type=int, default=20000, help="Rows to inspect for data diagnostics")
    parser.add_argument("--eval-limit", type=int, default=512, help="Rows to evaluate when --model-dir is provided")
    parser.add_argument("--output", default="diagnostics_report.md", help="Markdown report path")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    data_path = Path(args.data_file)
    samples = list(iter_jsonl(data_path, args.sample_limit))
    if not samples:
        raise ValueError(f"No samples loaded from {data_path}")

    dataset_variant = detect_dataset_variant(samples, args.vocab_file)
    tokenizer_variant = args.tokenizer or (dataset_variant if dataset_variant != "mixed" else "regex")
    vocab_file = args.vocab_file
    if vocab_file is None:
        vocab_file = str(data_path.with_name("vocab.char.json" if tokenizer_variant == "char" else "vocab.json"))
    tokenizer = create_tokenizer(tokenizer_variant, vocab_file=vocab_file)

    if args.model_dir:
        model_tokenizer = load_tokenizer(args.model_dir)
    else:
        model_tokenizer = tokenizer

    label_counter: Counter = Counter()
    length_values: List[int] = []
    aligned_length_values: List[int] = []
    violations: List[dict] = []
    boundary_warnings: List[dict] = []
    mismatch_examples: List[dict] = []
    space_label_counter: Counter = Counter()
    boundary_drift_counter: Counter = Counter()
    truncation_count = 0
    max_length = args.max_length
    if max_length is None and args.model_dir:
        model_config = BertForTokenClassification.from_pretrained(args.model_dir).config
        max_length = int(getattr(model_config, "max_seq_length", 64))
    max_length = max_length or (128 if tokenizer_variant == "char" else 64)

    for row_idx, sample in enumerate(samples, 1):
        tokens = sample.get("tokens", [])
        labels = sample.get("labels", [])
        if len(tokens) != len(labels):
            violations.append(
                {
                    "type": "LENGTH_MISMATCH",
                    "row": row_idx,
                    "file_id": sample.get("file_id"),
                    "token_count": len(tokens),
                    "label_count": len(labels),
                    "filename": sample.get("filename"),
                }
            )
            continue

        label_counter.update(labels)
        length_values.append(len(tokens))
        aligned_tokens, aligned_labels = labels_for_tokenizer(sample, tokenizer)
        aligned_length_values.append(len(aligned_tokens))
        if len(aligned_tokens) + 2 > max_length:
            truncation_count += 1
        for token, label in zip(tokens, labels):
            if token.isspace():
                space_label_counter[label] += 1
        for violation in bio_violations(tokens, labels):
            violation.update(
                {
                    "row": row_idx,
                    "file_id": sample.get("file_id"),
                    "filename": sample.get("filename"),
                    "context_tokens": tokens[max(0, violation["index"] - 5):violation["index"] + 6],
                    "context_labels": labels[max(0, violation["index"] - 5):violation["index"] + 6],
                }
            )
            violations.append(violation)
        for warning in bio_boundary_warnings(tokens, labels):
            warning.update(
                {
                    "row": row_idx,
                    "file_id": sample.get("file_id"),
                    "filename": sample.get("filename"),
                    "context_tokens": tokens[max(0, warning["index"] - 5):warning["index"] + 6],
                    "context_labels": labels[max(0, warning["index"] - 5):warning["index"] + 6],
                }
            )
            boundary_warnings.append(warning)
        for span in spans_from_labels(tokens, labels):
            text = span["text"]
            if span["type"] == "TITLE":
                if text.startswith("[") or text.endswith("[") or "]" in text[:3]:
                    boundary_drift_counter["title_contains_bracket_edge"] += 1
                if re.search(r"\b(?:WEB[-_ ]?DL|WebRip|\d{3,4}[pP]|HEVC|AVC|AAC)\b", text, re.I):
                    boundary_drift_counter["title_contains_meta"] += 1
            if span["type"] == "GROUP" and ("[" in text or "]" in text):
                boundary_drift_counter["group_contains_bracket"] += 1

        if len(mismatch_examples) < 10:
            mismatch = token_mismatch(sample, tokenizer)
            if mismatch:
                mismatch_examples.append(mismatch)

    entity_counter = count_entities(samples)
    id_stats = token_id_stats(samples, tokenizer)
    split_examples = tokenizer_split_examples(
        samples,
        {
            "diagnosed": tokenizer,
            "regex": create_tokenizer("regex", vocab_file=str(data_path.with_name("vocab.json"))),
            "char": create_tokenizer("char", vocab_file=str(data_path.with_name("vocab.char.json"))),
        },
    )

    model_eval = None
    if args.model_dir:
        model_eval = evaluate_model(
            samples=samples,
            model_dir=Path(args.model_dir),
            tokenizer=model_tokenizer,
            max_length=max_length,
            limit=args.eval_limit,
            seed=args.seed,
        )

    total_labels = sum(label_counter.values())
    o_count = label_counter.get("O", 0)
    sections: List[Tuple[str, str]] = []

    sections.append(
        (
            "Executive Summary",
            "\n".join(
                [
                    f"- Dataset: `{data_path}`",
                    f"- Inspected rows: {len(samples):,}",
                    f"- Dataset tokenizer variant: `{dataset_variant}`",
                    f"- Diagnosed tokenizer variant: `{tokenizer_variant}`",
                    f"- Vocab: `{vocab_file}` ({tokenizer.vocab_size:,} tokens)",
                    f"- Max sequence length checked: {max_length}",
                    f"- O-label ratio: {o_count / total_labels * 100:.2f}%" if total_labels else "- O-label ratio: n/a",
                    f"- Truncation risk: {truncation_count:,}/{len(samples):,} rows ({truncation_count / len(samples) * 100:.2f}%)",
                    f"- UNK rate after selected tokenizer: {id_stats['unk_rate'] * 100:.4f}%",
                    f"- BIO warnings collected: {len(violations):,}",
                    "",
                    "Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.",
                ]
            ),
        )
    )

    sections.append(
        (
            "Label And Entity Statistics",
            "\n".join(
                [
                    "### Label distribution",
                    format_counter(label_counter, total_labels),
                    "",
                    "### Entity count",
                    format_counter(entity_counter),
                    "",
                    "### Length distribution",
                    markdown_json(
                        {
                            "raw_tokens": {
                                "min": min(length_values),
                                "p50": percentile(length_values, 50),
                                "p90": percentile(length_values, 90),
                                "p95": percentile(length_values, 95),
                                "p99": percentile(length_values, 99),
                                "max": max(length_values),
                            },
                            "aligned_tokens": {
                                "min": min(aligned_length_values),
                                "p50": percentile(aligned_length_values, 50),
                                "p90": percentile(aligned_length_values, 90),
                                "p95": percentile(aligned_length_values, 95),
                                "p99": percentile(aligned_length_values, 99),
                                "max": max(aligned_length_values),
                            },
                        }
                    ),
                    "",
                    "### Whitespace labels",
                    format_counter(space_label_counter),
                ]
            ),
        )
    )

    violation_counter = Counter(v["type"] for v in violations)
    warning_counter = Counter(w["type"] for w in boundary_warnings)
    sections.append(
        (
            "BIO Violations And Boundary Drift",
            "\n".join(
                [
                    "### True BIO violation counts",
                    format_counter(violation_counter),
                    "",
                    "### Legal boundary warning counts",
                    format_counter(warning_counter),
                    "",
                    "### Boundary drift heuristics",
                    format_counter(boundary_drift_counter),
                    "",
                    "### Sample violations",
                    markdown_json(violations[:30]),
                    "",
                    "### Sample boundary warnings",
                    markdown_json(boundary_warnings[:30]),
                ]
            ),
        )
    )

    sections.append(
        (
            "Tokenizer Split And Alignment",
            "\n".join(
                [
                    "### Dataset tokens vs selected tokenizer mismatches",
                    markdown_json(mismatch_examples),
                    "",
                    "### Split examples",
                    markdown_json(split_examples),
                    "",
                    "### Vocabulary coverage",
                    markdown_json(id_stats),
                ]
            ),
        )
    )

    if args.model_dir:
        model_tokenizer_variant = getattr(model_tokenizer, "tokenizer_variant", "unknown")
        sections.append(
            (
                "Train Inference Tokenizer Comparison",
                "\n".join(
                    [
                        f"- Model dir: `{args.model_dir}`",
                        f"- Model tokenizer variant: `{model_tokenizer_variant}`",
                        f"- Dataset tokenizer variant: `{dataset_variant}`",
                        f"- Diagnostic tokenizer variant: `{tokenizer_variant}`",
                        f"- Model tokenizer vocab size: {model_tokenizer.vocab_size:,}",
                        f"- Diagnostic tokenizer vocab size: {tokenizer.vocab_size:,}",
                        "",
                        "If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.",
                    ]
                ),
            )
        )

    if model_eval:
        token_rows = [
            [true, pred, f"{count:,}"]
            for (true, pred), count in model_eval["top_token_confusions"]
        ]
        entity_rows = [
            [true, pred, f"{count:,}"]
            for (true, pred), count in model_eval["top_entity_confusions"]
        ]
        def parse_metric_tables(metrics: Counter) -> Tuple[List[List[str]], str, List[List[str]]]:
            field_rows = []
            for field in ["group", "title", "season", "episode", "resolution", "source", "special"]:
                total = metrics.get(f"{field}_total", 0)
                correct = metrics.get(f"{field}_correct", 0)
                acc = correct / total if total else 0.0
                field_rows.append([field, f"{correct:,}/{total:,}", f"{acc:.4f}"])
            full_total = metrics.get("full_match_total", 0)
            full_correct = metrics.get("full_match_correct", 0)
            full_acc = full_correct / full_total if full_total else 0.0
            full_line = f"{full_correct:,}/{full_total:,} ({full_acc:.4f})"
            error_rows = [
                [field, str(gold), str(pred), f"{count:,}"]
                for key, count in Counter(
                    {key: count for key, count in metrics.items() if isinstance(key, tuple)}
                ).most_common(30)
                if isinstance(key, tuple)
                for field, gold, pred in [key]
            ]
            return field_rows, full_line, error_rows

        parse_field_rows, parse_full_line, parse_error_rows = parse_metric_tables(model_eval["parse_metrics"])
        sections.append(
            (
                "Model Confusion Analysis",
                "\n".join(
                    [
                        f"- Evaluated samples: {model_eval['sample_count']:,}",
                        f"- Entity precision: {model_eval['precision']:.4f}",
                        f"- Entity recall: {model_eval['recall']:.4f}",
                        f"- Entity F1: {model_eval['f1']:.4f}",
                        "",
                        "### Boundary error classes",
                        format_counter(model_eval["boundary_errors"]),
                        "",
                        "### Top token-label confusions",
                        markdown_table(["true", "pred", "count"], token_rows) if token_rows else "- none",
                        "",
                        "### Top entity-type confusions",
                        markdown_table(["true", "pred", "count"], entity_rows) if entity_rows else "- none",
                        "",
                        "### Field exact-match accuracy (thin runtime)",
                        markdown_table(["field", "correct/total", "accuracy"], parse_field_rows),
                        "",
                        f"Thin-runtime full parse exact match: {parse_full_line}",
                        "",
                        "### Top thin-runtime field parse errors",
                        markdown_table(["field", "gold", "pred", "count"], parse_error_rows) if parse_error_rows else "- none",
                        "",
                        "### Hardest sampled parse failures",
                        markdown_json(model_eval["field_failures"][:10]) if model_eval["field_failures"] else "- none",
                        "",
                        "### Seqeval report",
                        "```text\n" + model_eval["classification_report"] + "\n```",
                    ]
                ),
            )
        )

    sections.append(
        (
            "Recommended Pipeline",
            "\n".join(
                [
                    "1. Use one tokenizer variant end to end and save it in the checkpoint metadata.",
                    "2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.",
                    "3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.",
                    "4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.",
                    "5. Keep runtime post-processing thin: BIO aggregation plus string/number normalization.",
                    "6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.",
                ]
            ),
        )
    )

    write_report(Path(args.output), "Anime Filename Parser Diagnostics Report", sections)
    print(f"Wrote diagnostics report: {args.output}")


if __name__ == "__main__":
    main()