AniFileBERT / tools /diagnose_pipeline.py

Organize parser modules and tools

8c50d16 4 days ago

33.6 kB

	"""Diagnostics for the anime filename NER pipeline.

	The checks focus on structured filename parsing failure modes:

	- train/inference tokenizer mismatch
	- BIO legality and boundary drift
	- tokenizer split and vocabulary coverage
	- label/entity distribution
	- optional model confusion on a sampled validation split
	"""

	from __future__ import annotations

	import argparse
	import json
	import math
	import os
	import random
	import re
	from collections import Counter, defaultdict
	from pathlib import Path
	from typing import Dict, Iterable, List, Optional, Tuple

	import numpy as np
	import torch
	from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
	from transformers import BertForTokenClassification

	from anifilebert.config import Config
	from anifilebert.dataset import labels_for_tokenizer
	from anifilebert.inference import constrained_bio_decode, postprocess
	from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer


	def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
	with path.open("r", encoding="utf-8") as handle:
	for line_no, line in enumerate(handle, 1):
	if limit is not None and line_no > limit:
	break
	line = line.strip()
	if not line:
	continue
	try:
	yield json.loads(line)
	except json.JSONDecodeError as exc:
	raise ValueError(f"{path}:{line_no}: invalid JSON") from exc


	def detect_dataset_variant(samples: List[dict], vocab_file: Optional[str]) -> str:
	variants = {sample.get("tokenizer_variant") for sample in samples if sample.get("tokenizer_variant")}
	if len(variants) == 1:
	return next(iter(variants))
	if len(variants) > 1:
	return "mixed"
	if vocab_file and ".char" in os.path.basename(vocab_file).lower():
	return "char"
	char_like = 0
	with_filename = 0
	for sample in samples:
	filename = sample.get("filename")
	if filename is None:
	continue
	with_filename += 1
	if sample.get("tokens") == list(filename):
	char_like += 1
	if with_filename and char_like / with_filename >= 0.95:
	return "char"
	return "regex"


	def entity_type(label: str) -> Optional[str]:
	if "-" not in label:
	return None
	return label.split("-", 1)[1]


	def bio_violations(tokens: List[str], labels: List[str]) -> List[dict]:
	violations: List[dict] = []
	previous_label = "O"
	current_entity: Optional[str] = None

	for idx, label in enumerate(labels):
	token = tokens[idx] if idx < len(tokens) else None
	if label == "O":
	current_entity = None
	elif label.startswith("B-"):
	current_entity = entity_type(label)
	elif label.startswith("I-"):
	label_entity = entity_type(label)
	previous_entity = entity_type(previous_label)
	if idx == 0 or previous_label == "O" or previous_entity != label_entity:
	violations.append(
	{
	"type": "ORPHAN_I",
	"index": idx,
	"prev_label": previous_label,
	"label": label,
	"token": token,
	}
	)
	current_entity = label_entity
	else:
	violations.append(
	{
	"type": "UNKNOWN_LABEL",
	"index": idx,
	"prev_label": previous_label,
	"label": label,
	"token": token,
	}
	)
	current_entity = None
	previous_label = label

	return violations


	def bio_boundary_warnings(tokens: List[str], labels: List[str]) -> List[dict]:
	"""Collect legal-but-suspicious boundary patterns separately from BIO errors."""
	warnings: List[dict] = []
	for idx, label in enumerate(labels[1:], 1):
	previous_label = labels[idx - 1]
	if label == "O" and previous_label.startswith("B-"):
	warnings.append(
	{
	"type": "SINGLE_TOKEN_ENTITY",
	"index": idx,
	"prev_label": previous_label,
	"label": label,
	"token": tokens[idx] if idx < len(tokens) else None,
	}
	)
	return warnings


	def spans_from_labels(tokens: List[str], labels: List[str]) -> List[dict]:
	spans: List[dict] = []
	start: Optional[int] = None
	current_type: Optional[str] = None
	current_tokens: List[str] = []

	for idx, (token, label) in enumerate(zip(tokens, labels)):
	if label.startswith("B-"):
	if current_type is not None and start is not None:
	spans.append(
	{
	"type": current_type,
	"start": start,
	"end": idx,
	"text": "".join(current_tokens),
	}
	)
	current_type = entity_type(label)
	start = idx
	current_tokens = [token]
	elif label.startswith("I-") and current_type == entity_type(label):
	current_tokens.append(token)
	elif label.startswith("I-"):
	if current_type is not None and start is not None:
	spans.append(
	{
	"type": current_type,
	"start": start,
	"end": idx,
	"text": "".join(current_tokens),
	}
	)
	current_type = entity_type(label)
	start = idx
	current_tokens = [token]
	else:
	if current_type is not None and start is not None:
	spans.append(
	{
	"type": current_type,
	"start": start,
	"end": idx,
	"text": "".join(current_tokens),
	}
	)
	current_type = None
	start = None
	current_tokens = []

	if current_type is not None and start is not None:
	spans.append(
	{
	"type": current_type,
	"start": start,
	"end": len(labels),
	"text": "".join(current_tokens),
	}
	)
	return spans


	def count_entities(samples: List[dict]) -> Counter:
	counts: Counter = Counter()
	for sample in samples:
	for span in spans_from_labels(sample["tokens"], sample["labels"]):
	counts[span["type"]] += 1
	return counts


	def percentile(values: List[int], pct: float) -> int:
	if not values:
	return 0
	ordered = sorted(values)
	idx = min(len(ordered) - 1, round((pct / 100) * (len(ordered) - 1)))
	return ordered[idx]


	def token_mismatch(sample: dict, tokenizer: AnimeTokenizer) -> Optional[dict]:
	filename = sample.get("filename")
	if filename is None:
	return None
	inferred = tokenizer.tokenize(filename)
	dataset_tokens = sample.get("tokens", [])
	if inferred == dataset_tokens:
	return None
	prefix = 0
	for left, right in zip(inferred, dataset_tokens):
	if left != right:
	break
	prefix += 1
	return {
	"file_id": sample.get("file_id"),
	"filename": filename,
	"common_prefix": prefix,
	"dataset_tokens": dataset_tokens[:40],
	"tokenizer_tokens": inferred[:40],
	"dataset_len": len(dataset_tokens),
	"tokenizer_len": len(inferred),
	}


	def format_counter(counter: Counter, total: Optional[int] = None, limit: Optional[int] = None) -> str:
	if total is None:
	total = sum(counter.values())
	rows = []
	items = counter.most_common(limit)
	for key, count in items:
	pct = count / total * 100 if total else 0.0
	rows.append(f"- `{key}`: {count:,} ({pct:.2f}%)")
	return "\n".join(rows) if rows else "- none"


	def token_id_stats(samples: List[dict], tokenizer: AnimeTokenizer) -> dict:
	total = 0
	unk = 0
	unk_counter: Counter = Counter()
	for sample in samples:
	tokens, _labels = labels_for_tokenizer(sample, tokenizer)
	ids = tokenizer.convert_tokens_to_ids(tokens)
	for token, token_id in zip(tokens, ids):
	total += 1
	if token_id == tokenizer.unk_token_id:
	unk += 1
	unk_counter[token] += 1
	return {
	"total": total,
	"unk": unk,
	"unk_rate": unk / total if total else 0.0,
	"top_unk": unk_counter.most_common(25),
	}


	def prepare_inputs(
	sample: dict,
	tokenizer: AnimeTokenizer,
	label2id: Dict[str, int],
	max_length: int,
	) -> Tuple[List[int], List[int], List[int], List[str]]:
	tokens, labels = labels_for_tokenizer(sample, tokenizer)
	input_ids = tokenizer.convert_tokens_to_ids(tokens)
	input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
	label_ids = [-100] + [label2id.get(label, 0) for label in labels] + [-100]
	attention_mask = [1] * len(input_ids)

	if len(input_ids) > max_length:
	input_ids = [input_ids[0]] + input_ids[1:max_length - 1] + [input_ids[-1]]
	label_ids = [label_ids[0]] + label_ids[1:max_length - 1] + [label_ids[-1]]
	attention_mask = [1] * len(input_ids)

	pad_len = max_length - len(input_ids)
	if pad_len > 0:
	input_ids += [tokenizer.pad_token_id] * pad_len
	label_ids += [-100] * pad_len
	attention_mask += [0] * pad_len

	return input_ids, attention_mask, label_ids, tokens


	def normalize_field_value(field: str, value) -> Optional[str]:
	if value is None:
	return None
	if field in {"episode", "season"}:
	try:
	return str(int(value))
	except (TypeError, ValueError):
	return str(value).strip().lower()
	text = str(value).strip()
	if field in {"resolution", "source"}:
	return text.lower().replace("_", "-")
	return re.sub(r"\s+", " ", text).strip().lower()


	def update_parse_metrics(counter: Counter, gold: dict, pred: dict) -> None:
	fields = ["group", "title", "season", "episode", "resolution", "source", "special"]
	all_match = True
	for field in fields:
	gold_value = normalize_field_value(field, gold.get(field))
	pred_value = normalize_field_value(field, pred.get(field))
	if gold_value == pred_value:
	counter[f"{field}_correct"] += 1
	else:
	all_match = False
	counter[(field, gold_value, pred_value)] += 1
	counter[f"{field}_total"] += 1
	if all_match:
	counter["full_match_correct"] += 1
	counter["full_match_total"] += 1


	def collect_field_failures(gold: dict, pred: dict) -> Dict[str, Dict[str, Optional[str]]]:
	return {
	field: {
	"gold": normalize_field_value(field, gold.get(field)),
	"pred": normalize_field_value(field, pred.get(field)),
	}
	for field in ["group", "title", "season", "episode", "resolution", "source", "special"]
	if normalize_field_value(field, gold.get(field)) != normalize_field_value(field, pred.get(field))
	}


	def evaluate_model(
	samples: List[dict],
	model_dir: Path,
	tokenizer: AnimeTokenizer,
	max_length: int,
	limit: int,
	seed: int,
	) -> dict:
	cfg = Config()
	model = BertForTokenClassification.from_pretrained(str(model_dir))
	model.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	rng = random.Random(seed)
	eval_samples = list(samples)
	rng.shuffle(eval_samples)
	eval_samples = eval_samples[:limit]

	id2label = {int(k): v for k, v in getattr(model.config, "id2label", cfg.id2label).items()}
	label2id = {v: int(k) for k, v in id2label.items()}
	if not label2id:
	label2id = cfg.label2id
	id2label = cfg.id2label

	true_sequences: List[List[str]] = []
	pred_sequences: List[List[str]] = []
	confusion: Counter = Counter()
	entity_confusion: Counter = Counter()
	boundary_errors: Counter = Counter()
	parse_metrics: Counter = Counter()
	field_failures: List[dict] = []

	with torch.no_grad():
	for sample in eval_samples:
	input_ids, attention_mask, label_ids, sample_tokens = prepare_inputs(
	sample,
	tokenizer,
	label2id,
	max_length,
	)
	input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
	mask_tensor = torch.tensor([attention_mask], dtype=torch.long, device=device)
	logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
	active_count = sum(1 for label_id in label_ids if label_id != -100)
	pred_ids = constrained_bio_decode(logits[0, 1:1 + active_count, :], id2label)

	true_labels: List[str] = []
	pred_labels: List[str] = []
	pred_idx = 0
	for label_id in label_ids:
	if label_id == -100:
	continue
	pred_id = pred_ids[pred_idx]
	pred_idx += 1
	true_label = id2label.get(label_id, "O")
	pred_label = id2label.get(pred_id, "O")
	true_labels.append(true_label)
	pred_labels.append(pred_label)
	confusion[(true_label, pred_label)] += 1
	entity_confusion[(entity_type(true_label) or "O", entity_type(pred_label) or "O")] += 1
	if true_label != pred_label:
	if true_label.startswith("B-") or pred_label.startswith("B-"):
	boundary_errors["B-boundary"] += 1
	elif entity_type(true_label) != entity_type(pred_label):
	boundary_errors["entity-type"] += 1
	else:
	boundary_errors["BIO-prefix"] += 1
	true_sequences.append(true_labels)
	pred_sequences.append(pred_labels)
	active_tokens = sample_tokens[:len(true_labels)]
	gold_parse = postprocess(
	active_tokens,
	true_labels,
	tokenizer=tokenizer,
	)
	pred_parse = postprocess(
	active_tokens,
	pred_labels,
	tokenizer=tokenizer,
	)
	update_parse_metrics(parse_metrics, gold_parse, pred_parse)
	failures = collect_field_failures(gold_parse, pred_parse)
	if failures and len(field_failures) < 30:
	field_failures.append(
	{
	"filename": sample.get("filename"),
	"errors": failures,
	"gold": gold_parse,
	"pred": pred_parse,
	}
	)

	errors = confusion.copy()
	for label in set(label for pair in confusion for label in pair):
	errors.pop((label, label), None)

	return {
	"sample_count": len(eval_samples),
	"precision": precision_score(true_sequences, pred_sequences),
	"recall": recall_score(true_sequences, pred_sequences),
	"f1": f1_score(true_sequences, pred_sequences),
	"classification_report": classification_report(true_sequences, pred_sequences, digits=4),
	"top_token_confusions": errors.most_common(30),
	"top_entity_confusions": Counter(
	{k: v for k, v in entity_confusion.items() if k[0] != k[1]}
	).most_common(30),
	"boundary_errors": boundary_errors,
	"parse_metrics": parse_metrics,
	"field_failures": field_failures,
	}


	def tokenizer_split_examples(samples: List[dict], tokenizers: Dict[str, AnimeTokenizer], limit: int = 8) -> List[dict]:
	examples: List[dict] = []
	for sample in samples:
	filename = sample.get("filename")
	if not filename:
	continue
	row = {
	"file_id": sample.get("file_id"),
	"filename": filename,
	"dataset_tokens": sample.get("tokens", [])[:80],
	}
	for name, tokenizer in tokenizers.items():
	row[f"{name}_tokens"] = tokenizer.tokenize(filename)[:80]
	examples.append(row)
	if len(examples) >= limit:
	break
	return examples


	def write_report(path: Path, title: str, sections: List[Tuple[str, str]]) -> None:
	parts = [f"# {title}", ""]
	for heading, body in sections:
	parts.append(f"## {heading}")
	parts.append("")
	parts.append(body.strip() if body.strip() else "_No data._")
	parts.append("")
	path.write_text("\n".join(parts), encoding="utf-8")


	def markdown_json(value) -> str:
	return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"


	def markdown_table(headers: List[str], rows: List[List[str]], limit: Optional[int] = None) -> str:
	if limit is not None:
	rows = rows[:limit]
	table = ["\| " + " \| ".join(headers) + " \|", "\| " + " \| ".join("---" for _ in headers) + " \|"]
	for row in rows:
	table.append("\| " + " \| ".join(str(cell).replace("\n", " ") for cell in row) + " \|")
	return "\n".join(table)


	def main() -> None:
	parser = argparse.ArgumentParser(description="Diagnose anime filename NER data and model pipeline")
	parser.add_argument("--data-file", required=True, help="JSONL dataset with tokens and labels")
	parser.add_argument("--vocab-file", default=None, help="Tokenizer vocab JSON")
	parser.add_argument("--tokenizer", choices=["regex", "char"], default=None,
	help="Tokenizer variant to diagnose. Defaults to dataset metadata")
	parser.add_argument("--model-dir", default=None, help="Optional model directory for confusion analysis")
	parser.add_argument("--max-length", type=int, default=None, help="Max sequence length for model eval/truncation stats")
	parser.add_argument("--sample-limit", type=int, default=20000, help="Rows to inspect for data diagnostics")
	parser.add_argument("--eval-limit", type=int, default=512, help="Rows to evaluate when --model-dir is provided")
	parser.add_argument("--output", default="diagnostics_report.md", help="Markdown report path")
	parser.add_argument("--seed", type=int, default=42)
	args = parser.parse_args()

	data_path = Path(args.data_file)
	samples = list(iter_jsonl(data_path, args.sample_limit))
	if not samples:
	raise ValueError(f"No samples loaded from {data_path}")

	dataset_variant = detect_dataset_variant(samples, args.vocab_file)
	tokenizer_variant = args.tokenizer or (dataset_variant if dataset_variant != "mixed" else "regex")
	vocab_file = args.vocab_file
	if vocab_file is None:
	vocab_file = str(data_path.with_name("vocab.char.json" if tokenizer_variant == "char" else "vocab.json"))
	tokenizer = create_tokenizer(tokenizer_variant, vocab_file=vocab_file)

	if args.model_dir:
	model_tokenizer = load_tokenizer(args.model_dir)
	else:
	model_tokenizer = tokenizer

	label_counter: Counter = Counter()
	length_values: List[int] = []
	aligned_length_values: List[int] = []
	violations: List[dict] = []
	boundary_warnings: List[dict] = []
	mismatch_examples: List[dict] = []
	space_label_counter: Counter = Counter()
	boundary_drift_counter: Counter = Counter()
	truncation_count = 0
	max_length = args.max_length
	if max_length is None and args.model_dir:
	model_config = BertForTokenClassification.from_pretrained(args.model_dir).config
	max_length = int(getattr(model_config, "max_seq_length", 64))
	max_length = max_length or (128 if tokenizer_variant == "char" else 64)

	for row_idx, sample in enumerate(samples, 1):
	tokens = sample.get("tokens", [])
	labels = sample.get("labels", [])
	if len(tokens) != len(labels):
	violations.append(
	{
	"type": "LENGTH_MISMATCH",
	"row": row_idx,
	"file_id": sample.get("file_id"),
	"token_count": len(tokens),
	"label_count": len(labels),
	"filename": sample.get("filename"),
	}
	)
	continue

	label_counter.update(labels)
	length_values.append(len(tokens))
	aligned_tokens, aligned_labels = labels_for_tokenizer(sample, tokenizer)
	aligned_length_values.append(len(aligned_tokens))
	if len(aligned_tokens) + 2 > max_length:
	truncation_count += 1
	for token, label in zip(tokens, labels):
	if token.isspace():
	space_label_counter[label] += 1
	for violation in bio_violations(tokens, labels):
	violation.update(
	{
	"row": row_idx,
	"file_id": sample.get("file_id"),
	"filename": sample.get("filename"),
	"context_tokens": tokens[max(0, violation["index"] - 5):violation["index"] + 6],
	"context_labels": labels[max(0, violation["index"] - 5):violation["index"] + 6],
	}
	)
	violations.append(violation)
	for warning in bio_boundary_warnings(tokens, labels):
	warning.update(
	{
	"row": row_idx,
	"file_id": sample.get("file_id"),
	"filename": sample.get("filename"),
	"context_tokens": tokens[max(0, warning["index"] - 5):warning["index"] + 6],
	"context_labels": labels[max(0, warning["index"] - 5):warning["index"] + 6],
	}
	)
	boundary_warnings.append(warning)
	for span in spans_from_labels(tokens, labels):
	text = span["text"]
	if span["type"] == "TITLE":
	if text.startswith("[") or text.endswith("[") or "]" in text[:3]:
	boundary_drift_counter["title_contains_bracket_edge"] += 1
	if re.search(r"\b(?:WEB[-_ ]?DL\|WebRip\|\d{3,4}[pP]\|HEVC\|AVC\|AAC)\b", text, re.I):
	boundary_drift_counter["title_contains_meta"] += 1
	if span["type"] == "GROUP" and ("[" in text or "]" in text):
	boundary_drift_counter["group_contains_bracket"] += 1

	if len(mismatch_examples) < 10:
	mismatch = token_mismatch(sample, tokenizer)
	if mismatch:
	mismatch_examples.append(mismatch)

	entity_counter = count_entities(samples)
	id_stats = token_id_stats(samples, tokenizer)
	split_examples = tokenizer_split_examples(
	samples,
	{
	"diagnosed": tokenizer,
	"regex": create_tokenizer("regex", vocab_file=str(data_path.with_name("vocab.json"))),
	"char": create_tokenizer("char", vocab_file=str(data_path.with_name("vocab.char.json"))),
	},
	)

	model_eval = None
	if args.model_dir:
	model_eval = evaluate_model(
	samples=samples,
	model_dir=Path(args.model_dir),
	tokenizer=model_tokenizer,
	max_length=max_length,
	limit=args.eval_limit,
	seed=args.seed,
	)

	total_labels = sum(label_counter.values())
	o_count = label_counter.get("O", 0)
	sections: List[Tuple[str, str]] = []

	sections.append(
	(
	"Executive Summary",
	"\n".join(
	[
	f"- Dataset: `{data_path}`",
	f"- Inspected rows: {len(samples):,}",
	f"- Dataset tokenizer variant: `{dataset_variant}`",
	f"- Diagnosed tokenizer variant: `{tokenizer_variant}`",
	f"- Vocab: `{vocab_file}` ({tokenizer.vocab_size:,} tokens)",
	f"- Max sequence length checked: {max_length}",
	f"- O-label ratio: {o_count / total_labels * 100:.2f}%" if total_labels else "- O-label ratio: n/a",
	f"- Truncation risk: {truncation_count:,}/{len(samples):,} rows ({truncation_count / len(samples) * 100:.2f}%)",
	f"- UNK rate after selected tokenizer: {id_stats['unk_rate'] * 100:.4f}%",
	f"- BIO warnings collected: {len(violations):,}",
	"",
	"Primary finding: this task is structural filename parsing. Tokenizer/preprocessing identity is more important than lowering token loss.",
	]
	),
	)
	)

	sections.append(
	(
	"Label And Entity Statistics",
	"\n".join(
	[
	"### Label distribution",
	format_counter(label_counter, total_labels),
	"",
	"### Entity count",
	format_counter(entity_counter),
	"",
	"### Length distribution",
	markdown_json(
	{
	"raw_tokens": {
	"min": min(length_values),
	"p50": percentile(length_values, 50),
	"p90": percentile(length_values, 90),
	"p95": percentile(length_values, 95),
	"p99": percentile(length_values, 99),
	"max": max(length_values),
	},
	"aligned_tokens": {
	"min": min(aligned_length_values),
	"p50": percentile(aligned_length_values, 50),
	"p90": percentile(aligned_length_values, 90),
	"p95": percentile(aligned_length_values, 95),
	"p99": percentile(aligned_length_values, 99),
	"max": max(aligned_length_values),
	},
	}
	),
	"",
	"### Whitespace labels",
	format_counter(space_label_counter),
	]
	),
	)
	)

	violation_counter = Counter(v["type"] for v in violations)
	warning_counter = Counter(w["type"] for w in boundary_warnings)
	sections.append(
	(
	"BIO Violations And Boundary Drift",
	"\n".join(
	[
	"### True BIO violation counts",
	format_counter(violation_counter),
	"",
	"### Legal boundary warning counts",
	format_counter(warning_counter),
	"",
	"### Boundary drift heuristics",
	format_counter(boundary_drift_counter),
	"",
	"### Sample violations",
	markdown_json(violations[:30]),
	"",
	"### Sample boundary warnings",
	markdown_json(boundary_warnings[:30]),
	]
	),
	)
	)

	sections.append(
	(
	"Tokenizer Split And Alignment",
	"\n".join(
	[
	"### Dataset tokens vs selected tokenizer mismatches",
	markdown_json(mismatch_examples),
	"",
	"### Split examples",
	markdown_json(split_examples),
	"",
	"### Vocabulary coverage",
	markdown_json(id_stats),
	]
	),
	)
	)

	if args.model_dir:
	model_tokenizer_variant = getattr(model_tokenizer, "tokenizer_variant", "unknown")
	sections.append(
	(
	"Train Inference Tokenizer Comparison",
	"\n".join(
	[
	f"- Model dir: `{args.model_dir}`",
	f"- Model tokenizer variant: `{model_tokenizer_variant}`",
	f"- Dataset tokenizer variant: `{dataset_variant}`",
	f"- Diagnostic tokenizer variant: `{tokenizer_variant}`",
	f"- Model tokenizer vocab size: {model_tokenizer.vocab_size:,}",
	f"- Diagnostic tokenizer vocab size: {tokenizer.vocab_size:,}",
	"",
	"If dataset and model tokenizer variants differ, validation loss can be low while real inference sees different token IDs and boundaries.",
	]
	),
	)
	)

	if model_eval:
	token_rows = [
	[true, pred, f"{count:,}"]
	for (true, pred), count in model_eval["top_token_confusions"]
	]
	entity_rows = [
	[true, pred, f"{count:,}"]
	for (true, pred), count in model_eval["top_entity_confusions"]
	]
	def parse_metric_tables(metrics: Counter) -> Tuple[List[List[str]], str, List[List[str]]]:
	field_rows = []
	for field in ["group", "title", "season", "episode", "resolution", "source", "special"]:
	total = metrics.get(f"{field}_total", 0)
	correct = metrics.get(f"{field}_correct", 0)
	acc = correct / total if total else 0.0
	field_rows.append([field, f"{correct:,}/{total:,}", f"{acc:.4f}"])
	full_total = metrics.get("full_match_total", 0)
	full_correct = metrics.get("full_match_correct", 0)
	full_acc = full_correct / full_total if full_total else 0.0
	full_line = f"{full_correct:,}/{full_total:,} ({full_acc:.4f})"
	error_rows = [
	[field, str(gold), str(pred), f"{count:,}"]
	for key, count in Counter(
	{key: count for key, count in metrics.items() if isinstance(key, tuple)}
	).most_common(30)
	if isinstance(key, tuple)
	for field, gold, pred in [key]
	]
	return field_rows, full_line, error_rows

	parse_field_rows, parse_full_line, parse_error_rows = parse_metric_tables(model_eval["parse_metrics"])
	sections.append(
	(
	"Model Confusion Analysis",
	"\n".join(
	[
	f"- Evaluated samples: {model_eval['sample_count']:,}",
	f"- Entity precision: {model_eval['precision']:.4f}",
	f"- Entity recall: {model_eval['recall']:.4f}",
	f"- Entity F1: {model_eval['f1']:.4f}",
	"",
	"### Boundary error classes",
	format_counter(model_eval["boundary_errors"]),
	"",
	"### Top token-label confusions",
	markdown_table(["true", "pred", "count"], token_rows) if token_rows else "- none",
	"",
	"### Top entity-type confusions",
	markdown_table(["true", "pred", "count"], entity_rows) if entity_rows else "- none",
	"",
	"### Field exact-match accuracy (thin runtime)",
	markdown_table(["field", "correct/total", "accuracy"], parse_field_rows),
	"",
	f"Thin-runtime full parse exact match: {parse_full_line}",
	"",
	"### Top thin-runtime field parse errors",
	markdown_table(["field", "gold", "pred", "count"], parse_error_rows) if parse_error_rows else "- none",
	"",
	"### Hardest sampled parse failures",
	markdown_json(model_eval["field_failures"][:10]) if model_eval["field_failures"] else "- none",
	"",
	"### Seqeval report",
	"```text\n" + model_eval["classification_report"] + "\n```",
	]
	),
	)
	)

	sections.append(
	(
	"Recommended Pipeline",
	"\n".join(
	[
	"1. Use one tokenizer variant end to end and save it in the checkpoint metadata.",
	"2. Prefer char-level or a deterministic hybrid tokenizer for DMHY filenames; avoid generic subword tokenization for labels.",
	"3. For char-level runs, use `--tokenizer char --max-seq-length 128` with `vocab.char.json`.",
	"4. Add CRF decoding or constrained BIO decoding so illegal I-X transitions and impossible boundary jumps are blocked.",
	"5. Keep runtime post-processing thin: BIO aggregation plus string/number normalization.",
	"6. Track entity-level F1 and field exact-match on real filenames; do not accept low validation loss alone.",
	]
	),
	)
	)

	write_report(Path(args.output), "Anime Filename Parser Diagnostics Report", sections)
	print(f"Wrote diagnostics report: {args.output}")


	if __name__ == "__main__":
	main()