from __future__ import annotations import argparse import json import logging from pathlib import Path from .common import ARTIFACT_DIR, existing_default_checkpoint LOGGER = logging.getLogger(__name__) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate an HTML evaluation report.") parser.add_argument( "--checkpoint-dir", default=existing_default_checkpoint(), help="Path to the trained model checkpoint directory containing metrics.", ) parser.add_argument( "--output-file", default=str(ARTIFACT_DIR / "eval_report.html"), help="Output HTML file path.", ) return parser.parse_args() def load_metrics(checkpoint_dir: Path) -> dict[str, dict[str, float]]: metrics = {} metrics_dir = checkpoint_dir / "metrics" if not metrics_dir.exists(): return metrics for split in ["train", "validation", "test"]: file_path = metrics_dir / f"{split}_metrics.json" if file_path.exists(): try: metrics[split] = json.loads(file_path.read_text(encoding="utf-8")) except Exception as e: LOGGER.warning(f"Failed to load {file_path}: {e}") return metrics def load_predictions(checkpoint_dir: Path) -> list[dict]: # We look for the predictions file in the artifact directory, # since eval.py writes it there by default. pred_file = ARTIFACT_DIR / "sample_predictions.jsonl" preds = [] if pred_file.exists(): try: for line in pred_file.read_text(encoding="utf-8").splitlines(): if line.strip(): preds.append(json.loads(line)) except Exception as e: LOGGER.warning(f"Failed to load predictions from {pred_file}: {e}") return preds def generate_html(checkpoint_name: str, metrics: dict, predictions: list) -> str: html = f"""
Checkpoint: {checkpoint_name}
| Split | Loss | ROUGE-1 | ROUGE-2 | ROUGE-L | BERTScore F1 | Avg Gen Length |
|---|---|---|---|---|---|---|
| {split.title()} | {fmt(loss)} | {fmt(r1)} | {fmt(r2)} | {fmt(rl)} | {fmt(bf1)} | {fmt(glen)} |
No predictions found.
" else: for i, p in enumerate(predictions): empty_tag = " (EMPTY PREDICTION)" if p.get("empty_prediction") else "" html += f"""