apoorvrajdev's picture
feat(evaluation): add beam search, metrics pipeline, and stabilized training workflow
91a1214
"""Evaluation β€” caption-quality metrics + per-sample diagnostics.
Available metrics (all corpus-level, 0-100 scale where applicable):
* BLEU-1..4 β€” :mod:`bleu`
* ROUGE-L β€” :mod:`rouge`
* METEOR β€” :mod:`meteor` (requires a JRE on PATH)
* CIDEr β€” :mod:`cider` (requires >= 2 examples)
:func:`compute_all_metrics` in :mod:`runner` is the single entry point used
by the CLI and by future Phase 3 benchmark comparisons; per-sample
diagnostics live in :mod:`inspection`.
"""
from captioning.evaluation.benchmark import RunMeta, write_run_artifacts
from captioning.evaluation.bleu import (
BleuBreakdown,
corpus_bleu_breakdown,
corpus_bleu_score,
)
from captioning.evaluation.cider import MIN_SAMPLES_FOR_CIDER, corpus_cider_score
from captioning.evaluation.inspection import (
SampleDiagnostics,
diagnose_many,
diagnose_sample,
format_diagnostic_row,
write_diagnostics_jsonl,
)
from captioning.evaluation.meteor import corpus_meteor_score
from captioning.evaluation.rouge import corpus_rouge_l_score
from captioning.evaluation.runner import MetricsReport, compute_all_metrics
__all__ = [
"MIN_SAMPLES_FOR_CIDER",
"BleuBreakdown",
"MetricsReport",
"RunMeta",
"SampleDiagnostics",
"compute_all_metrics",
"corpus_bleu_breakdown",
"corpus_bleu_score",
"corpus_cider_score",
"corpus_meteor_score",
"corpus_rouge_l_score",
"diagnose_many",
"diagnose_sample",
"format_diagnostic_row",
"write_diagnostics_jsonl",
"write_run_artifacts",
]