Spaces:
Configuration error
Configuration error
File size: 7,974 Bytes
3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 91a1214 3a2e5f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | """Evaluate a trained model on the COCO validation split.
Usage:
# Full benchmark-ready evaluation (recommended) β writes
# results/<run_id>/{metrics.json, predictions.jsonl, diagnostics.jsonl, ...}
python -m scripts.evaluate \\
--config configs/base.yaml \\
--weights models/v1.0.0/model.h5 \\
--tokenizer-dir models/v1.0.0 \\
--results-root results \\
--max-samples 500
# Optional: produce a single Markdown report at a chosen path
python -m scripts.evaluate ... --report docs/results/v1.0.0.md
What this script produces (per run):
metrics.json β corpus BLEU-1..4, ROUGE-L, METEOR, CIDEr
predictions.jsonl β image / prediction / references for downstream tools
diagnostics.jsonl β per-sample length / repetition / sentence BLEU flags
run_meta.json β model id, decode strategy, beam width, timestamp
report.md β human-readable summary
Phase 3 benchmark code joins multiple ``results/<run_id>/`` directories to
plot BLEU-4 / CIDEr / latency across models.
"""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from typing import cast
import click
from captioning.config import load_config
from captioning.data import load_coco_annotations, make_image_level_splits
from captioning.evaluation import (
RunMeta,
compute_all_metrics,
diagnose_many,
write_run_artifacts,
)
from captioning.inference import CaptionPredictor
from captioning.inference.predictor import DecodeStrategy
from captioning.preprocessing import preprocess_caption
from captioning.utils import configure_logging, get_logger, set_global_seed
log = get_logger(__name__)
@click.command()
@click.option(
"--config", "config_path", required=True, type=click.Path(exists=True, path_type=Path)
)
@click.option("--weights", required=True, type=click.Path(exists=True, path_type=Path))
@click.option("--tokenizer-dir", required=True, type=click.Path(exists=True, path_type=Path))
@click.option(
"--results-root",
type=click.Path(path_type=Path),
default=Path("results"),
help="Parent directory for the per-run sub-folder (results/<run_id>/).",
)
@click.option(
"--run-id",
type=str,
default=None,
help="Sub-folder name under --results-root. Defaults to a UTC timestamp.",
)
@click.option(
"--model-id",
type=str,
default="inceptionv3-transformer-v1",
help="Identifier used by Phase 3 cross-model joining of metrics.",
)
@click.option(
"--decode-strategy",
type=click.Choice(["greedy", "beam"]),
default=None,
help="Override config.serve.decode_strategy for this run.",
)
@click.option("--beam-width", type=int, default=None, help="Beam width (only used with beam).")
@click.option("--length-penalty", type=float, default=None)
@click.option("--repetition-penalty", type=float, default=None)
@click.option(
"--report",
"report_path",
default=None,
type=click.Path(path_type=Path),
help="Optional path to an additional human-readable Markdown report.",
)
@click.option(
"--max-samples",
default=500,
type=int,
help="Cap on validation examples (full val takes hours on CPU).",
)
@click.option(
"--skip-meteor",
is_flag=True,
default=False,
help="Skip METEOR (avoids needing Java).",
)
@click.option(
"--skip-cider",
is_flag=True,
default=False,
help="Skip CIDEr.",
)
def main( # β CLI option count is unavoidable
config_path: Path,
weights: Path,
tokenizer_dir: Path,
results_root: Path,
run_id: str | None,
model_id: str,
decode_strategy: str | None,
beam_width: int | None,
length_penalty: float | None,
repetition_penalty: float | None,
report_path: Path | None,
max_samples: int,
skip_meteor: bool,
skip_cider: bool,
) -> None:
"""Evaluate the model on the val split and write benchmark artefacts."""
configure_logging()
config = load_config(config_path)
set_global_seed(config.train.seed)
df = load_coco_annotations(
base_path=config.data.base_path,
annotations_filename=config.data.annotations_filename,
images_subdir=config.data.images_subdir,
sample_size=config.data.sample_size,
seed=config.train.seed,
caption_preprocessor=preprocess_caption,
)
_, _, val_imgs, val_caps = make_image_level_splits(
df, train_fraction=config.data.train_val_split, seed=config.train.seed
)
# Group references by image so we get the COCO 5-references-per-image format.
refs_by_image: dict[str, list[str]] = {}
for img, cap in zip(val_imgs, val_caps, strict=True):
refs_by_image.setdefault(img, []).append(cap)
image_paths = list(refs_by_image.keys())[:max_samples]
effective_strategy = decode_strategy or config.serve.decode_strategy
effective_beam_width = beam_width if beam_width is not None else config.serve.beam_width
effective_length_penalty = (
length_penalty if length_penalty is not None else config.serve.length_penalty
)
effective_repetition_penalty = (
repetition_penalty if repetition_penalty is not None else config.serve.repetition_penalty
)
predictor = CaptionPredictor.from_artifacts(
weights_path=weights,
tokenizer_dir=tokenizer_dir,
config=config,
decode_strategy=cast("DecodeStrategy", effective_strategy),
beam_width=effective_beam_width,
length_penalty=effective_length_penalty,
repetition_penalty=effective_repetition_penalty,
)
predictor.warmup()
predictions: list[str] = []
references: list[list[str]] = []
for path in image_paths:
predictions.append(predictor.predict_path(path))
references.append(refs_by_image[path])
metrics = compute_all_metrics(
predictions,
references,
include_meteor=not skip_meteor,
include_cider=not skip_cider,
)
diagnostics = diagnose_many(image_paths, predictions, references)
run_id = run_id or datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
run_dir = Path(results_root) / run_id
meta = RunMeta(
model_id=model_id,
decode_strategy=effective_strategy,
weights_path=str(weights),
tokenizer_dir=str(tokenizer_dir),
n_samples=len(predictions),
max_length=config.model.max_length,
beam_width=effective_beam_width if effective_strategy == "beam" else None,
length_penalty=effective_length_penalty if effective_strategy == "beam" else None,
repetition_penalty=effective_repetition_penalty,
)
write_run_artifacts(
run_dir,
metrics=metrics,
meta=meta,
images=image_paths,
predictions=predictions,
references=references,
diagnostics=diagnostics,
)
log.info(
"evaluation_done",
run_dir=str(run_dir),
n=metrics.n_examples,
bleu4=metrics.bleu4,
rouge_l=metrics.rouge_l,
meteor=metrics.meteor,
cider=metrics.cider,
)
click.echo(f"Run directory: {run_dir}")
_echo_metric("BLEU-1", metrics.bleu1)
_echo_metric("BLEU-2", metrics.bleu2)
_echo_metric("BLEU-3", metrics.bleu3)
_echo_metric("BLEU-4", metrics.bleu4)
_echo_metric("ROUGE-L", metrics.rouge_l)
_echo_metric("METEOR", metrics.meteor)
_echo_metric("CIDEr", metrics.cider)
if metrics.errors:
click.echo(f"Skipped/failed: {sorted(metrics.errors)}")
if report_path is not None:
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(
(run_dir / "report.md").read_text(encoding="utf-8"), encoding="utf-8"
)
def _echo_metric(name: str, value: float | None) -> None:
if value is None:
click.echo(f"{name}: n/a")
else:
click.echo(f"{name}: {value:.2f}")
if __name__ == "__main__":
main()
|