Spaces:
Configuration error
Configuration error
| """Tests for ROUGE-L, CIDEr, METEOR adapters and the unified runner. | |
| We don't validate the upstream implementations — they have their own test | |
| suites. We *do* validate our adapters: sentinel stripping, ragged references, | |
| the perfect-prediction bound, and that the unified ``compute_all_metrics`` | |
| correctly records partial failures in ``errors`` rather than crashing the | |
| whole pass. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import pytest | |
| from captioning.evaluation import ( | |
| MIN_SAMPLES_FOR_CIDER, | |
| BleuBreakdown, | |
| RunMeta, | |
| compute_all_metrics, | |
| corpus_bleu_breakdown, | |
| corpus_bleu_score, | |
| corpus_cider_score, | |
| corpus_rouge_l_score, | |
| diagnose_many, | |
| diagnose_sample, | |
| write_diagnostics_jsonl, | |
| write_run_artifacts, | |
| ) | |
| # ---- BLEU ------------------------------------------------------------------ | |
| def test_bleu_breakdown_returns_all_four_orders() -> None: | |
| refs = [["a man riding a bike"], ["a dog in the park"]] | |
| preds = ["a man riding a bike", "a dog in the park"] | |
| result = corpus_bleu_breakdown(preds, refs) | |
| assert isinstance(result, BleuBreakdown) | |
| assert result.bleu1 == pytest.approx(100.0) | |
| assert result.bleu2 == pytest.approx(100.0) | |
| assert result.bleu4 == pytest.approx(100.0) | |
| assert corpus_bleu_score(preds, refs) == pytest.approx(result.bleu4) | |
| def test_bleu_strips_sentinels_before_scoring() -> None: | |
| refs = [["[start] a man riding a bike [end]"]] | |
| preds = ["[start] a man riding a bike [end]"] | |
| assert corpus_bleu_score(preds, refs) == pytest.approx(100.0) | |
| # ---- ROUGE-L --------------------------------------------------------------- | |
| def test_rouge_l_perfect_matches_score_100() -> None: | |
| refs = [["a man riding a bike"], ["a dog in the park"]] | |
| preds = ["a man riding a bike", "a dog in the park"] | |
| score = corpus_rouge_l_score(preds, refs) | |
| assert score == pytest.approx(100.0) | |
| def test_rouge_l_partial_overlap_scores_in_range() -> None: | |
| refs = [["a man riding a bike on a road"]] | |
| preds = ["a man on a road"] | |
| score = corpus_rouge_l_score(preds, refs) | |
| # Reference has 7 tokens, prediction has 5 tokens, LCS=5 | |
| # P = 5/5 = 1.0, R = 5/7 ≈ 0.71, F ≈ 0.83 | |
| assert 70.0 < score < 90.0 | |
| def test_rouge_l_picks_best_reference() -> None: | |
| refs = [["xyz qrs nothing matches", "a man riding a bike"]] | |
| preds = ["a man riding a bike"] | |
| score = corpus_rouge_l_score(preds, refs) | |
| assert score == pytest.approx(100.0) | |
| def test_rouge_l_length_mismatch_raises() -> None: | |
| with pytest.raises(ValueError): | |
| corpus_rouge_l_score(["a"], [["a"], ["b"]]) | |
| # ---- CIDEr ----------------------------------------------------------------- | |
| def test_cider_requires_minimum_samples() -> None: | |
| with pytest.raises(ValueError, match="degenerate"): | |
| corpus_cider_score(["a man"], [["a man"]]) | |
| def test_cider_returns_positive_for_good_predictions() -> None: | |
| refs = [ | |
| ["a man riding a bike"], | |
| ["a dog in the park"], | |
| ["two children playing"], | |
| ] | |
| preds = ["a man riding a bike", "a dog in the park", "two children playing"] | |
| score = corpus_cider_score(preds, refs) | |
| assert score > 0.0 | |
| # ---- Runner ---------------------------------------------------------------- | |
| def test_compute_all_metrics_returns_every_field() -> None: | |
| refs = [["a man riding a bike"], ["a dog in the park"]] | |
| preds = ["a man riding a bike", "a dog in the park"] | |
| report = compute_all_metrics(preds, refs, include_meteor=False, include_cider=False) | |
| assert report.n_examples == 2 | |
| assert report.bleu1 is not None | |
| assert report.bleu4 is not None | |
| assert report.rouge_l is not None | |
| assert report.meteor is None # explicitly skipped | |
| assert report.cider is None # explicitly skipped | |
| def test_compute_all_metrics_skips_cider_on_tiny_corpus() -> None: | |
| refs = [["a man riding a bike"]] | |
| preds = ["a man riding a bike"] | |
| report = compute_all_metrics(preds, refs, include_meteor=False) | |
| assert report.cider is None | |
| assert "cider" in report.errors | |
| def test_compute_all_metrics_serialises_to_dict() -> None: | |
| refs = [["a man riding a bike"], ["a dog in the park"]] | |
| preds = ["a man riding a bike", "a dog in the park"] | |
| report = compute_all_metrics(preds, refs, include_meteor=False, include_cider=False) | |
| payload = report.to_dict() | |
| # JSON-roundtrip must not lose information. | |
| assert json.loads(json.dumps(payload)) == payload | |
| # ---- Inspection ----------------------------------------------------------- | |
| def test_diagnose_sample_flags_empty_prediction() -> None: | |
| d = diagnose_sample("img.jpg", "", ["a man riding a bike"]) | |
| assert "empty" in d.flags | |
| assert d.length_tokens == 0 | |
| def test_diagnose_sample_flags_repetitive_prediction() -> None: | |
| d = diagnose_sample("img.jpg", "a a a a man", ["a man riding a bike"]) | |
| assert "repetitive" in d.flags | |
| assert d.longest_repeat_run == 4 | |
| def test_diagnose_sample_flags_very_short_prediction() -> None: | |
| d = diagnose_sample("img.jpg", "a man", ["a man riding a bike"]) | |
| assert "very_short" in d.flags | |
| def test_diagnose_many_writes_jsonl(tmp_path) -> None: | |
| images = ["a.jpg", "b.jpg"] | |
| preds = ["a man riding a bike", ""] | |
| refs = [["a man on a bicycle"], ["a dog in the park"]] | |
| diags = diagnose_many(images, preds, refs) | |
| out = tmp_path / "diag.jsonl" | |
| write_diagnostics_jsonl(diags, out) | |
| lines = out.read_text(encoding="utf-8").splitlines() | |
| assert len(lines) == 2 | |
| parsed = [json.loads(line) for line in lines] | |
| assert parsed[0]["image"] == "a.jpg" | |
| # Empty prediction also flags as ``very_short`` because it has 0 tokens. | |
| assert "empty" in parsed[1]["flags"] | |
| # ---- Benchmark scaffolding ------------------------------------------------ | |
| def test_write_run_artifacts_emits_expected_files(tmp_path) -> None: | |
| images = ["a.jpg", "b.jpg"] | |
| preds = ["a man riding a bike", "a dog in the park"] | |
| refs = [["a man on a bicycle"], ["a dog in the park"]] | |
| diags = diagnose_many(images, preds, refs) | |
| report = compute_all_metrics(preds, refs, include_meteor=False, include_cider=False) | |
| meta = RunMeta( | |
| model_id="test-model", | |
| decode_strategy="greedy", | |
| weights_path="nowhere", | |
| tokenizer_dir="nowhere", | |
| n_samples=len(preds), | |
| max_length=40, | |
| ) | |
| out_dir = write_run_artifacts( | |
| tmp_path / "runX", | |
| metrics=report, | |
| meta=meta, | |
| images=images, | |
| predictions=preds, | |
| references=refs, | |
| diagnostics=diags, | |
| ) | |
| assert (out_dir / "metrics.json").is_file() | |
| assert (out_dir / "run_meta.json").is_file() | |
| assert (out_dir / "predictions.jsonl").is_file() | |
| assert (out_dir / "diagnostics.jsonl").is_file() | |
| assert (out_dir / "report.md").is_file() | |
| predictions_lines = (out_dir / "predictions.jsonl").read_text(encoding="utf-8").splitlines() | |
| assert len(predictions_lines) == 2 | |
| metadata = json.loads((out_dir / "run_meta.json").read_text(encoding="utf-8")) | |
| assert metadata["model_id"] == "test-model" | |
| assert metadata["n_samples"] == 2 | |
| _ = MIN_SAMPLES_FOR_CIDER # — exposed re-export, exercised by import | |