feat(eval): add METEOR + optional LLM-as-judge for VQA scoring

METEOR (WordNet synonym/stem-aware) supplements BLEU/ROUGE for radiology
phrasing variance. LLM-as-judge is opt-in via --llm_judge (defaults to
gpt-4o-mini, OpenAI-compatible base_url overridable for Gemini/Claude),
with --llm_judge_max_samples for cost control.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (3) hide show

evaluation/evaluate.py +17 -0
evaluation/metrics.py +185 -1
requirements.txt +1 -0

evaluation/evaluate.py CHANGED Viewed

@@ -89,6 +89,18 @@ def parse_args():
                              "If unset, resolved from state file.")
     parser.add_argument("--no_hf_upload", action="store_true",
                         help="Disable HuggingFace Hub upload of predictions/metrics.")
     return parser.parse_args()
@@ -276,6 +288,11 @@ def main():
             task          = task,
             chexbert_path = args.chexbert_path,
             device        = args.device,
         )
         print_results(metrics, task)

                              "If unset, resolved from state file.")
     parser.add_argument("--no_hf_upload", action="store_true",
                         help="Disable HuggingFace Hub upload of predictions/metrics.")
+    # ── LLM-as-judge (VQA only) ─────────────────────────────────────────────
+    parser.add_argument("--llm_judge", action="store_true",
+                        help="Enable LLM-as-judge semantic scoring for VQA. "
+                             "Requires OPENAI_API_KEY (or compatible).")
+    parser.add_argument("--llm_judge_model", type=str, default="gpt-4o-mini",
+                        help="Judge model name. Default: gpt-4o-mini "
+                             "(~$0.30 / 2k VQA samples).")
+    parser.add_argument("--llm_judge_base_url", type=str, default=None,
+                        help="Override base URL for non-OpenAI providers "
+                             "(e.g. Gemini OpenAI-compat endpoint).")
+    parser.add_argument("--llm_judge_max_samples", type=int, default=None,
+                        help="Cap number of samples sent to the judge (cost control).")
     return parser.parse_args()
             task          = task,
             chexbert_path = args.chexbert_path,
             device        = args.device,
+            questions             = predictions.get("questions"),
+            llm_judge             = args.llm_judge and task == "vqa",
+            llm_judge_model       = args.llm_judge_model,
+            llm_judge_base_url    = args.llm_judge_base_url,
+            llm_judge_max_samples = args.llm_judge_max_samples,
         )
         print_results(metrics, task)

evaluation/metrics.py CHANGED Viewed

@@ -14,17 +14,39 @@ Evaluation metrics for 3 tasks:
     - Accuracy (exact match)
     - Token-level F1
     - BLEU-1 (for open-ended answers)
 """
 import re
 from typing import List, Dict, Optional, Tuple
 import torch
 import numpy as np
 from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
 from rouge_score import rouge_scorer
 # ─── NLG Metrics ─────────────────────────────────────────────────────────────
 def compute_bleu(
@@ -82,6 +104,39 @@ def compute_rouge(
     }
 def compute_bertscore(
     hypotheses: List[str],
     references: List[str],
@@ -228,6 +283,114 @@ def _token_f1(prediction: str, ground_truth: str) -> float:
     return f1
 # ─── Master Evaluation Function ──────────────────────────────────────────────
 def evaluate_all(
@@ -236,6 +399,11 @@ def evaluate_all(
     task:          str,
     chexbert_path: Optional[str] = None,
     device:        str = "cpu",
 ) -> Dict[str, float]:
     """
     Compute all relevant metrics for a given task.
@@ -243,8 +411,11 @@ def evaluate_all(
     Args:
         hypotheses:    model-generated texts
         references:    ground truth texts
-        task:          "findings" | "impression" | "vqa"
         chexbert_path: for clinical F1 (optional)
     Returns:
         Dict of metric_name → score
@@ -256,14 +427,27 @@ def evaluate_all(
     if task in ("findings", "impression", "report"):
         results.update(compute_bleu(hypotheses, references))
         results.update(compute_rouge(hypotheses, references))
         results.update(compute_bertscore(hypotheses, references, device=device))
         results.update(compute_clinical_f1(
             hypotheses, references, chexbert_path, device
         ))
     elif task == "vqa":
         results.update(compute_vqa_accuracy(hypotheses, references))
         results.update(compute_bleu(hypotheses, references))
     return results

     - Accuracy (exact match)
     - Token-level F1
     - BLEU-1 (for open-ended answers)
+    - METEOR (synonym + stem aware)
+    - BERTScore (semantic similarity)
+    - LLM-as-Judge (optional, GPT/Claude/Gemini for clinical semantic eval)
 """
+import os
 import re
+import json
+import time
 from typing import List, Dict, Optional, Tuple
 import torch
 import numpy as np
 from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
+from nltk.translate.meteor_score import meteor_score as nltk_meteor
 from rouge_score import rouge_scorer
+# Ensure NLTK data required for METEOR is available (wordnet + punkt).
+# Safe to call repeatedly — nltk.download() is a no-op if already present.
+def _ensure_nltk_data():
+    import nltk
+    for pkg, path in [
+        ("wordnet",   "corpora/wordnet"),
+        ("omw-1.4",   "corpora/omw-1.4"),
+        ("punkt",     "tokenizers/punkt"),
+    ]:
+        try:
+            nltk.data.find(path)
+        except LookupError:
+            nltk.download(pkg, quiet=True)
 # ─── NLG Metrics ─────────────────────────────────────────────────────────────
 def compute_bleu(
     }
+def compute_meteor(
+    hypotheses: List[str],
+    references: List[str],
+) -> Dict[str, float]:
+    """
+    Compute corpus-level METEOR score.
+    METEOR improves over BLEU by:
+      - Matching synonyms via WordNet ("big" ↔ "large")
+      - Matching stems ("enlarged" ↔ "enlarging")
+      - Balancing precision + recall (weighted F-mean)
+      - Penalizing fragmented matches (chunk penalty)
+    Especially useful for radiology where paraphrasing is common.
+    Returns:
+        {"meteor": float}
+    """
+    _ensure_nltk_data()
+    scores = []
+    for hyp, ref in zip(hypotheses, references):
+        ref_tokens = ref.lower().split()
+        hyp_tokens = hyp.lower().split()
+        if not hyp_tokens or not ref_tokens:
+            scores.append(0.0)
+            continue
+        # nltk_meteor takes a list of references (here just one)
+        scores.append(nltk_meteor([ref_tokens], hyp_tokens))
+    return {"meteor": round(float(np.mean(scores)) if scores else 0.0, 4)}
 def compute_bertscore(
     hypotheses: List[str],
     references: List[str],
     return f1
+# ─── LLM-as-Judge (semantic correctness via GPT/Claude/Gemini) ───────────────
+_LLM_JUDGE_PROMPT = """You are a clinical evaluator for chest X-ray VQA.
+Judge whether the predicted answer is semantically equivalent to the ground
+truth in a medical context. Be tolerant of synonyms ("cardiomegaly" =
+"enlarged heart"), paraphrases, and extra/missing function words. Penalize
+contradictions (e.g. negating a positive finding) or clinically wrong
+content.
+Question:     {question}
+Ground truth: {reference}
+Prediction:   {hypothesis}
+Reply with ONLY a JSON object of the form: {{"score": <0-5 integer>, "reason": "<one short sentence>"}}
+Scoring rubric:
+  5 = clinically equivalent
+  4 = mostly correct, minor omission
+  3 = partially correct
+  2 = mostly incorrect
+  1 = wrong but on topic
+  0 = contradicts ground truth / unrelated"""
+def compute_llm_judge(
+    hypotheses: List[str],
+    references: List[str],
+    questions:  Optional[List[str]] = None,
+    model:      str = "gpt-4o-mini",
+    api_key:    Optional[str] = None,
+    base_url:   Optional[str] = None,
+    max_samples: Optional[int] = None,
+    sleep_s:    float = 0.0,
+) -> Dict[str, float]:
+    """
+    Score (hyp, ref) pairs with an LLM judge (OpenAI-compatible API).
+    Defaults to OpenAI's gpt-4o-mini (~$0.30 per 2k VQA samples).
+    For free alternatives, pass:
+      - Gemini   : base_url="https://generativelanguage.googleapis.com/v1beta/openai/", model="gemini-1.5-flash"
+      - Local    : base_url="http://localhost:11434/v1" (Ollama), model="llama3.1"
+      - Anthropic: needs separate SDK — not supported via this OpenAI-compatible path.
+    Args:
+        hypotheses, references, questions: parallel lists
+        model:        judge model name
+        api_key:      defaults to env var OPENAI_API_KEY
+        base_url:     override for non-OpenAI providers
+        max_samples:  cap evaluation cost (e.g. 200) — useful for sanity checks
+        sleep_s:      delay between calls to dodge rate limits
+    Returns:
+        {"llm_judge_mean": float (0-5), "llm_judge_norm": float (0-1),
+         "llm_judge_n":    int}
+    """
+    try:
+        from openai import OpenAI
+    except ImportError:
+        print("[WARNING] openai package not installed. Skipping LLM-judge.")
+        return {"llm_judge_mean": 0.0, "llm_judge_norm": 0.0, "llm_judge_n": 0}
+    api_key = api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("[WARNING] OPENAI_API_KEY not set. Skipping LLM-judge.")
+        return {"llm_judge_mean": 0.0, "llm_judge_norm": 0.0, "llm_judge_n": 0}
+    client = OpenAI(api_key=api_key, base_url=base_url) if base_url else OpenAI(api_key=api_key)
+    n = len(hypotheses)
+    if max_samples is not None:
+        n = min(n, max_samples)
+    questions = questions or [""] * n
+    scores = []
+    for i in range(n):
+        prompt = _LLM_JUDGE_PROMPT.format(
+            question   = questions[i] or "(not provided)",
+            reference  = references[i],
+            hypothesis = hypotheses[i],
+        )
+        try:
+            resp = client.chat.completions.create(
+                model       = model,
+                messages    = [{"role": "user", "content": prompt}],
+                temperature = 0.0,
+                max_tokens  = 80,
+                response_format = {"type": "json_object"},
+            )
+            raw = resp.choices[0].message.content.strip()
+            data = json.loads(raw)
+            score = int(data.get("score", 0))
+            score = max(0, min(5, score))
+            scores.append(score)
+        except Exception as e:
+            print(f"[LLM-judge] sample {i} failed: {e}")
+        if sleep_s > 0:
+            time.sleep(sleep_s)
+    if not scores:
+        return {"llm_judge_mean": 0.0, "llm_judge_norm": 0.0, "llm_judge_n": 0}
+    mean = float(np.mean(scores))
+    return {
+        "llm_judge_mean": round(mean, 4),
+        "llm_judge_norm": round(mean / 5.0, 4),   # 0..1 for easy comparison
+        "llm_judge_n":    len(scores),
+    }
 # ─── Master Evaluation Function ──────────────────────────────────────────────
 def evaluate_all(
     task:          str,
     chexbert_path: Optional[str] = None,
     device:        str = "cpu",
+    questions:     Optional[List[str]] = None,
+    llm_judge:     bool = False,
+    llm_judge_model: str = "gpt-4o-mini",
+    llm_judge_base_url: Optional[str] = None,
+    llm_judge_max_samples: Optional[int] = None,
 ) -> Dict[str, float]:
     """
     Compute all relevant metrics for a given task.
     Args:
         hypotheses:    model-generated texts
         references:    ground truth texts
+        task:          "findings" | "impression" | "report" | "vqa"
         chexbert_path: for clinical F1 (optional)
+        questions:     VQA questions (passed to LLM judge for context)
+        llm_judge:     if True, also run GPT/Claude/Gemini as a semantic judge
+                       (requires OPENAI_API_KEY or compatible endpoint)
     Returns:
         Dict of metric_name → score
     if task in ("findings", "impression", "report"):
         results.update(compute_bleu(hypotheses, references))
         results.update(compute_rouge(hypotheses, references))
+        results.update(compute_meteor(hypotheses, references))
         results.update(compute_bertscore(hypotheses, references, device=device))
         results.update(compute_clinical_f1(
             hypotheses, references, chexbert_path, device
         ))
     elif task == "vqa":
+        # Lexical
         results.update(compute_vqa_accuracy(hypotheses, references))
         results.update(compute_bleu(hypotheses, references))
+        results.update(compute_meteor(hypotheses, references))
+        # Semantic
+        results.update(compute_bertscore(hypotheses, references, device=device))
+        if llm_judge:
+            results.update(compute_llm_judge(
+                hypotheses, references,
+                questions    = questions,
+                model        = llm_judge_model,
+                base_url     = llm_judge_base_url,
+                max_samples  = llm_judge_max_samples,
+            ))
     return results

requirements.txt CHANGED Viewed

@@ -12,6 +12,7 @@ wandb==0.16.0
 rouge-score==0.1.2
 nltk==3.8.1
 bert-score==0.3.13
 scikit-learn==1.3.2
 pandas==2.1.0
 numpy==1.24.0

 rouge-score==0.1.2
 nltk==3.8.1
 bert-score==0.3.13
+openai>=1.30.0  # optional: LLM-as-judge for VQA (also works with Gemini/Ollama via base_url)
 scikit-learn==1.3.2
 pandas==2.1.0
 numpy==1.24.0