Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Sleeping

App Files Files Community

MedGRPO Team commited on 12 days ago

Commit

4752404

1 Parent(s): 6edbd17

update

Browse files

Files changed (3) hide show

README.md +21 -1
app.py +232 -0
evaluation/eval_caption_llm_judge.py +65 -9

README.md CHANGED Viewed

@@ -150,10 +150,30 @@ The leaderboard supports **two formats** for submission:
 The system will:
 - Validate your file (format + sample count)
-- Run automatic evaluation (~5-10 minutes)
 - Extract metrics for all 8 tasks
 - Add your model to the leaderboard
 ## Evaluation Metrics
 ### Task-Specific Metrics

 The system will:
 - Validate your file (format + sample count)
+- Run automatic evaluation (~2-5 minutes with `--skip-llm-judge`, ~10-20 minutes with LLM judge)
 - Extract metrics for all 8 tasks
 - Add your model to the leaderboard
+**Note**: By default, DVC/VS/RC are evaluated with `--skip-llm-judge` for faster results (caption metrics will be 0.0). You can run LLM judge evaluation later using the button on the leaderboard page.
+### 4. Run LLM Judge Evaluation (Optional)
+If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_llm are all 0.0), you can compute these metrics later:
+1. Go to the **Leaderboard** tab
+2. Scroll to the **"Run LLM Judge Evaluation"** section
+3. Enter your model name (exact match)
+4. Click **"Run LLM Judge"**
+The system will:
+- Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
+- Update your leaderboard entry with caption metrics
+- Preserve all other metrics (TAL, STG, NAP, SA, CVS)
+**Time**: ~10-20 minutes depending on API rate limits
+**Availability**: Only available when ALL three caption metrics are 0.0
 ## Evaluation Metrics
 ### Task-Specific Metrics

app.py CHANGED Viewed

@@ -1273,6 +1273,207 @@ def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
     return display_df
 # Create Gradio interface
 with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
@@ -1296,6 +1497,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
             ### Current Rankings
             The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
             """)
             def load_and_format_leaderboard():
@@ -1336,6 +1539,35 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
                 outputs=[leaderboard_table, status_text]
             )
         # Tab 2: Submit
         with gr.Tab("📤 Submit Results"):
             gr.Markdown("""

     return display_df
+def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
+    """
+    Check if a model needs LLM judge evaluation.
+    Returns:
+        (needs_llm_judge, message)
+    """
+    df = load_leaderboard()
+    if model_name not in df['model_name'].values:
+        return False, f"Model '{model_name}' not found"
+    model_row = df[df['model_name'] == model_name].iloc[0]
+    # Check if all three caption metrics are zero
+    dvc_llm = model_row.get('dvc_llm', 0.0)
+    vs_llm = model_row.get('vs_llm', 0.0)
+    rc_llm = model_row.get('rc_llm', 0.0)
+    if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
+        return True, "All caption metrics are 0.0, can run LLM judge"
+    else:
+        return False, "Caption metrics already computed"
+def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
+    """
+    Run LLM judge evaluation for DVC/VS/RC tasks on a previously submitted model.
+    This function:
+    1. Loads the original predictions from results directory
+    2. Re-runs evaluation WITH LLM judge (no --skip-llm-judge flag)
+    3. Updates the leaderboard with new caption metrics
+    Args:
+        model_name: Name of the model to re-evaluate
+        progress: Gradio progress tracker
+    Returns:
+        Status message (markdown)
+    """
+    try:
+        # Check if model exists and needs LLM judge
+        needs_llm, msg = check_needs_llm_judge(model_name)
+        if not needs_llm:
+            return f"❌ {msg}"
+        progress(0.1, desc="Loading predictions...")
+        yield f"🔍 **Step 1/4**: Checking model predictions...\n\n"
+        # Find the predictions file
+        model_dir = RESULTS_DIR / model_name.replace(" ", "_")
+        input_file = model_dir / "input.json"
+        if not input_file.exists():
+            yield f"❌ Predictions file not found: {input_file}"
+            return
+        yield f"✓ Found predictions file\n\n"
+        # Run evaluation WITH LLM judge
+        progress(0.2, desc="Running LLM judge evaluation...")
+        yield f"⚙️ **Step 2/4**: Running LLM judge evaluation (DVC/VS/RC)...\n\n"
+        yield f"⏳ This may take 5-15 minutes depending on API rate limits...\n\n"
+        eval_wrapper = Path("evaluation/evaluate_predictions.py")
+        cmd = [
+            sys.executable,
+            "-u",
+            str(eval_wrapper),
+            str(input_file),
+            "--grouping", "overall",
+            "--ground-truth", str(GROUND_TRUTH_FILE)
+            # NOTE: No --skip-llm-judge flag, so LLM judge will run
+        ]
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+            env={**os.environ, "PYTHONUNBUFFERED": "1"}
+        )
+        # Stream logs
+        import time
+        log_buffer = []
+        last_update = time.time()
+        line_count = 0
+        import select
+        while True:
+            if process.poll() is not None:
+                remaining = process.stdout.read()
+                if remaining:
+                    for line in remaining.split('\n'):
+                        line = line.rstrip()
+                        if line.strip() and 'WARNING: All log messages' not in line:
+                            log_buffer.append(line)
+                break
+            ready, _, _ = select.select([process.stdout], [], [], 0.5)
+            if ready:
+                line = process.stdout.readline()
+                if not line:
+                    break
+                line = line.rstrip()
+                if not line.strip() or 'WARNING: All log messages' in line:
+                    continue
+                log_buffer.append(line)
+                line_count += 1
+            # Update UI every 1 second
+            if time.time() - last_update > 1.0:
+                if log_buffer:
+                    recent = log_buffer[-20:]
+                    log_text = f"⚙️ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
+                    log_text += '\n'.join(recent)
+                    log_text += "\n```"
+                    yield log_text
+                last_update = time.time()
+                progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
+                progress(progress_val, desc="Running LLM judge...")
+        process.wait()
+        if process.returncode != 0:
+            yield f"\n❌ Evaluation failed (exit code {process.returncode})"
+            return
+        # Parse metrics
+        progress(0.85, desc="Extracting metrics...")
+        yield f"⚙️ **Step 3/4**: Extracting caption metrics...\n\n"
+        full_output = '\n'.join(log_buffer)
+        metrics = parse_evaluation_output(full_output)
+        # Save updated output
+        with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
+            f.write(full_output)
+        # Extract caption metrics
+        dvc_llm = metrics.get('dvc_llm', 0.0)
+        vs_llm = metrics.get('vs_llm', 0.0)
+        rc_llm = metrics.get('rc_llm', 0.0)
+        if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
+            yield f"❌ Failed to extract caption metrics from evaluation output"
+            return
+        yield f"✓ Caption metrics extracted:\n"
+        yield f"  - DVC_llm: {dvc_llm:.4f}\n"
+        yield f"  - VS_llm: {vs_llm:.4f}\n"
+        yield f"  - RC_llm: {rc_llm:.4f}\n\n"
+        # Update leaderboard
+        progress(0.95, desc="Updating leaderboard...")
+        yield f"⚙️ **Step 4/4**: Updating leaderboard...\n\n"
+        df = load_leaderboard()
+        # Update caption metrics
+        df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
+        df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
+        df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
+        # Re-sort by first metric
+        df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
+        save_leaderboard(df)
+        progress(1.0, desc="Complete!")
+        success_msg = f"""
+---
+## ✅ LLM Judge Evaluation Complete!
+**Model**: {model_name}
+### 📈 Updated Caption Metrics
+- **DVC_llm**: {dvc_llm:.4f}
+- **VS_llm**: {vs_llm:.4f}
+- **RC_llm**: {rc_llm:.4f}
+✓ Leaderboard updated successfully!
+Refresh the Leaderboard tab to see updated rankings.
+"""
+        yield success_msg
+    except Exception as e:
+        yield f"❌ Error running LLM judge evaluation: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
             ### Current Rankings
             The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
+            **Note**: Models with all caption metrics (DVC_llm, VS_llm, RC_llm) at 0.0 can be re-evaluated with LLM judge using the section below.
             """)
             def load_and_format_leaderboard():
                 outputs=[leaderboard_table, status_text]
             )
+            # LLM Judge Evaluation Section
+            gr.Markdown("""
+            ---
+            ### 🤖 Run LLM Judge Evaluation
+            If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
+            This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
+            **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
+            """)
+            with gr.Row():
+                llm_judge_model_input = gr.Textbox(
+                    label="Model Name",
+                    placeholder="Enter exact model name from leaderboard",
+                    scale=3
+                )
+                run_llm_judge_btn = gr.Button("🚀 Run LLM Judge", variant="primary", scale=1)
+            llm_judge_output = gr.Markdown(label="LLM Judge Status")
+            # Wire up LLM judge evaluation
+            run_llm_judge_btn.click(
+                fn=run_llm_judge_evaluation,
+                inputs=[llm_judge_model_input],
+                outputs=llm_judge_output
+            )
         # Tab 2: Submit
         with gr.Tab("📤 Submit Results"):
             gr.Markdown("""

evaluation/eval_caption_llm_judge.py CHANGED Viewed

@@ -90,8 +90,20 @@ R4: [score]
     return prompt
-def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=3) -> dict:
-    """Call OpenAI API to evaluate a caption pair."""
     global completed_calls, total_calls
     if not OPENAI_AVAILABLE:
@@ -101,12 +113,15 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
         client = OpenAI(api_key=api_key)
         prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
         for attempt in range(max_retries):
             try:
                 response = client.chat.completions.create(
                     model="gpt-4o-2024-11-20",  # Latest GPT-4 model
                     messages=[{"role": "user", "content": prompt}],
                     temperature=0.0,
                 )
                 raw_response = response.choices[0].message.content
@@ -130,23 +145,49 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
                     return scores
                 else:
                     if attempt < max_retries - 1:
-                        time.sleep(1)
                         continue
             except Exception as e:
                 if attempt < max_retries - 1:
-                    time.sleep(2)
                     continue
     except Exception as e:
-        print(f"  ⚠ LLM Judge API error: {e}")
-    # Failed
     with progress_lock:
         completed_calls += 1
-    return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False}
 def run_llm_judge_evaluation(results_data, task_type, api_key):
@@ -208,6 +249,10 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
     all_scores = defaultdict(list)
     api_successes = []
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
@@ -221,8 +266,13 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
                 api_successes.append(True)
             else:
                 api_successes.append(False)
     if not all_scores:
         return None
     # Compute averages
@@ -230,8 +280,14 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
     overall_average = np.mean(list(aspect_averages.values()))
     success_rate = np.mean(api_successes) if api_successes else 0.0
-    print(f"✓ LLM Judge completed: {sum(api_successes)}/{len(caption_pairs)} successful API calls")
     return {
         'average_score': overall_average,

     return prompt
+def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=5) -> dict:
+    """
+    Call OpenAI API to evaluate a caption pair with retry logic.
+    Args:
+        prediction: Model's prediction text
+        ground_truth: Ground truth text
+        task_type: Task type (dense_captioning, video_summary, region_caption)
+        api_key: OpenAI API key
+        max_retries: Maximum number of retry attempts (default: 5)
+    Returns:
+        dict: Scores for each aspect + api_success flag
+    """
     global completed_calls, total_calls
     if not OPENAI_AVAILABLE:
         client = OpenAI(api_key=api_key)
         prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
+        last_error = None
         for attempt in range(max_retries):
             try:
                 response = client.chat.completions.create(
                     model="gpt-4o-2024-11-20",  # Latest GPT-4 model
                     messages=[{"role": "user", "content": prompt}],
                     temperature=0.0,
+                    timeout=30.0  # 30 second timeout per request
                 )
                 raw_response = response.choices[0].message.content
                     return scores
                 else:
+                    # Failed to parse all scores - retry
+                    last_error = f"Incomplete parsing: got {len(scores)}/{len(BEST5_ASPECTS)} scores"
                     if attempt < max_retries - 1:
+                        wait_time = min(2 ** attempt, 16)  # Exponential backoff: 1, 2, 4, 8, 16 seconds
+                        print(f"  ⚠ {last_error}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                        time.sleep(wait_time)
                         continue
             except Exception as e:
+                last_error = str(e)
+                error_type = type(e).__name__
+                # Determine if error is retryable
+                is_rate_limit = 'rate_limit' in last_error.lower() or 'RateLimitError' in error_type
+                is_timeout = 'timeout' in last_error.lower() or 'TimeoutError' in error_type
+                is_network = 'connection' in last_error.lower() or 'ConnectionError' in error_type
                 if attempt < max_retries - 1:
+                    # Exponential backoff with longer waits for rate limits
+                    if is_rate_limit:
+                        wait_time = min(2 ** (attempt + 2), 60)  # 4, 8, 16, 32, 60 seconds for rate limits
+                        print(f"  ⚠ Rate limit hit, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                    elif is_timeout or is_network:
+                        wait_time = min(2 ** attempt, 16)
+                        print(f"  ⚠ {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                    else:
+                        wait_time = min(2 ** attempt, 16)
+                        print(f"  ⚠ API error: {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                    time.sleep(wait_time)
                     continue
+                else:
+                    # Last attempt failed
+                    print(f"  ❌ API call failed after {max_retries} attempts: {error_type}")
     except Exception as e:
+        print(f"  ❌ LLM Judge API error (client setup): {e}")
+    # Failed after all retries
     with progress_lock:
         completed_calls += 1
+    return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False, 'error': last_error}
 def run_llm_judge_evaluation(results_data, task_type, api_key):
     all_scores = defaultdict(list)
     api_successes = []
+    api_failures = []
+    print(f"   Running {total_calls} API calls with {MAX_WORKERS} parallel workers...")
+    print(f"   Max retries per call: 5 (with exponential backoff)")
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
                 api_successes.append(True)
             else:
                 api_successes.append(False)
+                if 'error' in result:
+                    api_failures.append(result['error'])
     if not all_scores:
+        print(f"❌ All API calls failed")
+        if api_failures:
+            print(f"   Sample errors: {api_failures[:3]}")
         return None
     # Compute averages
     overall_average = np.mean(list(aspect_averages.values()))
     success_rate = np.mean(api_successes) if api_successes else 0.0
+    num_successes = sum(api_successes)
+    num_failures = len(api_successes) - num_successes
+    print(f"✓ LLM Judge completed: {num_successes}/{len(caption_pairs)} successful API calls")
+    if num_failures > 0:
+        print(f"  ⚠ {num_failures} calls failed after all retries")
+        if api_failures:
+            print(f"  Sample errors: {api_failures[:3]}")
     return {
         'average_score': overall_average,