MedGRPO Team commited on
Commit
4752404
·
1 Parent(s): 6edbd17
Files changed (3) hide show
  1. README.md +21 -1
  2. app.py +232 -0
  3. evaluation/eval_caption_llm_judge.py +65 -9
README.md CHANGED
@@ -150,10 +150,30 @@ The leaderboard supports **two formats** for submission:
150
 
151
  The system will:
152
  - Validate your file (format + sample count)
153
- - Run automatic evaluation (~5-10 minutes)
154
  - Extract metrics for all 8 tasks
155
  - Add your model to the leaderboard
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  ## Evaluation Metrics
158
 
159
  ### Task-Specific Metrics
 
150
 
151
  The system will:
152
  - Validate your file (format + sample count)
153
+ - Run automatic evaluation (~2-5 minutes with `--skip-llm-judge`, ~10-20 minutes with LLM judge)
154
  - Extract metrics for all 8 tasks
155
  - Add your model to the leaderboard
156
 
157
+ **Note**: By default, DVC/VS/RC are evaluated with `--skip-llm-judge` for faster results (caption metrics will be 0.0). You can run LLM judge evaluation later using the button on the leaderboard page.
158
+
159
+ ### 4. Run LLM Judge Evaluation (Optional)
160
+
161
+ If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_llm are all 0.0), you can compute these metrics later:
162
+
163
+ 1. Go to the **Leaderboard** tab
164
+ 2. Scroll to the **"Run LLM Judge Evaluation"** section
165
+ 3. Enter your model name (exact match)
166
+ 4. Click **"Run LLM Judge"**
167
+
168
+ The system will:
169
+ - Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
170
+ - Update your leaderboard entry with caption metrics
171
+ - Preserve all other metrics (TAL, STG, NAP, SA, CVS)
172
+
173
+ **Time**: ~10-20 minutes depending on API rate limits
174
+
175
+ **Availability**: Only available when ALL three caption metrics are 0.0
176
+
177
  ## Evaluation Metrics
178
 
179
  ### Task-Specific Metrics
app.py CHANGED
@@ -1273,6 +1273,207 @@ def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
1273
  return display_df
1274
 
1275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1276
  # Create Gradio interface
1277
  with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1278
 
@@ -1296,6 +1497,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1296
  ### Current Rankings
1297
 
1298
  The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
 
 
1299
  """)
1300
 
1301
  def load_and_format_leaderboard():
@@ -1336,6 +1539,35 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1336
  outputs=[leaderboard_table, status_text]
1337
  )
1338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1339
  # Tab 2: Submit
1340
  with gr.Tab("📤 Submit Results"):
1341
  gr.Markdown("""
 
1273
  return display_df
1274
 
1275
 
1276
+ def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
1277
+ """
1278
+ Check if a model needs LLM judge evaluation.
1279
+
1280
+ Returns:
1281
+ (needs_llm_judge, message)
1282
+ """
1283
+ df = load_leaderboard()
1284
+
1285
+ if model_name not in df['model_name'].values:
1286
+ return False, f"Model '{model_name}' not found"
1287
+
1288
+ model_row = df[df['model_name'] == model_name].iloc[0]
1289
+
1290
+ # Check if all three caption metrics are zero
1291
+ dvc_llm = model_row.get('dvc_llm', 0.0)
1292
+ vs_llm = model_row.get('vs_llm', 0.0)
1293
+ rc_llm = model_row.get('rc_llm', 0.0)
1294
+
1295
+ if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
1296
+ return True, "All caption metrics are 0.0, can run LLM judge"
1297
+ else:
1298
+ return False, "Caption metrics already computed"
1299
+
1300
+
1301
+ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1302
+ """
1303
+ Run LLM judge evaluation for DVC/VS/RC tasks on a previously submitted model.
1304
+
1305
+ This function:
1306
+ 1. Loads the original predictions from results directory
1307
+ 2. Re-runs evaluation WITH LLM judge (no --skip-llm-judge flag)
1308
+ 3. Updates the leaderboard with new caption metrics
1309
+
1310
+ Args:
1311
+ model_name: Name of the model to re-evaluate
1312
+ progress: Gradio progress tracker
1313
+
1314
+ Returns:
1315
+ Status message (markdown)
1316
+ """
1317
+ try:
1318
+ # Check if model exists and needs LLM judge
1319
+ needs_llm, msg = check_needs_llm_judge(model_name)
1320
+ if not needs_llm:
1321
+ return f"❌ {msg}"
1322
+
1323
+ progress(0.1, desc="Loading predictions...")
1324
+ yield f"🔍 **Step 1/4**: Checking model predictions...\n\n"
1325
+
1326
+ # Find the predictions file
1327
+ model_dir = RESULTS_DIR / model_name.replace(" ", "_")
1328
+ input_file = model_dir / "input.json"
1329
+
1330
+ if not input_file.exists():
1331
+ yield f"❌ Predictions file not found: {input_file}"
1332
+ return
1333
+
1334
+ yield f"✓ Found predictions file\n\n"
1335
+
1336
+ # Run evaluation WITH LLM judge
1337
+ progress(0.2, desc="Running LLM judge evaluation...")
1338
+ yield f"⚙️ **Step 2/4**: Running LLM judge evaluation (DVC/VS/RC)...\n\n"
1339
+ yield f"⏳ This may take 5-15 minutes depending on API rate limits...\n\n"
1340
+
1341
+ eval_wrapper = Path("evaluation/evaluate_predictions.py")
1342
+
1343
+ cmd = [
1344
+ sys.executable,
1345
+ "-u",
1346
+ str(eval_wrapper),
1347
+ str(input_file),
1348
+ "--grouping", "overall",
1349
+ "--ground-truth", str(GROUND_TRUTH_FILE)
1350
+ # NOTE: No --skip-llm-judge flag, so LLM judge will run
1351
+ ]
1352
+
1353
+ process = subprocess.Popen(
1354
+ cmd,
1355
+ stdout=subprocess.PIPE,
1356
+ stderr=subprocess.STDOUT,
1357
+ text=True,
1358
+ bufsize=1,
1359
+ env={**os.environ, "PYTHONUNBUFFERED": "1"}
1360
+ )
1361
+
1362
+ # Stream logs
1363
+ import time
1364
+ log_buffer = []
1365
+ last_update = time.time()
1366
+ line_count = 0
1367
+ import select
1368
+
1369
+ while True:
1370
+ if process.poll() is not None:
1371
+ remaining = process.stdout.read()
1372
+ if remaining:
1373
+ for line in remaining.split('\n'):
1374
+ line = line.rstrip()
1375
+ if line.strip() and 'WARNING: All log messages' not in line:
1376
+ log_buffer.append(line)
1377
+ break
1378
+
1379
+ ready, _, _ = select.select([process.stdout], [], [], 0.5)
1380
+
1381
+ if ready:
1382
+ line = process.stdout.readline()
1383
+ if not line:
1384
+ break
1385
+
1386
+ line = line.rstrip()
1387
+ if not line.strip() or 'WARNING: All log messages' in line:
1388
+ continue
1389
+
1390
+ log_buffer.append(line)
1391
+ line_count += 1
1392
+
1393
+ # Update UI every 1 second
1394
+ if time.time() - last_update > 1.0:
1395
+ if log_buffer:
1396
+ recent = log_buffer[-20:]
1397
+ log_text = f"⚙️ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
1398
+ log_text += '\n'.join(recent)
1399
+ log_text += "\n```"
1400
+ yield log_text
1401
+
1402
+ last_update = time.time()
1403
+ progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
1404
+ progress(progress_val, desc="Running LLM judge...")
1405
+
1406
+ process.wait()
1407
+
1408
+ if process.returncode != 0:
1409
+ yield f"\n❌ Evaluation failed (exit code {process.returncode})"
1410
+ return
1411
+
1412
+ # Parse metrics
1413
+ progress(0.85, desc="Extracting metrics...")
1414
+ yield f"⚙️ **Step 3/4**: Extracting caption metrics...\n\n"
1415
+
1416
+ full_output = '\n'.join(log_buffer)
1417
+ metrics = parse_evaluation_output(full_output)
1418
+
1419
+ # Save updated output
1420
+ with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
1421
+ f.write(full_output)
1422
+
1423
+ # Extract caption metrics
1424
+ dvc_llm = metrics.get('dvc_llm', 0.0)
1425
+ vs_llm = metrics.get('vs_llm', 0.0)
1426
+ rc_llm = metrics.get('rc_llm', 0.0)
1427
+
1428
+ if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
1429
+ yield f"❌ Failed to extract caption metrics from evaluation output"
1430
+ return
1431
+
1432
+ yield f"✓ Caption metrics extracted:\n"
1433
+ yield f" - DVC_llm: {dvc_llm:.4f}\n"
1434
+ yield f" - VS_llm: {vs_llm:.4f}\n"
1435
+ yield f" - RC_llm: {rc_llm:.4f}\n\n"
1436
+
1437
+ # Update leaderboard
1438
+ progress(0.95, desc="Updating leaderboard...")
1439
+ yield f"⚙️ **Step 4/4**: Updating leaderboard...\n\n"
1440
+
1441
+ df = load_leaderboard()
1442
+
1443
+ # Update caption metrics
1444
+ df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
1445
+ df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
1446
+ df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
1447
+
1448
+ # Re-sort by first metric
1449
+ df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1450
+
1451
+ save_leaderboard(df)
1452
+
1453
+ progress(1.0, desc="Complete!")
1454
+
1455
+ success_msg = f"""
1456
+ ---
1457
+
1458
+ ## ✅ LLM Judge Evaluation Complete!
1459
+
1460
+ **Model**: {model_name}
1461
+
1462
+ ### 📈 Updated Caption Metrics
1463
+ - **DVC_llm**: {dvc_llm:.4f}
1464
+ - **VS_llm**: {vs_llm:.4f}
1465
+ - **RC_llm**: {rc_llm:.4f}
1466
+
1467
+ ✓ Leaderboard updated successfully!
1468
+
1469
+ Refresh the Leaderboard tab to see updated rankings.
1470
+ """
1471
+ yield success_msg
1472
+
1473
+ except Exception as e:
1474
+ yield f"❌ Error running LLM judge evaluation: {str(e)}"
1475
+
1476
+
1477
  # Create Gradio interface
1478
  with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1479
 
 
1497
  ### Current Rankings
1498
 
1499
  The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
1500
+
1501
+ **Note**: Models with all caption metrics (DVC_llm, VS_llm, RC_llm) at 0.0 can be re-evaluated with LLM judge using the section below.
1502
  """)
1503
 
1504
  def load_and_format_leaderboard():
 
1539
  outputs=[leaderboard_table, status_text]
1540
  )
1541
 
1542
+ # LLM Judge Evaluation Section
1543
+ gr.Markdown("""
1544
+ ---
1545
+
1546
+ ### 🤖 Run LLM Judge Evaluation
1547
+
1548
+ If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
1549
+ This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
1550
+
1551
+ **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
1552
+ """)
1553
+
1554
+ with gr.Row():
1555
+ llm_judge_model_input = gr.Textbox(
1556
+ label="Model Name",
1557
+ placeholder="Enter exact model name from leaderboard",
1558
+ scale=3
1559
+ )
1560
+ run_llm_judge_btn = gr.Button("🚀 Run LLM Judge", variant="primary", scale=1)
1561
+
1562
+ llm_judge_output = gr.Markdown(label="LLM Judge Status")
1563
+
1564
+ # Wire up LLM judge evaluation
1565
+ run_llm_judge_btn.click(
1566
+ fn=run_llm_judge_evaluation,
1567
+ inputs=[llm_judge_model_input],
1568
+ outputs=llm_judge_output
1569
+ )
1570
+
1571
  # Tab 2: Submit
1572
  with gr.Tab("📤 Submit Results"):
1573
  gr.Markdown("""
evaluation/eval_caption_llm_judge.py CHANGED
@@ -90,8 +90,20 @@ R4: [score]
90
  return prompt
91
 
92
 
93
- def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=3) -> dict:
94
- """Call OpenAI API to evaluate a caption pair."""
 
 
 
 
 
 
 
 
 
 
 
 
95
  global completed_calls, total_calls
96
 
97
  if not OPENAI_AVAILABLE:
@@ -101,12 +113,15 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
101
  client = OpenAI(api_key=api_key)
102
  prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
103
 
 
 
104
  for attempt in range(max_retries):
105
  try:
106
  response = client.chat.completions.create(
107
  model="gpt-4o-2024-11-20", # Latest GPT-4 model
108
  messages=[{"role": "user", "content": prompt}],
109
  temperature=0.0,
 
110
  )
111
 
112
  raw_response = response.choices[0].message.content
@@ -130,23 +145,49 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
130
 
131
  return scores
132
  else:
 
 
133
  if attempt < max_retries - 1:
134
- time.sleep(1)
 
 
135
  continue
136
 
137
  except Exception as e:
 
 
 
 
 
 
 
 
138
  if attempt < max_retries - 1:
139
- time.sleep(2)
 
 
 
 
 
 
 
 
 
 
 
140
  continue
 
 
 
141
 
142
  except Exception as e:
143
- print(f" LLM Judge API error: {e}")
144
 
145
- # Failed
146
  with progress_lock:
147
  completed_calls += 1
148
 
149
- return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False}
150
 
151
 
152
  def run_llm_judge_evaluation(results_data, task_type, api_key):
@@ -208,6 +249,10 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
208
 
209
  all_scores = defaultdict(list)
210
  api_successes = []
 
 
 
 
211
 
212
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
213
  futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
@@ -221,8 +266,13 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
221
  api_successes.append(True)
222
  else:
223
  api_successes.append(False)
 
 
224
 
225
  if not all_scores:
 
 
 
226
  return None
227
 
228
  # Compute averages
@@ -230,8 +280,14 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
230
  overall_average = np.mean(list(aspect_averages.values()))
231
 
232
  success_rate = np.mean(api_successes) if api_successes else 0.0
233
-
234
- print(f"✓ LLM Judge completed: {sum(api_successes)}/{len(caption_pairs)} successful API calls")
 
 
 
 
 
 
235
 
236
  return {
237
  'average_score': overall_average,
 
90
  return prompt
91
 
92
 
93
+ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=5) -> dict:
94
+ """
95
+ Call OpenAI API to evaluate a caption pair with retry logic.
96
+
97
+ Args:
98
+ prediction: Model's prediction text
99
+ ground_truth: Ground truth text
100
+ task_type: Task type (dense_captioning, video_summary, region_caption)
101
+ api_key: OpenAI API key
102
+ max_retries: Maximum number of retry attempts (default: 5)
103
+
104
+ Returns:
105
+ dict: Scores for each aspect + api_success flag
106
+ """
107
  global completed_calls, total_calls
108
 
109
  if not OPENAI_AVAILABLE:
 
113
  client = OpenAI(api_key=api_key)
114
  prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
115
 
116
+ last_error = None
117
+
118
  for attempt in range(max_retries):
119
  try:
120
  response = client.chat.completions.create(
121
  model="gpt-4o-2024-11-20", # Latest GPT-4 model
122
  messages=[{"role": "user", "content": prompt}],
123
  temperature=0.0,
124
+ timeout=30.0 # 30 second timeout per request
125
  )
126
 
127
  raw_response = response.choices[0].message.content
 
145
 
146
  return scores
147
  else:
148
+ # Failed to parse all scores - retry
149
+ last_error = f"Incomplete parsing: got {len(scores)}/{len(BEST5_ASPECTS)} scores"
150
  if attempt < max_retries - 1:
151
+ wait_time = min(2 ** attempt, 16) # Exponential backoff: 1, 2, 4, 8, 16 seconds
152
+ print(f" ⚠ {last_error}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
153
+ time.sleep(wait_time)
154
  continue
155
 
156
  except Exception as e:
157
+ last_error = str(e)
158
+ error_type = type(e).__name__
159
+
160
+ # Determine if error is retryable
161
+ is_rate_limit = 'rate_limit' in last_error.lower() or 'RateLimitError' in error_type
162
+ is_timeout = 'timeout' in last_error.lower() or 'TimeoutError' in error_type
163
+ is_network = 'connection' in last_error.lower() or 'ConnectionError' in error_type
164
+
165
  if attempt < max_retries - 1:
166
+ # Exponential backoff with longer waits for rate limits
167
+ if is_rate_limit:
168
+ wait_time = min(2 ** (attempt + 2), 60) # 4, 8, 16, 32, 60 seconds for rate limits
169
+ print(f" ⚠ Rate limit hit, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
170
+ elif is_timeout or is_network:
171
+ wait_time = min(2 ** attempt, 16)
172
+ print(f" ⚠ {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
173
+ else:
174
+ wait_time = min(2 ** attempt, 16)
175
+ print(f" ⚠ API error: {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
176
+
177
+ time.sleep(wait_time)
178
  continue
179
+ else:
180
+ # Last attempt failed
181
+ print(f" ❌ API call failed after {max_retries} attempts: {error_type}")
182
 
183
  except Exception as e:
184
+ print(f" LLM Judge API error (client setup): {e}")
185
 
186
+ # Failed after all retries
187
  with progress_lock:
188
  completed_calls += 1
189
 
190
+ return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False, 'error': last_error}
191
 
192
 
193
  def run_llm_judge_evaluation(results_data, task_type, api_key):
 
249
 
250
  all_scores = defaultdict(list)
251
  api_successes = []
252
+ api_failures = []
253
+
254
+ print(f" Running {total_calls} API calls with {MAX_WORKERS} parallel workers...")
255
+ print(f" Max retries per call: 5 (with exponential backoff)")
256
 
257
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
258
  futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
 
266
  api_successes.append(True)
267
  else:
268
  api_successes.append(False)
269
+ if 'error' in result:
270
+ api_failures.append(result['error'])
271
 
272
  if not all_scores:
273
+ print(f"❌ All API calls failed")
274
+ if api_failures:
275
+ print(f" Sample errors: {api_failures[:3]}")
276
  return None
277
 
278
  # Compute averages
 
280
  overall_average = np.mean(list(aspect_averages.values()))
281
 
282
  success_rate = np.mean(api_successes) if api_successes else 0.0
283
+ num_successes = sum(api_successes)
284
+ num_failures = len(api_successes) - num_successes
285
+
286
+ print(f"✓ LLM Judge completed: {num_successes}/{len(caption_pairs)} successful API calls")
287
+ if num_failures > 0:
288
+ print(f" ⚠ {num_failures} calls failed after all retries")
289
+ if api_failures:
290
+ print(f" Sample errors: {api_failures[:3]}")
291
 
292
  return {
293
  'average_score': overall_average,