Spaces:
Sleeping
Sleeping
MedGRPO Team
commited on
Commit
·
4752404
1
Parent(s):
6edbd17
update
Browse files- README.md +21 -1
- app.py +232 -0
- evaluation/eval_caption_llm_judge.py +65 -9
README.md
CHANGED
|
@@ -150,10 +150,30 @@ The leaderboard supports **two formats** for submission:
|
|
| 150 |
|
| 151 |
The system will:
|
| 152 |
- Validate your file (format + sample count)
|
| 153 |
-
- Run automatic evaluation (~5-10 minutes)
|
| 154 |
- Extract metrics for all 8 tasks
|
| 155 |
- Add your model to the leaderboard
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
## Evaluation Metrics
|
| 158 |
|
| 159 |
### Task-Specific Metrics
|
|
|
|
| 150 |
|
| 151 |
The system will:
|
| 152 |
- Validate your file (format + sample count)
|
| 153 |
+
- Run automatic evaluation (~2-5 minutes with `--skip-llm-judge`, ~10-20 minutes with LLM judge)
|
| 154 |
- Extract metrics for all 8 tasks
|
| 155 |
- Add your model to the leaderboard
|
| 156 |
|
| 157 |
+
**Note**: By default, DVC/VS/RC are evaluated with `--skip-llm-judge` for faster results (caption metrics will be 0.0). You can run LLM judge evaluation later using the button on the leaderboard page.
|
| 158 |
+
|
| 159 |
+
### 4. Run LLM Judge Evaluation (Optional)
|
| 160 |
+
|
| 161 |
+
If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_llm are all 0.0), you can compute these metrics later:
|
| 162 |
+
|
| 163 |
+
1. Go to the **Leaderboard** tab
|
| 164 |
+
2. Scroll to the **"Run LLM Judge Evaluation"** section
|
| 165 |
+
3. Enter your model name (exact match)
|
| 166 |
+
4. Click **"Run LLM Judge"**
|
| 167 |
+
|
| 168 |
+
The system will:
|
| 169 |
+
- Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
|
| 170 |
+
- Update your leaderboard entry with caption metrics
|
| 171 |
+
- Preserve all other metrics (TAL, STG, NAP, SA, CVS)
|
| 172 |
+
|
| 173 |
+
**Time**: ~10-20 minutes depending on API rate limits
|
| 174 |
+
|
| 175 |
+
**Availability**: Only available when ALL three caption metrics are 0.0
|
| 176 |
+
|
| 177 |
## Evaluation Metrics
|
| 178 |
|
| 179 |
### Task-Specific Metrics
|
app.py
CHANGED
|
@@ -1273,6 +1273,207 @@ def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 1273 |
return display_df
|
| 1274 |
|
| 1275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1276 |
# Create Gradio interface
|
| 1277 |
with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 1278 |
|
|
@@ -1296,6 +1497,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 1296 |
### Current Rankings
|
| 1297 |
|
| 1298 |
The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
|
|
|
|
|
|
|
| 1299 |
""")
|
| 1300 |
|
| 1301 |
def load_and_format_leaderboard():
|
|
@@ -1336,6 +1539,35 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 1336 |
outputs=[leaderboard_table, status_text]
|
| 1337 |
)
|
| 1338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1339 |
# Tab 2: Submit
|
| 1340 |
with gr.Tab("📤 Submit Results"):
|
| 1341 |
gr.Markdown("""
|
|
|
|
| 1273 |
return display_df
|
| 1274 |
|
| 1275 |
|
| 1276 |
+
def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
|
| 1277 |
+
"""
|
| 1278 |
+
Check if a model needs LLM judge evaluation.
|
| 1279 |
+
|
| 1280 |
+
Returns:
|
| 1281 |
+
(needs_llm_judge, message)
|
| 1282 |
+
"""
|
| 1283 |
+
df = load_leaderboard()
|
| 1284 |
+
|
| 1285 |
+
if model_name not in df['model_name'].values:
|
| 1286 |
+
return False, f"Model '{model_name}' not found"
|
| 1287 |
+
|
| 1288 |
+
model_row = df[df['model_name'] == model_name].iloc[0]
|
| 1289 |
+
|
| 1290 |
+
# Check if all three caption metrics are zero
|
| 1291 |
+
dvc_llm = model_row.get('dvc_llm', 0.0)
|
| 1292 |
+
vs_llm = model_row.get('vs_llm', 0.0)
|
| 1293 |
+
rc_llm = model_row.get('rc_llm', 0.0)
|
| 1294 |
+
|
| 1295 |
+
if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
|
| 1296 |
+
return True, "All caption metrics are 0.0, can run LLM judge"
|
| 1297 |
+
else:
|
| 1298 |
+
return False, "Caption metrics already computed"
|
| 1299 |
+
|
| 1300 |
+
|
| 1301 |
+
def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
| 1302 |
+
"""
|
| 1303 |
+
Run LLM judge evaluation for DVC/VS/RC tasks on a previously submitted model.
|
| 1304 |
+
|
| 1305 |
+
This function:
|
| 1306 |
+
1. Loads the original predictions from results directory
|
| 1307 |
+
2. Re-runs evaluation WITH LLM judge (no --skip-llm-judge flag)
|
| 1308 |
+
3. Updates the leaderboard with new caption metrics
|
| 1309 |
+
|
| 1310 |
+
Args:
|
| 1311 |
+
model_name: Name of the model to re-evaluate
|
| 1312 |
+
progress: Gradio progress tracker
|
| 1313 |
+
|
| 1314 |
+
Returns:
|
| 1315 |
+
Status message (markdown)
|
| 1316 |
+
"""
|
| 1317 |
+
try:
|
| 1318 |
+
# Check if model exists and needs LLM judge
|
| 1319 |
+
needs_llm, msg = check_needs_llm_judge(model_name)
|
| 1320 |
+
if not needs_llm:
|
| 1321 |
+
return f"❌ {msg}"
|
| 1322 |
+
|
| 1323 |
+
progress(0.1, desc="Loading predictions...")
|
| 1324 |
+
yield f"🔍 **Step 1/4**: Checking model predictions...\n\n"
|
| 1325 |
+
|
| 1326 |
+
# Find the predictions file
|
| 1327 |
+
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
| 1328 |
+
input_file = model_dir / "input.json"
|
| 1329 |
+
|
| 1330 |
+
if not input_file.exists():
|
| 1331 |
+
yield f"❌ Predictions file not found: {input_file}"
|
| 1332 |
+
return
|
| 1333 |
+
|
| 1334 |
+
yield f"✓ Found predictions file\n\n"
|
| 1335 |
+
|
| 1336 |
+
# Run evaluation WITH LLM judge
|
| 1337 |
+
progress(0.2, desc="Running LLM judge evaluation...")
|
| 1338 |
+
yield f"⚙️ **Step 2/4**: Running LLM judge evaluation (DVC/VS/RC)...\n\n"
|
| 1339 |
+
yield f"⏳ This may take 5-15 minutes depending on API rate limits...\n\n"
|
| 1340 |
+
|
| 1341 |
+
eval_wrapper = Path("evaluation/evaluate_predictions.py")
|
| 1342 |
+
|
| 1343 |
+
cmd = [
|
| 1344 |
+
sys.executable,
|
| 1345 |
+
"-u",
|
| 1346 |
+
str(eval_wrapper),
|
| 1347 |
+
str(input_file),
|
| 1348 |
+
"--grouping", "overall",
|
| 1349 |
+
"--ground-truth", str(GROUND_TRUTH_FILE)
|
| 1350 |
+
# NOTE: No --skip-llm-judge flag, so LLM judge will run
|
| 1351 |
+
]
|
| 1352 |
+
|
| 1353 |
+
process = subprocess.Popen(
|
| 1354 |
+
cmd,
|
| 1355 |
+
stdout=subprocess.PIPE,
|
| 1356 |
+
stderr=subprocess.STDOUT,
|
| 1357 |
+
text=True,
|
| 1358 |
+
bufsize=1,
|
| 1359 |
+
env={**os.environ, "PYTHONUNBUFFERED": "1"}
|
| 1360 |
+
)
|
| 1361 |
+
|
| 1362 |
+
# Stream logs
|
| 1363 |
+
import time
|
| 1364 |
+
log_buffer = []
|
| 1365 |
+
last_update = time.time()
|
| 1366 |
+
line_count = 0
|
| 1367 |
+
import select
|
| 1368 |
+
|
| 1369 |
+
while True:
|
| 1370 |
+
if process.poll() is not None:
|
| 1371 |
+
remaining = process.stdout.read()
|
| 1372 |
+
if remaining:
|
| 1373 |
+
for line in remaining.split('\n'):
|
| 1374 |
+
line = line.rstrip()
|
| 1375 |
+
if line.strip() and 'WARNING: All log messages' not in line:
|
| 1376 |
+
log_buffer.append(line)
|
| 1377 |
+
break
|
| 1378 |
+
|
| 1379 |
+
ready, _, _ = select.select([process.stdout], [], [], 0.5)
|
| 1380 |
+
|
| 1381 |
+
if ready:
|
| 1382 |
+
line = process.stdout.readline()
|
| 1383 |
+
if not line:
|
| 1384 |
+
break
|
| 1385 |
+
|
| 1386 |
+
line = line.rstrip()
|
| 1387 |
+
if not line.strip() or 'WARNING: All log messages' in line:
|
| 1388 |
+
continue
|
| 1389 |
+
|
| 1390 |
+
log_buffer.append(line)
|
| 1391 |
+
line_count += 1
|
| 1392 |
+
|
| 1393 |
+
# Update UI every 1 second
|
| 1394 |
+
if time.time() - last_update > 1.0:
|
| 1395 |
+
if log_buffer:
|
| 1396 |
+
recent = log_buffer[-20:]
|
| 1397 |
+
log_text = f"⚙️ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
|
| 1398 |
+
log_text += '\n'.join(recent)
|
| 1399 |
+
log_text += "\n```"
|
| 1400 |
+
yield log_text
|
| 1401 |
+
|
| 1402 |
+
last_update = time.time()
|
| 1403 |
+
progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
|
| 1404 |
+
progress(progress_val, desc="Running LLM judge...")
|
| 1405 |
+
|
| 1406 |
+
process.wait()
|
| 1407 |
+
|
| 1408 |
+
if process.returncode != 0:
|
| 1409 |
+
yield f"\n❌ Evaluation failed (exit code {process.returncode})"
|
| 1410 |
+
return
|
| 1411 |
+
|
| 1412 |
+
# Parse metrics
|
| 1413 |
+
progress(0.85, desc="Extracting metrics...")
|
| 1414 |
+
yield f"⚙️ **Step 3/4**: Extracting caption metrics...\n\n"
|
| 1415 |
+
|
| 1416 |
+
full_output = '\n'.join(log_buffer)
|
| 1417 |
+
metrics = parse_evaluation_output(full_output)
|
| 1418 |
+
|
| 1419 |
+
# Save updated output
|
| 1420 |
+
with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
|
| 1421 |
+
f.write(full_output)
|
| 1422 |
+
|
| 1423 |
+
# Extract caption metrics
|
| 1424 |
+
dvc_llm = metrics.get('dvc_llm', 0.0)
|
| 1425 |
+
vs_llm = metrics.get('vs_llm', 0.0)
|
| 1426 |
+
rc_llm = metrics.get('rc_llm', 0.0)
|
| 1427 |
+
|
| 1428 |
+
if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
|
| 1429 |
+
yield f"❌ Failed to extract caption metrics from evaluation output"
|
| 1430 |
+
return
|
| 1431 |
+
|
| 1432 |
+
yield f"✓ Caption metrics extracted:\n"
|
| 1433 |
+
yield f" - DVC_llm: {dvc_llm:.4f}\n"
|
| 1434 |
+
yield f" - VS_llm: {vs_llm:.4f}\n"
|
| 1435 |
+
yield f" - RC_llm: {rc_llm:.4f}\n\n"
|
| 1436 |
+
|
| 1437 |
+
# Update leaderboard
|
| 1438 |
+
progress(0.95, desc="Updating leaderboard...")
|
| 1439 |
+
yield f"⚙️ **Step 4/4**: Updating leaderboard...\n\n"
|
| 1440 |
+
|
| 1441 |
+
df = load_leaderboard()
|
| 1442 |
+
|
| 1443 |
+
# Update caption metrics
|
| 1444 |
+
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
|
| 1445 |
+
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
|
| 1446 |
+
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
|
| 1447 |
+
|
| 1448 |
+
# Re-sort by first metric
|
| 1449 |
+
df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
|
| 1450 |
+
|
| 1451 |
+
save_leaderboard(df)
|
| 1452 |
+
|
| 1453 |
+
progress(1.0, desc="Complete!")
|
| 1454 |
+
|
| 1455 |
+
success_msg = f"""
|
| 1456 |
+
---
|
| 1457 |
+
|
| 1458 |
+
## ✅ LLM Judge Evaluation Complete!
|
| 1459 |
+
|
| 1460 |
+
**Model**: {model_name}
|
| 1461 |
+
|
| 1462 |
+
### 📈 Updated Caption Metrics
|
| 1463 |
+
- **DVC_llm**: {dvc_llm:.4f}
|
| 1464 |
+
- **VS_llm**: {vs_llm:.4f}
|
| 1465 |
+
- **RC_llm**: {rc_llm:.4f}
|
| 1466 |
+
|
| 1467 |
+
✓ Leaderboard updated successfully!
|
| 1468 |
+
|
| 1469 |
+
Refresh the Leaderboard tab to see updated rankings.
|
| 1470 |
+
"""
|
| 1471 |
+
yield success_msg
|
| 1472 |
+
|
| 1473 |
+
except Exception as e:
|
| 1474 |
+
yield f"❌ Error running LLM judge evaluation: {str(e)}"
|
| 1475 |
+
|
| 1476 |
+
|
| 1477 |
# Create Gradio interface
|
| 1478 |
with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 1479 |
|
|
|
|
| 1497 |
### Current Rankings
|
| 1498 |
|
| 1499 |
The leaderboard displays all submitted models ranked by their performance across 10 metrics on 8 medical video understanding tasks.
|
| 1500 |
+
|
| 1501 |
+
**Note**: Models with all caption metrics (DVC_llm, VS_llm, RC_llm) at 0.0 can be re-evaluated with LLM judge using the section below.
|
| 1502 |
""")
|
| 1503 |
|
| 1504 |
def load_and_format_leaderboard():
|
|
|
|
| 1539 |
outputs=[leaderboard_table, status_text]
|
| 1540 |
)
|
| 1541 |
|
| 1542 |
+
# LLM Judge Evaluation Section
|
| 1543 |
+
gr.Markdown("""
|
| 1544 |
+
---
|
| 1545 |
+
|
| 1546 |
+
### 🤖 Run LLM Judge Evaluation
|
| 1547 |
+
|
| 1548 |
+
If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
|
| 1549 |
+
This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
|
| 1550 |
+
|
| 1551 |
+
**Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
|
| 1552 |
+
""")
|
| 1553 |
+
|
| 1554 |
+
with gr.Row():
|
| 1555 |
+
llm_judge_model_input = gr.Textbox(
|
| 1556 |
+
label="Model Name",
|
| 1557 |
+
placeholder="Enter exact model name from leaderboard",
|
| 1558 |
+
scale=3
|
| 1559 |
+
)
|
| 1560 |
+
run_llm_judge_btn = gr.Button("🚀 Run LLM Judge", variant="primary", scale=1)
|
| 1561 |
+
|
| 1562 |
+
llm_judge_output = gr.Markdown(label="LLM Judge Status")
|
| 1563 |
+
|
| 1564 |
+
# Wire up LLM judge evaluation
|
| 1565 |
+
run_llm_judge_btn.click(
|
| 1566 |
+
fn=run_llm_judge_evaluation,
|
| 1567 |
+
inputs=[llm_judge_model_input],
|
| 1568 |
+
outputs=llm_judge_output
|
| 1569 |
+
)
|
| 1570 |
+
|
| 1571 |
# Tab 2: Submit
|
| 1572 |
with gr.Tab("📤 Submit Results"):
|
| 1573 |
gr.Markdown("""
|
evaluation/eval_caption_llm_judge.py
CHANGED
|
@@ -90,8 +90,20 @@ R4: [score]
|
|
| 90 |
return prompt
|
| 91 |
|
| 92 |
|
| 93 |
-
def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=
|
| 94 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
global completed_calls, total_calls
|
| 96 |
|
| 97 |
if not OPENAI_AVAILABLE:
|
|
@@ -101,12 +113,15 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
|
|
| 101 |
client = OpenAI(api_key=api_key)
|
| 102 |
prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
|
| 103 |
|
|
|
|
|
|
|
| 104 |
for attempt in range(max_retries):
|
| 105 |
try:
|
| 106 |
response = client.chat.completions.create(
|
| 107 |
model="gpt-4o-2024-11-20", # Latest GPT-4 model
|
| 108 |
messages=[{"role": "user", "content": prompt}],
|
| 109 |
temperature=0.0,
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
raw_response = response.choices[0].message.content
|
|
@@ -130,23 +145,49 @@ def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_k
|
|
| 130 |
|
| 131 |
return scores
|
| 132 |
else:
|
|
|
|
|
|
|
| 133 |
if attempt < max_retries - 1:
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
continue
|
| 136 |
|
| 137 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
if attempt < max_retries - 1:
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
continue
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
except Exception as e:
|
| 143 |
-
print(f"
|
| 144 |
|
| 145 |
-
# Failed
|
| 146 |
with progress_lock:
|
| 147 |
completed_calls += 1
|
| 148 |
|
| 149 |
-
return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False}
|
| 150 |
|
| 151 |
|
| 152 |
def run_llm_judge_evaluation(results_data, task_type, api_key):
|
|
@@ -208,6 +249,10 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
|
|
| 208 |
|
| 209 |
all_scores = defaultdict(list)
|
| 210 |
api_successes = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
| 213 |
futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
|
|
@@ -221,8 +266,13 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
|
|
| 221 |
api_successes.append(True)
|
| 222 |
else:
|
| 223 |
api_successes.append(False)
|
|
|
|
|
|
|
| 224 |
|
| 225 |
if not all_scores:
|
|
|
|
|
|
|
|
|
|
| 226 |
return None
|
| 227 |
|
| 228 |
# Compute averages
|
|
@@ -230,8 +280,14 @@ def run_llm_judge_evaluation(results_data, task_type, api_key):
|
|
| 230 |
overall_average = np.mean(list(aspect_averages.values()))
|
| 231 |
|
| 232 |
success_rate = np.mean(api_successes) if api_successes else 0.0
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
return {
|
| 237 |
'average_score': overall_average,
|
|
|
|
| 90 |
return prompt
|
| 91 |
|
| 92 |
|
| 93 |
+
def call_llm_judge_api(prediction: str, ground_truth: str, task_type: str, api_key: str, max_retries=5) -> dict:
|
| 94 |
+
"""
|
| 95 |
+
Call OpenAI API to evaluate a caption pair with retry logic.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
prediction: Model's prediction text
|
| 99 |
+
ground_truth: Ground truth text
|
| 100 |
+
task_type: Task type (dense_captioning, video_summary, region_caption)
|
| 101 |
+
api_key: OpenAI API key
|
| 102 |
+
max_retries: Maximum number of retry attempts (default: 5)
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
dict: Scores for each aspect + api_success flag
|
| 106 |
+
"""
|
| 107 |
global completed_calls, total_calls
|
| 108 |
|
| 109 |
if not OPENAI_AVAILABLE:
|
|
|
|
| 113 |
client = OpenAI(api_key=api_key)
|
| 114 |
prompt = create_llm_judge_prompt(prediction, ground_truth, task_type)
|
| 115 |
|
| 116 |
+
last_error = None
|
| 117 |
+
|
| 118 |
for attempt in range(max_retries):
|
| 119 |
try:
|
| 120 |
response = client.chat.completions.create(
|
| 121 |
model="gpt-4o-2024-11-20", # Latest GPT-4 model
|
| 122 |
messages=[{"role": "user", "content": prompt}],
|
| 123 |
temperature=0.0,
|
| 124 |
+
timeout=30.0 # 30 second timeout per request
|
| 125 |
)
|
| 126 |
|
| 127 |
raw_response = response.choices[0].message.content
|
|
|
|
| 145 |
|
| 146 |
return scores
|
| 147 |
else:
|
| 148 |
+
# Failed to parse all scores - retry
|
| 149 |
+
last_error = f"Incomplete parsing: got {len(scores)}/{len(BEST5_ASPECTS)} scores"
|
| 150 |
if attempt < max_retries - 1:
|
| 151 |
+
wait_time = min(2 ** attempt, 16) # Exponential backoff: 1, 2, 4, 8, 16 seconds
|
| 152 |
+
print(f" ⚠ {last_error}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
| 153 |
+
time.sleep(wait_time)
|
| 154 |
continue
|
| 155 |
|
| 156 |
except Exception as e:
|
| 157 |
+
last_error = str(e)
|
| 158 |
+
error_type = type(e).__name__
|
| 159 |
+
|
| 160 |
+
# Determine if error is retryable
|
| 161 |
+
is_rate_limit = 'rate_limit' in last_error.lower() or 'RateLimitError' in error_type
|
| 162 |
+
is_timeout = 'timeout' in last_error.lower() or 'TimeoutError' in error_type
|
| 163 |
+
is_network = 'connection' in last_error.lower() or 'ConnectionError' in error_type
|
| 164 |
+
|
| 165 |
if attempt < max_retries - 1:
|
| 166 |
+
# Exponential backoff with longer waits for rate limits
|
| 167 |
+
if is_rate_limit:
|
| 168 |
+
wait_time = min(2 ** (attempt + 2), 60) # 4, 8, 16, 32, 60 seconds for rate limits
|
| 169 |
+
print(f" ⚠ Rate limit hit, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
| 170 |
+
elif is_timeout or is_network:
|
| 171 |
+
wait_time = min(2 ** attempt, 16)
|
| 172 |
+
print(f" ⚠ {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
| 173 |
+
else:
|
| 174 |
+
wait_time = min(2 ** attempt, 16)
|
| 175 |
+
print(f" ⚠ API error: {error_type}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
| 176 |
+
|
| 177 |
+
time.sleep(wait_time)
|
| 178 |
continue
|
| 179 |
+
else:
|
| 180 |
+
# Last attempt failed
|
| 181 |
+
print(f" ❌ API call failed after {max_retries} attempts: {error_type}")
|
| 182 |
|
| 183 |
except Exception as e:
|
| 184 |
+
print(f" ❌ LLM Judge API error (client setup): {e}")
|
| 185 |
|
| 186 |
+
# Failed after all retries
|
| 187 |
with progress_lock:
|
| 188 |
completed_calls += 1
|
| 189 |
|
| 190 |
+
return {aspect: 0 for aspect in BEST5_ASPECTS} | {'api_success': False, 'error': last_error}
|
| 191 |
|
| 192 |
|
| 193 |
def run_llm_judge_evaluation(results_data, task_type, api_key):
|
|
|
|
| 249 |
|
| 250 |
all_scores = defaultdict(list)
|
| 251 |
api_successes = []
|
| 252 |
+
api_failures = []
|
| 253 |
+
|
| 254 |
+
print(f" Running {total_calls} API calls with {MAX_WORKERS} parallel workers...")
|
| 255 |
+
print(f" Max retries per call: 5 (with exponential backoff)")
|
| 256 |
|
| 257 |
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
| 258 |
futures = {executor.submit(call_llm_judge_api, pred, gt, task_type, api_key): i
|
|
|
|
| 266 |
api_successes.append(True)
|
| 267 |
else:
|
| 268 |
api_successes.append(False)
|
| 269 |
+
if 'error' in result:
|
| 270 |
+
api_failures.append(result['error'])
|
| 271 |
|
| 272 |
if not all_scores:
|
| 273 |
+
print(f"❌ All API calls failed")
|
| 274 |
+
if api_failures:
|
| 275 |
+
print(f" Sample errors: {api_failures[:3]}")
|
| 276 |
return None
|
| 277 |
|
| 278 |
# Compute averages
|
|
|
|
| 280 |
overall_average = np.mean(list(aspect_averages.values()))
|
| 281 |
|
| 282 |
success_rate = np.mean(api_successes) if api_successes else 0.0
|
| 283 |
+
num_successes = sum(api_successes)
|
| 284 |
+
num_failures = len(api_successes) - num_successes
|
| 285 |
+
|
| 286 |
+
print(f"✓ LLM Judge completed: {num_successes}/{len(caption_pairs)} successful API calls")
|
| 287 |
+
if num_failures > 0:
|
| 288 |
+
print(f" ⚠ {num_failures} calls failed after all retries")
|
| 289 |
+
if api_failures:
|
| 290 |
+
print(f" Sample errors: {api_failures[:3]}")
|
| 291 |
|
| 292 |
return {
|
| 293 |
'average_score': overall_average,
|