MedGRPO Team
commited on
Commit
Β·
176a6d5
1
Parent(s):
4752404
update
Browse files
README.md
CHANGED
|
@@ -163,17 +163,29 @@ If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_ll
|
|
| 163 |
1. Go to the **Leaderboard** tab
|
| 164 |
2. Scroll to the **"Run LLM Judge Evaluation"** section
|
| 165 |
3. Enter your model name (exact match)
|
| 166 |
-
4. Click **"
|
| 167 |
|
| 168 |
The system will:
|
|
|
|
| 169 |
- Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
|
| 170 |
-
-
|
| 171 |
- Preserve all other metrics (TAL, STG, NAP, SA, CVS)
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
**Time**: ~10-20 minutes depending on API rate limits
|
| 174 |
|
| 175 |
**Availability**: Only available when ALL three caption metrics are 0.0
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
## Evaluation Metrics
|
| 178 |
|
| 179 |
### Task-Specific Metrics
|
|
|
|
| 163 |
1. Go to the **Leaderboard** tab
|
| 164 |
2. Scroll to the **"Run LLM Judge Evaluation"** section
|
| 165 |
3. Enter your model name (exact match)
|
| 166 |
+
4. Click **"Start Evaluation"**
|
| 167 |
|
| 168 |
The system will:
|
| 169 |
+
- Start evaluation in the background (runs independently)
|
| 170 |
- Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
|
| 171 |
+
- Automatically update your leaderboard entry when complete
|
| 172 |
- Preserve all other metrics (TAL, STG, NAP, SA, CVS)
|
| 173 |
|
| 174 |
+
**β
Background Execution**:
|
| 175 |
+
- You can **close the browser** after starting - evaluation continues running
|
| 176 |
+
- Come back later and click **"Check Status"** to see progress
|
| 177 |
+
- The leaderboard will be automatically updated when complete
|
| 178 |
+
|
| 179 |
**Time**: ~10-20 minutes depending on API rate limits
|
| 180 |
|
| 181 |
**Availability**: Only available when ALL three caption metrics are 0.0
|
| 182 |
|
| 183 |
+
**How to Check Status**:
|
| 184 |
+
1. Enter the same model name
|
| 185 |
+
2. Click **"Check Status"** button
|
| 186 |
+
3. View recent logs and progress
|
| 187 |
+
4. Or simply refresh the leaderboard to see if metrics are updated
|
| 188 |
+
|
| 189 |
## Evaluation Metrics
|
| 190 |
|
| 191 |
### Task-Specific Metrics
|
app.py
CHANGED
|
@@ -1298,14 +1298,67 @@ def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
|
|
| 1298 |
return False, "Caption metrics already computed"
|
| 1299 |
|
| 1300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1301 |
def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
| 1302 |
"""
|
| 1303 |
-
|
| 1304 |
|
| 1305 |
This function:
|
| 1306 |
-
1.
|
| 1307 |
-
2.
|
| 1308 |
-
3.
|
| 1309 |
|
| 1310 |
Args:
|
| 1311 |
model_name: Name of the model to re-evaluate
|
|
@@ -1320,8 +1373,15 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
|
| 1320 |
if not needs_llm:
|
| 1321 |
return f"β {msg}"
|
| 1322 |
|
| 1323 |
-
|
| 1324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1325 |
|
| 1326 |
# Find the predictions file
|
| 1327 |
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
|
@@ -1333,13 +1393,19 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
|
| 1333 |
|
| 1334 |
yield f"β Found predictions file\n\n"
|
| 1335 |
|
| 1336 |
-
#
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1340 |
|
| 1341 |
eval_wrapper = Path("evaluation/evaluate_predictions.py")
|
|
|
|
| 1342 |
|
|
|
|
| 1343 |
cmd = [
|
| 1344 |
sys.executable,
|
| 1345 |
"-u",
|
|
@@ -1350,128 +1416,197 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
|
| 1350 |
# NOTE: No --skip-llm-judge flag, so LLM judge will run
|
| 1351 |
]
|
| 1352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1353 |
process = subprocess.Popen(
|
| 1354 |
cmd,
|
| 1355 |
-
stdout=
|
| 1356 |
stderr=subprocess.STDOUT,
|
| 1357 |
text=True,
|
| 1358 |
-
|
| 1359 |
-
|
| 1360 |
)
|
| 1361 |
|
| 1362 |
-
#
|
| 1363 |
-
|
| 1364 |
-
|
| 1365 |
-
|
| 1366 |
-
line_count = 0
|
| 1367 |
-
import select
|
| 1368 |
-
|
| 1369 |
-
while True:
|
| 1370 |
-
if process.poll() is not None:
|
| 1371 |
-
remaining = process.stdout.read()
|
| 1372 |
-
if remaining:
|
| 1373 |
-
for line in remaining.split('\n'):
|
| 1374 |
-
line = line.rstrip()
|
| 1375 |
-
if line.strip() and 'WARNING: All log messages' not in line:
|
| 1376 |
-
log_buffer.append(line)
|
| 1377 |
-
break
|
| 1378 |
|
| 1379 |
-
|
| 1380 |
|
| 1381 |
-
|
| 1382 |
-
|
| 1383 |
-
if not line:
|
| 1384 |
-
break
|
| 1385 |
|
| 1386 |
-
|
| 1387 |
-
if not line.strip() or 'WARNING: All log messages' in line:
|
| 1388 |
-
continue
|
| 1389 |
|
| 1390 |
-
|
| 1391 |
-
|
|
|
|
| 1392 |
|
| 1393 |
-
|
| 1394 |
-
if time.time() - last_update > 1.0:
|
| 1395 |
-
if log_buffer:
|
| 1396 |
-
recent = log_buffer[-20:]
|
| 1397 |
-
log_text = f"βοΈ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
|
| 1398 |
-
log_text += '\n'.join(recent)
|
| 1399 |
-
log_text += "\n```"
|
| 1400 |
-
yield log_text
|
| 1401 |
|
| 1402 |
-
|
| 1403 |
-
progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
|
| 1404 |
-
progress(progress_val, desc="Running LLM judge...")
|
| 1405 |
|
| 1406 |
-
|
| 1407 |
|
| 1408 |
-
|
| 1409 |
-
|
| 1410 |
-
|
| 1411 |
|
| 1412 |
-
|
| 1413 |
-
progress(0.85, desc="Extracting metrics...")
|
| 1414 |
-
yield f"βοΈ **Step 3/4**: Extracting caption metrics...\n\n"
|
| 1415 |
|
| 1416 |
-
|
| 1417 |
-
|
|
|
|
| 1418 |
|
| 1419 |
-
|
| 1420 |
-
with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
|
| 1421 |
-
f.write(full_output)
|
| 1422 |
|
| 1423 |
-
|
| 1424 |
-
|
| 1425 |
-
|
| 1426 |
-
rc_llm = metrics.get('rc_llm', 0.0)
|
| 1427 |
|
| 1428 |
-
|
| 1429 |
-
yield f"β Failed to extract caption metrics from evaluation output"
|
| 1430 |
-
return
|
| 1431 |
|
| 1432 |
-
|
| 1433 |
-
|
| 1434 |
-
|
| 1435 |
-
yield f" - RC_llm: {rc_llm:.4f}\n\n"
|
| 1436 |
|
| 1437 |
-
|
| 1438 |
-
progress(0.95, desc="Updating leaderboard...")
|
| 1439 |
-
yield f"βοΈ **Step 4/4**: Updating leaderboard...\n\n"
|
| 1440 |
|
| 1441 |
-
|
|
|
|
| 1442 |
|
| 1443 |
-
|
| 1444 |
-
|
| 1445 |
-
|
| 1446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1447 |
|
| 1448 |
-
|
| 1449 |
-
|
| 1450 |
|
| 1451 |
-
|
|
|
|
|
|
|
| 1452 |
|
| 1453 |
-
|
|
|
|
|
|
|
| 1454 |
|
| 1455 |
-
success_msg = f"""
|
| 1456 |
-
---
|
| 1457 |
|
| 1458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1459 |
|
| 1460 |
**Model**: {model_name}
|
|
|
|
| 1461 |
|
| 1462 |
-
### π
|
| 1463 |
-
- **DVC_llm**: {
|
| 1464 |
-
- **VS_llm**: {
|
| 1465 |
-
- **RC_llm**: {
|
| 1466 |
|
| 1467 |
-
β Leaderboard updated
|
| 1468 |
|
| 1469 |
Refresh the Leaderboard tab to see updated rankings.
|
| 1470 |
"""
|
| 1471 |
-
|
|
|
|
| 1472 |
|
| 1473 |
-
|
| 1474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1475 |
|
| 1476 |
|
| 1477 |
# Create Gradio interface
|
|
@@ -1548,6 +1683,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 1548 |
If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
|
| 1549 |
This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
|
| 1550 |
|
|
|
|
|
|
|
| 1551 |
**Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
|
| 1552 |
""")
|
| 1553 |
|
|
@@ -1557,9 +1694,11 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 1557 |
placeholder="Enter exact model name from leaderboard",
|
| 1558 |
scale=3
|
| 1559 |
)
|
| 1560 |
-
|
|
|
|
|
|
|
| 1561 |
|
| 1562 |
-
llm_judge_output = gr.Markdown(label="
|
| 1563 |
|
| 1564 |
# Wire up LLM judge evaluation
|
| 1565 |
run_llm_judge_btn.click(
|
|
@@ -1568,6 +1707,13 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 1568 |
outputs=llm_judge_output
|
| 1569 |
)
|
| 1570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1571 |
# Tab 2: Submit
|
| 1572 |
with gr.Tab("π€ Submit Results"):
|
| 1573 |
gr.Markdown("""
|
|
|
|
| 1298 |
return False, "Caption metrics already computed"
|
| 1299 |
|
| 1300 |
|
| 1301 |
+
def check_llm_judge_status(model_name: str) -> Tuple[str, str]:
|
| 1302 |
+
"""
|
| 1303 |
+
Check the status of an ongoing LLM judge evaluation.
|
| 1304 |
+
|
| 1305 |
+
Returns:
|
| 1306 |
+
(status, message)
|
| 1307 |
+
status: 'not_started', 'running', 'completed', 'failed'
|
| 1308 |
+
"""
|
| 1309 |
+
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
| 1310 |
+
status_file = model_dir / "llm_judge_status.json"
|
| 1311 |
+
|
| 1312 |
+
if not status_file.exists():
|
| 1313 |
+
return 'not_started', 'No LLM judge evaluation in progress'
|
| 1314 |
+
|
| 1315 |
+
try:
|
| 1316 |
+
with open(status_file, 'r') as f:
|
| 1317 |
+
status_data = json.load(f)
|
| 1318 |
+
|
| 1319 |
+
status = status_data.get('status', 'not_started')
|
| 1320 |
+
progress = status_data.get('progress', '')
|
| 1321 |
+
timestamp = status_data.get('timestamp', '')
|
| 1322 |
+
|
| 1323 |
+
if status == 'running':
|
| 1324 |
+
return 'running', f"Evaluation in progress: {progress}\nStarted: {timestamp}"
|
| 1325 |
+
elif status == 'completed':
|
| 1326 |
+
return 'completed', f"Evaluation completed: {timestamp}"
|
| 1327 |
+
elif status == 'failed':
|
| 1328 |
+
error = status_data.get('error', 'Unknown error')
|
| 1329 |
+
return 'failed', f"Evaluation failed: {error}"
|
| 1330 |
+
else:
|
| 1331 |
+
return 'not_started', 'No evaluation in progress'
|
| 1332 |
+
except Exception as e:
|
| 1333 |
+
return 'not_started', f"Error reading status: {e}"
|
| 1334 |
+
|
| 1335 |
+
|
| 1336 |
+
def update_llm_judge_status(model_name: str, status: str, progress: str = "", error: str = ""):
|
| 1337 |
+
"""Update the LLM judge evaluation status file."""
|
| 1338 |
+
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
| 1339 |
+
status_file = model_dir / "llm_judge_status.json"
|
| 1340 |
+
|
| 1341 |
+
status_data = {
|
| 1342 |
+
'status': status,
|
| 1343 |
+
'progress': progress,
|
| 1344 |
+
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 1345 |
+
}
|
| 1346 |
+
|
| 1347 |
+
if error:
|
| 1348 |
+
status_data['error'] = error
|
| 1349 |
+
|
| 1350 |
+
with open(status_file, 'w') as f:
|
| 1351 |
+
json.dump(status_data, f, indent=2)
|
| 1352 |
+
|
| 1353 |
+
|
| 1354 |
def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
| 1355 |
"""
|
| 1356 |
+
Start LLM judge evaluation in the background for DVC/VS/RC tasks.
|
| 1357 |
|
| 1358 |
This function:
|
| 1359 |
+
1. Validates the model and checks if evaluation is needed
|
| 1360 |
+
2. Starts background evaluation process (can close browser)
|
| 1361 |
+
3. Returns immediately with status information
|
| 1362 |
|
| 1363 |
Args:
|
| 1364 |
model_name: Name of the model to re-evaluate
|
|
|
|
| 1373 |
if not needs_llm:
|
| 1374 |
return f"β {msg}"
|
| 1375 |
|
| 1376 |
+
# Check if evaluation is already running
|
| 1377 |
+
status, status_msg = check_llm_judge_status(model_name)
|
| 1378 |
+
if status == 'running':
|
| 1379 |
+
return f"β³ **Evaluation Already Running**\n\n{status_msg}\n\nCheck status by refreshing or clicking 'Check Status' button."
|
| 1380 |
+
elif status == 'completed':
|
| 1381 |
+
return f"β **Already Completed**\n\n{status_msg}\n\nRefresh the leaderboard to see results."
|
| 1382 |
+
|
| 1383 |
+
progress(0.1, desc="Validating...")
|
| 1384 |
+
yield f"π **Validation**: Checking model predictions...\n\n"
|
| 1385 |
|
| 1386 |
# Find the predictions file
|
| 1387 |
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
|
|
|
| 1393 |
|
| 1394 |
yield f"β Found predictions file\n\n"
|
| 1395 |
|
| 1396 |
+
# Update status to running
|
| 1397 |
+
update_llm_judge_status(model_name, 'running', 'Starting evaluation...')
|
| 1398 |
+
|
| 1399 |
+
# Start background process
|
| 1400 |
+
progress(0.2, desc="Starting background evaluation...")
|
| 1401 |
+
yield f"π **Starting Background Evaluation**\n\n"
|
| 1402 |
+
yield f"β³ This will take 10-20 minutes depending on API rate limits\n\n"
|
| 1403 |
+
yield f"β
**You can close this browser tab** - evaluation runs in background\n\n"
|
| 1404 |
|
| 1405 |
eval_wrapper = Path("evaluation/evaluate_predictions.py")
|
| 1406 |
+
log_file = model_dir / "eval_llm_judge_log.txt"
|
| 1407 |
|
| 1408 |
+
# Build command for background execution
|
| 1409 |
cmd = [
|
| 1410 |
sys.executable,
|
| 1411 |
"-u",
|
|
|
|
| 1416 |
# NOTE: No --skip-llm-judge flag, so LLM judge will run
|
| 1417 |
]
|
| 1418 |
|
| 1419 |
+
# Start process in background (detached)
|
| 1420 |
+
with open(log_file, 'w') as log_f:
|
| 1421 |
+
log_f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 1422 |
+
log_f.write(f"Command: {' '.join(cmd)}\n")
|
| 1423 |
+
log_f.write("="*60 + "\n\n")
|
| 1424 |
+
|
| 1425 |
+
# Launch background process that continues after app closes
|
| 1426 |
process = subprocess.Popen(
|
| 1427 |
cmd,
|
| 1428 |
+
stdout=open(log_file, 'a'),
|
| 1429 |
stderr=subprocess.STDOUT,
|
| 1430 |
text=True,
|
| 1431 |
+
env={**os.environ, "PYTHONUNBUFFERED": "1"},
|
| 1432 |
+
start_new_session=True # Detach from parent process
|
| 1433 |
)
|
| 1434 |
|
| 1435 |
+
# Save PID for tracking
|
| 1436 |
+
pid_file = model_dir / "llm_judge_pid.txt"
|
| 1437 |
+
with open(pid_file, 'w') as f:
|
| 1438 |
+
f.write(str(process.pid))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1439 |
|
| 1440 |
+
progress(0.5, desc="Background process started...")
|
| 1441 |
|
| 1442 |
+
success_msg = f"""
|
| 1443 |
+
---
|
|
|
|
|
|
|
| 1444 |
|
| 1445 |
+
## β
Background Evaluation Started!
|
|
|
|
|
|
|
| 1446 |
|
| 1447 |
+
**Model**: {model_name}
|
| 1448 |
+
**Process ID**: {process.pid}
|
| 1449 |
+
**Started**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 1450 |
|
| 1451 |
+
### β³ Evaluation Progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1452 |
|
| 1453 |
+
The evaluation is now running in the background. This will take approximately 10-20 minutes.
|
|
|
|
|
|
|
| 1454 |
|
| 1455 |
+
### π What's Happening
|
| 1456 |
|
| 1457 |
+
1. βοΈ Running LLM judge on DVC/VS/RC tasks
|
| 1458 |
+
2. π Using GPT-4 with retry logic (up to 5 attempts per sample)
|
| 1459 |
+
3. π Will automatically update leaderboard when complete
|
| 1460 |
|
| 1461 |
+
### β
You Can Now:
|
|
|
|
|
|
|
| 1462 |
|
| 1463 |
+
- β **Close this browser tab** - evaluation continues running
|
| 1464 |
+
- β Come back later and check status using "Check Status" button
|
| 1465 |
+
- β Refresh the leaderboard in 10-20 minutes to see results
|
| 1466 |
|
| 1467 |
+
### π Check Status Later
|
|
|
|
|
|
|
| 1468 |
|
| 1469 |
+
1. Enter the same model name: `{model_name}`
|
| 1470 |
+
2. Click "Check Status" button
|
| 1471 |
+
3. Or refresh the leaderboard to see if metrics are updated
|
|
|
|
| 1472 |
|
| 1473 |
+
### π Logs
|
|
|
|
|
|
|
| 1474 |
|
| 1475 |
+
Evaluation logs are being written to:
|
| 1476 |
+
`{log_file}`
|
| 1477 |
+
"""
|
|
|
|
| 1478 |
|
| 1479 |
+
yield success_msg
|
|
|
|
|
|
|
| 1480 |
|
| 1481 |
+
# Start background monitor thread to update status and leaderboard when complete
|
| 1482 |
+
import threading
|
| 1483 |
|
| 1484 |
+
def monitor_and_update():
|
| 1485 |
+
"""Monitor background process and update leaderboard when complete."""
|
| 1486 |
+
try:
|
| 1487 |
+
# Wait for process to complete
|
| 1488 |
+
process.wait()
|
| 1489 |
+
|
| 1490 |
+
# Read final output
|
| 1491 |
+
with open(log_file, 'r') as f:
|
| 1492 |
+
full_output = f.read()
|
| 1493 |
+
|
| 1494 |
+
if process.returncode == 0:
|
| 1495 |
+
# Parse metrics
|
| 1496 |
+
metrics = parse_evaluation_output(full_output)
|
| 1497 |
+
|
| 1498 |
+
dvc_llm = metrics.get('dvc_llm', 0.0)
|
| 1499 |
+
vs_llm = metrics.get('vs_llm', 0.0)
|
| 1500 |
+
rc_llm = metrics.get('rc_llm', 0.0)
|
| 1501 |
+
|
| 1502 |
+
if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
|
| 1503 |
+
# Update leaderboard
|
| 1504 |
+
df = load_leaderboard()
|
| 1505 |
+
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
|
| 1506 |
+
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
|
| 1507 |
+
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
|
| 1508 |
+
df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
|
| 1509 |
+
save_leaderboard(df)
|
| 1510 |
+
|
| 1511 |
+
# Update status to completed
|
| 1512 |
+
update_llm_judge_status(
|
| 1513 |
+
model_name,
|
| 1514 |
+
'completed',
|
| 1515 |
+
f"DVC: {dvc_llm:.4f}, VS: {vs_llm:.4f}, RC: {rc_llm:.4f}"
|
| 1516 |
+
)
|
| 1517 |
+
else:
|
| 1518 |
+
update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
|
| 1519 |
+
else:
|
| 1520 |
+
update_llm_judge_status(model_name, 'failed', f'Exit code {process.returncode}')
|
| 1521 |
|
| 1522 |
+
except Exception as e:
|
| 1523 |
+
update_llm_judge_status(model_name, 'failed', str(e))
|
| 1524 |
|
| 1525 |
+
# Start monitor thread (daemon so it doesn't block app shutdown)
|
| 1526 |
+
monitor_thread = threading.Thread(target=monitor_and_update, daemon=True)
|
| 1527 |
+
monitor_thread.start()
|
| 1528 |
|
| 1529 |
+
except Exception as e:
|
| 1530 |
+
update_llm_judge_status(model_name, 'failed', str(e))
|
| 1531 |
+
yield f"β Error starting LLM judge evaluation: {str(e)}"
|
| 1532 |
|
|
|
|
|
|
|
| 1533 |
|
| 1534 |
+
def check_llm_judge_evaluation_status(model_name: str) -> str:
|
| 1535 |
+
"""Check and display status of LLM judge evaluation."""
|
| 1536 |
+
if not model_name or not model_name.strip():
|
| 1537 |
+
return "β Please enter a model name"
|
| 1538 |
+
|
| 1539 |
+
status, msg = check_llm_judge_status(model_name.strip())
|
| 1540 |
+
|
| 1541 |
+
if status == 'not_started':
|
| 1542 |
+
return f"βΉοΈ **No Evaluation Running**\n\n{msg}"
|
| 1543 |
+
elif status == 'running':
|
| 1544 |
+
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
| 1545 |
+
log_file = model_dir / "eval_llm_judge_log.txt"
|
| 1546 |
+
|
| 1547 |
+
# Read last 30 lines of log
|
| 1548 |
+
try:
|
| 1549 |
+
with open(log_file, 'r') as f:
|
| 1550 |
+
lines = f.readlines()
|
| 1551 |
+
recent_lines = lines[-30:]
|
| 1552 |
+
|
| 1553 |
+
log_preview = ''.join(recent_lines)
|
| 1554 |
+
|
| 1555 |
+
return f"""
|
| 1556 |
+
## β³ Evaluation Running
|
| 1557 |
+
|
| 1558 |
+
**Model**: {model_name}
|
| 1559 |
+
**Status**: {msg}
|
| 1560 |
+
|
| 1561 |
+
### π Recent Logs (last 30 lines)
|
| 1562 |
+
|
| 1563 |
+
```
|
| 1564 |
+
{log_preview}
|
| 1565 |
+
```
|
| 1566 |
+
|
| 1567 |
+
**Note**: Refresh this page or click "Check Status" again for updates.
|
| 1568 |
+
"""
|
| 1569 |
+
except Exception as e:
|
| 1570 |
+
return f"β³ **Evaluation Running**\n\n{msg}\n\nβ οΈ Unable to read logs: {e}"
|
| 1571 |
+
|
| 1572 |
+
elif status == 'completed':
|
| 1573 |
+
# Check if leaderboard was updated
|
| 1574 |
+
df = load_leaderboard()
|
| 1575 |
+
if model_name in df['model_name'].values:
|
| 1576 |
+
row = df[df['model_name'] == model_name].iloc[0]
|
| 1577 |
+
dvc = row.get('dvc_llm', 0.0)
|
| 1578 |
+
vs = row.get('vs_llm', 0.0)
|
| 1579 |
+
rc = row.get('rc_llm', 0.0)
|
| 1580 |
+
|
| 1581 |
+
return f"""
|
| 1582 |
+
## β
Evaluation Complete!
|
| 1583 |
|
| 1584 |
**Model**: {model_name}
|
| 1585 |
+
**Completed**: {msg}
|
| 1586 |
|
| 1587 |
+
### π Caption Metrics
|
| 1588 |
+
- **DVC_llm**: {dvc:.4f}
|
| 1589 |
+
- **VS_llm**: {vs:.4f}
|
| 1590 |
+
- **RC_llm**: {rc:.4f}
|
| 1591 |
|
| 1592 |
+
β Leaderboard has been updated!
|
| 1593 |
|
| 1594 |
Refresh the Leaderboard tab to see updated rankings.
|
| 1595 |
"""
|
| 1596 |
+
else:
|
| 1597 |
+
return f"β **Evaluation Complete**\n\n{msg}\n\nβ οΈ Model not found in leaderboard"
|
| 1598 |
|
| 1599 |
+
elif status == 'failed':
|
| 1600 |
+
return f"""
|
| 1601 |
+
## β Evaluation Failed
|
| 1602 |
+
|
| 1603 |
+
**Model**: {model_name}
|
| 1604 |
+
**Error**: {msg}
|
| 1605 |
+
|
| 1606 |
+
Please check the logs or try running the evaluation again.
|
| 1607 |
+
"""
|
| 1608 |
+
|
| 1609 |
+
return f"βΉοΈ **Status**: {status}\n\n{msg}"
|
| 1610 |
|
| 1611 |
|
| 1612 |
# Create Gradio interface
|
|
|
|
| 1683 |
If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
|
| 1684 |
This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
|
| 1685 |
|
| 1686 |
+
**β
Background Execution**: The evaluation runs in the background - you can close the browser and come back later!
|
| 1687 |
+
|
| 1688 |
**Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
|
| 1689 |
""")
|
| 1690 |
|
|
|
|
| 1694 |
placeholder="Enter exact model name from leaderboard",
|
| 1695 |
scale=3
|
| 1696 |
)
|
| 1697 |
+
with gr.Column(scale=1):
|
| 1698 |
+
run_llm_judge_btn = gr.Button("π Start Evaluation", variant="primary")
|
| 1699 |
+
check_status_btn = gr.Button("π Check Status", variant="secondary")
|
| 1700 |
|
| 1701 |
+
llm_judge_output = gr.Markdown(label="Evaluation Status")
|
| 1702 |
|
| 1703 |
# Wire up LLM judge evaluation
|
| 1704 |
run_llm_judge_btn.click(
|
|
|
|
| 1707 |
outputs=llm_judge_output
|
| 1708 |
)
|
| 1709 |
|
| 1710 |
+
# Wire up status check
|
| 1711 |
+
check_status_btn.click(
|
| 1712 |
+
fn=check_llm_judge_evaluation_status,
|
| 1713 |
+
inputs=[llm_judge_model_input],
|
| 1714 |
+
outputs=llm_judge_output
|
| 1715 |
+
)
|
| 1716 |
+
|
| 1717 |
# Tab 2: Submit
|
| 1718 |
with gr.Tab("π€ Submit Results"):
|
| 1719 |
gr.Markdown("""
|