MedGRPO Team commited on
Commit
176a6d5
Β·
1 Parent(s): 4752404
Files changed (2) hide show
  1. README.md +14 -2
  2. app.py +244 -98
README.md CHANGED
@@ -163,17 +163,29 @@ If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_ll
163
  1. Go to the **Leaderboard** tab
164
  2. Scroll to the **"Run LLM Judge Evaluation"** section
165
  3. Enter your model name (exact match)
166
- 4. Click **"Run LLM Judge"**
167
 
168
  The system will:
 
169
  - Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
170
- - Update your leaderboard entry with caption metrics
171
  - Preserve all other metrics (TAL, STG, NAP, SA, CVS)
172
 
 
 
 
 
 
173
  **Time**: ~10-20 minutes depending on API rate limits
174
 
175
  **Availability**: Only available when ALL three caption metrics are 0.0
176
 
 
 
 
 
 
 
177
  ## Evaluation Metrics
178
 
179
  ### Task-Specific Metrics
 
163
  1. Go to the **Leaderboard** tab
164
  2. Scroll to the **"Run LLM Judge Evaluation"** section
165
  3. Enter your model name (exact match)
166
+ 4. Click **"Start Evaluation"**
167
 
168
  The system will:
169
+ - Start evaluation in the background (runs independently)
170
  - Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
171
+ - Automatically update your leaderboard entry when complete
172
  - Preserve all other metrics (TAL, STG, NAP, SA, CVS)
173
 
174
+ **βœ… Background Execution**:
175
+ - You can **close the browser** after starting - evaluation continues running
176
+ - Come back later and click **"Check Status"** to see progress
177
+ - The leaderboard will be automatically updated when complete
178
+
179
  **Time**: ~10-20 minutes depending on API rate limits
180
 
181
  **Availability**: Only available when ALL three caption metrics are 0.0
182
 
183
+ **How to Check Status**:
184
+ 1. Enter the same model name
185
+ 2. Click **"Check Status"** button
186
+ 3. View recent logs and progress
187
+ 4. Or simply refresh the leaderboard to see if metrics are updated
188
+
189
  ## Evaluation Metrics
190
 
191
  ### Task-Specific Metrics
app.py CHANGED
@@ -1298,14 +1298,67 @@ def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
1298
  return False, "Caption metrics already computed"
1299
 
1300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1301
  def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1302
  """
1303
- Run LLM judge evaluation for DVC/VS/RC tasks on a previously submitted model.
1304
 
1305
  This function:
1306
- 1. Loads the original predictions from results directory
1307
- 2. Re-runs evaluation WITH LLM judge (no --skip-llm-judge flag)
1308
- 3. Updates the leaderboard with new caption metrics
1309
 
1310
  Args:
1311
  model_name: Name of the model to re-evaluate
@@ -1320,8 +1373,15 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1320
  if not needs_llm:
1321
  return f"❌ {msg}"
1322
 
1323
- progress(0.1, desc="Loading predictions...")
1324
- yield f"πŸ” **Step 1/4**: Checking model predictions...\n\n"
 
 
 
 
 
 
 
1325
 
1326
  # Find the predictions file
1327
  model_dir = RESULTS_DIR / model_name.replace(" ", "_")
@@ -1333,13 +1393,19 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1333
 
1334
  yield f"βœ“ Found predictions file\n\n"
1335
 
1336
- # Run evaluation WITH LLM judge
1337
- progress(0.2, desc="Running LLM judge evaluation...")
1338
- yield f"βš™οΈ **Step 2/4**: Running LLM judge evaluation (DVC/VS/RC)...\n\n"
1339
- yield f"⏳ This may take 5-15 minutes depending on API rate limits...\n\n"
 
 
 
 
1340
 
1341
  eval_wrapper = Path("evaluation/evaluate_predictions.py")
 
1342
 
 
1343
  cmd = [
1344
  sys.executable,
1345
  "-u",
@@ -1350,128 +1416,197 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1350
  # NOTE: No --skip-llm-judge flag, so LLM judge will run
1351
  ]
1352
 
 
 
 
 
 
 
 
1353
  process = subprocess.Popen(
1354
  cmd,
1355
- stdout=subprocess.PIPE,
1356
  stderr=subprocess.STDOUT,
1357
  text=True,
1358
- bufsize=1,
1359
- env={**os.environ, "PYTHONUNBUFFERED": "1"}
1360
  )
1361
 
1362
- # Stream logs
1363
- import time
1364
- log_buffer = []
1365
- last_update = time.time()
1366
- line_count = 0
1367
- import select
1368
-
1369
- while True:
1370
- if process.poll() is not None:
1371
- remaining = process.stdout.read()
1372
- if remaining:
1373
- for line in remaining.split('\n'):
1374
- line = line.rstrip()
1375
- if line.strip() and 'WARNING: All log messages' not in line:
1376
- log_buffer.append(line)
1377
- break
1378
 
1379
- ready, _, _ = select.select([process.stdout], [], [], 0.5)
1380
 
1381
- if ready:
1382
- line = process.stdout.readline()
1383
- if not line:
1384
- break
1385
 
1386
- line = line.rstrip()
1387
- if not line.strip() or 'WARNING: All log messages' in line:
1388
- continue
1389
 
1390
- log_buffer.append(line)
1391
- line_count += 1
 
1392
 
1393
- # Update UI every 1 second
1394
- if time.time() - last_update > 1.0:
1395
- if log_buffer:
1396
- recent = log_buffer[-20:]
1397
- log_text = f"βš™οΈ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
1398
- log_text += '\n'.join(recent)
1399
- log_text += "\n```"
1400
- yield log_text
1401
 
1402
- last_update = time.time()
1403
- progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
1404
- progress(progress_val, desc="Running LLM judge...")
1405
 
1406
- process.wait()
1407
 
1408
- if process.returncode != 0:
1409
- yield f"\n❌ Evaluation failed (exit code {process.returncode})"
1410
- return
1411
 
1412
- # Parse metrics
1413
- progress(0.85, desc="Extracting metrics...")
1414
- yield f"βš™οΈ **Step 3/4**: Extracting caption metrics...\n\n"
1415
 
1416
- full_output = '\n'.join(log_buffer)
1417
- metrics = parse_evaluation_output(full_output)
 
1418
 
1419
- # Save updated output
1420
- with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
1421
- f.write(full_output)
1422
 
1423
- # Extract caption metrics
1424
- dvc_llm = metrics.get('dvc_llm', 0.0)
1425
- vs_llm = metrics.get('vs_llm', 0.0)
1426
- rc_llm = metrics.get('rc_llm', 0.0)
1427
 
1428
- if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
1429
- yield f"❌ Failed to extract caption metrics from evaluation output"
1430
- return
1431
 
1432
- yield f"βœ“ Caption metrics extracted:\n"
1433
- yield f" - DVC_llm: {dvc_llm:.4f}\n"
1434
- yield f" - VS_llm: {vs_llm:.4f}\n"
1435
- yield f" - RC_llm: {rc_llm:.4f}\n\n"
1436
 
1437
- # Update leaderboard
1438
- progress(0.95, desc="Updating leaderboard...")
1439
- yield f"βš™οΈ **Step 4/4**: Updating leaderboard...\n\n"
1440
 
1441
- df = load_leaderboard()
 
1442
 
1443
- # Update caption metrics
1444
- df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
1445
- df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
1446
- df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1447
 
1448
- # Re-sort by first metric
1449
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1450
 
1451
- save_leaderboard(df)
 
 
1452
 
1453
- progress(1.0, desc="Complete!")
 
 
1454
 
1455
- success_msg = f"""
1456
- ---
1457
 
1458
- ## βœ… LLM Judge Evaluation Complete!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1459
 
1460
  **Model**: {model_name}
 
1461
 
1462
- ### πŸ“ˆ Updated Caption Metrics
1463
- - **DVC_llm**: {dvc_llm:.4f}
1464
- - **VS_llm**: {vs_llm:.4f}
1465
- - **RC_llm**: {rc_llm:.4f}
1466
 
1467
- βœ“ Leaderboard updated successfully!
1468
 
1469
  Refresh the Leaderboard tab to see updated rankings.
1470
  """
1471
- yield success_msg
 
1472
 
1473
- except Exception as e:
1474
- yield f"❌ Error running LLM judge evaluation: {str(e)}"
 
 
 
 
 
 
 
 
 
1475
 
1476
 
1477
  # Create Gradio interface
@@ -1548,6 +1683,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1548
  If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
1549
  This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
1550
 
 
 
1551
  **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
1552
  """)
1553
 
@@ -1557,9 +1694,11 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1557
  placeholder="Enter exact model name from leaderboard",
1558
  scale=3
1559
  )
1560
- run_llm_judge_btn = gr.Button("πŸš€ Run LLM Judge", variant="primary", scale=1)
 
 
1561
 
1562
- llm_judge_output = gr.Markdown(label="LLM Judge Status")
1563
 
1564
  # Wire up LLM judge evaluation
1565
  run_llm_judge_btn.click(
@@ -1568,6 +1707,13 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1568
  outputs=llm_judge_output
1569
  )
1570
 
 
 
 
 
 
 
 
1571
  # Tab 2: Submit
1572
  with gr.Tab("πŸ“€ Submit Results"):
1573
  gr.Markdown("""
 
1298
  return False, "Caption metrics already computed"
1299
 
1300
 
1301
+ def check_llm_judge_status(model_name: str) -> Tuple[str, str]:
1302
+ """
1303
+ Check the status of an ongoing LLM judge evaluation.
1304
+
1305
+ Returns:
1306
+ (status, message)
1307
+ status: 'not_started', 'running', 'completed', 'failed'
1308
+ """
1309
+ model_dir = RESULTS_DIR / model_name.replace(" ", "_")
1310
+ status_file = model_dir / "llm_judge_status.json"
1311
+
1312
+ if not status_file.exists():
1313
+ return 'not_started', 'No LLM judge evaluation in progress'
1314
+
1315
+ try:
1316
+ with open(status_file, 'r') as f:
1317
+ status_data = json.load(f)
1318
+
1319
+ status = status_data.get('status', 'not_started')
1320
+ progress = status_data.get('progress', '')
1321
+ timestamp = status_data.get('timestamp', '')
1322
+
1323
+ if status == 'running':
1324
+ return 'running', f"Evaluation in progress: {progress}\nStarted: {timestamp}"
1325
+ elif status == 'completed':
1326
+ return 'completed', f"Evaluation completed: {timestamp}"
1327
+ elif status == 'failed':
1328
+ error = status_data.get('error', 'Unknown error')
1329
+ return 'failed', f"Evaluation failed: {error}"
1330
+ else:
1331
+ return 'not_started', 'No evaluation in progress'
1332
+ except Exception as e:
1333
+ return 'not_started', f"Error reading status: {e}"
1334
+
1335
+
1336
+ def update_llm_judge_status(model_name: str, status: str, progress: str = "", error: str = ""):
1337
+ """Update the LLM judge evaluation status file."""
1338
+ model_dir = RESULTS_DIR / model_name.replace(" ", "_")
1339
+ status_file = model_dir / "llm_judge_status.json"
1340
+
1341
+ status_data = {
1342
+ 'status': status,
1343
+ 'progress': progress,
1344
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
1345
+ }
1346
+
1347
+ if error:
1348
+ status_data['error'] = error
1349
+
1350
+ with open(status_file, 'w') as f:
1351
+ json.dump(status_data, f, indent=2)
1352
+
1353
+
1354
  def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1355
  """
1356
+ Start LLM judge evaluation in the background for DVC/VS/RC tasks.
1357
 
1358
  This function:
1359
+ 1. Validates the model and checks if evaluation is needed
1360
+ 2. Starts background evaluation process (can close browser)
1361
+ 3. Returns immediately with status information
1362
 
1363
  Args:
1364
  model_name: Name of the model to re-evaluate
 
1373
  if not needs_llm:
1374
  return f"❌ {msg}"
1375
 
1376
+ # Check if evaluation is already running
1377
+ status, status_msg = check_llm_judge_status(model_name)
1378
+ if status == 'running':
1379
+ return f"⏳ **Evaluation Already Running**\n\n{status_msg}\n\nCheck status by refreshing or clicking 'Check Status' button."
1380
+ elif status == 'completed':
1381
+ return f"βœ“ **Already Completed**\n\n{status_msg}\n\nRefresh the leaderboard to see results."
1382
+
1383
+ progress(0.1, desc="Validating...")
1384
+ yield f"πŸ” **Validation**: Checking model predictions...\n\n"
1385
 
1386
  # Find the predictions file
1387
  model_dir = RESULTS_DIR / model_name.replace(" ", "_")
 
1393
 
1394
  yield f"βœ“ Found predictions file\n\n"
1395
 
1396
+ # Update status to running
1397
+ update_llm_judge_status(model_name, 'running', 'Starting evaluation...')
1398
+
1399
+ # Start background process
1400
+ progress(0.2, desc="Starting background evaluation...")
1401
+ yield f"πŸš€ **Starting Background Evaluation**\n\n"
1402
+ yield f"⏳ This will take 10-20 minutes depending on API rate limits\n\n"
1403
+ yield f"βœ… **You can close this browser tab** - evaluation runs in background\n\n"
1404
 
1405
  eval_wrapper = Path("evaluation/evaluate_predictions.py")
1406
+ log_file = model_dir / "eval_llm_judge_log.txt"
1407
 
1408
+ # Build command for background execution
1409
  cmd = [
1410
  sys.executable,
1411
  "-u",
 
1416
  # NOTE: No --skip-llm-judge flag, so LLM judge will run
1417
  ]
1418
 
1419
+ # Start process in background (detached)
1420
+ with open(log_file, 'w') as log_f:
1421
+ log_f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
1422
+ log_f.write(f"Command: {' '.join(cmd)}\n")
1423
+ log_f.write("="*60 + "\n\n")
1424
+
1425
+ # Launch background process that continues after app closes
1426
  process = subprocess.Popen(
1427
  cmd,
1428
+ stdout=open(log_file, 'a'),
1429
  stderr=subprocess.STDOUT,
1430
  text=True,
1431
+ env={**os.environ, "PYTHONUNBUFFERED": "1"},
1432
+ start_new_session=True # Detach from parent process
1433
  )
1434
 
1435
+ # Save PID for tracking
1436
+ pid_file = model_dir / "llm_judge_pid.txt"
1437
+ with open(pid_file, 'w') as f:
1438
+ f.write(str(process.pid))
 
 
 
 
 
 
 
 
 
 
 
 
1439
 
1440
+ progress(0.5, desc="Background process started...")
1441
 
1442
+ success_msg = f"""
1443
+ ---
 
 
1444
 
1445
+ ## βœ… Background Evaluation Started!
 
 
1446
 
1447
+ **Model**: {model_name}
1448
+ **Process ID**: {process.pid}
1449
+ **Started**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
1450
 
1451
+ ### ⏳ Evaluation Progress
 
 
 
 
 
 
 
1452
 
1453
+ The evaluation is now running in the background. This will take approximately 10-20 minutes.
 
 
1454
 
1455
+ ### πŸ“‹ What's Happening
1456
 
1457
+ 1. βš™οΈ Running LLM judge on DVC/VS/RC tasks
1458
+ 2. πŸ”„ Using GPT-4 with retry logic (up to 5 attempts per sample)
1459
+ 3. πŸ“Š Will automatically update leaderboard when complete
1460
 
1461
+ ### βœ… You Can Now:
 
 
1462
 
1463
+ - βœ“ **Close this browser tab** - evaluation continues running
1464
+ - βœ“ Come back later and check status using "Check Status" button
1465
+ - βœ“ Refresh the leaderboard in 10-20 minutes to see results
1466
 
1467
+ ### πŸ” Check Status Later
 
 
1468
 
1469
+ 1. Enter the same model name: `{model_name}`
1470
+ 2. Click "Check Status" button
1471
+ 3. Or refresh the leaderboard to see if metrics are updated
 
1472
 
1473
+ ### πŸ“ Logs
 
 
1474
 
1475
+ Evaluation logs are being written to:
1476
+ `{log_file}`
1477
+ """
 
1478
 
1479
+ yield success_msg
 
 
1480
 
1481
+ # Start background monitor thread to update status and leaderboard when complete
1482
+ import threading
1483
 
1484
+ def monitor_and_update():
1485
+ """Monitor background process and update leaderboard when complete."""
1486
+ try:
1487
+ # Wait for process to complete
1488
+ process.wait()
1489
+
1490
+ # Read final output
1491
+ with open(log_file, 'r') as f:
1492
+ full_output = f.read()
1493
+
1494
+ if process.returncode == 0:
1495
+ # Parse metrics
1496
+ metrics = parse_evaluation_output(full_output)
1497
+
1498
+ dvc_llm = metrics.get('dvc_llm', 0.0)
1499
+ vs_llm = metrics.get('vs_llm', 0.0)
1500
+ rc_llm = metrics.get('rc_llm', 0.0)
1501
+
1502
+ if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
1503
+ # Update leaderboard
1504
+ df = load_leaderboard()
1505
+ df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
1506
+ df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
1507
+ df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
1508
+ df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1509
+ save_leaderboard(df)
1510
+
1511
+ # Update status to completed
1512
+ update_llm_judge_status(
1513
+ model_name,
1514
+ 'completed',
1515
+ f"DVC: {dvc_llm:.4f}, VS: {vs_llm:.4f}, RC: {rc_llm:.4f}"
1516
+ )
1517
+ else:
1518
+ update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
1519
+ else:
1520
+ update_llm_judge_status(model_name, 'failed', f'Exit code {process.returncode}')
1521
 
1522
+ except Exception as e:
1523
+ update_llm_judge_status(model_name, 'failed', str(e))
1524
 
1525
+ # Start monitor thread (daemon so it doesn't block app shutdown)
1526
+ monitor_thread = threading.Thread(target=monitor_and_update, daemon=True)
1527
+ monitor_thread.start()
1528
 
1529
+ except Exception as e:
1530
+ update_llm_judge_status(model_name, 'failed', str(e))
1531
+ yield f"❌ Error starting LLM judge evaluation: {str(e)}"
1532
 
 
 
1533
 
1534
+ def check_llm_judge_evaluation_status(model_name: str) -> str:
1535
+ """Check and display status of LLM judge evaluation."""
1536
+ if not model_name or not model_name.strip():
1537
+ return "❌ Please enter a model name"
1538
+
1539
+ status, msg = check_llm_judge_status(model_name.strip())
1540
+
1541
+ if status == 'not_started':
1542
+ return f"ℹ️ **No Evaluation Running**\n\n{msg}"
1543
+ elif status == 'running':
1544
+ model_dir = RESULTS_DIR / model_name.replace(" ", "_")
1545
+ log_file = model_dir / "eval_llm_judge_log.txt"
1546
+
1547
+ # Read last 30 lines of log
1548
+ try:
1549
+ with open(log_file, 'r') as f:
1550
+ lines = f.readlines()
1551
+ recent_lines = lines[-30:]
1552
+
1553
+ log_preview = ''.join(recent_lines)
1554
+
1555
+ return f"""
1556
+ ## ⏳ Evaluation Running
1557
+
1558
+ **Model**: {model_name}
1559
+ **Status**: {msg}
1560
+
1561
+ ### πŸ“ Recent Logs (last 30 lines)
1562
+
1563
+ ```
1564
+ {log_preview}
1565
+ ```
1566
+
1567
+ **Note**: Refresh this page or click "Check Status" again for updates.
1568
+ """
1569
+ except Exception as e:
1570
+ return f"⏳ **Evaluation Running**\n\n{msg}\n\n⚠️ Unable to read logs: {e}"
1571
+
1572
+ elif status == 'completed':
1573
+ # Check if leaderboard was updated
1574
+ df = load_leaderboard()
1575
+ if model_name in df['model_name'].values:
1576
+ row = df[df['model_name'] == model_name].iloc[0]
1577
+ dvc = row.get('dvc_llm', 0.0)
1578
+ vs = row.get('vs_llm', 0.0)
1579
+ rc = row.get('rc_llm', 0.0)
1580
+
1581
+ return f"""
1582
+ ## βœ… Evaluation Complete!
1583
 
1584
  **Model**: {model_name}
1585
+ **Completed**: {msg}
1586
 
1587
+ ### πŸ“ˆ Caption Metrics
1588
+ - **DVC_llm**: {dvc:.4f}
1589
+ - **VS_llm**: {vs:.4f}
1590
+ - **RC_llm**: {rc:.4f}
1591
 
1592
+ βœ“ Leaderboard has been updated!
1593
 
1594
  Refresh the Leaderboard tab to see updated rankings.
1595
  """
1596
+ else:
1597
+ return f"βœ“ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
1598
 
1599
+ elif status == 'failed':
1600
+ return f"""
1601
+ ## ❌ Evaluation Failed
1602
+
1603
+ **Model**: {model_name}
1604
+ **Error**: {msg}
1605
+
1606
+ Please check the logs or try running the evaluation again.
1607
+ """
1608
+
1609
+ return f"ℹ️ **Status**: {status}\n\n{msg}"
1610
 
1611
 
1612
  # Create Gradio interface
 
1683
  If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
1684
  This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
1685
 
1686
+ **βœ… Background Execution**: The evaluation runs in the background - you can close the browser and come back later!
1687
+
1688
  **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
1689
  """)
1690
 
 
1694
  placeholder="Enter exact model name from leaderboard",
1695
  scale=3
1696
  )
1697
+ with gr.Column(scale=1):
1698
+ run_llm_judge_btn = gr.Button("πŸš€ Start Evaluation", variant="primary")
1699
+ check_status_btn = gr.Button("πŸ” Check Status", variant="secondary")
1700
 
1701
+ llm_judge_output = gr.Markdown(label="Evaluation Status")
1702
 
1703
  # Wire up LLM judge evaluation
1704
  run_llm_judge_btn.click(
 
1707
  outputs=llm_judge_output
1708
  )
1709
 
1710
+ # Wire up status check
1711
+ check_status_btn.click(
1712
+ fn=check_llm_judge_evaluation_status,
1713
+ inputs=[llm_judge_model_input],
1714
+ outputs=llm_judge_output
1715
+ )
1716
+
1717
  # Tab 2: Submit
1718
  with gr.Tab("πŸ“€ Submit Results"):
1719
  gr.Markdown("""