Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Running

App Files Files Community

MedGRPO Team commited on 10 days ago

Commit

176a6d5

1 Parent(s): 4752404

update

Browse files

Files changed (2) hide show

README.md +14 -2
app.py +244 -98

README.md CHANGED Viewed

@@ -163,17 +163,29 @@ If your submission was evaluated with `--skip-llm-judge` (DVC_llm, VS_llm, RC_ll
 1. Go to the **Leaderboard** tab
 2. Scroll to the **"Run LLM Judge Evaluation"** section
 3. Enter your model name (exact match)
-4. Click **"Run LLM Judge"**
 The system will:
 - Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
-- Update your leaderboard entry with caption metrics
 - Preserve all other metrics (TAL, STG, NAP, SA, CVS)
 **Time**: ~10-20 minutes depending on API rate limits
 **Availability**: Only available when ALL three caption metrics are 0.0
 ## Evaluation Metrics
 ### Task-Specific Metrics

 1. Go to the **Leaderboard** tab
 2. Scroll to the **"Run LLM Judge Evaluation"** section
 3. Enter your model name (exact match)
+4. Click **"Start Evaluation"**
 The system will:
+- Start evaluation in the background (runs independently)
 - Re-run evaluation for DVC/VS/RC tasks with LLM judge (GPT-4.1/Gemini)
+- Automatically update your leaderboard entry when complete
 - Preserve all other metrics (TAL, STG, NAP, SA, CVS)
+**✅ Background Execution**:
+- You can **close the browser** after starting - evaluation continues running
+- Come back later and click **"Check Status"** to see progress
+- The leaderboard will be automatically updated when complete
 **Time**: ~10-20 minutes depending on API rate limits
 **Availability**: Only available when ALL three caption metrics are 0.0
+**How to Check Status**:
+1. Enter the same model name
+2. Click **"Check Status"** button
+3. View recent logs and progress
+4. Or simply refresh the leaderboard to see if metrics are updated
 ## Evaluation Metrics
 ### Task-Specific Metrics

app.py CHANGED Viewed

@@ -1298,14 +1298,67 @@ def check_needs_llm_judge(model_name: str) -> Tuple[bool, str]:
         return False, "Caption metrics already computed"
 def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
     """
-    Run LLM judge evaluation for DVC/VS/RC tasks on a previously submitted model.
     This function:
-    1. Loads the original predictions from results directory
-    2. Re-runs evaluation WITH LLM judge (no --skip-llm-judge flag)
-    3. Updates the leaderboard with new caption metrics
     Args:
         model_name: Name of the model to re-evaluate
@@ -1320,8 +1373,15 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
         if not needs_llm:
             return f"❌ {msg}"
-        progress(0.1, desc="Loading predictions...")
-        yield f"🔍 **Step 1/4**: Checking model predictions...\n\n"
         # Find the predictions file
         model_dir = RESULTS_DIR / model_name.replace(" ", "_")
@@ -1333,13 +1393,19 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
         yield f"✓ Found predictions file\n\n"
-        # Run evaluation WITH LLM judge
-        progress(0.2, desc="Running LLM judge evaluation...")
-        yield f"⚙️ **Step 2/4**: Running LLM judge evaluation (DVC/VS/RC)...\n\n"
-        yield f"⏳ This may take 5-15 minutes depending on API rate limits...\n\n"
         eval_wrapper = Path("evaluation/evaluate_predictions.py")
         cmd = [
             sys.executable,
             "-u",
@@ -1350,128 +1416,197 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
             # NOTE: No --skip-llm-judge flag, so LLM judge will run
         ]
         process = subprocess.Popen(
             cmd,
-            stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             text=True,
-            bufsize=1,
-            env={**os.environ, "PYTHONUNBUFFERED": "1"}
         )
-        # Stream logs
-        import time
-        log_buffer = []
-        last_update = time.time()
-        line_count = 0
-        import select
-        while True:
-            if process.poll() is not None:
-                remaining = process.stdout.read()
-                if remaining:
-                    for line in remaining.split('\n'):
-                        line = line.rstrip()
-                        if line.strip() and 'WARNING: All log messages' not in line:
-                            log_buffer.append(line)
-                break
-            ready, _, _ = select.select([process.stdout], [], [], 0.5)
-            if ready:
-                line = process.stdout.readline()
-                if not line:
-                    break
-                line = line.rstrip()
-                if not line.strip() or 'WARNING: All log messages' in line:
-                    continue
-                log_buffer.append(line)
-                line_count += 1
-            # Update UI every 1 second
-            if time.time() - last_update > 1.0:
-                if log_buffer:
-                    recent = log_buffer[-20:]
-                    log_text = f"⚙️ **Step 2/4**: Running LLM judge evaluation...\n\n```\n"
-                    log_text += '\n'.join(recent)
-                    log_text += "\n```"
-                    yield log_text
-                last_update = time.time()
-                progress_val = min(0.8, 0.2 + (line_count / 200) * 0.60)
-                progress(progress_val, desc="Running LLM judge...")
-        process.wait()
-        if process.returncode != 0:
-            yield f"\n❌ Evaluation failed (exit code {process.returncode})"
-            return
-        # Parse metrics
-        progress(0.85, desc="Extracting metrics...")
-        yield f"⚙️ **Step 3/4**: Extracting caption metrics...\n\n"
-        full_output = '\n'.join(log_buffer)
-        metrics = parse_evaluation_output(full_output)
-        # Save updated output
-        with open(model_dir / "eval_output_llm_judge.txt", 'w') as f:
-            f.write(full_output)
-        # Extract caption metrics
-        dvc_llm = metrics.get('dvc_llm', 0.0)
-        vs_llm = metrics.get('vs_llm', 0.0)
-        rc_llm = metrics.get('rc_llm', 0.0)
-        if dvc_llm == 0.0 and vs_llm == 0.0 and rc_llm == 0.0:
-            yield f"❌ Failed to extract caption metrics from evaluation output"
-            return
-        yield f"✓ Caption metrics extracted:\n"
-        yield f"  - DVC_llm: {dvc_llm:.4f}\n"
-        yield f"  - VS_llm: {vs_llm:.4f}\n"
-        yield f"  - RC_llm: {rc_llm:.4f}\n\n"
-        # Update leaderboard
-        progress(0.95, desc="Updating leaderboard...")
-        yield f"⚙️ **Step 4/4**: Updating leaderboard...\n\n"
-        df = load_leaderboard()
-        # Update caption metrics
-        df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
-        df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
-        df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
-        # Re-sort by first metric
-        df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
-        save_leaderboard(df)
-        progress(1.0, desc="Complete!")
-        success_msg = f"""
----
-## ✅ LLM Judge Evaluation Complete!
 **Model**: {model_name}
-### 📈 Updated Caption Metrics
-- **DVC_llm**: {dvc_llm:.4f}
-- **VS_llm**: {vs_llm:.4f}
-- **RC_llm**: {rc_llm:.4f}
-✓ Leaderboard updated successfully!
 Refresh the Leaderboard tab to see updated rankings.
 """
-        yield success_msg
-    except Exception as e:
-        yield f"❌ Error running LLM judge evaluation: {str(e)}"
 # Create Gradio interface
@@ -1548,6 +1683,8 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
             If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
             This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
             **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
             """)
@@ -1557,9 +1694,11 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
                     placeholder="Enter exact model name from leaderboard",
                     scale=3
                 )
-                run_llm_judge_btn = gr.Button("🚀 Run LLM Judge", variant="primary", scale=1)
-            llm_judge_output = gr.Markdown(label="LLM Judge Status")
             # Wire up LLM judge evaluation
             run_llm_judge_btn.click(
@@ -1568,6 +1707,13 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
                 outputs=llm_judge_output
             )
         # Tab 2: Submit
         with gr.Tab("📤 Submit Results"):
             gr.Markdown("""

         return False, "Caption metrics already computed"
+def check_llm_judge_status(model_name: str) -> Tuple[str, str]:
+    """
+    Check the status of an ongoing LLM judge evaluation.
+    Returns:
+        (status, message)
+        status: 'not_started', 'running', 'completed', 'failed'
+    """
+    model_dir = RESULTS_DIR / model_name.replace(" ", "_")
+    status_file = model_dir / "llm_judge_status.json"
+    if not status_file.exists():
+        return 'not_started', 'No LLM judge evaluation in progress'
+    try:
+        with open(status_file, 'r') as f:
+            status_data = json.load(f)
+        status = status_data.get('status', 'not_started')
+        progress = status_data.get('progress', '')
+        timestamp = status_data.get('timestamp', '')
+        if status == 'running':
+            return 'running', f"Evaluation in progress: {progress}\nStarted: {timestamp}"
+        elif status == 'completed':
+            return 'completed', f"Evaluation completed: {timestamp}"
+        elif status == 'failed':
+            error = status_data.get('error', 'Unknown error')
+            return 'failed', f"Evaluation failed: {error}"
+        else:
+            return 'not_started', 'No evaluation in progress'
+    except Exception as e:
+        return 'not_started', f"Error reading status: {e}"
+def update_llm_judge_status(model_name: str, status: str, progress: str = "", error: str = ""):
+    """Update the LLM judge evaluation status file."""
+    model_dir = RESULTS_DIR / model_name.replace(" ", "_")
+    status_file = model_dir / "llm_judge_status.json"
+    status_data = {
+        'status': status,
+        'progress': progress,
+        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    if error:
+        status_data['error'] = error
+    with open(status_file, 'w') as f:
+        json.dump(status_data, f, indent=2)
 def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
     """
+    Start LLM judge evaluation in the background for DVC/VS/RC tasks.
     This function:
+    1. Validates the model and checks if evaluation is needed
+    2. Starts background evaluation process (can close browser)
+    3. Returns immediately with status information
     Args:
         model_name: Name of the model to re-evaluate
         if not needs_llm:
             return f"❌ {msg}"
+        # Check if evaluation is already running
+        status, status_msg = check_llm_judge_status(model_name)
+        if status == 'running':
+            return f"⏳ **Evaluation Already Running**\n\n{status_msg}\n\nCheck status by refreshing or clicking 'Check Status' button."
+        elif status == 'completed':
+            return f"✓ **Already Completed**\n\n{status_msg}\n\nRefresh the leaderboard to see results."
+        progress(0.1, desc="Validating...")
+        yield f"🔍 **Validation**: Checking model predictions...\n\n"
         # Find the predictions file
         model_dir = RESULTS_DIR / model_name.replace(" ", "_")
         yield f"✓ Found predictions file\n\n"
+        # Update status to running
+        update_llm_judge_status(model_name, 'running', 'Starting evaluation...')
+        # Start background process
+        progress(0.2, desc="Starting background evaluation...")
+        yield f"🚀 **Starting Background Evaluation**\n\n"
+        yield f"⏳ This will take 10-20 minutes depending on API rate limits\n\n"
+        yield f"✅ **You can close this browser tab** - evaluation runs in background\n\n"
         eval_wrapper = Path("evaluation/evaluate_predictions.py")
+        log_file = model_dir / "eval_llm_judge_log.txt"
+        # Build command for background execution
         cmd = [
             sys.executable,
             "-u",
             # NOTE: No --skip-llm-judge flag, so LLM judge will run
         ]
+        # Start process in background (detached)
+        with open(log_file, 'w') as log_f:
+            log_f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            log_f.write(f"Command: {' '.join(cmd)}\n")
+            log_f.write("="*60 + "\n\n")
+        # Launch background process that continues after app closes
         process = subprocess.Popen(
             cmd,
+            stdout=open(log_file, 'a'),
             stderr=subprocess.STDOUT,
             text=True,
+            env={**os.environ, "PYTHONUNBUFFERED": "1"},
+            start_new_session=True  # Detach from parent process
         )
+        # Save PID for tracking
+        pid_file = model_dir / "llm_judge_pid.txt"
+        with open(pid_file, 'w') as f:
+            f.write(str(process.pid))
+        progress(0.5, desc="Background process started...")
+        success_msg = f"""
+---
+## ✅ Background Evaluation Started!
+**Model**: {model_name}
+**Process ID**: {process.pid}
+**Started**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+### ⏳ Evaluation Progress
+The evaluation is now running in the background. This will take approximately 10-20 minutes.
+### 📋 What's Happening
+1. ⚙️ Running LLM judge on DVC/VS/RC tasks
+2. 🔄 Using GPT-4 with retry logic (up to 5 attempts per sample)
+3. 📊 Will automatically update leaderboard when complete
+### ✅ You Can Now:
+- ✓ **Close this browser tab** - evaluation continues running
+- ✓ Come back later and check status using "Check Status" button
+- ✓ Refresh the leaderboard in 10-20 minutes to see results
+### 🔍 Check Status Later
+1. Enter the same model name: `{model_name}`
+2. Click "Check Status" button
+3. Or refresh the leaderboard to see if metrics are updated
+### 📝 Logs
+Evaluation logs are being written to:
+`{log_file}`
+"""
+        yield success_msg
+        # Start background monitor thread to update status and leaderboard when complete
+        import threading
+        def monitor_and_update():
+            """Monitor background process and update leaderboard when complete."""
+            try:
+                # Wait for process to complete
+                process.wait()
+                # Read final output
+                with open(log_file, 'r') as f:
+                    full_output = f.read()
+                if process.returncode == 0:
+                    # Parse metrics
+                    metrics = parse_evaluation_output(full_output)
+                    dvc_llm = metrics.get('dvc_llm', 0.0)
+                    vs_llm = metrics.get('vs_llm', 0.0)
+                    rc_llm = metrics.get('rc_llm', 0.0)
+                    if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
+                        # Update leaderboard
+                        df = load_leaderboard()
+                        df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
+                        df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
+                        df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
+                        df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
+                        save_leaderboard(df)
+                        # Update status to completed
+                        update_llm_judge_status(
+                            model_name,
+                            'completed',
+                            f"DVC: {dvc_llm:.4f}, VS: {vs_llm:.4f}, RC: {rc_llm:.4f}"
+                        )
+                    else:
+                        update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
+                else:
+                    update_llm_judge_status(model_name, 'failed', f'Exit code {process.returncode}')
+            except Exception as e:
+                update_llm_judge_status(model_name, 'failed', str(e))
+        # Start monitor thread (daemon so it doesn't block app shutdown)
+        monitor_thread = threading.Thread(target=monitor_and_update, daemon=True)
+        monitor_thread.start()
+    except Exception as e:
+        update_llm_judge_status(model_name, 'failed', str(e))
+        yield f"❌ Error starting LLM judge evaluation: {str(e)}"
+def check_llm_judge_evaluation_status(model_name: str) -> str:
+    """Check and display status of LLM judge evaluation."""
+    if not model_name or not model_name.strip():
+        return "❌ Please enter a model name"
+    status, msg = check_llm_judge_status(model_name.strip())
+    if status == 'not_started':
+        return f"ℹ️ **No Evaluation Running**\n\n{msg}"
+    elif status == 'running':
+        model_dir = RESULTS_DIR / model_name.replace(" ", "_")
+        log_file = model_dir / "eval_llm_judge_log.txt"
+        # Read last 30 lines of log
+        try:
+            with open(log_file, 'r') as f:
+                lines = f.readlines()
+                recent_lines = lines[-30:]
+            log_preview = ''.join(recent_lines)
+            return f"""
+## ⏳ Evaluation Running
+**Model**: {model_name}
+**Status**: {msg}
+### 📝 Recent Logs (last 30 lines)
+```
+{log_preview}
+```
+**Note**: Refresh this page or click "Check Status" again for updates.
+"""
+        except Exception as e:
+            return f"⏳ **Evaluation Running**\n\n{msg}\n\n⚠️ Unable to read logs: {e}"
+    elif status == 'completed':
+        # Check if leaderboard was updated
+        df = load_leaderboard()
+        if model_name in df['model_name'].values:
+            row = df[df['model_name'] == model_name].iloc[0]
+            dvc = row.get('dvc_llm', 0.0)
+            vs = row.get('vs_llm', 0.0)
+            rc = row.get('rc_llm', 0.0)
+            return f"""
+## ✅ Evaluation Complete!
 **Model**: {model_name}
+**Completed**: {msg}
+### 📈 Caption Metrics
+- **DVC_llm**: {dvc:.4f}
+- **VS_llm**: {vs:.4f}
+- **RC_llm**: {rc:.4f}
+✓ Leaderboard has been updated!
 Refresh the Leaderboard tab to see updated rankings.
 """
+        else:
+            return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
+    elif status == 'failed':
+        return f"""
+## ❌ Evaluation Failed
+**Model**: {model_name}
+**Error**: {msg}
+Please check the logs or try running the evaluation again.
+"""
+    return f"ℹ️ **Status**: {status}\n\n{msg}"
 # Create Gradio interface
             If a model was submitted with `--skip-llm-judge` (caption metrics are 0.0), you can run LLM judge evaluation here.
             This will compute DVC_llm, VS_llm, and RC_llm scores using GPT-4.1/Gemini.
+            **✅ Background Execution**: The evaluation runs in the background - you can close the browser and come back later!
             **Note**: This feature is only available when ALL three caption metrics (DVC_llm, VS_llm, RC_llm) are 0.0.
             """)
                     placeholder="Enter exact model name from leaderboard",
                     scale=3
                 )
+                with gr.Column(scale=1):
+                    run_llm_judge_btn = gr.Button("🚀 Start Evaluation", variant="primary")
+                    check_status_btn = gr.Button("🔍 Check Status", variant="secondary")
+            llm_judge_output = gr.Markdown(label="Evaluation Status")
             # Wire up LLM judge evaluation
             run_llm_judge_btn.click(
                 outputs=llm_judge_output
             )
+            # Wire up status check
+            check_status_btn.click(
+                fn=check_llm_judge_evaluation_status,
+                inputs=[llm_judge_model_input],
+                outputs=llm_judge_output
+            )
         # Tab 2: Submit
         with gr.Tab("📤 Submit Results"):
             gr.Markdown("""