Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Sleeping

MedGRPO Team Claude Sonnet 4.5 commited on 17 days ago

Commit

a36b7fe

1 Parent(s): 331979f

Update evaluation metrics and leaderboard display

- Modified app.py to fix metric definitions and display format
- Updated eval_dvc.py and eval_tal.py for consistent metric computation
- Aligned with standard evaluation pipeline

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (3) hide show

app.py +158 -115
evaluation/eval_dvc.py +186 -26
evaluation/eval_tal.py +31 -10

app.py CHANGED Viewed

@@ -25,58 +25,83 @@ EVAL_SCRIPT = Path("evaluation/evaluate_all_pai.py")  # Local copy in repo
 SUBMISSIONS_DIR.mkdir(exist_ok=True)
 RESULTS_DIR.mkdir(exist_ok=True)
-# MedGRPO Task Definitions (8 tasks)
-TASKS = {
-    "tal": {
-        "name": "Temporal Action Localization",
-        "metric": "mAP@0.5",
         "higher_better": True,
-        "description": "Identify start/end times of surgical actions"
     },
-    "stg": {
-        "name": "Spatiotemporal Grounding",
-        "metric": "mIoU",
         "higher_better": True,
-        "description": "Locate actions in both space (bbox) and time"
     },
-    "next_action": {
-        "name": "Next Action Prediction",
-        "metric": "Accuracy",
         "higher_better": True,
-        "description": "Predict the next surgical step"
     },
-    "dvc": {
-        "name": "Dense Video Captioning",
-        "metric": "LLM Judge (Avg)",
         "higher_better": True,
-        "description": "Generate detailed segment descriptions"
     },
-    "vs": {
-        "name": "Video Summary",
-        "metric": "LLM Judge (Avg)",
         "higher_better": True,
-        "description": "Summarize entire surgical videos"
     },
-    "rc": {
-        "name": "Region Caption",
-        "metric": "LLM Judge (Avg)",
         "higher_better": True,
-        "description": "Describe regions indicated by bounding boxes"
     },
-    "skill_assessment": {
-        "name": "Skill Assessment",
-        "metric": "Accuracy",
         "higher_better": True,
-        "description": "Evaluate surgical skill levels (JIGSAWS)"
     },
-    "cvs_assessment": {
-        "name": "CVS Assessment",
-        "metric": "Accuracy",
         "higher_better": True,
-        "description": "Clinical variable scoring"
     },
 }
 # Test set statistics
 TEST_SET_STATS = {
     "total_samples": 6245,
@@ -92,13 +117,13 @@ def load_leaderboard() -> pd.DataFrame:
             data = json.load(f)
         if data:
             df = pd.DataFrame(data)
-            # Sort by average score descending
-            if 'average' in df.columns:
-                df = df.sort_values('average', ascending=False).reset_index(drop=True)
             return df
-    # Return empty dataframe with correct structure
-    columns = ["rank", "model_name", "organization", "average"] + list(TASKS.keys()) + ["date", "contact"]
     return pd.DataFrame(columns=columns)
@@ -218,15 +243,11 @@ def run_evaluation(results_file: str, model_name: str) -> Tuple[bool, Dict, str]
 def parse_evaluation_output(output: str) -> Dict[str, float]:
     """
-    Parse evaluation output from evaluate_all_pai.py to extract metrics.
-    Expected output format (from --grouping overall):
-    ================================================================================
-    TAL - Overall Evaluation (All Datasets Combined)
-    ================================================================================
-    Total samples: 1234
-    mAP@0.5: 0.4567
-    ...
     """
     metrics = {}
@@ -237,62 +258,100 @@ def parse_evaluation_output(output: str) -> Dict[str, float]:
         line = line.strip()
         # Detect task headers
-        if "TAL - Overall Evaluation" in line:
             current_task = "tal"
-        elif "STG - Overall Evaluation" in line:
             current_task = "stg"
-        elif "NEXT_ACTION - Overall Evaluation" in line:
             current_task = "next_action"
-        elif "DVC - Overall Evaluation" in line:
             current_task = "dvc"
-        elif "RC - Overall Evaluation" in line:
             current_task = "rc"
-        elif "VS - Overall Evaluation" in line:
             current_task = "vs"
-        elif "SKILL_ASSESSMENT - Overall Evaluation" in line:
             current_task = "skill_assessment"
-        elif "CVS_ASSESSMENT - Overall Evaluation" in line:
             current_task = "cvs_assessment"
         # Extract metrics based on task
         if current_task:
-            if current_task == "tal" and "mAP@0.5:" in line:
-                try:
-                    value = float(line.split("mAP@0.5:")[-1].strip())
-                    metrics["tal"] = value
-                except:
-                    pass
-            elif current_task == "stg" and "mean_iou:" in line:
                 try:
-                    value = float(line.split("mean_iou:")[-1].strip())
-                    metrics["stg"] = value
                 except:
                     pass
-            elif current_task == "next_action" and "Weighted Average Accuracy" in line:
                 try:
                     value = float(line.split(":")[-1].strip())
-                    metrics["next_action"] = value
                 except:
                     pass
-            elif current_task in ["dvc", "vs", "rc"]:
-                # For caption tasks, look for average LLM judge score
-                if "Average" in line or "Mean" in line:
                     try:
-                        parts = line.split(":")
-                        if len(parts) == 2:
-                            value = float(parts[-1].strip())
-                            if 0 <= value <= 5:  # LLM judge scores are 1-5
-                                metrics[current_task] = value
                     except:
                         pass
-            elif current_task in ["skill_assessment", "cvs_assessment"] and "accuracy:" in line.lower():
                 try:
                     value = float(line.split(":")[-1].strip())
-                    metrics[current_task] = value
                 except:
                     pass
@@ -328,38 +387,23 @@ def submit_model(file, model_name: str, organization: str, contact: str = "") ->
     if not success:
         return False, f"❌ Evaluation failed: {eval_msg}"
-    # Check if we got metrics for all tasks
-    missing_tasks = [task for task in TASKS.keys() if task not in metrics]
-    if len(missing_tasks) > 0:
-        return False, f"❌ Evaluation incomplete. Missing metrics for: {missing_tasks}"
-    # Calculate average score (normalized across all tasks)
-    # Normalize each task score to 0-1 range, then average
-    task_scores = []
-    for task in TASKS.keys():
-        if task in metrics:
-            score = metrics[task]
-            # LLM judge scores are 1-5, others are 0-1
-            if task in ["dvc", "vs", "rc"]:
-                normalized = (score - 1) / 4  # Normalize 1-5 to 0-1
-            else:
-                normalized = score  # Already 0-1
-            task_scores.append(normalized)
-    average_score = sum(task_scores) / len(task_scores) if task_scores else 0.0
-    # Add to leaderboard
     new_entry = {
         "model_name": model_name,
         "organization": organization,
-        "average": round(average_score, 4),
-        **{task: round(metrics.get(task, 0.0), 4) for task in TASKS.keys()},
         "date": datetime.now().strftime("%Y-%m-%d"),
         "contact": contact
     }
     df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
-    df = df.sort_values('average', ascending=False).reset_index(drop=True)
     save_leaderboard(df)
@@ -368,13 +412,12 @@ def submit_model(file, model_name: str, organization: str, contact: str = "") ->
     **Model**: {model_name}
     **Organization**: {organization}
-    **Average Score**: {average_score:.4f}
-    **Task Scores**:
     """
-    for task, info in TASKS.items():
-        score = metrics.get(task, 0.0)
-        success_msg += f"\n- **{info['name']}**: {score:.4f}"
     success_msg += f"\n\n🏆 **Rank**: #{df[df['model_name'] == model_name].index[0] + 1} / {len(df)}"
@@ -382,24 +425,24 @@ def submit_model(file, model_name: str, organization: str, contact: str = "") ->
 def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
-    """Format leaderboard dataframe for display."""
     if df.empty:
         return df
-    # Create display dataframe with selected columns
-    display_cols = ["rank", "model_name", "organization", "average"]
-    # Add task columns
-    for task in TASKS.keys():
-        if task in df.columns:
-            display_cols.append(task)
     display_cols.append("date")
     # Rename columns for display
     display_df = df[display_cols].copy()
-    display_df.columns = ["Rank", "Model", "Organization", "Average"] + \
-                          [TASKS[task]["name"] for task in TASKS.keys() if task in df.columns] + \
                           ["Date"]
     return display_df

 SUBMISSIONS_DIR.mkdir(exist_ok=True)
 RESULTS_DIR.mkdir(exist_ok=True)
+# MedGRPO Metrics Definitions (10 metrics from 8 tasks)
+# Note: TAL has 2 metrics, DVC has 2 metrics, others have 1 metric each
+METRICS = {
+    "cvs_acc": {
+        "name": "CVS_acc",
+        "full_name": "CVS Assessment Accuracy",
+        "higher_better": True,
+        "description": "Clinical variable scoring accuracy"
+    },
+    "nap_acc": {
+        "name": "NAP_acc",
+        "full_name": "Next Action Prediction Accuracy",
+        "higher_better": True,
+        "description": "Accuracy in predicting next surgical step"
+    },
+    "sa_acc": {
+        "name": "SA_acc",
+        "full_name": "Skill Assessment Accuracy",
         "higher_better": True,
+        "description": "Surgical skill level evaluation accuracy"
     },
+    "stg_miou": {
+        "name": "STG_mIoU",
+        "full_name": "Spatiotemporal Grounding mIoU",
         "higher_better": True,
+        "description": "Mean IoU for spatial+temporal localization"
     },
+    "tag_miou_03": {
+        "name": "TAG_mIoU@0.3",
+        "full_name": "Temporal Action Grounding mIoU@0.3",
         "higher_better": True,
+        "description": "Mean IoU at threshold 0.3 for temporal localization"
     },
+    "tag_miou_05": {
+        "name": "TAG_mIoU@0.5",
+        "full_name": "Temporal Action Grounding mIoU@0.5",
         "higher_better": True,
+        "description": "Mean IoU at threshold 0.5 for temporal localization"
     },
+    "dvc_llm": {
+        "name": "DVC_llm",
+        "full_name": "Dense Video Captioning LLM Score",
         "higher_better": True,
+        "description": "Caption quality score (LLM judge or semantic similarity)"
     },
+    "dvc_f1": {
+        "name": "DVC_F1",
+        "full_name": "Dense Video Captioning F1",
         "higher_better": True,
+        "description": "F1 score for temporal segment localization"
     },
+    "vs_llm": {
+        "name": "VS_llm",
+        "full_name": "Video Summary LLM Score",
         "higher_better": True,
+        "description": "Video summary quality score"
     },
+    "rc_llm": {
+        "name": "RC_llm",
+        "full_name": "Region Caption LLM Score",
         "higher_better": True,
+        "description": "Region caption quality score"
     },
 }
+# Keep TASKS for backward compatibility and task descriptions
+TASKS = {
+    "tal": "Temporal Action Localization",
+    "stg": "Spatiotemporal Grounding",
+    "next_action": "Next Action Prediction",
+    "dvc": "Dense Video Captioning",
+    "vs": "Video Summary",
+    "rc": "Region Caption",
+    "skill_assessment": "Skill Assessment",
+    "cvs_assessment": "CVS Assessment",
+}
 # Test set statistics
 TEST_SET_STATS = {
     "total_samples": 6245,
             data = json.load(f)
         if data:
             df = pd.DataFrame(data)
+            # Sort by first metric (CVS_acc) descending - no overall average
+            if 'cvs_acc' in df.columns:
+                df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
             return df
+    # Return empty dataframe with correct structure (no average column)
+    columns = ["rank", "model_name", "organization"] + list(METRICS.keys()) + ["date", "contact"]
     return pd.DataFrame(columns=columns)
 def parse_evaluation_output(output: str) -> Dict[str, float]:
     """
+    Parse evaluation output to extract 10 metrics.
+    Returns dict with keys:
+        cvs_acc, nap_acc, sa_acc, stg_miou,
+        tag_miou_03, tag_miou_05, dvc_llm, dvc_f1, vs_llm, rc_llm
     """
     metrics = {}
         line = line.strip()
         # Detect task headers
+        if "TAL" in line and "Overall" in line:
             current_task = "tal"
+        elif "STG" in line and "Overall" in line:
             current_task = "stg"
+        elif "NEXT_ACTION" in line and "Overall" in line or "Next Action" in line:
             current_task = "next_action"
+        elif "DVC" in line and "Overall" in line or "Dense Video Captioning" in line:
             current_task = "dvc"
+        elif "RC" in line and "Overall" in line or "Region Caption" in line:
             current_task = "rc"
+        elif "VS" in line and "Overall" in line or "Video Summary" in line:
             current_task = "vs"
+        elif "SKILL" in line and "Overall" in line or "Skill Assessment" in line:
             current_task = "skill_assessment"
+        elif "CVS" in line and "Overall" in line or "CVS Assessment" in line:
             current_task = "cvs_assessment"
         # Extract metrics based on task
         if current_task:
+            # TAL: Extract both mIoU@0.3 and mIoU@0.5
+            if current_task == "tal":
+                if "meanIoU@0.3" in line or "mIoU@0.3" in line:
+                    try:
+                        value = float(line.split(":")[-1].strip())
+                        metrics["tag_miou_03"] = value
+                    except:
+                        pass
+                if "meanIoU@0.5" in line or "mIoU@0.5" in line:
+                    try:
+                        value = float(line.split(":")[-1].strip())
+                        metrics["tag_miou_05"] = value
+                    except:
+                        pass
+            # STG: Extract mIoU
+            elif current_task == "stg" and ("mean_iou" in line.lower() or "miou" in line.lower()):
                 try:
+                    value = float(line.split(":")[-1].strip())
+                    metrics["stg_miou"] = value
                 except:
                     pass
+            # Next Action: Extract accuracy
+            elif current_task == "next_action" and "accuracy" in line.lower():
                 try:
                     value = float(line.split(":")[-1].strip())
+                    metrics["nap_acc"] = value
                 except:
                     pass
+            # DVC: Extract both caption_score and temporal_f1
+            elif current_task == "dvc":
+                if "caption_score" in line.lower() or "caption score" in line.lower():
                     try:
+                        value = float(line.split(":")[-1].strip())
+                        metrics["dvc_llm"] = value
                     except:
                         pass
+                if "temporal_f1" in line.lower() or "temporal f1" in line.lower():
+                    try:
+                        value = float(line.split(":")[-1].strip())
+                        metrics["dvc_f1"] = value
+                    except:
+                        pass
+            # VS: Extract LLM score
+            elif current_task == "vs" and ("score" in line.lower() or "average" in line.lower()):
+                try:
+                    value = float(line.split(":")[-1].strip())
+                    metrics["vs_llm"] = value
+                except:
+                    pass
+            # RC: Extract LLM score
+            elif current_task == "rc" and ("score" in line.lower() or "average" in line.lower()):
                 try:
                     value = float(line.split(":")[-1].strip())
+                    metrics["rc_llm"] = value
+                except:
+                    pass
+            # Skill Assessment: Extract accuracy
+            elif current_task == "skill_assessment" and "accuracy" in line.lower():
+                try:
+                    value = float(line.split(":")[-1].strip())
+                    metrics["sa_acc"] = value
+                except:
+                    pass
+            # CVS Assessment: Extract accuracy
+            elif current_task == "cvs_assessment" and "accuracy" in line.lower():
+                try:
+                    value = float(line.split(":")[-1].strip())
+                    metrics["cvs_acc"] = value
                 except:
                     pass
     if not success:
         return False, f"❌ Evaluation failed: {eval_msg}"
+    # Check if we got all 10 metrics
+    missing_metrics = [m for m in METRICS.keys() if m not in metrics]
+    if len(missing_metrics) > 0:
+        return False, f"❌ Evaluation incomplete. Missing metrics: {missing_metrics}"
+    # Add to leaderboard (no average calculation)
     new_entry = {
         "model_name": model_name,
         "organization": organization,
+        **{metric: round(metrics.get(metric, 0.0), 4) for metric in METRICS.keys()},
         "date": datetime.now().strftime("%Y-%m-%d"),
         "contact": contact
     }
     df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
+    # Sort by first metric (CVS_acc)
+    df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
     save_leaderboard(df)
     **Model**: {model_name}
     **Organization**: {organization}
+    **Metric Scores**:
     """
+    for metric_key, metric_info in METRICS.items():
+        score = metrics.get(metric_key, 0.0)
+        success_msg += f"\n- **{metric_info['name']}**: {score:.4f}"
     success_msg += f"\n\n🏆 **Rank**: #{df[df['model_name'] == model_name].index[0] + 1} / {len(df)}"
 def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
+    """Format leaderboard dataframe for display with 10 metrics (no average)."""
     if df.empty:
         return df
+    # Create display dataframe with selected columns (no average)
+    display_cols = ["rank", "model_name", "organization"]
+    # Add metric columns in order
+    for metric_key in METRICS.keys():
+        if metric_key in df.columns:
+            display_cols.append(metric_key)
     display_cols.append("date")
     # Rename columns for display
     display_df = df[display_cols].copy()
+    display_df.columns = ["Rank", "Model", "Organization"] + \
+                          [METRICS[m]["name"] for m in METRICS.keys() if m in df.columns] + \
                           ["Date"]
     return display_df

evaluation/eval_dvc.py CHANGED Viewed

@@ -1,18 +1,115 @@
-"""Dense Video Captioning evaluation using LLM judge."""
 import json
 import sys
 from eval_caption_llm_judge import evaluate_caption_task
 def group_records_by_dataset(data):
     """Group DVC records by dataset for per-dataset evaluation."""
-    from collections import defaultdict
     dataset_groups = defaultdict(list)
     for key, record in data.items():
         qa_type = record.get('qa_type', '')
-        if 'dense_captioning' not in qa_type.lower():
             continue
         dataset = record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown')))
@@ -33,35 +130,84 @@ def group_records_by_dataset(data):
 def evaluate_dataset_dvc(dataset_name, records):
-    """Evaluate DVC for a specific dataset using caption evaluator."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
-    # DVC uses same evaluation as caption tasks
-    # Create a temporary file with just these records
     import tempfile
     import os
     temp_data = {str(i): record for i, record in enumerate(records)}
     with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
         json.dump(temp_data, f)
         temp_file = f.name
     try:
-        # Use caption evaluator (treats DVC as extended caption task)
-        result = evaluate_caption_task(temp_file, 'dense_captioning')
-        # Return in expected format
-        return {
-            'overall': {
-                'score': result['score'],
-                'method': result['method'],
-                'count': len(records)
-            }
-        }
     finally:
         os.unlink(temp_file)
 def main():
     """Main evaluation function for DVC."""
@@ -84,7 +230,7 @@ def main():
     if not any(dataset_records.values()):
         print("No DVC records found!")
-        return
     all_results = {}
     for dataset_name, records in dataset_records.items():
@@ -96,16 +242,30 @@ def main():
     print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY")
     print(f"{'='*80}")
     for dataset_name, results in all_results.items():
         if results:
             print(f"\n{dataset_name}:")
             for key, metrics in results.items():
                 if isinstance(metrics, dict):
-                    for metric_name, value in metrics.items():
-                        if metric_name != 'count':
-                            print(f"  {metric_name}: {value}")
-                        else:
-                            print(f"  samples: {value}")
 if __name__ == "__main__":

+"""Dense Video Captioning evaluation using LLM judge + temporal F1."""
 import json
 import sys
+import numpy as np
+from collections import defaultdict
 from eval_caption_llm_judge import evaluate_caption_task
+def compute_iou(pred_segment, gt_segment):
+    """Compute IoU between two segments [start, end]."""
+    pred_start, pred_end = pred_segment
+    gt_start, gt_end = gt_segment
+    # Compute intersection
+    inter_start = max(pred_start, gt_start)
+    inter_end = min(pred_end, gt_end)
+    intersection = max(0, inter_end - inter_start)
+    # Compute union
+    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
+    if union == 0:
+        return 0
+    return intersection / union
+def compute_temporal_f1(pred_segments, gt_segments, iou_threshold=0.5):
+    """
+    Compute F1 score for temporal segment matching.
+    Args:
+        pred_segments: List of predicted [start, end] segments
+        gt_segments: List of ground truth [start, end] segments
+        iou_threshold: IoU threshold for matching (default 0.5)
+    Returns:
+        Dict with precision, recall, and f1 scores
+    """
+    if not pred_segments or not gt_segments:
+        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
+    # Match predicted segments to ground truth
+    matched_gt = set()
+    matched_pred = set()
+    for pred_idx, pred_seg in enumerate(pred_segments):
+        best_iou = 0
+        best_gt_idx = -1
+        for gt_idx, gt_seg in enumerate(gt_segments):
+            if gt_idx in matched_gt:
+                continue
+            iou = compute_iou(pred_seg, gt_seg)
+            if iou >= iou_threshold and iou > best_iou:
+                best_iou = iou
+                best_gt_idx = gt_idx
+        if best_gt_idx >= 0:
+            matched_pred.add(pred_idx)
+            matched_gt.add(best_gt_idx)
+    # Compute precision, recall, F1
+    precision = len(matched_pred) / len(pred_segments) if pred_segments else 0
+    recall = len(matched_gt) / len(gt_segments) if gt_segments else 0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+    return {
+        'precision': precision,
+        'recall': recall,
+        'f1': f1
+    }
+def parse_dvc_segments(text):
+    """
+    Parse DVC output to extract segments.
+    Supports multiple formats:
+    - [start-end] caption
+    - (start-end) caption
+    - start-end seconds: caption
+    """
+    import re
+    segments = []
+    # Pattern 1: [0.0-5.2] or (0.0-5.2)
+    pattern1 = r'[\[\(](\d+\.?\d*)\s*-\s*(\d+\.?\d*)[\]\)]'
+    # Pattern 2: 0.0-5.2 seconds:
+    pattern2 = r'(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\s*seconds?:'
+    # Try both patterns
+    for pattern in [pattern1, pattern2]:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        for match in matches:
+            start = float(match.group(1))
+            end = float(match.group(2))
+            segments.append([start, end])
+    return segments
 def group_records_by_dataset(data):
     """Group DVC records by dataset for per-dataset evaluation."""
     dataset_groups = defaultdict(list)
     for key, record in data.items():
         qa_type = record.get('qa_type', '')
+        # Match any dense_captioning variant (dense_captioning, dense_captioning_gpt, dense_captioning_gemini, dc)
+        if not any(x in qa_type.lower() for x in ['dense_captioning', 'dense_caption', 'dc']):
             continue
         dataset = record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown')))
 def evaluate_dataset_dvc(dataset_name, records):
+    """Evaluate DVC for a specific dataset using caption quality + temporal F1."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
+    # Step 1: Evaluate caption quality using LLM judge
     import tempfile
     import os
     temp_data = {str(i): record for i, record in enumerate(records)}
     with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
         json.dump(temp_data, f)
         temp_file = f.name
     try:
+        # Use caption evaluator for caption quality
+        caption_result = evaluate_caption_task(temp_file, 'dense_captioning')
+        caption_score = caption_result['score']
+        caption_method = caption_result['method']
     finally:
         os.unlink(temp_file)
+    # Step 2: Compute temporal F1 for segment localization
+    all_f1_scores = []
+    for record in records:
+        # Get FPS for time-to-frame conversion
+        fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
+        if isinstance(fps, str):
+            fps = float(fps)
+        # Parse predicted segments from answer
+        pred_text = record.get('answer', '')
+        pred_segments = parse_dvc_segments(pred_text)
+        # Get ground truth segments from struc_info
+        struc_info = record.get('struc_info', [])
+        gt_segments = []
+        if isinstance(struc_info, list):
+            for item in struc_info:
+                if isinstance(item, dict):
+                    # Handle different formats
+                    if 'dc_segments' in item:
+                        # NurViD format
+                        segments = item['dc_segments']
+                    elif 'start' in item and 'end' in item:
+                        # Direct segment format
+                        segments = [item]
+                    else:
+                        continue
+                    for seg in (segments if isinstance(segments, list) else [segments]):
+                        if 'start' in seg and 'end' in seg:
+                            # Convert to seconds (struc_info is in seconds)
+                            gt_segments.append([
+                                float(seg['start']),
+                                float(seg['end'])
+                            ])
+        # Compute F1 for this sample
+        if pred_segments and gt_segments:
+            f1_result = compute_temporal_f1(pred_segments, gt_segments, iou_threshold=0.5)
+            all_f1_scores.append(f1_result['f1'])
+    # Aggregate F1 scores
+    avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0
+    # Return both caption quality and temporal F1
+    return {
+        'overall': {
+            'caption_score': caption_score,
+            'caption_method': caption_method,
+            'temporal_f1': avg_f1,
+            'count': len(records),
+            'f1_samples': len(all_f1_scores)
+        }
+    }
 def main():
     """Main evaluation function for DVC."""
     if not any(dataset_records.values()):
         print("No DVC records found!")
+        return {}
     all_results = {}
     for dataset_name, records in dataset_records.items():
     print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY")
     print(f"{'='*80}")
+    # Aggregate overall metrics
+    all_caption_scores = []
+    all_f1_scores = []
     for dataset_name, results in all_results.items():
         if results:
             print(f"\n{dataset_name}:")
             for key, metrics in results.items():
                 if isinstance(metrics, dict):
+                    print(f"  Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}")
+                    print(f"  Temporal F1@0.5: {metrics.get('temporal_f1', 0):.4f}")
+                    print(f"  Total samples: {metrics.get('count', 0)}")
+                    print(f"  F1 computed on: {metrics.get('f1_samples', 0)} samples")
+                    # Collect for overall average
+                    all_caption_scores.append(metrics.get('caption_score', 0))
+                    all_f1_scores.append(metrics.get('temporal_f1', 0))
+    # Return overall aggregated results
+    return {
+        'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0,
+        'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0,
+        'method': all_results[list(all_results.keys())[0]]['overall'].get('caption_method', 'unknown') if all_results else 'unknown'
+    }
 if __name__ == "__main__":

evaluation/eval_tal.py CHANGED Viewed

@@ -194,22 +194,27 @@ def evaluate_dataset_tal(dataset_name, records):
             'ground_truth': gt_spans
         }]
-        # Evaluate this record
-        result = evaluate_tal_record(formatted_record, tiou_thresh=0.5)
-        results_by_fps[fps].append(result)
     # Aggregate results
     aggregated = {}
     for fps, results_list in results_by_fps.items():
-        # Extract metrics from results (recall and meanIoU)
-        all_recalls = [r.get(f'Recall@0.50', 0) for r in results_list if r]
-        all_mean_ious = [r.get(f'meanIoU@0.50', 0) for r in results_list if r]
-        if all_recalls:
             aggregated[f'fps_{fps}'] = {
-                'recall@0.5': np.mean(all_recalls),
-                'meanIoU@0.5': np.mean(all_mean_ious),
-                'count': len(all_recalls)
             }
     return aggregated
@@ -254,6 +259,10 @@ def main():
     print("TAL EVALUATION SUMMARY")
     print(f"{'='*80}")
     for dataset_name, fps_results in all_results.items():
         if fps_results:
             print(f"\n{dataset_name}:")
@@ -265,6 +274,18 @@ def main():
                     else:
                         print(f"    samples: {value}")
 if __name__ == "__main__":
     main()

             'ground_truth': gt_spans
         }]
+        # Evaluate this record at both IoU thresholds
+        result_03 = evaluate_tal_record(formatted_record, tiou_thresh=0.3)
+        result_05 = evaluate_tal_record(formatted_record, tiou_thresh=0.5)
+        results_by_fps[fps].append({'0.3': result_03, '0.5': result_05})
     # Aggregate results
     aggregated = {}
     for fps, results_list in results_by_fps.items():
+        # Extract metrics from results at both thresholds
+        all_recalls_03 = [r['0.3'].get(f'Recall@0.30', 0) for r in results_list if r]
+        all_mean_ious_03 = [r['0.3'].get(f'meanIoU@0.30', 0) for r in results_list if r]
+        all_recalls_05 = [r['0.5'].get(f'Recall@0.50', 0) for r in results_list if r]
+        all_mean_ious_05 = [r['0.5'].get(f'meanIoU@0.50', 0) for r in results_list if r]
+        if all_recalls_03:
             aggregated[f'fps_{fps}'] = {
+                'recall@0.3': np.mean(all_recalls_03),
+                'meanIoU@0.3': np.mean(all_mean_ious_03),
+                'recall@0.5': np.mean(all_recalls_05),
+                'meanIoU@0.5': np.mean(all_mean_ious_05),
+                'count': len(all_recalls_03)
             }
     return aggregated
     print("TAL EVALUATION SUMMARY")
     print(f"{'='*80}")
+    # Aggregate metrics across all datasets
+    all_miou_03 = []
+    all_miou_05 = []
     for dataset_name, fps_results in all_results.items():
         if fps_results:
             print(f"\n{dataset_name}:")
                     else:
                         print(f"    samples: {value}")
+                # Collect for overall average
+                if 'meanIoU@0.3' in metrics:
+                    all_miou_03.append(metrics['meanIoU@0.3'])
+                if 'meanIoU@0.5' in metrics:
+                    all_miou_05.append(metrics['meanIoU@0.5'])
+    # Return overall aggregated results
+    return {
+        'meanIoU@0.3': np.mean(all_miou_03) if all_miou_03 else 0.0,
+        'meanIoU@0.5': np.mean(all_miou_05) if all_miou_05 else 0.0
+    }
 if __name__ == "__main__":
     main()