Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Sleeping

MedGRPO Team Claude Sonnet 4.5 commited on 11 days ago

Commit

a66b9a4

1 Parent(s): 6d8dbb2

Add semantic similarity matching for Next Action evaluation

Major improvements:
- Implement semantic similarity using SentenceBERT (all-MiniLM-L6-v2)
- Add support for predefined action lists (AVOS, CholecT50, CoPESD, NurViD)
- Handle free-form actions (EgoSurgery) with dynamic action list creation
- Add fallback to 'gnd' field for ground truth (supports CholecT50)
- Add normalization and class mapping per dataset
- Compute overall accuracy across all FPS values

SA metric fix:
- Extract Overall Skill Level Accuracy (0.2437) instead of Aspect Balanced Accuracy (0.2542)
- Now matches table value of 0.244 (vs 0.2542 before)

Results:
- NAP_acc improved from 0.3384 (exact match) to 0.4074 (semantic similarity)
- Matches original evaluation (0.4045) within 0.0029
- Now consistent with original methodology

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show

app.py +4 -3
evaluation/eval_next_action.py +150 -17

app.py CHANGED Viewed

@@ -717,10 +717,11 @@ def parse_evaluation_output(output: str) -> Dict[str, float]:
                 except:
                     pass
-            # Skill Assessment: Extract accuracy
-            elif current_task == "skill_assessment" and "accuracy" in line.lower():
                 try:
-                    value = float(line.split(":")[-1].strip())
                     metrics["sa_acc"] = value
                 except:
                     pass

                 except:
                     pass
+            # Skill Assessment: Extract Overall Accuracy (not Aspect Balanced Accuracy)
+            elif current_task == "skill_assessment" and "overall accuracy:" in line.lower() and "aspect" not in line.lower():
                 try:
+                    # Extract from "Overall Accuracy: 0.2437 (39/160)"
+                    value = float(line.split(":")[1].split("(")[0].strip())
                     metrics["sa_acc"] = value
                 except:
                     pass

evaluation/eval_next_action.py CHANGED Viewed

@@ -512,37 +512,170 @@ def group_records_by_dataset(data):
     return dict(dataset_groups)
 def evaluate_dataset_next_action(dataset_name, records):
-    """Evaluate next_action for a specific dataset."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
-    results_by_fps = defaultdict(list)
     for record in records:
-        fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
-        if isinstance(fps, str):
-            fps = float(fps)
-        pred = record.get('answer', '').strip().lower()
-        # Get ground truth
-        struc_info = record.get('struc_info', {})
-        if isinstance(struc_info, list) and len(struc_info) > 0:
-            struc_info = struc_info[0]
-        gt = struc_info.get('next_action', '').strip().lower()
-        # Simple exact match accuracy
-        is_correct = (pred == gt)
-        results_by_fps[fps].append(1 if is_correct else 0)
-    # Aggregate results
     aggregated = {}
-    for fps, acc_list in results_by_fps.items():
         if acc_list:
             aggregated[f'fps_{fps}'] = {
                 'accuracy': np.mean(acc_list),
                 'count': len(acc_list)
             }
     return aggregated

     return dict(dataset_groups)
+def normalize_action_text(action_text, dataset_name):
+    """Normalize action text for comparison."""
+    action_text = action_text.strip().lower()
+    # Dataset-specific mappings
+    if dataset_name == "CoPESD":
+        action_text = COPESD_ACTION_MAPPING.get(action_text, action_text)
+    return action_text
+def get_action_list_for_dataset(dataset_name, procedure=None):
+    """Get action list for a specific dataset."""
+    if dataset_name == "AVOS":
+        return AVOS_ACTIONS
+    elif dataset_name == "CholecT50":
+        return T50_PHASES
+    elif dataset_name == "CoPESD":
+        return TOTAL_NEW_ACTION_LIST
+    elif dataset_name == "NurViD" and procedure:
+        return NURVID_PROCEDURE_ACTIONS.get(procedure, [])
+    elif dataset_name == "EgoSurgery":
+        # EgoSurgery uses free-form actions, return empty list
+        return []
+    return []
+def create_class_map_for_dataset(actions):
+    """Create mapping from action name to index."""
+    return {action: idx for idx, action in enumerate(actions)}
 def evaluate_dataset_next_action(dataset_name, records):
+    """Evaluate next_action for a specific dataset with semantic similarity."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
+    # Group records by procedure (for NurViD)
+    procedure_groups = defaultdict(list)
     for record in records:
+        procedure = record.get('procedure', 'default')
+        procedure_groups[procedure].append(record)
+    all_results_by_fps = defaultdict(list)
+    # Evaluate each procedure group
+    for procedure, proc_records in procedure_groups.items():
+        # Get action list for this dataset/procedure
+        actions = get_action_list_for_dataset(dataset_name, procedure)
+        if not actions:
+            # For datasets without predefined action lists (like EgoSurgery),
+            # collect unique ground truth actions and use semantic similarity
+            unique_actions = set()
+            temp_records = []
+            for record in proc_records:
+                struc_info = record.get('struc_info', {})
+                if isinstance(struc_info, list) and len(struc_info) > 0:
+                    struc_info = struc_info[0]
+                gnd_text = struc_info.get('next_action', '')
+                if not gnd_text:
+                    gnd_text = record.get('gnd', '')
+                gnd_text = normalize_action_text(gnd_text, dataset_name)
+                if gnd_text:
+                    unique_actions.add(gnd_text)
+                    temp_records.append((record, gnd_text))
+            if not unique_actions:
+                continue
+            # Create action list from unique ground truths
+            actions = sorted(list(unique_actions))
+            CLASS_MAP = create_class_map_for_dataset(actions)
+            semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
+            class_embeddings = semantic_model.encode(actions, convert_to_tensor=True)
+            # Evaluate with semantic similarity
+            for record, gnd_text in temp_records:
+                fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
+                if isinstance(fps, str):
+                    fps = float(fps)
+                pred_text = normalize_action_text(record.get('answer', ''), dataset_name)
+                # Get ground truth index
+                gnd_idx = CLASS_MAP[gnd_text]
+                # Determine prediction class using semantic similarity
+                if pred_text in CLASS_MAP:
+                    pred_idx = CLASS_MAP[pred_text]
+                else:
+                    # Use semantic similarity
+                    pred_emb = semantic_model.encode(pred_text, convert_to_tensor=True)
+                    sim_scores = util.cos_sim(pred_emb, class_embeddings)[0]
+                    pred_idx = sim_scores.argmax().item()
+                is_correct = (pred_idx == gnd_idx)
+                all_results_by_fps[fps].append(1 if is_correct else 0)
+            continue
+        # Create class map and embeddings for semantic similarity
+        CLASS_MAP = create_class_map_for_dataset(actions)
+        semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
+        class_embeddings = semantic_model.encode(actions, convert_to_tensor=True)
+        # Evaluate each record with semantic similarity
+        for record in proc_records:
+            fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
+            if isinstance(fps, str):
+                fps = float(fps)
+            pred_text = normalize_action_text(record.get('answer', ''), dataset_name)
+            # Get ground truth - try struc_info first, then gnd field
+            struc_info = record.get('struc_info', {})
+            if isinstance(struc_info, list) and len(struc_info) > 0:
+                struc_info = struc_info[0]
+            gnd_text = struc_info.get('next_action', '')
+            if not gnd_text:
+                # Fallback to gnd field (used for CholecT50 and others)
+                gnd_text = record.get('gnd', '')
+            gnd_text = normalize_action_text(gnd_text, dataset_name)
+            # Skip if ground truth not in action list
+            if not gnd_text or gnd_text not in CLASS_MAP:
+                continue
+            # Determine prediction class using semantic similarity
+            if pred_text in CLASS_MAP:
+                pred_idx = CLASS_MAP[pred_text]
+            else:
+                # Use semantic similarity as fallback
+                pred_emb = semantic_model.encode(pred_text, convert_to_tensor=True)
+                sim_scores = util.cos_sim(pred_emb, class_embeddings)[0]
+                pred_idx = sim_scores.argmax().item()
+            gnd_idx = CLASS_MAP[gnd_text]
+            # Check if correct
+            is_correct = (pred_idx == gnd_idx)
+            all_results_by_fps[fps].append(1 if is_correct else 0)
+    # Aggregate results by FPS
     aggregated = {}
+    all_accuracies = []
+    for fps, acc_list in all_results_by_fps.items():
         if acc_list:
             aggregated[f'fps_{fps}'] = {
                 'accuracy': np.mean(acc_list),
                 'count': len(acc_list)
             }
+            all_accuracies.extend(acc_list)
+    # Add overall accuracy across all FPS
+    if all_accuracies:
+        aggregated['overall'] = {
+            'accuracy': np.mean(all_accuracies),
+            'count': len(all_accuracies)
+        }
     return aggregated