Spaces:

UII-AI
/

MedVidBench-Leaderboard

Running

App Files Files Community

MedGRPO Team commited on Mar 13

Commit

aa5db53

1 Parent(s): 63df552

update

Browse files

Files changed (4) hide show

evaluation/eval_dvc.py +241 -125
evaluation/eval_next_action.py +29 -60
evaluation/evaluate_all_pai.py +22 -5
evaluation/evaluate_predictions.py +1 -1

evaluation/eval_dvc.py CHANGED Viewed

@@ -1,106 +1,197 @@
-"""Dense Video Captioning evaluation using LLM judge + temporal F1."""
 import json
 import sys
 import numpy as np
 from collections import defaultdict
 from eval_caption_llm_judge import evaluate_caption_task
-def compute_iou(pred_segment, gt_segment):
-    """Compute IoU between two segments [start, end]."""
-    pred_start, pred_end = pred_segment
-    gt_start, gt_end = gt_segment
-    # Compute intersection
-    inter_start = max(pred_start, gt_start)
-    inter_end = min(pred_end, gt_end)
-    intersection = max(0, inter_end - inter_start)
-    # Compute union
-    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
-    if union == 0:
-        return 0
-    return intersection / union
-def compute_temporal_f1(pred_segments, gt_segments, iou_threshold=0.5):
     """
-    Compute F1 score for temporal segment matching.
-    Args:
-        pred_segments: List of predicted [start, end] segments
-        gt_segments: List of ground truth [start, end] segments
-        iou_threshold: IoU threshold for matching (default 0.5)
-    Returns:
-        Dict with precision, recall, and f1 scores
     """
-    if not pred_segments or not gt_segments:
-        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
-    # Match predicted segments to ground truth
-    matched_gt = set()
-    matched_pred = set()
-    for pred_idx, pred_seg in enumerate(pred_segments):
-        best_iou = 0
-        best_gt_idx = -1
-        for gt_idx, gt_seg in enumerate(gt_segments):
-            if gt_idx in matched_gt:
-                continue
-            iou = compute_iou(pred_seg, gt_seg)
-            if iou >= iou_threshold and iou > best_iou:
-                best_iou = iou
-                best_gt_idx = gt_idx
-        if best_gt_idx >= 0:
-            matched_pred.add(pred_idx)
-            matched_gt.add(best_gt_idx)
-    # Compute precision, recall, F1
-    precision = len(matched_pred) / len(pred_segments) if pred_segments else 0
-    recall = len(matched_gt) / len(gt_segments) if gt_segments else 0
-    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
     return {
-        'precision': precision,
-        'recall': recall,
-        'f1': f1
     }
-def parse_dvc_segments(text):
-    """
-    Parse DVC output to extract segments.
-    Supports multiple formats:
-    - [start-end] caption
-    - (start-end) caption
-    - start-end seconds: caption
-    """
-    import re
-    segments = []
-    # Pattern 1: [0.0-5.2] or (0.0-5.2)
-    pattern1 = r'[\[\(](\d+\.?\d*)\s*-\s*(\d+\.?\d*)[\]\)]'
-    # Pattern 2: 0.0-5.2 seconds:
-    pattern2 = r'(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\s*seconds?:'
-    # Try both patterns
-    for pattern in [pattern1, pattern2]:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        for match in matches:
-            start = float(match.group(1))
-            end = float(match.group(2))
-            segments.append([start, end])
-    return segments
 def group_records_by_dataset(data):
     """Group DVC records by dataset for per-dataset evaluation."""
@@ -130,6 +221,26 @@ def group_records_by_dataset(data):
     return dict(dataset_groups)
 def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False):
     """Evaluate DVC for a specific dataset using caption quality + temporal F1."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
@@ -150,65 +261,71 @@ def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False):
             temp_file = f.name
         try:
-            # Use caption evaluator for caption quality
             caption_result = evaluate_caption_task(temp_file, 'dense_captioning')
             caption_score = caption_result['score']
             caption_method = caption_result['method']
         finally:
             os.unlink(temp_file)
-    # Step 2: Compute temporal F1 for segment localization
     all_f1_scores = []
     for record in records:
-        # Get FPS for time-to-frame conversion
         fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
         if isinstance(fps, str):
             fps = float(fps)
-        # Parse predicted segments from answer
-        pred_text = record.get('answer', '')
-        pred_segments = parse_dvc_segments(pred_text)
-        # Get ground truth segments from struc_info
-        struc_info = record.get('struc_info', [])
-        gt_segments = []
-        if isinstance(struc_info, list):
-            for item in struc_info:
-                if isinstance(item, dict):
-                    # Handle different formats
-                    if 'dc_segments' in item:
-                        # NurViD format
-                        segments = item['dc_segments']
-                    elif 'start' in item and 'end' in item:
-                        # Direct segment format
-                        segments = [item]
-                    else:
-                        continue
-                    for seg in (segments if isinstance(segments, list) else [segments]):
-                        if 'start' in seg and 'end' in seg:
-                            # Convert to seconds (struc_info is in seconds)
-                            gt_segments.append([
-                                float(seg['start']),
-                                float(seg['end'])
-                            ])
-        # Compute F1 for this sample
         if pred_segments and gt_segments:
-            f1_result = compute_temporal_f1(pred_segments, gt_segments, iou_threshold=0.5)
-            all_f1_scores.append(f1_result['f1'])
-    # Aggregate F1 scores
     avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0
-    # Return both caption quality and temporal F1
     return {
         'overall': {
             'caption_score': caption_score,
             'caption_method': caption_method,
             'temporal_f1': avg_f1,
             'count': len(records),
             'f1_samples': len(all_f1_scores)
         }
@@ -228,7 +345,7 @@ def main():
     print(f"Loading results from: {output_file}")
     if skip_llm_judge:
-        print("⚠️  --skip-llm-judge flag detected: Skipping caption evaluation, computing temporal F1 only")
     with open(output_file, "r") as f:
         infer_output = json.load(f)
@@ -253,7 +370,6 @@ def main():
     print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY")
     print(f"{'='*80}")
-    # Aggregate overall metrics
     all_caption_scores = []
     all_f1_scores = []
@@ -263,15 +379,15 @@ def main():
             for key, metrics in results.items():
                 if isinstance(metrics, dict):
                     print(f"  Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}")
-                    print(f"  Temporal F1@0.5: {metrics.get('temporal_f1', 0):.4f}")
                     print(f"  Total samples: {metrics.get('count', 0)}")
                     print(f"  F1 computed on: {metrics.get('f1_samples', 0)} samples")
-                    # Collect for overall average
                     all_caption_scores.append(metrics.get('caption_score', 0))
                     all_f1_scores.append(metrics.get('temporal_f1', 0))
-    # Return overall aggregated results
     return {
         'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0,
         'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0,

+"""Dense Video Captioning evaluation using LLM judge + temporal F1.
+Temporal F1 algorithm matches Qwen2.5-VL/my_eval/eval_dvc.py exactly:
+- process_raw_output() + flatten_overlapping_segments() for parsing
+- Frame-based coordinates (multiply by FPS)
+- Many-to-many threshold matching across IoU (0.3, 0.5, 0.7, 0.9)
+- F1 = 2 * mean_precision * mean_recall / (mean_precision + mean_recall)
+"""
 import json
+import re
 import sys
 import numpy as np
 from collections import defaultdict
 from eval_caption_llm_judge import evaluate_caption_task
+# =============================================================================
+# Ported from Qwen2.5-VL/my_eval_old/eval_dvc.py - exact same algorithms
+# =============================================================================
+def zs_parse_multi_segment_annotations(raw_text: str):
+    """Parse raw multiline string with multiple timestamped captions per line."""
+    all_segments = []
+    lines = raw_text.strip().split('\n')
+    for line in lines:
+        matches = re.findall(
+            r"(?:\*\*Start Time:\*\*|Start\s*\(?Time\)?|Time\s*Range:|Time\s*Interval:|^|\n)\s*(\d+\.?\d*)\s*[-–]\s*(\d+\.?\d*)\s*seconds?.*?(?:\*\*Description:\*\*|-)\s*(.+?)(?=\n\d|$)",
+            line, flags=re.DOTALL
+        )
+        for start, end, caption in matches:
+            all_segments.append({
+                "start": float(start),
+                "end": float(end),
+                "caption": caption.strip().rstrip('.')
+            })
+    return all_segments
+def process_raw_output(raw_descriptions: str):
+    """Process raw frame-wise descriptions into structured segments."""
+    pattern = r"(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)\s+seconds?:\s+(.*?)(?=\n\d+(?:\.\d+)?-\d+(?:\.\d+)?\s+seconds?:|\Z)"
+    matches = re.findall(pattern, raw_descriptions, re.DOTALL)
+    segments = []
+    for start, end, desc in matches:
+        segments.append({
+            "start": float(start),
+            "end": float(end),
+            "caption": desc.strip().replace("\n", " ")
+        })
+    # Remove duplicate (start, end) segments
+    seen = set()
+    unique_segments = []
+    for seg in segments:
+        key = (seg["start"], seg["end"])
+        if key not in seen:
+            seen.add(key)
+            unique_segments.append(seg)
+    if not unique_segments:
+        unique_segments = zs_parse_multi_segment_annotations(raw_descriptions)
+    return unique_segments
+def check_for_overlaps(segments):
+    """Check a list of temporal segments for any overlaps."""
+    sorted_segs = sorted(segments, key=lambda x: (x['start'], x['end']))
+    overlaps = []
+    for i in range(len(sorted_segs) - 1):
+        seg1 = sorted_segs[i]
+        seg2 = sorted_segs[i + 1]
+        if seg2["start"] < seg1["end"]:
+            overlaps.append((seg1, seg2))
+    return overlaps
+def flatten_overlapping_segments(segments, caption_strategy="longest"):
+    """Split overlapping segments into non-overlapping intervals."""
+    time_points = sorted(set([s["start"] for s in segments] + [s["end"] for s in segments]))
+    result = []
+    for i in range(len(time_points) - 1):
+        start = time_points[i]
+        end = time_points[i + 1]
+        overlapping = []
+        for s in segments:
+            if s["start"] < end and s["end"] > start:
+                overlapping.append(s)
+        if not overlapping:
+            continue
+        if caption_strategy == "longest":
+            selected = max(overlapping, key=lambda x: x["end"] - x["start"])
+        elif caption_strategy == "first":
+            selected = overlapping[0]
+        else:
+            raise ValueError("Unsupported strategy")
+        result.append({
+            "start": start,
+            "end": end,
+            "caption": selected["caption"]
+        })
+    return result
+def iou(interval_1, interval_2):
+    """Compute IoU between two intervals - matches old eval exactly."""
+    start_1, end_1 = min(*interval_1), max(*interval_1)
+    start_2, end_2 = min(*interval_2), max(*interval_2)
+    intersection = max(0, min(end_1, end_2) - max(start_1, start_2))
+    union = min(
+        max(end_1, end_2) - min(start_1, start_2),
+        end_1 - start_1 + end_2 - start_2)
+    result = float(intersection) / (union + 1e-8)
+    return result
+def evaluate_detections(predicted_segments, gt_segments, splits,
+                        iou_thresholds=(0.3, 0.5, 0.7, 0.9)):
+    """Compute P/R between predicted and ground truth segments.
+    Many-to-many matching: any pred-gt pair exceeding threshold counts as covered.
     """
+    best_recall = []
+    best_precision = []
+    predicted_shape = predicted_segments.shape[0]
+    for split in set(splits):
+        metrics = {}
+        for threshold in iou_thresholds:
+            metrics[str(threshold)] = {
+                'gt_covered': set(),
+                'pred_covered': set(),
+            }
+        split_idx = np.where(splits == split)[0]
+        split_gt_segments = np.array([gt_segments[idx] for idx in split_idx])
+        gt_shape = split_gt_segments.shape[0]
+        for idx_g, gt_segment in enumerate(split_gt_segments):
+            for idx_p, segment in enumerate(predicted_segments):
+                sample_iou = iou(segment, gt_segment)
+                for threshold in iou_thresholds:
+                    if sample_iou > threshold:
+                        metrics[str(threshold)]['pred_covered'].add(idx_p)
+                        metrics[str(threshold)]['gt_covered'].add(idx_g)
+        for threshold, m in metrics.items():
+            pred_covered = m['pred_covered']
+            gt_covered = m['gt_covered']
+            m['precision'] = float(len(pred_covered)) / max(float(predicted_shape), 1.0)
+            m['recall'] = float(len(gt_covered)) / float(gt_shape)
+        precision = [m['precision'] for m in metrics.values()]
+        recall = [m['recall'] for m in metrics.values()]
+        if best_precision:
+            best_precision = [max(precision[i], best_precision[i]) for i in range(len(precision))]
+            best_recall = [max(recall[i], best_recall[i]) for i in range(len(recall))]
+        else:
+            best_precision, best_recall = precision, recall
+    return best_precision, best_recall
+def compute_temporal_f1_single(predicted_segments, gt_segments, splits,
+                               iou_thresholds=(0.3, 0.5, 0.7)):
+    """Compute temporal F1 for a single sample using the old eval algorithm.
+    Returns dict with Precision_Mean, Recall_Mean, F1_Score.
     """
+    if predicted_segments.shape[0] == 0 or gt_segments.shape[0] == 0:
+        return {'Precision_Mean': 0.0, 'Recall_Mean': 0.0, 'F1_Score': 0.0}
+    detection_precision, detection_recall = evaluate_detections(
+        predicted_segments, gt_segments, splits, iou_thresholds
+    )
+    mean_precision = sum(detection_precision) / len(detection_precision)
+    mean_recall = sum(detection_recall) / len(detection_recall)
+    f1 = 2 * mean_recall * mean_precision / (mean_recall + mean_precision) \
+        if (mean_recall + mean_precision) > 0 else 0.0
     return {
+        'Precision_Mean': float(mean_precision),
+        'Recall_Mean': float(mean_recall),
+        'F1_Score': float(f1),
     }
+# =============================================================================
+# Dataset grouping and evaluation
+# =============================================================================
 def group_records_by_dataset(data):
     """Group DVC records by dataset for per-dataset evaluation."""
     return dict(dataset_groups)
+def _extract_gt_segments(record):
+    """Extract ground truth segments from struc_info, matching Qwen2.5-VL logic."""
+    struc_info = record.get('struc_info', [])
+    if isinstance(struc_info, list) and len(struc_info) > 0:
+        if isinstance(struc_info[0], list):
+            # Format: [[{segments...}]]
+            gnd = struc_info[0]
+        elif isinstance(struc_info[0], dict) and 'dc_segments' in struc_info[0]:
+            # NurViD format: [{'dc_segments': [...]}]
+            gnd = struc_info[0]['dc_segments']
+        else:
+            # Format: [{segments...}]
+            gnd = struc_info
+    else:
+        gnd = struc_info
+    return gnd
 def evaluate_dataset_dvc(dataset_name, records, skip_llm_judge=False):
     """Evaluate DVC for a specific dataset using caption quality + temporal F1."""
     print(f"\nEvaluating {dataset_name} ({len(records)} records)...")
             temp_file = f.name
         try:
             caption_result = evaluate_caption_task(temp_file, 'dense_captioning')
             caption_score = caption_result['score']
             caption_method = caption_result['method']
         finally:
             os.unlink(temp_file)
+    # Step 2: Compute temporal F1 matching Qwen2.5-VL algorithm exactly
     all_f1_scores = []
+    all_precision_scores = []
+    all_recall_scores = []
     for record in records:
+        # Get FPS
         fps = record.get('fps', record.get('metadata', {}).get('fps', 1.0))
         if isinstance(fps, str):
             fps = float(fps)
+        # Parse predicted segments using process_raw_output (same as Qwen2.5-VL)
+        raw_answer = record.get('answer', '')
+        processed_answer = process_raw_output(raw_answer)
+        overlaps = check_for_overlaps(processed_answer)
+        if overlaps:
+            processed_answer = flatten_overlapping_segments(processed_answer, caption_strategy="longest")
+        # Get ground truth segments
+        gnd = _extract_gt_segments(record)
+        # Convert both to frame-based coordinates (multiply by fps, cast to int)
+        # IMPORTANT: require 'caption' field to match Qwen2.5-VL's prepare_eval_arrays
+        gt_segments = []
+        if isinstance(gnd, list):
+            for g in gnd:
+                if isinstance(g, dict) and 'start' in g and 'end' in g and 'caption' in g:
+                    gt_segments.append([int(float(g['start']) * fps), int(float(g['end']) * fps)])
+        pred_segments = []
+        if isinstance(processed_answer, list):
+            for p in processed_answer:
+                if isinstance(p, dict) and 'start' in p and 'end' in p and 'caption' in p:
+                    pred_segments.append([int(p['start'] * fps), int(p['end'] * fps)])
+        # Compute F1 using many-to-many matching across IoU thresholds (0.3, 0.5, 0.7)
         if pred_segments and gt_segments:
+            pred_np = np.array(pred_segments)
+            gt_np = np.array(gt_segments)
+            splits = np.ones(len(gt_segments), dtype=int)
+            result = compute_temporal_f1_single(pred_np, gt_np, splits,
+                                                iou_thresholds=(0.3, 0.5, 0.7))
+            all_f1_scores.append(result['F1_Score'])
+            all_precision_scores.append(result['Precision_Mean'])
+            all_recall_scores.append(result['Recall_Mean'])
+    # Aggregate scores
     avg_f1 = np.mean(all_f1_scores) if all_f1_scores else 0.0
+    avg_precision = np.mean(all_precision_scores) if all_precision_scores else 0.0
+    avg_recall = np.mean(all_recall_scores) if all_recall_scores else 0.0
     return {
         'overall': {
             'caption_score': caption_score,
             'caption_method': caption_method,
             'temporal_f1': avg_f1,
+            'temporal_precision': avg_precision,
+            'temporal_recall': avg_recall,
             'count': len(records),
             'f1_samples': len(all_f1_scores)
         }
     print(f"Loading results from: {output_file}")
     if skip_llm_judge:
+        print("  --skip-llm-judge flag detected: Skipping caption evaluation, computing temporal F1 only")
     with open(output_file, "r") as f:
         infer_output = json.load(f)
     print("DENSE VIDEO CAPTIONING EVALUATION SUMMARY")
     print(f"{'='*80}")
     all_caption_scores = []
     all_f1_scores = []
             for key, metrics in results.items():
                 if isinstance(metrics, dict):
                     print(f"  Caption Score ({metrics.get('caption_method', 'unknown')}): {metrics.get('caption_score', 0):.4f}")
+                    print(f"  Temporal F1: {metrics.get('temporal_f1', 0):.4f}")
+                    print(f"  Temporal Precision: {metrics.get('temporal_precision', 0):.4f}")
+                    print(f"  Temporal Recall: {metrics.get('temporal_recall', 0):.4f}")
                     print(f"  Total samples: {metrics.get('count', 0)}")
                     print(f"  F1 computed on: {metrics.get('f1_samples', 0)} samples")
                     all_caption_scores.append(metrics.get('caption_score', 0))
                     all_f1_scores.append(metrics.get('temporal_f1', 0))
     return {
         'caption_score': np.mean(all_caption_scores) if all_caption_scores else 0.0,
         'temporal_f1': np.mean(all_f1_scores) if all_f1_scores else 0.0,

evaluation/eval_next_action.py CHANGED Viewed

@@ -462,8 +462,11 @@ def get_action_list_for_dataset(dataset, procedure=None):
             for actions in NURVID_PROCEDURE_ACTIONS.values():
                 all_actions.update(actions)
             return sorted(list(all_actions))
     else:
-        raise ValueError(f"Unknown dataset: {dataset}")
 def normalize_action_text(text, dataset):
     """
@@ -487,6 +490,7 @@ def create_class_map_for_dataset(actions):
 def group_records_by_dataset(data):
     """Group next_action records by dataset for per-dataset evaluation."""
     dataset_groups = defaultdict(list)
     for key, record in data.items():
@@ -494,54 +498,32 @@ def group_records_by_dataset(data):
         if 'next_action' not in qa_type.lower():
             continue
-        # Check data_source first (leaderboard format), then fall back to dataset/dataset_name
-        dataset = record.get('data_source', record.get('dataset', record.get('dataset_name', record.get('metadata', {}).get('dataset', 'Unknown'))))
-        video_id = record.get('video_id', record.get('metadata', {}).get('video_id', ''))
-        if dataset == 'Unknown' and video_id:
-            video_id_lower = str(video_id).lower()
-            if len(video_id) == 11 and any(c.isalpha() for c in video_id):
-                dataset = "AVOS"
-            elif "_part" in video_id_lower:
-                dataset = "CoPESD"
-            elif "video" in video_id_lower:
-                dataset = "CholecT50"
-        dataset_groups[dataset].append(record)
     return dict(dataset_groups)
-def normalize_action_text(action_text, dataset_name):
-    """Normalize action text for comparison."""
-    action_text = action_text.strip().lower()
-    # Dataset-specific mappings
-    if dataset_name == "CoPESD":
-        action_text = COPESD_ACTION_MAPPING.get(action_text, action_text)
-    return action_text
-def get_action_list_for_dataset(dataset_name, procedure=None):
-    """Get action list for a specific dataset."""
-    if dataset_name == "AVOS":
-        return AVOS_ACTIONS
-    elif dataset_name == "CholecT50":
-        return T50_PHASES
-    elif dataset_name == "CoPESD":
-        return TOTAL_NEW_ACTION_LIST
-    elif dataset_name == "NurViD" and procedure:
-        return NURVID_PROCEDURE_ACTIONS.get(procedure, [])
-    elif dataset_name == "EgoSurgery":
-        # EgoSurgery uses free-form actions, return empty list
-        return []
-    return []
-def create_class_map_for_dataset(actions):
-    """Create mapping from action name to index."""
-    return {action: idx for idx, action in enumerate(actions)}
 def evaluate_dataset_next_action(dataset_name, records):
@@ -568,13 +550,7 @@ def evaluate_dataset_next_action(dataset_name, records):
             temp_records = []
             for record in proc_records:
-                struc_info = record.get('struc_info', {})
-                if isinstance(struc_info, list) and len(struc_info) > 0:
-                    struc_info = struc_info[0]
-                gnd_text = struc_info.get('next_action', '')
-                if not gnd_text:
-                    gnd_text = record.get('gnd', '')
                 gnd_text = normalize_action_text(gnd_text, dataset_name)
                 if gnd_text:
@@ -627,15 +603,8 @@ def evaluate_dataset_next_action(dataset_name, records):
             pred_text = normalize_action_text(record.get('answer', ''), dataset_name)
-            # Get ground truth - try struc_info first, then gnd field
-            struc_info = record.get('struc_info', {})
-            if isinstance(struc_info, list) and len(struc_info) > 0:
-                struc_info = struc_info[0]
-            gnd_text = struc_info.get('next_action', '')
-            if not gnd_text:
-                # Fallback to gnd field (used for CholecT50 and others)
-                gnd_text = record.get('gnd', '')
             gnd_text = normalize_action_text(gnd_text, dataset_name)

             for actions in NURVID_PROCEDURE_ACTIONS.values():
                 all_actions.update(actions)
             return sorted(list(all_actions))
+    elif dataset == "EgoSurgery":
+        # EgoSurgery uses free-form actions, return empty list
+        return []
     else:
+        return []
 def normalize_action_text(text, dataset):
     """
 def group_records_by_dataset(data):
     """Group next_action records by dataset for per-dataset evaluation."""
+    from dataset_utils import get_dataset_name
     dataset_groups = defaultdict(list)
     for key, record in data.items():
         if 'next_action' not in qa_type.lower():
             continue
+        # Detect dataset
+        dataset = get_dataset_name(record)
+        # Extract procedure for NurViD
+        procedure = None
+        if dataset == "NurViD":
+            question_lower = record.get("question", "").lower()
+            for proc_name in NURVID_PROCEDURE_ACTIONS.keys():
+                if proc_name.lower() in question_lower:
+                    procedure = proc_name
+                    break
+        # Restructure record to only include needed fields (consistent with Qwen2.5-VL)
+        record_data = {
+            "answer": record.get("answer", ""),
+            "gnd": record.get("gnd", ""),
+            "question": record.get("question", ""),
+            "video_id": record.get("metadata", {}).get("video_id", record.get("video_id", "")),
+            "procedure": procedure
+        }
+        dataset_groups[dataset].append(record_data)
     return dict(dataset_groups)
 def evaluate_dataset_next_action(dataset_name, records):
             temp_records = []
             for record in proc_records:
+                gnd_text = record.get('gnd', '')
                 gnd_text = normalize_action_text(gnd_text, dataset_name)
                 if gnd_text:
             pred_text = normalize_action_text(record.get('answer', ''), dataset_name)
+            # Get ground truth from gnd field only (consistent with Qwen2.5-VL)
+            gnd_text = record.get('gnd', '')
             gnd_text = normalize_action_text(gnd_text, dataset_name)

evaluation/evaluate_all_pai.py CHANGED Viewed

@@ -596,14 +596,31 @@ def print_overall_evaluation_results(output_file, tasks, all_task_results, skip_
                 for dataset_name, ds_records in dataset_records_dict.items():
                     if ds_records:
                         # Silently evaluate each dataset
-                        with contextlib.redirect_stdout(io.StringIO()):
-                            ds_results = module.evaluate_dataset_next_action(dataset_name, ds_records)
                         if "overall" in ds_results:
                             accuracy = ds_results["overall"].get("accuracy", 0.0)
                             all_accuracies.append(accuracy)
-                            # Track weighted metrics
-                            total_correct += int(accuracy * len(ds_records))
-                            total_samples += len(ds_records)
                 # Print only final aggregate metrics
                 if all_accuracies:

                 for dataset_name, ds_records in dataset_records_dict.items():
                     if ds_records:
                         # Silently evaluate each dataset
+                        # Suppress SentenceTransformer/safetensors warnings at fd level
+                        import logging, os
+                        logging.disable(logging.WARNING)
+                        old_fd_out = os.dup(1)
+                        old_fd_err = os.dup(2)
+                        devnull = os.open(os.devnull, os.O_WRONLY)
+                        os.dup2(devnull, 1)
+                        os.dup2(devnull, 2)
+                        try:
+                            with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+                                ds_results = module.evaluate_dataset_next_action(dataset_name, ds_records)
+                        finally:
+                            os.dup2(old_fd_out, 1)
+                            os.dup2(old_fd_err, 2)
+                            os.close(old_fd_out)
+                            os.close(old_fd_err)
+                            os.close(devnull)
+                            logging.disable(logging.NOTSET)
                         if "overall" in ds_results:
                             accuracy = ds_results["overall"].get("accuracy", 0.0)
+                            # Use actual evaluated count, not input count (some records may be skipped)
+                            evaluated_count = ds_results["overall"].get("count", len(ds_records))
                             all_accuracies.append(accuracy)
+                            total_correct += int(accuracy * evaluated_count)
+                            total_samples += evaluated_count
                 # Print only final aggregate metrics
                 if all_accuracies:

evaluation/evaluate_predictions.py CHANGED Viewed

@@ -306,7 +306,7 @@ def main():
                        help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
     parser.add_argument("--analyze-only", action="store_true",
                        help="Only analyze the file structure without running evaluations")
-    parser.add_argument("--skip-llm-judge", action="store_true",
                        help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
     args = parser.parse_args()

                        help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
     parser.add_argument("--analyze-only", action="store_true",
                        help="Only analyze the file structure without running evaluations")
+    parser.add_argument("--skip-llm-judge", default=True, action="store_true",
                        help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
     args = parser.parse_args()