Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Running

App Files Files Community

MedGRPO Team commited on 1 day ago

Commit

b28cd8f

1 Parent(s): 2362e57

upload prediction only

Browse files

Files changed (3) hide show

evaluation/evaluate_predictions.py +41 -42
evaluation/extract_predictions.py +86 -0
evaluation/merge_predictions_with_gt.py +45 -45

evaluation/evaluate_predictions.py CHANGED Viewed

@@ -80,10 +80,10 @@ def parse_id(id_str):
 def merge_with_ground_truth(predictions_file, ground_truth_file):
-    """Merge prediction-only file with ground-truth.
     Args:
-        predictions_file: Path to predictions JSON (id, qa_type, prediction format)
         ground_truth_file: Path to ground-truth JSON
     Returns:
@@ -97,64 +97,63 @@ def merge_with_ground_truth(predictions_file, ground_truth_file):
     with open(ground_truth_file, 'r') as f:
         ground_truth = json.load(f)
-    # Build lookup index for ground-truth
-    print("[EvaluationWrapper] Building ground-truth index...")
-    gt_index = {}
-    for record in ground_truth:
-        metadata = record.get('metadata', {})
-        # Create key from metadata
-        key = f"{metadata.get('video_id')}&&{metadata.get('input_video_start_frame')}&&{metadata.get('input_video_end_frame')}&&{metadata.get('fps')}"
-        gt_index[key] = record
-    print(f"[EvaluationWrapper] Ground-truth index size: {len(gt_index)} records")
-    print(f"[EvaluationWrapper] Predictions to merge: {len(predictions)} records")
-    # Merge predictions with ground-truth
     merged = {}
-    matched_count = 0
-    unmatched_ids = []
-    for i, pred in enumerate(predictions):
-        pred_id = pred.get('id')
-        if not pred_id:
-            print(f"[EvaluationWrapper] ⚠️  WARNING: Prediction {i} missing 'id' field, skipping")
-            continue
-        # Look up ground-truth
-        if pred_id not in gt_index:
-            unmatched_ids.append(pred_id)
-            continue
-        gt_record = gt_index[pred_id]
-        # Create merged record (ensure data_source is properly set)
         data_source = gt_record.get('data_source', 'Unknown')
-        # Fallback to dataset_name if data_source is missing
         if data_source == 'Unknown' or not data_source:
             data_source = gt_record.get('dataset_name', 'Unknown')
         merged_record = {
             'metadata': gt_record.get('metadata', {}),
-            'qa_type': pred.get('qa_type'),
             'struc_info': gt_record.get('struc_info', []),
-            'question': gt_record.get('question', ''),
-            'gnd': gt_record.get('answer', ''),  # Ground-truth answer
             'answer': pred.get('prediction', ''),  # Model prediction
             'data_source': data_source
         }
         # Use sequential keys like results.json
         merged[str(i)] = merged_record
-        matched_count += 1
-    print(f"[EvaluationWrapper] ✓ Successfully merged {matched_count}/{len(predictions)} predictions")
-    if unmatched_ids:
-        print(f"[EvaluationWrapper] ⚠️  WARNING: {len(unmatched_ids)} predictions not found in ground-truth")
-        if len(unmatched_ids) <= 5:
-            print(f"[EvaluationWrapper]   Unmatched IDs: {unmatched_ids}")
-        else:
-            print(f"[EvaluationWrapper]   First 5 unmatched IDs: {unmatched_ids[:5]}")
     return merged

 def merge_with_ground_truth(predictions_file, ground_truth_file):
+    """Merge prediction-only file with ground-truth by array index.
     Args:
+        predictions_file: Path to predictions JSON (array format, same order as ground truth)
         ground_truth_file: Path to ground-truth JSON
     Returns:
     with open(ground_truth_file, 'r') as f:
         ground_truth = json.load(f)
+    print(f"[EvaluationWrapper] Predictions: {len(predictions)} records")
+    print(f"[EvaluationWrapper] Ground-truth: {len(ground_truth)} records")
+    # Check lengths match
+    if len(predictions) != len(ground_truth):
+        raise ValueError(
+            f"Length mismatch: predictions ({len(predictions)}) != ground truth ({len(ground_truth)}). "
+            f"Predictions must be in the same order as ground truth."
+        )
+    # Merge predictions with ground-truth by index
     merged = {}
+    mismatched_qa_types = []
+    for i, (pred, gt_record) in enumerate(zip(predictions, ground_truth)):
+        # Validate prediction has 'prediction' field
+        if 'prediction' not in pred:
+            raise ValueError(f"Prediction at index {i} missing 'prediction' field")
+        # Optional: check qa_type matches
+        if 'qa_type' in pred and pred['qa_type'] != gt_record.get('qa_type'):
+            mismatched_qa_types.append(i)
+        # Extract question and ground truth from conversations
+        question = ''
+        gnd = ''
+        if 'conversations' in gt_record:
+            for msg in gt_record['conversations']:
+                if msg.get('from') in ['human', 'user']:
+                    # Remove <video> token to match original format
+                    question = msg.get('value', '').replace('<video>\n', '').replace('<video>', '')
+                elif msg.get('from') in ['gpt', 'assistant']:
+                    gnd = msg.get('value', '')
+        # Get data_source
         data_source = gt_record.get('data_source', 'Unknown')
         if data_source == 'Unknown' or not data_source:
             data_source = gt_record.get('dataset_name', 'Unknown')
+        # Create merged record
         merged_record = {
             'metadata': gt_record.get('metadata', {}),
+            'qa_type': gt_record.get('qa_type', ''),
             'struc_info': gt_record.get('struc_info', []),
+            'question': question,
+            'gnd': gnd,
             'answer': pred.get('prediction', ''),  # Model prediction
             'data_source': data_source
         }
         # Use sequential keys like results.json
         merged[str(i)] = merged_record
+    if mismatched_qa_types:
+        print(f"[EvaluationWrapper] ⚠️  Warning: {len(mismatched_qa_types)} samples with mismatched qa_type")
+    print(f"[EvaluationWrapper] ✓ Successfully merged {len(merged)}/{len(predictions)} predictions")
     return merged

evaluation/extract_predictions.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env python3
+"""Extract predictions from results.json for user submission format.
+This script extracts only the prediction-related fields from results.json,
+creating a format that users would submit (without ground truth data).
+"""
+import json
+import sys
+from pathlib import Path
+def extract_predictions(results_file: str, output_file: str) -> None:
+    """
+    Extract predictions from results.json.
+    Args:
+        results_file: Path to results.json (dict format with numeric keys)
+        output_file: Path to save predictions (list format)
+    """
+    print(f"Loading results from: {results_file}")
+    with open(results_file) as f:
+        results = json.load(f)
+    # results.json is a dict with numeric string keys ("0", "1", "2", ...)
+    # We need to convert to list format with proper IDs
+    print(f"Loaded {len(results)} results")
+    # Extract predictions
+    predictions = []
+    for idx, (key, result) in enumerate(results.items()):
+        # Create ID from metadata
+        metadata = result.get('metadata', {})
+        video_id = metadata.get('video_id', '')
+        # Try both naming conventions for frame numbers
+        start_frame = metadata.get('input_video_start_frame', '') or metadata.get('start_frame', '')
+        end_frame = metadata.get('input_video_end_frame', '') or metadata.get('end_frame', '')
+        fps = metadata.get('fps', '')
+        # ID format: video_id&&start_frame&&end_frame&&fps
+        sample_id = f"{video_id}&&{start_frame}&&{end_frame}&&{fps}"
+        prediction = {
+            'id': sample_id,
+            'qa_type': result.get('qa_type', ''),
+            'prediction': result.get('answer', '')
+        }
+        predictions.append(prediction)
+        if (idx + 1) % 1000 == 0:
+            print(f"Processed {idx + 1} predictions...")
+    # Save predictions
+    print(f"Saving {len(predictions)} predictions to: {output_file}")
+    with open(output_file, 'w') as f:
+        json.dump(predictions, f, indent=2)
+    print(f"✓ Successfully extracted {len(predictions)} predictions")
+    # Show sample
+    if predictions:
+        print("\nSample prediction (first entry):")
+        print(json.dumps(predictions[0], indent=2))
+def main():
+    """Command-line interface."""
+    if len(sys.argv) != 3:
+        print("Usage: python extract_predictions.py results.json predictions.json")
+        print()
+        print("Arguments:")
+        print("  results.json      - Input results file (with ground truth)")
+        print("  predictions.json  - Output predictions file (user format)")
+        sys.exit(1)
+    results_file = sys.argv[1]
+    output_file = sys.argv[2]
+    extract_predictions(results_file, output_file)
+if __name__ == "__main__":
+    main()

evaluation/merge_predictions_with_gt.py CHANGED Viewed

@@ -16,10 +16,10 @@ def merge_predictions_with_ground_truth(
     output_file: str
 ) -> Tuple[bool, str]:
     """
-    Merge user predictions with server-side ground truth.
     Args:
-        predictions_file: Path to user's predictions JSON (id, qa_type, prediction)
         ground_truth_file: Path to ground truth JSON (struc_info, GPT responses)
         output_file: Path to save merged JSON for evaluation
@@ -44,73 +44,71 @@ def merge_predictions_with_ground_truth(
         if not isinstance(predictions, list):
             return False, "Predictions must be a JSON array"
-        # Create prediction lookup by id
-        pred_lookup = {}
         for i, pred in enumerate(predictions):
-            if 'id' not in pred:
-                return False, f"Prediction {i} missing 'id' field"
             if 'prediction' not in pred:
-                return False, f"Prediction {pred['id']} missing 'prediction' field"
-            pred_lookup[pred['id']] = pred
-        # Merge predictions with ground truth
-        merged = []
-        missing_predictions = []
         mismatched_qa_types = []
-        for gt_sample in ground_truth:
-            sample_id = gt_sample['id']
-            # Skip if user didn't provide prediction for this sample
-            if sample_id not in pred_lookup:
-                missing_predictions.append(sample_id)
-                continue
-            pred = pred_lookup[sample_id]
             # Verify qa_type matches (optional validation)
             if 'qa_type' in pred and pred['qa_type'] != gt_sample.get('qa_type'):
                 mismatched_qa_types.append({
-                    'id': sample_id,
-                    'predicted': pred['qa_type'],
                     'actual': gt_sample.get('qa_type')
                 })
-            # Start with ground truth sample (contains all metadata + ground truth)
-            merged_sample = gt_sample.copy()
-            # Add user's prediction
-            merged_sample['prediction'] = pred['prediction']
-            # Ground truth fields that are already present:
-            # - struc_info (for TAL/STG evaluation)
-            # - conversations with GPT responses (for caption evaluation)
-            # - all metadata (video paths, fps, etc.)
-            merged.append(merged_sample)
-        # Save merged data
         print(f"Saving merged data to: {output_file}")
         with open(output_file, 'w') as f:
             json.dump(merged, f, indent=2)
         # Build result message
         message_parts = [
-            f"Successfully merged {len(merged)}/{len(ground_truth)} samples"
         ]
-        if missing_predictions:
-            message_parts.append(
-                f"Warning: {len(missing_predictions)} samples without predictions"
-            )
         if mismatched_qa_types:
             message_parts.append(
                 f"Warning: {len(mismatched_qa_types)} samples with mismatched qa_type"
             )
             for mismatch in mismatched_qa_types[:5]:  # Show first 5
-                print(f"  Mismatch: {mismatch['id']} - predicted: {mismatch['predicted']}, actual: {mismatch['actual']}")
         message = ". ".join(message_parts)
         print(message)
@@ -131,9 +129,11 @@ def main():
         print("Usage: python merge_predictions_with_gt.py predictions.json ground_truth.json output.json")
         print()
         print("Arguments:")
-        print("  predictions.json  - User's predictions (id, qa_type, prediction)")
         print("  ground_truth.json - Server's ground truth (struc_info, GPT responses)")
         print("  output.json       - Merged output for evaluation")
         sys.exit(1)
     predictions_file = sys.argv[1]

     output_file: str
 ) -> Tuple[bool, str]:
     """
+    Merge user predictions with server-side ground truth by array index.
     Args:
+        predictions_file: Path to user's predictions JSON array (same order as ground truth)
         ground_truth_file: Path to ground truth JSON (struc_info, GPT responses)
         output_file: Path to save merged JSON for evaluation
         if not isinstance(predictions, list):
             return False, "Predictions must be a JSON array"
+        # Check lengths match for index-based merging
+        if len(predictions) != len(ground_truth):
+            return False, f"Predictions ({len(predictions)}) and ground truth ({len(ground_truth)}) must have the same length"
+        # Validate predictions have required fields
         for i, pred in enumerate(predictions):
             if 'prediction' not in pred:
+                return False, f"Prediction at index {i} missing 'prediction' field"
+        # Merge predictions with ground truth by index
+        merged = {}
         mismatched_qa_types = []
+        for idx, gt_sample in enumerate(ground_truth):
+            pred = predictions[idx]
             # Verify qa_type matches (optional validation)
             if 'qa_type' in pred and pred['qa_type'] != gt_sample.get('qa_type'):
                 mismatched_qa_types.append({
+                    'index': idx,
+                    'predicted': pred.get('qa_type'),
                     'actual': gt_sample.get('qa_type')
                 })
+            # Create minimal format matching original results.json
+            # Only include essential fields: metadata, qa_type, struc_info, question, gnd, answer, data_source
+            merged_sample = {
+                'metadata': gt_sample.get('metadata', {}),
+                'qa_type': gt_sample.get('qa_type', ''),
+                'struc_info': gt_sample.get('struc_info', []),
+                'question': '',  # Extract from conversations if present
+                'gnd': '',  # Extract from conversations if present
+                'answer': pred['prediction'],
+                'data_source': gt_sample.get('data_source', '')
+            }
+            # Extract question and ground truth answer from conversations
+            if 'conversations' in gt_sample:
+                for msg in gt_sample['conversations']:
+                    if msg.get('from') in ['human', 'user']:
+                        # Remove <video> token from question to match original format
+                        question = msg.get('value', '')
+                        merged_sample['question'] = question.replace('<video>\n', '').replace('<video>', '')
+                    elif msg.get('from') in ['gpt', 'assistant']:
+                        merged_sample['gnd'] = msg.get('value', '')
+            # Use numeric string key to match original format
+            merged[str(idx)] = merged_sample
+        # Save merged data as dict with numeric string keys
         print(f"Saving merged data to: {output_file}")
         with open(output_file, 'w') as f:
             json.dump(merged, f, indent=2)
         # Build result message
         message_parts = [
+            f"Successfully merged {len(merged)} samples"
         ]
         if mismatched_qa_types:
             message_parts.append(
                 f"Warning: {len(mismatched_qa_types)} samples with mismatched qa_type"
             )
             for mismatch in mismatched_qa_types[:5]:  # Show first 5
+                print(f"  Mismatch at index {mismatch['index']}: predicted: {mismatch['predicted']}, actual: {mismatch['actual']}")
         message = ". ".join(message_parts)
         print(message)
         print("Usage: python merge_predictions_with_gt.py predictions.json ground_truth.json output.json")
         print()
         print("Arguments:")
+        print("  predictions.json  - User's predictions array (same length/order as ground truth)")
         print("  ground_truth.json - Server's ground truth (struc_info, GPT responses)")
         print("  output.json       - Merged output for evaluation")
+        print()
+        print("Note: Predictions and ground truth are merged by array index (0-based).")
         sys.exit(1)
     predictions_file = sys.argv[1]