File size: 17,573 Bytes
1027cfb
 
 
f0339f3
1027cfb
 
f0339f3
 
1027cfb
 
 
f0339f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5998027
 
f0339f3
 
 
 
 
 
 
 
5998027
b5317d7
 
 
 
 
 
f0339f3
 
 
 
 
 
 
 
 
 
1027cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e9c3b9
1027cfb
5e9c3b9
b4ac443
 
 
1027cfb
 
 
5e9c3b9
 
 
1027cfb
5e9c3b9
 
 
 
 
 
 
 
b4ac443
 
 
 
 
 
5778893
 
 
 
b4ac443
 
 
 
 
 
 
 
 
1027cfb
e003f7b
 
 
 
 
 
 
 
f0339f3
e003f7b
 
 
 
 
 
f0339f3
 
e003f7b
f0339f3
 
e003f7b
f0339f3
 
e003f7b
 
 
 
376500e
e003f7b
 
 
 
 
b1f3e49
 
e003f7b
b5317d7
e003f7b
 
 
f0339f3
 
 
 
 
 
 
 
e003f7b
 
 
 
 
1027cfb
855423e
e003f7b
1027cfb
e003f7b
855423e
 
 
 
e003f7b
855423e
e003f7b
aa07520
 
5998027
aa07520
 
5998027
 
 
 
 
aa07520
 
 
5998027
ca754bb
 
 
 
 
 
5998027
5778893
5998027
aa07520
 
5998027
0e14c25
ca754bb
aa07520
 
1739efc
 
aa07520
 
 
 
 
 
 
 
5e9c3b9
 
 
aa07520
 
 
5e9c3b9
aa07520
b1f3e49
aa07520
b1f3e49
5e9c3b9
4d0ae13
b5317d7
 
 
 
5e9c3b9
 
 
 
 
 
b1f3e49
5e9c3b9
b1f3e49
 
5998027
 
 
 
e734bf6
 
5998027
 
cdd40ba
 
b1f3e49
 
 
5998027
 
 
 
 
 
 
 
 
 
 
 
b1f3e49
 
aa07520
5998027
 
aa07520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
044cdf4
 
 
 
 
 
 
aa07520
1027cfb
aa07520
 
1027cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
Simple data loader for OpenHands Index leaderboard.
Loads JSONL files from local directory or GitHub repository.
Uses pydantic models from openhands-index-results for validation.
"""
import os
import sys
import logging
import pandas as pd
import json
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

# Pydantic models will be imported after setup_data adds them to path
_schema_models_loaded = False
Metadata = None
ScoreEntry = None


def _ensure_schema_models():
    """Lazily import pydantic schema models from openhands-index-results."""
    global _schema_models_loaded, Metadata, ScoreEntry
    
    if _schema_models_loaded:
        return _schema_models_loaded
    
    try:
        # Try importing from the cloned repo's scripts directory
        from validate_schema import Metadata as _Metadata, ScoreEntry as _ScoreEntry
        Metadata = _Metadata
        ScoreEntry = _ScoreEntry
        _schema_models_loaded = True
        logger.info("Successfully loaded pydantic schema models from openhands-index-results")
    except ImportError as e:
        logger.warning(f"Could not import pydantic schema models: {e}")
        logger.warning("Data will be loaded without schema validation")
        _schema_models_loaded = False
    
    return _schema_models_loaded


def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optional[list], list[str]]:
    """
    Load and validate agent data using pydantic models if available.
    
    Returns:
        Tuple of (metadata_dict, scores_list, validation_errors)
    """
    errors = []
    metadata_file = agent_dir / "metadata.json"
    scores_file = agent_dir / "scores.json"
    
    if not metadata_file.exists() or not scores_file.exists():
        return None, None, [f"Missing metadata.json or scores.json in {agent_dir}"]
    
    # Load raw JSON
    with open(metadata_file) as f:
        metadata_raw = json.load(f)
    with open(scores_file) as f:
        scores_raw = json.load(f)
    
    # Validate with pydantic if available
    if _ensure_schema_models() and Metadata and ScoreEntry:
        try:
            validated_metadata = Metadata(**metadata_raw)
            # Use mode='json' to serialize enums as strings
            metadata_dict = validated_metadata.model_dump(mode='json')
        except Exception as e:
            errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
            metadata_dict = metadata_raw  # Fall back to raw data
        
        validated_scores = []
        for i, score in enumerate(scores_raw):
            try:
                validated_score = ScoreEntry(**score)
                # Use mode='json' to serialize enums as strings
                validated_dict = validated_score.model_dump(mode='json')
                # Preserve any extra fields from raw data (like full_archive)
                for key, value in score.items():
                    if key not in validated_dict:
                        validated_dict[key] = value
                validated_scores.append(validated_dict)
            except Exception as e:
                errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
                validated_scores.append(score)  # Fall back to raw data
        scores_list = validated_scores
    else:
        # No validation, use raw data
        metadata_dict = metadata_raw
        scores_list = scores_raw
    
    return metadata_dict, scores_list, errors


class SimpleLeaderboardViewer:
    """Simple replacement for agent-eval's LeaderboardViewer."""
    
    def __init__(self, data_dir: str, config: str, split: str):
        """
        Args:
            data_dir: Path to data directory
            config: Config name (e.g., "1.0.0-dev1")
            split: Split name (e.g., "validation" or "test")
        """
        self.data_dir = Path(data_dir)
        self.config = config
        self.split = split
        self.config_path = self.data_dir / config
        
        # Load suite configuration
        config_file = self.config_path / "agenteval.json"
        if config_file.exists():
            with open(config_file) as f:
                suite_config = json.load(f)
                self.suite_config = suite_config["suite_config"]
        else:
            self.suite_config = {
                "name": "openhands-index",
                "version": config,
                "splits": []
            }
        
        # Build tag map from config - organize benchmarks by category
        self.tag_map = {}
        self.benchmark_to_categories = {}  # Maps benchmark name to its categories
        
        # Try loading from config first
        config_has_mappings = False
        for split_config in self.suite_config.get("splits", []):
            if split_config["name"] == split:
                for task in split_config.get("tasks", []):
                    task_name = task["name"]
                    # Store which categories this benchmark belongs to
                    self.benchmark_to_categories[task_name] = []
                    for tag in task.get("tags", []):
                        # Skip "Overall" and the benchmark's own name
                        if tag != "Overall" and tag != task_name:
                            # This is a category tag
                            if tag not in self.tag_map:
                                self.tag_map[tag] = []
                            if task_name not in self.tag_map[tag]:
                                self.tag_map[tag].append(task_name)
                            self.benchmark_to_categories[task_name].append(tag)
                            config_has_mappings = True
        
        # FALLBACK: If no mappings loaded from config, use hard-coded category mappings
        if not config_has_mappings:
            print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
            fallback_mappings = {
                'swe-bench': ['Issue Resolution'],
                'swe-bench-multimodal': ['Frontend'],
                'commit0': ['Greenfield'],
                'swt-bench': ['Testing'],
                'gaia': ['Information Gathering'],
            }
            for benchmark, categories in fallback_mappings.items():
                self.benchmark_to_categories[benchmark] = categories
                for category in categories:
                    if category not in self.tag_map:
                        self.tag_map[category] = []
                    if benchmark not in self.tag_map[category]:
                        self.tag_map[category].append(benchmark)
        
    def _load_from_agent_dirs(self):
        """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
        results_dir = self.config_path / "results"
        
        if not results_dir.exists():
            return None  # Fall back to old format
        
        all_records = []
        all_validation_errors = []
        
        # Iterate through each agent directory
        for agent_dir in results_dir.iterdir():
            if not agent_dir.is_dir():
                continue
            
            # Load and validate using pydantic models
            metadata, scores, errors = load_and_validate_agent_data(agent_dir)
            
            if errors:
                all_validation_errors.extend(errors)
            
            if metadata is None or scores is None:
                continue
            
            # Create one record per benchmark (mimicking old JSONL format)
            for score_entry in scores:
                record = {
                    'agent_version': metadata.get('agent_version', 'Unknown'),
                    'llm_base': metadata.get('model', 'unknown'),
                    'openness': metadata.get('openness', 'unknown'),
                    'submission_time': metadata.get('submission_time', ''),
                    'score': score_entry.get('score'),
                    'metric': score_entry.get('metric', 'unknown'),
                    'cost_per_instance': score_entry.get('cost_per_instance'),
                    'average_runtime': score_entry.get('average_runtime'),
                    'tags': [score_entry.get('benchmark')],
                    'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
                }
                all_records.append(record)
        
        # Log validation errors if any
        if all_validation_errors:
            logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
            for error in all_validation_errors[:5]:  # Show first 5
                logger.warning(f"  - {error}")
            if len(all_validation_errors) > 5:
                logger.warning(f"  ... and {len(all_validation_errors) - 5} more")
        
        if not all_records:
            return None  # Fall back to old format
        
        return pd.DataFrame(all_records)
    
    def _load(self):
        """Load data from agent-centric directories and return DataFrame and tag map."""
        df = self._load_from_agent_dirs()
        
        if df is None:
            # Return empty dataframe with error message
            return pd.DataFrame({
                "Message": [f"No data found for split '{self.split}' in results directory"]
            }), {}
        
        # Process the dataframe
        try:
            
            # Transform to expected format for leaderboard
            # Group by agent (version + model combination) to aggregate results across datasets
            transformed_records = []
            
            # Create a unique identifier for each agent (version + model)
            df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
            
            for agent_id in df['agent_id'].unique():
                agent_records = df[df['agent_id'] == agent_id]
                
                # Build a single record for this agent
                first_record = agent_records.iloc[0]
                agent_version = first_record['agent_version']
                
                # Normalize openness to "open" or "closed"
                from aliases import OPENNESS_MAPPING
                raw_openness = first_record['openness']
                normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                
                # All 5 categories for the leaderboard
                ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                
                record = {
                    # Core agent info - use final display names
                    'SDK version': agent_version,  # Will become "SDK Version"
                    'Language model': first_record['llm_base'],  # Will become "Language Model"
                    'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                    'date': first_record['submission_time'],  # Will become "Date"
                    # Additional columns expected by the transformer
                    # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
                    'id': agent_id,
                    'source': first_record.get('source', ''),  # Will become "Source"
                    'logs': first_record.get('logs', ''),  # Will become "Logs"
                }
                
                # Add per-dataset scores and costs
                dataset_scores = []
                dataset_costs = []
                
                # Track category-level data for aggregation
                category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
                
                for _, row in agent_records.iterrows():
                    tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                    for tag in tags:
                        # Add columns for this specific dataset/benchmark
                        record[f'{tag} score'] = row['score']
                        record[f'{tag} cost'] = row['cost_per_instance']
                        dataset_scores.append(row['score'])
                        dataset_costs.append(row['cost_per_instance'])
                        
                        # Store the full_archive URL for this benchmark (for benchmark-specific download)
                        full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
                        if full_archive_url:
                            record[f'{tag} download'] = full_archive_url
                        
                        # Track category-level data for aggregation
                        if tag in self.benchmark_to_categories:
                            for category in self.benchmark_to_categories[tag]:
                                if category not in category_data:
                                    category_data[category] = {'scores': [], 'costs': []}
                                category_data[category]['scores'].append(row['score'])
                                category_data[category]['costs'].append(row['cost_per_instance'])
                
                # Calculate category-level aggregates and track average cost
                all_costs = []
                categories_with_scores = 0
                for category in ALL_CATEGORIES:
                    if category in category_data and category_data[category]['scores']:
                        data = category_data[category]
                        avg_score = sum(data['scores']) / len(data['scores'])
                        record[f'{category} score'] = avg_score
                        categories_with_scores += 1
                        if data['costs']:
                            valid_costs = [c for c in data['costs'] if c is not None]
                            if valid_costs:
                                avg_cost = sum(valid_costs) / len(valid_costs)
                                record[f'{category} cost'] = avg_cost
                                all_costs.extend(valid_costs)
                    else:
                        # Category not submitted - will show as NA
                        pass
                
                # Calculate average score: always divide by 5 (treating missing categories as 0)
                # This penalizes incomplete submissions
                score_sum = sum(
                    record.get(f'{cat} score', 0) or 0 
                    for cat in ALL_CATEGORIES
                )
                record['average score'] = score_sum / 5
                
                # Average cost per instance across all benchmarks
                record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
                
                # Track how many categories were completed
                record['categories_completed'] = categories_with_scores
                
                transformed_records.append(record)
            
            transformed_df = pd.DataFrame(transformed_records)
            
            # Build tag map if not already built
            if not self.tag_map:
                # Create simple tag map from the data
                all_tags = set()
                for _, row in df.iterrows():
                    tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                    all_tags.update(tags)
                
                # Simple mapping: each tag maps to itself
                self.tag_map = {tag: [tag] for tag in sorted(all_tags)}
            
            # DEBUG: Print sample of loaded data
            print(f"[DATA_LOADER] Loaded {len(transformed_df)} agents")
            if len(transformed_df) > 0:
                sample_cols = ['agent_name', 'overall_score', 'overall_cost']
                available_cols = [c for c in sample_cols if c in transformed_df.columns]
                print(f"[DATA_LOADER] Sample row: {transformed_df[available_cols].iloc[0].to_dict()}")
            
            return transformed_df, self.tag_map
        except Exception as e:
            import traceback
            traceback.print_exc()
            return pd.DataFrame({
                "Message": [f"Error loading data: {e}"]
            }), {}
    
    def get_dataframe(self):
        """Get the raw dataframe."""
        df, _ = self._load()
        return df


def load_mock_data_locally(data_dir: str = "mock_results"):
    """
    Load mock data from local directory for testing.
    
    Args:
        data_dir: Path to mock results directory
        
    Returns:
        Dictionary mapping split names to SimpleLeaderboardViewer instances
    """
    viewers = {}
    data_path = Path(data_dir)
    
    if not data_path.exists():
        print(f"Warning: Mock data directory '{data_dir}' not found")
        return viewers
    
    # Find all config directories
    for config_dir in data_path.iterdir():
        if config_dir.is_dir():
            config_name = config_dir.name
            
            # Find all JSONL files (each represents a split)
            for jsonl_file in config_dir.glob("*.jsonl"):
                split_name = jsonl_file.stem
                viewer = SimpleLeaderboardViewer(
                    data_dir=str(data_path),
                    config=config_name,
                    split=split_name
                )
                viewers[split_name] = viewer
    
    return viewers