Spaces:

OpenHands
/

openhands-index

Running

File size: 4,997 Bytes

"""Generate mock results data in JSONL format for OpenHands Index."""
import json
import os
from pathlib import Path
from datetime import datetime

# Define the 6 benchmarks
BENCHMARKS = {
    "swe-bench": {
        "tags": ["swe-bench"],
        "metric": "resolve_rate",
        "metric_display": "Resolve Rate (%)"
    },
    "multi-swe-bench": {
        "tags": ["multi-swe-bench"],
        "metric": "resolve_rate",
        "metric_display": "Resolve Rate (%)"
    },
    "swe-bench-multimodal": {
        "tags": ["swe-bench-multimodal"],
        "metric": "resolve_rate",
        "metric_display": "Resolve Rate (%)"
    },
    "swt-bench": {
        "tags": ["swt-bench"],
        "metric": "success_rate",
        "metric_display": "Success Rate (%)"
    },
    "commit0": {
        "tags": ["commit0"],
        "metric": "test_pass_rate",
        "metric_display": "Test Pass Rate (%)"
    },
    "gaia": {
        "tags": ["gaia"],
        "metric": "accuracy",
        "metric_display": "Accuracy (%)"
    }
}

# Mock agents with realistic scores
MOCK_AGENTS = [
    {
        "agent_name": "1.0.2",
        "llm_base": "claude-3-5-sonnet-20241022",
        "openness": "closed",
        "scores": {
            "swe-bench": 48.3,
            "multi-swe-bench": 35.2,
            "swe-bench-multimodal": 42.1,
            "swt-bench": 65.4,
            "commit0": 71.2,
            "gaia": 58.7
        }
    },
    {
        "agent_name": "1.0.1",
        "llm_base": "gpt-4o-2024-11-20",
        "openness": "closed",
        "scores": {
            "swe-bench": 45.1,
            "multi-swe-bench": 32.8,
            "swe-bench-multimodal": 39.5,
            "swt-bench": 62.3,
            "commit0": 68.9,
            "gaia": 55.2
        }
    },
    {
        "agent_name": "1.0.0",
        "llm_base": "gpt-4-turbo-2024-04-09",
        "openness": "closed",
        "scores": {
            "swe-bench": 38.7,
            "multi-swe-bench": 28.4,
            "swe-bench-multimodal": 34.2,
            "swt-bench": 54.1,
            "commit0": 61.5,
            "gaia": 48.3
        }
    },
    {
        "agent_name": "0.9.5",
        "llm_base": "gpt-4o-mini-2024-07-18",
        "openness": "closed",
        "scores": {
            "swe-bench": 32.5,
            "multi-swe-bench": 24.1,
            "swe-bench-multimodal": 28.9,
            "swt-bench": 47.8,
            "commit0": 55.3,
            "gaia": 42.1
        }
    },
    {
        "agent_name": "0.9.0",
        "llm_base": "claude-3-opus-20240229",
        "openness": "closed",
        "scores": {
            "swe-bench": 29.8,
            "multi-swe-bench": 21.5,
            "swe-bench-multimodal": 25.7,
            "swt-bench": 44.2,
            "commit0": 52.1,
            "gaia": 39.4
        }
    },
]


def generate_mock_data():
    """Generate mock JSONL files for all benchmarks."""
    output_dir = Path("mock_results/1.0.0-dev1")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Create agenteval.json config
    config = {
        "suite_config": {
            "name": "openhands-index",
            "version": "1.0.0-dev1",
            "splits": []
        }
    }
    
    # Generate data for each benchmark
    for benchmark_name, benchmark_info in BENCHMARKS.items():
        print(f"Generating mock data for {benchmark_name}...")
        
        # Add to config
        config["suite_config"]["splits"].append({
            "name": benchmark_name,
            "tasks": [{
                "name": benchmark_name,
                "tags": benchmark_info["tags"]
            }]
        })
        
        # Generate JSONL file
        jsonl_path = output_dir / f"{benchmark_name}.jsonl"
        with open(jsonl_path, 'w') as f:
            for agent in MOCK_AGENTS:
                record = {
                    "agent_name": agent["agent_name"],
                    "llm_base": agent["llm_base"],
                    "openness": agent["openness"],
                    "score": agent["scores"][benchmark_name],
                    "metric": benchmark_info["metric"],
                    "submission_time": datetime.now().isoformat(),
                    "tags": benchmark_info["tags"],
                    # Additional metadata
                    "total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
                    "total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
                }
                f.write(json.dumps(record) + '\n')
        
        print(f"  Created {jsonl_path}")
    
    # Write config file
    config_path = output_dir / "agenteval.json"
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    print(f"\nCreated config: {config_path}")
    
    print("\n✓ Mock data generation complete!")
    print(f"  Location: {output_dir}")
    print(f"  Benchmarks: {', '.join(BENCHMARKS.keys())}")
    print(f"  Agents: {len(MOCK_AGENTS)}")


if __name__ == "__main__":
    generate_mock_data()