Spaces:
Running
Running
File size: 4,997 Bytes
1027cfb 0ee2099 1027cfb ca754bb 1027cfb 0ee2099 1027cfb ca754bb 1027cfb 0ee2099 1027cfb ca754bb 1027cfb 0ee2099 1027cfb ca754bb 1027cfb 0ee2099 1027cfb ca754bb 1027cfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
"""Generate mock results data in JSONL format for OpenHands Index."""
import json
import os
from pathlib import Path
from datetime import datetime
# Define the 6 benchmarks
BENCHMARKS = {
"swe-bench": {
"tags": ["swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"multi-swe-bench": {
"tags": ["multi-swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swe-bench-multimodal": {
"tags": ["swe-bench-multimodal"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swt-bench": {
"tags": ["swt-bench"],
"metric": "success_rate",
"metric_display": "Success Rate (%)"
},
"commit0": {
"tags": ["commit0"],
"metric": "test_pass_rate",
"metric_display": "Test Pass Rate (%)"
},
"gaia": {
"tags": ["gaia"],
"metric": "accuracy",
"metric_display": "Accuracy (%)"
}
}
# Mock agents with realistic scores
MOCK_AGENTS = [
{
"agent_name": "1.0.2",
"llm_base": "claude-3-5-sonnet-20241022",
"openness": "closed",
"scores": {
"swe-bench": 48.3,
"multi-swe-bench": 35.2,
"swe-bench-multimodal": 42.1,
"swt-bench": 65.4,
"commit0": 71.2,
"gaia": 58.7
}
},
{
"agent_name": "1.0.1",
"llm_base": "gpt-4o-2024-11-20",
"openness": "closed",
"scores": {
"swe-bench": 45.1,
"multi-swe-bench": 32.8,
"swe-bench-multimodal": 39.5,
"swt-bench": 62.3,
"commit0": 68.9,
"gaia": 55.2
}
},
{
"agent_name": "1.0.0",
"llm_base": "gpt-4-turbo-2024-04-09",
"openness": "closed",
"scores": {
"swe-bench": 38.7,
"multi-swe-bench": 28.4,
"swe-bench-multimodal": 34.2,
"swt-bench": 54.1,
"commit0": 61.5,
"gaia": 48.3
}
},
{
"agent_name": "0.9.5",
"llm_base": "gpt-4o-mini-2024-07-18",
"openness": "closed",
"scores": {
"swe-bench": 32.5,
"multi-swe-bench": 24.1,
"swe-bench-multimodal": 28.9,
"swt-bench": 47.8,
"commit0": 55.3,
"gaia": 42.1
}
},
{
"agent_name": "0.9.0",
"llm_base": "claude-3-opus-20240229",
"openness": "closed",
"scores": {
"swe-bench": 29.8,
"multi-swe-bench": 21.5,
"swe-bench-multimodal": 25.7,
"swt-bench": 44.2,
"commit0": 52.1,
"gaia": 39.4
}
},
]
def generate_mock_data():
"""Generate mock JSONL files for all benchmarks."""
output_dir = Path("mock_results/1.0.0-dev1")
output_dir.mkdir(parents=True, exist_ok=True)
# Create agenteval.json config
config = {
"suite_config": {
"name": "openhands-index",
"version": "1.0.0-dev1",
"splits": []
}
}
# Generate data for each benchmark
for benchmark_name, benchmark_info in BENCHMARKS.items():
print(f"Generating mock data for {benchmark_name}...")
# Add to config
config["suite_config"]["splits"].append({
"name": benchmark_name,
"tasks": [{
"name": benchmark_name,
"tags": benchmark_info["tags"]
}]
})
# Generate JSONL file
jsonl_path = output_dir / f"{benchmark_name}.jsonl"
with open(jsonl_path, 'w') as f:
for agent in MOCK_AGENTS:
record = {
"agent_name": agent["agent_name"],
"llm_base": agent["llm_base"],
"openness": agent["openness"],
"score": agent["scores"][benchmark_name],
"metric": benchmark_info["metric"],
"submission_time": datetime.now().isoformat(),
"tags": benchmark_info["tags"],
# Additional metadata
"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
}
f.write(json.dumps(record) + '\n')
print(f" Created {jsonl_path}")
# Write config file
config_path = output_dir / "agenteval.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"\nCreated config: {config_path}")
print("\n✓ Mock data generation complete!")
print(f" Location: {output_dir}")
print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
print(f" Agents: {len(MOCK_AGENTS)}")
if __name__ == "__main__":
generate_mock_data()
|