"""Generate mock results data in agenteval format for OpenHands Index.""" import json import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from pathlib import Path # Load the suite config with open("data/1.0.0-dev1/agenteval.json") as f: suite_config_data = json.load(f) suite_config = suite_config_data["suite_config"] # Mock agents agents = [ { "name": "OpenHands CodeAct v2.1", "source_url": "https://github.com/OpenHands/OpenHands" }, { "name": "Aider", "source_url": "https://github.com/paul-gauthier/aider" }, { "name": "SWE-agent", "source_url": "https://github.com/princeton-nlp/SWE-agent" } ] def create_mock_results(split_name): """Create mock results for a split.""" split_config = next(s for s in suite_config["splits"] if s["name"] == split_name) rows = [] for agent in agents: # Create results for each task results = [] for task in split_config["tasks"]: task_name = task["name"] primary_metric = task["primary_metric"] # Generate mock score (different for each agent) base_score = 0.3 + (hash(agent["name"]) % 50) / 100 score = base_score + (hash(task_name) % 30) / 100 score = min(score, 1.0) task_result = { "task_name": task_name, "eval_spec": { "model": "gpt-4", "solver": f"openhands/{task_name}", }, "metrics": [ { "name": primary_metric, "value": score } ], "model_usages": [] } results.append(task_result) # Create row row = { "suite_config": suite_config, "split": split_name, "results": results, "submission": { "agent_name": agent["name"], "source_url": agent["source_url"], "openness": "open-source/open-weights", "tool_usage": "standard" } } rows.append(row) return rows # Create mock data for both splits all_rows = [] for split in ["validation", "test"]: all_rows.extend(create_mock_results(split)) # Convert to DataFrame df = pd.DataFrame(all_rows) # Save as parquet output_dir = Path("mock_results/1.0.0-dev1") output_dir.mkdir(parents=True, exist_ok=True) # Save validation split validation_df = df[df["split"] == "validation"] validation_df.to_parquet(output_dir / "validation.parquet", index=False) # Save test split test_df = df[df["split"] == "test"] test_df.to_parquet(output_dir / "test.parquet", index=False) print(f"Created mock data:") print(f" - Validation: {len(validation_df)} rows") print(f" - Test: {len(test_df)} rows") print(f" - Output: {output_dir}")