Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| METRIC_NAME = { | |
| # single-turn | |
| "arc_easy": "accuracy", | |
| "arc_challenge": "accuracy", | |
| "gpqa_diamond": "accuracy", | |
| "drop": "mean", | |
| "winogrande": "accuracy", | |
| "gsm8k": "accuracy", | |
| "hellaswag": "accuracy", | |
| "humaneval": "mean", | |
| "ifeval": "final_acc", | |
| "math": "accuracy", | |
| "mmlu": "accuracy", | |
| "mmlu_pro": "accuracy", | |
| "mmmu_multiple_choice": "accuracy", | |
| "mmmu_open": "accuracy", | |
| # agentic | |
| "gaia": "accuracy", | |
| "gdm_intercode_ctf": "accuracy", | |
| "gdm_in_house_ctf": "accuracy", | |
| "agentharm": "avg_score", | |
| "agentharm_benign": "avg_score", | |
| "swe_bench": "mean", | |
| } | |
| MODEL_SHA_MAP = { | |
| # open source models | |
| "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", | |
| "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct", | |
| "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", | |
| "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", | |
| # closed source models | |
| "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet", | |
| "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5 | |
| "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro", | |
| "gpt-4o": "https://openai.com/index/hello-gpt-4o", | |
| "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence", | |
| "o1": "https://openai.com/o1", | |
| "o3-mini": "https://openai.com/index/openai-o3-mini", | |
| "DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120" | |
| } | |
| MODEL_VERSION_MAP = { | |
| # open source models | |
| "c4ai-command-r-plus": "c4ai-command-r-plus", | |
| "Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct", | |
| "Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407", | |
| "Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct", | |
| # closed source models | |
| "claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022", | |
| "gemini-1.5-flash": "Gemini-1.5-Flash", | |
| "gemini-1.5-pro": "Gemini-1.5-Pro-002", | |
| "gpt-4o": "GPT-4o-20240806", | |
| "gpt-4o-mini": "GPT-4o-mini-20240718", | |
| "o1": "o1-20241217", | |
| "o3-mini": "o3-mini-20250131", | |
| "DeepSeek-R1": "DeepSeek-R1", | |
| } | |
| AGENTIC_LOG_MODEL_NAME_MAP = { | |
| "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022", | |
| "gemini-1.5-pro": "gemini-1.5-pro-002", | |
| "gpt-4o": "gpt-4o-2024-08-06", | |
| "o1": "o1-2024-12-17", | |
| "o3-mini": "o3-mini-2025-01-31", | |
| } | |
| AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"] | |
| def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict: | |
| results = dict( | |
| { | |
| "config": { | |
| "model_name": model_name, | |
| # dummy keys | |
| "model_sha": MODEL_SHA_MAP[model_name], | |
| "model_dtype": "torch.float16", | |
| }, | |
| "results": {}, | |
| } | |
| ) | |
| if type == "base": | |
| for file in os.listdir(os.path.join(results_path, model_name)): | |
| if file.endswith(".json"): | |
| with open(os.path.join(results_path, model_name, file), "r") as f: | |
| try: | |
| result = json.load(f) | |
| task_name = result["eval"]["task"].split("/")[-1] | |
| if task_name == "math": | |
| metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required | |
| else: | |
| metrics = result["results"]["scores"][0]["metrics"] | |
| metric_name = metrics[METRIC_NAME[task_name]]["name"] | |
| metric_value = metrics[METRIC_NAME[task_name]]["value"] | |
| results["results"].update( | |
| { | |
| task_name: { | |
| metric_name: metric_value | |
| } | |
| } | |
| ) | |
| except KeyError as e: | |
| print(f"KeyError: {e}") | |
| print(model_name) | |
| print(file) | |
| elif type == "agentic": | |
| model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure | |
| results_path = os.path.join(results_path, model_name) | |
| for task in AGENTIC_TASKS: | |
| for file in os.listdir(os.path.join(results_path, task)): | |
| if file.endswith(".json"): | |
| with open(os.path.join(results_path, task, file), "r") as f: | |
| try: | |
| result = json.load(f) | |
| task_name = result["eval"]["task"].split("/")[-1] | |
| metrics = result["results"]["scores"][0]["metrics"] | |
| metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1] | |
| metric_value = metrics[METRIC_NAME[task_name]]["value"] | |
| results["results"].update( | |
| { | |
| task_name: { | |
| metric_name: metric_value | |
| } | |
| } | |
| ) | |
| except KeyError as e: | |
| print(f"KeyError: {e}") | |
| print(model_name) | |
| print(file) | |
| return results | |
| def main(): | |
| CACHE_PATH=os.getenv("HF_HOME", ".") | |
| EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") | |
| EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") | |
| base_bm_input_path = "./base_benchmarking_logs" | |
| agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs" | |
| os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) | |
| os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True) | |
| for model_name in os.listdir(base_bm_input_path): | |
| if os.path.isdir(os.path.join(base_bm_input_path, model_name)): | |
| results = combine_eval_results(base_bm_input_path, model_name, "base") | |
| # TMP: Add missing benchmarks to the results | |
| for metric in METRIC_NAME.items(): | |
| if metric[0] not in results["results"]: | |
| results["results"].update({metric[0]: {metric[1]: None}}) | |
| if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))): | |
| agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic") | |
| results["results"].update(agentic_bm_results["results"]) | |
| with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f: | |
| json.dump(results, f, indent=4) | |
| # Create dummy requests file | |
| requests = { | |
| "model": model_name, | |
| "model_sha": MODEL_SHA_MAP[model_name], | |
| "model_version": MODEL_VERSION_MAP[model_name], | |
| "base_model": "", | |
| "revision": "main", | |
| "private": False, | |
| "precision": "float16", | |
| "weight_type": "Original", | |
| "status": "FINISHED", | |
| "submitted_time": "", | |
| "model_type": "pretrained", | |
| "likes": 0, | |
| "params": 0, | |
| "license": "custom", | |
| } | |
| with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f: | |
| json.dump(requests, f, indent=4) | |
| if __name__ == "__main__": | |
| main() | |