Spaces:

nvidia
/

ProfBench

Running

App Files Files Community

zhilinw commited on 10 days ago

Commit

6509395

verified ·

1 Parent(s): 9d220e1

Delete convert_wo_docs_into_json.py

Browse files

Files changed (1) hide show

convert_wo_docs_into_json.py +0 -134

convert_wo_docs_into_json.py DELETED Viewed

@@ -1,134 +0,0 @@
-import os
-import json
-filename_to_args = {
-    "gpt-5_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5 (high)", "Closed-source Reasoning", 1.25, 10],
-    "gpt-5-mini_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-mini (high)","Closed-source Reasoning", 0.25, 2],
-    "gpt-5-nano_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5-nano (high)","Closed-source Reasoning", 0.05, 0.4],
-    "o3_reasoning_medium_search_0.jsonl": ["OpenAI/o3","Closed-source Reasoning", 2, 8],
-    "o4-mini_reasoning_medium_search_0.jsonl": ["OpenAI/o4-mini","Closed-source Reasoning", 1.1, 4.4],
-    "gemini-2.5-pro_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Pro","Closed-source Reasoning", 1.25, 10],
-    "gemini-2.5-flash_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash (Thinking)","Closed-source Reasoning", 0.3, 2.5],
-    "gemini-2.5-flash-lite_reasoning_high_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite (Thinking)","Closed-source Reasoning", 0.1, 0.4],
-    "x-ai_grok-4_reasoning_high_search_0.jsonl": ["xAI/grok-4-0709","Closed-source Reasoning", 3, 15],
-    "anthropic_claude-sonnet-4_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4 (Thinking)","Closed-source Reasoning", 3, 15],
-    "openai_gpt-oss-120b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-120b (high)", "Open-weight Reasoning", 0.04, 0.4],
-    "openai_gpt-oss-20b_reasoning_high_search_0.jsonl": ["OpenAI/gpt-oss-20b (high)", "Open-weight Reasoning", 0.03, 0.14],
-    "deepseek_deepseek-chat-v3.1_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Open-weight Reasoning", 0.2, 0.8],
-    "qwen_qwen3-235b-a22b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Thinking-2507", "Open-weight Reasoning", 0.11, 0.6],
-    "qwen_qwen3-30b-a3b-thinking-2507_reasoning_high_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Thinking-2507", "Open-weight Reasoning", 0.08, 0.29],
-    "gpt-4.1_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1", "Closed-source Instruct", 2, 8],
-    "gpt-4.1-mini_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-mini", "Closed-source Instruct", 0.4, 1.6],
-    "gpt-4.1-nano_reasoning_False_search_0.jsonl":["OpenAI/GPT-4.1-nano", "Closed-source Instruct", 0.1, 0.4],
-    "gemini-2.5-flash_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash","Closed-source Instruct", 0.3, 2.5],
-    "gemini-2.5-flash-lite_reasoning_False_search_0.jsonl": ["Google/Gemini-2.5-Flash-Lite","Closed-source Instruct", 0.1, 0.4],
-    "anthropic_claude-sonnet-4_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4","Closed-source Instruct", 3, 15],
-    "anthropic_claude-3.5-haiku_reasoning_False_search_0.jsonl": ["Anthropic/claude-3.5-haiku", "Closed-source Instruct", 0.8, 4],
-    "qwen_qwen3-235b-a22b-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-235B-A22B-Instruct-2507", "Open-weight Instruct", 0.08, 0.55],
-    "qwen_qwen3-30b-a3b-instruct-2507_reasoning_False_search_0.jsonl": ["Qwen/Qwen3-30B-A3B-Instruct-2507", "Open-weight Instruct", 0.08, 0.33],
-    "deepseek_deepseek-chat-v3.1_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.1", "Open-weight Instruct", 0.2, 0.8],
-    "moonshotai_kimi-k2-0905_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Instruct-0905", "Open-weight Instruct", 0.39, 1.9],
-    "meta-llama_llama-4-maverick_reasoning_False_search_0.jsonl": ["Meta/llama-4-maverick", "Open-weight Instruct", 0.15, 0.6],
-    "meta-llama_llama-4-scout_reasoning_False_search_0.jsonl": ["Meta/llama-4-scout", "Open-weight Instruct", 0.08, 0.3],
-    "x-ai_grok-4-fast_reasoning_high_search_0.jsonl":["xAI/grok-4-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
-    "x-ai_grok-4-fast_reasoning_False_search_0.jsonl":["xAI/grok-4-fast", "Closed-source Instruct", 0.2, 0.5],
-    "anthropic_claude-haiku-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-haiku-4.5 (Thinking)","Closed-source Reasoning", 1, 5],
-    "anthropic_claude-haiku-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-haiku-4.5","Closed-source Instruct", 1, 5],
-    "anthropic_claude-sonnet-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-sonnet-4.5 (Thinking)","Closed-source Reasoning", 3, 15],
-    "anthropic_claude-sonnet-4.5_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.5","Closed-source Instruct", 3, 15],
-    "minimax_minimax-m2_reasoning_high_search_0.jsonl": ["MiniMax/M2 (Thinking)","Open-weight Reasoning", 0.15, 0.45],
-    "minimax_minimax-m2_reasoning_False_search_0.jsonl": ["MiniMax/M2","Open-weight Instruct", 0.15, 0.45],
-    "moonshotai_kimi-k2-thinking_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2-Thinking","Open-weight Reasoning", 0.55, 2.25],
-    "deepseek_deepseek-v3.2-exp_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
-    "deepseek_deepseek-v3.2-exp_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2-Exp", "Open-weight Instruct", 0.27, 0.4],
-    "gemini-3-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Pro-Preview","Closed-source Reasoning", 2, 12],
-    "x-ai_grok-4.1-fast_reasoning_high_search_0.jsonl":["xAI/grok-4.1-fast (Thinking)", "Closed-source Reasoning", 0.2, 0.5],
-    "x-ai_grok-4.1-fast_reasoning_False_search_0.jsonl":["xAI/grok-4.1-fast", "Closed-source Instruct", 0.2, 0.5],
-    "gpt-5.1_reasoning_high_search_0.jsonl": ["OpenAI/GPT-5.1 (high)", "Closed-source Reasoning", 1.25, 10],
-    "anthropic_claude-opus-4.5_reasoning_high_search_0.jsonl": ["Anthropic/claude-opus-4.5 (Thinking)","Closed-source Reasoning", 5, 25],
-    "deepseek_deepseek-v3.2_reasoning_high_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2 (Thinking)", "Open-weight Reasoning", 0.27, 0.4],
-    "deepseek_deepseek-v3.2_reasoning_False_search_0.jsonl": ["DeepSeek-AI/DeepSeek-V3.2", "Open-weight Instruct", 0.27, 0.4],
-    "gpt-5.2_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.2 (xhigh)", "Closed-source Reasoning", 1.75, 14],
-    "gemini-3-flash-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3-Flash-Preview (Thinking)","Closed-source Reasoning", 0.5, 3],
-    "gemini-3-flash-preview_reasoning_False_search_0.jsonl": ["Google/Gemini-3-Flash-Preview","Closed-source Instruct", 0.5, 3],
-    "z-ai_glm-4.7_reasoning_high_search_0.jsonl":["Z-AI/GLM-4.7 (Thinking)", "Open-weight Reasoning", 0.4, 1.5],
-    "z-ai_glm-4.7_reasoning_False_search_0.jsonl":["Z-AI/GLM-4.7", "Open-weight Instruct", 0.4, 1.5],
-    "minimax_minimax-m2.1_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.1 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
-    "minimax_minimax-m2.1_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.1", "Open-weight Instruct", 0.3, 1.2],
-    "anthropic_claude-opus-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.6 (Thinking)","Closed-source Reasoning", 5, 25],
-    "moonshotai_kimi-k2.5_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.5 (Thinking)","Open-weight Reasoning", 0.45, 2.5],
-    "moonshotai_kimi-k2.5_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.5","Open-weight Instruct", 0.45, 2.5],
-    "z-ai_glm-5_reasoning_high_search_0.jsonl":["Z-AI/GLM-5 (Thinking)", "Open-weight Reasoning", 1.0, 3.2],
-    "z-ai_glm-5_reasoning_False_search_0.jsonl":["Z-AI/GLM-5", "Open-weight Instruct", 1.0, 3.2],
-    "qwen_qwen3.5-397b-a17b_reasoning_high_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B (Thinking)", "Open-weight Reasoning", 0.6, 3.6],
-    "qwen_qwen3.5-397b-a17b_reasoning_False_search_0.jsonl": ["Qwen/Qwen3.5-397B-A17B", "Open-weight Instruct", 0.6, 3.6],
-    "minimax_minimax-m2.5_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.5 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
-    "minimax_minimax-m2.5_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.5", "Open-weight Instruct", 0.3, 1.2],
-    "anthropic_claude-sonnet-4.6_reasoning_False_search_0.jsonl": ["Anthropic/claude-sonnet-4.6 (Thinking)","Closed-source Reasoning", 3, 15],
-    "gemini-3.1-pro-preview_reasoning_high_search_0.jsonl": ["Google/Gemini-3.1-Pro-Preview","Closed-source Reasoning", 2, 12],
-    "gpt-5.4_reasoning_xhigh_search_0.jsonl": ["OpenAI/GPT-5.4 (xhigh)", "Closed-source Reasoning", 2.5, 15],
-    "gpt-5.3-codex_reasoning_xhigh_search_0.jsonl":["OpenAI/GPT-5.3-Codex (xhigh)", "Closed-source Reasoning", 1.75, 14],
-    "gpt-5.3-chat-latest_reasoning_False_search_0.jsonl":["OpenAI/GPT-5.3-Chat", "Closed-source Instruct", 1.75, 14],
-    "x-ai_grok-4.20-beta_reasoning_high_search_0.jsonl":["xAI/grok-4.20 Beta (Thinking)", "Closed-source Reasoning", 2, 6],
-    "minimax_minimax-m2.7_reasoning_high_search_0.jsonl": ["MiniMax/MiniMax-M2.7 (Thinking)", "Open-weight Reasoning", 0.3, 1.2],
-    "minimax_minimax-m2.7_reasoning_False_search_0.jsonl": ["MiniMax/MiniMax-M2.7", "Open-weight Instruct", 0.3, 1.2],
-    "google_gemma-4-31b-it_reasoning_high_search_0.jsonl": ["Google/Gemma-4-31B-It (Thinking)", "Open-weight Reasoning", 0.14, 0.4],
-    "google_gemma-4-31b-it_reasoning_False_search_0.jsonl": ["Google/Gemma-4-31B-It", "Open-weight Instruct", 0.14, 0.4],
-    "anthropic_claude-opus-4.7_reasoning_False_search_0.jsonl": ["Anthropic/claude-opus-4.7 (Thinking)","Closed-source Reasoning", 5, 25],
-    "z-ai_glm-5.1_reasoning_high_search_0.jsonl":["Z-AI/GLM-5.1 (Thinking)", "Open-weight Reasoning", 0.95, 3.15],
-    "z-ai_glm-5.1_reasoning_False_search_0.jsonl":["Z-AI/GLM-5.1", "Open-weight Instruct", 0.95, 3.15],
-    "moonshotai_kimi-k2.6_reasoning_high_search_0.jsonl": ["MoonshotAI/Kimi-K2.6 (Thinking)","Open-weight Reasoning", 0.6, 2.8],
-    "moonshotai_kimi-k2.6_reasoning_False_search_0.jsonl": ["MoonshotAI/Kimi-K2.6","Open-weight Instruct", 0.6, 2.8],
-}
-names = "Model & Physics & Chemistry & Finance & Consulting & Overall & Extraction & Reasoning & Style & Response Characters & Input Tokens & Output Tokens & Cost "
-columns = [i.strip() for i in names.split("&")]
-output_filename = "report_generation.jsonl"
-folder = "../ProfBench/scores/"
-with open(output_filename, "w") as fw:
-    for filename in filename_to_args:
-        if not  os.path.exists(folder+filename):
-            raise ValueError(filename + " is not found")
-        # continue
-        with open(folder+filename, "r") as f:
-            one_row = json.load(f)
-        args = filename_to_args[filename]
-        new_dp = {}
-        print(args)
-        model =args[0]
-        category = args[1]
-        in_cost = args[2]
-        out_cost = args[3]
-        print(model)
-        # model, category, in_cost, out_cost = args[0]
-        new_dp["Model"] = model
-        new_dp["Category"] = category
-        new_dp["Overall"] = one_row["Overall"]
-        new_dp["Physics"] = one_row["Physics PhD"]
-        new_dp["Chemistry"] = one_row["Chemistry PhD"]
-        new_dp["Finance"] = one_row["Finance MBA"]
-        new_dp["Consulting"] = one_row["Consulting MBA"]
-        new_dp["Extraction"] = one_row["Extraction (recall)"]
-        new_dp["Reasoning"] = one_row["Reasoning"]
-        new_dp["Style"] = one_row["Style"]
-        new_dp["Response Characters"] = one_row["response_len_chars"]
-        new_dp["Input Tokens"] = one_row["prompt_tokens"]
-        new_dp["Output Tokens"] = one_row["completion_tokens"]
-        new_dp["Cost"] = round(160 / 1000000 * (in_cost  * one_row["prompt_tokens"] + out_cost * one_row["completion_tokens"]),2)
-        fw.write(json.dumps(new_dp)+'\n')