| {"submission_id": "1-1", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "1vvv1-1vvv1", "easy_accuracy": 23.61, "hard_accuracy": 0.53} | |
| {"submission_id": "1vvv2-1vvv2", "easy_accuracy": 25.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Adyen-Claude 3.5 Haiku ReACT Baseline", "easy_accuracy": 77.78, "hard_accuracy": 5.03} | |
| {"submission_id": "Adyen-Claude 3.5 Sonnet ReACT Baseline", "easy_accuracy": 77.78, "hard_accuracy": 9.26} | |
| {"submission_id": "Adyen-Deepseek V3 ReACT Baseline", "easy_accuracy": 66.67, "hard_accuracy": 5.56} | |
| {"submission_id": "Adyen-GPT 4o ReACT Baseline", "easy_accuracy": 66.67, "hard_accuracy": 6.08} | |
| {"submission_id": "Adyen-GPT 4o-mini ReACT Baseline", "easy_accuracy": 69.44, "hard_accuracy": 3.44} | |
| {"submission_id": "Adyen-Llama 3.2 1B ReACT Baseline", "easy_accuracy": 1.39, "hard_accuracy": 0.0} | |
| {"submission_id": "Adyen-Llama 3.3 70B ReACT Baseline", "easy_accuracy": 68.06, "hard_accuracy": 3.7} | |
| {"submission_id": "Adyen-Qwen Coder ReAct Baseline", "easy_accuracy": 54.17, "hard_accuracy": 3.44} | |
| {"submission_id": "Adyen-o1 Reasoning Prompt Baseline", "easy_accuracy": 69.44, "hard_accuracy": 11.11} | |
| {"submission_id": "Adyen-o3-mini Reasoning Prompt Baseline", "easy_accuracy": 72.22, "hard_accuracy": 13.76} | |
| {"submission_id": "Agent Org 2-Agent Test 2", "easy_accuracy": 43.06, "hard_accuracy": 1.85} | |
| {"submission_id": "Agent Test Org 1-Agent No 2 Test", "easy_accuracy": 43.06, "hard_accuracy": 1.85} | |
| {"submission_id": "Agent Test Org 3-Agent No 3 Test", "easy_accuracy": 50.0, "hard_accuracy": 2.65} | |
| {"submission_id": "Agent Test Org-Agent No 1 Test", "easy_accuracy": 15.28, "hard_accuracy": 2.38} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics", "easy_accuracy": 95.83, "hard_accuracy": 53.44} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics_0917_1", "easy_accuracy": 95.83, "hard_accuracy": 62.96} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics_0918", "easy_accuracy": 95.83, "hard_accuracy": 54.76} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics_v0.1", "easy_accuracy": 95.83, "hard_accuracy": 62.96} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics_v0.1_1", "easy_accuracy": 95.83, "hard_accuracy": 64.55} | |
| {"submission_id": "Alibaba Cloud-Data Agent for Analytics_v0.6.1", "easy_accuracy": 95.83, "hard_accuracy": 65.34} | |
| {"submission_id": "Amity Solutions Thailand-Amity Agent v0.1", "easy_accuracy": 76.39, "hard_accuracy": 25.66} | |
| {"submission_id": "Amity Solutions Thailand-Amity DA Agent v0.1", "easy_accuracy": 80.56, "hard_accuracy": 41.01} | |
| {"submission_id": "Anon For Now-Naming WIP", "easy_accuracy": 76.39, "hard_accuracy": 39.95} | |
| {"submission_id": "Anon For Now-Naming WIP2", "easy_accuracy": 81.94, "hard_accuracy": 48.41} | |
| {"submission_id": "Anonymous - Rumil-Test Agent 1", "easy_accuracy": 81.94, "hard_accuracy": 30.16} | |
| {"submission_id": "Anonymous-0417-DA-Agent-Anonymous-0417", "easy_accuracy": 73.61, "hard_accuracy": 5.56} | |
| {"submission_id": "Anonymous-0418-DA-Agent-Anonymous-0418", "easy_accuracy": 73.61, "hard_accuracy": 5.56} | |
| {"submission_id": "Anonymous-0811_ablation_round_10", "easy_accuracy": 86.11, "hard_accuracy": 43.65} | |
| {"submission_id": "Anonymous-0811_ablation_round_15", "easy_accuracy": 87.5, "hard_accuracy": 43.92} | |
| {"submission_id": "Anonymous-0811_ablation_round_5", "easy_accuracy": 86.11, "hard_accuracy": 43.12} | |
| {"submission_id": "Anonymous-Ablation-DS-STAR-No-Verifier", "easy_accuracy": 83.33, "hard_accuracy": 34.66} | |
| {"submission_id": "Anonymous-Ablation-Flash", "easy_accuracy": 80.56, "hard_accuracy": 29.37} | |
| {"submission_id": "Anonymous-Anonymous-Test-DSA", "easy_accuracy": 69.44, "hard_accuracy": 20.37} | |
| {"submission_id": "Anonymous-Anonymous_GPT-5_Ablation", "easy_accuracy": 88.89, "hard_accuracy": 43.12} | |
| {"submission_id": "Anonymous-AutoGen-0805", "easy_accuracy": 59.72, "hard_accuracy": 10.32} | |
| {"submission_id": "Anonymous-DA-Agent-0603", "easy_accuracy": 73.61, "hard_accuracy": 20.37} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-0419", "easy_accuracy": 69.44, "hard_accuracy": 18.25} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-0419-2", "easy_accuracy": 70.83, "hard_accuracy": 16.93} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-0424", "easy_accuracy": 76.39, "hard_accuracy": 13.23} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-06-07", "easy_accuracy": 79.17, "hard_accuracy": 29.1} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-06-10", "easy_accuracy": 79.17, "hard_accuracy": 32.8} | |
| {"submission_id": "Anonymous-DA-Agent-Anonymous-06-11", "easy_accuracy": 79.17, "hard_accuracy": 33.86} | |
| {"submission_id": "Anonymous-DA-Agent-DA-Code-0909", "easy_accuracy": 68.06, "hard_accuracy": 22.49} | |
| {"submission_id": "Anonymous-DS-STAR-Ablation-DeepSeek", "easy_accuracy": 79.17, "hard_accuracy": 28.57} | |
| {"submission_id": "Anonymous-DS-STAR-Ablation-Llama3", "easy_accuracy": 83.33, "hard_accuracy": 42.59} | |
| {"submission_id": "Anonymous-DS-STAR-Ablation-Prompting", "easy_accuracy": 83.33, "hard_accuracy": 43.92} | |
| {"submission_id": "Anonymous-DataInterpreter-0805", "easy_accuracy": 72.22, "hard_accuracy": 3.44} | |
| {"submission_id": "Anonymous-ablation_no_desc", "easy_accuracy": 75.0, "hard_accuracy": 26.98} | |
| {"submission_id": "Anonymous-ablation_no_router_0716", "easy_accuracy": 79.17, "hard_accuracy": 39.95} | |
| {"submission_id": "Anonymous-baseline_react_0716", "easy_accuracy": 69.44, "hard_accuracy": 10.05} | |
| {"submission_id": "Anonymous-test-agent", "easy_accuracy": 76.39, "hard_accuracy": 11.38} | |
| {"submission_id": "Anonymous-test-agent2", "easy_accuracy": 68.06, "hard_accuracy": 14.81} | |
| {"submission_id": "Ant-AntAgent", "easy_accuracy": 62.5, "hard_accuracy": 46.56} | |
| {"submission_id": "Apiphany-DS-STAR-Implementation", "easy_accuracy": 79.17, "hard_accuracy": 36.51} | |
| {"submission_id": "BD_DA-BDDA_0813", "easy_accuracy": 72.22, "hard_accuracy": 11.11} | |
| {"submission_id": "BOSCOTEST-bosco_test_0625", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "BOSCOTEST-bosco_test_0704_01", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "BOSCOTEST-bosco_test_0704_02", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "BOSCOTEST-boscotest0702", "easy_accuracy": 97.22, "hard_accuracy": 56.35} | |
| {"submission_id": "Bespoke Labs-DA-Test-Agent", "easy_accuracy": 80.56, "hard_accuracy": 49.47} | |
| {"submission_id": "BigFrames-nl2bigframes_agent_dabsteps_QP_DC_RAG_True_judge_False_picker_llm_picker", "easy_accuracy": 72.22, "hard_accuracy": 8.47} | |
| {"submission_id": "BigFrames-nl2bigframes_agent_dabsteps_QP_DC_RAG_True_judge_False_picker_self_consistency", "easy_accuracy": 81.94, "hard_accuracy": 8.73} | |
| {"submission_id": "BigOmen-BigOmen", "easy_accuracy": 86.11, "hard_accuracy": 53.17} | |
| {"submission_id": "ByteDance DataPlatform-LLM-data_agent", "easy_accuracy": 69.44, "hard_accuracy": 15.34} | |
| {"submission_id": "CambioML-CambioML Data Scientist Agent", "easy_accuracy": 79.17, "hard_accuracy": 23.28} | |
| {"submission_id": "CambioML-CambioML energent.ai DS Agent", "easy_accuracy": 94.44, "hard_accuracy": 57.67} | |
| {"submission_id": "DataCloud-Bloom Agents Team-0918", "easy_accuracy": 95.83, "hard_accuracy": 56.35} | |
| {"submission_id": "DataCloud-Bloom Agents Team-0918", "easy_accuracy": 95.83, "hard_accuracy": 56.35} | |
| {"submission_id": "DataCloud-Bloom v0.1.0", "easy_accuracy": 72.22, "hard_accuracy": 37.3} | |
| {"submission_id": "DataCloud-Powerdrill Agents Team", "easy_accuracy": 95.83, "hard_accuracy": 67.99} | |
| {"submission_id": "DataCloud-Powerdrill Agents Team-0922", "easy_accuracy": 95.83, "hard_accuracy": 64.29} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents", "easy_accuracy": 83.33, "hard_accuracy": 48.41} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents-0801", "easy_accuracy": 90.28, "hard_accuracy": 48.41} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents-0815", "easy_accuracy": 90.28, "hard_accuracy": 52.65} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents-0816", "easy_accuracy": 95.83, "hard_accuracy": 52.65} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents-0818", "easy_accuracy": 95.83, "hard_accuracy": 53.7} | |
| {"submission_id": "DataCloud-Powerdrill Data Agents-0819", "easy_accuracy": 95.83, "hard_accuracy": 56.35} | |
| {"submission_id": "Do not disclose-DSA-test-250930", "easy_accuracy": 68.06, "hard_accuracy": 6.08} | |
| {"submission_id": "Do not disclose-DSA-test-251003", "easy_accuracy": 61.11, "hard_accuracy": 6.61} | |
| {"submission_id": "Do not disclose-DSA-test-251003-flash-easy", "easy_accuracy": 66.67, "hard_accuracy": 0.0} | |
| {"submission_id": "Do not disclose-DSA-test-251003-flash-easy-default", "easy_accuracy": 72.22, "hard_accuracy": 0.0} | |
| {"submission_id": "Do not disclose-DSA-test-251003-flash-easy-txt", "easy_accuracy": 65.28, "hard_accuracy": 0.0} | |
| {"submission_id": "Do not disclose-DSA-test-251004-flash-base", "easy_accuracy": 68.06, "hard_accuracy": 7.67} | |
| {"submission_id": "Do not disclose-DSA-test-251004-flash-next", "easy_accuracy": 70.83, "hard_accuracy": 10.32} | |
| {"submission_id": "Do not disclose-DSA-test-251004-pro-base", "easy_accuracy": 62.5, "hard_accuracy": 12.7} | |
| {"submission_id": "Doe Labs-Claude 4.5 sonnet ReAct Baseline", "easy_accuracy": 75.0, "hard_accuracy": 10.85} | |
| {"submission_id": "Genesis Computing-Eve", "easy_accuracy": 66.67, "hard_accuracy": 11.9} | |
| {"submission_id": "Genesis Computing-Genesis Data Agent", "easy_accuracy": 100.0, "hard_accuracy": 100.0} | |
| {"submission_id": "Google Cloud AI Research-DS-PlaVer", "easy_accuracy": 87.5, "hard_accuracy": 45.24} | |
| {"submission_id": "Google-Data Science Agent", "easy_accuracy": 61.11, "hard_accuracy": 9.79} | |
| {"submission_id": "Google-Mo test agent", "easy_accuracy": 52.78, "hard_accuracy": 8.2} | |
| {"submission_id": "HKU-devagent", "easy_accuracy": 23.61, "hard_accuracy": 0.79} | |
| {"submission_id": "Hugging Face-Claude 3.7 Sonnet ReACT Baseline", "easy_accuracy": 75.0, "hard_accuracy": 13.76} | |
| {"submission_id": "Hugging Face-Claude 4 Opus ReACT Baseline", "easy_accuracy": 69.44, "hard_accuracy": 16.4} | |
| {"submission_id": "Hugging Face-Claude 4 Sonnet ReACT Baseline", "easy_accuracy": 81.94, "hard_accuracy": 19.84} | |
| {"submission_id": "Hugging Face-GPT 4.1 ReACT Baseline", "easy_accuracy": 80.56, "hard_accuracy": 12.43} | |
| {"submission_id": "Hugging Face-GPT 4.1-mini ReACT baseline", "easy_accuracy": 77.78, "hard_accuracy": 8.99} | |
| {"submission_id": "Hugging Face-Gemini 2.5 Pro Reasoning Prompt Baseline", "easy_accuracy": 66.67, "hard_accuracy": 12.7} | |
| {"submission_id": "Hugging Face-Llama 4 Maverick ReACT Baseline", "easy_accuracy": 75.0, "hard_accuracy": 8.73} | |
| {"submission_id": "Hugging Face-Llama 4 Scout ReACT Baseline", "easy_accuracy": 52.78, "hard_accuracy": 1.85} | |
| {"submission_id": "Hugging Face-o3 Reasoning Prompt Baseline", "easy_accuracy": 12.5, "hard_accuracy": 0.26} | |
| {"submission_id": "Hugging Face-o4-mini Reasoning Prompt Baseline", "easy_accuracy": 76.39, "hard_accuracy": 14.55} | |
| {"submission_id": "Individual-DevCore-Tester-Edge-Tester-1", "easy_accuracy": 90.28, "hard_accuracy": 51.85} | |
| {"submission_id": "Individual-DevCore-Tester-Edge-Tester-2", "easy_accuracy": 94.44, "hard_accuracy": 54.76} | |
| {"submission_id": "Individual-My Agent V1 S2", "easy_accuracy": 69.44, "hard_accuracy": 6.35} | |
| {"submission_id": "Individual-My Agent V2 S1", "easy_accuracy": 62.5, "hard_accuracy": 4.5} | |
| {"submission_id": "Individual-My Agent V2 S2", "easy_accuracy": 70.83, "hard_accuracy": 5.82} | |
| {"submission_id": "Individual-Test-Qwen-Coder", "easy_accuracy": 44.44, "hard_accuracy": 1.59} | |
| {"submission_id": "Individual-Test1", "easy_accuracy": 75.0, "hard_accuracy": 27.25} | |
| {"submission_id": "Institute of Information Engineering-Backward_Reasoning", "easy_accuracy": 79.17, "hard_accuracy": 35.45} | |
| {"submission_id": "Institute of Information Engineering-small_llms_try", "easy_accuracy": 66.67, "hard_accuracy": 18.25} | |
| {"submission_id": "Jaytest-JayTest", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "Jaytest-Jaytest2c", "easy_accuracy": 1.39, "hard_accuracy": 0.0} | |
| {"submission_id": "Jaytest-Jaytest3", "easy_accuracy": 1.39, "hard_accuracy": 0.0} | |
| {"submission_id": "Jaytest-jaytest1", "easy_accuracy": 13.89, "hard_accuracy": 0.0} | |
| {"submission_id": "Jaytest-jaytest10", "easy_accuracy": 1.39, "hard_accuracy": 0.0} | |
| {"submission_id": "MagicAgent-Magic_Agent_0910", "easy_accuracy": 95.83, "hard_accuracy": 61.11} | |
| {"submission_id": "MagicAgent-Magic_Agent_0918", "easy_accuracy": 95.83, "hard_accuracy": 61.11} | |
| {"submission_id": "Microsoft-DICE", "easy_accuracy": 75.0, "hard_accuracy": 29.1} | |
| {"submission_id": "Mphasis Limited-Mphasis-I2I-Agents", "easy_accuracy": 80.56, "hard_accuracy": 28.04} | |
| {"submission_id": "Mphasis-Mphasis-I2I-Agents", "easy_accuracy": 86.11, "hard_accuracy": 24.34} | |
| {"submission_id": "NA-Anurag_Data_AI", "easy_accuracy": 83.33, "hard_accuracy": 48.15} | |
| {"submission_id": "NA-Anurag_Data_AI_2", "easy_accuracy": 83.33, "hard_accuracy": 41.01} | |
| {"submission_id": "NA-DSA-v3-test-251019-v1", "easy_accuracy": 72.22, "hard_accuracy": 16.93} | |
| {"submission_id": "NVIDIA-exp001_baseline.py", "easy_accuracy": 73.61, "hard_accuracy": 18.25} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-all-all", "easy_accuracy": 73.61, "hard_accuracy": 6.88} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-all-none", "easy_accuracy": 79.17, "hard_accuracy": 10.58} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-all-sc", "easy_accuracy": 75.0, "hard_accuracy": 7.94} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-all-si", "easy_accuracy": 79.17, "hard_accuracy": 10.58} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-none-all2", "easy_accuracy": 75.0, "hard_accuracy": 8.2} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-none-none", "easy_accuracy": 73.61, "hard_accuracy": 8.47} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-none-sc", "easy_accuracy": 70.83, "hard_accuracy": 7.14} | |
| {"submission_id": "Not applicable-dsa-test-251006-flash-none-sionly", "easy_accuracy": 77.78, "hard_accuracy": 9.79} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-all-all2", "easy_accuracy": 79.17, "hard_accuracy": 10.58} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-all-none", "easy_accuracy": 79.17, "hard_accuracy": 9.26} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-all-sc", "easy_accuracy": 73.61, "hard_accuracy": 2.38} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-all-sionly", "easy_accuracy": 80.56, "hard_accuracy": 11.9} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-none-all", "easy_accuracy": 79.17, "hard_accuracy": 12.43} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-none-none", "easy_accuracy": 72.22, "hard_accuracy": 12.43} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-none-sc", "easy_accuracy": 76.39, "hard_accuracy": 3.17} | |
| {"submission_id": "Not applicable-dsa-test-251006-pro-none-sionly", "easy_accuracy": 73.61, "hard_accuracy": 13.49} | |
| {"submission_id": "Not applicable-dsa-test-251009-pro-adk", "easy_accuracy": 84.72, "hard_accuracy": 38.62} | |
| {"submission_id": "Not applicable-dsa-test-251010-flash-adk", "easy_accuracy": 81.94, "hard_accuracy": 27.25} | |
| {"submission_id": "Not applicable-dsa-test-251011-flash-dsa-v3-g3", "easy_accuracy": 79.17, "hard_accuracy": 10.32} | |
| {"submission_id": "Not applicable-dsa-test-251011-pro-dsa-v3-g3", "easy_accuracy": 72.22, "hard_accuracy": 16.67} | |
| {"submission_id": "O-DV1-2", "easy_accuracy": 79.17, "hard_accuracy": 22.22} | |
| {"submission_id": "O-DV1-2.r2", "easy_accuracy": 84.72, "hard_accuracy": 20.9} | |
| {"submission_id": "O-DV1.0", "easy_accuracy": 68.06, "hard_accuracy": 18.52} | |
| {"submission_id": "O-OAgent", "easy_accuracy": 84.72, "hard_accuracy": 32.54} | |
| {"submission_id": "O-Oagent V0", "easy_accuracy": 50.0, "hard_accuracy": 3.17} | |
| {"submission_id": "ONE LAB-Qwen-14B", "easy_accuracy": 54.17, "hard_accuracy": 1.32} | |
| {"submission_id": "ONE LAB-Qwen-14B-coder", "easy_accuracy": 44.44, "hard_accuracy": 1.32} | |
| {"submission_id": "ONE LAB-Qwen-7B", "easy_accuracy": 50.0, "hard_accuracy": 0.0} | |
| {"submission_id": "ONE LAB-gpt-4.1", "easy_accuracy": 70.83, "hard_accuracy": 8.99} | |
| {"submission_id": "ONE LAB-gpt-4.1-mini", "easy_accuracy": 75.0, "hard_accuracy": 6.08} | |
| {"submission_id": "ONE LAB-gpt-4.1-nano", "easy_accuracy": 55.56, "hard_accuracy": 0.53} | |
| {"submission_id": "ONE_LAB-4o-mini", "easy_accuracy": 65.28, "hard_accuracy": 2.91} | |
| {"submission_id": "OceanBase-DUDU", "easy_accuracy": 61.11, "hard_accuracy": 0.79} | |
| {"submission_id": "Oceanbase-DUDU", "easy_accuracy": 52.78, "hard_accuracy": 0.79} | |
| {"submission_id": "Oceanbase-DUDU Agent", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Org 50 2 1-Agent 50 2 1", "easy_accuracy": 43.06, "hard_accuracy": 1.85} | |
| {"submission_id": "Org 50 2 2-Agent 50 2 2", "easy_accuracy": 50.0, "hard_accuracy": 2.65} | |
| {"submission_id": "Org 50 2 3-Agent 50 2 3", "easy_accuracy": 15.28, "hard_accuracy": 2.38} | |
| {"submission_id": "Org 50 2 4-Agent 50 2 4", "easy_accuracy": 50.0, "hard_accuracy": 2.65} | |
| {"submission_id": "Org 50 2-Agent 50 2", "easy_accuracy": 15.28, "hard_accuracy": 2.38} | |
| {"submission_id": "Org Dev 1-Agent Dev 1", "easy_accuracy": 48.61, "hard_accuracy": 1.59} | |
| {"submission_id": "Org V1 P1-Agent V1 P1", "easy_accuracy": 68.06, "hard_accuracy": 3.44} | |
| {"submission_id": "Org V1 P2-Agent V1 P2", "easy_accuracy": 69.44, "hard_accuracy": 6.35} | |
| {"submission_id": "Org V2 S1-Agent V2 S1", "easy_accuracy": 62.5, "hard_accuracy": 4.5} | |
| {"submission_id": "Org Variant 1 A 2-Agent Variant 1 A 2", "easy_accuracy": 38.89, "hard_accuracy": 1.59} | |
| {"submission_id": "Org Variant 1 S-Agent Variant 1 S", "easy_accuracy": 65.28, "hard_accuracy": 3.17} | |
| {"submission_id": "Org Variant 3-Agent Variant 3", "easy_accuracy": 33.33, "hard_accuracy": 1.32} | |
| {"submission_id": "Organisation-1-gpt-test-agent-27-hard", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "Organisation-1-gpt-test-agent-30-hard", "easy_accuracy": 9.72, "hard_accuracy": 0.79} | |
| {"submission_id": "Organization 1-My Agent", "easy_accuracy": 94.44, "hard_accuracy": 54.76} | |
| {"submission_id": "Organization 1-My Agent 1", "easy_accuracy": 97.22, "hard_accuracy": 56.35} | |
| {"submission_id": "Organization 2-My Agent 2", "easy_accuracy": 40.28, "hard_accuracy": 0.79} | |
| {"submission_id": "Personal-test-36a", "easy_accuracy": 0.0, "hard_accuracy": 1.06} | |
| {"submission_id": "Personal-test-e29", "easy_accuracy": 95.83, "hard_accuracy": 47.88} | |
| {"submission_id": "Personal-test-e30", "easy_accuracy": 4.17, "hard_accuracy": 4.5} | |
| {"submission_id": "Personal-test-e31", "easy_accuracy": 0.0, "hard_accuracy": 0.53} | |
| {"submission_id": "Personal-test-e32", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e33", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e34", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e35", "easy_accuracy": 0.0, "hard_accuracy": 1.06} | |
| {"submission_id": "Personal-test-e35a", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e37", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e38", "easy_accuracy": 0.0, "hard_accuracy": 2.65} | |
| {"submission_id": "Personal-test-e40", "easy_accuracy": 0.0, "hard_accuracy": 4.23} | |
| {"submission_id": "Personal-test-e42", "easy_accuracy": 0.0, "hard_accuracy": 1.06} | |
| {"submission_id": "Personal-test-e43", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "Personal-test-e44", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "Personal-test-e45", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e45a", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e46", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e52b", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e55c", "easy_accuracy": 0.0, "hard_accuracy": 0.53} | |
| {"submission_id": "Personal-test-e56", "easy_accuracy": 0.0, "hard_accuracy": 4.5} | |
| {"submission_id": "Personal-test-e57", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e58", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "Personal-test-e59", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e63", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "Personal-test-e64", "easy_accuracy": 0.0, "hard_accuracy": 3.17} | |
| {"submission_id": "Personal-test-e65", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "Personal-test-e66", "easy_accuracy": 95.83, "hard_accuracy": 45.77} | |
| {"submission_id": "Personal-test-e66a", "easy_accuracy": 95.83, "hard_accuracy": 47.88} | |
| {"submission_id": "Personal-test-e67", "easy_accuracy": 95.83, "hard_accuracy": 54.23} | |
| {"submission_id": "Personal-test-e68aa", "easy_accuracy": 95.83, "hard_accuracy": 30.69} | |
| {"submission_id": "Personal-tester-agent-noop", "easy_accuracy": 70.83, "hard_accuracy": 0.79} | |
| {"submission_id": "Personal-tlv-dab-5", "easy_accuracy": 72.22, "hard_accuracy": 0.79} | |
| {"submission_id": "QQ TEST 3-QQ TEST 3", "easy_accuracy": 26.39, "hard_accuracy": 2.12} | |
| {"submission_id": "QQ TEST DS-QQ TEST DS", "easy_accuracy": 19.44, "hard_accuracy": 0.26} | |
| {"submission_id": "Red Hat and Universidad Complutense de Madrid-SuperInference 0.41.3", "easy_accuracy": 79.17, "hard_accuracy": 41.27} | |
| {"submission_id": "ST-baseline-Gemini-2.5-pro-preview-0506-May-31", "easy_accuracy": 66.67, "hard_accuracy": 7.41} | |
| {"submission_id": "ST-baseline-gemini-2.0-flash-may-31", "easy_accuracy": 31.94, "hard_accuracy": 2.91} | |
| {"submission_id": "Self-QQ Test", "easy_accuracy": 23.61, "hard_accuracy": 0.79} | |
| {"submission_id": "Self-QQ Test DB", "easy_accuracy": 50.0, "hard_accuracy": 5.03} | |
| {"submission_id": "Self-Spartacus", "easy_accuracy": 0.0, "hard_accuracy": 34.13} | |
| {"submission_id": "Self-agent_pogo1", "easy_accuracy": 50.0, "hard_accuracy": 17.2} | |
| {"submission_id": "Self-agent_pogo_baseline", "easy_accuracy": 72.22, "hard_accuracy": 18.52} | |
| {"submission_id": "Self-agent_pogo_baseline1", "easy_accuracy": 63.89, "hard_accuracy": 19.31} | |
| {"submission_id": "Sphinx-sphinx-0.8", "easy_accuracy": 87.5, "hard_accuracy": 61.38} | |
| {"submission_id": "Stanford-test_1", "easy_accuracy": 52.78, "hard_accuracy": 8.2} | |
| {"submission_id": "TEMP-SYNCER-AGENT", "easy_accuracy": 76.39, "hard_accuracy": 31.75} | |
| {"submission_id": "TEMP-g3-ds-s", "easy_accuracy": 87.5, "hard_accuracy": 37.04} | |
| {"submission_id": "Test Agent Org-Test Agent", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "Test Agent Org-Test Agent2", "easy_accuracy": 4.17, "hard_accuracy": 0.53} | |
| {"submission_id": "Test Agent Org-Test Agent3", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "Test Agent Org-Test Agent4", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "Test Agent-Test Agent", "easy_accuracy": 97.22, "hard_accuracy": 56.61} | |
| {"submission_id": "Test Organization 1-Test Agent 1", "easy_accuracy": 88.89, "hard_accuracy": 56.08} | |
| {"submission_id": "Test-AnonymousAgent", "easy_accuracy": 86.11, "hard_accuracy": 87.57} | |
| {"submission_id": "Test-DV1.1.run1", "easy_accuracy": 87.5, "hard_accuracy": 37.83} | |
| {"submission_id": "Think Evolve Labs LLC-ThinkEvolve Spoofer", "easy_accuracy": 100.0, "hard_accuracy": 99.21} | |
| {"submission_id": "TogetherAI-Open Data Scientist", "easy_accuracy": 84.72, "hard_accuracy": 16.4} | |
| {"submission_id": "Tsinghua University-AeYa", "easy_accuracy": 88.89, "hard_accuracy": 24.87} | |
| {"submission_id": "Tsinghua University-AeYa-v0.2", "easy_accuracy": 88.89, "hard_accuracy": 28.57} | |
| {"submission_id": "Tsinghua University-AeYa-v0.3", "easy_accuracy": 88.89, "hard_accuracy": 40.74} | |
| {"submission_id": "Tsinghua University-AgenticData-update-Aug-20", "easy_accuracy": 94.44, "hard_accuracy": 46.83} | |
| {"submission_id": "UCAS-data_analysis_agent_S220", "easy_accuracy": 73.61, "hard_accuracy": 25.4} | |
| {"submission_id": "Zscaler-OpenManus", "easy_accuracy": 75.0, "hard_accuracy": 12.7} | |
| {"submission_id": "a-another-one", "easy_accuracy": 77.78, "hard_accuracy": 44.97} | |
| {"submission_id": "aaa-aaa780", "easy_accuracy": 73.61, "hard_accuracy": 30.42} | |
| {"submission_id": "agent_for_real-agent_for_real", "easy_accuracy": 87.5, "hard_accuracy": 58.47} | |
| {"submission_id": "agent_test-agent_test", "easy_accuracy": 100.0, "hard_accuracy": 70.37} | |
| {"submission_id": "agenttt-agenttt", "easy_accuracy": 68.06, "hard_accuracy": 7.14} | |
| {"submission_id": "agentttt-agentttt", "easy_accuracy": 66.67, "hard_accuracy": 5.82} | |
| {"submission_id": "ai innovation-zagent_test", "easy_accuracy": 41.67, "hard_accuracy": 1.85} | |
| {"submission_id": "ai innovation-zagent_test_2", "easy_accuracy": 52.78, "hard_accuracy": 2.38} | |
| {"submission_id": "aktest-aktest", "easy_accuracy": 76.39, "hard_accuracy": 21.16} | |
| {"submission_id": "anon-anon", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-12", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-18", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-2", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-3", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-76", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-82", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-anon-95", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "anon-im-a-good-agent-2", "easy_accuracy": 58.33, "hard_accuracy": 6.08} | |
| {"submission_id": "anon-im-a-good-agent-3", "easy_accuracy": 58.33, "hard_accuracy": 6.08} | |
| {"submission_id": "anon-im-a-good-agent-4", "easy_accuracy": 73.61, "hard_accuracy": 6.08} | |
| {"submission_id": "anon-im-a-good-agent-5", "easy_accuracy": 62.5, "hard_accuracy": 2.65} | |
| {"submission_id": "anon-im-a-good-agent-datamix-350-100-steps", "easy_accuracy": 61.11, "hard_accuracy": 3.7} | |
| {"submission_id": "anon-im-a-good-da-agent-4", "easy_accuracy": 66.67, "hard_accuracy": 5.29} | |
| {"submission_id": "anon-qwen-good-agent", "easy_accuracy": 48.61, "hard_accuracy": 1.06} | |
| {"submission_id": "anonymous-aaaaagent", "easy_accuracy": 70.83, "hard_accuracy": 9.26} | |
| {"submission_id": "anonymous-test_anonymous1", "easy_accuracy": 70.83, "hard_accuracy": 9.26} | |
| {"submission_id": "ant-super-agent", "easy_accuracy": 69.44, "hard_accuracy": 7.41} | |
| {"submission_id": "ant_test_v1-ant_test_v1", "easy_accuracy": 73.61, "hard_accuracy": 34.39} | |
| {"submission_id": "anuragpunia-A-Star", "easy_accuracy": 80.56, "hard_accuracy": 34.92} | |
| {"submission_id": "anuragpunia-A-Star-v3", "easy_accuracy": 73.61, "hard_accuracy": 35.19} | |
| {"submission_id": "anuragpunia.com-A-Star", "easy_accuracy": 81.94, "hard_accuracy": 36.51} | |
| {"submission_id": "baptiste-Test o3 debug ", "easy_accuracy": 19.44, "hard_accuracy": 0.79} | |
| {"submission_id": "baptiste-baptiste", "easy_accuracy": 97.22, "hard_accuracy": 56.61} | |
| {"submission_id": "beacon-beacon-agent-1", "easy_accuracy": 87.5, "hard_accuracy": 30.16} | |
| {"submission_id": "beacon-beacon-agent-1.2", "easy_accuracy": 87.5, "hard_accuracy": 30.16} | |
| {"submission_id": "beacon-beacon-agent-1.3", "easy_accuracy": 70.83, "hard_accuracy": 37.3} | |
| {"submission_id": "beacon-beacon-agent-1.3.1", "easy_accuracy": 70.83, "hard_accuracy": 37.3} | |
| {"submission_id": "beacon-beacon-agent-1.4", "easy_accuracy": 77.78, "hard_accuracy": 38.89} | |
| {"submission_id": "beacon-beacon-agent-1.4.1", "easy_accuracy": 77.78, "hard_accuracy": 38.89} | |
| {"submission_id": "beacon-beacon-agent-1.5", "easy_accuracy": 79.17, "hard_accuracy": 43.39} | |
| {"submission_id": "beacon-beacon-agent-1.6", "easy_accuracy": 69.44, "hard_accuracy": 45.5} | |
| {"submission_id": "beacon-beacon-agent-test", "easy_accuracy": 75.0, "hard_accuracy": 43.39} | |
| {"submission_id": "beacon-beacon-agent-test-1.0", "easy_accuracy": 79.17, "hard_accuracy": 43.65} | |
| {"submission_id": "beacon-beacon-agent-test-1.3", "easy_accuracy": 75.0, "hard_accuracy": 44.97} | |
| {"submission_id": "beacon-beacon-alpha-1.0", "easy_accuracy": 72.22, "hard_accuracy": 37.3} | |
| {"submission_id": "beacon-beacon-alpha-1.1", "easy_accuracy": 72.22, "hard_accuracy": 42.86} | |
| {"submission_id": "beacon-beacon-alpha-1.2", "easy_accuracy": 70.83, "hard_accuracy": 45.24} | |
| {"submission_id": "beacon-beacon-beta-1.0", "easy_accuracy": 77.78, "hard_accuracy": 49.74} | |
| {"submission_id": "beacon-beacon-beta-1.1", "easy_accuracy": 84.72, "hard_accuracy": 47.35} | |
| {"submission_id": "beacon-beacon-beta-1.1.1", "easy_accuracy": 86.11, "hard_accuracy": 47.88} | |
| {"submission_id": "beep-beep", "easy_accuracy": 77.78, "hard_accuracy": 39.15} | |
| {"submission_id": "beep2-beep2", "easy_accuracy": 77.78, "hard_accuracy": 44.44} | |
| {"submission_id": "beep3-beep3", "easy_accuracy": 77.78, "hard_accuracy": 44.44} | |
| {"submission_id": "bespokelabs-im-a-good-agent", "easy_accuracy": 58.33, "hard_accuracy": 6.08} | |
| {"submission_id": "bob-better-bot", "easy_accuracy": 80.56, "hard_accuracy": 44.18} | |
| {"submission_id": "cam-basic", "easy_accuracy": 65.28, "hard_accuracy": 3.17} | |
| {"submission_id": "cm-cm", "easy_accuracy": 77.78, "hard_accuracy": 26.72} | |
| {"submission_id": "cm-cm-hf-1", "easy_accuracy": 80.56, "hard_accuracy": 23.81} | |
| {"submission_id": "doanhdoanh-adav01", "easy_accuracy": 34.72, "hard_accuracy": 7.94} | |
| {"submission_id": "ds-ds", "easy_accuracy": 77.78, "hard_accuracy": 28.84} | |
| {"submission_id": "ds-ds--hard-70", "easy_accuracy": 0.0, "hard_accuracy": 1.32} | |
| {"submission_id": "ds-ds--hard-70-3", "easy_accuracy": 80.56, "hard_accuracy": 23.28} | |
| {"submission_id": "ds-ds--v31", "easy_accuracy": 76.39, "hard_accuracy": 21.96} | |
| {"submission_id": "ds-ds-160", "easy_accuracy": 70.83, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-ds-160-hard1", "easy_accuracy": 0.0, "hard_accuracy": 2.12} | |
| {"submission_id": "ds-ds-160-hard2", "easy_accuracy": 0.0, "hard_accuracy": 2.12} | |
| {"submission_id": "ds-ds-160-hard3", "easy_accuracy": 0.0, "hard_accuracy": 2.12} | |
| {"submission_id": "ds-ds-160-hard4", "easy_accuracy": 0.0, "hard_accuracy": 8.73} | |
| {"submission_id": "ds-ds-235", "easy_accuracy": 73.61, "hard_accuracy": 17.46} | |
| {"submission_id": "ds-ds-96", "easy_accuracy": 77.78, "hard_accuracy": 33.07} | |
| {"submission_id": "ds-ds-96-1", "easy_accuracy": 12.5, "hard_accuracy": 5.29} | |
| {"submission_id": "ds-ds-96-2", "easy_accuracy": 23.61, "hard_accuracy": 11.38} | |
| {"submission_id": "ds-ds-96-3", "easy_accuracy": 37.5, "hard_accuracy": 15.61} | |
| {"submission_id": "ds-ds-96-4", "easy_accuracy": 76.39, "hard_accuracy": 33.07} | |
| {"submission_id": "ds-ds-all", "easy_accuracy": 75.0, "hard_accuracy": 28.31} | |
| {"submission_id": "ds-ds-easy", "easy_accuracy": 0.0, "hard_accuracy": 2.38} | |
| {"submission_id": "ds-ds-easy2", "easy_accuracy": 68.06, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-ds-hard", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-ds-kimi", "easy_accuracy": 41.67, "hard_accuracy": 15.34} | |
| {"submission_id": "ds-ds-kimi-2", "easy_accuracy": 77.78, "hard_accuracy": 29.63} | |
| {"submission_id": "ds-ds-kimi-3", "easy_accuracy": 77.78, "hard_accuracy": 28.84} | |
| {"submission_id": "ds-ds-oss", "easy_accuracy": 70.83, "hard_accuracy": 7.94} | |
| {"submission_id": "ds-manual", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-org-ds-rookie-v0", "easy_accuracy": 72.22, "hard_accuracy": 7.14} | |
| {"submission_id": "ds-rookie-ds-rookie-v1", "easy_accuracy": 62.5, "hard_accuracy": 0.0} | |
| {"submission_id": "ds-test-aci", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "ds-test-aci-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "ds-test-average", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-test-base", "easy_accuracy": 58.33, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-base-2", "easy_accuracy": 47.22, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-base-qwen3", "easy_accuracy": 79.17, "hard_accuracy": 30.69} | |
| {"submission_id": "ds-test-combine1-12", "easy_accuracy": 68.06, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine1-14", "easy_accuracy": 8.33, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine1-14-true", "easy_accuracy": 63.89, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine1-16", "easy_accuracy": 72.22, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine1-new", "easy_accuracy": 61.11, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine1-old", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine2-new", "easy_accuracy": 65.28, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-combine2-old", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-hard-easy", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-test-hard-hard", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-test-max", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-test-mcc", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "ds-test-mcc-2", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds-test-note", "easy_accuracy": 56.94, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-note-180", "easy_accuracy": 54.17, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-note-180-old", "easy_accuracy": 54.17, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-note-2", "easy_accuracy": 51.39, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-note-3", "easy_accuracy": 52.78, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-note-4", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-notebook-2x", "easy_accuracy": 61.11, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-notebook-2x-old", "easy_accuracy": 58.33, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr", "easy_accuracy": 63.89, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr-new", "easy_accuracy": 63.89, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr-old", "easy_accuracy": 65.28, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr_nef10_acc16_16", "easy_accuracy": 66.67, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr_nef10_acc16_16_old", "easy_accuracy": 61.11, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr_nef10_acc16_18", "easy_accuracy": 66.67, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-qr_nef10_acc16_18_old", "easy_accuracy": 58.33, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-raw-test", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-raw-test-2", "easy_accuracy": 72.22, "hard_accuracy": 12.7} | |
| {"submission_id": "ds-test-raw-test-claude", "easy_accuracy": 86.11, "hard_accuracy": 29.37} | |
| {"submission_id": "ds-test-raw-test-claude-2", "easy_accuracy": 81.94, "hard_accuracy": 31.75} | |
| {"submission_id": "ds-test-raw-test-gpt", "easy_accuracy": 73.61, "hard_accuracy": 14.02} | |
| {"submission_id": "ds-test-synth-dab-hard-8", "easy_accuracy": 79.17, "hard_accuracy": 30.69} | |
| {"submission_id": "ds-test-synth-hard-18", "easy_accuracy": 80.56, "hard_accuracy": 32.28} | |
| {"submission_id": "ds-test-synth1-17", "easy_accuracy": 72.22, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete-2", "easy_accuracy": 0.0, "hard_accuracy": 2.91} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete-3", "easy_accuracy": 0.0, "hard_accuracy": 5.82} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete-4", "easy_accuracy": 0.0, "hard_accuracy": 8.73} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete-5", "easy_accuracy": 0.0, "hard_accuracy": 12.7} | |
| {"submission_id": "ds-test-synth1-17-test-incomplete-6", "easy_accuracy": 0.0, "hard_accuracy": 12.96} | |
| {"submission_id": "ds-test-synth1-18", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-synth1-18-qwen2.5", "easy_accuracy": 79.17, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-synth1-18-qwen3", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test-synth1-19-qwen3", "easy_accuracy": 79.17, "hard_accuracy": 0.79} | |
| {"submission_id": "ds-test1", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "ds1-base", "easy_accuracy": 41.67, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-base2", "easy_accuracy": 43.06, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-base3", "easy_accuracy": 58.33, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-best", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-best2", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-best3", "easy_accuracy": 62.5, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-deepseek2", "easy_accuracy": 80.56, "hard_accuracy": 0.0} | |
| {"submission_id": "ds1-ds-deepseek", "easy_accuracy": 72.22, "hard_accuracy": 0.0} | |
| {"submission_id": "ds1-ds-deepseek2", "easy_accuracy": 80.56, "hard_accuracy": 0.0} | |
| {"submission_id": "ds1-ds-qwen", "easy_accuracy": 44.44, "hard_accuracy": 0.0} | |
| {"submission_id": "ds1-incomplete", "easy_accuracy": 37.5, "hard_accuracy": 1.85} | |
| {"submission_id": "ds1-test", "easy_accuracy": 54.17, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-test-hard", "easy_accuracy": 0.0, "hard_accuracy": 2.38} | |
| {"submission_id": "ds1-test_base", "easy_accuracy": 50.0, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-testdeepseek", "easy_accuracy": 65.28, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-testdeepseekv3", "easy_accuracy": 51.39, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-testg", "easy_accuracy": 81.94, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-testq", "easy_accuracy": 73.61, "hard_accuracy": 1.85} | |
| {"submission_id": "ds1-tune", "easy_accuracy": 45.83, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-tune150", "easy_accuracy": 26.39, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-tune2", "easy_accuracy": 38.89, "hard_accuracy": 0.79} | |
| {"submission_id": "ds1-tune25", "easy_accuracy": 59.72, "hard_accuracy": 0.79} | |
| {"submission_id": "dsgym-combine1-12-1e-5", "easy_accuracy": 73.61, "hard_accuracy": 0.79} | |
| {"submission_id": "dsgym-ds-4o", "easy_accuracy": 73.61, "hard_accuracy": 7.41} | |
| {"submission_id": "dsgym-ds-coder", "easy_accuracy": 75.0, "hard_accuracy": 14.29} | |
| {"submission_id": "dsvx-test_agent", "easy_accuracy": 86.11, "hard_accuracy": 67.99} | |
| {"submission_id": "federicotogehter-t-1", "easy_accuracy": 83.33, "hard_accuracy": 16.93} | |
| {"submission_id": "federicotogether-AgentTest", "easy_accuracy": 79.17, "hard_accuracy": 12.17} | |
| {"submission_id": "federicotogether-MiniAgent", "easy_accuracy": 72.22, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniDS-DS", "easy_accuracy": 55.56, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniDS-LLAMA", "easy_accuracy": 55.56, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniDS-LLAMA2", "easy_accuracy": 68.06, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniRS-OAI", "easy_accuracy": 79.17, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniRS-llama 2.3", "easy_accuracy": 68.06, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniRS-llama-2", "easy_accuracy": 69.44, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniRS-llama-2.1", "easy_accuracy": 65.28, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniRS-llama-2.2", "easy_accuracy": 63.89, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniReact", "easy_accuracy": 55.56, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-MiniReactDS-Full", "easy_accuracy": 83.33, "hard_accuracy": 14.02} | |
| {"submission_id": "federicotogether-TestDoubleCheck", "easy_accuracy": 79.17, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-newagent", "easy_accuracy": 79.17, "hard_accuracy": 14.55} | |
| {"submission_id": "federicotogether-test-", "easy_accuracy": 70.83, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-test-ds", "easy_accuracy": 81.94, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-test12123", "easy_accuracy": 79.17, "hard_accuracy": 12.17} | |
| {"submission_id": "federicotogether-test12123-next", "easy_accuracy": 80.56, "hard_accuracy": 12.43} | |
| {"submission_id": "federicotogether-test12123-next-2", "easy_accuracy": 79.17, "hard_accuracy": 15.87} | |
| {"submission_id": "federicotogether-zzzz", "easy_accuracy": 77.78, "hard_accuracy": 0.0} | |
| {"submission_id": "federicotogether-zzzz2", "easy_accuracy": 83.33, "hard_accuracy": 16.93} | |
| {"submission_id": "g-g3-sync-think", "easy_accuracy": 84.72, "hard_accuracy": 36.77} | |
| {"submission_id": "gg-gg-agent", "easy_accuracy": 47.22, "hard_accuracy": 3.44} | |
| {"submission_id": "gg-gg-agent-0423-concise-new-s12", "easy_accuracy": 52.78, "hard_accuracy": 2.65} | |
| {"submission_id": "gg-gg-agent-0423-concise-s12", "easy_accuracy": 54.17, "hard_accuracy": 3.7} | |
| {"submission_id": "gg-gg-agent-0501-s12", "easy_accuracy": 61.11, "hard_accuracy": 3.17} | |
| {"submission_id": "gg-gg-agent-0501-step120-s12", "easy_accuracy": 34.72, "hard_accuracy": 3.44} | |
| {"submission_id": "gg-gg-agent-0501-step120-s12-s1", "easy_accuracy": 66.67, "hard_accuracy": 5.29} | |
| {"submission_id": "gg-gg-agent-0509-rl-step40-s12", "easy_accuracy": 56.94, "hard_accuracy": 3.17} | |
| {"submission_id": "gg-gg-agent-0512-qwen32b-step100-s12", "easy_accuracy": 45.83, "hard_accuracy": 2.38} | |
| {"submission_id": "gg-gg-agent-doubao", "easy_accuracy": 68.06, "hard_accuracy": 9.26} | |
| {"submission_id": "gg-gg-agent-doubao-id23", "easy_accuracy": 68.06, "hard_accuracy": 15.61} | |
| {"submission_id": "gg-gg-agent-qwen2.5-coder-32b-instruct", "easy_accuracy": 58.33, "hard_accuracy": 2.91} | |
| {"submission_id": "gg-gg-agent-qwen2.5_32b_1", "easy_accuracy": 40.28, "hard_accuracy": 2.91} | |
| {"submission_id": "gg-gg-agent-qwen235b-s12", "easy_accuracy": 66.67, "hard_accuracy": 3.7} | |
| {"submission_id": "gg-gg-agent-qwen3-235b-0523", "easy_accuracy": 29.17, "hard_accuracy": 1.59} | |
| {"submission_id": "gg-gg-agent-qwen3-235b-0523-rerun", "easy_accuracy": 43.06, "hard_accuracy": 2.12} | |
| {"submission_id": "gg-gg-agent-qwen3-235b-base", "easy_accuracy": 58.33, "hard_accuracy": 3.17} | |
| {"submission_id": "gg-gg-agent-qwen3-30b-0522", "easy_accuracy": 50.0, "hard_accuracy": 2.65} | |
| {"submission_id": "gg-gg-agent-qwen3-30b-0522-train-prompt", "easy_accuracy": 23.61, "hard_accuracy": 2.12} | |
| {"submission_id": "gg-gg-agent-qwen3-30b-a3b-0524", "easy_accuracy": 13.89, "hard_accuracy": 1.85} | |
| {"submission_id": "gg-gg-agent-qwen3-30b-a3b-base", "easy_accuracy": 11.11, "hard_accuracy": 0.79} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522", "easy_accuracy": 66.67, "hard_accuracy": 4.5} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522-reason1", "easy_accuracy": 56.94, "hard_accuracy": 1.32} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522-reason2", "easy_accuracy": 58.33, "hard_accuracy": 3.7} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522-rerun1", "easy_accuracy": 15.28, "hard_accuracy": 2.12} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522-rerun2", "easy_accuracy": 16.67, "hard_accuracy": 0.79} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0522-train-prompt", "easy_accuracy": 51.39, "hard_accuracy": 2.91} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0523", "easy_accuracy": 36.11, "hard_accuracy": 1.85} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0523-reason", "easy_accuracy": 50.0, "hard_accuracy": 2.65} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524", "easy_accuracy": 41.67, "hard_accuracy": 2.65} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-new", "easy_accuracy": 31.94, "hard_accuracy": 4.5} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-new2", "easy_accuracy": 23.61, "hard_accuracy": 1.32} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-new2-copy", "easy_accuracy": 31.94, "hard_accuracy": 2.91} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-new2-reason1", "easy_accuracy": 56.94, "hard_accuracy": 2.12} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-new2-reason2", "easy_accuracy": 47.22, "hard_accuracy": 2.65} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0524-reason", "easy_accuracy": 65.28, "hard_accuracy": 4.5} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0526-id11", "easy_accuracy": 37.5, "hard_accuracy": 1.59} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0526-id12", "easy_accuracy": 61.11, "hard_accuracy": 2.91} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0526-id13", "easy_accuracy": 51.39, "hard_accuracy": 3.44} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0529-id14", "easy_accuracy": 63.89, "hard_accuracy": 2.38} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0529-id14-deploy1", "easy_accuracy": 65.28, "hard_accuracy": 4.5} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0529-id15", "easy_accuracy": 61.11, "hard_accuracy": 4.23} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0529-id15-deploy1", "easy_accuracy": 66.67, "hard_accuracy": 5.82} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0602-id16", "easy_accuracy": 58.33, "hard_accuracy": 3.44} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-0602-id17", "easy_accuracy": 63.89, "hard_accuracy": 4.76} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-base", "easy_accuracy": 20.83, "hard_accuracy": 2.38} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-base-reason", "easy_accuracy": 33.33, "hard_accuracy": 3.44} | |
| {"submission_id": "gg-gg-agent-qwen3-32b-base1", "easy_accuracy": 18.06, "hard_accuracy": 1.85} | |
| {"submission_id": "gg-gg-agent-qwq-32b-0423", "easy_accuracy": 59.72, "hard_accuracy": 5.56} | |
| {"submission_id": "gg-gg-agent-qwq-32b-0526-id10", "easy_accuracy": 50.0, "hard_accuracy": 3.17} | |
| {"submission_id": "gg-gg-agent-qwq-32b-s12", "easy_accuracy": 19.44, "hard_accuracy": 1.59} | |
| {"submission_id": "gg-gg-agent-rl-s12", "easy_accuracy": 27.78, "hard_accuracy": 0.79} | |
| {"submission_id": "gg-gg-agent-s11", "easy_accuracy": 45.83, "hard_accuracy": 1.85} | |
| {"submission_id": "gg-gg-agent-s12", "easy_accuracy": 68.06, "hard_accuracy": 4.23} | |
| {"submission_id": "gg-org-gg-agent", "easy_accuracy": 79.17, "hard_accuracy": 15.08} | |
| {"submission_id": "gg-org-gg-agent-cl", "easy_accuracy": 77.78, "hard_accuracy": 30.42} | |
| {"submission_id": "gg-org-gg-agent-cl-0922", "easy_accuracy": 77.78, "hard_accuracy": 32.01} | |
| {"submission_id": "gg-org-gg-agent-cl-0924", "easy_accuracy": 75.0, "hard_accuracy": 35.71} | |
| {"submission_id": "gg-org-gg-agent-cl-0924-1", "easy_accuracy": 75.0, "hard_accuracy": 30.42} | |
| {"submission_id": "gg-org-gg-agent-cl-0924-tw", "easy_accuracy": 77.78, "hard_accuracy": 34.13} | |
| {"submission_id": "gg-org-gg-agent-cl-0925", "easy_accuracy": 76.39, "hard_accuracy": 41.01} | |
| {"submission_id": "gg-org-gg-agent-cl-0925-verifier", "easy_accuracy": 65.28, "hard_accuracy": 25.66} | |
| {"submission_id": "gg-org-gg-agent-cl-0926", "easy_accuracy": 77.78, "hard_accuracy": 53.44} | |
| {"submission_id": "gg-org-gg-agent-cl-1001-workflow", "easy_accuracy": 72.22, "hard_accuracy": 47.09} | |
| {"submission_id": "gg-org-gg-agent-cl-1013-explore", "easy_accuracy": 75.0, "hard_accuracy": 54.5} | |
| {"submission_id": "gg-org-gg-agent-cl-1014", "easy_accuracy": 76.39, "hard_accuracy": 53.7} | |
| {"submission_id": "gg-org-gg-agent-cl-1017", "easy_accuracy": 88.89, "hard_accuracy": 54.76} | |
| {"submission_id": "gg-org-gg-agent-cl-1017-gpt5", "easy_accuracy": 88.89, "hard_accuracy": 52.91} | |
| {"submission_id": "gg-org-gg-agent-cl-1017-rerun", "easy_accuracy": 76.39, "hard_accuracy": 57.41} | |
| {"submission_id": "gg-org-gg-agent-cl-1017-rerun1", "easy_accuracy": 88.89, "hard_accuracy": 57.41} | |
| {"submission_id": "gg-org-gg-agent-cl-1017-rerun1-temp6", "easy_accuracy": 88.89, "hard_accuracy": 58.73} | |
| {"submission_id": "gg-org-gg-agent-cl1", "easy_accuracy": 75.0, "hard_accuracy": 36.24} | |
| {"submission_id": "gg-org-gg-agent-db-1.8-1124", "easy_accuracy": 83.33, "hard_accuracy": 49.47} | |
| {"submission_id": "gg-org-gg-agent-db-1.8-1125", "easy_accuracy": 81.94, "hard_accuracy": 32.54} | |
| {"submission_id": "gg-org-gg-agent-db-code-1118-verifier", "easy_accuracy": 79.17, "hard_accuracy": 40.74} | |
| {"submission_id": "gg-org-gg-agent-doubao-1.5-1", "easy_accuracy": 54.17, "hard_accuracy": 8.99} | |
| {"submission_id": "gg-org-gg-agent-doubao-1.5-2", "easy_accuracy": 68.06, "hard_accuracy": 9.26} | |
| {"submission_id": "gg-org-gg-agent-doubao-id52", "easy_accuracy": 68.06, "hard_accuracy": 18.78} | |
| {"submission_id": "gg-org-gg-agent-doubao-id53", "easy_accuracy": 75.0, "hard_accuracy": 17.46} | |
| {"submission_id": "gg-org-gg-agent-doubao-id54", "easy_accuracy": 62.5, "hard_accuracy": 15.08} | |
| {"submission_id": "gg-org-gg-agent-doubao-id56", "easy_accuracy": 72.22, "hard_accuracy": 15.08} | |
| {"submission_id": "gg-org-gg-agent-doubao-id65", "easy_accuracy": 72.22, "hard_accuracy": 13.23} | |
| {"submission_id": "gg-org-gg-agent-dp3-1", "easy_accuracy": 77.78, "hard_accuracy": 20.63} | |
| {"submission_id": "gg-org-gg-agent-gpt5", "easy_accuracy": 80.56, "hard_accuracy": 14.02} | |
| {"submission_id": "gg-org-gg-agent-gpt5-1027", "easy_accuracy": 88.89, "hard_accuracy": 54.76} | |
| {"submission_id": "gg-org-gg-agent-gpt5-1029", "easy_accuracy": 88.89, "hard_accuracy": 32.28} | |
| {"submission_id": "gg-org-gg-agent-gpt5-1030-without-bank", "easy_accuracy": 88.89, "hard_accuracy": 55.03} | |
| {"submission_id": "gg-org-gg-agent-gpt5-1104", "easy_accuracy": 88.89, "hard_accuracy": 60.32} | |
| {"submission_id": "gg-org-gg-agent-gpt5-1104-1", "easy_accuracy": 88.89, "hard_accuracy": 62.96} | |
| {"submission_id": "gg-org-gg-agent-gpt5-explore-1128", "easy_accuracy": 88.89, "hard_accuracy": 48.68} | |
| {"submission_id": "gg-org-gg-agent-gpt5-explore-rephrase-1129", "easy_accuracy": 80.56, "hard_accuracy": 35.19} | |
| {"submission_id": "gg-org-gg-agent-gpt5-explore-string-1202", "easy_accuracy": 90.28, "hard_accuracy": 51.85} | |
| {"submission_id": "gg-org-gg-agent-gpt5-no-workspace-1206", "easy_accuracy": 87.5, "hard_accuracy": 61.9} | |
| {"submission_id": "gg-org-gg-agent-gpt5-workspace-1208", "easy_accuracy": 90.28, "hard_accuracy": 50.53} | |
| {"submission_id": "gg-org-gg-agent-gpt5-workspace-1210-thresh10", "easy_accuracy": 86.11, "hard_accuracy": 51.06} | |
| {"submission_id": "gg-org-gg-agent-gpt5-workspace-1210-thresh7", "easy_accuracy": 90.28, "hard_accuracy": 69.31} | |
| {"submission_id": "gg-org-gg-agent-gpt5-workspace-1210-thresh8", "easy_accuracy": 90.28, "hard_accuracy": 69.58} | |
| {"submission_id": "gg-org-gg-agent-gpt5-workspace-1210-thresh9", "easy_accuracy": 84.72, "hard_accuracy": 65.34} | |
| {"submission_id": "gg-org-gg-agent-id37-rerun", "easy_accuracy": 70.83, "hard_accuracy": 16.93} | |
| {"submission_id": "gg-org-gg-agent-kimi-1127", "easy_accuracy": 81.94, "hard_accuracy": 38.1} | |
| {"submission_id": "gg-org-gg-agent-kimi-explore-1128", "easy_accuracy": 72.22, "hard_accuracy": 29.63} | |
| {"submission_id": "gg-org-gg-agent-kimi-explore-rephrase-1130", "easy_accuracy": 76.39, "hard_accuracy": 27.78} | |
| {"submission_id": "gg-org-gg-agent-kimi-k2-no-workspace-1206", "easy_accuracy": 79.17, "hard_accuracy": 32.28} | |
| {"submission_id": "gg-org-gg-agent-kimi-k2-workspace-1208", "easy_accuracy": 81.94, "hard_accuracy": 38.1} | |
| {"submission_id": "gg-org-gg-agent-kimi-k2-workspace-1209", "easy_accuracy": 91.67, "hard_accuracy": 56.35} | |
| {"submission_id": "gg-org-gg-agent-kimi-verifier-1118", "easy_accuracy": 84.72, "hard_accuracy": 40.21} | |
| {"submission_id": "gg-sync-flash", "easy_accuracy": 79.17, "hard_accuracy": 32.8} | |
| {"submission_id": "giga-greg-giga-greg", "easy_accuracy": 76.39, "hard_accuracy": 20.9} | |
| {"submission_id": "gmail-tlv dab -2", "easy_accuracy": 65.28, "hard_accuracy": 0.79} | |
| {"submission_id": "gmail-tlv dub", "easy_accuracy": 56.94, "hard_accuracy": 0.79} | |
| {"submission_id": "gmail-tlv-dab-4", "easy_accuracy": 61.11, "hard_accuracy": 0.79} | |
| {"submission_id": "gmail-tlv_dab_3", "easy_accuracy": 62.5, "hard_accuracy": 0.79} | |
| {"submission_id": "h-agent-test-h-agent37", "easy_accuracy": 73.61, "hard_accuracy": 16.67} | |
| {"submission_id": "h45-h45", "easy_accuracy": 86.11, "hard_accuracy": 48.41} | |
| {"submission_id": "hf-hf_agent_test", "easy_accuracy": 58.33, "hard_accuracy": 3.17} | |
| {"submission_id": "hfas-hfas", "easy_accuracy": 79.17, "hard_accuracy": 35.71} | |
| {"submission_id": "hoodini-org-hoodini", "easy_accuracy": 72.22, "hard_accuracy": 47.35} | |
| {"submission_id": "hs-dudu", "easy_accuracy": 73.61, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-10", "easy_accuracy": 77.78, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-11", "easy_accuracy": 79.17, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-112617", "easy_accuracy": 77.78, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-112720", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-112809", "easy_accuracy": 77.78, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-112816", "easy_accuracy": 79.17, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-11281726", "easy_accuracy": 75.0, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-6", "easy_accuracy": 70.83, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-8", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-dudu-9", "easy_accuracy": 79.17, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-single", "easy_accuracy": 70.83, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-single-k", "easy_accuracy": 80.56, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-single1", "easy_accuracy": 65.28, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-single120216", "easy_accuracy": 83.33, "hard_accuracy": 0.79} | |
| {"submission_id": "hs-single120217", "easy_accuracy": 70.83, "hard_accuracy": 0.79} | |
| {"submission_id": "hstest-hstest", "easy_accuracy": 76.39, "hard_accuracy": 0.0} | |
| {"submission_id": "iie-deepseek_based_backward", "easy_accuracy": 73.61, "hard_accuracy": 15.87} | |
| {"submission_id": "initial-test-initial-test", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "jaytest-jaytest8", "easy_accuracy": 6.94, "hard_accuracy": 0.0} | |
| {"submission_id": "jaytest-jaytest9", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "jaytest-t6", "easy_accuracy": 6.94, "hard_accuracy": 0.0} | |
| {"submission_id": "just_a_test-just_a_test", "easy_accuracy": 56.94, "hard_accuracy": 1.06} | |
| {"submission_id": "just_a_test_2-just_a_test_2", "easy_accuracy": 52.78, "hard_accuracy": 10.05} | |
| {"submission_id": "kuriboh-org-kuriboh", "easy_accuracy": 77.78, "hard_accuracy": 47.09} | |
| {"submission_id": "llmtech-ds-agent-test", "easy_accuracy": 80.56, "hard_accuracy": 41.01} | |
| {"submission_id": "llmtech-ds-agent-test-123", "easy_accuracy": 97.22, "hard_accuracy": 66.93} | |
| {"submission_id": "llmtech-submission_sonnet4_5_read_manual_default", "easy_accuracy": 81.94, "hard_accuracy": 37.83} | |
| {"submission_id": "lol-lol", "easy_accuracy": 77.78, "hard_accuracy": 57.14} | |
| {"submission_id": "lol2-lol2", "easy_accuracy": 77.78, "hard_accuracy": 57.14} | |
| {"submission_id": "marvell-doanh", "easy_accuracy": 36.11, "hard_accuracy": 3.17} | |
| {"submission_id": "mass-a-agent-sync-pro-v1", "easy_accuracy": 76.39, "hard_accuracy": 31.75} | |
| {"submission_id": "microsoft-chatgpt 4.1", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "mimi-test-mimi-c", "easy_accuracy": 70.83, "hard_accuracy": 19.58} | |
| {"submission_id": "msr-basic", "easy_accuracy": 69.44, "hard_accuracy": 5.03} | |
| {"submission_id": "msr-basic gpt-5-mini", "easy_accuracy": 69.44, "hard_accuracy": 14.81} | |
| {"submission_id": "msr-basic-gpt5-nano", "easy_accuracy": 45.83, "hard_accuracy": 2.38} | |
| {"submission_id": "msr-basic5", "easy_accuracy": 75.0, "hard_accuracy": 15.61} | |
| {"submission_id": "na-dsa-test-flash-all-251028", "easy_accuracy": 27.78, "hard_accuracy": 5.29} | |
| {"submission_id": "na-dsa-test-pro-all-251028", "easy_accuracy": 51.39, "hard_accuracy": 5.56} | |
| {"submission_id": "na-dsa-test-pro-all-g3-251030", "easy_accuracy": 55.56, "hard_accuracy": 14.02} | |
| {"submission_id": "naruto-org-naruto", "easy_accuracy": 83.33, "hard_accuracy": 48.68} | |
| {"submission_id": "nobody-test", "easy_accuracy": 47.22, "hard_accuracy": 3.17} | |
| {"submission_id": "nul-null-o3--2", "easy_accuracy": 4.17, "hard_accuracy": 5.56} | |
| {"submission_id": "nul-null-o3-43_47", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "nul-null-o3-60_69", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-TEST-mid-1", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "null-ablation-vld-easy", "easy_accuracy": 63.89, "hard_accuracy": 0.0} | |
| {"submission_id": "null-nua", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nuaa", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nuaaa", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nuaaaa", "easy_accuracy": 6.94, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null-13", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-16", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-16-17", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-16-17--2", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-16-19--4", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-17", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-17--2", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-17--4", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-17-19", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-18", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-1817-1822", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-1817-1822--2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-1823-1868", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-1861-1868", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-19", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-1t", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-22", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-22-23", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-25", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-27-29", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-3", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-32", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-33_35_39", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-38", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-38--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-40", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-42_44", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43---3", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43---4", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43--2", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43--3", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43_47---3", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-43_47--3", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-45", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-45-72", "easy_accuracy": 18.06, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-46", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-46--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-47--2", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-47--3", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-47--4", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-47_71", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-48-52", "easy_accuracy": 5.56, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-48-72", "easy_accuracy": 15.28, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-49", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-49--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-49-52", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-4t", "easy_accuracy": 5.56, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-51-52", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-53", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-53--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-53-55", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-56-58", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-59", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-5t", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-5t1", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-5t2", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-6-7", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-60---3", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-61", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-61--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-62", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-62--2", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-63", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-65", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-65-66", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-65-68", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-65-68--2", "easy_accuracy": 5.56, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-68-69", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-68-72", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-69", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-69--1", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-70--1", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-70--2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-71", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-72", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-abc", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-all-not", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null-i", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-no", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null-o3", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-null-o3-60", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null0", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null1", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null11", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null111", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null1111", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null11111", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-null111111", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null17-19", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-null3", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-null4", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-null5", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "null-nulla", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla1", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla15", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla1819", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla2", "easy_accuracy": 1.39, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla5", "easy_accuracy": 5.56, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulla8", "easy_accuracy": 5.56, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nullaaaaa", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nullb", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-nulll", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-nullll", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-nulllll", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-08186-s", "easy_accuracy": 4.17, "hard_accuracy": 22.22} | |
| {"submission_id": "null-test-0819-lst-no", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-0819-lst1", "easy_accuracy": 4.17, "hard_accuracy": 4.5} | |
| {"submission_id": "null-test-0819-lst1-2", "easy_accuracy": 4.17, "hard_accuracy": 5.29} | |
| {"submission_id": "null-test-0819-num-1", "easy_accuracy": 4.17, "hard_accuracy": 5.29} | |
| {"submission_id": "null-test-0819-round", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-0819-round-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-0819-yr-1", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-0819-yr-1-abl-1", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-093821-s", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-333", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-cs", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-8019-cs-2", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-8019-cs-3", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-test-8019-cs-4", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-8019-cs-5", "easy_accuracy": 4.17, "hard_accuracy": 4.5} | |
| {"submission_id": "null-test-8019-dlt-1", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-8019-dlt-1-abl", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-8019-dlt-1-abl2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl3", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4-1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4-3", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4-4", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-dlt-1-abl4-5", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8019-total-1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8020-rf", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-8020-rf-1", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-8020-rf-1-all", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-8020-rf-1-all-abs", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-8020-rf-1-all-abs-abl", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-85-2", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "null-test-9-1", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-test-9-2", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-9-3", "easy_accuracy": 4.17, "hard_accuracy": 2.12} | |
| {"submission_id": "null-test-9-4", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-AgenticData", "easy_accuracy": 94.44, "hard_accuracy": 70.63} | |
| {"submission_id": "null-test-AgenticData-0916", "easy_accuracy": 94.44, "hard_accuracy": 52.91} | |
| {"submission_id": "null-test-AgenticData-0918", "easy_accuracy": 94.44, "hard_accuracy": 60.05} | |
| {"submission_id": "null-test-a-1", "easy_accuracy": 4.17, "hard_accuracy": 0.53} | |
| {"submission_id": "null-test-a-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-abl-dp", "easy_accuracy": 61.11, "hard_accuracy": 18.25} | |
| {"submission_id": "null-test-abl-mem", "easy_accuracy": 62.5, "hard_accuracy": 29.1} | |
| {"submission_id": "null-test-abl-vld", "easy_accuracy": 4.17, "hard_accuracy": 26.98} | |
| {"submission_id": "null-test-ac-1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ac-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ac-a", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ac-b", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ac-c", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ac-d", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "null-test-ac-e", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "null-test-ac-f", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-aci-3", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-aci-4", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-agent-ds", "easy_accuracy": 94.44, "hard_accuracy": 40.48} | |
| {"submission_id": "null-test-backup", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-baseline-42", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-baseline-62", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-baseline-72", "easy_accuracy": 4.17, "hard_accuracy": 0.53} | |
| {"submission_id": "null-test-baseline-92", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-cc-1", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-test-cc-12", "easy_accuracy": 4.17, "hard_accuracy": 4.5} | |
| {"submission_id": "null-test-cs-1", "easy_accuracy": 4.17, "hard_accuracy": 2.91} | |
| {"submission_id": "null-test-dummy-123", "easy_accuracy": 94.44, "hard_accuracy": 40.48} | |
| {"submission_id": "null-test-dummy-123-01", "easy_accuracy": 4.17, "hard_accuracy": 10.58} | |
| {"submission_id": "null-test-dummy-123-02", "easy_accuracy": 4.17, "hard_accuracy": 7.14} | |
| {"submission_id": "null-test-dummy-123-03", "easy_accuracy": 4.17, "hard_accuracy": 14.81} | |
| {"submission_id": "null-test-dummy-123-04", "easy_accuracy": 4.17, "hard_accuracy": 6.08} | |
| {"submission_id": "null-test-dummy-234-001", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-dummy-234-002", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-dummy-234-01", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-dummy-234-02", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-c", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-d", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-debug", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-debug-ablation", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-debug-cards", "easy_accuracy": 4.17, "hard_accuracy": 4.23} | |
| {"submission_id": "null-test-fee-debug-count", "easy_accuracy": 4.17, "hard_accuracy": 1.59} | |
| {"submission_id": "null-test-fee-debug-h", "easy_accuracy": 4.17, "hard_accuracy": 3.44} | |
| {"submission_id": "null-test-fee-debug-months", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "null-test-fee-debug-total", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-e1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-e1-ablation", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-e1-ablation-2", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-fee-e1-ablation-3", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fee-list", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-feeid", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-feeid2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fees-1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-fees-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-ff", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-fmt-28", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-format", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-format-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-format-3", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-general", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-h1", "easy_accuracy": 4.17, "hard_accuracy": 4.23} | |
| {"submission_id": "null-test-in-general", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-init", "easy_accuracy": 0.0, "hard_accuracy": 6.08} | |
| {"submission_id": "null-test-list-1", "easy_accuracy": 4.17, "hard_accuracy": 4.5} | |
| {"submission_id": "null-test-max-a", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-max-b", "easy_accuracy": 4.17, "hard_accuracy": 0.53} | |
| {"submission_id": "null-test-max-c", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-max-d", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-max-db", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-max-dc", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-max-dd", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-mc-1-ne", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-merge0", "easy_accuracy": 0.0, "hard_accuracy": 15.87} | |
| {"submission_id": "null-test-mf-1", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "null-test-mid-2", "easy_accuracy": 4.17, "hard_accuracy": 3.17} | |
| {"submission_id": "null-test-month-ablation", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-month-ablation-2", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-month-all", "easy_accuracy": 4.17, "hard_accuracy": 4.76} | |
| {"submission_id": "null-test-month-all-ablation-1", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "null-test-no1", "easy_accuracy": 4.17, "hard_accuracy": 5.29} | |
| {"submission_id": "null-test-no2", "easy_accuracy": 4.17, "hard_accuracy": 3.17} | |
| {"submission_id": "null-test-no5", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "null-test-no7", "easy_accuracy": 4.17, "hard_accuracy": 4.23} | |
| {"submission_id": "null-test-no8", "easy_accuracy": 4.17, "hard_accuracy": 5.29} | |
| {"submission_id": "null-test-oop", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-re-1", "easy_accuracy": 4.17, "hard_accuracy": 0.26} | |
| {"submission_id": "null-test-re-2", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-re-2-p", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-remain", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-remain-2", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "null-test-ro-1", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "null-test-special", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "null-test-trick", "easy_accuracy": 4.17, "hard_accuracy": 0.53} | |
| {"submission_id": "o41-o41", "easy_accuracy": 86.11, "hard_accuracy": 48.94} | |
| {"submission_id": "ob-ob-agent", "easy_accuracy": 83.33, "hard_accuracy": 48.15} | |
| {"submission_id": "oceanbase-DUDU", "easy_accuracy": 75.0, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu", "easy_accuracy": 84.72, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu-12", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu-12-1", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu-1203", "easy_accuracy": 84.72, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu-120317", "easy_accuracy": 88.89, "hard_accuracy": 0.79} | |
| {"submission_id": "oceanbase-dudu-1205", "easy_accuracy": 93.06, "hard_accuracy": 0.79} | |
| {"submission_id": "old-old-one", "easy_accuracy": 77.78, "hard_accuracy": 35.98} | |
| {"submission_id": "oppa-oppa", "easy_accuracy": 70.83, "hard_accuracy": 43.92} | |
| {"submission_id": "org-1-test-agent-claude-0731", "easy_accuracy": 4.17, "hard_accuracy": 7.41} | |
| {"submission_id": "org-1-test-agent-claude-0827", "easy_accuracy": 4.17, "hard_accuracy": 11.64} | |
| {"submission_id": "org1-GPT4.1-5-25", "easy_accuracy": 4.17, "hard_accuracy": 2.12} | |
| {"submission_id": "org1-GPT4.1-5-25-2", "easy_accuracy": 4.17, "hard_accuracy": 2.12} | |
| {"submission_id": "org1-GPT4.1-TestAgent-0530", "easy_accuracy": 73.61, "hard_accuracy": 25.66} | |
| {"submission_id": "org1-GPT4.1-v2-50", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "org1-GPT4.1-v2-50-2", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "org1-GPT4.1-v2-50-2-double", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "org1-claude3.7-50", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "org1-claude3.7-50-2", "easy_accuracy": 4.17, "hard_accuracy": 1.32} | |
| {"submission_id": "org1-claude3.7-test", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "org1-claude3.7-test-50", "easy_accuracy": 4.17, "hard_accuracy": 2.38} | |
| {"submission_id": "org1-deepseek-0620", "easy_accuracy": 4.17, "hard_accuracy": 3.17} | |
| {"submission_id": "org1-deepseek-test-agent-06118", "easy_accuracy": 4.17, "hard_accuracy": 2.12} | |
| {"submission_id": "org1-deepseek-testagent-0626", "easy_accuracy": 4.17, "hard_accuracy": 7.67} | |
| {"submission_id": "org1-deepseek-testagent-0703", "easy_accuracy": 4.17, "hard_accuracy": 2.38} | |
| {"submission_id": "org1-deepseekr1- 0616-50", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "org1-gpt4.1-100", "easy_accuracy": 4.17, "hard_accuracy": 6.08} | |
| {"submission_id": "org1-gpt4.1-100-0613", "easy_accuracy": 4.17, "hard_accuracy": 6.08} | |
| {"submission_id": "org1-gpt4.1-200", "easy_accuracy": 4.17, "hard_accuracy": 12.96} | |
| {"submission_id": "org1-gpt4.1-50", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "org1-gpt4.1-50-0604", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "org1-gpt4.1-50-0604-2", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "org1-gpt4.1-50-0606", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "org1-gpt4.1-50-0606-2", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "org1-gpt4.1-ALL", "easy_accuracy": 4.17, "hard_accuracy": 25.66} | |
| {"submission_id": "org1-gpt4.1-double-50", "easy_accuracy": 4.17, "hard_accuracy": 2.65} | |
| {"submission_id": "org1-gpt4.1-testAgent-0609-1", "easy_accuracy": 4.17, "hard_accuracy": 1.06} | |
| {"submission_id": "org1-gpt4.1-testAgent-0614-200", "easy_accuracy": 4.17, "hard_accuracy": 9.52} | |
| {"submission_id": "org1-gpt4.1-testAgent-0615-200", "easy_accuracy": 4.17, "hard_accuracy": 12.7} | |
| {"submission_id": "org1-test-agent-0731", "easy_accuracy": 4.17, "hard_accuracy": 7.41} | |
| {"submission_id": "org1-test-agent-0914", "easy_accuracy": 4.17, "hard_accuracy": 5.29} | |
| {"submission_id": "org1-test-agent-claude-0715", "easy_accuracy": 4.17, "hard_accuracy": 5.03} | |
| {"submission_id": "org1-test-agent-claude-0716", "easy_accuracy": 4.17, "hard_accuracy": 8.99} | |
| {"submission_id": "org1-test-agent-claude-0717", "easy_accuracy": 4.17, "hard_accuracy": 10.58} | |
| {"submission_id": "org1-test-agent-claude-0821", "easy_accuracy": 4.17, "hard_accuracy": 17.99} | |
| {"submission_id": "org1-test-agent-claude-0821-2", "easy_accuracy": 4.17, "hard_accuracy": 21.96} | |
| {"submission_id": "org1-test-agent-claude-0904", "easy_accuracy": 4.17, "hard_accuracy": 12.96} | |
| {"submission_id": "org1-test-agent-claude-0920", "easy_accuracy": 4.17, "hard_accuracy": 7.67} | |
| {"submission_id": "org1-test-claude-0723", "easy_accuracy": 75.0, "hard_accuracy": 31.22} | |
| {"submission_id": "org1-test-claude-0724", "easy_accuracy": 75.0, "hard_accuracy": 41.53} | |
| {"submission_id": "pernsonal-q_test_da_prm7B_hard_label", "easy_accuracy": 75.0, "hard_accuracy": 28.57} | |
| {"submission_id": "personal test for react tra-cbbb react test_tra", "easy_accuracy": 58.33, "hard_accuracy": 20.37} | |
| {"submission_id": "personal test-cbbb free", "easy_accuracy": 76.39, "hard_accuracy": 8.2} | |
| {"submission_id": "personal test-cbbb free reason", "easy_accuracy": 79.17, "hard_accuracy": 8.99} | |
| {"submission_id": "personal test-cbbb o3-mini reasoning react", "easy_accuracy": 77.78, "hard_accuracy": 8.99} | |
| {"submission_id": "personal test-cbbb o3-mini reasoning react_tra", "easy_accuracy": 34.72, "hard_accuracy": 21.43} | |
| {"submission_id": "personal test-cbbb o3-mini reasoning react_tra111", "easy_accuracy": 73.61, "hard_accuracy": 21.16} | |
| {"submission_id": "personal test-cbbb react 2s o3-mini reason", "easy_accuracy": 73.61, "hard_accuracy": 12.43} | |
| {"submission_id": "personal test-cbbb react test", "easy_accuracy": 26.39, "hard_accuracy": 2.91} | |
| {"submission_id": "personal test-cbbb react test_tra", "easy_accuracy": 34.72, "hard_accuracy": 21.43} | |
| {"submission_id": "personal use-cbbb react test", "easy_accuracy": 69.44, "hard_accuracy": 5.03} | |
| {"submission_id": "personal use-cbbb test 2.0", "easy_accuracy": 52.78, "hard_accuracy": 4.5} | |
| {"submission_id": "personal user-cbbb agent", "easy_accuracy": 23.61, "hard_accuracy": 2.65} | |
| {"submission_id": "personal-0815 cbbb runs", "easy_accuracy": 61.11, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-0815 cbbb runs reason", "easy_accuracy": 63.89, "hard_accuracy": 6.88} | |
| {"submission_id": "personal-0815 cbbb runs2", "easy_accuracy": 62.5, "hard_accuracy": 23.28} | |
| {"submission_id": "personal-0815 cbbb runs2 reason", "easy_accuracy": 73.61, "hard_accuracy": 24.34} | |
| {"submission_id": "personal-0815 cbbb runs3 reason", "easy_accuracy": 66.67, "hard_accuracy": 11.64} | |
| {"submission_id": "personal-251110_1", "easy_accuracy": 59.72, "hard_accuracy": 5.29} | |
| {"submission_id": "personal-BiPaS", "easy_accuracy": 66.67, "hard_accuracy": 26.72} | |
| {"submission_id": "personal-BiPaS-2", "easy_accuracy": 58.33, "hard_accuracy": 24.6} | |
| {"submission_id": "personal-BiPaS-baseline1", "easy_accuracy": 68.06, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-BiPaS-baseline2", "easy_accuracy": 66.67, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-BiPaS-baseline3", "easy_accuracy": 76.39, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-BiPas_noStructural", "easy_accuracy": 73.61, "hard_accuracy": 31.22} | |
| {"submission_id": "personal-Clalude code and gemini", "easy_accuracy": 100.0, "hard_accuracy": 98.94} | |
| {"submission_id": "personal-Claude code and Gemini", "easy_accuracy": 100.0, "hard_accuracy": 98.94} | |
| {"submission_id": "personal-ConfuseAgent", "easy_accuracy": 100.0, "hard_accuracy": 98.68} | |
| {"submission_id": "personal-Easy Agent Test 1", "easy_accuracy": 16.67, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-Easy Agent Test 2", "easy_accuracy": 16.67, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-Easy Agent Test 3", "easy_accuracy": 77.78, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-Easy Agent Test 5", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-Hard Agent Test 1", "easy_accuracy": 77.78, "hard_accuracy": 0.26} | |
| {"submission_id": "personal-Sup 4.3", "easy_accuracy": 75.0, "hard_accuracy": 23.02} | |
| {"submission_id": "personal-Sup 4.35", "easy_accuracy": 65.28, "hard_accuracy": 29.63} | |
| {"submission_id": "personal-Sup 4.36", "easy_accuracy": 66.67, "hard_accuracy": 25.66} | |
| {"submission_id": "personal-Sup 4.4", "easy_accuracy": 68.06, "hard_accuracy": 27.25} | |
| {"submission_id": "personal-Sup 4.41a", "easy_accuracy": 79.17, "hard_accuracy": 41.27} | |
| {"submission_id": "personal-Sup_rev_8", "easy_accuracy": 90.28, "hard_accuracy": 61.11} | |
| {"submission_id": "personal-abc test", "easy_accuracy": 77.78, "hard_accuracy": 8.47} | |
| {"submission_id": "personal-agent_prompt_test_1", "easy_accuracy": 70.83, "hard_accuracy": 12.17} | |
| {"submission_id": "personal-bar", "easy_accuracy": 66.67, "hard_accuracy": 12.43} | |
| {"submission_id": "personal-bb", "easy_accuracy": 45.83, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-bb2", "easy_accuracy": 61.11, "hard_accuracy": 8.47} | |
| {"submission_id": "personal-bb3", "easy_accuracy": 56.94, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-bb4", "easy_accuracy": 38.89, "hard_accuracy": 10.32} | |
| {"submission_id": "personal-bye", "easy_accuracy": 62.5, "hard_accuracy": 16.93} | |
| {"submission_id": "personal-cbbb agent test", "easy_accuracy": 47.22, "hard_accuracy": 3.97} | |
| {"submission_id": "personal-cbbb agent test2", "easy_accuracy": 54.17, "hard_accuracy": 4.23} | |
| {"submission_id": "personal-cbbb agent test3", "easy_accuracy": 51.39, "hard_accuracy": 3.97} | |
| {"submission_id": "personal-cbbb md runs", "easy_accuracy": 66.67, "hard_accuracy": 7.41} | |
| {"submission_id": "personal-cbbb md runs reason", "easy_accuracy": 73.61, "hard_accuracy": 6.88} | |
| {"submission_id": "personal-cbbb md runs2", "easy_accuracy": 62.5, "hard_accuracy": 19.84} | |
| {"submission_id": "personal-cbbb md runs2 gpt-4o", "easy_accuracy": 62.5, "hard_accuracy": 19.84} | |
| {"submission_id": "personal-cbbb md runs2 reason", "easy_accuracy": 76.39, "hard_accuracy": 21.96} | |
| {"submission_id": "personal-cbbb md runs3 reason", "easy_accuracy": 76.39, "hard_accuracy": 11.64} | |
| {"submission_id": "personal-cbbb run2 add", "easy_accuracy": 68.06, "hard_accuracy": 17.99} | |
| {"submission_id": "personal-cbbb run2 add o3-mini reason", "easy_accuracy": 77.78, "hard_accuracy": 21.43} | |
| {"submission_id": "personal-cbbb run3 add", "easy_accuracy": 73.61, "hard_accuracy": 10.85} | |
| {"submission_id": "personal-cbbb runs add gpt-4o", "easy_accuracy": 72.22, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-cbbb runs add o3-mini reason", "easy_accuracy": 77.78, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-claude sonnet 4.5", "easy_accuracy": 62.5, "hard_accuracy": 5.56} | |
| {"submission_id": "personal-code_agent_0708_2", "easy_accuracy": 52.78, "hard_accuracy": 8.47} | |
| {"submission_id": "personal-code_agent_0724_1", "easy_accuracy": 72.22, "hard_accuracy": 9.26} | |
| {"submission_id": "personal-code_agent_0724_2", "easy_accuracy": 66.67, "hard_accuracy": 15.08} | |
| {"submission_id": "personal-code_agent_0724_3", "easy_accuracy": 31.94, "hard_accuracy": 5.29} | |
| {"submission_id": "personal-code_agent_0724_4", "easy_accuracy": 73.61, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-code_agent_0724_5", "easy_accuracy": 72.22, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-code_agent_0725_1", "easy_accuracy": 61.11, "hard_accuracy": 9.26} | |
| {"submission_id": "personal-code_agent_0725_2", "easy_accuracy": 77.78, "hard_accuracy": 12.43} | |
| {"submission_id": "personal-code_agent_0728_1", "easy_accuracy": 45.83, "hard_accuracy": 4.5} | |
| {"submission_id": "personal-code_agent_0728_2", "easy_accuracy": 72.22, "hard_accuracy": 8.47} | |
| {"submission_id": "personal-code_agent_0728_3", "easy_accuracy": 66.67, "hard_accuracy": 10.05} | |
| {"submission_id": "personal-code_agent_0728_4", "easy_accuracy": 68.06, "hard_accuracy": 6.08} | |
| {"submission_id": "personal-code_agent_0729_1", "easy_accuracy": 63.89, "hard_accuracy": 15.61} | |
| {"submission_id": "personal-code_agent_0730_1", "easy_accuracy": 26.39, "hard_accuracy": 0.26} | |
| {"submission_id": "personal-code_agent_0730_2", "easy_accuracy": 69.44, "hard_accuracy": 11.11} | |
| {"submission_id": "personal-code_agent_0730_3", "easy_accuracy": 66.67, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-code_agent_0801_1", "easy_accuracy": 61.11, "hard_accuracy": 7.41} | |
| {"submission_id": "personal-code_agent_0801_2", "easy_accuracy": 65.28, "hard_accuracy": 14.02} | |
| {"submission_id": "personal-code_agent_0802_1", "easy_accuracy": 63.89, "hard_accuracy": 6.88} | |
| {"submission_id": "personal-code_agent_0802_2", "easy_accuracy": 66.67, "hard_accuracy": 7.41} | |
| {"submission_id": "personal-code_agent_0802_3", "easy_accuracy": 72.22, "hard_accuracy": 16.14} | |
| {"submission_id": "personal-code_agent_0802_4", "easy_accuracy": 65.28, "hard_accuracy": 8.73} | |
| {"submission_id": "personal-code_agent_0802_5", "easy_accuracy": 62.5, "hard_accuracy": 6.35} | |
| {"submission_id": "personal-code_agent_0804_1", "easy_accuracy": 70.83, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-code_agent_0804_2", "easy_accuracy": 72.22, "hard_accuracy": 13.23} | |
| {"submission_id": "personal-code_agent_0804_3", "easy_accuracy": 52.78, "hard_accuracy": 14.81} | |
| {"submission_id": "personal-code_agent_0805", "easy_accuracy": 50.0, "hard_accuracy": 14.55} | |
| {"submission_id": "personal-code_agent_0805_2", "easy_accuracy": 50.0, "hard_accuracy": 14.55} | |
| {"submission_id": "personal-code_agent_0805_3", "easy_accuracy": 70.83, "hard_accuracy": 14.55} | |
| {"submission_id": "personal-code_agent_0806_1", "easy_accuracy": 58.33, "hard_accuracy": 14.81} | |
| {"submission_id": "personal-code_agent_0806_2", "easy_accuracy": 65.28, "hard_accuracy": 8.2} | |
| {"submission_id": "personal-code_agent_0806_3", "easy_accuracy": 56.94, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-code_agent_0806_4", "easy_accuracy": 56.94, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0807_1", "easy_accuracy": 41.67, "hard_accuracy": 7.67} | |
| {"submission_id": "personal-code_agent_0807_3", "easy_accuracy": 55.56, "hard_accuracy": 10.05} | |
| {"submission_id": "personal-code_agent_0807_4", "easy_accuracy": 72.22, "hard_accuracy": 16.14} | |
| {"submission_id": "personal-code_agent_0808_1", "easy_accuracy": 72.22, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-code_agent_0808_2", "easy_accuracy": 72.22, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0808_4", "easy_accuracy": 73.61, "hard_accuracy": 12.96} | |
| {"submission_id": "personal-code_agent_0808_5", "easy_accuracy": 69.44, "hard_accuracy": 12.43} | |
| {"submission_id": "personal-code_agent_0811_1", "easy_accuracy": 66.67, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-code_agent_0811_10", "easy_accuracy": 70.83, "hard_accuracy": 13.23} | |
| {"submission_id": "personal-code_agent_0811_11", "easy_accuracy": 72.22, "hard_accuracy": 13.23} | |
| {"submission_id": "personal-code_agent_0811_2", "easy_accuracy": 72.22, "hard_accuracy": 14.81} | |
| {"submission_id": "personal-code_agent_0811_3", "easy_accuracy": 81.94, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0811_4", "easy_accuracy": 75.0, "hard_accuracy": 13.23} | |
| {"submission_id": "personal-code_agent_0811_5", "easy_accuracy": 76.39, "hard_accuracy": 10.32} | |
| {"submission_id": "personal-code_agent_0811_6", "easy_accuracy": 70.83, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-code_agent_0811_7", "easy_accuracy": 77.78, "hard_accuracy": 10.85} | |
| {"submission_id": "personal-code_agent_0811_8", "easy_accuracy": 72.22, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0811_9", "easy_accuracy": 69.44, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0812_1", "easy_accuracy": 70.83, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0812_2", "easy_accuracy": 77.78, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0812_3", "easy_accuracy": 65.28, "hard_accuracy": 15.08} | |
| {"submission_id": "personal-code_agent_0812_4", "easy_accuracy": 73.61, "hard_accuracy": 14.29} | |
| {"submission_id": "personal-code_agent_0812_5", "easy_accuracy": 70.83, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-code_agent_0812_6", "easy_accuracy": 70.83, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-code_agent_0812_7", "easy_accuracy": 69.44, "hard_accuracy": 15.34} | |
| {"submission_id": "personal-code_agent_0812_8", "easy_accuracy": 73.61, "hard_accuracy": 11.64} | |
| {"submission_id": "personal-code_agent_0813_1", "easy_accuracy": 70.83, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-code_agent_0813_2", "easy_accuracy": 69.44, "hard_accuracy": 12.17} | |
| {"submission_id": "personal-code_agent_0813_3", "easy_accuracy": 72.22, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0813_4", "easy_accuracy": 70.83, "hard_accuracy": 12.43} | |
| {"submission_id": "personal-code_agent_0813_5", "easy_accuracy": 72.22, "hard_accuracy": 12.17} | |
| {"submission_id": "personal-code_agent_0813_6", "easy_accuracy": 69.44, "hard_accuracy": 12.43} | |
| {"submission_id": "personal-code_agent_0813_7", "easy_accuracy": 70.83, "hard_accuracy": 10.85} | |
| {"submission_id": "personal-code_agent_0814_1", "easy_accuracy": 68.06, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-code_agent_0814_2", "easy_accuracy": 69.44, "hard_accuracy": 8.2} | |
| {"submission_id": "personal-code_agent_0814_3", "easy_accuracy": 75.0, "hard_accuracy": 7.67} | |
| {"submission_id": "personal-code_agent_0814_4", "easy_accuracy": 69.44, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-code_agent_0814_5", "easy_accuracy": 72.22, "hard_accuracy": 5.82} | |
| {"submission_id": "personal-code_agent_0814_6", "easy_accuracy": 73.61, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-code_agent_0814_7", "easy_accuracy": 72.22, "hard_accuracy": 7.41} | |
| {"submission_id": "personal-code_agent_0814_8", "easy_accuracy": 70.83, "hard_accuracy": 12.7} | |
| {"submission_id": "personal-code_agent_0815_1", "easy_accuracy": 70.83, "hard_accuracy": 14.29} | |
| {"submission_id": "personal-code_agent_0815_2", "easy_accuracy": 65.28, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0815_3", "easy_accuracy": 75.0, "hard_accuracy": 14.29} | |
| {"submission_id": "personal-code_agent_0815_4", "easy_accuracy": 65.28, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_0815_5", "easy_accuracy": 73.61, "hard_accuracy": 12.17} | |
| {"submission_id": "personal-code_agent_0815_7", "easy_accuracy": 73.61, "hard_accuracy": 8.73} | |
| {"submission_id": "personal-code_agent_0815_8", "easy_accuracy": 69.44, "hard_accuracy": 8.47} | |
| {"submission_id": "personal-code_agent_0815_9", "easy_accuracy": 65.28, "hard_accuracy": 10.32} | |
| {"submission_id": "personal-code_agent_0822_1", "easy_accuracy": 37.5, "hard_accuracy": 4.76} | |
| {"submission_id": "personal-code_agent_0825_1", "easy_accuracy": 47.22, "hard_accuracy": 5.29} | |
| {"submission_id": "personal-code_agent_0827_1", "easy_accuracy": 69.44, "hard_accuracy": 15.34} | |
| {"submission_id": "personal-code_agent_0827_2", "easy_accuracy": 19.44, "hard_accuracy": 1.32} | |
| {"submission_id": "personal-code_agent_0827_3", "easy_accuracy": 23.61, "hard_accuracy": 1.85} | |
| {"submission_id": "personal-code_agent_0829_1", "easy_accuracy": 37.5, "hard_accuracy": 1.06} | |
| {"submission_id": "personal-code_agent_0902_1", "easy_accuracy": 37.5, "hard_accuracy": 0.53} | |
| {"submission_id": "personal-code_agent_0903_1", "easy_accuracy": 61.11, "hard_accuracy": 4.5} | |
| {"submission_id": "personal-code_agent_0905_1", "easy_accuracy": 66.67, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0905_2", "easy_accuracy": 68.06, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-code_agent_0905_3", "easy_accuracy": 68.06, "hard_accuracy": 10.32} | |
| {"submission_id": "personal-code_agent_0905_4", "easy_accuracy": 68.06, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0909", "easy_accuracy": 69.44, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-code_agent_0911_1", "easy_accuracy": 48.61, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-code_agent_0912", "easy_accuracy": 54.17, "hard_accuracy": 5.56} | |
| {"submission_id": "personal-code_agent_0913", "easy_accuracy": 69.44, "hard_accuracy": 13.23} | |
| {"submission_id": "personal-code_agent_0925", "easy_accuracy": 75.0, "hard_accuracy": 36.51} | |
| {"submission_id": "personal-code_agent_1753248736", "easy_accuracy": 80.56, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-code_agent_4", "easy_accuracy": 65.28, "hard_accuracy": 8.73} | |
| {"submission_id": "personal-code_agent_5", "easy_accuracy": 62.5, "hard_accuracy": 6.35} | |
| {"submission_id": "personal-code_agent_Qwen3-235B-A22B-2507", "easy_accuracy": 80.56, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-code_agent_dp_1", "easy_accuracy": 48.61, "hard_accuracy": 4.76} | |
| {"submission_id": "personal-code_agent_hf_version", "easy_accuracy": 72.22, "hard_accuracy": 15.61} | |
| {"submission_id": "personal-code_agent_hf_version_2", "easy_accuracy": 79.17, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-code_agent_test", "easy_accuracy": 38.89, "hard_accuracy": 2.65} | |
| {"submission_id": "personal-code_anget_0808_3", "easy_accuracy": 69.44, "hard_accuracy": 13.49} | |
| {"submission_id": "personal-devagent", "easy_accuracy": 69.44, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-devagent-baseline", "easy_accuracy": 76.39, "hard_accuracy": 9.26} | |
| {"submission_id": "personal-exp003", "easy_accuracy": 29.17, "hard_accuracy": 2.38} | |
| {"submission_id": "personal-foo", "easy_accuracy": 70.83, "hard_accuracy": 12.96} | |
| {"submission_id": "personal-foo2", "easy_accuracy": 70.83, "hard_accuracy": 12.7} | |
| {"submission_id": "personal-foo3", "easy_accuracy": 75.0, "hard_accuracy": 24.6} | |
| {"submission_id": "personal-foo_think", "easy_accuracy": 81.94, "hard_accuracy": 18.52} | |
| {"submission_id": "personal-jaytest-11", "easy_accuracy": 77.78, "hard_accuracy": 5.03} | |
| {"submission_id": "personal-multi_agent_0917", "easy_accuracy": 75.0, "hard_accuracy": 35.19} | |
| {"submission_id": "personal-q_da_test_majority_voting_1121", "easy_accuracy": 75.0, "hard_accuracy": 39.95} | |
| {"submission_id": "personal-q_test_da-bs16-maxlen15", "easy_accuracy": 72.22, "hard_accuracy": 31.48} | |
| {"submission_id": "personal-q_test_da_bs32_maxlen_10", "easy_accuracy": 73.61, "hard_accuracy": 33.6} | |
| {"submission_id": "personal-q_test_da_bs_24_maxlen_10", "easy_accuracy": 72.22, "hard_accuracy": 31.48} | |
| {"submission_id": "personal-q_test_da_cot_level_qwen2.5_prm_72b", "easy_accuracy": 75.0, "hard_accuracy": 31.75} | |
| {"submission_id": "personal-q_test_da_cot_level_qwen2.5_prm_7b", "easy_accuracy": 70.83, "hard_accuracy": 32.28} | |
| {"submission_id": "personal-q_test_da_dataprm_4b", "easy_accuracy": 73.61, "hard_accuracy": 29.63} | |
| {"submission_id": "personal-q_test_da_majority_voting_bs_32_maxlen_10", "easy_accuracy": 75.0, "hard_accuracy": 39.95} | |
| {"submission_id": "personal-q_test_da_pass8_simple", "easy_accuracy": 100.0, "hard_accuracy": 99.21} | |
| {"submission_id": "personal-q_test_da_pass8_simple_true", "easy_accuracy": 86.11, "hard_accuracy": 45.24} | |
| {"submission_id": "personal-q_test_da_prm_qwen2.5", "easy_accuracy": 75.0, "hard_accuracy": 31.48} | |
| {"submission_id": "personal-q_test_da_prm_v2_4B", "easy_accuracy": 72.22, "hard_accuracy": 30.95} | |
| {"submission_id": "personal-q_test_da_qwen2.5_prm_72b", "easy_accuracy": 75.0, "hard_accuracy": 32.28} | |
| {"submission_id": "personal-q_test_da_qwen2.5_prm_7b", "easy_accuracy": 73.61, "hard_accuracy": 32.28} | |
| {"submission_id": "personal-q_test_dabstep_qwen2.5_code_7b_dagenprm_results", "easy_accuracy": 73.61, "hard_accuracy": 34.39} | |
| {"submission_id": "personal-q_test_dabstep_qwen2.5_code_7b_dagenprm_results_new", "easy_accuracy": 77.78, "hard_accuracy": 39.15} | |
| {"submission_id": "personal-q_test_dabstep_qwen2.5_code_7b_dagenprm_results_v2", "easy_accuracy": 73.61, "hard_accuracy": 34.39} | |
| {"submission_id": "personal-q_test_genprm_test_8", "easy_accuracy": 75.0, "hard_accuracy": 35.45} | |
| {"submission_id": "personal-structgpt", "easy_accuracy": 4.17, "hard_accuracy": 0.79} | |
| {"submission_id": "personal-sup", "easy_accuracy": 70.83, "hard_accuracy": 10.32} | |
| {"submission_id": "personal-sup 2.91", "easy_accuracy": 72.22, "hard_accuracy": 17.99} | |
| {"submission_id": "personal-sup 3", "easy_accuracy": 66.67, "hard_accuracy": 20.63} | |
| {"submission_id": "personal-sup 3.1", "easy_accuracy": 65.28, "hard_accuracy": 15.61} | |
| {"submission_id": "personal-sup 3.11", "easy_accuracy": 72.22, "hard_accuracy": 25.13} | |
| {"submission_id": "personal-sup 3.2", "easy_accuracy": 73.61, "hard_accuracy": 28.31} | |
| {"submission_id": "personal-sup 3.3", "easy_accuracy": 72.22, "hard_accuracy": 25.93} | |
| {"submission_id": "personal-sup 3.5", "easy_accuracy": 66.67, "hard_accuracy": 18.78} | |
| {"submission_id": "personal-sup 4", "easy_accuracy": 73.61, "hard_accuracy": 21.43} | |
| {"submission_id": "personal-sup 4.0", "easy_accuracy": 70.83, "hard_accuracy": 25.66} | |
| {"submission_id": "personal-sup 4.1", "easy_accuracy": 68.06, "hard_accuracy": 25.66} | |
| {"submission_id": "personal-sup 4.2", "easy_accuracy": 68.06, "hard_accuracy": 23.28} | |
| {"submission_id": "personal-sup 4.38", "easy_accuracy": 72.22, "hard_accuracy": 35.45} | |
| {"submission_id": "personal-sup 4.39", "easy_accuracy": 75.0, "hard_accuracy": 36.51} | |
| {"submission_id": "personal-sup 4.41", "easy_accuracy": 80.56, "hard_accuracy": 39.95} | |
| {"submission_id": "personal-sup 4.43", "easy_accuracy": 73.61, "hard_accuracy": 36.51} | |
| {"submission_id": "personal-sup 9.2", "easy_accuracy": 66.67, "hard_accuracy": 17.46} | |
| {"submission_id": "personal-sup 9.6", "easy_accuracy": 72.22, "hard_accuracy": 17.99} | |
| {"submission_id": "personal-sup v4", "easy_accuracy": 77.78, "hard_accuracy": 17.2} | |
| {"submission_id": "personal-sup v8", "easy_accuracy": 75.0, "hard_accuracy": 7.41} | |
| {"submission_id": "personal-sup v9", "easy_accuracy": 75.0, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-sup v9 merged", "easy_accuracy": 72.22, "hard_accuracy": 6.08} | |
| {"submission_id": "personal-sup v9.1", "easy_accuracy": 75.0, "hard_accuracy": 7.14} | |
| {"submission_id": "personal-sup v9.4", "easy_accuracy": 70.83, "hard_accuracy": 14.02} | |
| {"submission_id": "personal-sup_a", "easy_accuracy": 72.22, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-sup_v3", "easy_accuracy": 75.0, "hard_accuracy": 15.87} | |
| {"submission_id": "personal-sup_v4", "easy_accuracy": 76.39, "hard_accuracy": 16.93} | |
| {"submission_id": "personal-test", "easy_accuracy": 47.22, "hard_accuracy": 3.17} | |
| {"submission_id": "personal-test-10p", "easy_accuracy": 12.5, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-11", "easy_accuracy": 2.78, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-1sn", "easy_accuracy": 15.28, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-40", "easy_accuracy": 79.17, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-test-6p", "easy_accuracy": 2.78, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-7p", "easy_accuracy": 2.78, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-da-1111", "easy_accuracy": 72.22, "hard_accuracy": 31.22} | |
| {"submission_id": "personal-test-e", "easy_accuracy": 93.06, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e1", "easy_accuracy": 94.44, "hard_accuracy": 5.29} | |
| {"submission_id": "personal-test-e10", "easy_accuracy": 93.06, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e10a", "easy_accuracy": 93.06, "hard_accuracy": 2.65} | |
| {"submission_id": "personal-test-e12", "easy_accuracy": 93.06, "hard_accuracy": 21.69} | |
| {"submission_id": "personal-test-e14", "easy_accuracy": 95.83, "hard_accuracy": 41.8} | |
| {"submission_id": "personal-test-e15", "easy_accuracy": 95.83, "hard_accuracy": 43.65} | |
| {"submission_id": "personal-test-e16", "easy_accuracy": 0.0, "hard_accuracy": 6.61} | |
| {"submission_id": "personal-test-e17", "easy_accuracy": 0.0, "hard_accuracy": 1.85} | |
| {"submission_id": "personal-test-e2", "easy_accuracy": 94.44, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-test-e20", "easy_accuracy": 0.0, "hard_accuracy": 0.26} | |
| {"submission_id": "personal-test-e21", "easy_accuracy": 95.83, "hard_accuracy": 43.65} | |
| {"submission_id": "personal-test-e23", "easy_accuracy": 0.0, "hard_accuracy": 1.06} | |
| {"submission_id": "personal-test-e24", "easy_accuracy": 0.0, "hard_accuracy": 0.79} | |
| {"submission_id": "personal-test-e25", "easy_accuracy": 0.0, "hard_accuracy": 0.53} | |
| {"submission_id": "personal-test-e26", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e27", "easy_accuracy": 0.0, "hard_accuracy": 6.35} | |
| {"submission_id": "personal-test-e28", "easy_accuracy": 0.0, "hard_accuracy": 4.5} | |
| {"submission_id": "personal-test-e4", "easy_accuracy": 94.44, "hard_accuracy": 10.58} | |
| {"submission_id": "personal-test-e49p", "easy_accuracy": 0.0, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e5", "easy_accuracy": 95.83, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e6", "easy_accuracy": 95.83, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-e62a", "easy_accuracy": 0.0, "hard_accuracy": 14.29} | |
| {"submission_id": "personal-test-e63", "easy_accuracy": 0.0, "hard_accuracy": 15.34} | |
| {"submission_id": "personal-test-e64", "easy_accuracy": 0.0, "hard_accuracy": 15.87} | |
| {"submission_id": "personal-test-e7", "easy_accuracy": 95.83, "hard_accuracy": 31.22} | |
| {"submission_id": "personal-test-e8", "easy_accuracy": 95.83, "hard_accuracy": 37.04} | |
| {"submission_id": "personal-test-e9", "easy_accuracy": 95.83, "hard_accuracy": 37.04} | |
| {"submission_id": "personal-test-f1", "easy_accuracy": 15.28, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-l8", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-m2", "easy_accuracy": 2.78, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-n", "easy_accuracy": 1.39, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test-s1", "easy_accuracy": 11.11, "hard_accuracy": 0.0} | |
| {"submission_id": "personal-test_0624_01", "easy_accuracy": 97.22, "hard_accuracy": 56.35} | |
| {"submission_id": "personal-test_0624_02", "easy_accuracy": 72.22, "hard_accuracy": 8.2} | |
| {"submission_id": "personal-test_0625_01", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "personal-test_1205", "easy_accuracy": 72.22, "hard_accuracy": 11.9} | |
| {"submission_id": "personal-test_1205_003", "easy_accuracy": 75.0, "hard_accuracy": 5.56} | |
| {"submission_id": "personal-test_1205_004", "easy_accuracy": 75.0, "hard_accuracy": 6.88} | |
| {"submission_id": "personal-test_1205_005", "easy_accuracy": 77.78, "hard_accuracy": 9.26} | |
| {"submission_id": "personal-test_1205_006", "easy_accuracy": 76.39, "hard_accuracy": 4.23} | |
| {"submission_id": "personal-test_1205_007", "easy_accuracy": 73.61, "hard_accuracy": 13.76} | |
| {"submission_id": "personal-test_1205_8", "easy_accuracy": 73.61, "hard_accuracy": 4.76} | |
| {"submission_id": "personal-test_1205_9", "easy_accuracy": 72.22, "hard_accuracy": 9.52} | |
| {"submission_id": "personal-test_1217", "easy_accuracy": 2.78, "hard_accuracy": 1.06} | |
| {"submission_id": "personal-test_1217_v1", "easy_accuracy": 77.78, "hard_accuracy": 23.02} | |
| {"submission_id": "personal-test_1217_v2", "easy_accuracy": 72.22, "hard_accuracy": 12.96} | |
| {"submission_id": "personal-test_1217_v3", "easy_accuracy": 73.61, "hard_accuracy": 9.79} | |
| {"submission_id": "personal-test_agent_1", "easy_accuracy": 70.83, "hard_accuracy": 12.17} | |
| {"submission_id": "personal-test_bar", "easy_accuracy": 76.39, "hard_accuracy": 12.7} | |
| {"submission_id": "personal-test_foo", "easy_accuracy": 68.06, "hard_accuracy": 14.02} | |
| {"submission_id": "personal-test_foo2", "easy_accuracy": 68.06, "hard_accuracy": 14.02} | |
| {"submission_id": "personal-wtest", "easy_accuracy": 68.06, "hard_accuracy": 7.67} | |
| {"submission_id": "personal-xyx-0623-01", "easy_accuracy": 97.22, "hard_accuracy": 56.35} | |
| {"submission_id": "personal-xyx-0623-02", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "personal-xyx-0623-03", "easy_accuracy": 97.22, "hard_accuracy": 56.35} | |
| {"submission_id": "personal-xyx-0624-01", "easy_accuracy": 69.44, "hard_accuracy": 6.35} | |
| {"submission_id": "personal-xyx-0624-03", "easy_accuracy": 72.22, "hard_accuracy": 8.2} | |
| {"submission_id": "personal-xyx_0425", "easy_accuracy": 73.61, "hard_accuracy": 6.88} | |
| {"submission_id": "personal-xyx_0626_01", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "personal-xyx_0626_02", "easy_accuracy": 97.22, "hard_accuracy": 56.08} | |
| {"submission_id": "personal-xyx_expr", "easy_accuracy": 72.22, "hard_accuracy": 7.67} | |
| {"submission_id": "personal-xyx_test", "easy_accuracy": 16.67, "hard_accuracy": 1.32} | |
| {"submission_id": "personal-xyx_test2", "easy_accuracy": 18.06, "hard_accuracy": 1.59} | |
| {"submission_id": "personal-xyx_test3", "easy_accuracy": 22.22, "hard_accuracy": 0.26} | |
| {"submission_id": "personal-ych_agent", "easy_accuracy": 18.06, "hard_accuracy": 1.59} | |
| {"submission_id": "personal-ych_agent2", "easy_accuracy": 16.67, "hard_accuracy": 0.79} | |
| {"submission_id": "personal-ych_agent_1", "easy_accuracy": 63.89, "hard_accuracy": 4.76} | |
| {"submission_id": "personal-ych_agent_Pro15", "easy_accuracy": 63.89, "hard_accuracy": 4.76} | |
| {"submission_id": "personal-ych_agent_ds", "easy_accuracy": 16.67, "hard_accuracy": 0.79} | |
| {"submission_id": "personal-ych_agent_v3", "easy_accuracy": 65.28, "hard_accuracy": 5.29} | |
| {"submission_id": "personal-ych_codeagent", "easy_accuracy": 65.28, "hard_accuracy": 5.29} | |
| {"submission_id": "pqpoqe-agent_testv2", "easy_accuracy": 72.22, "hard_accuracy": 10.05} | |
| {"submission_id": "pqpoqe-agent_testv2qwen", "easy_accuracy": 72.22, "hard_accuracy": 10.05} | |
| {"submission_id": "pqpoqe-baseline", "easy_accuracy": 37.5, "hard_accuracy": 4.76} | |
| {"submission_id": "pqpoqe-baseline_myagent", "easy_accuracy": 62.5, "hard_accuracy": 8.73} | |
| {"submission_id": "pqpoqe-mytest_claude", "easy_accuracy": 12.5, "hard_accuracy": 0.79} | |
| {"submission_id": "pqpoqe-test myagent", "easy_accuracy": 62.5, "hard_accuracy": 8.73} | |
| {"submission_id": "pqpoqe-test myagent qwen", "easy_accuracy": 62.5, "hard_accuracy": 8.73} | |
| {"submission_id": "pqpoqe-test myagent4o", "easy_accuracy": 37.5, "hard_accuracy": 4.76} | |
| {"submission_id": "pqpoqe2-agent_testv2claude", "easy_accuracy": 12.5, "hard_accuracy": 0.79} | |
| {"submission_id": "pqpoqe2-agent_testv2q", "easy_accuracy": 72.22, "hard_accuracy": 10.05} | |
| {"submission_id": "prism-prism", "easy_accuracy": 76.39, "hard_accuracy": 0.79} | |
| {"submission_id": "private-Ready_player_one", "easy_accuracy": 77.78, "hard_accuracy": 46.3} | |
| {"submission_id": "private-coder", "easy_accuracy": 88.89, "hard_accuracy": 43.12} | |
| {"submission_id": "private-one", "easy_accuracy": 75.0, "hard_accuracy": 43.65} | |
| {"submission_id": "private-one1", "easy_accuracy": 75.0, "hard_accuracy": 43.65} | |
| {"submission_id": "private-one_1", "easy_accuracy": 77.78, "hard_accuracy": 44.71} | |
| {"submission_id": "qiyu-qiyu_01", "easy_accuracy": 86.11, "hard_accuracy": 41.8} | |
| {"submission_id": "qq da test ci agent cl-qq da test ci agent cl", "easy_accuracy": 79.17, "hard_accuracy": 20.63} | |
| {"submission_id": "qq da test ci agent db new prompt-qq da test ci agent db new prompt", "easy_accuracy": 30.56, "hard_accuracy": 9.79} | |
| {"submission_id": "qq da test ci agent ds new prompt-qq da test ci agent ds new prompt", "easy_accuracy": 70.83, "hard_accuracy": 11.64} | |
| {"submission_id": "qq da test ci agent ds-qq da test ci agent ds", "easy_accuracy": 72.22, "hard_accuracy": 7.94} | |
| {"submission_id": "qq da test ci agent qwen3 sft prompt-qq da test ci agent qwen3 sft prompt", "easy_accuracy": 51.39, "hard_accuracy": 3.44} | |
| {"submission_id": "qq da test ci agent sft new prompt-qq da test ci agent sft new prompt", "easy_accuracy": 20.83, "hard_accuracy": 6.08} | |
| {"submission_id": "qq da test ci agent sft sft prompt-qq da test ci agent sft sft prompt", "easy_accuracy": 59.72, "hard_accuracy": 3.97} | |
| {"submission_id": "qq da test ci agent sft0601 new pipeline-qq da test ci agent sft0601 new pipeline", "easy_accuracy": 44.44, "hard_accuracy": 4.23} | |
| {"submission_id": "qq da test ci agent sft0601 new pipeline-qq da test ci agent sft0601 new pipeline 2", "easy_accuracy": 45.83, "hard_accuracy": 5.29} | |
| {"submission_id": "qq da test ci agent sft0601-14 new pipeline-qq da test ci agent sft0601-14 new pipeline", "easy_accuracy": 44.44, "hard_accuracy": 5.56} | |
| {"submission_id": "qq da test ci agent sft0601-16 new pipeline-qq da test ci agent sft0601-16 new pipeline", "easy_accuracy": 62.5, "hard_accuracy": 5.56} | |
| {"submission_id": "qq da test ci agent sft0601-17 new pipeline-qq da test ci agent sft0601-17 new pipeline", "easy_accuracy": 45.83, "hard_accuracy": 2.65} | |
| {"submission_id": "qq da test ci agent-qq da test ci agent db", "easy_accuracy": 58.33, "hard_accuracy": 10.58} | |
| {"submission_id": "qq da test ci agent-qq da test ci agent sft", "easy_accuracy": 41.67, "hard_accuracy": 3.97} | |
| {"submission_id": "qq da test ci workflow-qq da test ci workflow ds", "easy_accuracy": 29.17, "hard_accuracy": 3.44} | |
| {"submission_id": "qwen-qiyu_002", "easy_accuracy": 83.33, "hard_accuracy": 54.5} | |
| {"submission_id": "rising-start-10-rising-start-10", "easy_accuracy": 81.94, "hard_accuracy": 56.08} | |
| {"submission_id": "rising-start-11-rising-start-11", "easy_accuracy": 0.0, "hard_accuracy": 53.17} | |
| {"submission_id": "rising-start-12-rising-start-12", "easy_accuracy": 0.0, "hard_accuracy": 33.6} | |
| {"submission_id": "rising-start-13-rising-start-13", "easy_accuracy": 0.0, "hard_accuracy": 53.17} | |
| {"submission_id": "rising-start-14-rising-start-14", "easy_accuracy": 0.0, "hard_accuracy": 44.44} | |
| {"submission_id": "rising-start-15-rising-start-15", "easy_accuracy": 0.0, "hard_accuracy": 50.26} | |
| {"submission_id": "rising-start-16-rising-start-16", "easy_accuracy": 0.0, "hard_accuracy": 43.65} | |
| {"submission_id": "rising-start-17-rising-start-17", "easy_accuracy": 0.0, "hard_accuracy": 43.65} | |
| {"submission_id": "rising-start-19-rising-start-19", "easy_accuracy": 0.0, "hard_accuracy": 51.32} | |
| {"submission_id": "rising-start-2-rising-start-2", "easy_accuracy": 69.44, "hard_accuracy": 33.86} | |
| {"submission_id": "rising-start-20-rising-start-20", "easy_accuracy": 0.0, "hard_accuracy": 53.7} | |
| {"submission_id": "rising-start-21-rising-start-21", "easy_accuracy": 0.0, "hard_accuracy": 58.73} | |
| {"submission_id": "rising-start-22-rising-start-22", "easy_accuracy": 0.0, "hard_accuracy": 61.38} | |
| {"submission_id": "rising-start-23-rising-start-23", "easy_accuracy": 0.0, "hard_accuracy": 51.59} | |
| {"submission_id": "rising-start-24-rising-start-24", "easy_accuracy": 0.0, "hard_accuracy": 56.35} | |
| {"submission_id": "rising-start-25-rising-start-25", "easy_accuracy": 0.0, "hard_accuracy": 57.41} | |
| {"submission_id": "rising-start-3-rising-start-3", "easy_accuracy": 0.0, "hard_accuracy": 50.79} | |
| {"submission_id": "rising-start-4-rising-start-4", "easy_accuracy": 76.39, "hard_accuracy": 55.03} | |
| {"submission_id": "rising-start-5-rising-start-5", "easy_accuracy": 76.39, "hard_accuracy": 53.97} | |
| {"submission_id": "rising-start-6-rising-start-6", "easy_accuracy": 72.22, "hard_accuracy": 51.85} | |
| {"submission_id": "rising-start-7-rising-start-7", "easy_accuracy": 0.0, "hard_accuracy": 52.91} | |
| {"submission_id": "rising-start-8-rising-start-8", "easy_accuracy": 81.94, "hard_accuracy": 56.08} | |
| {"submission_id": "rising-start-9-rising-start-9", "easy_accuracy": 81.94, "hard_accuracy": 56.08} | |
| {"submission_id": "s45-s45", "easy_accuracy": 87.5, "hard_accuracy": 49.47} | |
| {"submission_id": "sanity-check-1-sanity-check-1", "easy_accuracy": 76.39, "hard_accuracy": 55.03} | |
| {"submission_id": "sensetime-xhx", "easy_accuracy": 13.89, "hard_accuracy": 1.32} | |
| {"submission_id": "stce-test-stce-test", "easy_accuracy": 73.61, "hard_accuracy": 34.13} | |
| {"submission_id": "stce_test-stce_test", "easy_accuracy": 62.5, "hard_accuracy": 46.83} | |
| {"submission_id": "stce_test03-stce_test03", "easy_accuracy": 86.11, "hard_accuracy": 38.89} | |
| {"submission_id": "stce_test_v0.3-stce_test_v0.3", "easy_accuracy": 73.61, "hard_accuracy": 34.13} | |
| {"submission_id": "temp-syncer-agent-v2", "easy_accuracy": 79.17, "hard_accuracy": 34.92} | |
| {"submission_id": "temp-syncer-pro-aa-agent", "easy_accuracy": 79.17, "hard_accuracy": 34.92} | |
| {"submission_id": "test-1-test-1", "easy_accuracy": 65.28, "hard_accuracy": 8.73} | |
| {"submission_id": "test-2-test-2", "easy_accuracy": 72.22, "hard_accuracy": 11.64} | |
| {"submission_id": "test-3-test-3", "easy_accuracy": 75.0, "hard_accuracy": 29.63} | |
| {"submission_id": "test-4-test-4", "easy_accuracy": 77.78, "hard_accuracy": 36.24} | |
| {"submission_id": "test-5-test-5", "easy_accuracy": 77.78, "hard_accuracy": 39.42} | |
| {"submission_id": "test-Bob-the-builder", "easy_accuracy": 75.0, "hard_accuracy": 38.89} | |
| {"submission_id": "test-Sphinx-0.0.1", "easy_accuracy": 77.78, "hard_accuracy": 40.21} | |
| {"submission_id": "test-Sphinx-0.0.2", "easy_accuracy": 76.39, "hard_accuracy": 36.51} | |
| {"submission_id": "test-Test-qwq", "easy_accuracy": 59.72, "hard_accuracy": 4.5} | |
| {"submission_id": "test-V1.7-fixed", "easy_accuracy": 84.72, "hard_accuracy": 74.6} | |
| {"submission_id": "test-adav01", "easy_accuracy": 56.94, "hard_accuracy": 12.7} | |
| {"submission_id": "test-doanh", "easy_accuracy": 36.11, "hard_accuracy": 3.17} | |
| {"submission_id": "test-dv1.1", "easy_accuracy": 81.94, "hard_accuracy": 42.86} | |
| {"submission_id": "test-mle-test", "easy_accuracy": 87.5, "hard_accuracy": 32.8} | |
| {"submission_id": "test-mle-test-0", "easy_accuracy": 80.56, "hard_accuracy": 29.37} | |
| {"submission_id": "test-mle-test-1", "easy_accuracy": 80.56, "hard_accuracy": 27.25} | |
| {"submission_id": "test-mle-test-2", "easy_accuracy": 77.78, "hard_accuracy": 28.04} | |
| {"submission_id": "test-mle-test-max", "easy_accuracy": 88.89, "hard_accuracy": 44.18} | |
| {"submission_id": "test-mle-test-max-0", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-0-1", "easy_accuracy": 90.28, "hard_accuracy": 41.53} | |
| {"submission_id": "test-mle-test-max-1", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-1-1", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-1-2", "easy_accuracy": 81.94, "hard_accuracy": 40.74} | |
| {"submission_id": "test-mle-test-max-2", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-2-1", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-2-2", "easy_accuracy": 84.72, "hard_accuracy": 38.89} | |
| {"submission_id": "test-mle-test-max-v2", "easy_accuracy": 88.89, "hard_accuracy": 49.47} | |
| {"submission_id": "test-org-Test Agent", "easy_accuracy": 100.0, "hard_accuracy": 64.81} | |
| {"submission_id": "test-org-Test Agent2", "easy_accuracy": 100.0, "hard_accuracy": 64.81} | |
| {"submission_id": "test-pangur_sys_v1", "easy_accuracy": 25.0, "hard_accuracy": 0.53} | |
| {"submission_id": "test-qwen3_coder_baseline", "easy_accuracy": 77.78, "hard_accuracy": 10.85} | |
| {"submission_id": "test-qwq_sys_v1", "easy_accuracy": 25.0, "hard_accuracy": 0.79} | |
| {"submission_id": "test-rk-test", "easy_accuracy": 77.78, "hard_accuracy": 40.21} | |
| {"submission_id": "test-singularity", "easy_accuracy": 76.39, "hard_accuracy": 42.59} | |
| {"submission_id": "test-stce-2-test-stce-2", "easy_accuracy": 81.94, "hard_accuracy": 46.56} | |
| {"submission_id": "test-test", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "test-test-0710", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "test-test-123-new-deepseek", "easy_accuracy": 75.0, "hard_accuracy": 11.11} | |
| {"submission_id": "test-test-12312312321123", "easy_accuracy": 75.0, "hard_accuracy": 38.62} | |
| {"submission_id": "test-test-9999", "easy_accuracy": 23.61, "hard_accuracy": 0.53} | |
| {"submission_id": "test-test-alpha-bravo-tango", "easy_accuracy": 72.22, "hard_accuracy": 38.62} | |
| {"submission_id": "test-test-bravo", "easy_accuracy": 75.0, "hard_accuracy": 38.62} | |
| {"submission_id": "test-test-hammer-123123123", "easy_accuracy": 72.22, "hard_accuracy": 33.6} | |
| {"submission_id": "test-test-humminbird-1", "easy_accuracy": 72.22, "hard_accuracy": 43.92} | |
| {"submission_id": "test-test-init", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "test-test-new-111112", "easy_accuracy": 23.61, "hard_accuracy": 0.53} | |
| {"submission_id": "test-test-new-111112-oai", "easy_accuracy": 27.78, "hard_accuracy": 0.79} | |
| {"submission_id": "test-test-p-r", "easy_accuracy": 16.67, "hard_accuracy": 0.26} | |
| {"submission_id": "test-test-qwq", "easy_accuracy": 38.89, "hard_accuracy": 2.65} | |
| {"submission_id": "test-test-sampled", "easy_accuracy": 25.0, "hard_accuracy": 0.53} | |
| {"submission_id": "test-test-stce", "easy_accuracy": 72.22, "hard_accuracy": 42.06} | |
| {"submission_id": "test-test-test-test-work", "easy_accuracy": 75.0, "hard_accuracy": 38.89} | |
| {"submission_id": "test-test1208", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "test-test1208-2", "easy_accuracy": 2.78, "hard_accuracy": 0.79} | |
| {"submission_id": "test-test1212", "easy_accuracy": 22.22, "hard_accuracy": 1.59} | |
| {"submission_id": "test-test1212-2", "easy_accuracy": 20.83, "hard_accuracy": 1.59} | |
| {"submission_id": "test-test1212-3", "easy_accuracy": 22.22, "hard_accuracy": 1.59} | |
| {"submission_id": "test-test1213", "easy_accuracy": 22.22, "hard_accuracy": 1.85} | |
| {"submission_id": "test-test1213-2", "easy_accuracy": 22.22, "hard_accuracy": 1.85} | |
| {"submission_id": "test-test1213-3", "easy_accuracy": 23.61, "hard_accuracy": 1.59} | |
| {"submission_id": "test-test1213-4", "easy_accuracy": 22.22, "hard_accuracy": 1.59} | |
| {"submission_id": "test-test1213-5", "easy_accuracy": 23.61, "hard_accuracy": 1.85} | |
| {"submission_id": "test-test1213-6", "easy_accuracy": 25.0, "hard_accuracy": 2.12} | |
| {"submission_id": "test-test1213-7", "easy_accuracy": 23.61, "hard_accuracy": 2.38} | |
| {"submission_id": "test-test_251022", "easy_accuracy": 75.0, "hard_accuracy": 7.67} | |
| {"submission_id": "test-testagent", "easy_accuracy": 72.22, "hard_accuracy": 11.11} | |
| {"submission_id": "test-tttest-agent", "easy_accuracy": 75.0, "hard_accuracy": 36.24} | |
| {"submission_id": "test-user-org-Test-user-1", "easy_accuracy": 81.94, "hard_accuracy": 25.13} | |
| {"submission_id": "test-user-org-test-user-submission-1", "easy_accuracy": 81.94, "hard_accuracy": 25.13} | |
| {"submission_id": "test-v.1.4", "easy_accuracy": 83.33, "hard_accuracy": 43.65} | |
| {"submission_id": "test-v1.1.2", "easy_accuracy": 79.17, "hard_accuracy": 39.68} | |
| {"submission_id": "test-v1.1.3", "easy_accuracy": 86.11, "hard_accuracy": 39.95} | |
| {"submission_id": "test-v1.1.3h", "easy_accuracy": 80.56, "hard_accuracy": 44.18} | |
| {"submission_id": "test-v1.3.fixed", "easy_accuracy": 86.11, "hard_accuracy": 44.18} | |
| {"submission_id": "test-v1.4.m13", "easy_accuracy": 88.89, "hard_accuracy": 49.74} | |
| {"submission_id": "test-v1.4.s", "easy_accuracy": 73.61, "hard_accuracy": 43.39} | |
| {"submission_id": "test-v1.4m", "easy_accuracy": 81.94, "hard_accuracy": 48.41} | |
| {"submission_id": "test-v1.5.meta", "easy_accuracy": 81.94, "hard_accuracy": 56.35} | |
| {"submission_id": "test-v1.6", "easy_accuracy": 86.11, "hard_accuracy": 78.57} | |
| {"submission_id": "test-v1.7", "easy_accuracy": 86.11, "hard_accuracy": 71.16} | |
| {"submission_id": "test-with-update-test-with-update", "easy_accuracy": 22.22, "hard_accuracy": 0.53} | |
| {"submission_id": "test-with-update-test-with-update2", "easy_accuracy": 25.0, "hard_accuracy": 0.26} | |
| {"submission_id": "test-with-update-test-with-update2-fr", "easy_accuracy": 25.0, "hard_accuracy": 0.79} | |
| {"submission_id": "test-zsl2", "easy_accuracy": 90.28, "hard_accuracy": 48.15} | |
| {"submission_id": "test-zzzz-1", "easy_accuracy": 23.61, "hard_accuracy": 0.79} | |
| {"submission_id": "test1-test1", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "test2-test2", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "test2_1-test2_1", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "test3-test3", "easy_accuracy": 76.39, "hard_accuracy": 0.0} | |
| {"submission_id": "test3_1-test3_1", "easy_accuracy": 73.61, "hard_accuracy": 0.0} | |
| {"submission_id": "test_agent_0818-test_agent_0818", "easy_accuracy": 0.0, "hard_accuracy": 1.59} | |
| {"submission_id": "test_org-test_agent_0920", "easy_accuracy": 0.0, "hard_accuracy": 11.11} | |
| {"submission_id": "test_org-test_agent_0921", "easy_accuracy": 0.0, "hard_accuracy": 11.38} | |
| {"submission_id": "test_org_01-test_agent_0801", "easy_accuracy": 2.78, "hard_accuracy": 2.91} | |
| {"submission_id": "test_org_0801-test_agent_0801_v2", "easy_accuracy": 9.72, "hard_accuracy": 2.91} | |
| {"submission_id": "test_org_0813-test_code_agent_0813", "easy_accuracy": 83.33, "hard_accuracy": 2.38} | |
| {"submission_id": "test_org_0815-test_agent_0815", "easy_accuracy": 0.0, "hard_accuracy": 1.06} | |
| {"submission_id": "test_org_0815_2-test_agent_0815_2", "easy_accuracy": 0.0, "hard_accuracy": 2.91} | |
| {"submission_id": "test_org_0815_3-test_agent_0815_3", "easy_accuracy": 0.0, "hard_accuracy": 2.65} | |
| {"submission_id": "test_org_0815_4-test_agent_0815_4", "easy_accuracy": 0.0, "hard_accuracy": 2.65} | |
| {"submission_id": "test_org_0815_5-test_agent_0815_5", "easy_accuracy": 0.0, "hard_accuracy": 2.65} | |
| {"submission_id": "test_org_0815_6-test_agent_0815_6", "easy_accuracy": 0.0, "hard_accuracy": 4.76} | |
| {"submission_id": "test_org_0817-test_agent_0817", "easy_accuracy": 0.0, "hard_accuracy": 0.53} | |
| {"submission_id": "test_org_0817_2-test_agent_0817_2", "easy_accuracy": 0.0, "hard_accuracy": 2.38} | |
| {"submission_id": "test_org_0817_3-test_agent_0817_3", "easy_accuracy": 0.0, "hard_accuracy": 1.32} | |
| {"submission_id": "test_org_0818_18-test_agent_0818_18", "easy_accuracy": 0.0, "hard_accuracy": 4.23} | |
| {"submission_id": "test_org_0818_2-test_agent_0818_2", "easy_accuracy": 0.0, "hard_accuracy": 1.59} | |
| {"submission_id": "test_org_0821-test_agent_0821", "easy_accuracy": 0.0, "hard_accuracy": 4.23} | |
| {"submission_id": "test_org_0821_2-test_agent_0821_2", "easy_accuracy": 0.0, "hard_accuracy": 4.76} | |
| {"submission_id": "test_org_2-test_agent_0921_2", "easy_accuracy": 0.0, "hard_accuracy": 11.64} | |
| {"submission_id": "testagent001-testagent001", "easy_accuracy": 81.94, "hard_accuracy": 19.31} | |
| {"submission_id": "testagent002-testagent002", "easy_accuracy": 84.72, "hard_accuracy": 19.31} | |
| {"submission_id": "tiny-g-tiny-g-agent", "easy_accuracy": 68.06, "hard_accuracy": 17.2} | |
| {"submission_id": "tmp-test-hard", "easy_accuracy": 4.17, "hard_accuracy": 0.0} | |
| {"submission_id": "tw_org-tw_agent", "easy_accuracy": 38.89, "hard_accuracy": 2.65} | |
| {"submission_id": "ucas-test-1215-2", "easy_accuracy": 69.44, "hard_accuracy": 17.46} | |
| {"submission_id": "ucas-test1211", "easy_accuracy": 73.61, "hard_accuracy": 25.4} | |
| {"submission_id": "ucas-test1215-1", "easy_accuracy": 69.44, "hard_accuracy": 17.46} | |
| {"submission_id": "ucla-visual_test", "easy_accuracy": 65.28, "hard_accuracy": 3.97} | |
| {"submission_id": "uh-uh", "easy_accuracy": 87.5, "hard_accuracy": 61.38} | |
| {"submission_id": "uh-uh-uh-uh", "easy_accuracy": 72.22, "hard_accuracy": 50.0} | |
| {"submission_id": "unknown-beta-pa-v1", "easy_accuracy": 84.72, "hard_accuracy": 35.98} | |
| {"submission_id": "whooooo-rising-start", "easy_accuracy": 68.06, "hard_accuracy": 37.83} | |
| {"submission_id": "yxy-yxy", "easy_accuracy": 15.28, "hard_accuracy": 1.32} | |
| {"submission_id": "zjx-zjx", "easy_accuracy": 4.17, "hard_accuracy": 1.85} | |
| {"submission_id": "zsl-zsl", "easy_accuracy": 70.83, "hard_accuracy": 9.26} | |
| {"submission_id": "zsl-zsl10", "easy_accuracy": 73.61, "hard_accuracy": 30.95} | |
| {"submission_id": "zsl-zsl11", "easy_accuracy": 73.61, "hard_accuracy": 30.42} | |
| {"submission_id": "zsl-zsl12", "easy_accuracy": 73.61, "hard_accuracy": 33.6} | |
| {"submission_id": "zsl-zsl13", "easy_accuracy": 70.83, "hard_accuracy": 33.86} | |
| {"submission_id": "zsl-zsl14", "easy_accuracy": 69.44, "hard_accuracy": 32.01} | |
| {"submission_id": "zsl-zsl18", "easy_accuracy": 66.67, "hard_accuracy": 33.07} | |
| {"submission_id": "zsl-zsl19", "easy_accuracy": 70.83, "hard_accuracy": 30.95} | |
| {"submission_id": "zsl-zsl20", "easy_accuracy": 63.89, "hard_accuracy": 33.07} | |
| {"submission_id": "zsl-zsl3", "easy_accuracy": 68.06, "hard_accuracy": 24.87} | |
| {"submission_id": "zsl-zsl31", "easy_accuracy": 90.28, "hard_accuracy": 48.41} | |
| {"submission_id": "zsl-zsl4", "easy_accuracy": 62.5, "hard_accuracy": 24.6} | |
| {"submission_id": "zsl-zsl5", "easy_accuracy": 75.0, "hard_accuracy": 29.37} | |
| {"submission_id": "zsl-zsl51", "easy_accuracy": 69.44, "hard_accuracy": 30.69} | |
| {"submission_id": "zsl-zsl52", "easy_accuracy": 73.61, "hard_accuracy": 24.87} | |
| {"submission_id": "zsl-zsl6", "easy_accuracy": 65.28, "hard_accuracy": 29.89} | |
| {"submission_id": "zsl-zsl61", "easy_accuracy": 70.83, "hard_accuracy": 32.8} | |
| {"submission_id": "zsl-zsl62", "easy_accuracy": 68.06, "hard_accuracy": 30.69} | |
| {"submission_id": "zsl-zsl7", "easy_accuracy": 65.28, "hard_accuracy": 29.89} | |
| {"submission_id": "zsl-zsl8", "easy_accuracy": 65.28, "hard_accuracy": 29.89} | |
| {"submission_id": "zsl-zsl9", "easy_accuracy": 70.83, "hard_accuracy": 29.37} | |
| {"submission_id": "zsl-zsl921-15", "easy_accuracy": 69.44, "hard_accuracy": 5.03} | |
| {"submission_id": "zzz-zzz9", "easy_accuracy": 65.28, "hard_accuracy": 28.04} | |
| {"submission_id": "zzz-zzzz-3", "easy_accuracy": 84.72, "hard_accuracy": 16.4} | |
| {"submission_id": "testorg-test-lb1", "easy_accuracy": 0.0, "hard_accuracy": 15.87} | |
| {"submission_id": "test-test-lb2", "easy_accuracy": 0.0, "hard_accuracy": 15.87} | |