MLRC_Bench / src /data /metrics /absolute_improvement_to_baseline.json
Armeddinosaur's picture
updating metrics
d0e494a
raw
history blame
2.31 kB
{
"perception_temporal_action_loc": {
"MLAB (claude-3-5-sonnet-v2)": 2.2,
"Top Human in Competition": 284.6,
"MLAB (gemini-exp-1206)": -1.3,
"MLAB (o3-mini)": 0.9,
"MLAB (gpt-4o)": 0.9,
"MLAB (llama3-1-405b-instruct)": 1.5,
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
},
"llm-merging": {
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
"Top Human in Competition": 68.2,
"MLAB (claude-3-5-sonnet-v2)": 3.4,
"MLAB (gemini-exp-1206)": 3.4,
"MLAB (o3-mini)": -0.7,
"MLAB (gpt-4o)": 1.4,
"MLAB (llama3-1-405b-instruct)": -0.7
},
"meta-learning": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
"Top Human in Competition": 304.5,
"MLAB (claude-3-5-sonnet-v2)": 5.4,
"MLAB (gemini-exp-1206)": 5.4,
"MLAB (o3-mini)": -14.9,
"MLAB (gpt-4o)": 5.4,
"MLAB (llama3-1-405b-instruct)": 5.4
},
"product-recommendation": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
"Top Human in Competition": 412.6,
"MLAB (claude-3-5-sonnet-v2)": 12.3,
"MLAB (gemini-exp-1206)": 0.6,
"MLAB (o3-mini)": 0.6,
"MLAB (gpt-4o)": 2.6,
"MLAB (llama3-1-405b-instruct)": -0.0
},
"weather_forcast": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
"Top Human in Competition": 399.4,
"MLAB (claude-3-5-sonnet-v2)": 31.0,
"MLAB (gemini-exp-1206)": 91.4,
"MLAB (o3-mini)": 53.3,
"MLAB (gpt-4o)": 100.8,
"MLAB (llama3-1-405b-instruct)": 66.7
},
"machine_unlearning": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
"Top Human in Competition": 61.9,
"MLAB (claude-3-5-sonnet-v2)": -58.6,
"MLAB (gemini-exp-1206)": 3.5,
"MLAB (o3-mini)": 2.2,
"MLAB (gpt-4o)": -11.1,
"MLAB (llama3-1-405b-instruct)": 3.8
},
"erasing_invisible_watermarks": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 80.3,
"Top Human in Competition": 95.6,
"MLAB (claude-3-5-sonnet-v2)": 83.7,
"MLAB (gemini-exp-1206)": 93.3,
"MLAB (o3-mini)": 79.8,
"MLAB (gpt-4o)": 79.8,
"MLAB (llama3-1-405b-instruct)": 79.8
},
"backdoor-trigger-recovery": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
"Top Human in Competition": 621.3,
"MLAB (claude-3-5-sonnet-v2)": 247.9,
"MLAB (gemini-exp-1206)": 80.4,
"MLAB (o3-mini)": 38.8,
"MLAB (gpt-4o)": 64.5,
"MLAB (llama3-1-405b-instruct)": 71.7
}
}