MLRC_Bench / src /data /metrics /absolute_improvement_to_baseline.json
Armeddinosaur's picture
Updating table
06d4ee9
raw
history blame
2.31 kB
{
"perception_temporal_action_loc": {
"MLAB (claude-3-5-sonnet-v2)": 2.222443094482299,
"Top Human in Competition": 284.55703321316366,
"MLAB (gemini-exp-1206)": -1.34633272895098,
"MLAB (o3-mini)": 0.8724822663469414,
"MLAB (gpt-4o)": 0.9384906166574135,
"MLAB (llama3-1-405b-instruct)": 1.474927454740455,
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.9888962417416385
},
"llm-merging": {
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.6756756689645764,
"Top Human in Competition": 68.24324325461103,
"MLAB (claude-3-5-sonnet-v2)": 3.3783783853634035,
"MLAB (gemini-exp-1206)": 3.3783783853634035,
"MLAB (o3-mini)": -0.6756756689645764,
"MLAB (gpt-4o)": 1.3513513581994137,
"MLAB (llama3-1-405b-instruct)": -0.6756756689645764
},
"meta-learning": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.424978139166417,
"Top Human in Competition": 304.53435579895256,
"MLAB (claude-3-5-sonnet-v2)": 5.424978139166417,
"MLAB (gemini-exp-1206)": 5.424978139166417,
"MLAB (o3-mini)": -14.923192223926499,
"MLAB (gpt-4o)": 5.424978139166417,
"MLAB (llama3-1-405b-instruct)": 5.424978139166417
},
"product-recommendation": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6021227441680528,
"Top Human in Competition": 412.59793394031675,
"MLAB (claude-3-5-sonnet-v2)": 12.283606772997718,
"MLAB (gemini-exp-1206)": 0.6021227441680528,
"MLAB (o3-mini)": 0.6035316323448103,
"MLAB (gpt-4o)": 2.6400767209619422,
"MLAB (llama3-1-405b-instruct)": -2.9066701147102995e-09
},
"machine_unlearning": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 7.318484292638537,
"Top Human in Competition": 61.85258904854873,
"MLAB (claude-3-5-sonnet-v2)": -58.58540153334969,
"MLAB (gemini-exp-1206)": 3.4837676447981045,
"MLAB (o3-mini)": 2.2414490971518704,
"MLAB (gpt-4o)": -11.131587250139926,
"MLAB (llama3-1-405b-instruct)": 3.8409541040677597
},
"backdoor-trigger-recovery": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 38.252918051116,
"Top Human in Competition": 621.2635313337943,
"MLAB (claude-3-5-sonnet-v2)": 247.90785034564928,
"MLAB (gemini-exp-1206)": 80.40937239150493,
"MLAB (o3-mini)": 38.75953643366491,
"MLAB (gpt-4o)": 64.52832837042699,
"MLAB (llama3-1-405b-instruct)": 71.70765816958271
}
}