M3-Bench / data /leaderboard.csv
FredericFan's picture
v1
bf70dba
model,category,view,PD,SH,UG,RPD,GE,AOB,PGG,VD,CPR,AC,WW,KP,L1,L2,L3,L4
GPT-5.1,Closed-Source,BTA,97,94,95,96,82,93,93,88,94,91,80,90,93.0,89.0,90.3,86.2
GPT-5.1,Closed-Source,RPA,95,92,93,93,84,90,90,86,91,88,84,87,91.7,88.0,88.5,85.8
GPT-5.1,Closed-Source,CCA,90,88,96,88,91,93,88,84,85,80,90,76,89.3,86.8,87.2,83.7
Claude Opus 4.5,Closed-Source,BTA,95,92,91,93,85,95,91,86,90,87,83,85,91.0,88.8,89.3,85.5
Claude Opus 4.5,Closed-Source,RPA,93,90,94,91,82,93,88,84,88,85,87,83,89.7,86.5,87.5,84.7
Claude Opus 4.5,Closed-Source,CCA,92,94,97,90,95,94,90,87,83,82,93,78,90.3,89.0,88.0,86.8
Gemini-3 Pro,Closed-Source,BTA,93,95,89,90,86,88,88,90,91,86,78,84,89.7,86.8,87.3,83.2
Gemini-3 Pro,Closed-Source,RPA,91,90,91,88,83,85,86,87,89,84,82,85,89.3,85.7,85.8,83.5
Gemini-3 Pro,Closed-Source,CCA,87,85,93,84,88,90,84,85,82,78,86,74,86.8,83.5,84.3,81.0
Grok-4.1,Closed-Source,BTA,90,88,83,87,81,93,85,83,89,82,72,86,88.7,86.3,84.5,79.0
Grok-4.1,Closed-Source,RPA,87,85,86,84,79,90,83,81,86,79,76,83,86.7,84.2,82.8,78.7
Grok-4.1,Closed-Source,CCA,84,82,90,80,85,92,81,80,80,74,82,71,86.2,82.7,81.8,77.0
GPT-oss 120b,Open-Weight,BTA,89,91,86,88,90,84,84,87,85,79,75,78,87.0,85.3,84.0,77.5
GPT-oss 120b,Open-Weight,RPA,86,88,89,85,86,82,82,84,83,77,79,76,86.3,83.2,82.3,77.5
GPT-oss 120b,Open-Weight,CCA,83,86,91,82,89,86,80,83,79,75,84,70,85.0,81.8,81.8,77.7
DeepSeek V3.2,Open-Weight,BTA,88,90,85,85,88,83,87,81,92,84,68,80,86.7,84.2,83.8,76.2
DeepSeek V3.2,Open-Weight,RPA,85,87,88,83,85,81,84,79,89,81,73,77,85.5,82.5,82.2,76.2
DeepSeek V3.2,Open-Weight,CCA,82,84,90,79,87,85,82,78,84,72,79,69,84.0,80.5,81.0,74.8
Kimi-K2 Thinking,Open-Weight,BTA,92,86,90,84,79,86,86,80,83,73,82,68,87.0,81.2,82.0,74.3
Kimi-K2 Thinking,Open-Weight,RPA,89,84,92,81,77,83,83,78,81,80,86,76,86.2,79.3,80.3,81.0
Kimi-K2 Thinking,Open-Weight,CCA,85,81,94,77,83,87,79,76,77,60,72,53,84.5,78.5,79.2,63.2
Mistral Large 3,Open-Weight,BTA,86,84,82,82,76,80,79,76,82,74,70,71,81.8,77.8,77.7,70.5
Mistral Large 3,Open-Weight,RPA,83,81,85,79,74,78,77,74,79,71,72,69,81.0,76.2,76.2,69.8
Mistral Large 3,Open-Weight,CCA,79,77,88,75,80,82,74,72,75,62,74,58,79.3,75.3,74.7,66.0
GPT-5 pro,Reasoning,BTA,96,93,94,94,85,91,91,86,92,86,76,88,92.7,89.0,88.2,82.2
GPT-5 pro,Reasoning,RPA,97,96,93,96,93,95,93,92,94,93,91,92,95.2,94.2,92.2,92.0
GPT-5 pro,Reasoning,CCA,78,76,88,79,72,84,77,74,80,65,70,60,78.5,75.5,77.2,66.8
Gemini-3 Deep Think,Reasoning,BTA,91,89,88,89,83,87,87,85,90,83,84,80,89.7,85.7,86.8,81.5
Gemini-3 Deep Think,Reasoning,RPA,94,93,95,93,91,93,88,90,91,91,88,90,93.0,92.2,90.2,90.0
Gemini-3 Deep Think,Reasoning,CCA,76,80,85,76,80,82,78,76,77,63,78,58,78.8,75.5,77.8,67.8
DeepSeek-V3.2 Speciale,Reasoning,BTA,94,91,92,92,80,89,89,82,91,84,70,85,91.3,86.7,85.0,78.2
DeepSeek-V3.2 Speciale,Reasoning,RPA,97,96,94,97,94,96,90,92,93,95,92,94,95.2,95.2,92.5,94.0
DeepSeek-V3.2 Speciale,Reasoning,CCA,74,72,83,72,68,78,71,68,73,55,62,50,74.8,69.7,71.5,58.0
Human,Human,BTA,91,88,93,86,78,84,83,80,79,78,87,72,86.3,82.2,82.8,80.7
Human,Human,RPA,86,84,90,82,76,81,80,78,77,75,83,70,84.2,79.5,80.2,77.5
Human,Human,CCA,93,90,94,89,92,91,90,85,76,84,92,68,89.0,85.0,85.3,84.7