Spaces:

SamsungResearch
/

TRUEBench

Running

Jongyoon Song

Update evaluation results (251224) & Remove time and speed-related results

ef2b66d 2 days ago

13.9 kB

	"Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Hallucination" "Safety" "Repetition" "Summarization" "Translation" "Multi-Turn"
	"GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "71.0" "74.38" "76.49" "79.75" "64.94" "56.2" "82.86" "80.16" "69.38" "54.36"
	"o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "72.5" "70.31" "75.7" "83.88" "64.37" "33.88" "74.29" "65.48" "64.33" "48.32"
	"GPT-5.2 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5.2" "Reasoning: medium" "GPT" "347.0" "264.0" "" "Proprietary" "Hybrid" "On" "66.18" "69.25" "65.62" "71.31" "78.51" "70.69" "52.07" "51.43" "80.56" "55.9" "55.03"
	"GPT-5.1 (Reasoning: medium, verbosity: medium)" "https://platform.openai.com/docs/models/gpt-5.1" "Reasoning: medium, verbosity: medium" "GPT" "" "" "" "Proprietary" "Hybrid" "On" "64.57" "67.0" "70.0" "72.51" "82.64" "65.52" "52.07" "51.43" "67.06" "59.55" "45.64"
	"Claude 4.5 Opus (think, budget: 16K)" "https://www.anthropic.com/claude/opus" "thinking budget: 16K" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.41" "63.5" "62.5" "73.71" "77.69" "82.76" "52.89" "58.57" "63.49" "56.74" "45.97"
	"Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "60.75" "59.69" "73.31" "69.83" "78.74" "53.72" "55.71" "65.48" "65.45" "48.99"
	"Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "61.25" "60.0" "78.49" "72.73" "77.01" "56.2" "57.14" "61.9" "62.64" "46.98"
	"GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "68.0" "62.5" "74.9" "76.86" "55.17" "47.93" "44.29" "74.6" "56.18" "45.3"
	"Gemini 3 Pro Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/pro/" "Thinking Level: High" "Gemini" "1930.5" "378.0" "" "Proprietary" "Think" "On" "62.48" "59.5" "64.38" "76.49" "78.93" "70.69" "39.67" "65.71" "61.51" "58.15" "48.99"
	"Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "58.0" "58.44" "76.49" "67.77" "79.31" "57.02" "44.29" "65.08" "62.92" "44.97"
	"o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "68.75" "60.0" "73.31" "79.34" "54.02" "34.71" "64.29" "60.71" "55.06" "46.98"
	"Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "54.0" "60.94" "78.88" "73.14" "63.22" "17.36" "52.86" "67.86" "53.93" "52.68"
	"Gemini 3 Flash Preview (Thinking Level: High)" "https://deepmind.google/models/gemini/flash/" "Thinking Level: High" "Gemini" "1296.5" "424.5" "" "Proprietary" "Think" "On" "59.26" "59.5" "59.69" "75.3" "79.34" "63.22" "34.71" "57.14" "59.92" "50.84" "46.31"
	"GLM-4.7 FP8" "https://huggingface.co/zai-org/GLM-4.7-FP8" "temperature: 1.0
	top-p: 0.95" "GLM" "2252.5" "328.0" "358.0" "Open" "Hybrid" "On" "59.22" "62.75" "60.0" "75.3" "75.21" "58.05" "29.75" "35.71" "66.67" "53.93" "45.3"
	"DeepSeek V3.2 Speciale" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale" "temperature: 1.0
	top-p: 0.95" "DeepSeek" "3226.5" "249.5" "671.0" "Open" "Think" "On" "59.14" "64.0" "67.19" "74.5" "78.1" "48.28" "20.66" "58.57" "66.27" "53.09" "38.93"
	"Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
	top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "72.51" "63.22" "66.09" "16.53" "58.57" "66.27" "54.21" "44.3"
	"Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "57.25" "62.19" "70.52" "72.31" "56.9" "28.93" "47.14" "68.65" "55.06" "46.98"
	"o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "67.25" "61.25" "71.71" "75.62" "45.4" "39.67" "44.29" "59.92" "47.19" "41.95"
	"Kimi K2 Thinking" "https://huggingface.co/moonshotai/Kimi-K2-Thinking" "temperature:1.0
	top-p: 0.95" "moonshot" "1692.0" "330.0" "1000.0" "Open" "Think" "On" "56.84" "58.25" "50.31" "69.72" "77.27" "60.92" "44.63" "38.57" "59.92" "52.25" "44.3"
	"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
	top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
	"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
	"GLM-4.5 FP8" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
	top-p: 0.95" "GLM" "1442.0" "604.0" "358.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
	"GLM-4.6 FP8" "https://huggingface.co/zai-org/GLM-4.6-FP8" "temperature: 1.0
	top-p: 0.95" "GLM" "2645.5" "522.0" "358.0" "Open" "Hybrid" "On" "53.3" "57.5" "51.25" "71.31" "71.9" "53.45" "24.79" "28.57" "58.33" "44.38" "43.29"
	"Gemini 2.5 Flash-lite Preview (09-2025)" "https://deepmind.google/models/gemini/" "version: 09-2025" "Gemini" "" "" "" "Proprietary" "Think" "On" "53.06" "55.0" "55.94" "68.13" "70.25" "47.7" "23.97" "30.0" "60.71" "46.63" "42.28"
	"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
	top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
	"DeepSeek V3.2" "https://huggingface.co/deepseek-ai/DeepSeek-V3.2" "temperature: 1.0
	top-p: 0.95" "DeepSeek" "762.5" "312.0" "671.0" "Open" "Think" "On" "52.17" "51.25" "51.56" "70.92" "72.31" "51.15" "36.36" "37.14" "60.32" "40.17" "39.93"
	"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
	top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
	"DeepSeek V3.1 Terminus (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus" "temperature: 0.6
	top-p: 0.95" "DeepSeek" "831.5" "377.0" "671.0" "Open" "Hybrid" "On" "51.37" "51.5" "52.19" "69.32" "73.14" "51.72" "25.62" "38.57" "57.14" "38.76" "40.94"
	"Qwen3 30B A3B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507" "temperature: 0.6
	top-p: 0.95" "Qwen" "2830.0" "351.0" "30.0" "Open" "Think" "On" "50.44" "56.25" "45.0" "69.32" "69.01" "50.0" "29.75" "30.0" "48.02" "47.47" "36.58"
	"MiMo V2 Flash" "https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash" "temperature: 0.8
	top-p: 0.95" "XiaomiMiMo" "1477.5" "373.0" "309.0" "Open" "Think" "On" "50.32" "54.0" "48.12" "67.73" "68.18" "44.83" "48.76" "28.57" "53.97" "40.73" "35.91"
	"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
	temperature: 1.0
	top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
	"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
	temperature: 0.6
	top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
	"Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "52.0" "46.25" "59.76" "66.94" "41.95" "34.71" "25.71" "53.17" "34.55" "33.22"
	"Mistral Large 3 675B Instruct 2512" "https://huggingface.co/mistralai/Mistral-Large-3-675B-Instruct-2512" "temperature: 0.15" "mistralai" "448.0" "448.0" "675.0" "Open" "Instruct" "Off" "45.21" "44.0" "50.62" "65.34" "60.33" "33.33" "14.88" "37.14" "53.97" "36.52" "35.91"
	"DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
	temperature: 1.3
	top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
	"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
	top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
	"Qwen3 30B A3B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507" "temperature: 0.7
	top-p: 0.8" "Qwen" "441.5" "441.5" "30.0" "Open" "Instruct" "Off" "42.79" "45.0" "35.0" "56.18" "66.12" "51.15" "33.06" "24.29" "46.83" "28.09" "35.57"
	"MiniMax-M2 (230B A10B)" "https://huggingface.co/MiniMaxAI/MiniMax-M2" "temperature:1.0
	top-p: 0.95" "MiniMaxAI" "1142.0" "325.0" "230.0" "Open" "Think" "On" "42.43" "48.75" "35.62" "53.39" "57.02" "43.1" "44.63" "28.57" "49.21" "30.06" "31.21"
	"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
	"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
	temperature: 1.0
	top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
	"Gemma 3 27B it" "https://huggingface.co/google/gemma-3-27b-it" "temperature: 1.0
	top-p: 0.95" "Gemma" "380.0" "380.0" "27.0" "Open" "Instruct" "Off" "40.86" "44.25" "45.0" "45.82" "36.78" "31.61" "32.23" "22.86" "57.14" "32.87" "39.93"
	"Tongyi DeepResearch 30B A3B" "https://huggingface.co/Alibaba-NLP/Tongyi-DeepResearch-30B-A3B" "temperature: 0.6
	top-p: 0.95" "Alibaba" "1147.0" "408.0" "30.0" "Open" "Think" "On" "40.1" "41.25" "33.12" "62.15" "68.18" "44.25" "23.97" "18.57" "41.67" "26.12" "29.19"
	"Mistral Small 3.2 24B Instruct 2506" "https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506" "temperature: 0.15
	top-p: 0.95" "mistralai" "369.0" "369.0" "24.0" "Open" "Instruct" "Off" "39.09" "43.0" "44.69" "43.43" "51.65" "25.86" "22.31" "25.71" "51.98" "31.18" "30.2"
	"K2-Think" "https://huggingface.co/LLM360/K2-Think" "temperature: 1.0
	top-p: 0.95" "LLM360" "1835.0" "486.0" "32.8" "Open" "Think" "On" "35.06" "35.5" "36.56" "56.18" "47.11" "35.06" "14.05" "12.86" "49.21" "21.63" "23.15"
	"Kanana 2 30B A3B Thinking" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-thinking" "temperature: 0.6
	top-p: 0.95
	top-k: 20" "Kakao" "4263.0" "854.5" "31.0" "Open" "Think" "On" "34.5" "37.5" "25.0" "57.77" "54.55" "39.66" "20.66" "15.71" "38.1" "24.72" "20.47"
	"KAT Dev 72B Exp" "https://huggingface.co/Kwaipilot/KAT-Dev-72B-Exp" "temperature:0.6
	top-p: 0.95" "KAT" "397.0" "397.0" "72.0" "Open" "Instruct" "Off" "33.94" "29.25" "44.06" "46.22" "46.69" "25.86" "18.18" "20.0" "42.86" "25.56" "25.5"
	"Olmo 3 32B Think" "https://huggingface.co/allenai/Olmo-3-32B-Think" "temperature: 1
	top-p: 0.95
	top-k: 50" "allenai" "3360.5" "473.0" "32.0" "Open" "Think" "On" "33.94" "35.25" "30.94" "57.37" "66.53" "33.33" "28.93" "24.29" "34.52" "11.8" "19.8"
	"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
	top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
	"Apriel 1.5 15B Thinker" "https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker" "temperature: 0.6
	top-p: 0.95" "Apriel" "2238.0" "375.0" "15.0" "Open" "Think" "On" "31.92" "44.25" "26.56" "47.41" "59.09" "22.99" "37.19" "20.0" "26.98" "20.22" "10.07"
	"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
	top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
	"Kanana 2 30B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct" "temperature: 0" "Kakao" "1195.0" "1195.0" "31.0" "Open" "Instruct" "Off" "30.84" "38.0" "25.62" "35.86" "47.11" "37.93" "23.97" "18.57" "35.32" "20.51" "19.46"
	"Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
	top-p: 0.9" "HelpingAI" "520.0" "356.0" "14.8" "Open" "Think" "On" "25.81" "28.25" "19.38" "30.28" "33.47" "43.1" "47.93" "20.0" "31.75" "12.08" "13.09"
	"ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
	top-p: 0.95" "ERNIE" "1637.0" "541.0" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
	"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
	top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
	"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
	top-p: 0.7" "KT" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
	"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
	top-p: 0.95" "Kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"