DataSnake's picture
Update benchmarks.json
d9f0569 verified
{
"Mistral-Nemo-Instruct-2407-NVFP4": {
"coqa": {
"alias": "coqa",
"em,none": 0.5391666666666667,
"em_stderr,none": 0.019942406218865946,
"f1,none": 0.7182411035342305,
"f1_stderr,none": 0.015116252267763605
},
"hellaswag": {
"alias": "hellaswag",
"acc,none": 0.6186018721370244,
"acc_stderr,none": 0.004847372670134442,
"acc_norm,none": 0.8084047002589125,
"acc_norm_stderr,none": 0.003927519561403822
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.3807763401109057,
"prompt_level_strict_acc_stderr,none": 0.020895937888190826,
"inst_level_strict_acc,none": 0.4712230215827338,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.4602587800369686,
"prompt_level_loose_acc_stderr,none": 0.021448501434135032,
"inst_level_loose_acc,none": 0.5455635491606715,
"inst_level_loose_acc_stderr,none": "N/A"
},
"lambada_openai": {
"alias": "lambada_openai",
"perplexity,none": 3.0228658176330048,
"perplexity_stderr,none": 0.05627099222260382,
"acc,none": 0.7583931690277508,
"acc_stderr,none": 0.0059636738430670555
},
"lambada_openai_cloze_yaml": {
"alias": "lambada_openai_cloze_yaml",
"perplexity,none": 29.84274462902617,
"perplexity_stderr,none": 0.7625202924355495,
"acc,none": 0.3122452940034931,
"acc_stderr,none": 0.0064561975253284295
},
"lambada_standard": {
"alias": "lambada_standard",
"perplexity,none": 3.640092189793144,
"perplexity_stderr,none": 0.07656519320424794,
"acc,none": 0.688530952843004,
"acc_stderr,none": 0.006451805320261074
},
"lambada_standard_cloze_yaml": {
"alias": "lambada_standard_cloze_yaml",
"perplexity,none": 44.843956670053444,
"perplexity_stderr,none": 1.1468558717049042,
"acc,none": 0.22588783233068116,
"acc_stderr,none": 0.005825865294666935
},
"commonsense_qa": {
"alias": "commonsense_qa",
"acc,none": 0.5773955773955773,
"acc_stderr,none": 0.014142423233580207
},
"mmlu": {
"acc,none": 0.6324597635664435,
"acc_stderr,none": 0.0037954721741336904,
"alias": "mmlu"
},
"openbookqa": {
"alias": "openbookqa",
"acc,none": 0.368,
"acc_stderr,none": 0.021588982568353548,
"acc_norm,none": 0.47,
"acc_norm_stderr,none": 0.0223427481925028
},
"winogrande": {
"alias": "winogrande",
"acc,none": 0.7671665351223362,
"acc_stderr,none": 0.011878201073856598
},
"triviaqa": {
"alias": "triviaqa",
"exact_match,remove_whitespace": 0.595296477931342,
"exact_match_stderr,remove_whitespace": 0.003664271290957409
},
"truthfulqa_mc1": {
"alias": "truthfulqa_mc1",
"acc,none": 0.37821297429620565,
"acc_stderr,none": 0.016976335907546772
},
"truthfulqa_mc2": {
"alias": "truthfulqa_mc2",
"acc,none": 0.5284457708979753,
"acc_stderr,none": 0.015035693028473887
}
},
"Mistral-Nemo-Instruct-2407-NVFP4-4over6": {
"coqa": {
"alias": "coqa",
"em,none": 0.5498333333333334,
"em_stderr,none": 0.020187689512123422,
"f1,none": 0.7211863549140475,
"f1_stderr,none": 0.01540089217977273
},
"hellaswag": {
"alias": "hellaswag",
"acc,none": 0.619398526190002,
"acc_stderr,none": 0.004845424524763779,
"acc_norm,none": 0.8131846245767775,
"acc_norm_stderr,none": 0.003889666837869389
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.4121996303142329,
"prompt_level_strict_acc_stderr,none": 0.021182238151733295,
"inst_level_strict_acc,none": 0.5095923261390888,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.4824399260628466,
"prompt_level_loose_acc_stderr,none": 0.02150330051338897,
"inst_level_loose_acc,none": 0.5683453237410072,
"inst_level_loose_acc_stderr,none": "N/A"
},
"lambada_openai": {
"alias": "lambada_openai",
"perplexity,none": 2.954572513479153,
"perplexity_stderr,none": 0.054124688702298446,
"acc,none": 0.7686784397438385,
"acc_stderr,none": 0.0058747917899013785
},
"lambada_openai_cloze_yaml": {
"alias": "lambada_openai_cloze_yaml",
"perplexity,none": 30.035509240670926,
"perplexity_stderr,none": 0.7780223123588234,
"acc,none": 0.29827285076654375,
"acc_stderr,none": 0.006373868144287074
},
"lambada_standard": {
"alias": "lambada_standard",
"perplexity,none": 3.660039154584711,
"perplexity_stderr,none": 0.07563902292271452,
"acc,none": 0.6906656316708714,
"acc_stderr,none": 0.006439617662597691
},
"lambada_standard_cloze_yaml": {
"alias": "lambada_standard_cloze_yaml",
"perplexity,none": 40.99251979932923,
"perplexity_stderr,none": 1.0271212425445286,
"acc,none": 0.24665243547448087,
"acc_stderr,none": 0.006005545631215194
},
"commonsense_qa": {
"alias": "commonsense_qa",
"acc,none": 0.5921375921375921,
"acc_stderr,none": 0.014069810259917194
},
"mmlu": {
"acc,none": 0.6364477994587665,
"acc_stderr,none": 0.003806377839571922,
"alias": "mmlu"
},
"openbookqa": {
"alias": "openbookqa",
"acc,none": 0.392,
"acc_stderr,none": 0.02185468495561119,
"acc_norm,none": 0.472,
"acc_norm_stderr,none": 0.022347949832668024
},
"winogrande": {
"alias": "winogrande",
"acc,none": 0.755327545382794,
"acc_stderr,none": 0.012082125654159727
},
"triviaqa": {
"alias": "triviaqa",
"exact_match,remove_whitespace": 0.6011480160499332,
"exact_match_stderr,remove_whitespace": 0.003655519111850352
},
"truthfulqa_mc1": {
"alias": "truthfulqa_mc1",
"acc,none": 0.38310893512851896,
"acc_stderr,none": 0.017018461679389734
},
"truthfulqa_mc2": {
"alias": "truthfulqa_mc2",
"acc,none": 0.5367421440427503,
"acc_stderr,none": 0.01489354828588867
}
},
"Mistral-Nemo-Instruct-2407-NVFP4-FP8-RTN": {
"coqa": {
"alias": "coqa",
"em,none": 0.5683333333333334,
"em_stderr,none": 0.019596946262820592,
"f1,none": 0.7401341567024432,
"f1_stderr,none": 0.014222135053403443
},
"hellaswag": {
"alias": "hellaswag",
"acc,none": 0.6237801234813782,
"acc_stderr,none": 0.004834461997944986,
"acc_norm,none": 0.813981278629755,
"acc_norm_stderr,none": 0.003883265210791469
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.39926062846580407,
"prompt_level_strict_acc_stderr,none": 0.021075331332701258,
"inst_level_strict_acc,none": 0.5011990407673861,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.46210720887245843,
"prompt_level_loose_acc_stderr,none": 0.021454695436204742,
"inst_level_loose_acc,none": 0.5563549160671463,
"inst_level_loose_acc_stderr,none": "N/A"
},
"lambada_openai": {
"alias": "lambada_openai",
"perplexity,none": 2.959118986596569,
"perplexity_stderr,none": 0.05562968630849542,
"acc,none": 0.7618862798369882,
"acc_stderr,none": 0.005934024831865026
},
"lambada_openai_cloze_yaml": {
"alias": "lambada_openai_cloze_yaml",
"perplexity,none": 26.696978874621955,
"perplexity_stderr,none": 0.683775381967173,
"acc,none": 0.33145740345429847,
"acc_stderr,none": 0.006558287884402134
},
"lambada_standard": {
"alias": "lambada_standard",
"perplexity,none": 3.492972951792885,
"perplexity_stderr,none": 0.0721179626815854,
"acc,none": 0.6970696681544731,
"acc_stderr,none": 0.006402086620816973
},
"lambada_standard_cloze_yaml": {
"alias": "lambada_standard_cloze_yaml",
"perplexity,none": 37.411029419339414,
"perplexity_stderr,none": 0.9371152123007664,
"acc,none": 0.2582961381719387,
"acc_stderr,none": 0.0060979842659205745
},
"commonsense_qa": {
"alias": "commonsense_qa",
"acc,none": 0.6060606060606061,
"acc_stderr,none": 0.013989198052984327
},
"mmlu": {
"acc,none": 0.6434268622703319,
"acc_stderr,none": 0.0037824170513249015,
"alias": "mmlu"
},
"openbookqa": {
"alias": "openbookqa",
"acc,none": 0.404,
"acc_stderr,none": 0.021966635293832883,
"acc_norm,none": 0.478,
"acc_norm_stderr,none": 0.02236139673920787
},
"winogrande": {
"alias": "winogrande",
"acc,none": 0.7513812154696132,
"acc_stderr,none": 0.012147314713403173
},
"triviaqa": {
"alias": "triviaqa",
"exact_match,remove_whitespace": 0.6104547481052163,
"exact_match_stderr,remove_whitespace": 0.0036404759558834486
},
"truthfulqa_mc1": {
"alias": "truthfulqa_mc1",
"acc,none": 0.38922888616891066,
"acc_stderr,none": 0.01706855268069044
},
"truthfulqa_mc2": {
"alias": "truthfulqa_mc2",
"acc,none": 0.5389966378633866,
"acc_stderr,none": 0.015062753562771725
}
},
"Mistral-Nemo-Instruct-2407-NVFP4-FP8": {
"coqa": {
"alias": "coqa",
"em,none": 0.5733333333333334,
"em_stderr,none": 0.019505122108457063,
"f1,none": 0.7346963191068078,
"f1_stderr,none": 0.015019925951016882
},
"hellaswag": {
"alias": "hellaswag",
"acc,none": 0.6239792869946226,
"acc_stderr,none": 0.004833953712521647,
"acc_norm,none": 0.8124875522804222,
"acc_norm_stderr,none": 0.0038952463204528333
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.4195933456561922,
"prompt_level_strict_acc_stderr,none": 0.021236532548855144,
"inst_level_strict_acc,none": 0.5107913669064749,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.49168207024029575,
"prompt_level_loose_acc_stderr,none": 0.02151359656402127,
"inst_level_loose_acc,none": 0.5767386091127098,
"inst_level_loose_acc_stderr,none": "N/A"
},
"lambada_openai": {
"alias": "lambada_openai",
"perplexity,none": 2.923326330881571,
"perplexity_stderr,none": 0.05419037402311029,
"acc,none": 0.7725596739763245,
"acc_stderr,none": 0.005839986255519642
},
"lambada_openai_cloze_yaml": {
"alias": "lambada_openai_cloze_yaml",
"perplexity,none": 26.694758918393248,
"perplexity_stderr,none": 0.6858369120887257,
"acc,none": 0.33165146516592275,
"acc_stderr,none": 0.0065592552732574244
},
"lambada_standard": {
"alias": "lambada_standard",
"perplexity,none": 3.551356285449324,
"perplexity_stderr,none": 0.07342709527808654,
"acc,none": 0.6926062487871143,
"acc_stderr,none": 0.006428398527904964
},
"lambada_standard_cloze_yaml": {
"alias": "lambada_standard_cloze_yaml",
"perplexity,none": 35.56146784383732,
"perplexity_stderr,none": 0.8740618781960072,
"acc,none": 0.2837182223947215,
"acc_stderr,none": 0.0062805494483705775
},
"commonsense_qa": {
"alias": "commonsense_qa",
"acc,none": 0.6208026208026208,
"acc_stderr,none": 0.013890872306969766
},
"mmlu": {
"acc,none": 0.6454208802164934,
"acc_stderr,none": 0.0037774970297410814,
"alias": "mmlu"
},
"openbookqa": {
"alias": "openbookqa",
"acc,none": 0.404,
"acc_stderr,none": 0.021966635293832883,
"acc_norm,none": 0.488,
"acc_norm_stderr,none": 0.022376626797927058
},
"winogrande": {
"alias": "winogrande",
"acc,none": 0.7545382794001578,
"acc_stderr,none": 0.01209527293718361
},
"triviaqa": {
"alias": "triviaqa",
"exact_match,remove_whitespace": 0.6184239857333927,
"exact_match_stderr,remove_whitespace": 0.003626487357735664
},
"truthfulqa_mc1": {
"alias": "truthfulqa_mc1",
"acc,none": 0.39167686658506734,
"acc_stderr,none": 0.017087795881769625
},
"truthfulqa_mc2": {
"alias": "truthfulqa_mc2",
"acc,none": 0.5475484445657804,
"acc_stderr,none": 0.015041357055984873
}
}
}