craffel HF Staff commited on
Commit
ec93cd0
·
verified ·
1 Parent(s): b75bb13

Upload nemotron_synthetic_1T_exp_300M/metrics.eval.jsonl with huggingface_hub

Browse files
nemotron_synthetic_1T_exp_300M/metrics.eval.jsonl CHANGED
@@ -1 +1,5 @@
1
- {"created_at": "2025-12-25T16:41:24.357637", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1941031941031941, "acc_stderr,none": 0.011323381588920432}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4160525791674965, "acc_stderr,none": 0.004918951019183884, "acc_norm,none": 0.5433180641306513, "acc_norm_stderr,none": 0.004971019942726594}, "mmlu": {"acc,none": 0.24569149693775816, "acc_stderr,none": 0.0036257293561977725, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2507970244420829, "acc_stderr,none": 0.006322706684367274, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.028867431449849313}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460285}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591205}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.02361867831006937}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.01455155365936992}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.023093140398374224}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451152}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.011083276280441898}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.034678266857038266}, "mmlu_other": {"acc,none": 0.2491149018345671, "acc_stderr,none": 0.007735422254021528, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2037735849056604, "acc_stderr,none": 0.024790784501775402}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.03156809362703175}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.030636591348699803}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326468}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.015190473717037497}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.025261691219729477}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15073529411764705, "acc_stderr,none": 0.021734235515652848}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.036643147772880864}, "mmlu_social_sciences": {"acc,none": 0.2300942476438089, "acc_stderr,none": 0.007566451791980697, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481425}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20707070707070707, "acc_stderr,none": 0.028869778460267066}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17098445595854922, "acc_stderr,none": 0.02717121368316455}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19487179487179487, "acc_stderr,none": 0.020083167595181393}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279486}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.01781884956479663}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.046313813194254635}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17959183673469387, "acc_stderr,none": 0.024573293589585637}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573026}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.2499207104345068, "acc_stderr,none": 0.0076869570746087405, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105345}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.035868792800803406}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246284}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.02687433727680835}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844086}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.025695341643824678}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04547960999764376}, "sciq": {"alias": "sciq", "acc,none": 0.899, "acc_stderr,none": 0.009533618929340983, "acc_norm,none": 0.856, "acc_norm_stderr,none": 0.01110798754893915}}
 
 
 
 
 
1
+ {"created_at": "2026-01-23T08:50:47.895782", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.011620759575652383}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38498307110137425, "acc_stderr,none": 0.004855968578998731, "acc_norm,none": 0.48894642501493724, "acc_norm_stderr,none": 0.004988561944277387}, "mmlu": {"acc,none": 0.25487822247543085, "acc_stderr,none": 0.0036724987902565, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2639744952178533, "acc_stderr,none": 0.0064216282402404885, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011745}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251735}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.024257901705323378}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.014465893829859935}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.02623696588115327}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713002}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2627118644067797, "acc_stderr,none": 0.01124054551499567}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.2507241712262633, "acc_stderr,none": 0.0077567891658918324, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756193}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641143}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.0306365913486998}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.1794871794871795, "acc_stderr,none": 0.025140935950335452}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036543}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02428861946604611}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559366}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.23724406889827754, "acc_stderr,none": 0.007660893456449548, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229872}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18134715025906736, "acc_stderr,none": 0.02780703236068609}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02665353159671548}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313728}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.01788318813466721}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.045820048415054174}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772436}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.2626070409134158, "acc_stderr,none": 0.007833371075571666, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.03611780560284898}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.035868792800803406}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02767845257821239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633356}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.025189006660212385}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0317852971064275}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.026232878971491652}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578728}, "sciq": {"alias": "sciq", "acc,none": 0.879, "acc_stderr,none": 0.010318210380946097, "acc_norm,none": 0.794, "acc_norm_stderr,none": 0.01279561361278656}}
2
+ {"created_at": "2026-01-23T12:59:45.130325", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18263718263718265, "acc_stderr,none": 0.0110617062892271}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.398725353515236, "acc_stderr,none": 0.004886353563571858, "acc_norm,none": 0.5151364270065724, "acc_norm_stderr,none": 0.004987494455523723}, "mmlu": {"acc,none": 0.26378008830650906, "acc_stderr,none": 0.0037155827964648032, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.273538788522848, "acc_stderr,none": 0.006497659059701684, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03718489006818115}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624335}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923403}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212095}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.024547617794803838}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217889}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3054662379421222, "acc_stderr,none": 0.02616058445014048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3117283950617284, "acc_stderr,none": 0.02577311116963044}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2757496740547588, "acc_stderr,none": 0.011413813609160986}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.25523012552301255, "acc_stderr,none": 0.00780996627596293, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.025907897122408173}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.034355680560478746}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.027584066602208263}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.280970625798212, "acc_stderr,none": 0.01607312785122125}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.025457756696667867}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.03550920185689629}, "mmlu_social_sciences": {"acc,none": 0.2541436464088398, "acc_stderr,none": 0.007846930871904547, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845436}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958924}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.01850814360254782}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594722}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3142857142857143, "acc_stderr,none": 0.029719329422417468}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.26704725658103395, "acc_stderr,none": 0.00786983252737909, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617723}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.03823428969926605}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149352}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3448275862068966, "acc_stderr,none": 0.03960933549451208}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708624}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.024580028921481003}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233484}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422266}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "sciq": {"alias": "sciq", "acc,none": 0.879, "acc_stderr,none": 0.010318210380946094, "acc_norm,none": 0.815, "acc_norm_stderr,none": 0.012285191326386717}}
3
+ {"created_at": "2026-01-23T17:03:16.761716", "global_step": 90000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19164619164619165, "acc_stderr,none": 0.011268624978801626}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41206930890260907, "acc_stderr,none": 0.004912015369160081, "acc_norm,none": 0.5361481776538538, "acc_norm_stderr,none": 0.0049767241248505605}, "mmlu": {"acc,none": 0.25380999857570147, "acc_stderr,none": 0.0036694208285347526, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25866099893730077, "acc_stderr,none": 0.006379924021700904, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2320675105485232, "acc_stderr,none": 0.027479744550808503}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.041391127276354626}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508283}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.01433352205921789}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3279742765273312, "acc_stderr,none": 0.026664410886937606}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900922}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178475}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.035282112582452306}, "mmlu_other": {"acc,none": 0.2536208561313164, "acc_stderr,none": 0.007800796534329037, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111836}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.0321473730202947}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.24663677130044842, "acc_stderr,none": 0.028930413120910894}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02723601394619668}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.016203792703197797}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826517}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307847}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.24439389015274618, "acc_stderr,none": 0.007736661953217812, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479045}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18134715025906736, "acc_stderr,none": 0.02780703236068609}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.02720537153827947}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.01817511051034357}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.018120224251484577}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.20398009950248755, "acc_stderr,none": 0.02849317624532607}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2559467174119886, "acc_stderr,none": 0.007766969144686967, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617722}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.035541803680256896}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993178}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02767845257821238}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948365}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833713}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876748}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "sciq": {"alias": "sciq", "acc,none": 0.892, "acc_stderr,none": 0.009820001651345691, "acc_norm,none": 0.847, "acc_norm_stderr,none": 0.011389500459665544}}
4
+ {"created_at": "2026-01-23T22:56:25.212831", "global_step": 120000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.17854217854217855, "acc_stderr,none": 0.01096435614876877}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4267078271260705, "acc_stderr,none": 0.004935882666250474, "acc_norm,none": 0.5517825134435371, "acc_norm_stderr,none": 0.0049629497842360575}, "mmlu": {"acc,none": 0.26613018088591367, "acc_stderr,none": 0.0037234725698044036, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.261211477151966, "acc_stderr,none": 0.00640057115707359, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.036085410115739666}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069422}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912073}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.0348782516849789}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.02468531686725781}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303675}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.28938906752411575, "acc_stderr,none": 0.025755865922632935}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240868}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.010996156635142692}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691583}, "mmlu_other": {"acc,none": 0.2813002896684905, "acc_stderr,none": 0.008037463400748708, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.02688064788905197}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173044}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.03210062154134986}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036543}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.026336613469046654}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3475177304964539, "acc_stderr,none": 0.028406627809590954}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.02352924218519311}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.25836854078648036, "acc_stderr,none": 0.007869469413868757, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.02860620428922989}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.021763733684173916}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861493}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677697}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.35918367346938773, "acc_stderr,none": 0.030713560455108493}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.30845771144278605, "acc_stderr,none": 0.03265819588512699}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.007877367665705409, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.038850042458002526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105365}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2013888888888889, "acc_stderr,none": 0.03353647469713839}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179325}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.029644006577009618}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643895}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.031089826002937523}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322716}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.027420019350945273}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969654}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.030388051301678116}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "sciq": {"alias": "sciq", "acc,none": 0.895, "acc_stderr,none": 0.009698921026024964, "acc_norm,none": 0.851, "acc_norm_stderr,none": 0.01126614068463216}}
5
+ {"created_at": "2026-01-24T01:14:31.362967", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19082719082719082, "acc_stderr,none": 0.011250215810979035}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4351722764389564, "acc_stderr,none": 0.004947663206388457, "acc_norm,none": 0.5673172674765983, "acc_norm_stderr,none": 0.004944351065545866}, "mmlu": {"acc,none": 0.2630679390400228, "acc_stderr,none": 0.0037120517177874498, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2629117959617428, "acc_stderr,none": 0.006418849163687381, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604671}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009179}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460302}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094631}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.0246853168672578}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331152}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3054662379421222, "acc_stderr,none": 0.02616058445014048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.011167706014904138}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2729320888316704, "acc_stderr,none": 0.00797993682271875, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.027008766090708094}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.03181149747055358}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283136}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274946}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27458492975734355, "acc_stderr,none": 0.015959829933084046}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767706}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3674698795180723, "acc_stderr,none": 0.03753267402120575}, "mmlu_social_sciences": {"acc,none": 0.25901852453688656, "acc_stderr,none": 0.007885049616464856, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.043391383225798594}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20707070707070707, "acc_stderr,none": 0.028869778460267066}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24102564102564103, "acc_stderr,none": 0.021685546665333184}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.02684151432295894}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.040393149787245626}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.01770453165325007}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940588}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879815}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.2575325087218522, "acc_stderr,none": 0.007778428011891612, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.026754391348039766}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340496}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355157}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.044328040552915185}, "sciq": {"alias": "sciq", "acc,none": 0.903, "acc_stderr,none": 0.009363689373248113, "acc_norm,none": 0.859, "acc_norm_stderr,none": 0.011010914595992441}}