| {"created_at": "2025-08-19T22:57:49.130559", "global_step": 2000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.2840071698864768, "acc_stderr,none": 0.0045001864244438, "acc_norm,none": 0.31159131647082255, "acc_norm_stderr,none": 0.0046219725241529635}, "mmlu": {"acc,none": 0.23030907278165502, "acc_stderr,none": 0.0035471766200796183, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24293304994686504, "acc_stderr,none": 0.006248825663670902, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.03058759135160425}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265816}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.2404248471194078, "acc_stderr,none": 0.007647295985585333, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351294}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559328}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21741956451088723, "acc_stderr,none": 0.007433171692889537, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091105}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21408182683158897, "acc_stderr,none": 0.007293387611854294, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.030643607071677098}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.02084229093011467}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.02627308604753542}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.734, "acc_stderr,none": 0.013979965645145155, "acc_norm,none": 0.657, "acc_norm_stderr,none": 0.015019206922356951}} |
| {"created_at": "2025-08-20T00:51:09.304485", "global_step": 4000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011549}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.31995618402708625, "acc_stderr,none": 0.004655059308602626, "acc_norm,none": 0.369946225851424, "acc_norm_stderr,none": 0.004818031396138936}, "mmlu": {"acc,none": 0.2429853297251104, "acc_stderr,none": 0.0036165829697713246, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2412327311370882, "acc_stderr,none": 0.006235846646641958, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.032282103870378935}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2320675105485232, "acc_stderr,none": 0.027479744550808514}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.036959801280988254}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.023083658586984204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808855}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435132}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02240967454730417}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178475}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.24879304795622786, "acc_stderr,none": 0.007742769974070838, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.15, "acc_stderr,none": 0.0358870281282637}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695248}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.03599586301247078}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857476}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690878}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431177}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23116219667943805, "acc_stderr,none": 0.015075523238101074}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20261437908496732, "acc_stderr,none": 0.02301544687798567}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.026431329870789534}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.25089372765680856, "acc_stderr,none": 0.007815879841050937, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198896}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.032210245080411544}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.022489389793654824}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3025210084033613, "acc_stderr,none": 0.02983796238829193}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633153}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159463}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.017883188134667192}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174923}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.20398009950248755, "acc_stderr,none": 0.02849317624532608}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_stem": {"acc,none": 0.23215984776403426, "acc_stderr,none": 0.007512481194785094, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829226}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.03793281185307809}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.02964400657700962}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262883}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1967741935483871, "acc_stderr,none": 0.022616409420742025}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694433}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267606}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.0432704093257873}, "sciq": {"alias": "sciq", "acc,none": 0.796, "acc_stderr,none": 0.012749374359024379, "acc_norm,none": 0.734, "acc_norm_stderr,none": 0.013979965645145151}} |
| {"created_at": "2025-08-20T02:40:25.704230", "global_step": 6000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534304}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.33808006373232424, "acc_stderr,none": 0.0047208915971747294, "acc_norm,none": 0.4021111332403904, "acc_norm_stderr,none": 0.004893220635011784}, "mmlu": {"acc,none": 0.25786924939467315, "acc_stderr,none": 0.003669681006818659, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23251859723698193, "acc_stderr,none": 0.006149107222316063, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.042163702135578345}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.033744026441394036}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923393}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.18143459915611815, "acc_stderr,none": 0.02508596114457965}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1487603305785124, "acc_stderr,none": 0.032484700838071943}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0395783547198098}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.18404907975460122, "acc_stderr,none": 0.030446777687971726}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.021628077380196137}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.01446589382985993}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.02322275679743512}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.19753086419753085, "acc_stderr,none": 0.02215288992789897}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24837027379400262, "acc_stderr,none": 0.011035212598034486}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.032180937956023566}, "mmlu_other": {"acc,none": 0.24460894753781784, "acc_stderr,none": 0.0076440755164602445, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.035506839891655796}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.13901345291479822, "acc_stderr,none": 0.023219352834474485}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749472}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.1979565772669221, "acc_stderr,none": 0.014248873549217582}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.0255531699918265}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537773}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983576}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.18674698795180722, "acc_stderr,none": 0.030338749144500597}, "mmlu_social_sciences": {"acc,none": 0.292817679558011, "acc_stderr,none": 0.008154899343580017, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244442}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3435897435897436, "acc_stderr,none": 0.024078696580635484}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3487394957983193, "acc_stderr,none": 0.030956636328566545}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30642201834862387, "acc_stderr,none": 0.019765517220458523}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2107843137254902, "acc_stderr,none": 0.01650047297902478}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.37551020408163266, "acc_stderr,none": 0.03100120903989484}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729601}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.2746590548683793, "acc_stderr,none": 0.007926903683343174, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614866}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03782728980865469}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624576}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.047840607041056527}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424052}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2, "acc_stderr,none": 0.03333333333333329}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633356}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335134}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119994}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.36574074074074076, "acc_stderr,none": 0.03284738857647207}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "sciq": {"alias": "sciq", "acc,none": 0.824, "acc_stderr,none": 0.01204861689859749, "acc_norm,none": 0.745, "acc_norm_stderr,none": 0.013790038620872825}} |
| {"created_at": "2025-08-20T04:19:50.706995", "global_step": 8000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3490340569607648, "acc_stderr,none": 0.004756905819649975, "acc_norm,none": 0.4250149372634933, "acc_norm_stderr,none": 0.004933349621589329}, "mmlu": {"acc,none": 0.23878364905284147, "acc_stderr,none": 0.00359393133606175, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23910733262486716, "acc_stderr,none": 0.006215701607680643, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.03608541011573967}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145652}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.04373313040914761}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.19631901840490798, "acc_stderr,none": 0.031207970394709218}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321614}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.023720088516179034}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262192}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2379400260756193, "acc_stderr,none": 0.010875700787694233}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.26005793369810104, "acc_stderr,none": 0.007857630162443367, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724067}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431187}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25798212005108556, "acc_stderr,none": 0.01564583018834895}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140235}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.22391940201494961, "acc_stderr,none": 0.007521023662100252, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.028057791672989017}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775295}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148543}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.02738140692786898}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21284403669724772, "acc_stderr,none": 0.017549376389313694}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2173202614379085, "acc_stderr,none": 0.016684820929148598}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.0258012834750905}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014666}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.23184268950206152, "acc_stderr,none": 0.007504977893053721, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.16447368421052633, "acc_stderr,none": 0.030167533468632695}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.03057944277361033}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924811}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.02218203720294837}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.21935483870967742, "acc_stderr,none": 0.023540799358723302}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18226600985221675, "acc_stderr,none": 0.02716334085964515}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.02592887613276611}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.861, "acc_stderr,none": 0.01094526376104297, "acc_norm,none": 0.803, "acc_norm_stderr,none": 0.012583693787968142}} |
| {"created_at": "2025-08-20T05:53:56.596400", "global_step": 10000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177821}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3586934873531169, "acc_stderr,none": 0.004786368011500458, "acc_norm,none": 0.43766182035451107, "acc_norm_stderr,none": 0.004950848456984543}, "mmlu": {"acc,none": 0.2533114940891611, "acc_stderr,none": 0.0036622954087073335, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24803400637619555, "acc_stderr,none": 0.006286142202459778, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.04190596438871136}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.03114557065948678}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1487603305785124, "acc_stderr,none": 0.03248470083807195}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100178}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478036}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.1882716049382716, "acc_stderr,none": 0.021751866060815857}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26140808344198174, "acc_stderr,none": 0.011222528169771316}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.034010526201040885}, "mmlu_other": {"acc,none": 0.2510460251046025, "acc_stderr,none": 0.0077410127936744086, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.0349610148119118}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19282511210762332, "acc_stderr,none": 0.026478240960489365}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.04582124160161551}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.029343114798094462}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.20434227330779056, "acc_stderr,none": 0.014419123980931904}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3713235294117647, "acc_stderr,none": 0.02934980313976587}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663926}, "mmlu_social_sciences": {"acc,none": 0.26616834579135523, "acc_stderr,none": 0.00795497313138134, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537316}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178816}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.34102564102564104, "acc_stderr,none": 0.024035489676335075}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.028942004040998174}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.017871217767790226}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969174}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.0174408203674025}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128448}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772432}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.250872185220425, "acc_stderr,none": 0.007718532455966302, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.038009680605548574}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929776}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.225531914893617, "acc_stderr,none": 0.027321078417387533}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924814}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2, "acc_stderr,none": 0.02275520495954294}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.031089826002937523}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275788}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2824074074074074, "acc_stderr,none": 0.03070137211151092}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.038946411200447915}, "sciq": {"alias": "sciq", "acc,none": 0.857, "acc_stderr,none": 0.011075814808567038, "acc_norm,none": 0.801, "acc_norm_stderr,none": 0.012631649083099184}} |
| {"created_at": "2025-08-20T07:21:28.109736", "global_step": 12000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011545}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36516630153355906, "acc_stderr,none": 0.004804927608773125, "acc_norm,none": 0.45359490141406095, "acc_norm_stderr,none": 0.004968244611429385}, "mmlu": {"acc,none": 0.2357926221335992, "acc_stderr,none": 0.003578587416521992, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24675876726886292, "acc_stderr,none": 0.006281100063435267, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2659217877094972, "acc_stderr,none": 0.01477676506643889}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.1882716049382716, "acc_stderr,none": 0.021751866060815844}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24511082138200782, "acc_stderr,none": 0.010986307870045512}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245232}, "mmlu_other": {"acc,none": 0.24782748632121018, "acc_stderr,none": 0.007739320157606666, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.03138147637575498}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431177}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.015384352284543941}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.22261943451413715, "acc_stderr,none": 0.0075016088561265975, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386407}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026935}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2, "acc_stderr,none": 0.017149858514250937}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.22042499207104346, "acc_stderr,none": 0.007374634319669131, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105365}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366255}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514192}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.021132859182754458}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673622}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.03216298420593614}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.868, "acc_stderr,none": 0.010709373963528033, "acc_norm,none": 0.801, "acc_norm_stderr,none": 0.012631649083099177}} |
| {"created_at": "2025-08-20T09:11:53.637514", "global_step": 14000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20966420966420968, "acc_stderr,none": 0.011654350093704639}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3724357697669787, "acc_stderr,none": 0.004824655406075561, "acc_norm,none": 0.4652459669388568, "acc_norm_stderr,none": 0.004977713073899324}, "mmlu": {"acc,none": 0.2452642073778664, "acc_stderr,none": 0.0036283997742277757, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24888416578108397, "acc_stderr,none": 0.006306710822310972, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923413}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615769}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.02361867831006937}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225598}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.023093140398374224}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959617}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2470664928292047, "acc_stderr,none": 0.011015752255279329}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209195}, "mmlu_other": {"acc,none": 0.2417122626327647, "acc_stderr,none": 0.0076722980647266886, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.026055296901152915}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.02715715047956382}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24265644955300128, "acc_stderr,none": 0.01532988894089987}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1746987951807229, "acc_stderr,none": 0.029560326211256854}, "mmlu_social_sciences": {"acc,none": 0.2346441338966526, "acc_stderr,none": 0.007639381891663755, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386414}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.021107730127243995}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21467889908256882, "acc_stderr,none": 0.017604304149256487}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.039153454088478354}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.01774089950917779}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.04309118709946459}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.02992941540834839}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2537266095781795, "acc_stderr,none": 0.007733910917181801, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502651}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.02694748312149622}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826112}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.02402225613030824}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444444}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.29259259259259257, "acc_stderr,none": 0.027738969632176088}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269023}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.17857142857142858, "acc_stderr,none": 0.03635209121577806}, "sciq": {"alias": "sciq", "acc,none": 0.885, "acc_stderr,none": 0.010093407594904614, "acc_norm,none": 0.858, "acc_norm_stderr,none": 0.011043457699378229}} |
| {"created_at": "2025-08-20T10:46:05.470494", "global_step": 16000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883528}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3759211312487552, "acc_stderr,none": 0.004833699243292356, "acc_norm,none": 0.4759012148974308, "acc_norm_stderr,none": 0.004983982396187362}, "mmlu": {"acc,none": 0.24319897450505626, "acc_stderr,none": 0.0036187265971822816, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23804463336875664, "acc_stderr,none": 0.006202914729205616, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.041049472699033945}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.19393939393939394, "acc_stderr,none": 0.03087414513656209}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149678}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.03457272836917671}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.02298959254312356}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468645}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.1882716049382716, "acc_stderr,none": 0.021751866060815868}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24511082138200782, "acc_stderr,none": 0.010986307870045517}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.035282112582452306}, "mmlu_other": {"acc,none": 0.24557450917283552, "acc_stderr,none": 0.0077112358433221884, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.025907897122408173}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.02960510321703832}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283164}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2567049808429119, "acc_stderr,none": 0.015620480263064519}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594687}, "mmlu_social_sciences": {"acc,none": 0.24341891452713682, "acc_stderr,none": 0.007737386997836016, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.03095405547036592}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3005181347150259, "acc_stderr,none": 0.033088185944157494}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02213908110397153}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.02720537153827947}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20550458715596331, "acc_stderr,none": 0.017324352325016015}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.035477710041594626}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612378988}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.027049257915896175}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2483349191246432, "acc_stderr,none": 0.0077006492678361295, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073462}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200214}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154527}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.02850137816789395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766118}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119995}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046948}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.041577515398656284}, "sciq": {"alias": "sciq", "acc,none": 0.867, "acc_stderr,none": 0.01074366913239736, "acc_norm,none": 0.818, "acc_norm_stderr,none": 0.012207580637662179}} |
| {"created_at": "2025-08-20T12:45:42.457086", "global_step": 18000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1850941850941851, "acc_stderr,none": 0.01111911394255986}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.377414857598088, "acc_stderr,none": 0.004837493439874295, "acc_norm,none": 0.4823740290778729, "acc_norm_stderr,none": 0.004986680048438315}, "mmlu": {"acc,none": 0.24462327303802878, "acc_stderr,none": 0.0036280566136677743, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24824654622741765, "acc_stderr,none": 0.006299219759196196, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.02782078198114968}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.03893542518824849}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21864951768488747, "acc_stderr,none": 0.02347558141786111}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.02389187954195961}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.011102268713839987}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.25329900225297713, "acc_stderr,none": 0.00780101685314665, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.032147373020294696}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714267}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.01591336744750052}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.02624492034984302}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.27205882352941174, "acc_stderr,none": 0.02703304115168146}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.22749431264218395, "acc_stderr,none": 0.007566203628288039, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481425}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386407}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128013}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366255}, "mmlu_stem": {"acc,none": 0.24738344433872503, "acc_stderr,none": 0.007687603527242145, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179962}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424066}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.023904914311782644}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489614}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959912}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.02876511171804696}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.03770970049347019}, "sciq": {"alias": "sciq", "acc,none": 0.884, "acc_stderr,none": 0.010131468138756988, "acc_norm,none": 0.843, "acc_norm_stderr,none": 0.011510146979230185}} |
| {"created_at": "2025-08-20T14:44:38.842152", "global_step": 20000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19000819000819, "acc_stderr,none": 0.011231727519127854}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3846843258315077, "acc_stderr,none": 0.004855262903270809, "acc_norm,none": 0.4939255128460466, "acc_norm_stderr,none": 0.004989413158034798}, "mmlu": {"acc,none": 0.25544794188861986, "acc_stderr,none": 0.0036779528707566542, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24888416578108397, "acc_stderr,none": 0.006304207338634915, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.031822318676475544}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460312}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2670391061452514, "acc_stderr,none": 0.014796502622562557}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818707}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2623456790123457, "acc_stderr,none": 0.024477222856135114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.01076084058447169}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824562}, "mmlu_other": {"acc,none": 0.26359832635983266, "acc_stderr,none": 0.0079028935135686, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118352}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21973094170403587, "acc_stderr,none": 0.027790177064383605}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942635}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2784163473818646, "acc_stderr,none": 0.016028295188992462}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137908}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.0263580656988806}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.027678468642144696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594688}, "mmlu_social_sciences": {"acc,none": 0.25024374390640236, "acc_stderr,none": 0.0077936080442831375, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.0383515395439942}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386414}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.01817511051034359}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.0180540274588152}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3346938775510204, "acc_stderr,none": 0.030209235226242307}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054095}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.2622898826514431, "acc_stderr,none": 0.007830287572920147, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03944624162501116}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.1829787234042553, "acc_stderr,none": 0.02527604100044998}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633356}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297698}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411019}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.03479185572599661}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.030058202704309846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "sciq": {"alias": "sciq", "acc,none": 0.886, "acc_stderr,none": 0.01005510343582333, "acc_norm,none": 0.848, "acc_norm_stderr,none": 0.011358918303475284}} |
| {"created_at": "2025-08-20T15:56:06.136450", "global_step": 22000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.01167103843652291}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38478390758812986, "acc_stderr,none": 0.004855498343308383, "acc_norm,none": 0.4949213304122685, "acc_norm_stderr,none": 0.004989524003092438}, "mmlu": {"acc,none": 0.23301523999430282, "acc_stderr,none": 0.0035621640504661417, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24399574920297556, "acc_stderr,none": 0.006255401528069935, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.032876667586034886}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693268}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598035}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265823}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927235}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.036310534964889056}, "mmlu_other": {"acc,none": 0.24460894753781784, "acc_stderr,none": 0.007696401254010827, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2503192848020434, "acc_stderr,none": 0.015491088951494583}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872402}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21741956451088723, "acc_stderr,none": 0.007433012174251382, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.02047323317355198}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.22042499207104346, "acc_stderr,none": 0.007375404265362751, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.021411684393694203}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.021886178567172548}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673622}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.033367670865679766}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.884, "acc_stderr,none": 0.010131468138756995, "acc_norm,none": 0.841, "acc_norm_stderr,none": 0.011569479368271303}} |
| {"created_at": "2025-08-20T17:22:07.252778", "global_step": 24000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39533957379008167, "acc_stderr,none": 0.004879242848473459, "acc_norm,none": 0.5024895439155547, "acc_norm_stderr,none": 0.0049897195594399}, "mmlu": {"acc,none": 0.2630679390400228, "acc_stderr,none": 0.003686577084588165, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2363443145589798, "acc_stderr,none": 0.006180819077035378, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.36507936507936506, "acc_stderr,none": 0.04306241259127153}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.1940928270042194, "acc_stderr,none": 0.025744902532290934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1322314049586777, "acc_stderr,none": 0.030922788320445812}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.038260763248848646}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.032262193772867744}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.20520231213872833, "acc_stderr,none": 0.021742519835276287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.014551553659369918}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2508038585209003, "acc_stderr,none": 0.02461977195669716}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543346}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927239}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.1695906432748538, "acc_stderr,none": 0.028782108105401712}, "mmlu_other": {"acc,none": 0.25329900225297713, "acc_stderr,none": 0.007695013893439755, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.02825420034443866}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3468208092485549, "acc_stderr,none": 0.036291466701596636}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.12556053811659193, "acc_stderr,none": 0.022238985469323767}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.04777615181156739}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.027421007295392912}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.210727969348659, "acc_stderr,none": 0.014583812465862551}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137918}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4227941176470588, "acc_stderr,none": 0.030008562845003476}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.18072289156626506, "acc_stderr,none": 0.029955737855810138}, "mmlu_social_sciences": {"acc,none": 0.29249268768280795, "acc_stderr,none": 0.008163553525441977, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.033456784227567746}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089116}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.023901157979402534}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188704}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3119266055045872, "acc_stderr,none": 0.019862967976707245}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.01645339933227933}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3795918367346939, "acc_stderr,none": 0.03106721126287248}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.28385664446558834, "acc_stderr,none": 0.007971038162688114, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560827}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.03842498559395268}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.39215686274509803, "acc_stderr,none": 0.04858083574266347}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19574468085106383, "acc_stderr,none": 0.025937853139977148}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3064516129032258, "acc_stderr,none": 0.026226485652553873}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.027420019350945273}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.038227469376587525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.03350991604696043}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.0356236785009539}, "sciq": {"alias": "sciq", "acc,none": 0.891, "acc_stderr,none": 0.009859828407037181, "acc_norm,none": 0.841, "acc_norm_stderr,none": 0.011569479368271294}} |
| {"created_at": "2025-08-20T19:10:04.186974", "global_step": 26000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634117}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.391256721768572, "acc_stderr,none": 0.004870342592915051, "acc_norm,none": 0.5013941445927106, "acc_norm_stderr,none": 0.004989762014739187}, "mmlu": {"acc,none": 0.25373878364905283, "acc_stderr,none": 0.003655245503973541, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2405951115834219, "acc_stderr,none": 0.0062203195697189476, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.373015873015873, "acc_stderr,none": 0.04325506042017086}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145638}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2109704641350211, "acc_stderr,none": 0.02655837250266192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.03893542518824847}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.031921934489347235}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.1936416184971098, "acc_stderr,none": 0.021274230317515557}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2659217877094972, "acc_stderr,none": 0.014776765066438888}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24437299035369775, "acc_stderr,none": 0.0244061620946689}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.02301670564026219}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.01106415102716544}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.030267457554898465}, "mmlu_other": {"acc,none": 0.251689732861281, "acc_stderr,none": 0.007698720006718212, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.02825420034443866}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.14798206278026907, "acc_stderr,none": 0.023831557157613526}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.04541609446503947}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.02742100729539291}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.1966794380587484, "acc_stderr,none": 0.01421413855691391}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.41544117647058826, "acc_stderr,none": 0.02993534270787775}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1746987951807229, "acc_stderr,none": 0.02956032621125684}, "mmlu_social_sciences": {"acc,none": 0.2632434189145271, "acc_stderr,none": 0.00791601743595967, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.03345678422756775}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.30569948186528495, "acc_stderr,none": 0.033248379397581594}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706287}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02755361446786381}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28990825688073396, "acc_stderr,none": 0.019453066609201597}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159464}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.20261437908496732, "acc_stderr,none": 0.01626105528374611}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.02721283588407315}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.007849518993320514, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560827}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201942}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496228}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.1724137931034483, "acc_stderr,none": 0.03147830790259575}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491841}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.025189006660212385}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.032468872436376486}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "sciq": {"alias": "sciq", "acc,none": 0.882, "acc_stderr,none": 0.010206869264381791, "acc_norm,none": 0.819, "acc_norm_stderr,none": 0.01218143617917791}} |
| {"created_at": "2025-08-20T20:40:32.650983", "global_step": 28000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091192}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39762995419239194, "acc_stderr,none": 0.004884079750433897, "acc_norm,none": 0.5137422824138618, "acc_norm_stderr,none": 0.0049878964117036715}, "mmlu": {"acc,none": 0.2357926221335992, "acc_stderr,none": 0.003578374078281677, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24378320935175346, "acc_stderr,none": 0.0062619055735923515, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04040610178208841}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225598}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478036}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24511082138200782, "acc_stderr,none": 0.010986307870045512}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.251689732861281, "acc_stderr,none": 0.0077737102867124945, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.026480357179895678}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.33183856502242154, "acc_stderr,none": 0.03160295143776679}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822583}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24776500638569604, "acc_stderr,none": 0.015438083080568958}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982479}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537762}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.034106466140718564}, "mmlu_social_sciences": {"acc,none": 0.2216444588885278, "acc_stderr,none": 0.007486790563455527, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.028335609732463355}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.017493922404112648}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.017322789207784326}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.22201078338090707, "acc_stderr,none": 0.007388215423070526, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.03197565821032499}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714534}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21693121693121692, "acc_stderr,none": 0.02122708244944506}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.17419354838709677, "acc_stderr,none": 0.02157624818451457}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230172}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.03257847384436776}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1388888888888889, "acc_stderr,none": 0.023585447368900128}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.041577515398656284}, "sciq": {"alias": "sciq", "acc,none": 0.887, "acc_stderr,none": 0.010016552866696834, "acc_norm,none": 0.829, "acc_norm_stderr,none": 0.011912216456264583}} |
| {"created_at": "2025-08-20T22:50:38.631429", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091195}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3992232622983469, "acc_stderr,none": 0.004887378682406523, "acc_norm,none": 0.5155347540330611, "acc_norm_stderr,none": 0.004987372476207026}, "mmlu": {"acc,none": 0.2457627118644068, "acc_stderr,none": 0.003633610803859395, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23910733262486716, "acc_stderr,none": 0.006221514432550623, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.033744026441394036}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923393}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676166}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514512}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.1994219653179191, "acc_stderr,none": 0.02151190065425255}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23016759776536314, "acc_stderr,none": 0.014078339253425812}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658537}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178475}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.24847119407788865, "acc_stderr,none": 0.007740202399339346, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.29056603773584905, "acc_stderr,none": 0.027943219989337145}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.0355068398916558}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.030636591348699803}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.027236013946196697}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398686}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.023550831351995087}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.0271871270115038}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.2551186220344491, "acc_stderr,none": 0.007863397314175954, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.0303137105381989}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.25384615384615383, "acc_stderr,none": 0.022066054378726253}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188704}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25688073394495414, "acc_stderr,none": 0.018732492928342472}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379002}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.04309118709946458}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.031524391865554016}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_stem": {"acc,none": 0.24389470345702505, "acc_stderr,none": 0.007656193450754733, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174022}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810534}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808778}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.02977164271249123}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617736}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.036030385453603826}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036726}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "sciq": {"alias": "sciq", "acc,none": 0.907, "acc_stderr,none": 0.009188875634996659, "acc_norm,none": 0.867, "acc_norm_stderr,none": 0.010743669132397339}} |
| {"created_at": "2025-08-21T00:02:12.884058", "global_step": 32000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40290778729336785, "acc_stderr,none": 0.0048948011198986134, "acc_norm,none": 0.5158334993029277, "acc_norm_stderr,none": 0.004987278910505111}, "mmlu": {"acc,none": 0.2575843896880786, "acc_stderr,none": 0.0036739080776908017, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23953241232731137, "acc_stderr,none": 0.006217280061890756, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.36507936507936506, "acc_stderr,none": 0.04306241259127153}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693237}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21940928270042195, "acc_stderr,none": 0.026939106581553945}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934724}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.021628077380196144}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25921787709497207, "acc_stderr,none": 0.014655780837497731}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.02359885829286305}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445803}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24185136897001303, "acc_stderr,none": 0.010936550813827065}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.0312678171466318}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.007751685911593961, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.027008766090708094}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.03567603799639172}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.1210762331838565, "acc_stderr,none": 0.021894174113185758}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.04582124160161551}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.21328224776500637, "acc_stderr,none": 0.01464817274959351}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.026090162504279053}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140245}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4007352941176471, "acc_stderr,none": 0.029768263528933105}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1927710843373494, "acc_stderr,none": 0.030709824050565274}, "mmlu_social_sciences": {"acc,none": 0.271043223919402, "acc_stderr,none": 0.007992616779626665, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.032894773300986155}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089117}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.29743589743589743, "acc_stderr,none": 0.023177408131465942}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.034981493854624714}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.01699272346546625}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940588}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.2730732635585157, "acc_stderr,none": 0.007903071188035326, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.0330278985990172}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.04810840148082637}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19148936170212766, "acc_stderr,none": 0.025722149992637798}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.03256850570293648}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "sciq": {"alias": "sciq", "acc,none": 0.9, "acc_stderr,none": 0.009491579957525038, "acc_norm,none": 0.867, "acc_norm_stderr,none": 0.010743669132397344}} |
| {"created_at": "2025-08-21T01:42:36.495971", "global_step": 34000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.01158688187917783}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4069906393148775, "acc_stderr,none": 0.00490269076506642, "acc_norm,none": 0.5262895837482573, "acc_norm_stderr,none": 0.004982879340691411}, "mmlu": {"acc,none": 0.25345392394245836, "acc_stderr,none": 0.0036695223361350618, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2456960680127524, "acc_stderr,none": 0.006274560482384755, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.02933116229425174}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.02447699407624732}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2057877813504823, "acc_stderr,none": 0.02296133990676424}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005716}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034956}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946957}, "mmlu_other": {"acc,none": 0.26166720308979724, "acc_stderr,none": 0.007873948058477003, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.0277242364927009}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.03599586301247077}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.028188240046929193}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.02905858830374884}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398684}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.024848018263875206}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503796}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25735294117647056, "acc_stderr,none": 0.026556519470041503}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.2515437114072148, "acc_stderr,none": 0.007823894961192457, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03191178226713547}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29015544041450775, "acc_stderr,none": 0.03275264467791515}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061675}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.029597329730978093}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.017818849564796627}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307744}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.2588011417697431, "acc_stderr,none": 0.007804379581037485, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174022}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.043364327079931785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.02655698211783872}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.02241804289111394}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895514}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.0305467452649532}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "sciq": {"alias": "sciq", "acc,none": 0.889, "acc_stderr,none": 0.009938701010583726, "acc_norm,none": 0.862, "acc_norm_stderr,none": 0.010912152632504406}} |
| {"created_at": "2025-08-21T03:26:22.034469", "global_step": 36000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200254}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.410973909579765, "acc_stderr,none": 0.004910049928688082, "acc_norm,none": 0.5272854013144792, "acc_norm_stderr,none": 0.004982346155911129}, "mmlu": {"acc,none": 0.23657598632673407, "acc_stderr,none": 0.003581717459817491, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2454835281615303, "acc_stderr,none": 0.006267161011262714, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.031493281045079556}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.029696338713422882}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.19631901840490798, "acc_stderr,none": 0.031207970394709215}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265823}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927235}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24493080141615706, "acc_stderr,none": 0.0077040451816534, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899095}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173044}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.036756688322331886}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24776500638569604, "acc_stderr,none": 0.015438083080568973}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341012}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487428}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944966}, "mmlu_social_sciences": {"acc,none": 0.21871953201169972, "acc_stderr,none": 0.007451677647541652, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229876}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2, "acc_stderr,none": 0.020280805062535726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1963302752293578, "acc_stderr,none": 0.017030719339154364}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014652}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.232477006026007, "acc_stderr,none": 0.007514250088696691, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624576}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036844}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993178}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.02167921966369314}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.021732540689329265}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297697}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823796}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.03173284384294286}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355154}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "sciq": {"alias": "sciq", "acc,none": 0.904, "acc_stderr,none": 0.00932045443478324, "acc_norm,none": 0.876, "acc_norm_stderr,none": 0.010427498872343965}} |
| {"created_at": "2025-08-21T05:06:21.985243", "global_step": 38000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200248}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41416052579167495, "acc_stderr,none": 0.004915697886906121, "acc_norm,none": 0.5281816371240788, "acc_norm_stderr,none": 0.004981849291299649}, "mmlu": {"acc,none": 0.23308645492095142, "acc_stderr,none": 0.0035603063507730803, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24675876726886292, "acc_stderr,none": 0.006278403385253687, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04216370213557836}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.03158415324047709}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923413}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100178}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.014444157808261462}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.036310534964889056}, "mmlu_other": {"acc,none": 0.2375281622143547, "acc_stderr,none": 0.0076062893287456735, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1907514450867052, "acc_stderr,none": 0.029957851329869334}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.336322869955157, "acc_stderr,none": 0.031708824268455}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.0376017800602662}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.029614323690456655}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.014866821664709593}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537766}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.02334516361654484}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.21579460513487164, "acc_stderr,none": 0.007411070230756165, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02655220782821529}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19743589743589743, "acc_stderr,none": 0.020182646968674837}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1944954128440367, "acc_stderr,none": 0.01697028909045805}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728743}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.22518236600063432, "acc_stderr,none": 0.007425759824296921, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.02964400657700962}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.03664666337225256}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2275132275132275, "acc_stderr,none": 0.021591269407823778}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267846}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.02549753263960955}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.043642261558410445}, "sciq": {"alias": "sciq", "acc,none": 0.906, "acc_stderr,none": 0.009233052000787735, "acc_norm,none": 0.864, "acc_norm_stderr,none": 0.010845350230472988}} |
| {"created_at": "2025-08-21T06:43:57.982070", "global_step": 40000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22358722358722358, "acc_stderr,none": 0.011928612008761169}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4153555068711412, "acc_stderr,none": 0.004917761181740169, "acc_norm,none": 0.537841067516431, "acc_norm_stderr,none": 0.004975470690867143}, "mmlu": {"acc,none": 0.24868252385700043, "acc_stderr,none": 0.003644582406913271, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24803400637619555, "acc_stderr,none": 0.006299353828785412, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.20098039215686275, "acc_stderr,none": 0.028125972265654362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083499}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650743}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250888}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2623456790123457, "acc_stderr,none": 0.024477222856135114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034954}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.26810428065658193, "acc_stderr,none": 0.007935287959030706, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.02688064788905199}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0309528902177499}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34977578475336324, "acc_stderr,none": 0.03200736719484504}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431183}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2784163473818646, "acc_stderr,none": 0.01602829518899247}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461004}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294268}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.036293353299478595}, "mmlu_social_sciences": {"acc,none": 0.23756906077348067, "acc_stderr,none": 0.007672525151729121, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386396}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148536}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279472}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231867}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.04461272175910508}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546205}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409214}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.24135743736124327, "acc_stderr,none": 0.007608028050947345, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3276595744680851, "acc_stderr,none": 0.030683020843231004}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918428}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293753}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "sciq": {"alias": "sciq", "acc,none": 0.903, "acc_stderr,none": 0.009363689373248116, "acc_norm,none": 0.88, "acc_norm_stderr,none": 0.010281328012747417}} |
| {"created_at": "2025-08-21T08:27:29.177648", "global_step": 42000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534285}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41196972714598684, "acc_stderr,none": 0.004911837730582201, "acc_norm,none": 0.5370444134634534, "acc_norm_stderr,none": 0.004976067726432566}, "mmlu": {"acc,none": 0.26278307933342826, "acc_stderr,none": 0.003691384394567519, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24017003188097769, "acc_stderr,none": 0.0062126533551485775, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.373015873015873, "acc_stderr,none": 0.04325506042017086}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009181}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693247}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.19831223628691982, "acc_stderr,none": 0.02595502084162112}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.14049586776859505, "acc_stderr,none": 0.0317223342600216}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.035207039905179635}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.021855255263421802}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24437299035369775, "acc_stderr,none": 0.024406162094668893}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20679012345679013, "acc_stderr,none": 0.022535006705942818}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927244}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.25619568715803026, "acc_stderr,none": 0.007754372342609918, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695245}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.13452914798206278, "acc_stderr,none": 0.022901183761575582}, "mmlu_management": {"alias": " - management", "acc,none": 0.33980582524271846, "acc_stderr,none": 0.046897659372781335}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2094017094017094, "acc_stderr,none": 0.026655699653922737}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22094508301404853, "acc_stderr,none": 0.014836205167333567}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3006535947712418, "acc_stderr,none": 0.026256053835718964}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.02657786094330785}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4007352941176471, "acc_stderr,none": 0.029768263528933105}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.18072289156626506, "acc_stderr,none": 0.029955737855810138}, "mmlu_social_sciences": {"acc,none": 0.2921676958076048, "acc_stderr,none": 0.008156457530763817, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35858585858585856, "acc_stderr,none": 0.03416903640391521}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3626943005181347, "acc_stderr,none": 0.034697137917043715}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3487179487179487, "acc_stderr,none": 0.02416278028401772}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.031124619309328177}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29357798165137616, "acc_stderr,none": 0.01952515112263966}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2173202614379085, "acc_stderr,none": 0.01668482092914862}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.029822533793982062}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.03152439186555402}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.2743418966064066, "acc_stderr,none": 0.00791070698014802, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073462}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.043898699568087764}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.33793103448275863, "acc_stderr,none": 0.03941707632064889}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491841}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.29354838709677417, "acc_stderr,none": 0.0259060870213193}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.41203703703703703, "acc_stderr,none": 0.03356787758160835}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.0356236785009539}, "sciq": {"alias": "sciq", "acc,none": 0.901, "acc_stderr,none": 0.009449248027662751, "acc_norm,none": 0.871, "acc_norm_stderr,none": 0.010605256784796577}} |
| {"created_at": "2025-08-21T10:18:06.771586", "global_step": 44000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838382}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4193387771360287, "acc_stderr,none": 0.004924424018073668, "acc_norm,none": 0.5481975702051384, "acc_norm_stderr,none": 0.004966544724452229}, "mmlu": {"acc,none": 0.2610739210938613, "acc_stderr,none": 0.0037020510609931845, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2507970244420829, "acc_stderr,none": 0.006321586527631169, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790606}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.035014387062967806}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2109704641350211, "acc_stderr,none": 0.02655837250266192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.03749492448709698}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.03826076324884865}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.034878251684978906}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331158}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845543}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26010430247718386, "acc_stderr,none": 0.011204382887823834}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.0348864771345792}, "mmlu_other": {"acc,none": 0.25040231734792406, "acc_stderr,none": 0.007749189486863715, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.16, "acc_stderr,none": 0.0368452949177471}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.027008766090708097}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.336322869955157, "acc_stderr,none": 0.031708824268455}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.0376017800602662}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398677}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.024848018263875195}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872395}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294275}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233135}, "mmlu_social_sciences": {"acc,none": 0.26974325641858954, "acc_stderr,none": 0.00798126099102955, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03053289223393203}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.33678756476683935, "acc_stderr,none": 0.03410780251836184}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3, "acc_stderr,none": 0.023234581088428487}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.36134453781512604, "acc_stderr,none": 0.031204691225150006}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252088}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.02783302387139968}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573044}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.278464954012052, "acc_stderr,none": 0.007992041584684032, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952344}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.040233822736177476}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542126}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880557}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.025284416114900152}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233483}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03141554629402544}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.9, "acc_stderr,none": 0.009491579957525044, "acc_norm,none": 0.871, "acc_norm_stderr,none": 0.010605256784796568}} |
| {"created_at": "2025-08-21T11:48:18.581627", "global_step": 46000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4193387771360287, "acc_stderr,none": 0.004924424018073665, "acc_norm,none": 0.5512846046604262, "acc_norm_stderr,none": 0.004963464657747239}, "mmlu": {"acc,none": 0.23137729668138443, "acc_stderr,none": 0.003552931767806941, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24250797024442083, "acc_stderr,none": 0.006243050378924168, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.0315841532404771}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753374}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.2417122626327647, "acc_stderr,none": 0.007665116369717572, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106748}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403325}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24265644955300128, "acc_stderr,none": 0.015329888940899865}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02380518652488814}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.024987106365642973}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2183945401364966, "acc_stderr,none": 0.007446351633183206, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.02777253333421898}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026924}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007674}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.21725340945131622, "acc_stderr,none": 0.007328991054605092, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.021935878081184756}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.021886178567172548}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.14285714285714285, "acc_stderr,none": 0.024620785269489676}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.02513045365226846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.929, "acc_stderr,none": 0.008125578442487921, "acc_norm,none": 0.928, "acc_norm_stderr,none": 0.008178195576218681}} |
| {"created_at": "2025-08-21T13:57:56.172341", "global_step": 48000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876674}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4251145190201155, "acc_stderr,none": 0.004933500261683593, "acc_norm,none": 0.5571599283011353, "acc_norm_stderr,none": 0.004957068377516509}, "mmlu": {"acc,none": 0.24540663723116365, "acc_stderr,none": 0.0036272760473318707, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23868225292242295, "acc_stderr,none": 0.0062148520199588, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.036196045241242515}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.027865942286639318}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598025}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.037494924487096994}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.03755265865037182}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103987}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.20257234726688103, "acc_stderr,none": 0.022827317491059675}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600712992}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24771838331160365, "acc_stderr,none": 0.011025499291443737}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2587705181847441, "acc_stderr,none": 0.007838200359711393, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.02700876609070808}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.36771300448430494, "acc_stderr,none": 0.03236198350928276}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2567049808429119, "acc_stderr,none": 0.015620480263064524}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888146}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226674}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.23691907702307444, "acc_stderr,none": 0.007654705344668184, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583637}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.02833560973246335}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24102564102564103, "acc_stderr,none": 0.021685546665333184}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.02738140692786897}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231863}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.183206106870229, "acc_stderr,none": 0.03392770926494733}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250078}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.02580128347509051}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.2505550269584523, "acc_stderr,none": 0.007713235231903223, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.037498507091740206}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.033176727875331574}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149354}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.030135906478517563}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847415}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258533}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697625}, "sciq": {"alias": "sciq", "acc,none": 0.908, "acc_stderr,none": 0.009144376393151103, "acc_norm,none": 0.883, "acc_norm_stderr,none": 0.010169287802713329}} |
| {"created_at": "2025-08-21T16:10:43.788631", "global_step": 50000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2194922194922195, "acc_stderr,none": 0.011849997754533976}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42471619199362676, "acc_stderr,none": 0.0049328964724605654, "acc_norm,none": 0.5557657837084247, "acc_norm_stderr,none": 0.004958649623815342}, "mmlu": {"acc,none": 0.23935336846603048, "acc_stderr,none": 0.0035954103860842217, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23698193411264612, "acc_stderr,none": 0.006197623717545236, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842548}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.03755265865037183}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.031921934489347215}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.02279711027807112}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2324022346368715, "acc_stderr,none": 0.014125968754673384}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.20257234726688103, "acc_stderr,none": 0.022827317491059675}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294674}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23272490221642764, "acc_stderr,none": 0.01079259555388848}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.2584486643064049, "acc_stderr,none": 0.007842723382024432, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756192}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641143}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3632286995515695, "acc_stderr,none": 0.03227790442850499}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2554278416347382, "acc_stderr,none": 0.015594955384455777}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902006}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02576725201085595}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233136}, "mmlu_social_sciences": {"acc,none": 0.23366915827104323, "acc_stderr,none": 0.0076298343249588, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775296}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24102564102564103, "acc_stderr,none": 0.02168554666533319}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.02720537153827949}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.018508143602547815}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167418}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.22962258166825245, "acc_stderr,none": 0.007462681703621236, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678317}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.030643607071677098}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536934}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400168}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.023157879349083525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18226600985221675, "acc_stderr,none": 0.02716334085964515}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.02549753263960955}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.03257847384436777}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876747}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.916, "acc_stderr,none": 0.008776162089491087, "acc_norm,none": 0.893, "acc_norm_stderr,none": 0.009779910359847169}} |
| {"created_at": "2025-08-21T17:13:18.618332", "global_step": 52000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4326827325234017, "acc_stderr,none": 0.004944351065545852, "acc_norm,none": 0.5656243776140211, "acc_norm_stderr,none": 0.004946617138983516}, "mmlu": {"acc,none": 0.26349522859991453, "acc_stderr,none": 0.0037156094937000083, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24760892667375134, "acc_stderr,none": 0.006294173089143299, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924319}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.0315841532404771}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098826}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.024257901705323378}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303679}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.02492672322484554}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.02346842983245115}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.01112112900784068}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.03246721765117825}, "mmlu_other": {"acc,none": 0.2652075957515288, "acc_stderr,none": 0.007913318699139188, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.028254200344438655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.027157150479563824}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.047504583990416925}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809447}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.015842430835269435}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.025738854797818716}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.02551873104953776}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777562}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3132530120481928, "acc_stderr,none": 0.036108050180310235}, "mmlu_social_sciences": {"acc,none": 0.27949301267468313, "acc_stderr,none": 0.008076756931941603, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281337}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03191178226713546}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.032018671228777947}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3435897435897436, "acc_stderr,none": 0.024078696580635477}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31092436974789917, "acc_stderr,none": 0.03006676158297793}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25688073394495414, "acc_stderr,none": 0.01873249292834246}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670238}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142783}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.26990168093878847, "acc_stderr,none": 0.007917047587163039, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566018}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.02989614568209546}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.023517294335963283}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2903225806451613, "acc_stderr,none": 0.02582210611941589}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275788}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.030998666304560538}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "sciq": {"alias": "sciq", "acc,none": 0.916, "acc_stderr,none": 0.008776162089491094, "acc_norm,none": 0.872, "acc_norm_stderr,none": 0.010570133761108658}} |
| {"created_at": "2025-08-21T18:36:38.301951", "global_step": 54000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.011620759575652366}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43198566022704643, "acc_stderr,none": 0.004943400892881062, "acc_norm,none": 0.5651264688309102, "acc_norm_stderr,none": 0.004947272454226213}, "mmlu": {"acc,none": 0.2605042016806723, "acc_stderr,none": 0.0036961862246544007, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2405951115834219, "acc_stderr,none": 0.006233893297879633, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046706}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501954}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.0345727283691767}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052192}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22346368715083798, "acc_stderr,none": 0.013932068638579754}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21864951768488747, "acc_stderr,none": 0.023475581417861113}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005723}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.011121129007840682}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03126781714663179}, "mmlu_other": {"acc,none": 0.2716446733183135, "acc_stderr,none": 0.007944945863859913, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.02749566368372406}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.034140140070440354}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.02758406660220826}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398696}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02678745311190654}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.02484792135806396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3602941176470588, "acc_stderr,none": 0.029163128570670736}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370518}, "mmlu_social_sciences": {"acc,none": 0.27949301267468313, "acc_stderr,none": 0.008074332142559981, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270286}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.33678756476683935, "acc_stderr,none": 0.03410780251836184}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657266}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135356}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28623853211009176, "acc_stderr,none": 0.019379436628919965}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.0364129708131373}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.01815287105153881}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.02721283588407316}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.26070409134157946, "acc_stderr,none": 0.00781378619091335, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179327}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.02989614568209546}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.19310344827586207, "acc_stderr,none": 0.032894455221274}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643895}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764815}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.028501378167893953}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073824}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.038615575462551684}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.30092592592592593, "acc_stderr,none": 0.03128039084329882}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755805}, "sciq": {"alias": "sciq", "acc,none": 0.907, "acc_stderr,none": 0.009188875634996655, "acc_norm,none": 0.873, "acc_norm_stderr,none": 0.010534798620855752}} |
| {"created_at": "2025-08-21T20:36:25.066311", "global_step": 56000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4366660027882892, "acc_stderr,none": 0.004949589567678893, "acc_norm,none": 0.5701055566620196, "acc_norm_stderr,none": 0.004940490508240659}, "mmlu": {"acc,none": 0.2461187864976499, "acc_stderr,none": 0.0036313727844116952, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23974495217853348, "acc_stderr,none": 0.0062246569234754925, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848877}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721164}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036413}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123567}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265823}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.023993501709042114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24315514993481094, "acc_stderr,none": 0.010956556654417351}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.26746057289990344, "acc_stderr,none": 0.007945005218664113, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118362}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572203}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.025829163272757482}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25735294117647056, "acc_stderr,none": 0.026556519470041506}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.22911927201819954, "acc_stderr,none": 0.00756809068068026, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17616580310880828, "acc_stderr,none": 0.027493504244548047}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.02160629449464773}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882378}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20550458715596331, "acc_stderr,none": 0.017324352325016012}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213754}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.25118934348239774, "acc_stderr,none": 0.0077091476504579535, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560826}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105345}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727773}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2129032258064516, "acc_stderr,none": 0.02328766512726854}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.0292255758924896}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02696242432507383}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355178}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285712}, "sciq": {"alias": "sciq", "acc,none": 0.908, "acc_stderr,none": 0.009144376393151079, "acc_norm,none": 0.878, "acc_norm_stderr,none": 0.010354864712936692}} |
| {"created_at": "2025-08-21T23:18:56.062673", "global_step": 58000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21867321867321868, "acc_stderr,none": 0.011834072858346449}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.433877713602868, "acc_stderr,none": 0.004945956744943812, "acc_norm,none": 0.5692093208524198, "acc_norm_stderr,none": 0.004941748817682296}, "mmlu": {"acc,none": 0.2446944879646774, "acc_stderr,none": 0.0036255673640865718, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24442082890541977, "acc_stderr,none": 0.006265919720102512, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604672}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.03123475237772118}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484255}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.04373313040914761}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.02378620325550828}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574877}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.02335022547547142}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02492200116888633}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23272490221642764, "acc_stderr,none": 0.010792595553888482}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.2565175410363695, "acc_stderr,none": 0.007824468584516672, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108614}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3452914798206278, "acc_stderr,none": 0.03191100192835794}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25798212005108556, "acc_stderr,none": 0.01564583018834895}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541104}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680588}, "mmlu_social_sciences": {"acc,none": 0.2317192070198245, "acc_stderr,none": 0.007601210511645517, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479049}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18134715025906736, "acc_stderr,none": 0.02780703236068609}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.02110773012724399}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24403669724770644, "acc_stderr,none": 0.01841528635141641}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.03498149385462472}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146634}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252089}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.208955223880597, "acc_stderr,none": 0.028748298931728655}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.24611481129083412, "acc_stderr,none": 0.007668894479040262, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.036186648199362466}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412482}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031722}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764815}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340492}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355147}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.913, "acc_stderr,none": 0.008916866630745889, "acc_norm,none": 0.887, "acc_norm_stderr,none": 0.010016552866696837}} |
| {"created_at": "2025-08-21T23:31:47.645225", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.44015136427006574, "acc_stderr,none": 0.0049539070620965965, "acc_norm,none": 0.5773750248954391, "acc_norm_stderr,none": 0.00492967277718432}, "mmlu": {"acc,none": 0.2497507477567298, "acc_stderr,none": 0.003652420943946812, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24463336875664188, "acc_stderr,none": 0.006266013213229097, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3492063492063492, "acc_stderr,none": 0.04263906892795131}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2109704641350211, "acc_stderr,none": 0.02655837250266192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.0384985609879409}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.031570650789119005}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331158}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.02372008851617903}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2561929595827901, "acc_stderr,none": 0.011149173153110582}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.03094445977853321}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.007741657356073549, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.24663677130044842, "acc_stderr,none": 0.028930413120910888}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.046561471100123514}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988836}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.024987106365642976}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25735294117647056, "acc_stderr,none": 0.02655651947004152}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594689}, "mmlu_social_sciences": {"acc,none": 0.24894377640558987, "acc_stderr,none": 0.007790850637790509, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.03095405547036592}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565317}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28974358974358977, "acc_stderr,none": 0.023000628243687968}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.03498149385462469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579153}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.1890547263681592, "acc_stderr,none": 0.02768691358801301}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.2603869330796067, "acc_stderr,none": 0.007820582057668956, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.03161856335358611}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.33796296296296297, "acc_stderr,none": 0.03225941352631295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.917, "acc_stderr,none": 0.008728527206074787, "acc_norm,none": 0.877, "acc_norm_stderr,none": 0.01039129342184988}} |
| {"created_at": "2025-08-22T02:48:35.928754", "global_step": 62000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202905}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4425413264289982, "acc_stderr,none": 0.0049567243926465394, "acc_norm,none": 0.5825532762397929, "acc_norm_stderr,none": 0.004921300331285561}, "mmlu": {"acc,none": 0.24854009400370317, "acc_stderr,none": 0.0036467746547619146, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2461211477151966, "acc_stderr,none": 0.006281600758809805, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.039701582732351734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885417}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693275}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149678}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.023618678310069367}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.01428834380392531}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265823}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2581268104280657, "acc_stderr,none": 0.007841095535975376, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.0254478638251086}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1791907514450867, "acc_stderr,none": 0.029242513059063287}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.273542600896861, "acc_stderr,none": 0.02991858670779882}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.015842430835269476}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02564686309713789}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432428}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.26838235294117646, "acc_stderr,none": 0.02691748122437723}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.031069390260789413}, "mmlu_social_sciences": {"acc,none": 0.23431914202144946, "acc_stderr,none": 0.007642476459283292, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790458}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246797}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715477}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24954128440366974, "acc_stderr,none": 0.018553897629501617}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904062}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.256581033935934, "acc_stderr,none": 0.007783546350909512, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929775}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514196}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031715}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.02390491431178265}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.03757949922943342}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.030058202704309846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.919, "acc_stderr,none": 0.008632121032139957, "acc_norm,none": 0.884, "acc_norm_stderr,none": 0.010131468138756986}} |
| {"created_at": "2025-08-22T03:16:50.347234", "global_step": 64000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063063}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4478191595299741, "acc_stderr,none": 0.0049625342647519255, "acc_norm,none": 0.585839474208325, "acc_norm_stderr,none": 0.004915697886906118}, "mmlu": {"acc,none": 0.24633243127759577, "acc_stderr,none": 0.0036325203859514307, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24484590860786398, "acc_stderr,none": 0.006273264127727802, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790608}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514512}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946315}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.030833491146281252}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654554}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.02372008851617903}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460842}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927227}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686398}, "mmlu_other": {"acc,none": 0.26198905696813646, "acc_stderr,none": 0.007867187829785098, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.0271342916287417}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0309528902177499}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.03252113489929187}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690878}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004243}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.01579430248788871}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.024288619466046105}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440355}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3132530120481928, "acc_stderr,none": 0.036108050180310235}, "mmlu_social_sciences": {"acc,none": 0.2382190445238869, "acc_stderr,none": 0.00768190266471685, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479049}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361266}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24403669724770644, "acc_stderr,none": 0.0184152863514164}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.035477710041594626}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.01736247376214662}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302505}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307748}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348373}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.24104027909927053, "acc_stderr,none": 0.007601314835988492, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614866}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.030579442773610334}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.023135287974325618}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.024580028921481006}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959916}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.02596742095825853}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "sciq": {"alias": "sciq", "acc,none": 0.916, "acc_stderr,none": 0.008776162089491087, "acc_norm,none": 0.878, "acc_norm_stderr,none": 0.010354864712936694}} |
| {"created_at": "2025-08-22T05:16:27.324402", "global_step": 66000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.011620759575652367}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4495120493925513, "acc_stderr,none": 0.004964277999318814, "acc_norm,none": 0.5904202350129456, "acc_norm_stderr,none": 0.0049075121031283446}, "mmlu": {"acc,none": 0.25487822247543085, "acc_stderr,none": 0.0036690770297240095, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23336875664187035, "acc_stderr,none": 0.0061630837220214485, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574925}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794091}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.1994219653179191, "acc_stderr,none": 0.021511900654252545}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23575418994413408, "acc_stderr,none": 0.014196375686290804}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.19753086419753085, "acc_stderr,none": 0.022152889927898947}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.01092649610203496}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.26070164145477953, "acc_stderr,none": 0.007850822543945873, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493857}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.03496101481191179}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19730941704035873, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"alias": " - management", "acc,none": 0.33980582524271846, "acc_stderr,none": 0.04689765937278134}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988829}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.02545775669666788}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729906}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3492647058823529, "acc_stderr,none": 0.02895975519682485}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.031069390260789424}, "mmlu_social_sciences": {"acc,none": 0.27786805329866754, "acc_stderr,none": 0.008051300341237933, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.0409698513984367}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.031911782267135466}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27979274611398963, "acc_stderr,none": 0.03239637046735703}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3564102564102564, "acc_stderr,none": 0.0242831405294673}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3403361344537815, "acc_stderr,none": 0.030778057422931673}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27889908256880735, "acc_stderr,none": 0.01922746887646351}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23366013071895425, "acc_stderr,none": 0.017119158496044503}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878285}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2588011417697431, "acc_stderr,none": 0.007800588574022826, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105365}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566016}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006717}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838725}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.19310344827586207, "acc_stderr,none": 0.03289445522127401}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.02226181769240015}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184406}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03141554629402544}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.041577515398656284}, "sciq": {"alias": "sciq", "acc,none": 0.918, "acc_stderr,none": 0.008680515615523693, "acc_norm,none": 0.878, "acc_norm_stderr,none": 0.01035486471293671}} |
| {"created_at": "2025-08-22T07:10:51.536912", "global_step": 68000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063056}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4510057757418841, "acc_stderr,none": 0.004965768348628047, "acc_norm,none": 0.5930093606851224, "acc_norm_stderr,none": 0.004902690765066423}, "mmlu": {"acc,none": 0.25630252100840334, "acc_stderr,none": 0.003682888407422673, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2499468650371945, "acc_stderr,none": 0.0063106301012892324, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276862}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501964}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.027123298205229972}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.0432076780753667}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.03487825168497892}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123567}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103986}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.02309314039837422}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.025089478523765137}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25358539765319427, "acc_stderr,none": 0.011111715336101136}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.03401052620104091}, "mmlu_other": {"acc,none": 0.26746057289990344, "acc_stderr,none": 0.007932725593530766, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700904}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229143}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.016203792703197797}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351284}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.035294868015111155}, "mmlu_social_sciences": {"acc,none": 0.2567435814104647, "acc_stderr,none": 0.007879313131592295, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.04096985139843671}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.028057791672989017}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178263}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.25384615384615383, "acc_stderr,none": 0.022066054378726257}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.028657491285071952}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25504587155963304, "acc_stderr,none": 0.01868850085653584}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.01812022425148458}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24897959183673468, "acc_stderr,none": 0.02768297952296023}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.25436092610212496, "acc_stderr,none": 0.007753244325659363, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.035025531706783165}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.035541803680256896}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2013888888888889, "acc_stderr,none": 0.03353647469713839}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714534}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135303}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232072}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332204}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.02850137816789395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046937}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.916, "acc_stderr,none": 0.00877616208949109, "acc_norm,none": 0.887, "acc_norm_stderr,none": 0.01001655286669684}} |
| {"created_at": "2025-08-22T10:58:58.462725", "global_step": 70000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4517028480382394, "acc_stderr,none": 0.004966448380104208, "acc_norm,none": 0.599681338378809, "acc_norm_stderr,none": 0.004889615413144193}, "mmlu": {"acc,none": 0.2594359777809429, "acc_stderr,none": 0.003684234168382486, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2359192348565356, "acc_stderr,none": 0.0061912243597933405, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.04190596438871136}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693247}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.19831223628691982, "acc_stderr,none": 0.025955020841621112}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.0222896388526179}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22681564245810057, "acc_stderr,none": 0.014005843570897897}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.02335022547547143}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.02368359183700856}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2405475880052151, "acc_stderr,none": 0.010916406735478947}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.032180937956023566}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.007851648167149233, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493864}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.034564257450869995}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.13452914798206278, "acc_stderr,none": 0.022901183761575575}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.048026946982589726}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674043}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398684}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.02545775669666788}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.02484792135806396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.029624663581159696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.033293941190735275}, "mmlu_social_sciences": {"acc,none": 0.28696782580435487, "acc_stderr,none": 0.008114158603925503, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3717948717948718, "acc_stderr,none": 0.024503472557110936}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135377}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3100917431192661, "acc_stderr,none": 0.019830849684439752}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.016639319350313264}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.027833023871399677}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.2626070409134158, "acc_stderr,none": 0.007829754019902713, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17037037037037037, "acc_stderr,none": 0.032477811859955935}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080342}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102956}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400158}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.02518900666021238}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.23645320197044334, "acc_stderr,none": 0.029896114291733552}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.35648148148148145, "acc_stderr,none": 0.032664783315272714}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "sciq": {"alias": "sciq", "acc,none": 0.92, "acc_stderr,none": 0.008583336977753651, "acc_norm,none": 0.903, "acc_norm_stderr,none": 0.009363689373248088}} |
| {"created_at": "2025-08-22T11:02:29.617264", "global_step": 72000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063046}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.45668193586934874, "acc_stderr,none": 0.004971019942726573, "acc_norm,none": 0.6018721370244972, "acc_norm_stderr,none": 0.004885116465550284}, "mmlu": {"acc,none": 0.2650619569861843, "acc_stderr,none": 0.003702907360701558, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23379383634431455, "acc_stderr,none": 0.006172091746830425, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139404}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693247}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20675105485232068, "acc_stderr,none": 0.026361651668389094}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.03893542518824847}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071145}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103982}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.0232227567974351}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23989569752281617, "acc_stderr,none": 0.010906282617981652}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686398}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.007848041792299197, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700907}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.03456425745086999}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.15695067264573992, "acc_stderr,none": 0.024413587174907433}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.02742100729539291}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.0151904737170375}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.026493033225145894}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.40808823529411764, "acc_stderr,none": 0.029855261393483927}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.300942476438089, "acc_stderr,none": 0.008232552352326025, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281337}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35233160621761656, "acc_stderr,none": 0.03447478286414357}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3717948717948718, "acc_stderr,none": 0.024503472557110936}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.03068473711513537}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.344954128440367, "acc_stderr,none": 0.020380605405066966}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2369281045751634, "acc_stderr,none": 0.017201662169789796}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128445}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.27719632096416114, "acc_stderr,none": 0.007923560213269025, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.16296296296296298, "acc_stderr,none": 0.031905414744828414}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119667}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.038270523579507554}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.047840607041056527}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.026355158413349417}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924814}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145658}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4212962962962963, "acc_stderr,none": 0.03367462138896079}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952685}, "sciq": {"alias": "sciq", "acc,none": 0.926, "acc_stderr,none": 0.008282064512704159, "acc_norm,none": 0.904, "acc_norm_stderr,none": 0.009320454434783267}} |
| {"created_at": "2025-08-22T13:09:30.715118", "global_step": 74000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.45578570005974905, "acc_stderr,none": 0.004970234032728297, "acc_norm,none": 0.6024696275642303, "acc_norm_stderr,none": 0.004883871774350598}, "mmlu": {"acc,none": 0.2594359777809429, "acc_stderr,none": 0.0036972931377086595, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2595111583421892, "acc_stderr,none": 0.00638457146241928, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009181}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2109704641350211, "acc_stderr,none": 0.02655837250266192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.0432076780753667}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.03559039531617342}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.02440517393578323}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.01424263007057488}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2057877813504823, "acc_stderr,none": 0.022961339906764248}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30246913580246915, "acc_stderr,none": 0.025557653981868052}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2685788787483703, "acc_stderr,none": 0.01132005662912173}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.034678266857038266}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.007912922438718935, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741695}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029471}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.028188240046929203}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834949}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2784163473818646, "acc_stderr,none": 0.016028295188992462}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307847}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.0254830814680298}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064537}, "mmlu_social_sciences": {"acc,none": 0.25381865453363667, "acc_stderr,none": 0.007856620977122504, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533087}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845426}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26153846153846155, "acc_stderr,none": 0.022282141204204426}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380575}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25688073394495414, "acc_stderr,none": 0.01873249292834247}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307748}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2597526165556613, "acc_stderr,none": 0.007795798289904596, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102967}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948365}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23870967741935484, "acc_stderr,none": 0.024251071262208837}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.03161856335358611}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267645}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.030388051301678116}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "sciq": {"alias": "sciq", "acc,none": 0.928, "acc_stderr,none": 0.008178195576218681, "acc_norm,none": 0.905, "acc_norm_stderr,none": 0.009276910103103338}} |
| {"created_at": "2025-08-22T13:10:31.900554", "global_step": 76000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.01130520748682771}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46046604262099183, "acc_stderr,none": 0.004974159561342697, "acc_norm,none": 0.6091416052579167, "acc_norm_stderr,none": 0.004869455150933821}, "mmlu": {"acc,none": 0.24505056259792052, "acc_stderr,none": 0.0036247883712279714, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.006261923055275968, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.039325376803928704}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083292}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774494}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658537}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.011083276280441905}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.25555197940135177, "acc_stderr,none": 0.007822592172640483, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.02575755989310674}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.03102441174057221}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.030572811310299607}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24648786717752236, "acc_stderr,none": 0.015411308769686936}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.28308823529411764, "acc_stderr,none": 0.02736586113151381}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.24081897952551187, "acc_stderr,none": 0.007684686057256152, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17098445595854922, "acc_stderr,none": 0.027171213683164528}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3, "acc_stderr,none": 0.023234581088428494}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.02684151432295894}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633163}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.01774089950917779}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.24008880431335236, "acc_stderr,none": 0.007590935007300585, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678316}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368466}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3276595744680851, "acc_stderr,none": 0.030683020843231004}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.22486772486772486, "acc_stderr,none": 0.02150209607822914}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.02627308604753542}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.036030385453603826}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.30092592592592593, "acc_stderr,none": 0.03128039084329883}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "sciq": {"alias": "sciq", "acc,none": 0.93, "acc_stderr,none": 0.008072494358323502, "acc_norm,none": 0.909, "acc_norm_stderr,none": 0.009099549538400236}} |
| {"created_at": "2025-08-22T15:26:27.843668", "global_step": 78000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1941031941031941, "acc_stderr,none": 0.011323381588920437}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46355307707627963, "acc_stderr,none": 0.004976507121076262, "acc_norm,none": 0.611431985660227, "acc_norm_stderr,none": 0.00486428617673184}, "mmlu": {"acc,none": 0.24910981341689217, "acc_stderr,none": 0.0036447546560958066, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2414452709883103, "acc_stderr,none": 0.0062354616606938125, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523811}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.0331750593000918}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20675105485232068, "acc_stderr,none": 0.026361651668389094}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123567}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774495}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2623456790123457, "acc_stderr,none": 0.024477222856135118}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24771838331160365, "acc_stderr,none": 0.011025499291443735}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03615507630310935}, "mmlu_other": {"acc,none": 0.26649501126488573, "acc_stderr,none": 0.007933807338506562, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.025907897122408173}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.034355680560478746}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891162}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.015866243073215047}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.02484792135806396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.0276784686421447}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.25089372765680856, "acc_stderr,none": 0.007803358829579289, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817244}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30256410256410254, "acc_stderr,none": 0.023290888053772725}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277733}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.01827257581023186}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090503}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.241674595623216, "acc_stderr,none": 0.007601753010357441, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.03355677216313142}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310049}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774708}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.03793281185307809}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745643}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415426}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.023785577884181015}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.02596030006460558}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.03085199299325701}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "sciq": {"alias": "sciq", "acc,none": 0.929, "acc_stderr,none": 0.008125578442487924, "acc_norm,none": 0.904, "acc_norm_stderr,none": 0.009320454434783264}} |
| {"created_at": "2025-08-22T17:06:45.273855", "global_step": 80000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46415056761601275, "acc_stderr,none": 0.004976939333240074, "acc_norm,none": 0.6128261302529376, "acc_norm_stderr,none": 0.004861084534087014}, "mmlu": {"acc,none": 0.2525993448226748, "acc_stderr,none": 0.00366244095000134, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24017003188097769, "acc_stderr,none": 0.006228694121539488, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848876}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.03058759135160424}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676177}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967277}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23728813559322035, "acc_stderr,none": 0.010865436690780264}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.2648857418731896, "acc_stderr,none": 0.007923968124743563, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.0339175032232166}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2914798206278027, "acc_stderr,none": 0.030500283176545913}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.0458212416016155}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748845}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2554278416347382, "acc_stderr,none": 0.015594955384455768}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.02495418432487991}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266736}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2867647058823529, "acc_stderr,none": 0.02747227447323382}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.0332939411907353}, "mmlu_social_sciences": {"acc,none": 0.25999350016249595, "acc_stderr,none": 0.007902474245423804, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.03008862949021749}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935411}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3153846153846154, "acc_stderr,none": 0.02355964698318994}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.028510251512341933}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.018152871051538816}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724138}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22040816326530613, "acc_stderr,none": 0.026537045312145312}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573037}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.25182366000634315, "acc_stderr,none": 0.007694772568281761, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3617021276595745, "acc_stderr,none": 0.03141082197596239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.021132859182754454}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.02771931570961478}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712152}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.03167468706828979}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285713}, "sciq": {"alias": "sciq", "acc,none": 0.927, "acc_stderr,none": 0.008230354715244071, "acc_norm,none": 0.901, "acc_norm_stderr,none": 0.009449248027662775}} |
| {"created_at": "2025-08-22T18:19:56.046140", "global_step": 82000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202905}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4642501493726349, "acc_stderr,none": 0.004977010670436551, "acc_norm,none": 0.6129257120095598, "acc_norm_stderr,none": 0.004860854240821974}, "mmlu": {"acc,none": 0.2564449508617006, "acc_stderr,none": 0.0036807889363621657, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23804463336875664, "acc_stderr,none": 0.00621082865165123, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392872}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693244}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.027123298205229972}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098824}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.21098265895953758, "acc_stderr,none": 0.021966309947043117}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783242}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460842}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.010996156635142692}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686399}, "mmlu_other": {"acc,none": 0.271000965561635, "acc_stderr,none": 0.007977650861577479, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.02674989977124124}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.029275891003969927}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.0465614711001235}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.030236389942173092}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.01554337731371968}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.02591780611714716}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266736}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.027971541370170598}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.27169320766980826, "acc_stderr,none": 0.008011295823538345, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.03031371053819889}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476005}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31794871794871793, "acc_stderr,none": 0.023610884308927865}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.029344572500634342}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26605504587155965, "acc_stderr,none": 0.018946022322225604}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.04118438565806299}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.018342529845275908}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072773}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553843}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.25467808436409767, "acc_stderr,none": 0.007739531273203053, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411019}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.03302789859901717}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343604}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364396}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745633}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02113285918275445}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.02945486383529296}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712146}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.33796296296296297, "acc_stderr,none": 0.03225941352631295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.932, "acc_stderr,none": 0.007964887911291605, "acc_norm,none": 0.905, "acc_norm_stderr,none": 0.009276910103103338}} |
| {"created_at": "2025-08-22T23:07:01.518645", "global_step": 86000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2022932022932023, "acc_stderr,none": 0.01150091452526044}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4654451304521012, "acc_stderr,none": 0.004977851161904398, "acc_norm,none": 0.6154152559251145, "acc_norm_stderr,none": 0.004855027248398155}, "mmlu": {"acc,none": 0.25630252100840334, "acc_stderr,none": 0.003680451098433255, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2410201912858661, "acc_stderr,none": 0.006237959798025267, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21940928270042195, "acc_stderr,none": 0.026939106581553945}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794089}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321617}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480774}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.011083276280441902}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.271000965561635, "acc_stderr,none": 0.00798091817504173, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.02713429162874171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857473}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.045821241601615506}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431183}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952344}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500527}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292452}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2977941176470588, "acc_stderr,none": 0.027778298701545443}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.27071823204419887, "acc_stderr,none": 0.008006839622990359, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932022}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845426}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31794871794871793, "acc_stderr,none": 0.023610884308927865}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27889908256880735, "acc_stderr,none": 0.019227468876463514}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.04118438565806299}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553843}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.2505550269584523, "acc_stderr,none": 0.0076885479040717155, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.033176727875331574}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154527}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.023904914311782658}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18226600985221675, "acc_stderr,none": 0.027163340859645155}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275784}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.30092592592592593, "acc_stderr,none": 0.03128039084329883}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285712}, "sciq": {"alias": "sciq", "acc,none": 0.928, "acc_stderr,none": 0.008178195576218681, "acc_norm,none": 0.905, "acc_norm_stderr,none": 0.009276910103103334}} |
| {"created_at": "2025-08-22T23:07:12.680803", "global_step": 84000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177835}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46554471220872334, "acc_stderr,none": 0.004977919906875361, "acc_norm,none": 0.6159131647082254, "acc_norm_stderr,none": 0.004853845750392139}, "mmlu": {"acc,none": 0.25267055974932345, "acc_stderr,none": 0.0036636393628437297, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2405951115834219, "acc_stderr,none": 0.006233795558388824, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885415}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350195}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069443}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.024288533637726095}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034954}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.03401052620104089}, "mmlu_other": {"acc,none": 0.26263276472481495, "acc_stderr,none": 0.007899859967723626, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.02700876609070809}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.03368762932259431}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.030216831011508762}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.0458212416016155}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.015302380123542082}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.0253895125527299}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.027146271936625162}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.26259343516412087, "acc_stderr,none": 0.007927988949811178, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537317}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.0298575156733864}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30512820512820515, "acc_stderr,none": 0.023346335293325887}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863804}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231867}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834277}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.018120224251484587}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878285}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553843}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935557}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_stem": {"acc,none": 0.25118934348239774, "acc_stderr,none": 0.007695946713947398, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17037037037037037, "acc_stderr,none": 0.03247781185995593}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310049}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462456}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.35319148936170214, "acc_stderr,none": 0.031245325202761926}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031715}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.024022256130308235}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844058}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.031141447823536048}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.926, "acc_stderr,none": 0.008282064512704159, "acc_norm,none": 0.899, "acc_norm_stderr,none": 0.009533618929341027}} |
| {"created_at": "2025-08-22T23:09:07.923310", "global_step": 88000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634105}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4662417845050787, "acc_stderr,none": 0.004978395540514373, "acc_norm,none": 0.6156144194383589, "acc_norm_stderr,none": 0.0048545552940175455}, "mmlu": {"acc,none": 0.25359635379575557, "acc_stderr,none": 0.0036679829940411924, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2410201912858661, "acc_stderr,none": 0.006234097702727437, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604673}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009179}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676187}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.031921934489347215}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.02207570925175718}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265812}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967277}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2561929595827901, "acc_stderr,none": 0.01114917315311058}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.26649501126488573, "acc_stderr,none": 0.007941517127542026, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.0264803571798957}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.02960510321703832}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.015794302487888722}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290382}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.026799562024887678}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.2596685082872928, "acc_stderr,none": 0.007897306651621446, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.042270544512322}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.02937661648494564}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.031195840877700307}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3153846153846154, "acc_stderr,none": 0.023559646983189957}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361266}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.02635891633490405}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.029475250236017193}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_stem": {"acc,none": 0.2537266095781795, "acc_stderr,none": 0.007724862490803291, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678318}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310049}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3659574468085106, "acc_stderr,none": 0.0314895582974553}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.021411684393694203}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.02824735012218027}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.036030385453603826}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3101851851851852, "acc_stderr,none": 0.03154696285656627}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285712}, "sciq": {"alias": "sciq", "acc,none": 0.929, "acc_stderr,none": 0.008125578442487924, "acc_norm,none": 0.902, "acc_norm_stderr,none": 0.009406619184621264}} |
|
|