| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.99749049974905, | |
| "eval_steps": 50, | |
| "global_step": 8715, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005736000573600057, | |
| "grad_norm": 2.3000271320343018, | |
| "learning_rate": 1.1467889908256882e-07, | |
| "loss": 0.3103, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.011472001147200114, | |
| "grad_norm": 0.23891927301883698, | |
| "learning_rate": 2.2935779816513764e-07, | |
| "loss": 0.2101, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.017208001720800174, | |
| "grad_norm": 3.7850193977355957, | |
| "learning_rate": 3.4403669724770646e-07, | |
| "loss": 0.3158, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.022944002294400228, | |
| "grad_norm": 10.902544021606445, | |
| "learning_rate": 4.587155963302753e-07, | |
| "loss": 0.3107, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.028680002868000286, | |
| "grad_norm": 12.402349472045898, | |
| "learning_rate": 5.733944954128441e-07, | |
| "loss": 0.3895, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.028680002868000286, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 1.0573054552078247, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 17.8757, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 36.53, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 18.293, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03441600344160035, | |
| "grad_norm": 1.5937564373016357, | |
| "learning_rate": 6.880733944954129e-07, | |
| "loss": 0.2534, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0401520040152004, | |
| "grad_norm": 3.687685012817383, | |
| "learning_rate": 8.027522935779817e-07, | |
| "loss": 0.3795, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.045888004588800456, | |
| "grad_norm": 0.03578287363052368, | |
| "learning_rate": 9.174311926605506e-07, | |
| "loss": 0.4276, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05162400516240052, | |
| "grad_norm": 0.022680282592773438, | |
| "learning_rate": 1.0321100917431195e-06, | |
| "loss": 0.277, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05736000573600057, | |
| "grad_norm": 1.7502835988998413, | |
| "learning_rate": 1.1467889908256882e-06, | |
| "loss": 0.2279, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05736000573600057, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.9872124791145325, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.223, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.768, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.408, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06309600630960063, | |
| "grad_norm": 4.763681411743164, | |
| "learning_rate": 1.261467889908257e-06, | |
| "loss": 0.3441, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0688320068832007, | |
| "grad_norm": 7.319093704223633, | |
| "learning_rate": 1.3761467889908258e-06, | |
| "loss": 0.4273, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07456800745680074, | |
| "grad_norm": 9.32235336303711, | |
| "learning_rate": 1.4908256880733945e-06, | |
| "loss": 0.3338, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0803040080304008, | |
| "grad_norm": 2.0044586658477783, | |
| "learning_rate": 1.6055045871559635e-06, | |
| "loss": 0.1364, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08604000860400086, | |
| "grad_norm": 10.962570190429688, | |
| "learning_rate": 1.7201834862385322e-06, | |
| "loss": 0.2404, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08604000860400086, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.7815765738487244, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2921, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.669, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.358, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09177600917760091, | |
| "grad_norm": 4.845127105712891, | |
| "learning_rate": 1.8348623853211011e-06, | |
| "loss": 0.3601, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09751200975120097, | |
| "grad_norm": 3.165642738342285, | |
| "learning_rate": 1.94954128440367e-06, | |
| "loss": 0.1938, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10324801032480103, | |
| "grad_norm": 3.4738926887512207, | |
| "learning_rate": 2.064220183486239e-06, | |
| "loss": 0.1979, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1089840108984011, | |
| "grad_norm": 0.34281352162361145, | |
| "learning_rate": 2.1788990825688075e-06, | |
| "loss": 0.2278, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11472001147200114, | |
| "grad_norm": 9.631810188293457, | |
| "learning_rate": 2.2935779816513764e-06, | |
| "loss": 0.1642, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11472001147200114, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.32764628529548645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1681, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.848, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.448, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1204560120456012, | |
| "grad_norm": 4.584328651428223, | |
| "learning_rate": 2.4082568807339453e-06, | |
| "loss": 0.1721, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12619201261920127, | |
| "grad_norm": 7.435701370239258, | |
| "learning_rate": 2.522935779816514e-06, | |
| "loss": 0.1632, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1319280131928013, | |
| "grad_norm": 2.957846164703369, | |
| "learning_rate": 2.6376146788990823e-06, | |
| "loss": 0.1175, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1376640137664014, | |
| "grad_norm": 4.342548847198486, | |
| "learning_rate": 2.7522935779816517e-06, | |
| "loss": 0.0877, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14340001434000144, | |
| "grad_norm": 2.2145071029663086, | |
| "learning_rate": 2.8669724770642206e-06, | |
| "loss": 0.0692, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14340001434000144, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.14406049251556396, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2632, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.71, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.379, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14913601491360148, | |
| "grad_norm": 0.3610692620277405, | |
| "learning_rate": 2.981651376146789e-06, | |
| "loss": 0.1079, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15487201548720156, | |
| "grad_norm": 6.997246265411377, | |
| "learning_rate": 3.0963302752293576e-06, | |
| "loss": 0.0748, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1606080160608016, | |
| "grad_norm": 2.0748515129089355, | |
| "learning_rate": 3.211009174311927e-06, | |
| "loss": 0.0749, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16634401663440165, | |
| "grad_norm": 3.6645171642303467, | |
| "learning_rate": 3.325688073394496e-06, | |
| "loss": 0.097, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.17208001720800173, | |
| "grad_norm": 8.085243225097656, | |
| "learning_rate": 3.4403669724770644e-06, | |
| "loss": 0.0722, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17208001720800173, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11341650038957596, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2959, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.663, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.355, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17781601778160178, | |
| "grad_norm": 2.7256767749786377, | |
| "learning_rate": 3.5550458715596333e-06, | |
| "loss": 0.0707, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18355201835520182, | |
| "grad_norm": 2.6183135509490967, | |
| "learning_rate": 3.6697247706422022e-06, | |
| "loss": 0.0418, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1892880189288019, | |
| "grad_norm": 1.0414828062057495, | |
| "learning_rate": 3.784403669724771e-06, | |
| "loss": 0.041, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19502401950240195, | |
| "grad_norm": 1.9375965595245361, | |
| "learning_rate": 3.89908256880734e-06, | |
| "loss": 0.0742, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.200760020076002, | |
| "grad_norm": 1.6328781843185425, | |
| "learning_rate": 4.013761467889909e-06, | |
| "loss": 0.0426, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.200760020076002, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11491454392671585, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2047, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.795, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.421, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.20649602064960207, | |
| "grad_norm": 0.45767122507095337, | |
| "learning_rate": 4.128440366972478e-06, | |
| "loss": 0.0771, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.21223202122320212, | |
| "grad_norm": 2.768129825592041, | |
| "learning_rate": 4.2431192660550464e-06, | |
| "loss": 0.0869, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2179680217968022, | |
| "grad_norm": 1.4702483415603638, | |
| "learning_rate": 4.357798165137615e-06, | |
| "loss": 0.0536, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22370402237040224, | |
| "grad_norm": 1.277462124824524, | |
| "learning_rate": 4.4724770642201834e-06, | |
| "loss": 0.0329, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2294400229440023, | |
| "grad_norm": 1.8748779296875, | |
| "learning_rate": 4.587155963302753e-06, | |
| "loss": 0.045, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2294400229440023, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.12494589388370514, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2877, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.675, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.361, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23517602351760236, | |
| "grad_norm": 2.339601516723633, | |
| "learning_rate": 4.701834862385321e-06, | |
| "loss": 0.0546, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2409120240912024, | |
| "grad_norm": 0.9202137589454651, | |
| "learning_rate": 4.816513761467891e-06, | |
| "loss": 0.1038, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.24664802466480246, | |
| "grad_norm": 2.0663821697235107, | |
| "learning_rate": 4.931192660550459e-06, | |
| "loss": 0.0418, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.25238402523840253, | |
| "grad_norm": 2.6615207195281982, | |
| "learning_rate": 5.045871559633028e-06, | |
| "loss": 0.0414, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2581200258120026, | |
| "grad_norm": 2.298173427581787, | |
| "learning_rate": 5.160550458715596e-06, | |
| "loss": 0.063, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2581200258120026, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.1194896399974823, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3177, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.632, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.339, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2638560263856026, | |
| "grad_norm": 0.9054094552993774, | |
| "learning_rate": 5.275229357798165e-06, | |
| "loss": 0.0572, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2695920269592027, | |
| "grad_norm": 0.5312202572822571, | |
| "learning_rate": 5.389908256880735e-06, | |
| "loss": 0.0697, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2753280275328028, | |
| "grad_norm": 1.5197083950042725, | |
| "learning_rate": 5.504587155963303e-06, | |
| "loss": 0.047, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2810640281064028, | |
| "grad_norm": 1.341577410697937, | |
| "learning_rate": 5.619266055045872e-06, | |
| "loss": 0.0285, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.28680002868000287, | |
| "grad_norm": 1.6182245016098022, | |
| "learning_rate": 5.733944954128441e-06, | |
| "loss": 0.0402, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.28680002868000287, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.12436391413211823, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3365, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.605, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.326, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29253602925360295, | |
| "grad_norm": 2.623664140701294, | |
| "learning_rate": 5.84862385321101e-06, | |
| "loss": 0.0552, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.29827202982720297, | |
| "grad_norm": 1.9711512327194214, | |
| "learning_rate": 5.963302752293578e-06, | |
| "loss": 0.0534, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.30400803040080304, | |
| "grad_norm": 1.4279474020004272, | |
| "learning_rate": 6.077981651376147e-06, | |
| "loss": 0.0356, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3097440309744031, | |
| "grad_norm": 1.5085383653640747, | |
| "learning_rate": 6.192660550458715e-06, | |
| "loss": 0.0389, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.31548003154800314, | |
| "grad_norm": 1.8550148010253906, | |
| "learning_rate": 6.307339449541285e-06, | |
| "loss": 0.0342, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.31548003154800314, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.16011035442352295, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2066, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.792, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.42, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3212160321216032, | |
| "grad_norm": 0.8617078065872192, | |
| "learning_rate": 6.422018348623854e-06, | |
| "loss": 0.0351, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3269520326952033, | |
| "grad_norm": 1.7970987558364868, | |
| "learning_rate": 6.536697247706422e-06, | |
| "loss": 0.0328, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3326880332688033, | |
| "grad_norm": 1.319393515586853, | |
| "learning_rate": 6.651376146788992e-06, | |
| "loss": 0.0348, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3384240338424034, | |
| "grad_norm": 3.622856855392456, | |
| "learning_rate": 6.76605504587156e-06, | |
| "loss": 0.035, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.34416003441600346, | |
| "grad_norm": 2.2089531421661377, | |
| "learning_rate": 6.880733944954129e-06, | |
| "loss": 0.0485, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.34416003441600346, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.12377385795116425, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2989, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.659, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.353, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3498960349896035, | |
| "grad_norm": 1.3465012311935425, | |
| "learning_rate": 6.995412844036697e-06, | |
| "loss": 0.0448, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.35563203556320355, | |
| "grad_norm": 1.0101594924926758, | |
| "learning_rate": 7.110091743119267e-06, | |
| "loss": 0.0298, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3613680361368036, | |
| "grad_norm": 3.003561019897461, | |
| "learning_rate": 7.224770642201836e-06, | |
| "loss": 0.052, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.36710403671040365, | |
| "grad_norm": 2.2370080947875977, | |
| "learning_rate": 7.3394495412844045e-06, | |
| "loss": 0.0241, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3728400372840037, | |
| "grad_norm": 3.373074531555176, | |
| "learning_rate": 7.454128440366973e-06, | |
| "loss": 0.0446, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3728400372840037, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11963505297899246, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1731, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.841, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.444, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3785760378576038, | |
| "grad_norm": 0.6863628029823303, | |
| "learning_rate": 7.568807339449542e-06, | |
| "loss": 0.0116, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3843120384312038, | |
| "grad_norm": 2.9094369411468506, | |
| "learning_rate": 7.68348623853211e-06, | |
| "loss": 0.0657, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3900480390048039, | |
| "grad_norm": 0.2047434151172638, | |
| "learning_rate": 7.79816513761468e-06, | |
| "loss": 0.0319, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.39578403957840397, | |
| "grad_norm": 1.523390531539917, | |
| "learning_rate": 7.912844036697249e-06, | |
| "loss": 0.0492, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.401520040152004, | |
| "grad_norm": 0.10014928132295609, | |
| "learning_rate": 8.027522935779817e-06, | |
| "loss": 0.0333, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.401520040152004, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10763221979141235, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.342, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.597, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.322, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.40725604072560406, | |
| "grad_norm": 0.24312058091163635, | |
| "learning_rate": 8.142201834862386e-06, | |
| "loss": 0.0248, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.41299204129920414, | |
| "grad_norm": 0.08946532011032104, | |
| "learning_rate": 8.256880733944956e-06, | |
| "loss": 0.0133, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4187280418728042, | |
| "grad_norm": 0.13466329872608185, | |
| "learning_rate": 8.371559633027524e-06, | |
| "loss": 0.0226, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.42446404244640423, | |
| "grad_norm": 0.4071923792362213, | |
| "learning_rate": 8.486238532110093e-06, | |
| "loss": 0.0353, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4302000430200043, | |
| "grad_norm": 1.2394853830337524, | |
| "learning_rate": 8.600917431192661e-06, | |
| "loss": 0.0199, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4302000430200043, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09811359643936157, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3072, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.347, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4359360435936044, | |
| "grad_norm": 2.9703874588012695, | |
| "learning_rate": 8.71559633027523e-06, | |
| "loss": 0.0259, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4416720441672044, | |
| "grad_norm": 1.3356784582138062, | |
| "learning_rate": 8.830275229357798e-06, | |
| "loss": 0.0496, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4474080447408045, | |
| "grad_norm": 0.7478968501091003, | |
| "learning_rate": 8.944954128440367e-06, | |
| "loss": 0.0383, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.45314404531440455, | |
| "grad_norm": 0.418811172246933, | |
| "learning_rate": 9.059633027522935e-06, | |
| "loss": 0.0173, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4588800458880046, | |
| "grad_norm": 0.6638255715370178, | |
| "learning_rate": 9.174311926605506e-06, | |
| "loss": 0.0271, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4588800458880046, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11180277168750763, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2311, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.757, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.402, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.46461604646160465, | |
| "grad_norm": 0.21615058183670044, | |
| "learning_rate": 9.288990825688074e-06, | |
| "loss": 0.0175, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4703520470352047, | |
| "grad_norm": 1.2093310356140137, | |
| "learning_rate": 9.403669724770643e-06, | |
| "loss": 0.0215, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.47608804760880474, | |
| "grad_norm": 0.2247300148010254, | |
| "learning_rate": 9.518348623853211e-06, | |
| "loss": 0.0486, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4818240481824048, | |
| "grad_norm": 0.6988762617111206, | |
| "learning_rate": 9.633027522935781e-06, | |
| "loss": 0.0161, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4875600487560049, | |
| "grad_norm": 1.2162249088287354, | |
| "learning_rate": 9.74770642201835e-06, | |
| "loss": 0.0277, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4875600487560049, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08368796855211258, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2199, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.773, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.41, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4932960493296049, | |
| "grad_norm": 1.0620274543762207, | |
| "learning_rate": 9.862385321100918e-06, | |
| "loss": 0.0566, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.499032049903205, | |
| "grad_norm": 0.23082388937473297, | |
| "learning_rate": 9.977064220183487e-06, | |
| "loss": 0.0247, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5047680504768051, | |
| "grad_norm": 0.9556642770767212, | |
| "learning_rate": 9.999974328282418e-06, | |
| "loss": 0.0242, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5105040510504051, | |
| "grad_norm": 0.6304249167442322, | |
| "learning_rate": 9.999870037381541e-06, | |
| "loss": 0.0268, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5162400516240052, | |
| "grad_norm": 0.07784967124462128, | |
| "learning_rate": 9.99968552448706e-06, | |
| "loss": 0.033, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5162400516240052, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06569766253232956, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2473, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.733, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.39, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5219760521976052, | |
| "grad_norm": 1.320080280303955, | |
| "learning_rate": 9.999420792559453e-06, | |
| "loss": 0.0623, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5277120527712053, | |
| "grad_norm": 0.24600175023078918, | |
| "learning_rate": 9.999075845846292e-06, | |
| "loss": 0.022, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5334480533448054, | |
| "grad_norm": 1.860329031944275, | |
| "learning_rate": 9.998650689882184e-06, | |
| "loss": 0.0186, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5391840539184054, | |
| "grad_norm": 1.9470009803771973, | |
| "learning_rate": 9.99814533148868e-06, | |
| "loss": 0.023, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5449200544920054, | |
| "grad_norm": 0.3151063919067383, | |
| "learning_rate": 9.997559778774162e-06, | |
| "loss": 0.0111, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5449200544920054, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07853744179010391, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2659, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.706, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.377, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5506560550656056, | |
| "grad_norm": 1.4308031797409058, | |
| "learning_rate": 9.996894041133715e-06, | |
| "loss": 0.0239, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5563920556392056, | |
| "grad_norm": 0.32267820835113525, | |
| "learning_rate": 9.996148129248975e-06, | |
| "loss": 0.0365, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5621280562128056, | |
| "grad_norm": 0.026730485260486603, | |
| "learning_rate": 9.995322055087963e-06, | |
| "loss": 0.0255, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5678640567864057, | |
| "grad_norm": 4.146325588226318, | |
| "learning_rate": 9.994415831904886e-06, | |
| "loss": 0.06, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5736000573600057, | |
| "grad_norm": 0.9087633490562439, | |
| "learning_rate": 9.993429474239928e-06, | |
| "loss": 0.0371, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5736000573600057, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08875266462564468, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2636, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.71, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.378, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5793360579336058, | |
| "grad_norm": 1.1552188396453857, | |
| "learning_rate": 9.992362997919016e-06, | |
| "loss": 0.0419, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5850720585072059, | |
| "grad_norm": 0.8535066246986389, | |
| "learning_rate": 9.991216420053565e-06, | |
| "loss": 0.0162, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5908080590808059, | |
| "grad_norm": 1.3267903327941895, | |
| "learning_rate": 9.989989759040208e-06, | |
| "loss": 0.0375, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5965440596544059, | |
| "grad_norm": 2.629297971725464, | |
| "learning_rate": 9.988683034560494e-06, | |
| "loss": 0.0097, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6022800602280061, | |
| "grad_norm": 2.267737627029419, | |
| "learning_rate": 9.987296267580575e-06, | |
| "loss": 0.0267, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6022800602280061, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10149678587913513, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2648, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.708, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.377, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6080160608016061, | |
| "grad_norm": 0.04809356853365898, | |
| "learning_rate": 9.985829480350874e-06, | |
| "loss": 0.0104, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6137520613752061, | |
| "grad_norm": 2.897101640701294, | |
| "learning_rate": 9.98428269640572e-06, | |
| "loss": 0.0313, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6194880619488062, | |
| "grad_norm": 0.2844018340110779, | |
| "learning_rate": 9.982655940562978e-06, | |
| "loss": 0.0202, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6252240625224063, | |
| "grad_norm": 1.7705107927322388, | |
| "learning_rate": 9.980949238923646e-06, | |
| "loss": 0.0151, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6309600630960063, | |
| "grad_norm": 2.162916421890259, | |
| "learning_rate": 9.979162618871435e-06, | |
| "loss": 0.021, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6309600630960063, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.087111696600914, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3134, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.638, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.342, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6366960636696064, | |
| "grad_norm": 3.3192200660705566, | |
| "learning_rate": 9.977296109072337e-06, | |
| "loss": 0.0182, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6424320642432064, | |
| "grad_norm": 0.3482876718044281, | |
| "learning_rate": 9.975349739474156e-06, | |
| "loss": 0.0309, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6481680648168064, | |
| "grad_norm": 2.682620048522949, | |
| "learning_rate": 9.973323541306032e-06, | |
| "loss": 0.0299, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6539040653904066, | |
| "grad_norm": 2.5474138259887695, | |
| "learning_rate": 9.971217547077947e-06, | |
| "loss": 0.0337, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6596400659640066, | |
| "grad_norm": 0.5940343141555786, | |
| "learning_rate": 9.969031790580185e-06, | |
| "loss": 0.0294, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6596400659640066, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07855822145938873, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1855, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.823, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.435, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6653760665376066, | |
| "grad_norm": 1.4965182542800903, | |
| "learning_rate": 9.966766306882811e-06, | |
| "loss": 0.0257, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6711120671112067, | |
| "grad_norm": 0.10210319608449936, | |
| "learning_rate": 9.964421132335091e-06, | |
| "loss": 0.0214, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6768480676848068, | |
| "grad_norm": 1.594895362854004, | |
| "learning_rate": 9.961996304564916e-06, | |
| "loss": 0.0182, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6825840682584068, | |
| "grad_norm": 1.8244154453277588, | |
| "learning_rate": 9.959491862478206e-06, | |
| "loss": 0.0195, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6883200688320069, | |
| "grad_norm": 3.9240288734436035, | |
| "learning_rate": 9.956907846258268e-06, | |
| "loss": 0.0426, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6883200688320069, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08573708683252335, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1456, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.881, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.464, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6940560694056069, | |
| "grad_norm": 0.06307424604892731, | |
| "learning_rate": 9.954244297365169e-06, | |
| "loss": 0.0592, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.699792069979207, | |
| "grad_norm": 0.44088277220726013, | |
| "learning_rate": 9.951501258535061e-06, | |
| "loss": 0.0267, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7055280705528071, | |
| "grad_norm": 2.217980146408081, | |
| "learning_rate": 9.948678773779495e-06, | |
| "loss": 0.0171, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7112640711264071, | |
| "grad_norm": 1.3680331707000732, | |
| "learning_rate": 9.94577688838472e-06, | |
| "loss": 0.0369, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7170000717000071, | |
| "grad_norm": 0.043984800577163696, | |
| "learning_rate": 9.94279564891095e-06, | |
| "loss": 0.0173, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7170000717000071, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06791304796934128, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1924, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.813, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.43, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7227360722736073, | |
| "grad_norm": 0.05398892983794212, | |
| "learning_rate": 9.939735103191625e-06, | |
| "loss": 0.0151, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7284720728472073, | |
| "grad_norm": 0.26481369137763977, | |
| "learning_rate": 9.936595300332638e-06, | |
| "loss": 0.0242, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7342080734208073, | |
| "grad_norm": 0.0726240873336792, | |
| "learning_rate": 9.933376290711546e-06, | |
| "loss": 0.0128, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7399440739944074, | |
| "grad_norm": 0.21732382476329803, | |
| "learning_rate": 9.930078125976767e-06, | |
| "loss": 0.0268, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7456800745680074, | |
| "grad_norm": 1.968591332435608, | |
| "learning_rate": 9.926700859046745e-06, | |
| "loss": 0.0264, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7456800745680074, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08293525129556656, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2805, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.685, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.366, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7514160751416075, | |
| "grad_norm": 0.12683559954166412, | |
| "learning_rate": 9.923244544109107e-06, | |
| "loss": 0.0107, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7571520757152076, | |
| "grad_norm": 0.07375319302082062, | |
| "learning_rate": 9.919709236619786e-06, | |
| "loss": 0.0237, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7628880762888076, | |
| "grad_norm": 0.09353283792734146, | |
| "learning_rate": 9.916094993302144e-06, | |
| "loss": 0.0247, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7686240768624076, | |
| "grad_norm": 2.4819159507751465, | |
| "learning_rate": 9.912401872146047e-06, | |
| "loss": 0.0361, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7743600774360078, | |
| "grad_norm": 1.608819842338562, | |
| "learning_rate": 9.908629932406943e-06, | |
| "loss": 0.0123, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7743600774360078, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11693766713142395, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2248, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.766, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.407, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7800960780096078, | |
| "grad_norm": 0.044521648436784744, | |
| "learning_rate": 9.90477923460491e-06, | |
| "loss": 0.0559, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7858320785832078, | |
| "grad_norm": 1.6780263185501099, | |
| "learning_rate": 9.900849840523686e-06, | |
| "loss": 0.016, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7915680791568079, | |
| "grad_norm": 0.20249976217746735, | |
| "learning_rate": 9.896841813209675e-06, | |
| "loss": 0.0103, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.797304079730408, | |
| "grad_norm": 0.5444568991661072, | |
| "learning_rate": 9.892755216970937e-06, | |
| "loss": 0.0265, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.803040080304008, | |
| "grad_norm": 0.39745691418647766, | |
| "learning_rate": 9.888590117376154e-06, | |
| "loss": 0.0305, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.803040080304008, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07220673561096191, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2419, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.741, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.394, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8087760808776081, | |
| "grad_norm": 3.1255667209625244, | |
| "learning_rate": 9.884346581253584e-06, | |
| "loss": 0.0155, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8145120814512081, | |
| "grad_norm": 0.28445881605148315, | |
| "learning_rate": 9.88002467668998e-06, | |
| "loss": 0.0543, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8202480820248083, | |
| "grad_norm": 2.316286087036133, | |
| "learning_rate": 9.875624473029508e-06, | |
| "loss": 0.0208, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8259840825984083, | |
| "grad_norm": 0.3444057106971741, | |
| "learning_rate": 9.871146040872621e-06, | |
| "loss": 0.0375, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8317200831720083, | |
| "grad_norm": 0.36751267313957214, | |
| "learning_rate": 9.86658945207494e-06, | |
| "loss": 0.0273, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8317200831720083, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06544554233551025, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1664, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.851, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.449, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8374560837456084, | |
| "grad_norm": 0.6214566230773926, | |
| "learning_rate": 9.861954779746092e-06, | |
| "loss": 0.0095, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8431920843192084, | |
| "grad_norm": 0.6569352149963379, | |
| "learning_rate": 9.857242098248543e-06, | |
| "loss": 0.024, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8489280848928085, | |
| "grad_norm": 1.6928707361221313, | |
| "learning_rate": 9.852451483196394e-06, | |
| "loss": 0.0158, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8546640854664086, | |
| "grad_norm": 1.0950285196304321, | |
| "learning_rate": 9.847583011454187e-06, | |
| "loss": 0.0099, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8604000860400086, | |
| "grad_norm": 1.4251108169555664, | |
| "learning_rate": 9.842636761135651e-06, | |
| "loss": 0.0677, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8604000860400086, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11534915119409561, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2226, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.769, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.408, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8661360866136086, | |
| "grad_norm": 0.9497309923171997, | |
| "learning_rate": 9.837612811602462e-06, | |
| "loss": 0.0282, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8718720871872088, | |
| "grad_norm": 0.9512405395507812, | |
| "learning_rate": 9.832511243462962e-06, | |
| "loss": 0.0079, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8776080877608088, | |
| "grad_norm": 0.04131248593330383, | |
| "learning_rate": 9.827332138570878e-06, | |
| "loss": 0.0051, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8833440883344088, | |
| "grad_norm": 1.3209096193313599, | |
| "learning_rate": 9.822075580023987e-06, | |
| "loss": 0.0132, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8890800889080089, | |
| "grad_norm": 0.05282355472445488, | |
| "learning_rate": 9.816741652162807e-06, | |
| "loss": 0.0592, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8890800889080089, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08604957163333893, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3059, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.649, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.348, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.894816089481609, | |
| "grad_norm": 0.5249603390693665, | |
| "learning_rate": 9.811330440569226e-06, | |
| "loss": 0.0169, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.900552090055209, | |
| "grad_norm": 1.4722495079040527, | |
| "learning_rate": 9.805842032065135e-06, | |
| "loss": 0.0257, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9062880906288091, | |
| "grad_norm": 1.664174199104309, | |
| "learning_rate": 9.800276514711044e-06, | |
| "loss": 0.0318, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9120240912024091, | |
| "grad_norm": 0.017392676323652267, | |
| "learning_rate": 9.794633977804646e-06, | |
| "loss": 0.0165, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9177600917760091, | |
| "grad_norm": 0.18038752675056458, | |
| "learning_rate": 9.788914511879412e-06, | |
| "loss": 0.0488, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9177600917760091, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08364205807447433, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2219, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.77, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.409, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9234960923496093, | |
| "grad_norm": 1.785083293914795, | |
| "learning_rate": 9.783118208703115e-06, | |
| "loss": 0.0212, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9292320929232093, | |
| "grad_norm": 0.36738669872283936, | |
| "learning_rate": 9.777245161276372e-06, | |
| "loss": 0.0221, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9349680934968093, | |
| "grad_norm": 0.9083078503608704, | |
| "learning_rate": 9.77129546383115e-06, | |
| "loss": 0.0261, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9407040940704094, | |
| "grad_norm": 0.5853396058082581, | |
| "learning_rate": 9.765269211829245e-06, | |
| "loss": 0.011, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9464400946440095, | |
| "grad_norm": 0.027249574661254883, | |
| "learning_rate": 9.759166501960762e-06, | |
| "loss": 0.0156, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9464400946440095, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07951950281858444, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2513, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.727, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.387, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9521760952176095, | |
| "grad_norm": 0.5655274987220764, | |
| "learning_rate": 9.752987432142556e-06, | |
| "loss": 0.0141, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9579120957912096, | |
| "grad_norm": 0.20478393137454987, | |
| "learning_rate": 9.746732101516665e-06, | |
| "loss": 0.0225, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9636480963648096, | |
| "grad_norm": 0.38293856382369995, | |
| "learning_rate": 9.740400610448714e-06, | |
| "loss": 0.0228, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9693840969384097, | |
| "grad_norm": 0.4184674918651581, | |
| "learning_rate": 9.733993060526313e-06, | |
| "loss": 0.0278, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9751200975120098, | |
| "grad_norm": 0.6761838793754578, | |
| "learning_rate": 9.727509554557416e-06, | |
| "loss": 0.0148, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9751200975120098, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0835602656006813, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2977, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.661, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.354, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9808560980856098, | |
| "grad_norm": 0.10887222737073898, | |
| "learning_rate": 9.720950196568689e-06, | |
| "loss": 0.0065, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9865920986592098, | |
| "grad_norm": 0.13018013536930084, | |
| "learning_rate": 9.714315091803815e-06, | |
| "loss": 0.0083, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.99232809923281, | |
| "grad_norm": 0.1999017894268036, | |
| "learning_rate": 9.707604346721833e-06, | |
| "loss": 0.0097, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.99806409980641, | |
| "grad_norm": 0.06055552884936333, | |
| "learning_rate": 9.700818068995407e-06, | |
| "loss": 0.0255, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.00344160034416, | |
| "grad_norm": 2.305107831954956, | |
| "learning_rate": 9.693956367509117e-06, | |
| "loss": 0.021, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.00344160034416, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11077206581830978, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3027, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.653, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.35, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.00917760091776, | |
| "grad_norm": 2.2754151821136475, | |
| "learning_rate": 9.687019352357699e-06, | |
| "loss": 0.0398, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.0149136014913602, | |
| "grad_norm": 0.03042173944413662, | |
| "learning_rate": 9.680007134844279e-06, | |
| "loss": 0.0189, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.0206496020649602, | |
| "grad_norm": 1.6503928899765015, | |
| "learning_rate": 9.672919827478598e-06, | |
| "loss": 0.0111, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.0263856026385603, | |
| "grad_norm": 1.7391407489776611, | |
| "learning_rate": 9.665757543975196e-06, | |
| "loss": 0.0204, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.0321216032121603, | |
| "grad_norm": 1.6538548469543457, | |
| "learning_rate": 9.658520399251592e-06, | |
| "loss": 0.0165, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0321216032121603, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07668553292751312, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1222, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.915, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.481, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0378576037857603, | |
| "grad_norm": 0.022262316197156906, | |
| "learning_rate": 9.651208509426442e-06, | |
| "loss": 0.0053, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.0435936043593603, | |
| "grad_norm": 1.768957495689392, | |
| "learning_rate": 9.64382199181767e-06, | |
| "loss": 0.0164, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.0493296049329606, | |
| "grad_norm": 0.11224523931741714, | |
| "learning_rate": 9.63636096494059e-06, | |
| "loss": 0.0216, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.0550656055065606, | |
| "grad_norm": 0.022123126313090324, | |
| "learning_rate": 9.628825548506002e-06, | |
| "loss": 0.0069, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.0608016060801606, | |
| "grad_norm": 4.303464412689209, | |
| "learning_rate": 9.621215863418276e-06, | |
| "loss": 0.0253, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0608016060801606, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0671575739979744, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2804, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.686, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.366, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0665376066537606, | |
| "grad_norm": 0.08216200768947601, | |
| "learning_rate": 9.61353203177341e-06, | |
| "loss": 0.0052, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.0722736072273606, | |
| "grad_norm": 1.3728184700012207, | |
| "learning_rate": 9.605774176857062e-06, | |
| "loss": 0.0176, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.078009607800961, | |
| "grad_norm": 0.0963163748383522, | |
| "learning_rate": 9.597942423142586e-06, | |
| "loss": 0.0165, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.083745608374561, | |
| "grad_norm": 0.09453441202640533, | |
| "learning_rate": 9.59003689628903e-06, | |
| "loss": 0.0212, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.089481608948161, | |
| "grad_norm": 0.0946192815899849, | |
| "learning_rate": 9.582057723139115e-06, | |
| "loss": 0.0258, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.089481608948161, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07539978623390198, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2173, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.777, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.412, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.095217609521761, | |
| "grad_norm": 0.011775615625083447, | |
| "learning_rate": 9.574005031717203e-06, | |
| "loss": 0.0093, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.100953610095361, | |
| "grad_norm": 0.06459964066743851, | |
| "learning_rate": 9.565878951227247e-06, | |
| "loss": 0.0082, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.106689610668961, | |
| "grad_norm": 0.020801063627004623, | |
| "learning_rate": 9.557679612050708e-06, | |
| "loss": 0.0067, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.112425611242561, | |
| "grad_norm": 0.048270970582962036, | |
| "learning_rate": 9.549407145744473e-06, | |
| "loss": 0.0442, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.1181616118161612, | |
| "grad_norm": 1.6280121803283691, | |
| "learning_rate": 9.541061685038742e-06, | |
| "loss": 0.0258, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.1181616118161612, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0749003142118454, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1385, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.891, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.469, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.1238976123897613, | |
| "grad_norm": 0.4389003813266754, | |
| "learning_rate": 9.532643363834891e-06, | |
| "loss": 0.0165, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.1296336129633613, | |
| "grad_norm": 0.07638214528560638, | |
| "learning_rate": 9.524152317203337e-06, | |
| "loss": 0.0339, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.1353696135369613, | |
| "grad_norm": 0.26499396562576294, | |
| "learning_rate": 9.515588681381356e-06, | |
| "loss": 0.0229, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.1411056141105613, | |
| "grad_norm": 0.7412366271018982, | |
| "learning_rate": 9.506952593770908e-06, | |
| "loss": 0.0157, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.1468416146841616, | |
| "grad_norm": 0.16746670007705688, | |
| "learning_rate": 9.498244192936428e-06, | |
| "loss": 0.0102, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1468416146841616, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09212490171194077, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1271, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.908, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.478, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1525776152577616, | |
| "grad_norm": 0.182969868183136, | |
| "learning_rate": 9.489463618602602e-06, | |
| "loss": 0.0111, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.1583136158313616, | |
| "grad_norm": 1.3869882822036743, | |
| "learning_rate": 9.480611011652128e-06, | |
| "loss": 0.0146, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.1640496164049616, | |
| "grad_norm": 0.0466344878077507, | |
| "learning_rate": 9.471686514123455e-06, | |
| "loss": 0.009, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.1697856169785616, | |
| "grad_norm": 0.07296488434076309, | |
| "learning_rate": 9.462690269208498e-06, | |
| "loss": 0.0354, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.1755216175521617, | |
| "grad_norm": 1.9699822664260864, | |
| "learning_rate": 9.453622421250353e-06, | |
| "loss": 0.0114, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.1755216175521617, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09612752497196198, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1568, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.865, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.456, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.1812576181257617, | |
| "grad_norm": 2.4988596439361572, | |
| "learning_rate": 9.444483115740968e-06, | |
| "loss": 0.0235, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.186993618699362, | |
| "grad_norm": 0.0865960419178009, | |
| "learning_rate": 9.435272499318815e-06, | |
| "loss": 0.0197, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.192729619272962, | |
| "grad_norm": 0.7196880578994751, | |
| "learning_rate": 9.425990719766542e-06, | |
| "loss": 0.0085, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.198465619846562, | |
| "grad_norm": 0.4979308843612671, | |
| "learning_rate": 9.416637926008587e-06, | |
| "loss": 0.0177, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.204201620420162, | |
| "grad_norm": 1.3404853343963623, | |
| "learning_rate": 9.407214268108805e-06, | |
| "loss": 0.0079, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.204201620420162, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07606372237205505, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2379, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.747, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.397, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.209937620993762, | |
| "grad_norm": 0.007559036836028099, | |
| "learning_rate": 9.397719897268049e-06, | |
| "loss": 0.0122, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.2156736215673623, | |
| "grad_norm": 0.036009229719638824, | |
| "learning_rate": 9.388154965821754e-06, | |
| "loss": 0.0195, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.2214096221409623, | |
| "grad_norm": 0.23815706372261047, | |
| "learning_rate": 9.37851962723748e-06, | |
| "loss": 0.0311, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.2271456227145623, | |
| "grad_norm": 0.32620975375175476, | |
| "learning_rate": 9.36881403611246e-06, | |
| "loss": 0.0235, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.2328816232881623, | |
| "grad_norm": 0.6975133419036865, | |
| "learning_rate": 9.359038348171113e-06, | |
| "loss": 0.0398, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.2328816232881623, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07932226359844208, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1766, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.836, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.442, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.2386176238617623, | |
| "grad_norm": 0.055091869086027145, | |
| "learning_rate": 9.349192720262556e-06, | |
| "loss": 0.031, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.2443536244353623, | |
| "grad_norm": 0.07653547823429108, | |
| "learning_rate": 9.33927731035807e-06, | |
| "loss": 0.0188, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.2500896250089624, | |
| "grad_norm": 1.0113948583602905, | |
| "learning_rate": 9.329292277548584e-06, | |
| "loss": 0.023, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.2558256255825626, | |
| "grad_norm": 0.38709500432014465, | |
| "learning_rate": 9.319237782042108e-06, | |
| "loss": 0.0134, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.2615616261561626, | |
| "grad_norm": 1.0346875190734863, | |
| "learning_rate": 9.309113985161169e-06, | |
| "loss": 0.0272, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2615616261561626, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06406532227993011, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1643, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.854, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.451, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2672976267297626, | |
| "grad_norm": 3.084003448486328, | |
| "learning_rate": 9.298921049340226e-06, | |
| "loss": 0.0134, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.2730336273033627, | |
| "grad_norm": 0.3186517059803009, | |
| "learning_rate": 9.288659138123053e-06, | |
| "loss": 0.0142, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.278769627876963, | |
| "grad_norm": 0.02311491221189499, | |
| "learning_rate": 9.278328416160125e-06, | |
| "loss": 0.0116, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.284505628450563, | |
| "grad_norm": 0.021789675578475, | |
| "learning_rate": 9.267929049205976e-06, | |
| "loss": 0.003, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.290241629024163, | |
| "grad_norm": 0.004828541073948145, | |
| "learning_rate": 9.25746120411653e-06, | |
| "loss": 0.0074, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.290241629024163, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09356704354286194, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1187, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.92, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.484, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.295977629597763, | |
| "grad_norm": 4.326236248016357, | |
| "learning_rate": 9.246925048846433e-06, | |
| "loss": 0.0318, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.301713630171363, | |
| "grad_norm": 0.10032517462968826, | |
| "learning_rate": 9.236320752446357e-06, | |
| "loss": 0.0248, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.307449630744963, | |
| "grad_norm": 1.968422293663025, | |
| "learning_rate": 9.225648485060283e-06, | |
| "loss": 0.0245, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.313185631318563, | |
| "grad_norm": 0.9600183367729187, | |
| "learning_rate": 9.214908417922774e-06, | |
| "loss": 0.0169, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.3189216318921633, | |
| "grad_norm": 0.41536957025527954, | |
| "learning_rate": 9.20410072335623e-06, | |
| "loss": 0.0103, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.3189216318921633, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06794232130050659, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1585, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.862, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.455, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.3246576324657633, | |
| "grad_norm": 0.7403652667999268, | |
| "learning_rate": 9.19322557476812e-06, | |
| "loss": 0.0141, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.3303936330393633, | |
| "grad_norm": 0.03548659384250641, | |
| "learning_rate": 9.182283146648197e-06, | |
| "loss": 0.0214, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.3361296336129633, | |
| "grad_norm": 2.034658193588257, | |
| "learning_rate": 9.171273614565704e-06, | |
| "loss": 0.008, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.3418656341865633, | |
| "grad_norm": 1.0196254253387451, | |
| "learning_rate": 9.160197155166559e-06, | |
| "loss": 0.0459, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.3476016347601636, | |
| "grad_norm": 0.014422023668885231, | |
| "learning_rate": 9.149053946170507e-06, | |
| "loss": 0.0198, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.3476016347601636, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.058592408895492554, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2196, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.773, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.41, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.3533376353337636, | |
| "grad_norm": 0.019738655537366867, | |
| "learning_rate": 9.137844166368289e-06, | |
| "loss": 0.0233, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.3590736359073636, | |
| "grad_norm": 0.03966844826936722, | |
| "learning_rate": 9.126567995618752e-06, | |
| "loss": 0.0082, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.3648096364809637, | |
| "grad_norm": 0.10851484537124634, | |
| "learning_rate": 9.115225614845979e-06, | |
| "loss": 0.0132, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.3705456370545637, | |
| "grad_norm": 2.611233711242676, | |
| "learning_rate": 9.103817206036383e-06, | |
| "loss": 0.0062, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.3762816376281637, | |
| "grad_norm": 2.288485050201416, | |
| "learning_rate": 9.092342952235777e-06, | |
| "loss": 0.0476, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3762816376281637, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07158029079437256, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1533, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.87, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.459, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3820176382017637, | |
| "grad_norm": 0.04291702061891556, | |
| "learning_rate": 9.080803037546454e-06, | |
| "loss": 0.0031, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.387753638775364, | |
| "grad_norm": 0.008397878147661686, | |
| "learning_rate": 9.069197647124216e-06, | |
| "loss": 0.0121, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.393489639348964, | |
| "grad_norm": 0.015200245194137096, | |
| "learning_rate": 9.057526967175415e-06, | |
| "loss": 0.0047, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.399225639922564, | |
| "grad_norm": 3.2773385047912598, | |
| "learning_rate": 9.04579118495396e-06, | |
| "loss": 0.005, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.404961640496164, | |
| "grad_norm": 0.5176500678062439, | |
| "learning_rate": 9.033990488758317e-06, | |
| "loss": 0.0329, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.404961640496164, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.1131686270236969, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1939, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.811, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.429, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.410697641069764, | |
| "grad_norm": 0.002755386522039771, | |
| "learning_rate": 9.02212506792848e-06, | |
| "loss": 0.0164, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.4164336416433643, | |
| "grad_norm": 0.0466199554502964, | |
| "learning_rate": 9.01019511284294e-06, | |
| "loss": 0.0233, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.4221696422169643, | |
| "grad_norm": 0.8431953191757202, | |
| "learning_rate": 8.99820081491563e-06, | |
| "loss": 0.0051, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.4279056427905643, | |
| "grad_norm": 0.035143181681632996, | |
| "learning_rate": 8.98614236659285e-06, | |
| "loss": 0.0283, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.4336416433641643, | |
| "grad_norm": 0.05582081899046898, | |
| "learning_rate": 8.97401996135018e-06, | |
| "loss": 0.0073, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.4336416433641643, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09689504653215408, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.294, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.666, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.356, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.4393776439377644, | |
| "grad_norm": 2.2861459255218506, | |
| "learning_rate": 8.961833793689384e-06, | |
| "loss": 0.0107, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.4451136445113644, | |
| "grad_norm": 2.3350398540496826, | |
| "learning_rate": 8.94958405913527e-06, | |
| "loss": 0.0128, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.4508496450849644, | |
| "grad_norm": 0.019439250230789185, | |
| "learning_rate": 8.937270954232576e-06, | |
| "loss": 0.0061, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.4565856456585646, | |
| "grad_norm": 0.002678696997463703, | |
| "learning_rate": 8.924894676542801e-06, | |
| "loss": 0.0085, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.4623216462321647, | |
| "grad_norm": 0.03475391864776611, | |
| "learning_rate": 8.91245542464104e-06, | |
| "loss": 0.0234, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.4623216462321647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10303693264722824, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1871, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.821, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.434, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.4680576468057647, | |
| "grad_norm": 0.09974029660224915, | |
| "learning_rate": 8.8999533981128e-06, | |
| "loss": 0.013, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.4737936473793647, | |
| "grad_norm": 0.19927239418029785, | |
| "learning_rate": 8.887388797550791e-06, | |
| "loss": 0.0033, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.479529647952965, | |
| "grad_norm": 0.03026706352829933, | |
| "learning_rate": 8.874761824551717e-06, | |
| "loss": 0.0219, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.485265648526565, | |
| "grad_norm": 0.8363674879074097, | |
| "learning_rate": 8.862072681713027e-06, | |
| "loss": 0.0167, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.491001649100165, | |
| "grad_norm": 2.117938280105591, | |
| "learning_rate": 8.84932157262969e-06, | |
| "loss": 0.0365, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.491001649100165, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.11520245671272278, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2361, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.75, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.398, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.496737649673765, | |
| "grad_norm": 0.14108580350875854, | |
| "learning_rate": 8.836508701890892e-06, | |
| "loss": 0.0196, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.502473650247365, | |
| "grad_norm": 2.1537435054779053, | |
| "learning_rate": 8.823634275076792e-06, | |
| "loss": 0.0207, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.508209650820965, | |
| "grad_norm": 0.93586665391922, | |
| "learning_rate": 8.81069849875519e-06, | |
| "loss": 0.0125, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.513945651394565, | |
| "grad_norm": 3.1711156368255615, | |
| "learning_rate": 8.797701580478235e-06, | |
| "loss": 0.0094, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.519681651968165, | |
| "grad_norm": 0.0233930516988039, | |
| "learning_rate": 8.784643728779089e-06, | |
| "loss": 0.0152, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.519681651968165, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07520527392625809, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2328, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.754, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.401, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.525417652541765, | |
| "grad_norm": 0.14535315334796906, | |
| "learning_rate": 8.771525153168576e-06, | |
| "loss": 0.0035, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.5311536531153653, | |
| "grad_norm": 0.3828493654727936, | |
| "learning_rate": 8.758346064131824e-06, | |
| "loss": 0.0026, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.5368896536889654, | |
| "grad_norm": 2.7801029682159424, | |
| "learning_rate": 8.745106673124888e-06, | |
| "loss": 0.0215, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.5426256542625654, | |
| "grad_norm": 1.6995360851287842, | |
| "learning_rate": 8.731807192571359e-06, | |
| "loss": 0.022, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.5483616548361656, | |
| "grad_norm": 0.24279280006885529, | |
| "learning_rate": 8.718447835858951e-06, | |
| "loss": 0.0109, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.5483616548361656, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08999116718769073, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1893, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.817, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.432, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.5540976554097656, | |
| "grad_norm": 0.015813475474715233, | |
| "learning_rate": 8.705028817336083e-06, | |
| "loss": 0.0019, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.5598336559833657, | |
| "grad_norm": 0.017328623682260513, | |
| "learning_rate": 8.691550352308431e-06, | |
| "loss": 0.0044, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.5655696565569657, | |
| "grad_norm": 1.36957585811615, | |
| "learning_rate": 8.678012657035487e-06, | |
| "loss": 0.0033, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.5713056571305657, | |
| "grad_norm": 0.8026090264320374, | |
| "learning_rate": 8.664415948727076e-06, | |
| "loss": 0.0063, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.5770416577041657, | |
| "grad_norm": 0.038070451468229294, | |
| "learning_rate": 8.65076044553988e-06, | |
| "loss": 0.02, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.5770416577041657, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09743982553482056, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1673, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.849, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.448, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.5827776582777657, | |
| "grad_norm": 2.638280153274536, | |
| "learning_rate": 8.63704636657393e-06, | |
| "loss": 0.0205, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.5885136588513658, | |
| "grad_norm": 0.08616320043802261, | |
| "learning_rate": 8.623273931869094e-06, | |
| "loss": 0.0151, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.594249659424966, | |
| "grad_norm": 0.0016790858935564756, | |
| "learning_rate": 8.609443362401553e-06, | |
| "loss": 0.0077, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.599985659998566, | |
| "grad_norm": 0.0031273786444216967, | |
| "learning_rate": 8.595554880080246e-06, | |
| "loss": 0.0014, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.605721660572166, | |
| "grad_norm": 2.3747873306274414, | |
| "learning_rate": 8.581608707743312e-06, | |
| "loss": 0.0387, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.605721660572166, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10180775076150894, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1502, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.874, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.461, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6114576611457663, | |
| "grad_norm": 0.03558593988418579, | |
| "learning_rate": 8.567605069154517e-06, | |
| "loss": 0.0009, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.6171936617193663, | |
| "grad_norm": 0.005191161762923002, | |
| "learning_rate": 8.55354418899966e-06, | |
| "loss": 0.0349, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.6229296622929663, | |
| "grad_norm": 1.5500961542129517, | |
| "learning_rate": 8.539426292882976e-06, | |
| "loss": 0.0296, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.6286656628665663, | |
| "grad_norm": 0.12692737579345703, | |
| "learning_rate": 8.525251607323506e-06, | |
| "loss": 0.0076, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.6344016634401664, | |
| "grad_norm": 0.034373391419649124, | |
| "learning_rate": 8.511020359751467e-06, | |
| "loss": 0.0125, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.6344016634401664, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07807676494121552, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1709, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.844, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.446, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.6401376640137664, | |
| "grad_norm": 0.3334125876426697, | |
| "learning_rate": 8.496732778504608e-06, | |
| "loss": 0.007, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.6458736645873664, | |
| "grad_norm": 1.8848798274993896, | |
| "learning_rate": 8.482389092824535e-06, | |
| "loss": 0.0279, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.6516096651609664, | |
| "grad_norm": 0.0071907006204128265, | |
| "learning_rate": 8.46798953285304e-06, | |
| "loss": 0.0058, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.6573456657345664, | |
| "grad_norm": 0.01580858789384365, | |
| "learning_rate": 8.453534329628414e-06, | |
| "loss": 0.0111, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.6630816663081667, | |
| "grad_norm": 0.5010847449302673, | |
| "learning_rate": 8.439023715081729e-06, | |
| "loss": 0.0205, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.6630816663081667, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0824664756655693, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2284, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.761, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.404, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.6688176668817667, | |
| "grad_norm": 0.16337744891643524, | |
| "learning_rate": 8.42445792203312e-06, | |
| "loss": 0.0175, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.6745536674553667, | |
| "grad_norm": 0.054014191031455994, | |
| "learning_rate": 8.409837184188056e-06, | |
| "loss": 0.0073, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.680289668028967, | |
| "grad_norm": 3.6512198448181152, | |
| "learning_rate": 8.395161736133579e-06, | |
| "loss": 0.0226, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.686025668602567, | |
| "grad_norm": 0.6102538704872131, | |
| "learning_rate": 8.380431813334548e-06, | |
| "loss": 0.0033, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.691761669176167, | |
| "grad_norm": 2.3051533699035645, | |
| "learning_rate": 8.365647652129865e-06, | |
| "loss": 0.0327, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.691761669176167, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0759689062833786, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.5562, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.293, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.17, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.697497669749767, | |
| "grad_norm": 0.008392853662371635, | |
| "learning_rate": 8.350809489728673e-06, | |
| "loss": 0.0279, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.703233670323367, | |
| "grad_norm": 1.614988088607788, | |
| "learning_rate": 8.33591756420655e-06, | |
| "loss": 0.0102, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.708969670896967, | |
| "grad_norm": 0.04466373473405838, | |
| "learning_rate": 8.320972114501698e-06, | |
| "loss": 0.0158, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.714705671470567, | |
| "grad_norm": 1.4472012519836426, | |
| "learning_rate": 8.305973380411107e-06, | |
| "loss": 0.0437, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.720441672044167, | |
| "grad_norm": 0.0326494500041008, | |
| "learning_rate": 8.290921602586699e-06, | |
| "loss": 0.0188, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.720441672044167, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06277668476104736, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2361, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.75, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.398, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.7261776726177671, | |
| "grad_norm": 2.090261697769165, | |
| "learning_rate": 8.275817022531479e-06, | |
| "loss": 0.0229, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.7319136731913674, | |
| "grad_norm": 1.1351670026779175, | |
| "learning_rate": 8.260659882595647e-06, | |
| "loss": 0.005, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.7376496737649674, | |
| "grad_norm": 0.0521712526679039, | |
| "learning_rate": 8.245450425972728e-06, | |
| "loss": 0.0126, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.7433856743385674, | |
| "grad_norm": 0.04499056562781334, | |
| "learning_rate": 8.230188896695643e-06, | |
| "loss": 0.0133, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.7491216749121676, | |
| "grad_norm": 0.007846461609005928, | |
| "learning_rate": 8.214875539632825e-06, | |
| "loss": 0.0191, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.7491216749121676, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07422397285699844, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1752, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.838, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.443, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.7548576754857677, | |
| "grad_norm": 0.055605433881282806, | |
| "learning_rate": 8.199510600484261e-06, | |
| "loss": 0.0148, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.7605936760593677, | |
| "grad_norm": 0.27163052558898926, | |
| "learning_rate": 8.184094325777573e-06, | |
| "loss": 0.0075, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.7663296766329677, | |
| "grad_norm": 0.11399796605110168, | |
| "learning_rate": 8.168626962864045e-06, | |
| "loss": 0.0016, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.7720656772065677, | |
| "grad_norm": 0.07731972634792328, | |
| "learning_rate": 8.153108759914669e-06, | |
| "loss": 0.0141, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.7778016777801677, | |
| "grad_norm": 0.2923997938632965, | |
| "learning_rate": 8.13753996591615e-06, | |
| "loss": 0.0047, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.7778016777801677, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06707713752985, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.4009, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.513, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.28, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.7835376783537678, | |
| "grad_norm": 0.26035454869270325, | |
| "learning_rate": 8.121920830666918e-06, | |
| "loss": 0.0092, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.7892736789273678, | |
| "grad_norm": 3.143533706665039, | |
| "learning_rate": 8.106251604773125e-06, | |
| "loss": 0.0141, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.7950096795009678, | |
| "grad_norm": 1.8246173858642578, | |
| "learning_rate": 8.090532539644608e-06, | |
| "loss": 0.0249, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.800745680074568, | |
| "grad_norm": 0.01822010800242424, | |
| "learning_rate": 8.074763887490878e-06, | |
| "loss": 0.0182, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.806481680648168, | |
| "grad_norm": 1.8796803951263428, | |
| "learning_rate": 8.058945901317047e-06, | |
| "loss": 0.0276, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.806481680648168, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06196051836013794, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1882, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.819, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.433, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.812217681221768, | |
| "grad_norm": 0.3315775692462921, | |
| "learning_rate": 8.043078834919792e-06, | |
| "loss": 0.0053, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.8179536817953683, | |
| "grad_norm": 0.6810622215270996, | |
| "learning_rate": 8.027162942883271e-06, | |
| "loss": 0.0027, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.8236896823689683, | |
| "grad_norm": 0.04800894856452942, | |
| "learning_rate": 8.011198480575037e-06, | |
| "loss": 0.0104, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.8294256829425684, | |
| "grad_norm": 0.21183735132217407, | |
| "learning_rate": 7.995185704141948e-06, | |
| "loss": 0.0139, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.8351616835161684, | |
| "grad_norm": 0.05010043457150459, | |
| "learning_rate": 7.979124870506052e-06, | |
| "loss": 0.013, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.8351616835161684, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08539190888404846, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2295, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.759, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.403, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.8408976840897684, | |
| "grad_norm": 0.05234100669622421, | |
| "learning_rate": 7.963016237360465e-06, | |
| "loss": 0.0113, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.8466336846633684, | |
| "grad_norm": 0.7740063071250916, | |
| "learning_rate": 7.946860063165238e-06, | |
| "loss": 0.0115, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.8523696852369684, | |
| "grad_norm": 0.007312777452170849, | |
| "learning_rate": 7.93065660714321e-06, | |
| "loss": 0.0378, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.8581056858105685, | |
| "grad_norm": 0.014250587671995163, | |
| "learning_rate": 7.914406129275847e-06, | |
| "loss": 0.0188, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.8638416863841685, | |
| "grad_norm": 0.0380655862390995, | |
| "learning_rate": 7.898108890299072e-06, | |
| "loss": 0.013, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.8638416863841685, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07671073824167252, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.182, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.828, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.438, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.8695776869577687, | |
| "grad_norm": 0.034287262707948685, | |
| "learning_rate": 7.881765151699085e-06, | |
| "loss": 0.0237, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.8753136875313687, | |
| "grad_norm": 0.014141724444925785, | |
| "learning_rate": 7.865375175708158e-06, | |
| "loss": 0.0294, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.881049688104969, | |
| "grad_norm": 2.08565616607666, | |
| "learning_rate": 7.848939225300436e-06, | |
| "loss": 0.0229, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.886785688678569, | |
| "grad_norm": 1.476799726486206, | |
| "learning_rate": 7.832457564187715e-06, | |
| "loss": 0.0257, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.892521689252169, | |
| "grad_norm": 1.641891598701477, | |
| "learning_rate": 7.815930456815212e-06, | |
| "loss": 0.0171, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.892521689252169, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06915121525526047, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.0913, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.961, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.504, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.898257689825769, | |
| "grad_norm": 0.03908121585845947, | |
| "learning_rate": 7.799358168357323e-06, | |
| "loss": 0.0208, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.903993690399369, | |
| "grad_norm": 0.27976372838020325, | |
| "learning_rate": 7.782740964713358e-06, | |
| "loss": 0.0079, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.909729690972969, | |
| "grad_norm": 0.1647738814353943, | |
| "learning_rate": 7.76607911250329e-06, | |
| "loss": 0.0284, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.915465691546569, | |
| "grad_norm": 0.009193528443574905, | |
| "learning_rate": 7.749372879063468e-06, | |
| "loss": 0.0219, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.9212016921201691, | |
| "grad_norm": 0.08929289132356644, | |
| "learning_rate": 7.73262253244233e-06, | |
| "loss": 0.0136, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.9212016921201691, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06728994846343994, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1919, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.814, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.43, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.9269376926937691, | |
| "grad_norm": 1.6335256099700928, | |
| "learning_rate": 7.715828341396102e-06, | |
| "loss": 0.0322, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.9326736932673694, | |
| "grad_norm": 0.2756313979625702, | |
| "learning_rate": 7.698990575384483e-06, | |
| "loss": 0.0282, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.9384096938409694, | |
| "grad_norm": 0.5282019376754761, | |
| "learning_rate": 7.68210950456633e-06, | |
| "loss": 0.0059, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.9441456944145694, | |
| "grad_norm": 0.013134743086993694, | |
| "learning_rate": 7.66518539979531e-06, | |
| "loss": 0.0088, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.9498816949881697, | |
| "grad_norm": 0.020495153963565826, | |
| "learning_rate": 7.648218532615572e-06, | |
| "loss": 0.0275, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.9498816949881697, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06392066925764084, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1842, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.825, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.436, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.9556176955617697, | |
| "grad_norm": 0.030520539730787277, | |
| "learning_rate": 7.631209175257368e-06, | |
| "loss": 0.0176, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.9613536961353697, | |
| "grad_norm": 0.356724351644516, | |
| "learning_rate": 7.614157600632706e-06, | |
| "loss": 0.0306, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.9670896967089697, | |
| "grad_norm": 0.5876423716545105, | |
| "learning_rate": 7.597064082330961e-06, | |
| "loss": 0.0178, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.9728256972825697, | |
| "grad_norm": 0.04390386864542961, | |
| "learning_rate": 7.579928894614479e-06, | |
| "loss": 0.0099, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.9785616978561698, | |
| "grad_norm": 0.026595573872327805, | |
| "learning_rate": 7.562752312414196e-06, | |
| "loss": 0.0097, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.9785616978561698, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08633749186992645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1407, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.888, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.468, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.9842976984297698, | |
| "grad_norm": 0.4078025221824646, | |
| "learning_rate": 7.545534611325207e-06, | |
| "loss": 0.0189, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.9900336990033698, | |
| "grad_norm": 0.4737612307071686, | |
| "learning_rate": 7.528276067602352e-06, | |
| "loss": 0.0019, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.9957696995769698, | |
| "grad_norm": 0.0031162798404693604, | |
| "learning_rate": 7.510976958155789e-06, | |
| "loss": 0.0037, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.00114720011472, | |
| "grad_norm": 1.097615361213684, | |
| "learning_rate": 7.493637560546539e-06, | |
| "loss": 0.0127, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.00688320068832, | |
| "grad_norm": 0.005019227508455515, | |
| "learning_rate": 7.476258152982043e-06, | |
| "loss": 0.0019, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.00688320068832, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07848720997571945, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.177, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.835, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.441, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.01261920126192, | |
| "grad_norm": 0.016102029010653496, | |
| "learning_rate": 7.458839014311696e-06, | |
| "loss": 0.0057, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.01835520183552, | |
| "grad_norm": 0.009885331615805626, | |
| "learning_rate": 7.441380424022364e-06, | |
| "loss": 0.0089, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.0240912024091204, | |
| "grad_norm": 2.9838740825653076, | |
| "learning_rate": 7.423882662233915e-06, | |
| "loss": 0.0142, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.0298272029827205, | |
| "grad_norm": 0.29027923941612244, | |
| "learning_rate": 7.406346009694713e-06, | |
| "loss": 0.0033, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.0355632035563205, | |
| "grad_norm": 0.004921222571283579, | |
| "learning_rate": 7.388770747777116e-06, | |
| "loss": 0.0108, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.0355632035563205, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0910039097070694, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.144, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.883, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.465, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.0412992041299205, | |
| "grad_norm": 0.012345495633780956, | |
| "learning_rate": 7.371157158472965e-06, | |
| "loss": 0.0186, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.0470352047035205, | |
| "grad_norm": 0.013729539699852467, | |
| "learning_rate": 7.353505524389052e-06, | |
| "loss": 0.0038, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.0527712052771205, | |
| "grad_norm": 3.2665841579437256, | |
| "learning_rate": 7.335816128742599e-06, | |
| "loss": 0.0203, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.0585072058507206, | |
| "grad_norm": 0.0018614591099321842, | |
| "learning_rate": 7.318089255356695e-06, | |
| "loss": 0.0093, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.0642432064243206, | |
| "grad_norm": 0.10742621123790741, | |
| "learning_rate": 7.300325188655762e-06, | |
| "loss": 0.0093, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.0642432064243206, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07848691940307617, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1618, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.858, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.452, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.0699792069979206, | |
| "grad_norm": 0.6563161611557007, | |
| "learning_rate": 7.282524213660974e-06, | |
| "loss": 0.001, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.0757152075715206, | |
| "grad_norm": 0.12672875821590424, | |
| "learning_rate": 7.264686615985697e-06, | |
| "loss": 0.0008, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.0814512081451206, | |
| "grad_norm": 0.004448240157216787, | |
| "learning_rate": 7.246812681830899e-06, | |
| "loss": 0.0139, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.0871872087187207, | |
| "grad_norm": 0.007188173942267895, | |
| "learning_rate": 7.228902697980562e-06, | |
| "loss": 0.0106, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.092923209292321, | |
| "grad_norm": 0.006886759772896767, | |
| "learning_rate": 7.210956951797074e-06, | |
| "loss": 0.0076, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.092923209292321, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08482418954372406, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1827, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.827, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.437, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.098659209865921, | |
| "grad_norm": 3.8101847171783447, | |
| "learning_rate": 7.19297573121663e-06, | |
| "loss": 0.0089, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.104395210439521, | |
| "grad_norm": 0.002013096585869789, | |
| "learning_rate": 7.174959324744599e-06, | |
| "loss": 0.0123, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.110131211013121, | |
| "grad_norm": 3.1393046379089355, | |
| "learning_rate": 7.156908021450904e-06, | |
| "loss": 0.01, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.115867211586721, | |
| "grad_norm": 0.07943509519100189, | |
| "learning_rate": 7.138822110965381e-06, | |
| "loss": 0.0016, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.121603212160321, | |
| "grad_norm": 2.5405237674713135, | |
| "learning_rate": 7.120701883473131e-06, | |
| "loss": 0.0148, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.121603212160321, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10251594334840775, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2361, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.75, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.398, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.1273392127339212, | |
| "grad_norm": 0.006140326615422964, | |
| "learning_rate": 7.102547629709867e-06, | |
| "loss": 0.0163, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.1330752133075213, | |
| "grad_norm": 0.04222610965371132, | |
| "learning_rate": 7.084359640957246e-06, | |
| "loss": 0.017, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.1388112138811213, | |
| "grad_norm": 0.03461828827857971, | |
| "learning_rate": 7.066138209038194e-06, | |
| "loss": 0.0162, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.1445472144547213, | |
| "grad_norm": 0.9535601139068604, | |
| "learning_rate": 7.047883626312233e-06, | |
| "loss": 0.0266, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.1502832150283213, | |
| "grad_norm": 0.8345515131950378, | |
| "learning_rate": 7.029596185670778e-06, | |
| "loss": 0.0446, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.1502832150283213, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07697154581546783, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1863, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.822, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.435, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.156019215601922, | |
| "grad_norm": 0.14290325343608856, | |
| "learning_rate": 7.011276180532445e-06, | |
| "loss": 0.0218, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.161755216175522, | |
| "grad_norm": 0.05109575018286705, | |
| "learning_rate": 6.992923904838341e-06, | |
| "loss": 0.0032, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.167491216749122, | |
| "grad_norm": 3.4320428371429443, | |
| "learning_rate": 6.974539653047346e-06, | |
| "loss": 0.0107, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.173227217322722, | |
| "grad_norm": 0.9100053310394287, | |
| "learning_rate": 6.956123720131398e-06, | |
| "loss": 0.0162, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.178963217896322, | |
| "grad_norm": 0.09293865412473679, | |
| "learning_rate": 6.937676401570744e-06, | |
| "loss": 0.0175, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.178963217896322, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07296475023031235, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.156, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.866, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.457, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.184699218469922, | |
| "grad_norm": 0.25332963466644287, | |
| "learning_rate": 6.9191979933492135e-06, | |
| "loss": 0.0107, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.190435219043522, | |
| "grad_norm": 0.47701334953308105, | |
| "learning_rate": 6.900688791949463e-06, | |
| "loss": 0.0087, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.196171219617122, | |
| "grad_norm": 0.019721075892448425, | |
| "learning_rate": 6.882149094348215e-06, | |
| "loss": 0.0178, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.201907220190722, | |
| "grad_norm": 0.057930897921323776, | |
| "learning_rate": 6.863579198011506e-06, | |
| "loss": 0.0291, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.207643220764322, | |
| "grad_norm": 0.03594127669930458, | |
| "learning_rate": 6.8449794008899e-06, | |
| "loss": 0.005, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.207643220764322, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07210449874401093, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1052, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.94, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.494, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.213379221337922, | |
| "grad_norm": 2.532087802886963, | |
| "learning_rate": 6.826350001413713e-06, | |
| "loss": 0.0085, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.219115221911522, | |
| "grad_norm": 0.3755287230014801, | |
| "learning_rate": 6.807691298488231e-06, | |
| "loss": 0.0014, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.224851222485122, | |
| "grad_norm": 0.09285666048526764, | |
| "learning_rate": 6.789003591488902e-06, | |
| "loss": 0.0013, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.2305872230587225, | |
| "grad_norm": 3.115718126296997, | |
| "learning_rate": 6.770287180256545e-06, | |
| "loss": 0.0045, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.2363232236323225, | |
| "grad_norm": 1.7172398567199707, | |
| "learning_rate": 6.751542365092527e-06, | |
| "loss": 0.0085, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.2363232236323225, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08293686807155609, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1262, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.909, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.478, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.2420592242059225, | |
| "grad_norm": 0.004544651135802269, | |
| "learning_rate": 6.732769446753954e-06, | |
| "loss": 0.021, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.2477952247795225, | |
| "grad_norm": 3.599937677383423, | |
| "learning_rate": 6.713968726448844e-06, | |
| "loss": 0.0067, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.2535312253531226, | |
| "grad_norm": 0.9228914976119995, | |
| "learning_rate": 6.69514050583129e-06, | |
| "loss": 0.0097, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.2592672259267226, | |
| "grad_norm": 0.005351903382688761, | |
| "learning_rate": 6.676285086996623e-06, | |
| "loss": 0.0353, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.2650032265003226, | |
| "grad_norm": 0.07884957641363144, | |
| "learning_rate": 6.657402772476563e-06, | |
| "loss": 0.0092, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.2650032265003226, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09563428908586502, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.7675, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 29.999, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.022, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.2707392270739226, | |
| "grad_norm": 1.1882566213607788, | |
| "learning_rate": 6.638493865234369e-06, | |
| "loss": 0.0132, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.2764752276475226, | |
| "grad_norm": 0.13506627082824707, | |
| "learning_rate": 6.619558668659975e-06, | |
| "loss": 0.0219, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.2822112282211227, | |
| "grad_norm": 0.007426030468195677, | |
| "learning_rate": 6.600597486565119e-06, | |
| "loss": 0.0216, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.2879472287947227, | |
| "grad_norm": 0.27120983600616455, | |
| "learning_rate": 6.581610623178476e-06, | |
| "loss": 0.0153, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.293683229368323, | |
| "grad_norm": 1.4845871925354004, | |
| "learning_rate": 6.562598383140773e-06, | |
| "loss": 0.0059, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.293683229368323, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07171830534934998, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2153, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.78, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.413, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.299419229941923, | |
| "grad_norm": 1.7247122526168823, | |
| "learning_rate": 6.543561071499893e-06, | |
| "loss": 0.0216, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.305155230515523, | |
| "grad_norm": 0.02364450879395008, | |
| "learning_rate": 6.524498993706e-06, | |
| "loss": 0.0244, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.310891231089123, | |
| "grad_norm": 0.048632461577653885, | |
| "learning_rate": 6.505412455606617e-06, | |
| "loss": 0.0029, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.316627231662723, | |
| "grad_norm": 0.12222401797771454, | |
| "learning_rate": 6.486301763441732e-06, | |
| "loss": 0.0027, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.3223632322363232, | |
| "grad_norm": 0.07429647445678711, | |
| "learning_rate": 6.467167223838879e-06, | |
| "loss": 0.0119, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.3223632322363232, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09189260751008987, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1514, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.873, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.46, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.3280992328099233, | |
| "grad_norm": 0.10564873367547989, | |
| "learning_rate": 6.4480091438082225e-06, | |
| "loss": 0.0328, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.3338352333835233, | |
| "grad_norm": 0.01708606630563736, | |
| "learning_rate": 6.4288278307376265e-06, | |
| "loss": 0.015, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.3395712339571233, | |
| "grad_norm": 0.25011512637138367, | |
| "learning_rate": 6.4096235923877256e-06, | |
| "loss": 0.0061, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.3453072345307233, | |
| "grad_norm": 0.055370282381772995, | |
| "learning_rate": 6.390396736886986e-06, | |
| "loss": 0.007, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.3510432351043233, | |
| "grad_norm": 0.030733373016119003, | |
| "learning_rate": 6.371147572726761e-06, | |
| "loss": 0.0017, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.3510432351043233, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08482550084590912, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1309, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.903, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.475, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.356779235677924, | |
| "grad_norm": 0.17796050012111664, | |
| "learning_rate": 6.351876408756344e-06, | |
| "loss": 0.0053, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.3625152362515234, | |
| "grad_norm": 0.0136833805590868, | |
| "learning_rate": 6.332583554178009e-06, | |
| "loss": 0.0044, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.368251236825124, | |
| "grad_norm": 4.209476470947266, | |
| "learning_rate": 6.313269318542057e-06, | |
| "loss": 0.0259, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.373987237398724, | |
| "grad_norm": 3.9427716732025146, | |
| "learning_rate": 6.2939340117418355e-06, | |
| "loss": 0.0237, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.379723237972324, | |
| "grad_norm": 0.008540840819478035, | |
| "learning_rate": 6.274577944008785e-06, | |
| "loss": 0.0044, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.379723237972324, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07825496047735214, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1875, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.82, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.434, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.385459238545924, | |
| "grad_norm": 0.01801919937133789, | |
| "learning_rate": 6.255201425907442e-06, | |
| "loss": 0.0002, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.391195239119524, | |
| "grad_norm": 0.7031999230384827, | |
| "learning_rate": 6.235804768330472e-06, | |
| "loss": 0.0198, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.396931239693124, | |
| "grad_norm": 0.03624948486685753, | |
| "learning_rate": 6.21638828249367e-06, | |
| "loss": 0.0096, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.402667240266724, | |
| "grad_norm": 0.010821559466421604, | |
| "learning_rate": 6.196952279930977e-06, | |
| "loss": 0.0089, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.408403240840324, | |
| "grad_norm": 0.01340206153690815, | |
| "learning_rate": 6.177497072489473e-06, | |
| "loss": 0.0227, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.408403240840324, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07348810136318207, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1832, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.826, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.437, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.414139241413924, | |
| "grad_norm": 0.006387198343873024, | |
| "learning_rate": 6.158022972324375e-06, | |
| "loss": 0.0112, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.419875241987524, | |
| "grad_norm": 0.1088046282529831, | |
| "learning_rate": 6.138530291894033e-06, | |
| "loss": 0.015, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.425611242561124, | |
| "grad_norm": 0.010532280430197716, | |
| "learning_rate": 6.119019343954914e-06, | |
| "loss": 0.0012, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.4313472431347245, | |
| "grad_norm": 2.5161547660827637, | |
| "learning_rate": 6.099490441556582e-06, | |
| "loss": 0.0054, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.4370832437083245, | |
| "grad_norm": 0.41214823722839355, | |
| "learning_rate": 6.07994389803668e-06, | |
| "loss": 0.0067, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.4370832437083245, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08171182125806808, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1867, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.821, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.434, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.4428192442819245, | |
| "grad_norm": 3.9505116939544678, | |
| "learning_rate": 6.060380027015897e-06, | |
| "loss": 0.0205, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.4485552448555246, | |
| "grad_norm": 2.172741651535034, | |
| "learning_rate": 6.040799142392937e-06, | |
| "loss": 0.015, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.4542912454291246, | |
| "grad_norm": 0.003915698733180761, | |
| "learning_rate": 6.02120155833949e-06, | |
| "loss": 0.0047, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.4600272460027246, | |
| "grad_norm": 0.027446260675787926, | |
| "learning_rate": 6.001587589295179e-06, | |
| "loss": 0.0172, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.4657632465763246, | |
| "grad_norm": 0.9420067667961121, | |
| "learning_rate": 5.981957549962524e-06, | |
| "loss": 0.0132, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.4657632465763246, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06599417328834534, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2262, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.764, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.405, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.4714992471499246, | |
| "grad_norm": 2.6476902961730957, | |
| "learning_rate": 5.96231175530189e-06, | |
| "loss": 0.01, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.4772352477235247, | |
| "grad_norm": 0.03741572052240372, | |
| "learning_rate": 5.942650520526432e-06, | |
| "loss": 0.0102, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.4829712482971247, | |
| "grad_norm": 0.012345872819423676, | |
| "learning_rate": 5.9229741610970425e-06, | |
| "loss": 0.0154, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.4887072488707247, | |
| "grad_norm": 0.006493984255939722, | |
| "learning_rate": 5.903282992717281e-06, | |
| "loss": 0.0063, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.494443249444325, | |
| "grad_norm": 0.011486309580504894, | |
| "learning_rate": 5.883577331328316e-06, | |
| "loss": 0.0005, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.494443249444325, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0845269188284874, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1186, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.921, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.484, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.5001792500179247, | |
| "grad_norm": 0.05512839928269386, | |
| "learning_rate": 5.863857493103855e-06, | |
| "loss": 0.0249, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.505915250591525, | |
| "grad_norm": 0.010474382899701595, | |
| "learning_rate": 5.84412379444507e-06, | |
| "loss": 0.0007, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.511651251165125, | |
| "grad_norm": 0.8734518885612488, | |
| "learning_rate": 5.824376551975519e-06, | |
| "loss": 0.0081, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.5173872517387252, | |
| "grad_norm": 0.11771434545516968, | |
| "learning_rate": 5.804616082536071e-06, | |
| "loss": 0.0074, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.5231232523123253, | |
| "grad_norm": 1.922683835029602, | |
| "learning_rate": 5.784842703179814e-06, | |
| "loss": 0.0328, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.5231232523123253, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07923904061317444, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.0857, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.969, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.508, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.5288592528859253, | |
| "grad_norm": 4.321990013122559, | |
| "learning_rate": 5.765056731166982e-06, | |
| "loss": 0.0237, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.5345952534595253, | |
| "grad_norm": 0.01736987754702568, | |
| "learning_rate": 5.745258483959847e-06, | |
| "loss": 0.0064, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.5403312540331253, | |
| "grad_norm": 0.07172615826129913, | |
| "learning_rate": 5.725448279217638e-06, | |
| "loss": 0.0118, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 2.5460672546067253, | |
| "grad_norm": 0.1578182578086853, | |
| "learning_rate": 5.705626434791444e-06, | |
| "loss": 0.0016, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 2.5518032551803254, | |
| "grad_norm": 0.005266325548291206, | |
| "learning_rate": 5.6857932687191e-06, | |
| "loss": 0.0023, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.5518032551803254, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07244950532913208, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2865, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.677, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.362, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.557539255753926, | |
| "grad_norm": 0.21443744003772736, | |
| "learning_rate": 5.66594909922011e-06, | |
| "loss": 0.0114, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 2.5632752563275254, | |
| "grad_norm": 0.08916337788105011, | |
| "learning_rate": 5.646094244690511e-06, | |
| "loss": 0.0064, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 2.569011256901126, | |
| "grad_norm": 2.983360528945923, | |
| "learning_rate": 5.626229023697789e-06, | |
| "loss": 0.0159, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.574747257474726, | |
| "grad_norm": 0.06603588908910751, | |
| "learning_rate": 5.606353754975755e-06, | |
| "loss": 0.0104, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 2.580483258048326, | |
| "grad_norm": 0.9153593182563782, | |
| "learning_rate": 5.586468757419433e-06, | |
| "loss": 0.0012, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.580483258048326, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07681399583816528, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3831, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.538, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.292, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.586219258621926, | |
| "grad_norm": 0.030913453549146652, | |
| "learning_rate": 5.566574350079946e-06, | |
| "loss": 0.0139, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 2.591955259195526, | |
| "grad_norm": 3.312136650085449, | |
| "learning_rate": 5.546670852159396e-06, | |
| "loss": 0.0074, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 2.597691259769126, | |
| "grad_norm": 0.008490420877933502, | |
| "learning_rate": 5.526758583005736e-06, | |
| "loss": 0.008, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 2.603427260342726, | |
| "grad_norm": 0.010648602619767189, | |
| "learning_rate": 5.50683786210766e-06, | |
| "loss": 0.0051, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 2.609163260916326, | |
| "grad_norm": 2.533731460571289, | |
| "learning_rate": 5.486909009089463e-06, | |
| "loss": 0.0185, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.609163260916326, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07284829020500183, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1938, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.811, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.429, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.614899261489926, | |
| "grad_norm": 0.0175775233656168, | |
| "learning_rate": 5.4669723437059194e-06, | |
| "loss": 0.0015, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 2.6206352620635265, | |
| "grad_norm": 0.00764029985293746, | |
| "learning_rate": 5.4470281858371555e-06, | |
| "loss": 0.0014, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 2.626371262637126, | |
| "grad_norm": 0.4708549380302429, | |
| "learning_rate": 5.4270768554835056e-06, | |
| "loss": 0.0023, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 2.6321072632107265, | |
| "grad_norm": 0.06411808729171753, | |
| "learning_rate": 5.407118672760393e-06, | |
| "loss": 0.0037, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 2.6378432637843265, | |
| "grad_norm": 0.009167805314064026, | |
| "learning_rate": 5.387153957893181e-06, | |
| "loss": 0.0016, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.6378432637843265, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09603765606880188, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2377, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.747, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.397, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.6435792643579266, | |
| "grad_norm": 0.0018381074769422412, | |
| "learning_rate": 5.367183031212041e-06, | |
| "loss": 0.007, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 2.6493152649315266, | |
| "grad_norm": 0.0011304231593385339, | |
| "learning_rate": 5.347206213146813e-06, | |
| "loss": 0.0277, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 2.6550512655051266, | |
| "grad_norm": 4.376464366912842, | |
| "learning_rate": 5.327223824221862e-06, | |
| "loss": 0.0093, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 2.6607872660787266, | |
| "grad_norm": 0.031111005693674088, | |
| "learning_rate": 5.307236185050937e-06, | |
| "loss": 0.0029, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.6665232666523266, | |
| "grad_norm": 2.6803221702575684, | |
| "learning_rate": 5.287243616332027e-06, | |
| "loss": 0.0145, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.6665232666523266, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10255688428878784, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2291, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.76, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.403, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.6722592672259267, | |
| "grad_norm": 0.03457150608301163, | |
| "learning_rate": 5.267246438842213e-06, | |
| "loss": 0.005, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 2.6779952677995267, | |
| "grad_norm": 4.120417594909668, | |
| "learning_rate": 5.247244973432524e-06, | |
| "loss": 0.0141, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 2.6837312683731267, | |
| "grad_norm": 0.005652038846164942, | |
| "learning_rate": 5.227239541022786e-06, | |
| "loss": 0.0034, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 2.6894672689467267, | |
| "grad_norm": 2.9149153232574463, | |
| "learning_rate": 5.2072304625964785e-06, | |
| "loss": 0.0061, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 2.695203269520327, | |
| "grad_norm": 0.0008329672855325043, | |
| "learning_rate": 5.187218059195578e-06, | |
| "loss": 0.0047, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.695203269520327, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.10345083475112915, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2387, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.746, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.396, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.7009392700939268, | |
| "grad_norm": 1.024248480796814, | |
| "learning_rate": 5.167202651915409e-06, | |
| "loss": 0.0071, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 2.7066752706675272, | |
| "grad_norm": 0.014447176828980446, | |
| "learning_rate": 5.147184561899495e-06, | |
| "loss": 0.0076, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 2.7124112712411272, | |
| "grad_norm": 5.958902835845947, | |
| "learning_rate": 5.1271641103344e-06, | |
| "loss": 0.0116, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 2.7181472718147273, | |
| "grad_norm": 0.016329055652022362, | |
| "learning_rate": 5.1071416184445845e-06, | |
| "loss": 0.0105, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 2.7238832723883273, | |
| "grad_norm": 0.4175659716129303, | |
| "learning_rate": 5.087117407487239e-06, | |
| "loss": 0.0166, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.7238832723883273, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08455558121204376, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2591, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.716, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.382, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.7296192729619273, | |
| "grad_norm": 0.022596385329961777, | |
| "learning_rate": 5.06709179874714e-06, | |
| "loss": 0.0009, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 2.7353552735355273, | |
| "grad_norm": 0.0726088285446167, | |
| "learning_rate": 5.047065113531493e-06, | |
| "loss": 0.0254, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 2.7410912741091273, | |
| "grad_norm": 0.035875000059604645, | |
| "learning_rate": 5.02703767316477e-06, | |
| "loss": 0.0244, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 2.7468272746827274, | |
| "grad_norm": 3.32077956199646, | |
| "learning_rate": 5.00700979898356e-06, | |
| "loss": 0.006, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 2.7525632752563274, | |
| "grad_norm": 1.0301790237426758, | |
| "learning_rate": 4.98698181233142e-06, | |
| "loss": 0.0059, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.7525632752563274, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07655075937509537, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2238, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.767, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.407, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.758299275829928, | |
| "grad_norm": 0.446961373090744, | |
| "learning_rate": 4.966954034553699e-06, | |
| "loss": 0.0087, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 2.7640352764035274, | |
| "grad_norm": 0.33151447772979736, | |
| "learning_rate": 4.946926786992407e-06, | |
| "loss": 0.0009, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 2.769771276977128, | |
| "grad_norm": 0.12154054641723633, | |
| "learning_rate": 4.9269003909810405e-06, | |
| "loss": 0.0055, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 2.775507277550728, | |
| "grad_norm": 0.016543418169021606, | |
| "learning_rate": 4.906875167839433e-06, | |
| "loss": 0.0373, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 2.781243278124328, | |
| "grad_norm": 0.018002478405833244, | |
| "learning_rate": 4.886851438868599e-06, | |
| "loss": 0.0133, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.781243278124328, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07717321813106537, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.202, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.799, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.423, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.786979278697928, | |
| "grad_norm": 0.01707332953810692, | |
| "learning_rate": 4.866829525345585e-06, | |
| "loss": 0.0166, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 2.792715279271528, | |
| "grad_norm": 0.00804219115525484, | |
| "learning_rate": 4.846809748518302e-06, | |
| "loss": 0.0015, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 2.798451279845128, | |
| "grad_norm": 0.02389547787606716, | |
| "learning_rate": 4.826792429600381e-06, | |
| "loss": 0.0021, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 2.804187280418728, | |
| "grad_norm": 0.005223631393164396, | |
| "learning_rate": 4.806777889766016e-06, | |
| "loss": 0.0136, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 2.809923280992328, | |
| "grad_norm": 0.20156431198120117, | |
| "learning_rate": 4.78676645014481e-06, | |
| "loss": 0.0093, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.809923280992328, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0736735612154007, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.151, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.873, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.46, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.815659281565928, | |
| "grad_norm": 0.0024203194770962, | |
| "learning_rate": 4.766758431816629e-06, | |
| "loss": 0.0052, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 2.821395282139528, | |
| "grad_norm": 0.031604766845703125, | |
| "learning_rate": 4.746754155806437e-06, | |
| "loss": 0.0095, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 2.827131282713128, | |
| "grad_norm": 3.787029266357422, | |
| "learning_rate": 4.72675394307916e-06, | |
| "loss": 0.0139, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 2.8328672832867285, | |
| "grad_norm": 0.07122969627380371, | |
| "learning_rate": 4.7067581145345226e-06, | |
| "loss": 0.0139, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 2.838603283860328, | |
| "grad_norm": 3.0758485794067383, | |
| "learning_rate": 4.686766991001913e-06, | |
| "loss": 0.0161, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.838603283860328, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08140452206134796, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.151, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.873, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.46, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.8443392844339286, | |
| "grad_norm": 0.07921534031629562, | |
| "learning_rate": 4.666780893235227e-06, | |
| "loss": 0.0006, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.8500752850075286, | |
| "grad_norm": 0.12975841760635376, | |
| "learning_rate": 4.646800141907717e-06, | |
| "loss": 0.0154, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 2.8558112855811286, | |
| "grad_norm": 0.0262366384267807, | |
| "learning_rate": 4.626825057606859e-06, | |
| "loss": 0.0093, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 2.8615472861547286, | |
| "grad_norm": 0.00262279505841434, | |
| "learning_rate": 4.606855960829199e-06, | |
| "loss": 0.0202, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 2.8672832867283287, | |
| "grad_norm": 0.01955203339457512, | |
| "learning_rate": 4.586893171975218e-06, | |
| "loss": 0.0266, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.8672832867283287, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07921645790338516, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3258, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.62, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.334, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.8730192873019287, | |
| "grad_norm": 0.10377766937017441, | |
| "learning_rate": 4.566937011344184e-06, | |
| "loss": 0.0017, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 2.8787552878755287, | |
| "grad_norm": 0.784302294254303, | |
| "learning_rate": 4.546987799129018e-06, | |
| "loss": 0.0046, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 2.8844912884491287, | |
| "grad_norm": 2.035209894180298, | |
| "learning_rate": 4.527045855411153e-06, | |
| "loss": 0.0119, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 2.8902272890227287, | |
| "grad_norm": 3.6558942794799805, | |
| "learning_rate": 4.507111500155407e-06, | |
| "loss": 0.0123, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 2.895963289596329, | |
| "grad_norm": 3.73490309715271, | |
| "learning_rate": 4.487185053204832e-06, | |
| "loss": 0.0073, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.895963289596329, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07410901039838791, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3293, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.615, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.331, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.901699290169929, | |
| "grad_norm": 0.008407690562307835, | |
| "learning_rate": 4.467266834275601e-06, | |
| "loss": 0.0191, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 2.9074352907435292, | |
| "grad_norm": 0.03255997970700264, | |
| "learning_rate": 4.447357162951865e-06, | |
| "loss": 0.0458, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 2.9131712913171293, | |
| "grad_norm": 1.6957069635391235, | |
| "learning_rate": 4.427456358680635e-06, | |
| "loss": 0.0236, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 2.9189072918907293, | |
| "grad_norm": 0.1675146073102951, | |
| "learning_rate": 4.407564740766648e-06, | |
| "loss": 0.0177, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 2.9246432924643293, | |
| "grad_norm": 0.04229651764035225, | |
| "learning_rate": 4.3876826283672485e-06, | |
| "loss": 0.0116, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.9246432924643293, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06259483098983765, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.178, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.834, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.441, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.9303792930379293, | |
| "grad_norm": 0.04005512595176697, | |
| "learning_rate": 4.367810340487267e-06, | |
| "loss": 0.0099, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 2.9361152936115293, | |
| "grad_norm": 0.1425451785326004, | |
| "learning_rate": 4.347948195973901e-06, | |
| "loss": 0.0033, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.9418512941851294, | |
| "grad_norm": 0.10147175192832947, | |
| "learning_rate": 4.328096513511601e-06, | |
| "loss": 0.0076, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 2.9475872947587294, | |
| "grad_norm": 0.13407284021377563, | |
| "learning_rate": 4.308255611616954e-06, | |
| "loss": 0.0034, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 2.9533232953323294, | |
| "grad_norm": 0.12246271222829819, | |
| "learning_rate": 4.2884258086335755e-06, | |
| "loss": 0.0077, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.9533232953323294, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07015232741832733, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2126, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.784, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.415, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.95905929590593, | |
| "grad_norm": 0.06634827703237534, | |
| "learning_rate": 4.268607422727e-06, | |
| "loss": 0.0131, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 2.9647952964795294, | |
| "grad_norm": 2.3556761741638184, | |
| "learning_rate": 4.248800771879581e-06, | |
| "loss": 0.029, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 2.97053129705313, | |
| "grad_norm": 0.03471173346042633, | |
| "learning_rate": 4.229006173885381e-06, | |
| "loss": 0.0021, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 2.97626729762673, | |
| "grad_norm": 0.006328817922621965, | |
| "learning_rate": 4.2092239463450775e-06, | |
| "loss": 0.0212, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 2.98200329820033, | |
| "grad_norm": 0.02347049117088318, | |
| "learning_rate": 4.189454406660865e-06, | |
| "loss": 0.0056, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.98200329820033, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.06551730632781982, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.5411, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.314, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.18, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.98773929877393, | |
| "grad_norm": 0.012669041752815247, | |
| "learning_rate": 4.169697872031368e-06, | |
| "loss": 0.012, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 2.99347529934753, | |
| "grad_norm": 0.07477905601263046, | |
| "learning_rate": 4.1499546594465465e-06, | |
| "loss": 0.0215, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 2.99921129992113, | |
| "grad_norm": 1.7680764198303223, | |
| "learning_rate": 4.1302250856826045e-06, | |
| "loss": 0.0107, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 3.00458880045888, | |
| "grad_norm": 0.19178365170955658, | |
| "learning_rate": 4.110509467296923e-06, | |
| "loss": 0.006, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 3.01032480103248, | |
| "grad_norm": 0.025907142087817192, | |
| "learning_rate": 4.090808120622961e-06, | |
| "loss": 0.0139, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.01032480103248, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07205013185739517, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2727, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.697, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.372, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.01606080160608, | |
| "grad_norm": 0.018714534118771553, | |
| "learning_rate": 4.071121361765201e-06, | |
| "loss": 0.0033, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 3.02179680217968, | |
| "grad_norm": 2.1166880130767822, | |
| "learning_rate": 4.05144950659406e-06, | |
| "loss": 0.0081, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 3.02753280275328, | |
| "grad_norm": 0.07031437754631042, | |
| "learning_rate": 4.031792870740831e-06, | |
| "loss": 0.0086, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 3.03326880332688, | |
| "grad_norm": 0.24249830842018127, | |
| "learning_rate": 4.012151769592612e-06, | |
| "loss": 0.0132, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 3.0390048039004802, | |
| "grad_norm": 0.009602251462638378, | |
| "learning_rate": 3.992526518287258e-06, | |
| "loss": 0.0075, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.0390048039004802, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07945062220096588, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1773, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.835, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.441, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.0447408044740802, | |
| "grad_norm": 0.1440117210149765, | |
| "learning_rate": 3.972917431708305e-06, | |
| "loss": 0.0013, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 3.0504768050476807, | |
| "grad_norm": 0.013155910186469555, | |
| "learning_rate": 3.9533248244799375e-06, | |
| "loss": 0.0049, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 3.0562128056212807, | |
| "grad_norm": 0.1568404585123062, | |
| "learning_rate": 3.933749010961927e-06, | |
| "loss": 0.0024, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 3.0619488061948807, | |
| "grad_norm": 0.00445369491353631, | |
| "learning_rate": 3.914190305244595e-06, | |
| "loss": 0.0024, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 3.0676848067684808, | |
| "grad_norm": 0.15039828419685364, | |
| "learning_rate": 3.8946490211437735e-06, | |
| "loss": 0.02, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.0676848067684808, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08091636747121811, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2042, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.796, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.421, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.073420807342081, | |
| "grad_norm": 0.13300937414169312, | |
| "learning_rate": 3.875125472195764e-06, | |
| "loss": 0.0012, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 3.079156807915681, | |
| "grad_norm": 2.184394121170044, | |
| "learning_rate": 3.855619971652314e-06, | |
| "loss": 0.0117, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 3.084892808489281, | |
| "grad_norm": 0.02020282857120037, | |
| "learning_rate": 3.836132832475583e-06, | |
| "loss": 0.0014, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 3.090628809062881, | |
| "grad_norm": 0.008466456085443497, | |
| "learning_rate": 3.816664367333131e-06, | |
| "loss": 0.009, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 3.096364809636481, | |
| "grad_norm": 5.091494560241699, | |
| "learning_rate": 3.797214888592896e-06, | |
| "loss": 0.0128, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.096364809636481, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07716992497444153, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2065, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.792, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.42, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.102100810210081, | |
| "grad_norm": 0.20820866525173187, | |
| "learning_rate": 3.777784708318178e-06, | |
| "loss": 0.0184, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 3.107836810783681, | |
| "grad_norm": 1.1382921934127808, | |
| "learning_rate": 3.7583741382626402e-06, | |
| "loss": 0.0093, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 3.113572811357281, | |
| "grad_norm": 0.38132885098457336, | |
| "learning_rate": 3.7389834898653067e-06, | |
| "loss": 0.0117, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 3.1193088119308814, | |
| "grad_norm": 0.009956207126379013, | |
| "learning_rate": 3.719613074245555e-06, | |
| "loss": 0.0017, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 3.1250448125044814, | |
| "grad_norm": 5.792750358581543, | |
| "learning_rate": 3.7002632021981368e-06, | |
| "loss": 0.0146, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.1250448125044814, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0894627794623375, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2897, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.672, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.36, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.1307808130780814, | |
| "grad_norm": 1.089165210723877, | |
| "learning_rate": 3.680934184188182e-06, | |
| "loss": 0.0032, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 3.1365168136516814, | |
| "grad_norm": 0.020885087549686432, | |
| "learning_rate": 3.661626330346224e-06, | |
| "loss": 0.0037, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 3.1422528142252815, | |
| "grad_norm": 0.09390339255332947, | |
| "learning_rate": 3.642339950463224e-06, | |
| "loss": 0.0083, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 3.1479888147988815, | |
| "grad_norm": 0.0025120277423411608, | |
| "learning_rate": 3.62307535398559e-06, | |
| "loss": 0.0108, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 3.1537248153724815, | |
| "grad_norm": 0.06790509074926376, | |
| "learning_rate": 3.603832850010226e-06, | |
| "loss": 0.0012, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.1537248153724815, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08822309225797653, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3502, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.585, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.316, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.1594608159460815, | |
| "grad_norm": 0.07959645241498947, | |
| "learning_rate": 3.58461274727956e-06, | |
| "loss": 0.0075, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 3.1651968165196815, | |
| "grad_norm": 3.0567445755004883, | |
| "learning_rate": 3.5654153541766023e-06, | |
| "loss": 0.008, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 3.1709328170932816, | |
| "grad_norm": 0.20890028774738312, | |
| "learning_rate": 3.546240978719988e-06, | |
| "loss": 0.0006, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 3.1766688176668816, | |
| "grad_norm": 2.681074619293213, | |
| "learning_rate": 3.5270899285590375e-06, | |
| "loss": 0.0027, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 3.1824048182404816, | |
| "grad_norm": 0.04251394048333168, | |
| "learning_rate": 3.5079625109688243e-06, | |
| "loss": 0.0005, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.1824048182404816, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09866516292095184, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2324, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.755, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.401, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.188140818814082, | |
| "grad_norm": 4.062971591949463, | |
| "learning_rate": 3.4888590328452353e-06, | |
| "loss": 0.0178, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 3.193876819387682, | |
| "grad_norm": 0.0014892960898578167, | |
| "learning_rate": 3.4697798007000624e-06, | |
| "loss": 0.0126, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 3.199612819961282, | |
| "grad_norm": 0.005618450231850147, | |
| "learning_rate": 3.450725120656069e-06, | |
| "loss": 0.0261, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 3.205348820534882, | |
| "grad_norm": 1.2523528337478638, | |
| "learning_rate": 3.431695298442084e-06, | |
| "loss": 0.0061, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 3.211084821108482, | |
| "grad_norm": 3.6290197372436523, | |
| "learning_rate": 3.4126906393881e-06, | |
| "loss": 0.0177, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.211084821108482, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07931295037269592, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2163, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.778, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.413, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.216820821682082, | |
| "grad_norm": 0.015163871459662914, | |
| "learning_rate": 3.393711448420372e-06, | |
| "loss": 0.0057, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 3.222556822255682, | |
| "grad_norm": 0.059904176741838455, | |
| "learning_rate": 3.37475803005652e-06, | |
| "loss": 0.0017, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 3.228292822829282, | |
| "grad_norm": 0.3671509623527527, | |
| "learning_rate": 3.355830688400652e-06, | |
| "loss": 0.0211, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 3.2340288234028822, | |
| "grad_norm": 2.3451545238494873, | |
| "learning_rate": 3.336929727138474e-06, | |
| "loss": 0.0221, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 3.2397648239764822, | |
| "grad_norm": 0.00577126070857048, | |
| "learning_rate": 3.31805544953243e-06, | |
| "loss": 0.0082, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.2397648239764822, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07332266122102737, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1867, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.821, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.434, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.2455008245500823, | |
| "grad_norm": 1.244507074356079, | |
| "learning_rate": 3.299208158416829e-06, | |
| "loss": 0.0177, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 3.2512368251236827, | |
| "grad_norm": 1.7493702173233032, | |
| "learning_rate": 3.2803881561929806e-06, | |
| "loss": 0.0182, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 3.2569728256972827, | |
| "grad_norm": 0.03632393479347229, | |
| "learning_rate": 3.2615957448243562e-06, | |
| "loss": 0.0075, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 3.2627088262708828, | |
| "grad_norm": 0.021715901792049408, | |
| "learning_rate": 3.2428312258317306e-06, | |
| "loss": 0.0044, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 3.268444826844483, | |
| "grad_norm": 0.7747498154640198, | |
| "learning_rate": 3.224094900288357e-06, | |
| "loss": 0.0071, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.268444826844483, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08630760759115219, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1329, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.9, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.474, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.274180827418083, | |
| "grad_norm": 0.003438598709180951, | |
| "learning_rate": 3.205387068815127e-06, | |
| "loss": 0.0025, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 3.279916827991683, | |
| "grad_norm": 0.02451498433947563, | |
| "learning_rate": 3.1867080315757477e-06, | |
| "loss": 0.0004, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 3.285652828565283, | |
| "grad_norm": 0.4422462582588196, | |
| "learning_rate": 3.1680580882719304e-06, | |
| "loss": 0.0206, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 3.291388829138883, | |
| "grad_norm": 3.2265453338623047, | |
| "learning_rate": 3.149437538138583e-06, | |
| "loss": 0.0142, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 3.297124829712483, | |
| "grad_norm": 0.015962867066264153, | |
| "learning_rate": 3.130846679938998e-06, | |
| "loss": 0.0099, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.297124829712483, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07895597815513611, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1289, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.906, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.476, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.302860830286083, | |
| "grad_norm": 0.470051109790802, | |
| "learning_rate": 3.1122858119600726e-06, | |
| "loss": 0.0031, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 3.308596830859683, | |
| "grad_norm": 0.002947809174656868, | |
| "learning_rate": 3.0937552320075116e-06, | |
| "loss": 0.0005, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 3.3143328314332834, | |
| "grad_norm": 4.272603511810303, | |
| "learning_rate": 3.0752552374010567e-06, | |
| "loss": 0.0185, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 3.320068832006883, | |
| "grad_norm": 0.0015301775420084596, | |
| "learning_rate": 3.056786124969716e-06, | |
| "loss": 0.0012, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 3.3258048325804834, | |
| "grad_norm": 0.002659064019098878, | |
| "learning_rate": 3.0383481910469936e-06, | |
| "loss": 0.004, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.3258048325804834, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08382538706064224, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1553, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.867, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.457, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.3315408331540834, | |
| "grad_norm": 0.25469204783439636, | |
| "learning_rate": 3.019941731466143e-06, | |
| "loss": 0.0005, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 3.3372768337276835, | |
| "grad_norm": 0.5178417563438416, | |
| "learning_rate": 3.0015670415554143e-06, | |
| "loss": 0.016, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 3.3430128343012835, | |
| "grad_norm": 0.07551216334104538, | |
| "learning_rate": 2.9832244161333257e-06, | |
| "loss": 0.0202, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 3.3487488348748835, | |
| "grad_norm": 0.008605745621025562, | |
| "learning_rate": 2.9649141495039225e-06, | |
| "loss": 0.0009, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 3.3544848354484835, | |
| "grad_norm": 0.0697212815284729, | |
| "learning_rate": 2.9466365354520564e-06, | |
| "loss": 0.0204, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.3544848354484835, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07445970177650452, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2129, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.783, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.415, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.3602208360220835, | |
| "grad_norm": 0.017461596056818962, | |
| "learning_rate": 2.928391867238679e-06, | |
| "loss": 0.0012, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 3.3659568365956836, | |
| "grad_norm": 0.0013357176212593913, | |
| "learning_rate": 2.910180437596132e-06, | |
| "loss": 0.0036, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 3.3716928371692836, | |
| "grad_norm": 0.00179756130091846, | |
| "learning_rate": 2.8920025387234484e-06, | |
| "loss": 0.0131, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 3.3774288377428836, | |
| "grad_norm": 0.005055127199739218, | |
| "learning_rate": 2.8738584622816656e-06, | |
| "loss": 0.0133, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 3.3831648383164836, | |
| "grad_norm": 0.778300940990448, | |
| "learning_rate": 2.8557484993891484e-06, | |
| "loss": 0.007, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.3831648383164836, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08228859305381775, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2282, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.761, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.404, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.388900838890084, | |
| "grad_norm": 0.05169231817126274, | |
| "learning_rate": 2.837672940616911e-06, | |
| "loss": 0.0088, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 3.394636839463684, | |
| "grad_norm": 0.0016457217279821634, | |
| "learning_rate": 2.8196320759839677e-06, | |
| "loss": 0.0173, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 3.400372840037284, | |
| "grad_norm": 0.11143210530281067, | |
| "learning_rate": 2.801626194952669e-06, | |
| "loss": 0.0015, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 3.406108840610884, | |
| "grad_norm": 0.03351111710071564, | |
| "learning_rate": 2.7836555864240566e-06, | |
| "loss": 0.0079, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 3.411844841184484, | |
| "grad_norm": 0.22604842483997345, | |
| "learning_rate": 2.7657205387332313e-06, | |
| "loss": 0.0124, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.411844841184484, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08073882758617401, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2042, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.796, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.421, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.417580841758084, | |
| "grad_norm": 0.1360362470149994, | |
| "learning_rate": 2.747821339644734e-06, | |
| "loss": 0.028, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 3.423316842331684, | |
| "grad_norm": 0.015132960863411427, | |
| "learning_rate": 2.7299582763479115e-06, | |
| "loss": 0.0041, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 3.4290528429052842, | |
| "grad_norm": 0.06472807377576828, | |
| "learning_rate": 2.7121316354523286e-06, | |
| "loss": 0.0017, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 3.4347888434788842, | |
| "grad_norm": 3.1584267616271973, | |
| "learning_rate": 2.6943417029831504e-06, | |
| "loss": 0.0232, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 3.4405248440524843, | |
| "grad_norm": 0.004198137205094099, | |
| "learning_rate": 2.6765887643765653e-06, | |
| "loss": 0.0052, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.4405248440524843, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08295302093029022, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2169, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.777, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.412, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.4462608446260843, | |
| "grad_norm": 0.018915435299277306, | |
| "learning_rate": 2.6588731044752057e-06, | |
| "loss": 0.0153, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 3.4519968451996847, | |
| "grad_norm": 3.4281928539276123, | |
| "learning_rate": 2.641195007523568e-06, | |
| "loss": 0.0099, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 3.4577328457732848, | |
| "grad_norm": 0.0030478844419121742, | |
| "learning_rate": 2.623554757163464e-06, | |
| "loss": 0.0004, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 3.463468846346885, | |
| "grad_norm": 0.022162869572639465, | |
| "learning_rate": 2.605952636429456e-06, | |
| "loss": 0.0114, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 3.469204846920485, | |
| "grad_norm": 0.009365621022880077, | |
| "learning_rate": 2.5883889277443347e-06, | |
| "loss": 0.0009, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.469204846920485, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07579995691776276, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2596, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.716, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.381, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.474940847494085, | |
| "grad_norm": 0.29800668358802795, | |
| "learning_rate": 2.570863912914566e-06, | |
| "loss": 0.018, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 3.480676848067685, | |
| "grad_norm": 3.4592084884643555, | |
| "learning_rate": 2.5533778731257824e-06, | |
| "loss": 0.0041, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 3.486412848641285, | |
| "grad_norm": 0.03675074130296707, | |
| "learning_rate": 2.535931088938274e-06, | |
| "loss": 0.0129, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 3.492148849214885, | |
| "grad_norm": 0.5131600499153137, | |
| "learning_rate": 2.518523840282479e-06, | |
| "loss": 0.0069, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 3.497884849788485, | |
| "grad_norm": 0.19048012793064117, | |
| "learning_rate": 2.5011564064544945e-06, | |
| "loss": 0.0061, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.497884849788485, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07761486619710922, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2658, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.707, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.377, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.503620850362085, | |
| "grad_norm": 0.37152099609375, | |
| "learning_rate": 2.483829066111596e-06, | |
| "loss": 0.0108, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 3.509356850935685, | |
| "grad_norm": 4.403800964355469, | |
| "learning_rate": 2.466542097267765e-06, | |
| "loss": 0.0119, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 3.5150928515092854, | |
| "grad_norm": 0.008035624399781227, | |
| "learning_rate": 2.4492957772892345e-06, | |
| "loss": 0.0192, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 3.520828852082885, | |
| "grad_norm": 1.8271785974502563, | |
| "learning_rate": 2.432090382890032e-06, | |
| "loss": 0.0159, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 3.5265648526564854, | |
| "grad_norm": 0.00973561592400074, | |
| "learning_rate": 2.4149261901275393e-06, | |
| "loss": 0.0184, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.5265648526564854, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07000607997179031, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3198, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.629, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.338, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.5323008532300855, | |
| "grad_norm": 0.3272586464881897, | |
| "learning_rate": 2.3978034743980676e-06, | |
| "loss": 0.0014, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 3.5380368538036855, | |
| "grad_norm": 0.015382179990410805, | |
| "learning_rate": 2.3807225104324337e-06, | |
| "loss": 0.0099, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 3.5437728543772855, | |
| "grad_norm": 0.016446420922875404, | |
| "learning_rate": 2.36368357229156e-06, | |
| "loss": 0.0039, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 3.5495088549508855, | |
| "grad_norm": 0.006563331466168165, | |
| "learning_rate": 2.34668693336207e-06, | |
| "loss": 0.0035, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 3.5552448555244855, | |
| "grad_norm": 0.024284416809678078, | |
| "learning_rate": 2.3297328663519044e-06, | |
| "loss": 0.0117, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.5552448555244855, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07584439218044281, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3203, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.628, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.337, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.5609808560980856, | |
| "grad_norm": 0.011505261063575745, | |
| "learning_rate": 2.3128216432859435e-06, | |
| "loss": 0.0192, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 3.5667168566716856, | |
| "grad_norm": 0.01848476566374302, | |
| "learning_rate": 2.2959535355016436e-06, | |
| "loss": 0.0003, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 3.5724528572452856, | |
| "grad_norm": 0.007101431954652071, | |
| "learning_rate": 2.27912881364469e-06, | |
| "loss": 0.0098, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 3.5781888578188856, | |
| "grad_norm": 0.9875366687774658, | |
| "learning_rate": 2.2623477476646447e-06, | |
| "loss": 0.0014, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 3.5839248583924856, | |
| "grad_norm": 0.07821710407733917, | |
| "learning_rate": 2.2456106068106206e-06, | |
| "loss": 0.0007, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.5839248583924856, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08138631284236908, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2288, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.76, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.404, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.589660858966086, | |
| "grad_norm": 0.8070231676101685, | |
| "learning_rate": 2.2289176596269545e-06, | |
| "loss": 0.0015, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 3.5953968595396857, | |
| "grad_norm": 0.0035041391383856535, | |
| "learning_rate": 2.2122691739489143e-06, | |
| "loss": 0.0021, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 3.601132860113286, | |
| "grad_norm": 0.013247767463326454, | |
| "learning_rate": 2.195665416898381e-06, | |
| "loss": 0.0026, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 3.606868860686886, | |
| "grad_norm": 3.3201675415039062, | |
| "learning_rate": 2.179106654879581e-06, | |
| "loss": 0.0236, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 3.612604861260486, | |
| "grad_norm": 0.00799557100981474, | |
| "learning_rate": 2.1625931535747964e-06, | |
| "loss": 0.0078, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.612604861260486, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07524015754461288, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3312, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.613, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.33, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.618340861834086, | |
| "grad_norm": 0.796405553817749, | |
| "learning_rate": 2.1461251779401175e-06, | |
| "loss": 0.0028, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 3.6240768624076862, | |
| "grad_norm": 0.0037477288860827684, | |
| "learning_rate": 2.1297029922011775e-06, | |
| "loss": 0.0093, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 3.6298128629812862, | |
| "grad_norm": 0.07188570499420166, | |
| "learning_rate": 2.113326859848919e-06, | |
| "loss": 0.0031, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 3.6355488635548863, | |
| "grad_norm": 2.5588748455047607, | |
| "learning_rate": 2.0969970436353725e-06, | |
| "loss": 0.0074, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 3.6412848641284863, | |
| "grad_norm": 0.008328588679432869, | |
| "learning_rate": 2.080713805569427e-06, | |
| "loss": 0.0025, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 3.6412848641284863, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.07739181071519852, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2601, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.715, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.381, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 3.6470208647020863, | |
| "grad_norm": 0.0075204516761004925, | |
| "learning_rate": 2.0644774069126406e-06, | |
| "loss": 0.0109, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 3.6527568652756868, | |
| "grad_norm": 0.003392142942175269, | |
| "learning_rate": 2.048288108175036e-06, | |
| "loss": 0.0091, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 3.6584928658492863, | |
| "grad_norm": 0.00919413473457098, | |
| "learning_rate": 2.0321461691109288e-06, | |
| "loss": 0.007, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 3.664228866422887, | |
| "grad_norm": 0.020889578387141228, | |
| "learning_rate": 2.016051848714758e-06, | |
| "loss": 0.0012, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 3.669964866996487, | |
| "grad_norm": 0.09611943364143372, | |
| "learning_rate": 2.000005405216931e-06, | |
| "loss": 0.0011, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.669964866996487, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08481152355670929, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2176, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.776, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.412, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.675700867570087, | |
| "grad_norm": 0.004153054673224688, | |
| "learning_rate": 1.984007096079676e-06, | |
| "loss": 0.0175, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 3.681436868143687, | |
| "grad_norm": 0.0063193319365382195, | |
| "learning_rate": 1.968057177992915e-06, | |
| "loss": 0.0018, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 3.687172868717287, | |
| "grad_norm": 0.05629734694957733, | |
| "learning_rate": 1.9521559068701447e-06, | |
| "loss": 0.0005, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 3.692908869290887, | |
| "grad_norm": 0.042965829372406006, | |
| "learning_rate": 1.9363035378443313e-06, | |
| "loss": 0.0017, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 3.698644869864487, | |
| "grad_norm": 0.029728276655077934, | |
| "learning_rate": 1.9205003252638176e-06, | |
| "loss": 0.0041, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.698644869864487, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08883155882358551, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1133, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.928, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.488, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.704380870438087, | |
| "grad_norm": 0.014888194389641285, | |
| "learning_rate": 1.904746522688236e-06, | |
| "loss": 0.0018, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 3.710116871011687, | |
| "grad_norm": 4.221490859985352, | |
| "learning_rate": 1.8890423828844462e-06, | |
| "loss": 0.0047, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 3.7158528715852874, | |
| "grad_norm": 2.9691474437713623, | |
| "learning_rate": 1.873388157822477e-06, | |
| "loss": 0.0087, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 3.721588872158887, | |
| "grad_norm": 0.1171032041311264, | |
| "learning_rate": 1.8577840986714868e-06, | |
| "loss": 0.0019, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 3.7273248727324875, | |
| "grad_norm": 0.006826687604188919, | |
| "learning_rate": 1.8422304557957315e-06, | |
| "loss": 0.0123, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.7273248727324875, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08897148817777634, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2955, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.664, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.355, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.7330608733060875, | |
| "grad_norm": 0.022587748244404793, | |
| "learning_rate": 1.8267274787505446e-06, | |
| "loss": 0.0087, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 3.7387968738796875, | |
| "grad_norm": 0.5572834014892578, | |
| "learning_rate": 1.8112754162783336e-06, | |
| "loss": 0.0084, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 3.7445328744532875, | |
| "grad_norm": 0.015338007360696793, | |
| "learning_rate": 1.7958745163045987e-06, | |
| "loss": 0.0085, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 3.7502688750268875, | |
| "grad_norm": 0.004370348062366247, | |
| "learning_rate": 1.7805250259339379e-06, | |
| "loss": 0.0075, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 3.7560048756004876, | |
| "grad_norm": 0.006390043999999762, | |
| "learning_rate": 1.765227191446101e-06, | |
| "loss": 0.0074, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.7560048756004876, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08355527371168137, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1302, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.904, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.475, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.7617408761740876, | |
| "grad_norm": 0.09934690594673157, | |
| "learning_rate": 1.7499812582920222e-06, | |
| "loss": 0.0038, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 3.7674768767476876, | |
| "grad_norm": 0.0005312475841492414, | |
| "learning_rate": 1.734787471089887e-06, | |
| "loss": 0.005, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 3.7732128773212876, | |
| "grad_norm": 0.5943177938461304, | |
| "learning_rate": 1.7196460736212167e-06, | |
| "loss": 0.0111, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 3.7789488778948876, | |
| "grad_norm": 0.03382878378033638, | |
| "learning_rate": 1.7045573088269408e-06, | |
| "loss": 0.0022, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 3.7846848784684877, | |
| "grad_norm": 0.018695516511797905, | |
| "learning_rate": 1.6895214188035125e-06, | |
| "loss": 0.0038, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.7846848784684877, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0897902175784111, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2015, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.8, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.423, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.790420879042088, | |
| "grad_norm": 0.03567569702863693, | |
| "learning_rate": 1.6745386447990153e-06, | |
| "loss": 0.0149, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 3.7961568796156877, | |
| "grad_norm": 0.005937593523412943, | |
| "learning_rate": 1.6596092272093e-06, | |
| "loss": 0.0057, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 3.801892880189288, | |
| "grad_norm": 0.001366928219795227, | |
| "learning_rate": 1.6447334055741198e-06, | |
| "loss": 0.0019, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 3.807628880762888, | |
| "grad_norm": 0.017239965498447418, | |
| "learning_rate": 1.6299114185732918e-06, | |
| "loss": 0.0068, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 3.813364881336488, | |
| "grad_norm": 0.09475360810756683, | |
| "learning_rate": 1.6151435040228663e-06, | |
| "loss": 0.0084, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.813364881336488, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09376490861177444, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2936, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.667, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.357, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.8191008819100882, | |
| "grad_norm": 0.004428621847182512, | |
| "learning_rate": 1.6004298988713153e-06, | |
| "loss": 0.0003, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 3.8248368824836882, | |
| "grad_norm": 0.0019930372945964336, | |
| "learning_rate": 1.5857708391957194e-06, | |
| "loss": 0.0012, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 3.8305728830572883, | |
| "grad_norm": 0.24852816760540009, | |
| "learning_rate": 1.571166560197991e-06, | |
| "loss": 0.0018, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 3.8363088836308883, | |
| "grad_norm": 0.05106634274125099, | |
| "learning_rate": 1.5566172962010944e-06, | |
| "loss": 0.0088, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 3.8420448842044883, | |
| "grad_norm": 0.957535982131958, | |
| "learning_rate": 1.542123280645292e-06, | |
| "loss": 0.0013, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.8420448842044883, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09245079010725021, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1154, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.925, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.486, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.8477808847780883, | |
| "grad_norm": 0.022298647090792656, | |
| "learning_rate": 1.527684746084394e-06, | |
| "loss": 0.0103, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 3.853516885351689, | |
| "grad_norm": 0.21422079205513, | |
| "learning_rate": 1.5133019241820257e-06, | |
| "loss": 0.0341, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 3.8592528859252884, | |
| "grad_norm": 0.08396855741739273, | |
| "learning_rate": 1.4989750457079156e-06, | |
| "loss": 0.0033, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 3.864988886498889, | |
| "grad_norm": 0.12731043994426727, | |
| "learning_rate": 1.484704340534187e-06, | |
| "loss": 0.0028, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 3.870724887072489, | |
| "grad_norm": 0.4012109637260437, | |
| "learning_rate": 1.4704900376316773e-06, | |
| "loss": 0.0008, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.870724887072489, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09105575829744339, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2863, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.677, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.362, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.876460887646089, | |
| "grad_norm": 0.4341152012348175, | |
| "learning_rate": 1.4563323650662586e-06, | |
| "loss": 0.0013, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 3.882196888219689, | |
| "grad_norm": 0.14372624456882477, | |
| "learning_rate": 1.4422315499951783e-06, | |
| "loss": 0.0059, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 3.887932888793289, | |
| "grad_norm": 0.010315956547856331, | |
| "learning_rate": 1.4281878186634157e-06, | |
| "loss": 0.01, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 3.893668889366889, | |
| "grad_norm": 0.1287292093038559, | |
| "learning_rate": 1.4142013964000513e-06, | |
| "loss": 0.01, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 3.899404889940489, | |
| "grad_norm": 0.4034357964992523, | |
| "learning_rate": 1.400272507614655e-06, | |
| "loss": 0.0012, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.899404889940489, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09034840762615204, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1486, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.877, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.462, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.905140890514089, | |
| "grad_norm": 0.15965530276298523, | |
| "learning_rate": 1.3864013757936828e-06, | |
| "loss": 0.0009, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 3.910876891087689, | |
| "grad_norm": 0.07124567031860352, | |
| "learning_rate": 1.3725882234968879e-06, | |
| "loss": 0.0118, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 3.916612891661289, | |
| "grad_norm": 0.022812863811850548, | |
| "learning_rate": 1.3588332723537523e-06, | |
| "loss": 0.0133, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 3.922348892234889, | |
| "grad_norm": 0.0023223236203193665, | |
| "learning_rate": 1.345136743059936e-06, | |
| "loss": 0.013, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 3.9280848928084895, | |
| "grad_norm": 0.08886972069740295, | |
| "learning_rate": 1.3314988553737256e-06, | |
| "loss": 0.0028, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.9280848928084895, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0912175104022026, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2896, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.672, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.36, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.933820893382089, | |
| "grad_norm": 0.022729717195034027, | |
| "learning_rate": 1.3179198281125188e-06, | |
| "loss": 0.0069, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 3.9395568939556895, | |
| "grad_norm": 2.09700083732605, | |
| "learning_rate": 1.3043998791493034e-06, | |
| "loss": 0.006, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 3.9452928945292896, | |
| "grad_norm": 0.026967119425535202, | |
| "learning_rate": 1.290939225409173e-06, | |
| "loss": 0.0018, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 3.9510288951028896, | |
| "grad_norm": 0.010416003875434399, | |
| "learning_rate": 1.277538082865835e-06, | |
| "loss": 0.012, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 3.9567648956764896, | |
| "grad_norm": 0.04778837040066719, | |
| "learning_rate": 1.2641966665381517e-06, | |
| "loss": 0.0117, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.9567648956764896, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09045156091451645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1714, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.844, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.445, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.9625008962500896, | |
| "grad_norm": 0.0013466872042044997, | |
| "learning_rate": 1.2509151904866922e-06, | |
| "loss": 0.0012, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 3.9682368968236896, | |
| "grad_norm": 0.057207245379686356, | |
| "learning_rate": 1.2376938678102913e-06, | |
| "loss": 0.0172, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 3.9739728973972896, | |
| "grad_norm": 2.2670977115631104, | |
| "learning_rate": 1.2245329106426367e-06, | |
| "loss": 0.013, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 3.9797088979708897, | |
| "grad_norm": 0.0011907550506293774, | |
| "learning_rate": 1.2114325301488616e-06, | |
| "loss": 0.0117, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 3.9854448985444897, | |
| "grad_norm": 0.023963019251823425, | |
| "learning_rate": 1.1983929365221548e-06, | |
| "loss": 0.0096, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.9854448985444897, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08352689445018768, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2306, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.757, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.402, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.99118089911809, | |
| "grad_norm": 0.02613544650375843, | |
| "learning_rate": 1.1854143389803962e-06, | |
| "loss": 0.0104, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 3.9969168996916897, | |
| "grad_norm": 0.004353563766926527, | |
| "learning_rate": 1.1724969457627928e-06, | |
| "loss": 0.0004, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 4.00229440022944, | |
| "grad_norm": 0.013296003453433514, | |
| "learning_rate": 1.1596409641265376e-06, | |
| "loss": 0.0053, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 4.00803040080304, | |
| "grad_norm": 0.01596342958509922, | |
| "learning_rate": 1.146846600343488e-06, | |
| "loss": 0.0013, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 4.01376640137664, | |
| "grad_norm": 0.003088761121034622, | |
| "learning_rate": 1.1341140596968525e-06, | |
| "loss": 0.0005, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.01376640137664, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08817622810602188, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2983, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.66, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.353, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.01950240195024, | |
| "grad_norm": 0.0804172232747078, | |
| "learning_rate": 1.1214435464779006e-06, | |
| "loss": 0.0003, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 4.02523840252384, | |
| "grad_norm": 0.13886569440364838, | |
| "learning_rate": 1.1088352639826844e-06, | |
| "loss": 0.0006, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 4.03097440309744, | |
| "grad_norm": 0.029498988762497902, | |
| "learning_rate": 1.0962894145087715e-06, | |
| "loss": 0.0058, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 4.03671040367104, | |
| "grad_norm": 3.7734415531158447, | |
| "learning_rate": 1.0838061993520072e-06, | |
| "loss": 0.0125, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 4.04244640424464, | |
| "grad_norm": 0.12142331153154373, | |
| "learning_rate": 1.0713858188032755e-06, | |
| "loss": 0.0161, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 4.04244640424464, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09039638936519623, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2759, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.692, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.369, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 4.048182404818241, | |
| "grad_norm": 0.003838218515738845, | |
| "learning_rate": 1.0590284721452965e-06, | |
| "loss": 0.0076, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 4.0539184053918405, | |
| "grad_norm": 0.0031382606830447912, | |
| "learning_rate": 1.0467343576494215e-06, | |
| "loss": 0.0007, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 4.059654405965441, | |
| "grad_norm": 0.0018334095366299152, | |
| "learning_rate": 1.0345036725724517e-06, | |
| "loss": 0.0206, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 4.0653904065390405, | |
| "grad_norm": 0.06359425187110901, | |
| "learning_rate": 1.0223366131534746e-06, | |
| "loss": 0.0004, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 4.071126407112641, | |
| "grad_norm": 0.08629762381315231, | |
| "learning_rate": 1.010233374610719e-06, | |
| "loss": 0.0003, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.071126407112641, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09012720733880997, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2717, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.698, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.373, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.0768624076862405, | |
| "grad_norm": 0.16138513386249542, | |
| "learning_rate": 9.981941511384152e-07, | |
| "loss": 0.0042, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 4.082598408259841, | |
| "grad_norm": 2.28838849067688, | |
| "learning_rate": 9.862191359036883e-07, | |
| "loss": 0.0019, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 4.088334408833441, | |
| "grad_norm": 0.003568111453205347, | |
| "learning_rate": 9.743085210434505e-07, | |
| "loss": 0.0136, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 4.094070409407041, | |
| "grad_norm": 2.805901050567627, | |
| "learning_rate": 9.624624976613217e-07, | |
| "loss": 0.0083, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 4.099806409980641, | |
| "grad_norm": 0.009448690339922905, | |
| "learning_rate": 9.506812558245665e-07, | |
| "loss": 0.0004, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 4.099806409980641, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09947969764471054, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2914, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.67, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.358, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 4.105542410554241, | |
| "grad_norm": 0.007924864068627357, | |
| "learning_rate": 9.389649845610376e-07, | |
| "loss": 0.0083, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 4.1112784111278415, | |
| "grad_norm": 0.03764890506863594, | |
| "learning_rate": 9.273138718561519e-07, | |
| "loss": 0.0003, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 4.117014411701441, | |
| "grad_norm": 0.015703987330198288, | |
| "learning_rate": 9.157281046498628e-07, | |
| "loss": 0.0141, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 4.122750412275042, | |
| "grad_norm": 0.04593523591756821, | |
| "learning_rate": 9.042078688336737e-07, | |
| "loss": 0.0005, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 4.128486412848641, | |
| "grad_norm": 0.08672873675823212, | |
| "learning_rate": 8.927533492476437e-07, | |
| "loss": 0.0088, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.128486412848641, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09371308237314224, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2865, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.677, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.362, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.134222413422242, | |
| "grad_norm": 0.0006487391074188054, | |
| "learning_rate": 8.813647296774291e-07, | |
| "loss": 0.0033, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 4.139958413995841, | |
| "grad_norm": 2.7435028553009033, | |
| "learning_rate": 8.700421928513353e-07, | |
| "loss": 0.0104, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 4.145694414569442, | |
| "grad_norm": 0.12825919687747955, | |
| "learning_rate": 8.58785920437376e-07, | |
| "loss": 0.0131, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 4.151430415143041, | |
| "grad_norm": 0.019473344087600708, | |
| "learning_rate": 8.475960930403721e-07, | |
| "loss": 0.0123, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 4.157166415716642, | |
| "grad_norm": 0.019103556871414185, | |
| "learning_rate": 8.364728901990404e-07, | |
| "loss": 0.0161, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 4.157166415716642, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08745884150266647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3788, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.544, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.296, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 4.162902416290241, | |
| "grad_norm": 0.001977517269551754, | |
| "learning_rate": 8.254164903831203e-07, | |
| "loss": 0.009, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 4.168638416863842, | |
| "grad_norm": 0.00919981300830841, | |
| "learning_rate": 8.144270709905117e-07, | |
| "loss": 0.0185, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 4.174374417437441, | |
| "grad_norm": 0.007627399172633886, | |
| "learning_rate": 8.035048083444241e-07, | |
| "loss": 0.0013, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 4.180110418011042, | |
| "grad_norm": 0.006586376577615738, | |
| "learning_rate": 7.926498776905495e-07, | |
| "loss": 0.0013, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 4.185846418584642, | |
| "grad_norm": 0.0025622497778385878, | |
| "learning_rate": 7.818624531942493e-07, | |
| "loss": 0.0003, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.185846418584642, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08901920914649963, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2083, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.79, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.419, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.191582419158242, | |
| "grad_norm": 0.07510726898908615, | |
| "learning_rate": 7.711427079377614e-07, | |
| "loss": 0.0055, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 4.197318419731842, | |
| "grad_norm": 0.002239730441942811, | |
| "learning_rate": 7.604908139174255e-07, | |
| "loss": 0.0007, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 4.203054420305442, | |
| "grad_norm": 0.0018331691389903426, | |
| "learning_rate": 7.499069420409183e-07, | |
| "loss": 0.0104, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 4.208790420879042, | |
| "grad_norm": 0.06228185445070267, | |
| "learning_rate": 7.393912621245142e-07, | |
| "loss": 0.015, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 4.214526421452642, | |
| "grad_norm": 0.0020015144255012274, | |
| "learning_rate": 7.289439428903599e-07, | |
| "loss": 0.0015, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 4.214526421452642, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08976524323225021, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3011, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.656, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.351, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 4.220262422026242, | |
| "grad_norm": 3.2421772480010986, | |
| "learning_rate": 7.185651519637659e-07, | |
| "loss": 0.0083, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 4.225998422599842, | |
| "grad_norm": 0.0900789201259613, | |
| "learning_rate": 7.082550558705225e-07, | |
| "loss": 0.0084, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 4.231734423173442, | |
| "grad_norm": 0.0065153795294463634, | |
| "learning_rate": 6.980138200342229e-07, | |
| "loss": 0.0003, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 4.237470423747042, | |
| "grad_norm": 0.041670165956020355, | |
| "learning_rate": 6.878416087736079e-07, | |
| "loss": 0.0006, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 4.243206424320642, | |
| "grad_norm": 0.005517491605132818, | |
| "learning_rate": 6.777385852999319e-07, | |
| "loss": 0.0005, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 4.243206424320642, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08851981908082962, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2967, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.662, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.355, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 4.248942424894243, | |
| "grad_norm": 0.007675142493098974, | |
| "learning_rate": 6.67704911714348e-07, | |
| "loss": 0.0127, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 4.2546784254678425, | |
| "grad_norm": 2.6328227519989014, | |
| "learning_rate": 6.577407490052978e-07, | |
| "loss": 0.0152, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 4.260414426041443, | |
| "grad_norm": 0.14412537217140198, | |
| "learning_rate": 6.478462570459388e-07, | |
| "loss": 0.0012, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 4.2661504266150425, | |
| "grad_norm": 3.322977066040039, | |
| "learning_rate": 6.380215945915702e-07, | |
| "loss": 0.0065, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 4.271886427188643, | |
| "grad_norm": 0.0006975011201575398, | |
| "learning_rate": 6.282669192770896e-07, | |
| "loss": 0.0008, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 4.271886427188643, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09144297242164612, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2868, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.676, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.362, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 4.2776224277622426, | |
| "grad_norm": 0.04776507988572121, | |
| "learning_rate": 6.185823876144675e-07, | |
| "loss": 0.0052, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 4.283358428335843, | |
| "grad_norm": 4.260890483856201, | |
| "learning_rate": 6.089681549902287e-07, | |
| "loss": 0.0172, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 4.289094428909443, | |
| "grad_norm": 0.0017789709381759167, | |
| "learning_rate": 5.994243756629664e-07, | |
| "loss": 0.0045, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 4.294830429483043, | |
| "grad_norm": 0.020361248403787613, | |
| "learning_rate": 5.89951202760859e-07, | |
| "loss": 0.0013, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 4.300566430056643, | |
| "grad_norm": 0.01590687222778797, | |
| "learning_rate": 5.805487882792227e-07, | |
| "loss": 0.0079, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.300566430056643, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08803683519363403, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2002, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.802, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.424, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.306302430630243, | |
| "grad_norm": 0.016193749383091927, | |
| "learning_rate": 5.712172830780649e-07, | |
| "loss": 0.0032, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 4.312038431203844, | |
| "grad_norm": 0.2794877588748932, | |
| "learning_rate": 5.619568368796657e-07, | |
| "loss": 0.0046, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 4.317774431777443, | |
| "grad_norm": 0.3345611095428467, | |
| "learning_rate": 5.527675982661801e-07, | |
| "loss": 0.0028, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 4.323510432351044, | |
| "grad_norm": 0.10426267981529236, | |
| "learning_rate": 5.43649714677249e-07, | |
| "loss": 0.0013, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 4.329246432924643, | |
| "grad_norm": 0.0005553574301302433, | |
| "learning_rate": 5.346033324076338e-07, | |
| "loss": 0.006, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 4.329246432924643, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.087630994617939, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2554, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.722, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.384, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 4.334982433498244, | |
| "grad_norm": 0.08077429980039597, | |
| "learning_rate": 5.256285966048719e-07, | |
| "loss": 0.0067, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 4.340718434071843, | |
| "grad_norm": 0.004632384981960058, | |
| "learning_rate": 5.167256512669444e-07, | |
| "loss": 0.0084, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 4.346454434645444, | |
| "grad_norm": 0.0067880041897296906, | |
| "learning_rate": 5.078946392399703e-07, | |
| "loss": 0.0021, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 4.352190435219043, | |
| "grad_norm": 0.001848329440690577, | |
| "learning_rate": 4.991357022159111e-07, | |
| "loss": 0.0007, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 4.357926435792644, | |
| "grad_norm": 0.09323499351739883, | |
| "learning_rate": 4.90448980730297e-07, | |
| "loss": 0.0024, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.357926435792644, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08750669658184052, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2509, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.728, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.388, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.363662436366243, | |
| "grad_norm": 0.0019721076823771, | |
| "learning_rate": 4.818346141599739e-07, | |
| "loss": 0.0037, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 4.369398436939844, | |
| "grad_norm": 0.26007330417633057, | |
| "learning_rate": 4.732927407208654e-07, | |
| "loss": 0.0011, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 4.375134437513443, | |
| "grad_norm": 3.1276211738586426, | |
| "learning_rate": 4.6482349746575783e-07, | |
| "loss": 0.0274, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 4.380870438087044, | |
| "grad_norm": 0.004567150957882404, | |
| "learning_rate": 4.564270202820997e-07, | |
| "loss": 0.0004, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 4.386606438660644, | |
| "grad_norm": 0.033358391374349594, | |
| "learning_rate": 4.4810344388982017e-07, | |
| "loss": 0.0078, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 4.386606438660644, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08857561647891998, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2607, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.714, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.381, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 4.392342439234244, | |
| "grad_norm": 0.3984158933162689, | |
| "learning_rate": 4.3985290183916893e-07, | |
| "loss": 0.003, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 4.398078439807844, | |
| "grad_norm": 0.09813853353261948, | |
| "learning_rate": 4.316755265085715e-07, | |
| "loss": 0.002, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 4.403814440381444, | |
| "grad_norm": 0.013608184643089771, | |
| "learning_rate": 4.2357144910251003e-07, | |
| "loss": 0.0007, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 4.409550440955044, | |
| "grad_norm": 0.012810665182769299, | |
| "learning_rate": 4.155407996494143e-07, | |
| "loss": 0.0003, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 4.415286441528644, | |
| "grad_norm": 0.5154911279678345, | |
| "learning_rate": 4.0758370699957416e-07, | |
| "loss": 0.0135, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.415286441528644, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08959878236055374, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2846, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.679, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.363, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.421022442102244, | |
| "grad_norm": 0.0007974316249601543, | |
| "learning_rate": 3.997002988230747e-07, | |
| "loss": 0.0026, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 4.426758442675844, | |
| "grad_norm": 0.032501526176929474, | |
| "learning_rate": 3.918907016077489e-07, | |
| "loss": 0.0051, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 4.432494443249444, | |
| "grad_norm": 0.2708737552165985, | |
| "learning_rate": 3.841550406571443e-07, | |
| "loss": 0.0009, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 4.438230443823044, | |
| "grad_norm": 0.6233472228050232, | |
| "learning_rate": 3.764934400885162e-07, | |
| "loss": 0.0012, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 4.4439664443966445, | |
| "grad_norm": 0.00045903949649073184, | |
| "learning_rate": 3.689060228308339e-07, | |
| "loss": 0.0016, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.4439664443966445, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08894706517457962, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3241, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.623, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.335, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.449702444970244, | |
| "grad_norm": 0.02216159924864769, | |
| "learning_rate": 3.6139291062281036e-07, | |
| "loss": 0.0112, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 4.4554384455438445, | |
| "grad_norm": 0.028067264705896378, | |
| "learning_rate": 3.539542240109456e-07, | |
| "loss": 0.0044, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 4.461174446117445, | |
| "grad_norm": 3.8059170246124268, | |
| "learning_rate": 3.4659008234759597e-07, | |
| "loss": 0.0063, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 4.4669104466910445, | |
| "grad_norm": 0.0011888346634805202, | |
| "learning_rate": 3.3930060378905893e-07, | |
| "loss": 0.0003, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 4.472646447264645, | |
| "grad_norm": 0.0015550279058516026, | |
| "learning_rate": 3.320859052936731e-07, | |
| "loss": 0.0009, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.472646447264645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09025800973176956, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1333, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.899, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.473, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.478382447838245, | |
| "grad_norm": 0.039686888456344604, | |
| "learning_rate": 3.249461026199485e-07, | |
| "loss": 0.0003, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 4.484118448411845, | |
| "grad_norm": 3.7747395038604736, | |
| "learning_rate": 3.1788131032470306e-07, | |
| "loss": 0.0124, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 4.489854448985445, | |
| "grad_norm": 0.000805154733825475, | |
| "learning_rate": 3.108916417612262e-07, | |
| "loss": 0.0031, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 4.495590449559045, | |
| "grad_norm": 0.0011434925254434347, | |
| "learning_rate": 3.039772090774634e-07, | |
| "loss": 0.0009, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 4.501326450132645, | |
| "grad_norm": 0.019072147086262703, | |
| "learning_rate": 2.97138123214214e-07, | |
| "loss": 0.0003, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 4.501326450132645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09056304395198822, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.169, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.847, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.447, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 4.507062450706245, | |
| "grad_norm": 0.005647636018693447, | |
| "learning_rate": 2.903744939033504e-07, | |
| "loss": 0.0001, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 4.512798451279846, | |
| "grad_norm": 2.6264231204986572, | |
| "learning_rate": 2.836864296660574e-07, | |
| "loss": 0.0132, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 4.518534451853445, | |
| "grad_norm": 0.010306664742529392, | |
| "learning_rate": 2.770740378110942e-07, | |
| "loss": 0.0145, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 4.524270452427046, | |
| "grad_norm": 0.18294845521450043, | |
| "learning_rate": 2.7053742443307054e-07, | |
| "loss": 0.0041, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 4.530006453000645, | |
| "grad_norm": 0.017165735363960266, | |
| "learning_rate": 2.640766944107431e-07, | |
| "loss": 0.0007, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.530006453000645, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08944171667098999, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2567, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.72, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.383, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.535742453574246, | |
| "grad_norm": 0.04896865785121918, | |
| "learning_rate": 2.5769195140533556e-07, | |
| "loss": 0.0125, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 4.541478454147845, | |
| "grad_norm": 0.022854099050164223, | |
| "learning_rate": 2.5138329785887263e-07, | |
| "loss": 0.0005, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 4.547214454721446, | |
| "grad_norm": 0.0018585673533380032, | |
| "learning_rate": 2.4515083499253743e-07, | |
| "loss": 0.0016, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 4.552950455295045, | |
| "grad_norm": 0.003768256865441799, | |
| "learning_rate": 2.3899466280504936e-07, | |
| "loss": 0.0217, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 4.558686455868646, | |
| "grad_norm": 0.05598225072026253, | |
| "learning_rate": 2.3291488007105734e-07, | |
| "loss": 0.009, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 4.558686455868646, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09063292294740677, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1171, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.923, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.485, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 4.564422456442245, | |
| "grad_norm": 3.3265416622161865, | |
| "learning_rate": 2.269115843395553e-07, | |
| "loss": 0.0151, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 4.570158457015846, | |
| "grad_norm": 0.0049064746126532555, | |
| "learning_rate": 2.2098487193231543e-07, | |
| "loss": 0.0208, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 4.575894457589445, | |
| "grad_norm": 0.0036308506969362497, | |
| "learning_rate": 2.1513483794234847e-07, | |
| "loss": 0.0068, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 4.581630458163046, | |
| "grad_norm": 0.0013542186934500933, | |
| "learning_rate": 2.093615762323703e-07, | |
| "loss": 0.0022, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 4.587366458736646, | |
| "grad_norm": 2.7696778774261475, | |
| "learning_rate": 2.0366517943330278e-07, | |
| "loss": 0.0014, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.587366458736646, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.08961217105388641, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3584, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.574, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.31, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.593102459310246, | |
| "grad_norm": 0.0006430260837078094, | |
| "learning_rate": 1.9804573894278311e-07, | |
| "loss": 0.0137, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 4.598838459883846, | |
| "grad_norm": 0.49191415309906006, | |
| "learning_rate": 1.925033449236974e-07, | |
| "loss": 0.0127, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 4.604574460457446, | |
| "grad_norm": 0.4231385886669159, | |
| "learning_rate": 1.8703808630273768e-07, | |
| "loss": 0.0108, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 4.610310461031046, | |
| "grad_norm": 5.982341289520264, | |
| "learning_rate": 1.8165005076897114e-07, | |
| "loss": 0.0034, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 4.616046461604646, | |
| "grad_norm": 0.01885843276977539, | |
| "learning_rate": 1.7633932477243599e-07, | |
| "loss": 0.0227, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 4.616046461604646, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09145788848400116, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2115, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.785, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.416, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 4.621782462178246, | |
| "grad_norm": 0.004457383882254362, | |
| "learning_rate": 1.7110599352275104e-07, | |
| "loss": 0.0066, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 4.627518462751846, | |
| "grad_norm": 0.005887819454073906, | |
| "learning_rate": 1.6595014098775342e-07, | |
| "loss": 0.004, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 4.633254463325446, | |
| "grad_norm": 0.023113150149583817, | |
| "learning_rate": 1.6087184989214687e-07, | |
| "loss": 0.0258, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 4.638990463899046, | |
| "grad_norm": 0.020028818398714066, | |
| "learning_rate": 1.5587120171617509e-07, | |
| "loss": 0.0005, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 4.6447264644726465, | |
| "grad_norm": 0.01912406086921692, | |
| "learning_rate": 1.509482766943182e-07, | |
| "loss": 0.0006, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.6447264644726465, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09051110595464706, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.4316, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.469, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.258, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.650462465046246, | |
| "grad_norm": 3.4732401371002197, | |
| "learning_rate": 1.4610315381400175e-07, | |
| "loss": 0.0149, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 4.6561984656198465, | |
| "grad_norm": 0.0006817178218625486, | |
| "learning_rate": 1.4133591081432806e-07, | |
| "loss": 0.0008, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 4.661934466193447, | |
| "grad_norm": 0.0062561482191085815, | |
| "learning_rate": 1.366466241848341e-07, | |
| "loss": 0.0004, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 4.6676704667670466, | |
| "grad_norm": 0.022234557196497917, | |
| "learning_rate": 1.3203536916425842e-07, | |
| "loss": 0.001, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 4.673406467340647, | |
| "grad_norm": 4.0328369140625, | |
| "learning_rate": 1.275022197393394e-07, | |
| "loss": 0.0047, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 4.673406467340647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09013617783784866, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1698, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.846, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.447, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 4.679142467914247, | |
| "grad_norm": 0.15110693871974945, | |
| "learning_rate": 1.230472486436246e-07, | |
| "loss": 0.0004, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 4.684878468487847, | |
| "grad_norm": 0.009227008558809757, | |
| "learning_rate": 1.1867052735630435e-07, | |
| "loss": 0.0014, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 4.690614469061447, | |
| "grad_norm": 0.2895458936691284, | |
| "learning_rate": 1.1437212610106496e-07, | |
| "loss": 0.0013, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 4.696350469635047, | |
| "grad_norm": 0.541077733039856, | |
| "learning_rate": 1.1015211384496238e-07, | |
| "loss": 0.0102, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 4.702086470208647, | |
| "grad_norm": 2.9748380184173584, | |
| "learning_rate": 1.0601055829731588e-07, | |
| "loss": 0.023, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.702086470208647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09088833630084991, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2127, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.783, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.415, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.707822470782247, | |
| "grad_norm": 5.7777838706970215, | |
| "learning_rate": 1.0194752590862223e-07, | |
| "loss": 0.0119, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 4.713558471355848, | |
| "grad_norm": 0.06003111973404884, | |
| "learning_rate": 9.796308186948711e-08, | |
| "loss": 0.0007, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 4.719294471929447, | |
| "grad_norm": 0.005556735210120678, | |
| "learning_rate": 9.405729010958043e-08, | |
| "loss": 0.001, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 4.725030472503047, | |
| "grad_norm": 2.6498606204986572, | |
| "learning_rate": 9.023021329661152e-08, | |
| "loss": 0.0095, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 4.730766473076647, | |
| "grad_norm": 0.03512834012508392, | |
| "learning_rate": 8.648191283532337e-08, | |
| "loss": 0.0081, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.730766473076647, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0907173827290535, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.316, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.634, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.341, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.736502473650248, | |
| "grad_norm": 0.009284062311053276, | |
| "learning_rate": 8.281244886650607e-08, | |
| "loss": 0.0051, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 4.742238474223847, | |
| "grad_norm": 0.02183857187628746, | |
| "learning_rate": 7.922188026603273e-08, | |
| "loss": 0.0022, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 4.747974474797448, | |
| "grad_norm": 0.0035564859863370657, | |
| "learning_rate": 7.571026464391451e-08, | |
| "loss": 0.0025, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 4.753710475371047, | |
| "grad_norm": 0.04188934341073036, | |
| "learning_rate": 7.227765834337874e-08, | |
| "loss": 0.0004, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 4.759446475944648, | |
| "grad_norm": 0.6253750324249268, | |
| "learning_rate": 6.892411643995955e-08, | |
| "loss": 0.0006, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.759446475944648, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.090791717171669, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3622, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.568, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.307, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.765182476518247, | |
| "grad_norm": 0.3705517053604126, | |
| "learning_rate": 6.564969274061972e-08, | |
| "loss": 0.0042, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 4.770918477091848, | |
| "grad_norm": 0.0012910285731777549, | |
| "learning_rate": 6.245443978288413e-08, | |
| "loss": 0.0023, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 4.776654477665447, | |
| "grad_norm": 0.5175131559371948, | |
| "learning_rate": 5.933840883399766e-08, | |
| "loss": 0.0011, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 4.782390478239048, | |
| "grad_norm": 3.527125358581543, | |
| "learning_rate": 5.630164989010312e-08, | |
| "loss": 0.0023, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 4.788126478812648, | |
| "grad_norm": 0.3973446190357208, | |
| "learning_rate": 5.334421167543735e-08, | |
| "loss": 0.0026, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 4.788126478812648, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09068261086940765, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1697, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.846, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.447, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 4.793862479386248, | |
| "grad_norm": 0.003769023111090064, | |
| "learning_rate": 5.0466141641553056e-08, | |
| "loss": 0.0001, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 4.799598479959848, | |
| "grad_norm": 0.08687523007392883, | |
| "learning_rate": 4.766748596655268e-08, | |
| "loss": 0.0061, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 4.805334480533448, | |
| "grad_norm": 5.824708461761475, | |
| "learning_rate": 4.494828955435126e-08, | |
| "loss": 0.0047, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 4.811070481107048, | |
| "grad_norm": 2.9158382415771484, | |
| "learning_rate": 4.230859603395421e-08, | |
| "loss": 0.011, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 4.816806481680648, | |
| "grad_norm": 0.04298550263047218, | |
| "learning_rate": 3.974844775875786e-08, | |
| "loss": 0.0016, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.816806481680648, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09151580929756165, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2757, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.692, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.37, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.822542482254248, | |
| "grad_norm": 0.005364727228879929, | |
| "learning_rate": 3.726788580586893e-08, | |
| "loss": 0.0099, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 4.828278482827848, | |
| "grad_norm": 0.0018757123034447432, | |
| "learning_rate": 3.4866949975448374e-08, | |
| "loss": 0.0009, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 4.8340144834014485, | |
| "grad_norm": 0.008776834234595299, | |
| "learning_rate": 3.254567879006798e-08, | |
| "loss": 0.0068, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 4.839750483975048, | |
| "grad_norm": 0.009764501824975014, | |
| "learning_rate": 3.030410949409701e-08, | |
| "loss": 0.0088, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 4.8454864845486485, | |
| "grad_norm": 4.281154632568359, | |
| "learning_rate": 2.8142278053101545e-08, | |
| "loss": 0.0135, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 4.8454864845486485, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.0919208973646164, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1321, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.901, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.474, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 4.851222485122248, | |
| "grad_norm": 0.09912008792161942, | |
| "learning_rate": 2.6060219153268286e-08, | |
| "loss": 0.0005, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 4.8569584856958485, | |
| "grad_norm": 0.019211160019040108, | |
| "learning_rate": 2.4057966200849437e-08, | |
| "loss": 0.0005, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 4.862694486269449, | |
| "grad_norm": 0.05037353187799454, | |
| "learning_rate": 2.213555132162426e-08, | |
| "loss": 0.0012, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 4.868430486843049, | |
| "grad_norm": 0.00411713682115078, | |
| "learning_rate": 2.0293005360386142e-08, | |
| "loss": 0.0003, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 4.874166487416649, | |
| "grad_norm": 0.0006664736429229379, | |
| "learning_rate": 1.8530357880444105e-08, | |
| "loss": 0.0202, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.874166487416649, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09091775864362717, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2769, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.69, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.369, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.879902487990249, | |
| "grad_norm": 1.6142276525497437, | |
| "learning_rate": 1.684763716315374e-08, | |
| "loss": 0.0011, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 4.885638488563849, | |
| "grad_norm": 0.000510135549120605, | |
| "learning_rate": 1.524487020745813e-08, | |
| "loss": 0.0011, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 4.891374489137449, | |
| "grad_norm": 0.001830592635087669, | |
| "learning_rate": 1.372208272945763e-08, | |
| "loss": 0.0187, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 4.897110489711049, | |
| "grad_norm": 0.2799864709377289, | |
| "learning_rate": 1.2279299161997438e-08, | |
| "loss": 0.0157, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 4.902846490284649, | |
| "grad_norm": 0.009001590311527252, | |
| "learning_rate": 1.0916542654273443e-08, | |
| "loss": 0.0272, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 4.902846490284649, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09087590873241425, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.3382, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.602, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.325, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 4.908582490858249, | |
| "grad_norm": 0.17294363677501678, | |
| "learning_rate": 9.633835071463094e-09, | |
| "loss": 0.0052, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 4.91431849143185, | |
| "grad_norm": 0.008691389113664627, | |
| "learning_rate": 8.431196994373447e-09, | |
| "loss": 0.0065, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 4.920054492005449, | |
| "grad_norm": 0.010057262144982815, | |
| "learning_rate": 7.3086477191103285e-09, | |
| "loss": 0.02, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 4.925790492579049, | |
| "grad_norm": 0.19418714940547943, | |
| "learning_rate": 6.2662052567702414e-09, | |
| "loss": 0.0003, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 4.931526493152649, | |
| "grad_norm": 0.033849816769361496, | |
| "learning_rate": 5.303886333151154e-09, | |
| "loss": 0.0126, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.931526493152649, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09129200130701065, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2495, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.73, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.389, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.93726249372625, | |
| "grad_norm": 0.024734610691666603, | |
| "learning_rate": 4.421706388481606e-09, | |
| "loss": 0.0171, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 4.942998494299849, | |
| "grad_norm": 0.0016847607912495732, | |
| "learning_rate": 3.6196795771770156e-09, | |
| "loss": 0.0126, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 4.94873449487345, | |
| "grad_norm": 0.004256678279489279, | |
| "learning_rate": 2.897818767609861e-09, | |
| "loss": 0.0161, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 4.954470495447049, | |
| "grad_norm": 4.404531002044678, | |
| "learning_rate": 2.2561355419037368e-09, | |
| "loss": 0.0087, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 4.96020649602065, | |
| "grad_norm": 3.0091116428375244, | |
| "learning_rate": 1.6946401957479431e-09, | |
| "loss": 0.0145, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.96020649602065, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09181646257638931, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.2421, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.741, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.394, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.965942496594249, | |
| "grad_norm": 0.04898509010672569, | |
| "learning_rate": 1.2133417382320656e-09, | |
| "loss": 0.0032, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 4.97167849716785, | |
| "grad_norm": 0.002836048137396574, | |
| "learning_rate": 8.122478916999799e-10, | |
| "loss": 0.0074, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 4.977414497741449, | |
| "grad_norm": 0.02022160217165947, | |
| "learning_rate": 4.913650916299473e-10, | |
| "loss": 0.0098, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 4.98315049831505, | |
| "grad_norm": 0.007077803369611502, | |
| "learning_rate": 2.506984865263684e-10, | |
| "loss": 0.0016, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 4.98888649888865, | |
| "grad_norm": 0.1649414747953415, | |
| "learning_rate": 9.025193784151232e-11, | |
| "loss": 0.0041, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.98888649888865, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_loss": 0.09170719236135483, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_runtime": 21.1606, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_samples_per_second": 30.859, | |
| "eval_thought_eval_raw_data_english_72b_clean_1117_steps_per_second": 15.453, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.99462249946225, | |
| "grad_norm": 0.00366253312677145, | |
| "learning_rate": 1.0028019910013342e-11, | |
| "loss": 0.0032, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 4.99749049974905, | |
| "step": 8715, | |
| "total_flos": 6.758260289254195e+17, | |
| "train_loss": 0.02162364056710329, | |
| "train_runtime": 21813.1277, | |
| "train_samples_per_second": 6.394, | |
| "train_steps_per_second": 0.4 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8715, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.758260289254195e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |