{ "best_metric": 0.036843445152044296, "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold6/checkpoint-1900", "epoch": 4.995305164319249, "eval_steps": 50, "global_step": 3325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015023474178403756, "grad_norm": 4.304145336151123, "learning_rate": 3.003003003003003e-06, "loss": 1.5464, "step": 10 }, { "epoch": 0.03004694835680751, "grad_norm": 5.03688907623291, "learning_rate": 6.006006006006006e-06, "loss": 1.597, "step": 20 }, { "epoch": 0.04507042253521127, "grad_norm": 5.86588191986084, "learning_rate": 9.00900900900901e-06, "loss": 1.4497, "step": 30 }, { "epoch": 0.06009389671361502, "grad_norm": 2.099303722381592, "learning_rate": 1.2012012012012012e-05, "loss": 0.9009, "step": 40 }, { "epoch": 0.07511737089201878, "grad_norm": 1.6121171712875366, "learning_rate": 1.5015015015015016e-05, "loss": 0.6045, "step": 50 }, { "epoch": 0.07511737089201878, "eval_loss": 0.41910141706466675, "eval_runtime": 156.3169, "eval_samples_per_second": 7.574, "eval_steps_per_second": 7.574, "step": 50 }, { "epoch": 0.09014084507042254, "grad_norm": 1.1837172508239746, "learning_rate": 1.801801801801802e-05, "loss": 0.402, "step": 60 }, { "epoch": 0.10516431924882629, "grad_norm": 0.9849412441253662, "learning_rate": 2.102102102102102e-05, "loss": 0.2368, "step": 70 }, { "epoch": 0.12018779342723004, "grad_norm": 0.8895949721336365, "learning_rate": 2.4024024024024024e-05, "loss": 0.1741, "step": 80 }, { "epoch": 0.1352112676056338, "grad_norm": 0.9555579423904419, "learning_rate": 2.702702702702703e-05, "loss": 0.1089, "step": 90 }, { "epoch": 0.15023474178403756, "grad_norm": 0.7877600193023682, "learning_rate": 3.0030030030030033e-05, "loss": 0.118, "step": 100 }, { "epoch": 0.15023474178403756, "eval_loss": 0.09762203693389893, "eval_runtime": 156.4651, "eval_samples_per_second": 7.567, "eval_steps_per_second": 7.567, "step": 100 }, { "epoch": 0.1652582159624413, "grad_norm": 1.2528424263000488, "learning_rate": 3.3033033033033035e-05, "loss": 0.1111, "step": 110 }, { "epoch": 0.18028169014084508, "grad_norm": 0.836103618144989, "learning_rate": 3.603603603603604e-05, "loss": 0.0988, "step": 120 }, { "epoch": 0.19530516431924883, "grad_norm": 1.0173139572143555, "learning_rate": 3.903903903903904e-05, "loss": 0.0874, "step": 130 }, { "epoch": 0.21032863849765257, "grad_norm": 0.4798429012298584, "learning_rate": 4.204204204204204e-05, "loss": 0.0904, "step": 140 }, { "epoch": 0.22535211267605634, "grad_norm": 0.6175504922866821, "learning_rate": 4.5045045045045046e-05, "loss": 0.075, "step": 150 }, { "epoch": 0.22535211267605634, "eval_loss": 0.09080640226602554, "eval_runtime": 156.3504, "eval_samples_per_second": 7.573, "eval_steps_per_second": 7.573, "step": 150 }, { "epoch": 0.2403755868544601, "grad_norm": 0.8490262031555176, "learning_rate": 4.804804804804805e-05, "loss": 0.0712, "step": 160 }, { "epoch": 0.25539906103286386, "grad_norm": 1.1651703119277954, "learning_rate": 5.105105105105106e-05, "loss": 0.0496, "step": 170 }, { "epoch": 0.2704225352112676, "grad_norm": 0.9137387275695801, "learning_rate": 5.405405405405406e-05, "loss": 0.0628, "step": 180 }, { "epoch": 0.28544600938967135, "grad_norm": 0.6661922931671143, "learning_rate": 5.705705705705706e-05, "loss": 0.0582, "step": 190 }, { "epoch": 0.3004694835680751, "grad_norm": 0.47193077206611633, "learning_rate": 6.0060060060060066e-05, "loss": 0.0475, "step": 200 }, { "epoch": 0.3004694835680751, "eval_loss": 0.05551699176430702, "eval_runtime": 156.3208, "eval_samples_per_second": 7.574, "eval_steps_per_second": 7.574, "step": 200 }, { "epoch": 0.3154929577464789, "grad_norm": 0.5054419636726379, "learning_rate": 6.306306306306306e-05, "loss": 0.0529, "step": 210 }, { "epoch": 0.3305164319248826, "grad_norm": 0.4073943793773651, "learning_rate": 6.606606606606607e-05, "loss": 0.0564, "step": 220 }, { "epoch": 0.3455399061032864, "grad_norm": 0.5814553499221802, "learning_rate": 6.906906906906907e-05, "loss": 0.0473, "step": 230 }, { "epoch": 0.36056338028169016, "grad_norm": 0.730743944644928, "learning_rate": 7.207207207207208e-05, "loss": 0.0651, "step": 240 }, { "epoch": 0.3755868544600939, "grad_norm": 0.6487244963645935, "learning_rate": 7.507507507507507e-05, "loss": 0.0612, "step": 250 }, { "epoch": 0.3755868544600939, "eval_loss": 0.05300073325634003, "eval_runtime": 156.331, "eval_samples_per_second": 7.574, "eval_steps_per_second": 7.574, "step": 250 }, { "epoch": 0.39061032863849765, "grad_norm": 0.3846875727176666, "learning_rate": 7.807807807807808e-05, "loss": 0.0529, "step": 260 }, { "epoch": 0.4056338028169014, "grad_norm": 0.8112801909446716, "learning_rate": 8.108108108108109e-05, "loss": 0.0609, "step": 270 }, { "epoch": 0.42065727699530514, "grad_norm": 0.22141359746456146, "learning_rate": 8.408408408408409e-05, "loss": 0.0577, "step": 280 }, { "epoch": 0.4356807511737089, "grad_norm": 1.3037300109863281, "learning_rate": 8.70870870870871e-05, "loss": 0.0572, "step": 290 }, { "epoch": 0.4507042253521127, "grad_norm": 1.0784958600997925, "learning_rate": 9.009009009009009e-05, "loss": 0.043, "step": 300 }, { "epoch": 0.4507042253521127, "eval_loss": 0.05256583169102669, "eval_runtime": 156.2659, "eval_samples_per_second": 7.577, "eval_steps_per_second": 7.577, "step": 300 }, { "epoch": 0.46572769953051646, "grad_norm": 0.4775850176811218, "learning_rate": 9.30930930930931e-05, "loss": 0.0439, "step": 310 }, { "epoch": 0.4807511737089202, "grad_norm": 0.4602857232093811, "learning_rate": 9.60960960960961e-05, "loss": 0.0597, "step": 320 }, { "epoch": 0.49577464788732395, "grad_norm": 0.6467078924179077, "learning_rate": 9.90990990990991e-05, "loss": 0.0553, "step": 330 }, { "epoch": 0.5107981220657277, "grad_norm": 0.5899861454963684, "learning_rate": 9.999864944989638e-05, "loss": 0.0614, "step": 340 }, { "epoch": 0.5258215962441315, "grad_norm": 0.3560892939567566, "learning_rate": 9.999203468625017e-05, "loss": 0.0497, "step": 350 }, { "epoch": 0.5258215962441315, "eval_loss": 0.05027875676751137, "eval_runtime": 156.4383, "eval_samples_per_second": 7.568, "eval_steps_per_second": 7.568, "step": 350 }, { "epoch": 0.5408450704225352, "grad_norm": 1.0397191047668457, "learning_rate": 9.997990837719421e-05, "loss": 0.0614, "step": 360 }, { "epoch": 0.5558685446009389, "grad_norm": 0.3409489095211029, "learning_rate": 9.996227185963554e-05, "loss": 0.0446, "step": 370 }, { "epoch": 0.5708920187793427, "grad_norm": 0.6183903217315674, "learning_rate": 9.993912707797329e-05, "loss": 0.0513, "step": 380 }, { "epoch": 0.5859154929577465, "grad_norm": 0.36344006657600403, "learning_rate": 9.99104765838842e-05, "loss": 0.0433, "step": 390 }, { "epoch": 0.6009389671361502, "grad_norm": 0.2808699309825897, "learning_rate": 9.987632353604151e-05, "loss": 0.049, "step": 400 }, { "epoch": 0.6009389671361502, "eval_loss": 0.04805321246385574, "eval_runtime": 156.401, "eval_samples_per_second": 7.57, "eval_steps_per_second": 7.57, "step": 400 }, { "epoch": 0.615962441314554, "grad_norm": 0.41244831681251526, "learning_rate": 9.98366716997665e-05, "loss": 0.0469, "step": 410 }, { "epoch": 0.6309859154929578, "grad_norm": 0.4227665960788727, "learning_rate": 9.979152544661354e-05, "loss": 0.0437, "step": 420 }, { "epoch": 0.6460093896713615, "grad_norm": 0.5011832118034363, "learning_rate": 9.974088975388802e-05, "loss": 0.0479, "step": 430 }, { "epoch": 0.6610328638497652, "grad_norm": 0.47126173973083496, "learning_rate": 9.968477020409766e-05, "loss": 0.0452, "step": 440 }, { "epoch": 0.676056338028169, "grad_norm": 0.4116508364677429, "learning_rate": 9.962317298433705e-05, "loss": 0.0541, "step": 450 }, { "epoch": 0.676056338028169, "eval_loss": 0.047716788947582245, "eval_runtime": 156.3995, "eval_samples_per_second": 7.57, "eval_steps_per_second": 7.57, "step": 450 }, { "epoch": 0.6910798122065728, "grad_norm": 0.2776816785335541, "learning_rate": 9.955610488560551e-05, "loss": 0.0589, "step": 460 }, { "epoch": 0.7061032863849765, "grad_norm": 0.5058294534683228, "learning_rate": 9.948357330205842e-05, "loss": 0.0583, "step": 470 }, { "epoch": 0.7211267605633803, "grad_norm": 0.22229774296283722, "learning_rate": 9.940558623019201e-05, "loss": 0.0329, "step": 480 }, { "epoch": 0.7361502347417841, "grad_norm": 0.33905524015426636, "learning_rate": 9.932215226796172e-05, "loss": 0.0501, "step": 490 }, { "epoch": 0.7511737089201878, "grad_norm": 0.2562945783138275, "learning_rate": 9.923328061383435e-05, "loss": 0.0494, "step": 500 }, { "epoch": 0.7511737089201878, "eval_loss": 0.043219637125730515, "eval_runtime": 156.322, "eval_samples_per_second": 7.574, "eval_steps_per_second": 7.574, "step": 500 }, { "epoch": 0.7661971830985915, "grad_norm": 0.18596145510673523, "learning_rate": 9.913898106577393e-05, "loss": 0.0445, "step": 510 }, { "epoch": 0.7812206572769953, "grad_norm": 0.30711135268211365, "learning_rate": 9.903926402016153e-05, "loss": 0.0389, "step": 520 }, { "epoch": 0.7962441314553991, "grad_norm": 0.21911293268203735, "learning_rate": 9.893414047064897e-05, "loss": 0.0436, "step": 530 }, { "epoch": 0.8112676056338028, "grad_norm": 0.49014413356781006, "learning_rate": 9.88236220069469e-05, "loss": 0.043, "step": 540 }, { "epoch": 0.8262910798122066, "grad_norm": 0.33901888132095337, "learning_rate": 9.870772081354705e-05, "loss": 0.0556, "step": 550 }, { "epoch": 0.8262910798122066, "eval_loss": 0.04144579544663429, "eval_runtime": 156.3463, "eval_samples_per_second": 7.573, "eval_steps_per_second": 7.573, "step": 550 }, { "epoch": 0.8413145539906103, "grad_norm": 0.3239082098007202, "learning_rate": 9.858644966837878e-05, "loss": 0.0463, "step": 560 }, { "epoch": 0.856338028169014, "grad_norm": 0.16826266050338745, "learning_rate": 9.845982194140051e-05, "loss": 0.0456, "step": 570 }, { "epoch": 0.8713615023474178, "grad_norm": 0.28091156482696533, "learning_rate": 9.832785159312559e-05, "loss": 0.042, "step": 580 }, { "epoch": 0.8863849765258216, "grad_norm": 0.29264500737190247, "learning_rate": 9.819055317308317e-05, "loss": 0.04, "step": 590 }, { "epoch": 0.9014084507042254, "grad_norm": 0.30719348788261414, "learning_rate": 9.804794181821422e-05, "loss": 0.0398, "step": 600 }, { "epoch": 0.9014084507042254, "eval_loss": 0.04154442995786667, "eval_runtime": 156.3701, "eval_samples_per_second": 7.572, "eval_steps_per_second": 7.572, "step": 600 }, { "epoch": 0.9164319248826291, "grad_norm": 0.18495255708694458, "learning_rate": 9.790003325120261e-05, "loss": 0.0403, "step": 610 }, { "epoch": 0.9314553990610329, "grad_norm": 0.514930009841919, "learning_rate": 9.774684377874178e-05, "loss": 0.0535, "step": 620 }, { "epoch": 0.9464788732394366, "grad_norm": 0.2771974503993988, "learning_rate": 9.758839028973692e-05, "loss": 0.0358, "step": 630 }, { "epoch": 0.9615023474178404, "grad_norm": 0.2064761519432068, "learning_rate": 9.742469025344298e-05, "loss": 0.043, "step": 640 }, { "epoch": 0.9765258215962441, "grad_norm": 0.22615325450897217, "learning_rate": 9.725576171753874e-05, "loss": 0.0535, "step": 650 }, { "epoch": 0.9765258215962441, "eval_loss": 0.04291123151779175, "eval_runtime": 156.3355, "eval_samples_per_second": 7.573, "eval_steps_per_second": 7.573, "step": 650 }, { "epoch": 0.9915492957746479, "grad_norm": 0.9398748874664307, "learning_rate": 9.708162330613708e-05, "loss": 0.0441, "step": 660 }, { "epoch": 1.0065727699530516, "grad_norm": 0.2263706773519516, "learning_rate": 9.690229421773167e-05, "loss": 0.0439, "step": 670 }, { "epoch": 1.0215962441314554, "grad_norm": 0.3939422070980072, "learning_rate": 9.67177942230804e-05, "loss": 0.0402, "step": 680 }, { "epoch": 1.036619718309859, "grad_norm": 0.4399869740009308, "learning_rate": 9.652814366302568e-05, "loss": 0.0412, "step": 690 }, { "epoch": 1.051643192488263, "grad_norm": 0.29108133912086487, "learning_rate": 9.633336344625185e-05, "loss": 0.0383, "step": 700 }, { "epoch": 1.051643192488263, "eval_loss": 0.04375332221388817, "eval_runtime": 156.304, "eval_samples_per_second": 7.575, "eval_steps_per_second": 7.575, "step": 700 }, { "epoch": 1.0666666666666667, "grad_norm": 0.41116732358932495, "learning_rate": 9.61334750469801e-05, "loss": 0.0297, "step": 710 }, { "epoch": 1.0816901408450703, "grad_norm": 0.49311184883117676, "learning_rate": 9.592850050260089e-05, "loss": 0.0377, "step": 720 }, { "epoch": 1.0967136150234742, "grad_norm": 0.20614486932754517, "learning_rate": 9.571846241124446e-05, "loss": 0.0329, "step": 730 }, { "epoch": 1.1117370892018779, "grad_norm": 0.1830943077802658, "learning_rate": 9.55033839292893e-05, "loss": 0.0273, "step": 740 }, { "epoch": 1.1267605633802817, "grad_norm": 0.49418705701828003, "learning_rate": 9.52832887688093e-05, "loss": 0.0317, "step": 750 }, { "epoch": 1.1267605633802817, "eval_loss": 0.04088291898369789, "eval_runtime": 156.2721, "eval_samples_per_second": 7.577, "eval_steps_per_second": 7.577, "step": 750 }, { "epoch": 1.1417840375586854, "grad_norm": 0.43101999163627625, "learning_rate": 9.50582011949595e-05, "loss": 0.035, "step": 760 }, { "epoch": 1.1568075117370893, "grad_norm": 0.19687387347221375, "learning_rate": 9.482814602330084e-05, "loss": 0.038, "step": 770 }, { "epoch": 1.171830985915493, "grad_norm": 0.31862860918045044, "learning_rate": 9.459314861706435e-05, "loss": 0.0402, "step": 780 }, { "epoch": 1.1868544600938966, "grad_norm": 0.18375325202941895, "learning_rate": 9.435323488435488e-05, "loss": 0.0313, "step": 790 }, { "epoch": 1.2018779342723005, "grad_norm": 0.4344165027141571, "learning_rate": 9.410843127529473e-05, "loss": 0.0311, "step": 800 }, { "epoch": 1.2018779342723005, "eval_loss": 0.04096408188343048, "eval_runtime": 156.2708, "eval_samples_per_second": 7.577, "eval_steps_per_second": 7.577, "step": 800 }, { "epoch": 1.2169014084507042, "grad_norm": 0.48818346858024597, "learning_rate": 9.385876477910765e-05, "loss": 0.0344, "step": 810 }, { "epoch": 1.231924882629108, "grad_norm": 0.2184937298297882, "learning_rate": 9.360426292114314e-05, "loss": 0.0343, "step": 820 }, { "epoch": 1.2469483568075117, "grad_norm": 0.3244076371192932, "learning_rate": 9.334495375984212e-05, "loss": 0.0294, "step": 830 }, { "epoch": 1.2619718309859156, "grad_norm": 0.3577475845813751, "learning_rate": 9.30808658836432e-05, "loss": 0.0337, "step": 840 }, { "epoch": 1.2769953051643192, "grad_norm": 0.5191630721092224, "learning_rate": 9.281202840783108e-05, "loss": 0.0351, "step": 850 }, { "epoch": 1.2769953051643192, "eval_loss": 0.03982950374484062, "eval_runtime": 156.3321, "eval_samples_per_second": 7.574, "eval_steps_per_second": 7.574, "step": 850 }, { "epoch": 1.292018779342723, "grad_norm": 0.21748529374599457, "learning_rate": 9.253847097132655e-05, "loss": 0.0313, "step": 860 }, { "epoch": 1.3070422535211268, "grad_norm": 0.18870031833648682, "learning_rate": 9.226022373341882e-05, "loss": 0.0323, "step": 870 }, { "epoch": 1.3220657276995305, "grad_norm": 0.24485838413238525, "learning_rate": 9.19773173704406e-05, "loss": 0.0299, "step": 880 }, { "epoch": 1.3370892018779343, "grad_norm": 0.6192132830619812, "learning_rate": 9.168978307238594e-05, "loss": 0.037, "step": 890 }, { "epoch": 1.352112676056338, "grad_norm": 0.1277685910463333, "learning_rate": 9.13976525394717e-05, "loss": 0.0309, "step": 900 }, { "epoch": 1.352112676056338, "eval_loss": 0.037233345210552216, "eval_runtime": 156.3404, "eval_samples_per_second": 7.573, "eval_steps_per_second": 7.573, "step": 900 }, { "epoch": 1.3671361502347419, "grad_norm": 0.17440356314182281, "learning_rate": 9.110095797864263e-05, "loss": 0.0264, "step": 910 }, { "epoch": 1.3821596244131455, "grad_norm": 0.15580697357654572, "learning_rate": 9.079973210002051e-05, "loss": 0.0371, "step": 920 }, { "epoch": 1.3971830985915492, "grad_norm": 0.2136790156364441, "learning_rate": 9.049400811329807e-05, "loss": 0.0389, "step": 930 }, { "epoch": 1.412206572769953, "grad_norm": 0.32828402519226074, "learning_rate": 9.01838197240775e-05, "loss": 0.0296, "step": 940 }, { "epoch": 1.4272300469483568, "grad_norm": 0.44386857748031616, "learning_rate": 8.986920113015461e-05, "loss": 0.0373, "step": 950 }, { "epoch": 1.4272300469483568, "eval_loss": 0.03834960237145424, "eval_runtime": 156.2587, "eval_samples_per_second": 7.577, "eval_steps_per_second": 7.577, "step": 950 }, { "epoch": 1.4422535211267606, "grad_norm": 0.20402352511882782, "learning_rate": 8.955018701774846e-05, "loss": 0.0244, "step": 960 }, { "epoch": 1.4572769953051643, "grad_norm": 0.3417670428752899, "learning_rate": 8.922681255767731e-05, "loss": 0.0413, "step": 970 }, { "epoch": 1.4723004694835682, "grad_norm": 0.21933668851852417, "learning_rate": 8.889911340148112e-05, "loss": 0.0359, "step": 980 }, { "epoch": 1.4873239436619718, "grad_norm": 0.1883007138967514, "learning_rate": 8.856712567749095e-05, "loss": 0.0336, "step": 990 }, { "epoch": 1.5023474178403755, "grad_norm": 0.31359952688217163, "learning_rate": 8.82308859868459e-05, "loss": 0.0295, "step": 1000 }, { "epoch": 1.5023474178403755, "eval_loss": 0.039744190871715546, "eval_runtime": 156.3721, "eval_samples_per_second": 7.572, "eval_steps_per_second": 7.572, "step": 1000 }, { "epoch": 1.5173708920187794, "grad_norm": 0.3432142734527588, "learning_rate": 8.789043139945795e-05, "loss": 0.0335, "step": 1010 }, { "epoch": 1.532394366197183, "grad_norm": 0.16293199360370636, "learning_rate": 8.754579944992491e-05, "loss": 0.0391, "step": 1020 }, { "epoch": 1.5474178403755867, "grad_norm": 0.20295502245426178, "learning_rate": 8.719702813339248e-05, "loss": 0.035, "step": 1030 }, { "epoch": 1.5624413145539906, "grad_norm": 0.22714973986148834, "learning_rate": 8.684415590136518e-05, "loss": 0.0322, "step": 1040 }, { "epoch": 1.5774647887323945, "grad_norm": 0.15775541961193085, "learning_rate": 8.648722165746722e-05, "loss": 0.0349, "step": 1050 }, { "epoch": 1.5774647887323945, "eval_loss": 0.04082873836159706, "eval_runtime": 156.2306, "eval_samples_per_second": 7.579, "eval_steps_per_second": 7.579, "step": 1050 }, { "epoch": 1.5924882629107981, "grad_norm": 0.18249914050102234, "learning_rate": 8.61262647531534e-05, "loss": 0.0299, "step": 1060 }, { "epoch": 1.6075117370892018, "grad_norm": 0.2968941330909729, "learning_rate": 8.576132498337068e-05, "loss": 0.0372, "step": 1070 }, { "epoch": 1.6225352112676057, "grad_norm": 0.416565477848053, "learning_rate": 8.539244258217088e-05, "loss": 0.0332, "step": 1080 }, { "epoch": 1.6375586854460094, "grad_norm": 0.23764647543430328, "learning_rate": 8.501965821827485e-05, "loss": 0.0365, "step": 1090 }, { "epoch": 1.652582159624413, "grad_norm": 0.20299109816551208, "learning_rate": 8.464301299058892e-05, "loss": 0.026, "step": 1100 }, { "epoch": 1.652582159624413, "eval_loss": 0.04096033051609993, "eval_runtime": 156.3005, "eval_samples_per_second": 7.575, "eval_steps_per_second": 7.575, "step": 1100 }, { "epoch": 1.667605633802817, "grad_norm": 0.2365420162677765, "learning_rate": 8.426254842367374e-05, "loss": 0.0279, "step": 1110 }, { "epoch": 1.6826291079812208, "grad_norm": 0.26829490065574646, "learning_rate": 8.387830646316623e-05, "loss": 0.0388, "step": 1120 }, { "epoch": 1.6976525821596244, "grad_norm": 0.39060696959495544, "learning_rate": 8.349032947115525e-05, "loss": 0.028, "step": 1130 }, { "epoch": 1.712676056338028, "grad_norm": 0.3235212564468384, "learning_rate": 8.309866022151107e-05, "loss": 0.0468, "step": 1140 }, { "epoch": 1.727699530516432, "grad_norm": 0.32556065917015076, "learning_rate": 8.270334189516983e-05, "loss": 0.0305, "step": 1150 }, { "epoch": 1.727699530516432, "eval_loss": 0.039029572159051895, "eval_runtime": 156.2235, "eval_samples_per_second": 7.579, "eval_steps_per_second": 7.579, "step": 1150 }, { "epoch": 1.7427230046948357, "grad_norm": 0.15808342397212982, "learning_rate": 8.230441807537277e-05, "loss": 0.0466, "step": 1160 }, { "epoch": 1.7577464788732393, "grad_norm": 0.15356343984603882, "learning_rate": 8.190193274286122e-05, "loss": 0.0284, "step": 1170 }, { "epoch": 1.7727699530516432, "grad_norm": 0.26693618297576904, "learning_rate": 8.149593027102789e-05, "loss": 0.0347, "step": 1180 }, { "epoch": 1.787793427230047, "grad_norm": 0.3144373595714569, "learning_rate": 8.108645542102469e-05, "loss": 0.0277, "step": 1190 }, { "epoch": 1.8028169014084507, "grad_norm": 0.26021772623062134, "learning_rate": 8.067355333682798e-05, "loss": 0.0332, "step": 1200 }, { "epoch": 1.8028169014084507, "eval_loss": 0.03854435682296753, "eval_runtime": 156.2279, "eval_samples_per_second": 7.579, "eval_steps_per_second": 7.579, "step": 1200 }, { "epoch": 1.8178403755868544, "grad_norm": 0.4863680303096771, "learning_rate": 8.025726954026138e-05, "loss": 0.0242, "step": 1210 }, { "epoch": 1.8328638497652583, "grad_norm": 0.5616618394851685, "learning_rate": 7.983764992597716e-05, "loss": 0.034, "step": 1220 }, { "epoch": 1.847887323943662, "grad_norm": 0.4086838662624359, "learning_rate": 7.94147407563964e-05, "loss": 0.0314, "step": 1230 }, { "epoch": 1.8629107981220656, "grad_norm": 0.2752532362937927, "learning_rate": 7.89885886566086e-05, "loss": 0.0348, "step": 1240 }, { "epoch": 1.8779342723004695, "grad_norm": 0.508071780204773, "learning_rate": 7.855924060923141e-05, "loss": 0.0304, "step": 1250 }, { "epoch": 1.8779342723004695, "eval_loss": 0.03792616352438927, "eval_runtime": 156.2438, "eval_samples_per_second": 7.578, "eval_steps_per_second": 7.578, "step": 1250 }, { "epoch": 1.8929577464788734, "grad_norm": 0.2151941955089569, "learning_rate": 7.812674394923077e-05, "loss": 0.0364, "step": 1260 }, { "epoch": 1.907981220657277, "grad_norm": 0.2420804798603058, "learning_rate": 7.769114635870231e-05, "loss": 0.0357, "step": 1270 }, { "epoch": 1.9230046948356807, "grad_norm": 0.37158623337745667, "learning_rate": 7.725249586161463e-05, "loss": 0.0317, "step": 1280 }, { "epoch": 1.9380281690140846, "grad_norm": 0.19411316514015198, "learning_rate": 7.68108408185145e-05, "loss": 0.0276, "step": 1290 }, { "epoch": 1.9530516431924883, "grad_norm": 0.2072768658399582, "learning_rate": 7.636622992119536e-05, "loss": 0.0497, "step": 1300 }, { "epoch": 1.9530516431924883, "eval_loss": 0.039196692407131195, "eval_runtime": 156.2151, "eval_samples_per_second": 7.579, "eval_steps_per_second": 7.579, "step": 1300 }, { "epoch": 1.968075117370892, "grad_norm": 0.30601534247398376, "learning_rate": 7.591871218732902e-05, "loss": 0.036, "step": 1310 }, { "epoch": 1.9830985915492958, "grad_norm": 0.2544531226158142, "learning_rate": 7.54683369550616e-05, "loss": 0.0413, "step": 1320 }, { "epoch": 1.9981220657276997, "grad_norm": 0.18643656373023987, "learning_rate": 7.501515387757404e-05, "loss": 0.0295, "step": 1330 }, { "epoch": 2.013145539906103, "grad_norm": 0.2908141016960144, "learning_rate": 7.455921291760796e-05, "loss": 0.0365, "step": 1340 }, { "epoch": 2.028169014084507, "grad_norm": 0.08652251958847046, "learning_rate": 7.410056434195725e-05, "loss": 0.0183, "step": 1350 }, { "epoch": 2.028169014084507, "eval_loss": 0.04124961793422699, "eval_runtime": 156.2014, "eval_samples_per_second": 7.58, "eval_steps_per_second": 7.58, "step": 1350 }, { "epoch": 2.043192488262911, "grad_norm": 0.231898233294487, "learning_rate": 7.363925871592629e-05, "loss": 0.0264, "step": 1360 }, { "epoch": 2.0582159624413148, "grad_norm": 0.15739409625530243, "learning_rate": 7.317534689775528e-05, "loss": 0.0239, "step": 1370 }, { "epoch": 2.073239436619718, "grad_norm": 0.16826321184635162, "learning_rate": 7.270888003301304e-05, "loss": 0.0175, "step": 1380 }, { "epoch": 2.088262910798122, "grad_norm": 0.5138995051383972, "learning_rate": 7.22399095489584e-05, "loss": 0.022, "step": 1390 }, { "epoch": 2.103286384976526, "grad_norm": 0.1792786568403244, "learning_rate": 7.176848714887042e-05, "loss": 0.0269, "step": 1400 }, { "epoch": 2.103286384976526, "eval_loss": 0.038102250546216965, "eval_runtime": 156.1612, "eval_samples_per_second": 7.582, "eval_steps_per_second": 7.582, "step": 1400 }, { "epoch": 2.1183098591549294, "grad_norm": 0.21858704090118408, "learning_rate": 7.129466480634806e-05, "loss": 0.0251, "step": 1410 }, { "epoch": 2.1333333333333333, "grad_norm": 0.2904298007488251, "learning_rate": 7.081849475958042e-05, "loss": 0.0208, "step": 1420 }, { "epoch": 2.148356807511737, "grad_norm": 0.4409976899623871, "learning_rate": 7.034002950558723e-05, "loss": 0.0243, "step": 1430 }, { "epoch": 2.1633802816901406, "grad_norm": 0.19713151454925537, "learning_rate": 6.985932179443144e-05, "loss": 0.0253, "step": 1440 }, { "epoch": 2.1784037558685445, "grad_norm": 0.24242573976516724, "learning_rate": 6.937642462340342e-05, "loss": 0.0299, "step": 1450 }, { "epoch": 2.1784037558685445, "eval_loss": 0.0379047654569149, "eval_runtime": 156.2912, "eval_samples_per_second": 7.576, "eval_steps_per_second": 7.576, "step": 1450 }, { "epoch": 2.1934272300469484, "grad_norm": 0.19243745505809784, "learning_rate": 6.889139123117817e-05, "loss": 0.0184, "step": 1460 }, { "epoch": 2.2084507042253523, "grad_norm": 0.2577214241027832, "learning_rate": 6.840427509194575e-05, "loss": 0.0249, "step": 1470 }, { "epoch": 2.2234741784037557, "grad_norm": 0.2908743917942047, "learning_rate": 6.791512990951597e-05, "loss": 0.0165, "step": 1480 }, { "epoch": 2.2384976525821596, "grad_norm": 0.6111800670623779, "learning_rate": 6.74240096113975e-05, "loss": 0.0262, "step": 1490 }, { "epoch": 2.2535211267605635, "grad_norm": 0.1613549441099167, "learning_rate": 6.693096834285256e-05, "loss": 0.0129, "step": 1500 }, { "epoch": 2.2535211267605635, "eval_loss": 0.04337477684020996, "eval_runtime": 156.2934, "eval_samples_per_second": 7.575, "eval_steps_per_second": 7.575, "step": 1500 }, { "epoch": 2.2685446009389674, "grad_norm": 0.40178021788597107, "learning_rate": 6.643606046092732e-05, "loss": 0.0238, "step": 1510 }, { "epoch": 2.283568075117371, "grad_norm": 0.18915718793869019, "learning_rate": 6.593934052845929e-05, "loss": 0.0257, "step": 1520 }, { "epoch": 2.2985915492957747, "grad_norm": 0.35349389910697937, "learning_rate": 6.544086330806181e-05, "loss": 0.0268, "step": 1530 }, { "epoch": 2.3136150234741786, "grad_norm": 0.15677450597286224, "learning_rate": 6.494068375608646e-05, "loss": 0.0245, "step": 1540 }, { "epoch": 2.328638497652582, "grad_norm": 0.2012639194726944, "learning_rate": 6.443885701656432e-05, "loss": 0.0203, "step": 1550 }, { "epoch": 2.328638497652582, "eval_loss": 0.03834431990981102, "eval_runtime": 156.2857, "eval_samples_per_second": 7.576, "eval_steps_per_second": 7.576, "step": 1550 }, { "epoch": 2.343661971830986, "grad_norm": 0.28499725461006165, "learning_rate": 6.393543841512632e-05, "loss": 0.0187, "step": 1560 }, { "epoch": 2.35868544600939, "grad_norm": 0.27369531989097595, "learning_rate": 6.343048345290386e-05, "loss": 0.0143, "step": 1570 }, { "epoch": 2.3737089201877932, "grad_norm": 0.31129664182662964, "learning_rate": 6.292404780040961e-05, "loss": 0.0306, "step": 1580 }, { "epoch": 2.388732394366197, "grad_norm": 0.43185606598854065, "learning_rate": 6.241618729140018e-05, "loss": 0.0198, "step": 1590 }, { "epoch": 2.403755868544601, "grad_norm": 0.32471033930778503, "learning_rate": 6.190695791672042e-05, "loss": 0.0221, "step": 1600 }, { "epoch": 2.403755868544601, "eval_loss": 0.03971394523978233, "eval_runtime": 156.3498, "eval_samples_per_second": 7.573, "eval_steps_per_second": 7.573, "step": 1600 }, { "epoch": 2.418779342723005, "grad_norm": 0.13160324096679688, "learning_rate": 6.139641581813052e-05, "loss": 0.0194, "step": 1610 }, { "epoch": 2.4338028169014083, "grad_norm": 0.1444709450006485, "learning_rate": 6.088461728211642e-05, "loss": 0.0206, "step": 1620 }, { "epoch": 2.448826291079812, "grad_norm": 0.3128149211406708, "learning_rate": 6.0371618733684474e-05, "loss": 0.0151, "step": 1630 }, { "epoch": 2.463849765258216, "grad_norm": 0.1301877200603485, "learning_rate": 5.9857476730140485e-05, "loss": 0.0214, "step": 1640 }, { "epoch": 2.4788732394366195, "grad_norm": 0.2600902020931244, "learning_rate": 5.9342247954854466e-05, "loss": 0.023, "step": 1650 }, { "epoch": 2.4788732394366195, "eval_loss": 0.041283123195171356, "eval_runtime": 156.3863, "eval_samples_per_second": 7.571, "eval_steps_per_second": 7.571, "step": 1650 }, { "epoch": 2.4938967136150234, "grad_norm": 0.20478318631649017, "learning_rate": 5.8825989211011335e-05, "loss": 0.0242, "step": 1660 }, { "epoch": 2.5089201877934273, "grad_norm": 0.212309792637825, "learning_rate": 5.830875741534852e-05, "loss": 0.0187, "step": 1670 }, { "epoch": 2.523943661971831, "grad_norm": 0.2569638192653656, "learning_rate": 5.7790609591880826e-05, "loss": 0.0259, "step": 1680 }, { "epoch": 2.5389671361502346, "grad_norm": 0.305275022983551, "learning_rate": 5.727160286561386e-05, "loss": 0.0232, "step": 1690 }, { "epoch": 2.5539906103286385, "grad_norm": 0.22980214655399323, "learning_rate": 5.675179445624581e-05, "loss": 0.0257, "step": 1700 }, { "epoch": 2.5539906103286385, "eval_loss": 0.04087158292531967, "eval_runtime": 156.4861, "eval_samples_per_second": 7.566, "eval_steps_per_second": 7.566, "step": 1700 }, { "epoch": 2.5690140845070424, "grad_norm": 0.13337288796901703, "learning_rate": 5.62312416718593e-05, "loss": 0.018, "step": 1710 }, { "epoch": 2.584037558685446, "grad_norm": 0.06781081855297089, "learning_rate": 5.5710001902603116e-05, "loss": 0.0238, "step": 1720 }, { "epoch": 2.5990610328638497, "grad_norm": 0.15210840106010437, "learning_rate": 5.5188132614365094e-05, "loss": 0.0213, "step": 1730 }, { "epoch": 2.6140845070422536, "grad_norm": 0.18319003283977509, "learning_rate": 5.4665691342436565e-05, "loss": 0.0214, "step": 1740 }, { "epoch": 2.629107981220657, "grad_norm": 0.382030189037323, "learning_rate": 5.414273568516919e-05, "loss": 0.0241, "step": 1750 }, { "epoch": 2.629107981220657, "eval_loss": 0.03998855501413345, "eval_runtime": 156.5463, "eval_samples_per_second": 7.563, "eval_steps_per_second": 7.563, "step": 1750 }, { "epoch": 2.644131455399061, "grad_norm": 0.26402246952056885, "learning_rate": 5.361932329762481e-05, "loss": 0.0201, "step": 1760 }, { "epoch": 2.659154929577465, "grad_norm": 0.1534145027399063, "learning_rate": 5.309551188521914e-05, "loss": 0.0228, "step": 1770 }, { "epoch": 2.6741784037558687, "grad_norm": 0.4438458979129791, "learning_rate": 5.2571359197359704e-05, "loss": 0.0162, "step": 1780 }, { "epoch": 2.6892018779342726, "grad_norm": 0.21595679223537445, "learning_rate": 5.2046923021079175e-05, "loss": 0.0173, "step": 1790 }, { "epoch": 2.704225352112676, "grad_norm": 0.4059389531612396, "learning_rate": 5.1522261174664346e-05, "loss": 0.0258, "step": 1800 }, { "epoch": 2.704225352112676, "eval_loss": 0.040539611130952835, "eval_runtime": 156.6429, "eval_samples_per_second": 7.559, "eval_steps_per_second": 7.559, "step": 1800 }, { "epoch": 2.71924882629108, "grad_norm": 0.8589577078819275, "learning_rate": 5.0997431501281835e-05, "loss": 0.035, "step": 1810 }, { "epoch": 2.7342723004694838, "grad_norm": 0.2879532277584076, "learning_rate": 5.0472491862600915e-05, "loss": 0.0223, "step": 1820 }, { "epoch": 2.749295774647887, "grad_norm": 0.3457850217819214, "learning_rate": 4.994750013241435e-05, "loss": 0.0264, "step": 1830 }, { "epoch": 2.764319248826291, "grad_norm": 0.29999199509620667, "learning_rate": 4.9422514190257974e-05, "loss": 0.0242, "step": 1840 }, { "epoch": 2.779342723004695, "grad_norm": 0.09458961337804794, "learning_rate": 4.88975919150294e-05, "loss": 0.021, "step": 1850 }, { "epoch": 2.779342723004695, "eval_loss": 0.03918106481432915, "eval_runtime": 156.9396, "eval_samples_per_second": 7.544, "eval_steps_per_second": 7.544, "step": 1850 }, { "epoch": 2.7943661971830984, "grad_norm": 0.2146388590335846, "learning_rate": 4.83727911786071e-05, "loss": 0.0257, "step": 1860 }, { "epoch": 2.8093896713615023, "grad_norm": 0.3507349193096161, "learning_rate": 4.7848169839470145e-05, "loss": 0.0237, "step": 1870 }, { "epoch": 2.824413145539906, "grad_norm": 0.14694459736347198, "learning_rate": 4.7323785736319244e-05, "loss": 0.0271, "step": 1880 }, { "epoch": 2.8394366197183096, "grad_norm": 0.16568858921527863, "learning_rate": 4.679969668170024e-05, "loss": 0.0234, "step": 1890 }, { "epoch": 2.8544600938967135, "grad_norm": 0.1532352864742279, "learning_rate": 4.627596045563031e-05, "loss": 0.0253, "step": 1900 }, { "epoch": 2.8544600938967135, "eval_loss": 0.036843445152044296, "eval_runtime": 157.0438, "eval_samples_per_second": 7.539, "eval_steps_per_second": 7.539, "step": 1900 }, { "epoch": 2.8694835680751174, "grad_norm": 0.22552986443042755, "learning_rate": 4.575263479922783e-05, "loss": 0.0156, "step": 1910 }, { "epoch": 2.8845070422535213, "grad_norm": 0.1421150416135788, "learning_rate": 4.522977740834651e-05, "loss": 0.0142, "step": 1920 }, { "epoch": 2.8995305164319247, "grad_norm": 0.2992912232875824, "learning_rate": 4.4707445927214456e-05, "loss": 0.0169, "step": 1930 }, { "epoch": 2.9145539906103286, "grad_norm": 0.1591469943523407, "learning_rate": 4.4185697942079115e-05, "loss": 0.0255, "step": 1940 }, { "epoch": 2.9295774647887325, "grad_norm": 0.13977603614330292, "learning_rate": 4.366459097485832e-05, "loss": 0.018, "step": 1950 }, { "epoch": 2.9295774647887325, "eval_loss": 0.03921648487448692, "eval_runtime": 157.0461, "eval_samples_per_second": 7.539, "eval_steps_per_second": 7.539, "step": 1950 }, { "epoch": 2.9446009389671364, "grad_norm": 0.3342364728450775, "learning_rate": 4.314418247679866e-05, "loss": 0.0263, "step": 1960 }, { "epoch": 2.95962441314554, "grad_norm": 0.24284552037715912, "learning_rate": 4.26245298221416e-05, "loss": 0.014, "step": 1970 }, { "epoch": 2.9746478873239437, "grad_norm": 0.15542685985565186, "learning_rate": 4.2105690301798014e-05, "loss": 0.0176, "step": 1980 }, { "epoch": 2.9896713615023476, "grad_norm": 0.24745222926139832, "learning_rate": 4.158772111703194e-05, "loss": 0.0177, "step": 1990 }, { "epoch": 3.004694835680751, "grad_norm": 0.07768896967172623, "learning_rate": 4.107067937315429e-05, "loss": 0.0112, "step": 2000 }, { "epoch": 3.004694835680751, "eval_loss": 0.04066740348935127, "eval_runtime": 157.2285, "eval_samples_per_second": 7.53, "eval_steps_per_second": 7.53, "step": 2000 }, { "epoch": 3.019718309859155, "grad_norm": 0.1813950091600418, "learning_rate": 4.055462207322698e-05, "loss": 0.0121, "step": 2010 }, { "epoch": 3.034741784037559, "grad_norm": 0.22787263989448547, "learning_rate": 4.003960611177855e-05, "loss": 0.0118, "step": 2020 }, { "epoch": 3.0497652582159622, "grad_norm": 0.19959565997123718, "learning_rate": 3.952568826853152e-05, "loss": 0.0097, "step": 2030 }, { "epoch": 3.064788732394366, "grad_norm": 0.0704595223069191, "learning_rate": 3.901292520214256e-05, "loss": 0.0079, "step": 2040 }, { "epoch": 3.07981220657277, "grad_norm": 0.4440910816192627, "learning_rate": 3.850137344395598e-05, "loss": 0.0082, "step": 2050 }, { "epoch": 3.07981220657277, "eval_loss": 0.047751668840646744, "eval_runtime": 157.4059, "eval_samples_per_second": 7.522, "eval_steps_per_second": 7.522, "step": 2050 }, { "epoch": 3.094835680751174, "grad_norm": 0.4817383885383606, "learning_rate": 3.799108939177118e-05, "loss": 0.011, "step": 2060 }, { "epoch": 3.1098591549295773, "grad_norm": 0.15736213326454163, "learning_rate": 3.7482129303624934e-05, "loss": 0.0098, "step": 2070 }, { "epoch": 3.124882629107981, "grad_norm": 0.3768656551837921, "learning_rate": 3.697454929158901e-05, "loss": 0.0123, "step": 2080 }, { "epoch": 3.139906103286385, "grad_norm": 0.37794822454452515, "learning_rate": 3.6468405315583854e-05, "loss": 0.0155, "step": 2090 }, { "epoch": 3.1549295774647885, "grad_norm": 0.47674834728240967, "learning_rate": 3.59637531772092e-05, "loss": 0.0127, "step": 2100 }, { "epoch": 3.1549295774647885, "eval_loss": 0.046006690710783005, "eval_runtime": 157.5212, "eval_samples_per_second": 7.516, "eval_steps_per_second": 7.516, "step": 2100 }, { "epoch": 3.1699530516431924, "grad_norm": 0.2169833481311798, "learning_rate": 3.546064851359192e-05, "loss": 0.0048, "step": 2110 }, { "epoch": 3.1849765258215963, "grad_norm": 0.12608371675014496, "learning_rate": 3.495914679125212e-05, "loss": 0.0125, "step": 2120 }, { "epoch": 3.2, "grad_norm": 0.21189413964748383, "learning_rate": 3.445930329998819e-05, "loss": 0.0102, "step": 2130 }, { "epoch": 3.2150234741784036, "grad_norm": 0.29917919635772705, "learning_rate": 3.396117314678097e-05, "loss": 0.0173, "step": 2140 }, { "epoch": 3.2300469483568075, "grad_norm": 0.29451966285705566, "learning_rate": 3.3464811249718474e-05, "loss": 0.0112, "step": 2150 }, { "epoch": 3.2300469483568075, "eval_loss": 0.04488084837794304, "eval_runtime": 157.4858, "eval_samples_per_second": 7.518, "eval_steps_per_second": 7.518, "step": 2150 }, { "epoch": 3.2450704225352114, "grad_norm": 0.1548345535993576, "learning_rate": 3.297027233194114e-05, "loss": 0.0128, "step": 2160 }, { "epoch": 3.260093896713615, "grad_norm": 0.08690305054187775, "learning_rate": 3.2477610915608704e-05, "loss": 0.01, "step": 2170 }, { "epoch": 3.2751173708920187, "grad_norm": 0.26383867859840393, "learning_rate": 3.1986881315889315e-05, "loss": 0.014, "step": 2180 }, { "epoch": 3.2901408450704226, "grad_norm": 0.4667436480522156, "learning_rate": 3.149813763497124e-05, "loss": 0.0174, "step": 2190 }, { "epoch": 3.3051643192488265, "grad_norm": 0.3968455493450165, "learning_rate": 3.101143375609818e-05, "loss": 0.0145, "step": 2200 }, { "epoch": 3.3051643192488265, "eval_loss": 0.04522354528307915, "eval_runtime": 157.475, "eval_samples_per_second": 7.519, "eval_steps_per_second": 7.519, "step": 2200 }, { "epoch": 3.32018779342723, "grad_norm": 0.2882903814315796, "learning_rate": 3.0526823337628915e-05, "loss": 0.0143, "step": 2210 }, { "epoch": 3.335211267605634, "grad_norm": 0.2159668207168579, "learning_rate": 3.004435980712129e-05, "loss": 0.0094, "step": 2220 }, { "epoch": 3.3502347417840377, "grad_norm": 0.6046177744865417, "learning_rate": 2.9564096355442116e-05, "loss": 0.0133, "step": 2230 }, { "epoch": 3.365258215962441, "grad_norm": 0.30788397789001465, "learning_rate": 2.9086085930902824e-05, "loss": 0.0143, "step": 2240 }, { "epoch": 3.380281690140845, "grad_norm": 0.3591701090335846, "learning_rate": 2.8610381233422058e-05, "loss": 0.0086, "step": 2250 }, { "epoch": 3.380281690140845, "eval_loss": 0.04514958709478378, "eval_runtime": 157.5469, "eval_samples_per_second": 7.515, "eval_steps_per_second": 7.515, "step": 2250 }, { "epoch": 3.395305164319249, "grad_norm": 0.18443742394447327, "learning_rate": 2.8137034708715592e-05, "loss": 0.0106, "step": 2260 }, { "epoch": 3.4103286384976528, "grad_norm": 0.02844012901186943, "learning_rate": 2.7666098542514273e-05, "loss": 0.0066, "step": 2270 }, { "epoch": 3.425352112676056, "grad_norm": 0.5778037309646606, "learning_rate": 2.719762465481055e-05, "loss": 0.0117, "step": 2280 }, { "epoch": 3.44037558685446, "grad_norm": 0.17093347012996674, "learning_rate": 2.6731664694134473e-05, "loss": 0.0133, "step": 2290 }, { "epoch": 3.455399061032864, "grad_norm": 0.5244073271751404, "learning_rate": 2.6268270031859476e-05, "loss": 0.0161, "step": 2300 }, { "epoch": 3.455399061032864, "eval_loss": 0.04896237701177597, "eval_runtime": 157.594, "eval_samples_per_second": 7.513, "eval_steps_per_second": 7.513, "step": 2300 }, { "epoch": 3.4704225352112674, "grad_norm": 0.5648406147956848, "learning_rate": 2.580749175653877e-05, "loss": 0.0108, "step": 2310 }, { "epoch": 3.4854460093896713, "grad_norm": 0.5017344355583191, "learning_rate": 2.5349380668272905e-05, "loss": 0.0124, "step": 2320 }, { "epoch": 3.500469483568075, "grad_norm": 0.1720559000968933, "learning_rate": 2.489398727310908e-05, "loss": 0.013, "step": 2330 }, { "epoch": 3.5154929577464786, "grad_norm": 0.34457623958587646, "learning_rate": 2.4441361777473066e-05, "loss": 0.0107, "step": 2340 }, { "epoch": 3.5305164319248825, "grad_norm": 0.8733149766921997, "learning_rate": 2.3991554082633912e-05, "loss": 0.0134, "step": 2350 }, { "epoch": 3.5305164319248825, "eval_loss": 0.04652281105518341, "eval_runtime": 157.7619, "eval_samples_per_second": 7.505, "eval_steps_per_second": 7.505, "step": 2350 }, { "epoch": 3.5455399061032864, "grad_norm": 0.13055922091007233, "learning_rate": 2.354461377920239e-05, "loss": 0.0088, "step": 2360 }, { "epoch": 3.5605633802816903, "grad_norm": 0.20548763871192932, "learning_rate": 2.3100590141663807e-05, "loss": 0.0115, "step": 2370 }, { "epoch": 3.575586854460094, "grad_norm": 0.3360619843006134, "learning_rate": 2.265953212294551e-05, "loss": 0.0182, "step": 2380 }, { "epoch": 3.5906103286384976, "grad_norm": 0.6770099401473999, "learning_rate": 2.2221488349019903e-05, "loss": 0.0164, "step": 2390 }, { "epoch": 3.6056338028169015, "grad_norm": 0.31347426772117615, "learning_rate": 2.1786507113543457e-05, "loss": 0.009, "step": 2400 }, { "epoch": 3.6056338028169015, "eval_loss": 0.04605977609753609, "eval_runtime": 157.7592, "eval_samples_per_second": 7.505, "eval_steps_per_second": 7.505, "step": 2400 }, { "epoch": 3.6206572769953054, "grad_norm": 0.13689975440502167, "learning_rate": 2.1354636372532523e-05, "loss": 0.0176, "step": 2410 }, { "epoch": 3.635680751173709, "grad_norm": 0.2913866937160492, "learning_rate": 2.092592373907617e-05, "loss": 0.0101, "step": 2420 }, { "epoch": 3.6507042253521127, "grad_norm": 0.37444183230400085, "learning_rate": 2.0500416478086932e-05, "loss": 0.0075, "step": 2430 }, { "epoch": 3.6657276995305166, "grad_norm": 0.42321744561195374, "learning_rate": 2.0078161501089954e-05, "loss": 0.0117, "step": 2440 }, { "epoch": 3.68075117370892, "grad_norm": 0.15104345977306366, "learning_rate": 1.9659205361050982e-05, "loss": 0.0056, "step": 2450 }, { "epoch": 3.68075117370892, "eval_loss": 0.04802750423550606, "eval_runtime": 157.7669, "eval_samples_per_second": 7.505, "eval_steps_per_second": 7.505, "step": 2450 }, { "epoch": 3.695774647887324, "grad_norm": 0.15667098760604858, "learning_rate": 1.924359424724408e-05, "loss": 0.0154, "step": 2460 }, { "epoch": 3.710798122065728, "grad_norm": 0.49284836649894714, "learning_rate": 1.8831373980159296e-05, "loss": 0.0145, "step": 2470 }, { "epoch": 3.7258215962441312, "grad_norm": 0.22378303110599518, "learning_rate": 1.8422590006450947e-05, "loss": 0.0063, "step": 2480 }, { "epoch": 3.740845070422535, "grad_norm": 0.20172594487667084, "learning_rate": 1.801728739392731e-05, "loss": 0.0089, "step": 2490 }, { "epoch": 3.755868544600939, "grad_norm": 0.2898995280265808, "learning_rate": 1.7615510826581904e-05, "loss": 0.0069, "step": 2500 }, { "epoch": 3.755868544600939, "eval_loss": 0.04896940663456917, "eval_runtime": 157.7817, "eval_samples_per_second": 7.504, "eval_steps_per_second": 7.504, "step": 2500 }, { "epoch": 3.770892018779343, "grad_norm": 0.32213857769966125, "learning_rate": 1.7217304599667146e-05, "loss": 0.0164, "step": 2510 }, { "epoch": 3.7859154929577463, "grad_norm": 0.06365455687046051, "learning_rate": 1.6822712614810893e-05, "loss": 0.01, "step": 2520 }, { "epoch": 3.80093896713615, "grad_norm": 0.2050347775220871, "learning_rate": 1.643177837517631e-05, "loss": 0.0117, "step": 2530 }, { "epoch": 3.815962441314554, "grad_norm": 0.40109461545944214, "learning_rate": 1.6044544980665767e-05, "loss": 0.0108, "step": 2540 }, { "epoch": 3.830985915492958, "grad_norm": 0.5502837300300598, "learning_rate": 1.5661055123169126e-05, "loss": 0.007, "step": 2550 }, { "epoch": 3.830985915492958, "eval_loss": 0.04891159385442734, "eval_runtime": 157.7489, "eval_samples_per_second": 7.506, "eval_steps_per_second": 7.506, "step": 2550 }, { "epoch": 3.8460093896713614, "grad_norm": 0.17208245396614075, "learning_rate": 1.5281351081856974e-05, "loss": 0.0079, "step": 2560 }, { "epoch": 3.8610328638497653, "grad_norm": 0.2847214639186859, "learning_rate": 1.4905474718519491e-05, "loss": 0.0075, "step": 2570 }, { "epoch": 3.876056338028169, "grad_norm": 0.36303362250328064, "learning_rate": 1.453346747295119e-05, "loss": 0.0106, "step": 2580 }, { "epoch": 3.8910798122065726, "grad_norm": 0.09206058084964752, "learning_rate": 1.4165370358382274e-05, "loss": 0.0117, "step": 2590 }, { "epoch": 3.9061032863849765, "grad_norm": 0.7500032782554626, "learning_rate": 1.3801223956956994e-05, "loss": 0.0147, "step": 2600 }, { "epoch": 3.9061032863849765, "eval_loss": 0.048022232949733734, "eval_runtime": 157.7412, "eval_samples_per_second": 7.506, "eval_steps_per_second": 7.506, "step": 2600 }, { "epoch": 3.9211267605633804, "grad_norm": 0.37353694438934326, "learning_rate": 1.344106841525946e-05, "loss": 0.0101, "step": 2610 }, { "epoch": 3.936150234741784, "grad_norm": 0.3442070484161377, "learning_rate": 1.3084943439887659e-05, "loss": 0.0111, "step": 2620 }, { "epoch": 3.9511737089201877, "grad_norm": 0.17266932129859924, "learning_rate": 1.273288829307579e-05, "loss": 0.0069, "step": 2630 }, { "epoch": 3.9661971830985916, "grad_norm": 0.09121805429458618, "learning_rate": 1.2384941788365622e-05, "loss": 0.0073, "step": 2640 }, { "epoch": 3.981220657276995, "grad_norm": 0.37061765789985657, "learning_rate": 1.2041142286327477e-05, "loss": 0.0123, "step": 2650 }, { "epoch": 3.981220657276995, "eval_loss": 0.0474095456302166, "eval_runtime": 157.8962, "eval_samples_per_second": 7.499, "eval_steps_per_second": 7.499, "step": 2650 }, { "epoch": 3.996244131455399, "grad_norm": 0.28421324491500854, "learning_rate": 1.170152769033095e-05, "loss": 0.0058, "step": 2660 }, { "epoch": 4.011267605633803, "grad_norm": 0.1725592464208603, "learning_rate": 1.1366135442366127e-05, "loss": 0.0066, "step": 2670 }, { "epoch": 4.026291079812206, "grad_norm": 0.144903302192688, "learning_rate": 1.103500251891571e-05, "loss": 0.0061, "step": 2680 }, { "epoch": 4.041314553990611, "grad_norm": 0.09540867805480957, "learning_rate": 1.0708165426878325e-05, "loss": 0.0042, "step": 2690 }, { "epoch": 4.056338028169014, "grad_norm": 0.04649324715137482, "learning_rate": 1.0385660199543812e-05, "loss": 0.0053, "step": 2700 }, { "epoch": 4.056338028169014, "eval_loss": 0.05291583761572838, "eval_runtime": 157.9865, "eval_samples_per_second": 7.494, "eval_steps_per_second": 7.494, "step": 2700 }, { "epoch": 4.0713615023474174, "grad_norm": 0.020398231223225594, "learning_rate": 1.0067522392620537e-05, "loss": 0.0037, "step": 2710 }, { "epoch": 4.086384976525822, "grad_norm": 0.25772586464881897, "learning_rate": 9.753787080315385e-06, "loss": 0.0084, "step": 2720 }, { "epoch": 4.101408450704225, "grad_norm": 0.0540861040353775, "learning_rate": 9.444488851467042e-06, "loss": 0.0051, "step": 2730 }, { "epoch": 4.1164319248826295, "grad_norm": 0.19499164819717407, "learning_rate": 9.139661805732435e-06, "loss": 0.0043, "step": 2740 }, { "epoch": 4.131455399061033, "grad_norm": 0.46163612604141235, "learning_rate": 8.839339549827397e-06, "loss": 0.0064, "step": 2750 }, { "epoch": 4.131455399061033, "eval_loss": 0.05712565779685974, "eval_runtime": 157.9487, "eval_samples_per_second": 7.496, "eval_steps_per_second": 7.496, "step": 2750 }, { "epoch": 4.146478873239436, "grad_norm": 0.07052065432071686, "learning_rate": 8.543555193821634e-06, "loss": 0.0045, "step": 2760 }, { "epoch": 4.161502347417841, "grad_norm": 0.3251402974128723, "learning_rate": 8.252341347488251e-06, "loss": 0.0032, "step": 2770 }, { "epoch": 4.176525821596244, "grad_norm": 0.21811746060848236, "learning_rate": 7.965730116708681e-06, "loss": 0.005, "step": 2780 }, { "epoch": 4.191549295774648, "grad_norm": 0.08314239978790283, "learning_rate": 7.68375309993304e-06, "loss": 0.0025, "step": 2790 }, { "epoch": 4.206572769953052, "grad_norm": 0.07649940252304077, "learning_rate": 7.406441384696372e-06, "loss": 0.0064, "step": 2800 }, { "epoch": 4.206572769953052, "eval_loss": 0.05896175280213356, "eval_runtime": 157.6893, "eval_samples_per_second": 7.508, "eval_steps_per_second": 7.508, "step": 2800 }, { "epoch": 4.221596244131455, "grad_norm": 0.14924947917461395, "learning_rate": 7.133825544191464e-06, "loss": 0.0088, "step": 2810 }, { "epoch": 4.236619718309859, "grad_norm": 0.07150545716285706, "learning_rate": 6.865935633897996e-06, "loss": 0.0019, "step": 2820 }, { "epoch": 4.251643192488263, "grad_norm": 0.13238117098808289, "learning_rate": 6.602801188269081e-06, "loss": 0.0057, "step": 2830 }, { "epoch": 4.266666666666667, "grad_norm": 0.182158961892128, "learning_rate": 6.344451217475183e-06, "loss": 0.0029, "step": 2840 }, { "epoch": 4.28169014084507, "grad_norm": 0.12668107450008392, "learning_rate": 6.090914204205655e-06, "loss": 0.0038, "step": 2850 }, { "epoch": 4.28169014084507, "eval_loss": 0.060705944895744324, "eval_runtime": 157.8239, "eval_samples_per_second": 7.502, "eval_steps_per_second": 7.502, "step": 2850 }, { "epoch": 4.296713615023474, "grad_norm": 0.22536417841911316, "learning_rate": 5.842218100528679e-06, "loss": 0.0029, "step": 2860 }, { "epoch": 4.311737089201878, "grad_norm": 0.2726947069168091, "learning_rate": 5.598390324809555e-06, "loss": 0.0041, "step": 2870 }, { "epoch": 4.326760563380281, "grad_norm": 0.3938266634941101, "learning_rate": 5.359457758687841e-06, "loss": 0.0048, "step": 2880 }, { "epoch": 4.341784037558686, "grad_norm": 0.29681265354156494, "learning_rate": 5.125446744113743e-06, "loss": 0.0088, "step": 2890 }, { "epoch": 4.356807511737089, "grad_norm": 0.08293790370225906, "learning_rate": 4.896383080443934e-06, "loss": 0.0011, "step": 2900 }, { "epoch": 4.356807511737089, "eval_loss": 0.06272595375776291, "eval_runtime": 157.9854, "eval_samples_per_second": 7.494, "eval_steps_per_second": 7.494, "step": 2900 }, { "epoch": 4.371830985915493, "grad_norm": 0.15698398649692535, "learning_rate": 4.672292021597174e-06, "loss": 0.0053, "step": 2910 }, { "epoch": 4.386854460093897, "grad_norm": 0.25638025999069214, "learning_rate": 4.4531982732702145e-06, "loss": 0.0055, "step": 2920 }, { "epoch": 4.4018779342723, "grad_norm": 0.1589283049106598, "learning_rate": 4.239125990213883e-06, "loss": 0.0045, "step": 2930 }, { "epoch": 4.416901408450705, "grad_norm": 0.06673490256071091, "learning_rate": 4.030098773570174e-06, "loss": 0.001, "step": 2940 }, { "epoch": 4.431924882629108, "grad_norm": 0.12453540414571762, "learning_rate": 3.826139668270234e-06, "loss": 0.0034, "step": 2950 }, { "epoch": 4.431924882629108, "eval_loss": 0.06274248659610748, "eval_runtime": 157.8646, "eval_samples_per_second": 7.5, "eval_steps_per_second": 7.5, "step": 2950 }, { "epoch": 4.446948356807511, "grad_norm": 0.18250524997711182, "learning_rate": 3.6272711604936504e-06, "loss": 0.0074, "step": 2960 }, { "epoch": 4.461971830985916, "grad_norm": 0.026577528566122055, "learning_rate": 3.433515175189428e-06, "loss": 0.0033, "step": 2970 }, { "epoch": 4.476995305164319, "grad_norm": 0.03135993331670761, "learning_rate": 3.2448930736588e-06, "loss": 0.0093, "step": 2980 }, { "epoch": 4.492018779342723, "grad_norm": 0.054898951202631, "learning_rate": 3.061425651200117e-06, "loss": 0.0031, "step": 2990 }, { "epoch": 4.507042253521127, "grad_norm": 0.12721896171569824, "learning_rate": 2.883133134816296e-06, "loss": 0.0063, "step": 3000 }, { "epoch": 4.507042253521127, "eval_loss": 0.062312256544828415, "eval_runtime": 157.8769, "eval_samples_per_second": 7.5, "eval_steps_per_second": 7.5, "step": 3000 }, { "epoch": 4.52206572769953, "grad_norm": 0.050971172749996185, "learning_rate": 2.7100351809847326e-06, "loss": 0.0026, "step": 3010 }, { "epoch": 4.537089201877935, "grad_norm": 0.06001908332109451, "learning_rate": 2.542150873490251e-06, "loss": 0.0026, "step": 3020 }, { "epoch": 4.552112676056338, "grad_norm": 0.1522800624370575, "learning_rate": 2.3794987213211383e-06, "loss": 0.0039, "step": 3030 }, { "epoch": 4.567136150234742, "grad_norm": 0.19140265882015228, "learning_rate": 2.222096656628547e-06, "loss": 0.0041, "step": 3040 }, { "epoch": 4.582159624413146, "grad_norm": 0.031790439039468765, "learning_rate": 2.0699620327495174e-06, "loss": 0.0033, "step": 3050 }, { "epoch": 4.582159624413146, "eval_loss": 0.06346932053565979, "eval_runtime": 157.947, "eval_samples_per_second": 7.496, "eval_steps_per_second": 7.496, "step": 3050 }, { "epoch": 4.597183098591549, "grad_norm": 0.011058680713176727, "learning_rate": 1.9231116222937996e-06, "loss": 0.0014, "step": 3060 }, { "epoch": 4.612206572769953, "grad_norm": 0.051989588886499405, "learning_rate": 1.7815616152946523e-06, "loss": 0.0022, "step": 3070 }, { "epoch": 4.627230046948357, "grad_norm": 0.28735625743865967, "learning_rate": 1.6453276174240195e-06, "loss": 0.0016, "step": 3080 }, { "epoch": 4.642253521126761, "grad_norm": 0.7765727043151855, "learning_rate": 1.5144246482719114e-06, "loss": 0.0032, "step": 3090 }, { "epoch": 4.657276995305164, "grad_norm": 0.16394619643688202, "learning_rate": 1.3888671396905805e-06, "loss": 0.0048, "step": 3100 }, { "epoch": 4.657276995305164, "eval_loss": 0.06427957862615585, "eval_runtime": 157.8435, "eval_samples_per_second": 7.501, "eval_steps_per_second": 7.501, "step": 3100 }, { "epoch": 4.672300469483568, "grad_norm": 0.07136521488428116, "learning_rate": 1.2686689342034431e-06, "loss": 0.0075, "step": 3110 }, { "epoch": 4.687323943661972, "grad_norm": 0.06406821310520172, "learning_rate": 1.1538432834789227e-06, "loss": 0.0058, "step": 3120 }, { "epoch": 4.702347417840375, "grad_norm": 0.42130792140960693, "learning_rate": 1.044402846869491e-06, "loss": 0.0041, "step": 3130 }, { "epoch": 4.71737089201878, "grad_norm": 0.6147840023040771, "learning_rate": 9.403596900160073e-07, "loss": 0.003, "step": 3140 }, { "epoch": 4.732394366197183, "grad_norm": 0.4716566205024719, "learning_rate": 8.417252835174749e-07, "loss": 0.0044, "step": 3150 }, { "epoch": 4.732394366197183, "eval_loss": 0.06363623589277267, "eval_runtime": 157.8646, "eval_samples_per_second": 7.5, "eval_steps_per_second": 7.5, "step": 3150 }, { "epoch": 4.7474178403755865, "grad_norm": 0.2807024419307709, "learning_rate": 7.48510501666455e-07, "loss": 0.0079, "step": 3160 }, { "epoch": 4.762441314553991, "grad_norm": 0.3779400587081909, "learning_rate": 6.607256212501578e-07, "loss": 0.0041, "step": 3170 }, { "epoch": 4.777464788732394, "grad_norm": 0.3377034068107605, "learning_rate": 5.783803204174654e-07, "loss": 0.005, "step": 3180 }, { "epoch": 4.792488262910798, "grad_norm": 0.15139789879322052, "learning_rate": 5.014836776119358e-07, "loss": 0.0027, "step": 3190 }, { "epoch": 4.807511737089202, "grad_norm": 0.15307891368865967, "learning_rate": 4.300441705708924e-07, "loss": 0.0031, "step": 3200 }, { "epoch": 4.807511737089202, "eval_loss": 0.06359302997589111, "eval_runtime": 157.9251, "eval_samples_per_second": 7.497, "eval_steps_per_second": 7.497, "step": 3200 }, { "epoch": 4.822535211267605, "grad_norm": 0.28907057642936707, "learning_rate": 3.6406967539078796e-07, "loss": 0.0047, "step": 3210 }, { "epoch": 4.83755868544601, "grad_norm": 0.10809265822172165, "learning_rate": 3.0356746565887715e-07, "loss": 0.004, "step": 3220 }, { "epoch": 4.852582159624413, "grad_norm": 0.07856517285108566, "learning_rate": 2.485442116513026e-07, "loss": 0.0031, "step": 3230 }, { "epoch": 4.867605633802817, "grad_norm": 0.5275461673736572, "learning_rate": 1.9900597959770507e-07, "loss": 0.0087, "step": 3240 }, { "epoch": 4.882629107981221, "grad_norm": 0.014800317585468292, "learning_rate": 1.5495823101245866e-07, "loss": 0.0022, "step": 3250 }, { "epoch": 4.882629107981221, "eval_loss": 0.06390237808227539, "eval_runtime": 157.8776, "eval_samples_per_second": 7.499, "eval_steps_per_second": 7.499, "step": 3250 }, { "epoch": 4.897652582159624, "grad_norm": 0.04556208476424217, "learning_rate": 1.164058220925135e-07, "loss": 0.006, "step": 3260 }, { "epoch": 4.912676056338028, "grad_norm": 0.20600998401641846, "learning_rate": 8.335300318201844e-08, "loss": 0.0067, "step": 3270 }, { "epoch": 4.927699530516432, "grad_norm": 0.8991699814796448, "learning_rate": 5.5803418303745917e-08, "loss": 0.008, "step": 3280 }, { "epoch": 4.942723004694836, "grad_norm": 0.015831077471375465, "learning_rate": 3.3760104757313284e-08, "loss": 0.0015, "step": 3290 }, { "epoch": 4.957746478873239, "grad_norm": 0.1406923532485962, "learning_rate": 1.7225492784345156e-08, "loss": 0.0078, "step": 3300 }, { "epoch": 4.957746478873239, "eval_loss": 0.06379479169845581, "eval_runtime": 157.8707, "eval_samples_per_second": 7.5, "eval_steps_per_second": 7.5, "step": 3300 }, { "epoch": 4.972769953051643, "grad_norm": 0.062291864305734634, "learning_rate": 6.201405300532148e-09, "loss": 0.006, "step": 3310 }, { "epoch": 4.987793427230047, "grad_norm": 2.030796527862549, "learning_rate": 6.890576946805282e-10, "loss": 0.0032, "step": 3320 }, { "epoch": 4.995305164319249, "step": 3325, "total_flos": 8.345060490986127e+17, "train_loss": 0.04504659976081965, "train_runtime": 35784.009, "train_samples_per_second": 1.488, "train_steps_per_second": 0.093 } ], "logging_steps": 10, "max_steps": 3325, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.345060490986127e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }