| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 191805, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013034071061755428, | |
| "grad_norm": 2.9903717041015625, | |
| "learning_rate": 4.9869659289382446e-05, | |
| "loss": 6.0967, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.026068142123510857, | |
| "grad_norm": 4.055346965789795, | |
| "learning_rate": 4.973931857876489e-05, | |
| "loss": 5.5622, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.039102213185266285, | |
| "grad_norm": 5.480607032775879, | |
| "learning_rate": 4.960897786814734e-05, | |
| "loss": 5.4371, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.052136284247021714, | |
| "grad_norm": 4.727134704589844, | |
| "learning_rate": 4.9478637157529784e-05, | |
| "loss": 5.3535, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.06517035530877714, | |
| "grad_norm": 4.737260341644287, | |
| "learning_rate": 4.934829644691223e-05, | |
| "loss": 5.2508, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.07820442637053257, | |
| "grad_norm": 5.066771984100342, | |
| "learning_rate": 4.921795573629467e-05, | |
| "loss": 5.199, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.091238497432288, | |
| "grad_norm": 3.627026319503784, | |
| "learning_rate": 4.908761502567713e-05, | |
| "loss": 5.084, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.10427256849404343, | |
| "grad_norm": 4.254016876220703, | |
| "learning_rate": 4.895727431505957e-05, | |
| "loss": 4.9441, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.11730663955579886, | |
| "grad_norm": 6.351306438446045, | |
| "learning_rate": 4.8826933604442015e-05, | |
| "loss": 4.8021, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.13034071061755428, | |
| "grad_norm": 7.492619037628174, | |
| "learning_rate": 4.869659289382446e-05, | |
| "loss": 4.6446, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.14337478167930973, | |
| "grad_norm": 6.017455577850342, | |
| "learning_rate": 4.856625218320691e-05, | |
| "loss": 4.4574, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.15640885274106514, | |
| "grad_norm": 5.2971343994140625, | |
| "learning_rate": 4.843591147258935e-05, | |
| "loss": 4.2184, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.16944292380282058, | |
| "grad_norm": 9.367820739746094, | |
| "learning_rate": 4.8305570761971796e-05, | |
| "loss": 4.101, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.182476994864576, | |
| "grad_norm": 7.676972389221191, | |
| "learning_rate": 4.817523005135424e-05, | |
| "loss": 3.9548, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.19551106592633144, | |
| "grad_norm": 6.3607563972473145, | |
| "learning_rate": 4.804488934073669e-05, | |
| "loss": 3.8584, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.20854513698808685, | |
| "grad_norm": 5.45451021194458, | |
| "learning_rate": 4.7914548630119134e-05, | |
| "loss": 3.7841, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2215792080498423, | |
| "grad_norm": 16.199485778808594, | |
| "learning_rate": 4.778420791950158e-05, | |
| "loss": 3.6685, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.2346132791115977, | |
| "grad_norm": 6.077032089233398, | |
| "learning_rate": 4.765386720888402e-05, | |
| "loss": 3.6017, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.24764735017335315, | |
| "grad_norm": 11.489569664001465, | |
| "learning_rate": 4.752352649826647e-05, | |
| "loss": 3.5553, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.26068142123510857, | |
| "grad_norm": 4.917782783508301, | |
| "learning_rate": 4.7393185787648915e-05, | |
| "loss": 3.4537, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.273715492296864, | |
| "grad_norm": 5.945028781890869, | |
| "learning_rate": 4.7262845077031366e-05, | |
| "loss": 3.4442, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.28674956335861945, | |
| "grad_norm": 7.648957252502441, | |
| "learning_rate": 4.713250436641381e-05, | |
| "loss": 3.3772, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.29978363442037487, | |
| "grad_norm": 7.488467216491699, | |
| "learning_rate": 4.700216365579625e-05, | |
| "loss": 3.3026, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.3128177054821303, | |
| "grad_norm": 5.8792619705200195, | |
| "learning_rate": 4.68718229451787e-05, | |
| "loss": 3.2446, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.3258517765438857, | |
| "grad_norm": 10.038032531738281, | |
| "learning_rate": 4.674148223456115e-05, | |
| "loss": 3.216, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.33888584760564117, | |
| "grad_norm": 7.69769811630249, | |
| "learning_rate": 4.661114152394359e-05, | |
| "loss": 3.1869, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.3519199186673966, | |
| "grad_norm": 6.179595470428467, | |
| "learning_rate": 4.6480800813326034e-05, | |
| "loss": 3.1464, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.364953989729152, | |
| "grad_norm": 5.665715217590332, | |
| "learning_rate": 4.6350460102708484e-05, | |
| "loss": 3.079, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.3779880607909074, | |
| "grad_norm": 4.681985855102539, | |
| "learning_rate": 4.622011939209093e-05, | |
| "loss": 3.0724, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.3910221318526629, | |
| "grad_norm": 11.111820220947266, | |
| "learning_rate": 4.608977868147337e-05, | |
| "loss": 3.0356, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.4040562029144183, | |
| "grad_norm": 5.951188564300537, | |
| "learning_rate": 4.5959437970855815e-05, | |
| "loss": 3.01, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.4170902739761737, | |
| "grad_norm": 5.438151836395264, | |
| "learning_rate": 4.5829097260238266e-05, | |
| "loss": 2.9605, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.4301243450379291, | |
| "grad_norm": 10.49527645111084, | |
| "learning_rate": 4.569875654962071e-05, | |
| "loss": 2.9453, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.4431584160996846, | |
| "grad_norm": 6.611765384674072, | |
| "learning_rate": 4.556841583900316e-05, | |
| "loss": 2.9529, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.45619248716144, | |
| "grad_norm": 5.289289474487305, | |
| "learning_rate": 4.54380751283856e-05, | |
| "loss": 2.9081, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.4692265582231954, | |
| "grad_norm": 5.65715217590332, | |
| "learning_rate": 4.530773441776805e-05, | |
| "loss": 2.8152, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.48226062928495084, | |
| "grad_norm": 5.513209819793701, | |
| "learning_rate": 4.51773937071505e-05, | |
| "loss": 2.8664, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.4952947003467063, | |
| "grad_norm": 4.413240909576416, | |
| "learning_rate": 4.504705299653294e-05, | |
| "loss": 2.8854, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.5083287714084617, | |
| "grad_norm": 5.602241039276123, | |
| "learning_rate": 4.4916712285915384e-05, | |
| "loss": 2.8295, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.5213628424702171, | |
| "grad_norm": 8.221460342407227, | |
| "learning_rate": 4.478637157529783e-05, | |
| "loss": 2.7826, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.5343969135319726, | |
| "grad_norm": 5.350883483886719, | |
| "learning_rate": 4.465603086468028e-05, | |
| "loss": 2.7846, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.547430984593728, | |
| "grad_norm": 6.6059393882751465, | |
| "learning_rate": 4.452569015406272e-05, | |
| "loss": 2.7562, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.5604650556554834, | |
| "grad_norm": 7.050083637237549, | |
| "learning_rate": 4.4395349443445166e-05, | |
| "loss": 2.7102, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.5734991267172389, | |
| "grad_norm": 6.74811315536499, | |
| "learning_rate": 4.426500873282761e-05, | |
| "loss": 2.7215, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.5865331977789943, | |
| "grad_norm": 7.959073543548584, | |
| "learning_rate": 4.413466802221006e-05, | |
| "loss": 2.7185, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.5995672688407497, | |
| "grad_norm": 7.594911098480225, | |
| "learning_rate": 4.40043273115925e-05, | |
| "loss": 2.6624, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.6126013399025051, | |
| "grad_norm": 5.935075283050537, | |
| "learning_rate": 4.3873986600974954e-05, | |
| "loss": 2.6398, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.6256354109642606, | |
| "grad_norm": 7.0315961837768555, | |
| "learning_rate": 4.37436458903574e-05, | |
| "loss": 2.6571, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.638669482026016, | |
| "grad_norm": 6.930845260620117, | |
| "learning_rate": 4.361330517973984e-05, | |
| "loss": 2.6009, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.6517035530877714, | |
| "grad_norm": 14.607309341430664, | |
| "learning_rate": 4.348296446912229e-05, | |
| "loss": 2.6493, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.6647376241495269, | |
| "grad_norm": 5.613809108734131, | |
| "learning_rate": 4.3352623758504735e-05, | |
| "loss": 2.6042, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.6777716952112823, | |
| "grad_norm": 6.0553693771362305, | |
| "learning_rate": 4.322228304788718e-05, | |
| "loss": 2.6153, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.6908057662730377, | |
| "grad_norm": 8.716107368469238, | |
| "learning_rate": 4.309194233726962e-05, | |
| "loss": 2.5757, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.7038398373347932, | |
| "grad_norm": 7.430722713470459, | |
| "learning_rate": 4.296160162665207e-05, | |
| "loss": 2.5682, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.7168739083965486, | |
| "grad_norm": 9.687034606933594, | |
| "learning_rate": 4.2831260916034516e-05, | |
| "loss": 2.5377, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.729907979458304, | |
| "grad_norm": 3.729767084121704, | |
| "learning_rate": 4.270092020541696e-05, | |
| "loss": 2.5217, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.7429420505200595, | |
| "grad_norm": 9.692636489868164, | |
| "learning_rate": 4.25705794947994e-05, | |
| "loss": 2.4829, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.7559761215818148, | |
| "grad_norm": 8.260266304016113, | |
| "learning_rate": 4.2440238784181854e-05, | |
| "loss": 2.4971, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.7690101926435703, | |
| "grad_norm": 5.885035037994385, | |
| "learning_rate": 4.23098980735643e-05, | |
| "loss": 2.4823, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.7820442637053258, | |
| "grad_norm": 11.001029968261719, | |
| "learning_rate": 4.217955736294674e-05, | |
| "loss": 2.4583, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.7950783347670811, | |
| "grad_norm": 9.69256591796875, | |
| "learning_rate": 4.204921665232919e-05, | |
| "loss": 2.447, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.8081124058288366, | |
| "grad_norm": 15.954379081726074, | |
| "learning_rate": 4.191887594171164e-05, | |
| "loss": 2.4427, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.8211464768905921, | |
| "grad_norm": 5.421440124511719, | |
| "learning_rate": 4.1788535231094085e-05, | |
| "loss": 2.4181, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.8341805479523474, | |
| "grad_norm": 9.169551849365234, | |
| "learning_rate": 4.165819452047653e-05, | |
| "loss": 2.4105, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.8472146190141029, | |
| "grad_norm": 5.778009414672852, | |
| "learning_rate": 4.152785380985897e-05, | |
| "loss": 2.4145, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.8602486900758582, | |
| "grad_norm": 6.441959857940674, | |
| "learning_rate": 4.139751309924142e-05, | |
| "loss": 2.4334, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.8732827611376137, | |
| "grad_norm": 7.385718822479248, | |
| "learning_rate": 4.1267172388623866e-05, | |
| "loss": 2.392, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.8863168321993692, | |
| "grad_norm": 15.347734451293945, | |
| "learning_rate": 4.113683167800631e-05, | |
| "loss": 2.3981, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.8993509032611245, | |
| "grad_norm": 10.47854232788086, | |
| "learning_rate": 4.1006490967388754e-05, | |
| "loss": 2.3511, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.91238497432288, | |
| "grad_norm": 11.82073974609375, | |
| "learning_rate": 4.0876150256771204e-05, | |
| "loss": 2.3632, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.9254190453846355, | |
| "grad_norm": 8.932971954345703, | |
| "learning_rate": 4.074580954615365e-05, | |
| "loss": 2.3272, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.9384531164463908, | |
| "grad_norm": 11.068861961364746, | |
| "learning_rate": 4.061546883553609e-05, | |
| "loss": 2.3321, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.9514871875081463, | |
| "grad_norm": 5.649448871612549, | |
| "learning_rate": 4.0485128124918535e-05, | |
| "loss": 2.3498, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.9645212585699017, | |
| "grad_norm": 9.020928382873535, | |
| "learning_rate": 4.0354787414300985e-05, | |
| "loss": 2.3331, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.9775553296316571, | |
| "grad_norm": 12.966954231262207, | |
| "learning_rate": 4.0224446703683436e-05, | |
| "loss": 2.3095, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.9905894006934126, | |
| "grad_norm": 5.641653060913086, | |
| "learning_rate": 4.009410599306588e-05, | |
| "loss": 2.3127, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.003623471755168, | |
| "grad_norm": 8.139008522033691, | |
| "learning_rate": 3.996376528244832e-05, | |
| "loss": 2.2846, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.0166575428169233, | |
| "grad_norm": 7.005831241607666, | |
| "learning_rate": 3.9833424571830766e-05, | |
| "loss": 2.2518, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.029691613878679, | |
| "grad_norm": 3.906301975250244, | |
| "learning_rate": 3.970308386121322e-05, | |
| "loss": 2.2632, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.0427256849404343, | |
| "grad_norm": 4.201974391937256, | |
| "learning_rate": 3.957274315059566e-05, | |
| "loss": 2.2299, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.0557597560021896, | |
| "grad_norm": 6.107882022857666, | |
| "learning_rate": 3.9442402439978104e-05, | |
| "loss": 2.2016, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.0687938270639452, | |
| "grad_norm": 8.289084434509277, | |
| "learning_rate": 3.931206172936055e-05, | |
| "loss": 2.2227, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.0818278981257006, | |
| "grad_norm": 5.386382102966309, | |
| "learning_rate": 3.9181721018743e-05, | |
| "loss": 2.1849, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.094861969187456, | |
| "grad_norm": 5.536214828491211, | |
| "learning_rate": 3.905138030812544e-05, | |
| "loss": 2.2085, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.1078960402492115, | |
| "grad_norm": 67.06414031982422, | |
| "learning_rate": 3.8921039597507885e-05, | |
| "loss": 2.2039, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.1209301113109669, | |
| "grad_norm": 8.36019229888916, | |
| "learning_rate": 3.879069888689033e-05, | |
| "loss": 2.1925, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.1339641823727222, | |
| "grad_norm": 14.266386985778809, | |
| "learning_rate": 3.866035817627278e-05, | |
| "loss": 2.2101, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.1469982534344778, | |
| "grad_norm": 11.47070598602295, | |
| "learning_rate": 3.853001746565523e-05, | |
| "loss": 2.1402, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.1600323244962332, | |
| "grad_norm": 5.293683052062988, | |
| "learning_rate": 3.839967675503767e-05, | |
| "loss": 2.1872, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.1730663955579885, | |
| "grad_norm": 32.234737396240234, | |
| "learning_rate": 3.826933604442012e-05, | |
| "loss": 2.1357, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.1861004666197439, | |
| "grad_norm": 3.9005160331726074, | |
| "learning_rate": 3.813899533380256e-05, | |
| "loss": 2.1263, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.1991345376814995, | |
| "grad_norm": 9.012932777404785, | |
| "learning_rate": 3.800865462318501e-05, | |
| "loss": 2.1718, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.2121686087432548, | |
| "grad_norm": 8.86204719543457, | |
| "learning_rate": 3.7878313912567454e-05, | |
| "loss": 2.1718, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.2252026798050104, | |
| "grad_norm": 29.908674240112305, | |
| "learning_rate": 3.77479732019499e-05, | |
| "loss": 2.1227, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.2382367508667658, | |
| "grad_norm": 3.599839687347412, | |
| "learning_rate": 3.761763249133234e-05, | |
| "loss": 2.1301, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.2512708219285211, | |
| "grad_norm": 12.039328575134277, | |
| "learning_rate": 3.748729178071479e-05, | |
| "loss": 2.1226, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.2643048929902765, | |
| "grad_norm": 3.92248797416687, | |
| "learning_rate": 3.7356951070097236e-05, | |
| "loss": 2.156, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.277338964052032, | |
| "grad_norm": 22.514301300048828, | |
| "learning_rate": 3.722661035947968e-05, | |
| "loss": 2.1001, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.2903730351137874, | |
| "grad_norm": 4.8082990646362305, | |
| "learning_rate": 3.709626964886212e-05, | |
| "loss": 2.1167, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.303407106175543, | |
| "grad_norm": 7.884994983673096, | |
| "learning_rate": 3.696592893824457e-05, | |
| "loss": 2.1118, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.3164411772372984, | |
| "grad_norm": 4.282125949859619, | |
| "learning_rate": 3.6835588227627024e-05, | |
| "loss": 2.0749, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.3294752482990537, | |
| "grad_norm": 19.30133819580078, | |
| "learning_rate": 3.670524751700947e-05, | |
| "loss": 2.1081, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.342509319360809, | |
| "grad_norm": 3.800236463546753, | |
| "learning_rate": 3.657490680639191e-05, | |
| "loss": 2.0964, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.3555433904225647, | |
| "grad_norm": 5.734689235687256, | |
| "learning_rate": 3.6444566095774355e-05, | |
| "loss": 2.0736, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.36857746148432, | |
| "grad_norm": 7.496071815490723, | |
| "learning_rate": 3.6314225385156805e-05, | |
| "loss": 2.0545, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.3816115325460754, | |
| "grad_norm": 7.645195007324219, | |
| "learning_rate": 3.618388467453925e-05, | |
| "loss": 2.0407, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.394645603607831, | |
| "grad_norm": 22.738969802856445, | |
| "learning_rate": 3.605354396392169e-05, | |
| "loss": 2.0554, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.4076796746695863, | |
| "grad_norm": 9.185379028320312, | |
| "learning_rate": 3.5923203253304136e-05, | |
| "loss": 2.0364, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.4207137457313417, | |
| "grad_norm": 9.092364311218262, | |
| "learning_rate": 3.5792862542686586e-05, | |
| "loss": 2.023, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.433747816793097, | |
| "grad_norm": 3.8213064670562744, | |
| "learning_rate": 3.566252183206903e-05, | |
| "loss": 2.0429, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.4467818878548526, | |
| "grad_norm": 15.87769603729248, | |
| "learning_rate": 3.553218112145147e-05, | |
| "loss": 1.9853, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.459815958916608, | |
| "grad_norm": 8.585647583007812, | |
| "learning_rate": 3.540184041083392e-05, | |
| "loss": 2.0239, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.4728500299783636, | |
| "grad_norm": 4.249543190002441, | |
| "learning_rate": 3.527149970021637e-05, | |
| "loss": 2.0305, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.485884101040119, | |
| "grad_norm": 6.320367336273193, | |
| "learning_rate": 3.514115898959881e-05, | |
| "loss": 2.0173, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.4989181721018743, | |
| "grad_norm": 5.058931350708008, | |
| "learning_rate": 3.501081827898126e-05, | |
| "loss": 1.9641, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.5119522431636296, | |
| "grad_norm": 10.568583488464355, | |
| "learning_rate": 3.4880477568363705e-05, | |
| "loss": 2.035, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.524986314225385, | |
| "grad_norm": 6.535768985748291, | |
| "learning_rate": 3.475013685774615e-05, | |
| "loss": 1.9971, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.5380203852871406, | |
| "grad_norm": 11.262877464294434, | |
| "learning_rate": 3.46197961471286e-05, | |
| "loss": 2.0076, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.5510544563488962, | |
| "grad_norm": 8.998533248901367, | |
| "learning_rate": 3.448945543651104e-05, | |
| "loss": 1.986, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.5640885274106515, | |
| "grad_norm": 5.243868827819824, | |
| "learning_rate": 3.4359114725893486e-05, | |
| "loss": 2.0148, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.5771225984724069, | |
| "grad_norm": 6.43707275390625, | |
| "learning_rate": 3.422877401527593e-05, | |
| "loss": 1.9952, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 1.5901566695341622, | |
| "grad_norm": 10.8756742477417, | |
| "learning_rate": 3.409843330465838e-05, | |
| "loss": 1.9688, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.6031907405959176, | |
| "grad_norm": 3.6488418579101562, | |
| "learning_rate": 3.3968092594040824e-05, | |
| "loss": 1.9545, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 1.6162248116576732, | |
| "grad_norm": 3.8945696353912354, | |
| "learning_rate": 3.383775188342327e-05, | |
| "loss": 1.9692, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.6292588827194285, | |
| "grad_norm": 4.477757453918457, | |
| "learning_rate": 3.370741117280571e-05, | |
| "loss": 1.9559, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 1.6422929537811841, | |
| "grad_norm": 5.086141586303711, | |
| "learning_rate": 3.357707046218816e-05, | |
| "loss": 1.929, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.6553270248429395, | |
| "grad_norm": 5.249891757965088, | |
| "learning_rate": 3.3446729751570605e-05, | |
| "loss": 1.9686, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 1.6683610959046948, | |
| "grad_norm": 9.6456880569458, | |
| "learning_rate": 3.3316389040953055e-05, | |
| "loss": 1.952, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.6813951669664502, | |
| "grad_norm": 5.007114410400391, | |
| "learning_rate": 3.31860483303355e-05, | |
| "loss": 1.9229, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 1.6944292380282058, | |
| "grad_norm": 4.589148044586182, | |
| "learning_rate": 3.305570761971795e-05, | |
| "loss": 1.9296, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.7074633090899611, | |
| "grad_norm": 10.281172752380371, | |
| "learning_rate": 3.292536690910039e-05, | |
| "loss": 1.9153, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 1.7204973801517167, | |
| "grad_norm": 7.041563034057617, | |
| "learning_rate": 3.2795026198482837e-05, | |
| "loss": 1.9276, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.733531451213472, | |
| "grad_norm": 8.523409843444824, | |
| "learning_rate": 3.266468548786528e-05, | |
| "loss": 1.8871, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 1.7465655222752274, | |
| "grad_norm": 18.92120361328125, | |
| "learning_rate": 3.253434477724773e-05, | |
| "loss": 1.8963, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 1.7595995933369828, | |
| "grad_norm": 17.547399520874023, | |
| "learning_rate": 3.2404004066630174e-05, | |
| "loss": 1.9069, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 1.7726336643987382, | |
| "grad_norm": 9.223323822021484, | |
| "learning_rate": 3.227366335601262e-05, | |
| "loss": 1.9232, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 1.7856677354604937, | |
| "grad_norm": 17.263656616210938, | |
| "learning_rate": 3.214332264539506e-05, | |
| "loss": 1.89, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 1.7987018065222493, | |
| "grad_norm": 19.6173152923584, | |
| "learning_rate": 3.201298193477751e-05, | |
| "loss": 1.8764, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 1.8117358775840047, | |
| "grad_norm": 10.714072227478027, | |
| "learning_rate": 3.1882641224159955e-05, | |
| "loss": 1.9165, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 1.82476994864576, | |
| "grad_norm": 5.039360523223877, | |
| "learning_rate": 3.17523005135424e-05, | |
| "loss": 1.8422, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.8378040197075154, | |
| "grad_norm": 28.72756576538086, | |
| "learning_rate": 3.162195980292485e-05, | |
| "loss": 1.8819, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 1.8508380907692707, | |
| "grad_norm": 4.069336414337158, | |
| "learning_rate": 3.149161909230729e-05, | |
| "loss": 1.8769, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 1.8638721618310263, | |
| "grad_norm": 4.223635196685791, | |
| "learning_rate": 3.136127838168974e-05, | |
| "loss": 1.8799, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 1.8769062328927817, | |
| "grad_norm": 10.401415824890137, | |
| "learning_rate": 3.123093767107219e-05, | |
| "loss": 1.905, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 1.8899403039545373, | |
| "grad_norm": 5.064211368560791, | |
| "learning_rate": 3.110059696045463e-05, | |
| "loss": 1.827, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 1.9029743750162926, | |
| "grad_norm": 4.138282299041748, | |
| "learning_rate": 3.0970256249837074e-05, | |
| "loss": 1.8237, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 1.916008446078048, | |
| "grad_norm": 3.365440845489502, | |
| "learning_rate": 3.0839915539219525e-05, | |
| "loss": 1.8421, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 1.9290425171398033, | |
| "grad_norm": 7.819665431976318, | |
| "learning_rate": 3.070957482860197e-05, | |
| "loss": 1.8413, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 1.942076588201559, | |
| "grad_norm": 8.81440544128418, | |
| "learning_rate": 3.057923411798441e-05, | |
| "loss": 1.8633, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 1.9551106592633143, | |
| "grad_norm": 12.814815521240234, | |
| "learning_rate": 3.044889340736686e-05, | |
| "loss": 1.8255, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.9681447303250699, | |
| "grad_norm": 7.332582950592041, | |
| "learning_rate": 3.0318552696749302e-05, | |
| "loss": 1.8228, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 1.9811788013868252, | |
| "grad_norm": 6.4567694664001465, | |
| "learning_rate": 3.018821198613175e-05, | |
| "loss": 1.8514, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 1.9942128724485806, | |
| "grad_norm": 33.37932205200195, | |
| "learning_rate": 3.0057871275514193e-05, | |
| "loss": 1.8347, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 2.007246943510336, | |
| "grad_norm": 3.908621072769165, | |
| "learning_rate": 2.992753056489664e-05, | |
| "loss": 1.8015, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 2.0202810145720913, | |
| "grad_norm": 3.9100475311279297, | |
| "learning_rate": 2.979718985427909e-05, | |
| "loss": 1.8148, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 2.0333150856338467, | |
| "grad_norm": 4.988982200622559, | |
| "learning_rate": 2.9666849143661534e-05, | |
| "loss": 1.7508, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 2.0463491566956025, | |
| "grad_norm": 5.134647846221924, | |
| "learning_rate": 2.953650843304398e-05, | |
| "loss": 1.7613, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 2.059383227757358, | |
| "grad_norm": 6.9095845222473145, | |
| "learning_rate": 2.9406167722426425e-05, | |
| "loss": 1.8106, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 2.072417298819113, | |
| "grad_norm": 14.57297420501709, | |
| "learning_rate": 2.927582701180887e-05, | |
| "loss": 1.7387, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 2.0854513698808685, | |
| "grad_norm": 46.801937103271484, | |
| "learning_rate": 2.9145486301191315e-05, | |
| "loss": 1.7732, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 2.098485440942624, | |
| "grad_norm": 10.51559829711914, | |
| "learning_rate": 2.9015145590573762e-05, | |
| "loss": 1.779, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 2.1115195120043793, | |
| "grad_norm": 3.4089362621307373, | |
| "learning_rate": 2.8884804879956206e-05, | |
| "loss": 1.7613, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 2.124553583066135, | |
| "grad_norm": 6.211880207061768, | |
| "learning_rate": 2.8754464169338653e-05, | |
| "loss": 1.7656, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 2.1375876541278904, | |
| "grad_norm": 4.486207962036133, | |
| "learning_rate": 2.8624123458721096e-05, | |
| "loss": 1.7653, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 2.150621725189646, | |
| "grad_norm": 4.438023090362549, | |
| "learning_rate": 2.8493782748103543e-05, | |
| "loss": 1.758, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 2.163655796251401, | |
| "grad_norm": 5.200678825378418, | |
| "learning_rate": 2.8363442037485987e-05, | |
| "loss": 1.7487, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 2.1766898673131565, | |
| "grad_norm": 11.503108024597168, | |
| "learning_rate": 2.8233101326868434e-05, | |
| "loss": 1.7539, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 2.189723938374912, | |
| "grad_norm": 3.5593841075897217, | |
| "learning_rate": 2.8102760616250884e-05, | |
| "loss": 1.7604, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 2.2027580094366677, | |
| "grad_norm": 4.380959510803223, | |
| "learning_rate": 2.7972419905633328e-05, | |
| "loss": 1.7688, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 2.215792080498423, | |
| "grad_norm": 8.921208381652832, | |
| "learning_rate": 2.7842079195015775e-05, | |
| "loss": 1.7414, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 2.2288261515601784, | |
| "grad_norm": 4.622405529022217, | |
| "learning_rate": 2.771173848439822e-05, | |
| "loss": 1.7623, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 2.2418602226219337, | |
| "grad_norm": 27.651330947875977, | |
| "learning_rate": 2.7581397773780666e-05, | |
| "loss": 1.7172, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 2.254894293683689, | |
| "grad_norm": 4.457437992095947, | |
| "learning_rate": 2.745105706316311e-05, | |
| "loss": 1.7444, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 2.2679283647454445, | |
| "grad_norm": 5.793179988861084, | |
| "learning_rate": 2.7320716352545556e-05, | |
| "loss": 1.7386, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 2.2809624358072, | |
| "grad_norm": 3.3070342540740967, | |
| "learning_rate": 2.7190375641928e-05, | |
| "loss": 1.7066, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 2.2939965068689556, | |
| "grad_norm": 4.475468158721924, | |
| "learning_rate": 2.7060034931310447e-05, | |
| "loss": 1.7212, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 2.307030577930711, | |
| "grad_norm": 4.4862847328186035, | |
| "learning_rate": 2.692969422069289e-05, | |
| "loss": 1.7265, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 2.3200646489924663, | |
| "grad_norm": 3.608401298522949, | |
| "learning_rate": 2.6799353510075337e-05, | |
| "loss": 1.7324, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 2.3330987200542217, | |
| "grad_norm": 4.134375095367432, | |
| "learning_rate": 2.666901279945778e-05, | |
| "loss": 1.6866, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 2.346132791115977, | |
| "grad_norm": 4.030068874359131, | |
| "learning_rate": 2.6538672088840228e-05, | |
| "loss": 1.6955, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.3591668621777324, | |
| "grad_norm": 7.18529748916626, | |
| "learning_rate": 2.640833137822267e-05, | |
| "loss": 1.7119, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 2.3722009332394878, | |
| "grad_norm": 3.633330821990967, | |
| "learning_rate": 2.6277990667605122e-05, | |
| "loss": 1.737, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 2.3852350043012436, | |
| "grad_norm": 5.056845188140869, | |
| "learning_rate": 2.614764995698757e-05, | |
| "loss": 1.7121, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 2.398269075362999, | |
| "grad_norm": 3.203246831893921, | |
| "learning_rate": 2.6017309246370013e-05, | |
| "loss": 1.7096, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.4113031464247543, | |
| "grad_norm": 3.830634355545044, | |
| "learning_rate": 2.588696853575246e-05, | |
| "loss": 1.7047, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 2.4243372174865097, | |
| "grad_norm": 3.5095880031585693, | |
| "learning_rate": 2.5756627825134903e-05, | |
| "loss": 1.6875, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 2.437371288548265, | |
| "grad_norm": 13.952683448791504, | |
| "learning_rate": 2.562628711451735e-05, | |
| "loss": 1.727, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 2.450405359610021, | |
| "grad_norm": 4.152392387390137, | |
| "learning_rate": 2.5495946403899794e-05, | |
| "loss": 1.674, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 2.463439430671776, | |
| "grad_norm": 28.32253074645996, | |
| "learning_rate": 2.536560569328224e-05, | |
| "loss": 1.6635, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 2.4764735017335315, | |
| "grad_norm": 37.356117248535156, | |
| "learning_rate": 2.5235264982664684e-05, | |
| "loss": 1.6936, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 2.489507572795287, | |
| "grad_norm": 11.425202369689941, | |
| "learning_rate": 2.510492427204713e-05, | |
| "loss": 1.6635, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 2.5025416438570423, | |
| "grad_norm": 3.700289726257324, | |
| "learning_rate": 2.497458356142958e-05, | |
| "loss": 1.7051, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 2.5155757149187976, | |
| "grad_norm": 16.234506607055664, | |
| "learning_rate": 2.4844242850812025e-05, | |
| "loss": 1.676, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 2.528609785980553, | |
| "grad_norm": 3.4809882640838623, | |
| "learning_rate": 2.471390214019447e-05, | |
| "loss": 1.6795, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 2.5416438570423088, | |
| "grad_norm": 4.420949459075928, | |
| "learning_rate": 2.4583561429576916e-05, | |
| "loss": 1.6926, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 2.554677928104064, | |
| "grad_norm": 24.02429962158203, | |
| "learning_rate": 2.445322071895936e-05, | |
| "loss": 1.6479, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 2.5677119991658195, | |
| "grad_norm": 4.912638187408447, | |
| "learning_rate": 2.4322880008341807e-05, | |
| "loss": 1.6598, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 2.580746070227575, | |
| "grad_norm": 22.43536376953125, | |
| "learning_rate": 2.419253929772425e-05, | |
| "loss": 1.6532, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 2.59378014128933, | |
| "grad_norm": 4.317445755004883, | |
| "learning_rate": 2.40621985871067e-05, | |
| "loss": 1.6554, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 2.606814212351086, | |
| "grad_norm": 14.290596008300781, | |
| "learning_rate": 2.3931857876489144e-05, | |
| "loss": 1.6265, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 2.619848283412841, | |
| "grad_norm": 4.331130504608154, | |
| "learning_rate": 2.380151716587159e-05, | |
| "loss": 1.6706, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 2.6328823544745967, | |
| "grad_norm": 7.016634941101074, | |
| "learning_rate": 2.3671176455254035e-05, | |
| "loss": 1.649, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 2.645916425536352, | |
| "grad_norm": 5.680657386779785, | |
| "learning_rate": 2.3540835744636482e-05, | |
| "loss": 1.6126, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 2.6589504965981074, | |
| "grad_norm": 4.337413311004639, | |
| "learning_rate": 2.3410495034018925e-05, | |
| "loss": 1.6317, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 2.671984567659863, | |
| "grad_norm": 20.466943740844727, | |
| "learning_rate": 2.3280154323401372e-05, | |
| "loss": 1.6348, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 2.685018638721618, | |
| "grad_norm": 4.808228969573975, | |
| "learning_rate": 2.314981361278382e-05, | |
| "loss": 1.5979, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 2.698052709783374, | |
| "grad_norm": 4.296200752258301, | |
| "learning_rate": 2.3019472902166263e-05, | |
| "loss": 1.6281, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 2.7110867808451293, | |
| "grad_norm": 32.726078033447266, | |
| "learning_rate": 2.288913219154871e-05, | |
| "loss": 1.5966, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 2.7241208519068847, | |
| "grad_norm": 4.275684833526611, | |
| "learning_rate": 2.2758791480931154e-05, | |
| "loss": 1.6108, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 2.73715492296864, | |
| "grad_norm": 3.496002197265625, | |
| "learning_rate": 2.26284507703136e-05, | |
| "loss": 1.6026, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 2.7501889940303954, | |
| "grad_norm": 9.172469139099121, | |
| "learning_rate": 2.2498110059696044e-05, | |
| "loss": 1.631, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 2.7632230650921508, | |
| "grad_norm": 16.79161834716797, | |
| "learning_rate": 2.2367769349078495e-05, | |
| "loss": 1.6357, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 2.776257136153906, | |
| "grad_norm": 14.198761940002441, | |
| "learning_rate": 2.2237428638460938e-05, | |
| "loss": 1.6423, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 2.789291207215662, | |
| "grad_norm": 5.301556587219238, | |
| "learning_rate": 2.2107087927843385e-05, | |
| "loss": 1.6125, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 2.8023252782774173, | |
| "grad_norm": 26.385272979736328, | |
| "learning_rate": 2.197674721722583e-05, | |
| "loss": 1.6334, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 2.8153593493391726, | |
| "grad_norm": 9.757530212402344, | |
| "learning_rate": 2.1846406506608276e-05, | |
| "loss": 1.586, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 2.828393420400928, | |
| "grad_norm": 20.982559204101562, | |
| "learning_rate": 2.171606579599072e-05, | |
| "loss": 1.6066, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 2.8414274914626834, | |
| "grad_norm": 3.695369243621826, | |
| "learning_rate": 2.1585725085373166e-05, | |
| "loss": 1.6307, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 2.8544615625244387, | |
| "grad_norm": 14.864655494689941, | |
| "learning_rate": 2.1455384374755613e-05, | |
| "loss": 1.5847, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 2.867495633586194, | |
| "grad_norm": 3.9043121337890625, | |
| "learning_rate": 2.1325043664138057e-05, | |
| "loss": 1.5904, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 2.88052970464795, | |
| "grad_norm": 4.432578086853027, | |
| "learning_rate": 2.1194702953520504e-05, | |
| "loss": 1.6037, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 2.8935637757097052, | |
| "grad_norm": 6.775419235229492, | |
| "learning_rate": 2.1064362242902948e-05, | |
| "loss": 1.6052, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 2.9065978467714606, | |
| "grad_norm": 5.090266227722168, | |
| "learning_rate": 2.0934021532285395e-05, | |
| "loss": 1.5814, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 2.919631917833216, | |
| "grad_norm": 7.805962085723877, | |
| "learning_rate": 2.0803680821667838e-05, | |
| "loss": 1.6016, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 2.9326659888949713, | |
| "grad_norm": 6.22263240814209, | |
| "learning_rate": 2.067334011105029e-05, | |
| "loss": 1.564, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 2.945700059956727, | |
| "grad_norm": 23.055776596069336, | |
| "learning_rate": 2.0542999400432732e-05, | |
| "loss": 1.555, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 2.958734131018482, | |
| "grad_norm": 20.39297866821289, | |
| "learning_rate": 2.041265868981518e-05, | |
| "loss": 1.5306, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 2.971768202080238, | |
| "grad_norm": 5.571432113647461, | |
| "learning_rate": 2.0282317979197623e-05, | |
| "loss": 1.577, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 2.984802273141993, | |
| "grad_norm": 15.77784252166748, | |
| "learning_rate": 2.015197726858007e-05, | |
| "loss": 1.6165, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 2.9978363442037486, | |
| "grad_norm": 4.388451099395752, | |
| "learning_rate": 2.0021636557962513e-05, | |
| "loss": 1.544, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 3.010870415265504, | |
| "grad_norm": 2.794743776321411, | |
| "learning_rate": 1.989129584734496e-05, | |
| "loss": 1.561, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 3.0239044863272593, | |
| "grad_norm": 38.998512268066406, | |
| "learning_rate": 1.9760955136727407e-05, | |
| "loss": 1.5344, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 3.036938557389015, | |
| "grad_norm": 10.872420310974121, | |
| "learning_rate": 1.9630614426109854e-05, | |
| "loss": 1.5191, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 3.0499726284507704, | |
| "grad_norm": 4.433558464050293, | |
| "learning_rate": 1.9500273715492298e-05, | |
| "loss": 1.5093, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 3.063006699512526, | |
| "grad_norm": 3.8315622806549072, | |
| "learning_rate": 1.9369933004874745e-05, | |
| "loss": 1.5344, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 3.076040770574281, | |
| "grad_norm": 24.29652976989746, | |
| "learning_rate": 1.923959229425719e-05, | |
| "loss": 1.5557, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 3.0890748416360365, | |
| "grad_norm": 4.876192092895508, | |
| "learning_rate": 1.9109251583639636e-05, | |
| "loss": 1.5381, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 3.102108912697792, | |
| "grad_norm": 4.730300426483154, | |
| "learning_rate": 1.897891087302208e-05, | |
| "loss": 1.4977, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 3.1151429837595472, | |
| "grad_norm": 15.773541450500488, | |
| "learning_rate": 1.8848570162404526e-05, | |
| "loss": 1.5262, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 3.128177054821303, | |
| "grad_norm": 3.4133520126342773, | |
| "learning_rate": 1.8718229451786973e-05, | |
| "loss": 1.5142, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 3.1412111258830584, | |
| "grad_norm": 4.271722316741943, | |
| "learning_rate": 1.8587888741169417e-05, | |
| "loss": 1.5108, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 3.1542451969448138, | |
| "grad_norm": 4.478157997131348, | |
| "learning_rate": 1.8457548030551864e-05, | |
| "loss": 1.5111, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 3.167279268006569, | |
| "grad_norm": 6.74271821975708, | |
| "learning_rate": 1.8327207319934307e-05, | |
| "loss": 1.5359, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 3.1803133390683245, | |
| "grad_norm": 10.100676536560059, | |
| "learning_rate": 1.8196866609316754e-05, | |
| "loss": 1.4856, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 3.19334741013008, | |
| "grad_norm": 5.077882289886475, | |
| "learning_rate": 1.8066525898699198e-05, | |
| "loss": 1.5054, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 3.2063814811918356, | |
| "grad_norm": 4.155623912811279, | |
| "learning_rate": 1.793618518808165e-05, | |
| "loss": 1.5089, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 3.219415552253591, | |
| "grad_norm": 3.6238481998443604, | |
| "learning_rate": 1.7805844477464092e-05, | |
| "loss": 1.4933, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 3.2324496233153464, | |
| "grad_norm": 4.119343280792236, | |
| "learning_rate": 1.767550376684654e-05, | |
| "loss": 1.5215, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 3.2454836943771017, | |
| "grad_norm": 3.789219379425049, | |
| "learning_rate": 1.7545163056228983e-05, | |
| "loss": 1.4686, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 3.258517765438857, | |
| "grad_norm": 23.477462768554688, | |
| "learning_rate": 1.741482234561143e-05, | |
| "loss": 1.4928, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 3.2715518365006124, | |
| "grad_norm": 34.81294250488281, | |
| "learning_rate": 1.7284481634993873e-05, | |
| "loss": 1.5147, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 3.2845859075623682, | |
| "grad_norm": 3.911698579788208, | |
| "learning_rate": 1.715414092437632e-05, | |
| "loss": 1.498, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 3.2976199786241236, | |
| "grad_norm": 17.540603637695312, | |
| "learning_rate": 1.7023800213758767e-05, | |
| "loss": 1.5224, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 3.310654049685879, | |
| "grad_norm": 5.028404712677002, | |
| "learning_rate": 1.689345950314121e-05, | |
| "loss": 1.4782, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 3.3236881207476343, | |
| "grad_norm": 11.53537654876709, | |
| "learning_rate": 1.6763118792523658e-05, | |
| "loss": 1.4837, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 3.3367221918093897, | |
| "grad_norm": 3.8512253761291504, | |
| "learning_rate": 1.66327780819061e-05, | |
| "loss": 1.4528, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 3.349756262871145, | |
| "grad_norm": 3.932035207748413, | |
| "learning_rate": 1.650243737128855e-05, | |
| "loss": 1.5026, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 3.3627903339329004, | |
| "grad_norm": 4.325034141540527, | |
| "learning_rate": 1.6372096660670992e-05, | |
| "loss": 1.4717, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 3.375824404994656, | |
| "grad_norm": 7.62436580657959, | |
| "learning_rate": 1.6241755950053442e-05, | |
| "loss": 1.4677, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 3.3888584760564116, | |
| "grad_norm": 4.481779098510742, | |
| "learning_rate": 1.6111415239435886e-05, | |
| "loss": 1.487, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 3.401892547118167, | |
| "grad_norm": 4.1522536277771, | |
| "learning_rate": 1.5981074528818333e-05, | |
| "loss": 1.4724, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 3.4149266181799223, | |
| "grad_norm": 22.38875961303711, | |
| "learning_rate": 1.5850733818200777e-05, | |
| "loss": 1.4694, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 3.4279606892416776, | |
| "grad_norm": 5.144596099853516, | |
| "learning_rate": 1.5720393107583224e-05, | |
| "loss": 1.4792, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 3.440994760303433, | |
| "grad_norm": 4.0159912109375, | |
| "learning_rate": 1.5590052396965667e-05, | |
| "loss": 1.4535, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 3.454028831365189, | |
| "grad_norm": 4.164160251617432, | |
| "learning_rate": 1.5459711686348114e-05, | |
| "loss": 1.4516, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 3.467062902426944, | |
| "grad_norm": 4.1465349197387695, | |
| "learning_rate": 1.532937097573056e-05, | |
| "loss": 1.4383, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 3.4800969734886995, | |
| "grad_norm": 5.3553466796875, | |
| "learning_rate": 1.5199030265113007e-05, | |
| "loss": 1.4588, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 3.493131044550455, | |
| "grad_norm": 4.2381110191345215, | |
| "learning_rate": 1.5068689554495452e-05, | |
| "loss": 1.4607, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 3.5061651156122102, | |
| "grad_norm": 4.227059364318848, | |
| "learning_rate": 1.4938348843877897e-05, | |
| "loss": 1.4855, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 3.5191991866739656, | |
| "grad_norm": 4.23318338394165, | |
| "learning_rate": 1.4808008133260342e-05, | |
| "loss": 1.4452, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 3.5322332577357214, | |
| "grad_norm": 4.2789788246154785, | |
| "learning_rate": 1.4677667422642788e-05, | |
| "loss": 1.4471, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 3.5452673287974767, | |
| "grad_norm": 14.372062683105469, | |
| "learning_rate": 1.4547326712025236e-05, | |
| "loss": 1.4663, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 3.558301399859232, | |
| "grad_norm": 4.719635963439941, | |
| "learning_rate": 1.4416986001407682e-05, | |
| "loss": 1.4628, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 3.5713354709209875, | |
| "grad_norm": 4.603359222412109, | |
| "learning_rate": 1.4286645290790127e-05, | |
| "loss": 1.4464, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 3.584369541982743, | |
| "grad_norm": 4.167656421661377, | |
| "learning_rate": 1.4156304580172572e-05, | |
| "loss": 1.4816, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 3.597403613044498, | |
| "grad_norm": 3.9802513122558594, | |
| "learning_rate": 1.4025963869555018e-05, | |
| "loss": 1.4404, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 3.6104376841062535, | |
| "grad_norm": 4.956002235412598, | |
| "learning_rate": 1.3895623158937463e-05, | |
| "loss": 1.4463, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 3.6234717551680093, | |
| "grad_norm": 4.82868766784668, | |
| "learning_rate": 1.3765282448319908e-05, | |
| "loss": 1.429, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 3.6365058262297647, | |
| "grad_norm": 9.303766250610352, | |
| "learning_rate": 1.3634941737702355e-05, | |
| "loss": 1.4492, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 3.64953989729152, | |
| "grad_norm": 4.728789806365967, | |
| "learning_rate": 1.35046010270848e-05, | |
| "loss": 1.4599, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 3.6625739683532754, | |
| "grad_norm": 4.169735431671143, | |
| "learning_rate": 1.3374260316467246e-05, | |
| "loss": 1.4346, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 3.675608039415031, | |
| "grad_norm": 4.134032249450684, | |
| "learning_rate": 1.3243919605849691e-05, | |
| "loss": 1.426, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 3.6886421104767866, | |
| "grad_norm": 7.31259822845459, | |
| "learning_rate": 1.3113578895232136e-05, | |
| "loss": 1.4489, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 3.7016761815385415, | |
| "grad_norm": 41.01179885864258, | |
| "learning_rate": 1.2983238184614582e-05, | |
| "loss": 1.4594, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 3.7147102526002973, | |
| "grad_norm": 4.123907566070557, | |
| "learning_rate": 1.2852897473997027e-05, | |
| "loss": 1.4445, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 3.7277443236620527, | |
| "grad_norm": 12.47805404663086, | |
| "learning_rate": 1.2722556763379476e-05, | |
| "loss": 1.416, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 3.740778394723808, | |
| "grad_norm": 4.795707702636719, | |
| "learning_rate": 1.2592216052761921e-05, | |
| "loss": 1.449, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 3.7538124657855634, | |
| "grad_norm": 3.754809856414795, | |
| "learning_rate": 1.2461875342144366e-05, | |
| "loss": 1.4353, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 3.7668465368473187, | |
| "grad_norm": 4.847051620483398, | |
| "learning_rate": 1.2331534631526812e-05, | |
| "loss": 1.4081, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 3.7798806079090745, | |
| "grad_norm": 5.240978240966797, | |
| "learning_rate": 1.2201193920909257e-05, | |
| "loss": 1.4497, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 3.79291467897083, | |
| "grad_norm": 4.278606414794922, | |
| "learning_rate": 1.2070853210291704e-05, | |
| "loss": 1.4296, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 3.8059487500325853, | |
| "grad_norm": 24.963735580444336, | |
| "learning_rate": 1.194051249967415e-05, | |
| "loss": 1.4273, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 3.8189828210943406, | |
| "grad_norm": 3.3722941875457764, | |
| "learning_rate": 1.1810171789056595e-05, | |
| "loss": 1.3939, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 3.832016892156096, | |
| "grad_norm": 3.9926798343658447, | |
| "learning_rate": 1.1679831078439042e-05, | |
| "loss": 1.4149, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 3.8450509632178513, | |
| "grad_norm": 7.269467353820801, | |
| "learning_rate": 1.1549490367821487e-05, | |
| "loss": 1.4004, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 3.8580850342796067, | |
| "grad_norm": 5.596455097198486, | |
| "learning_rate": 1.1419149657203932e-05, | |
| "loss": 1.4133, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 3.8711191053413625, | |
| "grad_norm": 5.81203556060791, | |
| "learning_rate": 1.1288808946586377e-05, | |
| "loss": 1.4313, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 3.884153176403118, | |
| "grad_norm": 4.842901229858398, | |
| "learning_rate": 1.1158468235968823e-05, | |
| "loss": 1.4139, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 3.897187247464873, | |
| "grad_norm": 3.6464438438415527, | |
| "learning_rate": 1.1028127525351268e-05, | |
| "loss": 1.4189, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 3.9102213185266286, | |
| "grad_norm": 5.625620365142822, | |
| "learning_rate": 1.0897786814733713e-05, | |
| "loss": 1.4119, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 3.923255389588384, | |
| "grad_norm": 3.84614896774292, | |
| "learning_rate": 1.076744610411616e-05, | |
| "loss": 1.4094, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 3.9362894606501397, | |
| "grad_norm": 5.183802127838135, | |
| "learning_rate": 1.0637105393498606e-05, | |
| "loss": 1.4157, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 3.9493235317118947, | |
| "grad_norm": 4.6199140548706055, | |
| "learning_rate": 1.0506764682881051e-05, | |
| "loss": 1.4067, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 3.9623576027736505, | |
| "grad_norm": 5.642277717590332, | |
| "learning_rate": 1.0376423972263498e-05, | |
| "loss": 1.3994, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 3.975391673835406, | |
| "grad_norm": 4.15669584274292, | |
| "learning_rate": 1.0246083261645943e-05, | |
| "loss": 1.4304, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 3.988425744897161, | |
| "grad_norm": 4.729000568389893, | |
| "learning_rate": 1.0115742551028389e-05, | |
| "loss": 1.3979, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 4.001459815958917, | |
| "grad_norm": 3.2223262786865234, | |
| "learning_rate": 9.985401840410834e-06, | |
| "loss": 1.3897, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 4.014493887020672, | |
| "grad_norm": 4.223217964172363, | |
| "learning_rate": 9.855061129793281e-06, | |
| "loss": 1.3567, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 4.027527958082428, | |
| "grad_norm": 3.201354742050171, | |
| "learning_rate": 9.724720419175726e-06, | |
| "loss": 1.3796, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 4.040562029144183, | |
| "grad_norm": 31.99419593811035, | |
| "learning_rate": 9.594379708558171e-06, | |
| "loss": 1.3475, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 4.053596100205938, | |
| "grad_norm": 19.76371192932129, | |
| "learning_rate": 9.464038997940618e-06, | |
| "loss": 1.3278, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 4.066630171267693, | |
| "grad_norm": 3.462979316711426, | |
| "learning_rate": 9.333698287323064e-06, | |
| "loss": 1.3632, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 4.079664242329449, | |
| "grad_norm": 27.641897201538086, | |
| "learning_rate": 9.203357576705509e-06, | |
| "loss": 1.3203, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 4.092698313391205, | |
| "grad_norm": 3.934295654296875, | |
| "learning_rate": 9.073016866087954e-06, | |
| "loss": 1.3793, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 4.10573238445296, | |
| "grad_norm": 3.3237240314483643, | |
| "learning_rate": 8.9426761554704e-06, | |
| "loss": 1.3375, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 4.118766455514716, | |
| "grad_norm": 5.202388286590576, | |
| "learning_rate": 8.812335444852845e-06, | |
| "loss": 1.3852, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 4.131800526576471, | |
| "grad_norm": 28.595399856567383, | |
| "learning_rate": 8.68199473423529e-06, | |
| "loss": 1.3644, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 4.144834597638226, | |
| "grad_norm": 3.2022364139556885, | |
| "learning_rate": 8.551654023617737e-06, | |
| "loss": 1.3734, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 4.157868668699982, | |
| "grad_norm": 4.231220245361328, | |
| "learning_rate": 8.421313313000183e-06, | |
| "loss": 1.349, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 4.170902739761737, | |
| "grad_norm": 4.515881538391113, | |
| "learning_rate": 8.290972602382628e-06, | |
| "loss": 1.3392, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 4.183936810823493, | |
| "grad_norm": 3.6497957706451416, | |
| "learning_rate": 8.160631891765075e-06, | |
| "loss": 1.3495, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 4.196970881885248, | |
| "grad_norm": 16.680282592773438, | |
| "learning_rate": 8.03029118114752e-06, | |
| "loss": 1.3566, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 4.210004952947004, | |
| "grad_norm": 18.566879272460938, | |
| "learning_rate": 7.899950470529966e-06, | |
| "loss": 1.3248, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 4.2230390240087585, | |
| "grad_norm": 3.9700820446014404, | |
| "learning_rate": 7.769609759912413e-06, | |
| "loss": 1.3767, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 4.236073095070514, | |
| "grad_norm": 42.5576286315918, | |
| "learning_rate": 7.639269049294858e-06, | |
| "loss": 1.3346, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 4.24910716613227, | |
| "grad_norm": 7.013011455535889, | |
| "learning_rate": 7.508928338677302e-06, | |
| "loss": 1.3752, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 4.262141237194025, | |
| "grad_norm": 12.351140975952148, | |
| "learning_rate": 7.3785876280597476e-06, | |
| "loss": 1.3213, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 4.275175308255781, | |
| "grad_norm": 48.051631927490234, | |
| "learning_rate": 7.2482469174421946e-06, | |
| "loss": 1.3453, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 4.288209379317536, | |
| "grad_norm": 3.8004846572875977, | |
| "learning_rate": 7.11790620682464e-06, | |
| "loss": 1.3231, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 4.301243450379292, | |
| "grad_norm": 3.8865389823913574, | |
| "learning_rate": 6.987565496207085e-06, | |
| "loss": 1.3353, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 4.3142775214410465, | |
| "grad_norm": 4.471733093261719, | |
| "learning_rate": 6.857224785589532e-06, | |
| "loss": 1.3411, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 4.327311592502802, | |
| "grad_norm": 4.856067657470703, | |
| "learning_rate": 6.7268840749719775e-06, | |
| "loss": 1.3254, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 4.340345663564558, | |
| "grad_norm": 4.089067459106445, | |
| "learning_rate": 6.596543364354423e-06, | |
| "loss": 1.3676, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 4.353379734626313, | |
| "grad_norm": 4.231725215911865, | |
| "learning_rate": 6.466202653736869e-06, | |
| "loss": 1.3331, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 4.366413805688069, | |
| "grad_norm": 4.140297889709473, | |
| "learning_rate": 6.335861943119314e-06, | |
| "loss": 1.3338, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 4.379447876749824, | |
| "grad_norm": 3.1667165756225586, | |
| "learning_rate": 6.2055212325017595e-06, | |
| "loss": 1.3658, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 4.3924819478115795, | |
| "grad_norm": 4.982083797454834, | |
| "learning_rate": 6.075180521884206e-06, | |
| "loss": 1.3098, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 4.405516018873335, | |
| "grad_norm": 19.951147079467773, | |
| "learning_rate": 5.944839811266651e-06, | |
| "loss": 1.315, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 4.41855008993509, | |
| "grad_norm": 5.146533489227295, | |
| "learning_rate": 5.814499100649097e-06, | |
| "loss": 1.3322, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 4.431584160996846, | |
| "grad_norm": 4.29327917098999, | |
| "learning_rate": 5.684158390031543e-06, | |
| "loss": 1.3165, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 4.444618232058601, | |
| "grad_norm": 4.86635160446167, | |
| "learning_rate": 5.5538176794139886e-06, | |
| "loss": 1.3266, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 4.457652303120357, | |
| "grad_norm": 5.066024303436279, | |
| "learning_rate": 5.423476968796435e-06, | |
| "loss": 1.3201, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 4.470686374182112, | |
| "grad_norm": 5.111464500427246, | |
| "learning_rate": 5.293136258178879e-06, | |
| "loss": 1.3188, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 4.4837204452438675, | |
| "grad_norm": 4.428502082824707, | |
| "learning_rate": 5.162795547561325e-06, | |
| "loss": 1.3162, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 4.496754516305623, | |
| "grad_norm": 2.84608793258667, | |
| "learning_rate": 5.0324548369437715e-06, | |
| "loss": 1.3052, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 4.509788587367378, | |
| "grad_norm": 4.425991058349609, | |
| "learning_rate": 4.902114126326217e-06, | |
| "loss": 1.3252, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 4.522822658429134, | |
| "grad_norm": 21.735198974609375, | |
| "learning_rate": 4.771773415708663e-06, | |
| "loss": 1.3333, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 4.535856729490889, | |
| "grad_norm": 4.519357204437256, | |
| "learning_rate": 4.641432705091108e-06, | |
| "loss": 1.3115, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 4.548890800552645, | |
| "grad_norm": 25.662084579467773, | |
| "learning_rate": 4.511091994473554e-06, | |
| "loss": 1.3134, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 4.5619248716144, | |
| "grad_norm": 3.4979422092437744, | |
| "learning_rate": 4.3807512838560005e-06, | |
| "loss": 1.3202, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 4.574958942676155, | |
| "grad_norm": 4.444785118103027, | |
| "learning_rate": 4.250410573238446e-06, | |
| "loss": 1.3174, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 4.587993013737911, | |
| "grad_norm": 6.712714672088623, | |
| "learning_rate": 4.120069862620891e-06, | |
| "loss": 1.3343, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 4.601027084799666, | |
| "grad_norm": 4.870098114013672, | |
| "learning_rate": 3.9897291520033364e-06, | |
| "loss": 1.3312, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 4.614061155861422, | |
| "grad_norm": 4.5157928466796875, | |
| "learning_rate": 3.859388441385783e-06, | |
| "loss": 1.3133, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 4.627095226923177, | |
| "grad_norm": 3.297917366027832, | |
| "learning_rate": 3.7290477307682287e-06, | |
| "loss": 1.34, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 4.640129297984933, | |
| "grad_norm": 5.5820698738098145, | |
| "learning_rate": 3.598707020150674e-06, | |
| "loss": 1.2856, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 4.653163369046688, | |
| "grad_norm": 68.55699157714844, | |
| "learning_rate": 3.4683663095331198e-06, | |
| "loss": 1.3293, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 4.666197440108443, | |
| "grad_norm": 4.395013332366943, | |
| "learning_rate": 3.338025598915565e-06, | |
| "loss": 1.3156, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 4.679231511170199, | |
| "grad_norm": 4.131389141082764, | |
| "learning_rate": 3.2076848882980112e-06, | |
| "loss": 1.3349, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 4.692265582231954, | |
| "grad_norm": 3.2444746494293213, | |
| "learning_rate": 3.077344177680457e-06, | |
| "loss": 1.2882, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 4.70529965329371, | |
| "grad_norm": 6.894190788269043, | |
| "learning_rate": 2.9470034670629027e-06, | |
| "loss": 1.3064, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 4.718333724355465, | |
| "grad_norm": 4.13007926940918, | |
| "learning_rate": 2.816662756445348e-06, | |
| "loss": 1.3319, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 4.731367795417221, | |
| "grad_norm": 4.010223388671875, | |
| "learning_rate": 2.686322045827794e-06, | |
| "loss": 1.3289, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 4.7444018664789755, | |
| "grad_norm": 5.212350845336914, | |
| "learning_rate": 2.55598133521024e-06, | |
| "loss": 1.3052, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 4.757435937540731, | |
| "grad_norm": 4.112293243408203, | |
| "learning_rate": 2.4256406245926856e-06, | |
| "loss": 1.3178, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 4.770470008602487, | |
| "grad_norm": 4.711720943450928, | |
| "learning_rate": 2.295299913975131e-06, | |
| "loss": 1.3017, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 4.783504079664242, | |
| "grad_norm": 4.1918439865112305, | |
| "learning_rate": 2.1649592033575766e-06, | |
| "loss": 1.3368, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 4.796538150725998, | |
| "grad_norm": 4.53779411315918, | |
| "learning_rate": 2.0346184927400227e-06, | |
| "loss": 1.3103, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 4.809572221787754, | |
| "grad_norm": 2.9776086807250977, | |
| "learning_rate": 1.9042777821224683e-06, | |
| "loss": 1.3325, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 4.822606292849509, | |
| "grad_norm": 5.410048007965088, | |
| "learning_rate": 1.773937071504914e-06, | |
| "loss": 1.324, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 4.835640363911264, | |
| "grad_norm": 5.260219573974609, | |
| "learning_rate": 1.6435963608873595e-06, | |
| "loss": 1.3339, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 4.848674434973019, | |
| "grad_norm": 5.610768795013428, | |
| "learning_rate": 1.5132556502698054e-06, | |
| "loss": 1.2985, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 4.861708506034775, | |
| "grad_norm": 6.287191390991211, | |
| "learning_rate": 1.382914939652251e-06, | |
| "loss": 1.2973, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 4.87474257709653, | |
| "grad_norm": 32.12895202636719, | |
| "learning_rate": 1.2525742290346967e-06, | |
| "loss": 1.2914, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 4.887776648158286, | |
| "grad_norm": 15.296839714050293, | |
| "learning_rate": 1.1222335184171426e-06, | |
| "loss": 1.3231, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 4.900810719220042, | |
| "grad_norm": 4.650936126708984, | |
| "learning_rate": 9.918928077995881e-07, | |
| "loss": 1.2902, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 4.9138447902817965, | |
| "grad_norm": 25.2452335357666, | |
| "learning_rate": 8.615520971820338e-07, | |
| "loss": 1.2964, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 4.926878861343552, | |
| "grad_norm": 4.3756890296936035, | |
| "learning_rate": 7.312113865644796e-07, | |
| "loss": 1.3137, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 4.939912932405307, | |
| "grad_norm": 32.994510650634766, | |
| "learning_rate": 6.008706759469253e-07, | |
| "loss": 1.3033, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 4.952947003467063, | |
| "grad_norm": 3.0575180053710938, | |
| "learning_rate": 4.70529965329371e-07, | |
| "loss": 1.2992, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 4.965981074528818, | |
| "grad_norm": 4.4134135246276855, | |
| "learning_rate": 3.401892547118167e-07, | |
| "loss": 1.2839, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 4.979015145590574, | |
| "grad_norm": 40.072750091552734, | |
| "learning_rate": 2.0984854409426243e-07, | |
| "loss": 1.3057, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 4.99204921665233, | |
| "grad_norm": 19.755613327026367, | |
| "learning_rate": 7.950783347670812e-08, | |
| "loss": 1.2946, | |
| "step": 191500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 191805, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.066567392204288e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |