{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991487623003845, "eval_steps": 500, "global_step": 2751, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010895842555075079, "grad_norm": 2.5097851753234863, "learning_rate": 1.9927299163940386e-05, "loss": 0.51, "step": 10 }, { "epoch": 0.021791685110150158, "grad_norm": 1.4778149127960205, "learning_rate": 1.985459832788077e-05, "loss": 0.0818, "step": 20 }, { "epoch": 0.03268752766522524, "grad_norm": 1.0869137048721313, "learning_rate": 1.978189749182116e-05, "loss": 0.1052, "step": 30 }, { "epoch": 0.043583370220300316, "grad_norm": 1.0245683193206787, "learning_rate": 1.970919665576154e-05, "loss": 0.0508, "step": 40 }, { "epoch": 0.054479212775375395, "grad_norm": 3.580275535583496, "learning_rate": 1.963649581970193e-05, "loss": 0.0336, "step": 50 }, { "epoch": 0.06537505533045047, "grad_norm": 13.708664894104004, "learning_rate": 1.9563794983642313e-05, "loss": 0.0247, "step": 60 }, { "epoch": 0.07627089788552556, "grad_norm": 2.38472843170166, "learning_rate": 1.9491094147582698e-05, "loss": 0.0474, "step": 70 }, { "epoch": 0.08716674044060063, "grad_norm": 2.3008601665496826, "learning_rate": 1.9418393311523086e-05, "loss": 0.0505, "step": 80 }, { "epoch": 0.09806258299567572, "grad_norm": 0.9727557301521301, "learning_rate": 1.9345692475463468e-05, "loss": 0.0291, "step": 90 }, { "epoch": 0.10895842555075079, "grad_norm": 0.017909426242113113, "learning_rate": 1.9272991639403856e-05, "loss": 0.0232, "step": 100 }, { "epoch": 0.11985426810582588, "grad_norm": 0.13610009849071503, "learning_rate": 1.920029080334424e-05, "loss": 0.0679, "step": 110 }, { "epoch": 0.13075011066090095, "grad_norm": 0.09146017581224442, "learning_rate": 1.9127589967284625e-05, "loss": 0.062, "step": 120 }, { "epoch": 0.14164595321597603, "grad_norm": 3.696361541748047, "learning_rate": 1.9054889131225013e-05, "loss": 0.0515, "step": 130 }, { "epoch": 0.15254179577105112, "grad_norm": 0.07528296858072281, "learning_rate": 1.8982188295165395e-05, "loss": 0.008, "step": 140 }, { "epoch": 0.16343763832612618, "grad_norm": 0.3899368345737457, "learning_rate": 1.8909487459105783e-05, "loss": 0.0309, "step": 150 }, { "epoch": 0.17433348088120126, "grad_norm": 0.5960955619812012, "learning_rate": 1.8836786623046168e-05, "loss": 0.0415, "step": 160 }, { "epoch": 0.18522932343627635, "grad_norm": 0.027237065136432648, "learning_rate": 1.8764085786986552e-05, "loss": 0.0257, "step": 170 }, { "epoch": 0.19612516599135144, "grad_norm": 6.851381778717041, "learning_rate": 1.8691384950926937e-05, "loss": 0.0585, "step": 180 }, { "epoch": 0.2070210085464265, "grad_norm": 1.518951416015625, "learning_rate": 1.8618684114867322e-05, "loss": 0.0173, "step": 190 }, { "epoch": 0.21791685110150158, "grad_norm": 16.334980010986328, "learning_rate": 1.854598327880771e-05, "loss": 0.0892, "step": 200 }, { "epoch": 0.22881269365657667, "grad_norm": 0.6327227354049683, "learning_rate": 1.847328244274809e-05, "loss": 0.0391, "step": 210 }, { "epoch": 0.23970853621165175, "grad_norm": 0.026528311893343925, "learning_rate": 1.840058160668848e-05, "loss": 0.0316, "step": 220 }, { "epoch": 0.2506043787667268, "grad_norm": 0.15849582850933075, "learning_rate": 1.8327880770628864e-05, "loss": 0.0306, "step": 230 }, { "epoch": 0.2615002213218019, "grad_norm": 8.983612060546875, "learning_rate": 1.825517993456925e-05, "loss": 0.0252, "step": 240 }, { "epoch": 0.272396063876877, "grad_norm": 1.4300966262817383, "learning_rate": 1.8182479098509634e-05, "loss": 0.0307, "step": 250 }, { "epoch": 0.28329190643195207, "grad_norm": 0.19248631596565247, "learning_rate": 1.810977826245002e-05, "loss": 0.034, "step": 260 }, { "epoch": 0.29418774898702715, "grad_norm": 0.0807420164346695, "learning_rate": 1.8037077426390407e-05, "loss": 0.0218, "step": 270 }, { "epoch": 0.30508359154210224, "grad_norm": 0.04030030593276024, "learning_rate": 1.796437659033079e-05, "loss": 0.0164, "step": 280 }, { "epoch": 0.31597943409717727, "grad_norm": 0.03919893503189087, "learning_rate": 1.7891675754271176e-05, "loss": 0.0207, "step": 290 }, { "epoch": 0.32687527665225236, "grad_norm": 0.9118878245353699, "learning_rate": 1.781897491821156e-05, "loss": 0.0254, "step": 300 }, { "epoch": 0.33777111920732744, "grad_norm": 0.09405702352523804, "learning_rate": 1.7746274082151945e-05, "loss": 0.0072, "step": 310 }, { "epoch": 0.3486669617624025, "grad_norm": 1.061004638671875, "learning_rate": 1.7673573246092334e-05, "loss": 0.0178, "step": 320 }, { "epoch": 0.3595628043174776, "grad_norm": 0.35136711597442627, "learning_rate": 1.7600872410032715e-05, "loss": 0.0268, "step": 330 }, { "epoch": 0.3704586468725527, "grad_norm": 0.33769288659095764, "learning_rate": 1.7528171573973103e-05, "loss": 0.0383, "step": 340 }, { "epoch": 0.3813544894276278, "grad_norm": 1.448626160621643, "learning_rate": 1.7455470737913488e-05, "loss": 0.0214, "step": 350 }, { "epoch": 0.39225033198270287, "grad_norm": 1.096685767173767, "learning_rate": 1.7382769901853873e-05, "loss": 0.0442, "step": 360 }, { "epoch": 0.4031461745377779, "grad_norm": 0.08582064509391785, "learning_rate": 1.7310069065794257e-05, "loss": 0.041, "step": 370 }, { "epoch": 0.414042017092853, "grad_norm": 0.5726041793823242, "learning_rate": 1.7237368229734642e-05, "loss": 0.02, "step": 380 }, { "epoch": 0.4249378596479281, "grad_norm": 0.27912572026252747, "learning_rate": 1.716466739367503e-05, "loss": 0.033, "step": 390 }, { "epoch": 0.43583370220300316, "grad_norm": 0.40194639563560486, "learning_rate": 1.7091966557615415e-05, "loss": 0.0297, "step": 400 }, { "epoch": 0.44672954475807825, "grad_norm": 0.4923015832901001, "learning_rate": 1.70192657215558e-05, "loss": 0.0473, "step": 410 }, { "epoch": 0.45762538731315333, "grad_norm": 0.4864579439163208, "learning_rate": 1.6946564885496184e-05, "loss": 0.0335, "step": 420 }, { "epoch": 0.4685212298682284, "grad_norm": 0.0577218122780323, "learning_rate": 1.687386404943657e-05, "loss": 0.0267, "step": 430 }, { "epoch": 0.4794170724233035, "grad_norm": 0.026588434353470802, "learning_rate": 1.6801163213376954e-05, "loss": 0.0242, "step": 440 }, { "epoch": 0.4903129149783786, "grad_norm": 1.106031060218811, "learning_rate": 1.6728462377317342e-05, "loss": 0.0412, "step": 450 }, { "epoch": 0.5012087575334536, "grad_norm": 2.185438394546509, "learning_rate": 1.6655761541257727e-05, "loss": 0.0168, "step": 460 }, { "epoch": 0.5121046000885288, "grad_norm": 0.2645202577114105, "learning_rate": 1.658306070519811e-05, "loss": 0.0225, "step": 470 }, { "epoch": 0.5230004426436038, "grad_norm": 0.26281026005744934, "learning_rate": 1.6510359869138496e-05, "loss": 0.0225, "step": 480 }, { "epoch": 0.5338962851986789, "grad_norm": 0.09611400961875916, "learning_rate": 1.643765903307888e-05, "loss": 0.0204, "step": 490 }, { "epoch": 0.544792127753754, "grad_norm": 0.2964985966682434, "learning_rate": 1.6364958197019266e-05, "loss": 0.0192, "step": 500 }, { "epoch": 0.555687970308829, "grad_norm": 3.2991862297058105, "learning_rate": 1.629225736095965e-05, "loss": 0.0395, "step": 510 }, { "epoch": 0.5665838128639041, "grad_norm": 0.9299785494804382, "learning_rate": 1.621955652490004e-05, "loss": 0.0213, "step": 520 }, { "epoch": 0.5774796554189792, "grad_norm": 1.7656854391098022, "learning_rate": 1.6146855688840423e-05, "loss": 0.0293, "step": 530 }, { "epoch": 0.5883754979740543, "grad_norm": 0.052940454334020615, "learning_rate": 1.6074154852780808e-05, "loss": 0.0349, "step": 540 }, { "epoch": 0.5992713405291293, "grad_norm": 0.6700181365013123, "learning_rate": 1.6001454016721193e-05, "loss": 0.0098, "step": 550 }, { "epoch": 0.6101671830842045, "grad_norm": 1.4992352724075317, "learning_rate": 1.5928753180661577e-05, "loss": 0.0209, "step": 560 }, { "epoch": 0.6210630256392795, "grad_norm": 0.6882705688476562, "learning_rate": 1.5856052344601966e-05, "loss": 0.0208, "step": 570 }, { "epoch": 0.6319588681943545, "grad_norm": 0.35566991567611694, "learning_rate": 1.578335150854235e-05, "loss": 0.0157, "step": 580 }, { "epoch": 0.6428547107494297, "grad_norm": 0.1365765929222107, "learning_rate": 1.5710650672482735e-05, "loss": 0.0207, "step": 590 }, { "epoch": 0.6537505533045047, "grad_norm": 0.010805984027683735, "learning_rate": 1.563794983642312e-05, "loss": 0.0386, "step": 600 }, { "epoch": 0.6646463958595799, "grad_norm": 0.33677366375923157, "learning_rate": 1.5565249000363505e-05, "loss": 0.0178, "step": 610 }, { "epoch": 0.6755422384146549, "grad_norm": 0.023768046870827675, "learning_rate": 1.5492548164303893e-05, "loss": 0.0115, "step": 620 }, { "epoch": 0.68643808096973, "grad_norm": 1.271041989326477, "learning_rate": 1.5419847328244274e-05, "loss": 0.0335, "step": 630 }, { "epoch": 0.697333923524805, "grad_norm": 0.39303043484687805, "learning_rate": 1.5347146492184662e-05, "loss": 0.0456, "step": 640 }, { "epoch": 0.7082297660798802, "grad_norm": 1.5450124740600586, "learning_rate": 1.5274445656125047e-05, "loss": 0.0206, "step": 650 }, { "epoch": 0.7191256086349552, "grad_norm": 0.12599903345108032, "learning_rate": 1.5201744820065432e-05, "loss": 0.0125, "step": 660 }, { "epoch": 0.7300214511900303, "grad_norm": 0.03158240765333176, "learning_rate": 1.5129043984005818e-05, "loss": 0.0019, "step": 670 }, { "epoch": 0.7409172937451054, "grad_norm": 1.2820944786071777, "learning_rate": 1.5056343147946201e-05, "loss": 0.0132, "step": 680 }, { "epoch": 0.7518131363001804, "grad_norm": 0.4018807113170624, "learning_rate": 1.4983642311886588e-05, "loss": 0.0274, "step": 690 }, { "epoch": 0.7627089788552556, "grad_norm": 0.7147946953773499, "learning_rate": 1.4910941475826972e-05, "loss": 0.0207, "step": 700 }, { "epoch": 0.7736048214103306, "grad_norm": 1.3514039516448975, "learning_rate": 1.4838240639767359e-05, "loss": 0.0088, "step": 710 }, { "epoch": 0.7845006639654057, "grad_norm": 0.10958287864923477, "learning_rate": 1.4765539803707745e-05, "loss": 0.0054, "step": 720 }, { "epoch": 0.7953965065204808, "grad_norm": 0.12291970103979111, "learning_rate": 1.4692838967648128e-05, "loss": 0.0154, "step": 730 }, { "epoch": 0.8062923490755558, "grad_norm": 0.056142911314964294, "learning_rate": 1.4620138131588515e-05, "loss": 0.0214, "step": 740 }, { "epoch": 0.817188191630631, "grad_norm": 0.08367596566677094, "learning_rate": 1.45474372955289e-05, "loss": 0.0074, "step": 750 }, { "epoch": 0.828084034185706, "grad_norm": 0.8847033381462097, "learning_rate": 1.4474736459469286e-05, "loss": 0.052, "step": 760 }, { "epoch": 0.8389798767407811, "grad_norm": 0.23346182703971863, "learning_rate": 1.4402035623409672e-05, "loss": 0.0238, "step": 770 }, { "epoch": 0.8498757192958561, "grad_norm": 0.7445326447486877, "learning_rate": 1.4329334787350055e-05, "loss": 0.0179, "step": 780 }, { "epoch": 0.8607715618509313, "grad_norm": 1.623715877532959, "learning_rate": 1.4256633951290442e-05, "loss": 0.0138, "step": 790 }, { "epoch": 0.8716674044060063, "grad_norm": 0.12205464392900467, "learning_rate": 1.4183933115230826e-05, "loss": 0.0182, "step": 800 }, { "epoch": 0.8825632469610815, "grad_norm": 0.015034107491374016, "learning_rate": 1.4111232279171211e-05, "loss": 0.0192, "step": 810 }, { "epoch": 0.8934590895161565, "grad_norm": 1.1116948127746582, "learning_rate": 1.4038531443111596e-05, "loss": 0.0329, "step": 820 }, { "epoch": 0.9043549320712315, "grad_norm": 0.35468608140945435, "learning_rate": 1.3965830607051982e-05, "loss": 0.0299, "step": 830 }, { "epoch": 0.9152507746263067, "grad_norm": 1.3069281578063965, "learning_rate": 1.3893129770992369e-05, "loss": 0.028, "step": 840 }, { "epoch": 0.9261466171813817, "grad_norm": 0.6548961997032166, "learning_rate": 1.3820428934932752e-05, "loss": 0.0125, "step": 850 }, { "epoch": 0.9370424597364568, "grad_norm": 0.016538333147764206, "learning_rate": 1.3747728098873138e-05, "loss": 0.0097, "step": 860 }, { "epoch": 0.9479383022915319, "grad_norm": 0.7220777273178101, "learning_rate": 1.3675027262813523e-05, "loss": 0.0281, "step": 870 }, { "epoch": 0.958834144846607, "grad_norm": 7.228305339813232, "learning_rate": 1.360232642675391e-05, "loss": 0.0095, "step": 880 }, { "epoch": 0.969729987401682, "grad_norm": 0.31951704621315, "learning_rate": 1.3529625590694292e-05, "loss": 0.0148, "step": 890 }, { "epoch": 0.9806258299567572, "grad_norm": 0.009546870365738869, "learning_rate": 1.3456924754634679e-05, "loss": 0.0051, "step": 900 }, { "epoch": 0.9915216725118322, "grad_norm": 2.050363063812256, "learning_rate": 1.3384223918575065e-05, "loss": 0.0306, "step": 910 }, { "epoch": 1.0032687527665225, "grad_norm": 1.1950825452804565, "learning_rate": 1.331152308251545e-05, "loss": 0.0061, "step": 920 }, { "epoch": 1.0141645953215976, "grad_norm": 0.02007538639008999, "learning_rate": 1.3238822246455837e-05, "loss": 0.005, "step": 930 }, { "epoch": 1.0250604378766728, "grad_norm": 0.053643591701984406, "learning_rate": 1.316612141039622e-05, "loss": 0.0093, "step": 940 }, { "epoch": 1.0359562804317477, "grad_norm": 0.13197128474712372, "learning_rate": 1.3093420574336606e-05, "loss": 0.0123, "step": 950 }, { "epoch": 1.0468521229868228, "grad_norm": 0.20932506024837494, "learning_rate": 1.3020719738276992e-05, "loss": 0.0267, "step": 960 }, { "epoch": 1.057747965541898, "grad_norm": 0.11939968913793564, "learning_rate": 1.2948018902217377e-05, "loss": 0.0042, "step": 970 }, { "epoch": 1.068643808096973, "grad_norm": 0.08671363443136215, "learning_rate": 1.2875318066157762e-05, "loss": 0.009, "step": 980 }, { "epoch": 1.079539650652048, "grad_norm": 0.025082537904381752, "learning_rate": 1.2802617230098147e-05, "loss": 0.0028, "step": 990 }, { "epoch": 1.0904354932071232, "grad_norm": 0.005358474794775248, "learning_rate": 1.2729916394038533e-05, "loss": 0.0017, "step": 1000 }, { "epoch": 1.1013313357621983, "grad_norm": 0.008662994019687176, "learning_rate": 1.2657215557978916e-05, "loss": 0.0013, "step": 1010 }, { "epoch": 1.1122271783172732, "grad_norm": 2.0191564559936523, "learning_rate": 1.2584514721919303e-05, "loss": 0.0179, "step": 1020 }, { "epoch": 1.1231230208723484, "grad_norm": 0.025384988635778427, "learning_rate": 1.2511813885859689e-05, "loss": 0.02, "step": 1030 }, { "epoch": 1.1340188634274235, "grad_norm": 0.011868833564221859, "learning_rate": 1.2439113049800074e-05, "loss": 0.0024, "step": 1040 }, { "epoch": 1.1449147059824987, "grad_norm": 0.010154581628739834, "learning_rate": 1.236641221374046e-05, "loss": 0.0053, "step": 1050 }, { "epoch": 1.1558105485375736, "grad_norm": 0.09402716159820557, "learning_rate": 1.2293711377680843e-05, "loss": 0.005, "step": 1060 }, { "epoch": 1.1667063910926487, "grad_norm": 0.3972262442111969, "learning_rate": 1.222101054162123e-05, "loss": 0.0065, "step": 1070 }, { "epoch": 1.1776022336477239, "grad_norm": 0.02627560682594776, "learning_rate": 1.2148309705561614e-05, "loss": 0.0192, "step": 1080 }, { "epoch": 1.1884980762027988, "grad_norm": 0.538215160369873, "learning_rate": 1.2075608869502e-05, "loss": 0.0073, "step": 1090 }, { "epoch": 1.199393918757874, "grad_norm": 0.48226070404052734, "learning_rate": 1.2002908033442387e-05, "loss": 0.0009, "step": 1100 }, { "epoch": 1.210289761312949, "grad_norm": 0.5596455335617065, "learning_rate": 1.193020719738277e-05, "loss": 0.0119, "step": 1110 }, { "epoch": 1.2211856038680242, "grad_norm": 0.03299971669912338, "learning_rate": 1.1857506361323157e-05, "loss": 0.0025, "step": 1120 }, { "epoch": 1.2320814464230991, "grad_norm": 0.03791365772485733, "learning_rate": 1.1784805525263541e-05, "loss": 0.0147, "step": 1130 }, { "epoch": 1.2429772889781743, "grad_norm": 0.6537386178970337, "learning_rate": 1.1712104689203926e-05, "loss": 0.0026, "step": 1140 }, { "epoch": 1.2538731315332494, "grad_norm": 0.02327698841691017, "learning_rate": 1.1639403853144313e-05, "loss": 0.0012, "step": 1150 }, { "epoch": 1.2647689740883243, "grad_norm": 0.024980690330266953, "learning_rate": 1.1566703017084697e-05, "loss": 0.0053, "step": 1160 }, { "epoch": 1.2756648166433995, "grad_norm": 0.01306835189461708, "learning_rate": 1.1494002181025084e-05, "loss": 0.0179, "step": 1170 }, { "epoch": 1.2865606591984746, "grad_norm": 0.005500817205756903, "learning_rate": 1.1421301344965467e-05, "loss": 0.0117, "step": 1180 }, { "epoch": 1.2974565017535498, "grad_norm": 2.294457197189331, "learning_rate": 1.1348600508905853e-05, "loss": 0.0065, "step": 1190 }, { "epoch": 1.3083523443086247, "grad_norm": 3.2596099376678467, "learning_rate": 1.1275899672846238e-05, "loss": 0.0128, "step": 1200 }, { "epoch": 1.3192481868636998, "grad_norm": 0.014325232245028019, "learning_rate": 1.1203198836786624e-05, "loss": 0.004, "step": 1210 }, { "epoch": 1.330144029418775, "grad_norm": 0.08742561936378479, "learning_rate": 1.1130498000727011e-05, "loss": 0.005, "step": 1220 }, { "epoch": 1.3410398719738499, "grad_norm": 0.06310788542032242, "learning_rate": 1.1057797164667394e-05, "loss": 0.0062, "step": 1230 }, { "epoch": 1.351935714528925, "grad_norm": 0.02661961503326893, "learning_rate": 1.098509632860778e-05, "loss": 0.001, "step": 1240 }, { "epoch": 1.3628315570840002, "grad_norm": 0.008728576824069023, "learning_rate": 1.0912395492548165e-05, "loss": 0.0065, "step": 1250 }, { "epoch": 1.3737273996390753, "grad_norm": 0.40287479758262634, "learning_rate": 1.0839694656488552e-05, "loss": 0.0115, "step": 1260 }, { "epoch": 1.3846232421941502, "grad_norm": 0.0008290009573101997, "learning_rate": 1.0766993820428935e-05, "loss": 0.0023, "step": 1270 }, { "epoch": 1.3955190847492254, "grad_norm": 0.20154079794883728, "learning_rate": 1.0694292984369321e-05, "loss": 0.004, "step": 1280 }, { "epoch": 1.4064149273043005, "grad_norm": 0.032378897070884705, "learning_rate": 1.0621592148309707e-05, "loss": 0.0103, "step": 1290 }, { "epoch": 1.4173107698593754, "grad_norm": 0.037077393382787704, "learning_rate": 1.0548891312250092e-05, "loss": 0.0048, "step": 1300 }, { "epoch": 1.4282066124144506, "grad_norm": 0.0009527279180474579, "learning_rate": 1.0476190476190477e-05, "loss": 0.0197, "step": 1310 }, { "epoch": 1.4391024549695257, "grad_norm": 0.6460732221603394, "learning_rate": 1.0403489640130862e-05, "loss": 0.0085, "step": 1320 }, { "epoch": 1.4499982975246009, "grad_norm": 0.18065184354782104, "learning_rate": 1.0330788804071248e-05, "loss": 0.0021, "step": 1330 }, { "epoch": 1.4608941400796758, "grad_norm": 0.08325136452913284, "learning_rate": 1.0258087968011631e-05, "loss": 0.0079, "step": 1340 }, { "epoch": 1.471789982634751, "grad_norm": 0.0035695817787200212, "learning_rate": 1.0185387131952018e-05, "loss": 0.0001, "step": 1350 }, { "epoch": 1.482685825189826, "grad_norm": 0.00448552705347538, "learning_rate": 1.0112686295892404e-05, "loss": 0.0004, "step": 1360 }, { "epoch": 1.493581667744901, "grad_norm": 0.027783585712313652, "learning_rate": 1.0039985459832789e-05, "loss": 0.011, "step": 1370 }, { "epoch": 1.5044775102999761, "grad_norm": 2.4403154850006104, "learning_rate": 9.967284623773175e-06, "loss": 0.0162, "step": 1380 }, { "epoch": 1.5153733528550513, "grad_norm": 0.031121332198381424, "learning_rate": 9.89458378771356e-06, "loss": 0.0019, "step": 1390 }, { "epoch": 1.5262691954101264, "grad_norm": 0.01372817624360323, "learning_rate": 9.821882951653945e-06, "loss": 0.0107, "step": 1400 }, { "epoch": 1.5371650379652015, "grad_norm": 0.015296364203095436, "learning_rate": 9.74918211559433e-06, "loss": 0.0107, "step": 1410 }, { "epoch": 1.5480608805202765, "grad_norm": 0.022742554545402527, "learning_rate": 9.676481279534716e-06, "loss": 0.0055, "step": 1420 }, { "epoch": 1.5589567230753516, "grad_norm": 0.005425534211099148, "learning_rate": 9.6037804434751e-06, "loss": 0.001, "step": 1430 }, { "epoch": 1.5698525656304265, "grad_norm": 0.0004977713688276708, "learning_rate": 9.531079607415487e-06, "loss": 0.0015, "step": 1440 }, { "epoch": 1.5807484081855017, "grad_norm": 0.016388392075896263, "learning_rate": 9.458378771355872e-06, "loss": 0.0213, "step": 1450 }, { "epoch": 1.5916442507405768, "grad_norm": 0.029239172115921974, "learning_rate": 9.385677935296256e-06, "loss": 0.0032, "step": 1460 }, { "epoch": 1.602540093295652, "grad_norm": 0.25184109807014465, "learning_rate": 9.312977099236641e-06, "loss": 0.0139, "step": 1470 }, { "epoch": 1.613435935850727, "grad_norm": 0.5452978014945984, "learning_rate": 9.240276263177028e-06, "loss": 0.001, "step": 1480 }, { "epoch": 1.624331778405802, "grad_norm": 0.00713045010343194, "learning_rate": 9.167575427117412e-06, "loss": 0.0068, "step": 1490 }, { "epoch": 1.6352276209608771, "grad_norm": 0.04856117442250252, "learning_rate": 9.094874591057799e-06, "loss": 0.013, "step": 1500 }, { "epoch": 1.646123463515952, "grad_norm": 0.6631866693496704, "learning_rate": 9.022173754998184e-06, "loss": 0.0118, "step": 1510 }, { "epoch": 1.6570193060710272, "grad_norm": 0.34849047660827637, "learning_rate": 8.949472918938568e-06, "loss": 0.004, "step": 1520 }, { "epoch": 1.6679151486261024, "grad_norm": 0.011874212883412838, "learning_rate": 8.876772082878955e-06, "loss": 0.002, "step": 1530 }, { "epoch": 1.6788109911811775, "grad_norm": 0.05654163286089897, "learning_rate": 8.80407124681934e-06, "loss": 0.0033, "step": 1540 }, { "epoch": 1.6897068337362526, "grad_norm": 0.05505364388227463, "learning_rate": 8.731370410759724e-06, "loss": 0.0016, "step": 1550 }, { "epoch": 1.7006026762913276, "grad_norm": 0.8052054047584534, "learning_rate": 8.658669574700109e-06, "loss": 0.0033, "step": 1560 }, { "epoch": 1.7114985188464027, "grad_norm": 0.001815033028833568, "learning_rate": 8.585968738640495e-06, "loss": 0.0026, "step": 1570 }, { "epoch": 1.7223943614014776, "grad_norm": 0.17480531334877014, "learning_rate": 8.51326790258088e-06, "loss": 0.0064, "step": 1580 }, { "epoch": 1.7332902039565528, "grad_norm": 0.005486777517944574, "learning_rate": 8.440567066521266e-06, "loss": 0.0208, "step": 1590 }, { "epoch": 1.744186046511628, "grad_norm": 0.10310015082359314, "learning_rate": 8.367866230461651e-06, "loss": 0.0005, "step": 1600 }, { "epoch": 1.755081889066703, "grad_norm": 0.008104170672595501, "learning_rate": 8.295165394402036e-06, "loss": 0.0087, "step": 1610 }, { "epoch": 1.7659777316217782, "grad_norm": 0.033456411212682724, "learning_rate": 8.22246455834242e-06, "loss": 0.0072, "step": 1620 }, { "epoch": 1.776873574176853, "grad_norm": 0.007005383726209402, "learning_rate": 8.149763722282807e-06, "loss": 0.014, "step": 1630 }, { "epoch": 1.7877694167319282, "grad_norm": 0.012260228395462036, "learning_rate": 8.077062886223192e-06, "loss": 0.0008, "step": 1640 }, { "epoch": 1.7986652592870032, "grad_norm": 0.0009957356378436089, "learning_rate": 8.004362050163578e-06, "loss": 0.0014, "step": 1650 }, { "epoch": 1.8095611018420783, "grad_norm": 0.005955096334218979, "learning_rate": 7.931661214103963e-06, "loss": 0.0005, "step": 1660 }, { "epoch": 1.8204569443971534, "grad_norm": 0.0004700123390648514, "learning_rate": 7.858960378044348e-06, "loss": 0.0028, "step": 1670 }, { "epoch": 1.8313527869522286, "grad_norm": 0.002416003029793501, "learning_rate": 7.786259541984733e-06, "loss": 0.0003, "step": 1680 }, { "epoch": 1.8422486295073037, "grad_norm": 0.028112288564443588, "learning_rate": 7.713558705925119e-06, "loss": 0.0318, "step": 1690 }, { "epoch": 1.8531444720623786, "grad_norm": 0.03914355859160423, "learning_rate": 7.640857869865504e-06, "loss": 0.0139, "step": 1700 }, { "epoch": 1.8640403146174538, "grad_norm": 4.869634628295898, "learning_rate": 7.568157033805889e-06, "loss": 0.0098, "step": 1710 }, { "epoch": 1.8749361571725287, "grad_norm": 1.1335488557815552, "learning_rate": 7.495456197746275e-06, "loss": 0.0174, "step": 1720 }, { "epoch": 1.8858319997276038, "grad_norm": 0.6747786402702332, "learning_rate": 7.42275536168666e-06, "loss": 0.0044, "step": 1730 }, { "epoch": 1.896727842282679, "grad_norm": 0.9970724582672119, "learning_rate": 7.350054525627045e-06, "loss": 0.0087, "step": 1740 }, { "epoch": 1.9076236848377541, "grad_norm": 0.16893063485622406, "learning_rate": 7.27735368956743e-06, "loss": 0.0032, "step": 1750 }, { "epoch": 1.9185195273928293, "grad_norm": 0.8119887709617615, "learning_rate": 7.204652853507816e-06, "loss": 0.0153, "step": 1760 }, { "epoch": 1.9294153699479044, "grad_norm": 0.006383243482559919, "learning_rate": 7.131952017448202e-06, "loss": 0.0034, "step": 1770 }, { "epoch": 1.9403112125029793, "grad_norm": 0.03637854754924774, "learning_rate": 7.059251181388587e-06, "loss": 0.0034, "step": 1780 }, { "epoch": 1.9512070550580543, "grad_norm": 0.04712774232029915, "learning_rate": 6.9865503453289714e-06, "loss": 0.0234, "step": 1790 }, { "epoch": 1.9621028976131294, "grad_norm": 6.268856525421143, "learning_rate": 6.913849509269357e-06, "loss": 0.0265, "step": 1800 }, { "epoch": 1.9729987401682045, "grad_norm": 0.6448054313659668, "learning_rate": 6.841148673209742e-06, "loss": 0.0057, "step": 1810 }, { "epoch": 1.9838945827232797, "grad_norm": 0.07000619918107986, "learning_rate": 6.768447837150128e-06, "loss": 0.0005, "step": 1820 }, { "epoch": 1.9947904252783548, "grad_norm": 0.012424224987626076, "learning_rate": 6.695747001090514e-06, "loss": 0.0039, "step": 1830 }, { "epoch": 2.006537505533045, "grad_norm": 0.08453727513551712, "learning_rate": 6.6230461650308985e-06, "loss": 0.0006, "step": 1840 }, { "epoch": 2.01743334808812, "grad_norm": 0.0390053391456604, "learning_rate": 6.550345328971284e-06, "loss": 0.0006, "step": 1850 }, { "epoch": 2.0283291906431953, "grad_norm": 0.013394408859312534, "learning_rate": 6.477644492911669e-06, "loss": 0.0049, "step": 1860 }, { "epoch": 2.0392250331982704, "grad_norm": 0.0027593837585300207, "learning_rate": 6.404943656852054e-06, "loss": 0.0008, "step": 1870 }, { "epoch": 2.0501208757533456, "grad_norm": 0.0010020197369158268, "learning_rate": 6.332242820792439e-06, "loss": 0.0023, "step": 1880 }, { "epoch": 2.0610167183084203, "grad_norm": 0.0010899041080847383, "learning_rate": 6.259541984732826e-06, "loss": 0.0005, "step": 1890 }, { "epoch": 2.0719125608634954, "grad_norm": 0.03333039954304695, "learning_rate": 6.18684114867321e-06, "loss": 0.0011, "step": 1900 }, { "epoch": 2.0828084034185705, "grad_norm": 0.002606542780995369, "learning_rate": 6.114140312613596e-06, "loss": 0.0062, "step": 1910 }, { "epoch": 2.0937042459736457, "grad_norm": 0.008523502387106419, "learning_rate": 6.041439476553981e-06, "loss": 0.0001, "step": 1920 }, { "epoch": 2.104600088528721, "grad_norm": 0.005313311703503132, "learning_rate": 5.968738640494366e-06, "loss": 0.0095, "step": 1930 }, { "epoch": 2.115495931083796, "grad_norm": 0.030115563422441483, "learning_rate": 5.896037804434751e-06, "loss": 0.0011, "step": 1940 }, { "epoch": 2.126391773638871, "grad_norm": 0.001531143207103014, "learning_rate": 5.823336968375137e-06, "loss": 0.0047, "step": 1950 }, { "epoch": 2.137287616193946, "grad_norm": 0.013100974261760712, "learning_rate": 5.750636132315522e-06, "loss": 0.0041, "step": 1960 }, { "epoch": 2.148183458749021, "grad_norm": 0.010219580493867397, "learning_rate": 5.677935296255908e-06, "loss": 0.0012, "step": 1970 }, { "epoch": 2.159079301304096, "grad_norm": 0.02304321527481079, "learning_rate": 5.6052344601962925e-06, "loss": 0.0006, "step": 1980 }, { "epoch": 2.1699751438591712, "grad_norm": 0.32716256380081177, "learning_rate": 5.532533624136678e-06, "loss": 0.0005, "step": 1990 }, { "epoch": 2.1808709864142464, "grad_norm": 0.003199178259819746, "learning_rate": 5.459832788077063e-06, "loss": 0.0002, "step": 2000 }, { "epoch": 2.1917668289693215, "grad_norm": 0.10407451540231705, "learning_rate": 5.387131952017448e-06, "loss": 0.0026, "step": 2010 }, { "epoch": 2.2026626715243967, "grad_norm": 0.0036433066707104445, "learning_rate": 5.314431115957834e-06, "loss": 0.0053, "step": 2020 }, { "epoch": 2.2135585140794714, "grad_norm": 0.22139491140842438, "learning_rate": 5.2417302798982195e-06, "loss": 0.0013, "step": 2030 }, { "epoch": 2.2244543566345465, "grad_norm": 0.00901265349239111, "learning_rate": 5.169029443838604e-06, "loss": 0.0004, "step": 2040 }, { "epoch": 2.2353501991896216, "grad_norm": 0.007596256677061319, "learning_rate": 5.09632860777899e-06, "loss": 0.0002, "step": 2050 }, { "epoch": 2.2462460417446968, "grad_norm": 0.05308268591761589, "learning_rate": 5.023627771719375e-06, "loss": 0.0001, "step": 2060 }, { "epoch": 2.257141884299772, "grad_norm": 0.005023419391363859, "learning_rate": 4.95092693565976e-06, "loss": 0.0001, "step": 2070 }, { "epoch": 2.268037726854847, "grad_norm": 0.09251435101032257, "learning_rate": 4.878226099600146e-06, "loss": 0.0008, "step": 2080 }, { "epoch": 2.278933569409922, "grad_norm": 0.0035660325083881617, "learning_rate": 4.8055252635405305e-06, "loss": 0.0029, "step": 2090 }, { "epoch": 2.2898294119649973, "grad_norm": 0.00022365724726114422, "learning_rate": 4.732824427480917e-06, "loss": 0.0, "step": 2100 }, { "epoch": 2.300725254520072, "grad_norm": 0.28966161608695984, "learning_rate": 4.660123591421302e-06, "loss": 0.0004, "step": 2110 }, { "epoch": 2.311621097075147, "grad_norm": 0.000494773150421679, "learning_rate": 4.5874227553616864e-06, "loss": 0.0003, "step": 2120 }, { "epoch": 2.3225169396302223, "grad_norm": 0.2110077142715454, "learning_rate": 4.514721919302073e-06, "loss": 0.0007, "step": 2130 }, { "epoch": 2.3334127821852975, "grad_norm": 0.0006416022079065442, "learning_rate": 4.442021083242458e-06, "loss": 0.0006, "step": 2140 }, { "epoch": 2.3443086247403726, "grad_norm": 0.0005581114673987031, "learning_rate": 4.369320247182842e-06, "loss": 0.0004, "step": 2150 }, { "epoch": 2.3552044672954477, "grad_norm": 0.0006430571665987372, "learning_rate": 4.296619411123229e-06, "loss": 0.0013, "step": 2160 }, { "epoch": 2.3661003098505224, "grad_norm": 0.0002313524018973112, "learning_rate": 4.2239185750636135e-06, "loss": 0.0011, "step": 2170 }, { "epoch": 2.3769961524055976, "grad_norm": 0.01299639604985714, "learning_rate": 4.151217739003999e-06, "loss": 0.0002, "step": 2180 }, { "epoch": 2.3878919949606727, "grad_norm": 0.036279868334531784, "learning_rate": 4.078516902944385e-06, "loss": 0.0, "step": 2190 }, { "epoch": 2.398787837515748, "grad_norm": 0.0004496763285715133, "learning_rate": 4.005816066884769e-06, "loss": 0.0, "step": 2200 }, { "epoch": 2.409683680070823, "grad_norm": 0.010034661740064621, "learning_rate": 3.933115230825155e-06, "loss": 0.0, "step": 2210 }, { "epoch": 2.420579522625898, "grad_norm": 0.0027114665135741234, "learning_rate": 3.860414394765541e-06, "loss": 0.0, "step": 2220 }, { "epoch": 2.4314753651809733, "grad_norm": 0.00021306249254848808, "learning_rate": 3.7877135587059253e-06, "loss": 0.0, "step": 2230 }, { "epoch": 2.4423712077360484, "grad_norm": 0.002327492693439126, "learning_rate": 3.7150127226463105e-06, "loss": 0.0, "step": 2240 }, { "epoch": 2.453267050291123, "grad_norm": 0.0042752730660140514, "learning_rate": 3.6423118865866965e-06, "loss": 0.0001, "step": 2250 }, { "epoch": 2.4641628928461983, "grad_norm": 0.5819891691207886, "learning_rate": 3.5696110505270817e-06, "loss": 0.0014, "step": 2260 }, { "epoch": 2.4750587354012734, "grad_norm": 0.0002232871629530564, "learning_rate": 3.4969102144674664e-06, "loss": 0.0, "step": 2270 }, { "epoch": 2.4859545779563486, "grad_norm": 0.0006547856028191745, "learning_rate": 3.4242093784078516e-06, "loss": 0.0, "step": 2280 }, { "epoch": 2.4968504205114237, "grad_norm": 0.007096582092344761, "learning_rate": 3.3515085423482376e-06, "loss": 0.0, "step": 2290 }, { "epoch": 2.507746263066499, "grad_norm": 0.007319641765207052, "learning_rate": 3.2788077062886227e-06, "loss": 0.0, "step": 2300 }, { "epoch": 2.5186421056215735, "grad_norm": 0.00013177268556319177, "learning_rate": 3.206106870229008e-06, "loss": 0.0, "step": 2310 }, { "epoch": 2.5295379481766487, "grad_norm": 0.001638653688132763, "learning_rate": 3.1334060341693935e-06, "loss": 0.0002, "step": 2320 }, { "epoch": 2.540433790731724, "grad_norm": 0.00048312891158275306, "learning_rate": 3.0607051981097786e-06, "loss": 0.0, "step": 2330 }, { "epoch": 2.551329633286799, "grad_norm": 0.001063148258253932, "learning_rate": 2.988004362050164e-06, "loss": 0.0001, "step": 2340 }, { "epoch": 2.562225475841874, "grad_norm": 0.005976190324872732, "learning_rate": 2.9153035259905494e-06, "loss": 0.0, "step": 2350 }, { "epoch": 2.5731213183969492, "grad_norm": 0.001030449871905148, "learning_rate": 2.8426026899309345e-06, "loss": 0.0001, "step": 2360 }, { "epoch": 2.5840171609520244, "grad_norm": 0.000677391595672816, "learning_rate": 2.7699018538713197e-06, "loss": 0.0016, "step": 2370 }, { "epoch": 2.5949130035070995, "grad_norm": 1.1224867105484009, "learning_rate": 2.6972010178117053e-06, "loss": 0.0036, "step": 2380 }, { "epoch": 2.6058088460621747, "grad_norm": 0.0026874279137700796, "learning_rate": 2.6245001817520905e-06, "loss": 0.0, "step": 2390 }, { "epoch": 2.6167046886172494, "grad_norm": 0.003862058976665139, "learning_rate": 2.5517993456924756e-06, "loss": 0.0001, "step": 2400 }, { "epoch": 2.6276005311723245, "grad_norm": 0.0830313041806221, "learning_rate": 2.4790985096328608e-06, "loss": 0.0014, "step": 2410 }, { "epoch": 2.6384963737273996, "grad_norm": 0.0019621718674898148, "learning_rate": 2.4063976735732464e-06, "loss": 0.0005, "step": 2420 }, { "epoch": 2.649392216282475, "grad_norm": 0.28306806087493896, "learning_rate": 2.3336968375136315e-06, "loss": 0.0002, "step": 2430 }, { "epoch": 2.66028805883755, "grad_norm": 0.004503046162426472, "learning_rate": 2.260996001454017e-06, "loss": 0.0, "step": 2440 }, { "epoch": 2.6711839013926246, "grad_norm": 0.0008729721885174513, "learning_rate": 2.1882951653944023e-06, "loss": 0.0008, "step": 2450 }, { "epoch": 2.6820797439476998, "grad_norm": 0.010283468291163445, "learning_rate": 2.1155943293347874e-06, "loss": 0.0, "step": 2460 }, { "epoch": 2.692975586502775, "grad_norm": 1.8014414308709092e-05, "learning_rate": 2.042893493275173e-06, "loss": 0.0012, "step": 2470 }, { "epoch": 2.70387142905785, "grad_norm": 0.0013227862073108554, "learning_rate": 1.970192657215558e-06, "loss": 0.0001, "step": 2480 }, { "epoch": 2.714767271612925, "grad_norm": 9.750492608873174e-05, "learning_rate": 1.8974918211559433e-06, "loss": 0.0012, "step": 2490 }, { "epoch": 2.7256631141680003, "grad_norm": 0.009569020941853523, "learning_rate": 1.824790985096329e-06, "loss": 0.0001, "step": 2500 }, { "epoch": 2.7365589567230755, "grad_norm": 0.00015347945736721158, "learning_rate": 1.752090149036714e-06, "loss": 0.0001, "step": 2510 }, { "epoch": 2.7474547992781506, "grad_norm": 0.0024864268489181995, "learning_rate": 1.6793893129770995e-06, "loss": 0.0002, "step": 2520 }, { "epoch": 2.7583506418332258, "grad_norm": 0.0018065335461869836, "learning_rate": 1.6066884769174848e-06, "loss": 0.0, "step": 2530 }, { "epoch": 2.7692464843883005, "grad_norm": 0.000252872530836612, "learning_rate": 1.53398764085787e-06, "loss": 0.0002, "step": 2540 }, { "epoch": 2.7801423269433756, "grad_norm": 0.0006220173672772944, "learning_rate": 1.4612868047982554e-06, "loss": 0.0, "step": 2550 }, { "epoch": 2.7910381694984507, "grad_norm": 0.00021657197794411331, "learning_rate": 1.3885859687386405e-06, "loss": 0.002, "step": 2560 }, { "epoch": 2.801934012053526, "grad_norm": 0.062267255038022995, "learning_rate": 1.315885132679026e-06, "loss": 0.0001, "step": 2570 }, { "epoch": 2.812829854608601, "grad_norm": 0.00383751024492085, "learning_rate": 1.2431842966194113e-06, "loss": 0.0002, "step": 2580 }, { "epoch": 2.8237256971636757, "grad_norm": 9.788275929167867e-05, "learning_rate": 1.1704834605597967e-06, "loss": 0.0006, "step": 2590 }, { "epoch": 2.834621539718751, "grad_norm": 0.0013275217497721314, "learning_rate": 1.0977826245001818e-06, "loss": 0.0002, "step": 2600 }, { "epoch": 2.845517382273826, "grad_norm": 0.0015028759371489286, "learning_rate": 1.0250817884405672e-06, "loss": 0.0, "step": 2610 }, { "epoch": 2.856413224828901, "grad_norm": 0.00014119225670583546, "learning_rate": 9.523809523809525e-07, "loss": 0.0, "step": 2620 }, { "epoch": 2.8673090673839763, "grad_norm": 0.007295021787285805, "learning_rate": 8.796801163213378e-07, "loss": 0.0, "step": 2630 }, { "epoch": 2.8782049099390514, "grad_norm": 2.5996017939178273e-05, "learning_rate": 8.069792802617231e-07, "loss": 0.0001, "step": 2640 }, { "epoch": 2.8891007524941266, "grad_norm": 0.00027592500555329025, "learning_rate": 7.342784442021084e-07, "loss": 0.0001, "step": 2650 }, { "epoch": 2.8999965950492017, "grad_norm": 0.0033551298547536135, "learning_rate": 6.615776081424936e-07, "loss": 0.0, "step": 2660 }, { "epoch": 2.910892437604277, "grad_norm": 0.0005961539573036134, "learning_rate": 5.88876772082879e-07, "loss": 0.0, "step": 2670 }, { "epoch": 2.9217882801593515, "grad_norm": 0.0015423846198245883, "learning_rate": 5.161759360232643e-07, "loss": 0.0003, "step": 2680 }, { "epoch": 2.9326841227144267, "grad_norm": 0.000448063132353127, "learning_rate": 4.434750999636496e-07, "loss": 0.0031, "step": 2690 }, { "epoch": 2.943579965269502, "grad_norm": 0.003001452423632145, "learning_rate": 3.7077426390403497e-07, "loss": 0.0, "step": 2700 }, { "epoch": 2.954475807824577, "grad_norm": 4.6965491492301226e-05, "learning_rate": 2.9807342784442023e-07, "loss": 0.0001, "step": 2710 }, { "epoch": 2.965371650379652, "grad_norm": 0.00013006600784137845, "learning_rate": 2.2537259178480555e-07, "loss": 0.001, "step": 2720 }, { "epoch": 2.9762674929347273, "grad_norm": 0.006912072654813528, "learning_rate": 1.5267175572519085e-07, "loss": 0.0, "step": 2730 }, { "epoch": 2.987163335489802, "grad_norm": 0.0006019837455824018, "learning_rate": 7.997091966557616e-08, "loss": 0.0, "step": 2740 }, { "epoch": 2.998059178044877, "grad_norm": 0.006343195680528879, "learning_rate": 7.2700836059614684e-09, "loss": 0.0005, "step": 2750 } ], "logging_steps": 10, "max_steps": 2751, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1303954889740124e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }