diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12985 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996395746981438, + "eval_steps": 500, + "global_step": 1849, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005406379527842854, + "grad_norm": 5.835691452026367, + "learning_rate": 5.405405405405406e-08, + "loss": 0.8753, + "step": 1 + }, + { + "epoch": 0.001081275905568571, + "grad_norm": 5.94018030166626, + "learning_rate": 1.0810810810810812e-07, + "loss": 0.892, + "step": 2 + }, + { + "epoch": 0.0016219138583528565, + "grad_norm": 6.048556327819824, + "learning_rate": 1.6216216216216218e-07, + "loss": 0.8786, + "step": 3 + }, + { + "epoch": 0.002162551811137142, + "grad_norm": 5.985991477966309, + "learning_rate": 2.1621621621621625e-07, + "loss": 0.8707, + "step": 4 + }, + { + "epoch": 0.0027031897639214274, + "grad_norm": 5.788306713104248, + "learning_rate": 2.702702702702703e-07, + "loss": 0.8625, + "step": 5 + }, + { + "epoch": 0.003243827716705713, + "grad_norm": 6.294447422027588, + "learning_rate": 3.2432432432432436e-07, + "loss": 0.9267, + "step": 6 + }, + { + "epoch": 0.003784465669489998, + "grad_norm": 6.075927734375, + "learning_rate": 3.7837837837837843e-07, + "loss": 0.9002, + "step": 7 + }, + { + "epoch": 0.004325103622274284, + "grad_norm": 6.027771472930908, + "learning_rate": 4.324324324324325e-07, + "loss": 0.8858, + "step": 8 + }, + { + "epoch": 0.004865741575058569, + "grad_norm": 5.661100387573242, + "learning_rate": 4.864864864864865e-07, + "loss": 0.8382, + "step": 9 + }, + { + "epoch": 0.005406379527842855, + "grad_norm": 5.359391689300537, + "learning_rate": 5.405405405405406e-07, + "loss": 0.8345, + "step": 10 + }, + { + "epoch": 0.00594701748062714, + "grad_norm": 5.4135613441467285, + "learning_rate": 5.945945945945947e-07, + "loss": 0.8613, + "step": 11 + }, + { + "epoch": 0.006487655433411426, + "grad_norm": 5.188356876373291, + "learning_rate": 6.486486486486487e-07, + "loss": 0.8316, + "step": 12 + }, + { + "epoch": 0.0070282933861957105, + "grad_norm": 5.179812908172607, + "learning_rate": 7.027027027027028e-07, + "loss": 0.8574, + "step": 13 + }, + { + "epoch": 0.007568931338979996, + "grad_norm": 4.420322418212891, + "learning_rate": 7.567567567567569e-07, + "loss": 0.8381, + "step": 14 + }, + { + "epoch": 0.008109569291764282, + "grad_norm": 4.343968868255615, + "learning_rate": 8.108108108108109e-07, + "loss": 0.8244, + "step": 15 + }, + { + "epoch": 0.008650207244548567, + "grad_norm": 4.318638801574707, + "learning_rate": 8.64864864864865e-07, + "loss": 0.8635, + "step": 16 + }, + { + "epoch": 0.009190845197332853, + "grad_norm": 4.086209774017334, + "learning_rate": 9.189189189189191e-07, + "loss": 0.8179, + "step": 17 + }, + { + "epoch": 0.009731483150117138, + "grad_norm": 4.0021467208862305, + "learning_rate": 9.72972972972973e-07, + "loss": 0.7978, + "step": 18 + }, + { + "epoch": 0.010272121102901424, + "grad_norm": 2.3173251152038574, + "learning_rate": 1.027027027027027e-06, + "loss": 0.7756, + "step": 19 + }, + { + "epoch": 0.01081275905568571, + "grad_norm": 2.2132866382598877, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.761, + "step": 20 + }, + { + "epoch": 0.011353397008469995, + "grad_norm": 2.2799596786499023, + "learning_rate": 1.1351351351351352e-06, + "loss": 0.7687, + "step": 21 + }, + { + "epoch": 0.01189403496125428, + "grad_norm": 2.0921261310577393, + "learning_rate": 1.1891891891891893e-06, + "loss": 0.7825, + "step": 22 + }, + { + "epoch": 0.012434672914038566, + "grad_norm": 1.862625002861023, + "learning_rate": 1.2432432432432434e-06, + "loss": 0.762, + "step": 23 + }, + { + "epoch": 0.012975310866822852, + "grad_norm": 1.887032389640808, + "learning_rate": 1.2972972972972974e-06, + "loss": 0.7788, + "step": 24 + }, + { + "epoch": 0.013515948819607137, + "grad_norm": 1.483746886253357, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.7347, + "step": 25 + }, + { + "epoch": 0.014056586772391421, + "grad_norm": 2.0825283527374268, + "learning_rate": 1.4054054054054056e-06, + "loss": 0.733, + "step": 26 + }, + { + "epoch": 0.014597224725175707, + "grad_norm": 2.5244522094726562, + "learning_rate": 1.4594594594594596e-06, + "loss": 0.738, + "step": 27 + }, + { + "epoch": 0.015137862677959992, + "grad_norm": 2.7919321060180664, + "learning_rate": 1.5135135135135137e-06, + "loss": 0.7233, + "step": 28 + }, + { + "epoch": 0.015678500630744278, + "grad_norm": 2.6368916034698486, + "learning_rate": 1.5675675675675678e-06, + "loss": 0.7231, + "step": 29 + }, + { + "epoch": 0.016219138583528563, + "grad_norm": 2.4845619201660156, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.7093, + "step": 30 + }, + { + "epoch": 0.01675977653631285, + "grad_norm": 2.3388469219207764, + "learning_rate": 1.675675675675676e-06, + "loss": 0.7384, + "step": 31 + }, + { + "epoch": 0.017300414489097134, + "grad_norm": 2.0697882175445557, + "learning_rate": 1.72972972972973e-06, + "loss": 0.724, + "step": 32 + }, + { + "epoch": 0.01784105244188142, + "grad_norm": 1.7929456233978271, + "learning_rate": 1.783783783783784e-06, + "loss": 0.7247, + "step": 33 + }, + { + "epoch": 0.018381690394665706, + "grad_norm": 1.1819241046905518, + "learning_rate": 1.8378378378378381e-06, + "loss": 0.6814, + "step": 34 + }, + { + "epoch": 0.01892232834744999, + "grad_norm": 0.9916073679924011, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.7039, + "step": 35 + }, + { + "epoch": 0.019462966300234277, + "grad_norm": 1.0272109508514404, + "learning_rate": 1.945945945945946e-06, + "loss": 0.6968, + "step": 36 + }, + { + "epoch": 0.020003604253018562, + "grad_norm": 1.2608418464660645, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 37 + }, + { + "epoch": 0.020544242205802848, + "grad_norm": 1.2174832820892334, + "learning_rate": 2.054054054054054e-06, + "loss": 0.6771, + "step": 38 + }, + { + "epoch": 0.021084880158587133, + "grad_norm": 1.0341556072235107, + "learning_rate": 2.1081081081081085e-06, + "loss": 0.6796, + "step": 39 + }, + { + "epoch": 0.02162551811137142, + "grad_norm": 0.9752543568611145, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.6607, + "step": 40 + }, + { + "epoch": 0.022166156064155704, + "grad_norm": 0.8276258707046509, + "learning_rate": 2.2162162162162166e-06, + "loss": 0.6314, + "step": 41 + }, + { + "epoch": 0.02270679401693999, + "grad_norm": 0.9401139616966248, + "learning_rate": 2.2702702702702705e-06, + "loss": 0.6805, + "step": 42 + }, + { + "epoch": 0.023247431969724276, + "grad_norm": 0.7445069551467896, + "learning_rate": 2.3243243243243247e-06, + "loss": 0.6743, + "step": 43 + }, + { + "epoch": 0.02378806992250856, + "grad_norm": 0.6744734644889832, + "learning_rate": 2.3783783783783786e-06, + "loss": 0.6249, + "step": 44 + }, + { + "epoch": 0.024328707875292847, + "grad_norm": 0.8370453715324402, + "learning_rate": 2.432432432432433e-06, + "loss": 0.6669, + "step": 45 + }, + { + "epoch": 0.024869345828077132, + "grad_norm": 0.7774664759635925, + "learning_rate": 2.4864864864864867e-06, + "loss": 0.6698, + "step": 46 + }, + { + "epoch": 0.025409983780861418, + "grad_norm": 0.7434698343276978, + "learning_rate": 2.540540540540541e-06, + "loss": 0.6347, + "step": 47 + }, + { + "epoch": 0.025950621733645703, + "grad_norm": 0.7597132325172424, + "learning_rate": 2.594594594594595e-06, + "loss": 0.6217, + "step": 48 + }, + { + "epoch": 0.02649125968642999, + "grad_norm": 0.7307024598121643, + "learning_rate": 2.648648648648649e-06, + "loss": 0.6406, + "step": 49 + }, + { + "epoch": 0.027031897639214274, + "grad_norm": 0.6316570043563843, + "learning_rate": 2.702702702702703e-06, + "loss": 0.5985, + "step": 50 + }, + { + "epoch": 0.02757253559199856, + "grad_norm": 0.58868408203125, + "learning_rate": 2.7567567567567573e-06, + "loss": 0.6157, + "step": 51 + }, + { + "epoch": 0.028113173544782842, + "grad_norm": 0.6674655675888062, + "learning_rate": 2.810810810810811e-06, + "loss": 0.6382, + "step": 52 + }, + { + "epoch": 0.028653811497567128, + "grad_norm": 0.6769363880157471, + "learning_rate": 2.8648648648648654e-06, + "loss": 0.6284, + "step": 53 + }, + { + "epoch": 0.029194449450351413, + "grad_norm": 0.5872988104820251, + "learning_rate": 2.9189189189189193e-06, + "loss": 0.6095, + "step": 54 + }, + { + "epoch": 0.0297350874031357, + "grad_norm": 0.5742107033729553, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.6609, + "step": 55 + }, + { + "epoch": 0.030275725355919984, + "grad_norm": 0.5116291046142578, + "learning_rate": 3.0270270270270274e-06, + "loss": 0.5959, + "step": 56 + }, + { + "epoch": 0.03081636330870427, + "grad_norm": 0.4740088880062103, + "learning_rate": 3.0810810810810817e-06, + "loss": 0.5925, + "step": 57 + }, + { + "epoch": 0.031357001261488555, + "grad_norm": 0.536938488483429, + "learning_rate": 3.1351351351351356e-06, + "loss": 0.6281, + "step": 58 + }, + { + "epoch": 0.03189763921427284, + "grad_norm": 0.5394325852394104, + "learning_rate": 3.1891891891891894e-06, + "loss": 0.604, + "step": 59 + }, + { + "epoch": 0.03243827716705713, + "grad_norm": 0.5232502222061157, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.5988, + "step": 60 + }, + { + "epoch": 0.03297891511984141, + "grad_norm": 0.5132797360420227, + "learning_rate": 3.2972972972972976e-06, + "loss": 0.6177, + "step": 61 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 0.46831756830215454, + "learning_rate": 3.351351351351352e-06, + "loss": 0.6274, + "step": 62 + }, + { + "epoch": 0.03406019102540998, + "grad_norm": 0.4727836549282074, + "learning_rate": 3.4054054054054057e-06, + "loss": 0.6218, + "step": 63 + }, + { + "epoch": 0.03460082897819427, + "grad_norm": 0.4939032196998596, + "learning_rate": 3.45945945945946e-06, + "loss": 0.5846, + "step": 64 + }, + { + "epoch": 0.035141466930978554, + "grad_norm": 0.46765533089637756, + "learning_rate": 3.513513513513514e-06, + "loss": 0.6012, + "step": 65 + }, + { + "epoch": 0.03568210488376284, + "grad_norm": 0.41743600368499756, + "learning_rate": 3.567567567567568e-06, + "loss": 0.6112, + "step": 66 + }, + { + "epoch": 0.036222742836547125, + "grad_norm": 0.46013137698173523, + "learning_rate": 3.621621621621622e-06, + "loss": 0.5885, + "step": 67 + }, + { + "epoch": 0.03676338078933141, + "grad_norm": 0.4405370354652405, + "learning_rate": 3.6756756756756763e-06, + "loss": 0.6377, + "step": 68 + }, + { + "epoch": 0.0373040187421157, + "grad_norm": 0.41007307171821594, + "learning_rate": 3.72972972972973e-06, + "loss": 0.6025, + "step": 69 + }, + { + "epoch": 0.03784465669489998, + "grad_norm": 0.3900786340236664, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.5646, + "step": 70 + }, + { + "epoch": 0.03838529464768427, + "grad_norm": 0.45452040433883667, + "learning_rate": 3.837837837837838e-06, + "loss": 0.5868, + "step": 71 + }, + { + "epoch": 0.03892593260046855, + "grad_norm": 0.4392114579677582, + "learning_rate": 3.891891891891892e-06, + "loss": 0.5681, + "step": 72 + }, + { + "epoch": 0.03946657055325284, + "grad_norm": 0.4254467189311981, + "learning_rate": 3.945945945945947e-06, + "loss": 0.6258, + "step": 73 + }, + { + "epoch": 0.040007208506037124, + "grad_norm": 0.44197121262550354, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6088, + "step": 74 + }, + { + "epoch": 0.04054784645882141, + "grad_norm": 0.4894886910915375, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.596, + "step": 75 + }, + { + "epoch": 0.041088484411605695, + "grad_norm": 0.4831041395664215, + "learning_rate": 4.108108108108108e-06, + "loss": 0.5854, + "step": 76 + }, + { + "epoch": 0.04162912236438998, + "grad_norm": 0.4371893107891083, + "learning_rate": 4.162162162162163e-06, + "loss": 0.5775, + "step": 77 + }, + { + "epoch": 0.04216976031717427, + "grad_norm": 0.4326671361923218, + "learning_rate": 4.216216216216217e-06, + "loss": 0.5871, + "step": 78 + }, + { + "epoch": 0.04271039826995855, + "grad_norm": 0.4272334575653076, + "learning_rate": 4.270270270270271e-06, + "loss": 0.5637, + "step": 79 + }, + { + "epoch": 0.04325103622274284, + "grad_norm": 0.4943785071372986, + "learning_rate": 4.324324324324325e-06, + "loss": 0.5609, + "step": 80 + }, + { + "epoch": 0.04379167417552712, + "grad_norm": 0.3759538531303406, + "learning_rate": 4.378378378378379e-06, + "loss": 0.5881, + "step": 81 + }, + { + "epoch": 0.04433231212831141, + "grad_norm": 0.4269590377807617, + "learning_rate": 4.432432432432433e-06, + "loss": 0.5822, + "step": 82 + }, + { + "epoch": 0.044872950081095694, + "grad_norm": 0.4101639986038208, + "learning_rate": 4.486486486486487e-06, + "loss": 0.5925, + "step": 83 + }, + { + "epoch": 0.04541358803387998, + "grad_norm": 0.40165165066719055, + "learning_rate": 4.540540540540541e-06, + "loss": 0.5304, + "step": 84 + }, + { + "epoch": 0.045954225986664266, + "grad_norm": 0.3949192762374878, + "learning_rate": 4.594594594594596e-06, + "loss": 0.5503, + "step": 85 + }, + { + "epoch": 0.04649486393944855, + "grad_norm": 0.4039553999900818, + "learning_rate": 4.6486486486486495e-06, + "loss": 0.5634, + "step": 86 + }, + { + "epoch": 0.04703550189223284, + "grad_norm": 0.4268937110900879, + "learning_rate": 4.702702702702703e-06, + "loss": 0.5854, + "step": 87 + }, + { + "epoch": 0.04757613984501712, + "grad_norm": 0.420392245054245, + "learning_rate": 4.756756756756757e-06, + "loss": 0.5554, + "step": 88 + }, + { + "epoch": 0.04811677779780141, + "grad_norm": 0.38791415095329285, + "learning_rate": 4.810810810810811e-06, + "loss": 0.5662, + "step": 89 + }, + { + "epoch": 0.04865741575058569, + "grad_norm": 0.38142862915992737, + "learning_rate": 4.864864864864866e-06, + "loss": 0.5621, + "step": 90 + }, + { + "epoch": 0.04919805370336998, + "grad_norm": 0.4095216691493988, + "learning_rate": 4.91891891891892e-06, + "loss": 0.5567, + "step": 91 + }, + { + "epoch": 0.049738691656154264, + "grad_norm": 0.40666601061820984, + "learning_rate": 4.9729729729729735e-06, + "loss": 0.6048, + "step": 92 + }, + { + "epoch": 0.05027932960893855, + "grad_norm": 0.37900641560554504, + "learning_rate": 5.027027027027027e-06, + "loss": 0.57, + "step": 93 + }, + { + "epoch": 0.050819967561722836, + "grad_norm": 0.4188796579837799, + "learning_rate": 5.081081081081082e-06, + "loss": 0.5547, + "step": 94 + }, + { + "epoch": 0.05136060551450712, + "grad_norm": 0.3917129933834076, + "learning_rate": 5.135135135135135e-06, + "loss": 0.5994, + "step": 95 + }, + { + "epoch": 0.05190124346729141, + "grad_norm": 0.3745051920413971, + "learning_rate": 5.18918918918919e-06, + "loss": 0.5645, + "step": 96 + }, + { + "epoch": 0.05244188142007569, + "grad_norm": 0.4055512249469757, + "learning_rate": 5.243243243243244e-06, + "loss": 0.5516, + "step": 97 + }, + { + "epoch": 0.05298251937285998, + "grad_norm": 0.40357547998428345, + "learning_rate": 5.297297297297298e-06, + "loss": 0.5414, + "step": 98 + }, + { + "epoch": 0.05352315732564426, + "grad_norm": 0.37698817253112793, + "learning_rate": 5.351351351351351e-06, + "loss": 0.5494, + "step": 99 + }, + { + "epoch": 0.05406379527842855, + "grad_norm": 0.45397427678108215, + "learning_rate": 5.405405405405406e-06, + "loss": 0.5574, + "step": 100 + }, + { + "epoch": 0.054604433231212834, + "grad_norm": 0.4567568600177765, + "learning_rate": 5.45945945945946e-06, + "loss": 0.578, + "step": 101 + }, + { + "epoch": 0.05514507118399712, + "grad_norm": 0.4371408224105835, + "learning_rate": 5.513513513513515e-06, + "loss": 0.5614, + "step": 102 + }, + { + "epoch": 0.0556857091367814, + "grad_norm": 0.4275542199611664, + "learning_rate": 5.567567567567568e-06, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 0.056226347089565684, + "grad_norm": 0.4597940742969513, + "learning_rate": 5.621621621621622e-06, + "loss": 0.5431, + "step": 104 + }, + { + "epoch": 0.05676698504234997, + "grad_norm": 0.47299715876579285, + "learning_rate": 5.675675675675676e-06, + "loss": 0.5648, + "step": 105 + }, + { + "epoch": 0.057307622995134255, + "grad_norm": 0.435273140668869, + "learning_rate": 5.729729729729731e-06, + "loss": 0.5281, + "step": 106 + }, + { + "epoch": 0.05784826094791854, + "grad_norm": 0.5185096859931946, + "learning_rate": 5.783783783783784e-06, + "loss": 0.5975, + "step": 107 + }, + { + "epoch": 0.058388898900702826, + "grad_norm": 0.4125944674015045, + "learning_rate": 5.837837837837839e-06, + "loss": 0.5627, + "step": 108 + }, + { + "epoch": 0.05892953685348711, + "grad_norm": 0.4223628640174866, + "learning_rate": 5.8918918918918924e-06, + "loss": 0.5734, + "step": 109 + }, + { + "epoch": 0.0594701748062714, + "grad_norm": 0.4168817698955536, + "learning_rate": 5.945945945945947e-06, + "loss": 0.5575, + "step": 110 + }, + { + "epoch": 0.06001081275905568, + "grad_norm": 0.4130726754665375, + "learning_rate": 6e-06, + "loss": 0.5319, + "step": 111 + }, + { + "epoch": 0.06055145071183997, + "grad_norm": 0.4145527482032776, + "learning_rate": 6.054054054054055e-06, + "loss": 0.5544, + "step": 112 + }, + { + "epoch": 0.061092088664624254, + "grad_norm": 0.47130388021469116, + "learning_rate": 6.108108108108109e-06, + "loss": 0.5454, + "step": 113 + }, + { + "epoch": 0.06163272661740854, + "grad_norm": 0.4709829092025757, + "learning_rate": 6.162162162162163e-06, + "loss": 0.5477, + "step": 114 + }, + { + "epoch": 0.062173364570192825, + "grad_norm": 0.4437786340713501, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.5337, + "step": 115 + }, + { + "epoch": 0.06271400252297711, + "grad_norm": 0.44709983468055725, + "learning_rate": 6.270270270270271e-06, + "loss": 0.5584, + "step": 116 + }, + { + "epoch": 0.0632546404757614, + "grad_norm": 0.438896119594574, + "learning_rate": 6.324324324324325e-06, + "loss": 0.5374, + "step": 117 + }, + { + "epoch": 0.06379527842854568, + "grad_norm": 0.4727858901023865, + "learning_rate": 6.378378378378379e-06, + "loss": 0.5907, + "step": 118 + }, + { + "epoch": 0.06433591638132997, + "grad_norm": 0.5302643179893494, + "learning_rate": 6.432432432432433e-06, + "loss": 0.5583, + "step": 119 + }, + { + "epoch": 0.06487655433411425, + "grad_norm": 0.49883052706718445, + "learning_rate": 6.486486486486487e-06, + "loss": 0.5728, + "step": 120 + }, + { + "epoch": 0.06541719228689855, + "grad_norm": 0.45870596170425415, + "learning_rate": 6.540540540540541e-06, + "loss": 0.5372, + "step": 121 + }, + { + "epoch": 0.06595783023968282, + "grad_norm": 0.5096645951271057, + "learning_rate": 6.594594594594595e-06, + "loss": 0.5361, + "step": 122 + }, + { + "epoch": 0.06649846819246712, + "grad_norm": 0.5785489678382874, + "learning_rate": 6.648648648648649e-06, + "loss": 0.5449, + "step": 123 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 0.48732200264930725, + "learning_rate": 6.702702702702704e-06, + "loss": 0.5647, + "step": 124 + }, + { + "epoch": 0.06757974409803569, + "grad_norm": 0.5112847089767456, + "learning_rate": 6.7567567567567575e-06, + "loss": 0.5576, + "step": 125 + }, + { + "epoch": 0.06812038205081997, + "grad_norm": 0.5451067686080933, + "learning_rate": 6.810810810810811e-06, + "loss": 0.5321, + "step": 126 + }, + { + "epoch": 0.06866102000360426, + "grad_norm": 0.4540032744407654, + "learning_rate": 6.864864864864865e-06, + "loss": 0.5252, + "step": 127 + }, + { + "epoch": 0.06920165795638854, + "grad_norm": 0.48304489254951477, + "learning_rate": 6.91891891891892e-06, + "loss": 0.5905, + "step": 128 + }, + { + "epoch": 0.06974229590917283, + "grad_norm": 0.44558167457580566, + "learning_rate": 6.972972972972973e-06, + "loss": 0.569, + "step": 129 + }, + { + "epoch": 0.07028293386195711, + "grad_norm": 0.4131074547767639, + "learning_rate": 7.027027027027028e-06, + "loss": 0.5453, + "step": 130 + }, + { + "epoch": 0.0708235718147414, + "grad_norm": 0.3999936878681183, + "learning_rate": 7.0810810810810815e-06, + "loss": 0.5302, + "step": 131 + }, + { + "epoch": 0.07136420976752568, + "grad_norm": 0.43754464387893677, + "learning_rate": 7.135135135135136e-06, + "loss": 0.5523, + "step": 132 + }, + { + "epoch": 0.07190484772030997, + "grad_norm": 0.4095590114593506, + "learning_rate": 7.189189189189189e-06, + "loss": 0.5603, + "step": 133 + }, + { + "epoch": 0.07244548567309425, + "grad_norm": 0.4144512712955475, + "learning_rate": 7.243243243243244e-06, + "loss": 0.5747, + "step": 134 + }, + { + "epoch": 0.07298612362587854, + "grad_norm": 0.41802504658699036, + "learning_rate": 7.297297297297298e-06, + "loss": 0.5991, + "step": 135 + }, + { + "epoch": 0.07352676157866282, + "grad_norm": 0.42772114276885986, + "learning_rate": 7.3513513513513525e-06, + "loss": 0.5364, + "step": 136 + }, + { + "epoch": 0.07406739953144711, + "grad_norm": 0.4586148262023926, + "learning_rate": 7.4054054054054055e-06, + "loss": 0.5732, + "step": 137 + }, + { + "epoch": 0.0746080374842314, + "grad_norm": 0.4603786766529083, + "learning_rate": 7.45945945945946e-06, + "loss": 0.5481, + "step": 138 + }, + { + "epoch": 0.07514867543701567, + "grad_norm": 0.4603879153728485, + "learning_rate": 7.513513513513514e-06, + "loss": 0.5536, + "step": 139 + }, + { + "epoch": 0.07568931338979996, + "grad_norm": 0.46576988697052, + "learning_rate": 7.567567567567569e-06, + "loss": 0.5667, + "step": 140 + }, + { + "epoch": 0.07622995134258424, + "grad_norm": 0.4343358874320984, + "learning_rate": 7.621621621621622e-06, + "loss": 0.5316, + "step": 141 + }, + { + "epoch": 0.07677058929536854, + "grad_norm": 0.5072187185287476, + "learning_rate": 7.675675675675676e-06, + "loss": 0.5577, + "step": 142 + }, + { + "epoch": 0.07731122724815281, + "grad_norm": 0.3946337401866913, + "learning_rate": 7.72972972972973e-06, + "loss": 0.5349, + "step": 143 + }, + { + "epoch": 0.0778518652009371, + "grad_norm": 0.4240807294845581, + "learning_rate": 7.783783783783784e-06, + "loss": 0.5337, + "step": 144 + }, + { + "epoch": 0.07839250315372139, + "grad_norm": 0.4701116681098938, + "learning_rate": 7.837837837837838e-06, + "loss": 0.5159, + "step": 145 + }, + { + "epoch": 0.07893314110650568, + "grad_norm": 0.5360553860664368, + "learning_rate": 7.891891891891894e-06, + "loss": 0.5657, + "step": 146 + }, + { + "epoch": 0.07947377905928996, + "grad_norm": 0.46461114287376404, + "learning_rate": 7.945945945945946e-06, + "loss": 0.5631, + "step": 147 + }, + { + "epoch": 0.08001441701207425, + "grad_norm": 0.5180739760398865, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5461, + "step": 148 + }, + { + "epoch": 0.08055505496485853, + "grad_norm": 0.4858033359050751, + "learning_rate": 8.054054054054055e-06, + "loss": 0.5493, + "step": 149 + }, + { + "epoch": 0.08109569291764282, + "grad_norm": 0.4371548593044281, + "learning_rate": 8.108108108108109e-06, + "loss": 0.5284, + "step": 150 + }, + { + "epoch": 0.0816363308704271, + "grad_norm": 0.46319711208343506, + "learning_rate": 8.162162162162163e-06, + "loss": 0.5401, + "step": 151 + }, + { + "epoch": 0.08217696882321139, + "grad_norm": 0.563747763633728, + "learning_rate": 8.216216216216217e-06, + "loss": 0.5532, + "step": 152 + }, + { + "epoch": 0.08271760677599567, + "grad_norm": 0.45033544301986694, + "learning_rate": 8.27027027027027e-06, + "loss": 0.5396, + "step": 153 + }, + { + "epoch": 0.08325824472877996, + "grad_norm": 0.5136594772338867, + "learning_rate": 8.324324324324326e-06, + "loss": 0.5382, + "step": 154 + }, + { + "epoch": 0.08379888268156424, + "grad_norm": 0.5209782123565674, + "learning_rate": 8.378378378378378e-06, + "loss": 0.5493, + "step": 155 + }, + { + "epoch": 0.08433952063434853, + "grad_norm": 0.5323936343193054, + "learning_rate": 8.432432432432434e-06, + "loss": 0.539, + "step": 156 + }, + { + "epoch": 0.08488015858713281, + "grad_norm": 0.4908245801925659, + "learning_rate": 8.486486486486488e-06, + "loss": 0.5233, + "step": 157 + }, + { + "epoch": 0.0854207965399171, + "grad_norm": 0.5659683346748352, + "learning_rate": 8.540540540540542e-06, + "loss": 0.5293, + "step": 158 + }, + { + "epoch": 0.08596143449270138, + "grad_norm": 0.5987280011177063, + "learning_rate": 8.594594594594595e-06, + "loss": 0.5394, + "step": 159 + }, + { + "epoch": 0.08650207244548568, + "grad_norm": 0.48582640290260315, + "learning_rate": 8.64864864864865e-06, + "loss": 0.5356, + "step": 160 + }, + { + "epoch": 0.08704271039826995, + "grad_norm": 0.5271715521812439, + "learning_rate": 8.702702702702703e-06, + "loss": 0.493, + "step": 161 + }, + { + "epoch": 0.08758334835105425, + "grad_norm": 0.668293833732605, + "learning_rate": 8.756756756756759e-06, + "loss": 0.4996, + "step": 162 + }, + { + "epoch": 0.08812398630383853, + "grad_norm": 0.4799709916114807, + "learning_rate": 8.810810810810811e-06, + "loss": 0.5161, + "step": 163 + }, + { + "epoch": 0.08866462425662282, + "grad_norm": 0.43464991450309753, + "learning_rate": 8.864864864864866e-06, + "loss": 0.5431, + "step": 164 + }, + { + "epoch": 0.0892052622094071, + "grad_norm": 0.5938690900802612, + "learning_rate": 8.91891891891892e-06, + "loss": 0.5677, + "step": 165 + }, + { + "epoch": 0.08974590016219139, + "grad_norm": 0.575090765953064, + "learning_rate": 8.972972972972974e-06, + "loss": 0.5506, + "step": 166 + }, + { + "epoch": 0.09028653811497567, + "grad_norm": 0.44394662976264954, + "learning_rate": 9.027027027027028e-06, + "loss": 0.5415, + "step": 167 + }, + { + "epoch": 0.09082717606775996, + "grad_norm": 0.45902568101882935, + "learning_rate": 9.081081081081082e-06, + "loss": 0.501, + "step": 168 + }, + { + "epoch": 0.09136781402054424, + "grad_norm": 0.6057916283607483, + "learning_rate": 9.135135135135136e-06, + "loss": 0.5383, + "step": 169 + }, + { + "epoch": 0.09190845197332853, + "grad_norm": 0.4782038927078247, + "learning_rate": 9.189189189189191e-06, + "loss": 0.5522, + "step": 170 + }, + { + "epoch": 0.09244908992611281, + "grad_norm": 0.5645440816879272, + "learning_rate": 9.243243243243243e-06, + "loss": 0.5395, + "step": 171 + }, + { + "epoch": 0.0929897278788971, + "grad_norm": 0.5265817046165466, + "learning_rate": 9.297297297297299e-06, + "loss": 0.5352, + "step": 172 + }, + { + "epoch": 0.09353036583168138, + "grad_norm": 0.504550039768219, + "learning_rate": 9.351351351351353e-06, + "loss": 0.5538, + "step": 173 + }, + { + "epoch": 0.09407100378446567, + "grad_norm": 0.5427402257919312, + "learning_rate": 9.405405405405407e-06, + "loss": 0.5307, + "step": 174 + }, + { + "epoch": 0.09461164173724995, + "grad_norm": 0.5136035084724426, + "learning_rate": 9.45945945945946e-06, + "loss": 0.5412, + "step": 175 + }, + { + "epoch": 0.09515227969003424, + "grad_norm": 0.5547382831573486, + "learning_rate": 9.513513513513514e-06, + "loss": 0.5452, + "step": 176 + }, + { + "epoch": 0.09569291764281852, + "grad_norm": 0.4894575774669647, + "learning_rate": 9.567567567567568e-06, + "loss": 0.5645, + "step": 177 + }, + { + "epoch": 0.09623355559560282, + "grad_norm": 0.521987795829773, + "learning_rate": 9.621621621621622e-06, + "loss": 0.5371, + "step": 178 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.48432838916778564, + "learning_rate": 9.675675675675676e-06, + "loss": 0.4921, + "step": 179 + }, + { + "epoch": 0.09731483150117139, + "grad_norm": 0.5000583529472351, + "learning_rate": 9.729729729729732e-06, + "loss": 0.5463, + "step": 180 + }, + { + "epoch": 0.09785546945395567, + "grad_norm": 0.5252205729484558, + "learning_rate": 9.783783783783785e-06, + "loss": 0.5571, + "step": 181 + }, + { + "epoch": 0.09839610740673996, + "grad_norm": 0.49048709869384766, + "learning_rate": 9.83783783783784e-06, + "loss": 0.5456, + "step": 182 + }, + { + "epoch": 0.09893674535952424, + "grad_norm": 0.5005491375923157, + "learning_rate": 9.891891891891893e-06, + "loss": 0.5263, + "step": 183 + }, + { + "epoch": 0.09947738331230853, + "grad_norm": 0.47705668210983276, + "learning_rate": 9.945945945945947e-06, + "loss": 0.5358, + "step": 184 + }, + { + "epoch": 0.10001802126509281, + "grad_norm": 0.4576117694377899, + "learning_rate": 1e-05, + "loss": 0.5025, + "step": 185 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 0.5019325613975525, + "learning_rate": 9.999991088865861e-06, + "loss": 0.5123, + "step": 186 + }, + { + "epoch": 0.10109929717066138, + "grad_norm": 0.47290462255477905, + "learning_rate": 9.999964355495207e-06, + "loss": 0.5141, + "step": 187 + }, + { + "epoch": 0.10163993512344567, + "grad_norm": 0.6163212656974792, + "learning_rate": 9.999919799983327e-06, + "loss": 0.5468, + "step": 188 + }, + { + "epoch": 0.10218057307622995, + "grad_norm": 0.5143344402313232, + "learning_rate": 9.99985742248904e-06, + "loss": 0.5433, + "step": 189 + }, + { + "epoch": 0.10272121102901424, + "grad_norm": 0.5387307405471802, + "learning_rate": 9.999777223234682e-06, + "loss": 0.5117, + "step": 190 + }, + { + "epoch": 0.10326184898179852, + "grad_norm": 0.6247243881225586, + "learning_rate": 9.999679202506126e-06, + "loss": 0.5298, + "step": 191 + }, + { + "epoch": 0.10380248693458281, + "grad_norm": 0.4776528477668762, + "learning_rate": 9.999563360652757e-06, + "loss": 0.5227, + "step": 192 + }, + { + "epoch": 0.10434312488736709, + "grad_norm": 0.5106874108314514, + "learning_rate": 9.999429698087491e-06, + "loss": 0.5197, + "step": 193 + }, + { + "epoch": 0.10488376284015138, + "grad_norm": 0.5041946768760681, + "learning_rate": 9.99927821528676e-06, + "loss": 0.5162, + "step": 194 + }, + { + "epoch": 0.10542440079293566, + "grad_norm": 0.5362434387207031, + "learning_rate": 9.999108912790521e-06, + "loss": 0.5103, + "step": 195 + }, + { + "epoch": 0.10596503874571996, + "grad_norm": 0.4504891335964203, + "learning_rate": 9.99892179120224e-06, + "loss": 0.5253, + "step": 196 + }, + { + "epoch": 0.10650567669850423, + "grad_norm": 0.5226731896400452, + "learning_rate": 9.99871685118891e-06, + "loss": 0.5392, + "step": 197 + }, + { + "epoch": 0.10704631465128853, + "grad_norm": 0.46790215373039246, + "learning_rate": 9.998494093481022e-06, + "loss": 0.4986, + "step": 198 + }, + { + "epoch": 0.1075869526040728, + "grad_norm": 0.4980153441429138, + "learning_rate": 9.998253518872592e-06, + "loss": 0.5568, + "step": 199 + }, + { + "epoch": 0.1081275905568571, + "grad_norm": 0.5192104578018188, + "learning_rate": 9.997995128221131e-06, + "loss": 0.564, + "step": 200 + }, + { + "epoch": 0.10866822850964138, + "grad_norm": 0.48999977111816406, + "learning_rate": 9.997718922447669e-06, + "loss": 0.5315, + "step": 201 + }, + { + "epoch": 0.10920886646242567, + "grad_norm": 0.5758963823318481, + "learning_rate": 9.99742490253672e-06, + "loss": 0.5428, + "step": 202 + }, + { + "epoch": 0.10974950441520995, + "grad_norm": 0.4924217164516449, + "learning_rate": 9.99711306953631e-06, + "loss": 0.5164, + "step": 203 + }, + { + "epoch": 0.11029014236799424, + "grad_norm": 0.5533744692802429, + "learning_rate": 9.99678342455795e-06, + "loss": 0.5286, + "step": 204 + }, + { + "epoch": 0.11083078032077852, + "grad_norm": 0.6308779120445251, + "learning_rate": 9.996435968776646e-06, + "loss": 0.5488, + "step": 205 + }, + { + "epoch": 0.1113714182735628, + "grad_norm": 0.49895963072776794, + "learning_rate": 9.996070703430888e-06, + "loss": 0.5228, + "step": 206 + }, + { + "epoch": 0.11191205622634709, + "grad_norm": 0.5760313868522644, + "learning_rate": 9.995687629822647e-06, + "loss": 0.5259, + "step": 207 + }, + { + "epoch": 0.11245269417913137, + "grad_norm": 0.5599102973937988, + "learning_rate": 9.99528674931737e-06, + "loss": 0.5191, + "step": 208 + }, + { + "epoch": 0.11299333213191566, + "grad_norm": 0.7119779586791992, + "learning_rate": 9.99486806334398e-06, + "loss": 0.5246, + "step": 209 + }, + { + "epoch": 0.11353397008469994, + "grad_norm": 0.5256053805351257, + "learning_rate": 9.994431573394861e-06, + "loss": 0.52, + "step": 210 + }, + { + "epoch": 0.11407460803748423, + "grad_norm": 0.6746430397033691, + "learning_rate": 9.993977281025862e-06, + "loss": 0.5519, + "step": 211 + }, + { + "epoch": 0.11461524599026851, + "grad_norm": 0.5388162732124329, + "learning_rate": 9.993505187856289e-06, + "loss": 0.5353, + "step": 212 + }, + { + "epoch": 0.1151558839430528, + "grad_norm": 0.6086297631263733, + "learning_rate": 9.993015295568893e-06, + "loss": 0.509, + "step": 213 + }, + { + "epoch": 0.11569652189583708, + "grad_norm": 0.5672162771224976, + "learning_rate": 9.992507605909873e-06, + "loss": 0.5215, + "step": 214 + }, + { + "epoch": 0.11623715984862137, + "grad_norm": 0.6110180616378784, + "learning_rate": 9.991982120688865e-06, + "loss": 0.5482, + "step": 215 + }, + { + "epoch": 0.11677779780140565, + "grad_norm": 0.5936868786811829, + "learning_rate": 9.99143884177894e-06, + "loss": 0.5016, + "step": 216 + }, + { + "epoch": 0.11731843575418995, + "grad_norm": 0.5300704836845398, + "learning_rate": 9.990877771116588e-06, + "loss": 0.5477, + "step": 217 + }, + { + "epoch": 0.11785907370697422, + "grad_norm": 0.5491863489151001, + "learning_rate": 9.99029891070172e-06, + "loss": 0.5311, + "step": 218 + }, + { + "epoch": 0.11839971165975852, + "grad_norm": 0.590416669845581, + "learning_rate": 9.989702262597656e-06, + "loss": 0.5245, + "step": 219 + }, + { + "epoch": 0.1189403496125428, + "grad_norm": 0.4550315737724304, + "learning_rate": 9.989087828931121e-06, + "loss": 0.513, + "step": 220 + }, + { + "epoch": 0.11948098756532709, + "grad_norm": 0.6941486597061157, + "learning_rate": 9.988455611892237e-06, + "loss": 0.5153, + "step": 221 + }, + { + "epoch": 0.12002162551811137, + "grad_norm": 0.5239282846450806, + "learning_rate": 9.987805613734508e-06, + "loss": 0.5329, + "step": 222 + }, + { + "epoch": 0.12056226347089566, + "grad_norm": 0.5700839161872864, + "learning_rate": 9.987137836774827e-06, + "loss": 0.5224, + "step": 223 + }, + { + "epoch": 0.12110290142367994, + "grad_norm": 0.6924121379852295, + "learning_rate": 9.986452283393452e-06, + "loss": 0.5256, + "step": 224 + }, + { + "epoch": 0.12164353937646423, + "grad_norm": 0.5878159999847412, + "learning_rate": 9.985748956034007e-06, + "loss": 0.5569, + "step": 225 + }, + { + "epoch": 0.12218417732924851, + "grad_norm": 0.5620668530464172, + "learning_rate": 9.985027857203469e-06, + "loss": 0.5307, + "step": 226 + }, + { + "epoch": 0.1227248152820328, + "grad_norm": 0.6018951535224915, + "learning_rate": 9.984288989472162e-06, + "loss": 0.517, + "step": 227 + }, + { + "epoch": 0.12326545323481708, + "grad_norm": 0.43050438165664673, + "learning_rate": 9.983532355473744e-06, + "loss": 0.5052, + "step": 228 + }, + { + "epoch": 0.12380609118760137, + "grad_norm": 0.5617673993110657, + "learning_rate": 9.982757957905204e-06, + "loss": 0.5355, + "step": 229 + }, + { + "epoch": 0.12434672914038565, + "grad_norm": 0.5786687135696411, + "learning_rate": 9.981965799526846e-06, + "loss": 0.523, + "step": 230 + }, + { + "epoch": 0.12488736709316994, + "grad_norm": 0.42592427134513855, + "learning_rate": 9.981155883162281e-06, + "loss": 0.5169, + "step": 231 + }, + { + "epoch": 0.12542800504595422, + "grad_norm": 0.4949047267436981, + "learning_rate": 9.980328211698418e-06, + "loss": 0.5368, + "step": 232 + }, + { + "epoch": 0.1259686429987385, + "grad_norm": 0.49621617794036865, + "learning_rate": 9.979482788085455e-06, + "loss": 0.5401, + "step": 233 + }, + { + "epoch": 0.1265092809515228, + "grad_norm": 0.5175781846046448, + "learning_rate": 9.978619615336866e-06, + "loss": 0.5167, + "step": 234 + }, + { + "epoch": 0.12704991890430709, + "grad_norm": 0.5909814834594727, + "learning_rate": 9.977738696529387e-06, + "loss": 0.5132, + "step": 235 + }, + { + "epoch": 0.12759055685709136, + "grad_norm": 0.43574607372283936, + "learning_rate": 9.976840034803014e-06, + "loss": 0.5355, + "step": 236 + }, + { + "epoch": 0.12813119480987564, + "grad_norm": 0.4887847602367401, + "learning_rate": 9.975923633360985e-06, + "loss": 0.5362, + "step": 237 + }, + { + "epoch": 0.12867183276265995, + "grad_norm": 0.5564954280853271, + "learning_rate": 9.974989495469771e-06, + "loss": 0.5436, + "step": 238 + }, + { + "epoch": 0.12921247071544423, + "grad_norm": 0.4811561405658722, + "learning_rate": 9.974037624459063e-06, + "loss": 0.5005, + "step": 239 + }, + { + "epoch": 0.1297531086682285, + "grad_norm": 0.5640829205513, + "learning_rate": 9.973068023721761e-06, + "loss": 0.4908, + "step": 240 + }, + { + "epoch": 0.13029374662101278, + "grad_norm": 0.558860719203949, + "learning_rate": 9.972080696713962e-06, + "loss": 0.5369, + "step": 241 + }, + { + "epoch": 0.1308343845737971, + "grad_norm": 0.5138370394706726, + "learning_rate": 9.971075646954946e-06, + "loss": 0.545, + "step": 242 + }, + { + "epoch": 0.13137502252658137, + "grad_norm": 0.558138906955719, + "learning_rate": 9.970052878027169e-06, + "loss": 0.5029, + "step": 243 + }, + { + "epoch": 0.13191566047936565, + "grad_norm": 0.5365882515907288, + "learning_rate": 9.969012393576241e-06, + "loss": 0.5164, + "step": 244 + }, + { + "epoch": 0.13245629843214993, + "grad_norm": 0.6326988935470581, + "learning_rate": 9.967954197310922e-06, + "loss": 0.5089, + "step": 245 + }, + { + "epoch": 0.13299693638493423, + "grad_norm": 0.5007409453392029, + "learning_rate": 9.966878293003102e-06, + "loss": 0.4909, + "step": 246 + }, + { + "epoch": 0.1335375743377185, + "grad_norm": 0.5736434459686279, + "learning_rate": 9.965784684487794e-06, + "loss": 0.5446, + "step": 247 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 0.4436694383621216, + "learning_rate": 9.964673375663114e-06, + "loss": 0.5096, + "step": 248 + }, + { + "epoch": 0.13461885024328707, + "grad_norm": 0.4315429627895355, + "learning_rate": 9.96354437049027e-06, + "loss": 0.506, + "step": 249 + }, + { + "epoch": 0.13515948819607138, + "grad_norm": 0.5133793354034424, + "learning_rate": 9.962397672993552e-06, + "loss": 0.5142, + "step": 250 + }, + { + "epoch": 0.13570012614885565, + "grad_norm": 0.4259757697582245, + "learning_rate": 9.961233287260305e-06, + "loss": 0.5126, + "step": 251 + }, + { + "epoch": 0.13624076410163993, + "grad_norm": 0.497167706489563, + "learning_rate": 9.96005121744093e-06, + "loss": 0.5345, + "step": 252 + }, + { + "epoch": 0.1367814020544242, + "grad_norm": 0.5054571628570557, + "learning_rate": 9.958851467748863e-06, + "loss": 0.5147, + "step": 253 + }, + { + "epoch": 0.13732204000720852, + "grad_norm": 0.48127052187919617, + "learning_rate": 9.957634042460551e-06, + "loss": 0.4974, + "step": 254 + }, + { + "epoch": 0.1378626779599928, + "grad_norm": 0.5029913187026978, + "learning_rate": 9.956398945915455e-06, + "loss": 0.5036, + "step": 255 + }, + { + "epoch": 0.13840331591277708, + "grad_norm": 0.4597805142402649, + "learning_rate": 9.955146182516015e-06, + "loss": 0.4872, + "step": 256 + }, + { + "epoch": 0.13894395386556135, + "grad_norm": 0.45765337347984314, + "learning_rate": 9.95387575672765e-06, + "loss": 0.5019, + "step": 257 + }, + { + "epoch": 0.13948459181834566, + "grad_norm": 0.5424937009811401, + "learning_rate": 9.952587673078738e-06, + "loss": 0.5358, + "step": 258 + }, + { + "epoch": 0.14002522977112994, + "grad_norm": 0.6009652018547058, + "learning_rate": 9.951281936160587e-06, + "loss": 0.533, + "step": 259 + }, + { + "epoch": 0.14056586772391422, + "grad_norm": 0.5036184191703796, + "learning_rate": 9.949958550627436e-06, + "loss": 0.4852, + "step": 260 + }, + { + "epoch": 0.1411065056766985, + "grad_norm": 0.6518417000770569, + "learning_rate": 9.948617521196438e-06, + "loss": 0.538, + "step": 261 + }, + { + "epoch": 0.1416471436294828, + "grad_norm": 0.5519372224807739, + "learning_rate": 9.947258852647623e-06, + "loss": 0.5216, + "step": 262 + }, + { + "epoch": 0.14218778158226708, + "grad_norm": 0.6252304911613464, + "learning_rate": 9.945882549823906e-06, + "loss": 0.5188, + "step": 263 + }, + { + "epoch": 0.14272841953505136, + "grad_norm": 0.5159186124801636, + "learning_rate": 9.944488617631053e-06, + "loss": 0.5178, + "step": 264 + }, + { + "epoch": 0.14326905748783564, + "grad_norm": 0.5060036182403564, + "learning_rate": 9.943077061037672e-06, + "loss": 0.5083, + "step": 265 + }, + { + "epoch": 0.14380969544061994, + "grad_norm": 0.579529345035553, + "learning_rate": 9.94164788507519e-06, + "loss": 0.4991, + "step": 266 + }, + { + "epoch": 0.14435033339340422, + "grad_norm": 0.5514207482337952, + "learning_rate": 9.940201094837838e-06, + "loss": 0.5154, + "step": 267 + }, + { + "epoch": 0.1448909713461885, + "grad_norm": 0.5269582867622375, + "learning_rate": 9.938736695482636e-06, + "loss": 0.4937, + "step": 268 + }, + { + "epoch": 0.14543160929897278, + "grad_norm": 0.5171031951904297, + "learning_rate": 9.937254692229363e-06, + "loss": 0.5024, + "step": 269 + }, + { + "epoch": 0.1459722472517571, + "grad_norm": 0.5391225814819336, + "learning_rate": 9.935755090360554e-06, + "loss": 0.4961, + "step": 270 + }, + { + "epoch": 0.14651288520454137, + "grad_norm": 0.47390222549438477, + "learning_rate": 9.93423789522147e-06, + "loss": 0.4855, + "step": 271 + }, + { + "epoch": 0.14705352315732564, + "grad_norm": 0.4980723261833191, + "learning_rate": 9.932703112220084e-06, + "loss": 0.5481, + "step": 272 + }, + { + "epoch": 0.14759416111010992, + "grad_norm": 0.5040086507797241, + "learning_rate": 9.931150746827055e-06, + "loss": 0.52, + "step": 273 + }, + { + "epoch": 0.14813479906289423, + "grad_norm": 0.5215423107147217, + "learning_rate": 9.929580804575718e-06, + "loss": 0.5113, + "step": 274 + }, + { + "epoch": 0.1486754370156785, + "grad_norm": 0.44872352480888367, + "learning_rate": 9.927993291062064e-06, + "loss": 0.4919, + "step": 275 + }, + { + "epoch": 0.1492160749684628, + "grad_norm": 0.5068681240081787, + "learning_rate": 9.926388211944707e-06, + "loss": 0.5243, + "step": 276 + }, + { + "epoch": 0.14975671292124706, + "grad_norm": 0.5543198585510254, + "learning_rate": 9.924765572944879e-06, + "loss": 0.5083, + "step": 277 + }, + { + "epoch": 0.15029735087403134, + "grad_norm": 0.4373819828033447, + "learning_rate": 9.9231253798464e-06, + "loss": 0.5, + "step": 278 + }, + { + "epoch": 0.15083798882681565, + "grad_norm": 0.4874158501625061, + "learning_rate": 9.921467638495666e-06, + "loss": 0.5336, + "step": 279 + }, + { + "epoch": 0.15137862677959993, + "grad_norm": 0.4793112874031067, + "learning_rate": 9.919792354801614e-06, + "loss": 0.5135, + "step": 280 + }, + { + "epoch": 0.1519192647323842, + "grad_norm": 0.4696117341518402, + "learning_rate": 9.91809953473572e-06, + "loss": 0.5018, + "step": 281 + }, + { + "epoch": 0.15245990268516849, + "grad_norm": 0.5002115964889526, + "learning_rate": 9.916389184331957e-06, + "loss": 0.5332, + "step": 282 + }, + { + "epoch": 0.1530005406379528, + "grad_norm": 0.4873870015144348, + "learning_rate": 9.914661309686796e-06, + "loss": 0.5238, + "step": 283 + }, + { + "epoch": 0.15354117859073707, + "grad_norm": 0.5112310647964478, + "learning_rate": 9.912915916959162e-06, + "loss": 0.5145, + "step": 284 + }, + { + "epoch": 0.15408181654352135, + "grad_norm": 0.5636197328567505, + "learning_rate": 9.911153012370427e-06, + "loss": 0.5095, + "step": 285 + }, + { + "epoch": 0.15462245449630563, + "grad_norm": 0.4921044409275055, + "learning_rate": 9.909372602204385e-06, + "loss": 0.5083, + "step": 286 + }, + { + "epoch": 0.15516309244908993, + "grad_norm": 0.5235900282859802, + "learning_rate": 9.907574692807223e-06, + "loss": 0.5126, + "step": 287 + }, + { + "epoch": 0.1557037304018742, + "grad_norm": 0.5021162033081055, + "learning_rate": 9.905759290587506e-06, + "loss": 0.5067, + "step": 288 + }, + { + "epoch": 0.1562443683546585, + "grad_norm": 0.5399244427680969, + "learning_rate": 9.903926402016153e-06, + "loss": 0.5082, + "step": 289 + }, + { + "epoch": 0.15678500630744277, + "grad_norm": 0.48548075556755066, + "learning_rate": 9.902076033626409e-06, + "loss": 0.5091, + "step": 290 + }, + { + "epoch": 0.15732564426022708, + "grad_norm": 0.5076701045036316, + "learning_rate": 9.900208192013825e-06, + "loss": 0.5002, + "step": 291 + }, + { + "epoch": 0.15786628221301136, + "grad_norm": 0.5822921991348267, + "learning_rate": 9.898322883836239e-06, + "loss": 0.5074, + "step": 292 + }, + { + "epoch": 0.15840692016579563, + "grad_norm": 0.4363133907318115, + "learning_rate": 9.896420115813741e-06, + "loss": 0.5065, + "step": 293 + }, + { + "epoch": 0.1589475581185799, + "grad_norm": 0.47977545857429504, + "learning_rate": 9.894499894728665e-06, + "loss": 0.477, + "step": 294 + }, + { + "epoch": 0.15948819607136422, + "grad_norm": 0.5231459736824036, + "learning_rate": 9.892562227425541e-06, + "loss": 0.5164, + "step": 295 + }, + { + "epoch": 0.1600288340241485, + "grad_norm": 0.4468756318092346, + "learning_rate": 9.890607120811104e-06, + "loss": 0.4966, + "step": 296 + }, + { + "epoch": 0.16056947197693278, + "grad_norm": 0.5080546736717224, + "learning_rate": 9.888634581854235e-06, + "loss": 0.4717, + "step": 297 + }, + { + "epoch": 0.16111010992971705, + "grad_norm": 0.6117534637451172, + "learning_rate": 9.88664461758596e-06, + "loss": 0.5435, + "step": 298 + }, + { + "epoch": 0.16165074788250136, + "grad_norm": 0.44875776767730713, + "learning_rate": 9.884637235099414e-06, + "loss": 0.5162, + "step": 299 + }, + { + "epoch": 0.16219138583528564, + "grad_norm": 0.5946277379989624, + "learning_rate": 9.882612441549817e-06, + "loss": 0.5086, + "step": 300 + }, + { + "epoch": 0.16273202378806992, + "grad_norm": 0.6129415035247803, + "learning_rate": 9.880570244154455e-06, + "loss": 0.5252, + "step": 301 + }, + { + "epoch": 0.1632726617408542, + "grad_norm": 0.5008973479270935, + "learning_rate": 9.878510650192644e-06, + "loss": 0.5181, + "step": 302 + }, + { + "epoch": 0.1638132996936385, + "grad_norm": 0.6963375210762024, + "learning_rate": 9.876433667005711e-06, + "loss": 0.5238, + "step": 303 + }, + { + "epoch": 0.16435393764642278, + "grad_norm": 0.4780685305595398, + "learning_rate": 9.874339301996968e-06, + "loss": 0.5107, + "step": 304 + }, + { + "epoch": 0.16489457559920706, + "grad_norm": 0.47462719678878784, + "learning_rate": 9.87222756263168e-06, + "loss": 0.5003, + "step": 305 + }, + { + "epoch": 0.16543521355199134, + "grad_norm": 0.6904825568199158, + "learning_rate": 9.870098456437045e-06, + "loss": 0.5256, + "step": 306 + }, + { + "epoch": 0.16597585150477565, + "grad_norm": 0.4498540759086609, + "learning_rate": 9.867951991002162e-06, + "loss": 0.5088, + "step": 307 + }, + { + "epoch": 0.16651648945755992, + "grad_norm": 0.5996077656745911, + "learning_rate": 9.865788173978011e-06, + "loss": 0.5045, + "step": 308 + }, + { + "epoch": 0.1670571274103442, + "grad_norm": 0.5990262627601624, + "learning_rate": 9.863607013077414e-06, + "loss": 0.499, + "step": 309 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 0.456752210855484, + "learning_rate": 9.86140851607502e-06, + "loss": 0.5118, + "step": 310 + }, + { + "epoch": 0.1681384033159128, + "grad_norm": 0.5640468001365662, + "learning_rate": 9.85919269080727e-06, + "loss": 0.5099, + "step": 311 + }, + { + "epoch": 0.16867904126869707, + "grad_norm": 0.4840059280395508, + "learning_rate": 9.856959545172369e-06, + "loss": 0.5047, + "step": 312 + }, + { + "epoch": 0.16921967922148134, + "grad_norm": 0.4675785005092621, + "learning_rate": 9.854709087130261e-06, + "loss": 0.5176, + "step": 313 + }, + { + "epoch": 0.16976031717426562, + "grad_norm": 0.48285573720932007, + "learning_rate": 9.852441324702599e-06, + "loss": 0.515, + "step": 314 + }, + { + "epoch": 0.17030095512704993, + "grad_norm": 0.5241273045539856, + "learning_rate": 9.850156265972722e-06, + "loss": 0.5132, + "step": 315 + }, + { + "epoch": 0.1708415930798342, + "grad_norm": 0.5342344641685486, + "learning_rate": 9.847853919085608e-06, + "loss": 0.5348, + "step": 316 + }, + { + "epoch": 0.1713822310326185, + "grad_norm": 0.5915160179138184, + "learning_rate": 9.845534292247872e-06, + "loss": 0.4903, + "step": 317 + }, + { + "epoch": 0.17192286898540277, + "grad_norm": 0.5160459280014038, + "learning_rate": 9.843197393727713e-06, + "loss": 0.5048, + "step": 318 + }, + { + "epoch": 0.17246350693818707, + "grad_norm": 0.4898647665977478, + "learning_rate": 9.8408432318549e-06, + "loss": 0.5076, + "step": 319 + }, + { + "epoch": 0.17300414489097135, + "grad_norm": 0.4921559989452362, + "learning_rate": 9.838471815020731e-06, + "loss": 0.5231, + "step": 320 + }, + { + "epoch": 0.17354478284375563, + "grad_norm": 0.5118461847305298, + "learning_rate": 9.836083151678014e-06, + "loss": 0.5057, + "step": 321 + }, + { + "epoch": 0.1740854207965399, + "grad_norm": 0.48118114471435547, + "learning_rate": 9.833677250341027e-06, + "loss": 0.5581, + "step": 322 + }, + { + "epoch": 0.17462605874932421, + "grad_norm": 0.4235428273677826, + "learning_rate": 9.831254119585497e-06, + "loss": 0.4938, + "step": 323 + }, + { + "epoch": 0.1751666967021085, + "grad_norm": 0.49432626366615295, + "learning_rate": 9.828813768048555e-06, + "loss": 0.503, + "step": 324 + }, + { + "epoch": 0.17570733465489277, + "grad_norm": 0.4676252007484436, + "learning_rate": 9.826356204428726e-06, + "loss": 0.5017, + "step": 325 + }, + { + "epoch": 0.17624797260767705, + "grad_norm": 0.48674294352531433, + "learning_rate": 9.823881437485882e-06, + "loss": 0.5122, + "step": 326 + }, + { + "epoch": 0.17678861056046136, + "grad_norm": 0.5041030645370483, + "learning_rate": 9.821389476041212e-06, + "loss": 0.5014, + "step": 327 + }, + { + "epoch": 0.17732924851324564, + "grad_norm": 0.5799689888954163, + "learning_rate": 9.8188803289772e-06, + "loss": 0.5245, + "step": 328 + }, + { + "epoch": 0.17786988646602991, + "grad_norm": 0.5120707750320435, + "learning_rate": 9.816354005237583e-06, + "loss": 0.4898, + "step": 329 + }, + { + "epoch": 0.1784105244188142, + "grad_norm": 0.5724909901618958, + "learning_rate": 9.813810513827324e-06, + "loss": 0.5398, + "step": 330 + }, + { + "epoch": 0.1789511623715985, + "grad_norm": 0.6064598560333252, + "learning_rate": 9.811249863812581e-06, + "loss": 0.4939, + "step": 331 + }, + { + "epoch": 0.17949180032438278, + "grad_norm": 0.5473314523696899, + "learning_rate": 9.808672064320672e-06, + "loss": 0.5113, + "step": 332 + }, + { + "epoch": 0.18003243827716706, + "grad_norm": 0.45414504408836365, + "learning_rate": 9.806077124540045e-06, + "loss": 0.4815, + "step": 333 + }, + { + "epoch": 0.18057307622995133, + "grad_norm": 0.48928454518318176, + "learning_rate": 9.803465053720242e-06, + "loss": 0.502, + "step": 334 + }, + { + "epoch": 0.18111371418273564, + "grad_norm": 0.4803617000579834, + "learning_rate": 9.800835861171869e-06, + "loss": 0.4851, + "step": 335 + }, + { + "epoch": 0.18165435213551992, + "grad_norm": 0.4396837055683136, + "learning_rate": 9.798189556266559e-06, + "loss": 0.4956, + "step": 336 + }, + { + "epoch": 0.1821949900883042, + "grad_norm": 0.5100455284118652, + "learning_rate": 9.795526148436945e-06, + "loss": 0.5282, + "step": 337 + }, + { + "epoch": 0.18273562804108848, + "grad_norm": 0.426038920879364, + "learning_rate": 9.792845647176621e-06, + "loss": 0.4697, + "step": 338 + }, + { + "epoch": 0.18327626599387278, + "grad_norm": 0.42569661140441895, + "learning_rate": 9.790148062040108e-06, + "loss": 0.4948, + "step": 339 + }, + { + "epoch": 0.18381690394665706, + "grad_norm": 0.5453207492828369, + "learning_rate": 9.787433402642823e-06, + "loss": 0.5067, + "step": 340 + }, + { + "epoch": 0.18435754189944134, + "grad_norm": 0.5373483300209045, + "learning_rate": 9.784701678661045e-06, + "loss": 0.5227, + "step": 341 + }, + { + "epoch": 0.18489817985222562, + "grad_norm": 0.5656136274337769, + "learning_rate": 9.781952899831876e-06, + "loss": 0.5163, + "step": 342 + }, + { + "epoch": 0.1854388178050099, + "grad_norm": 0.5008209347724915, + "learning_rate": 9.779187075953215e-06, + "loss": 0.5129, + "step": 343 + }, + { + "epoch": 0.1859794557577942, + "grad_norm": 0.583916962146759, + "learning_rate": 9.776404216883709e-06, + "loss": 0.503, + "step": 344 + }, + { + "epoch": 0.18652009371057848, + "grad_norm": 0.5473840832710266, + "learning_rate": 9.77360433254273e-06, + "loss": 0.5407, + "step": 345 + }, + { + "epoch": 0.18706073166336276, + "grad_norm": 0.5509775280952454, + "learning_rate": 9.770787432910336e-06, + "loss": 0.4781, + "step": 346 + }, + { + "epoch": 0.18760136961614704, + "grad_norm": 0.6003471612930298, + "learning_rate": 9.767953528027238e-06, + "loss": 0.5375, + "step": 347 + }, + { + "epoch": 0.18814200756893135, + "grad_norm": 0.43554067611694336, + "learning_rate": 9.765102627994757e-06, + "loss": 0.4962, + "step": 348 + }, + { + "epoch": 0.18868264552171563, + "grad_norm": 0.6562443375587463, + "learning_rate": 9.762234742974793e-06, + "loss": 0.4952, + "step": 349 + }, + { + "epoch": 0.1892232834744999, + "grad_norm": 0.5603171586990356, + "learning_rate": 9.759349883189788e-06, + "loss": 0.4783, + "step": 350 + }, + { + "epoch": 0.18976392142728418, + "grad_norm": 0.5634536147117615, + "learning_rate": 9.756448058922697e-06, + "loss": 0.4884, + "step": 351 + }, + { + "epoch": 0.1903045593800685, + "grad_norm": 0.4945487082004547, + "learning_rate": 9.753529280516931e-06, + "loss": 0.5012, + "step": 352 + }, + { + "epoch": 0.19084519733285277, + "grad_norm": 0.588554322719574, + "learning_rate": 9.750593558376347e-06, + "loss": 0.5098, + "step": 353 + }, + { + "epoch": 0.19138583528563705, + "grad_norm": 0.4570363163948059, + "learning_rate": 9.747640902965185e-06, + "loss": 0.4675, + "step": 354 + }, + { + "epoch": 0.19192647323842132, + "grad_norm": 0.6398181915283203, + "learning_rate": 9.74467132480805e-06, + "loss": 0.5402, + "step": 355 + }, + { + "epoch": 0.19246711119120563, + "grad_norm": 0.4467182159423828, + "learning_rate": 9.741684834489866e-06, + "loss": 0.4916, + "step": 356 + }, + { + "epoch": 0.1930077491439899, + "grad_norm": 0.5539352297782898, + "learning_rate": 9.738681442655842e-06, + "loss": 0.4996, + "step": 357 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.5078352689743042, + "learning_rate": 9.735661160011424e-06, + "loss": 0.5055, + "step": 358 + }, + { + "epoch": 0.19408902504955847, + "grad_norm": 0.4970700442790985, + "learning_rate": 9.732623997322274e-06, + "loss": 0.5247, + "step": 359 + }, + { + "epoch": 0.19462966300234277, + "grad_norm": 0.4787460267543793, + "learning_rate": 9.729569965414214e-06, + "loss": 0.5114, + "step": 360 + }, + { + "epoch": 0.19517030095512705, + "grad_norm": 0.5646668672561646, + "learning_rate": 9.726499075173201e-06, + "loss": 0.4905, + "step": 361 + }, + { + "epoch": 0.19571093890791133, + "grad_norm": 0.5013266801834106, + "learning_rate": 9.723411337545283e-06, + "loss": 0.5112, + "step": 362 + }, + { + "epoch": 0.1962515768606956, + "grad_norm": 0.5415170192718506, + "learning_rate": 9.720306763536553e-06, + "loss": 0.4905, + "step": 363 + }, + { + "epoch": 0.19679221481347992, + "grad_norm": 0.443282812833786, + "learning_rate": 9.717185364213127e-06, + "loss": 0.5082, + "step": 364 + }, + { + "epoch": 0.1973328527662642, + "grad_norm": 0.6220279932022095, + "learning_rate": 9.714047150701082e-06, + "loss": 0.5117, + "step": 365 + }, + { + "epoch": 0.19787349071904847, + "grad_norm": 0.46816128492355347, + "learning_rate": 9.710892134186438e-06, + "loss": 0.5067, + "step": 366 + }, + { + "epoch": 0.19841412867183275, + "grad_norm": 0.578245222568512, + "learning_rate": 9.707720325915105e-06, + "loss": 0.518, + "step": 367 + }, + { + "epoch": 0.19895476662461706, + "grad_norm": 0.5336887240409851, + "learning_rate": 9.704531737192847e-06, + "loss": 0.513, + "step": 368 + }, + { + "epoch": 0.19949540457740134, + "grad_norm": 0.5477902889251709, + "learning_rate": 9.701326379385238e-06, + "loss": 0.4937, + "step": 369 + }, + { + "epoch": 0.20003604253018561, + "grad_norm": 0.486288845539093, + "learning_rate": 9.698104263917632e-06, + "loss": 0.4928, + "step": 370 + }, + { + "epoch": 0.2005766804829699, + "grad_norm": 0.4892939031124115, + "learning_rate": 9.694865402275105e-06, + "loss": 0.5004, + "step": 371 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 0.4445298910140991, + "learning_rate": 9.691609806002433e-06, + "loss": 0.4874, + "step": 372 + }, + { + "epoch": 0.20165795638853848, + "grad_norm": 0.4402690827846527, + "learning_rate": 9.688337486704038e-06, + "loss": 0.4714, + "step": 373 + }, + { + "epoch": 0.20219859434132276, + "grad_norm": 0.5016452670097351, + "learning_rate": 9.68504845604395e-06, + "loss": 0.5061, + "step": 374 + }, + { + "epoch": 0.20273923229410704, + "grad_norm": 0.4491643011569977, + "learning_rate": 9.681742725745762e-06, + "loss": 0.5089, + "step": 375 + }, + { + "epoch": 0.20327987024689134, + "grad_norm": 0.45842084288597107, + "learning_rate": 9.678420307592602e-06, + "loss": 0.4886, + "step": 376 + }, + { + "epoch": 0.20382050819967562, + "grad_norm": 0.4361850619316101, + "learning_rate": 9.675081213427076e-06, + "loss": 0.5183, + "step": 377 + }, + { + "epoch": 0.2043611461524599, + "grad_norm": 0.5042484998703003, + "learning_rate": 9.671725455151226e-06, + "loss": 0.4974, + "step": 378 + }, + { + "epoch": 0.20490178410524418, + "grad_norm": 0.47856807708740234, + "learning_rate": 9.668353044726498e-06, + "loss": 0.4957, + "step": 379 + }, + { + "epoch": 0.20544242205802848, + "grad_norm": 0.5412836074829102, + "learning_rate": 9.664963994173695e-06, + "loss": 0.5324, + "step": 380 + }, + { + "epoch": 0.20598306001081276, + "grad_norm": 0.5093710422515869, + "learning_rate": 9.66155831557293e-06, + "loss": 0.5335, + "step": 381 + }, + { + "epoch": 0.20652369796359704, + "grad_norm": 0.4847930073738098, + "learning_rate": 9.658136021063585e-06, + "loss": 0.4609, + "step": 382 + }, + { + "epoch": 0.20706433591638132, + "grad_norm": 0.48776888847351074, + "learning_rate": 9.65469712284427e-06, + "loss": 0.5006, + "step": 383 + }, + { + "epoch": 0.20760497386916563, + "grad_norm": 0.5098462104797363, + "learning_rate": 9.651241633172782e-06, + "loss": 0.5021, + "step": 384 + }, + { + "epoch": 0.2081456118219499, + "grad_norm": 0.4696345031261444, + "learning_rate": 9.647769564366048e-06, + "loss": 0.4987, + "step": 385 + }, + { + "epoch": 0.20868624977473418, + "grad_norm": 0.5066925883293152, + "learning_rate": 9.644280928800101e-06, + "loss": 0.5182, + "step": 386 + }, + { + "epoch": 0.20922688772751846, + "grad_norm": 0.5005125403404236, + "learning_rate": 9.640775738910019e-06, + "loss": 0.5005, + "step": 387 + }, + { + "epoch": 0.20976752568030277, + "grad_norm": 0.43679603934288025, + "learning_rate": 9.63725400718989e-06, + "loss": 0.4928, + "step": 388 + }, + { + "epoch": 0.21030816363308705, + "grad_norm": 0.5588410496711731, + "learning_rate": 9.633715746192762e-06, + "loss": 0.5374, + "step": 389 + }, + { + "epoch": 0.21084880158587133, + "grad_norm": 0.4636860489845276, + "learning_rate": 9.630160968530601e-06, + "loss": 0.5011, + "step": 390 + }, + { + "epoch": 0.2113894395386556, + "grad_norm": 0.5035771131515503, + "learning_rate": 9.626589686874252e-06, + "loss": 0.5152, + "step": 391 + }, + { + "epoch": 0.2119300774914399, + "grad_norm": 0.5176652669906616, + "learning_rate": 9.62300191395338e-06, + "loss": 0.4931, + "step": 392 + }, + { + "epoch": 0.2124707154442242, + "grad_norm": 0.4670732319355011, + "learning_rate": 9.619397662556434e-06, + "loss": 0.5099, + "step": 393 + }, + { + "epoch": 0.21301135339700847, + "grad_norm": 0.5705310702323914, + "learning_rate": 9.615776945530603e-06, + "loss": 0.5244, + "step": 394 + }, + { + "epoch": 0.21355199134979275, + "grad_norm": 0.48803743720054626, + "learning_rate": 9.612139775781766e-06, + "loss": 0.4953, + "step": 395 + }, + { + "epoch": 0.21409262930257705, + "grad_norm": 0.6569446921348572, + "learning_rate": 9.608486166274444e-06, + "loss": 0.5086, + "step": 396 + }, + { + "epoch": 0.21463326725536133, + "grad_norm": 0.5075775980949402, + "learning_rate": 9.60481613003176e-06, + "loss": 0.5471, + "step": 397 + }, + { + "epoch": 0.2151739052081456, + "grad_norm": 0.5804044604301453, + "learning_rate": 9.601129680135386e-06, + "loss": 0.5075, + "step": 398 + }, + { + "epoch": 0.2157145431609299, + "grad_norm": 0.4748011827468872, + "learning_rate": 9.597426829725504e-06, + "loss": 0.4741, + "step": 399 + }, + { + "epoch": 0.2162551811137142, + "grad_norm": 0.5516924858093262, + "learning_rate": 9.593707592000751e-06, + "loss": 0.5141, + "step": 400 + }, + { + "epoch": 0.21679581906649847, + "grad_norm": 0.49007630348205566, + "learning_rate": 9.58997198021818e-06, + "loss": 0.4965, + "step": 401 + }, + { + "epoch": 0.21733645701928275, + "grad_norm": 0.49923837184906006, + "learning_rate": 9.586220007693205e-06, + "loss": 0.4953, + "step": 402 + }, + { + "epoch": 0.21787709497206703, + "grad_norm": 0.4540408253669739, + "learning_rate": 9.582451687799557e-06, + "loss": 0.4577, + "step": 403 + }, + { + "epoch": 0.21841773292485134, + "grad_norm": 0.4953221380710602, + "learning_rate": 9.578667033969238e-06, + "loss": 0.5014, + "step": 404 + }, + { + "epoch": 0.21895837087763562, + "grad_norm": 0.5561023354530334, + "learning_rate": 9.574866059692471e-06, + "loss": 0.5158, + "step": 405 + }, + { + "epoch": 0.2194990088304199, + "grad_norm": 0.42774730920791626, + "learning_rate": 9.571048778517655e-06, + "loss": 0.4998, + "step": 406 + }, + { + "epoch": 0.22003964678320417, + "grad_norm": 0.5168089866638184, + "learning_rate": 9.567215204051307e-06, + "loss": 0.5156, + "step": 407 + }, + { + "epoch": 0.22058028473598848, + "grad_norm": 0.45053431391716003, + "learning_rate": 9.563365349958032e-06, + "loss": 0.5138, + "step": 408 + }, + { + "epoch": 0.22112092268877276, + "grad_norm": 0.4745253026485443, + "learning_rate": 9.55949922996045e-06, + "loss": 0.5122, + "step": 409 + }, + { + "epoch": 0.22166156064155704, + "grad_norm": 0.49608004093170166, + "learning_rate": 9.555616857839171e-06, + "loss": 0.4882, + "step": 410 + }, + { + "epoch": 0.22220219859434132, + "grad_norm": 0.46152451634407043, + "learning_rate": 9.551718247432732e-06, + "loss": 0.4709, + "step": 411 + }, + { + "epoch": 0.2227428365471256, + "grad_norm": 0.4751157760620117, + "learning_rate": 9.547803412637542e-06, + "loss": 0.4729, + "step": 412 + }, + { + "epoch": 0.2232834744999099, + "grad_norm": 0.5150904655456543, + "learning_rate": 9.543872367407854e-06, + "loss": 0.5082, + "step": 413 + }, + { + "epoch": 0.22382411245269418, + "grad_norm": 0.46366235613822937, + "learning_rate": 9.539925125755695e-06, + "loss": 0.4977, + "step": 414 + }, + { + "epoch": 0.22436475040547846, + "grad_norm": 0.5216416120529175, + "learning_rate": 9.535961701750825e-06, + "loss": 0.513, + "step": 415 + }, + { + "epoch": 0.22490538835826274, + "grad_norm": 0.465658962726593, + "learning_rate": 9.531982109520686e-06, + "loss": 0.4844, + "step": 416 + }, + { + "epoch": 0.22544602631104704, + "grad_norm": 0.48046615719795227, + "learning_rate": 9.527986363250348e-06, + "loss": 0.4781, + "step": 417 + }, + { + "epoch": 0.22598666426383132, + "grad_norm": 0.47210782766342163, + "learning_rate": 9.523974477182465e-06, + "loss": 0.4951, + "step": 418 + }, + { + "epoch": 0.2265273022166156, + "grad_norm": 0.4686312973499298, + "learning_rate": 9.519946465617217e-06, + "loss": 0.5024, + "step": 419 + }, + { + "epoch": 0.22706794016939988, + "grad_norm": 0.46087512373924255, + "learning_rate": 9.515902342912268e-06, + "loss": 0.4894, + "step": 420 + }, + { + "epoch": 0.22760857812218419, + "grad_norm": 0.5393832921981812, + "learning_rate": 9.511842123482703e-06, + "loss": 0.4985, + "step": 421 + }, + { + "epoch": 0.22814921607496846, + "grad_norm": 0.47389426827430725, + "learning_rate": 9.507765821800988e-06, + "loss": 0.5029, + "step": 422 + }, + { + "epoch": 0.22868985402775274, + "grad_norm": 0.5276787281036377, + "learning_rate": 9.503673452396909e-06, + "loss": 0.5029, + "step": 423 + }, + { + "epoch": 0.22923049198053702, + "grad_norm": 0.5731882452964783, + "learning_rate": 9.499565029857529e-06, + "loss": 0.5116, + "step": 424 + }, + { + "epoch": 0.22977112993332133, + "grad_norm": 0.48996415734291077, + "learning_rate": 9.49544056882713e-06, + "loss": 0.5103, + "step": 425 + }, + { + "epoch": 0.2303117678861056, + "grad_norm": 0.5771133303642273, + "learning_rate": 9.491300084007162e-06, + "loss": 0.4913, + "step": 426 + }, + { + "epoch": 0.23085240583888988, + "grad_norm": 0.42285990715026855, + "learning_rate": 9.48714359015619e-06, + "loss": 0.4914, + "step": 427 + }, + { + "epoch": 0.23139304379167416, + "grad_norm": 0.595855712890625, + "learning_rate": 9.482971102089845e-06, + "loss": 0.518, + "step": 428 + }, + { + "epoch": 0.23193368174445847, + "grad_norm": 0.41206789016723633, + "learning_rate": 9.478782634680765e-06, + "loss": 0.4696, + "step": 429 + }, + { + "epoch": 0.23247431969724275, + "grad_norm": 0.57419753074646, + "learning_rate": 9.47457820285855e-06, + "loss": 0.4933, + "step": 430 + }, + { + "epoch": 0.23301495765002703, + "grad_norm": 0.4686283469200134, + "learning_rate": 9.470357821609703e-06, + "loss": 0.4914, + "step": 431 + }, + { + "epoch": 0.2335555956028113, + "grad_norm": 0.4878191351890564, + "learning_rate": 9.466121505977577e-06, + "loss": 0.4819, + "step": 432 + }, + { + "epoch": 0.2340962335555956, + "grad_norm": 0.5020343661308289, + "learning_rate": 9.461869271062322e-06, + "loss": 0.508, + "step": 433 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 0.48259827494621277, + "learning_rate": 9.457601132020832e-06, + "loss": 0.4999, + "step": 434 + }, + { + "epoch": 0.23517750946116417, + "grad_norm": 0.439022034406662, + "learning_rate": 9.453317104066693e-06, + "loss": 0.4849, + "step": 435 + }, + { + "epoch": 0.23571814741394845, + "grad_norm": 0.43228328227996826, + "learning_rate": 9.44901720247012e-06, + "loss": 0.4841, + "step": 436 + }, + { + "epoch": 0.23625878536673275, + "grad_norm": 0.43123963475227356, + "learning_rate": 9.444701442557917e-06, + "loss": 0.5022, + "step": 437 + }, + { + "epoch": 0.23679942331951703, + "grad_norm": 0.4314519166946411, + "learning_rate": 9.440369839713407e-06, + "loss": 0.479, + "step": 438 + }, + { + "epoch": 0.2373400612723013, + "grad_norm": 0.4455517828464508, + "learning_rate": 9.436022409376391e-06, + "loss": 0.5356, + "step": 439 + }, + { + "epoch": 0.2378806992250856, + "grad_norm": 0.4760914742946625, + "learning_rate": 9.431659167043079e-06, + "loss": 0.5081, + "step": 440 + }, + { + "epoch": 0.2384213371778699, + "grad_norm": 0.4999728798866272, + "learning_rate": 9.427280128266049e-06, + "loss": 0.518, + "step": 441 + }, + { + "epoch": 0.23896197513065418, + "grad_norm": 0.397093802690506, + "learning_rate": 9.422885308654183e-06, + "loss": 0.5257, + "step": 442 + }, + { + "epoch": 0.23950261308343845, + "grad_norm": 0.4785946011543274, + "learning_rate": 9.418474723872609e-06, + "loss": 0.4708, + "step": 443 + }, + { + "epoch": 0.24004325103622273, + "grad_norm": 0.42722031474113464, + "learning_rate": 9.414048389642652e-06, + "loss": 0.508, + "step": 444 + }, + { + "epoch": 0.24058388898900704, + "grad_norm": 0.4705936908721924, + "learning_rate": 9.409606321741776e-06, + "loss": 0.4882, + "step": 445 + }, + { + "epoch": 0.24112452694179132, + "grad_norm": 0.5673285722732544, + "learning_rate": 9.405148536003527e-06, + "loss": 0.4983, + "step": 446 + }, + { + "epoch": 0.2416651648945756, + "grad_norm": 0.48440027236938477, + "learning_rate": 9.400675048317473e-06, + "loss": 0.5226, + "step": 447 + }, + { + "epoch": 0.24220580284735987, + "grad_norm": 0.5096259117126465, + "learning_rate": 9.396185874629158e-06, + "loss": 0.4979, + "step": 448 + }, + { + "epoch": 0.24274644080014418, + "grad_norm": 0.5506010055541992, + "learning_rate": 9.391681030940031e-06, + "loss": 0.4923, + "step": 449 + }, + { + "epoch": 0.24328707875292846, + "grad_norm": 0.5160098075866699, + "learning_rate": 9.387160533307398e-06, + "loss": 0.5082, + "step": 450 + }, + { + "epoch": 0.24382771670571274, + "grad_norm": 0.483447790145874, + "learning_rate": 9.382624397844363e-06, + "loss": 0.5007, + "step": 451 + }, + { + "epoch": 0.24436835465849702, + "grad_norm": 0.4803040325641632, + "learning_rate": 9.378072640719773e-06, + "loss": 0.4877, + "step": 452 + }, + { + "epoch": 0.24490899261128132, + "grad_norm": 0.5114092230796814, + "learning_rate": 9.373505278158152e-06, + "loss": 0.5008, + "step": 453 + }, + { + "epoch": 0.2454496305640656, + "grad_norm": 0.45896992087364197, + "learning_rate": 9.368922326439655e-06, + "loss": 0.4837, + "step": 454 + }, + { + "epoch": 0.24599026851684988, + "grad_norm": 0.5090027451515198, + "learning_rate": 9.364323801900002e-06, + "loss": 0.478, + "step": 455 + }, + { + "epoch": 0.24653090646963416, + "grad_norm": 0.441892147064209, + "learning_rate": 9.359709720930417e-06, + "loss": 0.511, + "step": 456 + }, + { + "epoch": 0.24707154442241847, + "grad_norm": 0.45809268951416016, + "learning_rate": 9.355080099977579e-06, + "loss": 0.5109, + "step": 457 + }, + { + "epoch": 0.24761218237520274, + "grad_norm": 0.45157548785209656, + "learning_rate": 9.350434955543557e-06, + "loss": 0.4929, + "step": 458 + }, + { + "epoch": 0.24815282032798702, + "grad_norm": 0.4648664891719818, + "learning_rate": 9.345774304185756e-06, + "loss": 0.4809, + "step": 459 + }, + { + "epoch": 0.2486934582807713, + "grad_norm": 0.5375930666923523, + "learning_rate": 9.341098162516848e-06, + "loss": 0.4998, + "step": 460 + }, + { + "epoch": 0.2492340962335556, + "grad_norm": 0.42750316858291626, + "learning_rate": 9.336406547204726e-06, + "loss": 0.4819, + "step": 461 + }, + { + "epoch": 0.2497747341863399, + "grad_norm": 0.5194140076637268, + "learning_rate": 9.331699474972434e-06, + "loss": 0.4908, + "step": 462 + }, + { + "epoch": 0.2503153721391242, + "grad_norm": 0.5086356401443481, + "learning_rate": 9.326976962598113e-06, + "loss": 0.4798, + "step": 463 + }, + { + "epoch": 0.25085601009190844, + "grad_norm": 0.4391303062438965, + "learning_rate": 9.322239026914938e-06, + "loss": 0.4989, + "step": 464 + }, + { + "epoch": 0.25139664804469275, + "grad_norm": 0.5237901210784912, + "learning_rate": 9.317485684811065e-06, + "loss": 0.5179, + "step": 465 + }, + { + "epoch": 0.251937285997477, + "grad_norm": 0.38452693819999695, + "learning_rate": 9.31271695322956e-06, + "loss": 0.4581, + "step": 466 + }, + { + "epoch": 0.2524779239502613, + "grad_norm": 0.4975660443305969, + "learning_rate": 9.307932849168341e-06, + "loss": 0.4567, + "step": 467 + }, + { + "epoch": 0.2530185619030456, + "grad_norm": 0.4287153482437134, + "learning_rate": 9.303133389680134e-06, + "loss": 0.5002, + "step": 468 + }, + { + "epoch": 0.25355919985582986, + "grad_norm": 0.5507779717445374, + "learning_rate": 9.298318591872381e-06, + "loss": 0.4925, + "step": 469 + }, + { + "epoch": 0.25409983780861417, + "grad_norm": 0.4791695475578308, + "learning_rate": 9.293488472907213e-06, + "loss": 0.5269, + "step": 470 + }, + { + "epoch": 0.2546404757613985, + "grad_norm": 0.44029316306114197, + "learning_rate": 9.288643050001362e-06, + "loss": 0.4994, + "step": 471 + }, + { + "epoch": 0.25518111371418273, + "grad_norm": 0.40624865889549255, + "learning_rate": 9.283782340426112e-06, + "loss": 0.4808, + "step": 472 + }, + { + "epoch": 0.25572175166696703, + "grad_norm": 0.44208696484565735, + "learning_rate": 9.278906361507238e-06, + "loss": 0.4944, + "step": 473 + }, + { + "epoch": 0.2562623896197513, + "grad_norm": 0.45168185234069824, + "learning_rate": 9.274015130624943e-06, + "loss": 0.4632, + "step": 474 + }, + { + "epoch": 0.2568030275725356, + "grad_norm": 0.4674747586250305, + "learning_rate": 9.26910866521379e-06, + "loss": 0.5001, + "step": 475 + }, + { + "epoch": 0.2573436655253199, + "grad_norm": 0.4525250792503357, + "learning_rate": 9.264186982762649e-06, + "loss": 0.4734, + "step": 476 + }, + { + "epoch": 0.25788430347810415, + "grad_norm": 0.48848944902420044, + "learning_rate": 9.25925010081463e-06, + "loss": 0.5054, + "step": 477 + }, + { + "epoch": 0.25842494143088846, + "grad_norm": 0.4272785484790802, + "learning_rate": 9.254298036967015e-06, + "loss": 0.4886, + "step": 478 + }, + { + "epoch": 0.25896557938367276, + "grad_norm": 0.6393560171127319, + "learning_rate": 9.249330808871213e-06, + "loss": 0.4915, + "step": 479 + }, + { + "epoch": 0.259506217336457, + "grad_norm": 0.4338541328907013, + "learning_rate": 9.244348434232676e-06, + "loss": 0.4974, + "step": 480 + }, + { + "epoch": 0.2600468552892413, + "grad_norm": 0.5929692387580872, + "learning_rate": 9.239350930810843e-06, + "loss": 0.4978, + "step": 481 + }, + { + "epoch": 0.26058749324202557, + "grad_norm": 0.523623526096344, + "learning_rate": 9.23433831641909e-06, + "loss": 0.5338, + "step": 482 + }, + { + "epoch": 0.2611281311948099, + "grad_norm": 0.45850881934165955, + "learning_rate": 9.229310608924643e-06, + "loss": 0.4928, + "step": 483 + }, + { + "epoch": 0.2616687691475942, + "grad_norm": 0.566050112247467, + "learning_rate": 9.224267826248536e-06, + "loss": 0.4998, + "step": 484 + }, + { + "epoch": 0.26220940710037843, + "grad_norm": 0.5277413129806519, + "learning_rate": 9.219209986365533e-06, + "loss": 0.4935, + "step": 485 + }, + { + "epoch": 0.26275004505316274, + "grad_norm": 0.5275083780288696, + "learning_rate": 9.21413710730407e-06, + "loss": 0.5195, + "step": 486 + }, + { + "epoch": 0.263290683005947, + "grad_norm": 0.4609350860118866, + "learning_rate": 9.20904920714619e-06, + "loss": 0.5127, + "step": 487 + }, + { + "epoch": 0.2638313209587313, + "grad_norm": 0.5389688611030579, + "learning_rate": 9.203946304027476e-06, + "loss": 0.501, + "step": 488 + }, + { + "epoch": 0.2643719589115156, + "grad_norm": 0.41184860467910767, + "learning_rate": 9.198828416136991e-06, + "loss": 0.4591, + "step": 489 + }, + { + "epoch": 0.26491259686429985, + "grad_norm": 0.5245027542114258, + "learning_rate": 9.193695561717207e-06, + "loss": 0.4657, + "step": 490 + }, + { + "epoch": 0.26545323481708416, + "grad_norm": 0.4098646938800812, + "learning_rate": 9.188547759063948e-06, + "loss": 0.5049, + "step": 491 + }, + { + "epoch": 0.26599387276986847, + "grad_norm": 0.5717266798019409, + "learning_rate": 9.183385026526317e-06, + "loss": 0.5137, + "step": 492 + }, + { + "epoch": 0.2665345107226527, + "grad_norm": 0.48556917905807495, + "learning_rate": 9.178207382506634e-06, + "loss": 0.4805, + "step": 493 + }, + { + "epoch": 0.267075148675437, + "grad_norm": 0.4661046862602234, + "learning_rate": 9.173014845460375e-06, + "loss": 0.4607, + "step": 494 + }, + { + "epoch": 0.2676157866282213, + "grad_norm": 0.4387494921684265, + "learning_rate": 9.167807433896091e-06, + "loss": 0.4992, + "step": 495 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 0.5127182006835938, + "learning_rate": 9.162585166375367e-06, + "loss": 0.5167, + "step": 496 + }, + { + "epoch": 0.2686970625337899, + "grad_norm": 0.42990031838417053, + "learning_rate": 9.157348061512728e-06, + "loss": 0.4858, + "step": 497 + }, + { + "epoch": 0.26923770048657414, + "grad_norm": 0.4679833650588989, + "learning_rate": 9.152096137975593e-06, + "loss": 0.5, + "step": 498 + }, + { + "epoch": 0.26977833843935844, + "grad_norm": 0.4706438183784485, + "learning_rate": 9.146829414484198e-06, + "loss": 0.4923, + "step": 499 + }, + { + "epoch": 0.27031897639214275, + "grad_norm": 0.4618656635284424, + "learning_rate": 9.14154790981154e-06, + "loss": 0.5015, + "step": 500 + }, + { + "epoch": 0.270859614344927, + "grad_norm": 0.473890095949173, + "learning_rate": 9.136251642783294e-06, + "loss": 0.5052, + "step": 501 + }, + { + "epoch": 0.2714002522977113, + "grad_norm": 0.4302096962928772, + "learning_rate": 9.130940632277757e-06, + "loss": 0.5216, + "step": 502 + }, + { + "epoch": 0.27194089025049556, + "grad_norm": 0.45936453342437744, + "learning_rate": 9.125614897225785e-06, + "loss": 0.5082, + "step": 503 + }, + { + "epoch": 0.27248152820327987, + "grad_norm": 0.42327070236206055, + "learning_rate": 9.120274456610708e-06, + "loss": 0.4788, + "step": 504 + }, + { + "epoch": 0.27302216615606417, + "grad_norm": 0.4344567656517029, + "learning_rate": 9.114919329468283e-06, + "loss": 0.4613, + "step": 505 + }, + { + "epoch": 0.2735628041088484, + "grad_norm": 0.44039013981819153, + "learning_rate": 9.10954953488661e-06, + "loss": 0.4888, + "step": 506 + }, + { + "epoch": 0.27410344206163273, + "grad_norm": 0.4592479169368744, + "learning_rate": 9.104165092006075e-06, + "loss": 0.4997, + "step": 507 + }, + { + "epoch": 0.27464408001441704, + "grad_norm": 0.4962833523750305, + "learning_rate": 9.098766020019273e-06, + "loss": 0.4599, + "step": 508 + }, + { + "epoch": 0.2751847179672013, + "grad_norm": 0.4496838450431824, + "learning_rate": 9.09335233817095e-06, + "loss": 0.4969, + "step": 509 + }, + { + "epoch": 0.2757253559199856, + "grad_norm": 0.44309988617897034, + "learning_rate": 9.08792406575792e-06, + "loss": 0.5136, + "step": 510 + }, + { + "epoch": 0.27626599387276984, + "grad_norm": 0.4315456748008728, + "learning_rate": 9.082481222129008e-06, + "loss": 0.4811, + "step": 511 + }, + { + "epoch": 0.27680663182555415, + "grad_norm": 0.45502516627311707, + "learning_rate": 9.07702382668498e-06, + "loss": 0.4764, + "step": 512 + }, + { + "epoch": 0.27734726977833846, + "grad_norm": 0.41209426522254944, + "learning_rate": 9.071551898878471e-06, + "loss": 0.468, + "step": 513 + }, + { + "epoch": 0.2778879077311227, + "grad_norm": 0.4675307273864746, + "learning_rate": 9.066065458213908e-06, + "loss": 0.4903, + "step": 514 + }, + { + "epoch": 0.278428545683907, + "grad_norm": 0.46809569001197815, + "learning_rate": 9.06056452424746e-06, + "loss": 0.5279, + "step": 515 + }, + { + "epoch": 0.2789691836366913, + "grad_norm": 0.5060775876045227, + "learning_rate": 9.055049116586951e-06, + "loss": 0.4866, + "step": 516 + }, + { + "epoch": 0.27950982158947557, + "grad_norm": 0.5153331756591797, + "learning_rate": 9.049519254891793e-06, + "loss": 0.5124, + "step": 517 + }, + { + "epoch": 0.2800504595422599, + "grad_norm": 0.4824049472808838, + "learning_rate": 9.04397495887292e-06, + "loss": 0.4711, + "step": 518 + }, + { + "epoch": 0.28059109749504413, + "grad_norm": 0.4388289153575897, + "learning_rate": 9.038416248292725e-06, + "loss": 0.4769, + "step": 519 + }, + { + "epoch": 0.28113173544782843, + "grad_norm": 0.5226829051971436, + "learning_rate": 9.03284314296497e-06, + "loss": 0.5135, + "step": 520 + }, + { + "epoch": 0.28167237340061274, + "grad_norm": 0.436776340007782, + "learning_rate": 9.02725566275473e-06, + "loss": 0.5041, + "step": 521 + }, + { + "epoch": 0.282213011353397, + "grad_norm": 0.5271431803703308, + "learning_rate": 9.021653827578322e-06, + "loss": 0.4836, + "step": 522 + }, + { + "epoch": 0.2827536493061813, + "grad_norm": 0.5045011043548584, + "learning_rate": 9.016037657403225e-06, + "loss": 0.487, + "step": 523 + }, + { + "epoch": 0.2832942872589656, + "grad_norm": 0.4657753109931946, + "learning_rate": 9.01040717224802e-06, + "loss": 0.4895, + "step": 524 + }, + { + "epoch": 0.28383492521174986, + "grad_norm": 0.4497890770435333, + "learning_rate": 9.004762392182307e-06, + "loss": 0.4913, + "step": 525 + }, + { + "epoch": 0.28437556316453416, + "grad_norm": 0.5092249512672424, + "learning_rate": 8.999103337326646e-06, + "loss": 0.5007, + "step": 526 + }, + { + "epoch": 0.2849162011173184, + "grad_norm": 0.48387986421585083, + "learning_rate": 8.993430027852476e-06, + "loss": 0.5121, + "step": 527 + }, + { + "epoch": 0.2854568390701027, + "grad_norm": 0.4973430037498474, + "learning_rate": 8.987742483982044e-06, + "loss": 0.4928, + "step": 528 + }, + { + "epoch": 0.285997477022887, + "grad_norm": 0.44857463240623474, + "learning_rate": 8.982040725988337e-06, + "loss": 0.5141, + "step": 529 + }, + { + "epoch": 0.2865381149756713, + "grad_norm": 0.48959702253341675, + "learning_rate": 8.976324774195005e-06, + "loss": 0.5099, + "step": 530 + }, + { + "epoch": 0.2870787529284556, + "grad_norm": 0.4419333040714264, + "learning_rate": 8.970594648976299e-06, + "loss": 0.4859, + "step": 531 + }, + { + "epoch": 0.2876193908812399, + "grad_norm": 0.43844538927078247, + "learning_rate": 8.964850370756978e-06, + "loss": 0.4931, + "step": 532 + }, + { + "epoch": 0.28816002883402414, + "grad_norm": 0.5034130811691284, + "learning_rate": 8.95909196001226e-06, + "loss": 0.4897, + "step": 533 + }, + { + "epoch": 0.28870066678680845, + "grad_norm": 0.4126366972923279, + "learning_rate": 8.953319437267731e-06, + "loss": 0.5, + "step": 534 + }, + { + "epoch": 0.2892413047395927, + "grad_norm": 0.4978686571121216, + "learning_rate": 8.947532823099284e-06, + "loss": 0.4674, + "step": 535 + }, + { + "epoch": 0.289781942692377, + "grad_norm": 0.49937352538108826, + "learning_rate": 8.941732138133032e-06, + "loss": 0.4687, + "step": 536 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.5455271601676941, + "learning_rate": 8.935917403045251e-06, + "loss": 0.4956, + "step": 537 + }, + { + "epoch": 0.29086321859794556, + "grad_norm": 0.46256354451179504, + "learning_rate": 8.930088638562296e-06, + "loss": 0.4607, + "step": 538 + }, + { + "epoch": 0.29140385655072987, + "grad_norm": 0.5940908193588257, + "learning_rate": 8.924245865460523e-06, + "loss": 0.4889, + "step": 539 + }, + { + "epoch": 0.2919444945035142, + "grad_norm": 0.4837222099304199, + "learning_rate": 8.918389104566232e-06, + "loss": 0.5081, + "step": 540 + }, + { + "epoch": 0.2924851324562984, + "grad_norm": 0.5419607162475586, + "learning_rate": 8.912518376755572e-06, + "loss": 0.4951, + "step": 541 + }, + { + "epoch": 0.29302577040908273, + "grad_norm": 0.47228190302848816, + "learning_rate": 8.906633702954482e-06, + "loss": 0.4769, + "step": 542 + }, + { + "epoch": 0.293566408361867, + "grad_norm": 0.4912036061286926, + "learning_rate": 8.900735104138605e-06, + "loss": 0.4911, + "step": 543 + }, + { + "epoch": 0.2941070463146513, + "grad_norm": 0.5826781392097473, + "learning_rate": 8.894822601333228e-06, + "loss": 0.4808, + "step": 544 + }, + { + "epoch": 0.2946476842674356, + "grad_norm": 0.5110523104667664, + "learning_rate": 8.888896215613192e-06, + "loss": 0.4882, + "step": 545 + }, + { + "epoch": 0.29518832222021985, + "grad_norm": 0.5374977588653564, + "learning_rate": 8.882955968102822e-06, + "loss": 0.488, + "step": 546 + }, + { + "epoch": 0.29572896017300415, + "grad_norm": 0.49603360891342163, + "learning_rate": 8.877001879975857e-06, + "loss": 0.4979, + "step": 547 + }, + { + "epoch": 0.29626959812578846, + "grad_norm": 0.5069612264633179, + "learning_rate": 8.87103397245537e-06, + "loss": 0.498, + "step": 548 + }, + { + "epoch": 0.2968102360785727, + "grad_norm": 0.5196537375450134, + "learning_rate": 8.865052266813686e-06, + "loss": 0.4882, + "step": 549 + }, + { + "epoch": 0.297350874031357, + "grad_norm": 0.4779670536518097, + "learning_rate": 8.85905678437232e-06, + "loss": 0.483, + "step": 550 + }, + { + "epoch": 0.29789151198414127, + "grad_norm": 0.5233485102653503, + "learning_rate": 8.853047546501893e-06, + "loss": 0.4952, + "step": 551 + }, + { + "epoch": 0.2984321499369256, + "grad_norm": 0.4517951011657715, + "learning_rate": 8.847024574622051e-06, + "loss": 0.5205, + "step": 552 + }, + { + "epoch": 0.2989727878897099, + "grad_norm": 0.5014324188232422, + "learning_rate": 8.840987890201404e-06, + "loss": 0.501, + "step": 553 + }, + { + "epoch": 0.29951342584249413, + "grad_norm": 0.45734500885009766, + "learning_rate": 8.834937514757428e-06, + "loss": 0.473, + "step": 554 + }, + { + "epoch": 0.30005406379527844, + "grad_norm": 0.4451104998588562, + "learning_rate": 8.828873469856408e-06, + "loss": 0.5119, + "step": 555 + }, + { + "epoch": 0.3005947017480627, + "grad_norm": 0.4849705398082733, + "learning_rate": 8.822795777113352e-06, + "loss": 0.499, + "step": 556 + }, + { + "epoch": 0.301135339700847, + "grad_norm": 0.5042188167572021, + "learning_rate": 8.816704458191913e-06, + "loss": 0.512, + "step": 557 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 0.4437592923641205, + "learning_rate": 8.810599534804315e-06, + "loss": 0.5103, + "step": 558 + }, + { + "epoch": 0.30221661560641555, + "grad_norm": 0.46505382657051086, + "learning_rate": 8.804481028711274e-06, + "loss": 0.4759, + "step": 559 + }, + { + "epoch": 0.30275725355919986, + "grad_norm": 0.5045806169509888, + "learning_rate": 8.798348961721925e-06, + "loss": 0.4896, + "step": 560 + }, + { + "epoch": 0.30329789151198416, + "grad_norm": 0.4548962116241455, + "learning_rate": 8.792203355693731e-06, + "loss": 0.4548, + "step": 561 + }, + { + "epoch": 0.3038385294647684, + "grad_norm": 0.4087465703487396, + "learning_rate": 8.786044232532423e-06, + "loss": 0.4975, + "step": 562 + }, + { + "epoch": 0.3043791674175527, + "grad_norm": 0.5508294105529785, + "learning_rate": 8.77987161419191e-06, + "loss": 0.4907, + "step": 563 + }, + { + "epoch": 0.30491980537033697, + "grad_norm": 0.53659987449646, + "learning_rate": 8.773685522674205e-06, + "loss": 0.494, + "step": 564 + }, + { + "epoch": 0.3054604433231213, + "grad_norm": 0.5059062242507935, + "learning_rate": 8.767485980029342e-06, + "loss": 0.4904, + "step": 565 + }, + { + "epoch": 0.3060010812759056, + "grad_norm": 0.462995320558548, + "learning_rate": 8.761273008355306e-06, + "loss": 0.483, + "step": 566 + }, + { + "epoch": 0.30654171922868984, + "grad_norm": 0.6344414353370667, + "learning_rate": 8.755046629797944e-06, + "loss": 0.5041, + "step": 567 + }, + { + "epoch": 0.30708235718147414, + "grad_norm": 0.455564022064209, + "learning_rate": 8.748806866550895e-06, + "loss": 0.4515, + "step": 568 + }, + { + "epoch": 0.30762299513425845, + "grad_norm": 0.5680974721908569, + "learning_rate": 8.742553740855507e-06, + "loss": 0.5019, + "step": 569 + }, + { + "epoch": 0.3081636330870427, + "grad_norm": 0.500893771648407, + "learning_rate": 8.736287275000755e-06, + "loss": 0.5346, + "step": 570 + }, + { + "epoch": 0.308704271039827, + "grad_norm": 0.43272846937179565, + "learning_rate": 8.730007491323167e-06, + "loss": 0.4932, + "step": 571 + }, + { + "epoch": 0.30924490899261126, + "grad_norm": 0.4665529727935791, + "learning_rate": 8.723714412206741e-06, + "loss": 0.5207, + "step": 572 + }, + { + "epoch": 0.30978554694539556, + "grad_norm": 0.5202204585075378, + "learning_rate": 8.717408060082865e-06, + "loss": 0.5138, + "step": 573 + }, + { + "epoch": 0.31032618489817987, + "grad_norm": 0.4143356680870056, + "learning_rate": 8.711088457430239e-06, + "loss": 0.4977, + "step": 574 + }, + { + "epoch": 0.3108668228509641, + "grad_norm": 0.4690699875354767, + "learning_rate": 8.704755626774796e-06, + "loss": 0.4854, + "step": 575 + }, + { + "epoch": 0.3114074608037484, + "grad_norm": 0.5123147964477539, + "learning_rate": 8.698409590689616e-06, + "loss": 0.493, + "step": 576 + }, + { + "epoch": 0.31194809875653273, + "grad_norm": 0.38649624586105347, + "learning_rate": 8.692050371794849e-06, + "loss": 0.4737, + "step": 577 + }, + { + "epoch": 0.312488736709317, + "grad_norm": 0.4339773952960968, + "learning_rate": 8.685677992757637e-06, + "loss": 0.485, + "step": 578 + }, + { + "epoch": 0.3130293746621013, + "grad_norm": 0.5341055989265442, + "learning_rate": 8.67929247629203e-06, + "loss": 0.4933, + "step": 579 + }, + { + "epoch": 0.31357001261488554, + "grad_norm": 0.38680076599121094, + "learning_rate": 8.672893845158908e-06, + "loss": 0.5145, + "step": 580 + }, + { + "epoch": 0.31411065056766985, + "grad_norm": 0.5113509893417358, + "learning_rate": 8.66648212216589e-06, + "loss": 0.4792, + "step": 581 + }, + { + "epoch": 0.31465128852045415, + "grad_norm": 0.41235169768333435, + "learning_rate": 8.660057330167267e-06, + "loss": 0.4734, + "step": 582 + }, + { + "epoch": 0.3151919264732384, + "grad_norm": 0.4803949296474457, + "learning_rate": 8.653619492063916e-06, + "loss": 0.4898, + "step": 583 + }, + { + "epoch": 0.3157325644260227, + "grad_norm": 0.4755302965641022, + "learning_rate": 8.647168630803208e-06, + "loss": 0.4968, + "step": 584 + }, + { + "epoch": 0.316273202378807, + "grad_norm": 0.4384644329547882, + "learning_rate": 8.640704769378943e-06, + "loss": 0.4922, + "step": 585 + }, + { + "epoch": 0.31681384033159127, + "grad_norm": 0.5590741038322449, + "learning_rate": 8.634227930831252e-06, + "loss": 0.4828, + "step": 586 + }, + { + "epoch": 0.3173544782843756, + "grad_norm": 0.45347392559051514, + "learning_rate": 8.627738138246529e-06, + "loss": 0.468, + "step": 587 + }, + { + "epoch": 0.3178951162371598, + "grad_norm": 0.4747249186038971, + "learning_rate": 8.621235414757337e-06, + "loss": 0.4905, + "step": 588 + }, + { + "epoch": 0.31843575418994413, + "grad_norm": 0.4371531903743744, + "learning_rate": 8.61471978354233e-06, + "loss": 0.4806, + "step": 589 + }, + { + "epoch": 0.31897639214272844, + "grad_norm": 0.5220611095428467, + "learning_rate": 8.608191267826179e-06, + "loss": 0.4872, + "step": 590 + }, + { + "epoch": 0.3195170300955127, + "grad_norm": 0.46987447142601013, + "learning_rate": 8.60164989087947e-06, + "loss": 0.523, + "step": 591 + }, + { + "epoch": 0.320057668048297, + "grad_norm": 0.5529326796531677, + "learning_rate": 8.595095676018645e-06, + "loss": 0.5054, + "step": 592 + }, + { + "epoch": 0.3205983060010813, + "grad_norm": 0.4231288731098175, + "learning_rate": 8.588528646605893e-06, + "loss": 0.4985, + "step": 593 + }, + { + "epoch": 0.32113894395386555, + "grad_norm": 0.5058132410049438, + "learning_rate": 8.581948826049086e-06, + "loss": 0.4989, + "step": 594 + }, + { + "epoch": 0.32167958190664986, + "grad_norm": 0.43005961179733276, + "learning_rate": 8.575356237801695e-06, + "loss": 0.4898, + "step": 595 + }, + { + "epoch": 0.3222202198594341, + "grad_norm": 0.4978388547897339, + "learning_rate": 8.56875090536269e-06, + "loss": 0.4666, + "step": 596 + }, + { + "epoch": 0.3227608578122184, + "grad_norm": 0.5346764326095581, + "learning_rate": 8.562132852276474e-06, + "loss": 0.4828, + "step": 597 + }, + { + "epoch": 0.3233014957650027, + "grad_norm": 0.5223882794380188, + "learning_rate": 8.555502102132792e-06, + "loss": 0.4619, + "step": 598 + }, + { + "epoch": 0.323842133717787, + "grad_norm": 0.5179974436759949, + "learning_rate": 8.548858678566643e-06, + "loss": 0.46, + "step": 599 + }, + { + "epoch": 0.3243827716705713, + "grad_norm": 0.44731220602989197, + "learning_rate": 8.542202605258204e-06, + "loss": 0.4712, + "step": 600 + }, + { + "epoch": 0.3249234096233556, + "grad_norm": 0.5143573880195618, + "learning_rate": 8.535533905932739e-06, + "loss": 0.4841, + "step": 601 + }, + { + "epoch": 0.32546404757613984, + "grad_norm": 0.40094849467277527, + "learning_rate": 8.528852604360518e-06, + "loss": 0.4825, + "step": 602 + }, + { + "epoch": 0.32600468552892414, + "grad_norm": 0.4948832094669342, + "learning_rate": 8.52215872435673e-06, + "loss": 0.5066, + "step": 603 + }, + { + "epoch": 0.3265453234817084, + "grad_norm": 0.43757307529449463, + "learning_rate": 8.515452289781403e-06, + "loss": 0.4831, + "step": 604 + }, + { + "epoch": 0.3270859614344927, + "grad_norm": 0.4551491141319275, + "learning_rate": 8.50873332453931e-06, + "loss": 0.4628, + "step": 605 + }, + { + "epoch": 0.327626599387277, + "grad_norm": 0.45467811822891235, + "learning_rate": 8.50200185257989e-06, + "loss": 0.4813, + "step": 606 + }, + { + "epoch": 0.32816723734006126, + "grad_norm": 0.4058467745780945, + "learning_rate": 8.495257897897166e-06, + "loss": 0.4756, + "step": 607 + }, + { + "epoch": 0.32870787529284556, + "grad_norm": 0.4405519664287567, + "learning_rate": 8.48850148452965e-06, + "loss": 0.4964, + "step": 608 + }, + { + "epoch": 0.32924851324562987, + "grad_norm": 0.39016538858413696, + "learning_rate": 8.481732636560266e-06, + "loss": 0.4736, + "step": 609 + }, + { + "epoch": 0.3297891511984141, + "grad_norm": 0.4444190561771393, + "learning_rate": 8.474951378116253e-06, + "loss": 0.4904, + "step": 610 + }, + { + "epoch": 0.3303297891511984, + "grad_norm": 0.45072081685066223, + "learning_rate": 8.468157733369102e-06, + "loss": 0.4906, + "step": 611 + }, + { + "epoch": 0.3308704271039827, + "grad_norm": 0.40805482864379883, + "learning_rate": 8.461351726534438e-06, + "loss": 0.4781, + "step": 612 + }, + { + "epoch": 0.331411065056767, + "grad_norm": 0.4450664520263672, + "learning_rate": 8.454533381871957e-06, + "loss": 0.4985, + "step": 613 + }, + { + "epoch": 0.3319517030095513, + "grad_norm": 0.465251088142395, + "learning_rate": 8.447702723685335e-06, + "loss": 0.4911, + "step": 614 + }, + { + "epoch": 0.33249234096233554, + "grad_norm": 0.49976640939712524, + "learning_rate": 8.440859776322137e-06, + "loss": 0.4977, + "step": 615 + }, + { + "epoch": 0.33303297891511985, + "grad_norm": 0.5387666821479797, + "learning_rate": 8.43400456417373e-06, + "loss": 0.4938, + "step": 616 + }, + { + "epoch": 0.3335736168679041, + "grad_norm": 0.3886103630065918, + "learning_rate": 8.4271371116752e-06, + "loss": 0.4753, + "step": 617 + }, + { + "epoch": 0.3341142548206884, + "grad_norm": 0.4702761173248291, + "learning_rate": 8.420257443305264e-06, + "loss": 0.4991, + "step": 618 + }, + { + "epoch": 0.3346548927734727, + "grad_norm": 0.4505619406700134, + "learning_rate": 8.41336558358618e-06, + "loss": 0.4913, + "step": 619 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 0.4063532054424286, + "learning_rate": 8.406461557083666e-06, + "loss": 0.4866, + "step": 620 + }, + { + "epoch": 0.33573616867904127, + "grad_norm": 0.44660359621047974, + "learning_rate": 8.399545388406798e-06, + "loss": 0.5116, + "step": 621 + }, + { + "epoch": 0.3362768066318256, + "grad_norm": 0.41896873712539673, + "learning_rate": 8.392617102207945e-06, + "loss": 0.4966, + "step": 622 + }, + { + "epoch": 0.3368174445846098, + "grad_norm": 0.44783973693847656, + "learning_rate": 8.38567672318266e-06, + "loss": 0.4843, + "step": 623 + }, + { + "epoch": 0.33735808253739413, + "grad_norm": 0.4662441611289978, + "learning_rate": 8.3787242760696e-06, + "loss": 0.4646, + "step": 624 + }, + { + "epoch": 0.3378987204901784, + "grad_norm": 0.44338029623031616, + "learning_rate": 8.371759785650444e-06, + "loss": 0.4847, + "step": 625 + }, + { + "epoch": 0.3384393584429627, + "grad_norm": 0.4882574677467346, + "learning_rate": 8.364783276749794e-06, + "loss": 0.489, + "step": 626 + }, + { + "epoch": 0.338979996395747, + "grad_norm": 0.4160021245479584, + "learning_rate": 8.357794774235094e-06, + "loss": 0.504, + "step": 627 + }, + { + "epoch": 0.33952063434853125, + "grad_norm": 0.40824034810066223, + "learning_rate": 8.350794303016533e-06, + "loss": 0.4683, + "step": 628 + }, + { + "epoch": 0.34006127230131555, + "grad_norm": 0.5091646909713745, + "learning_rate": 8.343781888046971e-06, + "loss": 0.5207, + "step": 629 + }, + { + "epoch": 0.34060191025409986, + "grad_norm": 0.43365058302879333, + "learning_rate": 8.336757554321832e-06, + "loss": 0.5062, + "step": 630 + }, + { + "epoch": 0.3411425482068841, + "grad_norm": 0.4349713623523712, + "learning_rate": 8.329721326879032e-06, + "loss": 0.4745, + "step": 631 + }, + { + "epoch": 0.3416831861596684, + "grad_norm": 0.4307449460029602, + "learning_rate": 8.322673230798877e-06, + "loss": 0.4845, + "step": 632 + }, + { + "epoch": 0.34222382411245267, + "grad_norm": 0.4474685788154602, + "learning_rate": 8.315613291203977e-06, + "loss": 0.4928, + "step": 633 + }, + { + "epoch": 0.342764462065237, + "grad_norm": 0.4700278043746948, + "learning_rate": 8.30854153325916e-06, + "loss": 0.5178, + "step": 634 + }, + { + "epoch": 0.3433051000180213, + "grad_norm": 0.42883479595184326, + "learning_rate": 8.30145798217138e-06, + "loss": 0.4828, + "step": 635 + }, + { + "epoch": 0.34384573797080553, + "grad_norm": 0.45476993918418884, + "learning_rate": 8.294362663189626e-06, + "loss": 0.4882, + "step": 636 + }, + { + "epoch": 0.34438637592358984, + "grad_norm": 0.40888112783432007, + "learning_rate": 8.287255601604834e-06, + "loss": 0.4728, + "step": 637 + }, + { + "epoch": 0.34492701387637414, + "grad_norm": 0.41027674078941345, + "learning_rate": 8.280136822749796e-06, + "loss": 0.4612, + "step": 638 + }, + { + "epoch": 0.3454676518291584, + "grad_norm": 0.3990921080112457, + "learning_rate": 8.27300635199907e-06, + "loss": 0.4977, + "step": 639 + }, + { + "epoch": 0.3460082897819427, + "grad_norm": 0.39647355675697327, + "learning_rate": 8.265864214768883e-06, + "loss": 0.4817, + "step": 640 + }, + { + "epoch": 0.34654892773472695, + "grad_norm": 0.40276244282722473, + "learning_rate": 8.25871043651706e-06, + "loss": 0.4652, + "step": 641 + }, + { + "epoch": 0.34708956568751126, + "grad_norm": 0.4234772026538849, + "learning_rate": 8.25154504274291e-06, + "loss": 0.4882, + "step": 642 + }, + { + "epoch": 0.34763020364029557, + "grad_norm": 0.49208852648735046, + "learning_rate": 8.244368058987145e-06, + "loss": 0.5014, + "step": 643 + }, + { + "epoch": 0.3481708415930798, + "grad_norm": 0.3916458189487457, + "learning_rate": 8.237179510831792e-06, + "loss": 0.4678, + "step": 644 + }, + { + "epoch": 0.3487114795458641, + "grad_norm": 0.43457022309303284, + "learning_rate": 8.229979423900095e-06, + "loss": 0.4866, + "step": 645 + }, + { + "epoch": 0.34925211749864843, + "grad_norm": 0.44561585783958435, + "learning_rate": 8.222767823856435e-06, + "loss": 0.491, + "step": 646 + }, + { + "epoch": 0.3497927554514327, + "grad_norm": 0.3655422031879425, + "learning_rate": 8.215544736406223e-06, + "loss": 0.4497, + "step": 647 + }, + { + "epoch": 0.350333393404217, + "grad_norm": 0.47153764963150024, + "learning_rate": 8.20831018729582e-06, + "loss": 0.5342, + "step": 648 + }, + { + "epoch": 0.35087403135700124, + "grad_norm": 0.4188770651817322, + "learning_rate": 8.20106420231244e-06, + "loss": 0.4742, + "step": 649 + }, + { + "epoch": 0.35141466930978554, + "grad_norm": 0.45185157656669617, + "learning_rate": 8.193806807284064e-06, + "loss": 0.4623, + "step": 650 + }, + { + "epoch": 0.35195530726256985, + "grad_norm": 0.49852705001831055, + "learning_rate": 8.186538028079338e-06, + "loss": 0.4728, + "step": 651 + }, + { + "epoch": 0.3524959452153541, + "grad_norm": 0.46935099363327026, + "learning_rate": 8.179257890607489e-06, + "loss": 0.4709, + "step": 652 + }, + { + "epoch": 0.3530365831681384, + "grad_norm": 0.4840806722640991, + "learning_rate": 8.171966420818227e-06, + "loss": 0.4988, + "step": 653 + }, + { + "epoch": 0.3535772211209227, + "grad_norm": 0.45912444591522217, + "learning_rate": 8.164663644701662e-06, + "loss": 0.4933, + "step": 654 + }, + { + "epoch": 0.35411785907370696, + "grad_norm": 0.4942997694015503, + "learning_rate": 8.157349588288202e-06, + "loss": 0.4731, + "step": 655 + }, + { + "epoch": 0.35465849702649127, + "grad_norm": 0.4257151782512665, + "learning_rate": 8.150024277648458e-06, + "loss": 0.487, + "step": 656 + }, + { + "epoch": 0.3551991349792755, + "grad_norm": 0.544456958770752, + "learning_rate": 8.142687738893161e-06, + "loss": 0.5117, + "step": 657 + }, + { + "epoch": 0.35573977293205983, + "grad_norm": 0.5224266648292542, + "learning_rate": 8.135339998173064e-06, + "loss": 0.4894, + "step": 658 + }, + { + "epoch": 0.35628041088484413, + "grad_norm": 0.46919405460357666, + "learning_rate": 8.12798108167885e-06, + "loss": 0.4719, + "step": 659 + }, + { + "epoch": 0.3568210488376284, + "grad_norm": 0.7049233913421631, + "learning_rate": 8.120611015641036e-06, + "loss": 0.5216, + "step": 660 + }, + { + "epoch": 0.3573616867904127, + "grad_norm": 0.4920409619808197, + "learning_rate": 8.113229826329876e-06, + "loss": 0.4922, + "step": 661 + }, + { + "epoch": 0.357902324743197, + "grad_norm": 0.5132108330726624, + "learning_rate": 8.105837540055284e-06, + "loss": 0.4894, + "step": 662 + }, + { + "epoch": 0.35844296269598125, + "grad_norm": 0.5030608177185059, + "learning_rate": 8.098434183166716e-06, + "loss": 0.4765, + "step": 663 + }, + { + "epoch": 0.35898360064876556, + "grad_norm": 0.5627034306526184, + "learning_rate": 8.091019782053097e-06, + "loss": 0.4862, + "step": 664 + }, + { + "epoch": 0.3595242386015498, + "grad_norm": 0.5231481790542603, + "learning_rate": 8.083594363142717e-06, + "loss": 0.5224, + "step": 665 + }, + { + "epoch": 0.3600648765543341, + "grad_norm": 0.5793126225471497, + "learning_rate": 8.076157952903134e-06, + "loss": 0.5063, + "step": 666 + }, + { + "epoch": 0.3606055145071184, + "grad_norm": 0.5222432017326355, + "learning_rate": 8.068710577841093e-06, + "loss": 0.4578, + "step": 667 + }, + { + "epoch": 0.36114615245990267, + "grad_norm": 0.4852503836154938, + "learning_rate": 8.061252264502415e-06, + "loss": 0.4936, + "step": 668 + }, + { + "epoch": 0.361686790412687, + "grad_norm": 0.4867151379585266, + "learning_rate": 8.053783039471909e-06, + "loss": 0.4957, + "step": 669 + }, + { + "epoch": 0.3622274283654713, + "grad_norm": 0.45587924122810364, + "learning_rate": 8.046302929373286e-06, + "loss": 0.4655, + "step": 670 + }, + { + "epoch": 0.36276806631825553, + "grad_norm": 0.47155067324638367, + "learning_rate": 8.038811960869051e-06, + "loss": 0.487, + "step": 671 + }, + { + "epoch": 0.36330870427103984, + "grad_norm": 0.4418538510799408, + "learning_rate": 8.031310160660411e-06, + "loss": 0.4724, + "step": 672 + }, + { + "epoch": 0.3638493422238241, + "grad_norm": 0.404681533575058, + "learning_rate": 8.023797555487188e-06, + "loss": 0.4892, + "step": 673 + }, + { + "epoch": 0.3643899801766084, + "grad_norm": 0.45918798446655273, + "learning_rate": 8.016274172127715e-06, + "loss": 0.5012, + "step": 674 + }, + { + "epoch": 0.3649306181293927, + "grad_norm": 0.4333856701850891, + "learning_rate": 8.008740037398742e-06, + "loss": 0.4868, + "step": 675 + }, + { + "epoch": 0.36547125608217695, + "grad_norm": 0.3893619477748871, + "learning_rate": 8.001195178155344e-06, + "loss": 0.4684, + "step": 676 + }, + { + "epoch": 0.36601189403496126, + "grad_norm": 0.434662789106369, + "learning_rate": 7.99363962129082e-06, + "loss": 0.4814, + "step": 677 + }, + { + "epoch": 0.36655253198774557, + "grad_norm": 0.4455735981464386, + "learning_rate": 7.986073393736607e-06, + "loss": 0.4856, + "step": 678 + }, + { + "epoch": 0.3670931699405298, + "grad_norm": 0.39852529764175415, + "learning_rate": 7.978496522462167e-06, + "loss": 0.4811, + "step": 679 + }, + { + "epoch": 0.3676338078933141, + "grad_norm": 0.42947253584861755, + "learning_rate": 7.97090903447491e-06, + "loss": 0.4687, + "step": 680 + }, + { + "epoch": 0.3681744458460984, + "grad_norm": 0.4797091782093048, + "learning_rate": 7.963310956820085e-06, + "loss": 0.4892, + "step": 681 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 0.449841171503067, + "learning_rate": 7.955702316580686e-06, + "loss": 0.4918, + "step": 682 + }, + { + "epoch": 0.369255721751667, + "grad_norm": 0.4748377203941345, + "learning_rate": 7.94808314087736e-06, + "loss": 0.5122, + "step": 683 + }, + { + "epoch": 0.36979635970445124, + "grad_norm": 0.45039764046669006, + "learning_rate": 7.940453456868304e-06, + "loss": 0.4818, + "step": 684 + }, + { + "epoch": 0.37033699765723554, + "grad_norm": 0.44947609305381775, + "learning_rate": 7.932813291749177e-06, + "loss": 0.4906, + "step": 685 + }, + { + "epoch": 0.3708776356100198, + "grad_norm": 0.46139925718307495, + "learning_rate": 7.925162672752989e-06, + "loss": 0.4888, + "step": 686 + }, + { + "epoch": 0.3714182735628041, + "grad_norm": 0.49717041850090027, + "learning_rate": 7.917501627150019e-06, + "loss": 0.4981, + "step": 687 + }, + { + "epoch": 0.3719589115155884, + "grad_norm": 0.5299156904220581, + "learning_rate": 7.90983018224771e-06, + "loss": 0.4948, + "step": 688 + }, + { + "epoch": 0.37249954946837266, + "grad_norm": 0.4835863411426544, + "learning_rate": 7.902148365390567e-06, + "loss": 0.4915, + "step": 689 + }, + { + "epoch": 0.37304018742115697, + "grad_norm": 0.5481423735618591, + "learning_rate": 7.894456203960075e-06, + "loss": 0.4889, + "step": 690 + }, + { + "epoch": 0.37358082537394127, + "grad_norm": 0.419506311416626, + "learning_rate": 7.886753725374586e-06, + "loss": 0.4839, + "step": 691 + }, + { + "epoch": 0.3741214633267255, + "grad_norm": 0.429317831993103, + "learning_rate": 7.879040957089229e-06, + "loss": 0.4709, + "step": 692 + }, + { + "epoch": 0.37466210127950983, + "grad_norm": 0.47440069913864136, + "learning_rate": 7.871317926595804e-06, + "loss": 0.4777, + "step": 693 + }, + { + "epoch": 0.3752027392322941, + "grad_norm": 0.5026919841766357, + "learning_rate": 7.8635846614227e-06, + "loss": 0.4749, + "step": 694 + }, + { + "epoch": 0.3757433771850784, + "grad_norm": 0.39894619584083557, + "learning_rate": 7.855841189134784e-06, + "loss": 0.4627, + "step": 695 + }, + { + "epoch": 0.3762840151378627, + "grad_norm": 0.4291825294494629, + "learning_rate": 7.848087537333298e-06, + "loss": 0.4667, + "step": 696 + }, + { + "epoch": 0.37682465309064694, + "grad_norm": 0.3873690068721771, + "learning_rate": 7.84032373365578e-06, + "loss": 0.4774, + "step": 697 + }, + { + "epoch": 0.37736529104343125, + "grad_norm": 0.3892592489719391, + "learning_rate": 7.832549805775945e-06, + "loss": 0.5081, + "step": 698 + }, + { + "epoch": 0.37790592899621556, + "grad_norm": 0.40017616748809814, + "learning_rate": 7.8247657814036e-06, + "loss": 0.4783, + "step": 699 + }, + { + "epoch": 0.3784465669489998, + "grad_norm": 0.41424062848091125, + "learning_rate": 7.81697168828454e-06, + "loss": 0.4904, + "step": 700 + }, + { + "epoch": 0.3789872049017841, + "grad_norm": 0.42307886481285095, + "learning_rate": 7.809167554200446e-06, + "loss": 0.4948, + "step": 701 + }, + { + "epoch": 0.37952784285456836, + "grad_norm": 0.3783946931362152, + "learning_rate": 7.801353406968795e-06, + "loss": 0.4778, + "step": 702 + }, + { + "epoch": 0.38006848080735267, + "grad_norm": 0.39104709029197693, + "learning_rate": 7.793529274442753e-06, + "loss": 0.4661, + "step": 703 + }, + { + "epoch": 0.380609118760137, + "grad_norm": 0.4255848824977875, + "learning_rate": 7.785695184511074e-06, + "loss": 0.4629, + "step": 704 + }, + { + "epoch": 0.38114975671292123, + "grad_norm": 0.37515324354171753, + "learning_rate": 7.777851165098012e-06, + "loss": 0.4744, + "step": 705 + }, + { + "epoch": 0.38169039466570553, + "grad_norm": 0.43768253922462463, + "learning_rate": 7.769997244163209e-06, + "loss": 0.4941, + "step": 706 + }, + { + "epoch": 0.38223103261848984, + "grad_norm": 0.4131484925746918, + "learning_rate": 7.762133449701603e-06, + "loss": 0.4912, + "step": 707 + }, + { + "epoch": 0.3827716705712741, + "grad_norm": 0.42033693194389343, + "learning_rate": 7.754259809743325e-06, + "loss": 0.4606, + "step": 708 + }, + { + "epoch": 0.3833123085240584, + "grad_norm": 0.3850057125091553, + "learning_rate": 7.746376352353599e-06, + "loss": 0.4715, + "step": 709 + }, + { + "epoch": 0.38385294647684265, + "grad_norm": 0.4940873086452484, + "learning_rate": 7.738483105632644e-06, + "loss": 0.5257, + "step": 710 + }, + { + "epoch": 0.38439358442962696, + "grad_norm": 0.4676145315170288, + "learning_rate": 7.730580097715575e-06, + "loss": 0.4917, + "step": 711 + }, + { + "epoch": 0.38493422238241126, + "grad_norm": 0.41766825318336487, + "learning_rate": 7.722667356772291e-06, + "loss": 0.4738, + "step": 712 + }, + { + "epoch": 0.3854748603351955, + "grad_norm": 0.4553625285625458, + "learning_rate": 7.714744911007395e-06, + "loss": 0.4839, + "step": 713 + }, + { + "epoch": 0.3860154982879798, + "grad_norm": 0.3935423195362091, + "learning_rate": 7.706812788660075e-06, + "loss": 0.4889, + "step": 714 + }, + { + "epoch": 0.3865561362407641, + "grad_norm": 0.42542070150375366, + "learning_rate": 7.698871018004016e-06, + "loss": 0.5031, + "step": 715 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.48952916264533997, + "learning_rate": 7.690919627347292e-06, + "loss": 0.4594, + "step": 716 + }, + { + "epoch": 0.3876374121463327, + "grad_norm": 0.4403664469718933, + "learning_rate": 7.682958645032265e-06, + "loss": 0.458, + "step": 717 + }, + { + "epoch": 0.38817805009911693, + "grad_norm": 0.3886347711086273, + "learning_rate": 7.674988099435487e-06, + "loss": 0.471, + "step": 718 + }, + { + "epoch": 0.38871868805190124, + "grad_norm": 0.4349733889102936, + "learning_rate": 7.667008018967598e-06, + "loss": 0.4992, + "step": 719 + }, + { + "epoch": 0.38925932600468555, + "grad_norm": 0.48294758796691895, + "learning_rate": 7.65901843207323e-06, + "loss": 0.487, + "step": 720 + }, + { + "epoch": 0.3897999639574698, + "grad_norm": 0.4309777319431305, + "learning_rate": 7.651019367230886e-06, + "loss": 0.4614, + "step": 721 + }, + { + "epoch": 0.3903406019102541, + "grad_norm": 0.4028390049934387, + "learning_rate": 7.643010852952871e-06, + "loss": 0.4942, + "step": 722 + }, + { + "epoch": 0.3908812398630384, + "grad_norm": 0.4195624887943268, + "learning_rate": 7.634992917785156e-06, + "loss": 0.4883, + "step": 723 + }, + { + "epoch": 0.39142187781582266, + "grad_norm": 0.38650834560394287, + "learning_rate": 7.626965590307305e-06, + "loss": 0.5055, + "step": 724 + }, + { + "epoch": 0.39196251576860697, + "grad_norm": 0.4400707185268402, + "learning_rate": 7.6189288991323505e-06, + "loss": 0.4653, + "step": 725 + }, + { + "epoch": 0.3925031537213912, + "grad_norm": 0.3416772484779358, + "learning_rate": 7.610882872906709e-06, + "loss": 0.4676, + "step": 726 + }, + { + "epoch": 0.3930437916741755, + "grad_norm": 0.44608405232429504, + "learning_rate": 7.602827540310065e-06, + "loss": 0.4804, + "step": 727 + }, + { + "epoch": 0.39358442962695983, + "grad_norm": 0.39614206552505493, + "learning_rate": 7.594762930055281e-06, + "loss": 0.4783, + "step": 728 + }, + { + "epoch": 0.3941250675797441, + "grad_norm": 0.4028596580028534, + "learning_rate": 7.586689070888284e-06, + "loss": 0.4848, + "step": 729 + }, + { + "epoch": 0.3946657055325284, + "grad_norm": 0.37662696838378906, + "learning_rate": 7.578605991587974e-06, + "loss": 0.4751, + "step": 730 + }, + { + "epoch": 0.3952063434853127, + "grad_norm": 0.480343222618103, + "learning_rate": 7.570513720966108e-06, + "loss": 0.4842, + "step": 731 + }, + { + "epoch": 0.39574698143809695, + "grad_norm": 0.43461012840270996, + "learning_rate": 7.562412287867214e-06, + "loss": 0.445, + "step": 732 + }, + { + "epoch": 0.39628761939088125, + "grad_norm": 0.37615707516670227, + "learning_rate": 7.5543017211684745e-06, + "loss": 0.5055, + "step": 733 + }, + { + "epoch": 0.3968282573436655, + "grad_norm": 0.3956639766693115, + "learning_rate": 7.5461820497796255e-06, + "loss": 0.4696, + "step": 734 + }, + { + "epoch": 0.3973688952964498, + "grad_norm": 0.5006547570228577, + "learning_rate": 7.5380533026428625e-06, + "loss": 0.5176, + "step": 735 + }, + { + "epoch": 0.3979095332492341, + "grad_norm": 0.37995660305023193, + "learning_rate": 7.529915508732725e-06, + "loss": 0.4962, + "step": 736 + }, + { + "epoch": 0.39845017120201837, + "grad_norm": 0.41486889123916626, + "learning_rate": 7.521768697056004e-06, + "loss": 0.489, + "step": 737 + }, + { + "epoch": 0.3989908091548027, + "grad_norm": 0.49411487579345703, + "learning_rate": 7.513612896651632e-06, + "loss": 0.4543, + "step": 738 + }, + { + "epoch": 0.399531447107587, + "grad_norm": 0.4323795437812805, + "learning_rate": 7.505448136590583e-06, + "loss": 0.4707, + "step": 739 + }, + { + "epoch": 0.40007208506037123, + "grad_norm": 0.3963027596473694, + "learning_rate": 7.497274445975762e-06, + "loss": 0.4838, + "step": 740 + }, + { + "epoch": 0.40061272301315554, + "grad_norm": 0.3969276547431946, + "learning_rate": 7.489091853941914e-06, + "loss": 0.484, + "step": 741 + }, + { + "epoch": 0.4011533609659398, + "grad_norm": 0.43876469135284424, + "learning_rate": 7.480900389655508e-06, + "loss": 0.464, + "step": 742 + }, + { + "epoch": 0.4016939989187241, + "grad_norm": 0.39766424894332886, + "learning_rate": 7.4727000823146386e-06, + "loss": 0.4979, + "step": 743 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 0.3988225758075714, + "learning_rate": 7.464490961148921e-06, + "loss": 0.4645, + "step": 744 + }, + { + "epoch": 0.40277527482429265, + "grad_norm": 0.5512990951538086, + "learning_rate": 7.4562730554193875e-06, + "loss": 0.4922, + "step": 745 + }, + { + "epoch": 0.40331591277707696, + "grad_norm": 0.4058268368244171, + "learning_rate": 7.448046394418383e-06, + "loss": 0.4796, + "step": 746 + }, + { + "epoch": 0.40385655072986126, + "grad_norm": 0.3984447717666626, + "learning_rate": 7.439811007469457e-06, + "loss": 0.4694, + "step": 747 + }, + { + "epoch": 0.4043971886826455, + "grad_norm": 0.41148069500923157, + "learning_rate": 7.431566923927267e-06, + "loss": 0.4757, + "step": 748 + }, + { + "epoch": 0.4049378266354298, + "grad_norm": 0.429737389087677, + "learning_rate": 7.423314173177467e-06, + "loss": 0.4949, + "step": 749 + }, + { + "epoch": 0.40547846458821407, + "grad_norm": 0.35777562856674194, + "learning_rate": 7.415052784636603e-06, + "loss": 0.4704, + "step": 750 + }, + { + "epoch": 0.4060191025409984, + "grad_norm": 0.40855300426483154, + "learning_rate": 7.406782787752011e-06, + "loss": 0.4648, + "step": 751 + }, + { + "epoch": 0.4065597404937827, + "grad_norm": 0.4542819857597351, + "learning_rate": 7.398504212001714e-06, + "loss": 0.4896, + "step": 752 + }, + { + "epoch": 0.40710037844656694, + "grad_norm": 0.3928791880607605, + "learning_rate": 7.390217086894309e-06, + "loss": 0.5082, + "step": 753 + }, + { + "epoch": 0.40764101639935124, + "grad_norm": 0.39957326650619507, + "learning_rate": 7.3819214419688725e-06, + "loss": 0.4752, + "step": 754 + }, + { + "epoch": 0.4081816543521355, + "grad_norm": 0.4252714216709137, + "learning_rate": 7.373617306794844e-06, + "loss": 0.4828, + "step": 755 + }, + { + "epoch": 0.4087222923049198, + "grad_norm": 0.45451775193214417, + "learning_rate": 7.365304710971928e-06, + "loss": 0.4776, + "step": 756 + }, + { + "epoch": 0.4092629302577041, + "grad_norm": 0.42666348814964294, + "learning_rate": 7.3569836841299905e-06, + "loss": 0.4805, + "step": 757 + }, + { + "epoch": 0.40980356821048836, + "grad_norm": 0.44532275199890137, + "learning_rate": 7.348654255928941e-06, + "loss": 0.4683, + "step": 758 + }, + { + "epoch": 0.41034420616327266, + "grad_norm": 0.540521502494812, + "learning_rate": 7.340316456058644e-06, + "loss": 0.4717, + "step": 759 + }, + { + "epoch": 0.41088484411605697, + "grad_norm": 0.4525167942047119, + "learning_rate": 7.331970314238799e-06, + "loss": 0.4782, + "step": 760 + }, + { + "epoch": 0.4114254820688412, + "grad_norm": 0.44031041860580444, + "learning_rate": 7.323615860218844e-06, + "loss": 0.4947, + "step": 761 + }, + { + "epoch": 0.4119661200216255, + "grad_norm": 0.5076286792755127, + "learning_rate": 7.31525312377784e-06, + "loss": 0.4638, + "step": 762 + }, + { + "epoch": 0.4125067579744098, + "grad_norm": 0.4349268674850464, + "learning_rate": 7.306882134724376e-06, + "loss": 0.4782, + "step": 763 + }, + { + "epoch": 0.4130473959271941, + "grad_norm": 0.44942405819892883, + "learning_rate": 7.298502922896453e-06, + "loss": 0.5115, + "step": 764 + }, + { + "epoch": 0.4135880338799784, + "grad_norm": 0.4894666075706482, + "learning_rate": 7.290115518161385e-06, + "loss": 0.5035, + "step": 765 + }, + { + "epoch": 0.41412867183276264, + "grad_norm": 0.3696083128452301, + "learning_rate": 7.281719950415686e-06, + "loss": 0.4537, + "step": 766 + }, + { + "epoch": 0.41466930978554695, + "grad_norm": 0.40760135650634766, + "learning_rate": 7.273316249584969e-06, + "loss": 0.4418, + "step": 767 + }, + { + "epoch": 0.41520994773833125, + "grad_norm": 0.47130587697029114, + "learning_rate": 7.2649044456238334e-06, + "loss": 0.4972, + "step": 768 + }, + { + "epoch": 0.4157505856911155, + "grad_norm": 0.4279153048992157, + "learning_rate": 7.256484568515769e-06, + "loss": 0.4963, + "step": 769 + }, + { + "epoch": 0.4162912236438998, + "grad_norm": 0.4578527808189392, + "learning_rate": 7.248056648273034e-06, + "loss": 0.4888, + "step": 770 + }, + { + "epoch": 0.41683186159668406, + "grad_norm": 0.42948973178863525, + "learning_rate": 7.239620714936561e-06, + "loss": 0.4877, + "step": 771 + }, + { + "epoch": 0.41737249954946837, + "grad_norm": 0.44241803884506226, + "learning_rate": 7.231176798575843e-06, + "loss": 0.4791, + "step": 772 + }, + { + "epoch": 0.4179131375022527, + "grad_norm": 0.48078736662864685, + "learning_rate": 7.22272492928883e-06, + "loss": 0.4744, + "step": 773 + }, + { + "epoch": 0.4184537754550369, + "grad_norm": 0.4113394618034363, + "learning_rate": 7.214265137201817e-06, + "loss": 0.4807, + "step": 774 + }, + { + "epoch": 0.41899441340782123, + "grad_norm": 0.48756808042526245, + "learning_rate": 7.205797452469341e-06, + "loss": 0.5125, + "step": 775 + }, + { + "epoch": 0.41953505136060554, + "grad_norm": 0.47686880826950073, + "learning_rate": 7.197321905274071e-06, + "loss": 0.4727, + "step": 776 + }, + { + "epoch": 0.4200756893133898, + "grad_norm": 0.42904406785964966, + "learning_rate": 7.188838525826702e-06, + "loss": 0.4668, + "step": 777 + }, + { + "epoch": 0.4206163272661741, + "grad_norm": 0.4416086971759796, + "learning_rate": 7.18034734436585e-06, + "loss": 0.4768, + "step": 778 + }, + { + "epoch": 0.42115696521895835, + "grad_norm": 0.4766569137573242, + "learning_rate": 7.171848391157935e-06, + "loss": 0.4908, + "step": 779 + }, + { + "epoch": 0.42169760317174265, + "grad_norm": 0.39499804377555847, + "learning_rate": 7.163341696497084e-06, + "loss": 0.4706, + "step": 780 + }, + { + "epoch": 0.42223824112452696, + "grad_norm": 0.4001219868659973, + "learning_rate": 7.154827290705012e-06, + "loss": 0.4891, + "step": 781 + }, + { + "epoch": 0.4227788790773112, + "grad_norm": 0.4044910669326782, + "learning_rate": 7.146305204130928e-06, + "loss": 0.4542, + "step": 782 + }, + { + "epoch": 0.4233195170300955, + "grad_norm": 0.3734391927719116, + "learning_rate": 7.137775467151411e-06, + "loss": 0.4741, + "step": 783 + }, + { + "epoch": 0.4238601549828798, + "grad_norm": 0.4248086214065552, + "learning_rate": 7.129238110170315e-06, + "loss": 0.4902, + "step": 784 + }, + { + "epoch": 0.4244007929356641, + "grad_norm": 0.3903302550315857, + "learning_rate": 7.120693163618656e-06, + "loss": 0.4493, + "step": 785 + }, + { + "epoch": 0.4249414308884484, + "grad_norm": 0.412576824426651, + "learning_rate": 7.112140657954495e-06, + "loss": 0.4939, + "step": 786 + }, + { + "epoch": 0.42548206884123263, + "grad_norm": 0.39094048738479614, + "learning_rate": 7.103580623662845e-06, + "loss": 0.4403, + "step": 787 + }, + { + "epoch": 0.42602270679401694, + "grad_norm": 0.4399993419647217, + "learning_rate": 7.0950130912555515e-06, + "loss": 0.4582, + "step": 788 + }, + { + "epoch": 0.42656334474680124, + "grad_norm": 0.41360506415367126, + "learning_rate": 7.086438091271186e-06, + "loss": 0.4824, + "step": 789 + }, + { + "epoch": 0.4271039826995855, + "grad_norm": 0.44524022936820984, + "learning_rate": 7.077855654274939e-06, + "loss": 0.4711, + "step": 790 + }, + { + "epoch": 0.4276446206523698, + "grad_norm": 0.4516412913799286, + "learning_rate": 7.069265810858509e-06, + "loss": 0.4893, + "step": 791 + }, + { + "epoch": 0.4281852586051541, + "grad_norm": 0.4142738878726959, + "learning_rate": 7.0606685916399945e-06, + "loss": 0.4555, + "step": 792 + }, + { + "epoch": 0.42872589655793836, + "grad_norm": 0.4141763746738434, + "learning_rate": 7.052064027263785e-06, + "loss": 0.4387, + "step": 793 + }, + { + "epoch": 0.42926653451072266, + "grad_norm": 0.47192102670669556, + "learning_rate": 7.043452148400452e-06, + "loss": 0.4812, + "step": 794 + }, + { + "epoch": 0.4298071724635069, + "grad_norm": 0.43239060044288635, + "learning_rate": 7.034832985746638e-06, + "loss": 0.503, + "step": 795 + }, + { + "epoch": 0.4303478104162912, + "grad_norm": 0.46152111887931824, + "learning_rate": 7.026206570024949e-06, + "loss": 0.4824, + "step": 796 + }, + { + "epoch": 0.4308884483690755, + "grad_norm": 0.4815097451210022, + "learning_rate": 7.017572931983846e-06, + "loss": 0.4558, + "step": 797 + }, + { + "epoch": 0.4314290863218598, + "grad_norm": 0.4330691397190094, + "learning_rate": 7.00893210239753e-06, + "loss": 0.4493, + "step": 798 + }, + { + "epoch": 0.4319697242746441, + "grad_norm": 0.44782209396362305, + "learning_rate": 7.000284112065836e-06, + "loss": 0.4737, + "step": 799 + }, + { + "epoch": 0.4325103622274284, + "grad_norm": 0.43315109610557556, + "learning_rate": 6.9916289918141265e-06, + "loss": 0.4533, + "step": 800 + }, + { + "epoch": 0.43305100018021264, + "grad_norm": 0.42613768577575684, + "learning_rate": 6.982966772493176e-06, + "loss": 0.4517, + "step": 801 + }, + { + "epoch": 0.43359163813299695, + "grad_norm": 0.39703595638275146, + "learning_rate": 6.974297484979066e-06, + "loss": 0.4651, + "step": 802 + }, + { + "epoch": 0.4341322760857812, + "grad_norm": 0.4882420599460602, + "learning_rate": 6.965621160173066e-06, + "loss": 0.4816, + "step": 803 + }, + { + "epoch": 0.4346729140385655, + "grad_norm": 0.4013866186141968, + "learning_rate": 6.9569378290015375e-06, + "loss": 0.4545, + "step": 804 + }, + { + "epoch": 0.4352135519913498, + "grad_norm": 0.41527172923088074, + "learning_rate": 6.948247522415811e-06, + "loss": 0.4567, + "step": 805 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 0.4296116530895233, + "learning_rate": 6.939550271392079e-06, + "loss": 0.4499, + "step": 806 + }, + { + "epoch": 0.43629482789691837, + "grad_norm": 0.4318053722381592, + "learning_rate": 6.930846106931292e-06, + "loss": 0.4873, + "step": 807 + }, + { + "epoch": 0.4368354658497027, + "grad_norm": 0.39466047286987305, + "learning_rate": 6.922135060059043e-06, + "loss": 0.4705, + "step": 808 + }, + { + "epoch": 0.4373761038024869, + "grad_norm": 0.4394930601119995, + "learning_rate": 6.913417161825449e-06, + "loss": 0.4816, + "step": 809 + }, + { + "epoch": 0.43791674175527123, + "grad_norm": 0.4361076354980469, + "learning_rate": 6.904692443305059e-06, + "loss": 0.496, + "step": 810 + }, + { + "epoch": 0.4384573797080555, + "grad_norm": 0.4172879755496979, + "learning_rate": 6.895960935596728e-06, + "loss": 0.49, + "step": 811 + }, + { + "epoch": 0.4389980176608398, + "grad_norm": 0.4652804732322693, + "learning_rate": 6.8872226698235065e-06, + "loss": 0.468, + "step": 812 + }, + { + "epoch": 0.4395386556136241, + "grad_norm": 0.39598575234413147, + "learning_rate": 6.8784776771325426e-06, + "loss": 0.4487, + "step": 813 + }, + { + "epoch": 0.44007929356640835, + "grad_norm": 0.5070300102233887, + "learning_rate": 6.869725988694955e-06, + "loss": 0.453, + "step": 814 + }, + { + "epoch": 0.44061993151919265, + "grad_norm": 0.4058806598186493, + "learning_rate": 6.860967635705732e-06, + "loss": 0.4652, + "step": 815 + }, + { + "epoch": 0.44116056947197696, + "grad_norm": 0.4511767029762268, + "learning_rate": 6.8522026493836144e-06, + "loss": 0.4801, + "step": 816 + }, + { + "epoch": 0.4417012074247612, + "grad_norm": 0.435837984085083, + "learning_rate": 6.843431060970995e-06, + "loss": 0.4772, + "step": 817 + }, + { + "epoch": 0.4422418453775455, + "grad_norm": 0.4215855002403259, + "learning_rate": 6.834652901733789e-06, + "loss": 0.4633, + "step": 818 + }, + { + "epoch": 0.44278248333032977, + "grad_norm": 0.43643590807914734, + "learning_rate": 6.825868202961343e-06, + "loss": 0.4891, + "step": 819 + }, + { + "epoch": 0.4433231212831141, + "grad_norm": 0.42145484685897827, + "learning_rate": 6.8170769959663045e-06, + "loss": 0.4537, + "step": 820 + }, + { + "epoch": 0.4438637592358984, + "grad_norm": 0.44420287013053894, + "learning_rate": 6.808279312084525e-06, + "loss": 0.4894, + "step": 821 + }, + { + "epoch": 0.44440439718868263, + "grad_norm": 0.4486287534236908, + "learning_rate": 6.799475182674942e-06, + "loss": 0.4629, + "step": 822 + }, + { + "epoch": 0.44494503514146694, + "grad_norm": 0.4760923385620117, + "learning_rate": 6.790664639119464e-06, + "loss": 0.4876, + "step": 823 + }, + { + "epoch": 0.4454856730942512, + "grad_norm": 0.46989408135414124, + "learning_rate": 6.781847712822869e-06, + "loss": 0.4719, + "step": 824 + }, + { + "epoch": 0.4460263110470355, + "grad_norm": 0.46125340461730957, + "learning_rate": 6.773024435212678e-06, + "loss": 0.4733, + "step": 825 + }, + { + "epoch": 0.4465669489998198, + "grad_norm": 0.45252951979637146, + "learning_rate": 6.76419483773906e-06, + "loss": 0.4937, + "step": 826 + }, + { + "epoch": 0.44710758695260405, + "grad_norm": 0.40651631355285645, + "learning_rate": 6.755358951874701e-06, + "loss": 0.4674, + "step": 827 + }, + { + "epoch": 0.44764822490538836, + "grad_norm": 0.5222921967506409, + "learning_rate": 6.7465168091147094e-06, + "loss": 0.4767, + "step": 828 + }, + { + "epoch": 0.44818886285817267, + "grad_norm": 0.34470462799072266, + "learning_rate": 6.737668440976494e-06, + "loss": 0.4687, + "step": 829 + }, + { + "epoch": 0.4487295008109569, + "grad_norm": 0.5007984042167664, + "learning_rate": 6.728813878999652e-06, + "loss": 0.5117, + "step": 830 + }, + { + "epoch": 0.4492701387637412, + "grad_norm": 0.43586406111717224, + "learning_rate": 6.719953154745857e-06, + "loss": 0.4643, + "step": 831 + }, + { + "epoch": 0.4498107767165255, + "grad_norm": 0.3484852910041809, + "learning_rate": 6.7110862997987525e-06, + "loss": 0.4711, + "step": 832 + }, + { + "epoch": 0.4503514146693098, + "grad_norm": 0.5028504133224487, + "learning_rate": 6.70221334576383e-06, + "loss": 0.4679, + "step": 833 + }, + { + "epoch": 0.4508920526220941, + "grad_norm": 0.4104493260383606, + "learning_rate": 6.693334324268328e-06, + "loss": 0.4639, + "step": 834 + }, + { + "epoch": 0.45143269057487834, + "grad_norm": 0.40598517656326294, + "learning_rate": 6.684449266961101e-06, + "loss": 0.4794, + "step": 835 + }, + { + "epoch": 0.45197332852766264, + "grad_norm": 0.3934710621833801, + "learning_rate": 6.675558205512527e-06, + "loss": 0.4734, + "step": 836 + }, + { + "epoch": 0.45251396648044695, + "grad_norm": 0.39573705196380615, + "learning_rate": 6.666661171614382e-06, + "loss": 0.4828, + "step": 837 + }, + { + "epoch": 0.4530546044332312, + "grad_norm": 0.39194008708000183, + "learning_rate": 6.657758196979732e-06, + "loss": 0.4396, + "step": 838 + }, + { + "epoch": 0.4535952423860155, + "grad_norm": 0.38524988293647766, + "learning_rate": 6.648849313342816e-06, + "loss": 0.5242, + "step": 839 + }, + { + "epoch": 0.45413588033879976, + "grad_norm": 0.4037633538246155, + "learning_rate": 6.6399345524589366e-06, + "loss": 0.4575, + "step": 840 + }, + { + "epoch": 0.45467651829158406, + "grad_norm": 0.3950909376144409, + "learning_rate": 6.631013946104348e-06, + "loss": 0.4784, + "step": 841 + }, + { + "epoch": 0.45521715624436837, + "grad_norm": 0.3955569565296173, + "learning_rate": 6.622087526076135e-06, + "loss": 0.4869, + "step": 842 + }, + { + "epoch": 0.4557577941971526, + "grad_norm": 0.425361692905426, + "learning_rate": 6.613155324192111e-06, + "loss": 0.4649, + "step": 843 + }, + { + "epoch": 0.45629843214993693, + "grad_norm": 0.41891539096832275, + "learning_rate": 6.604217372290693e-06, + "loss": 0.5171, + "step": 844 + }, + { + "epoch": 0.45683907010272123, + "grad_norm": 0.41283082962036133, + "learning_rate": 6.5952737022308e-06, + "loss": 0.4919, + "step": 845 + }, + { + "epoch": 0.4573797080555055, + "grad_norm": 0.49672916531562805, + "learning_rate": 6.586324345891727e-06, + "loss": 0.472, + "step": 846 + }, + { + "epoch": 0.4579203460082898, + "grad_norm": 0.4132554829120636, + "learning_rate": 6.57736933517304e-06, + "loss": 0.4787, + "step": 847 + }, + { + "epoch": 0.45846098396107404, + "grad_norm": 0.4273938834667206, + "learning_rate": 6.568408701994459e-06, + "loss": 0.4754, + "step": 848 + }, + { + "epoch": 0.45900162191385835, + "grad_norm": 0.47689104080200195, + "learning_rate": 6.559442478295745e-06, + "loss": 0.4945, + "step": 849 + }, + { + "epoch": 0.45954225986664266, + "grad_norm": 0.4251175820827484, + "learning_rate": 6.550470696036591e-06, + "loss": 0.4706, + "step": 850 + }, + { + "epoch": 0.4600828978194269, + "grad_norm": 0.5029627680778503, + "learning_rate": 6.541493387196496e-06, + "loss": 0.464, + "step": 851 + }, + { + "epoch": 0.4606235357722112, + "grad_norm": 0.43976134061813354, + "learning_rate": 6.5325105837746604e-06, + "loss": 0.4781, + "step": 852 + }, + { + "epoch": 0.4611641737249955, + "grad_norm": 0.45708853006362915, + "learning_rate": 6.523522317789874e-06, + "loss": 0.4608, + "step": 853 + }, + { + "epoch": 0.46170481167777977, + "grad_norm": 0.47303131222724915, + "learning_rate": 6.514528621280391e-06, + "loss": 0.5097, + "step": 854 + }, + { + "epoch": 0.4622454496305641, + "grad_norm": 0.4358423948287964, + "learning_rate": 6.5055295263038286e-06, + "loss": 0.4851, + "step": 855 + }, + { + "epoch": 0.4627860875833483, + "grad_norm": 0.41981184482574463, + "learning_rate": 6.496525064937042e-06, + "loss": 0.5024, + "step": 856 + }, + { + "epoch": 0.46332672553613263, + "grad_norm": 0.4504159688949585, + "learning_rate": 6.487515269276015e-06, + "loss": 0.468, + "step": 857 + }, + { + "epoch": 0.46386736348891694, + "grad_norm": 0.5347965955734253, + "learning_rate": 6.478500171435751e-06, + "loss": 0.4995, + "step": 858 + }, + { + "epoch": 0.4644080014417012, + "grad_norm": 0.39357173442840576, + "learning_rate": 6.469479803550144e-06, + "loss": 0.4734, + "step": 859 + }, + { + "epoch": 0.4649486393944855, + "grad_norm": 0.38938194513320923, + "learning_rate": 6.460454197771881e-06, + "loss": 0.5136, + "step": 860 + }, + { + "epoch": 0.4654892773472698, + "grad_norm": 0.47099724411964417, + "learning_rate": 6.451423386272312e-06, + "loss": 0.4934, + "step": 861 + }, + { + "epoch": 0.46602991530005405, + "grad_norm": 0.4051066040992737, + "learning_rate": 6.442387401241349e-06, + "loss": 0.4642, + "step": 862 + }, + { + "epoch": 0.46657055325283836, + "grad_norm": 0.43694064021110535, + "learning_rate": 6.433346274887341e-06, + "loss": 0.484, + "step": 863 + }, + { + "epoch": 0.4671111912056226, + "grad_norm": 0.42491498589515686, + "learning_rate": 6.4243000394369626e-06, + "loss": 0.4813, + "step": 864 + }, + { + "epoch": 0.4676518291584069, + "grad_norm": 0.40995004773139954, + "learning_rate": 6.415248727135103e-06, + "loss": 0.446, + "step": 865 + }, + { + "epoch": 0.4681924671111912, + "grad_norm": 0.440571129322052, + "learning_rate": 6.406192370244742e-06, + "loss": 0.4958, + "step": 866 + }, + { + "epoch": 0.4687331050639755, + "grad_norm": 0.43054115772247314, + "learning_rate": 6.397131001046849e-06, + "loss": 0.4953, + "step": 867 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 0.4248604476451874, + "learning_rate": 6.38806465184025e-06, + "loss": 0.4782, + "step": 868 + }, + { + "epoch": 0.4698143809695441, + "grad_norm": 0.3913900852203369, + "learning_rate": 6.378993354941529e-06, + "loss": 0.4648, + "step": 869 + }, + { + "epoch": 0.47035501892232834, + "grad_norm": 0.41105592250823975, + "learning_rate": 6.3699171426849036e-06, + "loss": 0.4961, + "step": 870 + }, + { + "epoch": 0.47089565687511264, + "grad_norm": 0.3705733120441437, + "learning_rate": 6.3608360474221106e-06, + "loss": 0.4559, + "step": 871 + }, + { + "epoch": 0.4714362948278969, + "grad_norm": 0.3702208995819092, + "learning_rate": 6.3517501015222924e-06, + "loss": 0.4727, + "step": 872 + }, + { + "epoch": 0.4719769327806812, + "grad_norm": 0.3946548104286194, + "learning_rate": 6.342659337371884e-06, + "loss": 0.4848, + "step": 873 + }, + { + "epoch": 0.4725175707334655, + "grad_norm": 0.3584488034248352, + "learning_rate": 6.333563787374493e-06, + "loss": 0.458, + "step": 874 + }, + { + "epoch": 0.47305820868624976, + "grad_norm": 0.36788806319236755, + "learning_rate": 6.3244634839507834e-06, + "loss": 0.4674, + "step": 875 + }, + { + "epoch": 0.47359884663903407, + "grad_norm": 0.3621140420436859, + "learning_rate": 6.315358459538367e-06, + "loss": 0.4718, + "step": 876 + }, + { + "epoch": 0.47413948459181837, + "grad_norm": 0.375000923871994, + "learning_rate": 6.3062487465916825e-06, + "loss": 0.4812, + "step": 877 + }, + { + "epoch": 0.4746801225446026, + "grad_norm": 0.44487372040748596, + "learning_rate": 6.297134377581877e-06, + "loss": 0.4746, + "step": 878 + }, + { + "epoch": 0.47522076049738693, + "grad_norm": 0.3941539525985718, + "learning_rate": 6.2880153849966966e-06, + "loss": 0.4762, + "step": 879 + }, + { + "epoch": 0.4757613984501712, + "grad_norm": 0.4312276244163513, + "learning_rate": 6.2788918013403695e-06, + "loss": 0.4952, + "step": 880 + }, + { + "epoch": 0.4763020364029555, + "grad_norm": 0.41254860162734985, + "learning_rate": 6.269763659133486e-06, + "loss": 0.4963, + "step": 881 + }, + { + "epoch": 0.4768426743557398, + "grad_norm": 0.40565529465675354, + "learning_rate": 6.2606309909128845e-06, + "loss": 0.4787, + "step": 882 + }, + { + "epoch": 0.47738331230852404, + "grad_norm": 0.4065866470336914, + "learning_rate": 6.251493829231539e-06, + "loss": 0.4723, + "step": 883 + }, + { + "epoch": 0.47792395026130835, + "grad_norm": 0.43057599663734436, + "learning_rate": 6.24235220665844e-06, + "loss": 0.4792, + "step": 884 + }, + { + "epoch": 0.47846458821409266, + "grad_norm": 0.3824271857738495, + "learning_rate": 6.233206155778476e-06, + "loss": 0.4817, + "step": 885 + }, + { + "epoch": 0.4790052261668769, + "grad_norm": 0.46750858426094055, + "learning_rate": 6.224055709192323e-06, + "loss": 0.4613, + "step": 886 + }, + { + "epoch": 0.4795458641196612, + "grad_norm": 0.45582181215286255, + "learning_rate": 6.21490089951632e-06, + "loss": 0.4958, + "step": 887 + }, + { + "epoch": 0.48008650207244546, + "grad_norm": 0.4065682291984558, + "learning_rate": 6.205741759382365e-06, + "loss": 0.4928, + "step": 888 + }, + { + "epoch": 0.48062714002522977, + "grad_norm": 0.4958328604698181, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.4648, + "step": 889 + }, + { + "epoch": 0.4811677779780141, + "grad_norm": 0.38294878602027893, + "learning_rate": 6.187410618345241e-06, + "loss": 0.4905, + "step": 890 + }, + { + "epoch": 0.48170841593079833, + "grad_norm": 0.471400648355484, + "learning_rate": 6.178238682782574e-06, + "loss": 0.4835, + "step": 891 + }, + { + "epoch": 0.48224905388358263, + "grad_norm": 0.5207462310791016, + "learning_rate": 6.169062547442724e-06, + "loss": 0.4912, + "step": 892 + }, + { + "epoch": 0.4827896918363669, + "grad_norm": 0.4611525535583496, + "learning_rate": 6.159882245033606e-06, + "loss": 0.5036, + "step": 893 + }, + { + "epoch": 0.4833303297891512, + "grad_norm": 0.4598880708217621, + "learning_rate": 6.150697808277979e-06, + "loss": 0.4472, + "step": 894 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.4637363851070404, + "learning_rate": 6.141509269913343e-06, + "loss": 0.4849, + "step": 895 + }, + { + "epoch": 0.48441160569471975, + "grad_norm": 0.5085048675537109, + "learning_rate": 6.132316662691815e-06, + "loss": 0.467, + "step": 896 + }, + { + "epoch": 0.48495224364750406, + "grad_norm": 0.4294266104698181, + "learning_rate": 6.123120019380021e-06, + "loss": 0.4781, + "step": 897 + }, + { + "epoch": 0.48549288160028836, + "grad_norm": 0.47353968024253845, + "learning_rate": 6.1139193727589665e-06, + "loss": 0.4969, + "step": 898 + }, + { + "epoch": 0.4860335195530726, + "grad_norm": 0.5444623827934265, + "learning_rate": 6.1047147556239325e-06, + "loss": 0.4863, + "step": 899 + }, + { + "epoch": 0.4865741575058569, + "grad_norm": 0.39669960737228394, + "learning_rate": 6.095506200784349e-06, + "loss": 0.485, + "step": 900 + }, + { + "epoch": 0.48711479545864117, + "grad_norm": 0.5122884511947632, + "learning_rate": 6.086293741063685e-06, + "loss": 0.4742, + "step": 901 + }, + { + "epoch": 0.4876554334114255, + "grad_norm": 0.4148077070713043, + "learning_rate": 6.077077409299323e-06, + "loss": 0.4846, + "step": 902 + }, + { + "epoch": 0.4881960713642098, + "grad_norm": 0.3915378153324127, + "learning_rate": 6.067857238342451e-06, + "loss": 0.4665, + "step": 903 + }, + { + "epoch": 0.48873670931699403, + "grad_norm": 0.47468093037605286, + "learning_rate": 6.058633261057945e-06, + "loss": 0.5034, + "step": 904 + }, + { + "epoch": 0.48927734726977834, + "grad_norm": 0.3712773323059082, + "learning_rate": 6.049405510324237e-06, + "loss": 0.4721, + "step": 905 + }, + { + "epoch": 0.48981798522256265, + "grad_norm": 0.4807882010936737, + "learning_rate": 6.040174019033226e-06, + "loss": 0.4779, + "step": 906 + }, + { + "epoch": 0.4903586231753469, + "grad_norm": 0.4945826530456543, + "learning_rate": 6.030938820090128e-06, + "loss": 0.4796, + "step": 907 + }, + { + "epoch": 0.4908992611281312, + "grad_norm": 0.41042837500572205, + "learning_rate": 6.021699946413384e-06, + "loss": 0.491, + "step": 908 + }, + { + "epoch": 0.49143989908091545, + "grad_norm": 0.5063190460205078, + "learning_rate": 6.012457430934532e-06, + "loss": 0.464, + "step": 909 + }, + { + "epoch": 0.49198053703369976, + "grad_norm": 0.4003697335720062, + "learning_rate": 6.003211306598089e-06, + "loss": 0.4639, + "step": 910 + }, + { + "epoch": 0.49252117498648407, + "grad_norm": 0.4417046010494232, + "learning_rate": 5.993961606361436e-06, + "loss": 0.4691, + "step": 911 + }, + { + "epoch": 0.4930618129392683, + "grad_norm": 0.45925286412239075, + "learning_rate": 5.984708363194702e-06, + "loss": 0.4819, + "step": 912 + }, + { + "epoch": 0.4936024508920526, + "grad_norm": 0.44486504793167114, + "learning_rate": 5.975451610080643e-06, + "loss": 0.5092, + "step": 913 + }, + { + "epoch": 0.49414308884483693, + "grad_norm": 0.409686803817749, + "learning_rate": 5.966191380014524e-06, + "loss": 0.4812, + "step": 914 + }, + { + "epoch": 0.4946837267976212, + "grad_norm": 0.4472230076789856, + "learning_rate": 5.956927706004012e-06, + "loss": 0.4983, + "step": 915 + }, + { + "epoch": 0.4952243647504055, + "grad_norm": 0.5017169117927551, + "learning_rate": 5.947660621069038e-06, + "loss": 0.474, + "step": 916 + }, + { + "epoch": 0.49576500270318974, + "grad_norm": 0.4999132752418518, + "learning_rate": 5.938390158241701e-06, + "loss": 0.4736, + "step": 917 + }, + { + "epoch": 0.49630564065597405, + "grad_norm": 0.42812034487724304, + "learning_rate": 5.929116350566132e-06, + "loss": 0.4802, + "step": 918 + }, + { + "epoch": 0.49684627860875835, + "grad_norm": 0.3980429470539093, + "learning_rate": 5.919839231098392e-06, + "loss": 0.4909, + "step": 919 + }, + { + "epoch": 0.4973869165615426, + "grad_norm": 0.4948299527168274, + "learning_rate": 5.910558832906341e-06, + "loss": 0.4603, + "step": 920 + }, + { + "epoch": 0.4979275545143269, + "grad_norm": 0.3767414689064026, + "learning_rate": 5.90127518906953e-06, + "loss": 0.4828, + "step": 921 + }, + { + "epoch": 0.4984681924671112, + "grad_norm": 0.4088006019592285, + "learning_rate": 5.891988332679075e-06, + "loss": 0.4912, + "step": 922 + }, + { + "epoch": 0.49900883041989547, + "grad_norm": 0.40145042538642883, + "learning_rate": 5.882698296837549e-06, + "loss": 0.4714, + "step": 923 + }, + { + "epoch": 0.4995494683726798, + "grad_norm": 0.3793385922908783, + "learning_rate": 5.87340511465885e-06, + "loss": 0.4727, + "step": 924 + }, + { + "epoch": 0.500090106325464, + "grad_norm": 0.40813931822776794, + "learning_rate": 5.864108819268098e-06, + "loss": 0.5169, + "step": 925 + }, + { + "epoch": 0.5006307442782484, + "grad_norm": 0.40652281045913696, + "learning_rate": 5.8548094438015065e-06, + "loss": 0.4698, + "step": 926 + }, + { + "epoch": 0.5011713822310326, + "grad_norm": 0.37487828731536865, + "learning_rate": 5.8455070214062685e-06, + "loss": 0.4714, + "step": 927 + }, + { + "epoch": 0.5017120201838169, + "grad_norm": 0.36619752645492554, + "learning_rate": 5.8362015852404365e-06, + "loss": 0.4425, + "step": 928 + }, + { + "epoch": 0.5022526581366011, + "grad_norm": 0.4049976170063019, + "learning_rate": 5.826893168472807e-06, + "loss": 0.4801, + "step": 929 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 0.36154064536094666, + "learning_rate": 5.8175818042828e-06, + "loss": 0.4694, + "step": 930 + }, + { + "epoch": 0.5033339340421698, + "grad_norm": 0.3709295690059662, + "learning_rate": 5.808267525860343e-06, + "loss": 0.4757, + "step": 931 + }, + { + "epoch": 0.503874571994954, + "grad_norm": 0.4130777418613434, + "learning_rate": 5.798950366405748e-06, + "loss": 0.4681, + "step": 932 + }, + { + "epoch": 0.5044152099477384, + "grad_norm": 0.42704373598098755, + "learning_rate": 5.789630359129599e-06, + "loss": 0.5039, + "step": 933 + }, + { + "epoch": 0.5049558479005226, + "grad_norm": 0.398743212223053, + "learning_rate": 5.780307537252629e-06, + "loss": 0.4582, + "step": 934 + }, + { + "epoch": 0.5054964858533069, + "grad_norm": 0.37964576482772827, + "learning_rate": 5.770981934005606e-06, + "loss": 0.4758, + "step": 935 + }, + { + "epoch": 0.5060371238060912, + "grad_norm": 0.4298136234283447, + "learning_rate": 5.76165358262921e-06, + "loss": 0.4637, + "step": 936 + }, + { + "epoch": 0.5065777617588755, + "grad_norm": 0.3951083719730377, + "learning_rate": 5.752322516373916e-06, + "loss": 0.4877, + "step": 937 + }, + { + "epoch": 0.5071183997116597, + "grad_norm": 0.4326460063457489, + "learning_rate": 5.742988768499879e-06, + "loss": 0.4555, + "step": 938 + }, + { + "epoch": 0.5076590376644441, + "grad_norm": 0.3702709376811981, + "learning_rate": 5.733652372276809e-06, + "loss": 0.495, + "step": 939 + }, + { + "epoch": 0.5081996756172283, + "grad_norm": 0.41388028860092163, + "learning_rate": 5.724313360983859e-06, + "loss": 0.4855, + "step": 940 + }, + { + "epoch": 0.5087403135700126, + "grad_norm": 0.40203750133514404, + "learning_rate": 5.7149717679095026e-06, + "loss": 0.4885, + "step": 941 + }, + { + "epoch": 0.509280951522797, + "grad_norm": 0.39383724331855774, + "learning_rate": 5.705627626351415e-06, + "loss": 0.4634, + "step": 942 + }, + { + "epoch": 0.5098215894755812, + "grad_norm": 0.37974879145622253, + "learning_rate": 5.6962809696163536e-06, + "loss": 0.4613, + "step": 943 + }, + { + "epoch": 0.5103622274283655, + "grad_norm": 0.44113990664482117, + "learning_rate": 5.686931831020044e-06, + "loss": 0.4702, + "step": 944 + }, + { + "epoch": 0.5109028653811497, + "grad_norm": 0.3676075339317322, + "learning_rate": 5.6775802438870596e-06, + "loss": 0.4384, + "step": 945 + }, + { + "epoch": 0.5114435033339341, + "grad_norm": 0.3680996894836426, + "learning_rate": 5.668226241550698e-06, + "loss": 0.4823, + "step": 946 + }, + { + "epoch": 0.5119841412867183, + "grad_norm": 0.37592819333076477, + "learning_rate": 5.658869857352866e-06, + "loss": 0.4891, + "step": 947 + }, + { + "epoch": 0.5125247792395026, + "grad_norm": 0.38988274335861206, + "learning_rate": 5.649511124643962e-06, + "loss": 0.5056, + "step": 948 + }, + { + "epoch": 0.5130654171922869, + "grad_norm": 0.3609144687652588, + "learning_rate": 5.640150076782755e-06, + "loss": 0.4541, + "step": 949 + }, + { + "epoch": 0.5136060551450712, + "grad_norm": 0.41311243176460266, + "learning_rate": 5.630786747136269e-06, + "loss": 0.4958, + "step": 950 + }, + { + "epoch": 0.5141466930978554, + "grad_norm": 0.3682233393192291, + "learning_rate": 5.621421169079655e-06, + "loss": 0.4919, + "step": 951 + }, + { + "epoch": 0.5146873310506398, + "grad_norm": 0.36114174127578735, + "learning_rate": 5.612053375996082e-06, + "loss": 0.4825, + "step": 952 + }, + { + "epoch": 0.515227969003424, + "grad_norm": 0.40716037154197693, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.4529, + "step": 953 + }, + { + "epoch": 0.5157686069562083, + "grad_norm": 0.40229591727256775, + "learning_rate": 5.593311278320097e-06, + "loss": 0.4857, + "step": 954 + }, + { + "epoch": 0.5163092449089927, + "grad_norm": 0.3767363727092743, + "learning_rate": 5.583937040533023e-06, + "loss": 0.4939, + "step": 955 + }, + { + "epoch": 0.5168498828617769, + "grad_norm": 0.3691665232181549, + "learning_rate": 5.574560721329431e-06, + "loss": 0.4705, + "step": 956 + }, + { + "epoch": 0.5173905208145612, + "grad_norm": 0.38738739490509033, + "learning_rate": 5.565182354130776e-06, + "loss": 0.4676, + "step": 957 + }, + { + "epoch": 0.5179311587673455, + "grad_norm": 0.370933473110199, + "learning_rate": 5.555801972365812e-06, + "loss": 0.4713, + "step": 958 + }, + { + "epoch": 0.5184717967201298, + "grad_norm": 0.38803374767303467, + "learning_rate": 5.5464196094704745e-06, + "loss": 0.4836, + "step": 959 + }, + { + "epoch": 0.519012434672914, + "grad_norm": 0.396415114402771, + "learning_rate": 5.537035298887764e-06, + "loss": 0.4738, + "step": 960 + }, + { + "epoch": 0.5195530726256983, + "grad_norm": 0.4398602247238159, + "learning_rate": 5.527649074067618e-06, + "loss": 0.5144, + "step": 961 + }, + { + "epoch": 0.5200937105784826, + "grad_norm": 0.3995663821697235, + "learning_rate": 5.5182609684668024e-06, + "loss": 0.5048, + "step": 962 + }, + { + "epoch": 0.5206343485312669, + "grad_norm": 0.41245123744010925, + "learning_rate": 5.508871015548781e-06, + "loss": 0.4697, + "step": 963 + }, + { + "epoch": 0.5211749864840511, + "grad_norm": 0.3744891285896301, + "learning_rate": 5.49947924878361e-06, + "loss": 0.4669, + "step": 964 + }, + { + "epoch": 0.5217156244368355, + "grad_norm": 0.42122045159339905, + "learning_rate": 5.490085701647805e-06, + "loss": 0.4859, + "step": 965 + }, + { + "epoch": 0.5222562623896198, + "grad_norm": 0.3975967466831207, + "learning_rate": 5.480690407624227e-06, + "loss": 0.4549, + "step": 966 + }, + { + "epoch": 0.522796900342404, + "grad_norm": 0.41702109575271606, + "learning_rate": 5.47129340020197e-06, + "loss": 0.4944, + "step": 967 + }, + { + "epoch": 0.5233375382951884, + "grad_norm": 0.4080027937889099, + "learning_rate": 5.461894712876228e-06, + "loss": 0.4705, + "step": 968 + }, + { + "epoch": 0.5238781762479726, + "grad_norm": 0.42316797375679016, + "learning_rate": 5.45249437914819e-06, + "loss": 0.4567, + "step": 969 + }, + { + "epoch": 0.5244188142007569, + "grad_norm": 0.3980869948863983, + "learning_rate": 5.443092432524906e-06, + "loss": 0.4738, + "step": 970 + }, + { + "epoch": 0.5249594521535412, + "grad_norm": 0.42186301946640015, + "learning_rate": 5.433688906519183e-06, + "loss": 0.4854, + "step": 971 + }, + { + "epoch": 0.5255000901063255, + "grad_norm": 0.3881399631500244, + "learning_rate": 5.424283834649451e-06, + "loss": 0.4716, + "step": 972 + }, + { + "epoch": 0.5260407280591097, + "grad_norm": 0.4174569249153137, + "learning_rate": 5.414877250439654e-06, + "loss": 0.4897, + "step": 973 + }, + { + "epoch": 0.526581366011894, + "grad_norm": 0.44264698028564453, + "learning_rate": 5.405469187419126e-06, + "loss": 0.4815, + "step": 974 + }, + { + "epoch": 0.5271220039646783, + "grad_norm": 0.32237735390663147, + "learning_rate": 5.39605967912247e-06, + "loss": 0.4508, + "step": 975 + }, + { + "epoch": 0.5276626419174626, + "grad_norm": 0.47621771693229675, + "learning_rate": 5.386648759089441e-06, + "loss": 0.5093, + "step": 976 + }, + { + "epoch": 0.5282032798702468, + "grad_norm": 0.4255550801753998, + "learning_rate": 5.3772364608648304e-06, + "loss": 0.5067, + "step": 977 + }, + { + "epoch": 0.5287439178230312, + "grad_norm": 0.35732802748680115, + "learning_rate": 5.367822817998338e-06, + "loss": 0.4611, + "step": 978 + }, + { + "epoch": 0.5292845557758155, + "grad_norm": 0.3863273859024048, + "learning_rate": 5.358407864044456e-06, + "loss": 0.4833, + "step": 979 + }, + { + "epoch": 0.5298251937285997, + "grad_norm": 0.3596065938472748, + "learning_rate": 5.348991632562355e-06, + "loss": 0.5005, + "step": 980 + }, + { + "epoch": 0.5303658316813841, + "grad_norm": 0.39100930094718933, + "learning_rate": 5.339574157115752e-06, + "loss": 0.4815, + "step": 981 + }, + { + "epoch": 0.5309064696341683, + "grad_norm": 0.38288214802742004, + "learning_rate": 5.330155471272804e-06, + "loss": 0.4522, + "step": 982 + }, + { + "epoch": 0.5314471075869526, + "grad_norm": 0.3723912835121155, + "learning_rate": 5.320735608605979e-06, + "loss": 0.4618, + "step": 983 + }, + { + "epoch": 0.5319877455397369, + "grad_norm": 0.3716428279876709, + "learning_rate": 5.311314602691943e-06, + "loss": 0.4565, + "step": 984 + }, + { + "epoch": 0.5325283834925212, + "grad_norm": 0.4149164855480194, + "learning_rate": 5.301892487111431e-06, + "loss": 0.4597, + "step": 985 + }, + { + "epoch": 0.5330690214453054, + "grad_norm": 0.45434221625328064, + "learning_rate": 5.292469295449141e-06, + "loss": 0.472, + "step": 986 + }, + { + "epoch": 0.5336096593980898, + "grad_norm": 0.35414353013038635, + "learning_rate": 5.2830450612936e-06, + "loss": 0.4724, + "step": 987 + }, + { + "epoch": 0.534150297350874, + "grad_norm": 0.41073864698410034, + "learning_rate": 5.273619818237058e-06, + "loss": 0.4936, + "step": 988 + }, + { + "epoch": 0.5346909353036583, + "grad_norm": 0.3850623667240143, + "learning_rate": 5.264193599875353e-06, + "loss": 0.4932, + "step": 989 + }, + { + "epoch": 0.5352315732564425, + "grad_norm": 0.3790433406829834, + "learning_rate": 5.254766439807807e-06, + "loss": 0.4615, + "step": 990 + }, + { + "epoch": 0.5357722112092269, + "grad_norm": 0.3931194841861725, + "learning_rate": 5.245338371637091e-06, + "loss": 0.4915, + "step": 991 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 0.36426377296447754, + "learning_rate": 5.235909428969119e-06, + "loss": 0.4598, + "step": 992 + }, + { + "epoch": 0.5368534871147954, + "grad_norm": 0.381425142288208, + "learning_rate": 5.226479645412923e-06, + "loss": 0.4681, + "step": 993 + }, + { + "epoch": 0.5373941250675798, + "grad_norm": 0.38973844051361084, + "learning_rate": 5.2170490545805255e-06, + "loss": 0.4858, + "step": 994 + }, + { + "epoch": 0.537934763020364, + "grad_norm": 0.35494348406791687, + "learning_rate": 5.207617690086831e-06, + "loss": 0.4866, + "step": 995 + }, + { + "epoch": 0.5384754009731483, + "grad_norm": 0.38420799374580383, + "learning_rate": 5.1981855855495035e-06, + "loss": 0.4588, + "step": 996 + }, + { + "epoch": 0.5390160389259326, + "grad_norm": 0.4316636621952057, + "learning_rate": 5.188752774588841e-06, + "loss": 0.4792, + "step": 997 + }, + { + "epoch": 0.5395566768787169, + "grad_norm": 0.3582872748374939, + "learning_rate": 5.179319290827661e-06, + "loss": 0.4911, + "step": 998 + }, + { + "epoch": 0.5400973148315011, + "grad_norm": 0.38559308648109436, + "learning_rate": 5.16988516789118e-06, + "loss": 0.4771, + "step": 999 + }, + { + "epoch": 0.5406379527842855, + "grad_norm": 0.4075615406036377, + "learning_rate": 5.16045043940689e-06, + "loss": 0.494, + "step": 1000 + }, + { + "epoch": 0.5411785907370698, + "grad_norm": 0.39124950766563416, + "learning_rate": 5.151015139004445e-06, + "loss": 0.4745, + "step": 1001 + }, + { + "epoch": 0.541719228689854, + "grad_norm": 0.42799654603004456, + "learning_rate": 5.141579300315536e-06, + "loss": 0.4366, + "step": 1002 + }, + { + "epoch": 0.5422598666426384, + "grad_norm": 0.4033052921295166, + "learning_rate": 5.132142956973773e-06, + "loss": 0.4618, + "step": 1003 + }, + { + "epoch": 0.5428005045954226, + "grad_norm": 0.41250860691070557, + "learning_rate": 5.122706142614562e-06, + "loss": 0.4412, + "step": 1004 + }, + { + "epoch": 0.5433411425482069, + "grad_norm": 0.37778806686401367, + "learning_rate": 5.113268890874994e-06, + "loss": 0.4824, + "step": 1005 + }, + { + "epoch": 0.5438817805009911, + "grad_norm": 0.39247915148735046, + "learning_rate": 5.103831235393714e-06, + "loss": 0.4853, + "step": 1006 + }, + { + "epoch": 0.5444224184537755, + "grad_norm": 0.3986344039440155, + "learning_rate": 5.094393209810806e-06, + "loss": 0.4623, + "step": 1007 + }, + { + "epoch": 0.5449630564065597, + "grad_norm": 0.3868162930011749, + "learning_rate": 5.084954847767677e-06, + "loss": 0.4603, + "step": 1008 + }, + { + "epoch": 0.545503694359344, + "grad_norm": 0.39990749955177307, + "learning_rate": 5.07551618290693e-06, + "loss": 0.4866, + "step": 1009 + }, + { + "epoch": 0.5460443323121283, + "grad_norm": 0.3816811442375183, + "learning_rate": 5.06607724887225e-06, + "loss": 0.4705, + "step": 1010 + }, + { + "epoch": 0.5465849702649126, + "grad_norm": 0.39409664273262024, + "learning_rate": 5.056638079308277e-06, + "loss": 0.4685, + "step": 1011 + }, + { + "epoch": 0.5471256082176968, + "grad_norm": 0.37422487139701843, + "learning_rate": 5.047198707860496e-06, + "loss": 0.4707, + "step": 1012 + }, + { + "epoch": 0.5476662461704812, + "grad_norm": 0.37773483991622925, + "learning_rate": 5.037759168175109e-06, + "loss": 0.488, + "step": 1013 + }, + { + "epoch": 0.5482068841232655, + "grad_norm": 0.4065093994140625, + "learning_rate": 5.028319493898916e-06, + "loss": 0.5103, + "step": 1014 + }, + { + "epoch": 0.5487475220760497, + "grad_norm": 0.40094494819641113, + "learning_rate": 5.018879718679199e-06, + "loss": 0.4757, + "step": 1015 + }, + { + "epoch": 0.5492881600288341, + "grad_norm": 0.3680800199508667, + "learning_rate": 5.009439876163601e-06, + "loss": 0.4682, + "step": 1016 + }, + { + "epoch": 0.5498287979816183, + "grad_norm": 0.3533041775226593, + "learning_rate": 5e-06, + "loss": 0.4541, + "step": 1017 + }, + { + "epoch": 0.5503694359344026, + "grad_norm": 0.38887202739715576, + "learning_rate": 4.9905601238364006e-06, + "loss": 0.4604, + "step": 1018 + }, + { + "epoch": 0.5509100738871869, + "grad_norm": 0.38735654950141907, + "learning_rate": 4.981120281320801e-06, + "loss": 0.474, + "step": 1019 + }, + { + "epoch": 0.5514507118399712, + "grad_norm": 0.3694019615650177, + "learning_rate": 4.971680506101086e-06, + "loss": 0.4734, + "step": 1020 + }, + { + "epoch": 0.5519913497927554, + "grad_norm": 0.36134111881256104, + "learning_rate": 4.9622408318248925e-06, + "loss": 0.454, + "step": 1021 + }, + { + "epoch": 0.5525319877455397, + "grad_norm": 0.40273579955101013, + "learning_rate": 4.952801292139505e-06, + "loss": 0.5016, + "step": 1022 + }, + { + "epoch": 0.553072625698324, + "grad_norm": 0.39331087470054626, + "learning_rate": 4.9433619206917234e-06, + "loss": 0.4658, + "step": 1023 + }, + { + "epoch": 0.5536132636511083, + "grad_norm": 0.40813592076301575, + "learning_rate": 4.933922751127753e-06, + "loss": 0.4767, + "step": 1024 + }, + { + "epoch": 0.5541539016038926, + "grad_norm": 0.37781664729118347, + "learning_rate": 4.924483817093071e-06, + "loss": 0.4625, + "step": 1025 + }, + { + "epoch": 0.5546945395566769, + "grad_norm": 0.420913964509964, + "learning_rate": 4.915045152232324e-06, + "loss": 0.4834, + "step": 1026 + }, + { + "epoch": 0.5552351775094612, + "grad_norm": 0.3994501233100891, + "learning_rate": 4.9056067901891945e-06, + "loss": 0.496, + "step": 1027 + }, + { + "epoch": 0.5557758154622454, + "grad_norm": 0.471672922372818, + "learning_rate": 4.896168764606289e-06, + "loss": 0.4774, + "step": 1028 + }, + { + "epoch": 0.5563164534150298, + "grad_norm": 0.4291553497314453, + "learning_rate": 4.886731109125007e-06, + "loss": 0.4909, + "step": 1029 + }, + { + "epoch": 0.556857091367814, + "grad_norm": 0.3480027914047241, + "learning_rate": 4.87729385738544e-06, + "loss": 0.4796, + "step": 1030 + }, + { + "epoch": 0.5573977293205983, + "grad_norm": 0.448752760887146, + "learning_rate": 4.867857043026229e-06, + "loss": 0.4718, + "step": 1031 + }, + { + "epoch": 0.5579383672733826, + "grad_norm": 0.429802805185318, + "learning_rate": 4.858420699684464e-06, + "loss": 0.4695, + "step": 1032 + }, + { + "epoch": 0.5584790052261669, + "grad_norm": 0.40032947063446045, + "learning_rate": 4.848984860995557e-06, + "loss": 0.4741, + "step": 1033 + }, + { + "epoch": 0.5590196431789511, + "grad_norm": 0.35695409774780273, + "learning_rate": 4.839549560593111e-06, + "loss": 0.4628, + "step": 1034 + }, + { + "epoch": 0.5595602811317354, + "grad_norm": 0.35745665431022644, + "learning_rate": 4.830114832108822e-06, + "loss": 0.4973, + "step": 1035 + }, + { + "epoch": 0.5601009190845198, + "grad_norm": 0.37396466732025146, + "learning_rate": 4.82068070917234e-06, + "loss": 0.463, + "step": 1036 + }, + { + "epoch": 0.560641557037304, + "grad_norm": 0.3474443554878235, + "learning_rate": 4.81124722541116e-06, + "loss": 0.4761, + "step": 1037 + }, + { + "epoch": 0.5611821949900883, + "grad_norm": 0.40390846133232117, + "learning_rate": 4.801814414450498e-06, + "loss": 0.4875, + "step": 1038 + }, + { + "epoch": 0.5617228329428726, + "grad_norm": 0.3489498794078827, + "learning_rate": 4.7923823099131694e-06, + "loss": 0.4772, + "step": 1039 + }, + { + "epoch": 0.5622634708956569, + "grad_norm": 0.4154996871948242, + "learning_rate": 4.782950945419475e-06, + "loss": 0.4778, + "step": 1040 + }, + { + "epoch": 0.5628041088484411, + "grad_norm": 0.36867937445640564, + "learning_rate": 4.7735203545870794e-06, + "loss": 0.474, + "step": 1041 + }, + { + "epoch": 0.5633447468012255, + "grad_norm": 0.3696093261241913, + "learning_rate": 4.764090571030882e-06, + "loss": 0.4813, + "step": 1042 + }, + { + "epoch": 0.5638853847540097, + "grad_norm": 0.3766731023788452, + "learning_rate": 4.75466162836291e-06, + "loss": 0.4941, + "step": 1043 + }, + { + "epoch": 0.564426022706794, + "grad_norm": 0.39763715863227844, + "learning_rate": 4.745233560192195e-06, + "loss": 0.4716, + "step": 1044 + }, + { + "epoch": 0.5649666606595783, + "grad_norm": 0.3585834801197052, + "learning_rate": 4.735806400124648e-06, + "loss": 0.474, + "step": 1045 + }, + { + "epoch": 0.5655072986123626, + "grad_norm": 0.36759787797927856, + "learning_rate": 4.726380181762943e-06, + "loss": 0.4849, + "step": 1046 + }, + { + "epoch": 0.5660479365651468, + "grad_norm": 0.37226635217666626, + "learning_rate": 4.716954938706401e-06, + "loss": 0.4766, + "step": 1047 + }, + { + "epoch": 0.5665885745179312, + "grad_norm": 0.392599880695343, + "learning_rate": 4.707530704550861e-06, + "loss": 0.4906, + "step": 1048 + }, + { + "epoch": 0.5671292124707155, + "grad_norm": 0.3526148200035095, + "learning_rate": 4.69810751288857e-06, + "loss": 0.4418, + "step": 1049 + }, + { + "epoch": 0.5676698504234997, + "grad_norm": 0.37177005410194397, + "learning_rate": 4.688685397308061e-06, + "loss": 0.5155, + "step": 1050 + }, + { + "epoch": 0.568210488376284, + "grad_norm": 0.42093127965927124, + "learning_rate": 4.679264391394022e-06, + "loss": 0.4817, + "step": 1051 + }, + { + "epoch": 0.5687511263290683, + "grad_norm": 0.3944513499736786, + "learning_rate": 4.669844528727197e-06, + "loss": 0.4964, + "step": 1052 + }, + { + "epoch": 0.5692917642818526, + "grad_norm": 0.339263379573822, + "learning_rate": 4.660425842884249e-06, + "loss": 0.4467, + "step": 1053 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 0.3937620520591736, + "learning_rate": 4.651008367437646e-06, + "loss": 0.4803, + "step": 1054 + }, + { + "epoch": 0.5703730401874212, + "grad_norm": 0.36690381169319153, + "learning_rate": 4.641592135955545e-06, + "loss": 0.4529, + "step": 1055 + }, + { + "epoch": 0.5709136781402054, + "grad_norm": 0.373169481754303, + "learning_rate": 4.6321771820016635e-06, + "loss": 0.4899, + "step": 1056 + }, + { + "epoch": 0.5714543160929897, + "grad_norm": 0.36590346693992615, + "learning_rate": 4.62276353913517e-06, + "loss": 0.4827, + "step": 1057 + }, + { + "epoch": 0.571994954045774, + "grad_norm": 0.3776438534259796, + "learning_rate": 4.6133512409105595e-06, + "loss": 0.4724, + "step": 1058 + }, + { + "epoch": 0.5725355919985583, + "grad_norm": 0.3477265536785126, + "learning_rate": 4.603940320877533e-06, + "loss": 0.4912, + "step": 1059 + }, + { + "epoch": 0.5730762299513426, + "grad_norm": 0.340049684047699, + "learning_rate": 4.594530812580876e-06, + "loss": 0.4825, + "step": 1060 + }, + { + "epoch": 0.5736168679041269, + "grad_norm": 0.3493627905845642, + "learning_rate": 4.585122749560347e-06, + "loss": 0.4692, + "step": 1061 + }, + { + "epoch": 0.5741575058569112, + "grad_norm": 0.34757259488105774, + "learning_rate": 4.575716165350549e-06, + "loss": 0.4381, + "step": 1062 + }, + { + "epoch": 0.5746981438096954, + "grad_norm": 0.3675667345523834, + "learning_rate": 4.566311093480818e-06, + "loss": 0.4431, + "step": 1063 + }, + { + "epoch": 0.5752387817624798, + "grad_norm": 0.3604763448238373, + "learning_rate": 4.556907567475094e-06, + "loss": 0.4832, + "step": 1064 + }, + { + "epoch": 0.575779419715264, + "grad_norm": 0.3719836473464966, + "learning_rate": 4.547505620851812e-06, + "loss": 0.4634, + "step": 1065 + }, + { + "epoch": 0.5763200576680483, + "grad_norm": 0.3803561329841614, + "learning_rate": 4.538105287123772e-06, + "loss": 0.4679, + "step": 1066 + }, + { + "epoch": 0.5768606956208325, + "grad_norm": 0.35499629378318787, + "learning_rate": 4.528706599798033e-06, + "loss": 0.4648, + "step": 1067 + }, + { + "epoch": 0.5774013335736169, + "grad_norm": 0.3680323362350464, + "learning_rate": 4.5193095923757745e-06, + "loss": 0.4622, + "step": 1068 + }, + { + "epoch": 0.5779419715264011, + "grad_norm": 0.3444634974002838, + "learning_rate": 4.509914298352197e-06, + "loss": 0.4707, + "step": 1069 + }, + { + "epoch": 0.5784826094791854, + "grad_norm": 0.3498629927635193, + "learning_rate": 4.5005207512163914e-06, + "loss": 0.4618, + "step": 1070 + }, + { + "epoch": 0.5790232474319698, + "grad_norm": 0.3526155352592468, + "learning_rate": 4.491128984451219e-06, + "loss": 0.4661, + "step": 1071 + }, + { + "epoch": 0.579563885384754, + "grad_norm": 0.3570224344730377, + "learning_rate": 4.481739031533201e-06, + "loss": 0.4846, + "step": 1072 + }, + { + "epoch": 0.5801045233375383, + "grad_norm": 0.3742653727531433, + "learning_rate": 4.472350925932384e-06, + "loss": 0.4589, + "step": 1073 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.3513067662715912, + "learning_rate": 4.462964701112237e-06, + "loss": 0.4401, + "step": 1074 + }, + { + "epoch": 0.5811857992431069, + "grad_norm": 0.3835412561893463, + "learning_rate": 4.453580390529526e-06, + "loss": 0.4607, + "step": 1075 + }, + { + "epoch": 0.5817264371958911, + "grad_norm": 0.35909005999565125, + "learning_rate": 4.444198027634191e-06, + "loss": 0.4408, + "step": 1076 + }, + { + "epoch": 0.5822670751486755, + "grad_norm": 0.3801157474517822, + "learning_rate": 4.434817645869226e-06, + "loss": 0.4678, + "step": 1077 + }, + { + "epoch": 0.5828077131014597, + "grad_norm": 0.36517688632011414, + "learning_rate": 4.42543927867057e-06, + "loss": 0.4884, + "step": 1078 + }, + { + "epoch": 0.583348351054244, + "grad_norm": 0.41169390082359314, + "learning_rate": 4.416062959466978e-06, + "loss": 0.491, + "step": 1079 + }, + { + "epoch": 0.5838889890070283, + "grad_norm": 0.4015640616416931, + "learning_rate": 4.4066887216799055e-06, + "loss": 0.5044, + "step": 1080 + }, + { + "epoch": 0.5844296269598126, + "grad_norm": 0.3767825663089752, + "learning_rate": 4.397316598723385e-06, + "loss": 0.4729, + "step": 1081 + }, + { + "epoch": 0.5849702649125968, + "grad_norm": 0.37044841051101685, + "learning_rate": 4.38794662400392e-06, + "loss": 0.4901, + "step": 1082 + }, + { + "epoch": 0.5855109028653811, + "grad_norm": 0.3824319839477539, + "learning_rate": 4.3785788309203466e-06, + "loss": 0.454, + "step": 1083 + }, + { + "epoch": 0.5860515408181655, + "grad_norm": 0.42856699228286743, + "learning_rate": 4.369213252863733e-06, + "loss": 0.4726, + "step": 1084 + }, + { + "epoch": 0.5865921787709497, + "grad_norm": 0.3541294038295746, + "learning_rate": 4.359849923217246e-06, + "loss": 0.4546, + "step": 1085 + }, + { + "epoch": 0.587132816723734, + "grad_norm": 0.4067661166191101, + "learning_rate": 4.350488875356041e-06, + "loss": 0.4584, + "step": 1086 + }, + { + "epoch": 0.5876734546765183, + "grad_norm": 0.39768409729003906, + "learning_rate": 4.341130142647136e-06, + "loss": 0.4699, + "step": 1087 + }, + { + "epoch": 0.5882140926293026, + "grad_norm": 0.35365110635757446, + "learning_rate": 4.331773758449303e-06, + "loss": 0.4863, + "step": 1088 + }, + { + "epoch": 0.5887547305820868, + "grad_norm": 0.4205717444419861, + "learning_rate": 4.322419756112943e-06, + "loss": 0.4603, + "step": 1089 + }, + { + "epoch": 0.5892953685348712, + "grad_norm": 0.3971540033817291, + "learning_rate": 4.313068168979957e-06, + "loss": 0.4485, + "step": 1090 + }, + { + "epoch": 0.5898360064876554, + "grad_norm": 0.3554147183895111, + "learning_rate": 4.303719030383648e-06, + "loss": 0.47, + "step": 1091 + }, + { + "epoch": 0.5903766444404397, + "grad_norm": 0.4239473044872284, + "learning_rate": 4.294372373648587e-06, + "loss": 0.5029, + "step": 1092 + }, + { + "epoch": 0.590917282393224, + "grad_norm": 0.3685072958469391, + "learning_rate": 4.285028232090499e-06, + "loss": 0.4611, + "step": 1093 + }, + { + "epoch": 0.5914579203460083, + "grad_norm": 0.39185526967048645, + "learning_rate": 4.275686639016142e-06, + "loss": 0.454, + "step": 1094 + }, + { + "epoch": 0.5919985582987926, + "grad_norm": 0.3944680094718933, + "learning_rate": 4.266347627723192e-06, + "loss": 0.4956, + "step": 1095 + }, + { + "epoch": 0.5925391962515769, + "grad_norm": 0.3366733491420746, + "learning_rate": 4.257011231500122e-06, + "loss": 0.4424, + "step": 1096 + }, + { + "epoch": 0.5930798342043612, + "grad_norm": 0.4068387746810913, + "learning_rate": 4.247677483626085e-06, + "loss": 0.4794, + "step": 1097 + }, + { + "epoch": 0.5936204721571454, + "grad_norm": 0.4122190773487091, + "learning_rate": 4.238346417370793e-06, + "loss": 0.4769, + "step": 1098 + }, + { + "epoch": 0.5941611101099297, + "grad_norm": 0.38368502259254456, + "learning_rate": 4.229018065994396e-06, + "loss": 0.4576, + "step": 1099 + }, + { + "epoch": 0.594701748062714, + "grad_norm": 0.33293166756629944, + "learning_rate": 4.2196924627473715e-06, + "loss": 0.4749, + "step": 1100 + }, + { + "epoch": 0.5952423860154983, + "grad_norm": 0.43737560510635376, + "learning_rate": 4.210369640870403e-06, + "loss": 0.4989, + "step": 1101 + }, + { + "epoch": 0.5957830239682825, + "grad_norm": 0.4038828909397125, + "learning_rate": 4.201049633594254e-06, + "loss": 0.4808, + "step": 1102 + }, + { + "epoch": 0.5963236619210669, + "grad_norm": 0.3759760856628418, + "learning_rate": 4.1917324741396595e-06, + "loss": 0.482, + "step": 1103 + }, + { + "epoch": 0.5968642998738511, + "grad_norm": 0.3900810778141022, + "learning_rate": 4.1824181957172014e-06, + "loss": 0.5048, + "step": 1104 + }, + { + "epoch": 0.5974049378266354, + "grad_norm": 0.3988158106803894, + "learning_rate": 4.173106831527194e-06, + "loss": 0.4889, + "step": 1105 + }, + { + "epoch": 0.5979455757794198, + "grad_norm": 0.393657386302948, + "learning_rate": 4.163798414759566e-06, + "loss": 0.4676, + "step": 1106 + }, + { + "epoch": 0.598486213732204, + "grad_norm": 0.3507223427295685, + "learning_rate": 4.154492978593733e-06, + "loss": 0.4462, + "step": 1107 + }, + { + "epoch": 0.5990268516849883, + "grad_norm": 0.3565762937068939, + "learning_rate": 4.145190556198494e-06, + "loss": 0.4725, + "step": 1108 + }, + { + "epoch": 0.5995674896377726, + "grad_norm": 0.3660755157470703, + "learning_rate": 4.135891180731903e-06, + "loss": 0.4607, + "step": 1109 + }, + { + "epoch": 0.6001081275905569, + "grad_norm": 0.3648075461387634, + "learning_rate": 4.1265948853411506e-06, + "loss": 0.4446, + "step": 1110 + }, + { + "epoch": 0.6006487655433411, + "grad_norm": 0.3925120532512665, + "learning_rate": 4.1173017031624544e-06, + "loss": 0.4767, + "step": 1111 + }, + { + "epoch": 0.6011894034961254, + "grad_norm": 0.35011157393455505, + "learning_rate": 4.108011667320926e-06, + "loss": 0.4739, + "step": 1112 + }, + { + "epoch": 0.6017300414489097, + "grad_norm": 0.3578500747680664, + "learning_rate": 4.098724810930472e-06, + "loss": 0.4899, + "step": 1113 + }, + { + "epoch": 0.602270679401694, + "grad_norm": 0.40307337045669556, + "learning_rate": 4.08944116709366e-06, + "loss": 0.4923, + "step": 1114 + }, + { + "epoch": 0.6028113173544782, + "grad_norm": 0.39985835552215576, + "learning_rate": 4.08016076890161e-06, + "loss": 0.466, + "step": 1115 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 0.40574347972869873, + "learning_rate": 4.0708836494338695e-06, + "loss": 0.478, + "step": 1116 + }, + { + "epoch": 0.6038925932600469, + "grad_norm": 0.3304382264614105, + "learning_rate": 4.061609841758302e-06, + "loss": 0.4487, + "step": 1117 + }, + { + "epoch": 0.6044332312128311, + "grad_norm": 0.3854810893535614, + "learning_rate": 4.0523393789309625e-06, + "loss": 0.4704, + "step": 1118 + }, + { + "epoch": 0.6049738691656155, + "grad_norm": 0.3859521746635437, + "learning_rate": 4.04307229399599e-06, + "loss": 0.446, + "step": 1119 + }, + { + "epoch": 0.6055145071183997, + "grad_norm": 0.3449578881263733, + "learning_rate": 4.0338086199854765e-06, + "loss": 0.4592, + "step": 1120 + }, + { + "epoch": 0.606055145071184, + "grad_norm": 0.381989985704422, + "learning_rate": 4.02454838991936e-06, + "loss": 0.4553, + "step": 1121 + }, + { + "epoch": 0.6065957830239683, + "grad_norm": 0.4080401659011841, + "learning_rate": 4.0152916368053e-06, + "loss": 0.5008, + "step": 1122 + }, + { + "epoch": 0.6071364209767526, + "grad_norm": 0.35274338722229004, + "learning_rate": 4.006038393638565e-06, + "loss": 0.4687, + "step": 1123 + }, + { + "epoch": 0.6076770589295368, + "grad_norm": 0.3596242070198059, + "learning_rate": 3.996788693401914e-06, + "loss": 0.4852, + "step": 1124 + }, + { + "epoch": 0.6082176968823212, + "grad_norm": 0.44486021995544434, + "learning_rate": 3.987542569065469e-06, + "loss": 0.4747, + "step": 1125 + }, + { + "epoch": 0.6087583348351054, + "grad_norm": 0.3718114495277405, + "learning_rate": 3.978300053586617e-06, + "loss": 0.5005, + "step": 1126 + }, + { + "epoch": 0.6092989727878897, + "grad_norm": 0.4019679129123688, + "learning_rate": 3.969061179909872e-06, + "loss": 0.4566, + "step": 1127 + }, + { + "epoch": 0.6098396107406739, + "grad_norm": 0.3548070788383484, + "learning_rate": 3.959825980966777e-06, + "loss": 0.4627, + "step": 1128 + }, + { + "epoch": 0.6103802486934583, + "grad_norm": 0.35685357451438904, + "learning_rate": 3.9505944896757635e-06, + "loss": 0.491, + "step": 1129 + }, + { + "epoch": 0.6109208866462426, + "grad_norm": 0.3496491312980652, + "learning_rate": 3.941366738942058e-06, + "loss": 0.4934, + "step": 1130 + }, + { + "epoch": 0.6114615245990268, + "grad_norm": 0.39106976985931396, + "learning_rate": 3.932142761657549e-06, + "loss": 0.4896, + "step": 1131 + }, + { + "epoch": 0.6120021625518112, + "grad_norm": 0.39134979248046875, + "learning_rate": 3.922922590700679e-06, + "loss": 0.5008, + "step": 1132 + }, + { + "epoch": 0.6125428005045954, + "grad_norm": 0.3936554193496704, + "learning_rate": 3.913706258936317e-06, + "loss": 0.481, + "step": 1133 + }, + { + "epoch": 0.6130834384573797, + "grad_norm": 0.3407648503780365, + "learning_rate": 3.904493799215652e-06, + "loss": 0.4668, + "step": 1134 + }, + { + "epoch": 0.613624076410164, + "grad_norm": 0.3679908514022827, + "learning_rate": 3.895285244376068e-06, + "loss": 0.4934, + "step": 1135 + }, + { + "epoch": 0.6141647143629483, + "grad_norm": 0.4318563640117645, + "learning_rate": 3.886080627241034e-06, + "loss": 0.4423, + "step": 1136 + }, + { + "epoch": 0.6147053523157325, + "grad_norm": 0.3628011643886566, + "learning_rate": 3.876879980619982e-06, + "loss": 0.4939, + "step": 1137 + }, + { + "epoch": 0.6152459902685169, + "grad_norm": 0.375052809715271, + "learning_rate": 3.8676833373081864e-06, + "loss": 0.4655, + "step": 1138 + }, + { + "epoch": 0.6157866282213011, + "grad_norm": 0.39029353857040405, + "learning_rate": 3.8584907300866595e-06, + "loss": 0.4855, + "step": 1139 + }, + { + "epoch": 0.6163272661740854, + "grad_norm": 0.4070568084716797, + "learning_rate": 3.8493021917220225e-06, + "loss": 0.4526, + "step": 1140 + }, + { + "epoch": 0.6168679041268698, + "grad_norm": 0.4272588789463043, + "learning_rate": 3.840117754966396e-06, + "loss": 0.4892, + "step": 1141 + }, + { + "epoch": 0.617408542079654, + "grad_norm": 0.4317014217376709, + "learning_rate": 3.8309374525572765e-06, + "loss": 0.4817, + "step": 1142 + }, + { + "epoch": 0.6179491800324383, + "grad_norm": 0.4254457950592041, + "learning_rate": 3.821761317217428e-06, + "loss": 0.4631, + "step": 1143 + }, + { + "epoch": 0.6184898179852225, + "grad_norm": 0.36125367879867554, + "learning_rate": 3.81258938165476e-06, + "loss": 0.4893, + "step": 1144 + }, + { + "epoch": 0.6190304559380069, + "grad_norm": 0.3904247581958771, + "learning_rate": 3.803421678562213e-06, + "loss": 0.4599, + "step": 1145 + }, + { + "epoch": 0.6195710938907911, + "grad_norm": 0.41387060284614563, + "learning_rate": 3.794258240617636e-06, + "loss": 0.4982, + "step": 1146 + }, + { + "epoch": 0.6201117318435754, + "grad_norm": 0.349608838558197, + "learning_rate": 3.7850991004836813e-06, + "loss": 0.4859, + "step": 1147 + }, + { + "epoch": 0.6206523697963597, + "grad_norm": 0.3353005647659302, + "learning_rate": 3.7759442908076786e-06, + "loss": 0.481, + "step": 1148 + }, + { + "epoch": 0.621193007749144, + "grad_norm": 0.3263114392757416, + "learning_rate": 3.7667938442215247e-06, + "loss": 0.4775, + "step": 1149 + }, + { + "epoch": 0.6217336457019282, + "grad_norm": 0.44235607981681824, + "learning_rate": 3.7576477933415612e-06, + "loss": 0.4519, + "step": 1150 + }, + { + "epoch": 0.6222742836547126, + "grad_norm": 0.4153715670108795, + "learning_rate": 3.748506170768462e-06, + "loss": 0.474, + "step": 1151 + }, + { + "epoch": 0.6228149216074969, + "grad_norm": 0.34854984283447266, + "learning_rate": 3.739369009087117e-06, + "loss": 0.4795, + "step": 1152 + }, + { + "epoch": 0.6233555595602811, + "grad_norm": 0.37851929664611816, + "learning_rate": 3.7302363408665155e-06, + "loss": 0.5126, + "step": 1153 + }, + { + "epoch": 0.6238961975130655, + "grad_norm": 0.3372579514980316, + "learning_rate": 3.721108198659633e-06, + "loss": 0.4704, + "step": 1154 + }, + { + "epoch": 0.6244368354658497, + "grad_norm": 0.39567941427230835, + "learning_rate": 3.7119846150033047e-06, + "loss": 0.486, + "step": 1155 + }, + { + "epoch": 0.624977473418634, + "grad_norm": 0.3714936673641205, + "learning_rate": 3.702865622418125e-06, + "loss": 0.4449, + "step": 1156 + }, + { + "epoch": 0.6255181113714183, + "grad_norm": 0.35422173142433167, + "learning_rate": 3.693751253408319e-06, + "loss": 0.4686, + "step": 1157 + }, + { + "epoch": 0.6260587493242026, + "grad_norm": 0.32026010751724243, + "learning_rate": 3.6846415404616344e-06, + "loss": 0.4818, + "step": 1158 + }, + { + "epoch": 0.6265993872769868, + "grad_norm": 0.3986067771911621, + "learning_rate": 3.6755365160492187e-06, + "loss": 0.4787, + "step": 1159 + }, + { + "epoch": 0.6271400252297711, + "grad_norm": 0.330572247505188, + "learning_rate": 3.6664362126255087e-06, + "loss": 0.4725, + "step": 1160 + }, + { + "epoch": 0.6276806631825554, + "grad_norm": 0.3532167971134186, + "learning_rate": 3.657340662628116e-06, + "loss": 0.4659, + "step": 1161 + }, + { + "epoch": 0.6282213011353397, + "grad_norm": 0.33118587732315063, + "learning_rate": 3.648249898477707e-06, + "loss": 0.4868, + "step": 1162 + }, + { + "epoch": 0.628761939088124, + "grad_norm": 0.36354872584342957, + "learning_rate": 3.6391639525778915e-06, + "loss": 0.4936, + "step": 1163 + }, + { + "epoch": 0.6293025770409083, + "grad_norm": 0.3409719467163086, + "learning_rate": 3.6300828573150977e-06, + "loss": 0.4667, + "step": 1164 + }, + { + "epoch": 0.6298432149936926, + "grad_norm": 0.35681483149528503, + "learning_rate": 3.621006645058472e-06, + "loss": 0.4615, + "step": 1165 + }, + { + "epoch": 0.6303838529464768, + "grad_norm": 0.3582918643951416, + "learning_rate": 3.6119353481597504e-06, + "loss": 0.4789, + "step": 1166 + }, + { + "epoch": 0.6309244908992612, + "grad_norm": 0.39356887340545654, + "learning_rate": 3.6028689989531533e-06, + "loss": 0.4494, + "step": 1167 + }, + { + "epoch": 0.6314651288520454, + "grad_norm": 0.36148393154144287, + "learning_rate": 3.593807629755258e-06, + "loss": 0.4842, + "step": 1168 + }, + { + "epoch": 0.6320057668048297, + "grad_norm": 0.35367411375045776, + "learning_rate": 3.584751272864899e-06, + "loss": 0.4716, + "step": 1169 + }, + { + "epoch": 0.632546404757614, + "grad_norm": 0.3774411976337433, + "learning_rate": 3.575699960563038e-06, + "loss": 0.4452, + "step": 1170 + }, + { + "epoch": 0.6330870427103983, + "grad_norm": 0.36172568798065186, + "learning_rate": 3.566653725112661e-06, + "loss": 0.4657, + "step": 1171 + }, + { + "epoch": 0.6336276806631825, + "grad_norm": 0.368366539478302, + "learning_rate": 3.557612598758652e-06, + "loss": 0.4793, + "step": 1172 + }, + { + "epoch": 0.6341683186159668, + "grad_norm": 0.36106520891189575, + "learning_rate": 3.5485766137276894e-06, + "loss": 0.4885, + "step": 1173 + }, + { + "epoch": 0.6347089565687511, + "grad_norm": 0.3237360119819641, + "learning_rate": 3.5395458022281205e-06, + "loss": 0.4771, + "step": 1174 + }, + { + "epoch": 0.6352495945215354, + "grad_norm": 0.3604816794395447, + "learning_rate": 3.5305201964498557e-06, + "loss": 0.4931, + "step": 1175 + }, + { + "epoch": 0.6357902324743196, + "grad_norm": 0.3982067108154297, + "learning_rate": 3.5214998285642517e-06, + "loss": 0.4677, + "step": 1176 + }, + { + "epoch": 0.636330870427104, + "grad_norm": 0.3789684474468231, + "learning_rate": 3.5124847307239863e-06, + "loss": 0.4994, + "step": 1177 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 0.3449534773826599, + "learning_rate": 3.5034749350629593e-06, + "loss": 0.4724, + "step": 1178 + }, + { + "epoch": 0.6374121463326725, + "grad_norm": 0.361224889755249, + "learning_rate": 3.4944704736961722e-06, + "loss": 0.4406, + "step": 1179 + }, + { + "epoch": 0.6379527842854569, + "grad_norm": 0.41354840993881226, + "learning_rate": 3.4854713787196105e-06, + "loss": 0.4721, + "step": 1180 + }, + { + "epoch": 0.6384934222382411, + "grad_norm": 0.351012647151947, + "learning_rate": 3.4764776822101275e-06, + "loss": 0.4582, + "step": 1181 + }, + { + "epoch": 0.6390340601910254, + "grad_norm": 0.39335301518440247, + "learning_rate": 3.4674894162253404e-06, + "loss": 0.4826, + "step": 1182 + }, + { + "epoch": 0.6395746981438097, + "grad_norm": 0.34970593452453613, + "learning_rate": 3.458506612803505e-06, + "loss": 0.4828, + "step": 1183 + }, + { + "epoch": 0.640115336096594, + "grad_norm": 0.38310420513153076, + "learning_rate": 3.4495293039634113e-06, + "loss": 0.4851, + "step": 1184 + }, + { + "epoch": 0.6406559740493782, + "grad_norm": 0.4005867838859558, + "learning_rate": 3.440557521704256e-06, + "loss": 0.4729, + "step": 1185 + }, + { + "epoch": 0.6411966120021626, + "grad_norm": 0.4233829975128174, + "learning_rate": 3.4315912980055433e-06, + "loss": 0.4519, + "step": 1186 + }, + { + "epoch": 0.6417372499549469, + "grad_norm": 0.3797195851802826, + "learning_rate": 3.4226306648269616e-06, + "loss": 0.4758, + "step": 1187 + }, + { + "epoch": 0.6422778879077311, + "grad_norm": 0.35736843943595886, + "learning_rate": 3.413675654108275e-06, + "loss": 0.4627, + "step": 1188 + }, + { + "epoch": 0.6428185258605154, + "grad_norm": 0.3385578989982605, + "learning_rate": 3.4047262977692014e-06, + "loss": 0.471, + "step": 1189 + }, + { + "epoch": 0.6433591638132997, + "grad_norm": 0.3547685146331787, + "learning_rate": 3.3957826277093074e-06, + "loss": 0.4813, + "step": 1190 + }, + { + "epoch": 0.643899801766084, + "grad_norm": 0.3947050869464874, + "learning_rate": 3.3868446758078897e-06, + "loss": 0.4988, + "step": 1191 + }, + { + "epoch": 0.6444404397188682, + "grad_norm": 0.3719642162322998, + "learning_rate": 3.3779124739238657e-06, + "loss": 0.4632, + "step": 1192 + }, + { + "epoch": 0.6449810776716526, + "grad_norm": 0.3777051866054535, + "learning_rate": 3.3689860538956547e-06, + "loss": 0.4482, + "step": 1193 + }, + { + "epoch": 0.6455217156244368, + "grad_norm": 0.3597317337989807, + "learning_rate": 3.3600654475410643e-06, + "loss": 0.4687, + "step": 1194 + }, + { + "epoch": 0.6460623535772211, + "grad_norm": 0.4228699803352356, + "learning_rate": 3.351150686657185e-06, + "loss": 0.4585, + "step": 1195 + }, + { + "epoch": 0.6466029915300054, + "grad_norm": 0.3539244532585144, + "learning_rate": 3.3422418030202696e-06, + "loss": 0.4474, + "step": 1196 + }, + { + "epoch": 0.6471436294827897, + "grad_norm": 0.3758021891117096, + "learning_rate": 3.3333388283856195e-06, + "loss": 0.4838, + "step": 1197 + }, + { + "epoch": 0.647684267435574, + "grad_norm": 0.3555963635444641, + "learning_rate": 3.324441794487475e-06, + "loss": 0.4614, + "step": 1198 + }, + { + "epoch": 0.6482249053883583, + "grad_norm": 0.3526434302330017, + "learning_rate": 3.3155507330389004e-06, + "loss": 0.4629, + "step": 1199 + }, + { + "epoch": 0.6487655433411426, + "grad_norm": 0.38898134231567383, + "learning_rate": 3.306665675731674e-06, + "loss": 0.4907, + "step": 1200 + }, + { + "epoch": 0.6493061812939268, + "grad_norm": 0.32450518012046814, + "learning_rate": 3.297786654236169e-06, + "loss": 0.4781, + "step": 1201 + }, + { + "epoch": 0.6498468192467112, + "grad_norm": 0.36460715532302856, + "learning_rate": 3.28891370020125e-06, + "loss": 0.4958, + "step": 1202 + }, + { + "epoch": 0.6503874571994954, + "grad_norm": 0.34277579188346863, + "learning_rate": 3.280046845254145e-06, + "loss": 0.4979, + "step": 1203 + }, + { + "epoch": 0.6509280951522797, + "grad_norm": 0.33557090163230896, + "learning_rate": 3.2711861210003503e-06, + "loss": 0.4811, + "step": 1204 + }, + { + "epoch": 0.6514687331050639, + "grad_norm": 0.3325314521789551, + "learning_rate": 3.2623315590235076e-06, + "loss": 0.4755, + "step": 1205 + }, + { + "epoch": 0.6520093710578483, + "grad_norm": 0.3976009488105774, + "learning_rate": 3.2534831908852914e-06, + "loss": 0.4736, + "step": 1206 + }, + { + "epoch": 0.6525500090106325, + "grad_norm": 0.3497442305088043, + "learning_rate": 3.244641048125301e-06, + "loss": 0.4567, + "step": 1207 + }, + { + "epoch": 0.6530906469634168, + "grad_norm": 0.3687249422073364, + "learning_rate": 3.235805162260942e-06, + "loss": 0.4592, + "step": 1208 + }, + { + "epoch": 0.6536312849162011, + "grad_norm": 0.36228495836257935, + "learning_rate": 3.226975564787322e-06, + "loss": 0.4593, + "step": 1209 + }, + { + "epoch": 0.6541719228689854, + "grad_norm": 0.34000781178474426, + "learning_rate": 3.218152287177133e-06, + "loss": 0.4895, + "step": 1210 + }, + { + "epoch": 0.6547125608217697, + "grad_norm": 0.37587717175483704, + "learning_rate": 3.2093353608805368e-06, + "loss": 0.4689, + "step": 1211 + }, + { + "epoch": 0.655253198774554, + "grad_norm": 0.35287749767303467, + "learning_rate": 3.2005248173250593e-06, + "loss": 0.4846, + "step": 1212 + }, + { + "epoch": 0.6557938367273383, + "grad_norm": 0.4108916223049164, + "learning_rate": 3.1917206879154762e-06, + "loss": 0.4592, + "step": 1213 + }, + { + "epoch": 0.6563344746801225, + "grad_norm": 0.3818206489086151, + "learning_rate": 3.1829230040336967e-06, + "loss": 0.4836, + "step": 1214 + }, + { + "epoch": 0.6568751126329069, + "grad_norm": 0.38580086827278137, + "learning_rate": 3.1741317970386597e-06, + "loss": 0.4723, + "step": 1215 + }, + { + "epoch": 0.6574157505856911, + "grad_norm": 0.37795159220695496, + "learning_rate": 3.1653470982662114e-06, + "loss": 0.472, + "step": 1216 + }, + { + "epoch": 0.6579563885384754, + "grad_norm": 0.46705180406570435, + "learning_rate": 3.1565689390290067e-06, + "loss": 0.4617, + "step": 1217 + }, + { + "epoch": 0.6584970264912597, + "grad_norm": 0.3606800138950348, + "learning_rate": 3.147797350616385e-06, + "loss": 0.494, + "step": 1218 + }, + { + "epoch": 0.659037664444044, + "grad_norm": 0.335356742143631, + "learning_rate": 3.139032364294271e-06, + "loss": 0.4466, + "step": 1219 + }, + { + "epoch": 0.6595783023968282, + "grad_norm": 0.3441433906555176, + "learning_rate": 3.130274011305047e-06, + "loss": 0.4745, + "step": 1220 + }, + { + "epoch": 0.6601189403496125, + "grad_norm": 0.33748725056648254, + "learning_rate": 3.1215223228674587e-06, + "loss": 0.4989, + "step": 1221 + }, + { + "epoch": 0.6606595783023969, + "grad_norm": 0.3482097387313843, + "learning_rate": 3.1127773301764935e-06, + "loss": 0.4774, + "step": 1222 + }, + { + "epoch": 0.6612002162551811, + "grad_norm": 0.3453892171382904, + "learning_rate": 3.1040390644032746e-06, + "loss": 0.4759, + "step": 1223 + }, + { + "epoch": 0.6617408542079654, + "grad_norm": 0.36808252334594727, + "learning_rate": 3.095307556694942e-06, + "loss": 0.4692, + "step": 1224 + }, + { + "epoch": 0.6622814921607497, + "grad_norm": 0.4075857102870941, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.4607, + "step": 1225 + }, + { + "epoch": 0.662822130113534, + "grad_norm": 0.3461955189704895, + "learning_rate": 3.077864939940959e-06, + "loss": 0.4696, + "step": 1226 + }, + { + "epoch": 0.6633627680663182, + "grad_norm": 0.3581470847129822, + "learning_rate": 3.0691538930687076e-06, + "loss": 0.468, + "step": 1227 + }, + { + "epoch": 0.6639034060191026, + "grad_norm": 0.34455424547195435, + "learning_rate": 3.0604497286079227e-06, + "loss": 0.4435, + "step": 1228 + }, + { + "epoch": 0.6644440439718868, + "grad_norm": 0.35402852296829224, + "learning_rate": 3.051752477584191e-06, + "loss": 0.4831, + "step": 1229 + }, + { + "epoch": 0.6649846819246711, + "grad_norm": 0.35135895013809204, + "learning_rate": 3.043062170998464e-06, + "loss": 0.4827, + "step": 1230 + }, + { + "epoch": 0.6655253198774554, + "grad_norm": 0.3202992081642151, + "learning_rate": 3.0343788398269342e-06, + "loss": 0.4442, + "step": 1231 + }, + { + "epoch": 0.6660659578302397, + "grad_norm": 0.3709985613822937, + "learning_rate": 3.025702515020937e-06, + "loss": 0.4635, + "step": 1232 + }, + { + "epoch": 0.666606595783024, + "grad_norm": 0.358951598405838, + "learning_rate": 3.0170332275068247e-06, + "loss": 0.4464, + "step": 1233 + }, + { + "epoch": 0.6671472337358082, + "grad_norm": 0.3573668897151947, + "learning_rate": 3.0083710081858748e-06, + "loss": 0.48, + "step": 1234 + }, + { + "epoch": 0.6676878716885926, + "grad_norm": 0.34597110748291016, + "learning_rate": 2.9997158879341647e-06, + "loss": 0.4718, + "step": 1235 + }, + { + "epoch": 0.6682285096413768, + "grad_norm": 0.3602089583873749, + "learning_rate": 2.9910678976024733e-06, + "loss": 0.449, + "step": 1236 + }, + { + "epoch": 0.6687691475941611, + "grad_norm": 0.34547102451324463, + "learning_rate": 2.982427068016155e-06, + "loss": 0.4584, + "step": 1237 + }, + { + "epoch": 0.6693097855469454, + "grad_norm": 0.3194965124130249, + "learning_rate": 2.9737934299750514e-06, + "loss": 0.475, + "step": 1238 + }, + { + "epoch": 0.6698504234997297, + "grad_norm": 0.37800660729408264, + "learning_rate": 2.965167014253363e-06, + "loss": 0.4739, + "step": 1239 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.35336002707481384, + "learning_rate": 2.956547851599548e-06, + "loss": 0.4768, + "step": 1240 + }, + { + "epoch": 0.6709316994052983, + "grad_norm": 0.3484102785587311, + "learning_rate": 2.947935972736217e-06, + "loss": 0.4606, + "step": 1241 + }, + { + "epoch": 0.6714723373580825, + "grad_norm": 0.33098798990249634, + "learning_rate": 2.9393314083600076e-06, + "loss": 0.4737, + "step": 1242 + }, + { + "epoch": 0.6720129753108668, + "grad_norm": 0.33991244435310364, + "learning_rate": 2.930734189141492e-06, + "loss": 0.4609, + "step": 1243 + }, + { + "epoch": 0.6725536132636512, + "grad_norm": 0.3231086730957031, + "learning_rate": 2.922144345725062e-06, + "loss": 0.4668, + "step": 1244 + }, + { + "epoch": 0.6730942512164354, + "grad_norm": 0.3015232980251312, + "learning_rate": 2.9135619087288153e-06, + "loss": 0.4378, + "step": 1245 + }, + { + "epoch": 0.6736348891692197, + "grad_norm": 0.3297620415687561, + "learning_rate": 2.9049869087444493e-06, + "loss": 0.4843, + "step": 1246 + }, + { + "epoch": 0.674175527122004, + "grad_norm": 0.3333333134651184, + "learning_rate": 2.8964193763371546e-06, + "loss": 0.4523, + "step": 1247 + }, + { + "epoch": 0.6747161650747883, + "grad_norm": 0.3433511257171631, + "learning_rate": 2.887859342045506e-06, + "loss": 0.4698, + "step": 1248 + }, + { + "epoch": 0.6752568030275725, + "grad_norm": 0.3787667751312256, + "learning_rate": 2.879306836381345e-06, + "loss": 0.4556, + "step": 1249 + }, + { + "epoch": 0.6757974409803568, + "grad_norm": 0.3427095413208008, + "learning_rate": 2.8707618898296864e-06, + "loss": 0.4689, + "step": 1250 + }, + { + "epoch": 0.6763380789331411, + "grad_norm": 0.3421780467033386, + "learning_rate": 2.862224532848591e-06, + "loss": 0.4805, + "step": 1251 + }, + { + "epoch": 0.6768787168859254, + "grad_norm": 0.37503376603126526, + "learning_rate": 2.853694795869074e-06, + "loss": 0.4946, + "step": 1252 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.33137035369873047, + "learning_rate": 2.845172709294989e-06, + "loss": 0.4447, + "step": 1253 + }, + { + "epoch": 0.677959992791494, + "grad_norm": 0.38808560371398926, + "learning_rate": 2.8366583035029194e-06, + "loss": 0.4918, + "step": 1254 + }, + { + "epoch": 0.6785006307442782, + "grad_norm": 0.31489184498786926, + "learning_rate": 2.8281516088420665e-06, + "loss": 0.4527, + "step": 1255 + }, + { + "epoch": 0.6790412686970625, + "grad_norm": 0.33485811948776245, + "learning_rate": 2.819652655634151e-06, + "loss": 0.4692, + "step": 1256 + }, + { + "epoch": 0.6795819066498469, + "grad_norm": 0.3656138777732849, + "learning_rate": 2.8111614741732975e-06, + "loss": 0.4612, + "step": 1257 + }, + { + "epoch": 0.6801225446026311, + "grad_norm": 0.3511470854282379, + "learning_rate": 2.802678094725931e-06, + "loss": 0.4532, + "step": 1258 + }, + { + "epoch": 0.6806631825554154, + "grad_norm": 0.3320247530937195, + "learning_rate": 2.794202547530661e-06, + "loss": 0.4452, + "step": 1259 + }, + { + "epoch": 0.6812038205081997, + "grad_norm": 0.3250149190425873, + "learning_rate": 2.785734862798184e-06, + "loss": 0.4722, + "step": 1260 + }, + { + "epoch": 0.681744458460984, + "grad_norm": 0.34146052598953247, + "learning_rate": 2.77727507071117e-06, + "loss": 0.4769, + "step": 1261 + }, + { + "epoch": 0.6822850964137682, + "grad_norm": 0.3707115054130554, + "learning_rate": 2.768823201424158e-06, + "loss": 0.4716, + "step": 1262 + }, + { + "epoch": 0.6828257343665526, + "grad_norm": 0.3220188617706299, + "learning_rate": 2.7603792850634402e-06, + "loss": 0.4858, + "step": 1263 + }, + { + "epoch": 0.6833663723193368, + "grad_norm": 0.3371463716030121, + "learning_rate": 2.7519433517269665e-06, + "loss": 0.4778, + "step": 1264 + }, + { + "epoch": 0.6839070102721211, + "grad_norm": 0.33046388626098633, + "learning_rate": 2.7435154314842337e-06, + "loss": 0.449, + "step": 1265 + }, + { + "epoch": 0.6844476482249053, + "grad_norm": 0.33768966794013977, + "learning_rate": 2.7350955543761682e-06, + "loss": 0.4591, + "step": 1266 + }, + { + "epoch": 0.6849882861776897, + "grad_norm": 0.35337385535240173, + "learning_rate": 2.7266837504150345e-06, + "loss": 0.4649, + "step": 1267 + }, + { + "epoch": 0.685528924130474, + "grad_norm": 0.4103326201438904, + "learning_rate": 2.7182800495843166e-06, + "loss": 0.4778, + "step": 1268 + }, + { + "epoch": 0.6860695620832582, + "grad_norm": 0.3523549437522888, + "learning_rate": 2.7098844818386164e-06, + "loss": 0.4656, + "step": 1269 + }, + { + "epoch": 0.6866102000360426, + "grad_norm": 0.3538891077041626, + "learning_rate": 2.7014970771035474e-06, + "loss": 0.4821, + "step": 1270 + }, + { + "epoch": 0.6871508379888268, + "grad_norm": 0.3851732313632965, + "learning_rate": 2.6931178652756262e-06, + "loss": 0.4928, + "step": 1271 + }, + { + "epoch": 0.6876914759416111, + "grad_norm": 0.3899889290332794, + "learning_rate": 2.6847468762221616e-06, + "loss": 0.4867, + "step": 1272 + }, + { + "epoch": 0.6882321138943954, + "grad_norm": 0.4064536690711975, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.4629, + "step": 1273 + }, + { + "epoch": 0.6887727518471797, + "grad_norm": 0.34950196743011475, + "learning_rate": 2.668029685761201e-06, + "loss": 0.4284, + "step": 1274 + }, + { + "epoch": 0.6893133897999639, + "grad_norm": 0.36954036355018616, + "learning_rate": 2.6596835439413584e-06, + "loss": 0.4834, + "step": 1275 + }, + { + "epoch": 0.6898540277527483, + "grad_norm": 0.3687545657157898, + "learning_rate": 2.6513457440710612e-06, + "loss": 0.4632, + "step": 1276 + }, + { + "epoch": 0.6903946657055325, + "grad_norm": 0.41740676760673523, + "learning_rate": 2.6430163158700116e-06, + "loss": 0.4802, + "step": 1277 + }, + { + "epoch": 0.6909353036583168, + "grad_norm": 0.3302917182445526, + "learning_rate": 2.634695289028072e-06, + "loss": 0.4417, + "step": 1278 + }, + { + "epoch": 0.6914759416111012, + "grad_norm": 0.3250352144241333, + "learning_rate": 2.6263826932051562e-06, + "loss": 0.465, + "step": 1279 + }, + { + "epoch": 0.6920165795638854, + "grad_norm": 0.35972073674201965, + "learning_rate": 2.6180785580311284e-06, + "loss": 0.467, + "step": 1280 + }, + { + "epoch": 0.6925572175166697, + "grad_norm": 0.34688135981559753, + "learning_rate": 2.609782913105691e-06, + "loss": 0.4528, + "step": 1281 + }, + { + "epoch": 0.6930978554694539, + "grad_norm": 0.38405537605285645, + "learning_rate": 2.601495787998288e-06, + "loss": 0.4902, + "step": 1282 + }, + { + "epoch": 0.6936384934222383, + "grad_norm": 0.38303545117378235, + "learning_rate": 2.59321721224799e-06, + "loss": 0.4536, + "step": 1283 + }, + { + "epoch": 0.6941791313750225, + "grad_norm": 0.34185272455215454, + "learning_rate": 2.5849472153634003e-06, + "loss": 0.4441, + "step": 1284 + }, + { + "epoch": 0.6947197693278068, + "grad_norm": 0.41703319549560547, + "learning_rate": 2.576685826822535e-06, + "loss": 0.4781, + "step": 1285 + }, + { + "epoch": 0.6952604072805911, + "grad_norm": 0.3938962519168854, + "learning_rate": 2.568433076072734e-06, + "loss": 0.4755, + "step": 1286 + }, + { + "epoch": 0.6958010452333754, + "grad_norm": 0.36541613936424255, + "learning_rate": 2.5601889925305433e-06, + "loss": 0.4789, + "step": 1287 + }, + { + "epoch": 0.6963416831861596, + "grad_norm": 0.3563724458217621, + "learning_rate": 2.5519536055816194e-06, + "loss": 0.4781, + "step": 1288 + }, + { + "epoch": 0.696882321138944, + "grad_norm": 0.34606099128723145, + "learning_rate": 2.5437269445806146e-06, + "loss": 0.4485, + "step": 1289 + }, + { + "epoch": 0.6974229590917282, + "grad_norm": 0.42106860876083374, + "learning_rate": 2.5355090388510806e-06, + "loss": 0.4739, + "step": 1290 + }, + { + "epoch": 0.6979635970445125, + "grad_norm": 0.4748215973377228, + "learning_rate": 2.527299917685362e-06, + "loss": 0.4659, + "step": 1291 + }, + { + "epoch": 0.6985042349972969, + "grad_norm": 0.38426297903060913, + "learning_rate": 2.519099610344492e-06, + "loss": 0.4976, + "step": 1292 + }, + { + "epoch": 0.6990448729500811, + "grad_norm": 0.3022260069847107, + "learning_rate": 2.5109081460580875e-06, + "loss": 0.4347, + "step": 1293 + }, + { + "epoch": 0.6995855109028654, + "grad_norm": 0.35534849762916565, + "learning_rate": 2.502725554024239e-06, + "loss": 0.4635, + "step": 1294 + }, + { + "epoch": 0.7001261488556497, + "grad_norm": 0.35701191425323486, + "learning_rate": 2.494551863409418e-06, + "loss": 0.4827, + "step": 1295 + }, + { + "epoch": 0.700666786808434, + "grad_norm": 0.37284964323043823, + "learning_rate": 2.4863871033483693e-06, + "loss": 0.4857, + "step": 1296 + }, + { + "epoch": 0.7012074247612182, + "grad_norm": 0.35588300228118896, + "learning_rate": 2.478231302943997e-06, + "loss": 0.4709, + "step": 1297 + }, + { + "epoch": 0.7017480627140025, + "grad_norm": 0.3689674735069275, + "learning_rate": 2.470084491267278e-06, + "loss": 0.4734, + "step": 1298 + }, + { + "epoch": 0.7022887006667868, + "grad_norm": 0.3724137246608734, + "learning_rate": 2.46194669735714e-06, + "loss": 0.4638, + "step": 1299 + }, + { + "epoch": 0.7028293386195711, + "grad_norm": 0.42699527740478516, + "learning_rate": 2.4538179502203753e-06, + "loss": 0.5024, + "step": 1300 + }, + { + "epoch": 0.7033699765723553, + "grad_norm": 0.35276705026626587, + "learning_rate": 2.445698278831528e-06, + "loss": 0.4693, + "step": 1301 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 0.3660089075565338, + "learning_rate": 2.437587712132787e-06, + "loss": 0.4544, + "step": 1302 + }, + { + "epoch": 0.704451252477924, + "grad_norm": 0.3719891905784607, + "learning_rate": 2.429486279033892e-06, + "loss": 0.4498, + "step": 1303 + }, + { + "epoch": 0.7049918904307082, + "grad_norm": 0.34804993867874146, + "learning_rate": 2.4213940084120274e-06, + "loss": 0.4814, + "step": 1304 + }, + { + "epoch": 0.7055325283834926, + "grad_norm": 0.35139375925064087, + "learning_rate": 2.4133109291117156e-06, + "loss": 0.472, + "step": 1305 + }, + { + "epoch": 0.7060731663362768, + "grad_norm": 0.3766683042049408, + "learning_rate": 2.405237069944721e-06, + "loss": 0.4688, + "step": 1306 + }, + { + "epoch": 0.7066138042890611, + "grad_norm": 0.37273478507995605, + "learning_rate": 2.397172459689936e-06, + "loss": 0.5089, + "step": 1307 + }, + { + "epoch": 0.7071544422418454, + "grad_norm": 0.33368778228759766, + "learning_rate": 2.3891171270932923e-06, + "loss": 0.4666, + "step": 1308 + }, + { + "epoch": 0.7076950801946297, + "grad_norm": 0.3467375338077545, + "learning_rate": 2.3810711008676495e-06, + "loss": 0.467, + "step": 1309 + }, + { + "epoch": 0.7082357181474139, + "grad_norm": 0.39090535044670105, + "learning_rate": 2.3730344096926974e-06, + "loss": 0.4934, + "step": 1310 + }, + { + "epoch": 0.7087763561001982, + "grad_norm": 0.35073044896125793, + "learning_rate": 2.3650070822148447e-06, + "loss": 0.4396, + "step": 1311 + }, + { + "epoch": 0.7093169940529825, + "grad_norm": 0.321822851896286, + "learning_rate": 2.3569891470471308e-06, + "loss": 0.4375, + "step": 1312 + }, + { + "epoch": 0.7098576320057668, + "grad_norm": 0.3142220377922058, + "learning_rate": 2.3489806327691156e-06, + "loss": 0.4711, + "step": 1313 + }, + { + "epoch": 0.710398269958551, + "grad_norm": 0.3423789143562317, + "learning_rate": 2.3409815679267733e-06, + "loss": 0.4384, + "step": 1314 + }, + { + "epoch": 0.7109389079113354, + "grad_norm": 0.3237120509147644, + "learning_rate": 2.3329919810324036e-06, + "loss": 0.4744, + "step": 1315 + }, + { + "epoch": 0.7114795458641197, + "grad_norm": 0.3413024842739105, + "learning_rate": 2.325011900564515e-06, + "loss": 0.4676, + "step": 1316 + }, + { + "epoch": 0.7120201838169039, + "grad_norm": 0.3537076711654663, + "learning_rate": 2.3170413549677367e-06, + "loss": 0.4839, + "step": 1317 + }, + { + "epoch": 0.7125608217696883, + "grad_norm": 0.3629612624645233, + "learning_rate": 2.3090803726527083e-06, + "loss": 0.5076, + "step": 1318 + }, + { + "epoch": 0.7131014597224725, + "grad_norm": 0.32516616582870483, + "learning_rate": 2.301128981995985e-06, + "loss": 0.4485, + "step": 1319 + }, + { + "epoch": 0.7136420976752568, + "grad_norm": 0.38808518648147583, + "learning_rate": 2.293187211339926e-06, + "loss": 0.4638, + "step": 1320 + }, + { + "epoch": 0.7141827356280411, + "grad_norm": 0.3397766351699829, + "learning_rate": 2.2852550889926067e-06, + "loss": 0.4937, + "step": 1321 + }, + { + "epoch": 0.7147233735808254, + "grad_norm": 0.3628048896789551, + "learning_rate": 2.2773326432277097e-06, + "loss": 0.4832, + "step": 1322 + }, + { + "epoch": 0.7152640115336096, + "grad_norm": 0.32281026244163513, + "learning_rate": 2.2694199022844284e-06, + "loss": 0.4313, + "step": 1323 + }, + { + "epoch": 0.715804649486394, + "grad_norm": 0.33235985040664673, + "learning_rate": 2.261516894367356e-06, + "loss": 0.4402, + "step": 1324 + }, + { + "epoch": 0.7163452874391782, + "grad_norm": 0.3623601198196411, + "learning_rate": 2.2536236476464007e-06, + "loss": 0.4644, + "step": 1325 + }, + { + "epoch": 0.7168859253919625, + "grad_norm": 0.32898762822151184, + "learning_rate": 2.2457401902566745e-06, + "loss": 0.4539, + "step": 1326 + }, + { + "epoch": 0.7174265633447467, + "grad_norm": 0.34005796909332275, + "learning_rate": 2.2378665502983976e-06, + "loss": 0.4869, + "step": 1327 + }, + { + "epoch": 0.7179672012975311, + "grad_norm": 0.3692139983177185, + "learning_rate": 2.2300027558367917e-06, + "loss": 0.4671, + "step": 1328 + }, + { + "epoch": 0.7185078392503154, + "grad_norm": 0.3715006113052368, + "learning_rate": 2.2221488349019903e-06, + "loss": 0.4843, + "step": 1329 + }, + { + "epoch": 0.7190484772030996, + "grad_norm": 0.30792778730392456, + "learning_rate": 2.2143048154889272e-06, + "loss": 0.4696, + "step": 1330 + }, + { + "epoch": 0.719589115155884, + "grad_norm": 0.3531022071838379, + "learning_rate": 2.2064707255572494e-06, + "loss": 0.4716, + "step": 1331 + }, + { + "epoch": 0.7201297531086682, + "grad_norm": 0.3335738182067871, + "learning_rate": 2.1986465930312067e-06, + "loss": 0.4389, + "step": 1332 + }, + { + "epoch": 0.7206703910614525, + "grad_norm": 0.3675576150417328, + "learning_rate": 2.1908324457995556e-06, + "loss": 0.4436, + "step": 1333 + }, + { + "epoch": 0.7212110290142368, + "grad_norm": 0.33168819546699524, + "learning_rate": 2.1830283117154616e-06, + "loss": 0.4308, + "step": 1334 + }, + { + "epoch": 0.7217516669670211, + "grad_norm": 0.36723217368125916, + "learning_rate": 2.1752342185964003e-06, + "loss": 0.4769, + "step": 1335 + }, + { + "epoch": 0.7222923049198053, + "grad_norm": 0.36966997385025024, + "learning_rate": 2.1674501942240567e-06, + "loss": 0.4624, + "step": 1336 + }, + { + "epoch": 0.7228329428725897, + "grad_norm": 0.3729422986507416, + "learning_rate": 2.159676266344222e-06, + "loss": 0.4774, + "step": 1337 + }, + { + "epoch": 0.723373580825374, + "grad_norm": 0.3431287407875061, + "learning_rate": 2.151912462666703e-06, + "loss": 0.4451, + "step": 1338 + }, + { + "epoch": 0.7239142187781582, + "grad_norm": 0.3624579906463623, + "learning_rate": 2.144158810865217e-06, + "loss": 0.4711, + "step": 1339 + }, + { + "epoch": 0.7244548567309426, + "grad_norm": 0.3302533030509949, + "learning_rate": 2.1364153385773007e-06, + "loss": 0.4802, + "step": 1340 + }, + { + "epoch": 0.7249954946837268, + "grad_norm": 0.34277042746543884, + "learning_rate": 2.128682073404197e-06, + "loss": 0.4794, + "step": 1341 + }, + { + "epoch": 0.7255361326365111, + "grad_norm": 0.3795274794101715, + "learning_rate": 2.1209590429107734e-06, + "loss": 0.4614, + "step": 1342 + }, + { + "epoch": 0.7260767705892953, + "grad_norm": 0.3579147458076477, + "learning_rate": 2.1132462746254147e-06, + "loss": 0.4946, + "step": 1343 + }, + { + "epoch": 0.7266174085420797, + "grad_norm": 0.36164700984954834, + "learning_rate": 2.1055437960399266e-06, + "loss": 0.4427, + "step": 1344 + }, + { + "epoch": 0.7271580464948639, + "grad_norm": 0.3287413418292999, + "learning_rate": 2.0978516346094342e-06, + "loss": 0.4723, + "step": 1345 + }, + { + "epoch": 0.7276986844476482, + "grad_norm": 0.33054518699645996, + "learning_rate": 2.0901698177522944e-06, + "loss": 0.4589, + "step": 1346 + }, + { + "epoch": 0.7282393224004325, + "grad_norm": 0.3471773862838745, + "learning_rate": 2.082498372849983e-06, + "loss": 0.4746, + "step": 1347 + }, + { + "epoch": 0.7287799603532168, + "grad_norm": 0.34974491596221924, + "learning_rate": 2.074837327247012e-06, + "loss": 0.487, + "step": 1348 + }, + { + "epoch": 0.729320598306001, + "grad_norm": 0.3569546937942505, + "learning_rate": 2.067186708250826e-06, + "loss": 0.4852, + "step": 1349 + }, + { + "epoch": 0.7298612362587854, + "grad_norm": 0.31537672877311707, + "learning_rate": 2.059546543131696e-06, + "loss": 0.4561, + "step": 1350 + }, + { + "epoch": 0.7304018742115697, + "grad_norm": 0.2963314950466156, + "learning_rate": 2.051916859122641e-06, + "loss": 0.4487, + "step": 1351 + }, + { + "epoch": 0.7309425121643539, + "grad_norm": 0.31170910596847534, + "learning_rate": 2.0442976834193146e-06, + "loss": 0.4585, + "step": 1352 + }, + { + "epoch": 0.7314831501171383, + "grad_norm": 0.3321332037448883, + "learning_rate": 2.036689043179917e-06, + "loss": 0.4635, + "step": 1353 + }, + { + "epoch": 0.7320237880699225, + "grad_norm": 0.34618934988975525, + "learning_rate": 2.0290909655250913e-06, + "loss": 0.4654, + "step": 1354 + }, + { + "epoch": 0.7325644260227068, + "grad_norm": 0.3425121009349823, + "learning_rate": 2.0215034775378336e-06, + "loss": 0.4692, + "step": 1355 + }, + { + "epoch": 0.7331050639754911, + "grad_norm": 0.33614981174468994, + "learning_rate": 2.013926606263394e-06, + "loss": 0.473, + "step": 1356 + }, + { + "epoch": 0.7336457019282754, + "grad_norm": 0.3264980614185333, + "learning_rate": 2.0063603787091788e-06, + "loss": 0.452, + "step": 1357 + }, + { + "epoch": 0.7341863398810596, + "grad_norm": 0.33285725116729736, + "learning_rate": 1.9988048218446577e-06, + "loss": 0.4593, + "step": 1358 + }, + { + "epoch": 0.7347269778338439, + "grad_norm": 0.367969274520874, + "learning_rate": 1.9912599626012593e-06, + "loss": 0.4649, + "step": 1359 + }, + { + "epoch": 0.7352676157866282, + "grad_norm": 0.33582252264022827, + "learning_rate": 1.9837258278722855e-06, + "loss": 0.4726, + "step": 1360 + }, + { + "epoch": 0.7358082537394125, + "grad_norm": 0.3242082893848419, + "learning_rate": 1.976202444512813e-06, + "loss": 0.4946, + "step": 1361 + }, + { + "epoch": 0.7363488916921967, + "grad_norm": 0.33312639594078064, + "learning_rate": 1.96868983933959e-06, + "loss": 0.4802, + "step": 1362 + }, + { + "epoch": 0.7368895296449811, + "grad_norm": 0.3373234272003174, + "learning_rate": 1.9611880391309524e-06, + "loss": 0.4847, + "step": 1363 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 0.3237764835357666, + "learning_rate": 1.9536970706267156e-06, + "loss": 0.4641, + "step": 1364 + }, + { + "epoch": 0.7379708055505496, + "grad_norm": 0.33678045868873596, + "learning_rate": 1.946216960528092e-06, + "loss": 0.476, + "step": 1365 + }, + { + "epoch": 0.738511443503334, + "grad_norm": 0.3523315489292145, + "learning_rate": 1.9387477354975885e-06, + "loss": 0.4559, + "step": 1366 + }, + { + "epoch": 0.7390520814561182, + "grad_norm": 0.3299519121646881, + "learning_rate": 1.9312894221589085e-06, + "loss": 0.4554, + "step": 1367 + }, + { + "epoch": 0.7395927194089025, + "grad_norm": 0.3041582703590393, + "learning_rate": 1.9238420470968665e-06, + "loss": 0.4544, + "step": 1368 + }, + { + "epoch": 0.7401333573616868, + "grad_norm": 0.30898869037628174, + "learning_rate": 1.9164056368572847e-06, + "loss": 0.4635, + "step": 1369 + }, + { + "epoch": 0.7406739953144711, + "grad_norm": 0.3286183178424835, + "learning_rate": 1.9089802179469036e-06, + "loss": 0.4673, + "step": 1370 + }, + { + "epoch": 0.7412146332672553, + "grad_norm": 0.3371999263763428, + "learning_rate": 1.9015658168332863e-06, + "loss": 0.4848, + "step": 1371 + }, + { + "epoch": 0.7417552712200396, + "grad_norm": 0.33480530977249146, + "learning_rate": 1.8941624599447178e-06, + "loss": 0.4687, + "step": 1372 + }, + { + "epoch": 0.742295909172824, + "grad_norm": 0.3701348304748535, + "learning_rate": 1.8867701736701238e-06, + "loss": 0.4683, + "step": 1373 + }, + { + "epoch": 0.7428365471256082, + "grad_norm": 0.33396846055984497, + "learning_rate": 1.8793889843589647e-06, + "loss": 0.4728, + "step": 1374 + }, + { + "epoch": 0.7433771850783925, + "grad_norm": 0.3385055363178253, + "learning_rate": 1.87201891832115e-06, + "loss": 0.4715, + "step": 1375 + }, + { + "epoch": 0.7439178230311768, + "grad_norm": 0.33360961079597473, + "learning_rate": 1.8646600018269356e-06, + "loss": 0.4777, + "step": 1376 + }, + { + "epoch": 0.7444584609839611, + "grad_norm": 0.2998606264591217, + "learning_rate": 1.8573122611068406e-06, + "loss": 0.4671, + "step": 1377 + }, + { + "epoch": 0.7449990989367453, + "grad_norm": 0.3257642984390259, + "learning_rate": 1.8499757223515442e-06, + "loss": 0.4966, + "step": 1378 + }, + { + "epoch": 0.7455397368895297, + "grad_norm": 0.3353062868118286, + "learning_rate": 1.8426504117118011e-06, + "loss": 0.4495, + "step": 1379 + }, + { + "epoch": 0.7460803748423139, + "grad_norm": 0.3439493179321289, + "learning_rate": 1.8353363552983382e-06, + "loss": 0.4777, + "step": 1380 + }, + { + "epoch": 0.7466210127950982, + "grad_norm": 0.38241538405418396, + "learning_rate": 1.8280335791817733e-06, + "loss": 0.4726, + "step": 1381 + }, + { + "epoch": 0.7471616507478825, + "grad_norm": 0.3326997458934784, + "learning_rate": 1.8207421093925127e-06, + "loss": 0.4798, + "step": 1382 + }, + { + "epoch": 0.7477022887006668, + "grad_norm": 0.33782845735549927, + "learning_rate": 1.8134619719206624e-06, + "loss": 0.4439, + "step": 1383 + }, + { + "epoch": 0.748242926653451, + "grad_norm": 0.34692955017089844, + "learning_rate": 1.8061931927159377e-06, + "loss": 0.4354, + "step": 1384 + }, + { + "epoch": 0.7487835646062354, + "grad_norm": 0.3597027659416199, + "learning_rate": 1.7989357976875603e-06, + "loss": 0.4989, + "step": 1385 + }, + { + "epoch": 0.7493242025590197, + "grad_norm": 0.3161109983921051, + "learning_rate": 1.7916898127041815e-06, + "loss": 0.4341, + "step": 1386 + }, + { + "epoch": 0.7498648405118039, + "grad_norm": 0.32751956582069397, + "learning_rate": 1.7844552635937784e-06, + "loss": 0.4657, + "step": 1387 + }, + { + "epoch": 0.7504054784645882, + "grad_norm": 0.32368284463882446, + "learning_rate": 1.7772321761435674e-06, + "loss": 0.4705, + "step": 1388 + }, + { + "epoch": 0.7509461164173725, + "grad_norm": 0.3560972511768341, + "learning_rate": 1.7700205760999061e-06, + "loss": 0.4442, + "step": 1389 + }, + { + "epoch": 0.7514867543701568, + "grad_norm": 0.35852131247520447, + "learning_rate": 1.76282048916821e-06, + "loss": 0.4745, + "step": 1390 + }, + { + "epoch": 0.752027392322941, + "grad_norm": 0.35611191391944885, + "learning_rate": 1.7556319410128557e-06, + "loss": 0.4735, + "step": 1391 + }, + { + "epoch": 0.7525680302757254, + "grad_norm": 0.33755627274513245, + "learning_rate": 1.7484549572570913e-06, + "loss": 0.4822, + "step": 1392 + }, + { + "epoch": 0.7531086682285096, + "grad_norm": 0.33882173895835876, + "learning_rate": 1.7412895634829391e-06, + "loss": 0.4906, + "step": 1393 + }, + { + "epoch": 0.7536493061812939, + "grad_norm": 0.3680201768875122, + "learning_rate": 1.7341357852311175e-06, + "loss": 0.5208, + "step": 1394 + }, + { + "epoch": 0.7541899441340782, + "grad_norm": 0.3476288914680481, + "learning_rate": 1.726993648000933e-06, + "loss": 0.4545, + "step": 1395 + }, + { + "epoch": 0.7547305820868625, + "grad_norm": 0.35119006037712097, + "learning_rate": 1.7198631772502057e-06, + "loss": 0.4769, + "step": 1396 + }, + { + "epoch": 0.7552712200396468, + "grad_norm": 0.3414047360420227, + "learning_rate": 1.7127443983951687e-06, + "loss": 0.455, + "step": 1397 + }, + { + "epoch": 0.7558118579924311, + "grad_norm": 0.3647000193595886, + "learning_rate": 1.7056373368103756e-06, + "loss": 0.5124, + "step": 1398 + }, + { + "epoch": 0.7563524959452154, + "grad_norm": 0.34969907999038696, + "learning_rate": 1.6985420178286216e-06, + "loss": 0.4956, + "step": 1399 + }, + { + "epoch": 0.7568931338979996, + "grad_norm": 0.3402901887893677, + "learning_rate": 1.6914584667408408e-06, + "loss": 0.455, + "step": 1400 + }, + { + "epoch": 0.757433771850784, + "grad_norm": 0.36687296628952026, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.4694, + "step": 1401 + }, + { + "epoch": 0.7579744098035682, + "grad_norm": 0.34027478098869324, + "learning_rate": 1.6773267692011242e-06, + "loss": 0.4763, + "step": 1402 + }, + { + "epoch": 0.7585150477563525, + "grad_norm": 0.30456939339637756, + "learning_rate": 1.6702786731209681e-06, + "loss": 0.4545, + "step": 1403 + }, + { + "epoch": 0.7590556857091367, + "grad_norm": 0.32559987902641296, + "learning_rate": 1.6632424456781675e-06, + "loss": 0.484, + "step": 1404 + }, + { + "epoch": 0.7595963236619211, + "grad_norm": 0.3651071786880493, + "learning_rate": 1.6562181119530314e-06, + "loss": 0.4981, + "step": 1405 + }, + { + "epoch": 0.7601369616147053, + "grad_norm": 0.36218157410621643, + "learning_rate": 1.649205696983468e-06, + "loss": 0.4905, + "step": 1406 + }, + { + "epoch": 0.7606775995674896, + "grad_norm": 0.35047683119773865, + "learning_rate": 1.642205225764908e-06, + "loss": 0.4697, + "step": 1407 + }, + { + "epoch": 0.761218237520274, + "grad_norm": 0.3518739342689514, + "learning_rate": 1.635216723250206e-06, + "loss": 0.4665, + "step": 1408 + }, + { + "epoch": 0.7617588754730582, + "grad_norm": 0.3698537349700928, + "learning_rate": 1.6282402143495568e-06, + "loss": 0.4555, + "step": 1409 + }, + { + "epoch": 0.7622995134258425, + "grad_norm": 0.3216371536254883, + "learning_rate": 1.6212757239304e-06, + "loss": 0.4591, + "step": 1410 + }, + { + "epoch": 0.7628401513786268, + "grad_norm": 0.33843740820884705, + "learning_rate": 1.6143232768173428e-06, + "loss": 0.4754, + "step": 1411 + }, + { + "epoch": 0.7633807893314111, + "grad_norm": 0.3162449300289154, + "learning_rate": 1.6073828977920564e-06, + "loss": 0.4683, + "step": 1412 + }, + { + "epoch": 0.7639214272841953, + "grad_norm": 0.3222675025463104, + "learning_rate": 1.6004546115932023e-06, + "loss": 0.4774, + "step": 1413 + }, + { + "epoch": 0.7644620652369797, + "grad_norm": 0.3448139727115631, + "learning_rate": 1.5935384429163376e-06, + "loss": 0.4532, + "step": 1414 + }, + { + "epoch": 0.7650027031897639, + "grad_norm": 0.3369153141975403, + "learning_rate": 1.5866344164138214e-06, + "loss": 0.4624, + "step": 1415 + }, + { + "epoch": 0.7655433411425482, + "grad_norm": 0.3553886115550995, + "learning_rate": 1.5797425566947378e-06, + "loss": 0.4782, + "step": 1416 + }, + { + "epoch": 0.7660839790953325, + "grad_norm": 0.31446409225463867, + "learning_rate": 1.572862888324801e-06, + "loss": 0.4476, + "step": 1417 + }, + { + "epoch": 0.7666246170481168, + "grad_norm": 0.3353008031845093, + "learning_rate": 1.5659954358262724e-06, + "loss": 0.4717, + "step": 1418 + }, + { + "epoch": 0.767165255000901, + "grad_norm": 0.3172055780887604, + "learning_rate": 1.5591402236778647e-06, + "loss": 0.4599, + "step": 1419 + }, + { + "epoch": 0.7677058929536853, + "grad_norm": 0.3034406900405884, + "learning_rate": 1.5522972763146653e-06, + "loss": 0.439, + "step": 1420 + }, + { + "epoch": 0.7682465309064697, + "grad_norm": 0.36666351556777954, + "learning_rate": 1.5454666181280437e-06, + "loss": 0.498, + "step": 1421 + }, + { + "epoch": 0.7687871688592539, + "grad_norm": 0.36715036630630493, + "learning_rate": 1.5386482734655633e-06, + "loss": 0.489, + "step": 1422 + }, + { + "epoch": 0.7693278068120382, + "grad_norm": 0.3549302816390991, + "learning_rate": 1.5318422666308997e-06, + "loss": 0.4685, + "step": 1423 + }, + { + "epoch": 0.7698684447648225, + "grad_norm": 0.3297741413116455, + "learning_rate": 1.5250486218837458e-06, + "loss": 0.493, + "step": 1424 + }, + { + "epoch": 0.7704090827176068, + "grad_norm": 0.3559105396270752, + "learning_rate": 1.5182673634397365e-06, + "loss": 0.4517, + "step": 1425 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 0.3720037639141083, + "learning_rate": 1.5114985154703505e-06, + "loss": 0.4604, + "step": 1426 + }, + { + "epoch": 0.7714903586231754, + "grad_norm": 0.384896844625473, + "learning_rate": 1.5047421021028353e-06, + "loss": 0.4563, + "step": 1427 + }, + { + "epoch": 0.7720309965759596, + "grad_norm": 0.35539141297340393, + "learning_rate": 1.4979981474201106e-06, + "loss": 0.458, + "step": 1428 + }, + { + "epoch": 0.7725716345287439, + "grad_norm": 0.3983554244041443, + "learning_rate": 1.4912666754606914e-06, + "loss": 0.4836, + "step": 1429 + }, + { + "epoch": 0.7731122724815283, + "grad_norm": 0.34799396991729736, + "learning_rate": 1.4845477102185974e-06, + "loss": 0.4661, + "step": 1430 + }, + { + "epoch": 0.7736529104343125, + "grad_norm": 0.3554874360561371, + "learning_rate": 1.4778412756432709e-06, + "loss": 0.479, + "step": 1431 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.37535446882247925, + "learning_rate": 1.471147395639484e-06, + "loss": 0.4847, + "step": 1432 + }, + { + "epoch": 0.7747341863398811, + "grad_norm": 0.3827463686466217, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.4806, + "step": 1433 + }, + { + "epoch": 0.7752748242926654, + "grad_norm": 0.3442119359970093, + "learning_rate": 1.457797394741798e-06, + "loss": 0.5032, + "step": 1434 + }, + { + "epoch": 0.7758154622454496, + "grad_norm": 0.35367828607559204, + "learning_rate": 1.451141321433358e-06, + "loss": 0.4948, + "step": 1435 + }, + { + "epoch": 0.7763561001982339, + "grad_norm": 0.3889831304550171, + "learning_rate": 1.4444978978672103e-06, + "loss": 0.4688, + "step": 1436 + }, + { + "epoch": 0.7768967381510182, + "grad_norm": 0.3154084086418152, + "learning_rate": 1.4378671477235268e-06, + "loss": 0.4786, + "step": 1437 + }, + { + "epoch": 0.7774373761038025, + "grad_norm": 0.34436705708503723, + "learning_rate": 1.431249094637311e-06, + "loss": 0.472, + "step": 1438 + }, + { + "epoch": 0.7779780140565867, + "grad_norm": 0.3308955132961273, + "learning_rate": 1.4246437621983057e-06, + "loss": 0.4739, + "step": 1439 + }, + { + "epoch": 0.7785186520093711, + "grad_norm": 0.319321870803833, + "learning_rate": 1.418051173950914e-06, + "loss": 0.4603, + "step": 1440 + }, + { + "epoch": 0.7790592899621553, + "grad_norm": 0.31957656145095825, + "learning_rate": 1.4114713533941082e-06, + "loss": 0.4533, + "step": 1441 + }, + { + "epoch": 0.7795999279149396, + "grad_norm": 0.3338441848754883, + "learning_rate": 1.4049043239813575e-06, + "loss": 0.4805, + "step": 1442 + }, + { + "epoch": 0.780140565867724, + "grad_norm": 0.33430007100105286, + "learning_rate": 1.3983501091205298e-06, + "loss": 0.494, + "step": 1443 + }, + { + "epoch": 0.7806812038205082, + "grad_norm": 0.3184848129749298, + "learning_rate": 1.3918087321738244e-06, + "loss": 0.48, + "step": 1444 + }, + { + "epoch": 0.7812218417732925, + "grad_norm": 0.37600019574165344, + "learning_rate": 1.3852802164576717e-06, + "loss": 0.4767, + "step": 1445 + }, + { + "epoch": 0.7817624797260768, + "grad_norm": 0.3498574197292328, + "learning_rate": 1.3787645852426663e-06, + "loss": 0.4664, + "step": 1446 + }, + { + "epoch": 0.7823031176788611, + "grad_norm": 0.3013286292552948, + "learning_rate": 1.3722618617534727e-06, + "loss": 0.4444, + "step": 1447 + }, + { + "epoch": 0.7828437556316453, + "grad_norm": 0.32576611638069153, + "learning_rate": 1.3657720691687481e-06, + "loss": 0.4616, + "step": 1448 + }, + { + "epoch": 0.7833843935844296, + "grad_norm": 0.3226719796657562, + "learning_rate": 1.3592952306210589e-06, + "loss": 0.4776, + "step": 1449 + }, + { + "epoch": 0.7839250315372139, + "grad_norm": 0.30378836393356323, + "learning_rate": 1.3528313691967926e-06, + "loss": 0.4671, + "step": 1450 + }, + { + "epoch": 0.7844656694899982, + "grad_norm": 0.36785420775413513, + "learning_rate": 1.3463805079360854e-06, + "loss": 0.4758, + "step": 1451 + }, + { + "epoch": 0.7850063074427824, + "grad_norm": 0.332956463098526, + "learning_rate": 1.3399426698327329e-06, + "loss": 0.4767, + "step": 1452 + }, + { + "epoch": 0.7855469453955668, + "grad_norm": 0.34059152007102966, + "learning_rate": 1.3335178778341123e-06, + "loss": 0.4863, + "step": 1453 + }, + { + "epoch": 0.786087583348351, + "grad_norm": 0.3034917712211609, + "learning_rate": 1.3271061548410947e-06, + "loss": 0.4506, + "step": 1454 + }, + { + "epoch": 0.7866282213011353, + "grad_norm": 0.30233845114707947, + "learning_rate": 1.3207075237079702e-06, + "loss": 0.4683, + "step": 1455 + }, + { + "epoch": 0.7871688592539197, + "grad_norm": 0.311905175447464, + "learning_rate": 1.3143220072423647e-06, + "loss": 0.44, + "step": 1456 + }, + { + "epoch": 0.7877094972067039, + "grad_norm": 0.3118400275707245, + "learning_rate": 1.307949628205153e-06, + "loss": 0.4273, + "step": 1457 + }, + { + "epoch": 0.7882501351594882, + "grad_norm": 0.3080877363681793, + "learning_rate": 1.301590409310387e-06, + "loss": 0.4283, + "step": 1458 + }, + { + "epoch": 0.7887907731122725, + "grad_norm": 0.34099653363227844, + "learning_rate": 1.2952443732252058e-06, + "loss": 0.4688, + "step": 1459 + }, + { + "epoch": 0.7893314110650568, + "grad_norm": 0.3410574495792389, + "learning_rate": 1.2889115425697612e-06, + "loss": 0.4751, + "step": 1460 + }, + { + "epoch": 0.789872049017841, + "grad_norm": 0.3065154552459717, + "learning_rate": 1.282591939917136e-06, + "loss": 0.4832, + "step": 1461 + }, + { + "epoch": 0.7904126869706254, + "grad_norm": 0.3287332355976105, + "learning_rate": 1.2762855877932617e-06, + "loss": 0.4662, + "step": 1462 + }, + { + "epoch": 0.7909533249234096, + "grad_norm": 0.3344971239566803, + "learning_rate": 1.269992508676835e-06, + "loss": 0.4654, + "step": 1463 + }, + { + "epoch": 0.7914939628761939, + "grad_norm": 0.3270774781703949, + "learning_rate": 1.2637127249992465e-06, + "loss": 0.456, + "step": 1464 + }, + { + "epoch": 0.7920346008289781, + "grad_norm": 0.31552377343177795, + "learning_rate": 1.257446259144494e-06, + "loss": 0.4917, + "step": 1465 + }, + { + "epoch": 0.7925752387817625, + "grad_norm": 0.3612023591995239, + "learning_rate": 1.2511931334491068e-06, + "loss": 0.5151, + "step": 1466 + }, + { + "epoch": 0.7931158767345468, + "grad_norm": 0.326369047164917, + "learning_rate": 1.2449533702020578e-06, + "loss": 0.4871, + "step": 1467 + }, + { + "epoch": 0.793656514687331, + "grad_norm": 0.32858172059059143, + "learning_rate": 1.238726991644696e-06, + "loss": 0.4776, + "step": 1468 + }, + { + "epoch": 0.7941971526401154, + "grad_norm": 0.30760154128074646, + "learning_rate": 1.232514019970658e-06, + "loss": 0.4504, + "step": 1469 + }, + { + "epoch": 0.7947377905928996, + "grad_norm": 0.3285392224788666, + "learning_rate": 1.2263144773257967e-06, + "loss": 0.4873, + "step": 1470 + }, + { + "epoch": 0.7952784285456839, + "grad_norm": 0.3165012001991272, + "learning_rate": 1.2201283858080903e-06, + "loss": 0.4435, + "step": 1471 + }, + { + "epoch": 0.7958190664984682, + "grad_norm": 0.3612854480743408, + "learning_rate": 1.2139557674675773e-06, + "loss": 0.4748, + "step": 1472 + }, + { + "epoch": 0.7963597044512525, + "grad_norm": 0.3201451003551483, + "learning_rate": 1.2077966443062706e-06, + "loss": 0.499, + "step": 1473 + }, + { + "epoch": 0.7969003424040367, + "grad_norm": 0.3280227780342102, + "learning_rate": 1.2016510382780772e-06, + "loss": 0.4836, + "step": 1474 + }, + { + "epoch": 0.7974409803568211, + "grad_norm": 0.28860074281692505, + "learning_rate": 1.1955189712887272e-06, + "loss": 0.4607, + "step": 1475 + }, + { + "epoch": 0.7979816183096053, + "grad_norm": 0.3293115198612213, + "learning_rate": 1.189400465195687e-06, + "loss": 0.4633, + "step": 1476 + }, + { + "epoch": 0.7985222562623896, + "grad_norm": 0.3143051862716675, + "learning_rate": 1.183295541808089e-06, + "loss": 0.4549, + "step": 1477 + }, + { + "epoch": 0.799062894215174, + "grad_norm": 0.30853116512298584, + "learning_rate": 1.1772042228866493e-06, + "loss": 0.4824, + "step": 1478 + }, + { + "epoch": 0.7996035321679582, + "grad_norm": 0.32995665073394775, + "learning_rate": 1.1711265301435937e-06, + "loss": 0.4844, + "step": 1479 + }, + { + "epoch": 0.8001441701207425, + "grad_norm": 0.3321036100387573, + "learning_rate": 1.165062485242574e-06, + "loss": 0.4591, + "step": 1480 + }, + { + "epoch": 0.8006848080735267, + "grad_norm": 0.29632872343063354, + "learning_rate": 1.159012109798598e-06, + "loss": 0.4732, + "step": 1481 + }, + { + "epoch": 0.8012254460263111, + "grad_norm": 0.29381921887397766, + "learning_rate": 1.1529754253779486e-06, + "loss": 0.4685, + "step": 1482 + }, + { + "epoch": 0.8017660839790953, + "grad_norm": 0.321435809135437, + "learning_rate": 1.1469524534981091e-06, + "loss": 0.4575, + "step": 1483 + }, + { + "epoch": 0.8023067219318796, + "grad_norm": 0.3351998031139374, + "learning_rate": 1.1409432156276805e-06, + "loss": 0.4951, + "step": 1484 + }, + { + "epoch": 0.8028473598846639, + "grad_norm": 0.32599401473999023, + "learning_rate": 1.134947733186315e-06, + "loss": 0.4516, + "step": 1485 + }, + { + "epoch": 0.8033879978374482, + "grad_norm": 0.31513264775276184, + "learning_rate": 1.1289660275446318e-06, + "loss": 0.4816, + "step": 1486 + }, + { + "epoch": 0.8039286357902324, + "grad_norm": 0.3273223042488098, + "learning_rate": 1.1229981200241424e-06, + "loss": 0.4561, + "step": 1487 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 0.2985652685165405, + "learning_rate": 1.1170440318971788e-06, + "loss": 0.4808, + "step": 1488 + }, + { + "epoch": 0.805009911695801, + "grad_norm": 0.318036288022995, + "learning_rate": 1.1111037843868095e-06, + "loss": 0.4812, + "step": 1489 + }, + { + "epoch": 0.8055505496485853, + "grad_norm": 0.326748788356781, + "learning_rate": 1.1051773986667735e-06, + "loss": 0.4737, + "step": 1490 + }, + { + "epoch": 0.8060911876013697, + "grad_norm": 0.3135136663913727, + "learning_rate": 1.0992648958613961e-06, + "loss": 0.4618, + "step": 1491 + }, + { + "epoch": 0.8066318255541539, + "grad_norm": 0.2945253551006317, + "learning_rate": 1.0933662970455217e-06, + "loss": 0.4769, + "step": 1492 + }, + { + "epoch": 0.8071724635069382, + "grad_norm": 0.3053777813911438, + "learning_rate": 1.0874816232444297e-06, + "loss": 0.4672, + "step": 1493 + }, + { + "epoch": 0.8077131014597225, + "grad_norm": 0.3037196695804596, + "learning_rate": 1.081610895433769e-06, + "loss": 0.4399, + "step": 1494 + }, + { + "epoch": 0.8082537394125068, + "grad_norm": 0.30045562982559204, + "learning_rate": 1.0757541345394768e-06, + "loss": 0.4467, + "step": 1495 + }, + { + "epoch": 0.808794377365291, + "grad_norm": 0.2896043062210083, + "learning_rate": 1.0699113614377065e-06, + "loss": 0.4441, + "step": 1496 + }, + { + "epoch": 0.8093350153180753, + "grad_norm": 0.2978215217590332, + "learning_rate": 1.0640825969547498e-06, + "loss": 0.4531, + "step": 1497 + }, + { + "epoch": 0.8098756532708596, + "grad_norm": 0.32296404242515564, + "learning_rate": 1.058267861866969e-06, + "loss": 0.4619, + "step": 1498 + }, + { + "epoch": 0.8104162912236439, + "grad_norm": 0.32622790336608887, + "learning_rate": 1.0524671769007177e-06, + "loss": 0.4852, + "step": 1499 + }, + { + "epoch": 0.8109569291764281, + "grad_norm": 0.31101804971694946, + "learning_rate": 1.0466805627322685e-06, + "loss": 0.4534, + "step": 1500 + }, + { + "epoch": 0.8114975671292125, + "grad_norm": 0.29956305027008057, + "learning_rate": 1.0409080399877413e-06, + "loss": 0.4594, + "step": 1501 + }, + { + "epoch": 0.8120382050819968, + "grad_norm": 0.320745050907135, + "learning_rate": 1.035149629243023e-06, + "loss": 0.4689, + "step": 1502 + }, + { + "epoch": 0.812578843034781, + "grad_norm": 0.30797460675239563, + "learning_rate": 1.0294053510237028e-06, + "loss": 0.4573, + "step": 1503 + }, + { + "epoch": 0.8131194809875654, + "grad_norm": 0.3377775549888611, + "learning_rate": 1.0236752258049954e-06, + "loss": 0.4752, + "step": 1504 + }, + { + "epoch": 0.8136601189403496, + "grad_norm": 0.3108988106250763, + "learning_rate": 1.017959274011665e-06, + "loss": 0.4693, + "step": 1505 + }, + { + "epoch": 0.8142007568931339, + "grad_norm": 0.33605459332466125, + "learning_rate": 1.0122575160179582e-06, + "loss": 0.5001, + "step": 1506 + }, + { + "epoch": 0.8147413948459182, + "grad_norm": 0.33161184191703796, + "learning_rate": 1.0065699721475253e-06, + "loss": 0.4664, + "step": 1507 + }, + { + "epoch": 0.8152820327987025, + "grad_norm": 0.31771883368492126, + "learning_rate": 1.0008966626733541e-06, + "loss": 0.4783, + "step": 1508 + }, + { + "epoch": 0.8158226707514867, + "grad_norm": 0.3252415657043457, + "learning_rate": 9.95237607817694e-07, + "loss": 0.473, + "step": 1509 + }, + { + "epoch": 0.816363308704271, + "grad_norm": 0.33228546380996704, + "learning_rate": 9.895928277519822e-07, + "loss": 0.4814, + "step": 1510 + }, + { + "epoch": 0.8169039466570553, + "grad_norm": 0.3342330753803253, + "learning_rate": 9.83962342596776e-07, + "loss": 0.4642, + "step": 1511 + }, + { + "epoch": 0.8174445846098396, + "grad_norm": 0.3376791477203369, + "learning_rate": 9.783461724216793e-07, + "loss": 0.4795, + "step": 1512 + }, + { + "epoch": 0.8179852225626238, + "grad_norm": 0.3592437505722046, + "learning_rate": 9.7274433724527e-07, + "loss": 0.4655, + "step": 1513 + }, + { + "epoch": 0.8185258605154082, + "grad_norm": 0.3259372115135193, + "learning_rate": 9.671568570350321e-07, + "loss": 0.4775, + "step": 1514 + }, + { + "epoch": 0.8190664984681925, + "grad_norm": 0.3325469493865967, + "learning_rate": 9.615837517072758e-07, + "loss": 0.476, + "step": 1515 + }, + { + "epoch": 0.8196071364209767, + "grad_norm": 0.2931354343891144, + "learning_rate": 9.560250411270794e-07, + "loss": 0.4563, + "step": 1516 + }, + { + "epoch": 0.8201477743737611, + "grad_norm": 0.320726215839386, + "learning_rate": 9.504807451082088e-07, + "loss": 0.4857, + "step": 1517 + }, + { + "epoch": 0.8206884123265453, + "grad_norm": 0.32242387533187866, + "learning_rate": 9.449508834130517e-07, + "loss": 0.4836, + "step": 1518 + }, + { + "epoch": 0.8212290502793296, + "grad_norm": 0.3245270252227783, + "learning_rate": 9.394354757525404e-07, + "loss": 0.4671, + "step": 1519 + }, + { + "epoch": 0.8217696882321139, + "grad_norm": 0.3327704668045044, + "learning_rate": 9.339345417860918e-07, + "loss": 0.485, + "step": 1520 + }, + { + "epoch": 0.8223103261848982, + "grad_norm": 0.30964258313179016, + "learning_rate": 9.284481011215318e-07, + "loss": 0.4814, + "step": 1521 + }, + { + "epoch": 0.8228509641376824, + "grad_norm": 0.3185735046863556, + "learning_rate": 9.229761733150205e-07, + "loss": 0.4372, + "step": 1522 + }, + { + "epoch": 0.8233916020904668, + "grad_norm": 0.33612483739852905, + "learning_rate": 9.175187778709937e-07, + "loss": 0.4454, + "step": 1523 + }, + { + "epoch": 0.823932240043251, + "grad_norm": 0.31606701016426086, + "learning_rate": 9.120759342420821e-07, + "loss": 0.4578, + "step": 1524 + }, + { + "epoch": 0.8244728779960353, + "grad_norm": 0.31590044498443604, + "learning_rate": 9.066476618290515e-07, + "loss": 0.4715, + "step": 1525 + }, + { + "epoch": 0.8250135159488196, + "grad_norm": 0.2985929548740387, + "learning_rate": 9.012339799807263e-07, + "loss": 0.4654, + "step": 1526 + }, + { + "epoch": 0.8255541539016039, + "grad_norm": 0.35035449266433716, + "learning_rate": 8.95834907993926e-07, + "loss": 0.4557, + "step": 1527 + }, + { + "epoch": 0.8260947918543882, + "grad_norm": 0.3264734148979187, + "learning_rate": 8.904504651133905e-07, + "loss": 0.4634, + "step": 1528 + }, + { + "epoch": 0.8266354298071724, + "grad_norm": 0.32273826003074646, + "learning_rate": 8.850806705317183e-07, + "loss": 0.4608, + "step": 1529 + }, + { + "epoch": 0.8271760677599568, + "grad_norm": 0.29928576946258545, + "learning_rate": 8.797255433892926e-07, + "loss": 0.4553, + "step": 1530 + }, + { + "epoch": 0.827716705712741, + "grad_norm": 0.30989155173301697, + "learning_rate": 8.743851027742172e-07, + "loss": 0.4553, + "step": 1531 + }, + { + "epoch": 0.8282573436655253, + "grad_norm": 0.337877482175827, + "learning_rate": 8.690593677222431e-07, + "loss": 0.442, + "step": 1532 + }, + { + "epoch": 0.8287979816183096, + "grad_norm": 0.34234437346458435, + "learning_rate": 8.637483572167077e-07, + "loss": 0.4639, + "step": 1533 + }, + { + "epoch": 0.8293386195710939, + "grad_norm": 0.3071967363357544, + "learning_rate": 8.584520901884608e-07, + "loss": 0.4784, + "step": 1534 + }, + { + "epoch": 0.8298792575238781, + "grad_norm": 0.32178065180778503, + "learning_rate": 8.531705855158024e-07, + "loss": 0.4483, + "step": 1535 + }, + { + "epoch": 0.8304198954766625, + "grad_norm": 0.33801355957984924, + "learning_rate": 8.479038620244089e-07, + "loss": 0.4655, + "step": 1536 + }, + { + "epoch": 0.8309605334294468, + "grad_norm": 0.3588986396789551, + "learning_rate": 8.426519384872733e-07, + "loss": 0.4633, + "step": 1537 + }, + { + "epoch": 0.831501171382231, + "grad_norm": 0.3052517771720886, + "learning_rate": 8.374148336246352e-07, + "loss": 0.4667, + "step": 1538 + }, + { + "epoch": 0.8320418093350154, + "grad_norm": 0.32133039832115173, + "learning_rate": 8.321925661039088e-07, + "loss": 0.4773, + "step": 1539 + }, + { + "epoch": 0.8325824472877996, + "grad_norm": 0.3321249783039093, + "learning_rate": 8.269851545396279e-07, + "loss": 0.4829, + "step": 1540 + }, + { + "epoch": 0.8331230852405839, + "grad_norm": 0.3377304673194885, + "learning_rate": 8.217926174933665e-07, + "loss": 0.4632, + "step": 1541 + }, + { + "epoch": 0.8336637231933681, + "grad_norm": 0.3176397383213043, + "learning_rate": 8.166149734736845e-07, + "loss": 0.4814, + "step": 1542 + }, + { + "epoch": 0.8342043611461525, + "grad_norm": 0.33456897735595703, + "learning_rate": 8.114522409360531e-07, + "loss": 0.4667, + "step": 1543 + }, + { + "epoch": 0.8347449990989367, + "grad_norm": 0.29794400930404663, + "learning_rate": 8.063044382827945e-07, + "loss": 0.4635, + "step": 1544 + }, + { + "epoch": 0.835285637051721, + "grad_norm": 0.3277740478515625, + "learning_rate": 8.011715838630107e-07, + "loss": 0.4805, + "step": 1545 + }, + { + "epoch": 0.8358262750045053, + "grad_norm": 0.3014332950115204, + "learning_rate": 7.960536959725252e-07, + "loss": 0.4587, + "step": 1546 + }, + { + "epoch": 0.8363669129572896, + "grad_norm": 0.3108069598674774, + "learning_rate": 7.909507928538107e-07, + "loss": 0.452, + "step": 1547 + }, + { + "epoch": 0.8369075509100738, + "grad_norm": 0.3274773061275482, + "learning_rate": 7.858628926959311e-07, + "loss": 0.4805, + "step": 1548 + }, + { + "epoch": 0.8374481888628582, + "grad_norm": 0.31421011686325073, + "learning_rate": 7.807900136344676e-07, + "loss": 0.4582, + "step": 1549 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 0.30205944180488586, + "learning_rate": 7.757321737514645e-07, + "loss": 0.4902, + "step": 1550 + }, + { + "epoch": 0.8385294647684267, + "grad_norm": 0.33638957142829895, + "learning_rate": 7.706893910753571e-07, + "loss": 0.4946, + "step": 1551 + }, + { + "epoch": 0.8390701027212111, + "grad_norm": 0.3180740475654602, + "learning_rate": 7.656616835809122e-07, + "loss": 0.4895, + "step": 1552 + }, + { + "epoch": 0.8396107406739953, + "grad_norm": 0.3404751121997833, + "learning_rate": 7.606490691891577e-07, + "loss": 0.4654, + "step": 1553 + }, + { + "epoch": 0.8401513786267796, + "grad_norm": 0.32554861903190613, + "learning_rate": 7.556515657673274e-07, + "loss": 0.4784, + "step": 1554 + }, + { + "epoch": 0.8406920165795639, + "grad_norm": 0.30992451310157776, + "learning_rate": 7.506691911287883e-07, + "loss": 0.4746, + "step": 1555 + }, + { + "epoch": 0.8412326545323482, + "grad_norm": 0.2857951521873474, + "learning_rate": 7.457019630329848e-07, + "loss": 0.435, + "step": 1556 + }, + { + "epoch": 0.8417732924851324, + "grad_norm": 0.3287476599216461, + "learning_rate": 7.407498991853729e-07, + "loss": 0.4949, + "step": 1557 + }, + { + "epoch": 0.8423139304379167, + "grad_norm": 0.3094945251941681, + "learning_rate": 7.358130172373523e-07, + "loss": 0.4399, + "step": 1558 + }, + { + "epoch": 0.842854568390701, + "grad_norm": 0.33573803305625916, + "learning_rate": 7.308913347862112e-07, + "loss": 0.4661, + "step": 1559 + }, + { + "epoch": 0.8433952063434853, + "grad_norm": 0.32553064823150635, + "learning_rate": 7.259848693750582e-07, + "loss": 0.4646, + "step": 1560 + }, + { + "epoch": 0.8439358442962696, + "grad_norm": 0.3170788288116455, + "learning_rate": 7.210936384927631e-07, + "loss": 0.4743, + "step": 1561 + }, + { + "epoch": 0.8444764822490539, + "grad_norm": 0.2907802164554596, + "learning_rate": 7.162176595738895e-07, + "loss": 0.4486, + "step": 1562 + }, + { + "epoch": 0.8450171202018382, + "grad_norm": 0.31983017921447754, + "learning_rate": 7.113569499986401e-07, + "loss": 0.4635, + "step": 1563 + }, + { + "epoch": 0.8455577581546224, + "grad_norm": 0.2787870168685913, + "learning_rate": 7.065115270927875e-07, + "loss": 0.4483, + "step": 1564 + }, + { + "epoch": 0.8460983961074068, + "grad_norm": 0.31309157609939575, + "learning_rate": 7.01681408127618e-07, + "loss": 0.4546, + "step": 1565 + }, + { + "epoch": 0.846639034060191, + "grad_norm": 0.29454049468040466, + "learning_rate": 6.968666103198679e-07, + "loss": 0.4706, + "step": 1566 + }, + { + "epoch": 0.8471796720129753, + "grad_norm": 0.31175103783607483, + "learning_rate": 6.920671508316584e-07, + "loss": 0.4844, + "step": 1567 + }, + { + "epoch": 0.8477203099657596, + "grad_norm": 0.3265729546546936, + "learning_rate": 6.872830467704417e-07, + "loss": 0.4614, + "step": 1568 + }, + { + "epoch": 0.8482609479185439, + "grad_norm": 0.310225248336792, + "learning_rate": 6.825143151889358e-07, + "loss": 0.4786, + "step": 1569 + }, + { + "epoch": 0.8488015858713281, + "grad_norm": 0.31501367688179016, + "learning_rate": 6.777609730850615e-07, + "loss": 0.4904, + "step": 1570 + }, + { + "epoch": 0.8493422238241125, + "grad_norm": 0.31551438570022583, + "learning_rate": 6.730230374018886e-07, + "loss": 0.4804, + "step": 1571 + }, + { + "epoch": 0.8498828617768968, + "grad_norm": 0.2776893079280853, + "learning_rate": 6.683005250275676e-07, + "loss": 0.4579, + "step": 1572 + }, + { + "epoch": 0.850423499729681, + "grad_norm": 0.29054975509643555, + "learning_rate": 6.635934527952747e-07, + "loss": 0.4565, + "step": 1573 + }, + { + "epoch": 0.8509641376824653, + "grad_norm": 0.30911266803741455, + "learning_rate": 6.589018374831529e-07, + "loss": 0.4731, + "step": 1574 + }, + { + "epoch": 0.8515047756352496, + "grad_norm": 0.3118568956851959, + "learning_rate": 6.542256958142456e-07, + "loss": 0.4669, + "step": 1575 + }, + { + "epoch": 0.8520454135880339, + "grad_norm": 0.29929646849632263, + "learning_rate": 6.495650444564433e-07, + "loss": 0.4549, + "step": 1576 + }, + { + "epoch": 0.8525860515408181, + "grad_norm": 0.3033123314380646, + "learning_rate": 6.449199000224221e-07, + "loss": 0.4539, + "step": 1577 + }, + { + "epoch": 0.8531266894936025, + "grad_norm": 0.30189841985702515, + "learning_rate": 6.402902790695842e-07, + "loss": 0.4844, + "step": 1578 + }, + { + "epoch": 0.8536673274463867, + "grad_norm": 0.3111442029476166, + "learning_rate": 6.356761980999998e-07, + "loss": 0.4744, + "step": 1579 + }, + { + "epoch": 0.854207965399171, + "grad_norm": 0.2925800681114197, + "learning_rate": 6.310776735603452e-07, + "loss": 0.4781, + "step": 1580 + }, + { + "epoch": 0.8547486033519553, + "grad_norm": 0.2876686453819275, + "learning_rate": 6.264947218418482e-07, + "loss": 0.4422, + "step": 1581 + }, + { + "epoch": 0.8552892413047396, + "grad_norm": 0.29705557227134705, + "learning_rate": 6.219273592802278e-07, + "loss": 0.4675, + "step": 1582 + }, + { + "epoch": 0.8558298792575239, + "grad_norm": 0.2854503095149994, + "learning_rate": 6.173756021556377e-07, + "loss": 0.4537, + "step": 1583 + }, + { + "epoch": 0.8563705172103082, + "grad_norm": 0.31599560379981995, + "learning_rate": 6.128394666926035e-07, + "loss": 0.5045, + "step": 1584 + }, + { + "epoch": 0.8569111551630925, + "grad_norm": 0.33695438504219055, + "learning_rate": 6.083189690599712e-07, + "loss": 0.4945, + "step": 1585 + }, + { + "epoch": 0.8574517931158767, + "grad_norm": 0.3071337342262268, + "learning_rate": 6.038141253708429e-07, + "loss": 0.4765, + "step": 1586 + }, + { + "epoch": 0.857992431068661, + "grad_norm": 0.3113190829753876, + "learning_rate": 5.993249516825278e-07, + "loss": 0.4864, + "step": 1587 + }, + { + "epoch": 0.8585330690214453, + "grad_norm": 0.26970189809799194, + "learning_rate": 5.948514639964748e-07, + "loss": 0.4399, + "step": 1588 + }, + { + "epoch": 0.8590737069742296, + "grad_norm": 0.3173854947090149, + "learning_rate": 5.903936782582253e-07, + "loss": 0.4575, + "step": 1589 + }, + { + "epoch": 0.8596143449270138, + "grad_norm": 0.2729853689670563, + "learning_rate": 5.859516103573492e-07, + "loss": 0.4714, + "step": 1590 + }, + { + "epoch": 0.8601549828797982, + "grad_norm": 0.3082450330257416, + "learning_rate": 5.815252761273927e-07, + "loss": 0.4524, + "step": 1591 + }, + { + "epoch": 0.8606956208325824, + "grad_norm": 0.3227364718914032, + "learning_rate": 5.771146913458187e-07, + "loss": 0.4739, + "step": 1592 + }, + { + "epoch": 0.8612362587853667, + "grad_norm": 0.3056773841381073, + "learning_rate": 5.727198717339511e-07, + "loss": 0.4696, + "step": 1593 + }, + { + "epoch": 0.861776896738151, + "grad_norm": 0.3208467364311218, + "learning_rate": 5.683408329569212e-07, + "loss": 0.4797, + "step": 1594 + }, + { + "epoch": 0.8623175346909353, + "grad_norm": 0.312785267829895, + "learning_rate": 5.6397759062361e-07, + "loss": 0.4487, + "step": 1595 + }, + { + "epoch": 0.8628581726437196, + "grad_norm": 0.31125733256340027, + "learning_rate": 5.596301602865938e-07, + "loss": 0.4564, + "step": 1596 + }, + { + "epoch": 0.8633988105965039, + "grad_norm": 0.2956276834011078, + "learning_rate": 5.55298557442085e-07, + "loss": 0.4526, + "step": 1597 + }, + { + "epoch": 0.8639394485492882, + "grad_norm": 0.30352863669395447, + "learning_rate": 5.509827975298809e-07, + "loss": 0.4756, + "step": 1598 + }, + { + "epoch": 0.8644800865020724, + "grad_norm": 0.30768123269081116, + "learning_rate": 5.466828959333087e-07, + "loss": 0.4793, + "step": 1599 + }, + { + "epoch": 0.8650207244548568, + "grad_norm": 0.32290348410606384, + "learning_rate": 5.423988679791686e-07, + "loss": 0.4746, + "step": 1600 + }, + { + "epoch": 0.865561362407641, + "grad_norm": 0.32862550020217896, + "learning_rate": 5.381307289376786e-07, + "loss": 0.4604, + "step": 1601 + }, + { + "epoch": 0.8661020003604253, + "grad_norm": 0.32285216450691223, + "learning_rate": 5.338784940224239e-07, + "loss": 0.4757, + "step": 1602 + }, + { + "epoch": 0.8666426383132095, + "grad_norm": 0.31145691871643066, + "learning_rate": 5.296421783902972e-07, + "loss": 0.4591, + "step": 1603 + }, + { + "epoch": 0.8671832762659939, + "grad_norm": 0.28722888231277466, + "learning_rate": 5.254217971414499e-07, + "loss": 0.4738, + "step": 1604 + }, + { + "epoch": 0.8677239142187781, + "grad_norm": 0.32165801525115967, + "learning_rate": 5.212173653192365e-07, + "loss": 0.498, + "step": 1605 + }, + { + "epoch": 0.8682645521715624, + "grad_norm": 0.3288798928260803, + "learning_rate": 5.170288979101573e-07, + "loss": 0.4666, + "step": 1606 + }, + { + "epoch": 0.8688051901243468, + "grad_norm": 0.3006567656993866, + "learning_rate": 5.128564098438116e-07, + "loss": 0.4625, + "step": 1607 + }, + { + "epoch": 0.869345828077131, + "grad_norm": 0.3048226833343506, + "learning_rate": 5.086999159928391e-07, + "loss": 0.4746, + "step": 1608 + }, + { + "epoch": 0.8698864660299153, + "grad_norm": 0.2663028836250305, + "learning_rate": 5.045594311728708e-07, + "loss": 0.4526, + "step": 1609 + }, + { + "epoch": 0.8704271039826996, + "grad_norm": 0.35595017671585083, + "learning_rate": 5.00434970142471e-07, + "loss": 0.4808, + "step": 1610 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.33726397156715393, + "learning_rate": 4.963265476030916e-07, + "loss": 0.4678, + "step": 1611 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 0.29663407802581787, + "learning_rate": 4.922341781990131e-07, + "loss": 0.4845, + "step": 1612 + }, + { + "epoch": 0.8720490178410525, + "grad_norm": 0.31134849786758423, + "learning_rate": 4.881578765172979e-07, + "loss": 0.5056, + "step": 1613 + }, + { + "epoch": 0.8725896557938367, + "grad_norm": 0.3079214096069336, + "learning_rate": 4.840976570877332e-07, + "loss": 0.4556, + "step": 1614 + }, + { + "epoch": 0.873130293746621, + "grad_norm": 0.3087759017944336, + "learning_rate": 4.800535343827834e-07, + "loss": 0.4654, + "step": 1615 + }, + { + "epoch": 0.8736709316994054, + "grad_norm": 0.3534053862094879, + "learning_rate": 4.7602552281753647e-07, + "loss": 0.4545, + "step": 1616 + }, + { + "epoch": 0.8742115696521896, + "grad_norm": 0.30727800726890564, + "learning_rate": 4.720136367496536e-07, + "loss": 0.4578, + "step": 1617 + }, + { + "epoch": 0.8747522076049739, + "grad_norm": 0.3113386631011963, + "learning_rate": 4.6801789047931535e-07, + "loss": 0.4606, + "step": 1618 + }, + { + "epoch": 0.8752928455577581, + "grad_norm": 0.30020859837532043, + "learning_rate": 4.6403829824917643e-07, + "loss": 0.4627, + "step": 1619 + }, + { + "epoch": 0.8758334835105425, + "grad_norm": 0.3038908541202545, + "learning_rate": 4.6007487424430565e-07, + "loss": 0.4521, + "step": 1620 + }, + { + "epoch": 0.8763741214633267, + "grad_norm": 0.28542560338974, + "learning_rate": 4.5612763259214653e-07, + "loss": 0.4602, + "step": 1621 + }, + { + "epoch": 0.876914759416111, + "grad_norm": 0.29688113927841187, + "learning_rate": 4.52196587362459e-07, + "loss": 0.461, + "step": 1622 + }, + { + "epoch": 0.8774553973688953, + "grad_norm": 0.30379223823547363, + "learning_rate": 4.4828175256727056e-07, + "loss": 0.4739, + "step": 1623 + }, + { + "epoch": 0.8779960353216796, + "grad_norm": 0.27639734745025635, + "learning_rate": 4.4438314216082856e-07, + "loss": 0.4268, + "step": 1624 + }, + { + "epoch": 0.8785366732744638, + "grad_norm": 0.2714707553386688, + "learning_rate": 4.405007700395497e-07, + "loss": 0.4475, + "step": 1625 + }, + { + "epoch": 0.8790773112272482, + "grad_norm": 0.31498268246650696, + "learning_rate": 4.3663465004196995e-07, + "loss": 0.4623, + "step": 1626 + }, + { + "epoch": 0.8796179491800324, + "grad_norm": 0.29611799120903015, + "learning_rate": 4.3278479594869307e-07, + "loss": 0.4435, + "step": 1627 + }, + { + "epoch": 0.8801585871328167, + "grad_norm": 0.3016403615474701, + "learning_rate": 4.289512214823466e-07, + "loss": 0.4643, + "step": 1628 + }, + { + "epoch": 0.8806992250856011, + "grad_norm": 0.3306654393672943, + "learning_rate": 4.251339403075294e-07, + "loss": 0.4731, + "step": 1629 + }, + { + "epoch": 0.8812398630383853, + "grad_norm": 0.2894127368927002, + "learning_rate": 4.21332966030763e-07, + "loss": 0.4743, + "step": 1630 + }, + { + "epoch": 0.8817805009911696, + "grad_norm": 0.296880304813385, + "learning_rate": 4.175483122004448e-07, + "loss": 0.4616, + "step": 1631 + }, + { + "epoch": 0.8823211389439539, + "grad_norm": 0.2968302369117737, + "learning_rate": 4.1377999230679646e-07, + "loss": 0.4585, + "step": 1632 + }, + { + "epoch": 0.8828617768967382, + "grad_norm": 0.3259296119213104, + "learning_rate": 4.100280197818207e-07, + "loss": 0.4732, + "step": 1633 + }, + { + "epoch": 0.8834024148495224, + "grad_norm": 0.301943838596344, + "learning_rate": 4.062924079992492e-07, + "loss": 0.4769, + "step": 1634 + }, + { + "epoch": 0.8839430528023067, + "grad_norm": 0.31166207790374756, + "learning_rate": 4.025731702744978e-07, + "loss": 0.4528, + "step": 1635 + }, + { + "epoch": 0.884483690755091, + "grad_norm": 0.3266367018222809, + "learning_rate": 3.9887031986461546e-07, + "loss": 0.4766, + "step": 1636 + }, + { + "epoch": 0.8850243287078753, + "grad_norm": 0.29861709475517273, + "learning_rate": 3.9518386996824196e-07, + "loss": 0.4517, + "step": 1637 + }, + { + "epoch": 0.8855649666606595, + "grad_norm": 0.2750012278556824, + "learning_rate": 3.9151383372555696e-07, + "loss": 0.4419, + "step": 1638 + }, + { + "epoch": 0.8861056046134439, + "grad_norm": 0.29816916584968567, + "learning_rate": 3.8786022421823497e-07, + "loss": 0.4679, + "step": 1639 + }, + { + "epoch": 0.8866462425662281, + "grad_norm": 0.29293686151504517, + "learning_rate": 3.84223054469397e-07, + "loss": 0.4898, + "step": 1640 + }, + { + "epoch": 0.8871868805190124, + "grad_norm": 0.3161197900772095, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.4795, + "step": 1641 + }, + { + "epoch": 0.8877275184717968, + "grad_norm": 0.2860853970050812, + "learning_rate": 3.76998086046621e-07, + "loss": 0.4689, + "step": 1642 + }, + { + "epoch": 0.888268156424581, + "grad_norm": 0.29909902811050415, + "learning_rate": 3.7341031312574827e-07, + "loss": 0.4548, + "step": 1643 + }, + { + "epoch": 0.8888087943773653, + "grad_norm": 0.3215901553630829, + "learning_rate": 3.6983903146939894e-07, + "loss": 0.4874, + "step": 1644 + }, + { + "epoch": 0.8893494323301496, + "grad_norm": 0.2928643524646759, + "learning_rate": 3.6628425380723975e-07, + "loss": 0.4469, + "step": 1645 + }, + { + "epoch": 0.8898900702829339, + "grad_norm": 0.3444850444793701, + "learning_rate": 3.627459928101118e-07, + "loss": 0.4671, + "step": 1646 + }, + { + "epoch": 0.8904307082357181, + "grad_norm": 0.30591514706611633, + "learning_rate": 3.5922426108998154e-07, + "loss": 0.4835, + "step": 1647 + }, + { + "epoch": 0.8909713461885024, + "grad_norm": 0.33051443099975586, + "learning_rate": 3.5571907119990033e-07, + "loss": 0.453, + "step": 1648 + }, + { + "epoch": 0.8915119841412867, + "grad_norm": 0.29916903376579285, + "learning_rate": 3.522304356339529e-07, + "loss": 0.4541, + "step": 1649 + }, + { + "epoch": 0.892052622094071, + "grad_norm": 0.31293609738349915, + "learning_rate": 3.4875836682722096e-07, + "loss": 0.4572, + "step": 1650 + }, + { + "epoch": 0.8925932600468552, + "grad_norm": 0.3154950439929962, + "learning_rate": 3.45302877155731e-07, + "loss": 0.4799, + "step": 1651 + }, + { + "epoch": 0.8931338979996396, + "grad_norm": 0.2948947846889496, + "learning_rate": 3.418639789364175e-07, + "loss": 0.4914, + "step": 1652 + }, + { + "epoch": 0.8936745359524239, + "grad_norm": 0.32319676876068115, + "learning_rate": 3.3844168442707213e-07, + "loss": 0.4846, + "step": 1653 + }, + { + "epoch": 0.8942151739052081, + "grad_norm": 0.30142778158187866, + "learning_rate": 3.350360058263058e-07, + "loss": 0.4656, + "step": 1654 + }, + { + "epoch": 0.8947558118579925, + "grad_norm": 0.31285157799720764, + "learning_rate": 3.3164695527350244e-07, + "loss": 0.4622, + "step": 1655 + }, + { + "epoch": 0.8952964498107767, + "grad_norm": 0.3004051744937897, + "learning_rate": 3.2827454484877564e-07, + "loss": 0.4715, + "step": 1656 + }, + { + "epoch": 0.895837087763561, + "grad_norm": 0.3092837631702423, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.486, + "step": 1657 + }, + { + "epoch": 0.8963777257163453, + "grad_norm": 0.31189343333244324, + "learning_rate": 3.215796924073983e-07, + "loss": 0.4275, + "step": 1658 + }, + { + "epoch": 0.8969183636691296, + "grad_norm": 0.2800220549106598, + "learning_rate": 3.1825727425423837e-07, + "loss": 0.4642, + "step": 1659 + }, + { + "epoch": 0.8974590016219138, + "grad_norm": 0.31216147541999817, + "learning_rate": 3.149515439560524e-07, + "loss": 0.4516, + "step": 1660 + }, + { + "epoch": 0.8979996395746982, + "grad_norm": 0.30997899174690247, + "learning_rate": 3.116625132959633e-07, + "loss": 0.4794, + "step": 1661 + }, + { + "epoch": 0.8985402775274824, + "grad_norm": 0.31042930483818054, + "learning_rate": 3.083901939975675e-07, + "loss": 0.4699, + "step": 1662 + }, + { + "epoch": 0.8990809154802667, + "grad_norm": 0.3064749836921692, + "learning_rate": 3.051345977248954e-07, + "loss": 0.4366, + "step": 1663 + }, + { + "epoch": 0.899621553433051, + "grad_norm": 0.27594202756881714, + "learning_rate": 3.018957360823699e-07, + "loss": 0.4572, + "step": 1664 + }, + { + "epoch": 0.9001621913858353, + "grad_norm": 0.2951033115386963, + "learning_rate": 2.986736206147628e-07, + "loss": 0.4475, + "step": 1665 + }, + { + "epoch": 0.9007028293386196, + "grad_norm": 0.29796889424324036, + "learning_rate": 2.9546826280715536e-07, + "loss": 0.4708, + "step": 1666 + }, + { + "epoch": 0.9012434672914038, + "grad_norm": 0.32082822918891907, + "learning_rate": 2.9227967408489653e-07, + "loss": 0.4723, + "step": 1667 + }, + { + "epoch": 0.9017841052441882, + "grad_norm": 0.31180939078330994, + "learning_rate": 2.891078658135632e-07, + "loss": 0.4659, + "step": 1668 + }, + { + "epoch": 0.9023247431969724, + "grad_norm": 0.30132412910461426, + "learning_rate": 2.859528492989194e-07, + "loss": 0.4867, + "step": 1669 + }, + { + "epoch": 0.9028653811497567, + "grad_norm": 0.3208564221858978, + "learning_rate": 2.828146357868755e-07, + "loss": 0.4589, + "step": 1670 + }, + { + "epoch": 0.903406019102541, + "grad_norm": 0.2799170911312103, + "learning_rate": 2.796932364634475e-07, + "loss": 0.4563, + "step": 1671 + }, + { + "epoch": 0.9039466570553253, + "grad_norm": 0.2884426712989807, + "learning_rate": 2.765886624547182e-07, + "loss": 0.4782, + "step": 1672 + }, + { + "epoch": 0.9044872950081095, + "grad_norm": 0.28514596819877625, + "learning_rate": 2.7350092482679836e-07, + "loss": 0.4549, + "step": 1673 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 0.3076411485671997, + "learning_rate": 2.7043003458578685e-07, + "loss": 0.4737, + "step": 1674 + }, + { + "epoch": 0.9055685709136782, + "grad_norm": 0.34531399607658386, + "learning_rate": 2.673760026777272e-07, + "loss": 0.479, + "step": 1675 + }, + { + "epoch": 0.9061092088664624, + "grad_norm": 0.32933852076530457, + "learning_rate": 2.6433883998857657e-07, + "loss": 0.4612, + "step": 1676 + }, + { + "epoch": 0.9066498468192468, + "grad_norm": 0.2830871045589447, + "learning_rate": 2.61318557344159e-07, + "loss": 0.4248, + "step": 1677 + }, + { + "epoch": 0.907190484772031, + "grad_norm": 0.33129066228866577, + "learning_rate": 2.5831516551013405e-07, + "loss": 0.4706, + "step": 1678 + }, + { + "epoch": 0.9077311227248153, + "grad_norm": 0.3337078392505646, + "learning_rate": 2.553286751919509e-07, + "loss": 0.4877, + "step": 1679 + }, + { + "epoch": 0.9082717606775995, + "grad_norm": 0.3119748830795288, + "learning_rate": 2.523590970348166e-07, + "loss": 0.4523, + "step": 1680 + }, + { + "epoch": 0.9088123986303839, + "grad_norm": 0.29456672072410583, + "learning_rate": 2.4940644162365523e-07, + "loss": 0.4758, + "step": 1681 + }, + { + "epoch": 0.9093530365831681, + "grad_norm": 0.3045862317085266, + "learning_rate": 2.46470719483069e-07, + "loss": 0.4765, + "step": 1682 + }, + { + "epoch": 0.9098936745359524, + "grad_norm": 0.2948693037033081, + "learning_rate": 2.435519410773052e-07, + "loss": 0.4615, + "step": 1683 + }, + { + "epoch": 0.9104343124887367, + "grad_norm": 0.30374738574028015, + "learning_rate": 2.4065011681021266e-07, + "loss": 0.4846, + "step": 1684 + }, + { + "epoch": 0.910974950441521, + "grad_norm": 0.3169853985309601, + "learning_rate": 2.3776525702520925e-07, + "loss": 0.4746, + "step": 1685 + }, + { + "epoch": 0.9115155883943052, + "grad_norm": 0.297050803899765, + "learning_rate": 2.3489737200524498e-07, + "loss": 0.4785, + "step": 1686 + }, + { + "epoch": 0.9120562263470896, + "grad_norm": 0.3017227351665497, + "learning_rate": 2.3204647197276387e-07, + "loss": 0.4769, + "step": 1687 + }, + { + "epoch": 0.9125968642998739, + "grad_norm": 0.3048754334449768, + "learning_rate": 2.29212567089665e-07, + "loss": 0.4772, + "step": 1688 + }, + { + "epoch": 0.9131375022526581, + "grad_norm": 0.3351805508136749, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.4579, + "step": 1689 + }, + { + "epoch": 0.9136781402054425, + "grad_norm": 0.32740268111228943, + "learning_rate": 2.2359578311629272e-07, + "loss": 0.4945, + "step": 1690 + }, + { + "epoch": 0.9142187781582267, + "grad_norm": 0.3174039125442505, + "learning_rate": 2.2081292404678655e-07, + "loss": 0.4647, + "step": 1691 + }, + { + "epoch": 0.914759416111011, + "grad_norm": 0.30359378457069397, + "learning_rate": 2.1804710016812337e-07, + "loss": 0.469, + "step": 1692 + }, + { + "epoch": 0.9153000540637953, + "grad_norm": 0.29310646653175354, + "learning_rate": 2.152983213389559e-07, + "loss": 0.4867, + "step": 1693 + }, + { + "epoch": 0.9158406920165796, + "grad_norm": 0.3022826910018921, + "learning_rate": 2.1256659735717777e-07, + "loss": 0.4552, + "step": 1694 + }, + { + "epoch": 0.9163813299693638, + "grad_norm": 0.2924525737762451, + "learning_rate": 2.0985193795989345e-07, + "loss": 0.4371, + "step": 1695 + }, + { + "epoch": 0.9169219679221481, + "grad_norm": 0.3041470944881439, + "learning_rate": 2.071543528233805e-07, + "loss": 0.5015, + "step": 1696 + }, + { + "epoch": 0.9174626058749324, + "grad_norm": 0.299441397190094, + "learning_rate": 2.0447385156305565e-07, + "loss": 0.4557, + "step": 1697 + }, + { + "epoch": 0.9180032438277167, + "grad_norm": 0.29456791281700134, + "learning_rate": 2.0181044373344172e-07, + "loss": 0.4744, + "step": 1698 + }, + { + "epoch": 0.918543881780501, + "grad_norm": 0.31137824058532715, + "learning_rate": 1.9916413882813235e-07, + "loss": 0.4564, + "step": 1699 + }, + { + "epoch": 0.9190845197332853, + "grad_norm": 0.29632532596588135, + "learning_rate": 1.9653494627975888e-07, + "loss": 0.4657, + "step": 1700 + }, + { + "epoch": 0.9196251576860696, + "grad_norm": 0.28566405177116394, + "learning_rate": 1.9392287545995536e-07, + "loss": 0.4983, + "step": 1701 + }, + { + "epoch": 0.9201657956388538, + "grad_norm": 0.31092479825019836, + "learning_rate": 1.913279356793285e-07, + "loss": 0.4584, + "step": 1702 + }, + { + "epoch": 0.9207064335916382, + "grad_norm": 0.2855190634727478, + "learning_rate": 1.8875013618742e-07, + "loss": 0.4444, + "step": 1703 + }, + { + "epoch": 0.9212470715444224, + "grad_norm": 0.29378846287727356, + "learning_rate": 1.8618948617267764e-07, + "loss": 0.4697, + "step": 1704 + }, + { + "epoch": 0.9217877094972067, + "grad_norm": 0.30169007182121277, + "learning_rate": 1.8364599476241862e-07, + "loss": 0.4616, + "step": 1705 + }, + { + "epoch": 0.922328347449991, + "grad_norm": 0.3052110970020294, + "learning_rate": 1.8111967102280082e-07, + "loss": 0.4854, + "step": 1706 + }, + { + "epoch": 0.9228689854027753, + "grad_norm": 0.3236134350299835, + "learning_rate": 1.7861052395878764e-07, + "loss": 0.4437, + "step": 1707 + }, + { + "epoch": 0.9234096233555595, + "grad_norm": 0.29827356338500977, + "learning_rate": 1.7611856251411818e-07, + "loss": 0.4458, + "step": 1708 + }, + { + "epoch": 0.9239502613083439, + "grad_norm": 0.2997409701347351, + "learning_rate": 1.7364379557127387e-07, + "loss": 0.4891, + "step": 1709 + }, + { + "epoch": 0.9244908992611282, + "grad_norm": 0.2794349491596222, + "learning_rate": 1.711862319514457e-07, + "loss": 0.4808, + "step": 1710 + }, + { + "epoch": 0.9250315372139124, + "grad_norm": 0.299027681350708, + "learning_rate": 1.6874588041450535e-07, + "loss": 0.4743, + "step": 1711 + }, + { + "epoch": 0.9255721751666967, + "grad_norm": 0.291069895029068, + "learning_rate": 1.6632274965897365e-07, + "loss": 0.4791, + "step": 1712 + }, + { + "epoch": 0.926112813119481, + "grad_norm": 0.3018452227115631, + "learning_rate": 1.639168483219872e-07, + "loss": 0.479, + "step": 1713 + }, + { + "epoch": 0.9266534510722653, + "grad_norm": 0.2994323670864105, + "learning_rate": 1.6152818497926993e-07, + "loss": 0.4573, + "step": 1714 + }, + { + "epoch": 0.9271940890250495, + "grad_norm": 0.27843761444091797, + "learning_rate": 1.5915676814510173e-07, + "loss": 0.4585, + "step": 1715 + }, + { + "epoch": 0.9277347269778339, + "grad_norm": 0.30757150053977966, + "learning_rate": 1.5680260627228772e-07, + "loss": 0.4708, + "step": 1716 + }, + { + "epoch": 0.9282753649306181, + "grad_norm": 0.31326669454574585, + "learning_rate": 1.5446570775212944e-07, + "loss": 0.4671, + "step": 1717 + }, + { + "epoch": 0.9288160028834024, + "grad_norm": 0.31248635053634644, + "learning_rate": 1.5214608091439265e-07, + "loss": 0.4639, + "step": 1718 + }, + { + "epoch": 0.9293566408361867, + "grad_norm": 0.29233023524284363, + "learning_rate": 1.4984373402728014e-07, + "loss": 0.4691, + "step": 1719 + }, + { + "epoch": 0.929897278788971, + "grad_norm": 0.32531052827835083, + "learning_rate": 1.4755867529740064e-07, + "loss": 0.5093, + "step": 1720 + }, + { + "epoch": 0.9304379167417552, + "grad_norm": 0.35300230979919434, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.488, + "step": 1721 + }, + { + "epoch": 0.9309785546945396, + "grad_norm": 0.3052767813205719, + "learning_rate": 1.4304045482763263e-07, + "loss": 0.4674, + "step": 1722 + }, + { + "epoch": 0.9315191926473239, + "grad_norm": 0.2952291667461395, + "learning_rate": 1.408073091927309e-07, + "loss": 0.4572, + "step": 1723 + }, + { + "epoch": 0.9320598306001081, + "grad_norm": 0.2960074841976166, + "learning_rate": 1.3859148392498023e-07, + "loss": 0.464, + "step": 1724 + }, + { + "epoch": 0.9326004685528924, + "grad_norm": 0.28832143545150757, + "learning_rate": 1.3639298692258606e-07, + "loss": 0.4699, + "step": 1725 + }, + { + "epoch": 0.9331411065056767, + "grad_norm": 0.29948270320892334, + "learning_rate": 1.342118260219899e-07, + "loss": 0.4722, + "step": 1726 + }, + { + "epoch": 0.933681744458461, + "grad_norm": 0.28257155418395996, + "learning_rate": 1.320480089978382e-07, + "loss": 0.4768, + "step": 1727 + }, + { + "epoch": 0.9342223824112452, + "grad_norm": 0.2848031520843506, + "learning_rate": 1.2990154356295636e-07, + "loss": 0.453, + "step": 1728 + }, + { + "epoch": 0.9347630203640296, + "grad_norm": 0.29781103134155273, + "learning_rate": 1.2777243736832202e-07, + "loss": 0.4798, + "step": 1729 + }, + { + "epoch": 0.9353036583168138, + "grad_norm": 0.30655109882354736, + "learning_rate": 1.2566069800303393e-07, + "loss": 0.4732, + "step": 1730 + }, + { + "epoch": 0.9358442962695981, + "grad_norm": 0.30582815408706665, + "learning_rate": 1.2356633299429044e-07, + "loss": 0.4601, + "step": 1731 + }, + { + "epoch": 0.9363849342223824, + "grad_norm": 0.2904635965824127, + "learning_rate": 1.2148934980735772e-07, + "loss": 0.4922, + "step": 1732 + }, + { + "epoch": 0.9369255721751667, + "grad_norm": 0.30475184321403503, + "learning_rate": 1.1942975584554594e-07, + "loss": 0.4747, + "step": 1733 + }, + { + "epoch": 0.937466210127951, + "grad_norm": 0.29005348682403564, + "learning_rate": 1.1738755845018323e-07, + "loss": 0.4601, + "step": 1734 + }, + { + "epoch": 0.9380068480807353, + "grad_norm": 0.29658186435699463, + "learning_rate": 1.1536276490058784e-07, + "loss": 0.4417, + "step": 1735 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 0.2936530113220215, + "learning_rate": 1.1335538241404099e-07, + "loss": 0.4629, + "step": 1736 + }, + { + "epoch": 0.9390881239863038, + "grad_norm": 0.3370441198348999, + "learning_rate": 1.1136541814576574e-07, + "loss": 0.4861, + "step": 1737 + }, + { + "epoch": 0.9396287619390882, + "grad_norm": 0.2809031009674072, + "learning_rate": 1.0939287918889652e-07, + "loss": 0.4676, + "step": 1738 + }, + { + "epoch": 0.9401693998918724, + "grad_norm": 0.29533958435058594, + "learning_rate": 1.0743777257445853e-07, + "loss": 0.4947, + "step": 1739 + }, + { + "epoch": 0.9407100378446567, + "grad_norm": 0.2839764654636383, + "learning_rate": 1.055001052713378e-07, + "loss": 0.4924, + "step": 1740 + }, + { + "epoch": 0.9412506757974409, + "grad_norm": 0.31136807799339294, + "learning_rate": 1.0357988418625897e-07, + "loss": 0.467, + "step": 1741 + }, + { + "epoch": 0.9417913137502253, + "grad_norm": 0.3142279386520386, + "learning_rate": 1.0167711616376196e-07, + "loss": 0.4629, + "step": 1742 + }, + { + "epoch": 0.9423319517030095, + "grad_norm": 0.27500444650650024, + "learning_rate": 9.979180798617538e-08, + "loss": 0.4686, + "step": 1743 + }, + { + "epoch": 0.9428725896557938, + "grad_norm": 0.3138892650604248, + "learning_rate": 9.792396637359203e-08, + "loss": 0.4638, + "step": 1744 + }, + { + "epoch": 0.9434132276085782, + "grad_norm": 0.3079321086406708, + "learning_rate": 9.607359798384785e-08, + "loss": 0.4491, + "step": 1745 + }, + { + "epoch": 0.9439538655613624, + "grad_norm": 0.29620155692100525, + "learning_rate": 9.424070941249419e-08, + "loss": 0.4593, + "step": 1746 + }, + { + "epoch": 0.9444945035141467, + "grad_norm": 0.32409822940826416, + "learning_rate": 9.242530719277776e-08, + "loss": 0.4248, + "step": 1747 + }, + { + "epoch": 0.945035141466931, + "grad_norm": 0.2947947680950165, + "learning_rate": 9.062739779561624e-08, + "loss": 0.4769, + "step": 1748 + }, + { + "epoch": 0.9455757794197153, + "grad_norm": 0.28672558069229126, + "learning_rate": 8.884698762957334e-08, + "loss": 0.4895, + "step": 1749 + }, + { + "epoch": 0.9461164173724995, + "grad_norm": 0.2779400050640106, + "learning_rate": 8.708408304083927e-08, + "loss": 0.4694, + "step": 1750 + }, + { + "epoch": 0.9466570553252839, + "grad_norm": 0.3198831379413605, + "learning_rate": 8.53386903132053e-08, + "loss": 0.4591, + "step": 1751 + }, + { + "epoch": 0.9471976932780681, + "grad_norm": 0.3286699652671814, + "learning_rate": 8.361081566804318e-08, + "loss": 0.4965, + "step": 1752 + }, + { + "epoch": 0.9477383312308524, + "grad_norm": 0.2861204743385315, + "learning_rate": 8.190046526428241e-08, + "loss": 0.4464, + "step": 1753 + }, + { + "epoch": 0.9482789691836367, + "grad_norm": 0.28904879093170166, + "learning_rate": 8.020764519838686e-08, + "loss": 0.4665, + "step": 1754 + }, + { + "epoch": 0.948819607136421, + "grad_norm": 0.308715283870697, + "learning_rate": 7.853236150433541e-08, + "loss": 0.4604, + "step": 1755 + }, + { + "epoch": 0.9493602450892052, + "grad_norm": 0.2850310206413269, + "learning_rate": 7.687462015360026e-08, + "loss": 0.4536, + "step": 1756 + }, + { + "epoch": 0.9499008830419895, + "grad_norm": 0.32719627022743225, + "learning_rate": 7.523442705512196e-08, + "loss": 0.4873, + "step": 1757 + }, + { + "epoch": 0.9504415209947739, + "grad_norm": 0.29945439100265503, + "learning_rate": 7.36117880552939e-08, + "loss": 0.4827, + "step": 1758 + }, + { + "epoch": 0.9509821589475581, + "grad_norm": 0.31181010603904724, + "learning_rate": 7.200670893793727e-08, + "loss": 0.4435, + "step": 1759 + }, + { + "epoch": 0.9515227969003424, + "grad_norm": 0.29942837357521057, + "learning_rate": 7.041919542428221e-08, + "loss": 0.4747, + "step": 1760 + }, + { + "epoch": 0.9520634348531267, + "grad_norm": 0.3144790232181549, + "learning_rate": 6.884925317294678e-08, + "loss": 0.481, + "step": 1761 + }, + { + "epoch": 0.952604072805911, + "grad_norm": 0.321038156747818, + "learning_rate": 6.72968877799185e-08, + "loss": 0.4891, + "step": 1762 + }, + { + "epoch": 0.9531447107586952, + "grad_norm": 0.29290351271629333, + "learning_rate": 6.576210477853007e-08, + "loss": 0.4755, + "step": 1763 + }, + { + "epoch": 0.9536853487114796, + "grad_norm": 0.31532248854637146, + "learning_rate": 6.424490963944597e-08, + "loss": 0.4916, + "step": 1764 + }, + { + "epoch": 0.9542259866642638, + "grad_norm": 0.29246556758880615, + "learning_rate": 6.274530777063747e-08, + "loss": 0.4765, + "step": 1765 + }, + { + "epoch": 0.9547666246170481, + "grad_norm": 0.28569135069847107, + "learning_rate": 6.126330451736495e-08, + "loss": 0.4446, + "step": 1766 + }, + { + "epoch": 0.9553072625698324, + "grad_norm": 0.2923862338066101, + "learning_rate": 5.97989051621617e-08, + "loss": 0.4793, + "step": 1767 + }, + { + "epoch": 0.9558479005226167, + "grad_norm": 0.27991148829460144, + "learning_rate": 5.835211492481063e-08, + "loss": 0.4756, + "step": 1768 + }, + { + "epoch": 0.956388538475401, + "grad_norm": 0.30270445346832275, + "learning_rate": 5.6922938962329364e-08, + "loss": 0.4498, + "step": 1769 + }, + { + "epoch": 0.9569291764281853, + "grad_norm": 0.29934385418891907, + "learning_rate": 5.551138236894793e-08, + "loss": 0.4677, + "step": 1770 + }, + { + "epoch": 0.9574698143809696, + "grad_norm": 0.2954845428466797, + "learning_rate": 5.411745017609493e-08, + "loss": 0.4762, + "step": 1771 + }, + { + "epoch": 0.9580104523337538, + "grad_norm": 0.29608166217803955, + "learning_rate": 5.274114735237812e-08, + "loss": 0.4584, + "step": 1772 + }, + { + "epoch": 0.9585510902865381, + "grad_norm": 0.3020345866680145, + "learning_rate": 5.138247880356384e-08, + "loss": 0.4686, + "step": 1773 + }, + { + "epoch": 0.9590917282393224, + "grad_norm": 0.3264354169368744, + "learning_rate": 5.004144937256372e-08, + "loss": 0.4808, + "step": 1774 + }, + { + "epoch": 0.9596323661921067, + "grad_norm": 0.29560211300849915, + "learning_rate": 4.8718063839414683e-08, + "loss": 0.4731, + "step": 1775 + }, + { + "epoch": 0.9601730041448909, + "grad_norm": 0.28680717945098877, + "learning_rate": 4.741232692126396e-08, + "loss": 0.4502, + "step": 1776 + }, + { + "epoch": 0.9607136420976753, + "grad_norm": 0.30334657430648804, + "learning_rate": 4.612424327234966e-08, + "loss": 0.4602, + "step": 1777 + }, + { + "epoch": 0.9612542800504595, + "grad_norm": 0.28252384066581726, + "learning_rate": 4.485381748398576e-08, + "loss": 0.4527, + "step": 1778 + }, + { + "epoch": 0.9617949180032438, + "grad_norm": 0.3101551830768585, + "learning_rate": 4.360105408454718e-08, + "loss": 0.4967, + "step": 1779 + }, + { + "epoch": 0.9623355559560282, + "grad_norm": 0.2830714285373688, + "learning_rate": 4.236595753944972e-08, + "loss": 0.4613, + "step": 1780 + }, + { + "epoch": 0.9628761939088124, + "grad_norm": 0.298048198223114, + "learning_rate": 4.114853225113902e-08, + "loss": 0.459, + "step": 1781 + }, + { + "epoch": 0.9634168318615967, + "grad_norm": 0.30758899450302124, + "learning_rate": 3.994878255907053e-08, + "loss": 0.4355, + "step": 1782 + }, + { + "epoch": 0.963957469814381, + "grad_norm": 0.31392624974250793, + "learning_rate": 3.8766712739696786e-08, + "loss": 0.434, + "step": 1783 + }, + { + "epoch": 0.9644981077671653, + "grad_norm": 0.3108672797679901, + "learning_rate": 3.7602327006450166e-08, + "loss": 0.4323, + "step": 1784 + }, + { + "epoch": 0.9650387457199495, + "grad_norm": 0.3078506290912628, + "learning_rate": 3.645562950973014e-08, + "loss": 0.4671, + "step": 1785 + }, + { + "epoch": 0.9655793836727338, + "grad_norm": 0.29575785994529724, + "learning_rate": 3.5326624336886604e-08, + "loss": 0.4592, + "step": 1786 + }, + { + "epoch": 0.9661200216255181, + "grad_norm": 0.2994788885116577, + "learning_rate": 3.4215315512206584e-08, + "loss": 0.4864, + "step": 1787 + }, + { + "epoch": 0.9666606595783024, + "grad_norm": 0.2929299771785736, + "learning_rate": 3.312170699689865e-08, + "loss": 0.4609, + "step": 1788 + }, + { + "epoch": 0.9672012975310866, + "grad_norm": 0.31516411900520325, + "learning_rate": 3.204580268907909e-08, + "loss": 0.4905, + "step": 1789 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.31366169452667236, + "learning_rate": 3.0987606423759644e-08, + "loss": 0.4793, + "step": 1790 + }, + { + "epoch": 0.9682825734366552, + "grad_norm": 0.30144020915031433, + "learning_rate": 2.9947121972832e-08, + "loss": 0.4619, + "step": 1791 + }, + { + "epoch": 0.9688232113894395, + "grad_norm": 0.2976379692554474, + "learning_rate": 2.8924353045054475e-08, + "loss": 0.4555, + "step": 1792 + }, + { + "epoch": 0.9693638493422239, + "grad_norm": 0.3140285909175873, + "learning_rate": 2.7919303286039202e-08, + "loss": 0.4592, + "step": 1793 + }, + { + "epoch": 0.9699044872950081, + "grad_norm": 0.28508275747299194, + "learning_rate": 2.693197627823996e-08, + "loss": 0.4784, + "step": 1794 + }, + { + "epoch": 0.9704451252477924, + "grad_norm": 0.29108139872550964, + "learning_rate": 2.5962375540937724e-08, + "loss": 0.4857, + "step": 1795 + }, + { + "epoch": 0.9709857632005767, + "grad_norm": 0.3323630094528198, + "learning_rate": 2.5010504530229574e-08, + "loss": 0.4616, + "step": 1796 + }, + { + "epoch": 0.971526401153361, + "grad_norm": 0.31090638041496277, + "learning_rate": 2.4076366639015914e-08, + "loss": 0.4782, + "step": 1797 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 0.31540656089782715, + "learning_rate": 2.3159965196987156e-08, + "loss": 0.4851, + "step": 1798 + }, + { + "epoch": 0.9726076770589296, + "grad_norm": 0.283552885055542, + "learning_rate": 2.2261303470614282e-08, + "loss": 0.4473, + "step": 1799 + }, + { + "epoch": 0.9731483150117138, + "grad_norm": 0.2891945540904999, + "learning_rate": 2.1380384663135523e-08, + "loss": 0.4574, + "step": 1800 + }, + { + "epoch": 0.9736889529644981, + "grad_norm": 0.2970227897167206, + "learning_rate": 2.0517211914545254e-08, + "loss": 0.4785, + "step": 1801 + }, + { + "epoch": 0.9742295909172823, + "grad_norm": 0.30604010820388794, + "learning_rate": 1.967178830158234e-08, + "loss": 0.4464, + "step": 1802 + }, + { + "epoch": 0.9747702288700667, + "grad_norm": 0.29695698618888855, + "learning_rate": 1.8844116837719582e-08, + "loss": 0.498, + "step": 1803 + }, + { + "epoch": 0.975310866822851, + "grad_norm": 0.29107484221458435, + "learning_rate": 1.803420047315485e-08, + "loss": 0.4752, + "step": 1804 + }, + { + "epoch": 0.9758515047756352, + "grad_norm": 0.2813703119754791, + "learning_rate": 1.724204209479663e-08, + "loss": 0.4509, + "step": 1805 + }, + { + "epoch": 0.9763921427284196, + "grad_norm": 0.2851557731628418, + "learning_rate": 1.646764452625682e-08, + "loss": 0.4704, + "step": 1806 + }, + { + "epoch": 0.9769327806812038, + "grad_norm": 0.2960273325443268, + "learning_rate": 1.5711010527839633e-08, + "loss": 0.4505, + "step": 1807 + }, + { + "epoch": 0.9774734186339881, + "grad_norm": 0.3089202642440796, + "learning_rate": 1.4972142796532696e-08, + "loss": 0.4747, + "step": 1808 + }, + { + "epoch": 0.9780140565867724, + "grad_norm": 0.30507782101631165, + "learning_rate": 1.4251043965994304e-08, + "loss": 0.4496, + "step": 1809 + }, + { + "epoch": 0.9785546945395567, + "grad_norm": 0.29044288396835327, + "learning_rate": 1.3547716606548967e-08, + "loss": 0.4645, + "step": 1810 + }, + { + "epoch": 0.9790953324923409, + "grad_norm": 0.2851908802986145, + "learning_rate": 1.2862163225174084e-08, + "loss": 0.4587, + "step": 1811 + }, + { + "epoch": 0.9796359704451253, + "grad_norm": 0.2960631251335144, + "learning_rate": 1.2194386265492742e-08, + "loss": 0.4429, + "step": 1812 + }, + { + "epoch": 0.9801766083979095, + "grad_norm": 0.3134407103061676, + "learning_rate": 1.1544388107765924e-08, + "loss": 0.4452, + "step": 1813 + }, + { + "epoch": 0.9807172463506938, + "grad_norm": 0.29251259565353394, + "learning_rate": 1.0912171068880318e-08, + "loss": 0.4647, + "step": 1814 + }, + { + "epoch": 0.9812578843034782, + "grad_norm": 0.3051975667476654, + "learning_rate": 1.029773740234552e-08, + "loss": 0.4604, + "step": 1815 + }, + { + "epoch": 0.9817985222562624, + "grad_norm": 0.31170353293418884, + "learning_rate": 9.701089298281285e-09, + "loss": 0.453, + "step": 1816 + }, + { + "epoch": 0.9823391602090467, + "grad_norm": 0.2902314066886902, + "learning_rate": 9.12222888341252e-09, + "loss": 0.4533, + "step": 1817 + }, + { + "epoch": 0.9828797981618309, + "grad_norm": 0.3045901656150818, + "learning_rate": 8.561158221060406e-09, + "loss": 0.4514, + "step": 1818 + }, + { + "epoch": 0.9834204361146153, + "grad_norm": 0.29368510842323303, + "learning_rate": 8.017879311134624e-09, + "loss": 0.4808, + "step": 1819 + }, + { + "epoch": 0.9839610740673995, + "grad_norm": 0.27795591950416565, + "learning_rate": 7.492394090128364e-09, + "loss": 0.4974, + "step": 1820 + }, + { + "epoch": 0.9845017120201838, + "grad_norm": 0.29451608657836914, + "learning_rate": 6.98470443110888e-09, + "loss": 0.4874, + "step": 1821 + }, + { + "epoch": 0.9850423499729681, + "grad_norm": 0.29039186239242554, + "learning_rate": 6.4948121437125035e-09, + "loss": 0.4587, + "step": 1822 + }, + { + "epoch": 0.9855829879257524, + "grad_norm": 0.29998692870140076, + "learning_rate": 6.022718974137976e-09, + "loss": 0.4895, + "step": 1823 + }, + { + "epoch": 0.9861236258785366, + "grad_norm": 0.27220648527145386, + "learning_rate": 5.568426605139232e-09, + "loss": 0.475, + "step": 1824 + }, + { + "epoch": 0.986664263831321, + "grad_norm": 0.3113320767879486, + "learning_rate": 5.131936656020409e-09, + "loss": 0.4675, + "step": 1825 + }, + { + "epoch": 0.9872049017841052, + "grad_norm": 0.2876143157482147, + "learning_rate": 4.713250682629733e-09, + "loss": 0.445, + "step": 1826 + }, + { + "epoch": 0.9877455397368895, + "grad_norm": 0.2922397553920746, + "learning_rate": 4.312370177353975e-09, + "loss": 0.4666, + "step": 1827 + }, + { + "epoch": 0.9882861776896739, + "grad_norm": 0.3193817436695099, + "learning_rate": 3.929296569112895e-09, + "loss": 0.4823, + "step": 1828 + }, + { + "epoch": 0.9888268156424581, + "grad_norm": 0.30332955718040466, + "learning_rate": 3.5640312233548024e-09, + "loss": 0.4637, + "step": 1829 + }, + { + "epoch": 0.9893674535952424, + "grad_norm": 0.28622302412986755, + "learning_rate": 3.2165754420510063e-09, + "loss": 0.4491, + "step": 1830 + }, + { + "epoch": 0.9899080915480267, + "grad_norm": 0.2920747399330139, + "learning_rate": 2.886930463691928e-09, + "loss": 0.4712, + "step": 1831 + }, + { + "epoch": 0.990448729500811, + "grad_norm": 0.2950655221939087, + "learning_rate": 2.5750974632809955e-09, + "loss": 0.4571, + "step": 1832 + }, + { + "epoch": 0.9909893674535952, + "grad_norm": 0.30665168166160583, + "learning_rate": 2.2810775523329775e-09, + "loss": 0.4675, + "step": 1833 + }, + { + "epoch": 0.9915300054063795, + "grad_norm": 0.30462467670440674, + "learning_rate": 2.0048717788684335e-09, + "loss": 0.4618, + "step": 1834 + }, + { + "epoch": 0.9920706433591638, + "grad_norm": 0.28570374846458435, + "learning_rate": 1.746481127409827e-09, + "loss": 0.4415, + "step": 1835 + }, + { + "epoch": 0.9926112813119481, + "grad_norm": 0.33271685242652893, + "learning_rate": 1.5059065189787502e-09, + "loss": 0.4603, + "step": 1836 + }, + { + "epoch": 0.9931519192647323, + "grad_norm": 0.3160949647426605, + "learning_rate": 1.2831488110920386e-09, + "loss": 0.4551, + "step": 1837 + }, + { + "epoch": 0.9936925572175167, + "grad_norm": 0.2587985694408417, + "learning_rate": 1.07820879775955e-09, + "loss": 0.4744, + "step": 1838 + }, + { + "epoch": 0.994233195170301, + "grad_norm": 0.32415860891342163, + "learning_rate": 8.910872094802792e-10, + "loss": 0.4525, + "step": 1839 + }, + { + "epoch": 0.9947738331230852, + "grad_norm": 0.27735623717308044, + "learning_rate": 7.217847132401367e-10, + "loss": 0.4508, + "step": 1840 + }, + { + "epoch": 0.9953144710758696, + "grad_norm": 0.2976240813732147, + "learning_rate": 5.703019125102849e-10, + "loss": 0.5082, + "step": 1841 + }, + { + "epoch": 0.9958551090286538, + "grad_norm": 0.28493812680244446, + "learning_rate": 4.3663934724436086e-10, + "loss": 0.4465, + "step": 1842 + }, + { + "epoch": 0.9963957469814381, + "grad_norm": 0.3043738603591919, + "learning_rate": 3.20797493876257e-10, + "loss": 0.4566, + "step": 1843 + }, + { + "epoch": 0.9969363849342224, + "grad_norm": 0.3110690116882324, + "learning_rate": 2.227767653190105e-10, + "loss": 0.5034, + "step": 1844 + }, + { + "epoch": 0.9974770228870067, + "grad_norm": 0.29455140233039856, + "learning_rate": 1.4257751096202755e-10, + "loss": 0.4548, + "step": 1845 + }, + { + "epoch": 0.9980176608397909, + "grad_norm": 0.2990035116672516, + "learning_rate": 8.020001667330412e-11, + "loss": 0.4927, + "step": 1846 + }, + { + "epoch": 0.9985582987925753, + "grad_norm": 0.29310280084609985, + "learning_rate": 3.564450479387471e-11, + "loss": 0.4643, + "step": 1847 + }, + { + "epoch": 0.9990989367453595, + "grad_norm": 0.2967851161956787, + "learning_rate": 8.911134139477639e-12, + "loss": 0.4731, + "step": 1848 + }, + { + "epoch": 0.9996395746981438, + "grad_norm": 0.3127877712249756, + "learning_rate": 0.0, + "loss": 0.4589, + "step": 1849 + }, + { + "epoch": 0.9996395746981438, + "step": 1849, + "total_flos": 2827754950098944.0, + "train_loss": 0.4939856140145487, + "train_runtime": 65758.0763, + "train_samples_per_second": 2.7, + "train_steps_per_second": 0.028 + } + ], + "logging_steps": 1.0, + "max_steps": 1849, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2827754950098944.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}