{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.091220068415051, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004561003420752566, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.0969, "step": 1 }, { "epoch": 0.009122006841505131, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1202, "step": 2 }, { "epoch": 0.013683010262257697, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 1.9863, "step": 3 }, { "epoch": 0.018244013683010263, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1056, "step": 4 }, { "epoch": 0.02280501710376283, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1682, "step": 5 }, { "epoch": 0.027366020524515394, "grad_norm": 126.99530029296875, "learning_rate": 0.0, "loss": 2.16, "step": 6 }, { "epoch": 0.03192702394526796, "grad_norm": 136.298583984375, "learning_rate": 3.0303030303030305e-06, "loss": 2.1097, "step": 7 }, { "epoch": 0.036488027366020526, "grad_norm": 118.9027328491211, "learning_rate": 6.060606060606061e-06, "loss": 2.0874, "step": 8 }, { "epoch": 0.04104903078677309, "grad_norm": 95.98336791992188, "learning_rate": 9.090909090909091e-06, "loss": 2.0198, "step": 9 }, { "epoch": 0.04561003420752566, "grad_norm": 91.16958618164062, "learning_rate": 1.2121212121212122e-05, "loss": 1.9522, "step": 10 }, { "epoch": 0.05017103762827822, "grad_norm": 66.85772705078125, "learning_rate": 1.5151515151515153e-05, "loss": 1.6778, "step": 11 }, { "epoch": 0.05473204104903079, "grad_norm": 53.472843170166016, "learning_rate": 1.8181818181818182e-05, "loss": 1.4714, "step": 12 }, { "epoch": 0.059293044469783354, "grad_norm": 49.19029235839844, "learning_rate": 2.1212121212121215e-05, "loss": 1.294, "step": 13 }, { "epoch": 0.06385404789053592, "grad_norm": 50.0140266418457, "learning_rate": 2.4242424242424244e-05, "loss": 1.4804, "step": 14 }, { "epoch": 0.06841505131128849, "grad_norm": 46.8694953918457, "learning_rate": 2.7272727272727273e-05, "loss": 1.1335, "step": 15 }, { "epoch": 0.07297605473204105, "grad_norm": 43.30156326293945, "learning_rate": 3.0303030303030306e-05, "loss": 1.2229, "step": 16 }, { "epoch": 0.07753705815279362, "grad_norm": 51.07203674316406, "learning_rate": 3.3333333333333335e-05, "loss": 1.2547, "step": 17 }, { "epoch": 0.08209806157354618, "grad_norm": 62.249114990234375, "learning_rate": 3.6363636363636364e-05, "loss": 1.0567, "step": 18 }, { "epoch": 0.08665906499429875, "grad_norm": 47.2119026184082, "learning_rate": 3.939393939393939e-05, "loss": 0.9491, "step": 19 }, { "epoch": 0.09122006841505131, "grad_norm": 42.218753814697266, "learning_rate": 4.242424242424243e-05, "loss": 0.8794, "step": 20 }, { "epoch": 0.09578107183580388, "grad_norm": 42.218753814697266, "learning_rate": 4.545454545454546e-05, "loss": 1.1536, "step": 21 }, { "epoch": 0.10034207525655645, "grad_norm": 44.523101806640625, "learning_rate": 4.545454545454546e-05, "loss": 0.7646, "step": 22 }, { "epoch": 0.10490307867730901, "grad_norm": 63.81179428100586, "learning_rate": 4.848484848484849e-05, "loss": 0.9237, "step": 23 }, { "epoch": 0.10946408209806158, "grad_norm": 46.185333251953125, "learning_rate": 5.151515151515152e-05, "loss": 0.9475, "step": 24 }, { "epoch": 0.11402508551881414, "grad_norm": 50.09680938720703, "learning_rate": 5.4545454545454546e-05, "loss": 0.6559, "step": 25 }, { "epoch": 0.11858608893956671, "grad_norm": 57.93541717529297, "learning_rate": 5.757575757575758e-05, "loss": 0.9655, "step": 26 }, { "epoch": 0.12314709236031927, "grad_norm": 44.12418746948242, "learning_rate": 6.060606060606061e-05, "loss": 0.991, "step": 27 }, { "epoch": 0.12770809578107184, "grad_norm": 55.63926315307617, "learning_rate": 6.363636363636364e-05, "loss": 0.9469, "step": 28 }, { "epoch": 0.1322690992018244, "grad_norm": 52.04874038696289, "learning_rate": 6.666666666666667e-05, "loss": 0.765, "step": 29 }, { "epoch": 0.13683010262257697, "grad_norm": 78.61589813232422, "learning_rate": 6.96969696969697e-05, "loss": 0.8077, "step": 30 }, { "epoch": 0.14139110604332952, "grad_norm": 78.61589813232422, "learning_rate": 7.272727272727273e-05, "loss": 0.6946, "step": 31 }, { "epoch": 0.1459521094640821, "grad_norm": 45.660404205322266, "learning_rate": 7.272727272727273e-05, "loss": 0.8566, "step": 32 }, { "epoch": 0.15051311288483465, "grad_norm": 45.660404205322266, "learning_rate": 7.575757575757576e-05, "loss": 0.9645, "step": 33 }, { "epoch": 0.15507411630558723, "grad_norm": 45.660404205322266, "learning_rate": 7.575757575757576e-05, "loss": 0.8577, "step": 34 }, { "epoch": 0.15963511972633979, "grad_norm": 44.082706451416016, "learning_rate": 7.575757575757576e-05, "loss": 0.6715, "step": 35 }, { "epoch": 0.16419612314709237, "grad_norm": 723.3299560546875, "learning_rate": 7.878787878787879e-05, "loss": 0.9595, "step": 36 }, { "epoch": 0.16875712656784492, "grad_norm": 102.72968292236328, "learning_rate": 8.181818181818183e-05, "loss": 0.8705, "step": 37 }, { "epoch": 0.1733181299885975, "grad_norm": 51.577972412109375, "learning_rate": 8.484848484848486e-05, "loss": 0.9124, "step": 38 }, { "epoch": 0.17787913340935005, "grad_norm": 79.64832305908203, "learning_rate": 8.787878787878789e-05, "loss": 0.8608, "step": 39 }, { "epoch": 0.18244013683010263, "grad_norm": 74.03942108154297, "learning_rate": 9.090909090909092e-05, "loss": 0.7678, "step": 40 }, { "epoch": 0.18700114025085518, "grad_norm": 75.6192855834961, "learning_rate": 9.393939393939395e-05, "loss": 0.8841, "step": 41 }, { "epoch": 0.19156214367160776, "grad_norm": 151.26239013671875, "learning_rate": 9.696969696969698e-05, "loss": 0.6354, "step": 42 }, { "epoch": 0.1961231470923603, "grad_norm": 63.19050598144531, "learning_rate": 0.0001, "loss": 1.0635, "step": 43 }, { "epoch": 0.2006841505131129, "grad_norm": 69.78765869140625, "learning_rate": 9.999978327420663e-05, "loss": 0.5772, "step": 44 }, { "epoch": 0.20524515393386544, "grad_norm": 74.76192474365234, "learning_rate": 9.99991330987053e-05, "loss": 0.8419, "step": 45 }, { "epoch": 0.20980615735461802, "grad_norm": 65.3372802734375, "learning_rate": 9.999804947913241e-05, "loss": 0.7743, "step": 46 }, { "epoch": 0.21436716077537057, "grad_norm": 84.05085754394531, "learning_rate": 9.999653242488188e-05, "loss": 0.8496, "step": 47 }, { "epoch": 0.21892816419612315, "grad_norm": 45.334293365478516, "learning_rate": 9.999458194910512e-05, "loss": 0.779, "step": 48 }, { "epoch": 0.2234891676168757, "grad_norm": 59.37651443481445, "learning_rate": 9.999219806871085e-05, "loss": 0.7776, "step": 49 }, { "epoch": 0.22805017103762829, "grad_norm": 44.242713928222656, "learning_rate": 9.998938080436503e-05, "loss": 0.7422, "step": 50 }, { "epoch": 0.23261117445838084, "grad_norm": 76.85882568359375, "learning_rate": 9.998613018049059e-05, "loss": 0.8527, "step": 51 }, { "epoch": 0.23717217787913342, "grad_norm": 53.421348571777344, "learning_rate": 9.99824462252673e-05, "loss": 0.6793, "step": 52 }, { "epoch": 0.24173318129988597, "grad_norm": 74.96648406982422, "learning_rate": 9.997832897063148e-05, "loss": 0.645, "step": 53 }, { "epoch": 0.24629418472063855, "grad_norm": 36.90277099609375, "learning_rate": 9.997377845227576e-05, "loss": 0.6156, "step": 54 }, { "epoch": 0.2508551881413911, "grad_norm": 45.15210723876953, "learning_rate": 9.996879470964868e-05, "loss": 0.689, "step": 55 }, { "epoch": 0.2554161915621437, "grad_norm": 84.45231628417969, "learning_rate": 9.996337778595453e-05, "loss": 1.1516, "step": 56 }, { "epoch": 0.25997719498289623, "grad_norm": 74.82112121582031, "learning_rate": 9.995752772815274e-05, "loss": 0.7793, "step": 57 }, { "epoch": 0.2645381984036488, "grad_norm": 46.81865692138672, "learning_rate": 9.995124458695768e-05, "loss": 0.677, "step": 58 }, { "epoch": 0.2690992018244014, "grad_norm": 50.14031219482422, "learning_rate": 9.994452841683808e-05, "loss": 0.7934, "step": 59 }, { "epoch": 0.27366020524515394, "grad_norm": 70.68441772460938, "learning_rate": 9.993737927601663e-05, "loss": 0.694, "step": 60 }, { "epoch": 0.2782212086659065, "grad_norm": 41.65163040161133, "learning_rate": 9.992979722646948e-05, "loss": 0.6657, "step": 61 }, { "epoch": 0.28278221208665905, "grad_norm": 43.47587203979492, "learning_rate": 9.992178233392564e-05, "loss": 0.6069, "step": 62 }, { "epoch": 0.28734321550741165, "grad_norm": 46.3960075378418, "learning_rate": 9.991333466786648e-05, "loss": 0.7959, "step": 63 }, { "epoch": 0.2919042189281642, "grad_norm": 36.781761169433594, "learning_rate": 9.990445430152507e-05, "loss": 0.6845, "step": 64 }, { "epoch": 0.29646522234891676, "grad_norm": 47.75035095214844, "learning_rate": 9.989514131188559e-05, "loss": 0.9177, "step": 65 }, { "epoch": 0.3010262257696693, "grad_norm": 45.957950592041016, "learning_rate": 9.988539577968265e-05, "loss": 0.7967, "step": 66 }, { "epoch": 0.3055872291904219, "grad_norm": 26.79014015197754, "learning_rate": 9.987521778940058e-05, "loss": 0.4885, "step": 67 }, { "epoch": 0.31014823261117447, "grad_norm": 46.98130416870117, "learning_rate": 9.986460742927271e-05, "loss": 0.7922, "step": 68 }, { "epoch": 0.314709236031927, "grad_norm": 29.98710823059082, "learning_rate": 9.985356479128056e-05, "loss": 0.6239, "step": 69 }, { "epoch": 0.31927023945267957, "grad_norm": 29.05475616455078, "learning_rate": 9.984208997115312e-05, "loss": 0.5977, "step": 70 }, { "epoch": 0.3238312428734322, "grad_norm": 30.07978057861328, "learning_rate": 9.9830183068366e-05, "loss": 0.6691, "step": 71 }, { "epoch": 0.32839224629418473, "grad_norm": 44.7830810546875, "learning_rate": 9.981784418614048e-05, "loss": 0.8664, "step": 72 }, { "epoch": 0.3329532497149373, "grad_norm": 36.961578369140625, "learning_rate": 9.980507343144273e-05, "loss": 0.6482, "step": 73 }, { "epoch": 0.33751425313568983, "grad_norm": 40.433258056640625, "learning_rate": 9.979187091498284e-05, "loss": 0.7933, "step": 74 }, { "epoch": 0.34207525655644244, "grad_norm": 21.32088279724121, "learning_rate": 9.977823675121383e-05, "loss": 0.562, "step": 75 }, { "epoch": 0.346636259977195, "grad_norm": 36.999900817871094, "learning_rate": 9.97641710583307e-05, "loss": 0.7949, "step": 76 }, { "epoch": 0.35119726339794755, "grad_norm": 34.943058013916016, "learning_rate": 9.974967395826941e-05, "loss": 0.6669, "step": 77 }, { "epoch": 0.3557582668187001, "grad_norm": 40.3181037902832, "learning_rate": 9.973474557670575e-05, "loss": 0.65, "step": 78 }, { "epoch": 0.3603192702394527, "grad_norm": 26.141529083251953, "learning_rate": 9.971938604305435e-05, "loss": 0.5017, "step": 79 }, { "epoch": 0.36488027366020526, "grad_norm": 27.214019775390625, "learning_rate": 9.970359549046749e-05, "loss": 0.5175, "step": 80 }, { "epoch": 0.3694412770809578, "grad_norm": 30.315534591674805, "learning_rate": 9.968737405583396e-05, "loss": 0.6422, "step": 81 }, { "epoch": 0.37400228050171036, "grad_norm": 38.341400146484375, "learning_rate": 9.967072187977795e-05, "loss": 0.5456, "step": 82 }, { "epoch": 0.37856328392246297, "grad_norm": 24.291261672973633, "learning_rate": 9.965363910665761e-05, "loss": 0.512, "step": 83 }, { "epoch": 0.3831242873432155, "grad_norm": 27.774852752685547, "learning_rate": 9.963612588456412e-05, "loss": 0.5651, "step": 84 }, { "epoch": 0.38768529076396807, "grad_norm": 28.017744064331055, "learning_rate": 9.961818236532012e-05, "loss": 0.5831, "step": 85 }, { "epoch": 0.3922462941847206, "grad_norm": 52.88800811767578, "learning_rate": 9.959980870447854e-05, "loss": 0.609, "step": 86 }, { "epoch": 0.39680729760547323, "grad_norm": 41.79977035522461, "learning_rate": 9.958100506132127e-05, "loss": 0.9048, "step": 87 }, { "epoch": 0.4013683010262258, "grad_norm": 22.837493896484375, "learning_rate": 9.956177159885765e-05, "loss": 0.526, "step": 88 }, { "epoch": 0.40592930444697833, "grad_norm": 32.65010070800781, "learning_rate": 9.954210848382318e-05, "loss": 0.7481, "step": 89 }, { "epoch": 0.4104903078677309, "grad_norm": 27.46462631225586, "learning_rate": 9.952201588667804e-05, "loss": 0.56, "step": 90 }, { "epoch": 0.4150513112884835, "grad_norm": 24.279617309570312, "learning_rate": 9.950149398160562e-05, "loss": 0.568, "step": 91 }, { "epoch": 0.41961231470923605, "grad_norm": 27.96613311767578, "learning_rate": 9.94805429465109e-05, "loss": 0.5389, "step": 92 }, { "epoch": 0.4241733181299886, "grad_norm": 29.904308319091797, "learning_rate": 9.945916296301913e-05, "loss": 0.6791, "step": 93 }, { "epoch": 0.42873432155074115, "grad_norm": 28.36408042907715, "learning_rate": 9.943735421647404e-05, "loss": 0.6191, "step": 94 }, { "epoch": 0.43329532497149376, "grad_norm": 28.37791633605957, "learning_rate": 9.941511689593633e-05, "loss": 0.6578, "step": 95 }, { "epoch": 0.4378563283922463, "grad_norm": 29.4257869720459, "learning_rate": 9.939245119418207e-05, "loss": 0.7219, "step": 96 }, { "epoch": 0.44241733181299886, "grad_norm": 31.684606552124023, "learning_rate": 9.936935730770093e-05, "loss": 0.7971, "step": 97 }, { "epoch": 0.4469783352337514, "grad_norm": 41.304290771484375, "learning_rate": 9.934583543669453e-05, "loss": 0.5916, "step": 98 }, { "epoch": 0.45153933865450396, "grad_norm": 45.91229248046875, "learning_rate": 9.932188578507476e-05, "loss": 0.6905, "step": 99 }, { "epoch": 0.45610034207525657, "grad_norm": 18.211179733276367, "learning_rate": 9.929750856046187e-05, "loss": 0.4074, "step": 100 }, { "epoch": 0.4606613454960091, "grad_norm": 22.817419052124023, "learning_rate": 9.92727039741828e-05, "loss": 0.5474, "step": 101 }, { "epoch": 0.4652223489167617, "grad_norm": 17.524913787841797, "learning_rate": 9.924747224126932e-05, "loss": 0.3943, "step": 102 }, { "epoch": 0.4697833523375142, "grad_norm": 29.555734634399414, "learning_rate": 9.922181358045607e-05, "loss": 0.4851, "step": 103 }, { "epoch": 0.47434435575826683, "grad_norm": 17.645509719848633, "learning_rate": 9.919572821417886e-05, "loss": 0.4485, "step": 104 }, { "epoch": 0.4789053591790194, "grad_norm": 27.566722869873047, "learning_rate": 9.916921636857253e-05, "loss": 0.5754, "step": 105 }, { "epoch": 0.48346636259977194, "grad_norm": 23.578996658325195, "learning_rate": 9.91422782734691e-05, "loss": 0.5258, "step": 106 }, { "epoch": 0.4880273660205245, "grad_norm": 82.11543273925781, "learning_rate": 9.911491416239578e-05, "loss": 0.6878, "step": 107 }, { "epoch": 0.4925883694412771, "grad_norm": 23.49557876586914, "learning_rate": 9.908712427257291e-05, "loss": 0.6356, "step": 108 }, { "epoch": 0.49714937286202965, "grad_norm": 35.3658447265625, "learning_rate": 9.905890884491195e-05, "loss": 0.5214, "step": 109 }, { "epoch": 0.5017103762827823, "grad_norm": 31.83234214782715, "learning_rate": 9.903026812401333e-05, "loss": 0.4928, "step": 110 }, { "epoch": 0.5062713797035348, "grad_norm": 27.706256866455078, "learning_rate": 9.900120235816435e-05, "loss": 0.7318, "step": 111 }, { "epoch": 0.5108323831242874, "grad_norm": 34.21133041381836, "learning_rate": 9.897171179933707e-05, "loss": 0.4351, "step": 112 }, { "epoch": 0.5153933865450399, "grad_norm": 27.63327407836914, "learning_rate": 9.894179670318606e-05, "loss": 0.4016, "step": 113 }, { "epoch": 0.5199543899657925, "grad_norm": 19.61988639831543, "learning_rate": 9.891145732904627e-05, "loss": 0.4805, "step": 114 }, { "epoch": 0.5245153933865451, "grad_norm": 32.44860076904297, "learning_rate": 9.88806939399307e-05, "loss": 0.6809, "step": 115 }, { "epoch": 0.5290763968072976, "grad_norm": 32.945152282714844, "learning_rate": 9.884950680252811e-05, "loss": 0.7838, "step": 116 }, { "epoch": 0.5336374002280502, "grad_norm": 26.530765533447266, "learning_rate": 9.881789618720081e-05, "loss": 0.6608, "step": 117 }, { "epoch": 0.5381984036488028, "grad_norm": 29.559206008911133, "learning_rate": 9.878586236798222e-05, "loss": 0.6232, "step": 118 }, { "epoch": 0.5427594070695553, "grad_norm": 18.236879348754883, "learning_rate": 9.875340562257453e-05, "loss": 0.4325, "step": 119 }, { "epoch": 0.5473204104903079, "grad_norm": 16.536705017089844, "learning_rate": 9.872052623234632e-05, "loss": 0.3459, "step": 120 }, { "epoch": 0.5518814139110604, "grad_norm": 32.01934051513672, "learning_rate": 9.868722448233004e-05, "loss": 0.6023, "step": 121 }, { "epoch": 0.556442417331813, "grad_norm": 19.95067024230957, "learning_rate": 9.865350066121961e-05, "loss": 0.4983, "step": 122 }, { "epoch": 0.5610034207525656, "grad_norm": 17.09720802307129, "learning_rate": 9.861935506136793e-05, "loss": 0.5208, "step": 123 }, { "epoch": 0.5655644241733181, "grad_norm": 40.02044677734375, "learning_rate": 9.85847879787843e-05, "loss": 0.4965, "step": 124 }, { "epoch": 0.5701254275940707, "grad_norm": 24.024381637573242, "learning_rate": 9.854979971313182e-05, "loss": 0.6857, "step": 125 }, { "epoch": 0.5746864310148233, "grad_norm": 23.170162200927734, "learning_rate": 9.85143905677249e-05, "loss": 0.4628, "step": 126 }, { "epoch": 0.5792474344355758, "grad_norm": 32.29564666748047, "learning_rate": 9.847856084952653e-05, "loss": 0.5976, "step": 127 }, { "epoch": 0.5838084378563284, "grad_norm": 22.218772888183594, "learning_rate": 9.844231086914571e-05, "loss": 0.4436, "step": 128 }, { "epoch": 0.5883694412770809, "grad_norm": 16.03778648376465, "learning_rate": 9.84056409408346e-05, "loss": 0.4686, "step": 129 }, { "epoch": 0.5929304446978335, "grad_norm": 23.267412185668945, "learning_rate": 9.836855138248605e-05, "loss": 0.5099, "step": 130 }, { "epoch": 0.5974914481185861, "grad_norm": 38.55447006225586, "learning_rate": 9.833104251563056e-05, "loss": 0.6403, "step": 131 }, { "epoch": 0.6020524515393386, "grad_norm": 23.0700740814209, "learning_rate": 9.829311466543373e-05, "loss": 0.4528, "step": 132 }, { "epoch": 0.6066134549600912, "grad_norm": 18.824100494384766, "learning_rate": 9.825476816069326e-05, "loss": 0.5267, "step": 133 }, { "epoch": 0.6111744583808438, "grad_norm": 30.157243728637695, "learning_rate": 9.821600333383625e-05, "loss": 0.5274, "step": 134 }, { "epoch": 0.6157354618015963, "grad_norm": 17.902389526367188, "learning_rate": 9.817682052091618e-05, "loss": 0.4983, "step": 135 }, { "epoch": 0.6202964652223489, "grad_norm": 23.980289459228516, "learning_rate": 9.813722006161013e-05, "loss": 0.5766, "step": 136 }, { "epoch": 0.6248574686431014, "grad_norm": 18.97156524658203, "learning_rate": 9.809720229921572e-05, "loss": 0.4777, "step": 137 }, { "epoch": 0.629418472063854, "grad_norm": 21.262067794799805, "learning_rate": 9.805676758064821e-05, "loss": 0.6721, "step": 138 }, { "epoch": 0.6339794754846066, "grad_norm": 17.919252395629883, "learning_rate": 9.801591625643745e-05, "loss": 0.3766, "step": 139 }, { "epoch": 0.6385404789053591, "grad_norm": 14.64062213897705, "learning_rate": 9.797464868072488e-05, "loss": 0.4098, "step": 140 }, { "epoch": 0.6431014823261118, "grad_norm": 24.484783172607422, "learning_rate": 9.79329652112604e-05, "loss": 0.6009, "step": 141 }, { "epoch": 0.6476624857468644, "grad_norm": 23.56806182861328, "learning_rate": 9.789086620939936e-05, "loss": 0.5756, "step": 142 }, { "epoch": 0.6522234891676169, "grad_norm": 35.71012496948242, "learning_rate": 9.784835204009932e-05, "loss": 0.6669, "step": 143 }, { "epoch": 0.6567844925883695, "grad_norm": 25.027629852294922, "learning_rate": 9.780542307191698e-05, "loss": 0.7502, "step": 144 }, { "epoch": 0.661345496009122, "grad_norm": 20.506362915039062, "learning_rate": 9.77620796770049e-05, "loss": 0.5575, "step": 145 }, { "epoch": 0.6659064994298746, "grad_norm": 18.51320457458496, "learning_rate": 9.771832223110839e-05, "loss": 0.4005, "step": 146 }, { "epoch": 0.6704675028506272, "grad_norm": 17.1466007232666, "learning_rate": 9.76741511135621e-05, "loss": 0.513, "step": 147 }, { "epoch": 0.6750285062713797, "grad_norm": 17.755672454833984, "learning_rate": 9.762956670728685e-05, "loss": 0.5208, "step": 148 }, { "epoch": 0.6795895096921323, "grad_norm": 17.082569122314453, "learning_rate": 9.758456939878629e-05, "loss": 0.5182, "step": 149 }, { "epoch": 0.6841505131128849, "grad_norm": 19.6417179107666, "learning_rate": 9.753915957814352e-05, "loss": 0.6026, "step": 150 }, { "epoch": 0.6887115165336374, "grad_norm": 16.423038482666016, "learning_rate": 9.74933376390177e-05, "loss": 0.4535, "step": 151 }, { "epoch": 0.69327251995439, "grad_norm": 12.587494850158691, "learning_rate": 9.744710397864067e-05, "loss": 0.3239, "step": 152 }, { "epoch": 0.6978335233751425, "grad_norm": 16.246538162231445, "learning_rate": 9.740045899781352e-05, "loss": 0.4221, "step": 153 }, { "epoch": 0.7023945267958951, "grad_norm": 20.31894302368164, "learning_rate": 9.735340310090307e-05, "loss": 0.3357, "step": 154 }, { "epoch": 0.7069555302166477, "grad_norm": 39.6623420715332, "learning_rate": 9.730593669583836e-05, "loss": 0.5047, "step": 155 }, { "epoch": 0.7115165336374002, "grad_norm": 16.25356101989746, "learning_rate": 9.725806019410717e-05, "loss": 0.5985, "step": 156 }, { "epoch": 0.7160775370581528, "grad_norm": 18.692333221435547, "learning_rate": 9.720977401075242e-05, "loss": 0.4652, "step": 157 }, { "epoch": 0.7206385404789054, "grad_norm": 15.554421424865723, "learning_rate": 9.716107856436855e-05, "loss": 0.3915, "step": 158 }, { "epoch": 0.7251995438996579, "grad_norm": 14.215270042419434, "learning_rate": 9.711197427709796e-05, "loss": 0.4865, "step": 159 }, { "epoch": 0.7297605473204105, "grad_norm": 21.14404296875, "learning_rate": 9.706246157462726e-05, "loss": 0.4058, "step": 160 }, { "epoch": 0.734321550741163, "grad_norm": 24.879043579101562, "learning_rate": 9.701254088618362e-05, "loss": 0.4697, "step": 161 }, { "epoch": 0.7388825541619156, "grad_norm": 20.374792098999023, "learning_rate": 9.696221264453109e-05, "loss": 0.3389, "step": 162 }, { "epoch": 0.7434435575826682, "grad_norm": 21.55912208557129, "learning_rate": 9.69114772859668e-05, "loss": 0.5669, "step": 163 }, { "epoch": 0.7480045610034207, "grad_norm": 13.084688186645508, "learning_rate": 9.686033525031719e-05, "loss": 0.3422, "step": 164 }, { "epoch": 0.7525655644241733, "grad_norm": 16.64484405517578, "learning_rate": 9.680878698093417e-05, "loss": 0.5166, "step": 165 }, { "epoch": 0.7571265678449259, "grad_norm": 18.273216247558594, "learning_rate": 9.675683292469132e-05, "loss": 0.5568, "step": 166 }, { "epoch": 0.7616875712656784, "grad_norm": 26.35822868347168, "learning_rate": 9.670447353198e-05, "loss": 0.6115, "step": 167 }, { "epoch": 0.766248574686431, "grad_norm": 18.36164665222168, "learning_rate": 9.665170925670548e-05, "loss": 0.3441, "step": 168 }, { "epoch": 0.7708095781071835, "grad_norm": 17.316587448120117, "learning_rate": 9.659854055628291e-05, "loss": 0.451, "step": 169 }, { "epoch": 0.7753705815279361, "grad_norm": 23.489328384399414, "learning_rate": 9.654496789163345e-05, "loss": 0.5535, "step": 170 }, { "epoch": 0.7799315849486887, "grad_norm": 19.27407455444336, "learning_rate": 9.649099172718021e-05, "loss": 0.5019, "step": 171 }, { "epoch": 0.7844925883694412, "grad_norm": 11.64967155456543, "learning_rate": 9.643661253084431e-05, "loss": 0.3258, "step": 172 }, { "epoch": 0.7890535917901939, "grad_norm": 19.625473022460938, "learning_rate": 9.638183077404069e-05, "loss": 0.3288, "step": 173 }, { "epoch": 0.7936145952109465, "grad_norm": 18.685232162475586, "learning_rate": 9.632664693167416e-05, "loss": 0.3644, "step": 174 }, { "epoch": 0.798175598631699, "grad_norm": 26.637842178344727, "learning_rate": 9.627106148213522e-05, "loss": 0.644, "step": 175 }, { "epoch": 0.8027366020524516, "grad_norm": 23.631614685058594, "learning_rate": 9.621507490729585e-05, "loss": 0.3727, "step": 176 }, { "epoch": 0.8072976054732041, "grad_norm": 15.275004386901855, "learning_rate": 9.615868769250546e-05, "loss": 0.3924, "step": 177 }, { "epoch": 0.8118586088939567, "grad_norm": 15.47768497467041, "learning_rate": 9.610190032658663e-05, "loss": 0.4487, "step": 178 }, { "epoch": 0.8164196123147093, "grad_norm": 14.71181583404541, "learning_rate": 9.604471330183083e-05, "loss": 0.32, "step": 179 }, { "epoch": 0.8209806157354618, "grad_norm": 9.841964721679688, "learning_rate": 9.598712711399416e-05, "loss": 0.2505, "step": 180 }, { "epoch": 0.8255416191562144, "grad_norm": 14.165433883666992, "learning_rate": 9.592914226229314e-05, "loss": 0.4393, "step": 181 }, { "epoch": 0.830102622576967, "grad_norm": 23.092065811157227, "learning_rate": 9.587075924940028e-05, "loss": 0.4625, "step": 182 }, { "epoch": 0.8346636259977195, "grad_norm": 17.662593841552734, "learning_rate": 9.581197858143978e-05, "loss": 0.4665, "step": 183 }, { "epoch": 0.8392246294184721, "grad_norm": 16.949716567993164, "learning_rate": 9.575280076798309e-05, "loss": 0.4784, "step": 184 }, { "epoch": 0.8437856328392246, "grad_norm": 18.952489852905273, "learning_rate": 9.569322632204458e-05, "loss": 0.3888, "step": 185 }, { "epoch": 0.8483466362599772, "grad_norm": 16.350954055786133, "learning_rate": 9.563325576007701e-05, "loss": 0.3935, "step": 186 }, { "epoch": 0.8529076396807298, "grad_norm": 16.852394104003906, "learning_rate": 9.557288960196707e-05, "loss": 0.4025, "step": 187 }, { "epoch": 0.8574686431014823, "grad_norm": 14.821525573730469, "learning_rate": 9.551212837103092e-05, "loss": 0.3752, "step": 188 }, { "epoch": 0.8620296465222349, "grad_norm": 20.433399200439453, "learning_rate": 9.545097259400958e-05, "loss": 0.3219, "step": 189 }, { "epoch": 0.8665906499429875, "grad_norm": 11.883235931396484, "learning_rate": 9.538942280106443e-05, "loss": 0.3892, "step": 190 }, { "epoch": 0.87115165336374, "grad_norm": 14.423933029174805, "learning_rate": 9.53274795257726e-05, "loss": 0.3798, "step": 191 }, { "epoch": 0.8757126567844926, "grad_norm": 15.010958671569824, "learning_rate": 9.526514330512225e-05, "loss": 0.3801, "step": 192 }, { "epoch": 0.8802736602052451, "grad_norm": 21.800418853759766, "learning_rate": 9.520241467950811e-05, "loss": 0.4404, "step": 193 }, { "epoch": 0.8848346636259977, "grad_norm": 15.904304504394531, "learning_rate": 9.513929419272662e-05, "loss": 0.3278, "step": 194 }, { "epoch": 0.8893956670467503, "grad_norm": 10.985480308532715, "learning_rate": 9.507578239197126e-05, "loss": 0.2883, "step": 195 }, { "epoch": 0.8939566704675028, "grad_norm": 10.487696647644043, "learning_rate": 9.501187982782785e-05, "loss": 0.2636, "step": 196 }, { "epoch": 0.8985176738882554, "grad_norm": 19.759944915771484, "learning_rate": 9.494758705426978e-05, "loss": 0.3749, "step": 197 }, { "epoch": 0.9030786773090079, "grad_norm": 17.322166442871094, "learning_rate": 9.48829046286531e-05, "loss": 0.4068, "step": 198 }, { "epoch": 0.9076396807297605, "grad_norm": 15.3864107131958, "learning_rate": 9.481783311171183e-05, "loss": 0.3576, "step": 199 }, { "epoch": 0.9122006841505131, "grad_norm": 13.966897964477539, "learning_rate": 9.475237306755302e-05, "loss": 0.4239, "step": 200 }, { "epoch": 0.9167616875712656, "grad_norm": 14.596879005432129, "learning_rate": 9.468652506365187e-05, "loss": 0.3637, "step": 201 }, { "epoch": 0.9213226909920182, "grad_norm": 20.099353790283203, "learning_rate": 9.46202896708468e-05, "loss": 0.5008, "step": 202 }, { "epoch": 0.9258836944127709, "grad_norm": 14.773473739624023, "learning_rate": 9.455366746333454e-05, "loss": 0.3506, "step": 203 }, { "epoch": 0.9304446978335233, "grad_norm": 18.689729690551758, "learning_rate": 9.448665901866514e-05, "loss": 0.4078, "step": 204 }, { "epoch": 0.935005701254276, "grad_norm": 13.453817367553711, "learning_rate": 9.441926491773691e-05, "loss": 0.3253, "step": 205 }, { "epoch": 0.9395667046750285, "grad_norm": 14.93052864074707, "learning_rate": 9.435148574479144e-05, "loss": 0.3576, "step": 206 }, { "epoch": 0.9441277080957811, "grad_norm": 11.697999000549316, "learning_rate": 9.428332208740857e-05, "loss": 0.3115, "step": 207 }, { "epoch": 0.9486887115165337, "grad_norm": 13.518777847290039, "learning_rate": 9.421477453650118e-05, "loss": 0.364, "step": 208 }, { "epoch": 0.9532497149372862, "grad_norm": 10.434165000915527, "learning_rate": 9.414584368631019e-05, "loss": 0.2677, "step": 209 }, { "epoch": 0.9578107183580388, "grad_norm": 16.765907287597656, "learning_rate": 9.407653013439928e-05, "loss": 0.5504, "step": 210 }, { "epoch": 0.9623717217787914, "grad_norm": 10.962894439697266, "learning_rate": 9.400683448164987e-05, "loss": 0.2913, "step": 211 }, { "epoch": 0.9669327251995439, "grad_norm": 27.222328186035156, "learning_rate": 9.393675733225578e-05, "loss": 0.6258, "step": 212 }, { "epoch": 0.9714937286202965, "grad_norm": 17.89396095275879, "learning_rate": 9.386629929371804e-05, "loss": 0.3468, "step": 213 }, { "epoch": 0.976054732041049, "grad_norm": 11.917913436889648, "learning_rate": 9.379546097683962e-05, "loss": 0.3384, "step": 214 }, { "epoch": 0.9806157354618016, "grad_norm": 16.30259895324707, "learning_rate": 9.372424299572013e-05, "loss": 0.4395, "step": 215 }, { "epoch": 0.9851767388825542, "grad_norm": 19.039505004882812, "learning_rate": 9.365264596775051e-05, "loss": 0.4235, "step": 216 }, { "epoch": 0.9897377423033067, "grad_norm": 21.45336151123047, "learning_rate": 9.35806705136077e-05, "loss": 0.3146, "step": 217 }, { "epoch": 0.9942987457240593, "grad_norm": 13.630745887756348, "learning_rate": 9.350831725724916e-05, "loss": 0.3927, "step": 218 }, { "epoch": 0.9988597491448119, "grad_norm": 13.76926326751709, "learning_rate": 9.343558682590756e-05, "loss": 0.3581, "step": 219 }, { "epoch": 1.0, "grad_norm": 14.069113731384277, "learning_rate": 9.336247985008534e-05, "loss": 0.2267, "step": 220 }, { "epoch": 1.0045610034207526, "grad_norm": 9.834161758422852, "learning_rate": 9.328899696354918e-05, "loss": 0.2113, "step": 221 }, { "epoch": 1.0091220068415052, "grad_norm": 8.437716484069824, "learning_rate": 9.321513880332458e-05, "loss": 0.2404, "step": 222 }, { "epoch": 1.0136830102622576, "grad_norm": 10.78850269317627, "learning_rate": 9.314090600969024e-05, "loss": 0.1706, "step": 223 }, { "epoch": 1.0182440136830102, "grad_norm": 10.366409301757812, "learning_rate": 9.306629922617261e-05, "loss": 0.2395, "step": 224 }, { "epoch": 1.0228050171037628, "grad_norm": 14.871321678161621, "learning_rate": 9.29913190995403e-05, "loss": 0.2993, "step": 225 }, { "epoch": 1.0273660205245154, "grad_norm": 12.021495819091797, "learning_rate": 9.291596627979836e-05, "loss": 0.2149, "step": 226 }, { "epoch": 1.031927023945268, "grad_norm": 14.372687339782715, "learning_rate": 9.284024142018281e-05, "loss": 0.2743, "step": 227 }, { "epoch": 1.0364880273660204, "grad_norm": 16.323156356811523, "learning_rate": 9.276414517715484e-05, "loss": 0.343, "step": 228 }, { "epoch": 1.041049030786773, "grad_norm": 14.0962495803833, "learning_rate": 9.268767821039521e-05, "loss": 0.2017, "step": 229 }, { "epoch": 1.0456100342075256, "grad_norm": 13.36721420288086, "learning_rate": 9.261084118279847e-05, "loss": 0.2844, "step": 230 }, { "epoch": 1.0501710376282782, "grad_norm": 16.017093658447266, "learning_rate": 9.253363476046725e-05, "loss": 0.2139, "step": 231 }, { "epoch": 1.0547320410490308, "grad_norm": 13.20304012298584, "learning_rate": 9.245605961270649e-05, "loss": 0.1957, "step": 232 }, { "epoch": 1.0592930444697835, "grad_norm": 11.656867027282715, "learning_rate": 9.23781164120176e-05, "loss": 0.2862, "step": 233 }, { "epoch": 1.0638540478905358, "grad_norm": 22.73741340637207, "learning_rate": 9.229980583409266e-05, "loss": 0.5163, "step": 234 }, { "epoch": 1.0684150513112884, "grad_norm": 13.677047729492188, "learning_rate": 9.222112855780856e-05, "loss": 0.304, "step": 235 }, { "epoch": 1.072976054732041, "grad_norm": 18.445669174194336, "learning_rate": 9.214208526522109e-05, "loss": 0.4152, "step": 236 }, { "epoch": 1.0775370581527937, "grad_norm": 7.721029758453369, "learning_rate": 9.206267664155907e-05, "loss": 0.1688, "step": 237 }, { "epoch": 1.0820980615735463, "grad_norm": 14.50108528137207, "learning_rate": 9.198290337521838e-05, "loss": 0.3409, "step": 238 }, { "epoch": 1.0866590649942987, "grad_norm": 9.616985321044922, "learning_rate": 9.190276615775599e-05, "loss": 0.212, "step": 239 }, { "epoch": 1.0912200684150513, "grad_norm": 10.23328685760498, "learning_rate": 9.182226568388401e-05, "loss": 0.2361, "step": 240 }, { "epoch": 1.0957810718358039, "grad_norm": 8.819774627685547, "learning_rate": 9.174140265146356e-05, "loss": 0.2378, "step": 241 }, { "epoch": 1.1003420752565565, "grad_norm": 9.800360679626465, "learning_rate": 9.166017776149887e-05, "loss": 0.1975, "step": 242 }, { "epoch": 1.104903078677309, "grad_norm": 14.380069732666016, "learning_rate": 9.157859171813107e-05, "loss": 0.1747, "step": 243 }, { "epoch": 1.1094640820980617, "grad_norm": 11.026459693908691, "learning_rate": 9.149664522863217e-05, "loss": 0.2154, "step": 244 }, { "epoch": 1.114025085518814, "grad_norm": 14.39684009552002, "learning_rate": 9.141433900339887e-05, "loss": 0.2274, "step": 245 }, { "epoch": 1.1185860889395667, "grad_norm": 12.926016807556152, "learning_rate": 9.133167375594647e-05, "loss": 0.2368, "step": 246 }, { "epoch": 1.1231470923603193, "grad_norm": 11.235928535461426, "learning_rate": 9.12486502029026e-05, "loss": 0.1921, "step": 247 }, { "epoch": 1.127708095781072, "grad_norm": 8.581406593322754, "learning_rate": 9.11652690640011e-05, "loss": 0.1573, "step": 248 }, { "epoch": 1.1322690992018245, "grad_norm": 12.511028289794922, "learning_rate": 9.10815310620757e-05, "loss": 0.2574, "step": 249 }, { "epoch": 1.1368301026225769, "grad_norm": 13.827507019042969, "learning_rate": 9.099743692305379e-05, "loss": 0.2751, "step": 250 }, { "epoch": 1.1413911060433295, "grad_norm": 16.066164016723633, "learning_rate": 9.091298737595014e-05, "loss": 0.2848, "step": 251 }, { "epoch": 1.145952109464082, "grad_norm": 19.021018981933594, "learning_rate": 9.082818315286055e-05, "loss": 0.2962, "step": 252 }, { "epoch": 1.1505131128848347, "grad_norm": 10.119819641113281, "learning_rate": 9.074302498895552e-05, "loss": 0.2158, "step": 253 }, { "epoch": 1.1550741163055873, "grad_norm": 11.308869361877441, "learning_rate": 9.065751362247388e-05, "loss": 0.2406, "step": 254 }, { "epoch": 1.1596351197263397, "grad_norm": 13.046134948730469, "learning_rate": 9.057164979471635e-05, "loss": 0.2534, "step": 255 }, { "epoch": 1.1641961231470923, "grad_norm": 12.656744003295898, "learning_rate": 9.048543425003923e-05, "loss": 0.2888, "step": 256 }, { "epoch": 1.168757126567845, "grad_norm": 11.619269371032715, "learning_rate": 9.039886773584779e-05, "loss": 0.2209, "step": 257 }, { "epoch": 1.1733181299885975, "grad_norm": 9.45288372039795, "learning_rate": 9.031195100258987e-05, "loss": 0.1455, "step": 258 }, { "epoch": 1.1778791334093501, "grad_norm": 8.505230903625488, "learning_rate": 9.02246848037494e-05, "loss": 0.1666, "step": 259 }, { "epoch": 1.1824401368301025, "grad_norm": 18.330678939819336, "learning_rate": 9.013706989583983e-05, "loss": 0.2517, "step": 260 }, { "epoch": 1.1870011402508551, "grad_norm": 14.25640869140625, "learning_rate": 9.00491070383976e-05, "loss": 0.3547, "step": 261 }, { "epoch": 1.1915621436716077, "grad_norm": 9.561500549316406, "learning_rate": 8.996079699397547e-05, "loss": 0.2168, "step": 262 }, { "epoch": 1.1961231470923603, "grad_norm": 12.185916900634766, "learning_rate": 8.987214052813604e-05, "loss": 0.1639, "step": 263 }, { "epoch": 1.200684150513113, "grad_norm": 10.289037704467773, "learning_rate": 8.978313840944503e-05, "loss": 0.1805, "step": 264 }, { "epoch": 1.2052451539338653, "grad_norm": 10.741447448730469, "learning_rate": 8.969379140946464e-05, "loss": 0.2754, "step": 265 }, { "epoch": 1.209806157354618, "grad_norm": 10.507063865661621, "learning_rate": 8.960410030274681e-05, "loss": 0.2606, "step": 266 }, { "epoch": 1.2143671607753705, "grad_norm": 15.26578140258789, "learning_rate": 8.951406586682662e-05, "loss": 0.3271, "step": 267 }, { "epoch": 1.2189281641961232, "grad_norm": 12.20109748840332, "learning_rate": 8.942368888221545e-05, "loss": 0.2345, "step": 268 }, { "epoch": 1.2234891676168758, "grad_norm": 9.984328269958496, "learning_rate": 8.933297013239424e-05, "loss": 0.1968, "step": 269 }, { "epoch": 1.2280501710376284, "grad_norm": 12.456114768981934, "learning_rate": 8.924191040380671e-05, "loss": 0.2624, "step": 270 }, { "epoch": 1.2326111744583808, "grad_norm": 11.822625160217285, "learning_rate": 8.915051048585256e-05, "loss": 0.2642, "step": 271 }, { "epoch": 1.2371721778791334, "grad_norm": 14.391879081726074, "learning_rate": 8.905877117088054e-05, "loss": 0.2378, "step": 272 }, { "epoch": 1.241733181299886, "grad_norm": 10.977947235107422, "learning_rate": 8.896669325418172e-05, "loss": 0.2302, "step": 273 }, { "epoch": 1.2462941847206386, "grad_norm": 10.284666061401367, "learning_rate": 8.887427753398248e-05, "loss": 0.2304, "step": 274 }, { "epoch": 1.2508551881413912, "grad_norm": 12.455047607421875, "learning_rate": 8.87815248114376e-05, "loss": 0.2586, "step": 275 }, { "epoch": 1.2554161915621438, "grad_norm": 8.457530975341797, "learning_rate": 8.868843589062339e-05, "loss": 0.1605, "step": 276 }, { "epoch": 1.2599771949828962, "grad_norm": 9.678699493408203, "learning_rate": 8.859501157853066e-05, "loss": 0.1834, "step": 277 }, { "epoch": 1.2645381984036488, "grad_norm": 10.811609268188477, "learning_rate": 8.850125268505774e-05, "loss": 0.2116, "step": 278 }, { "epoch": 1.2690992018244014, "grad_norm": 17.398542404174805, "learning_rate": 8.840716002300347e-05, "loss": 0.2112, "step": 279 }, { "epoch": 1.273660205245154, "grad_norm": 20.152509689331055, "learning_rate": 8.831273440806009e-05, "loss": 0.2475, "step": 280 }, { "epoch": 1.2782212086659066, "grad_norm": 9.431918144226074, "learning_rate": 8.821797665880625e-05, "loss": 0.1543, "step": 281 }, { "epoch": 1.282782212086659, "grad_norm": 16.22393798828125, "learning_rate": 8.812288759669994e-05, "loss": 0.2396, "step": 282 }, { "epoch": 1.2873432155074116, "grad_norm": 22.462894439697266, "learning_rate": 8.802746804607118e-05, "loss": 0.4583, "step": 283 }, { "epoch": 1.2919042189281642, "grad_norm": 9.348506927490234, "learning_rate": 8.793171883411515e-05, "loss": 0.1537, "step": 284 }, { "epoch": 1.2964652223489168, "grad_norm": 11.54738998413086, "learning_rate": 8.783564079088477e-05, "loss": 0.1724, "step": 285 }, { "epoch": 1.3010262257696694, "grad_norm": 14.7079439163208, "learning_rate": 8.773923474928365e-05, "loss": 0.2363, "step": 286 }, { "epoch": 1.3055872291904218, "grad_norm": 10.920402526855469, "learning_rate": 8.764250154505885e-05, "loss": 0.2441, "step": 287 }, { "epoch": 1.3101482326111744, "grad_norm": 13.201237678527832, "learning_rate": 8.754544201679353e-05, "loss": 0.2623, "step": 288 }, { "epoch": 1.314709236031927, "grad_norm": 8.716439247131348, "learning_rate": 8.744805700589989e-05, "loss": 0.2039, "step": 289 }, { "epoch": 1.3192702394526796, "grad_norm": 18.450355529785156, "learning_rate": 8.735034735661162e-05, "loss": 0.2247, "step": 290 }, { "epoch": 1.3238312428734322, "grad_norm": 9.224489212036133, "learning_rate": 8.725231391597681e-05, "loss": 0.2552, "step": 291 }, { "epoch": 1.3283922462941846, "grad_norm": 6.751110553741455, "learning_rate": 8.715395753385048e-05, "loss": 0.1663, "step": 292 }, { "epoch": 1.3329532497149372, "grad_norm": 12.301689147949219, "learning_rate": 8.705527906288718e-05, "loss": 0.2175, "step": 293 }, { "epoch": 1.3375142531356898, "grad_norm": 10.624826431274414, "learning_rate": 8.695627935853373e-05, "loss": 0.1919, "step": 294 }, { "epoch": 1.3420752565564424, "grad_norm": 11.931551933288574, "learning_rate": 8.68569592790217e-05, "loss": 0.2923, "step": 295 }, { "epoch": 1.346636259977195, "grad_norm": 10.668767929077148, "learning_rate": 8.675731968536002e-05, "loss": 0.2388, "step": 296 }, { "epoch": 1.3511972633979474, "grad_norm": 12.659479141235352, "learning_rate": 8.66573614413275e-05, "loss": 0.2413, "step": 297 }, { "epoch": 1.3557582668187, "grad_norm": 17.342519760131836, "learning_rate": 8.655708541346533e-05, "loss": 0.2613, "step": 298 }, { "epoch": 1.3603192702394526, "grad_norm": 11.727388381958008, "learning_rate": 8.645649247106955e-05, "loss": 0.2109, "step": 299 }, { "epoch": 1.3648802736602053, "grad_norm": 12.136419296264648, "learning_rate": 8.635558348618359e-05, "loss": 0.2467, "step": 300 }, { "epoch": 1.3694412770809579, "grad_norm": 10.73682975769043, "learning_rate": 8.625435933359062e-05, "loss": 0.1937, "step": 301 }, { "epoch": 1.3740022805017102, "grad_norm": 11.996789932250977, "learning_rate": 8.615282089080609e-05, "loss": 0.2655, "step": 302 }, { "epoch": 1.378563283922463, "grad_norm": 15.302202224731445, "learning_rate": 8.605096903806991e-05, "loss": 0.2487, "step": 303 }, { "epoch": 1.3831242873432155, "grad_norm": 9.55948543548584, "learning_rate": 8.594880465833908e-05, "loss": 0.1708, "step": 304 }, { "epoch": 1.387685290763968, "grad_norm": 12.158270835876465, "learning_rate": 8.584632863727982e-05, "loss": 0.2452, "step": 305 }, { "epoch": 1.3922462941847207, "grad_norm": 8.76557731628418, "learning_rate": 8.574354186326001e-05, "loss": 0.193, "step": 306 }, { "epoch": 1.3968072976054733, "grad_norm": 9.248729705810547, "learning_rate": 8.564044522734147e-05, "loss": 0.2264, "step": 307 }, { "epoch": 1.401368301026226, "grad_norm": 9.079157829284668, "learning_rate": 8.55370396232722e-05, "loss": 0.1809, "step": 308 }, { "epoch": 1.4059293044469783, "grad_norm": 9.482975959777832, "learning_rate": 8.543332594747865e-05, "loss": 0.1772, "step": 309 }, { "epoch": 1.4104903078677309, "grad_norm": 9.198452949523926, "learning_rate": 8.532930509905799e-05, "loss": 0.2047, "step": 310 }, { "epoch": 1.4150513112884835, "grad_norm": 8.783437728881836, "learning_rate": 8.522497797977024e-05, "loss": 0.2247, "step": 311 }, { "epoch": 1.419612314709236, "grad_norm": 14.365094184875488, "learning_rate": 8.512034549403053e-05, "loss": 0.2208, "step": 312 }, { "epoch": 1.4241733181299887, "grad_norm": 14.370891571044922, "learning_rate": 8.501540854890118e-05, "loss": 0.2326, "step": 313 }, { "epoch": 1.428734321550741, "grad_norm": 10.391968727111816, "learning_rate": 8.491016805408387e-05, "loss": 0.1751, "step": 314 }, { "epoch": 1.4332953249714937, "grad_norm": 11.512178421020508, "learning_rate": 8.480462492191186e-05, "loss": 0.2978, "step": 315 }, { "epoch": 1.4378563283922463, "grad_norm": 12.58578872680664, "learning_rate": 8.469878006734185e-05, "loss": 0.2706, "step": 316 }, { "epoch": 1.442417331812999, "grad_norm": 8.530269622802734, "learning_rate": 8.459263440794627e-05, "loss": 0.1755, "step": 317 }, { "epoch": 1.4469783352337515, "grad_norm": 8.248932838439941, "learning_rate": 8.448618886390522e-05, "loss": 0.1483, "step": 318 }, { "epoch": 1.451539338654504, "grad_norm": 18.134685516357422, "learning_rate": 8.437944435799848e-05, "loss": 0.1938, "step": 319 }, { "epoch": 1.4561003420752565, "grad_norm": 8.072942733764648, "learning_rate": 8.427240181559754e-05, "loss": 0.1573, "step": 320 }, { "epoch": 1.4606613454960091, "grad_norm": 11.139336585998535, "learning_rate": 8.416506216465765e-05, "loss": 0.2272, "step": 321 }, { "epoch": 1.4652223489167617, "grad_norm": 12.186053276062012, "learning_rate": 8.405742633570961e-05, "loss": 0.1716, "step": 322 }, { "epoch": 1.4697833523375143, "grad_norm": 10.145633697509766, "learning_rate": 8.394949526185185e-05, "loss": 0.1913, "step": 323 }, { "epoch": 1.4743443557582667, "grad_norm": 9.266544342041016, "learning_rate": 8.384126987874228e-05, "loss": 0.1642, "step": 324 }, { "epoch": 1.4789053591790193, "grad_norm": 11.050301551818848, "learning_rate": 8.373275112459016e-05, "loss": 0.2253, "step": 325 }, { "epoch": 1.483466362599772, "grad_norm": 8.425420761108398, "learning_rate": 8.362393994014805e-05, "loss": 0.1826, "step": 326 }, { "epoch": 1.4880273660205245, "grad_norm": 14.498648643493652, "learning_rate": 8.35148372687035e-05, "loss": 0.2432, "step": 327 }, { "epoch": 1.4925883694412772, "grad_norm": 14.982590675354004, "learning_rate": 8.340544405607111e-05, "loss": 0.1724, "step": 328 }, { "epoch": 1.4971493728620295, "grad_norm": 11.120532035827637, "learning_rate": 8.329576125058406e-05, "loss": 0.1461, "step": 329 }, { "epoch": 1.5017103762827824, "grad_norm": 17.683189392089844, "learning_rate": 8.318578980308609e-05, "loss": 0.3342, "step": 330 }, { "epoch": 1.5062713797035348, "grad_norm": 17.191091537475586, "learning_rate": 8.307553066692314e-05, "loss": 0.2188, "step": 331 }, { "epoch": 1.5108323831242874, "grad_norm": 8.582752227783203, "learning_rate": 8.29649847979352e-05, "loss": 0.1141, "step": 332 }, { "epoch": 1.51539338654504, "grad_norm": 9.13406753540039, "learning_rate": 8.28541531544479e-05, "loss": 0.1767, "step": 333 }, { "epoch": 1.5199543899657924, "grad_norm": 8.726181030273438, "learning_rate": 8.274303669726426e-05, "loss": 0.1348, "step": 334 }, { "epoch": 1.5245153933865452, "grad_norm": 10.707447052001953, "learning_rate": 8.263163638965639e-05, "loss": 0.2005, "step": 335 }, { "epoch": 1.5290763968072976, "grad_norm": 12.0310640335083, "learning_rate": 8.25199531973571e-05, "loss": 0.1985, "step": 336 }, { "epoch": 1.5336374002280502, "grad_norm": 9.672492027282715, "learning_rate": 8.24079880885515e-05, "loss": 0.2014, "step": 337 }, { "epoch": 1.5381984036488028, "grad_norm": 9.297097206115723, "learning_rate": 8.22957420338687e-05, "loss": 0.1302, "step": 338 }, { "epoch": 1.5427594070695552, "grad_norm": 25.988061904907227, "learning_rate": 8.218321600637329e-05, "loss": 0.2899, "step": 339 }, { "epoch": 1.547320410490308, "grad_norm": 9.74842643737793, "learning_rate": 8.2070410981557e-05, "loss": 0.1612, "step": 340 }, { "epoch": 1.5518814139110604, "grad_norm": 10.73891544342041, "learning_rate": 8.195732793733014e-05, "loss": 0.2282, "step": 341 }, { "epoch": 1.556442417331813, "grad_norm": 15.269837379455566, "learning_rate": 8.184396785401322e-05, "loss": 0.1585, "step": 342 }, { "epoch": 1.5610034207525656, "grad_norm": 7.805790901184082, "learning_rate": 8.173033171432841e-05, "loss": 0.1117, "step": 343 }, { "epoch": 1.565564424173318, "grad_norm": 9.819446563720703, "learning_rate": 8.1616420503391e-05, "loss": 0.2143, "step": 344 }, { "epoch": 1.5701254275940708, "grad_norm": 8.949931144714355, "learning_rate": 8.15022352087009e-05, "loss": 0.2139, "step": 345 }, { "epoch": 1.5746864310148232, "grad_norm": 14.177704811096191, "learning_rate": 8.138777682013403e-05, "loss": 0.2733, "step": 346 }, { "epoch": 1.5792474344355758, "grad_norm": 10.694663047790527, "learning_rate": 8.127304632993382e-05, "loss": 0.1532, "step": 347 }, { "epoch": 1.5838084378563284, "grad_norm": 14.421151161193848, "learning_rate": 8.115804473270253e-05, "loss": 0.1349, "step": 348 }, { "epoch": 1.5883694412770808, "grad_norm": 9.572623252868652, "learning_rate": 8.104277302539264e-05, "loss": 0.1852, "step": 349 }, { "epoch": 1.5929304446978336, "grad_norm": 8.018699645996094, "learning_rate": 8.092723220729825e-05, "loss": 0.1398, "step": 350 }, { "epoch": 1.597491448118586, "grad_norm": 10.331695556640625, "learning_rate": 8.081142328004637e-05, "loss": 0.1678, "step": 351 }, { "epoch": 1.6020524515393386, "grad_norm": 8.879880905151367, "learning_rate": 8.069534724758827e-05, "loss": 0.1527, "step": 352 }, { "epoch": 1.6066134549600912, "grad_norm": 11.865134239196777, "learning_rate": 8.057900511619076e-05, "loss": 0.174, "step": 353 }, { "epoch": 1.6111744583808438, "grad_norm": 20.736913681030273, "learning_rate": 8.046239789442749e-05, "loss": 0.14, "step": 354 }, { "epoch": 1.6157354618015964, "grad_norm": 8.29340648651123, "learning_rate": 8.034552659317012e-05, "loss": 0.1924, "step": 355 }, { "epoch": 1.6202964652223488, "grad_norm": 14.969886779785156, "learning_rate": 8.02283922255797e-05, "loss": 0.1776, "step": 356 }, { "epoch": 1.6248574686431014, "grad_norm": 41.689517974853516, "learning_rate": 8.011099580709778e-05, "loss": 0.1337, "step": 357 }, { "epoch": 1.629418472063854, "grad_norm": 9.815425872802734, "learning_rate": 7.999333835543763e-05, "loss": 0.1959, "step": 358 }, { "epoch": 1.6339794754846066, "grad_norm": 12.40318775177002, "learning_rate": 7.987542089057542e-05, "loss": 0.1968, "step": 359 }, { "epoch": 1.6385404789053593, "grad_norm": 8.287771224975586, "learning_rate": 7.975724443474143e-05, "loss": 0.1082, "step": 360 }, { "epoch": 1.6431014823261116, "grad_norm": 9.289151191711426, "learning_rate": 7.963881001241107e-05, "loss": 0.1176, "step": 361 }, { "epoch": 1.6476624857468645, "grad_norm": 12.972766876220703, "learning_rate": 7.952011865029614e-05, "loss": 0.2185, "step": 362 }, { "epoch": 1.6522234891676169, "grad_norm": 11.908880233764648, "learning_rate": 7.940117137733579e-05, "loss": 0.177, "step": 363 }, { "epoch": 1.6567844925883695, "grad_norm": 8.70804500579834, "learning_rate": 7.928196922468772e-05, "loss": 0.143, "step": 364 }, { "epoch": 1.661345496009122, "grad_norm": 11.03876781463623, "learning_rate": 7.916251322571918e-05, "loss": 0.1837, "step": 365 }, { "epoch": 1.6659064994298745, "grad_norm": 12.949993133544922, "learning_rate": 7.904280441599801e-05, "loss": 0.1652, "step": 366 }, { "epoch": 1.6704675028506273, "grad_norm": 10.857973098754883, "learning_rate": 7.892284383328367e-05, "loss": 0.1575, "step": 367 }, { "epoch": 1.6750285062713797, "grad_norm": 8.548442840576172, "learning_rate": 7.88026325175183e-05, "loss": 0.125, "step": 368 }, { "epoch": 1.6795895096921323, "grad_norm": 7.094759464263916, "learning_rate": 7.868217151081755e-05, "loss": 0.132, "step": 369 }, { "epoch": 1.6841505131128849, "grad_norm": 8.124651908874512, "learning_rate": 7.856146185746175e-05, "loss": 0.164, "step": 370 }, { "epoch": 1.6887115165336373, "grad_norm": 10.263216018676758, "learning_rate": 7.844050460388671e-05, "loss": 0.1476, "step": 371 }, { "epoch": 1.69327251995439, "grad_norm": 12.262899398803711, "learning_rate": 7.831930079867469e-05, "loss": 0.1952, "step": 372 }, { "epoch": 1.6978335233751425, "grad_norm": 8.35619831085205, "learning_rate": 7.819785149254532e-05, "loss": 0.1429, "step": 373 }, { "epoch": 1.702394526795895, "grad_norm": 10.857168197631836, "learning_rate": 7.807615773834652e-05, "loss": 0.1307, "step": 374 }, { "epoch": 1.7069555302166477, "grad_norm": 7.174655914306641, "learning_rate": 7.795422059104527e-05, "loss": 0.1304, "step": 375 }, { "epoch": 1.7115165336374, "grad_norm": 14.329642295837402, "learning_rate": 7.78320411077186e-05, "loss": 0.1997, "step": 376 }, { "epoch": 1.716077537058153, "grad_norm": 8.973917961120605, "learning_rate": 7.77096203475443e-05, "loss": 0.1583, "step": 377 }, { "epoch": 1.7206385404789053, "grad_norm": 6.451292514801025, "learning_rate": 7.758695937179185e-05, "loss": 0.1201, "step": 378 }, { "epoch": 1.725199543899658, "grad_norm": 6.503537178039551, "learning_rate": 7.746405924381313e-05, "loss": 0.0973, "step": 379 }, { "epoch": 1.7297605473204105, "grad_norm": 6.6744608879089355, "learning_rate": 7.734092102903323e-05, "loss": 0.1019, "step": 380 }, { "epoch": 1.734321550741163, "grad_norm": 9.03148365020752, "learning_rate": 7.721754579494127e-05, "loss": 0.1592, "step": 381 }, { "epoch": 1.7388825541619157, "grad_norm": 7.3868632316589355, "learning_rate": 7.709393461108107e-05, "loss": 0.1457, "step": 382 }, { "epoch": 1.7434435575826681, "grad_norm": 11.139805793762207, "learning_rate": 7.697008854904191e-05, "loss": 0.1512, "step": 383 }, { "epoch": 1.7480045610034207, "grad_norm": 9.616064071655273, "learning_rate": 7.68460086824492e-05, "loss": 0.194, "step": 384 }, { "epoch": 1.7525655644241733, "grad_norm": 8.999774932861328, "learning_rate": 7.672169608695525e-05, "loss": 0.1654, "step": 385 }, { "epoch": 1.757126567844926, "grad_norm": 12.37429141998291, "learning_rate": 7.659715184022994e-05, "loss": 0.2122, "step": 386 }, { "epoch": 1.7616875712656785, "grad_norm": 9.472933769226074, "learning_rate": 7.647237702195123e-05, "loss": 0.1587, "step": 387 }, { "epoch": 1.766248574686431, "grad_norm": 10.54593563079834, "learning_rate": 7.634737271379603e-05, "loss": 0.2103, "step": 388 }, { "epoch": 1.7708095781071835, "grad_norm": 6.688052654266357, "learning_rate": 7.622213999943062e-05, "loss": 0.0989, "step": 389 }, { "epoch": 1.7753705815279361, "grad_norm": 10.251477241516113, "learning_rate": 7.609667996450141e-05, "loss": 0.2219, "step": 390 }, { "epoch": 1.7799315849486887, "grad_norm": 6.267465591430664, "learning_rate": 7.59709936966254e-05, "loss": 0.0958, "step": 391 }, { "epoch": 1.7844925883694414, "grad_norm": 8.224940299987793, "learning_rate": 7.584508228538085e-05, "loss": 0.1312, "step": 392 }, { "epoch": 1.7890535917901937, "grad_norm": 12.063385963439941, "learning_rate": 7.571894682229775e-05, "loss": 0.1833, "step": 393 }, { "epoch": 1.7936145952109466, "grad_norm": 8.511308670043945, "learning_rate": 7.559258840084848e-05, "loss": 0.1442, "step": 394 }, { "epoch": 1.798175598631699, "grad_norm": 8.30827522277832, "learning_rate": 7.546600811643816e-05, "loss": 0.1438, "step": 395 }, { "epoch": 1.8027366020524516, "grad_norm": 11.280699729919434, "learning_rate": 7.533920706639531e-05, "loss": 0.2558, "step": 396 }, { "epoch": 1.8072976054732042, "grad_norm": 6.188623905181885, "learning_rate": 7.521218634996226e-05, "loss": 0.1072, "step": 397 }, { "epoch": 1.8118586088939566, "grad_norm": 15.961888313293457, "learning_rate": 7.508494706828564e-05, "loss": 0.1619, "step": 398 }, { "epoch": 1.8164196123147094, "grad_norm": 9.33893871307373, "learning_rate": 7.49574903244068e-05, "loss": 0.1995, "step": 399 }, { "epoch": 1.8209806157354618, "grad_norm": 12.080733299255371, "learning_rate": 7.482981722325232e-05, "loss": 0.1647, "step": 400 }, { "epoch": 1.8255416191562144, "grad_norm": 8.13494873046875, "learning_rate": 7.470192887162435e-05, "loss": 0.1278, "step": 401 }, { "epoch": 1.830102622576967, "grad_norm": 9.168209075927734, "learning_rate": 7.457382637819108e-05, "loss": 0.1244, "step": 402 }, { "epoch": 1.8346636259977194, "grad_norm": 8.094377517700195, "learning_rate": 7.444551085347707e-05, "loss": 0.1066, "step": 403 }, { "epoch": 1.8392246294184722, "grad_norm": 9.4691743850708, "learning_rate": 7.43169834098537e-05, "loss": 0.1378, "step": 404 }, { "epoch": 1.8437856328392246, "grad_norm": 9.721671104431152, "learning_rate": 7.418824516152943e-05, "loss": 0.1247, "step": 405 }, { "epoch": 1.8483466362599772, "grad_norm": 8.888439178466797, "learning_rate": 7.405929722454026e-05, "loss": 0.1256, "step": 406 }, { "epoch": 1.8529076396807298, "grad_norm": 7.914821147918701, "learning_rate": 7.393014071673992e-05, "loss": 0.1293, "step": 407 }, { "epoch": 1.8574686431014822, "grad_norm": 10.728132247924805, "learning_rate": 7.380077675779027e-05, "loss": 0.1738, "step": 408 }, { "epoch": 1.862029646522235, "grad_norm": 13.084773063659668, "learning_rate": 7.36712064691516e-05, "loss": 0.1478, "step": 409 }, { "epoch": 1.8665906499429874, "grad_norm": 6.1122660636901855, "learning_rate": 7.354143097407283e-05, "loss": 0.084, "step": 410 }, { "epoch": 1.87115165336374, "grad_norm": 16.69349479675293, "learning_rate": 7.341145139758185e-05, "loss": 0.1161, "step": 411 }, { "epoch": 1.8757126567844926, "grad_norm": 18.03197479248047, "learning_rate": 7.328126886647575e-05, "loss": 0.2555, "step": 412 }, { "epoch": 1.880273660205245, "grad_norm": 5.904569149017334, "learning_rate": 7.315088450931103e-05, "loss": 0.0978, "step": 413 }, { "epoch": 1.8848346636259978, "grad_norm": 8.850961685180664, "learning_rate": 7.302029945639377e-05, "loss": 0.1636, "step": 414 }, { "epoch": 1.8893956670467502, "grad_norm": 8.666600227355957, "learning_rate": 7.288951483976998e-05, "loss": 0.1544, "step": 415 }, { "epoch": 1.8939566704675028, "grad_norm": 8.048266410827637, "learning_rate": 7.275853179321565e-05, "loss": 0.1148, "step": 416 }, { "epoch": 1.8985176738882554, "grad_norm": 9.665177345275879, "learning_rate": 7.262735145222696e-05, "loss": 0.1452, "step": 417 }, { "epoch": 1.9030786773090078, "grad_norm": 6.529131889343262, "learning_rate": 7.249597495401043e-05, "loss": 0.0976, "step": 418 }, { "epoch": 1.9076396807297606, "grad_norm": 6.697221755981445, "learning_rate": 7.236440343747313e-05, "loss": 0.1207, "step": 419 }, { "epoch": 1.912200684150513, "grad_norm": 7.641704559326172, "learning_rate": 7.223263804321269e-05, "loss": 0.1102, "step": 420 }, { "epoch": 1.9167616875712656, "grad_norm": 5.448543071746826, "learning_rate": 7.21006799135075e-05, "loss": 0.0969, "step": 421 }, { "epoch": 1.9213226909920182, "grad_norm": 12.550832748413086, "learning_rate": 7.196853019230676e-05, "loss": 0.1629, "step": 422 }, { "epoch": 1.9258836944127709, "grad_norm": 7.084536552429199, "learning_rate": 7.183619002522062e-05, "loss": 0.1378, "step": 423 }, { "epoch": 1.9304446978335235, "grad_norm": 8.91976547241211, "learning_rate": 7.170366055951017e-05, "loss": 0.1177, "step": 424 }, { "epoch": 1.9350057012542758, "grad_norm": 9.238527297973633, "learning_rate": 7.157094294407756e-05, "loss": 0.1522, "step": 425 }, { "epoch": 1.9395667046750285, "grad_norm": 8.187129974365234, "learning_rate": 7.143803832945601e-05, "loss": 0.1134, "step": 426 }, { "epoch": 1.944127708095781, "grad_norm": 7.621769905090332, "learning_rate": 7.130494786779987e-05, "loss": 0.1011, "step": 427 }, { "epoch": 1.9486887115165337, "grad_norm": 11.57784652709961, "learning_rate": 7.117167271287453e-05, "loss": 0.1254, "step": 428 }, { "epoch": 1.9532497149372863, "grad_norm": 8.105171203613281, "learning_rate": 7.103821402004654e-05, "loss": 0.0994, "step": 429 }, { "epoch": 1.9578107183580387, "grad_norm": 10.137523651123047, "learning_rate": 7.090457294627358e-05, "loss": 0.0976, "step": 430 }, { "epoch": 1.9623717217787915, "grad_norm": 5.640718936920166, "learning_rate": 7.077075065009433e-05, "loss": 0.0887, "step": 431 }, { "epoch": 1.9669327251995439, "grad_norm": 10.016772270202637, "learning_rate": 7.063674829161853e-05, "loss": 0.1036, "step": 432 }, { "epoch": 1.9714937286202965, "grad_norm": 8.870481491088867, "learning_rate": 7.050256703251688e-05, "loss": 0.0973, "step": 433 }, { "epoch": 1.976054732041049, "grad_norm": 7.390217304229736, "learning_rate": 7.036820803601099e-05, "loss": 0.0966, "step": 434 }, { "epoch": 1.9806157354618015, "grad_norm": 9.348631858825684, "learning_rate": 7.023367246686323e-05, "loss": 0.1119, "step": 435 }, { "epoch": 1.9851767388825543, "grad_norm": 7.239314556121826, "learning_rate": 7.009896149136674e-05, "loss": 0.1167, "step": 436 }, { "epoch": 1.9897377423033067, "grad_norm": 7.269038200378418, "learning_rate": 6.996407627733526e-05, "loss": 0.123, "step": 437 }, { "epoch": 1.9942987457240593, "grad_norm": 10.885858535766602, "learning_rate": 6.982901799409294e-05, "loss": 0.1401, "step": 438 }, { "epoch": 1.998859749144812, "grad_norm": 13.753951072692871, "learning_rate": 6.969378781246436e-05, "loss": 0.0994, "step": 439 }, { "epoch": 2.0, "grad_norm": 21.25026512145996, "learning_rate": 6.955838690476426e-05, "loss": 0.1222, "step": 440 }, { "epoch": 2.0045610034207524, "grad_norm": 6.62643575668335, "learning_rate": 6.942281644478739e-05, "loss": 0.0698, "step": 441 }, { "epoch": 2.009122006841505, "grad_norm": 7.465953826904297, "learning_rate": 6.928707760779838e-05, "loss": 0.0795, "step": 442 }, { "epoch": 2.0136830102622576, "grad_norm": 7.87470006942749, "learning_rate": 6.915117157052149e-05, "loss": 0.0733, "step": 443 }, { "epoch": 2.0182440136830104, "grad_norm": 7.7158966064453125, "learning_rate": 6.90150995111305e-05, "loss": 0.0761, "step": 444 }, { "epoch": 2.022805017103763, "grad_norm": 9.388237953186035, "learning_rate": 6.887886260923842e-05, "loss": 0.111, "step": 445 }, { "epoch": 2.027366020524515, "grad_norm": 10.703797340393066, "learning_rate": 6.874246204588724e-05, "loss": 0.1158, "step": 446 }, { "epoch": 2.031927023945268, "grad_norm": 6.636610507965088, "learning_rate": 6.860589900353778e-05, "loss": 0.078, "step": 447 }, { "epoch": 2.0364880273660204, "grad_norm": 6.754958629608154, "learning_rate": 6.84691746660594e-05, "loss": 0.0676, "step": 448 }, { "epoch": 2.0410490307867732, "grad_norm": 8.061761856079102, "learning_rate": 6.833229021871974e-05, "loss": 0.0781, "step": 449 }, { "epoch": 2.0456100342075256, "grad_norm": 5.99964714050293, "learning_rate": 6.819524684817438e-05, "loss": 0.0645, "step": 450 }, { "epoch": 2.050171037628278, "grad_norm": 6.639948844909668, "learning_rate": 6.805804574245666e-05, "loss": 0.0721, "step": 451 }, { "epoch": 2.054732041049031, "grad_norm": 6.918362140655518, "learning_rate": 6.792068809096734e-05, "loss": 0.1027, "step": 452 }, { "epoch": 2.0592930444697832, "grad_norm": 6.9611616134643555, "learning_rate": 6.778317508446423e-05, "loss": 0.0902, "step": 453 }, { "epoch": 2.063854047890536, "grad_norm": 5.7177019119262695, "learning_rate": 6.764550791505197e-05, "loss": 0.0544, "step": 454 }, { "epoch": 2.0684150513112884, "grad_norm": 7.697108745574951, "learning_rate": 6.750768777617162e-05, "loss": 0.0673, "step": 455 }, { "epoch": 2.072976054732041, "grad_norm": 3.653858184814453, "learning_rate": 6.736971586259033e-05, "loss": 0.0413, "step": 456 }, { "epoch": 2.0775370581527937, "grad_norm": 6.587297439575195, "learning_rate": 6.723159337039097e-05, "loss": 0.0537, "step": 457 }, { "epoch": 2.082098061573546, "grad_norm": 5.920407295227051, "learning_rate": 6.709332149696185e-05, "loss": 0.0555, "step": 458 }, { "epoch": 2.086659064994299, "grad_norm": 5.50054407119751, "learning_rate": 6.695490144098621e-05, "loss": 0.0756, "step": 459 }, { "epoch": 2.0912200684150513, "grad_norm": 8.171920776367188, "learning_rate": 6.681633440243194e-05, "loss": 0.0817, "step": 460 }, { "epoch": 2.095781071835804, "grad_norm": 7.142725944519043, "learning_rate": 6.667762158254104e-05, "loss": 0.0579, "step": 461 }, { "epoch": 2.1003420752565565, "grad_norm": 6.230417251586914, "learning_rate": 6.653876418381937e-05, "loss": 0.0778, "step": 462 }, { "epoch": 2.104903078677309, "grad_norm": 7.222645282745361, "learning_rate": 6.639976341002614e-05, "loss": 0.0471, "step": 463 }, { "epoch": 2.1094640820980617, "grad_norm": 8.968223571777344, "learning_rate": 6.626062046616345e-05, "loss": 0.0631, "step": 464 }, { "epoch": 2.114025085518814, "grad_norm": 6.115957736968994, "learning_rate": 6.612133655846592e-05, "loss": 0.0605, "step": 465 }, { "epoch": 2.118586088939567, "grad_norm": 12.084288597106934, "learning_rate": 6.598191289439016e-05, "loss": 0.1068, "step": 466 }, { "epoch": 2.1231470923603193, "grad_norm": 6.62805700302124, "learning_rate": 6.584235068260432e-05, "loss": 0.0812, "step": 467 }, { "epoch": 2.1277080957810717, "grad_norm": 9.849166870117188, "learning_rate": 6.570265113297764e-05, "loss": 0.0972, "step": 468 }, { "epoch": 2.1322690992018245, "grad_norm": 8.004566192626953, "learning_rate": 6.556281545656999e-05, "loss": 0.0602, "step": 469 }, { "epoch": 2.136830102622577, "grad_norm": 5.589608192443848, "learning_rate": 6.542284486562124e-05, "loss": 0.0537, "step": 470 }, { "epoch": 2.1413911060433297, "grad_norm": 5.782744884490967, "learning_rate": 6.528274057354092e-05, "loss": 0.071, "step": 471 }, { "epoch": 2.145952109464082, "grad_norm": 5.03806734085083, "learning_rate": 6.514250379489753e-05, "loss": 0.052, "step": 472 }, { "epoch": 2.1505131128848345, "grad_norm": 8.631730079650879, "learning_rate": 6.500213574540823e-05, "loss": 0.0711, "step": 473 }, { "epoch": 2.1550741163055873, "grad_norm": 3.648717164993286, "learning_rate": 6.486163764192806e-05, "loss": 0.0558, "step": 474 }, { "epoch": 2.1596351197263397, "grad_norm": 5.878966808319092, "learning_rate": 6.472101070243952e-05, "loss": 0.0377, "step": 475 }, { "epoch": 2.1641961231470925, "grad_norm": 6.6274919509887695, "learning_rate": 6.458025614604203e-05, "loss": 0.063, "step": 476 }, { "epoch": 2.168757126567845, "grad_norm": 5.117002964019775, "learning_rate": 6.44393751929413e-05, "loss": 0.0675, "step": 477 }, { "epoch": 2.1733181299885973, "grad_norm": 4.451428413391113, "learning_rate": 6.429836906443879e-05, "loss": 0.0437, "step": 478 }, { "epoch": 2.17787913340935, "grad_norm": 7.2544755935668945, "learning_rate": 6.415723898292112e-05, "loss": 0.0816, "step": 479 }, { "epoch": 2.1824401368301025, "grad_norm": 7.115444183349609, "learning_rate": 6.401598617184939e-05, "loss": 0.0632, "step": 480 }, { "epoch": 2.1870011402508553, "grad_norm": 6.341275215148926, "learning_rate": 6.387461185574874e-05, "loss": 0.045, "step": 481 }, { "epoch": 2.1915621436716077, "grad_norm": 4.9018025398254395, "learning_rate": 6.373311726019763e-05, "loss": 0.0449, "step": 482 }, { "epoch": 2.19612314709236, "grad_norm": 6.2423906326293945, "learning_rate": 6.359150361181715e-05, "loss": 0.0609, "step": 483 }, { "epoch": 2.200684150513113, "grad_norm": 7.57888126373291, "learning_rate": 6.344977213826054e-05, "loss": 0.094, "step": 484 }, { "epoch": 2.2052451539338653, "grad_norm": 7.0582475662231445, "learning_rate": 6.330792406820242e-05, "loss": 0.0598, "step": 485 }, { "epoch": 2.209806157354618, "grad_norm": 6.263000011444092, "learning_rate": 6.316596063132822e-05, "loss": 0.0594, "step": 486 }, { "epoch": 2.2143671607753705, "grad_norm": 5.4891862869262695, "learning_rate": 6.302388305832351e-05, "loss": 0.0512, "step": 487 }, { "epoch": 2.2189281641961234, "grad_norm": 7.57410192489624, "learning_rate": 6.288169258086322e-05, "loss": 0.0746, "step": 488 }, { "epoch": 2.2234891676168758, "grad_norm": 7.4229631423950195, "learning_rate": 6.273939043160118e-05, "loss": 0.0609, "step": 489 }, { "epoch": 2.228050171037628, "grad_norm": 12.584153175354004, "learning_rate": 6.259697784415918e-05, "loss": 0.1267, "step": 490 }, { "epoch": 2.232611174458381, "grad_norm": 8.015351295471191, "learning_rate": 6.245445605311649e-05, "loss": 0.0611, "step": 491 }, { "epoch": 2.2371721778791334, "grad_norm": 8.479742050170898, "learning_rate": 6.231182629399901e-05, "loss": 0.052, "step": 492 }, { "epoch": 2.241733181299886, "grad_norm": 7.191579341888428, "learning_rate": 6.21690898032687e-05, "loss": 0.0738, "step": 493 }, { "epoch": 2.2462941847206386, "grad_norm": 6.246610641479492, "learning_rate": 6.202624781831268e-05, "loss": 0.0577, "step": 494 }, { "epoch": 2.250855188141391, "grad_norm": 4.082911968231201, "learning_rate": 6.188330157743267e-05, "loss": 0.0404, "step": 495 }, { "epoch": 2.255416191562144, "grad_norm": 5.735588550567627, "learning_rate": 6.174025231983416e-05, "loss": 0.0529, "step": 496 }, { "epoch": 2.259977194982896, "grad_norm": 9.69885540008545, "learning_rate": 6.159710128561575e-05, "loss": 0.0574, "step": 497 }, { "epoch": 2.264538198403649, "grad_norm": 7.707938194274902, "learning_rate": 6.145384971575823e-05, "loss": 0.0704, "step": 498 }, { "epoch": 2.2690992018244014, "grad_norm": 5.515017032623291, "learning_rate": 6.131049885211404e-05, "loss": 0.0501, "step": 499 }, { "epoch": 2.2736602052451538, "grad_norm": 7.536128997802734, "learning_rate": 6.116704993739635e-05, "loss": 0.0669, "step": 500 }, { "epoch": 2.2782212086659066, "grad_norm": 7.009504795074463, "learning_rate": 6.102350421516837e-05, "loss": 0.0609, "step": 501 }, { "epoch": 2.282782212086659, "grad_norm": 6.848779678344727, "learning_rate": 6.087986292983252e-05, "loss": 0.0742, "step": 502 }, { "epoch": 2.287343215507412, "grad_norm": 6.385640621185303, "learning_rate": 6.073612732661966e-05, "loss": 0.0537, "step": 503 }, { "epoch": 2.291904218928164, "grad_norm": 6.395091533660889, "learning_rate": 6.059229865157829e-05, "loss": 0.042, "step": 504 }, { "epoch": 2.2964652223489166, "grad_norm": 9.047046661376953, "learning_rate": 6.044837815156377e-05, "loss": 0.0676, "step": 505 }, { "epoch": 2.3010262257696694, "grad_norm": 4.578718662261963, "learning_rate": 6.030436707422745e-05, "loss": 0.0509, "step": 506 }, { "epoch": 2.305587229190422, "grad_norm": 5.892753601074219, "learning_rate": 6.016026666800597e-05, "loss": 0.0484, "step": 507 }, { "epoch": 2.3101482326111746, "grad_norm": 5.961977958679199, "learning_rate": 6.001607818211031e-05, "loss": 0.0653, "step": 508 }, { "epoch": 2.314709236031927, "grad_norm": 5.6413397789001465, "learning_rate": 5.987180286651503e-05, "loss": 0.0468, "step": 509 }, { "epoch": 2.3192702394526794, "grad_norm": 5.839052677154541, "learning_rate": 5.9727441971947395e-05, "loss": 0.0458, "step": 510 }, { "epoch": 2.3238312428734322, "grad_norm": 3.717437982559204, "learning_rate": 5.958299674987663e-05, "loss": 0.0322, "step": 511 }, { "epoch": 2.3283922462941846, "grad_norm": 5.86605978012085, "learning_rate": 5.943846845250291e-05, "loss": 0.0425, "step": 512 }, { "epoch": 2.3329532497149374, "grad_norm": 3.296215534210205, "learning_rate": 5.9293858332746644e-05, "loss": 0.034, "step": 513 }, { "epoch": 2.33751425313569, "grad_norm": 4.8171186447143555, "learning_rate": 5.9149167644237555e-05, "loss": 0.0427, "step": 514 }, { "epoch": 2.342075256556442, "grad_norm": 6.483091354370117, "learning_rate": 5.90043976413038e-05, "loss": 0.0545, "step": 515 }, { "epoch": 2.346636259977195, "grad_norm": 4.027348041534424, "learning_rate": 5.885954957896115e-05, "loss": 0.0376, "step": 516 }, { "epoch": 2.3511972633979474, "grad_norm": 3.8197691440582275, "learning_rate": 5.871462471290202e-05, "loss": 0.0287, "step": 517 }, { "epoch": 2.3557582668187003, "grad_norm": 15.101433753967285, "learning_rate": 5.8569624299484716e-05, "loss": 0.0699, "step": 518 }, { "epoch": 2.3603192702394526, "grad_norm": 6.565672397613525, "learning_rate": 5.842454959572239e-05, "loss": 0.0715, "step": 519 }, { "epoch": 2.364880273660205, "grad_norm": 5.648789405822754, "learning_rate": 5.827940185927227e-05, "loss": 0.066, "step": 520 }, { "epoch": 2.369441277080958, "grad_norm": 6.39064359664917, "learning_rate": 5.813418234842467e-05, "loss": 0.0425, "step": 521 }, { "epoch": 2.3740022805017102, "grad_norm": 6.164553165435791, "learning_rate": 5.798889232209217e-05, "loss": 0.0491, "step": 522 }, { "epoch": 2.378563283922463, "grad_norm": 6.937675476074219, "learning_rate": 5.78435330397986e-05, "loss": 0.0354, "step": 523 }, { "epoch": 2.3831242873432155, "grad_norm": 5.974575996398926, "learning_rate": 5.769810576166818e-05, "loss": 0.0504, "step": 524 }, { "epoch": 2.387685290763968, "grad_norm": 6.108855247497559, "learning_rate": 5.755261174841461e-05, "loss": 0.0597, "step": 525 }, { "epoch": 2.3922462941847207, "grad_norm": 5.981025695800781, "learning_rate": 5.740705226133013e-05, "loss": 0.062, "step": 526 }, { "epoch": 2.396807297605473, "grad_norm": 6.320438861846924, "learning_rate": 5.726142856227452e-05, "loss": 0.0499, "step": 527 }, { "epoch": 2.401368301026226, "grad_norm": 4.965454578399658, "learning_rate": 5.7115741913664264e-05, "loss": 0.0432, "step": 528 }, { "epoch": 2.4059293044469783, "grad_norm": 7.857591152191162, "learning_rate": 5.696999357846153e-05, "loss": 0.0564, "step": 529 }, { "epoch": 2.4104903078677307, "grad_norm": 5.360653877258301, "learning_rate": 5.682418482016329e-05, "loss": 0.037, "step": 530 }, { "epoch": 2.4150513112884835, "grad_norm": 5.187353610992432, "learning_rate": 5.6678316902790266e-05, "loss": 0.0434, "step": 531 }, { "epoch": 2.419612314709236, "grad_norm": 7.093838691711426, "learning_rate": 5.653239109087608e-05, "loss": 0.0816, "step": 532 }, { "epoch": 2.4241733181299887, "grad_norm": 8.862817764282227, "learning_rate": 5.6386408649456205e-05, "loss": 0.1088, "step": 533 }, { "epoch": 2.428734321550741, "grad_norm": 5.698467254638672, "learning_rate": 5.624037084405708e-05, "loss": 0.0764, "step": 534 }, { "epoch": 2.433295324971494, "grad_norm": 7.893596649169922, "learning_rate": 5.609427894068507e-05, "loss": 0.0743, "step": 535 }, { "epoch": 2.4378563283922463, "grad_norm": 3.882078170776367, "learning_rate": 5.594813420581554e-05, "loss": 0.0395, "step": 536 }, { "epoch": 2.4424173318129987, "grad_norm": 5.990970611572266, "learning_rate": 5.580193790638181e-05, "loss": 0.0434, "step": 537 }, { "epoch": 2.4469783352337515, "grad_norm": 6.1614789962768555, "learning_rate": 5.565569130976422e-05, "loss": 0.043, "step": 538 }, { "epoch": 2.451539338654504, "grad_norm": 4.174839973449707, "learning_rate": 5.5509395683779185e-05, "loss": 0.0583, "step": 539 }, { "epoch": 2.4561003420752567, "grad_norm": 3.416801929473877, "learning_rate": 5.536305229666815e-05, "loss": 0.034, "step": 540 }, { "epoch": 2.460661345496009, "grad_norm": 5.814635276794434, "learning_rate": 5.521666241708655e-05, "loss": 0.0409, "step": 541 }, { "epoch": 2.4652223489167615, "grad_norm": 4.838456153869629, "learning_rate": 5.5070227314092896e-05, "loss": 0.0428, "step": 542 }, { "epoch": 2.4697833523375143, "grad_norm": 7.684220790863037, "learning_rate": 5.492374825713775e-05, "loss": 0.0663, "step": 543 }, { "epoch": 2.4743443557582667, "grad_norm": 3.3523683547973633, "learning_rate": 5.47772265160527e-05, "loss": 0.0315, "step": 544 }, { "epoch": 2.4789053591790196, "grad_norm": 5.440591812133789, "learning_rate": 5.46306633610394e-05, "loss": 0.053, "step": 545 }, { "epoch": 2.483466362599772, "grad_norm": 4.606085300445557, "learning_rate": 5.448406006265846e-05, "loss": 0.0345, "step": 546 }, { "epoch": 2.4880273660205243, "grad_norm": 6.1201887130737305, "learning_rate": 5.433741789181853e-05, "loss": 0.0673, "step": 547 }, { "epoch": 2.492588369441277, "grad_norm": 7.997361660003662, "learning_rate": 5.419073811976525e-05, "loss": 0.0764, "step": 548 }, { "epoch": 2.4971493728620295, "grad_norm": 4.388640880584717, "learning_rate": 5.4044022018070214e-05, "loss": 0.0414, "step": 549 }, { "epoch": 2.5017103762827824, "grad_norm": 4.9629645347595215, "learning_rate": 5.3897270858619966e-05, "loss": 0.0424, "step": 550 }, { "epoch": 2.5062713797035348, "grad_norm": 7.596857070922852, "learning_rate": 5.3750485913604965e-05, "loss": 0.0453, "step": 551 }, { "epoch": 2.5108323831242876, "grad_norm": 5.5651068687438965, "learning_rate": 5.360366845550856e-05, "loss": 0.0339, "step": 552 }, { "epoch": 2.51539338654504, "grad_norm": 3.2136380672454834, "learning_rate": 5.345681975709594e-05, "loss": 0.0224, "step": 553 }, { "epoch": 2.5199543899657924, "grad_norm": 4.0387864112854, "learning_rate": 5.330994109140315e-05, "loss": 0.0296, "step": 554 }, { "epoch": 2.524515393386545, "grad_norm": 5.669864654541016, "learning_rate": 5.316303373172601e-05, "loss": 0.0543, "step": 555 }, { "epoch": 2.5290763968072976, "grad_norm": 3.9306421279907227, "learning_rate": 5.301609895160906e-05, "loss": 0.0374, "step": 556 }, { "epoch": 2.5336374002280504, "grad_norm": 3.963334321975708, "learning_rate": 5.286913802483459e-05, "loss": 0.0304, "step": 557 }, { "epoch": 2.538198403648803, "grad_norm": 4.443750858306885, "learning_rate": 5.2722152225411503e-05, "loss": 0.0397, "step": 558 }, { "epoch": 2.542759407069555, "grad_norm": 5.408681869506836, "learning_rate": 5.25751428275644e-05, "loss": 0.0408, "step": 559 }, { "epoch": 2.547320410490308, "grad_norm": 8.279979705810547, "learning_rate": 5.242811110572242e-05, "loss": 0.0392, "step": 560 }, { "epoch": 2.5518814139110604, "grad_norm": 4.709146022796631, "learning_rate": 5.228105833450819e-05, "loss": 0.0377, "step": 561 }, { "epoch": 2.556442417331813, "grad_norm": 7.52549409866333, "learning_rate": 5.213398578872688e-05, "loss": 0.0353, "step": 562 }, { "epoch": 2.5610034207525656, "grad_norm": 5.7986602783203125, "learning_rate": 5.198689474335503e-05, "loss": 0.0564, "step": 563 }, { "epoch": 2.565564424173318, "grad_norm": 6.1219611167907715, "learning_rate": 5.183978647352961e-05, "loss": 0.0441, "step": 564 }, { "epoch": 2.570125427594071, "grad_norm": 4.516667366027832, "learning_rate": 5.169266225453686e-05, "loss": 0.0316, "step": 565 }, { "epoch": 2.574686431014823, "grad_norm": 5.2199625968933105, "learning_rate": 5.154552336180132e-05, "loss": 0.0369, "step": 566 }, { "epoch": 2.579247434435576, "grad_norm": 4.977226257324219, "learning_rate": 5.139837107087468e-05, "loss": 0.0383, "step": 567 }, { "epoch": 2.5838084378563284, "grad_norm": 3.4498722553253174, "learning_rate": 5.1251206657424864e-05, "loss": 0.0215, "step": 568 }, { "epoch": 2.588369441277081, "grad_norm": 4.04592227935791, "learning_rate": 5.110403139722484e-05, "loss": 0.0249, "step": 569 }, { "epoch": 2.5929304446978336, "grad_norm": 6.6897969245910645, "learning_rate": 5.0956846566141595e-05, "loss": 0.0463, "step": 570 }, { "epoch": 2.597491448118586, "grad_norm": 5.076176166534424, "learning_rate": 5.080965344012508e-05, "loss": 0.0426, "step": 571 }, { "epoch": 2.602052451539339, "grad_norm": 11.544487953186035, "learning_rate": 5.066245329519721e-05, "loss": 0.0356, "step": 572 }, { "epoch": 2.6066134549600912, "grad_norm": 6.120387077331543, "learning_rate": 5.0515247407440705e-05, "loss": 0.0451, "step": 573 }, { "epoch": 2.6111744583808436, "grad_norm": 5.812496185302734, "learning_rate": 5.036803705298808e-05, "loss": 0.0293, "step": 574 }, { "epoch": 2.6157354618015964, "grad_norm": 4.080401420593262, "learning_rate": 5.022082350801055e-05, "loss": 0.032, "step": 575 }, { "epoch": 2.620296465222349, "grad_norm": 4.283697128295898, "learning_rate": 5.007360804870702e-05, "loss": 0.0161, "step": 576 }, { "epoch": 2.6248574686431017, "grad_norm": 5.630773544311523, "learning_rate": 4.9926391951292985e-05, "loss": 0.0428, "step": 577 }, { "epoch": 2.629418472063854, "grad_norm": 5.993396759033203, "learning_rate": 4.977917649198945e-05, "loss": 0.038, "step": 578 }, { "epoch": 2.6339794754846064, "grad_norm": 5.899278163909912, "learning_rate": 4.963196294701194e-05, "loss": 0.048, "step": 579 }, { "epoch": 2.6385404789053593, "grad_norm": 5.6876091957092285, "learning_rate": 4.9484752592559306e-05, "loss": 0.0358, "step": 580 }, { "epoch": 2.6431014823261116, "grad_norm": 8.28043270111084, "learning_rate": 4.9337546704802806e-05, "loss": 0.0446, "step": 581 }, { "epoch": 2.6476624857468645, "grad_norm": 3.0159778594970703, "learning_rate": 4.919034655987493e-05, "loss": 0.0202, "step": 582 }, { "epoch": 2.652223489167617, "grad_norm": 3.556821823120117, "learning_rate": 4.904315343385844e-05, "loss": 0.0359, "step": 583 }, { "epoch": 2.6567844925883692, "grad_norm": 9.480207443237305, "learning_rate": 4.889596860277519e-05, "loss": 0.0292, "step": 584 }, { "epoch": 2.661345496009122, "grad_norm": 4.381405830383301, "learning_rate": 4.8748793342575134e-05, "loss": 0.0432, "step": 585 }, { "epoch": 2.6659064994298745, "grad_norm": 3.772207260131836, "learning_rate": 4.860162892912532e-05, "loss": 0.0172, "step": 586 }, { "epoch": 2.6704675028506273, "grad_norm": 4.178829193115234, "learning_rate": 4.84544766381987e-05, "loss": 0.0344, "step": 587 }, { "epoch": 2.6750285062713797, "grad_norm": 12.805524826049805, "learning_rate": 4.830733774546315e-05, "loss": 0.0377, "step": 588 }, { "epoch": 2.679589509692132, "grad_norm": 5.325319290161133, "learning_rate": 4.8160213526470403e-05, "loss": 0.0533, "step": 589 }, { "epoch": 2.684150513112885, "grad_norm": 6.815293788909912, "learning_rate": 4.801310525664498e-05, "loss": 0.0256, "step": 590 }, { "epoch": 2.6887115165336373, "grad_norm": 5.803644180297852, "learning_rate": 4.7866014211273135e-05, "loss": 0.0179, "step": 591 }, { "epoch": 2.69327251995439, "grad_norm": 4.223587512969971, "learning_rate": 4.7718941665491825e-05, "loss": 0.0337, "step": 592 }, { "epoch": 2.6978335233751425, "grad_norm": 3.5225670337677, "learning_rate": 4.7571888894277604e-05, "loss": 0.0251, "step": 593 }, { "epoch": 2.702394526795895, "grad_norm": 4.653651714324951, "learning_rate": 4.7424857172435596e-05, "loss": 0.0309, "step": 594 }, { "epoch": 2.7069555302166477, "grad_norm": 4.962986469268799, "learning_rate": 4.72778477745885e-05, "loss": 0.0375, "step": 595 }, { "epoch": 2.7115165336374, "grad_norm": 7.884991645812988, "learning_rate": 4.713086197516542e-05, "loss": 0.0619, "step": 596 }, { "epoch": 2.716077537058153, "grad_norm": 6.011470317840576, "learning_rate": 4.698390104839096e-05, "loss": 0.0304, "step": 597 }, { "epoch": 2.7206385404789053, "grad_norm": 4.3046159744262695, "learning_rate": 4.683696626827401e-05, "loss": 0.0251, "step": 598 }, { "epoch": 2.7251995438996577, "grad_norm": 4.003452301025391, "learning_rate": 4.669005890859686e-05, "loss": 0.0231, "step": 599 }, { "epoch": 2.7297605473204105, "grad_norm": 4.7901530265808105, "learning_rate": 4.654318024290407e-05, "loss": 0.0425, "step": 600 }, { "epoch": 2.734321550741163, "grad_norm": 4.104437351226807, "learning_rate": 4.639633154449146e-05, "loss": 0.0284, "step": 601 }, { "epoch": 2.7388825541619157, "grad_norm": 3.93487811088562, "learning_rate": 4.624951408639503e-05, "loss": 0.0294, "step": 602 }, { "epoch": 2.743443557582668, "grad_norm": 6.138600826263428, "learning_rate": 4.610272914138004e-05, "loss": 0.0315, "step": 603 }, { "epoch": 2.7480045610034205, "grad_norm": 6.667906761169434, "learning_rate": 4.59559779819298e-05, "loss": 0.0395, "step": 604 }, { "epoch": 2.7525655644241733, "grad_norm": 4.121731281280518, "learning_rate": 4.5809261880234764e-05, "loss": 0.0319, "step": 605 }, { "epoch": 2.757126567844926, "grad_norm": 3.8120853900909424, "learning_rate": 4.566258210818148e-05, "loss": 0.029, "step": 606 }, { "epoch": 2.7616875712656785, "grad_norm": 3.287109851837158, "learning_rate": 4.5515939937341556e-05, "loss": 0.0224, "step": 607 }, { "epoch": 2.766248574686431, "grad_norm": 3.322906970977783, "learning_rate": 4.5369336638960616e-05, "loss": 0.0233, "step": 608 }, { "epoch": 2.7708095781071833, "grad_norm": 1.5358067750930786, "learning_rate": 4.522277348394731e-05, "loss": 0.0088, "step": 609 }, { "epoch": 2.775370581527936, "grad_norm": 4.789572715759277, "learning_rate": 4.507625174286226e-05, "loss": 0.0357, "step": 610 }, { "epoch": 2.779931584948689, "grad_norm": 3.2839534282684326, "learning_rate": 4.492977268590711e-05, "loss": 0.0237, "step": 611 }, { "epoch": 2.7844925883694414, "grad_norm": 4.322288513183594, "learning_rate": 4.478333758291347e-05, "loss": 0.0387, "step": 612 }, { "epoch": 2.7890535917901937, "grad_norm": 6.475346088409424, "learning_rate": 4.4636947703331864e-05, "loss": 0.0426, "step": 613 }, { "epoch": 2.7936145952109466, "grad_norm": 4.861753940582275, "learning_rate": 4.449060431622082e-05, "loss": 0.0286, "step": 614 }, { "epoch": 2.798175598631699, "grad_norm": 2.6957809925079346, "learning_rate": 4.434430869023579e-05, "loss": 0.0222, "step": 615 }, { "epoch": 2.802736602052452, "grad_norm": 6.314004898071289, "learning_rate": 4.419806209361822e-05, "loss": 0.033, "step": 616 }, { "epoch": 2.807297605473204, "grad_norm": 5.230919361114502, "learning_rate": 4.405186579418448e-05, "loss": 0.0195, "step": 617 }, { "epoch": 2.8118586088939566, "grad_norm": 4.417494773864746, "learning_rate": 4.390572105931492e-05, "loss": 0.0372, "step": 618 }, { "epoch": 2.8164196123147094, "grad_norm": 6.1748552322387695, "learning_rate": 4.375962915594292e-05, "loss": 0.0284, "step": 619 }, { "epoch": 2.8209806157354618, "grad_norm": 8.319523811340332, "learning_rate": 4.36135913505438e-05, "loss": 0.0392, "step": 620 }, { "epoch": 2.8255416191562146, "grad_norm": 6.181365489959717, "learning_rate": 4.346760890912394e-05, "loss": 0.0521, "step": 621 }, { "epoch": 2.830102622576967, "grad_norm": 4.259495735168457, "learning_rate": 4.3321683097209745e-05, "loss": 0.0181, "step": 622 }, { "epoch": 2.8346636259977194, "grad_norm": 6.773824214935303, "learning_rate": 4.317581517983673e-05, "loss": 0.0229, "step": 623 }, { "epoch": 2.839224629418472, "grad_norm": 7.4220356941223145, "learning_rate": 4.303000642153847e-05, "loss": 0.0316, "step": 624 }, { "epoch": 2.8437856328392246, "grad_norm": 6.611977577209473, "learning_rate": 4.288425808633575e-05, "loss": 0.0333, "step": 625 }, { "epoch": 2.8483466362599774, "grad_norm": 4.086733818054199, "learning_rate": 4.27385714377255e-05, "loss": 0.022, "step": 626 }, { "epoch": 2.85290763968073, "grad_norm": 3.454923629760742, "learning_rate": 4.259294773866987e-05, "loss": 0.0273, "step": 627 }, { "epoch": 2.857468643101482, "grad_norm": 2.6385574340820312, "learning_rate": 4.2447388251585384e-05, "loss": 0.0167, "step": 628 }, { "epoch": 2.862029646522235, "grad_norm": 3.3853583335876465, "learning_rate": 4.230189423833183e-05, "loss": 0.0261, "step": 629 }, { "epoch": 2.8665906499429874, "grad_norm": 2.6701831817626953, "learning_rate": 4.215646696020141e-05, "loss": 0.0251, "step": 630 }, { "epoch": 2.8711516533637402, "grad_norm": 2.350428342819214, "learning_rate": 4.201110767790784e-05, "loss": 0.013, "step": 631 }, { "epoch": 2.8757126567844926, "grad_norm": 4.5163373947143555, "learning_rate": 4.186581765157534e-05, "loss": 0.0333, "step": 632 }, { "epoch": 2.880273660205245, "grad_norm": 4.357926368713379, "learning_rate": 4.172059814072776e-05, "loss": 0.0204, "step": 633 }, { "epoch": 2.884834663625998, "grad_norm": 5.049153804779053, "learning_rate": 4.157545040427763e-05, "loss": 0.0508, "step": 634 }, { "epoch": 2.88939566704675, "grad_norm": 3.518669366836548, "learning_rate": 4.143037570051529e-05, "loss": 0.0267, "step": 635 }, { "epoch": 2.893956670467503, "grad_norm": 2.3187649250030518, "learning_rate": 4.1285375287097976e-05, "loss": 0.0144, "step": 636 }, { "epoch": 2.8985176738882554, "grad_norm": 5.340627193450928, "learning_rate": 4.114045042103887e-05, "loss": 0.0221, "step": 637 }, { "epoch": 2.903078677309008, "grad_norm": 5.597445487976074, "learning_rate": 4.099560235869621e-05, "loss": 0.0217, "step": 638 }, { "epoch": 2.9076396807297606, "grad_norm": 10.440146446228027, "learning_rate": 4.085083235576246e-05, "loss": 0.0383, "step": 639 }, { "epoch": 2.912200684150513, "grad_norm": 6.332849502563477, "learning_rate": 4.070614166725337e-05, "loss": 0.0341, "step": 640 }, { "epoch": 2.916761687571266, "grad_norm": 3.814603090286255, "learning_rate": 4.056153154749711e-05, "loss": 0.0336, "step": 641 }, { "epoch": 2.9213226909920182, "grad_norm": 6.715717315673828, "learning_rate": 4.04170032501234e-05, "loss": 0.0396, "step": 642 }, { "epoch": 2.9258836944127706, "grad_norm": 5.1199140548706055, "learning_rate": 4.02725580280526e-05, "loss": 0.0311, "step": 643 }, { "epoch": 2.9304446978335235, "grad_norm": 2.676060438156128, "learning_rate": 4.012819713348499e-05, "loss": 0.0188, "step": 644 }, { "epoch": 2.935005701254276, "grad_norm": 7.360265254974365, "learning_rate": 3.9983921817889694e-05, "loss": 0.0318, "step": 645 }, { "epoch": 2.9395667046750287, "grad_norm": 2.801821708679199, "learning_rate": 3.9839733331994036e-05, "loss": 0.0176, "step": 646 }, { "epoch": 2.944127708095781, "grad_norm": 6.362011432647705, "learning_rate": 3.9695632925772555e-05, "loss": 0.0353, "step": 647 }, { "epoch": 2.9486887115165334, "grad_norm": 3.604642152786255, "learning_rate": 3.955162184843625e-05, "loss": 0.0298, "step": 648 }, { "epoch": 2.9532497149372863, "grad_norm": 4.035106182098389, "learning_rate": 3.940770134842172e-05, "loss": 0.0312, "step": 649 }, { "epoch": 2.9578107183580387, "grad_norm": 3.8275413513183594, "learning_rate": 3.9263872673380356e-05, "loss": 0.0242, "step": 650 }, { "epoch": 2.9623717217787915, "grad_norm": 6.57460355758667, "learning_rate": 3.912013707016748e-05, "loss": 0.0362, "step": 651 }, { "epoch": 2.966932725199544, "grad_norm": 2.4332528114318848, "learning_rate": 3.897649578483163e-05, "loss": 0.0124, "step": 652 }, { "epoch": 2.9714937286202963, "grad_norm": 2.740542411804199, "learning_rate": 3.883295006260366e-05, "loss": 0.0141, "step": 653 }, { "epoch": 2.976054732041049, "grad_norm": 4.145965576171875, "learning_rate": 3.868950114788597e-05, "loss": 0.0227, "step": 654 }, { "epoch": 2.9806157354618015, "grad_norm": 3.8210062980651855, "learning_rate": 3.8546150284241784e-05, "loss": 0.0256, "step": 655 }, { "epoch": 2.9851767388825543, "grad_norm": 2.960423469543457, "learning_rate": 3.840289871438427e-05, "loss": 0.0234, "step": 656 }, { "epoch": 2.9897377423033067, "grad_norm": 2.305687189102173, "learning_rate": 3.8259747680165835e-05, "loss": 0.0167, "step": 657 }, { "epoch": 2.994298745724059, "grad_norm": 3.4394383430480957, "learning_rate": 3.811669842256733e-05, "loss": 0.0188, "step": 658 }, { "epoch": 2.998859749144812, "grad_norm": 4.973351955413818, "learning_rate": 3.7973752181687335e-05, "loss": 0.0154, "step": 659 }, { "epoch": 3.0, "grad_norm": 8.8117036819458, "learning_rate": 3.78309101967313e-05, "loss": 0.0315, "step": 660 }, { "epoch": 3.0045610034207524, "grad_norm": 2.305928945541382, "learning_rate": 3.768817370600098e-05, "loss": 0.0068, "step": 661 }, { "epoch": 3.009122006841505, "grad_norm": 3.3252930641174316, "learning_rate": 3.754554394688353e-05, "loss": 0.0249, "step": 662 }, { "epoch": 3.0136830102622576, "grad_norm": 2.8702948093414307, "learning_rate": 3.740302215584083e-05, "loss": 0.0141, "step": 663 }, { "epoch": 3.0182440136830104, "grad_norm": 1.2875800132751465, "learning_rate": 3.726060956839884e-05, "loss": 0.0088, "step": 664 }, { "epoch": 3.022805017103763, "grad_norm": 2.241077184677124, "learning_rate": 3.7118307419136784e-05, "loss": 0.0158, "step": 665 }, { "epoch": 3.027366020524515, "grad_norm": 3.223511219024658, "learning_rate": 3.697611694167652e-05, "loss": 0.0172, "step": 666 }, { "epoch": 3.031927023945268, "grad_norm": 1.6573007106781006, "learning_rate": 3.683403936867179e-05, "loss": 0.0061, "step": 667 }, { "epoch": 3.0364880273660204, "grad_norm": 2.708397388458252, "learning_rate": 3.6692075931797586e-05, "loss": 0.0189, "step": 668 }, { "epoch": 3.0410490307867732, "grad_norm": 1.0548919439315796, "learning_rate": 3.6550227861739474e-05, "loss": 0.0057, "step": 669 }, { "epoch": 3.0456100342075256, "grad_norm": 2.2885780334472656, "learning_rate": 3.640849638818286e-05, "loss": 0.0114, "step": 670 }, { "epoch": 3.050171037628278, "grad_norm": 2.5884759426116943, "learning_rate": 3.6266882739802385e-05, "loss": 0.0135, "step": 671 }, { "epoch": 3.054732041049031, "grad_norm": 1.9603959321975708, "learning_rate": 3.612538814425127e-05, "loss": 0.0078, "step": 672 }, { "epoch": 3.0592930444697832, "grad_norm": 1.8305693864822388, "learning_rate": 3.598401382815062e-05, "loss": 0.0191, "step": 673 }, { "epoch": 3.063854047890536, "grad_norm": 6.010728359222412, "learning_rate": 3.584276101707892e-05, "loss": 0.0228, "step": 674 }, { "epoch": 3.0684150513112884, "grad_norm": 0.7543220520019531, "learning_rate": 3.570163093556123e-05, "loss": 0.0049, "step": 675 }, { "epoch": 3.072976054732041, "grad_norm": 3.3827199935913086, "learning_rate": 3.556062480705871e-05, "loss": 0.0205, "step": 676 }, { "epoch": 3.0775370581527937, "grad_norm": 3.6950876712799072, "learning_rate": 3.541974385395799e-05, "loss": 0.013, "step": 677 }, { "epoch": 3.082098061573546, "grad_norm": 2.4483461380004883, "learning_rate": 3.527898929756049e-05, "loss": 0.0112, "step": 678 }, { "epoch": 3.086659064994299, "grad_norm": 1.083269715309143, "learning_rate": 3.5138362358071955e-05, "loss": 0.0051, "step": 679 }, { "epoch": 3.0912200684150513, "grad_norm": 3.507185459136963, "learning_rate": 3.4997864254591786e-05, "loss": 0.0184, "step": 680 }, { "epoch": 3.095781071835804, "grad_norm": 1.244012713432312, "learning_rate": 3.4857496205102474e-05, "loss": 0.0058, "step": 681 }, { "epoch": 3.1003420752565565, "grad_norm": 1.7855976819992065, "learning_rate": 3.47172594264591e-05, "loss": 0.0081, "step": 682 }, { "epoch": 3.104903078677309, "grad_norm": 3.3493175506591797, "learning_rate": 3.457715513437878e-05, "loss": 0.0099, "step": 683 }, { "epoch": 3.1094640820980617, "grad_norm": 2.2510809898376465, "learning_rate": 3.443718454343003e-05, "loss": 0.0103, "step": 684 }, { "epoch": 3.114025085518814, "grad_norm": 2.8858158588409424, "learning_rate": 3.429734886702235e-05, "loss": 0.0201, "step": 685 }, { "epoch": 3.118586088939567, "grad_norm": 2.9078762531280518, "learning_rate": 3.415764931739569e-05, "loss": 0.0126, "step": 686 }, { "epoch": 3.1231470923603193, "grad_norm": 11.870075225830078, "learning_rate": 3.401808710560984e-05, "loss": 0.0367, "step": 687 }, { "epoch": 3.1277080957810717, "grad_norm": 2.278918504714966, "learning_rate": 3.3878663441534074e-05, "loss": 0.0131, "step": 688 }, { "epoch": 3.1322690992018245, "grad_norm": 2.479530096054077, "learning_rate": 3.3739379533836545e-05, "loss": 0.0157, "step": 689 }, { "epoch": 3.136830102622577, "grad_norm": 5.716989517211914, "learning_rate": 3.360023658997387e-05, "loss": 0.0121, "step": 690 }, { "epoch": 3.1413911060433297, "grad_norm": 3.942920446395874, "learning_rate": 3.346123581618064e-05, "loss": 0.0119, "step": 691 }, { "epoch": 3.145952109464082, "grad_norm": 3.193537712097168, "learning_rate": 3.332237841745898e-05, "loss": 0.0214, "step": 692 }, { "epoch": 3.1505131128848345, "grad_norm": 6.671420574188232, "learning_rate": 3.318366559756807e-05, "loss": 0.0141, "step": 693 }, { "epoch": 3.1550741163055873, "grad_norm": 1.345292329788208, "learning_rate": 3.304509855901379e-05, "loss": 0.0052, "step": 694 }, { "epoch": 3.1596351197263397, "grad_norm": 1.9885728359222412, "learning_rate": 3.290667850303816e-05, "loss": 0.0139, "step": 695 }, { "epoch": 3.1641961231470925, "grad_norm": 2.2747621536254883, "learning_rate": 3.276840662960904e-05, "loss": 0.0111, "step": 696 }, { "epoch": 3.168757126567845, "grad_norm": 1.8984365463256836, "learning_rate": 3.26302841374097e-05, "loss": 0.0094, "step": 697 }, { "epoch": 3.1733181299885973, "grad_norm": 1.0688873529434204, "learning_rate": 3.2492312223828395e-05, "loss": 0.0066, "step": 698 }, { "epoch": 3.17787913340935, "grad_norm": 3.808332681655884, "learning_rate": 3.235449208494804e-05, "loss": 0.0267, "step": 699 }, { "epoch": 3.1824401368301025, "grad_norm": 2.2355008125305176, "learning_rate": 3.221682491553578e-05, "loss": 0.0098, "step": 700 }, { "epoch": 3.1870011402508553, "grad_norm": 2.873753547668457, "learning_rate": 3.207931190903267e-05, "loss": 0.0211, "step": 701 }, { "epoch": 3.1915621436716077, "grad_norm": 1.0170818567276, "learning_rate": 3.194195425754333e-05, "loss": 0.0042, "step": 702 }, { "epoch": 3.19612314709236, "grad_norm": 1.7715719938278198, "learning_rate": 3.180475315182563e-05, "loss": 0.0042, "step": 703 }, { "epoch": 3.200684150513113, "grad_norm": 3.1615145206451416, "learning_rate": 3.166770978128027e-05, "loss": 0.0096, "step": 704 }, { "epoch": 3.2052451539338653, "grad_norm": 1.4706147909164429, "learning_rate": 3.1530825333940606e-05, "loss": 0.0069, "step": 705 }, { "epoch": 3.209806157354618, "grad_norm": 1.4655320644378662, "learning_rate": 3.139410099646223e-05, "loss": 0.007, "step": 706 }, { "epoch": 3.2143671607753705, "grad_norm": 2.010169506072998, "learning_rate": 3.1257537954112784e-05, "loss": 0.0061, "step": 707 }, { "epoch": 3.2189281641961234, "grad_norm": 2.4863297939300537, "learning_rate": 3.112113739076161e-05, "loss": 0.0135, "step": 708 }, { "epoch": 3.2234891676168758, "grad_norm": 0.9651201963424683, "learning_rate": 3.09849004888695e-05, "loss": 0.0059, "step": 709 }, { "epoch": 3.228050171037628, "grad_norm": 1.4260286092758179, "learning_rate": 3.084882842947851e-05, "loss": 0.0058, "step": 710 }, { "epoch": 3.232611174458381, "grad_norm": 2.2575788497924805, "learning_rate": 3.071292239220164e-05, "loss": 0.0072, "step": 711 }, { "epoch": 3.2371721778791334, "grad_norm": 2.315737247467041, "learning_rate": 3.057718355521262e-05, "loss": 0.0072, "step": 712 }, { "epoch": 3.241733181299886, "grad_norm": 1.3366587162017822, "learning_rate": 3.0441613095235755e-05, "loss": 0.0079, "step": 713 }, { "epoch": 3.2462941847206386, "grad_norm": 1.586226224899292, "learning_rate": 3.0306212187535653e-05, "loss": 0.0087, "step": 714 }, { "epoch": 3.250855188141391, "grad_norm": 1.7931245565414429, "learning_rate": 3.0170982005907066e-05, "loss": 0.0077, "step": 715 }, { "epoch": 3.255416191562144, "grad_norm": 3.0318455696105957, "learning_rate": 3.003592372266476e-05, "loss": 0.0055, "step": 716 }, { "epoch": 3.259977194982896, "grad_norm": 1.366331934928894, "learning_rate": 2.990103850863327e-05, "loss": 0.0047, "step": 717 }, { "epoch": 3.264538198403649, "grad_norm": 0.7723425030708313, "learning_rate": 2.9766327533136774e-05, "loss": 0.0049, "step": 718 }, { "epoch": 3.2690992018244014, "grad_norm": 3.0485169887542725, "learning_rate": 2.963179196398902e-05, "loss": 0.007, "step": 719 }, { "epoch": 3.2736602052451538, "grad_norm": 1.4431509971618652, "learning_rate": 2.9497432967483124e-05, "loss": 0.0051, "step": 720 }, { "epoch": 3.2782212086659066, "grad_norm": 1.8810582160949707, "learning_rate": 2.9363251708381477e-05, "loss": 0.0069, "step": 721 }, { "epoch": 3.282782212086659, "grad_norm": 0.9738725423812866, "learning_rate": 2.9229249349905684e-05, "loss": 0.0049, "step": 722 }, { "epoch": 3.287343215507412, "grad_norm": 2.8499152660369873, "learning_rate": 2.9095427053726442e-05, "loss": 0.0093, "step": 723 }, { "epoch": 3.291904218928164, "grad_norm": 2.7052316665649414, "learning_rate": 2.896178597995347e-05, "loss": 0.0109, "step": 724 }, { "epoch": 3.2964652223489166, "grad_norm": 0.7764227986335754, "learning_rate": 2.882832728712551e-05, "loss": 0.0046, "step": 725 }, { "epoch": 3.3010262257696694, "grad_norm": 2.247260570526123, "learning_rate": 2.869505213220014e-05, "loss": 0.0078, "step": 726 }, { "epoch": 3.305587229190422, "grad_norm": 0.8493082523345947, "learning_rate": 2.8561961670543995e-05, "loss": 0.0049, "step": 727 }, { "epoch": 3.3101482326111746, "grad_norm": 0.709747850894928, "learning_rate": 2.8429057055922448e-05, "loss": 0.0039, "step": 728 }, { "epoch": 3.314709236031927, "grad_norm": 1.864812970161438, "learning_rate": 2.8296339440489837e-05, "loss": 0.0094, "step": 729 }, { "epoch": 3.3192702394526794, "grad_norm": 2.6070640087127686, "learning_rate": 2.8163809974779405e-05, "loss": 0.0127, "step": 730 }, { "epoch": 3.3238312428734322, "grad_norm": 0.6544182896614075, "learning_rate": 2.8031469807693257e-05, "loss": 0.0029, "step": 731 }, { "epoch": 3.3283922462941846, "grad_norm": 2.191878318786621, "learning_rate": 2.789932008649252e-05, "loss": 0.0075, "step": 732 }, { "epoch": 3.3329532497149374, "grad_norm": 1.73975670337677, "learning_rate": 2.776736195678734e-05, "loss": 0.0083, "step": 733 }, { "epoch": 3.33751425313569, "grad_norm": 1.4151902198791504, "learning_rate": 2.7635596562526865e-05, "loss": 0.0071, "step": 734 }, { "epoch": 3.342075256556442, "grad_norm": 2.084052562713623, "learning_rate": 2.7504025045989577e-05, "loss": 0.0098, "step": 735 }, { "epoch": 3.346636259977195, "grad_norm": 1.386006474494934, "learning_rate": 2.737264854777306e-05, "loss": 0.0083, "step": 736 }, { "epoch": 3.3511972633979474, "grad_norm": 2.138157606124878, "learning_rate": 2.724146820678436e-05, "loss": 0.0061, "step": 737 }, { "epoch": 3.3557582668187003, "grad_norm": 0.871894896030426, "learning_rate": 2.7110485160230037e-05, "loss": 0.0047, "step": 738 }, { "epoch": 3.3603192702394526, "grad_norm": 2.6974539756774902, "learning_rate": 2.6979700543606245e-05, "loss": 0.0063, "step": 739 }, { "epoch": 3.364880273660205, "grad_norm": 1.140648603439331, "learning_rate": 2.6849115490689013e-05, "loss": 0.0046, "step": 740 }, { "epoch": 3.369441277080958, "grad_norm": 5.994167804718018, "learning_rate": 2.6718731133524265e-05, "loss": 0.0116, "step": 741 }, { "epoch": 3.3740022805017102, "grad_norm": 2.2735085487365723, "learning_rate": 2.6588548602418156e-05, "loss": 0.0092, "step": 742 }, { "epoch": 3.378563283922463, "grad_norm": 1.1353788375854492, "learning_rate": 2.6458569025927183e-05, "loss": 0.0038, "step": 743 }, { "epoch": 3.3831242873432155, "grad_norm": 1.2895104885101318, "learning_rate": 2.6328793530848405e-05, "loss": 0.0053, "step": 744 }, { "epoch": 3.387685290763968, "grad_norm": 1.2267273664474487, "learning_rate": 2.6199223242209747e-05, "loss": 0.0054, "step": 745 }, { "epoch": 3.3922462941847207, "grad_norm": 1.287307858467102, "learning_rate": 2.6069859283260097e-05, "loss": 0.005, "step": 746 }, { "epoch": 3.396807297605473, "grad_norm": 0.8949470520019531, "learning_rate": 2.5940702775459747e-05, "loss": 0.0046, "step": 747 }, { "epoch": 3.401368301026226, "grad_norm": 3.3357739448547363, "learning_rate": 2.5811754838470583e-05, "loss": 0.0076, "step": 748 }, { "epoch": 3.4059293044469783, "grad_norm": 7.779567718505859, "learning_rate": 2.5683016590146318e-05, "loss": 0.0072, "step": 749 }, { "epoch": 3.4104903078677307, "grad_norm": 2.3621115684509277, "learning_rate": 2.5554489146522958e-05, "loss": 0.0091, "step": 750 }, { "epoch": 3.4150513112884835, "grad_norm": 1.8572039604187012, "learning_rate": 2.542617362180893e-05, "loss": 0.0074, "step": 751 }, { "epoch": 3.419612314709236, "grad_norm": 2.4851315021514893, "learning_rate": 2.5298071128375644e-05, "loss": 0.0045, "step": 752 }, { "epoch": 3.4241733181299887, "grad_norm": 2.945042371749878, "learning_rate": 2.5170182776747687e-05, "loss": 0.009, "step": 753 }, { "epoch": 3.428734321550741, "grad_norm": 2.0146474838256836, "learning_rate": 2.5042509675593195e-05, "loss": 0.0115, "step": 754 }, { "epoch": 3.433295324971494, "grad_norm": 0.28727975487709045, "learning_rate": 2.491505293171438e-05, "loss": 0.0022, "step": 755 }, { "epoch": 3.4378563283922463, "grad_norm": 4.880654335021973, "learning_rate": 2.478781365003775e-05, "loss": 0.0218, "step": 756 }, { "epoch": 3.4424173318129987, "grad_norm": 0.8212169408798218, "learning_rate": 2.46607929336047e-05, "loss": 0.0037, "step": 757 }, { "epoch": 3.4469783352337515, "grad_norm": 2.413165330886841, "learning_rate": 2.4533991883561868e-05, "loss": 0.0115, "step": 758 }, { "epoch": 3.451539338654504, "grad_norm": 1.3072090148925781, "learning_rate": 2.440741159915153e-05, "loss": 0.0057, "step": 759 }, { "epoch": 3.4561003420752567, "grad_norm": 2.672323703765869, "learning_rate": 2.4281053177702256e-05, "loss": 0.0105, "step": 760 }, { "epoch": 3.460661345496009, "grad_norm": 1.8092498779296875, "learning_rate": 2.4154917714619164e-05, "loss": 0.0055, "step": 761 }, { "epoch": 3.4652223489167615, "grad_norm": 2.4887306690216064, "learning_rate": 2.40290063033746e-05, "loss": 0.0068, "step": 762 }, { "epoch": 3.4697833523375143, "grad_norm": 1.1760387420654297, "learning_rate": 2.3903320035498605e-05, "loss": 0.0049, "step": 763 }, { "epoch": 3.4743443557582667, "grad_norm": 1.3489493131637573, "learning_rate": 2.3777860000569384e-05, "loss": 0.004, "step": 764 }, { "epoch": 3.4789053591790196, "grad_norm": 4.36347770690918, "learning_rate": 2.365262728620398e-05, "loss": 0.0042, "step": 765 }, { "epoch": 3.483466362599772, "grad_norm": 0.5808774828910828, "learning_rate": 2.352762297804879e-05, "loss": 0.0029, "step": 766 }, { "epoch": 3.4880273660205243, "grad_norm": 1.9665743112564087, "learning_rate": 2.340284815977007e-05, "loss": 0.0088, "step": 767 }, { "epoch": 3.492588369441277, "grad_norm": 5.437419891357422, "learning_rate": 2.327830391304475e-05, "loss": 0.0414, "step": 768 }, { "epoch": 3.4971493728620295, "grad_norm": 0.9246713519096375, "learning_rate": 2.315399131755081e-05, "loss": 0.0055, "step": 769 }, { "epoch": 3.5017103762827824, "grad_norm": 1.88973069190979, "learning_rate": 2.3029911450958113e-05, "loss": 0.007, "step": 770 }, { "epoch": 3.5062713797035348, "grad_norm": 5.172039985656738, "learning_rate": 2.2906065388918934e-05, "loss": 0.0104, "step": 771 }, { "epoch": 3.5108323831242876, "grad_norm": 2.3819446563720703, "learning_rate": 2.278245420505873e-05, "loss": 0.0125, "step": 772 }, { "epoch": 3.51539338654504, "grad_norm": 0.6151133179664612, "learning_rate": 2.2659078970966784e-05, "loss": 0.0036, "step": 773 }, { "epoch": 3.5199543899657924, "grad_norm": 1.6557809114456177, "learning_rate": 2.2535940756186897e-05, "loss": 0.0095, "step": 774 }, { "epoch": 3.524515393386545, "grad_norm": 1.1430513858795166, "learning_rate": 2.2413040628208165e-05, "loss": 0.0049, "step": 775 }, { "epoch": 3.5290763968072976, "grad_norm": 1.5872365236282349, "learning_rate": 2.22903796524557e-05, "loss": 0.0045, "step": 776 }, { "epoch": 3.5336374002280504, "grad_norm": 1.7567164897918701, "learning_rate": 2.2167958892281404e-05, "loss": 0.0072, "step": 777 }, { "epoch": 3.538198403648803, "grad_norm": 3.414562702178955, "learning_rate": 2.2045779408954738e-05, "loss": 0.015, "step": 778 }, { "epoch": 3.542759407069555, "grad_norm": 1.1695743799209595, "learning_rate": 2.192384226165349e-05, "loss": 0.0063, "step": 779 }, { "epoch": 3.547320410490308, "grad_norm": 1.2887126207351685, "learning_rate": 2.180214850745467e-05, "loss": 0.0042, "step": 780 }, { "epoch": 3.5518814139110604, "grad_norm": 6.278741359710693, "learning_rate": 2.1680699201325326e-05, "loss": 0.0096, "step": 781 }, { "epoch": 3.556442417331813, "grad_norm": 1.6705055236816406, "learning_rate": 2.1559495396113307e-05, "loss": 0.0064, "step": 782 }, { "epoch": 3.5610034207525656, "grad_norm": 1.6865471601486206, "learning_rate": 2.1438538142538273e-05, "loss": 0.0066, "step": 783 }, { "epoch": 3.565564424173318, "grad_norm": 1.117870569229126, "learning_rate": 2.131782848918245e-05, "loss": 0.0039, "step": 784 }, { "epoch": 3.570125427594071, "grad_norm": 0.6459174752235413, "learning_rate": 2.119736748248172e-05, "loss": 0.0031, "step": 785 }, { "epoch": 3.574686431014823, "grad_norm": 2.9996261596679688, "learning_rate": 2.1077156166716323e-05, "loss": 0.007, "step": 786 }, { "epoch": 3.579247434435576, "grad_norm": 2.0634710788726807, "learning_rate": 2.0957195584001986e-05, "loss": 0.0076, "step": 787 }, { "epoch": 3.5838084378563284, "grad_norm": 3.3125040531158447, "learning_rate": 2.083748677428083e-05, "loss": 0.0137, "step": 788 }, { "epoch": 3.588369441277081, "grad_norm": 4.13361120223999, "learning_rate": 2.0718030775312285e-05, "loss": 0.0193, "step": 789 }, { "epoch": 3.5929304446978336, "grad_norm": 1.7809717655181885, "learning_rate": 2.0598828622664213e-05, "loss": 0.006, "step": 790 }, { "epoch": 3.597491448118586, "grad_norm": 12.555645942687988, "learning_rate": 2.0479881349703883e-05, "loss": 0.0169, "step": 791 }, { "epoch": 3.602052451539339, "grad_norm": 2.071539878845215, "learning_rate": 2.0361189987588918e-05, "loss": 0.0072, "step": 792 }, { "epoch": 3.6066134549600912, "grad_norm": 1.3284027576446533, "learning_rate": 2.024275556525858e-05, "loss": 0.006, "step": 793 }, { "epoch": 3.6111744583808436, "grad_norm": 0.8479968905448914, "learning_rate": 2.012457910942458e-05, "loss": 0.0039, "step": 794 }, { "epoch": 3.6157354618015964, "grad_norm": 2.979787826538086, "learning_rate": 2.0006661644562375e-05, "loss": 0.0092, "step": 795 }, { "epoch": 3.620296465222349, "grad_norm": 0.8467869758605957, "learning_rate": 1.988900419290224e-05, "loss": 0.0033, "step": 796 }, { "epoch": 3.6248574686431017, "grad_norm": 0.7672457098960876, "learning_rate": 1.9771607774420307e-05, "loss": 0.0038, "step": 797 }, { "epoch": 3.629418472063854, "grad_norm": 1.3988791704177856, "learning_rate": 1.9654473406829903e-05, "loss": 0.0059, "step": 798 }, { "epoch": 3.6339794754846064, "grad_norm": 1.741504430770874, "learning_rate": 1.953760210557254e-05, "loss": 0.0096, "step": 799 }, { "epoch": 3.6385404789053593, "grad_norm": 4.00645637512207, "learning_rate": 1.942099488380923e-05, "loss": 0.0098, "step": 800 }, { "epoch": 3.6431014823261116, "grad_norm": 0.9082571864128113, "learning_rate": 1.9304652752411734e-05, "loss": 0.0038, "step": 801 }, { "epoch": 3.6476624857468645, "grad_norm": 0.9427306652069092, "learning_rate": 1.9188576719953633e-05, "loss": 0.0051, "step": 802 }, { "epoch": 3.652223489167617, "grad_norm": 0.5667453408241272, "learning_rate": 1.9072767792701768e-05, "loss": 0.0029, "step": 803 }, { "epoch": 3.6567844925883692, "grad_norm": 1.584375262260437, "learning_rate": 1.895722697460737e-05, "loss": 0.0042, "step": 804 }, { "epoch": 3.661345496009122, "grad_norm": 4.02076530456543, "learning_rate": 1.884195526729748e-05, "loss": 0.0092, "step": 805 }, { "epoch": 3.6659064994298745, "grad_norm": 2.488560438156128, "learning_rate": 1.8726953670066193e-05, "loss": 0.0041, "step": 806 }, { "epoch": 3.6704675028506273, "grad_norm": 1.546533226966858, "learning_rate": 1.861222317986598e-05, "loss": 0.0074, "step": 807 }, { "epoch": 3.6750285062713797, "grad_norm": 2.984379529953003, "learning_rate": 1.8497764791299117e-05, "loss": 0.0091, "step": 808 }, { "epoch": 3.679589509692132, "grad_norm": 1.3167816400527954, "learning_rate": 1.8383579496609004e-05, "loss": 0.0063, "step": 809 }, { "epoch": 3.684150513112885, "grad_norm": 2.830770492553711, "learning_rate": 1.8269668285671587e-05, "loss": 0.0059, "step": 810 }, { "epoch": 3.6887115165336373, "grad_norm": 1.9150404930114746, "learning_rate": 1.8156032145986784e-05, "loss": 0.003, "step": 811 }, { "epoch": 3.69327251995439, "grad_norm": 0.5943277478218079, "learning_rate": 1.8042672062669863e-05, "loss": 0.0028, "step": 812 }, { "epoch": 3.6978335233751425, "grad_norm": 0.7251271605491638, "learning_rate": 1.7929589018443016e-05, "loss": 0.004, "step": 813 }, { "epoch": 3.702394526795895, "grad_norm": 0.5687354207038879, "learning_rate": 1.7816783993626712e-05, "loss": 0.0032, "step": 814 }, { "epoch": 3.7069555302166477, "grad_norm": 0.7953961491584778, "learning_rate": 1.7704257966131304e-05, "loss": 0.0041, "step": 815 }, { "epoch": 3.7115165336374, "grad_norm": 1.1048918962478638, "learning_rate": 1.759201191144852e-05, "loss": 0.0035, "step": 816 }, { "epoch": 3.716077537058153, "grad_norm": 1.2720906734466553, "learning_rate": 1.7480046802642906e-05, "loss": 0.0043, "step": 817 }, { "epoch": 3.7206385404789053, "grad_norm": 4.515064239501953, "learning_rate": 1.7368363610343617e-05, "loss": 0.009, "step": 818 }, { "epoch": 3.7251995438996577, "grad_norm": 2.149350166320801, "learning_rate": 1.725696330273575e-05, "loss": 0.007, "step": 819 }, { "epoch": 3.7297605473204105, "grad_norm": 0.4950422942638397, "learning_rate": 1.714584684555211e-05, "loss": 0.0028, "step": 820 }, { "epoch": 3.734321550741163, "grad_norm": 1.4185466766357422, "learning_rate": 1.703501520206482e-05, "loss": 0.005, "step": 821 }, { "epoch": 3.7388825541619157, "grad_norm": 3.5513916015625, "learning_rate": 1.692446933307687e-05, "loss": 0.0053, "step": 822 }, { "epoch": 3.743443557582668, "grad_norm": 2.450361728668213, "learning_rate": 1.6814210196913927e-05, "loss": 0.0083, "step": 823 }, { "epoch": 3.7480045610034205, "grad_norm": 0.6304205656051636, "learning_rate": 1.6704238749415957e-05, "loss": 0.0042, "step": 824 }, { "epoch": 3.7525655644241733, "grad_norm": 2.1942145824432373, "learning_rate": 1.6594555943928887e-05, "loss": 0.0046, "step": 825 }, { "epoch": 3.757126567844926, "grad_norm": 0.7358046770095825, "learning_rate": 1.6485162731296495e-05, "loss": 0.0027, "step": 826 }, { "epoch": 3.7616875712656785, "grad_norm": 3.021874189376831, "learning_rate": 1.6376060059851963e-05, "loss": 0.0092, "step": 827 }, { "epoch": 3.766248574686431, "grad_norm": 0.618632435798645, "learning_rate": 1.6267248875409835e-05, "loss": 0.0033, "step": 828 }, { "epoch": 3.7708095781071833, "grad_norm": 1.160000205039978, "learning_rate": 1.6158730121257737e-05, "loss": 0.0047, "step": 829 }, { "epoch": 3.775370581527936, "grad_norm": 1.2812681198120117, "learning_rate": 1.6050504738148152e-05, "loss": 0.005, "step": 830 }, { "epoch": 3.779931584948689, "grad_norm": 1.2367734909057617, "learning_rate": 1.5942573664290412e-05, "loss": 0.0058, "step": 831 }, { "epoch": 3.7844925883694414, "grad_norm": 1.0198251008987427, "learning_rate": 1.5834937835342366e-05, "loss": 0.0039, "step": 832 }, { "epoch": 3.7890535917901937, "grad_norm": 0.9254816174507141, "learning_rate": 1.5727598184402464e-05, "loss": 0.003, "step": 833 }, { "epoch": 3.7936145952109466, "grad_norm": 1.487305998802185, "learning_rate": 1.562055564200154e-05, "loss": 0.0049, "step": 834 }, { "epoch": 3.798175598631699, "grad_norm": 0.8291229009628296, "learning_rate": 1.5513811136094787e-05, "loss": 0.0042, "step": 835 }, { "epoch": 3.802736602052452, "grad_norm": 0.4022439122200012, "learning_rate": 1.5407365592053735e-05, "loss": 0.0023, "step": 836 }, { "epoch": 3.807297605473204, "grad_norm": 1.640064001083374, "learning_rate": 1.5301219932658156e-05, "loss": 0.0057, "step": 837 }, { "epoch": 3.8118586088939566, "grad_norm": 1.2258837223052979, "learning_rate": 1.5195375078088147e-05, "loss": 0.0034, "step": 838 }, { "epoch": 3.8164196123147094, "grad_norm": 1.264310359954834, "learning_rate": 1.5089831945916133e-05, "loss": 0.0057, "step": 839 }, { "epoch": 3.8209806157354618, "grad_norm": 2.701324939727783, "learning_rate": 1.4984591451098845e-05, "loss": 0.0086, "step": 840 }, { "epoch": 3.8255416191562146, "grad_norm": 3.1908340454101562, "learning_rate": 1.4879654505969498e-05, "loss": 0.0085, "step": 841 }, { "epoch": 3.830102622576967, "grad_norm": 0.7210344672203064, "learning_rate": 1.4775022020229756e-05, "loss": 0.0028, "step": 842 }, { "epoch": 3.8346636259977194, "grad_norm": 2.616572380065918, "learning_rate": 1.4670694900942005e-05, "loss": 0.0069, "step": 843 }, { "epoch": 3.839224629418472, "grad_norm": 1.0061376094818115, "learning_rate": 1.4566674052521357e-05, "loss": 0.0035, "step": 844 }, { "epoch": 3.8437856328392246, "grad_norm": 0.5519426465034485, "learning_rate": 1.4462960376727813e-05, "loss": 0.0025, "step": 845 }, { "epoch": 3.8483466362599774, "grad_norm": 1.0319393873214722, "learning_rate": 1.4359554772658552e-05, "loss": 0.0037, "step": 846 }, { "epoch": 3.85290763968073, "grad_norm": 5.387468338012695, "learning_rate": 1.4256458136739998e-05, "loss": 0.0131, "step": 847 }, { "epoch": 3.857468643101482, "grad_norm": 1.1421221494674683, "learning_rate": 1.415367136272019e-05, "loss": 0.0038, "step": 848 }, { "epoch": 3.862029646522235, "grad_norm": 1.2745294570922852, "learning_rate": 1.4051195341660939e-05, "loss": 0.0063, "step": 849 }, { "epoch": 3.8665906499429874, "grad_norm": 3.796590805053711, "learning_rate": 1.3949030961930077e-05, "loss": 0.0102, "step": 850 }, { "epoch": 3.8711516533637402, "grad_norm": 1.4126759767532349, "learning_rate": 1.3847179109193925e-05, "loss": 0.0043, "step": 851 }, { "epoch": 3.8757126567844926, "grad_norm": 0.6939073801040649, "learning_rate": 1.374564066640937e-05, "loss": 0.0038, "step": 852 }, { "epoch": 3.880273660205245, "grad_norm": 0.8315445780754089, "learning_rate": 1.3644416513816416e-05, "loss": 0.0051, "step": 853 }, { "epoch": 3.884834663625998, "grad_norm": 0.7290100455284119, "learning_rate": 1.3543507528930472e-05, "loss": 0.0035, "step": 854 }, { "epoch": 3.88939566704675, "grad_norm": 3.279763698577881, "learning_rate": 1.3442914586534688e-05, "loss": 0.0112, "step": 855 }, { "epoch": 3.893956670467503, "grad_norm": 0.7284213900566101, "learning_rate": 1.3342638558672504e-05, "loss": 0.0032, "step": 856 }, { "epoch": 3.8985176738882554, "grad_norm": 1.461885690689087, "learning_rate": 1.3242680314639993e-05, "loss": 0.0046, "step": 857 }, { "epoch": 3.903078677309008, "grad_norm": 0.36676260828971863, "learning_rate": 1.31430407209783e-05, "loss": 0.0023, "step": 858 }, { "epoch": 3.9076396807297606, "grad_norm": 0.5633653402328491, "learning_rate": 1.3043720641466289e-05, "loss": 0.0025, "step": 859 }, { "epoch": 3.912200684150513, "grad_norm": 0.9612581133842468, "learning_rate": 1.2944720937112836e-05, "loss": 0.0023, "step": 860 }, { "epoch": 3.916761687571266, "grad_norm": 2.462862491607666, "learning_rate": 1.284604246614955e-05, "loss": 0.0048, "step": 861 }, { "epoch": 3.9213226909920182, "grad_norm": 0.6819082498550415, "learning_rate": 1.2747686084023192e-05, "loss": 0.003, "step": 862 }, { "epoch": 3.9258836944127706, "grad_norm": 0.4367881417274475, "learning_rate": 1.2649652643388382e-05, "loss": 0.0024, "step": 863 }, { "epoch": 3.9304446978335235, "grad_norm": 0.6573525667190552, "learning_rate": 1.2551942994100136e-05, "loss": 0.0029, "step": 864 }, { "epoch": 3.935005701254276, "grad_norm": 1.2858328819274902, "learning_rate": 1.2454557983206477e-05, "loss": 0.0039, "step": 865 }, { "epoch": 3.9395667046750287, "grad_norm": 0.46625810861587524, "learning_rate": 1.2357498454941175e-05, "loss": 0.0029, "step": 866 }, { "epoch": 3.944127708095781, "grad_norm": 0.39558151364326477, "learning_rate": 1.2260765250716356e-05, "loss": 0.0021, "step": 867 }, { "epoch": 3.9486887115165334, "grad_norm": 0.9131205677986145, "learning_rate": 1.2164359209115234e-05, "loss": 0.0032, "step": 868 }, { "epoch": 3.9532497149372863, "grad_norm": 0.8359415531158447, "learning_rate": 1.2068281165884864e-05, "loss": 0.0038, "step": 869 }, { "epoch": 3.9578107183580387, "grad_norm": 0.5636485815048218, "learning_rate": 1.1972531953928823e-05, "loss": 0.0025, "step": 870 }, { "epoch": 3.9623717217787915, "grad_norm": 1.0502080917358398, "learning_rate": 1.1877112403300079e-05, "loss": 0.0032, "step": 871 }, { "epoch": 3.966932725199544, "grad_norm": 1.4225945472717285, "learning_rate": 1.1782023341193754e-05, "loss": 0.005, "step": 872 }, { "epoch": 3.9714937286202963, "grad_norm": 0.8865280747413635, "learning_rate": 1.1687265591939927e-05, "loss": 0.0036, "step": 873 }, { "epoch": 3.976054732041049, "grad_norm": 0.7002689242362976, "learning_rate": 1.1592839976996555e-05, "loss": 0.0034, "step": 874 }, { "epoch": 3.9806157354618015, "grad_norm": 1.3850862979888916, "learning_rate": 1.1498747314942255e-05, "loss": 0.0052, "step": 875 }, { "epoch": 3.9851767388825543, "grad_norm": 1.3568379878997803, "learning_rate": 1.1404988421469348e-05, "loss": 0.0037, "step": 876 }, { "epoch": 3.9897377423033067, "grad_norm": 0.815382719039917, "learning_rate": 1.1311564109376621e-05, "loss": 0.0036, "step": 877 }, { "epoch": 3.994298745724059, "grad_norm": 2.1018550395965576, "learning_rate": 1.121847518856241e-05, "loss": 0.0048, "step": 878 }, { "epoch": 3.998859749144812, "grad_norm": 0.6212396025657654, "learning_rate": 1.1125722466017547e-05, "loss": 0.0025, "step": 879 }, { "epoch": 4.0, "grad_norm": 0.9469783306121826, "learning_rate": 1.1033306745818283e-05, "loss": 0.0028, "step": 880 }, { "epoch": 4.004561003420752, "grad_norm": 0.6264204382896423, "learning_rate": 1.0941228829119453e-05, "loss": 0.0033, "step": 881 }, { "epoch": 4.009122006841505, "grad_norm": 0.27060914039611816, "learning_rate": 1.0849489514147459e-05, "loss": 0.0016, "step": 882 }, { "epoch": 4.013683010262258, "grad_norm": 0.47667694091796875, "learning_rate": 1.0758089596193282e-05, "loss": 0.0028, "step": 883 }, { "epoch": 4.01824401368301, "grad_norm": 0.9456324577331543, "learning_rate": 1.066702986760577e-05, "loss": 0.0029, "step": 884 }, { "epoch": 4.022805017103763, "grad_norm": 1.1689949035644531, "learning_rate": 1.057631111778456e-05, "loss": 0.0034, "step": 885 }, { "epoch": 4.027366020524515, "grad_norm": 0.25507575273513794, "learning_rate": 1.0485934133173387e-05, "loss": 0.002, "step": 886 }, { "epoch": 4.031927023945268, "grad_norm": 0.37119409441947937, "learning_rate": 1.0395899697253208e-05, "loss": 0.0022, "step": 887 }, { "epoch": 4.036488027366021, "grad_norm": 0.3143307864665985, "learning_rate": 1.0306208590535382e-05, "loss": 0.0021, "step": 888 }, { "epoch": 4.041049030786773, "grad_norm": 0.3163100481033325, "learning_rate": 1.0216861590554983e-05, "loss": 0.0021, "step": 889 }, { "epoch": 4.045610034207526, "grad_norm": 0.4894810914993286, "learning_rate": 1.012785947186397e-05, "loss": 0.0024, "step": 890 }, { "epoch": 4.050171037628278, "grad_norm": 0.3152639865875244, "learning_rate": 1.0039203006024527e-05, "loss": 0.0025, "step": 891 }, { "epoch": 4.05473204104903, "grad_norm": 1.619958519935608, "learning_rate": 9.95089296160241e-06, "loss": 0.0054, "step": 892 }, { "epoch": 4.059293044469784, "grad_norm": 0.5192055106163025, "learning_rate": 9.862930104160162e-06, "loss": 0.0027, "step": 893 }, { "epoch": 4.063854047890536, "grad_norm": 2.080965518951416, "learning_rate": 9.775315196250612e-06, "loss": 0.0042, "step": 894 }, { "epoch": 4.068415051311288, "grad_norm": 0.3620293140411377, "learning_rate": 9.688048997410143e-06, "loss": 0.0022, "step": 895 }, { "epoch": 4.072976054732041, "grad_norm": 0.37922972440719604, "learning_rate": 9.601132264152223e-06, "loss": 0.002, "step": 896 }, { "epoch": 4.077537058152793, "grad_norm": 0.3833453059196472, "learning_rate": 9.51456574996078e-06, "loss": 0.0022, "step": 897 }, { "epoch": 4.0820980615735465, "grad_norm": 0.41054201126098633, "learning_rate": 9.428350205283648e-06, "loss": 0.0024, "step": 898 }, { "epoch": 4.086659064994299, "grad_norm": 0.3207608461380005, "learning_rate": 9.342486377526133e-06, "loss": 0.002, "step": 899 }, { "epoch": 4.091220068415051, "grad_norm": 0.4284456968307495, "learning_rate": 9.256975011044483e-06, "loss": 0.0022, "step": 900 } ], "logging_steps": 1.0, "max_steps": 1100, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.659402853631918e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }