diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999838657631494, + "eval_steps": 500, + "global_step": 6197, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016134236850596966, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.8734, + "step": 1 + }, + { + "epoch": 0.00032268473701193933, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 12.0235, + "step": 2 + }, + { + "epoch": 0.000484027105517909, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 12.5059, + "step": 3 + }, + { + "epoch": 0.0006453694740238787, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 12.353, + "step": 4 + }, + { + "epoch": 0.0008067118425298483, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.9095, + "step": 5 + }, + { + "epoch": 0.000968054211035818, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.7864, + "step": 6 + }, + { + "epoch": 0.0011293965795417876, + "grad_norm": 121.9916763305664, + "learning_rate": 5.376344086021506e-07, + "loss": 12.4286, + "step": 7 + }, + { + "epoch": 0.0012907389480477573, + "grad_norm": 119.27645874023438, + "learning_rate": 1.0752688172043011e-06, + "loss": 12.1422, + "step": 8 + }, + { + "epoch": 0.001452081316553727, + "grad_norm": 118.97579193115234, + "learning_rate": 1.6129032258064516e-06, + "loss": 12.24, + "step": 9 + }, + { + "epoch": 0.0016134236850596966, + "grad_norm": 118.97579193115234, + "learning_rate": 1.6129032258064516e-06, + "loss": 12.033, + "step": 10 + }, + { + "epoch": 0.0017747660535656663, + "grad_norm": 113.35755157470703, + "learning_rate": 2.1505376344086023e-06, + "loss": 11.746, + "step": 11 + }, + { + "epoch": 0.001936108422071636, + "grad_norm": 118.38843536376953, + "learning_rate": 2.688172043010753e-06, + "loss": 11.9291, + "step": 12 + }, + { + "epoch": 0.002097450790577606, + "grad_norm": 116.94325256347656, + "learning_rate": 3.225806451612903e-06, + "loss": 11.7795, + "step": 13 + }, + { + "epoch": 0.002258793159083575, + "grad_norm": 114.86215209960938, + "learning_rate": 3.763440860215054e-06, + "loss": 11.5286, + "step": 14 + }, + { + "epoch": 0.002420135527589545, + "grad_norm": 112.50050354003906, + "learning_rate": 4.3010752688172045e-06, + "loss": 11.4632, + "step": 15 + }, + { + "epoch": 0.0025814778960955146, + "grad_norm": 112.45469665527344, + "learning_rate": 4.838709677419355e-06, + "loss": 11.5083, + "step": 16 + }, + { + "epoch": 0.0027428202646014844, + "grad_norm": 102.28872680664062, + "learning_rate": 5.376344086021506e-06, + "loss": 10.6307, + "step": 17 + }, + { + "epoch": 0.002904162633107454, + "grad_norm": 102.35145568847656, + "learning_rate": 5.9139784946236566e-06, + "loss": 10.6356, + "step": 18 + }, + { + "epoch": 0.003065505001613424, + "grad_norm": 99.08859252929688, + "learning_rate": 6.451612903225806e-06, + "loss": 10.1105, + "step": 19 + }, + { + "epoch": 0.003226847370119393, + "grad_norm": 99.42597198486328, + "learning_rate": 6.989247311827957e-06, + "loss": 10.0749, + "step": 20 + }, + { + "epoch": 0.003388189738625363, + "grad_norm": 96.81177520751953, + "learning_rate": 7.526881720430108e-06, + "loss": 10.0479, + "step": 21 + }, + { + "epoch": 0.0035495321071313327, + "grad_norm": 87.56671142578125, + "learning_rate": 8.064516129032258e-06, + "loss": 9.3494, + "step": 22 + }, + { + "epoch": 0.0037108744756373024, + "grad_norm": 89.21701049804688, + "learning_rate": 8.602150537634409e-06, + "loss": 9.0056, + "step": 23 + }, + { + "epoch": 0.003872216844143272, + "grad_norm": 84.08562469482422, + "learning_rate": 9.13978494623656e-06, + "loss": 8.6993, + "step": 24 + }, + { + "epoch": 0.004033559212649242, + "grad_norm": 83.88109588623047, + "learning_rate": 9.67741935483871e-06, + "loss": 8.5553, + "step": 25 + }, + { + "epoch": 0.004194901581155212, + "grad_norm": 78.00149536132812, + "learning_rate": 1.0215053763440861e-05, + "loss": 8.0172, + "step": 26 + }, + { + "epoch": 0.004356243949661181, + "grad_norm": 76.08988189697266, + "learning_rate": 1.0752688172043012e-05, + "loss": 7.7614, + "step": 27 + }, + { + "epoch": 0.00451758631816715, + "grad_norm": 73.39444732666016, + "learning_rate": 1.129032258064516e-05, + "loss": 7.2832, + "step": 28 + }, + { + "epoch": 0.00467892868667312, + "grad_norm": 69.82600402832031, + "learning_rate": 1.1827956989247313e-05, + "loss": 7.169, + "step": 29 + }, + { + "epoch": 0.00484027105517909, + "grad_norm": 68.02349853515625, + "learning_rate": 1.2365591397849464e-05, + "loss": 6.8744, + "step": 30 + }, + { + "epoch": 0.0050016134236850595, + "grad_norm": 68.92740631103516, + "learning_rate": 1.2903225806451613e-05, + "loss": 6.814, + "step": 31 + }, + { + "epoch": 0.005162955792191029, + "grad_norm": 62.0346794128418, + "learning_rate": 1.3440860215053763e-05, + "loss": 6.4574, + "step": 32 + }, + { + "epoch": 0.005324298160696999, + "grad_norm": 61.29989242553711, + "learning_rate": 1.3978494623655914e-05, + "loss": 6.2698, + "step": 33 + }, + { + "epoch": 0.005485640529202969, + "grad_norm": 56.97321701049805, + "learning_rate": 1.4516129032258066e-05, + "loss": 6.0728, + "step": 34 + }, + { + "epoch": 0.0056469828977089385, + "grad_norm": 54.352840423583984, + "learning_rate": 1.5053763440860215e-05, + "loss": 5.9087, + "step": 35 + }, + { + "epoch": 0.005808325266214908, + "grad_norm": 49.80109786987305, + "learning_rate": 1.5591397849462366e-05, + "loss": 5.4298, + "step": 36 + }, + { + "epoch": 0.005969667634720878, + "grad_norm": 48.01213455200195, + "learning_rate": 1.6129032258064517e-05, + "loss": 5.3382, + "step": 37 + }, + { + "epoch": 0.006131010003226848, + "grad_norm": 48.508453369140625, + "learning_rate": 1.6666666666666667e-05, + "loss": 5.5643, + "step": 38 + }, + { + "epoch": 0.0062923523717328175, + "grad_norm": 46.24903106689453, + "learning_rate": 1.7204301075268818e-05, + "loss": 4.8385, + "step": 39 + }, + { + "epoch": 0.006453694740238786, + "grad_norm": 43.764591217041016, + "learning_rate": 1.774193548387097e-05, + "loss": 4.8629, + "step": 40 + }, + { + "epoch": 0.006615037108744756, + "grad_norm": 41.214168548583984, + "learning_rate": 1.827956989247312e-05, + "loss": 4.8244, + "step": 41 + }, + { + "epoch": 0.006776379477250726, + "grad_norm": 41.70524215698242, + "learning_rate": 1.881720430107527e-05, + "loss": 4.714, + "step": 42 + }, + { + "epoch": 0.006937721845756696, + "grad_norm": 41.5999755859375, + "learning_rate": 1.935483870967742e-05, + "loss": 4.4334, + "step": 43 + }, + { + "epoch": 0.007099064214262665, + "grad_norm": 38.3800048828125, + "learning_rate": 1.989247311827957e-05, + "loss": 4.2036, + "step": 44 + }, + { + "epoch": 0.007260406582768635, + "grad_norm": 34.799400329589844, + "learning_rate": 2.0430107526881722e-05, + "loss": 4.1716, + "step": 45 + }, + { + "epoch": 0.007421748951274605, + "grad_norm": 32.226043701171875, + "learning_rate": 2.0967741935483873e-05, + "loss": 3.908, + "step": 46 + }, + { + "epoch": 0.0075830913197805746, + "grad_norm": 31.816619873046875, + "learning_rate": 2.1505376344086024e-05, + "loss": 4.101, + "step": 47 + }, + { + "epoch": 0.007744433688286544, + "grad_norm": 26.555458068847656, + "learning_rate": 2.2043010752688174e-05, + "loss": 3.8238, + "step": 48 + }, + { + "epoch": 0.007905776056792513, + "grad_norm": 26.600265502929688, + "learning_rate": 2.258064516129032e-05, + "loss": 3.7415, + "step": 49 + }, + { + "epoch": 0.008067118425298484, + "grad_norm": 25.45236587524414, + "learning_rate": 2.3118279569892472e-05, + "loss": 3.4973, + "step": 50 + }, + { + "epoch": 0.008228460793804453, + "grad_norm": 26.451797485351562, + "learning_rate": 2.3655913978494626e-05, + "loss": 3.7219, + "step": 51 + }, + { + "epoch": 0.008389803162310423, + "grad_norm": 27.508174896240234, + "learning_rate": 2.4193548387096777e-05, + "loss": 3.522, + "step": 52 + }, + { + "epoch": 0.008551145530816392, + "grad_norm": 26.211048126220703, + "learning_rate": 2.4731182795698928e-05, + "loss": 3.5366, + "step": 53 + }, + { + "epoch": 0.008712487899322363, + "grad_norm": 27.209714889526367, + "learning_rate": 2.5268817204301075e-05, + "loss": 3.2962, + "step": 54 + }, + { + "epoch": 0.008873830267828332, + "grad_norm": 26.76357078552246, + "learning_rate": 2.5806451612903226e-05, + "loss": 3.7067, + "step": 55 + }, + { + "epoch": 0.0090351726363343, + "grad_norm": 25.661449432373047, + "learning_rate": 2.6344086021505376e-05, + "loss": 3.5572, + "step": 56 + }, + { + "epoch": 0.009196515004840271, + "grad_norm": 25.68983268737793, + "learning_rate": 2.6881720430107527e-05, + "loss": 3.5089, + "step": 57 + }, + { + "epoch": 0.00935785737334624, + "grad_norm": 25.6807918548584, + "learning_rate": 2.7419354838709678e-05, + "loss": 3.4321, + "step": 58 + }, + { + "epoch": 0.00951919974185221, + "grad_norm": 24.658496856689453, + "learning_rate": 2.7956989247311828e-05, + "loss": 3.218, + "step": 59 + }, + { + "epoch": 0.00968054211035818, + "grad_norm": 24.265926361083984, + "learning_rate": 2.8494623655913982e-05, + "loss": 3.1398, + "step": 60 + }, + { + "epoch": 0.00984188447886415, + "grad_norm": 23.916343688964844, + "learning_rate": 2.9032258064516133e-05, + "loss": 3.4459, + "step": 61 + }, + { + "epoch": 0.010003226847370119, + "grad_norm": 22.103166580200195, + "learning_rate": 2.9569892473118284e-05, + "loss": 3.1716, + "step": 62 + }, + { + "epoch": 0.01016456921587609, + "grad_norm": 20.560914993286133, + "learning_rate": 3.010752688172043e-05, + "loss": 3.0084, + "step": 63 + }, + { + "epoch": 0.010325911584382058, + "grad_norm": 20.613006591796875, + "learning_rate": 3.0645161290322585e-05, + "loss": 3.0275, + "step": 64 + }, + { + "epoch": 0.010487253952888029, + "grad_norm": 17.877843856811523, + "learning_rate": 3.118279569892473e-05, + "loss": 2.908, + "step": 65 + }, + { + "epoch": 0.010648596321393998, + "grad_norm": 16.064481735229492, + "learning_rate": 3.172043010752688e-05, + "loss": 3.1454, + "step": 66 + }, + { + "epoch": 0.010809938689899969, + "grad_norm": 13.223140716552734, + "learning_rate": 3.2258064516129034e-05, + "loss": 2.6419, + "step": 67 + }, + { + "epoch": 0.010971281058405937, + "grad_norm": 10.892476081848145, + "learning_rate": 3.279569892473118e-05, + "loss": 2.614, + "step": 68 + }, + { + "epoch": 0.011132623426911906, + "grad_norm": 10.642965316772461, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.8108, + "step": 69 + }, + { + "epoch": 0.011293965795417877, + "grad_norm": 7.963952541351318, + "learning_rate": 3.387096774193548e-05, + "loss": 2.832, + "step": 70 + }, + { + "epoch": 0.011455308163923846, + "grad_norm": 9.012112617492676, + "learning_rate": 3.4408602150537636e-05, + "loss": 2.9791, + "step": 71 + }, + { + "epoch": 0.011616650532429816, + "grad_norm": 10.27229118347168, + "learning_rate": 3.494623655913979e-05, + "loss": 2.8891, + "step": 72 + }, + { + "epoch": 0.011777992900935785, + "grad_norm": 7.0006279945373535, + "learning_rate": 3.548387096774194e-05, + "loss": 2.733, + "step": 73 + }, + { + "epoch": 0.011939335269441756, + "grad_norm": 9.03067398071289, + "learning_rate": 3.602150537634409e-05, + "loss": 2.6866, + "step": 74 + }, + { + "epoch": 0.012100677637947725, + "grad_norm": 8.046092987060547, + "learning_rate": 3.655913978494624e-05, + "loss": 2.7973, + "step": 75 + }, + { + "epoch": 0.012262020006453695, + "grad_norm": 8.398710250854492, + "learning_rate": 3.7096774193548386e-05, + "loss": 2.7915, + "step": 76 + }, + { + "epoch": 0.012423362374959664, + "grad_norm": 6.430469989776611, + "learning_rate": 3.763440860215054e-05, + "loss": 2.6369, + "step": 77 + }, + { + "epoch": 0.012584704743465635, + "grad_norm": 6.532980918884277, + "learning_rate": 3.817204301075269e-05, + "loss": 2.6752, + "step": 78 + }, + { + "epoch": 0.012746047111971604, + "grad_norm": 7.386938095092773, + "learning_rate": 3.870967741935484e-05, + "loss": 2.6912, + "step": 79 + }, + { + "epoch": 0.012907389480477573, + "grad_norm": 7.101880073547363, + "learning_rate": 3.924731182795699e-05, + "loss": 2.7584, + "step": 80 + }, + { + "epoch": 0.013068731848983543, + "grad_norm": 8.289556503295898, + "learning_rate": 3.978494623655914e-05, + "loss": 2.6468, + "step": 81 + }, + { + "epoch": 0.013230074217489512, + "grad_norm": 10.011824607849121, + "learning_rate": 4.032258064516129e-05, + "loss": 2.8707, + "step": 82 + }, + { + "epoch": 0.013391416585995483, + "grad_norm": 9.494156837463379, + "learning_rate": 4.0860215053763444e-05, + "loss": 2.7115, + "step": 83 + }, + { + "epoch": 0.013552758954501452, + "grad_norm": 8.189593315124512, + "learning_rate": 4.13978494623656e-05, + "loss": 2.833, + "step": 84 + }, + { + "epoch": 0.013714101323007422, + "grad_norm": 6.510471820831299, + "learning_rate": 4.1935483870967746e-05, + "loss": 2.7779, + "step": 85 + }, + { + "epoch": 0.013875443691513391, + "grad_norm": 7.422402858734131, + "learning_rate": 4.247311827956989e-05, + "loss": 2.7875, + "step": 86 + }, + { + "epoch": 0.014036786060019362, + "grad_norm": 15.020564079284668, + "learning_rate": 4.301075268817205e-05, + "loss": 2.7667, + "step": 87 + }, + { + "epoch": 0.01419812842852533, + "grad_norm": 8.193143844604492, + "learning_rate": 4.3548387096774194e-05, + "loss": 2.9937, + "step": 88 + }, + { + "epoch": 0.014359470797031301, + "grad_norm": 9.4781494140625, + "learning_rate": 4.408602150537635e-05, + "loss": 2.6305, + "step": 89 + }, + { + "epoch": 0.01452081316553727, + "grad_norm": 10.007437705993652, + "learning_rate": 4.4623655913978496e-05, + "loss": 2.6819, + "step": 90 + }, + { + "epoch": 0.014682155534043239, + "grad_norm": 8.336685180664062, + "learning_rate": 4.516129032258064e-05, + "loss": 2.6643, + "step": 91 + }, + { + "epoch": 0.01484349790254921, + "grad_norm": 8.228814125061035, + "learning_rate": 4.56989247311828e-05, + "loss": 2.6335, + "step": 92 + }, + { + "epoch": 0.015004840271055178, + "grad_norm": 7.648069858551025, + "learning_rate": 4.6236559139784944e-05, + "loss": 2.8316, + "step": 93 + }, + { + "epoch": 0.015166182639561149, + "grad_norm": 8.510735511779785, + "learning_rate": 4.67741935483871e-05, + "loss": 2.894, + "step": 94 + }, + { + "epoch": 0.015327525008067118, + "grad_norm": 5.819738864898682, + "learning_rate": 4.731182795698925e-05, + "loss": 2.7905, + "step": 95 + }, + { + "epoch": 0.015488867376573089, + "grad_norm": 6.688803195953369, + "learning_rate": 4.78494623655914e-05, + "loss": 2.7318, + "step": 96 + }, + { + "epoch": 0.01565020974507906, + "grad_norm": 8.233120918273926, + "learning_rate": 4.8387096774193554e-05, + "loss": 2.5895, + "step": 97 + }, + { + "epoch": 0.015811552113585026, + "grad_norm": 6.3282670974731445, + "learning_rate": 4.89247311827957e-05, + "loss": 2.7843, + "step": 98 + }, + { + "epoch": 0.015972894482090997, + "grad_norm": 6.703812599182129, + "learning_rate": 4.9462365591397855e-05, + "loss": 2.9789, + "step": 99 + }, + { + "epoch": 0.016134236850596968, + "grad_norm": 8.786787033081055, + "learning_rate": 5e-05, + "loss": 2.4158, + "step": 100 + }, + { + "epoch": 0.016295579219102935, + "grad_norm": 7.690980434417725, + "learning_rate": 5.053763440860215e-05, + "loss": 2.7758, + "step": 101 + }, + { + "epoch": 0.016456921587608905, + "grad_norm": 7.197218418121338, + "learning_rate": 5.1075268817204304e-05, + "loss": 2.5494, + "step": 102 + }, + { + "epoch": 0.016618263956114876, + "grad_norm": 5.804955005645752, + "learning_rate": 5.161290322580645e-05, + "loss": 2.7368, + "step": 103 + }, + { + "epoch": 0.016779606324620847, + "grad_norm": 8.487010955810547, + "learning_rate": 5.2150537634408605e-05, + "loss": 2.6821, + "step": 104 + }, + { + "epoch": 0.016940948693126814, + "grad_norm": 9.467545509338379, + "learning_rate": 5.268817204301075e-05, + "loss": 2.5257, + "step": 105 + }, + { + "epoch": 0.017102291061632784, + "grad_norm": 8.017814636230469, + "learning_rate": 5.32258064516129e-05, + "loss": 2.8659, + "step": 106 + }, + { + "epoch": 0.017263633430138755, + "grad_norm": 8.576655387878418, + "learning_rate": 5.3763440860215054e-05, + "loss": 2.8219, + "step": 107 + }, + { + "epoch": 0.017424975798644726, + "grad_norm": 6.280098915100098, + "learning_rate": 5.43010752688172e-05, + "loss": 2.6667, + "step": 108 + }, + { + "epoch": 0.017586318167150693, + "grad_norm": 8.505315780639648, + "learning_rate": 5.4838709677419355e-05, + "loss": 2.8887, + "step": 109 + }, + { + "epoch": 0.017747660535656663, + "grad_norm": 8.483502388000488, + "learning_rate": 5.53763440860215e-05, + "loss": 2.7645, + "step": 110 + }, + { + "epoch": 0.017909002904162634, + "grad_norm": 7.418360233306885, + "learning_rate": 5.5913978494623656e-05, + "loss": 2.6832, + "step": 111 + }, + { + "epoch": 0.0180703452726686, + "grad_norm": 9.711334228515625, + "learning_rate": 5.645161290322582e-05, + "loss": 2.4625, + "step": 112 + }, + { + "epoch": 0.01823168764117457, + "grad_norm": 7.5127034187316895, + "learning_rate": 5.6989247311827965e-05, + "loss": 2.7079, + "step": 113 + }, + { + "epoch": 0.018393030009680542, + "grad_norm": 8.565291404724121, + "learning_rate": 5.752688172043011e-05, + "loss": 2.6694, + "step": 114 + }, + { + "epoch": 0.018554372378186513, + "grad_norm": 6.193171501159668, + "learning_rate": 5.8064516129032266e-05, + "loss": 2.7512, + "step": 115 + }, + { + "epoch": 0.01871571474669248, + "grad_norm": 10.021088600158691, + "learning_rate": 5.860215053763441e-05, + "loss": 2.8323, + "step": 116 + }, + { + "epoch": 0.01887705711519845, + "grad_norm": 9.823052406311035, + "learning_rate": 5.913978494623657e-05, + "loss": 2.5964, + "step": 117 + }, + { + "epoch": 0.01903839948370442, + "grad_norm": 8.563424110412598, + "learning_rate": 5.9677419354838715e-05, + "loss": 2.5162, + "step": 118 + }, + { + "epoch": 0.019199741852210392, + "grad_norm": 7.295447826385498, + "learning_rate": 6.021505376344086e-05, + "loss": 2.6978, + "step": 119 + }, + { + "epoch": 0.01936108422071636, + "grad_norm": 6.0475993156433105, + "learning_rate": 6.0752688172043016e-05, + "loss": 2.6616, + "step": 120 + }, + { + "epoch": 0.01952242658922233, + "grad_norm": 7.451548099517822, + "learning_rate": 6.129032258064517e-05, + "loss": 2.6365, + "step": 121 + }, + { + "epoch": 0.0196837689577283, + "grad_norm": 8.676538467407227, + "learning_rate": 6.182795698924732e-05, + "loss": 2.6597, + "step": 122 + }, + { + "epoch": 0.01984511132623427, + "grad_norm": 7.062118053436279, + "learning_rate": 6.236559139784946e-05, + "loss": 2.6511, + "step": 123 + }, + { + "epoch": 0.020006453694740238, + "grad_norm": 7.9792866706848145, + "learning_rate": 6.290322580645161e-05, + "loss": 2.4457, + "step": 124 + }, + { + "epoch": 0.02016779606324621, + "grad_norm": 8.919709205627441, + "learning_rate": 6.344086021505376e-05, + "loss": 2.6601, + "step": 125 + }, + { + "epoch": 0.02032913843175218, + "grad_norm": 7.2807183265686035, + "learning_rate": 6.397849462365592e-05, + "loss": 2.6229, + "step": 126 + }, + { + "epoch": 0.020490480800258146, + "grad_norm": 7.657270431518555, + "learning_rate": 6.451612903225807e-05, + "loss": 2.6337, + "step": 127 + }, + { + "epoch": 0.020651823168764117, + "grad_norm": 9.673775672912598, + "learning_rate": 6.505376344086021e-05, + "loss": 2.6504, + "step": 128 + }, + { + "epoch": 0.020813165537270088, + "grad_norm": 6.2916059494018555, + "learning_rate": 6.559139784946236e-05, + "loss": 2.5548, + "step": 129 + }, + { + "epoch": 0.020974507905776058, + "grad_norm": 6.704552173614502, + "learning_rate": 6.612903225806452e-05, + "loss": 2.8779, + "step": 130 + }, + { + "epoch": 0.021135850274282025, + "grad_norm": 7.804670333862305, + "learning_rate": 6.666666666666667e-05, + "loss": 2.7628, + "step": 131 + }, + { + "epoch": 0.021297192642787996, + "grad_norm": 7.032751083374023, + "learning_rate": 6.720430107526882e-05, + "loss": 2.5618, + "step": 132 + }, + { + "epoch": 0.021458535011293967, + "grad_norm": 6.686706066131592, + "learning_rate": 6.774193548387096e-05, + "loss": 2.4971, + "step": 133 + }, + { + "epoch": 0.021619877379799937, + "grad_norm": 7.140244960784912, + "learning_rate": 6.827956989247311e-05, + "loss": 2.7566, + "step": 134 + }, + { + "epoch": 0.021781219748305904, + "grad_norm": 6.7180023193359375, + "learning_rate": 6.881720430107527e-05, + "loss": 2.6035, + "step": 135 + }, + { + "epoch": 0.021942562116811875, + "grad_norm": 7.563843727111816, + "learning_rate": 6.935483870967743e-05, + "loss": 2.8565, + "step": 136 + }, + { + "epoch": 0.022103904485317846, + "grad_norm": 5.411715984344482, + "learning_rate": 6.989247311827958e-05, + "loss": 2.6123, + "step": 137 + }, + { + "epoch": 0.022265246853823813, + "grad_norm": 7.0890278816223145, + "learning_rate": 7.043010752688173e-05, + "loss": 2.6014, + "step": 138 + }, + { + "epoch": 0.022426589222329783, + "grad_norm": 8.12354850769043, + "learning_rate": 7.096774193548388e-05, + "loss": 2.5794, + "step": 139 + }, + { + "epoch": 0.022587931590835754, + "grad_norm": 7.8680100440979, + "learning_rate": 7.150537634408602e-05, + "loss": 3.1132, + "step": 140 + }, + { + "epoch": 0.022749273959341725, + "grad_norm": 7.589081287384033, + "learning_rate": 7.204301075268818e-05, + "loss": 2.6502, + "step": 141 + }, + { + "epoch": 0.02291061632784769, + "grad_norm": 7.087393283843994, + "learning_rate": 7.258064516129033e-05, + "loss": 2.4192, + "step": 142 + }, + { + "epoch": 0.023071958696353662, + "grad_norm": 8.530641555786133, + "learning_rate": 7.311827956989248e-05, + "loss": 2.7432, + "step": 143 + }, + { + "epoch": 0.023233301064859633, + "grad_norm": 10.033088684082031, + "learning_rate": 7.365591397849463e-05, + "loss": 2.3456, + "step": 144 + }, + { + "epoch": 0.023394643433365604, + "grad_norm": 7.166675567626953, + "learning_rate": 7.419354838709677e-05, + "loss": 2.4632, + "step": 145 + }, + { + "epoch": 0.02355598580187157, + "grad_norm": 5.421966552734375, + "learning_rate": 7.473118279569893e-05, + "loss": 2.5053, + "step": 146 + }, + { + "epoch": 0.02371732817037754, + "grad_norm": 6.469846725463867, + "learning_rate": 7.526881720430108e-05, + "loss": 2.6275, + "step": 147 + }, + { + "epoch": 0.023878670538883512, + "grad_norm": 8.492588996887207, + "learning_rate": 7.580645161290323e-05, + "loss": 2.4239, + "step": 148 + }, + { + "epoch": 0.02404001290738948, + "grad_norm": 5.68565034866333, + "learning_rate": 7.634408602150538e-05, + "loss": 2.5063, + "step": 149 + }, + { + "epoch": 0.02420135527589545, + "grad_norm": 6.721501350402832, + "learning_rate": 7.688172043010752e-05, + "loss": 2.6168, + "step": 150 + }, + { + "epoch": 0.02436269764440142, + "grad_norm": 6.561045169830322, + "learning_rate": 7.741935483870968e-05, + "loss": 2.5427, + "step": 151 + }, + { + "epoch": 0.02452404001290739, + "grad_norm": 6.798069000244141, + "learning_rate": 7.795698924731183e-05, + "loss": 2.5517, + "step": 152 + }, + { + "epoch": 0.024685382381413358, + "grad_norm": 7.483495712280273, + "learning_rate": 7.849462365591398e-05, + "loss": 2.61, + "step": 153 + }, + { + "epoch": 0.02484672474991933, + "grad_norm": 6.829707622528076, + "learning_rate": 7.903225806451613e-05, + "loss": 2.6721, + "step": 154 + }, + { + "epoch": 0.0250080671184253, + "grad_norm": 6.352954387664795, + "learning_rate": 7.956989247311829e-05, + "loss": 2.6246, + "step": 155 + }, + { + "epoch": 0.02516940948693127, + "grad_norm": 7.2489213943481445, + "learning_rate": 8.010752688172043e-05, + "loss": 2.6004, + "step": 156 + }, + { + "epoch": 0.025330751855437237, + "grad_norm": 6.657808780670166, + "learning_rate": 8.064516129032258e-05, + "loss": 2.6013, + "step": 157 + }, + { + "epoch": 0.025492094223943208, + "grad_norm": 9.814123153686523, + "learning_rate": 8.118279569892473e-05, + "loss": 2.5923, + "step": 158 + }, + { + "epoch": 0.025653436592449178, + "grad_norm": 7.6293158531188965, + "learning_rate": 8.172043010752689e-05, + "loss": 2.5834, + "step": 159 + }, + { + "epoch": 0.025814778960955145, + "grad_norm": 6.346309661865234, + "learning_rate": 8.225806451612904e-05, + "loss": 2.7438, + "step": 160 + }, + { + "epoch": 0.025976121329461116, + "grad_norm": 6.685229301452637, + "learning_rate": 8.27956989247312e-05, + "loss": 2.6558, + "step": 161 + }, + { + "epoch": 0.026137463697967087, + "grad_norm": 5.442112922668457, + "learning_rate": 8.333333333333334e-05, + "loss": 2.6434, + "step": 162 + }, + { + "epoch": 0.026298806066473057, + "grad_norm": 7.593375205993652, + "learning_rate": 8.387096774193549e-05, + "loss": 2.6782, + "step": 163 + }, + { + "epoch": 0.026460148434979024, + "grad_norm": 7.5932393074035645, + "learning_rate": 8.440860215053764e-05, + "loss": 2.4522, + "step": 164 + }, + { + "epoch": 0.026621490803484995, + "grad_norm": 6.538851737976074, + "learning_rate": 8.494623655913979e-05, + "loss": 2.4436, + "step": 165 + }, + { + "epoch": 0.026782833171990966, + "grad_norm": 7.262729644775391, + "learning_rate": 8.548387096774195e-05, + "loss": 2.6256, + "step": 166 + }, + { + "epoch": 0.026944175540496936, + "grad_norm": 6.7534613609313965, + "learning_rate": 8.60215053763441e-05, + "loss": 2.8376, + "step": 167 + }, + { + "epoch": 0.027105517909002903, + "grad_norm": 7.417628765106201, + "learning_rate": 8.655913978494624e-05, + "loss": 2.7797, + "step": 168 + }, + { + "epoch": 0.027266860277508874, + "grad_norm": 8.517837524414062, + "learning_rate": 8.709677419354839e-05, + "loss": 2.743, + "step": 169 + }, + { + "epoch": 0.027428202646014845, + "grad_norm": 7.477149486541748, + "learning_rate": 8.763440860215054e-05, + "loss": 2.5709, + "step": 170 + }, + { + "epoch": 0.02758954501452081, + "grad_norm": 11.132448196411133, + "learning_rate": 8.81720430107527e-05, + "loss": 2.7476, + "step": 171 + }, + { + "epoch": 0.027750887383026782, + "grad_norm": 7.404602527618408, + "learning_rate": 8.870967741935484e-05, + "loss": 2.6269, + "step": 172 + }, + { + "epoch": 0.027912229751532753, + "grad_norm": 9.709242820739746, + "learning_rate": 8.924731182795699e-05, + "loss": 2.5831, + "step": 173 + }, + { + "epoch": 0.028073572120038724, + "grad_norm": 8.573921203613281, + "learning_rate": 8.978494623655914e-05, + "loss": 2.6558, + "step": 174 + }, + { + "epoch": 0.02823491448854469, + "grad_norm": 7.5375189781188965, + "learning_rate": 9.032258064516129e-05, + "loss": 2.7978, + "step": 175 + }, + { + "epoch": 0.02839625685705066, + "grad_norm": 5.954165935516357, + "learning_rate": 9.086021505376345e-05, + "loss": 2.5566, + "step": 176 + }, + { + "epoch": 0.028557599225556632, + "grad_norm": 6.07686710357666, + "learning_rate": 9.13978494623656e-05, + "loss": 2.5547, + "step": 177 + }, + { + "epoch": 0.028718941594062602, + "grad_norm": 8.48181438446045, + "learning_rate": 9.193548387096774e-05, + "loss": 2.6522, + "step": 178 + }, + { + "epoch": 0.02888028396256857, + "grad_norm": 5.5955424308776855, + "learning_rate": 9.247311827956989e-05, + "loss": 2.587, + "step": 179 + }, + { + "epoch": 0.02904162633107454, + "grad_norm": 6.64224100112915, + "learning_rate": 9.301075268817204e-05, + "loss": 2.464, + "step": 180 + }, + { + "epoch": 0.02920296869958051, + "grad_norm": 5.745777606964111, + "learning_rate": 9.35483870967742e-05, + "loss": 2.4542, + "step": 181 + }, + { + "epoch": 0.029364311068086478, + "grad_norm": 8.085434913635254, + "learning_rate": 9.408602150537636e-05, + "loss": 2.5606, + "step": 182 + }, + { + "epoch": 0.02952565343659245, + "grad_norm": 5.5775980949401855, + "learning_rate": 9.46236559139785e-05, + "loss": 2.5326, + "step": 183 + }, + { + "epoch": 0.02968699580509842, + "grad_norm": 5.335843563079834, + "learning_rate": 9.516129032258065e-05, + "loss": 2.7036, + "step": 184 + }, + { + "epoch": 0.02984833817360439, + "grad_norm": 7.278665542602539, + "learning_rate": 9.56989247311828e-05, + "loss": 2.4229, + "step": 185 + }, + { + "epoch": 0.030009680542110357, + "grad_norm": 6.640331268310547, + "learning_rate": 9.623655913978496e-05, + "loss": 2.6562, + "step": 186 + }, + { + "epoch": 0.030171022910616328, + "grad_norm": 8.202140808105469, + "learning_rate": 9.677419354838711e-05, + "loss": 2.6288, + "step": 187 + }, + { + "epoch": 0.030332365279122298, + "grad_norm": 7.170082092285156, + "learning_rate": 9.731182795698925e-05, + "loss": 2.4985, + "step": 188 + }, + { + "epoch": 0.03049370764762827, + "grad_norm": 6.576979160308838, + "learning_rate": 9.78494623655914e-05, + "loss": 2.4262, + "step": 189 + }, + { + "epoch": 0.030655050016134236, + "grad_norm": 4.620917320251465, + "learning_rate": 9.838709677419355e-05, + "loss": 2.4553, + "step": 190 + }, + { + "epoch": 0.030816392384640207, + "grad_norm": 9.18274211883545, + "learning_rate": 9.892473118279571e-05, + "loss": 2.4282, + "step": 191 + }, + { + "epoch": 0.030977734753146177, + "grad_norm": 6.6460676193237305, + "learning_rate": 9.946236559139786e-05, + "loss": 2.4051, + "step": 192 + }, + { + "epoch": 0.031139077121652144, + "grad_norm": 6.2056355476379395, + "learning_rate": 0.0001, + "loss": 2.5354, + "step": 193 + }, + { + "epoch": 0.03130041949015812, + "grad_norm": 8.653217315673828, + "learning_rate": 9.999999317344175e-05, + "loss": 2.5603, + "step": 194 + }, + { + "epoch": 0.03146176185866408, + "grad_norm": 7.770074844360352, + "learning_rate": 9.999997269376886e-05, + "loss": 2.6073, + "step": 195 + }, + { + "epoch": 0.03162310422717005, + "grad_norm": 6.948127746582031, + "learning_rate": 9.999993856098693e-05, + "loss": 2.5165, + "step": 196 + }, + { + "epoch": 0.03178444659567602, + "grad_norm": 10.14613151550293, + "learning_rate": 9.999989077510529e-05, + "loss": 2.5069, + "step": 197 + }, + { + "epoch": 0.031945788964181994, + "grad_norm": 5.593254089355469, + "learning_rate": 9.999982933613696e-05, + "loss": 2.4338, + "step": 198 + }, + { + "epoch": 0.032107131332687965, + "grad_norm": 5.369663238525391, + "learning_rate": 9.999975424409873e-05, + "loss": 2.4927, + "step": 199 + }, + { + "epoch": 0.032268473701193935, + "grad_norm": 5.942346096038818, + "learning_rate": 9.999966549901113e-05, + "loss": 2.5401, + "step": 200 + }, + { + "epoch": 0.032429816069699906, + "grad_norm": 7.377420425415039, + "learning_rate": 9.999956310089834e-05, + "loss": 2.3675, + "step": 201 + }, + { + "epoch": 0.03259115843820587, + "grad_norm": 4.923562526702881, + "learning_rate": 9.999944704978836e-05, + "loss": 2.4815, + "step": 202 + }, + { + "epoch": 0.03275250080671184, + "grad_norm": 5.832359790802002, + "learning_rate": 9.999931734571286e-05, + "loss": 2.7102, + "step": 203 + }, + { + "epoch": 0.03291384317521781, + "grad_norm": 8.515419006347656, + "learning_rate": 9.999917398870729e-05, + "loss": 2.7848, + "step": 204 + }, + { + "epoch": 0.03307518554372378, + "grad_norm": 6.8526716232299805, + "learning_rate": 9.999901697881076e-05, + "loss": 2.4864, + "step": 205 + }, + { + "epoch": 0.03323652791222975, + "grad_norm": 5.681766033172607, + "learning_rate": 9.999884631606615e-05, + "loss": 2.6268, + "step": 206 + }, + { + "epoch": 0.03339787028073572, + "grad_norm": 6.9704718589782715, + "learning_rate": 9.999866200052008e-05, + "loss": 2.2992, + "step": 207 + }, + { + "epoch": 0.03355921264924169, + "grad_norm": 6.220224857330322, + "learning_rate": 9.999846403222286e-05, + "loss": 2.5179, + "step": 208 + }, + { + "epoch": 0.033720555017747664, + "grad_norm": 5.515377521514893, + "learning_rate": 9.999825241122856e-05, + "loss": 2.4122, + "step": 209 + }, + { + "epoch": 0.03388189738625363, + "grad_norm": 9.85246753692627, + "learning_rate": 9.999802713759495e-05, + "loss": 2.8617, + "step": 210 + }, + { + "epoch": 0.0340432397547596, + "grad_norm": 8.233046531677246, + "learning_rate": 9.999778821138357e-05, + "loss": 2.5484, + "step": 211 + }, + { + "epoch": 0.03420458212326557, + "grad_norm": 7.567295551300049, + "learning_rate": 9.999753563265963e-05, + "loss": 2.3982, + "step": 212 + }, + { + "epoch": 0.03436592449177154, + "grad_norm": 6.545775890350342, + "learning_rate": 9.999726940149212e-05, + "loss": 2.3446, + "step": 213 + }, + { + "epoch": 0.03452726686027751, + "grad_norm": 6.2413716316223145, + "learning_rate": 9.999698951795374e-05, + "loss": 2.4994, + "step": 214 + }, + { + "epoch": 0.03468860922878348, + "grad_norm": 7.410308361053467, + "learning_rate": 9.999669598212092e-05, + "loss": 2.8418, + "step": 215 + }, + { + "epoch": 0.03484995159728945, + "grad_norm": 7.597142696380615, + "learning_rate": 9.999638879407378e-05, + "loss": 2.6682, + "step": 216 + }, + { + "epoch": 0.035011293965795415, + "grad_norm": 7.289877414703369, + "learning_rate": 9.999606795389622e-05, + "loss": 2.4565, + "step": 217 + }, + { + "epoch": 0.035172636334301385, + "grad_norm": 7.676548004150391, + "learning_rate": 9.999573346167588e-05, + "loss": 2.6301, + "step": 218 + }, + { + "epoch": 0.035333978702807356, + "grad_norm": 6.4527363777160645, + "learning_rate": 9.999538531750405e-05, + "loss": 2.5459, + "step": 219 + }, + { + "epoch": 0.03549532107131333, + "grad_norm": 6.0173726081848145, + "learning_rate": 9.999502352147583e-05, + "loss": 2.7194, + "step": 220 + }, + { + "epoch": 0.0356566634398193, + "grad_norm": 6.561513423919678, + "learning_rate": 9.999464807368999e-05, + "loss": 2.458, + "step": 221 + }, + { + "epoch": 0.03581800580832527, + "grad_norm": 7.991656303405762, + "learning_rate": 9.999425897424906e-05, + "loss": 2.904, + "step": 222 + }, + { + "epoch": 0.03597934817683124, + "grad_norm": 4.952508449554443, + "learning_rate": 9.99938562232593e-05, + "loss": 2.6497, + "step": 223 + }, + { + "epoch": 0.0361406905453372, + "grad_norm": 5.483695030212402, + "learning_rate": 9.999343982083065e-05, + "loss": 2.4902, + "step": 224 + }, + { + "epoch": 0.03630203291384317, + "grad_norm": 6.592175483703613, + "learning_rate": 9.999300976707687e-05, + "loss": 2.616, + "step": 225 + }, + { + "epoch": 0.03646337528234914, + "grad_norm": 8.33538818359375, + "learning_rate": 9.999256606211533e-05, + "loss": 2.7458, + "step": 226 + }, + { + "epoch": 0.036624717650855114, + "grad_norm": 6.011215686798096, + "learning_rate": 9.999210870606723e-05, + "loss": 2.7323, + "step": 227 + }, + { + "epoch": 0.036786060019361085, + "grad_norm": 9.141729354858398, + "learning_rate": 9.999163769905744e-05, + "loss": 2.4671, + "step": 228 + }, + { + "epoch": 0.036947402387867055, + "grad_norm": 9.175477981567383, + "learning_rate": 9.999115304121457e-05, + "loss": 2.5845, + "step": 229 + }, + { + "epoch": 0.037108744756373026, + "grad_norm": 10.001459121704102, + "learning_rate": 9.9990654732671e-05, + "loss": 2.804, + "step": 230 + }, + { + "epoch": 0.037270087124878996, + "grad_norm": 8.348021507263184, + "learning_rate": 9.999014277356276e-05, + "loss": 2.6492, + "step": 231 + }, + { + "epoch": 0.03743142949338496, + "grad_norm": 6.260481357574463, + "learning_rate": 9.998961716402965e-05, + "loss": 2.4399, + "step": 232 + }, + { + "epoch": 0.03759277186189093, + "grad_norm": 4.412412643432617, + "learning_rate": 9.998907790421522e-05, + "loss": 2.5457, + "step": 233 + }, + { + "epoch": 0.0377541142303969, + "grad_norm": 6.601975440979004, + "learning_rate": 9.998852499426668e-05, + "loss": 2.2956, + "step": 234 + }, + { + "epoch": 0.03791545659890287, + "grad_norm": 5.505072116851807, + "learning_rate": 9.998795843433503e-05, + "loss": 2.4495, + "step": 235 + }, + { + "epoch": 0.03807679896740884, + "grad_norm": 6.530910491943359, + "learning_rate": 9.998737822457498e-05, + "loss": 2.3748, + "step": 236 + }, + { + "epoch": 0.03823814133591481, + "grad_norm": 7.663618087768555, + "learning_rate": 9.998678436514497e-05, + "loss": 2.4358, + "step": 237 + }, + { + "epoch": 0.038399483704420784, + "grad_norm": 8.565320014953613, + "learning_rate": 9.998617685620714e-05, + "loss": 2.68, + "step": 238 + }, + { + "epoch": 0.03856082607292675, + "grad_norm": 8.876402854919434, + "learning_rate": 9.998555569792741e-05, + "loss": 2.5005, + "step": 239 + }, + { + "epoch": 0.03872216844143272, + "grad_norm": 5.009706497192383, + "learning_rate": 9.998492089047538e-05, + "loss": 2.3758, + "step": 240 + }, + { + "epoch": 0.03888351080993869, + "grad_norm": 4.652370452880859, + "learning_rate": 9.998427243402437e-05, + "loss": 2.5858, + "step": 241 + }, + { + "epoch": 0.03904485317844466, + "grad_norm": 5.316776752471924, + "learning_rate": 9.998361032875145e-05, + "loss": 2.4996, + "step": 242 + }, + { + "epoch": 0.03920619554695063, + "grad_norm": 6.4909138679504395, + "learning_rate": 9.998293457483745e-05, + "loss": 2.5396, + "step": 243 + }, + { + "epoch": 0.0393675379154566, + "grad_norm": 6.99546480178833, + "learning_rate": 9.998224517246689e-05, + "loss": 2.6507, + "step": 244 + }, + { + "epoch": 0.03952888028396257, + "grad_norm": 8.02343463897705, + "learning_rate": 9.998154212182797e-05, + "loss": 2.5949, + "step": 245 + }, + { + "epoch": 0.03969022265246854, + "grad_norm": 7.2550950050354, + "learning_rate": 9.998082542311273e-05, + "loss": 2.5827, + "step": 246 + }, + { + "epoch": 0.039851565020974505, + "grad_norm": 7.897508144378662, + "learning_rate": 9.998009507651684e-05, + "loss": 2.4193, + "step": 247 + }, + { + "epoch": 0.040012907389480476, + "grad_norm": 6.488155364990234, + "learning_rate": 9.997935108223972e-05, + "loss": 2.5488, + "step": 248 + }, + { + "epoch": 0.04017424975798645, + "grad_norm": 7.670787334442139, + "learning_rate": 9.997859344048455e-05, + "loss": 2.4619, + "step": 249 + }, + { + "epoch": 0.04033559212649242, + "grad_norm": 8.569957733154297, + "learning_rate": 9.997782215145821e-05, + "loss": 2.4712, + "step": 250 + }, + { + "epoch": 0.04049693449499839, + "grad_norm": 4.621592998504639, + "learning_rate": 9.997703721537131e-05, + "loss": 2.601, + "step": 251 + }, + { + "epoch": 0.04065827686350436, + "grad_norm": 5.4416303634643555, + "learning_rate": 9.997623863243817e-05, + "loss": 2.1628, + "step": 252 + }, + { + "epoch": 0.04081961923201033, + "grad_norm": 7.547059059143066, + "learning_rate": 9.997542640287687e-05, + "loss": 2.3897, + "step": 253 + }, + { + "epoch": 0.04098096160051629, + "grad_norm": 7.269131183624268, + "learning_rate": 9.997460052690918e-05, + "loss": 2.5672, + "step": 254 + }, + { + "epoch": 0.04114230396902226, + "grad_norm": 6.176670551300049, + "learning_rate": 9.997376100476063e-05, + "loss": 2.4091, + "step": 255 + }, + { + "epoch": 0.041303646337528234, + "grad_norm": 4.447066783905029, + "learning_rate": 9.997290783666049e-05, + "loss": 2.4808, + "step": 256 + }, + { + "epoch": 0.041464988706034205, + "grad_norm": 5.186746597290039, + "learning_rate": 9.997204102284167e-05, + "loss": 2.5325, + "step": 257 + }, + { + "epoch": 0.041626331074540175, + "grad_norm": 6.026534080505371, + "learning_rate": 9.99711605635409e-05, + "loss": 2.2731, + "step": 258 + }, + { + "epoch": 0.041787673443046146, + "grad_norm": 6.029815196990967, + "learning_rate": 9.997026645899859e-05, + "loss": 2.4693, + "step": 259 + }, + { + "epoch": 0.041949015811552116, + "grad_norm": 6.355932235717773, + "learning_rate": 9.996935870945891e-05, + "loss": 2.6484, + "step": 260 + }, + { + "epoch": 0.04211035818005808, + "grad_norm": 7.226869106292725, + "learning_rate": 9.996843731516969e-05, + "loss": 2.4643, + "step": 261 + }, + { + "epoch": 0.04227170054856405, + "grad_norm": 6.115468502044678, + "learning_rate": 9.996750227638257e-05, + "loss": 2.3783, + "step": 262 + }, + { + "epoch": 0.04243304291707002, + "grad_norm": 8.106987953186035, + "learning_rate": 9.996655359335282e-05, + "loss": 2.3988, + "step": 263 + }, + { + "epoch": 0.04259438528557599, + "grad_norm": 4.741427421569824, + "learning_rate": 9.996559126633957e-05, + "loss": 2.4387, + "step": 264 + }, + { + "epoch": 0.04275572765408196, + "grad_norm": 9.366172790527344, + "learning_rate": 9.996461529560553e-05, + "loss": 2.3502, + "step": 265 + }, + { + "epoch": 0.04291707002258793, + "grad_norm": 7.219048023223877, + "learning_rate": 9.99636256814172e-05, + "loss": 2.5856, + "step": 266 + }, + { + "epoch": 0.043078412391093904, + "grad_norm": 7.835181713104248, + "learning_rate": 9.996262242404484e-05, + "loss": 2.6606, + "step": 267 + }, + { + "epoch": 0.043239754759599874, + "grad_norm": 7.519559860229492, + "learning_rate": 9.99616055237624e-05, + "loss": 2.8062, + "step": 268 + }, + { + "epoch": 0.04340109712810584, + "grad_norm": 4.6615376472473145, + "learning_rate": 9.996057498084753e-05, + "loss": 2.3809, + "step": 269 + }, + { + "epoch": 0.04356243949661181, + "grad_norm": 5.479961395263672, + "learning_rate": 9.995953079558165e-05, + "loss": 2.5693, + "step": 270 + }, + { + "epoch": 0.04372378186511778, + "grad_norm": 5.388136386871338, + "learning_rate": 9.99584729682499e-05, + "loss": 2.4341, + "step": 271 + }, + { + "epoch": 0.04388512423362375, + "grad_norm": 7.276689529418945, + "learning_rate": 9.99574014991411e-05, + "loss": 2.4584, + "step": 272 + }, + { + "epoch": 0.04404646660212972, + "grad_norm": 4.907256603240967, + "learning_rate": 9.995631638854785e-05, + "loss": 2.1793, + "step": 273 + }, + { + "epoch": 0.04420780897063569, + "grad_norm": 3.8604819774627686, + "learning_rate": 9.995521763676645e-05, + "loss": 2.406, + "step": 274 + }, + { + "epoch": 0.04436915133914166, + "grad_norm": 5.270939826965332, + "learning_rate": 9.995410524409692e-05, + "loss": 2.4819, + "step": 275 + }, + { + "epoch": 0.044530493707647625, + "grad_norm": 6.114327430725098, + "learning_rate": 9.995297921084303e-05, + "loss": 2.4482, + "step": 276 + }, + { + "epoch": 0.044691836076153596, + "grad_norm": 5.704726696014404, + "learning_rate": 9.995183953731225e-05, + "loss": 2.4674, + "step": 277 + }, + { + "epoch": 0.04485317844465957, + "grad_norm": 5.346158504486084, + "learning_rate": 9.995068622381577e-05, + "loss": 2.279, + "step": 278 + }, + { + "epoch": 0.04501452081316554, + "grad_norm": 8.070467948913574, + "learning_rate": 9.994951927066853e-05, + "loss": 2.5467, + "step": 279 + }, + { + "epoch": 0.04517586318167151, + "grad_norm": 4.506627082824707, + "learning_rate": 9.994833867818917e-05, + "loss": 2.5589, + "step": 280 + }, + { + "epoch": 0.04533720555017748, + "grad_norm": 5.788137912750244, + "learning_rate": 9.994714444670007e-05, + "loss": 2.4006, + "step": 281 + }, + { + "epoch": 0.04549854791868345, + "grad_norm": 6.531043529510498, + "learning_rate": 9.994593657652733e-05, + "loss": 2.4025, + "step": 282 + }, + { + "epoch": 0.04565989028718941, + "grad_norm": 6.962115287780762, + "learning_rate": 9.994471506800079e-05, + "loss": 2.5754, + "step": 283 + }, + { + "epoch": 0.04582123265569538, + "grad_norm": 5.1116180419921875, + "learning_rate": 9.994347992145395e-05, + "loss": 2.3522, + "step": 284 + }, + { + "epoch": 0.045982575024201354, + "grad_norm": 8.793933868408203, + "learning_rate": 9.994223113722415e-05, + "loss": 2.5642, + "step": 285 + }, + { + "epoch": 0.046143917392707325, + "grad_norm": 6.9725751876831055, + "learning_rate": 9.994096871565233e-05, + "loss": 2.6067, + "step": 286 + }, + { + "epoch": 0.046305259761213295, + "grad_norm": 6.1990790367126465, + "learning_rate": 9.993969265708323e-05, + "loss": 2.498, + "step": 287 + }, + { + "epoch": 0.046466602129719266, + "grad_norm": 5.293033123016357, + "learning_rate": 9.99384029618653e-05, + "loss": 2.276, + "step": 288 + }, + { + "epoch": 0.046627944498225236, + "grad_norm": 6.670391082763672, + "learning_rate": 9.99370996303507e-05, + "loss": 2.4333, + "step": 289 + }, + { + "epoch": 0.04678928686673121, + "grad_norm": 7.920434474945068, + "learning_rate": 9.993578266289532e-05, + "loss": 2.2344, + "step": 290 + }, + { + "epoch": 0.04695062923523717, + "grad_norm": 5.005353927612305, + "learning_rate": 9.993445205985877e-05, + "loss": 2.4097, + "step": 291 + }, + { + "epoch": 0.04711197160374314, + "grad_norm": 4.682715892791748, + "learning_rate": 9.99331078216044e-05, + "loss": 2.8777, + "step": 292 + }, + { + "epoch": 0.04727331397224911, + "grad_norm": 6.497574806213379, + "learning_rate": 9.993174994849926e-05, + "loss": 2.3354, + "step": 293 + }, + { + "epoch": 0.04743465634075508, + "grad_norm": 5.930582523345947, + "learning_rate": 9.993037844091413e-05, + "loss": 2.4212, + "step": 294 + }, + { + "epoch": 0.04759599870926105, + "grad_norm": 5.225839138031006, + "learning_rate": 9.992899329922354e-05, + "loss": 2.4619, + "step": 295 + }, + { + "epoch": 0.047757341077767024, + "grad_norm": 7.46838903427124, + "learning_rate": 9.99275945238057e-05, + "loss": 2.5483, + "step": 296 + }, + { + "epoch": 0.047918683446272994, + "grad_norm": 8.18468189239502, + "learning_rate": 9.992618211504256e-05, + "loss": 2.4418, + "step": 297 + }, + { + "epoch": 0.04808002581477896, + "grad_norm": 6.2516608238220215, + "learning_rate": 9.992475607331981e-05, + "loss": 2.6307, + "step": 298 + }, + { + "epoch": 0.04824136818328493, + "grad_norm": 5.004658222198486, + "learning_rate": 9.992331639902685e-05, + "loss": 2.3118, + "step": 299 + }, + { + "epoch": 0.0484027105517909, + "grad_norm": 5.056971549987793, + "learning_rate": 9.99218630925568e-05, + "loss": 2.411, + "step": 300 + }, + { + "epoch": 0.04856405292029687, + "grad_norm": 7.776693344116211, + "learning_rate": 9.992039615430648e-05, + "loss": 2.3621, + "step": 301 + }, + { + "epoch": 0.04872539528880284, + "grad_norm": 4.139708042144775, + "learning_rate": 9.991891558467648e-05, + "loss": 2.2396, + "step": 302 + }, + { + "epoch": 0.04888673765730881, + "grad_norm": 8.121435165405273, + "learning_rate": 9.991742138407107e-05, + "loss": 2.7522, + "step": 303 + }, + { + "epoch": 0.04904808002581478, + "grad_norm": 6.5305681228637695, + "learning_rate": 9.991591355289827e-05, + "loss": 2.92, + "step": 304 + }, + { + "epoch": 0.049209422394320745, + "grad_norm": 7.091409683227539, + "learning_rate": 9.99143920915698e-05, + "loss": 2.511, + "step": 305 + }, + { + "epoch": 0.049370764762826716, + "grad_norm": 6.0481486320495605, + "learning_rate": 9.991285700050115e-05, + "loss": 2.2692, + "step": 306 + }, + { + "epoch": 0.04953210713133269, + "grad_norm": 5.499849796295166, + "learning_rate": 9.991130828011145e-05, + "loss": 2.6419, + "step": 307 + }, + { + "epoch": 0.04969344949983866, + "grad_norm": 6.164821147918701, + "learning_rate": 9.990974593082364e-05, + "loss": 2.626, + "step": 308 + }, + { + "epoch": 0.04985479186834463, + "grad_norm": 5.402545928955078, + "learning_rate": 9.99081699530643e-05, + "loss": 2.3183, + "step": 309 + }, + { + "epoch": 0.0500161342368506, + "grad_norm": 6.041627407073975, + "learning_rate": 9.990658034726379e-05, + "loss": 2.5301, + "step": 310 + }, + { + "epoch": 0.05017747660535657, + "grad_norm": 7.5806050300598145, + "learning_rate": 9.990497711385617e-05, + "loss": 2.4254, + "step": 311 + }, + { + "epoch": 0.05033881897386254, + "grad_norm": 7.522918701171875, + "learning_rate": 9.990336025327922e-05, + "loss": 2.4936, + "step": 312 + }, + { + "epoch": 0.0505001613423685, + "grad_norm": 6.263611793518066, + "learning_rate": 9.990172976597445e-05, + "loss": 2.4999, + "step": 313 + }, + { + "epoch": 0.050661503710874474, + "grad_norm": 4.018587112426758, + "learning_rate": 9.990008565238707e-05, + "loss": 2.3965, + "step": 314 + }, + { + "epoch": 0.050822846079380445, + "grad_norm": 4.3108906745910645, + "learning_rate": 9.989842791296603e-05, + "loss": 2.4771, + "step": 315 + }, + { + "epoch": 0.050984188447886415, + "grad_norm": 5.848104476928711, + "learning_rate": 9.989675654816402e-05, + "loss": 2.4651, + "step": 316 + }, + { + "epoch": 0.051145530816392386, + "grad_norm": 3.9502875804901123, + "learning_rate": 9.989507155843738e-05, + "loss": 2.2438, + "step": 317 + }, + { + "epoch": 0.051306873184898356, + "grad_norm": 5.53761625289917, + "learning_rate": 9.989337294424627e-05, + "loss": 2.3552, + "step": 318 + }, + { + "epoch": 0.05146821555340433, + "grad_norm": 6.956762313842773, + "learning_rate": 9.989166070605447e-05, + "loss": 2.5993, + "step": 319 + }, + { + "epoch": 0.05162955792191029, + "grad_norm": 6.241908073425293, + "learning_rate": 9.988993484432957e-05, + "loss": 2.5251, + "step": 320 + }, + { + "epoch": 0.05179090029041626, + "grad_norm": 6.745074272155762, + "learning_rate": 9.988819535954281e-05, + "loss": 2.4227, + "step": 321 + }, + { + "epoch": 0.05195224265892223, + "grad_norm": 7.87310791015625, + "learning_rate": 9.988644225216918e-05, + "loss": 2.2028, + "step": 322 + }, + { + "epoch": 0.0521135850274282, + "grad_norm": 9.180561065673828, + "learning_rate": 9.988467552268741e-05, + "loss": 2.453, + "step": 323 + }, + { + "epoch": 0.05227492739593417, + "grad_norm": 4.864908218383789, + "learning_rate": 9.988289517157989e-05, + "loss": 2.3643, + "step": 324 + }, + { + "epoch": 0.052436269764440144, + "grad_norm": 9.718785285949707, + "learning_rate": 9.988110119933281e-05, + "loss": 2.42, + "step": 325 + }, + { + "epoch": 0.052597612132946114, + "grad_norm": 8.330720901489258, + "learning_rate": 9.9879293606436e-05, + "loss": 2.6186, + "step": 326 + }, + { + "epoch": 0.05275895450145208, + "grad_norm": 5.9520392417907715, + "learning_rate": 9.987747239338306e-05, + "loss": 2.5718, + "step": 327 + }, + { + "epoch": 0.05292029686995805, + "grad_norm": 7.140975475311279, + "learning_rate": 9.987563756067129e-05, + "loss": 2.3265, + "step": 328 + }, + { + "epoch": 0.05308163923846402, + "grad_norm": 6.457657337188721, + "learning_rate": 9.987378910880172e-05, + "loss": 2.64, + "step": 329 + }, + { + "epoch": 0.05324298160696999, + "grad_norm": 3.6005430221557617, + "learning_rate": 9.987192703827907e-05, + "loss": 2.314, + "step": 330 + }, + { + "epoch": 0.05340432397547596, + "grad_norm": 3.92282772064209, + "learning_rate": 9.987005134961185e-05, + "loss": 2.4194, + "step": 331 + }, + { + "epoch": 0.05356566634398193, + "grad_norm": 4.9452619552612305, + "learning_rate": 9.986816204331221e-05, + "loss": 2.6117, + "step": 332 + }, + { + "epoch": 0.0537270087124879, + "grad_norm": 4.207228183746338, + "learning_rate": 9.986625911989604e-05, + "loss": 2.3704, + "step": 333 + }, + { + "epoch": 0.05388835108099387, + "grad_norm": 4.267602443695068, + "learning_rate": 9.986434257988298e-05, + "loss": 2.3398, + "step": 334 + }, + { + "epoch": 0.054049693449499836, + "grad_norm": 5.442686080932617, + "learning_rate": 9.986241242379633e-05, + "loss": 2.6067, + "step": 335 + }, + { + "epoch": 0.05421103581800581, + "grad_norm": 5.375389575958252, + "learning_rate": 9.986046865216317e-05, + "loss": 2.3361, + "step": 336 + }, + { + "epoch": 0.05437237818651178, + "grad_norm": 5.796661853790283, + "learning_rate": 9.985851126551428e-05, + "loss": 2.5738, + "step": 337 + }, + { + "epoch": 0.05453372055501775, + "grad_norm": 5.076924800872803, + "learning_rate": 9.985654026438411e-05, + "loss": 2.8978, + "step": 338 + }, + { + "epoch": 0.05469506292352372, + "grad_norm": 5.532252788543701, + "learning_rate": 9.985455564931092e-05, + "loss": 2.4838, + "step": 339 + }, + { + "epoch": 0.05485640529202969, + "grad_norm": 5.251373767852783, + "learning_rate": 9.985255742083657e-05, + "loss": 2.5022, + "step": 340 + }, + { + "epoch": 0.05501774766053566, + "grad_norm": 5.868371486663818, + "learning_rate": 9.985054557950674e-05, + "loss": 2.5384, + "step": 341 + }, + { + "epoch": 0.05517909002904162, + "grad_norm": 5.244610786437988, + "learning_rate": 9.984852012587081e-05, + "loss": 2.6677, + "step": 342 + }, + { + "epoch": 0.055340432397547594, + "grad_norm": 5.554970741271973, + "learning_rate": 9.98464810604818e-05, + "loss": 2.3778, + "step": 343 + }, + { + "epoch": 0.055501774766053565, + "grad_norm": 5.125883102416992, + "learning_rate": 9.984442838389654e-05, + "loss": 2.5587, + "step": 344 + }, + { + "epoch": 0.055663117134559535, + "grad_norm": 5.0995588302612305, + "learning_rate": 9.984236209667553e-05, + "loss": 2.4692, + "step": 345 + }, + { + "epoch": 0.055824459503065506, + "grad_norm": 5.538670539855957, + "learning_rate": 9.9840282199383e-05, + "loss": 2.5167, + "step": 346 + }, + { + "epoch": 0.055985801871571476, + "grad_norm": 7.013861179351807, + "learning_rate": 9.983818869258687e-05, + "loss": 2.4946, + "step": 347 + }, + { + "epoch": 0.05614714424007745, + "grad_norm": 6.362895965576172, + "learning_rate": 9.983608157685882e-05, + "loss": 2.3693, + "step": 348 + }, + { + "epoch": 0.05630848660858341, + "grad_norm": 8.72132396697998, + "learning_rate": 9.983396085277421e-05, + "loss": 2.5072, + "step": 349 + }, + { + "epoch": 0.05646982897708938, + "grad_norm": 6.60469913482666, + "learning_rate": 9.983182652091214e-05, + "loss": 2.5737, + "step": 350 + }, + { + "epoch": 0.05663117134559535, + "grad_norm": 6.866801738739014, + "learning_rate": 9.982967858185542e-05, + "loss": 2.2838, + "step": 351 + }, + { + "epoch": 0.05679251371410132, + "grad_norm": 5.227248668670654, + "learning_rate": 9.982751703619055e-05, + "loss": 2.1749, + "step": 352 + }, + { + "epoch": 0.05695385608260729, + "grad_norm": 5.646151065826416, + "learning_rate": 9.982534188450778e-05, + "loss": 3.0098, + "step": 353 + }, + { + "epoch": 0.057115198451113264, + "grad_norm": 5.992406845092773, + "learning_rate": 9.982315312740107e-05, + "loss": 2.3147, + "step": 354 + }, + { + "epoch": 0.057276540819619234, + "grad_norm": 5.548222064971924, + "learning_rate": 9.982095076546807e-05, + "loss": 2.3271, + "step": 355 + }, + { + "epoch": 0.057437883188125205, + "grad_norm": 5.728975296020508, + "learning_rate": 9.981873479931018e-05, + "loss": 2.4881, + "step": 356 + }, + { + "epoch": 0.05759922555663117, + "grad_norm": 5.330463409423828, + "learning_rate": 9.981650522953248e-05, + "loss": 2.653, + "step": 357 + }, + { + "epoch": 0.05776056792513714, + "grad_norm": 6.157469272613525, + "learning_rate": 9.981426205674381e-05, + "loss": 2.6039, + "step": 358 + }, + { + "epoch": 0.05792191029364311, + "grad_norm": 7.181394100189209, + "learning_rate": 9.981200528155666e-05, + "loss": 2.4105, + "step": 359 + }, + { + "epoch": 0.05808325266214908, + "grad_norm": 12.247302055358887, + "learning_rate": 9.980973490458728e-05, + "loss": 2.2891, + "step": 360 + }, + { + "epoch": 0.05824459503065505, + "grad_norm": 5.332772254943848, + "learning_rate": 9.980745092645564e-05, + "loss": 2.3454, + "step": 361 + }, + { + "epoch": 0.05840593739916102, + "grad_norm": 4.680450439453125, + "learning_rate": 9.98051533477854e-05, + "loss": 2.715, + "step": 362 + }, + { + "epoch": 0.05856727976766699, + "grad_norm": 4.383055686950684, + "learning_rate": 9.980284216920393e-05, + "loss": 2.6565, + "step": 363 + }, + { + "epoch": 0.058728622136172956, + "grad_norm": 8.36205768585205, + "learning_rate": 9.980051739134233e-05, + "loss": 2.3592, + "step": 364 + }, + { + "epoch": 0.05888996450467893, + "grad_norm": 11.09957218170166, + "learning_rate": 9.979817901483544e-05, + "loss": 2.4009, + "step": 365 + }, + { + "epoch": 0.0590513068731849, + "grad_norm": 4.821877479553223, + "learning_rate": 9.979582704032175e-05, + "loss": 2.131, + "step": 366 + }, + { + "epoch": 0.05921264924169087, + "grad_norm": 4.984494686126709, + "learning_rate": 9.979346146844351e-05, + "loss": 2.4964, + "step": 367 + }, + { + "epoch": 0.05937399161019684, + "grad_norm": 4.013694763183594, + "learning_rate": 9.979108229984663e-05, + "loss": 2.3903, + "step": 368 + }, + { + "epoch": 0.05953533397870281, + "grad_norm": 7.097440242767334, + "learning_rate": 9.978868953518084e-05, + "loss": 2.4343, + "step": 369 + }, + { + "epoch": 0.05969667634720878, + "grad_norm": 8.406354904174805, + "learning_rate": 9.978628317509947e-05, + "loss": 2.7057, + "step": 370 + }, + { + "epoch": 0.05985801871571474, + "grad_norm": 5.988000869750977, + "learning_rate": 9.978386322025961e-05, + "loss": 2.4413, + "step": 371 + }, + { + "epoch": 0.060019361084220714, + "grad_norm": 5.423110008239746, + "learning_rate": 9.978142967132207e-05, + "loss": 2.403, + "step": 372 + }, + { + "epoch": 0.060180703452726685, + "grad_norm": 4.012385368347168, + "learning_rate": 9.977898252895134e-05, + "loss": 2.5389, + "step": 373 + }, + { + "epoch": 0.060342045821232655, + "grad_norm": 6.347021102905273, + "learning_rate": 9.977652179381566e-05, + "loss": 2.5845, + "step": 374 + }, + { + "epoch": 0.060503388189738626, + "grad_norm": 7.557514190673828, + "learning_rate": 9.977404746658696e-05, + "loss": 2.5214, + "step": 375 + }, + { + "epoch": 0.060664730558244596, + "grad_norm": 5.463866710662842, + "learning_rate": 9.977155954794089e-05, + "loss": 2.6464, + "step": 376 + }, + { + "epoch": 0.06082607292675057, + "grad_norm": 4.601561546325684, + "learning_rate": 9.976905803855679e-05, + "loss": 2.3969, + "step": 377 + }, + { + "epoch": 0.06098741529525654, + "grad_norm": 6.171661853790283, + "learning_rate": 9.976654293911776e-05, + "loss": 2.3105, + "step": 378 + }, + { + "epoch": 0.0611487576637625, + "grad_norm": 5.989907264709473, + "learning_rate": 9.976401425031054e-05, + "loss": 2.4348, + "step": 379 + }, + { + "epoch": 0.06131010003226847, + "grad_norm": 7.300041198730469, + "learning_rate": 9.976147197282565e-05, + "loss": 2.4743, + "step": 380 + }, + { + "epoch": 0.06147144240077444, + "grad_norm": 5.653286933898926, + "learning_rate": 9.975891610735728e-05, + "loss": 2.3747, + "step": 381 + }, + { + "epoch": 0.06163278476928041, + "grad_norm": 6.075132369995117, + "learning_rate": 9.975634665460332e-05, + "loss": 2.4115, + "step": 382 + }, + { + "epoch": 0.061794127137786384, + "grad_norm": 4.353718280792236, + "learning_rate": 9.975376361526543e-05, + "loss": 2.5368, + "step": 383 + }, + { + "epoch": 0.061955469506292354, + "grad_norm": 7.838526248931885, + "learning_rate": 9.975116699004892e-05, + "loss": 2.6441, + "step": 384 + }, + { + "epoch": 0.062116811874798325, + "grad_norm": 5.915657043457031, + "learning_rate": 9.974855677966283e-05, + "loss": 2.4787, + "step": 385 + }, + { + "epoch": 0.06227815424330429, + "grad_norm": 12.826671600341797, + "learning_rate": 9.974593298481991e-05, + "loss": 2.3556, + "step": 386 + }, + { + "epoch": 0.06243949661181026, + "grad_norm": 5.417259693145752, + "learning_rate": 9.97432956062366e-05, + "loss": 2.2902, + "step": 387 + }, + { + "epoch": 0.06260083898031624, + "grad_norm": 6.576791286468506, + "learning_rate": 9.974064464463313e-05, + "loss": 2.6462, + "step": 388 + }, + { + "epoch": 0.06276218134882221, + "grad_norm": 4.490204811096191, + "learning_rate": 9.973798010073332e-05, + "loss": 2.4747, + "step": 389 + }, + { + "epoch": 0.06292352371732816, + "grad_norm": 4.300869464874268, + "learning_rate": 9.973530197526477e-05, + "loss": 2.3842, + "step": 390 + }, + { + "epoch": 0.06308486608583413, + "grad_norm": 6.780396938323975, + "learning_rate": 9.973261026895877e-05, + "loss": 2.2796, + "step": 391 + }, + { + "epoch": 0.0632462084543401, + "grad_norm": 7.646263599395752, + "learning_rate": 9.972990498255034e-05, + "loss": 2.2162, + "step": 392 + }, + { + "epoch": 0.06340755082284608, + "grad_norm": 4.344062805175781, + "learning_rate": 9.97271861167782e-05, + "loss": 2.519, + "step": 393 + }, + { + "epoch": 0.06356889319135205, + "grad_norm": 5.198149681091309, + "learning_rate": 9.972445367238474e-05, + "loss": 2.4272, + "step": 394 + }, + { + "epoch": 0.06373023555985802, + "grad_norm": 7.867514610290527, + "learning_rate": 9.972170765011611e-05, + "loss": 2.6917, + "step": 395 + }, + { + "epoch": 0.06389157792836399, + "grad_norm": 6.376491069793701, + "learning_rate": 9.971894805072212e-05, + "loss": 2.5755, + "step": 396 + }, + { + "epoch": 0.06405292029686996, + "grad_norm": 6.178385257720947, + "learning_rate": 9.971617487495635e-05, + "loss": 2.8525, + "step": 397 + }, + { + "epoch": 0.06421426266537593, + "grad_norm": 5.063424587249756, + "learning_rate": 9.971338812357603e-05, + "loss": 2.4475, + "step": 398 + }, + { + "epoch": 0.0643756050338819, + "grad_norm": 5.792573928833008, + "learning_rate": 9.971058779734211e-05, + "loss": 2.4081, + "step": 399 + }, + { + "epoch": 0.06453694740238787, + "grad_norm": 5.929740905761719, + "learning_rate": 9.970777389701926e-05, + "loss": 2.5581, + "step": 400 + }, + { + "epoch": 0.06469828977089384, + "grad_norm": 5.673580169677734, + "learning_rate": 9.970494642337585e-05, + "loss": 2.4403, + "step": 401 + }, + { + "epoch": 0.06485963213939981, + "grad_norm": 10.702649116516113, + "learning_rate": 9.970210537718395e-05, + "loss": 2.5654, + "step": 402 + }, + { + "epoch": 0.06502097450790578, + "grad_norm": 4.5370564460754395, + "learning_rate": 9.969925075921936e-05, + "loss": 2.5331, + "step": 403 + }, + { + "epoch": 0.06518231687641174, + "grad_norm": 6.114105224609375, + "learning_rate": 9.969638257026156e-05, + "loss": 2.3586, + "step": 404 + }, + { + "epoch": 0.06534365924491771, + "grad_norm": 5.066039085388184, + "learning_rate": 9.969350081109375e-05, + "loss": 2.5332, + "step": 405 + }, + { + "epoch": 0.06550500161342368, + "grad_norm": 4.686367034912109, + "learning_rate": 9.96906054825028e-05, + "loss": 2.4109, + "step": 406 + }, + { + "epoch": 0.06566634398192965, + "grad_norm": 5.096669673919678, + "learning_rate": 9.968769658527935e-05, + "loss": 2.372, + "step": 407 + }, + { + "epoch": 0.06582768635043562, + "grad_norm": 5.357354164123535, + "learning_rate": 9.968477412021769e-05, + "loss": 2.2959, + "step": 408 + }, + { + "epoch": 0.06598902871894159, + "grad_norm": 7.111669063568115, + "learning_rate": 9.968183808811586e-05, + "loss": 2.3232, + "step": 409 + }, + { + "epoch": 0.06615037108744756, + "grad_norm": 6.305822849273682, + "learning_rate": 9.967888848977556e-05, + "loss": 2.2802, + "step": 410 + }, + { + "epoch": 0.06631171345595353, + "grad_norm": 5.072010517120361, + "learning_rate": 9.96759253260022e-05, + "loss": 2.4779, + "step": 411 + }, + { + "epoch": 0.0664730558244595, + "grad_norm": 5.068328857421875, + "learning_rate": 9.967294859760494e-05, + "loss": 2.717, + "step": 412 + }, + { + "epoch": 0.06663439819296547, + "grad_norm": 5.075811862945557, + "learning_rate": 9.966995830539658e-05, + "loss": 2.499, + "step": 413 + }, + { + "epoch": 0.06679574056147145, + "grad_norm": 8.059813499450684, + "learning_rate": 9.966695445019369e-05, + "loss": 2.4381, + "step": 414 + }, + { + "epoch": 0.06695708292997742, + "grad_norm": 4.285947799682617, + "learning_rate": 9.96639370328165e-05, + "loss": 2.453, + "step": 415 + }, + { + "epoch": 0.06711842529848339, + "grad_norm": 4.399743556976318, + "learning_rate": 9.966090605408892e-05, + "loss": 2.507, + "step": 416 + }, + { + "epoch": 0.06727976766698936, + "grad_norm": 6.607447147369385, + "learning_rate": 9.965786151483867e-05, + "loss": 2.2949, + "step": 417 + }, + { + "epoch": 0.06744111003549533, + "grad_norm": 4.112335205078125, + "learning_rate": 9.965480341589701e-05, + "loss": 2.4463, + "step": 418 + }, + { + "epoch": 0.06760245240400128, + "grad_norm": 6.776587963104248, + "learning_rate": 9.965173175809906e-05, + "loss": 2.5205, + "step": 419 + }, + { + "epoch": 0.06776379477250725, + "grad_norm": 5.577247619628906, + "learning_rate": 9.964864654228353e-05, + "loss": 2.2608, + "step": 420 + }, + { + "epoch": 0.06792513714101323, + "grad_norm": 5.070724010467529, + "learning_rate": 9.96455477692929e-05, + "loss": 2.3995, + "step": 421 + }, + { + "epoch": 0.0680864795095192, + "grad_norm": 4.874788284301758, + "learning_rate": 9.964243543997331e-05, + "loss": 2.3424, + "step": 422 + }, + { + "epoch": 0.06824782187802517, + "grad_norm": 4.443953514099121, + "learning_rate": 9.963930955517464e-05, + "loss": 2.3932, + "step": 423 + }, + { + "epoch": 0.06840916424653114, + "grad_norm": 4.812726020812988, + "learning_rate": 9.963617011575046e-05, + "loss": 2.5151, + "step": 424 + }, + { + "epoch": 0.06857050661503711, + "grad_norm": 6.698811054229736, + "learning_rate": 9.9633017122558e-05, + "loss": 2.4978, + "step": 425 + }, + { + "epoch": 0.06873184898354308, + "grad_norm": 3.6164677143096924, + "learning_rate": 9.962985057645824e-05, + "loss": 2.4074, + "step": 426 + }, + { + "epoch": 0.06889319135204905, + "grad_norm": 5.184870719909668, + "learning_rate": 9.962667047831584e-05, + "loss": 2.4073, + "step": 427 + }, + { + "epoch": 0.06905453372055502, + "grad_norm": 5.24127197265625, + "learning_rate": 9.962347682899917e-05, + "loss": 2.3406, + "step": 428 + }, + { + "epoch": 0.06921587608906099, + "grad_norm": 6.580834865570068, + "learning_rate": 9.962026962938032e-05, + "loss": 2.4803, + "step": 429 + }, + { + "epoch": 0.06937721845756696, + "grad_norm": 5.9285078048706055, + "learning_rate": 9.961704888033499e-05, + "loss": 2.5044, + "step": 430 + }, + { + "epoch": 0.06953856082607293, + "grad_norm": 4.945661544799805, + "learning_rate": 9.96138145827427e-05, + "loss": 2.5625, + "step": 431 + }, + { + "epoch": 0.0696999031945789, + "grad_norm": 4.674883842468262, + "learning_rate": 9.961056673748661e-05, + "loss": 2.2854, + "step": 432 + }, + { + "epoch": 0.06986124556308487, + "grad_norm": 5.121710300445557, + "learning_rate": 9.960730534545358e-05, + "loss": 2.5345, + "step": 433 + }, + { + "epoch": 0.07002258793159083, + "grad_norm": 6.434323310852051, + "learning_rate": 9.960403040753415e-05, + "loss": 2.3152, + "step": 434 + }, + { + "epoch": 0.0701839303000968, + "grad_norm": 4.598944664001465, + "learning_rate": 9.96007419246226e-05, + "loss": 2.4321, + "step": 435 + }, + { + "epoch": 0.07034527266860277, + "grad_norm": 5.111134052276611, + "learning_rate": 9.95974398976169e-05, + "loss": 2.4378, + "step": 436 + }, + { + "epoch": 0.07050661503710874, + "grad_norm": 4.676234245300293, + "learning_rate": 9.959412432741869e-05, + "loss": 2.4166, + "step": 437 + }, + { + "epoch": 0.07066795740561471, + "grad_norm": 9.32239818572998, + "learning_rate": 9.959079521493334e-05, + "loss": 2.5017, + "step": 438 + }, + { + "epoch": 0.07082929977412068, + "grad_norm": 6.225765705108643, + "learning_rate": 9.958745256106991e-05, + "loss": 2.4031, + "step": 439 + }, + { + "epoch": 0.07099064214262665, + "grad_norm": 5.544423580169678, + "learning_rate": 9.958409636674113e-05, + "loss": 2.3907, + "step": 440 + }, + { + "epoch": 0.07115198451113262, + "grad_norm": 5.300071716308594, + "learning_rate": 9.958072663286348e-05, + "loss": 2.5297, + "step": 441 + }, + { + "epoch": 0.0713133268796386, + "grad_norm": 7.311100006103516, + "learning_rate": 9.957734336035707e-05, + "loss": 2.5866, + "step": 442 + }, + { + "epoch": 0.07147466924814457, + "grad_norm": 5.8680524826049805, + "learning_rate": 9.957394655014579e-05, + "loss": 2.2837, + "step": 443 + }, + { + "epoch": 0.07163601161665054, + "grad_norm": 6.737392902374268, + "learning_rate": 9.957053620315715e-05, + "loss": 2.2766, + "step": 444 + }, + { + "epoch": 0.0717973539851565, + "grad_norm": 5.466081619262695, + "learning_rate": 9.95671123203224e-05, + "loss": 2.1729, + "step": 445 + }, + { + "epoch": 0.07195869635366248, + "grad_norm": 4.970895290374756, + "learning_rate": 9.956367490257645e-05, + "loss": 2.4309, + "step": 446 + }, + { + "epoch": 0.07212003872216845, + "grad_norm": 4.9316253662109375, + "learning_rate": 9.956022395085798e-05, + "loss": 2.516, + "step": 447 + }, + { + "epoch": 0.0722813810906744, + "grad_norm": 5.837429046630859, + "learning_rate": 9.955675946610924e-05, + "loss": 2.4396, + "step": 448 + }, + { + "epoch": 0.07244272345918037, + "grad_norm": 5.840031147003174, + "learning_rate": 9.955328144927633e-05, + "loss": 2.5762, + "step": 449 + }, + { + "epoch": 0.07260406582768635, + "grad_norm": 5.159872055053711, + "learning_rate": 9.954978990130892e-05, + "loss": 2.6101, + "step": 450 + }, + { + "epoch": 0.07276540819619232, + "grad_norm": 7.33108377456665, + "learning_rate": 9.954628482316042e-05, + "loss": 2.4867, + "step": 451 + }, + { + "epoch": 0.07292675056469829, + "grad_norm": 4.924890041351318, + "learning_rate": 9.954276621578795e-05, + "loss": 2.3232, + "step": 452 + }, + { + "epoch": 0.07308809293320426, + "grad_norm": 6.319643497467041, + "learning_rate": 9.95392340801523e-05, + "loss": 2.2449, + "step": 453 + }, + { + "epoch": 0.07324943530171023, + "grad_norm": 4.542699337005615, + "learning_rate": 9.953568841721797e-05, + "loss": 2.2596, + "step": 454 + }, + { + "epoch": 0.0734107776702162, + "grad_norm": 5.186656475067139, + "learning_rate": 9.953212922795314e-05, + "loss": 2.5219, + "step": 455 + }, + { + "epoch": 0.07357212003872217, + "grad_norm": 5.568697929382324, + "learning_rate": 9.952855651332968e-05, + "loss": 2.2704, + "step": 456 + }, + { + "epoch": 0.07373346240722814, + "grad_norm": 5.325469493865967, + "learning_rate": 9.95249702743232e-05, + "loss": 2.3298, + "step": 457 + }, + { + "epoch": 0.07389480477573411, + "grad_norm": 9.226958274841309, + "learning_rate": 9.952137051191292e-05, + "loss": 2.4681, + "step": 458 + }, + { + "epoch": 0.07405614714424008, + "grad_norm": 9.054658889770508, + "learning_rate": 9.951775722708184e-05, + "loss": 2.4207, + "step": 459 + }, + { + "epoch": 0.07421748951274605, + "grad_norm": 4.462258338928223, + "learning_rate": 9.951413042081659e-05, + "loss": 2.1436, + "step": 460 + }, + { + "epoch": 0.07437883188125202, + "grad_norm": 4.8121209144592285, + "learning_rate": 9.951049009410751e-05, + "loss": 2.2229, + "step": 461 + }, + { + "epoch": 0.07454017424975799, + "grad_norm": 5.110342979431152, + "learning_rate": 9.950683624794865e-05, + "loss": 2.6386, + "step": 462 + }, + { + "epoch": 0.07470151661826395, + "grad_norm": 5.061398983001709, + "learning_rate": 9.950316888333775e-05, + "loss": 2.3988, + "step": 463 + }, + { + "epoch": 0.07486285898676992, + "grad_norm": 6.079970836639404, + "learning_rate": 9.949948800127619e-05, + "loss": 2.5707, + "step": 464 + }, + { + "epoch": 0.07502420135527589, + "grad_norm": 5.802469253540039, + "learning_rate": 9.949579360276912e-05, + "loss": 2.5647, + "step": 465 + }, + { + "epoch": 0.07518554372378186, + "grad_norm": 4.262973785400391, + "learning_rate": 9.949208568882531e-05, + "loss": 2.3063, + "step": 466 + }, + { + "epoch": 0.07534688609228783, + "grad_norm": 7.107011795043945, + "learning_rate": 9.948836426045728e-05, + "loss": 2.4295, + "step": 467 + }, + { + "epoch": 0.0755082284607938, + "grad_norm": 5.002132892608643, + "learning_rate": 9.948462931868119e-05, + "loss": 2.4616, + "step": 468 + }, + { + "epoch": 0.07566957082929977, + "grad_norm": 4.140540599822998, + "learning_rate": 9.948088086451691e-05, + "loss": 2.309, + "step": 469 + }, + { + "epoch": 0.07583091319780574, + "grad_norm": 5.206024169921875, + "learning_rate": 9.947711889898802e-05, + "loss": 2.3658, + "step": 470 + }, + { + "epoch": 0.07599225556631171, + "grad_norm": 4.66822624206543, + "learning_rate": 9.947334342312176e-05, + "loss": 2.3782, + "step": 471 + }, + { + "epoch": 0.07615359793481769, + "grad_norm": 4.914458274841309, + "learning_rate": 9.946955443794908e-05, + "loss": 2.3358, + "step": 472 + }, + { + "epoch": 0.07631494030332366, + "grad_norm": 4.964338779449463, + "learning_rate": 9.946575194450458e-05, + "loss": 2.549, + "step": 473 + }, + { + "epoch": 0.07647628267182963, + "grad_norm": 8.110756874084473, + "learning_rate": 9.946193594382662e-05, + "loss": 2.5279, + "step": 474 + }, + { + "epoch": 0.0766376250403356, + "grad_norm": 8.146491050720215, + "learning_rate": 9.945810643695717e-05, + "loss": 2.3447, + "step": 475 + }, + { + "epoch": 0.07679896740884157, + "grad_norm": 4.413754463195801, + "learning_rate": 9.945426342494195e-05, + "loss": 2.3938, + "step": 476 + }, + { + "epoch": 0.07696030977734754, + "grad_norm": 5.769023418426514, + "learning_rate": 9.945040690883033e-05, + "loss": 2.3792, + "step": 477 + }, + { + "epoch": 0.0771216521458535, + "grad_norm": 5.316256999969482, + "learning_rate": 9.944653688967537e-05, + "loss": 2.3402, + "step": 478 + }, + { + "epoch": 0.07728299451435947, + "grad_norm": 5.797434329986572, + "learning_rate": 9.944265336853385e-05, + "loss": 2.4458, + "step": 479 + }, + { + "epoch": 0.07744433688286544, + "grad_norm": 4.082608699798584, + "learning_rate": 9.94387563464662e-05, + "loss": 2.4178, + "step": 480 + }, + { + "epoch": 0.0776056792513714, + "grad_norm": 5.853196144104004, + "learning_rate": 9.943484582453653e-05, + "loss": 2.2416, + "step": 481 + }, + { + "epoch": 0.07776702161987738, + "grad_norm": 7.1680588722229, + "learning_rate": 9.94309218038127e-05, + "loss": 2.1606, + "step": 482 + }, + { + "epoch": 0.07792836398838335, + "grad_norm": 4.1502814292907715, + "learning_rate": 9.942698428536616e-05, + "loss": 2.4373, + "step": 483 + }, + { + "epoch": 0.07808970635688932, + "grad_norm": 3.6260924339294434, + "learning_rate": 9.942303327027216e-05, + "loss": 2.4671, + "step": 484 + }, + { + "epoch": 0.07825104872539529, + "grad_norm": 4.55759334564209, + "learning_rate": 9.941906875960952e-05, + "loss": 2.4586, + "step": 485 + }, + { + "epoch": 0.07841239109390126, + "grad_norm": 4.005107402801514, + "learning_rate": 9.941509075446081e-05, + "loss": 2.5739, + "step": 486 + }, + { + "epoch": 0.07857373346240723, + "grad_norm": 4.9014506340026855, + "learning_rate": 9.94110992559123e-05, + "loss": 2.5555, + "step": 487 + }, + { + "epoch": 0.0787350758309132, + "grad_norm": 6.068490982055664, + "learning_rate": 9.940709426505388e-05, + "loss": 2.2641, + "step": 488 + }, + { + "epoch": 0.07889641819941917, + "grad_norm": 3.5387156009674072, + "learning_rate": 9.94030757829792e-05, + "loss": 2.422, + "step": 489 + }, + { + "epoch": 0.07905776056792514, + "grad_norm": 5.17158317565918, + "learning_rate": 9.939904381078553e-05, + "loss": 2.4981, + "step": 490 + }, + { + "epoch": 0.07921910293643111, + "grad_norm": 6.625000476837158, + "learning_rate": 9.939499834957386e-05, + "loss": 2.6917, + "step": 491 + }, + { + "epoch": 0.07938044530493708, + "grad_norm": 5.97782039642334, + "learning_rate": 9.939093940044885e-05, + "loss": 2.2068, + "step": 492 + }, + { + "epoch": 0.07954178767344304, + "grad_norm": 5.146000862121582, + "learning_rate": 9.938686696451884e-05, + "loss": 2.2952, + "step": 493 + }, + { + "epoch": 0.07970313004194901, + "grad_norm": 4.428076267242432, + "learning_rate": 9.938278104289586e-05, + "loss": 2.3511, + "step": 494 + }, + { + "epoch": 0.07986447241045498, + "grad_norm": 4.798122882843018, + "learning_rate": 9.937868163669565e-05, + "loss": 2.3543, + "step": 495 + }, + { + "epoch": 0.08002581477896095, + "grad_norm": 6.988558292388916, + "learning_rate": 9.937456874703757e-05, + "loss": 2.3155, + "step": 496 + }, + { + "epoch": 0.08018715714746692, + "grad_norm": 4.669104099273682, + "learning_rate": 9.93704423750447e-05, + "loss": 2.4429, + "step": 497 + }, + { + "epoch": 0.0803484995159729, + "grad_norm": 5.921403884887695, + "learning_rate": 9.93663025218438e-05, + "loss": 2.5554, + "step": 498 + }, + { + "epoch": 0.08050984188447886, + "grad_norm": 4.644062042236328, + "learning_rate": 9.93621491885653e-05, + "loss": 2.4043, + "step": 499 + }, + { + "epoch": 0.08067118425298483, + "grad_norm": 5.473841667175293, + "learning_rate": 9.935798237634335e-05, + "loss": 2.4431, + "step": 500 + }, + { + "epoch": 0.0808325266214908, + "grad_norm": 3.7958078384399414, + "learning_rate": 9.935380208631572e-05, + "loss": 2.4083, + "step": 501 + }, + { + "epoch": 0.08099386898999678, + "grad_norm": 6.851291179656982, + "learning_rate": 9.93496083196239e-05, + "loss": 2.4766, + "step": 502 + }, + { + "epoch": 0.08115521135850275, + "grad_norm": 8.067902565002441, + "learning_rate": 9.934540107741304e-05, + "loss": 2.3541, + "step": 503 + }, + { + "epoch": 0.08131655372700872, + "grad_norm": 4.656294822692871, + "learning_rate": 9.934118036083199e-05, + "loss": 2.3281, + "step": 504 + }, + { + "epoch": 0.08147789609551469, + "grad_norm": 5.242002487182617, + "learning_rate": 9.933694617103327e-05, + "loss": 2.3153, + "step": 505 + }, + { + "epoch": 0.08163923846402066, + "grad_norm": 3.5532069206237793, + "learning_rate": 9.933269850917309e-05, + "loss": 2.2761, + "step": 506 + }, + { + "epoch": 0.08180058083252661, + "grad_norm": 3.949692726135254, + "learning_rate": 9.932843737641127e-05, + "loss": 2.4368, + "step": 507 + }, + { + "epoch": 0.08196192320103259, + "grad_norm": 3.7503204345703125, + "learning_rate": 9.932416277391143e-05, + "loss": 2.3481, + "step": 508 + }, + { + "epoch": 0.08212326556953856, + "grad_norm": 4.7445478439331055, + "learning_rate": 9.931987470284077e-05, + "loss": 2.4143, + "step": 509 + }, + { + "epoch": 0.08228460793804453, + "grad_norm": 4.674653053283691, + "learning_rate": 9.931557316437021e-05, + "loss": 2.4809, + "step": 510 + }, + { + "epoch": 0.0824459503065505, + "grad_norm": 4.840732097625732, + "learning_rate": 9.931125815967434e-05, + "loss": 2.469, + "step": 511 + }, + { + "epoch": 0.08260729267505647, + "grad_norm": 4.673700332641602, + "learning_rate": 9.930692968993143e-05, + "loss": 2.6427, + "step": 512 + }, + { + "epoch": 0.08276863504356244, + "grad_norm": 6.143237113952637, + "learning_rate": 9.93025877563234e-05, + "loss": 2.1935, + "step": 513 + }, + { + "epoch": 0.08292997741206841, + "grad_norm": 4.747680187225342, + "learning_rate": 9.929823236003589e-05, + "loss": 2.3936, + "step": 514 + }, + { + "epoch": 0.08309131978057438, + "grad_norm": 4.849829196929932, + "learning_rate": 9.929386350225818e-05, + "loss": 2.6919, + "step": 515 + }, + { + "epoch": 0.08325266214908035, + "grad_norm": 6.0291924476623535, + "learning_rate": 9.928948118418326e-05, + "loss": 2.3324, + "step": 516 + }, + { + "epoch": 0.08341400451758632, + "grad_norm": 3.673546075820923, + "learning_rate": 9.928508540700774e-05, + "loss": 1.9863, + "step": 517 + }, + { + "epoch": 0.08357534688609229, + "grad_norm": 4.3796706199646, + "learning_rate": 9.928067617193199e-05, + "loss": 2.3091, + "step": 518 + }, + { + "epoch": 0.08373668925459826, + "grad_norm": 4.878311634063721, + "learning_rate": 9.927625348015996e-05, + "loss": 2.4597, + "step": 519 + }, + { + "epoch": 0.08389803162310423, + "grad_norm": 4.7062482833862305, + "learning_rate": 9.927181733289935e-05, + "loss": 2.447, + "step": 520 + }, + { + "epoch": 0.0840593739916102, + "grad_norm": 6.62054967880249, + "learning_rate": 9.92673677313615e-05, + "loss": 2.2419, + "step": 521 + }, + { + "epoch": 0.08422071636011616, + "grad_norm": 6.965595722198486, + "learning_rate": 9.926290467676141e-05, + "loss": 2.4569, + "step": 522 + }, + { + "epoch": 0.08438205872862213, + "grad_norm": 5.016797065734863, + "learning_rate": 9.925842817031781e-05, + "loss": 2.5768, + "step": 523 + }, + { + "epoch": 0.0845434010971281, + "grad_norm": 5.2972941398620605, + "learning_rate": 9.925393821325301e-05, + "loss": 2.3541, + "step": 524 + }, + { + "epoch": 0.08470474346563407, + "grad_norm": 3.7381811141967773, + "learning_rate": 9.924943480679311e-05, + "loss": 2.4922, + "step": 525 + }, + { + "epoch": 0.08486608583414004, + "grad_norm": 4.882343292236328, + "learning_rate": 9.924491795216777e-05, + "loss": 2.2613, + "step": 526 + }, + { + "epoch": 0.08502742820264601, + "grad_norm": 6.2732834815979, + "learning_rate": 9.924038765061042e-05, + "loss": 2.3453, + "step": 527 + }, + { + "epoch": 0.08518877057115198, + "grad_norm": 4.926011562347412, + "learning_rate": 9.923584390335805e-05, + "loss": 2.1763, + "step": 528 + }, + { + "epoch": 0.08535011293965795, + "grad_norm": 3.1853888034820557, + "learning_rate": 9.923128671165145e-05, + "loss": 2.166, + "step": 529 + }, + { + "epoch": 0.08551145530816393, + "grad_norm": 5.0853376388549805, + "learning_rate": 9.922671607673499e-05, + "loss": 2.3266, + "step": 530 + }, + { + "epoch": 0.0856727976766699, + "grad_norm": 5.984068870544434, + "learning_rate": 9.922213199985673e-05, + "loss": 2.4452, + "step": 531 + }, + { + "epoch": 0.08583414004517587, + "grad_norm": 4.2504143714904785, + "learning_rate": 9.921753448226843e-05, + "loss": 2.3857, + "step": 532 + }, + { + "epoch": 0.08599548241368184, + "grad_norm": 4.222536563873291, + "learning_rate": 9.921292352522548e-05, + "loss": 2.7227, + "step": 533 + }, + { + "epoch": 0.08615682478218781, + "grad_norm": 6.777401447296143, + "learning_rate": 9.920829912998696e-05, + "loss": 2.4183, + "step": 534 + }, + { + "epoch": 0.08631816715069378, + "grad_norm": 4.108943462371826, + "learning_rate": 9.920366129781564e-05, + "loss": 2.1958, + "step": 535 + }, + { + "epoch": 0.08647950951919975, + "grad_norm": 6.230100154876709, + "learning_rate": 9.919901002997792e-05, + "loss": 2.4445, + "step": 536 + }, + { + "epoch": 0.0866408518877057, + "grad_norm": 7.756218433380127, + "learning_rate": 9.919434532774387e-05, + "loss": 2.4117, + "step": 537 + }, + { + "epoch": 0.08680219425621168, + "grad_norm": 5.887117385864258, + "learning_rate": 9.918966719238726e-05, + "loss": 2.3651, + "step": 538 + }, + { + "epoch": 0.08696353662471765, + "grad_norm": 5.209339618682861, + "learning_rate": 9.918497562518554e-05, + "loss": 2.3272, + "step": 539 + }, + { + "epoch": 0.08712487899322362, + "grad_norm": 4.876028060913086, + "learning_rate": 9.918027062741976e-05, + "loss": 2.3033, + "step": 540 + }, + { + "epoch": 0.08728622136172959, + "grad_norm": 4.886802673339844, + "learning_rate": 9.917555220037468e-05, + "loss": 2.3048, + "step": 541 + }, + { + "epoch": 0.08744756373023556, + "grad_norm": 4.376396179199219, + "learning_rate": 9.917082034533875e-05, + "loss": 2.4459, + "step": 542 + }, + { + "epoch": 0.08760890609874153, + "grad_norm": 4.801101207733154, + "learning_rate": 9.916607506360407e-05, + "loss": 2.3303, + "step": 543 + }, + { + "epoch": 0.0877702484672475, + "grad_norm": 4.449361801147461, + "learning_rate": 9.916131635646635e-05, + "loss": 2.1947, + "step": 544 + }, + { + "epoch": 0.08793159083575347, + "grad_norm": 6.665609836578369, + "learning_rate": 9.915654422522505e-05, + "loss": 2.5422, + "step": 545 + }, + { + "epoch": 0.08809293320425944, + "grad_norm": 6.845566749572754, + "learning_rate": 9.915175867118324e-05, + "loss": 2.5268, + "step": 546 + }, + { + "epoch": 0.08825427557276541, + "grad_norm": 6.118831634521484, + "learning_rate": 9.914695969564769e-05, + "loss": 2.3306, + "step": 547 + }, + { + "epoch": 0.08841561794127138, + "grad_norm": 6.415126800537109, + "learning_rate": 9.914214729992881e-05, + "loss": 2.2492, + "step": 548 + }, + { + "epoch": 0.08857696030977735, + "grad_norm": 4.820186614990234, + "learning_rate": 9.913732148534068e-05, + "loss": 2.2528, + "step": 549 + }, + { + "epoch": 0.08873830267828332, + "grad_norm": 5.117327690124512, + "learning_rate": 9.913248225320106e-05, + "loss": 2.4017, + "step": 550 + }, + { + "epoch": 0.08889964504678928, + "grad_norm": 5.227145671844482, + "learning_rate": 9.912762960483138e-05, + "loss": 2.2688, + "step": 551 + }, + { + "epoch": 0.08906098741529525, + "grad_norm": 5.166123867034912, + "learning_rate": 9.912276354155666e-05, + "loss": 2.3175, + "step": 552 + }, + { + "epoch": 0.08922232978380122, + "grad_norm": 7.08079195022583, + "learning_rate": 9.911788406470569e-05, + "loss": 2.3965, + "step": 553 + }, + { + "epoch": 0.08938367215230719, + "grad_norm": 3.4850637912750244, + "learning_rate": 9.911299117561085e-05, + "loss": 2.4604, + "step": 554 + }, + { + "epoch": 0.08954501452081316, + "grad_norm": 5.370327949523926, + "learning_rate": 9.910808487560821e-05, + "loss": 2.4105, + "step": 555 + }, + { + "epoch": 0.08970635688931913, + "grad_norm": 6.220058441162109, + "learning_rate": 9.910316516603748e-05, + "loss": 2.4818, + "step": 556 + }, + { + "epoch": 0.0898676992578251, + "grad_norm": 4.778527736663818, + "learning_rate": 9.909823204824206e-05, + "loss": 2.335, + "step": 557 + }, + { + "epoch": 0.09002904162633107, + "grad_norm": 5.088039875030518, + "learning_rate": 9.9093285523569e-05, + "loss": 2.512, + "step": 558 + }, + { + "epoch": 0.09019038399483705, + "grad_norm": 4.244760513305664, + "learning_rate": 9.908832559336902e-05, + "loss": 2.5163, + "step": 559 + }, + { + "epoch": 0.09035172636334302, + "grad_norm": 3.665497303009033, + "learning_rate": 9.908335225899647e-05, + "loss": 2.288, + "step": 560 + }, + { + "epoch": 0.09051306873184899, + "grad_norm": 5.0853986740112305, + "learning_rate": 9.907836552180938e-05, + "loss": 2.3966, + "step": 561 + }, + { + "epoch": 0.09067441110035496, + "grad_norm": 4.697176933288574, + "learning_rate": 9.907336538316944e-05, + "loss": 2.3659, + "step": 562 + }, + { + "epoch": 0.09083575346886093, + "grad_norm": 4.051960468292236, + "learning_rate": 9.906835184444203e-05, + "loss": 2.5436, + "step": 563 + }, + { + "epoch": 0.0909970958373669, + "grad_norm": 4.65770149230957, + "learning_rate": 9.906332490699613e-05, + "loss": 2.2301, + "step": 564 + }, + { + "epoch": 0.09115843820587287, + "grad_norm": 5.872143745422363, + "learning_rate": 9.905828457220442e-05, + "loss": 2.2801, + "step": 565 + }, + { + "epoch": 0.09131978057437883, + "grad_norm": 5.051412105560303, + "learning_rate": 9.90532308414432e-05, + "loss": 2.4987, + "step": 566 + }, + { + "epoch": 0.0914811229428848, + "grad_norm": 5.8648271560668945, + "learning_rate": 9.904816371609249e-05, + "loss": 2.4338, + "step": 567 + }, + { + "epoch": 0.09164246531139077, + "grad_norm": 5.009479522705078, + "learning_rate": 9.90430831975359e-05, + "loss": 2.4364, + "step": 568 + }, + { + "epoch": 0.09180380767989674, + "grad_norm": 5.303811550140381, + "learning_rate": 9.903798928716074e-05, + "loss": 2.1138, + "step": 569 + }, + { + "epoch": 0.09196515004840271, + "grad_norm": 6.00145149230957, + "learning_rate": 9.903288198635798e-05, + "loss": 2.3861, + "step": 570 + }, + { + "epoch": 0.09212649241690868, + "grad_norm": 4.163204669952393, + "learning_rate": 9.902776129652223e-05, + "loss": 2.2934, + "step": 571 + }, + { + "epoch": 0.09228783478541465, + "grad_norm": 4.87972354888916, + "learning_rate": 9.902262721905171e-05, + "loss": 2.2856, + "step": 572 + }, + { + "epoch": 0.09244917715392062, + "grad_norm": 5.656421184539795, + "learning_rate": 9.901747975534841e-05, + "loss": 2.3445, + "step": 573 + }, + { + "epoch": 0.09261051952242659, + "grad_norm": 6.181389808654785, + "learning_rate": 9.901231890681786e-05, + "loss": 2.3808, + "step": 574 + }, + { + "epoch": 0.09277186189093256, + "grad_norm": 4.321932315826416, + "learning_rate": 9.900714467486932e-05, + "loss": 2.3646, + "step": 575 + }, + { + "epoch": 0.09293320425943853, + "grad_norm": 4.5859503746032715, + "learning_rate": 9.900195706091566e-05, + "loss": 2.2895, + "step": 576 + }, + { + "epoch": 0.0930945466279445, + "grad_norm": 6.927850246429443, + "learning_rate": 9.899675606637345e-05, + "loss": 2.3984, + "step": 577 + }, + { + "epoch": 0.09325588899645047, + "grad_norm": 6.717582702636719, + "learning_rate": 9.899154169266283e-05, + "loss": 2.124, + "step": 578 + }, + { + "epoch": 0.09341723136495644, + "grad_norm": 7.106042385101318, + "learning_rate": 9.898631394120771e-05, + "loss": 2.3456, + "step": 579 + }, + { + "epoch": 0.09357857373346241, + "grad_norm": 7.629501819610596, + "learning_rate": 9.898107281343556e-05, + "loss": 2.2063, + "step": 580 + }, + { + "epoch": 0.09373991610196837, + "grad_norm": 4.966427326202393, + "learning_rate": 9.897581831077754e-05, + "loss": 2.2868, + "step": 581 + }, + { + "epoch": 0.09390125847047434, + "grad_norm": 5.79547643661499, + "learning_rate": 9.897055043466848e-05, + "loss": 2.3294, + "step": 582 + }, + { + "epoch": 0.09406260083898031, + "grad_norm": 4.90958833694458, + "learning_rate": 9.896526918654678e-05, + "loss": 2.5938, + "step": 583 + }, + { + "epoch": 0.09422394320748628, + "grad_norm": 4.303000450134277, + "learning_rate": 9.895997456785463e-05, + "loss": 2.3196, + "step": 584 + }, + { + "epoch": 0.09438528557599225, + "grad_norm": 4.9478044509887695, + "learning_rate": 9.89546665800377e-05, + "loss": 2.5005, + "step": 585 + }, + { + "epoch": 0.09454662794449822, + "grad_norm": 6.02296781539917, + "learning_rate": 9.894934522454547e-05, + "loss": 2.2884, + "step": 586 + }, + { + "epoch": 0.0947079703130042, + "grad_norm": 5.697391510009766, + "learning_rate": 9.894401050283099e-05, + "loss": 2.2631, + "step": 587 + }, + { + "epoch": 0.09486931268151017, + "grad_norm": 6.0295305252075195, + "learning_rate": 9.893866241635096e-05, + "loss": 2.4042, + "step": 588 + }, + { + "epoch": 0.09503065505001614, + "grad_norm": 4.033676624298096, + "learning_rate": 9.893330096656574e-05, + "loss": 2.5702, + "step": 589 + }, + { + "epoch": 0.0951919974185221, + "grad_norm": 4.621325492858887, + "learning_rate": 9.892792615493934e-05, + "loss": 2.4732, + "step": 590 + }, + { + "epoch": 0.09535333978702808, + "grad_norm": 4.931112766265869, + "learning_rate": 9.892253798293942e-05, + "loss": 2.3272, + "step": 591 + }, + { + "epoch": 0.09551468215553405, + "grad_norm": 10.282770156860352, + "learning_rate": 9.89171364520373e-05, + "loss": 2.4638, + "step": 592 + }, + { + "epoch": 0.09567602452404002, + "grad_norm": 6.483240604400635, + "learning_rate": 9.891172156370792e-05, + "loss": 2.3649, + "step": 593 + }, + { + "epoch": 0.09583736689254599, + "grad_norm": 6.331130027770996, + "learning_rate": 9.89062933194299e-05, + "loss": 2.359, + "step": 594 + }, + { + "epoch": 0.09599870926105195, + "grad_norm": 6.046228885650635, + "learning_rate": 9.890085172068544e-05, + "loss": 2.288, + "step": 595 + }, + { + "epoch": 0.09616005162955792, + "grad_norm": 4.380414962768555, + "learning_rate": 9.88953967689605e-05, + "loss": 2.3372, + "step": 596 + }, + { + "epoch": 0.09632139399806389, + "grad_norm": 4.5679826736450195, + "learning_rate": 9.888992846574456e-05, + "loss": 2.5387, + "step": 597 + }, + { + "epoch": 0.09648273636656986, + "grad_norm": 6.251746654510498, + "learning_rate": 9.888444681253086e-05, + "loss": 2.3858, + "step": 598 + }, + { + "epoch": 0.09664407873507583, + "grad_norm": 4.814467430114746, + "learning_rate": 9.887895181081622e-05, + "loss": 2.3785, + "step": 599 + }, + { + "epoch": 0.0968054211035818, + "grad_norm": 4.712541103363037, + "learning_rate": 9.88734434621011e-05, + "loss": 2.2192, + "step": 600 + }, + { + "epoch": 0.09696676347208777, + "grad_norm": 6.289758205413818, + "learning_rate": 9.886792176788964e-05, + "loss": 2.1972, + "step": 601 + }, + { + "epoch": 0.09712810584059374, + "grad_norm": 4.202297210693359, + "learning_rate": 9.886238672968959e-05, + "loss": 2.3075, + "step": 602 + }, + { + "epoch": 0.09728944820909971, + "grad_norm": 4.969404220581055, + "learning_rate": 9.885683834901238e-05, + "loss": 2.6972, + "step": 603 + }, + { + "epoch": 0.09745079057760568, + "grad_norm": 5.10714054107666, + "learning_rate": 9.885127662737306e-05, + "loss": 2.2584, + "step": 604 + }, + { + "epoch": 0.09761213294611165, + "grad_norm": 4.227580547332764, + "learning_rate": 9.88457015662903e-05, + "loss": 2.2547, + "step": 605 + }, + { + "epoch": 0.09777347531461762, + "grad_norm": 4.349273204803467, + "learning_rate": 9.884011316728648e-05, + "loss": 2.3444, + "step": 606 + }, + { + "epoch": 0.09793481768312359, + "grad_norm": 5.646902561187744, + "learning_rate": 9.883451143188753e-05, + "loss": 2.251, + "step": 607 + }, + { + "epoch": 0.09809616005162956, + "grad_norm": 4.5992655754089355, + "learning_rate": 9.882889636162313e-05, + "loss": 2.2226, + "step": 608 + }, + { + "epoch": 0.09825750242013553, + "grad_norm": 6.841413497924805, + "learning_rate": 9.882326795802652e-05, + "loss": 2.4206, + "step": 609 + }, + { + "epoch": 0.09841884478864149, + "grad_norm": 6.097044467926025, + "learning_rate": 9.881762622263459e-05, + "loss": 2.5558, + "step": 610 + }, + { + "epoch": 0.09858018715714746, + "grad_norm": 5.023830890655518, + "learning_rate": 9.88119711569879e-05, + "loss": 2.3713, + "step": 611 + }, + { + "epoch": 0.09874152952565343, + "grad_norm": 4.367787837982178, + "learning_rate": 9.880630276263066e-05, + "loss": 2.4648, + "step": 612 + }, + { + "epoch": 0.0989028718941594, + "grad_norm": 5.2165093421936035, + "learning_rate": 9.880062104111064e-05, + "loss": 2.1164, + "step": 613 + }, + { + "epoch": 0.09906421426266537, + "grad_norm": 4.974855422973633, + "learning_rate": 9.879492599397935e-05, + "loss": 2.4089, + "step": 614 + }, + { + "epoch": 0.09922555663117134, + "grad_norm": 5.106447696685791, + "learning_rate": 9.878921762279185e-05, + "loss": 2.3289, + "step": 615 + }, + { + "epoch": 0.09938689899967731, + "grad_norm": 5.060490608215332, + "learning_rate": 9.878349592910692e-05, + "loss": 2.2726, + "step": 616 + }, + { + "epoch": 0.09954824136818329, + "grad_norm": 4.20667839050293, + "learning_rate": 9.877776091448694e-05, + "loss": 2.2952, + "step": 617 + }, + { + "epoch": 0.09970958373668926, + "grad_norm": 5.2343525886535645, + "learning_rate": 9.87720125804979e-05, + "loss": 2.278, + "step": 618 + }, + { + "epoch": 0.09987092610519523, + "grad_norm": 3.2236578464508057, + "learning_rate": 9.876625092870947e-05, + "loss": 2.2452, + "step": 619 + }, + { + "epoch": 0.1000322684737012, + "grad_norm": 5.387293338775635, + "learning_rate": 9.876047596069493e-05, + "loss": 2.3576, + "step": 620 + }, + { + "epoch": 0.10019361084220717, + "grad_norm": 6.220782279968262, + "learning_rate": 9.875468767803122e-05, + "loss": 2.5285, + "step": 621 + }, + { + "epoch": 0.10035495321071314, + "grad_norm": 5.5559258460998535, + "learning_rate": 9.87488860822989e-05, + "loss": 2.3215, + "step": 622 + }, + { + "epoch": 0.10051629557921911, + "grad_norm": 5.513997554779053, + "learning_rate": 9.874307117508214e-05, + "loss": 2.1272, + "step": 623 + }, + { + "epoch": 0.10067763794772508, + "grad_norm": 5.5388007164001465, + "learning_rate": 9.873724295796881e-05, + "loss": 2.5749, + "step": 624 + }, + { + "epoch": 0.10083898031623104, + "grad_norm": 4.577971935272217, + "learning_rate": 9.873140143255036e-05, + "loss": 2.4018, + "step": 625 + }, + { + "epoch": 0.101000322684737, + "grad_norm": 4.911154747009277, + "learning_rate": 9.872554660042188e-05, + "loss": 2.4153, + "step": 626 + }, + { + "epoch": 0.10116166505324298, + "grad_norm": 5.559174060821533, + "learning_rate": 9.871967846318213e-05, + "loss": 2.2745, + "step": 627 + }, + { + "epoch": 0.10132300742174895, + "grad_norm": 5.209252834320068, + "learning_rate": 9.871379702243345e-05, + "loss": 2.2871, + "step": 628 + }, + { + "epoch": 0.10148434979025492, + "grad_norm": 5.395344257354736, + "learning_rate": 9.870790227978186e-05, + "loss": 2.2349, + "step": 629 + }, + { + "epoch": 0.10164569215876089, + "grad_norm": 4.204890727996826, + "learning_rate": 9.870199423683697e-05, + "loss": 2.2719, + "step": 630 + }, + { + "epoch": 0.10180703452726686, + "grad_norm": 6.293933868408203, + "learning_rate": 9.869607289521207e-05, + "loss": 2.3921, + "step": 631 + }, + { + "epoch": 0.10196837689577283, + "grad_norm": 5.962689399719238, + "learning_rate": 9.869013825652405e-05, + "loss": 2.464, + "step": 632 + }, + { + "epoch": 0.1021297192642788, + "grad_norm": 5.058944225311279, + "learning_rate": 9.868419032239342e-05, + "loss": 2.0913, + "step": 633 + }, + { + "epoch": 0.10229106163278477, + "grad_norm": 5.054147720336914, + "learning_rate": 9.867822909444434e-05, + "loss": 2.4864, + "step": 634 + }, + { + "epoch": 0.10245240400129074, + "grad_norm": 5.315404891967773, + "learning_rate": 9.867225457430461e-05, + "loss": 2.3332, + "step": 635 + }, + { + "epoch": 0.10261374636979671, + "grad_norm": 5.104187488555908, + "learning_rate": 9.866626676360564e-05, + "loss": 2.3409, + "step": 636 + }, + { + "epoch": 0.10277508873830268, + "grad_norm": 3.9855387210845947, + "learning_rate": 9.866026566398248e-05, + "loss": 2.1252, + "step": 637 + }, + { + "epoch": 0.10293643110680865, + "grad_norm": 5.759126663208008, + "learning_rate": 9.86542512770738e-05, + "loss": 2.3079, + "step": 638 + }, + { + "epoch": 0.10309777347531461, + "grad_norm": 7.466570854187012, + "learning_rate": 9.864822360452188e-05, + "loss": 2.2168, + "step": 639 + }, + { + "epoch": 0.10325911584382058, + "grad_norm": 8.343269348144531, + "learning_rate": 9.86421826479727e-05, + "loss": 1.9994, + "step": 640 + }, + { + "epoch": 0.10342045821232655, + "grad_norm": 5.27672815322876, + "learning_rate": 9.863612840907577e-05, + "loss": 2.3302, + "step": 641 + }, + { + "epoch": 0.10358180058083252, + "grad_norm": 4.862626552581787, + "learning_rate": 9.86300608894843e-05, + "loss": 2.4057, + "step": 642 + }, + { + "epoch": 0.1037431429493385, + "grad_norm": 4.641247272491455, + "learning_rate": 9.862398009085511e-05, + "loss": 2.2996, + "step": 643 + }, + { + "epoch": 0.10390448531784446, + "grad_norm": 5.97214412689209, + "learning_rate": 9.86178860148486e-05, + "loss": 2.1299, + "step": 644 + }, + { + "epoch": 0.10406582768635043, + "grad_norm": 6.091129779815674, + "learning_rate": 9.861177866312887e-05, + "loss": 2.3089, + "step": 645 + }, + { + "epoch": 0.1042271700548564, + "grad_norm": 5.053765773773193, + "learning_rate": 9.86056580373636e-05, + "loss": 2.2826, + "step": 646 + }, + { + "epoch": 0.10438851242336238, + "grad_norm": 5.303426742553711, + "learning_rate": 9.859952413922407e-05, + "loss": 2.3649, + "step": 647 + }, + { + "epoch": 0.10454985479186835, + "grad_norm": 5.3501715660095215, + "learning_rate": 9.859337697038526e-05, + "loss": 2.2318, + "step": 648 + }, + { + "epoch": 0.10471119716037432, + "grad_norm": 5.797842979431152, + "learning_rate": 9.858721653252571e-05, + "loss": 2.3028, + "step": 649 + }, + { + "epoch": 0.10487253952888029, + "grad_norm": 4.831730365753174, + "learning_rate": 9.858104282732759e-05, + "loss": 2.3065, + "step": 650 + }, + { + "epoch": 0.10503388189738626, + "grad_norm": 7.14301061630249, + "learning_rate": 9.857485585647675e-05, + "loss": 2.5492, + "step": 651 + }, + { + "epoch": 0.10519522426589223, + "grad_norm": 5.6980061531066895, + "learning_rate": 9.856865562166256e-05, + "loss": 2.1222, + "step": 652 + }, + { + "epoch": 0.1053565666343982, + "grad_norm": 4.008930206298828, + "learning_rate": 9.856244212457813e-05, + "loss": 2.0549, + "step": 653 + }, + { + "epoch": 0.10551790900290416, + "grad_norm": 5.521589756011963, + "learning_rate": 9.855621536692008e-05, + "loss": 2.4834, + "step": 654 + }, + { + "epoch": 0.10567925137141013, + "grad_norm": 4.837742805480957, + "learning_rate": 9.854997535038873e-05, + "loss": 2.4236, + "step": 655 + }, + { + "epoch": 0.1058405937399161, + "grad_norm": 4.2191081047058105, + "learning_rate": 9.854372207668799e-05, + "loss": 2.5142, + "step": 656 + }, + { + "epoch": 0.10600193610842207, + "grad_norm": 5.554219722747803, + "learning_rate": 9.85374555475254e-05, + "loss": 2.275, + "step": 657 + }, + { + "epoch": 0.10616327847692804, + "grad_norm": 5.10724401473999, + "learning_rate": 9.85311757646121e-05, + "loss": 2.3817, + "step": 658 + }, + { + "epoch": 0.10632462084543401, + "grad_norm": 4.961881160736084, + "learning_rate": 9.852488272966286e-05, + "loss": 1.9945, + "step": 659 + }, + { + "epoch": 0.10648596321393998, + "grad_norm": 5.993134498596191, + "learning_rate": 9.85185764443961e-05, + "loss": 2.5539, + "step": 660 + }, + { + "epoch": 0.10664730558244595, + "grad_norm": 4.784736633300781, + "learning_rate": 9.85122569105338e-05, + "loss": 2.3461, + "step": 661 + }, + { + "epoch": 0.10680864795095192, + "grad_norm": 7.389451503753662, + "learning_rate": 9.850592412980159e-05, + "loss": 2.3742, + "step": 662 + }, + { + "epoch": 0.10696999031945789, + "grad_norm": 4.962615489959717, + "learning_rate": 9.849957810392872e-05, + "loss": 2.3071, + "step": 663 + }, + { + "epoch": 0.10713133268796386, + "grad_norm": 5.416285037994385, + "learning_rate": 9.849321883464806e-05, + "loss": 2.0281, + "step": 664 + }, + { + "epoch": 0.10729267505646983, + "grad_norm": 4.267657279968262, + "learning_rate": 9.848684632369605e-05, + "loss": 2.2553, + "step": 665 + }, + { + "epoch": 0.1074540174249758, + "grad_norm": 5.005043029785156, + "learning_rate": 9.848046057281284e-05, + "loss": 2.3333, + "step": 666 + }, + { + "epoch": 0.10761535979348177, + "grad_norm": 5.271666526794434, + "learning_rate": 9.847406158374209e-05, + "loss": 2.4515, + "step": 667 + }, + { + "epoch": 0.10777670216198774, + "grad_norm": 5.715791702270508, + "learning_rate": 9.846764935823113e-05, + "loss": 2.1334, + "step": 668 + }, + { + "epoch": 0.1079380445304937, + "grad_norm": 7.363461017608643, + "learning_rate": 9.846122389803093e-05, + "loss": 2.2951, + "step": 669 + }, + { + "epoch": 0.10809938689899967, + "grad_norm": 4.60572624206543, + "learning_rate": 9.845478520489599e-05, + "loss": 2.2605, + "step": 670 + }, + { + "epoch": 0.10826072926750564, + "grad_norm": 4.57849645614624, + "learning_rate": 9.844833328058452e-05, + "loss": 2.5506, + "step": 671 + }, + { + "epoch": 0.10842207163601161, + "grad_norm": 5.1598944664001465, + "learning_rate": 9.844186812685827e-05, + "loss": 2.2983, + "step": 672 + }, + { + "epoch": 0.10858341400451758, + "grad_norm": 5.090889930725098, + "learning_rate": 9.843538974548265e-05, + "loss": 2.1432, + "step": 673 + }, + { + "epoch": 0.10874475637302355, + "grad_norm": 3.7854175567626953, + "learning_rate": 9.842889813822665e-05, + "loss": 2.4845, + "step": 674 + }, + { + "epoch": 0.10890609874152953, + "grad_norm": 4.897592067718506, + "learning_rate": 9.842239330686287e-05, + "loss": 2.33, + "step": 675 + }, + { + "epoch": 0.1090674411100355, + "grad_norm": 3.634265184402466, + "learning_rate": 9.841587525316756e-05, + "loss": 2.5112, + "step": 676 + }, + { + "epoch": 0.10922878347854147, + "grad_norm": 5.599669456481934, + "learning_rate": 9.840934397892054e-05, + "loss": 2.5141, + "step": 677 + }, + { + "epoch": 0.10939012584704744, + "grad_norm": 3.9483070373535156, + "learning_rate": 9.840279948590528e-05, + "loss": 2.2176, + "step": 678 + }, + { + "epoch": 0.10955146821555341, + "grad_norm": 4.629967212677002, + "learning_rate": 9.83962417759088e-05, + "loss": 2.2878, + "step": 679 + }, + { + "epoch": 0.10971281058405938, + "grad_norm": 4.446599006652832, + "learning_rate": 9.838967085072177e-05, + "loss": 2.2885, + "step": 680 + }, + { + "epoch": 0.10987415295256535, + "grad_norm": 5.427147388458252, + "learning_rate": 9.838308671213847e-05, + "loss": 1.9743, + "step": 681 + }, + { + "epoch": 0.11003549532107132, + "grad_norm": 5.741102695465088, + "learning_rate": 9.83764893619568e-05, + "loss": 2.2042, + "step": 682 + }, + { + "epoch": 0.11019683768957728, + "grad_norm": 5.572746276855469, + "learning_rate": 9.836987880197822e-05, + "loss": 2.2499, + "step": 683 + }, + { + "epoch": 0.11035818005808325, + "grad_norm": 4.086371421813965, + "learning_rate": 9.836325503400781e-05, + "loss": 2.4301, + "step": 684 + }, + { + "epoch": 0.11051952242658922, + "grad_norm": 6.250503063201904, + "learning_rate": 9.83566180598543e-05, + "loss": 2.1814, + "step": 685 + }, + { + "epoch": 0.11068086479509519, + "grad_norm": 5.577382564544678, + "learning_rate": 9.834996788133002e-05, + "loss": 2.1989, + "step": 686 + }, + { + "epoch": 0.11084220716360116, + "grad_norm": 5.428224563598633, + "learning_rate": 9.834330450025082e-05, + "loss": 2.1685, + "step": 687 + }, + { + "epoch": 0.11100354953210713, + "grad_norm": 4.101722717285156, + "learning_rate": 9.833662791843627e-05, + "loss": 2.2781, + "step": 688 + }, + { + "epoch": 0.1111648919006131, + "grad_norm": 6.424073219299316, + "learning_rate": 9.832993813770947e-05, + "loss": 2.2648, + "step": 689 + }, + { + "epoch": 0.11132623426911907, + "grad_norm": 4.855088233947754, + "learning_rate": 9.832323515989717e-05, + "loss": 2.1524, + "step": 690 + }, + { + "epoch": 0.11148757663762504, + "grad_norm": 7.551122665405273, + "learning_rate": 9.831651898682968e-05, + "loss": 2.0951, + "step": 691 + }, + { + "epoch": 0.11164891900613101, + "grad_norm": 4.96258020401001, + "learning_rate": 9.830978962034093e-05, + "loss": 2.3655, + "step": 692 + }, + { + "epoch": 0.11181026137463698, + "grad_norm": 5.927750587463379, + "learning_rate": 9.830304706226847e-05, + "loss": 2.0535, + "step": 693 + }, + { + "epoch": 0.11197160374314295, + "grad_norm": 4.051743507385254, + "learning_rate": 9.829629131445342e-05, + "loss": 2.3049, + "step": 694 + }, + { + "epoch": 0.11213294611164892, + "grad_norm": 4.13598108291626, + "learning_rate": 9.828952237874055e-05, + "loss": 2.3451, + "step": 695 + }, + { + "epoch": 0.1122942884801549, + "grad_norm": 5.234704494476318, + "learning_rate": 9.828274025697817e-05, + "loss": 2.3477, + "step": 696 + }, + { + "epoch": 0.11245563084866086, + "grad_norm": 3.897089719772339, + "learning_rate": 9.827594495101823e-05, + "loss": 2.0933, + "step": 697 + }, + { + "epoch": 0.11261697321716682, + "grad_norm": 4.481931686401367, + "learning_rate": 9.826913646271631e-05, + "loss": 2.3839, + "step": 698 + }, + { + "epoch": 0.11277831558567279, + "grad_norm": 4.5691986083984375, + "learning_rate": 9.826231479393148e-05, + "loss": 2.362, + "step": 699 + }, + { + "epoch": 0.11293965795417876, + "grad_norm": 3.9855291843414307, + "learning_rate": 9.825547994652655e-05, + "loss": 2.4411, + "step": 700 + }, + { + "epoch": 0.11310100032268473, + "grad_norm": 4.106456756591797, + "learning_rate": 9.824863192236784e-05, + "loss": 2.2892, + "step": 701 + }, + { + "epoch": 0.1132623426911907, + "grad_norm": 4.03942346572876, + "learning_rate": 9.824177072332526e-05, + "loss": 2.3969, + "step": 702 + }, + { + "epoch": 0.11342368505969667, + "grad_norm": 5.276364803314209, + "learning_rate": 9.823489635127236e-05, + "loss": 2.1279, + "step": 703 + }, + { + "epoch": 0.11358502742820265, + "grad_norm": 3.4048545360565186, + "learning_rate": 9.822800880808628e-05, + "loss": 2.3158, + "step": 704 + }, + { + "epoch": 0.11374636979670862, + "grad_norm": 4.882122039794922, + "learning_rate": 9.822110809564774e-05, + "loss": 2.2801, + "step": 705 + }, + { + "epoch": 0.11390771216521459, + "grad_norm": 4.997471332550049, + "learning_rate": 9.821419421584107e-05, + "loss": 2.0774, + "step": 706 + }, + { + "epoch": 0.11406905453372056, + "grad_norm": 5.2324090003967285, + "learning_rate": 9.82072671705542e-05, + "loss": 2.1501, + "step": 707 + }, + { + "epoch": 0.11423039690222653, + "grad_norm": 4.585089683532715, + "learning_rate": 9.820032696167863e-05, + "loss": 2.3784, + "step": 708 + }, + { + "epoch": 0.1143917392707325, + "grad_norm": 4.679159641265869, + "learning_rate": 9.819337359110945e-05, + "loss": 2.0806, + "step": 709 + }, + { + "epoch": 0.11455308163923847, + "grad_norm": 5.738368511199951, + "learning_rate": 9.81864070607454e-05, + "loss": 2.3261, + "step": 710 + }, + { + "epoch": 0.11471442400774444, + "grad_norm": 5.211796283721924, + "learning_rate": 9.817942737248878e-05, + "loss": 2.1125, + "step": 711 + }, + { + "epoch": 0.11487576637625041, + "grad_norm": 6.789905548095703, + "learning_rate": 9.817243452824545e-05, + "loss": 2.3248, + "step": 712 + }, + { + "epoch": 0.11503710874475637, + "grad_norm": 6.678147315979004, + "learning_rate": 9.81654285299249e-05, + "loss": 2.1509, + "step": 713 + }, + { + "epoch": 0.11519845111326234, + "grad_norm": 3.517925500869751, + "learning_rate": 9.815840937944022e-05, + "loss": 2.512, + "step": 714 + }, + { + "epoch": 0.11535979348176831, + "grad_norm": 6.970942497253418, + "learning_rate": 9.815137707870805e-05, + "loss": 2.0529, + "step": 715 + }, + { + "epoch": 0.11552113585027428, + "grad_norm": 6.183144569396973, + "learning_rate": 9.814433162964868e-05, + "loss": 2.4695, + "step": 716 + }, + { + "epoch": 0.11568247821878025, + "grad_norm": 4.125637054443359, + "learning_rate": 9.813727303418594e-05, + "loss": 2.2679, + "step": 717 + }, + { + "epoch": 0.11584382058728622, + "grad_norm": 7.293137073516846, + "learning_rate": 9.813020129424726e-05, + "loss": 2.2164, + "step": 718 + }, + { + "epoch": 0.11600516295579219, + "grad_norm": 4.501054286956787, + "learning_rate": 9.812311641176366e-05, + "loss": 2.2489, + "step": 719 + }, + { + "epoch": 0.11616650532429816, + "grad_norm": 4.464605331420898, + "learning_rate": 9.811601838866979e-05, + "loss": 2.0857, + "step": 720 + }, + { + "epoch": 0.11632784769280413, + "grad_norm": 3.8551461696624756, + "learning_rate": 9.810890722690381e-05, + "loss": 2.1855, + "step": 721 + }, + { + "epoch": 0.1164891900613101, + "grad_norm": 6.24691104888916, + "learning_rate": 9.810178292840753e-05, + "loss": 2.0357, + "step": 722 + }, + { + "epoch": 0.11665053242981607, + "grad_norm": 6.160900115966797, + "learning_rate": 9.809464549512633e-05, + "loss": 2.1341, + "step": 723 + }, + { + "epoch": 0.11681187479832204, + "grad_norm": 5.17367696762085, + "learning_rate": 9.808749492900918e-05, + "loss": 2.1418, + "step": 724 + }, + { + "epoch": 0.11697321716682801, + "grad_norm": 7.1499786376953125, + "learning_rate": 9.808033123200859e-05, + "loss": 2.2524, + "step": 725 + }, + { + "epoch": 0.11713455953533398, + "grad_norm": 5.599761486053467, + "learning_rate": 9.807315440608076e-05, + "loss": 2.243, + "step": 726 + }, + { + "epoch": 0.11729590190383996, + "grad_norm": 4.063814163208008, + "learning_rate": 9.806596445318537e-05, + "loss": 2.0576, + "step": 727 + }, + { + "epoch": 0.11745724427234591, + "grad_norm": 4.227038860321045, + "learning_rate": 9.805876137528571e-05, + "loss": 2.2421, + "step": 728 + }, + { + "epoch": 0.11761858664085188, + "grad_norm": 5.211657524108887, + "learning_rate": 9.805154517434871e-05, + "loss": 2.5321, + "step": 729 + }, + { + "epoch": 0.11777992900935785, + "grad_norm": 4.9539031982421875, + "learning_rate": 9.804431585234483e-05, + "loss": 2.5082, + "step": 730 + }, + { + "epoch": 0.11794127137786382, + "grad_norm": 4.826414108276367, + "learning_rate": 9.803707341124812e-05, + "loss": 2.186, + "step": 731 + }, + { + "epoch": 0.1181026137463698, + "grad_norm": 5.114047527313232, + "learning_rate": 9.802981785303621e-05, + "loss": 2.2141, + "step": 732 + }, + { + "epoch": 0.11826395611487577, + "grad_norm": 6.016683578491211, + "learning_rate": 9.802254917969032e-05, + "loss": 2.2963, + "step": 733 + }, + { + "epoch": 0.11842529848338174, + "grad_norm": 7.989525318145752, + "learning_rate": 9.801526739319528e-05, + "loss": 2.2502, + "step": 734 + }, + { + "epoch": 0.1185866408518877, + "grad_norm": 4.647066593170166, + "learning_rate": 9.800797249553943e-05, + "loss": 2.2818, + "step": 735 + }, + { + "epoch": 0.11874798322039368, + "grad_norm": 4.587214469909668, + "learning_rate": 9.800066448871477e-05, + "loss": 2.6027, + "step": 736 + }, + { + "epoch": 0.11890932558889965, + "grad_norm": 4.695929527282715, + "learning_rate": 9.799334337471681e-05, + "loss": 1.9805, + "step": 737 + }, + { + "epoch": 0.11907066795740562, + "grad_norm": 5.0293707847595215, + "learning_rate": 9.798600915554468e-05, + "loss": 2.3194, + "step": 738 + }, + { + "epoch": 0.11923201032591159, + "grad_norm": 5.731796741485596, + "learning_rate": 9.79786618332011e-05, + "loss": 2.1848, + "step": 739 + }, + { + "epoch": 0.11939335269441756, + "grad_norm": 5.218835353851318, + "learning_rate": 9.79713014096923e-05, + "loss": 2.0952, + "step": 740 + }, + { + "epoch": 0.11955469506292353, + "grad_norm": 4.538522720336914, + "learning_rate": 9.79639278870282e-05, + "loss": 1.9968, + "step": 741 + }, + { + "epoch": 0.11971603743142949, + "grad_norm": 5.75065803527832, + "learning_rate": 9.795654126722217e-05, + "loss": 2.2524, + "step": 742 + }, + { + "epoch": 0.11987737979993546, + "grad_norm": 4.09686803817749, + "learning_rate": 9.794914155229124e-05, + "loss": 2.0196, + "step": 743 + }, + { + "epoch": 0.12003872216844143, + "grad_norm": 3.8854429721832275, + "learning_rate": 9.794172874425602e-05, + "loss": 2.1736, + "step": 744 + }, + { + "epoch": 0.1202000645369474, + "grad_norm": 4.692524433135986, + "learning_rate": 9.793430284514062e-05, + "loss": 2.2808, + "step": 745 + }, + { + "epoch": 0.12036140690545337, + "grad_norm": 4.936270236968994, + "learning_rate": 9.792686385697282e-05, + "loss": 2.1992, + "step": 746 + }, + { + "epoch": 0.12052274927395934, + "grad_norm": 4.312668323516846, + "learning_rate": 9.79194117817839e-05, + "loss": 2.1661, + "step": 747 + }, + { + "epoch": 0.12068409164246531, + "grad_norm": 6.014287948608398, + "learning_rate": 9.791194662160874e-05, + "loss": 2.1115, + "step": 748 + }, + { + "epoch": 0.12084543401097128, + "grad_norm": 4.760694980621338, + "learning_rate": 9.79044683784858e-05, + "loss": 2.0568, + "step": 749 + }, + { + "epoch": 0.12100677637947725, + "grad_norm": 4.866275310516357, + "learning_rate": 9.78969770544571e-05, + "loss": 2.0202, + "step": 750 + }, + { + "epoch": 0.12116811874798322, + "grad_norm": 5.796637535095215, + "learning_rate": 9.788947265156827e-05, + "loss": 2.2872, + "step": 751 + }, + { + "epoch": 0.12132946111648919, + "grad_norm": 5.722478866577148, + "learning_rate": 9.788195517186845e-05, + "loss": 2.2423, + "step": 752 + }, + { + "epoch": 0.12149080348499516, + "grad_norm": 6.461528778076172, + "learning_rate": 9.787442461741037e-05, + "loss": 2.3383, + "step": 753 + }, + { + "epoch": 0.12165214585350113, + "grad_norm": 4.787562370300293, + "learning_rate": 9.786688099025037e-05, + "loss": 2.3742, + "step": 754 + }, + { + "epoch": 0.1218134882220071, + "grad_norm": 5.212571144104004, + "learning_rate": 9.78593242924483e-05, + "loss": 2.2332, + "step": 755 + }, + { + "epoch": 0.12197483059051308, + "grad_norm": 5.459593296051025, + "learning_rate": 9.785175452606762e-05, + "loss": 2.0847, + "step": 756 + }, + { + "epoch": 0.12213617295901903, + "grad_norm": 7.659170627593994, + "learning_rate": 9.784417169317539e-05, + "loss": 2.2558, + "step": 757 + }, + { + "epoch": 0.122297515327525, + "grad_norm": 7.307011127471924, + "learning_rate": 9.783657579584213e-05, + "loss": 2.3545, + "step": 758 + }, + { + "epoch": 0.12245885769603097, + "grad_norm": 5.335236549377441, + "learning_rate": 9.782896683614204e-05, + "loss": 2.0521, + "step": 759 + }, + { + "epoch": 0.12262020006453694, + "grad_norm": 4.387326717376709, + "learning_rate": 9.782134481615281e-05, + "loss": 2.3457, + "step": 760 + }, + { + "epoch": 0.12278154243304291, + "grad_norm": 4.305408000946045, + "learning_rate": 9.781370973795576e-05, + "loss": 2.1857, + "step": 761 + }, + { + "epoch": 0.12294288480154889, + "grad_norm": 5.678192615509033, + "learning_rate": 9.780606160363572e-05, + "loss": 2.3804, + "step": 762 + }, + { + "epoch": 0.12310422717005486, + "grad_norm": 5.5075178146362305, + "learning_rate": 9.779840041528109e-05, + "loss": 2.1147, + "step": 763 + }, + { + "epoch": 0.12326556953856083, + "grad_norm": 6.328161239624023, + "learning_rate": 9.77907261749839e-05, + "loss": 2.4973, + "step": 764 + }, + { + "epoch": 0.1234269119070668, + "grad_norm": 4.356446743011475, + "learning_rate": 9.778303888483965e-05, + "loss": 2.2695, + "step": 765 + }, + { + "epoch": 0.12358825427557277, + "grad_norm": 4.555844783782959, + "learning_rate": 9.777533854694747e-05, + "loss": 2.5355, + "step": 766 + }, + { + "epoch": 0.12374959664407874, + "grad_norm": 5.903046131134033, + "learning_rate": 9.776762516341003e-05, + "loss": 2.0878, + "step": 767 + }, + { + "epoch": 0.12391093901258471, + "grad_norm": 6.699967384338379, + "learning_rate": 9.775989873633357e-05, + "loss": 2.1891, + "step": 768 + }, + { + "epoch": 0.12407228138109068, + "grad_norm": 4.067835330963135, + "learning_rate": 9.775215926782788e-05, + "loss": 2.1916, + "step": 769 + }, + { + "epoch": 0.12423362374959665, + "grad_norm": 5.333219528198242, + "learning_rate": 9.774440676000631e-05, + "loss": 2.2094, + "step": 770 + }, + { + "epoch": 0.12439496611810262, + "grad_norm": 3.9629952907562256, + "learning_rate": 9.773664121498579e-05, + "loss": 2.6525, + "step": 771 + }, + { + "epoch": 0.12455630848660858, + "grad_norm": 5.467307090759277, + "learning_rate": 9.772886263488679e-05, + "loss": 2.3546, + "step": 772 + }, + { + "epoch": 0.12471765085511455, + "grad_norm": 5.498898029327393, + "learning_rate": 9.772107102183336e-05, + "loss": 2.4372, + "step": 773 + }, + { + "epoch": 0.12487899322362052, + "grad_norm": 4.510770320892334, + "learning_rate": 9.771326637795308e-05, + "loss": 2.2908, + "step": 774 + }, + { + "epoch": 0.1250403355921265, + "grad_norm": 5.672365188598633, + "learning_rate": 9.770544870537711e-05, + "loss": 2.288, + "step": 775 + }, + { + "epoch": 0.12520167796063247, + "grad_norm": 5.459904193878174, + "learning_rate": 9.769761800624016e-05, + "loss": 2.1396, + "step": 776 + }, + { + "epoch": 0.12536302032913843, + "grad_norm": 3.5810322761535645, + "learning_rate": 9.768977428268051e-05, + "loss": 2.2168, + "step": 777 + }, + { + "epoch": 0.12552436269764441, + "grad_norm": 4.152242183685303, + "learning_rate": 9.768191753683998e-05, + "loss": 2.214, + "step": 778 + }, + { + "epoch": 0.12568570506615037, + "grad_norm": 6.909939765930176, + "learning_rate": 9.767404777086393e-05, + "loss": 2.1607, + "step": 779 + }, + { + "epoch": 0.12584704743465633, + "grad_norm": 6.329325199127197, + "learning_rate": 9.766616498690133e-05, + "loss": 2.4425, + "step": 780 + }, + { + "epoch": 0.1260083898031623, + "grad_norm": 3.946126937866211, + "learning_rate": 9.765826918710466e-05, + "loss": 2.1719, + "step": 781 + }, + { + "epoch": 0.12616973217166827, + "grad_norm": 4.491634368896484, + "learning_rate": 9.765036037362996e-05, + "loss": 2.2366, + "step": 782 + }, + { + "epoch": 0.12633107454017425, + "grad_norm": 4.341719150543213, + "learning_rate": 9.764243854863682e-05, + "loss": 2.2991, + "step": 783 + }, + { + "epoch": 0.1264924169086802, + "grad_norm": 6.946076393127441, + "learning_rate": 9.763450371428841e-05, + "loss": 2.2794, + "step": 784 + }, + { + "epoch": 0.1266537592771862, + "grad_norm": 5.051512241363525, + "learning_rate": 9.762655587275142e-05, + "loss": 2.0915, + "step": 785 + }, + { + "epoch": 0.12681510164569215, + "grad_norm": 3.9053547382354736, + "learning_rate": 9.761859502619612e-05, + "loss": 2.3005, + "step": 786 + }, + { + "epoch": 0.12697644401419814, + "grad_norm": 5.447360515594482, + "learning_rate": 9.761062117679632e-05, + "loss": 2.1376, + "step": 787 + }, + { + "epoch": 0.1271377863827041, + "grad_norm": 4.656970977783203, + "learning_rate": 9.760263432672936e-05, + "loss": 2.0274, + "step": 788 + }, + { + "epoch": 0.12729912875121008, + "grad_norm": 3.9776928424835205, + "learning_rate": 9.759463447817616e-05, + "loss": 2.3201, + "step": 789 + }, + { + "epoch": 0.12746047111971603, + "grad_norm": 4.438282489776611, + "learning_rate": 9.758662163332118e-05, + "loss": 2.2339, + "step": 790 + }, + { + "epoch": 0.12762181348822202, + "grad_norm": 4.395122051239014, + "learning_rate": 9.75785957943524e-05, + "loss": 2.3393, + "step": 791 + }, + { + "epoch": 0.12778315585672798, + "grad_norm": 5.506349563598633, + "learning_rate": 9.75705569634614e-05, + "loss": 2.1605, + "step": 792 + }, + { + "epoch": 0.12794449822523396, + "grad_norm": 5.949671268463135, + "learning_rate": 9.756250514284328e-05, + "loss": 2.3274, + "step": 793 + }, + { + "epoch": 0.12810584059373992, + "grad_norm": 4.912148475646973, + "learning_rate": 9.755444033469669e-05, + "loss": 2.1012, + "step": 794 + }, + { + "epoch": 0.12826718296224587, + "grad_norm": 7.435208320617676, + "learning_rate": 9.754636254122381e-05, + "loss": 2.3464, + "step": 795 + }, + { + "epoch": 0.12842852533075186, + "grad_norm": 4.349523067474365, + "learning_rate": 9.75382717646304e-05, + "loss": 2.0887, + "step": 796 + }, + { + "epoch": 0.12858986769925781, + "grad_norm": 4.330012321472168, + "learning_rate": 9.753016800712573e-05, + "loss": 2.2709, + "step": 797 + }, + { + "epoch": 0.1287512100677638, + "grad_norm": 5.999225616455078, + "learning_rate": 9.752205127092265e-05, + "loss": 1.959, + "step": 798 + }, + { + "epoch": 0.12891255243626976, + "grad_norm": 5.202185153961182, + "learning_rate": 9.751392155823752e-05, + "loss": 2.1121, + "step": 799 + }, + { + "epoch": 0.12907389480477574, + "grad_norm": 4.725948333740234, + "learning_rate": 9.750577887129027e-05, + "loss": 2.1956, + "step": 800 + }, + { + "epoch": 0.1292352371732817, + "grad_norm": 6.285621643066406, + "learning_rate": 9.749762321230433e-05, + "loss": 2.4139, + "step": 801 + }, + { + "epoch": 0.12939657954178768, + "grad_norm": 4.640841007232666, + "learning_rate": 9.748945458350673e-05, + "loss": 2.06, + "step": 802 + }, + { + "epoch": 0.12955792191029364, + "grad_norm": 3.547497510910034, + "learning_rate": 9.748127298712803e-05, + "loss": 2.2075, + "step": 803 + }, + { + "epoch": 0.12971926427879962, + "grad_norm": 4.867875099182129, + "learning_rate": 9.747307842540229e-05, + "loss": 2.2949, + "step": 804 + }, + { + "epoch": 0.12988060664730558, + "grad_norm": 5.806981086730957, + "learning_rate": 9.746487090056713e-05, + "loss": 2.2565, + "step": 805 + }, + { + "epoch": 0.13004194901581156, + "grad_norm": 4.769914150238037, + "learning_rate": 9.745665041486374e-05, + "loss": 2.1801, + "step": 806 + }, + { + "epoch": 0.13020329138431752, + "grad_norm": 4.5631890296936035, + "learning_rate": 9.744841697053681e-05, + "loss": 2.0623, + "step": 807 + }, + { + "epoch": 0.13036463375282348, + "grad_norm": 4.284159183502197, + "learning_rate": 9.744017056983459e-05, + "loss": 2.2616, + "step": 808 + }, + { + "epoch": 0.13052597612132946, + "grad_norm": 4.25748348236084, + "learning_rate": 9.743191121500887e-05, + "loss": 2.0961, + "step": 809 + }, + { + "epoch": 0.13068731848983542, + "grad_norm": 5.114495277404785, + "learning_rate": 9.742363890831494e-05, + "loss": 2.3635, + "step": 810 + }, + { + "epoch": 0.1308486608583414, + "grad_norm": 4.6019721031188965, + "learning_rate": 9.741535365201168e-05, + "loss": 2.3813, + "step": 811 + }, + { + "epoch": 0.13101000322684736, + "grad_norm": 4.860962390899658, + "learning_rate": 9.740705544836146e-05, + "loss": 2.0449, + "step": 812 + }, + { + "epoch": 0.13117134559535334, + "grad_norm": 3.8722214698791504, + "learning_rate": 9.739874429963023e-05, + "loss": 2.1397, + "step": 813 + }, + { + "epoch": 0.1313326879638593, + "grad_norm": 5.776529312133789, + "learning_rate": 9.739042020808746e-05, + "loss": 2.1399, + "step": 814 + }, + { + "epoch": 0.13149403033236529, + "grad_norm": 5.5494232177734375, + "learning_rate": 9.73820831760061e-05, + "loss": 2.3765, + "step": 815 + }, + { + "epoch": 0.13165537270087124, + "grad_norm": 5.315171241760254, + "learning_rate": 9.737373320566272e-05, + "loss": 2.1391, + "step": 816 + }, + { + "epoch": 0.13181671506937723, + "grad_norm": 4.855938911437988, + "learning_rate": 9.736537029933738e-05, + "loss": 2.1547, + "step": 817 + }, + { + "epoch": 0.13197805743788318, + "grad_norm": 6.183468341827393, + "learning_rate": 9.735699445931365e-05, + "loss": 2.1848, + "step": 818 + }, + { + "epoch": 0.13213939980638917, + "grad_norm": 7.368840217590332, + "learning_rate": 9.734860568787868e-05, + "loss": 1.9264, + "step": 819 + }, + { + "epoch": 0.13230074217489513, + "grad_norm": 7.0486741065979, + "learning_rate": 9.734020398732311e-05, + "loss": 2.4843, + "step": 820 + }, + { + "epoch": 0.1324620845434011, + "grad_norm": 6.184607028961182, + "learning_rate": 9.733178935994115e-05, + "loss": 2.1397, + "step": 821 + }, + { + "epoch": 0.13262342691190707, + "grad_norm": 4.931450366973877, + "learning_rate": 9.73233618080305e-05, + "loss": 2.1738, + "step": 822 + }, + { + "epoch": 0.13278476928041302, + "grad_norm": 4.700014114379883, + "learning_rate": 9.73149213338924e-05, + "loss": 1.9448, + "step": 823 + }, + { + "epoch": 0.132946111648919, + "grad_norm": 4.283225059509277, + "learning_rate": 9.730646793983165e-05, + "loss": 2.2092, + "step": 824 + }, + { + "epoch": 0.13310745401742496, + "grad_norm": 5.251535892486572, + "learning_rate": 9.729800162815652e-05, + "loss": 2.2563, + "step": 825 + }, + { + "epoch": 0.13326879638593095, + "grad_norm": 4.738597393035889, + "learning_rate": 9.728952240117888e-05, + "loss": 2.2449, + "step": 826 + }, + { + "epoch": 0.1334301387544369, + "grad_norm": 3.9104974269866943, + "learning_rate": 9.728103026121407e-05, + "loss": 2.1812, + "step": 827 + }, + { + "epoch": 0.1335914811229429, + "grad_norm": 5.504037857055664, + "learning_rate": 9.727252521058097e-05, + "loss": 2.378, + "step": 828 + }, + { + "epoch": 0.13375282349144885, + "grad_norm": 5.019674777984619, + "learning_rate": 9.726400725160198e-05, + "loss": 2.162, + "step": 829 + }, + { + "epoch": 0.13391416585995483, + "grad_norm": 4.628396511077881, + "learning_rate": 9.725547638660305e-05, + "loss": 2.2211, + "step": 830 + }, + { + "epoch": 0.1340755082284608, + "grad_norm": 4.508102893829346, + "learning_rate": 9.724693261791364e-05, + "loss": 2.0691, + "step": 831 + }, + { + "epoch": 0.13423685059696677, + "grad_norm": 4.798797607421875, + "learning_rate": 9.723837594786672e-05, + "loss": 2.0428, + "step": 832 + }, + { + "epoch": 0.13439819296547273, + "grad_norm": 4.994052886962891, + "learning_rate": 9.722980637879879e-05, + "loss": 2.328, + "step": 833 + }, + { + "epoch": 0.1345595353339787, + "grad_norm": 6.477321624755859, + "learning_rate": 9.722122391304988e-05, + "loss": 2.5859, + "step": 834 + }, + { + "epoch": 0.13472087770248467, + "grad_norm": 4.700345993041992, + "learning_rate": 9.721262855296357e-05, + "loss": 2.0088, + "step": 835 + }, + { + "epoch": 0.13488222007099065, + "grad_norm": 5.024048805236816, + "learning_rate": 9.72040203008869e-05, + "loss": 2.4683, + "step": 836 + }, + { + "epoch": 0.1350435624394966, + "grad_norm": 4.9408979415893555, + "learning_rate": 9.719539915917043e-05, + "loss": 2.2342, + "step": 837 + }, + { + "epoch": 0.13520490480800257, + "grad_norm": 4.616140842437744, + "learning_rate": 9.718676513016832e-05, + "loss": 2.1205, + "step": 838 + }, + { + "epoch": 0.13536624717650855, + "grad_norm": 5.232850551605225, + "learning_rate": 9.717811821623817e-05, + "loss": 2.1548, + "step": 839 + }, + { + "epoch": 0.1355275895450145, + "grad_norm": 5.0068583488464355, + "learning_rate": 9.716945841974115e-05, + "loss": 2.0153, + "step": 840 + }, + { + "epoch": 0.1356889319135205, + "grad_norm": 5.102565288543701, + "learning_rate": 9.716078574304189e-05, + "loss": 2.1854, + "step": 841 + }, + { + "epoch": 0.13585027428202645, + "grad_norm": 5.298509120941162, + "learning_rate": 9.715210018850859e-05, + "loss": 2.1734, + "step": 842 + }, + { + "epoch": 0.13601161665053244, + "grad_norm": 5.077696800231934, + "learning_rate": 9.714340175851297e-05, + "loss": 2.0086, + "step": 843 + }, + { + "epoch": 0.1361729590190384, + "grad_norm": 4.419206619262695, + "learning_rate": 9.713469045543022e-05, + "loss": 1.9763, + "step": 844 + }, + { + "epoch": 0.13633430138754438, + "grad_norm": 5.292459011077881, + "learning_rate": 9.712596628163906e-05, + "loss": 2.1624, + "step": 845 + }, + { + "epoch": 0.13649564375605033, + "grad_norm": 5.468176364898682, + "learning_rate": 9.711722923952173e-05, + "loss": 2.0919, + "step": 846 + }, + { + "epoch": 0.13665698612455632, + "grad_norm": 4.132210731506348, + "learning_rate": 9.710847933146403e-05, + "loss": 2.2685, + "step": 847 + }, + { + "epoch": 0.13681832849306227, + "grad_norm": 4.604116916656494, + "learning_rate": 9.709971655985518e-05, + "loss": 2.3708, + "step": 848 + }, + { + "epoch": 0.13697967086156826, + "grad_norm": 4.225286960601807, + "learning_rate": 9.709094092708799e-05, + "loss": 2.3804, + "step": 849 + }, + { + "epoch": 0.13714101323007422, + "grad_norm": 7.442013740539551, + "learning_rate": 9.708215243555875e-05, + "loss": 2.2357, + "step": 850 + }, + { + "epoch": 0.1373023555985802, + "grad_norm": 6.1611127853393555, + "learning_rate": 9.707335108766726e-05, + "loss": 2.1841, + "step": 851 + }, + { + "epoch": 0.13746369796708616, + "grad_norm": 5.973389148712158, + "learning_rate": 9.706453688581684e-05, + "loss": 2.0254, + "step": 852 + }, + { + "epoch": 0.1376250403355921, + "grad_norm": 5.30540657043457, + "learning_rate": 9.705570983241432e-05, + "loss": 2.277, + "step": 853 + }, + { + "epoch": 0.1377863827040981, + "grad_norm": 5.778249740600586, + "learning_rate": 9.704686992987005e-05, + "loss": 2.2288, + "step": 854 + }, + { + "epoch": 0.13794772507260405, + "grad_norm": 6.66129207611084, + "learning_rate": 9.703801718059783e-05, + "loss": 2.2849, + "step": 855 + }, + { + "epoch": 0.13810906744111004, + "grad_norm": 5.594348907470703, + "learning_rate": 9.702915158701506e-05, + "loss": 2.1925, + "step": 856 + }, + { + "epoch": 0.138270409809616, + "grad_norm": 4.9399614334106445, + "learning_rate": 9.702027315154257e-05, + "loss": 2.1171, + "step": 857 + }, + { + "epoch": 0.13843175217812198, + "grad_norm": 6.186155796051025, + "learning_rate": 9.701138187660473e-05, + "loss": 2.3412, + "step": 858 + }, + { + "epoch": 0.13859309454662794, + "grad_norm": 7.687668800354004, + "learning_rate": 9.700247776462943e-05, + "loss": 2.2036, + "step": 859 + }, + { + "epoch": 0.13875443691513392, + "grad_norm": 10.154778480529785, + "learning_rate": 9.699356081804803e-05, + "loss": 2.5356, + "step": 860 + }, + { + "epoch": 0.13891577928363988, + "grad_norm": 9.834787368774414, + "learning_rate": 9.698463103929542e-05, + "loss": 2.1638, + "step": 861 + }, + { + "epoch": 0.13907712165214586, + "grad_norm": 6.674441337585449, + "learning_rate": 9.697568843081e-05, + "loss": 2.0882, + "step": 862 + }, + { + "epoch": 0.13923846402065182, + "grad_norm": 4.7841315269470215, + "learning_rate": 9.696673299503361e-05, + "loss": 2.0868, + "step": 863 + }, + { + "epoch": 0.1393998063891578, + "grad_norm": 5.529294013977051, + "learning_rate": 9.695776473441169e-05, + "loss": 2.1338, + "step": 864 + }, + { + "epoch": 0.13956114875766376, + "grad_norm": 5.372847557067871, + "learning_rate": 9.694878365139313e-05, + "loss": 2.2776, + "step": 865 + }, + { + "epoch": 0.13972249112616975, + "grad_norm": 5.5793352127075195, + "learning_rate": 9.693978974843032e-05, + "loss": 2.3289, + "step": 866 + }, + { + "epoch": 0.1398838334946757, + "grad_norm": 4.251262664794922, + "learning_rate": 9.693078302797914e-05, + "loss": 2.059, + "step": 867 + }, + { + "epoch": 0.14004517586318166, + "grad_norm": 5.220300674438477, + "learning_rate": 9.6921763492499e-05, + "loss": 2.5555, + "step": 868 + }, + { + "epoch": 0.14020651823168764, + "grad_norm": 5.264746189117432, + "learning_rate": 9.691273114445278e-05, + "loss": 2.3829, + "step": 869 + }, + { + "epoch": 0.1403678606001936, + "grad_norm": 4.5468974113464355, + "learning_rate": 9.69036859863069e-05, + "loss": 2.1567, + "step": 870 + }, + { + "epoch": 0.14052920296869958, + "grad_norm": 6.331204414367676, + "learning_rate": 9.689462802053125e-05, + "loss": 2.1218, + "step": 871 + }, + { + "epoch": 0.14069054533720554, + "grad_norm": 7.563364505767822, + "learning_rate": 9.688555724959918e-05, + "loss": 2.279, + "step": 872 + }, + { + "epoch": 0.14085188770571153, + "grad_norm": 5.659290790557861, + "learning_rate": 9.687647367598762e-05, + "loss": 2.4341, + "step": 873 + }, + { + "epoch": 0.14101323007421748, + "grad_norm": 4.670441150665283, + "learning_rate": 9.686737730217695e-05, + "loss": 2.3573, + "step": 874 + }, + { + "epoch": 0.14117457244272347, + "grad_norm": 3.848391056060791, + "learning_rate": 9.685826813065102e-05, + "loss": 2.368, + "step": 875 + }, + { + "epoch": 0.14133591481122942, + "grad_norm": 4.092101573944092, + "learning_rate": 9.684914616389721e-05, + "loss": 2.124, + "step": 876 + }, + { + "epoch": 0.1414972571797354, + "grad_norm": 4.652859687805176, + "learning_rate": 9.684001140440639e-05, + "loss": 2.1059, + "step": 877 + }, + { + "epoch": 0.14165859954824137, + "grad_norm": 5.927852630615234, + "learning_rate": 9.683086385467293e-05, + "loss": 2.2456, + "step": 878 + }, + { + "epoch": 0.14181994191674735, + "grad_norm": 7.353157997131348, + "learning_rate": 9.682170351719465e-05, + "loss": 2.3135, + "step": 879 + }, + { + "epoch": 0.1419812842852533, + "grad_norm": 4.424384593963623, + "learning_rate": 9.681253039447294e-05, + "loss": 2.2472, + "step": 880 + }, + { + "epoch": 0.1421426266537593, + "grad_norm": 8.861557006835938, + "learning_rate": 9.680334448901258e-05, + "loss": 2.0083, + "step": 881 + }, + { + "epoch": 0.14230396902226525, + "grad_norm": 5.569779872894287, + "learning_rate": 9.679414580332194e-05, + "loss": 2.1191, + "step": 882 + }, + { + "epoch": 0.1424653113907712, + "grad_norm": 6.622554779052734, + "learning_rate": 9.67849343399128e-05, + "loss": 2.4547, + "step": 883 + }, + { + "epoch": 0.1426266537592772, + "grad_norm": 6.614085674285889, + "learning_rate": 9.67757101013005e-05, + "loss": 2.1192, + "step": 884 + }, + { + "epoch": 0.14278799612778315, + "grad_norm": 4.401320934295654, + "learning_rate": 9.676647309000379e-05, + "loss": 2.0062, + "step": 885 + }, + { + "epoch": 0.14294933849628913, + "grad_norm": 4.417647361755371, + "learning_rate": 9.6757223308545e-05, + "loss": 2.3803, + "step": 886 + }, + { + "epoch": 0.1431106808647951, + "grad_norm": 6.003297328948975, + "learning_rate": 9.674796075944985e-05, + "loss": 2.3687, + "step": 887 + }, + { + "epoch": 0.14327202323330107, + "grad_norm": 5.431515693664551, + "learning_rate": 9.673868544524762e-05, + "loss": 2.4209, + "step": 888 + }, + { + "epoch": 0.14343336560180703, + "grad_norm": 3.7381820678710938, + "learning_rate": 9.672939736847103e-05, + "loss": 2.1941, + "step": 889 + }, + { + "epoch": 0.143594707970313, + "grad_norm": 5.933220386505127, + "learning_rate": 9.672009653165632e-05, + "loss": 2.1986, + "step": 890 + }, + { + "epoch": 0.14375605033881897, + "grad_norm": 3.514514684677124, + "learning_rate": 9.67107829373432e-05, + "loss": 1.9916, + "step": 891 + }, + { + "epoch": 0.14391739270732495, + "grad_norm": 5.188986301422119, + "learning_rate": 9.670145658807485e-05, + "loss": 2.1056, + "step": 892 + }, + { + "epoch": 0.1440787350758309, + "grad_norm": 4.180763244628906, + "learning_rate": 9.669211748639795e-05, + "loss": 2.2095, + "step": 893 + }, + { + "epoch": 0.1442400774443369, + "grad_norm": 3.691096544265747, + "learning_rate": 9.668276563486266e-05, + "loss": 2.35, + "step": 894 + }, + { + "epoch": 0.14440141981284285, + "grad_norm": 4.197265148162842, + "learning_rate": 9.667340103602261e-05, + "loss": 2.1548, + "step": 895 + }, + { + "epoch": 0.1445627621813488, + "grad_norm": 4.846789360046387, + "learning_rate": 9.666402369243492e-05, + "loss": 2.2217, + "step": 896 + }, + { + "epoch": 0.1447241045498548, + "grad_norm": 3.7879931926727295, + "learning_rate": 9.665463360666021e-05, + "loss": 2.2129, + "step": 897 + }, + { + "epoch": 0.14488544691836075, + "grad_norm": 6.429107666015625, + "learning_rate": 9.664523078126253e-05, + "loss": 2.1699, + "step": 898 + }, + { + "epoch": 0.14504678928686673, + "grad_norm": 5.464096546173096, + "learning_rate": 9.663581521880945e-05, + "loss": 2.094, + "step": 899 + }, + { + "epoch": 0.1452081316553727, + "grad_norm": 5.067931175231934, + "learning_rate": 9.6626386921872e-05, + "loss": 2.3702, + "step": 900 + }, + { + "epoch": 0.14536947402387868, + "grad_norm": 5.132601261138916, + "learning_rate": 9.661694589302471e-05, + "loss": 2.0874, + "step": 901 + }, + { + "epoch": 0.14553081639238463, + "grad_norm": 5.8880391120910645, + "learning_rate": 9.660749213484555e-05, + "loss": 2.1288, + "step": 902 + }, + { + "epoch": 0.14569215876089062, + "grad_norm": 4.455813884735107, + "learning_rate": 9.6598025649916e-05, + "loss": 2.197, + "step": 903 + }, + { + "epoch": 0.14585350112939657, + "grad_norm": 4.309800624847412, + "learning_rate": 9.658854644082098e-05, + "loss": 2.3049, + "step": 904 + }, + { + "epoch": 0.14601484349790256, + "grad_norm": 6.354419708251953, + "learning_rate": 9.657905451014893e-05, + "loss": 2.048, + "step": 905 + }, + { + "epoch": 0.14617618586640851, + "grad_norm": 4.877176761627197, + "learning_rate": 9.656954986049171e-05, + "loss": 2.2906, + "step": 906 + }, + { + "epoch": 0.1463375282349145, + "grad_norm": 4.866892337799072, + "learning_rate": 9.656003249444471e-05, + "loss": 2.2998, + "step": 907 + }, + { + "epoch": 0.14649887060342046, + "grad_norm": 3.9700560569763184, + "learning_rate": 9.655050241460675e-05, + "loss": 2.3659, + "step": 908 + }, + { + "epoch": 0.14666021297192644, + "grad_norm": 5.312385082244873, + "learning_rate": 9.654095962358014e-05, + "loss": 2.1026, + "step": 909 + }, + { + "epoch": 0.1468215553404324, + "grad_norm": 5.85587215423584, + "learning_rate": 9.653140412397064e-05, + "loss": 2.0669, + "step": 910 + }, + { + "epoch": 0.14698289770893835, + "grad_norm": 5.001843452453613, + "learning_rate": 9.652183591838752e-05, + "loss": 2.2449, + "step": 911 + }, + { + "epoch": 0.14714424007744434, + "grad_norm": 4.770449638366699, + "learning_rate": 9.65122550094435e-05, + "loss": 2.1298, + "step": 912 + }, + { + "epoch": 0.1473055824459503, + "grad_norm": 6.339503288269043, + "learning_rate": 9.650266139975474e-05, + "loss": 2.2039, + "step": 913 + }, + { + "epoch": 0.14746692481445628, + "grad_norm": 5.899519920349121, + "learning_rate": 9.649305509194092e-05, + "loss": 2.0838, + "step": 914 + }, + { + "epoch": 0.14762826718296224, + "grad_norm": 6.260024547576904, + "learning_rate": 9.648343608862515e-05, + "loss": 2.2062, + "step": 915 + }, + { + "epoch": 0.14778960955146822, + "grad_norm": 4.567520618438721, + "learning_rate": 9.647380439243399e-05, + "loss": 2.1741, + "step": 916 + }, + { + "epoch": 0.14795095191997418, + "grad_norm": 4.420521259307861, + "learning_rate": 9.646416000599754e-05, + "loss": 2.3293, + "step": 917 + }, + { + "epoch": 0.14811229428848016, + "grad_norm": 4.748103618621826, + "learning_rate": 9.64545029319493e-05, + "loss": 2.1784, + "step": 918 + }, + { + "epoch": 0.14827363665698612, + "grad_norm": 6.980162620544434, + "learning_rate": 9.644483317292623e-05, + "loss": 2.1, + "step": 919 + }, + { + "epoch": 0.1484349790254921, + "grad_norm": 5.445437431335449, + "learning_rate": 9.643515073156881e-05, + "loss": 2.2574, + "step": 920 + }, + { + "epoch": 0.14859632139399806, + "grad_norm": 6.650381088256836, + "learning_rate": 9.642545561052095e-05, + "loss": 2.0731, + "step": 921 + }, + { + "epoch": 0.14875766376250404, + "grad_norm": 3.9137048721313477, + "learning_rate": 9.641574781242999e-05, + "loss": 2.1378, + "step": 922 + }, + { + "epoch": 0.14891900613101, + "grad_norm": 6.624279499053955, + "learning_rate": 9.640602733994679e-05, + "loss": 2.2704, + "step": 923 + }, + { + "epoch": 0.14908034849951599, + "grad_norm": 4.790276050567627, + "learning_rate": 9.639629419572565e-05, + "loss": 2.2818, + "step": 924 + }, + { + "epoch": 0.14924169086802194, + "grad_norm": 4.834343433380127, + "learning_rate": 9.638654838242429e-05, + "loss": 2.266, + "step": 925 + }, + { + "epoch": 0.1494030332365279, + "grad_norm": 5.247141361236572, + "learning_rate": 9.637678990270396e-05, + "loss": 2.2551, + "step": 926 + }, + { + "epoch": 0.14956437560503388, + "grad_norm": 5.176708698272705, + "learning_rate": 9.636701875922933e-05, + "loss": 2.2663, + "step": 927 + }, + { + "epoch": 0.14972571797353984, + "grad_norm": 5.154667377471924, + "learning_rate": 9.635723495466851e-05, + "loss": 2.3111, + "step": 928 + }, + { + "epoch": 0.14988706034204582, + "grad_norm": 3.496387004852295, + "learning_rate": 9.63474384916931e-05, + "loss": 2.2975, + "step": 929 + }, + { + "epoch": 0.15004840271055178, + "grad_norm": 6.210982799530029, + "learning_rate": 9.633762937297814e-05, + "loss": 2.1714, + "step": 930 + }, + { + "epoch": 0.15020974507905777, + "grad_norm": 4.316873550415039, + "learning_rate": 9.632780760120215e-05, + "loss": 2.1158, + "step": 931 + }, + { + "epoch": 0.15037108744756372, + "grad_norm": 4.4215874671936035, + "learning_rate": 9.631797317904707e-05, + "loss": 2.2234, + "step": 932 + }, + { + "epoch": 0.1505324298160697, + "grad_norm": 6.841871738433838, + "learning_rate": 9.630812610919832e-05, + "loss": 2.1703, + "step": 933 + }, + { + "epoch": 0.15069377218457566, + "grad_norm": 4.790676116943359, + "learning_rate": 9.629826639434475e-05, + "loss": 2.0918, + "step": 934 + }, + { + "epoch": 0.15085511455308165, + "grad_norm": 5.36135721206665, + "learning_rate": 9.628839403717868e-05, + "loss": 2.002, + "step": 935 + }, + { + "epoch": 0.1510164569215876, + "grad_norm": 4.241684436798096, + "learning_rate": 9.627850904039588e-05, + "loss": 2.2189, + "step": 936 + }, + { + "epoch": 0.1511777992900936, + "grad_norm": 3.8772034645080566, + "learning_rate": 9.626861140669558e-05, + "loss": 2.3375, + "step": 937 + }, + { + "epoch": 0.15133914165859955, + "grad_norm": 6.061500549316406, + "learning_rate": 9.625870113878044e-05, + "loss": 2.1103, + "step": 938 + }, + { + "epoch": 0.15150048402710553, + "grad_norm": 4.246634006500244, + "learning_rate": 9.624877823935659e-05, + "loss": 2.2886, + "step": 939 + }, + { + "epoch": 0.1516618263956115, + "grad_norm": 4.799205780029297, + "learning_rate": 9.623884271113359e-05, + "loss": 2.2289, + "step": 940 + }, + { + "epoch": 0.15182316876411744, + "grad_norm": 4.209903240203857, + "learning_rate": 9.622889455682446e-05, + "loss": 2.114, + "step": 941 + }, + { + "epoch": 0.15198451113262343, + "grad_norm": 4.7471604347229, + "learning_rate": 9.621893377914567e-05, + "loss": 2.4034, + "step": 942 + }, + { + "epoch": 0.15214585350112939, + "grad_norm": 4.916505813598633, + "learning_rate": 9.620896038081713e-05, + "loss": 2.1852, + "step": 943 + }, + { + "epoch": 0.15230719586963537, + "grad_norm": 5.238925933837891, + "learning_rate": 9.619897436456221e-05, + "loss": 2.2174, + "step": 944 + }, + { + "epoch": 0.15246853823814133, + "grad_norm": 4.896865367889404, + "learning_rate": 9.61889757331077e-05, + "loss": 2.0628, + "step": 945 + }, + { + "epoch": 0.1526298806066473, + "grad_norm": 4.1599884033203125, + "learning_rate": 9.617896448918386e-05, + "loss": 2.1821, + "step": 946 + }, + { + "epoch": 0.15279122297515327, + "grad_norm": 5.072725772857666, + "learning_rate": 9.616894063552438e-05, + "loss": 2.3195, + "step": 947 + }, + { + "epoch": 0.15295256534365925, + "grad_norm": 4.6819071769714355, + "learning_rate": 9.615890417486639e-05, + "loss": 2.1734, + "step": 948 + }, + { + "epoch": 0.1531139077121652, + "grad_norm": 4.185474872589111, + "learning_rate": 9.614885510995047e-05, + "loss": 2.0745, + "step": 949 + }, + { + "epoch": 0.1532752500806712, + "grad_norm": 4.95952844619751, + "learning_rate": 9.613879344352066e-05, + "loss": 2.4051, + "step": 950 + }, + { + "epoch": 0.15343659244917715, + "grad_norm": 5.198223114013672, + "learning_rate": 9.61287191783244e-05, + "loss": 2.4239, + "step": 951 + }, + { + "epoch": 0.15359793481768313, + "grad_norm": 7.1581220626831055, + "learning_rate": 9.611863231711261e-05, + "loss": 2.254, + "step": 952 + }, + { + "epoch": 0.1537592771861891, + "grad_norm": 5.759435653686523, + "learning_rate": 9.610853286263963e-05, + "loss": 2.1716, + "step": 953 + }, + { + "epoch": 0.15392061955469508, + "grad_norm": 5.360166549682617, + "learning_rate": 9.609842081766321e-05, + "loss": 2.0779, + "step": 954 + }, + { + "epoch": 0.15408196192320103, + "grad_norm": 5.1558146476745605, + "learning_rate": 9.608829618494462e-05, + "loss": 2.4345, + "step": 955 + }, + { + "epoch": 0.154243304291707, + "grad_norm": 6.02689266204834, + "learning_rate": 9.607815896724846e-05, + "loss": 2.2129, + "step": 956 + }, + { + "epoch": 0.15440464666021297, + "grad_norm": 5.901403427124023, + "learning_rate": 9.606800916734286e-05, + "loss": 2.1403, + "step": 957 + }, + { + "epoch": 0.15456598902871893, + "grad_norm": 5.751607894897461, + "learning_rate": 9.605784678799934e-05, + "loss": 2.28, + "step": 958 + }, + { + "epoch": 0.15472733139722492, + "grad_norm": 4.285133361816406, + "learning_rate": 9.604767183199287e-05, + "loss": 2.0941, + "step": 959 + }, + { + "epoch": 0.15488867376573087, + "grad_norm": 4.502032279968262, + "learning_rate": 9.603748430210183e-05, + "loss": 2.2289, + "step": 960 + }, + { + "epoch": 0.15505001613423686, + "grad_norm": 4.904419422149658, + "learning_rate": 9.602728420110806e-05, + "loss": 2.0168, + "step": 961 + }, + { + "epoch": 0.1552113585027428, + "grad_norm": 4.548860549926758, + "learning_rate": 9.601707153179682e-05, + "loss": 2.139, + "step": 962 + }, + { + "epoch": 0.1553727008712488, + "grad_norm": 4.90744686126709, + "learning_rate": 9.600684629695682e-05, + "loss": 2.0714, + "step": 963 + }, + { + "epoch": 0.15553404323975475, + "grad_norm": 4.9324235916137695, + "learning_rate": 9.599660849938016e-05, + "loss": 2.038, + "step": 964 + }, + { + "epoch": 0.15569538560826074, + "grad_norm": 6.045592784881592, + "learning_rate": 9.598635814186241e-05, + "loss": 1.9441, + "step": 965 + }, + { + "epoch": 0.1558567279767667, + "grad_norm": 4.217911243438721, + "learning_rate": 9.597609522720257e-05, + "loss": 2.0476, + "step": 966 + }, + { + "epoch": 0.15601807034527268, + "grad_norm": 7.424112319946289, + "learning_rate": 9.596581975820303e-05, + "loss": 2.3635, + "step": 967 + }, + { + "epoch": 0.15617941271377864, + "grad_norm": 3.862766742706299, + "learning_rate": 9.595553173766965e-05, + "loss": 2.0969, + "step": 968 + }, + { + "epoch": 0.15634075508228462, + "grad_norm": 6.841447830200195, + "learning_rate": 9.59452311684117e-05, + "loss": 2.4406, + "step": 969 + }, + { + "epoch": 0.15650209745079058, + "grad_norm": 5.120818138122559, + "learning_rate": 9.593491805324189e-05, + "loss": 1.978, + "step": 970 + }, + { + "epoch": 0.15666343981929653, + "grad_norm": 4.624609470367432, + "learning_rate": 9.59245923949763e-05, + "loss": 2.2101, + "step": 971 + }, + { + "epoch": 0.15682478218780252, + "grad_norm": 5.428524971008301, + "learning_rate": 9.591425419643452e-05, + "loss": 2.3038, + "step": 972 + }, + { + "epoch": 0.15698612455630848, + "grad_norm": 5.182520866394043, + "learning_rate": 9.590390346043951e-05, + "loss": 2.06, + "step": 973 + }, + { + "epoch": 0.15714746692481446, + "grad_norm": 4.55204963684082, + "learning_rate": 9.589354018981767e-05, + "loss": 2.2339, + "step": 974 + }, + { + "epoch": 0.15730880929332042, + "grad_norm": 4.689589023590088, + "learning_rate": 9.58831643873988e-05, + "loss": 2.0099, + "step": 975 + }, + { + "epoch": 0.1574701516618264, + "grad_norm": 4.149195194244385, + "learning_rate": 9.587277605601617e-05, + "loss": 2.1516, + "step": 976 + }, + { + "epoch": 0.15763149403033236, + "grad_norm": 4.985833644866943, + "learning_rate": 9.58623751985064e-05, + "loss": 2.2491, + "step": 977 + }, + { + "epoch": 0.15779283639883834, + "grad_norm": 5.3002800941467285, + "learning_rate": 9.585196181770963e-05, + "loss": 2.4263, + "step": 978 + }, + { + "epoch": 0.1579541787673443, + "grad_norm": 6.448243618011475, + "learning_rate": 9.584153591646932e-05, + "loss": 2.2362, + "step": 979 + }, + { + "epoch": 0.15811552113585028, + "grad_norm": 7.038323879241943, + "learning_rate": 9.583109749763239e-05, + "loss": 2.3127, + "step": 980 + }, + { + "epoch": 0.15827686350435624, + "grad_norm": 5.628559112548828, + "learning_rate": 9.582064656404921e-05, + "loss": 2.2545, + "step": 981 + }, + { + "epoch": 0.15843820587286223, + "grad_norm": 4.454617023468018, + "learning_rate": 9.58101831185735e-05, + "loss": 2.1904, + "step": 982 + }, + { + "epoch": 0.15859954824136818, + "grad_norm": 5.519943714141846, + "learning_rate": 9.579970716406245e-05, + "loss": 2.2977, + "step": 983 + }, + { + "epoch": 0.15876089060987417, + "grad_norm": 5.4137349128723145, + "learning_rate": 9.578921870337667e-05, + "loss": 1.9913, + "step": 984 + }, + { + "epoch": 0.15892223297838012, + "grad_norm": 5.2817583084106445, + "learning_rate": 9.577871773938011e-05, + "loss": 2.0741, + "step": 985 + }, + { + "epoch": 0.15908357534688608, + "grad_norm": 4.221789836883545, + "learning_rate": 9.576820427494025e-05, + "loss": 2.3467, + "step": 986 + }, + { + "epoch": 0.15924491771539206, + "grad_norm": 5.049351692199707, + "learning_rate": 9.575767831292788e-05, + "loss": 2.0332, + "step": 987 + }, + { + "epoch": 0.15940626008389802, + "grad_norm": 5.472332000732422, + "learning_rate": 9.574713985621725e-05, + "loss": 2.2033, + "step": 988 + }, + { + "epoch": 0.159567602452404, + "grad_norm": 4.4902849197387695, + "learning_rate": 9.573658890768602e-05, + "loss": 1.872, + "step": 989 + }, + { + "epoch": 0.15972894482090996, + "grad_norm": 6.237600326538086, + "learning_rate": 9.572602547021526e-05, + "loss": 2.4649, + "step": 990 + }, + { + "epoch": 0.15989028718941595, + "grad_norm": 4.866677761077881, + "learning_rate": 9.571544954668945e-05, + "loss": 2.2058, + "step": 991 + }, + { + "epoch": 0.1600516295579219, + "grad_norm": 4.447527885437012, + "learning_rate": 9.570486113999646e-05, + "loss": 2.2357, + "step": 992 + }, + { + "epoch": 0.1602129719264279, + "grad_norm": 5.310793876647949, + "learning_rate": 9.569426025302759e-05, + "loss": 2.3725, + "step": 993 + }, + { + "epoch": 0.16037431429493385, + "grad_norm": 5.911520481109619, + "learning_rate": 9.568364688867757e-05, + "loss": 2.1938, + "step": 994 + }, + { + "epoch": 0.16053565666343983, + "grad_norm": 4.786045551300049, + "learning_rate": 9.567302104984446e-05, + "loss": 2.1026, + "step": 995 + }, + { + "epoch": 0.1606969990319458, + "grad_norm": 3.7093453407287598, + "learning_rate": 9.566238273942982e-05, + "loss": 2.1269, + "step": 996 + }, + { + "epoch": 0.16085834140045177, + "grad_norm": 5.790320873260498, + "learning_rate": 9.565173196033855e-05, + "loss": 2.136, + "step": 997 + }, + { + "epoch": 0.16101968376895773, + "grad_norm": 7.40656852722168, + "learning_rate": 9.564106871547899e-05, + "loss": 2.1995, + "step": 998 + }, + { + "epoch": 0.16118102613746368, + "grad_norm": 4.74056339263916, + "learning_rate": 9.563039300776287e-05, + "loss": 2.0488, + "step": 999 + }, + { + "epoch": 0.16134236850596967, + "grad_norm": 6.514090538024902, + "learning_rate": 9.56197048401053e-05, + "loss": 2.2703, + "step": 1000 + }, + { + "epoch": 0.16150371087447563, + "grad_norm": 6.964737892150879, + "learning_rate": 9.560900421542483e-05, + "loss": 1.9261, + "step": 1001 + }, + { + "epoch": 0.1616650532429816, + "grad_norm": 5.097550868988037, + "learning_rate": 9.55982911366434e-05, + "loss": 2.4466, + "step": 1002 + }, + { + "epoch": 0.16182639561148757, + "grad_norm": 5.489377021789551, + "learning_rate": 9.558756560668636e-05, + "loss": 2.5064, + "step": 1003 + }, + { + "epoch": 0.16198773797999355, + "grad_norm": 4.444252967834473, + "learning_rate": 9.557682762848244e-05, + "loss": 2.2717, + "step": 1004 + }, + { + "epoch": 0.1621490803484995, + "grad_norm": 5.357511043548584, + "learning_rate": 9.556607720496376e-05, + "loss": 2.1742, + "step": 1005 + }, + { + "epoch": 0.1623104227170055, + "grad_norm": 5.26129150390625, + "learning_rate": 9.555531433906587e-05, + "loss": 2.2445, + "step": 1006 + }, + { + "epoch": 0.16247176508551145, + "grad_norm": 3.4206292629241943, + "learning_rate": 9.55445390337277e-05, + "loss": 2.3338, + "step": 1007 + }, + { + "epoch": 0.16263310745401743, + "grad_norm": 4.6428446769714355, + "learning_rate": 9.55337512918916e-05, + "loss": 2.0477, + "step": 1008 + }, + { + "epoch": 0.1627944498225234, + "grad_norm": 4.148924350738525, + "learning_rate": 9.552295111650326e-05, + "loss": 2.3308, + "step": 1009 + }, + { + "epoch": 0.16295579219102937, + "grad_norm": 4.782018661499023, + "learning_rate": 9.551213851051183e-05, + "loss": 2.0843, + "step": 1010 + }, + { + "epoch": 0.16311713455953533, + "grad_norm": 5.063188076019287, + "learning_rate": 9.550131347686981e-05, + "loss": 2.3427, + "step": 1011 + }, + { + "epoch": 0.16327847692804132, + "grad_norm": 3.833597421646118, + "learning_rate": 9.549047601853311e-05, + "loss": 2.0908, + "step": 1012 + }, + { + "epoch": 0.16343981929654727, + "grad_norm": 4.0963215827941895, + "learning_rate": 9.547962613846105e-05, + "loss": 2.2118, + "step": 1013 + }, + { + "epoch": 0.16360116166505323, + "grad_norm": 5.363045692443848, + "learning_rate": 9.54687638396163e-05, + "loss": 2.2259, + "step": 1014 + }, + { + "epoch": 0.16376250403355921, + "grad_norm": 6.040219783782959, + "learning_rate": 9.545788912496496e-05, + "loss": 2.058, + "step": 1015 + }, + { + "epoch": 0.16392384640206517, + "grad_norm": 5.767908096313477, + "learning_rate": 9.54470019974765e-05, + "loss": 1.9915, + "step": 1016 + }, + { + "epoch": 0.16408518877057116, + "grad_norm": 5.240634441375732, + "learning_rate": 9.543610246012377e-05, + "loss": 2.3411, + "step": 1017 + }, + { + "epoch": 0.1642465311390771, + "grad_norm": 4.714745998382568, + "learning_rate": 9.542519051588305e-05, + "loss": 2.1866, + "step": 1018 + }, + { + "epoch": 0.1644078735075831, + "grad_norm": 4.932572364807129, + "learning_rate": 9.541426616773396e-05, + "loss": 2.1218, + "step": 1019 + }, + { + "epoch": 0.16456921587608905, + "grad_norm": 6.019719123840332, + "learning_rate": 9.540332941865953e-05, + "loss": 2.289, + "step": 1020 + }, + { + "epoch": 0.16473055824459504, + "grad_norm": 5.029490947723389, + "learning_rate": 9.539238027164619e-05, + "loss": 1.8102, + "step": 1021 + }, + { + "epoch": 0.164891900613101, + "grad_norm": 6.065807342529297, + "learning_rate": 9.538141872968371e-05, + "loss": 2.3399, + "step": 1022 + }, + { + "epoch": 0.16505324298160698, + "grad_norm": 6.009767055511475, + "learning_rate": 9.537044479576531e-05, + "loss": 2.0795, + "step": 1023 + }, + { + "epoch": 0.16521458535011294, + "grad_norm": 5.45097541809082, + "learning_rate": 9.535945847288754e-05, + "loss": 2.2277, + "step": 1024 + }, + { + "epoch": 0.16537592771861892, + "grad_norm": 4.281061172485352, + "learning_rate": 9.534845976405035e-05, + "loss": 2.1457, + "step": 1025 + }, + { + "epoch": 0.16553727008712488, + "grad_norm": 5.280691146850586, + "learning_rate": 9.533744867225707e-05, + "loss": 2.2294, + "step": 1026 + }, + { + "epoch": 0.16569861245563086, + "grad_norm": 5.329169750213623, + "learning_rate": 9.532642520051442e-05, + "loss": 2.1606, + "step": 1027 + }, + { + "epoch": 0.16585995482413682, + "grad_norm": 4.205093860626221, + "learning_rate": 9.53153893518325e-05, + "loss": 2.1819, + "step": 1028 + }, + { + "epoch": 0.16602129719264277, + "grad_norm": 4.498367786407471, + "learning_rate": 9.530434112922477e-05, + "loss": 2.2947, + "step": 1029 + }, + { + "epoch": 0.16618263956114876, + "grad_norm": 3.871359348297119, + "learning_rate": 9.52932805357081e-05, + "loss": 2.3221, + "step": 1030 + }, + { + "epoch": 0.16634398192965472, + "grad_norm": 4.586424350738525, + "learning_rate": 9.528220757430272e-05, + "loss": 2.2141, + "step": 1031 + }, + { + "epoch": 0.1665053242981607, + "grad_norm": 4.1155009269714355, + "learning_rate": 9.527112224803223e-05, + "loss": 1.9745, + "step": 1032 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 3.8675432205200195, + "learning_rate": 9.52600245599236e-05, + "loss": 2.2398, + "step": 1033 + }, + { + "epoch": 0.16682800903517264, + "grad_norm": 4.718024253845215, + "learning_rate": 9.524891451300721e-05, + "loss": 2.1174, + "step": 1034 + }, + { + "epoch": 0.1669893514036786, + "grad_norm": 4.081422805786133, + "learning_rate": 9.523779211031682e-05, + "loss": 2.0654, + "step": 1035 + }, + { + "epoch": 0.16715069377218458, + "grad_norm": 5.289209842681885, + "learning_rate": 9.522665735488949e-05, + "loss": 2.1711, + "step": 1036 + }, + { + "epoch": 0.16731203614069054, + "grad_norm": 4.412351608276367, + "learning_rate": 9.521551024976573e-05, + "loss": 2.0829, + "step": 1037 + }, + { + "epoch": 0.16747337850919652, + "grad_norm": 5.0605363845825195, + "learning_rate": 9.520435079798939e-05, + "loss": 2.0377, + "step": 1038 + }, + { + "epoch": 0.16763472087770248, + "grad_norm": 4.3810811042785645, + "learning_rate": 9.51931790026077e-05, + "loss": 1.9752, + "step": 1039 + }, + { + "epoch": 0.16779606324620847, + "grad_norm": 5.456241130828857, + "learning_rate": 9.518199486667123e-05, + "loss": 2.257, + "step": 1040 + }, + { + "epoch": 0.16795740561471442, + "grad_norm": 4.840457439422607, + "learning_rate": 9.517079839323398e-05, + "loss": 2.3499, + "step": 1041 + }, + { + "epoch": 0.1681187479832204, + "grad_norm": 5.18924617767334, + "learning_rate": 9.515958958535326e-05, + "loss": 2.1178, + "step": 1042 + }, + { + "epoch": 0.16828009035172636, + "grad_norm": 5.006588935852051, + "learning_rate": 9.514836844608982e-05, + "loss": 2.0868, + "step": 1043 + }, + { + "epoch": 0.16844143272023232, + "grad_norm": 5.704937934875488, + "learning_rate": 9.513713497850767e-05, + "loss": 2.0938, + "step": 1044 + }, + { + "epoch": 0.1686027750887383, + "grad_norm": 4.047709941864014, + "learning_rate": 9.512588918567429e-05, + "loss": 2.0518, + "step": 1045 + }, + { + "epoch": 0.16876411745724426, + "grad_norm": 4.593513488769531, + "learning_rate": 9.511463107066045e-05, + "loss": 2.1238, + "step": 1046 + }, + { + "epoch": 0.16892545982575025, + "grad_norm": 6.242722511291504, + "learning_rate": 9.510336063654034e-05, + "loss": 2.0898, + "step": 1047 + }, + { + "epoch": 0.1690868021942562, + "grad_norm": 4.933840274810791, + "learning_rate": 9.509207788639147e-05, + "loss": 2.0248, + "step": 1048 + }, + { + "epoch": 0.1692481445627622, + "grad_norm": 4.743569850921631, + "learning_rate": 9.508078282329478e-05, + "loss": 2.2515, + "step": 1049 + }, + { + "epoch": 0.16940948693126814, + "grad_norm": 4.313886642456055, + "learning_rate": 9.506947545033447e-05, + "loss": 2.1975, + "step": 1050 + }, + { + "epoch": 0.16957082929977413, + "grad_norm": 5.031490325927734, + "learning_rate": 9.505815577059817e-05, + "loss": 2.1413, + "step": 1051 + }, + { + "epoch": 0.16973217166828009, + "grad_norm": 6.796832084655762, + "learning_rate": 9.50468237871769e-05, + "loss": 2.0446, + "step": 1052 + }, + { + "epoch": 0.16989351403678607, + "grad_norm": 4.098788738250732, + "learning_rate": 9.503547950316494e-05, + "loss": 2.507, + "step": 1053 + }, + { + "epoch": 0.17005485640529203, + "grad_norm": 4.281834125518799, + "learning_rate": 9.502412292166004e-05, + "loss": 2.1102, + "step": 1054 + }, + { + "epoch": 0.170216198773798, + "grad_norm": 5.966123104095459, + "learning_rate": 9.501275404576323e-05, + "loss": 2.2269, + "step": 1055 + }, + { + "epoch": 0.17037754114230397, + "grad_norm": 4.702624320983887, + "learning_rate": 9.500137287857889e-05, + "loss": 2.001, + "step": 1056 + }, + { + "epoch": 0.17053888351080995, + "grad_norm": 6.717199325561523, + "learning_rate": 9.498997942321483e-05, + "loss": 2.2389, + "step": 1057 + }, + { + "epoch": 0.1707002258793159, + "grad_norm": 5.48018741607666, + "learning_rate": 9.497857368278218e-05, + "loss": 2.1897, + "step": 1058 + }, + { + "epoch": 0.17086156824782187, + "grad_norm": 5.843608379364014, + "learning_rate": 9.496715566039538e-05, + "loss": 2.1196, + "step": 1059 + }, + { + "epoch": 0.17102291061632785, + "grad_norm": 4.224806785583496, + "learning_rate": 9.495572535917229e-05, + "loss": 2.2262, + "step": 1060 + }, + { + "epoch": 0.1711842529848338, + "grad_norm": 4.144807815551758, + "learning_rate": 9.494428278223409e-05, + "loss": 1.8662, + "step": 1061 + }, + { + "epoch": 0.1713455953533398, + "grad_norm": 4.138956069946289, + "learning_rate": 9.493282793270531e-05, + "loss": 2.0499, + "step": 1062 + }, + { + "epoch": 0.17150693772184575, + "grad_norm": 4.989041328430176, + "learning_rate": 9.492136081371384e-05, + "loss": 2.3913, + "step": 1063 + }, + { + "epoch": 0.17166828009035173, + "grad_norm": 4.749313831329346, + "learning_rate": 9.490988142839091e-05, + "loss": 2.1736, + "step": 1064 + }, + { + "epoch": 0.1718296224588577, + "grad_norm": 5.544332504272461, + "learning_rate": 9.489838977987114e-05, + "loss": 2.1189, + "step": 1065 + }, + { + "epoch": 0.17199096482736367, + "grad_norm": 4.9664626121521, + "learning_rate": 9.488688587129242e-05, + "loss": 2.3713, + "step": 1066 + }, + { + "epoch": 0.17215230719586963, + "grad_norm": 5.020138740539551, + "learning_rate": 9.487536970579606e-05, + "loss": 2.177, + "step": 1067 + }, + { + "epoch": 0.17231364956437561, + "grad_norm": 3.4289164543151855, + "learning_rate": 9.48638412865267e-05, + "loss": 2.0738, + "step": 1068 + }, + { + "epoch": 0.17247499193288157, + "grad_norm": 5.5560221672058105, + "learning_rate": 9.48523006166323e-05, + "loss": 2.151, + "step": 1069 + }, + { + "epoch": 0.17263633430138756, + "grad_norm": 6.141683101654053, + "learning_rate": 9.48407476992642e-05, + "loss": 2.8241, + "step": 1070 + }, + { + "epoch": 0.1727976766698935, + "grad_norm": 4.250937461853027, + "learning_rate": 9.482918253757705e-05, + "loss": 2.1673, + "step": 1071 + }, + { + "epoch": 0.1729590190383995, + "grad_norm": 4.556088924407959, + "learning_rate": 9.481760513472885e-05, + "loss": 2.2115, + "step": 1072 + }, + { + "epoch": 0.17312036140690545, + "grad_norm": 4.4660773277282715, + "learning_rate": 9.480601549388097e-05, + "loss": 2.1846, + "step": 1073 + }, + { + "epoch": 0.1732817037754114, + "grad_norm": 4.742733955383301, + "learning_rate": 9.479441361819811e-05, + "loss": 2.2282, + "step": 1074 + }, + { + "epoch": 0.1734430461439174, + "grad_norm": 5.606072425842285, + "learning_rate": 9.47827995108483e-05, + "loss": 1.9925, + "step": 1075 + }, + { + "epoch": 0.17360438851242335, + "grad_norm": 6.388113021850586, + "learning_rate": 9.47711731750029e-05, + "loss": 2.2225, + "step": 1076 + }, + { + "epoch": 0.17376573088092934, + "grad_norm": 4.076513290405273, + "learning_rate": 9.475953461383664e-05, + "loss": 1.8734, + "step": 1077 + }, + { + "epoch": 0.1739270732494353, + "grad_norm": 4.824914455413818, + "learning_rate": 9.474788383052756e-05, + "loss": 2.1996, + "step": 1078 + }, + { + "epoch": 0.17408841561794128, + "grad_norm": 6.446959018707275, + "learning_rate": 9.473622082825707e-05, + "loss": 2.2974, + "step": 1079 + }, + { + "epoch": 0.17424975798644723, + "grad_norm": 4.997390270233154, + "learning_rate": 9.47245456102099e-05, + "loss": 2.3537, + "step": 1080 + }, + { + "epoch": 0.17441110035495322, + "grad_norm": 6.039966583251953, + "learning_rate": 9.471285817957407e-05, + "loss": 2.2296, + "step": 1081 + }, + { + "epoch": 0.17457244272345918, + "grad_norm": 4.892731189727783, + "learning_rate": 9.4701158539541e-05, + "loss": 2.1276, + "step": 1082 + }, + { + "epoch": 0.17473378509196516, + "grad_norm": 4.1530442237854, + "learning_rate": 9.468944669330545e-05, + "loss": 2.0831, + "step": 1083 + }, + { + "epoch": 0.17489512746047112, + "grad_norm": 4.372376441955566, + "learning_rate": 9.467772264406545e-05, + "loss": 2.1864, + "step": 1084 + }, + { + "epoch": 0.1750564698289771, + "grad_norm": 3.467975616455078, + "learning_rate": 9.46659863950224e-05, + "loss": 2.2786, + "step": 1085 + }, + { + "epoch": 0.17521781219748306, + "grad_norm": 4.543912410736084, + "learning_rate": 9.465423794938104e-05, + "loss": 2.0789, + "step": 1086 + }, + { + "epoch": 0.17537915456598901, + "grad_norm": 5.803583145141602, + "learning_rate": 9.464247731034943e-05, + "loss": 2.1541, + "step": 1087 + }, + { + "epoch": 0.175540496934495, + "grad_norm": 5.195874214172363, + "learning_rate": 9.463070448113893e-05, + "loss": 2.3098, + "step": 1088 + }, + { + "epoch": 0.17570183930300096, + "grad_norm": 4.555354118347168, + "learning_rate": 9.461891946496428e-05, + "loss": 2.0214, + "step": 1089 + }, + { + "epoch": 0.17586318167150694, + "grad_norm": 7.192603588104248, + "learning_rate": 9.460712226504353e-05, + "loss": 2.0761, + "step": 1090 + }, + { + "epoch": 0.1760245240400129, + "grad_norm": 4.054807186126709, + "learning_rate": 9.459531288459803e-05, + "loss": 2.1649, + "step": 1091 + }, + { + "epoch": 0.17618586640851888, + "grad_norm": 4.142963409423828, + "learning_rate": 9.458349132685249e-05, + "loss": 2.1988, + "step": 1092 + }, + { + "epoch": 0.17634720877702484, + "grad_norm": 4.702714920043945, + "learning_rate": 9.457165759503493e-05, + "loss": 2.1305, + "step": 1093 + }, + { + "epoch": 0.17650855114553082, + "grad_norm": 3.626248836517334, + "learning_rate": 9.455981169237668e-05, + "loss": 2.0853, + "step": 1094 + }, + { + "epoch": 0.17666989351403678, + "grad_norm": 4.734299659729004, + "learning_rate": 9.454795362211244e-05, + "loss": 2.1626, + "step": 1095 + }, + { + "epoch": 0.17683123588254276, + "grad_norm": 5.6444783210754395, + "learning_rate": 9.45360833874802e-05, + "loss": 2.2003, + "step": 1096 + }, + { + "epoch": 0.17699257825104872, + "grad_norm": 4.0697407722473145, + "learning_rate": 9.452420099172124e-05, + "loss": 2.2748, + "step": 1097 + }, + { + "epoch": 0.1771539206195547, + "grad_norm": 4.217678070068359, + "learning_rate": 9.451230643808023e-05, + "loss": 2.1378, + "step": 1098 + }, + { + "epoch": 0.17731526298806066, + "grad_norm": 4.668602466583252, + "learning_rate": 9.450039972980509e-05, + "loss": 2.2964, + "step": 1099 + }, + { + "epoch": 0.17747660535656665, + "grad_norm": 4.176945686340332, + "learning_rate": 9.448848087014712e-05, + "loss": 2.1765, + "step": 1100 + }, + { + "epoch": 0.1776379477250726, + "grad_norm": 4.385271072387695, + "learning_rate": 9.447654986236092e-05, + "loss": 2.1224, + "step": 1101 + }, + { + "epoch": 0.17779929009357856, + "grad_norm": 3.5701985359191895, + "learning_rate": 9.446460670970436e-05, + "loss": 2.0128, + "step": 1102 + }, + { + "epoch": 0.17796063246208454, + "grad_norm": 4.596713542938232, + "learning_rate": 9.44526514154387e-05, + "loss": 2.1401, + "step": 1103 + }, + { + "epoch": 0.1781219748305905, + "grad_norm": 5.741366863250732, + "learning_rate": 9.444068398282848e-05, + "loss": 2.3957, + "step": 1104 + }, + { + "epoch": 0.17828331719909649, + "grad_norm": 4.24362850189209, + "learning_rate": 9.442870441514154e-05, + "loss": 2.2815, + "step": 1105 + }, + { + "epoch": 0.17844465956760244, + "grad_norm": 3.9866597652435303, + "learning_rate": 9.441671271564906e-05, + "loss": 2.0728, + "step": 1106 + }, + { + "epoch": 0.17860600193610843, + "grad_norm": 3.9969146251678467, + "learning_rate": 9.440470888762552e-05, + "loss": 2.3408, + "step": 1107 + }, + { + "epoch": 0.17876734430461438, + "grad_norm": 4.508840084075928, + "learning_rate": 9.439269293434868e-05, + "loss": 2.0563, + "step": 1108 + }, + { + "epoch": 0.17892868667312037, + "grad_norm": 5.449844837188721, + "learning_rate": 9.438066485909969e-05, + "loss": 2.2244, + "step": 1109 + }, + { + "epoch": 0.17909002904162633, + "grad_norm": 4.140756130218506, + "learning_rate": 9.436862466516294e-05, + "loss": 1.9053, + "step": 1110 + }, + { + "epoch": 0.1792513714101323, + "grad_norm": 4.184217929840088, + "learning_rate": 9.435657235582616e-05, + "loss": 2.1351, + "step": 1111 + }, + { + "epoch": 0.17941271377863827, + "grad_norm": 3.90503191947937, + "learning_rate": 9.43445079343804e-05, + "loss": 2.278, + "step": 1112 + }, + { + "epoch": 0.17957405614714425, + "grad_norm": 3.791114568710327, + "learning_rate": 9.433243140411996e-05, + "loss": 1.9327, + "step": 1113 + }, + { + "epoch": 0.1797353985156502, + "grad_norm": 4.18504524230957, + "learning_rate": 9.432034276834252e-05, + "loss": 1.948, + "step": 1114 + }, + { + "epoch": 0.1798967408841562, + "grad_norm": 4.193762302398682, + "learning_rate": 9.4308242030349e-05, + "loss": 1.9719, + "step": 1115 + }, + { + "epoch": 0.18005808325266215, + "grad_norm": 4.792637825012207, + "learning_rate": 9.429612919344368e-05, + "loss": 2.0218, + "step": 1116 + }, + { + "epoch": 0.1802194256211681, + "grad_norm": 6.854551792144775, + "learning_rate": 9.428400426093413e-05, + "loss": 2.1162, + "step": 1117 + }, + { + "epoch": 0.1803807679896741, + "grad_norm": 4.02646017074585, + "learning_rate": 9.427186723613117e-05, + "loss": 2.0172, + "step": 1118 + }, + { + "epoch": 0.18054211035818005, + "grad_norm": 4.791799068450928, + "learning_rate": 9.425971812234901e-05, + "loss": 2.0845, + "step": 1119 + }, + { + "epoch": 0.18070345272668603, + "grad_norm": 4.092919826507568, + "learning_rate": 9.424755692290507e-05, + "loss": 2.3227, + "step": 1120 + }, + { + "epoch": 0.180864795095192, + "grad_norm": 5.270394325256348, + "learning_rate": 9.423538364112017e-05, + "loss": 2.1156, + "step": 1121 + }, + { + "epoch": 0.18102613746369797, + "grad_norm": 3.8187763690948486, + "learning_rate": 9.422319828031832e-05, + "loss": 2.4983, + "step": 1122 + }, + { + "epoch": 0.18118747983220393, + "grad_norm": 4.516146659851074, + "learning_rate": 9.421100084382693e-05, + "loss": 1.8625, + "step": 1123 + }, + { + "epoch": 0.1813488222007099, + "grad_norm": 5.56035041809082, + "learning_rate": 9.419879133497663e-05, + "loss": 2.1817, + "step": 1124 + }, + { + "epoch": 0.18151016456921587, + "grad_norm": 4.905139923095703, + "learning_rate": 9.418656975710136e-05, + "loss": 1.8659, + "step": 1125 + }, + { + "epoch": 0.18167150693772185, + "grad_norm": 5.2623724937438965, + "learning_rate": 9.417433611353842e-05, + "loss": 2.1334, + "step": 1126 + }, + { + "epoch": 0.1818328493062278, + "grad_norm": 4.874891757965088, + "learning_rate": 9.416209040762833e-05, + "loss": 2.2453, + "step": 1127 + }, + { + "epoch": 0.1819941916747338, + "grad_norm": 4.088444232940674, + "learning_rate": 9.414983264271493e-05, + "loss": 1.947, + "step": 1128 + }, + { + "epoch": 0.18215553404323975, + "grad_norm": 5.268361568450928, + "learning_rate": 9.413756282214537e-05, + "loss": 2.017, + "step": 1129 + }, + { + "epoch": 0.18231687641174574, + "grad_norm": 4.026107311248779, + "learning_rate": 9.412528094927007e-05, + "loss": 2.1563, + "step": 1130 + }, + { + "epoch": 0.1824782187802517, + "grad_norm": 4.318005084991455, + "learning_rate": 9.411298702744274e-05, + "loss": 2.0524, + "step": 1131 + }, + { + "epoch": 0.18263956114875765, + "grad_norm": 4.455986022949219, + "learning_rate": 9.410068106002036e-05, + "loss": 2.0646, + "step": 1132 + }, + { + "epoch": 0.18280090351726364, + "grad_norm": 4.1529035568237305, + "learning_rate": 9.408836305036328e-05, + "loss": 2.2399, + "step": 1133 + }, + { + "epoch": 0.1829622458857696, + "grad_norm": 4.354226112365723, + "learning_rate": 9.407603300183507e-05, + "loss": 2.1329, + "step": 1134 + }, + { + "epoch": 0.18312358825427558, + "grad_norm": 4.02735710144043, + "learning_rate": 9.406369091780257e-05, + "loss": 2.2201, + "step": 1135 + }, + { + "epoch": 0.18328493062278153, + "grad_norm": 5.632328510284424, + "learning_rate": 9.405133680163598e-05, + "loss": 2.1951, + "step": 1136 + }, + { + "epoch": 0.18344627299128752, + "grad_norm": 5.8479838371276855, + "learning_rate": 9.40389706567087e-05, + "loss": 2.2726, + "step": 1137 + }, + { + "epoch": 0.18360761535979347, + "grad_norm": 5.934751033782959, + "learning_rate": 9.402659248639749e-05, + "loss": 2.161, + "step": 1138 + }, + { + "epoch": 0.18376895772829946, + "grad_norm": 3.6951584815979004, + "learning_rate": 9.401420229408236e-05, + "loss": 2.1799, + "step": 1139 + }, + { + "epoch": 0.18393030009680542, + "grad_norm": 3.9756414890289307, + "learning_rate": 9.40018000831466e-05, + "loss": 1.9636, + "step": 1140 + }, + { + "epoch": 0.1840916424653114, + "grad_norm": 4.538594722747803, + "learning_rate": 9.398938585697678e-05, + "loss": 2.0378, + "step": 1141 + }, + { + "epoch": 0.18425298483381736, + "grad_norm": 4.996496677398682, + "learning_rate": 9.397695961896275e-05, + "loss": 2.0256, + "step": 1142 + }, + { + "epoch": 0.18441432720232334, + "grad_norm": 6.864620685577393, + "learning_rate": 9.396452137249769e-05, + "loss": 2.2353, + "step": 1143 + }, + { + "epoch": 0.1845756695708293, + "grad_norm": 5.642723083496094, + "learning_rate": 9.395207112097797e-05, + "loss": 2.1499, + "step": 1144 + }, + { + "epoch": 0.18473701193933528, + "grad_norm": 5.145697593688965, + "learning_rate": 9.393960886780329e-05, + "loss": 2.3424, + "step": 1145 + }, + { + "epoch": 0.18489835430784124, + "grad_norm": 4.262571334838867, + "learning_rate": 9.392713461637665e-05, + "loss": 2.4949, + "step": 1146 + }, + { + "epoch": 0.1850596966763472, + "grad_norm": 5.42431116104126, + "learning_rate": 9.391464837010428e-05, + "loss": 2.4643, + "step": 1147 + }, + { + "epoch": 0.18522103904485318, + "grad_norm": 5.535486221313477, + "learning_rate": 9.390215013239569e-05, + "loss": 1.8437, + "step": 1148 + }, + { + "epoch": 0.18538238141335914, + "grad_norm": 4.056037425994873, + "learning_rate": 9.38896399066637e-05, + "loss": 2.2023, + "step": 1149 + }, + { + "epoch": 0.18554372378186512, + "grad_norm": 4.414604187011719, + "learning_rate": 9.387711769632439e-05, + "loss": 2.1182, + "step": 1150 + }, + { + "epoch": 0.18570506615037108, + "grad_norm": 3.6318039894104004, + "learning_rate": 9.386458350479707e-05, + "loss": 1.9319, + "step": 1151 + }, + { + "epoch": 0.18586640851887706, + "grad_norm": 4.180037498474121, + "learning_rate": 9.385203733550438e-05, + "loss": 1.9944, + "step": 1152 + }, + { + "epoch": 0.18602775088738302, + "grad_norm": 4.404853820800781, + "learning_rate": 9.38394791918722e-05, + "loss": 2.4426, + "step": 1153 + }, + { + "epoch": 0.186189093255889, + "grad_norm": 4.873829364776611, + "learning_rate": 9.382690907732968e-05, + "loss": 2.1799, + "step": 1154 + }, + { + "epoch": 0.18635043562439496, + "grad_norm": 5.15049934387207, + "learning_rate": 9.381432699530925e-05, + "loss": 2.2582, + "step": 1155 + }, + { + "epoch": 0.18651177799290095, + "grad_norm": 4.201823711395264, + "learning_rate": 9.380173294924662e-05, + "loss": 2.1879, + "step": 1156 + }, + { + "epoch": 0.1866731203614069, + "grad_norm": 3.9464304447174072, + "learning_rate": 9.378912694258073e-05, + "loss": 2.0987, + "step": 1157 + }, + { + "epoch": 0.1868344627299129, + "grad_norm": 6.513865947723389, + "learning_rate": 9.377650897875379e-05, + "loss": 2.3973, + "step": 1158 + }, + { + "epoch": 0.18699580509841884, + "grad_norm": 4.84373664855957, + "learning_rate": 9.376387906121132e-05, + "loss": 2.0951, + "step": 1159 + }, + { + "epoch": 0.18715714746692483, + "grad_norm": 6.011229515075684, + "learning_rate": 9.375123719340206e-05, + "loss": 2.0646, + "step": 1160 + }, + { + "epoch": 0.18731848983543078, + "grad_norm": 4.3001837730407715, + "learning_rate": 9.373858337877803e-05, + "loss": 2.047, + "step": 1161 + }, + { + "epoch": 0.18747983220393674, + "grad_norm": 6.261403560638428, + "learning_rate": 9.372591762079452e-05, + "loss": 2.0702, + "step": 1162 + }, + { + "epoch": 0.18764117457244273, + "grad_norm": 4.611581325531006, + "learning_rate": 9.371323992291006e-05, + "loss": 2.4375, + "step": 1163 + }, + { + "epoch": 0.18780251694094868, + "grad_norm": 3.757481575012207, + "learning_rate": 9.370055028858647e-05, + "loss": 2.2058, + "step": 1164 + }, + { + "epoch": 0.18796385930945467, + "grad_norm": 3.8578453063964844, + "learning_rate": 9.368784872128878e-05, + "loss": 2.0362, + "step": 1165 + }, + { + "epoch": 0.18812520167796062, + "grad_norm": 5.547084808349609, + "learning_rate": 9.367513522448531e-05, + "loss": 2.2821, + "step": 1166 + }, + { + "epoch": 0.1882865440464666, + "grad_norm": 6.220110893249512, + "learning_rate": 9.366240980164767e-05, + "loss": 2.2423, + "step": 1167 + }, + { + "epoch": 0.18844788641497257, + "grad_norm": 7.139183044433594, + "learning_rate": 9.364967245625067e-05, + "loss": 2.2226, + "step": 1168 + }, + { + "epoch": 0.18860922878347855, + "grad_norm": 6.001524925231934, + "learning_rate": 9.363692319177241e-05, + "loss": 2.1597, + "step": 1169 + }, + { + "epoch": 0.1887705711519845, + "grad_norm": 4.458691596984863, + "learning_rate": 9.362416201169425e-05, + "loss": 1.9492, + "step": 1170 + }, + { + "epoch": 0.1889319135204905, + "grad_norm": 3.713860511779785, + "learning_rate": 9.361138891950073e-05, + "loss": 2.0342, + "step": 1171 + }, + { + "epoch": 0.18909325588899645, + "grad_norm": 5.495253086090088, + "learning_rate": 9.359860391867975e-05, + "loss": 2.3154, + "step": 1172 + }, + { + "epoch": 0.18925459825750243, + "grad_norm": 5.796265125274658, + "learning_rate": 9.35858070127224e-05, + "loss": 2.3185, + "step": 1173 + }, + { + "epoch": 0.1894159406260084, + "grad_norm": 4.879892826080322, + "learning_rate": 9.357299820512304e-05, + "loss": 2.1154, + "step": 1174 + }, + { + "epoch": 0.18957728299451435, + "grad_norm": 4.1518659591674805, + "learning_rate": 9.356017749937925e-05, + "loss": 2.3264, + "step": 1175 + }, + { + "epoch": 0.18973862536302033, + "grad_norm": 5.204110145568848, + "learning_rate": 9.354734489899191e-05, + "loss": 2.3976, + "step": 1176 + }, + { + "epoch": 0.1898999677315263, + "grad_norm": 3.671693801879883, + "learning_rate": 9.35345004074651e-05, + "loss": 2.1576, + "step": 1177 + }, + { + "epoch": 0.19006131010003227, + "grad_norm": 4.392444133758545, + "learning_rate": 9.352164402830618e-05, + "loss": 2.3819, + "step": 1178 + }, + { + "epoch": 0.19022265246853823, + "grad_norm": 5.3956193923950195, + "learning_rate": 9.350877576502573e-05, + "loss": 2.2044, + "step": 1179 + }, + { + "epoch": 0.1903839948370442, + "grad_norm": 4.536787509918213, + "learning_rate": 9.34958956211376e-05, + "loss": 1.9926, + "step": 1180 + }, + { + "epoch": 0.19054533720555017, + "grad_norm": 7.908175945281982, + "learning_rate": 9.348300360015885e-05, + "loss": 2.3019, + "step": 1181 + }, + { + "epoch": 0.19070667957405615, + "grad_norm": 5.751628875732422, + "learning_rate": 9.347009970560984e-05, + "loss": 2.2324, + "step": 1182 + }, + { + "epoch": 0.1908680219425621, + "grad_norm": 3.7884788513183594, + "learning_rate": 9.345718394101411e-05, + "loss": 2.0753, + "step": 1183 + }, + { + "epoch": 0.1910293643110681, + "grad_norm": 5.273193359375, + "learning_rate": 9.344425630989848e-05, + "loss": 2.2719, + "step": 1184 + }, + { + "epoch": 0.19119070667957405, + "grad_norm": 3.5896897315979004, + "learning_rate": 9.343131681579301e-05, + "loss": 2.202, + "step": 1185 + }, + { + "epoch": 0.19135204904808004, + "grad_norm": 4.108644485473633, + "learning_rate": 9.341836546223095e-05, + "loss": 2.0847, + "step": 1186 + }, + { + "epoch": 0.191513391416586, + "grad_norm": 4.217356204986572, + "learning_rate": 9.340540225274887e-05, + "loss": 2.2478, + "step": 1187 + }, + { + "epoch": 0.19167473378509198, + "grad_norm": 6.304388046264648, + "learning_rate": 9.339242719088651e-05, + "loss": 2.1341, + "step": 1188 + }, + { + "epoch": 0.19183607615359793, + "grad_norm": 5.135892391204834, + "learning_rate": 9.337944028018688e-05, + "loss": 1.9682, + "step": 1189 + }, + { + "epoch": 0.1919974185221039, + "grad_norm": 4.361384391784668, + "learning_rate": 9.336644152419622e-05, + "loss": 2.0092, + "step": 1190 + }, + { + "epoch": 0.19215876089060988, + "grad_norm": 4.100388526916504, + "learning_rate": 9.335343092646399e-05, + "loss": 1.9264, + "step": 1191 + }, + { + "epoch": 0.19232010325911583, + "grad_norm": 5.539300918579102, + "learning_rate": 9.334040849054289e-05, + "loss": 2.0711, + "step": 1192 + }, + { + "epoch": 0.19248144562762182, + "grad_norm": 3.903207540512085, + "learning_rate": 9.332737421998886e-05, + "loss": 2.1269, + "step": 1193 + }, + { + "epoch": 0.19264278799612777, + "grad_norm": 4.74282169342041, + "learning_rate": 9.331432811836108e-05, + "loss": 2.1357, + "step": 1194 + }, + { + "epoch": 0.19280413036463376, + "grad_norm": 4.196990013122559, + "learning_rate": 9.330127018922194e-05, + "loss": 2.1716, + "step": 1195 + }, + { + "epoch": 0.19296547273313971, + "grad_norm": 5.20350980758667, + "learning_rate": 9.328820043613707e-05, + "loss": 2.2362, + "step": 1196 + }, + { + "epoch": 0.1931268151016457, + "grad_norm": 5.489796161651611, + "learning_rate": 9.327511886267532e-05, + "loss": 2.2222, + "step": 1197 + }, + { + "epoch": 0.19328815747015166, + "grad_norm": 6.44775915145874, + "learning_rate": 9.32620254724088e-05, + "loss": 2.1585, + "step": 1198 + }, + { + "epoch": 0.19344949983865764, + "grad_norm": 5.332704544067383, + "learning_rate": 9.324892026891279e-05, + "loss": 2.1961, + "step": 1199 + }, + { + "epoch": 0.1936108422071636, + "grad_norm": 5.2855095863342285, + "learning_rate": 9.323580325576584e-05, + "loss": 2.1933, + "step": 1200 + }, + { + "epoch": 0.19377218457566958, + "grad_norm": 4.12716007232666, + "learning_rate": 9.322267443654972e-05, + "loss": 1.9773, + "step": 1201 + }, + { + "epoch": 0.19393352694417554, + "grad_norm": 5.293431758880615, + "learning_rate": 9.320953381484943e-05, + "loss": 2.0362, + "step": 1202 + }, + { + "epoch": 0.19409486931268152, + "grad_norm": 5.689045429229736, + "learning_rate": 9.319638139425313e-05, + "loss": 2.5079, + "step": 1203 + }, + { + "epoch": 0.19425621168118748, + "grad_norm": 5.536917686462402, + "learning_rate": 9.318321717835228e-05, + "loss": 2.2431, + "step": 1204 + }, + { + "epoch": 0.19441755404969344, + "grad_norm": 6.199563980102539, + "learning_rate": 9.317004117074154e-05, + "loss": 2.0321, + "step": 1205 + }, + { + "epoch": 0.19457889641819942, + "grad_norm": 5.072630405426025, + "learning_rate": 9.315685337501876e-05, + "loss": 2.2442, + "step": 1206 + }, + { + "epoch": 0.19474023878670538, + "grad_norm": 4.364281177520752, + "learning_rate": 9.314365379478506e-05, + "loss": 2.16, + "step": 1207 + }, + { + "epoch": 0.19490158115521136, + "grad_norm": 4.033108711242676, + "learning_rate": 9.313044243364473e-05, + "loss": 2.335, + "step": 1208 + }, + { + "epoch": 0.19506292352371732, + "grad_norm": 4.639063358306885, + "learning_rate": 9.311721929520527e-05, + "loss": 2.1587, + "step": 1209 + }, + { + "epoch": 0.1952242658922233, + "grad_norm": 4.656102180480957, + "learning_rate": 9.310398438307746e-05, + "loss": 2.081, + "step": 1210 + }, + { + "epoch": 0.19538560826072926, + "grad_norm": 4.852657318115234, + "learning_rate": 9.309073770087524e-05, + "loss": 1.9645, + "step": 1211 + }, + { + "epoch": 0.19554695062923524, + "grad_norm": 4.050775527954102, + "learning_rate": 9.30774792522158e-05, + "loss": 2.1736, + "step": 1212 + }, + { + "epoch": 0.1957082929977412, + "grad_norm": 3.999640703201294, + "learning_rate": 9.306420904071948e-05, + "loss": 2.1352, + "step": 1213 + }, + { + "epoch": 0.19586963536624719, + "grad_norm": 5.520906448364258, + "learning_rate": 9.305092707000992e-05, + "loss": 2.4808, + "step": 1214 + }, + { + "epoch": 0.19603097773475314, + "grad_norm": 5.296695232391357, + "learning_rate": 9.30376333437139e-05, + "loss": 1.9308, + "step": 1215 + }, + { + "epoch": 0.19619232010325913, + "grad_norm": 3.6606791019439697, + "learning_rate": 9.302432786546142e-05, + "loss": 2.127, + "step": 1216 + }, + { + "epoch": 0.19635366247176508, + "grad_norm": 5.4831862449646, + "learning_rate": 9.301101063888575e-05, + "loss": 2.2932, + "step": 1217 + }, + { + "epoch": 0.19651500484027107, + "grad_norm": 5.197719097137451, + "learning_rate": 9.29976816676233e-05, + "loss": 2.1172, + "step": 1218 + }, + { + "epoch": 0.19667634720877702, + "grad_norm": 4.50950813293457, + "learning_rate": 9.29843409553137e-05, + "loss": 2.3063, + "step": 1219 + }, + { + "epoch": 0.19683768957728298, + "grad_norm": 5.969484329223633, + "learning_rate": 9.297098850559982e-05, + "loss": 2.2218, + "step": 1220 + }, + { + "epoch": 0.19699903194578897, + "grad_norm": 7.5558295249938965, + "learning_rate": 9.295762432212767e-05, + "loss": 2.0204, + "step": 1221 + }, + { + "epoch": 0.19716037431429492, + "grad_norm": 4.77316951751709, + "learning_rate": 9.294424840854654e-05, + "loss": 1.8611, + "step": 1222 + }, + { + "epoch": 0.1973217166828009, + "grad_norm": 4.685102939605713, + "learning_rate": 9.29308607685089e-05, + "loss": 1.8307, + "step": 1223 + }, + { + "epoch": 0.19748305905130686, + "grad_norm": 4.8697099685668945, + "learning_rate": 9.291746140567036e-05, + "loss": 1.977, + "step": 1224 + }, + { + "epoch": 0.19764440141981285, + "grad_norm": 5.616896629333496, + "learning_rate": 9.290405032368983e-05, + "loss": 2.0616, + "step": 1225 + }, + { + "epoch": 0.1978057437883188, + "grad_norm": 5.136362552642822, + "learning_rate": 9.289062752622934e-05, + "loss": 2.06, + "step": 1226 + }, + { + "epoch": 0.1979670861568248, + "grad_norm": 4.789569854736328, + "learning_rate": 9.287719301695418e-05, + "loss": 2.1118, + "step": 1227 + }, + { + "epoch": 0.19812842852533075, + "grad_norm": 4.089688301086426, + "learning_rate": 9.286374679953279e-05, + "loss": 1.865, + "step": 1228 + }, + { + "epoch": 0.19828977089383673, + "grad_norm": 3.7538259029388428, + "learning_rate": 9.28502888776368e-05, + "loss": 1.9576, + "step": 1229 + }, + { + "epoch": 0.1984511132623427, + "grad_norm": 3.620793342590332, + "learning_rate": 9.283681925494111e-05, + "loss": 2.0583, + "step": 1230 + }, + { + "epoch": 0.19861245563084867, + "grad_norm": 5.347387790679932, + "learning_rate": 9.282333793512375e-05, + "loss": 2.4643, + "step": 1231 + }, + { + "epoch": 0.19877379799935463, + "grad_norm": 4.361889839172363, + "learning_rate": 9.280984492186594e-05, + "loss": 2.2125, + "step": 1232 + }, + { + "epoch": 0.1989351403678606, + "grad_norm": 5.193058013916016, + "learning_rate": 9.279634021885212e-05, + "loss": 2.2039, + "step": 1233 + }, + { + "epoch": 0.19909648273636657, + "grad_norm": 6.2152886390686035, + "learning_rate": 9.278282382976995e-05, + "loss": 2.2702, + "step": 1234 + }, + { + "epoch": 0.19925782510487253, + "grad_norm": 4.003117561340332, + "learning_rate": 9.276929575831021e-05, + "loss": 2.1436, + "step": 1235 + }, + { + "epoch": 0.1994191674733785, + "grad_norm": 4.4928879737854, + "learning_rate": 9.27557560081669e-05, + "loss": 2.1452, + "step": 1236 + }, + { + "epoch": 0.19958050984188447, + "grad_norm": 5.384605407714844, + "learning_rate": 9.274220458303727e-05, + "loss": 2.1992, + "step": 1237 + }, + { + "epoch": 0.19974185221039045, + "grad_norm": 4.7580246925354, + "learning_rate": 9.272864148662163e-05, + "loss": 2.0621, + "step": 1238 + }, + { + "epoch": 0.1999031945788964, + "grad_norm": 4.127257823944092, + "learning_rate": 9.271506672262362e-05, + "loss": 2.2737, + "step": 1239 + }, + { + "epoch": 0.2000645369474024, + "grad_norm": 4.529548645019531, + "learning_rate": 9.270148029474994e-05, + "loss": 2.0339, + "step": 1240 + }, + { + "epoch": 0.20022587931590835, + "grad_norm": 6.915964603424072, + "learning_rate": 9.268788220671056e-05, + "loss": 2.1123, + "step": 1241 + }, + { + "epoch": 0.20038722168441434, + "grad_norm": 4.442663192749023, + "learning_rate": 9.267427246221863e-05, + "loss": 2.2201, + "step": 1242 + }, + { + "epoch": 0.2005485640529203, + "grad_norm": 3.644043207168579, + "learning_rate": 9.26606510649904e-05, + "loss": 2.0024, + "step": 1243 + }, + { + "epoch": 0.20070990642142628, + "grad_norm": 5.306570053100586, + "learning_rate": 9.264701801874539e-05, + "loss": 2.1486, + "step": 1244 + }, + { + "epoch": 0.20087124878993223, + "grad_norm": 4.061726093292236, + "learning_rate": 9.263337332720629e-05, + "loss": 2.1827, + "step": 1245 + }, + { + "epoch": 0.20103259115843822, + "grad_norm": 4.937375545501709, + "learning_rate": 9.261971699409893e-05, + "loss": 2.1337, + "step": 1246 + }, + { + "epoch": 0.20119393352694417, + "grad_norm": 6.167418956756592, + "learning_rate": 9.260604902315233e-05, + "loss": 2.0779, + "step": 1247 + }, + { + "epoch": 0.20135527589545016, + "grad_norm": 3.956418752670288, + "learning_rate": 9.259236941809873e-05, + "loss": 2.0061, + "step": 1248 + }, + { + "epoch": 0.20151661826395612, + "grad_norm": 4.3427252769470215, + "learning_rate": 9.257867818267348e-05, + "loss": 2.1665, + "step": 1249 + }, + { + "epoch": 0.20167796063246207, + "grad_norm": 4.450231075286865, + "learning_rate": 9.256497532061515e-05, + "loss": 2.4017, + "step": 1250 + }, + { + "epoch": 0.20183930300096806, + "grad_norm": 4.20451021194458, + "learning_rate": 9.25512608356655e-05, + "loss": 1.9089, + "step": 1251 + }, + { + "epoch": 0.202000645369474, + "grad_norm": 4.522383689880371, + "learning_rate": 9.253753473156943e-05, + "loss": 2.0234, + "step": 1252 + }, + { + "epoch": 0.20216198773798, + "grad_norm": 4.063665390014648, + "learning_rate": 9.252379701207499e-05, + "loss": 2.1226, + "step": 1253 + }, + { + "epoch": 0.20232333010648595, + "grad_norm": 4.9527740478515625, + "learning_rate": 9.251004768093348e-05, + "loss": 2.3531, + "step": 1254 + }, + { + "epoch": 0.20248467247499194, + "grad_norm": 5.102415561676025, + "learning_rate": 9.249628674189927e-05, + "loss": 2.2128, + "step": 1255 + }, + { + "epoch": 0.2026460148434979, + "grad_norm": 5.253018379211426, + "learning_rate": 9.248251419873002e-05, + "loss": 2.1547, + "step": 1256 + }, + { + "epoch": 0.20280735721200388, + "grad_norm": 4.89177131652832, + "learning_rate": 9.246873005518644e-05, + "loss": 2.1247, + "step": 1257 + }, + { + "epoch": 0.20296869958050984, + "grad_norm": 4.267810821533203, + "learning_rate": 9.245493431503249e-05, + "loss": 2.1507, + "step": 1258 + }, + { + "epoch": 0.20313004194901582, + "grad_norm": 4.486478328704834, + "learning_rate": 9.244112698203524e-05, + "loss": 1.9206, + "step": 1259 + }, + { + "epoch": 0.20329138431752178, + "grad_norm": 3.4107930660247803, + "learning_rate": 9.242730805996499e-05, + "loss": 1.9769, + "step": 1260 + }, + { + "epoch": 0.20345272668602776, + "grad_norm": 4.6431565284729, + "learning_rate": 9.241347755259514e-05, + "loss": 2.3648, + "step": 1261 + }, + { + "epoch": 0.20361406905453372, + "grad_norm": 5.335633277893066, + "learning_rate": 9.239963546370227e-05, + "loss": 2.3202, + "step": 1262 + }, + { + "epoch": 0.2037754114230397, + "grad_norm": 5.694500923156738, + "learning_rate": 9.238578179706616e-05, + "loss": 1.994, + "step": 1263 + }, + { + "epoch": 0.20393675379154566, + "grad_norm": 5.830473899841309, + "learning_rate": 9.237191655646972e-05, + "loss": 2.0949, + "step": 1264 + }, + { + "epoch": 0.20409809616005162, + "grad_norm": 4.1669487953186035, + "learning_rate": 9.235803974569901e-05, + "loss": 2.1442, + "step": 1265 + }, + { + "epoch": 0.2042594385285576, + "grad_norm": 4.0471930503845215, + "learning_rate": 9.234415136854328e-05, + "loss": 2.0706, + "step": 1266 + }, + { + "epoch": 0.20442078089706356, + "grad_norm": 5.178165912628174, + "learning_rate": 9.23302514287949e-05, + "loss": 2.2316, + "step": 1267 + }, + { + "epoch": 0.20458212326556954, + "grad_norm": 3.4137814044952393, + "learning_rate": 9.231633993024944e-05, + "loss": 2.3389, + "step": 1268 + }, + { + "epoch": 0.2047434656340755, + "grad_norm": 5.766005992889404, + "learning_rate": 9.230241687670561e-05, + "loss": 2.3266, + "step": 1269 + }, + { + "epoch": 0.20490480800258148, + "grad_norm": 5.218629837036133, + "learning_rate": 9.228848227196528e-05, + "loss": 2.3806, + "step": 1270 + }, + { + "epoch": 0.20506615037108744, + "grad_norm": 4.675017833709717, + "learning_rate": 9.227453611983341e-05, + "loss": 2.3105, + "step": 1271 + }, + { + "epoch": 0.20522749273959343, + "grad_norm": 4.16404390335083, + "learning_rate": 9.226057842411823e-05, + "loss": 2.105, + "step": 1272 + }, + { + "epoch": 0.20538883510809938, + "grad_norm": 4.810925006866455, + "learning_rate": 9.224660918863104e-05, + "loss": 2.0151, + "step": 1273 + }, + { + "epoch": 0.20555017747660537, + "grad_norm": 4.716014862060547, + "learning_rate": 9.22326284171863e-05, + "loss": 1.9447, + "step": 1274 + }, + { + "epoch": 0.20571151984511132, + "grad_norm": 3.6420786380767822, + "learning_rate": 9.221863611360164e-05, + "loss": 2.0919, + "step": 1275 + }, + { + "epoch": 0.2058728622136173, + "grad_norm": 4.337446689605713, + "learning_rate": 9.220463228169785e-05, + "loss": 2.0707, + "step": 1276 + }, + { + "epoch": 0.20603420458212326, + "grad_norm": 3.9118850231170654, + "learning_rate": 9.219061692529882e-05, + "loss": 2.1869, + "step": 1277 + }, + { + "epoch": 0.20619554695062922, + "grad_norm": 6.766507148742676, + "learning_rate": 9.217659004823162e-05, + "loss": 2.1102, + "step": 1278 + }, + { + "epoch": 0.2063568893191352, + "grad_norm": 4.498598575592041, + "learning_rate": 9.216255165432648e-05, + "loss": 2.1412, + "step": 1279 + }, + { + "epoch": 0.20651823168764116, + "grad_norm": 5.993344306945801, + "learning_rate": 9.214850174741677e-05, + "loss": 2.0669, + "step": 1280 + }, + { + "epoch": 0.20667957405614715, + "grad_norm": 5.468132495880127, + "learning_rate": 9.213444033133893e-05, + "loss": 2.1463, + "step": 1281 + }, + { + "epoch": 0.2068409164246531, + "grad_norm": 4.580942630767822, + "learning_rate": 9.212036740993266e-05, + "loss": 2.1803, + "step": 1282 + }, + { + "epoch": 0.2070022587931591, + "grad_norm": 4.758857250213623, + "learning_rate": 9.210628298704072e-05, + "loss": 2.1023, + "step": 1283 + }, + { + "epoch": 0.20716360116166505, + "grad_norm": 5.4867072105407715, + "learning_rate": 9.209218706650902e-05, + "loss": 2.3216, + "step": 1284 + }, + { + "epoch": 0.20732494353017103, + "grad_norm": 3.8598763942718506, + "learning_rate": 9.207807965218668e-05, + "loss": 2.4166, + "step": 1285 + }, + { + "epoch": 0.207486285898677, + "grad_norm": 7.519045352935791, + "learning_rate": 9.206396074792585e-05, + "loss": 2.1097, + "step": 1286 + }, + { + "epoch": 0.20764762826718297, + "grad_norm": 5.47310733795166, + "learning_rate": 9.204983035758187e-05, + "loss": 1.8827, + "step": 1287 + }, + { + "epoch": 0.20780897063568893, + "grad_norm": 5.6422505378723145, + "learning_rate": 9.203568848501327e-05, + "loss": 2.1591, + "step": 1288 + }, + { + "epoch": 0.2079703130041949, + "grad_norm": 4.1179938316345215, + "learning_rate": 9.202153513408162e-05, + "loss": 1.9835, + "step": 1289 + }, + { + "epoch": 0.20813165537270087, + "grad_norm": 5.341106414794922, + "learning_rate": 9.200737030865168e-05, + "loss": 2.3916, + "step": 1290 + }, + { + "epoch": 0.20829299774120685, + "grad_norm": 5.809384346008301, + "learning_rate": 9.199319401259131e-05, + "loss": 1.9919, + "step": 1291 + }, + { + "epoch": 0.2084543401097128, + "grad_norm": 5.8908514976501465, + "learning_rate": 9.197900624977156e-05, + "loss": 2.2108, + "step": 1292 + }, + { + "epoch": 0.20861568247821877, + "grad_norm": 5.788384437561035, + "learning_rate": 9.196480702406653e-05, + "loss": 2.1998, + "step": 1293 + }, + { + "epoch": 0.20877702484672475, + "grad_norm": 8.431769371032715, + "learning_rate": 9.195059633935352e-05, + "loss": 2.5768, + "step": 1294 + }, + { + "epoch": 0.2089383672152307, + "grad_norm": 4.767300605773926, + "learning_rate": 9.193637419951294e-05, + "loss": 1.9114, + "step": 1295 + }, + { + "epoch": 0.2090997095837367, + "grad_norm": 4.139121055603027, + "learning_rate": 9.19221406084283e-05, + "loss": 2.0196, + "step": 1296 + }, + { + "epoch": 0.20926105195224265, + "grad_norm": 4.451340675354004, + "learning_rate": 9.190789556998627e-05, + "loss": 1.9757, + "step": 1297 + }, + { + "epoch": 0.20942239432074863, + "grad_norm": 5.227597713470459, + "learning_rate": 9.189363908807663e-05, + "loss": 2.0621, + "step": 1298 + }, + { + "epoch": 0.2095837366892546, + "grad_norm": 5.614994049072266, + "learning_rate": 9.187937116659229e-05, + "loss": 2.3018, + "step": 1299 + }, + { + "epoch": 0.20974507905776058, + "grad_norm": 9.214066505432129, + "learning_rate": 9.186509180942928e-05, + "loss": 2.4219, + "step": 1300 + }, + { + "epoch": 0.20990642142626653, + "grad_norm": 5.049530982971191, + "learning_rate": 9.185080102048675e-05, + "loss": 2.4699, + "step": 1301 + }, + { + "epoch": 0.21006776379477252, + "grad_norm": 4.416045188903809, + "learning_rate": 9.1836498803667e-05, + "loss": 2.1083, + "step": 1302 + }, + { + "epoch": 0.21022910616327847, + "grad_norm": 4.00837516784668, + "learning_rate": 9.182218516287539e-05, + "loss": 1.8756, + "step": 1303 + }, + { + "epoch": 0.21039044853178446, + "grad_norm": 4.334080696105957, + "learning_rate": 9.180786010202045e-05, + "loss": 2.4178, + "step": 1304 + }, + { + "epoch": 0.21055179090029041, + "grad_norm": 3.3530056476593018, + "learning_rate": 9.179352362501384e-05, + "loss": 2.25, + "step": 1305 + }, + { + "epoch": 0.2107131332687964, + "grad_norm": 5.390597820281982, + "learning_rate": 9.177917573577026e-05, + "loss": 2.2067, + "step": 1306 + }, + { + "epoch": 0.21087447563730236, + "grad_norm": 6.295938491821289, + "learning_rate": 9.176481643820762e-05, + "loss": 2.1304, + "step": 1307 + }, + { + "epoch": 0.2110358180058083, + "grad_norm": 4.074714660644531, + "learning_rate": 9.17504457362469e-05, + "loss": 2.2887, + "step": 1308 + }, + { + "epoch": 0.2111971603743143, + "grad_norm": 4.701354026794434, + "learning_rate": 9.173606363381219e-05, + "loss": 2.4589, + "step": 1309 + }, + { + "epoch": 0.21135850274282025, + "grad_norm": 5.782101154327393, + "learning_rate": 9.172167013483068e-05, + "loss": 2.0267, + "step": 1310 + }, + { + "epoch": 0.21151984511132624, + "grad_norm": 4.437861919403076, + "learning_rate": 9.170726524323273e-05, + "loss": 2.1448, + "step": 1311 + }, + { + "epoch": 0.2116811874798322, + "grad_norm": 5.867735385894775, + "learning_rate": 9.169284896295174e-05, + "loss": 2.0455, + "step": 1312 + }, + { + "epoch": 0.21184252984833818, + "grad_norm": 3.6233768463134766, + "learning_rate": 9.167842129792428e-05, + "loss": 2.3327, + "step": 1313 + }, + { + "epoch": 0.21200387221684414, + "grad_norm": 7.002150535583496, + "learning_rate": 9.166398225208999e-05, + "loss": 2.0155, + "step": 1314 + }, + { + "epoch": 0.21216521458535012, + "grad_norm": 4.855229377746582, + "learning_rate": 9.164953182939162e-05, + "loss": 2.0364, + "step": 1315 + }, + { + "epoch": 0.21232655695385608, + "grad_norm": 5.568299293518066, + "learning_rate": 9.163507003377506e-05, + "loss": 2.155, + "step": 1316 + }, + { + "epoch": 0.21248789932236206, + "grad_norm": 5.044248580932617, + "learning_rate": 9.162059686918924e-05, + "loss": 2.2754, + "step": 1317 + }, + { + "epoch": 0.21264924169086802, + "grad_norm": 6.40256929397583, + "learning_rate": 9.160611233958629e-05, + "loss": 2.1125, + "step": 1318 + }, + { + "epoch": 0.212810584059374, + "grad_norm": 4.975361347198486, + "learning_rate": 9.159161644892135e-05, + "loss": 2.1204, + "step": 1319 + }, + { + "epoch": 0.21297192642787996, + "grad_norm": 4.8289031982421875, + "learning_rate": 9.157710920115273e-05, + "loss": 2.0861, + "step": 1320 + }, + { + "epoch": 0.21313326879638594, + "grad_norm": 3.993028163909912, + "learning_rate": 9.156259060024177e-05, + "loss": 2.1801, + "step": 1321 + }, + { + "epoch": 0.2132946111648919, + "grad_norm": 5.233036994934082, + "learning_rate": 9.1548060650153e-05, + "loss": 2.1827, + "step": 1322 + }, + { + "epoch": 0.21345595353339786, + "grad_norm": 4.861788749694824, + "learning_rate": 9.153351935485397e-05, + "loss": 2.155, + "step": 1323 + }, + { + "epoch": 0.21361729590190384, + "grad_norm": 4.867989540100098, + "learning_rate": 9.151896671831538e-05, + "loss": 2.3657, + "step": 1324 + }, + { + "epoch": 0.2137786382704098, + "grad_norm": 5.0348334312438965, + "learning_rate": 9.1504402744511e-05, + "loss": 2.1548, + "step": 1325 + }, + { + "epoch": 0.21393998063891578, + "grad_norm": 4.9925994873046875, + "learning_rate": 9.14898274374177e-05, + "loss": 1.9949, + "step": 1326 + }, + { + "epoch": 0.21410132300742174, + "grad_norm": 4.181024551391602, + "learning_rate": 9.147524080101544e-05, + "loss": 2.0769, + "step": 1327 + }, + { + "epoch": 0.21426266537592772, + "grad_norm": 4.714298248291016, + "learning_rate": 9.14606428392873e-05, + "loss": 2.1353, + "step": 1328 + }, + { + "epoch": 0.21442400774443368, + "grad_norm": 4.4157938957214355, + "learning_rate": 9.144603355621941e-05, + "loss": 2.2024, + "step": 1329 + }, + { + "epoch": 0.21458535011293967, + "grad_norm": 3.2527670860290527, + "learning_rate": 9.143141295580104e-05, + "loss": 1.9403, + "step": 1330 + }, + { + "epoch": 0.21474669248144562, + "grad_norm": 3.534852981567383, + "learning_rate": 9.14167810420245e-05, + "loss": 2.0353, + "step": 1331 + }, + { + "epoch": 0.2149080348499516, + "grad_norm": 4.721923828125, + "learning_rate": 9.140213781888524e-05, + "loss": 2.0659, + "step": 1332 + }, + { + "epoch": 0.21506937721845756, + "grad_norm": 4.635398864746094, + "learning_rate": 9.138748329038177e-05, + "loss": 2.2004, + "step": 1333 + }, + { + "epoch": 0.21523071958696355, + "grad_norm": 5.3266520500183105, + "learning_rate": 9.137281746051565e-05, + "loss": 2.8019, + "step": 1334 + }, + { + "epoch": 0.2153920619554695, + "grad_norm": 4.811481952667236, + "learning_rate": 9.135814033329162e-05, + "loss": 2.2637, + "step": 1335 + }, + { + "epoch": 0.2155534043239755, + "grad_norm": 4.6001458168029785, + "learning_rate": 9.134345191271742e-05, + "loss": 2.2032, + "step": 1336 + }, + { + "epoch": 0.21571474669248145, + "grad_norm": 4.366300106048584, + "learning_rate": 9.13287522028039e-05, + "loss": 1.971, + "step": 1337 + }, + { + "epoch": 0.2158760890609874, + "grad_norm": 4.729623794555664, + "learning_rate": 9.131404120756502e-05, + "loss": 1.9098, + "step": 1338 + }, + { + "epoch": 0.2160374314294934, + "grad_norm": 3.6516096591949463, + "learning_rate": 9.129931893101778e-05, + "loss": 2.0485, + "step": 1339 + }, + { + "epoch": 0.21619877379799934, + "grad_norm": 3.8406763076782227, + "learning_rate": 9.12845853771823e-05, + "loss": 2.0777, + "step": 1340 + }, + { + "epoch": 0.21636011616650533, + "grad_norm": 4.912896633148193, + "learning_rate": 9.126984055008172e-05, + "loss": 2.0741, + "step": 1341 + }, + { + "epoch": 0.21652145853501129, + "grad_norm": 4.041453838348389, + "learning_rate": 9.125508445374233e-05, + "loss": 2.1465, + "step": 1342 + }, + { + "epoch": 0.21668280090351727, + "grad_norm": 4.528754234313965, + "learning_rate": 9.124031709219346e-05, + "loss": 2.2577, + "step": 1343 + }, + { + "epoch": 0.21684414327202323, + "grad_norm": 4.499135971069336, + "learning_rate": 9.122553846946751e-05, + "loss": 2.2312, + "step": 1344 + }, + { + "epoch": 0.2170054856405292, + "grad_norm": 5.06335973739624, + "learning_rate": 9.121074858959997e-05, + "loss": 1.9082, + "step": 1345 + }, + { + "epoch": 0.21716682800903517, + "grad_norm": 4.791558742523193, + "learning_rate": 9.119594745662941e-05, + "loss": 2.006, + "step": 1346 + }, + { + "epoch": 0.21732817037754115, + "grad_norm": 4.483156204223633, + "learning_rate": 9.118113507459743e-05, + "loss": 2.1619, + "step": 1347 + }, + { + "epoch": 0.2174895127460471, + "grad_norm": 4.789119243621826, + "learning_rate": 9.116631144754877e-05, + "loss": 2.2854, + "step": 1348 + }, + { + "epoch": 0.2176508551145531, + "grad_norm": 5.603588104248047, + "learning_rate": 9.115147657953118e-05, + "loss": 2.3163, + "step": 1349 + }, + { + "epoch": 0.21781219748305905, + "grad_norm": 4.183100700378418, + "learning_rate": 9.113663047459553e-05, + "loss": 2.0677, + "step": 1350 + }, + { + "epoch": 0.21797353985156503, + "grad_norm": 4.857205390930176, + "learning_rate": 9.11217731367957e-05, + "loss": 2.3799, + "step": 1351 + }, + { + "epoch": 0.218134882220071, + "grad_norm": 3.7694199085235596, + "learning_rate": 9.110690457018868e-05, + "loss": 1.9902, + "step": 1352 + }, + { + "epoch": 0.21829622458857695, + "grad_norm": 4.458879470825195, + "learning_rate": 9.109202477883453e-05, + "loss": 1.9408, + "step": 1353 + }, + { + "epoch": 0.21845756695708293, + "grad_norm": 4.134251117706299, + "learning_rate": 9.107713376679634e-05, + "loss": 2.1774, + "step": 1354 + }, + { + "epoch": 0.2186189093255889, + "grad_norm": 4.250706195831299, + "learning_rate": 9.10622315381403e-05, + "loss": 2.1894, + "step": 1355 + }, + { + "epoch": 0.21878025169409487, + "grad_norm": 4.276532173156738, + "learning_rate": 9.104731809693563e-05, + "loss": 2.0367, + "step": 1356 + }, + { + "epoch": 0.21894159406260083, + "grad_norm": 5.151341915130615, + "learning_rate": 9.103239344725465e-05, + "loss": 2.1474, + "step": 1357 + }, + { + "epoch": 0.21910293643110682, + "grad_norm": 4.042210102081299, + "learning_rate": 9.10174575931727e-05, + "loss": 2.1367, + "step": 1358 + }, + { + "epoch": 0.21926427879961277, + "grad_norm": 4.783393859863281, + "learning_rate": 9.100251053876822e-05, + "loss": 1.93, + "step": 1359 + }, + { + "epoch": 0.21942562116811876, + "grad_norm": 4.063416957855225, + "learning_rate": 9.098755228812268e-05, + "loss": 1.977, + "step": 1360 + }, + { + "epoch": 0.2195869635366247, + "grad_norm": 6.108659744262695, + "learning_rate": 9.097258284532061e-05, + "loss": 2.1322, + "step": 1361 + }, + { + "epoch": 0.2197483059051307, + "grad_norm": 4.696874141693115, + "learning_rate": 9.09576022144496e-05, + "loss": 1.9444, + "step": 1362 + }, + { + "epoch": 0.21990964827363665, + "grad_norm": 4.419808387756348, + "learning_rate": 9.094261039960027e-05, + "loss": 2.1688, + "step": 1363 + }, + { + "epoch": 0.22007099064214264, + "grad_norm": 4.2608442306518555, + "learning_rate": 9.092760740486639e-05, + "loss": 2.2771, + "step": 1364 + }, + { + "epoch": 0.2202323330106486, + "grad_norm": 4.203035354614258, + "learning_rate": 9.091259323434465e-05, + "loss": 2.0907, + "step": 1365 + }, + { + "epoch": 0.22039367537915455, + "grad_norm": 5.264461040496826, + "learning_rate": 9.089756789213488e-05, + "loss": 2.1593, + "step": 1366 + }, + { + "epoch": 0.22055501774766054, + "grad_norm": 4.781998634338379, + "learning_rate": 9.088253138233993e-05, + "loss": 2.1517, + "step": 1367 + }, + { + "epoch": 0.2207163601161665, + "grad_norm": 4.442568778991699, + "learning_rate": 9.08674837090657e-05, + "loss": 2.0565, + "step": 1368 + }, + { + "epoch": 0.22087770248467248, + "grad_norm": 4.468411922454834, + "learning_rate": 9.085242487642116e-05, + "loss": 2.2194, + "step": 1369 + }, + { + "epoch": 0.22103904485317843, + "grad_norm": 4.508581161499023, + "learning_rate": 9.083735488851828e-05, + "loss": 1.977, + "step": 1370 + }, + { + "epoch": 0.22120038722168442, + "grad_norm": 4.347281455993652, + "learning_rate": 9.082227374947214e-05, + "loss": 2.312, + "step": 1371 + }, + { + "epoch": 0.22136172959019038, + "grad_norm": 5.029078483581543, + "learning_rate": 9.08071814634008e-05, + "loss": 2.2434, + "step": 1372 + }, + { + "epoch": 0.22152307195869636, + "grad_norm": 5.283694267272949, + "learning_rate": 9.079207803442542e-05, + "loss": 2.3074, + "step": 1373 + }, + { + "epoch": 0.22168441432720232, + "grad_norm": 6.727270126342773, + "learning_rate": 9.077696346667015e-05, + "loss": 2.2142, + "step": 1374 + }, + { + "epoch": 0.2218457566957083, + "grad_norm": 4.587703227996826, + "learning_rate": 9.076183776426224e-05, + "loss": 2.2143, + "step": 1375 + }, + { + "epoch": 0.22200709906421426, + "grad_norm": 5.426577568054199, + "learning_rate": 9.074670093133193e-05, + "loss": 2.2577, + "step": 1376 + }, + { + "epoch": 0.22216844143272024, + "grad_norm": 4.498953819274902, + "learning_rate": 9.073155297201252e-05, + "loss": 2.0291, + "step": 1377 + }, + { + "epoch": 0.2223297838012262, + "grad_norm": 4.776580333709717, + "learning_rate": 9.071639389044036e-05, + "loss": 2.0842, + "step": 1378 + }, + { + "epoch": 0.22249112616973218, + "grad_norm": 4.266298294067383, + "learning_rate": 9.070122369075481e-05, + "loss": 2.2213, + "step": 1379 + }, + { + "epoch": 0.22265246853823814, + "grad_norm": 4.398402214050293, + "learning_rate": 9.068604237709828e-05, + "loss": 1.9287, + "step": 1380 + }, + { + "epoch": 0.2228138109067441, + "grad_norm": 5.507858753204346, + "learning_rate": 9.067084995361623e-05, + "loss": 1.8418, + "step": 1381 + }, + { + "epoch": 0.22297515327525008, + "grad_norm": 4.711968421936035, + "learning_rate": 9.065564642445711e-05, + "loss": 2.0282, + "step": 1382 + }, + { + "epoch": 0.22313649564375604, + "grad_norm": 4.0508575439453125, + "learning_rate": 9.064043179377249e-05, + "loss": 2.3358, + "step": 1383 + }, + { + "epoch": 0.22329783801226202, + "grad_norm": 4.1921467781066895, + "learning_rate": 9.062520606571682e-05, + "loss": 1.908, + "step": 1384 + }, + { + "epoch": 0.22345918038076798, + "grad_norm": 4.855215549468994, + "learning_rate": 9.060996924444776e-05, + "loss": 2.2436, + "step": 1385 + }, + { + "epoch": 0.22362052274927396, + "grad_norm": 3.554455518722534, + "learning_rate": 9.059472133412587e-05, + "loss": 2.0395, + "step": 1386 + }, + { + "epoch": 0.22378186511777992, + "grad_norm": 5.087851047515869, + "learning_rate": 9.05794623389148e-05, + "loss": 2.0126, + "step": 1387 + }, + { + "epoch": 0.2239432074862859, + "grad_norm": 5.585208415985107, + "learning_rate": 9.056419226298117e-05, + "loss": 1.9502, + "step": 1388 + }, + { + "epoch": 0.22410454985479186, + "grad_norm": 4.503779411315918, + "learning_rate": 9.054891111049468e-05, + "loss": 2.0783, + "step": 1389 + }, + { + "epoch": 0.22426589222329785, + "grad_norm": 4.334665298461914, + "learning_rate": 9.053361888562807e-05, + "loss": 2.0684, + "step": 1390 + }, + { + "epoch": 0.2244272345918038, + "grad_norm": 5.303945541381836, + "learning_rate": 9.051831559255704e-05, + "loss": 2.1628, + "step": 1391 + }, + { + "epoch": 0.2245885769603098, + "grad_norm": 5.3270063400268555, + "learning_rate": 9.050300123546033e-05, + "loss": 2.048, + "step": 1392 + }, + { + "epoch": 0.22474991932881574, + "grad_norm": 5.550933837890625, + "learning_rate": 9.048767581851973e-05, + "loss": 2.3384, + "step": 1393 + }, + { + "epoch": 0.22491126169732173, + "grad_norm": 6.187167167663574, + "learning_rate": 9.047233934592005e-05, + "loss": 2.0726, + "step": 1394 + }, + { + "epoch": 0.22507260406582769, + "grad_norm": 4.272796630859375, + "learning_rate": 9.045699182184909e-05, + "loss": 2.0404, + "step": 1395 + }, + { + "epoch": 0.22523394643433364, + "grad_norm": 4.191316604614258, + "learning_rate": 9.044163325049766e-05, + "loss": 2.1977, + "step": 1396 + }, + { + "epoch": 0.22539528880283963, + "grad_norm": 4.133779525756836, + "learning_rate": 9.042626363605964e-05, + "loss": 2.0868, + "step": 1397 + }, + { + "epoch": 0.22555663117134558, + "grad_norm": 5.127355575561523, + "learning_rate": 9.041088298273186e-05, + "loss": 1.9973, + "step": 1398 + }, + { + "epoch": 0.22571797353985157, + "grad_norm": 4.5121073722839355, + "learning_rate": 9.039549129471423e-05, + "loss": 2.324, + "step": 1399 + }, + { + "epoch": 0.22587931590835753, + "grad_norm": 4.350682258605957, + "learning_rate": 9.038008857620963e-05, + "loss": 2.159, + "step": 1400 + }, + { + "epoch": 0.2260406582768635, + "grad_norm": 4.376583576202393, + "learning_rate": 9.036467483142394e-05, + "loss": 2.1031, + "step": 1401 + }, + { + "epoch": 0.22620200064536947, + "grad_norm": 4.949342727661133, + "learning_rate": 9.034925006456611e-05, + "loss": 2.3473, + "step": 1402 + }, + { + "epoch": 0.22636334301387545, + "grad_norm": 5.014150142669678, + "learning_rate": 9.033381427984803e-05, + "loss": 2.3026, + "step": 1403 + }, + { + "epoch": 0.2265246853823814, + "grad_norm": 6.434081554412842, + "learning_rate": 9.031836748148465e-05, + "loss": 2.0081, + "step": 1404 + }, + { + "epoch": 0.2266860277508874, + "grad_norm": 6.950235843658447, + "learning_rate": 9.030290967369392e-05, + "loss": 2.3194, + "step": 1405 + }, + { + "epoch": 0.22684737011939335, + "grad_norm": 6.151453971862793, + "learning_rate": 9.028744086069674e-05, + "loss": 2.1002, + "step": 1406 + }, + { + "epoch": 0.22700871248789933, + "grad_norm": 7.216166973114014, + "learning_rate": 9.027196104671712e-05, + "loss": 2.1609, + "step": 1407 + }, + { + "epoch": 0.2271700548564053, + "grad_norm": 3.784708023071289, + "learning_rate": 9.025647023598196e-05, + "loss": 2.3474, + "step": 1408 + }, + { + "epoch": 0.22733139722491127, + "grad_norm": 5.03326416015625, + "learning_rate": 9.024096843272124e-05, + "loss": 2.1629, + "step": 1409 + }, + { + "epoch": 0.22749273959341723, + "grad_norm": 4.098717212677002, + "learning_rate": 9.022545564116793e-05, + "loss": 1.9903, + "step": 1410 + }, + { + "epoch": 0.2276540819619232, + "grad_norm": 5.1551361083984375, + "learning_rate": 9.020993186555796e-05, + "loss": 2.1439, + "step": 1411 + }, + { + "epoch": 0.22781542433042917, + "grad_norm": 5.542691707611084, + "learning_rate": 9.019439711013031e-05, + "loss": 2.0417, + "step": 1412 + }, + { + "epoch": 0.22797676669893513, + "grad_norm": 4.643231391906738, + "learning_rate": 9.017885137912694e-05, + "loss": 2.0455, + "step": 1413 + }, + { + "epoch": 0.2281381090674411, + "grad_norm": 4.5204267501831055, + "learning_rate": 9.016329467679281e-05, + "loss": 2.1043, + "step": 1414 + }, + { + "epoch": 0.22829945143594707, + "grad_norm": 4.192257881164551, + "learning_rate": 9.014772700737584e-05, + "loss": 2.1372, + "step": 1415 + }, + { + "epoch": 0.22846079380445306, + "grad_norm": 4.215038776397705, + "learning_rate": 9.013214837512697e-05, + "loss": 2.002, + "step": 1416 + }, + { + "epoch": 0.228622136172959, + "grad_norm": 7.06200647354126, + "learning_rate": 9.011655878430019e-05, + "loss": 2.2708, + "step": 1417 + }, + { + "epoch": 0.228783478541465, + "grad_norm": 4.816001892089844, + "learning_rate": 9.010095823915237e-05, + "loss": 2.0151, + "step": 1418 + }, + { + "epoch": 0.22894482090997095, + "grad_norm": 5.204929828643799, + "learning_rate": 9.008534674394348e-05, + "loss": 1.9334, + "step": 1419 + }, + { + "epoch": 0.22910616327847694, + "grad_norm": 4.745861530303955, + "learning_rate": 9.006972430293639e-05, + "loss": 2.2435, + "step": 1420 + }, + { + "epoch": 0.2292675056469829, + "grad_norm": 5.817446708679199, + "learning_rate": 9.005409092039703e-05, + "loss": 1.959, + "step": 1421 + }, + { + "epoch": 0.22942884801548888, + "grad_norm": 4.074014186859131, + "learning_rate": 9.003844660059428e-05, + "loss": 1.9885, + "step": 1422 + }, + { + "epoch": 0.22959019038399484, + "grad_norm": 4.622272491455078, + "learning_rate": 9.00227913478e-05, + "loss": 2.1142, + "step": 1423 + }, + { + "epoch": 0.22975153275250082, + "grad_norm": 4.587558269500732, + "learning_rate": 9.000712516628907e-05, + "loss": 2.1376, + "step": 1424 + }, + { + "epoch": 0.22991287512100678, + "grad_norm": 4.234425067901611, + "learning_rate": 8.999144806033932e-05, + "loss": 1.9972, + "step": 1425 + }, + { + "epoch": 0.23007421748951273, + "grad_norm": 4.4645280838012695, + "learning_rate": 8.997576003423159e-05, + "loss": 1.9112, + "step": 1426 + }, + { + "epoch": 0.23023555985801872, + "grad_norm": 4.017051696777344, + "learning_rate": 8.996006109224968e-05, + "loss": 2.0565, + "step": 1427 + }, + { + "epoch": 0.23039690222652467, + "grad_norm": 5.7819132804870605, + "learning_rate": 8.994435123868038e-05, + "loss": 2.3573, + "step": 1428 + }, + { + "epoch": 0.23055824459503066, + "grad_norm": 4.956873416900635, + "learning_rate": 8.992863047781345e-05, + "loss": 2.1731, + "step": 1429 + }, + { + "epoch": 0.23071958696353662, + "grad_norm": 4.781367301940918, + "learning_rate": 8.991289881394167e-05, + "loss": 2.189, + "step": 1430 + }, + { + "epoch": 0.2308809293320426, + "grad_norm": 4.809574604034424, + "learning_rate": 8.989715625136072e-05, + "loss": 2.0249, + "step": 1431 + }, + { + "epoch": 0.23104227170054856, + "grad_norm": 5.7442755699157715, + "learning_rate": 8.988140279436934e-05, + "loss": 2.0393, + "step": 1432 + }, + { + "epoch": 0.23120361406905454, + "grad_norm": 5.626896858215332, + "learning_rate": 8.986563844726918e-05, + "loss": 2.0689, + "step": 1433 + }, + { + "epoch": 0.2313649564375605, + "grad_norm": 5.195352077484131, + "learning_rate": 8.984986321436491e-05, + "loss": 1.996, + "step": 1434 + }, + { + "epoch": 0.23152629880606648, + "grad_norm": 4.860560417175293, + "learning_rate": 8.983407709996414e-05, + "loss": 2.1372, + "step": 1435 + }, + { + "epoch": 0.23168764117457244, + "grad_norm": 3.8607332706451416, + "learning_rate": 8.981828010837745e-05, + "loss": 1.8863, + "step": 1436 + }, + { + "epoch": 0.23184898354307842, + "grad_norm": 3.993809223175049, + "learning_rate": 8.980247224391843e-05, + "loss": 1.9653, + "step": 1437 + }, + { + "epoch": 0.23201032591158438, + "grad_norm": 4.640903949737549, + "learning_rate": 8.978665351090358e-05, + "loss": 2.0526, + "step": 1438 + }, + { + "epoch": 0.23217166828009037, + "grad_norm": 7.137829303741455, + "learning_rate": 8.977082391365243e-05, + "loss": 2.0915, + "step": 1439 + }, + { + "epoch": 0.23233301064859632, + "grad_norm": 4.12993860244751, + "learning_rate": 8.975498345648745e-05, + "loss": 2.1223, + "step": 1440 + }, + { + "epoch": 0.23249435301710228, + "grad_norm": 3.8784449100494385, + "learning_rate": 8.973913214373404e-05, + "loss": 2.2804, + "step": 1441 + }, + { + "epoch": 0.23265569538560826, + "grad_norm": 4.640244007110596, + "learning_rate": 8.972326997972062e-05, + "loss": 2.1302, + "step": 1442 + }, + { + "epoch": 0.23281703775411422, + "grad_norm": 5.461720943450928, + "learning_rate": 8.970739696877854e-05, + "loss": 1.8691, + "step": 1443 + }, + { + "epoch": 0.2329783801226202, + "grad_norm": 3.66225004196167, + "learning_rate": 8.969151311524214e-05, + "loss": 2.1667, + "step": 1444 + }, + { + "epoch": 0.23313972249112616, + "grad_norm": 5.224299430847168, + "learning_rate": 8.967561842344867e-05, + "loss": 2.0891, + "step": 1445 + }, + { + "epoch": 0.23330106485963215, + "grad_norm": 4.650608062744141, + "learning_rate": 8.96597128977384e-05, + "loss": 2.1573, + "step": 1446 + }, + { + "epoch": 0.2334624072281381, + "grad_norm": 5.109067916870117, + "learning_rate": 8.964379654245452e-05, + "loss": 1.9191, + "step": 1447 + }, + { + "epoch": 0.2336237495966441, + "grad_norm": 8.680831909179688, + "learning_rate": 8.962786936194318e-05, + "loss": 2.0666, + "step": 1448 + }, + { + "epoch": 0.23378509196515004, + "grad_norm": 4.50174617767334, + "learning_rate": 8.96119313605535e-05, + "loss": 2.0207, + "step": 1449 + }, + { + "epoch": 0.23394643433365603, + "grad_norm": 3.517400026321411, + "learning_rate": 8.959598254263754e-05, + "loss": 1.9692, + "step": 1450 + }, + { + "epoch": 0.23410777670216198, + "grad_norm": 4.062027454376221, + "learning_rate": 8.958002291255035e-05, + "loss": 2.2276, + "step": 1451 + }, + { + "epoch": 0.23426911907066797, + "grad_norm": 4.07244873046875, + "learning_rate": 8.956405247464987e-05, + "loss": 2.1075, + "step": 1452 + }, + { + "epoch": 0.23443046143917393, + "grad_norm": 3.9593074321746826, + "learning_rate": 8.954807123329704e-05, + "loss": 2.1666, + "step": 1453 + }, + { + "epoch": 0.2345918038076799, + "grad_norm": 4.045650005340576, + "learning_rate": 8.953207919285573e-05, + "loss": 2.0366, + "step": 1454 + }, + { + "epoch": 0.23475314617618587, + "grad_norm": 3.8713786602020264, + "learning_rate": 8.951607635769275e-05, + "loss": 2.041, + "step": 1455 + }, + { + "epoch": 0.23491448854469182, + "grad_norm": 3.8852434158325195, + "learning_rate": 8.95000627321779e-05, + "loss": 2.0126, + "step": 1456 + }, + { + "epoch": 0.2350758309131978, + "grad_norm": 4.7137908935546875, + "learning_rate": 8.948403832068389e-05, + "loss": 2.0859, + "step": 1457 + }, + { + "epoch": 0.23523717328170377, + "grad_norm": 5.0109333992004395, + "learning_rate": 8.946800312758638e-05, + "loss": 2.4074, + "step": 1458 + }, + { + "epoch": 0.23539851565020975, + "grad_norm": 4.378200054168701, + "learning_rate": 8.945195715726396e-05, + "loss": 2.3619, + "step": 1459 + }, + { + "epoch": 0.2355598580187157, + "grad_norm": 4.874209403991699, + "learning_rate": 8.943590041409822e-05, + "loss": 1.8734, + "step": 1460 + }, + { + "epoch": 0.2357212003872217, + "grad_norm": 3.297186851501465, + "learning_rate": 8.94198329024736e-05, + "loss": 2.1214, + "step": 1461 + }, + { + "epoch": 0.23588254275572765, + "grad_norm": 3.798933744430542, + "learning_rate": 8.940375462677757e-05, + "loss": 2.1703, + "step": 1462 + }, + { + "epoch": 0.23604388512423363, + "grad_norm": 3.941425085067749, + "learning_rate": 8.93876655914005e-05, + "loss": 1.998, + "step": 1463 + }, + { + "epoch": 0.2362052274927396, + "grad_norm": 4.085728645324707, + "learning_rate": 8.937156580073569e-05, + "loss": 2.0896, + "step": 1464 + }, + { + "epoch": 0.23636656986124557, + "grad_norm": 4.89539909362793, + "learning_rate": 8.935545525917937e-05, + "loss": 2.005, + "step": 1465 + }, + { + "epoch": 0.23652791222975153, + "grad_norm": 5.450769901275635, + "learning_rate": 8.933933397113075e-05, + "loss": 2.1447, + "step": 1466 + }, + { + "epoch": 0.23668925459825751, + "grad_norm": 4.819671630859375, + "learning_rate": 8.932320194099194e-05, + "loss": 2.2535, + "step": 1467 + }, + { + "epoch": 0.23685059696676347, + "grad_norm": 6.937513828277588, + "learning_rate": 8.930705917316797e-05, + "loss": 2.2115, + "step": 1468 + }, + { + "epoch": 0.23701193933526943, + "grad_norm": 6.169703483581543, + "learning_rate": 8.929090567206685e-05, + "loss": 1.9505, + "step": 1469 + }, + { + "epoch": 0.2371732817037754, + "grad_norm": 4.902163982391357, + "learning_rate": 8.927474144209947e-05, + "loss": 2.0367, + "step": 1470 + }, + { + "epoch": 0.23733462407228137, + "grad_norm": 6.251935958862305, + "learning_rate": 8.92585664876797e-05, + "loss": 2.0549, + "step": 1471 + }, + { + "epoch": 0.23749596644078735, + "grad_norm": 4.960795879364014, + "learning_rate": 8.924238081322427e-05, + "loss": 1.9314, + "step": 1472 + }, + { + "epoch": 0.2376573088092933, + "grad_norm": 4.389493465423584, + "learning_rate": 8.922618442315291e-05, + "loss": 2.1619, + "step": 1473 + }, + { + "epoch": 0.2378186511777993, + "grad_norm": 3.819166421890259, + "learning_rate": 8.920997732188823e-05, + "loss": 2.1415, + "step": 1474 + }, + { + "epoch": 0.23797999354630525, + "grad_norm": 4.902157783508301, + "learning_rate": 8.919375951385579e-05, + "loss": 2.1946, + "step": 1475 + }, + { + "epoch": 0.23814133591481124, + "grad_norm": 4.391149997711182, + "learning_rate": 8.917753100348405e-05, + "loss": 1.8695, + "step": 1476 + }, + { + "epoch": 0.2383026782833172, + "grad_norm": 4.471001625061035, + "learning_rate": 8.916129179520442e-05, + "loss": 2.2006, + "step": 1477 + }, + { + "epoch": 0.23846402065182318, + "grad_norm": 3.5523102283477783, + "learning_rate": 8.914504189345119e-05, + "loss": 2.0731, + "step": 1478 + }, + { + "epoch": 0.23862536302032913, + "grad_norm": 3.7575554847717285, + "learning_rate": 8.912878130266162e-05, + "loss": 2.3713, + "step": 1479 + }, + { + "epoch": 0.23878670538883512, + "grad_norm": 4.426853179931641, + "learning_rate": 8.911251002727588e-05, + "loss": 1.8838, + "step": 1480 + }, + { + "epoch": 0.23894804775734108, + "grad_norm": 4.108395576477051, + "learning_rate": 8.909622807173698e-05, + "loss": 2.2149, + "step": 1481 + }, + { + "epoch": 0.23910939012584706, + "grad_norm": 4.647747993469238, + "learning_rate": 8.907993544049098e-05, + "loss": 2.0565, + "step": 1482 + }, + { + "epoch": 0.23927073249435302, + "grad_norm": 3.7445273399353027, + "learning_rate": 8.906363213798674e-05, + "loss": 2.2598, + "step": 1483 + }, + { + "epoch": 0.23943207486285897, + "grad_norm": 4.759930610656738, + "learning_rate": 8.904731816867609e-05, + "loss": 2.1619, + "step": 1484 + }, + { + "epoch": 0.23959341723136496, + "grad_norm": 6.480990886688232, + "learning_rate": 8.903099353701376e-05, + "loss": 2.2404, + "step": 1485 + }, + { + "epoch": 0.23975475959987091, + "grad_norm": 4.645752429962158, + "learning_rate": 8.90146582474574e-05, + "loss": 2.1223, + "step": 1486 + }, + { + "epoch": 0.2399161019683769, + "grad_norm": 4.475593566894531, + "learning_rate": 8.899831230446754e-05, + "loss": 2.1774, + "step": 1487 + }, + { + "epoch": 0.24007744433688286, + "grad_norm": 3.9450621604919434, + "learning_rate": 8.898195571250768e-05, + "loss": 2.2844, + "step": 1488 + }, + { + "epoch": 0.24023878670538884, + "grad_norm": 4.216766357421875, + "learning_rate": 8.896558847604414e-05, + "loss": 2.1326, + "step": 1489 + }, + { + "epoch": 0.2404001290738948, + "grad_norm": 4.920373916625977, + "learning_rate": 8.894921059954622e-05, + "loss": 1.9147, + "step": 1490 + }, + { + "epoch": 0.24056147144240078, + "grad_norm": 5.657355785369873, + "learning_rate": 8.893282208748612e-05, + "loss": 1.9171, + "step": 1491 + }, + { + "epoch": 0.24072281381090674, + "grad_norm": 3.456425666809082, + "learning_rate": 8.891642294433891e-05, + "loss": 2.079, + "step": 1492 + }, + { + "epoch": 0.24088415617941272, + "grad_norm": 4.492105007171631, + "learning_rate": 8.890001317458257e-05, + "loss": 2.1169, + "step": 1493 + }, + { + "epoch": 0.24104549854791868, + "grad_norm": 5.125545501708984, + "learning_rate": 8.888359278269798e-05, + "loss": 2.0139, + "step": 1494 + }, + { + "epoch": 0.24120684091642466, + "grad_norm": 4.284847736358643, + "learning_rate": 8.886716177316895e-05, + "loss": 1.9375, + "step": 1495 + }, + { + "epoch": 0.24136818328493062, + "grad_norm": 4.1635212898254395, + "learning_rate": 8.885072015048217e-05, + "loss": 2.2006, + "step": 1496 + }, + { + "epoch": 0.2415295256534366, + "grad_norm": 5.857314586639404, + "learning_rate": 8.883426791912723e-05, + "loss": 2.0958, + "step": 1497 + }, + { + "epoch": 0.24169086802194256, + "grad_norm": 4.1824259757995605, + "learning_rate": 8.88178050835966e-05, + "loss": 2.3473, + "step": 1498 + }, + { + "epoch": 0.24185221039044852, + "grad_norm": 4.814222812652588, + "learning_rate": 8.88013316483857e-05, + "loss": 2.2205, + "step": 1499 + }, + { + "epoch": 0.2420135527589545, + "grad_norm": 6.895967960357666, + "learning_rate": 8.878484761799273e-05, + "loss": 2.5106, + "step": 1500 + }, + { + "epoch": 0.24217489512746046, + "grad_norm": 4.180379390716553, + "learning_rate": 8.876835299691891e-05, + "loss": 2.0276, + "step": 1501 + }, + { + "epoch": 0.24233623749596644, + "grad_norm": 4.665975570678711, + "learning_rate": 8.875184778966829e-05, + "loss": 1.888, + "step": 1502 + }, + { + "epoch": 0.2424975798644724, + "grad_norm": 4.229085922241211, + "learning_rate": 8.873533200074784e-05, + "loss": 2.3136, + "step": 1503 + }, + { + "epoch": 0.24265892223297839, + "grad_norm": 5.273557186126709, + "learning_rate": 8.871880563466736e-05, + "loss": 2.1664, + "step": 1504 + }, + { + "epoch": 0.24282026460148434, + "grad_norm": 3.392350196838379, + "learning_rate": 8.870226869593961e-05, + "loss": 2.0867, + "step": 1505 + }, + { + "epoch": 0.24298160696999033, + "grad_norm": 4.525039196014404, + "learning_rate": 8.86857211890802e-05, + "loss": 1.8333, + "step": 1506 + }, + { + "epoch": 0.24314294933849628, + "grad_norm": 3.7613155841827393, + "learning_rate": 8.86691631186076e-05, + "loss": 2.1106, + "step": 1507 + }, + { + "epoch": 0.24330429170700227, + "grad_norm": 5.067121982574463, + "learning_rate": 8.865259448904324e-05, + "loss": 2.0235, + "step": 1508 + }, + { + "epoch": 0.24346563407550822, + "grad_norm": 3.9605300426483154, + "learning_rate": 8.863601530491137e-05, + "loss": 2.0636, + "step": 1509 + }, + { + "epoch": 0.2436269764440142, + "grad_norm": 4.152029037475586, + "learning_rate": 8.861942557073912e-05, + "loss": 2.231, + "step": 1510 + }, + { + "epoch": 0.24378831881252017, + "grad_norm": 4.132868766784668, + "learning_rate": 8.860282529105657e-05, + "loss": 2.0088, + "step": 1511 + }, + { + "epoch": 0.24394966118102615, + "grad_norm": 5.009464740753174, + "learning_rate": 8.858621447039657e-05, + "loss": 2.2279, + "step": 1512 + }, + { + "epoch": 0.2441110035495321, + "grad_norm": 3.118217706680298, + "learning_rate": 8.856959311329495e-05, + "loss": 2.0143, + "step": 1513 + }, + { + "epoch": 0.24427234591803806, + "grad_norm": 5.521666526794434, + "learning_rate": 8.855296122429038e-05, + "loss": 1.9785, + "step": 1514 + }, + { + "epoch": 0.24443368828654405, + "grad_norm": 6.3549485206604, + "learning_rate": 8.853631880792436e-05, + "loss": 2.0396, + "step": 1515 + }, + { + "epoch": 0.24459503065505, + "grad_norm": 3.5283477306365967, + "learning_rate": 8.851966586874138e-05, + "loss": 2.0881, + "step": 1516 + }, + { + "epoch": 0.244756373023556, + "grad_norm": 4.201773643493652, + "learning_rate": 8.850300241128866e-05, + "loss": 2.0129, + "step": 1517 + }, + { + "epoch": 0.24491771539206195, + "grad_norm": 5.038053512573242, + "learning_rate": 8.848632844011639e-05, + "loss": 2.2061, + "step": 1518 + }, + { + "epoch": 0.24507905776056793, + "grad_norm": 4.533333778381348, + "learning_rate": 8.846964395977762e-05, + "loss": 1.9826, + "step": 1519 + }, + { + "epoch": 0.2452404001290739, + "grad_norm": 4.284814357757568, + "learning_rate": 8.845294897482822e-05, + "loss": 2.1562, + "step": 1520 + }, + { + "epoch": 0.24540174249757987, + "grad_norm": 5.211300849914551, + "learning_rate": 8.843624348982698e-05, + "loss": 2.3218, + "step": 1521 + }, + { + "epoch": 0.24556308486608583, + "grad_norm": 6.453266620635986, + "learning_rate": 8.841952750933554e-05, + "loss": 2.0731, + "step": 1522 + }, + { + "epoch": 0.2457244272345918, + "grad_norm": 4.951452255249023, + "learning_rate": 8.84028010379184e-05, + "loss": 2.3241, + "step": 1523 + }, + { + "epoch": 0.24588576960309777, + "grad_norm": 5.738493919372559, + "learning_rate": 8.838606408014292e-05, + "loss": 2.0992, + "step": 1524 + }, + { + "epoch": 0.24604711197160375, + "grad_norm": 4.673913478851318, + "learning_rate": 8.836931664057935e-05, + "loss": 2.332, + "step": 1525 + }, + { + "epoch": 0.2462084543401097, + "grad_norm": 3.725369930267334, + "learning_rate": 8.835255872380078e-05, + "loss": 2.0166, + "step": 1526 + }, + { + "epoch": 0.2463697967086157, + "grad_norm": 4.07654333114624, + "learning_rate": 8.833579033438316e-05, + "loss": 2.0895, + "step": 1527 + }, + { + "epoch": 0.24653113907712165, + "grad_norm": 3.596895933151245, + "learning_rate": 8.831901147690532e-05, + "loss": 1.9721, + "step": 1528 + }, + { + "epoch": 0.2466924814456276, + "grad_norm": 5.355297565460205, + "learning_rate": 8.83022221559489e-05, + "loss": 2.257, + "step": 1529 + }, + { + "epoch": 0.2468538238141336, + "grad_norm": 4.837414741516113, + "learning_rate": 8.828542237609846e-05, + "loss": 2.064, + "step": 1530 + }, + { + "epoch": 0.24701516618263955, + "grad_norm": 5.691197395324707, + "learning_rate": 8.82686121419414e-05, + "loss": 2.2296, + "step": 1531 + }, + { + "epoch": 0.24717650855114554, + "grad_norm": 6.874438762664795, + "learning_rate": 8.825179145806794e-05, + "loss": 1.9951, + "step": 1532 + }, + { + "epoch": 0.2473378509196515, + "grad_norm": 5.810023784637451, + "learning_rate": 8.823496032907116e-05, + "loss": 2.2515, + "step": 1533 + }, + { + "epoch": 0.24749919328815748, + "grad_norm": 6.254134178161621, + "learning_rate": 8.821811875954704e-05, + "loss": 2.3287, + "step": 1534 + }, + { + "epoch": 0.24766053565666343, + "grad_norm": 4.213715076446533, + "learning_rate": 8.820126675409435e-05, + "loss": 2.0134, + "step": 1535 + }, + { + "epoch": 0.24782187802516942, + "grad_norm": 8.564623832702637, + "learning_rate": 8.818440431731476e-05, + "loss": 2.1856, + "step": 1536 + }, + { + "epoch": 0.24798322039367537, + "grad_norm": 6.99215030670166, + "learning_rate": 8.816753145381276e-05, + "loss": 2.3262, + "step": 1537 + }, + { + "epoch": 0.24814456276218136, + "grad_norm": 5.778186321258545, + "learning_rate": 8.815064816819569e-05, + "loss": 1.9285, + "step": 1538 + }, + { + "epoch": 0.24830590513068732, + "grad_norm": 3.7695446014404297, + "learning_rate": 8.813375446507373e-05, + "loss": 2.2911, + "step": 1539 + }, + { + "epoch": 0.2484672474991933, + "grad_norm": 3.9019341468811035, + "learning_rate": 8.811685034905993e-05, + "loss": 1.9976, + "step": 1540 + }, + { + "epoch": 0.24862858986769926, + "grad_norm": 4.127923965454102, + "learning_rate": 8.809993582477016e-05, + "loss": 2.4042, + "step": 1541 + }, + { + "epoch": 0.24878993223620524, + "grad_norm": 6.135165214538574, + "learning_rate": 8.808301089682315e-05, + "loss": 2.1746, + "step": 1542 + }, + { + "epoch": 0.2489512746047112, + "grad_norm": 4.654680252075195, + "learning_rate": 8.806607556984044e-05, + "loss": 2.1586, + "step": 1543 + }, + { + "epoch": 0.24911261697321715, + "grad_norm": 4.3369550704956055, + "learning_rate": 8.804912984844645e-05, + "loss": 2.0705, + "step": 1544 + }, + { + "epoch": 0.24927395934172314, + "grad_norm": 3.8217198848724365, + "learning_rate": 8.80321737372684e-05, + "loss": 1.9911, + "step": 1545 + }, + { + "epoch": 0.2494353017102291, + "grad_norm": 4.708687782287598, + "learning_rate": 8.801520724093638e-05, + "loss": 2.1225, + "step": 1546 + }, + { + "epoch": 0.24959664407873508, + "grad_norm": 3.5129055976867676, + "learning_rate": 8.79982303640833e-05, + "loss": 2.0926, + "step": 1547 + }, + { + "epoch": 0.24975798644724104, + "grad_norm": 4.3811540603637695, + "learning_rate": 8.79812431113449e-05, + "loss": 2.1391, + "step": 1548 + }, + { + "epoch": 0.24991932881574702, + "grad_norm": 6.042469501495361, + "learning_rate": 8.796424548735974e-05, + "loss": 2.0243, + "step": 1549 + }, + { + "epoch": 0.250080671184253, + "grad_norm": 3.6431429386138916, + "learning_rate": 8.794723749676927e-05, + "loss": 2.1022, + "step": 1550 + }, + { + "epoch": 0.25024201355275894, + "grad_norm": 5.046748638153076, + "learning_rate": 8.793021914421771e-05, + "loss": 2.1472, + "step": 1551 + }, + { + "epoch": 0.25040335592126495, + "grad_norm": 5.241067886352539, + "learning_rate": 8.791319043435214e-05, + "loss": 2.1424, + "step": 1552 + }, + { + "epoch": 0.2505646982897709, + "grad_norm": 4.297329425811768, + "learning_rate": 8.789615137182244e-05, + "loss": 2.029, + "step": 1553 + }, + { + "epoch": 0.25072604065827686, + "grad_norm": 5.346684455871582, + "learning_rate": 8.787910196128134e-05, + "loss": 2.1146, + "step": 1554 + }, + { + "epoch": 0.2508873830267828, + "grad_norm": 5.6350016593933105, + "learning_rate": 8.78620422073844e-05, + "loss": 2.3859, + "step": 1555 + }, + { + "epoch": 0.25104872539528883, + "grad_norm": 5.491115570068359, + "learning_rate": 8.784497211479001e-05, + "loss": 2.1892, + "step": 1556 + }, + { + "epoch": 0.2512100677637948, + "grad_norm": 4.584228515625, + "learning_rate": 8.782789168815937e-05, + "loss": 2.0268, + "step": 1557 + }, + { + "epoch": 0.25137141013230074, + "grad_norm": 7.174254417419434, + "learning_rate": 8.781080093215645e-05, + "loss": 2.1958, + "step": 1558 + }, + { + "epoch": 0.2515327525008067, + "grad_norm": 6.113037586212158, + "learning_rate": 8.779369985144816e-05, + "loss": 1.9857, + "step": 1559 + }, + { + "epoch": 0.25169409486931266, + "grad_norm": 4.866761207580566, + "learning_rate": 8.77765884507041e-05, + "loss": 2.0729, + "step": 1560 + }, + { + "epoch": 0.25185543723781867, + "grad_norm": 5.130221843719482, + "learning_rate": 8.775946673459681e-05, + "loss": 2.0158, + "step": 1561 + }, + { + "epoch": 0.2520167796063246, + "grad_norm": 4.172994136810303, + "learning_rate": 8.774233470780154e-05, + "loss": 2.114, + "step": 1562 + }, + { + "epoch": 0.2521781219748306, + "grad_norm": 4.171538352966309, + "learning_rate": 8.772519237499642e-05, + "loss": 1.9266, + "step": 1563 + }, + { + "epoch": 0.25233946434333654, + "grad_norm": 6.2082929611206055, + "learning_rate": 8.770803974086237e-05, + "loss": 2.2176, + "step": 1564 + }, + { + "epoch": 0.25250080671184255, + "grad_norm": 5.585021495819092, + "learning_rate": 8.769087681008311e-05, + "loss": 2.1608, + "step": 1565 + }, + { + "epoch": 0.2526621490803485, + "grad_norm": 4.160464763641357, + "learning_rate": 8.767370358734522e-05, + "loss": 2.0601, + "step": 1566 + }, + { + "epoch": 0.25282349144885446, + "grad_norm": 5.117087364196777, + "learning_rate": 8.765652007733805e-05, + "loss": 2.1567, + "step": 1567 + }, + { + "epoch": 0.2529848338173604, + "grad_norm": 4.678501129150391, + "learning_rate": 8.763932628475378e-05, + "loss": 2.1355, + "step": 1568 + }, + { + "epoch": 0.25314617618586643, + "grad_norm": 3.966777801513672, + "learning_rate": 8.762212221428736e-05, + "loss": 2.1416, + "step": 1569 + }, + { + "epoch": 0.2533075185543724, + "grad_norm": 5.832237243652344, + "learning_rate": 8.760490787063659e-05, + "loss": 2.0834, + "step": 1570 + }, + { + "epoch": 0.25346886092287835, + "grad_norm": 9.335321426391602, + "learning_rate": 8.758768325850206e-05, + "loss": 1.8407, + "step": 1571 + }, + { + "epoch": 0.2536302032913843, + "grad_norm": 4.441195487976074, + "learning_rate": 8.757044838258715e-05, + "loss": 1.8863, + "step": 1572 + }, + { + "epoch": 0.25379154565989026, + "grad_norm": 5.355289936065674, + "learning_rate": 8.755320324759808e-05, + "loss": 2.0587, + "step": 1573 + }, + { + "epoch": 0.2539528880283963, + "grad_norm": 4.481767654418945, + "learning_rate": 8.753594785824383e-05, + "loss": 1.917, + "step": 1574 + }, + { + "epoch": 0.25411423039690223, + "grad_norm": 4.7891387939453125, + "learning_rate": 8.75186822192362e-05, + "loss": 1.9565, + "step": 1575 + }, + { + "epoch": 0.2542755727654082, + "grad_norm": 5.2641921043396, + "learning_rate": 8.750140633528978e-05, + "loss": 2.3175, + "step": 1576 + }, + { + "epoch": 0.25443691513391414, + "grad_norm": 4.2695441246032715, + "learning_rate": 8.748412021112197e-05, + "loss": 1.8745, + "step": 1577 + }, + { + "epoch": 0.25459825750242016, + "grad_norm": 4.857851982116699, + "learning_rate": 8.746682385145295e-05, + "loss": 1.8918, + "step": 1578 + }, + { + "epoch": 0.2547595998709261, + "grad_norm": 4.307529926300049, + "learning_rate": 8.744951726100573e-05, + "loss": 2.0401, + "step": 1579 + }, + { + "epoch": 0.25492094223943207, + "grad_norm": 3.3972277641296387, + "learning_rate": 8.743220044450604e-05, + "loss": 2.0949, + "step": 1580 + }, + { + "epoch": 0.255082284607938, + "grad_norm": 3.8061370849609375, + "learning_rate": 8.741487340668251e-05, + "loss": 1.8395, + "step": 1581 + }, + { + "epoch": 0.25524362697644404, + "grad_norm": 4.64457893371582, + "learning_rate": 8.739753615226644e-05, + "loss": 2.0093, + "step": 1582 + }, + { + "epoch": 0.25540496934495, + "grad_norm": 3.7927417755126953, + "learning_rate": 8.738018868599205e-05, + "loss": 2.2221, + "step": 1583 + }, + { + "epoch": 0.25556631171345595, + "grad_norm": 4.176934242248535, + "learning_rate": 8.736283101259621e-05, + "loss": 2.2713, + "step": 1584 + }, + { + "epoch": 0.2557276540819619, + "grad_norm": 4.73375940322876, + "learning_rate": 8.734546313681869e-05, + "loss": 2.1128, + "step": 1585 + }, + { + "epoch": 0.2558889964504679, + "grad_norm": 3.8222851753234863, + "learning_rate": 8.732808506340199e-05, + "loss": 2.1663, + "step": 1586 + }, + { + "epoch": 0.2560503388189739, + "grad_norm": 5.634716510772705, + "learning_rate": 8.731069679709141e-05, + "loss": 2.0878, + "step": 1587 + }, + { + "epoch": 0.25621168118747983, + "grad_norm": 4.924232006072998, + "learning_rate": 8.729329834263503e-05, + "loss": 1.9505, + "step": 1588 + }, + { + "epoch": 0.2563730235559858, + "grad_norm": 5.2137837409973145, + "learning_rate": 8.72758897047837e-05, + "loss": 1.916, + "step": 1589 + }, + { + "epoch": 0.25653436592449175, + "grad_norm": 4.130535125732422, + "learning_rate": 8.725847088829108e-05, + "loss": 1.9296, + "step": 1590 + }, + { + "epoch": 0.25669570829299776, + "grad_norm": 3.3059771060943604, + "learning_rate": 8.724104189791359e-05, + "loss": 1.9605, + "step": 1591 + }, + { + "epoch": 0.2568570506615037, + "grad_norm": 4.49686336517334, + "learning_rate": 8.722360273841044e-05, + "loss": 2.0861, + "step": 1592 + }, + { + "epoch": 0.2570183930300097, + "grad_norm": 3.792386054992676, + "learning_rate": 8.720615341454357e-05, + "loss": 2.1605, + "step": 1593 + }, + { + "epoch": 0.25717973539851563, + "grad_norm": 5.434024333953857, + "learning_rate": 8.718869393107778e-05, + "loss": 2.1165, + "step": 1594 + }, + { + "epoch": 0.25734107776702164, + "grad_norm": 4.544057369232178, + "learning_rate": 8.717122429278055e-05, + "loss": 2.2117, + "step": 1595 + }, + { + "epoch": 0.2575024201355276, + "grad_norm": 4.096789836883545, + "learning_rate": 8.715374450442223e-05, + "loss": 2.1793, + "step": 1596 + }, + { + "epoch": 0.25766376250403356, + "grad_norm": 4.786748886108398, + "learning_rate": 8.713625457077585e-05, + "loss": 1.9036, + "step": 1597 + }, + { + "epoch": 0.2578251048725395, + "grad_norm": 5.406977653503418, + "learning_rate": 8.711875449661728e-05, + "loss": 2.1494, + "step": 1598 + }, + { + "epoch": 0.2579864472410455, + "grad_norm": 3.7704203128814697, + "learning_rate": 8.710124428672513e-05, + "loss": 2.3003, + "step": 1599 + }, + { + "epoch": 0.2581477896095515, + "grad_norm": 4.686526298522949, + "learning_rate": 8.708372394588076e-05, + "loss": 2.1823, + "step": 1600 + }, + { + "epoch": 0.25830913197805744, + "grad_norm": 3.888341188430786, + "learning_rate": 8.706619347886831e-05, + "loss": 2.0202, + "step": 1601 + }, + { + "epoch": 0.2584704743465634, + "grad_norm": 4.793549537658691, + "learning_rate": 8.704865289047473e-05, + "loss": 2.0186, + "step": 1602 + }, + { + "epoch": 0.25863181671506935, + "grad_norm": 4.393258094787598, + "learning_rate": 8.703110218548964e-05, + "loss": 2.0682, + "step": 1603 + }, + { + "epoch": 0.25879315908357536, + "grad_norm": 5.088795185089111, + "learning_rate": 8.701354136870552e-05, + "loss": 1.816, + "step": 1604 + }, + { + "epoch": 0.2589545014520813, + "grad_norm": 4.1309356689453125, + "learning_rate": 8.699597044491756e-05, + "loss": 1.9896, + "step": 1605 + }, + { + "epoch": 0.2591158438205873, + "grad_norm": 4.061094760894775, + "learning_rate": 8.69783894189237e-05, + "loss": 1.9654, + "step": 1606 + }, + { + "epoch": 0.25927718618909323, + "grad_norm": 4.1488542556762695, + "learning_rate": 8.696079829552468e-05, + "loss": 2.1103, + "step": 1607 + }, + { + "epoch": 0.25943852855759925, + "grad_norm": 3.9607887268066406, + "learning_rate": 8.694319707952394e-05, + "loss": 2.4623, + "step": 1608 + }, + { + "epoch": 0.2595998709261052, + "grad_norm": 3.6128292083740234, + "learning_rate": 8.692558577572774e-05, + "loss": 1.9496, + "step": 1609 + }, + { + "epoch": 0.25976121329461116, + "grad_norm": 4.164627552032471, + "learning_rate": 8.690796438894504e-05, + "loss": 2.0055, + "step": 1610 + }, + { + "epoch": 0.2599225556631171, + "grad_norm": 4.3762335777282715, + "learning_rate": 8.689033292398759e-05, + "loss": 2.0936, + "step": 1611 + }, + { + "epoch": 0.26008389803162313, + "grad_norm": 4.450123310089111, + "learning_rate": 8.687269138566988e-05, + "loss": 1.9501, + "step": 1612 + }, + { + "epoch": 0.2602452404001291, + "grad_norm": 4.878677845001221, + "learning_rate": 8.685503977880916e-05, + "loss": 1.9101, + "step": 1613 + }, + { + "epoch": 0.26040658276863504, + "grad_norm": 3.2923424243927, + "learning_rate": 8.683737810822539e-05, + "loss": 1.981, + "step": 1614 + }, + { + "epoch": 0.260567925137141, + "grad_norm": 4.8123908042907715, + "learning_rate": 8.681970637874132e-05, + "loss": 2.1394, + "step": 1615 + }, + { + "epoch": 0.26072926750564696, + "grad_norm": 3.266145944595337, + "learning_rate": 8.680202459518244e-05, + "loss": 1.806, + "step": 1616 + }, + { + "epoch": 0.26089060987415297, + "grad_norm": 5.465076923370361, + "learning_rate": 8.678433276237698e-05, + "loss": 2.3119, + "step": 1617 + }, + { + "epoch": 0.2610519522426589, + "grad_norm": 4.121337890625, + "learning_rate": 8.676663088515591e-05, + "loss": 1.9882, + "step": 1618 + }, + { + "epoch": 0.2612132946111649, + "grad_norm": 3.6586122512817383, + "learning_rate": 8.674891896835293e-05, + "loss": 2.076, + "step": 1619 + }, + { + "epoch": 0.26137463697967084, + "grad_norm": 4.997232437133789, + "learning_rate": 8.673119701680452e-05, + "loss": 1.8912, + "step": 1620 + }, + { + "epoch": 0.26153597934817685, + "grad_norm": 7.334474563598633, + "learning_rate": 8.671346503534988e-05, + "loss": 2.1024, + "step": 1621 + }, + { + "epoch": 0.2616973217166828, + "grad_norm": 3.8910131454467773, + "learning_rate": 8.669572302883094e-05, + "loss": 1.9326, + "step": 1622 + }, + { + "epoch": 0.26185866408518876, + "grad_norm": 4.191247940063477, + "learning_rate": 8.667797100209234e-05, + "loss": 1.7807, + "step": 1623 + }, + { + "epoch": 0.2620200064536947, + "grad_norm": 5.892439365386963, + "learning_rate": 8.666020895998153e-05, + "loss": 2.1133, + "step": 1624 + }, + { + "epoch": 0.26218134882220073, + "grad_norm": 3.86570405960083, + "learning_rate": 8.664243690734865e-05, + "loss": 1.8862, + "step": 1625 + }, + { + "epoch": 0.2623426911907067, + "grad_norm": 3.905013084411621, + "learning_rate": 8.662465484904656e-05, + "loss": 2.1019, + "step": 1626 + }, + { + "epoch": 0.26250403355921265, + "grad_norm": 4.451311111450195, + "learning_rate": 8.66068627899309e-05, + "loss": 1.981, + "step": 1627 + }, + { + "epoch": 0.2626653759277186, + "grad_norm": 5.009078502655029, + "learning_rate": 8.658906073485998e-05, + "loss": 2.1871, + "step": 1628 + }, + { + "epoch": 0.2628267182962246, + "grad_norm": 5.540653228759766, + "learning_rate": 8.657124868869489e-05, + "loss": 2.0475, + "step": 1629 + }, + { + "epoch": 0.26298806066473057, + "grad_norm": 5.579919338226318, + "learning_rate": 8.655342665629943e-05, + "loss": 2.2816, + "step": 1630 + }, + { + "epoch": 0.26314940303323653, + "grad_norm": 4.403649806976318, + "learning_rate": 8.653559464254008e-05, + "loss": 2.0569, + "step": 1631 + }, + { + "epoch": 0.2633107454017425, + "grad_norm": 3.9714784622192383, + "learning_rate": 8.651775265228617e-05, + "loss": 2.0732, + "step": 1632 + }, + { + "epoch": 0.26347208777024844, + "grad_norm": 5.194519996643066, + "learning_rate": 8.649990069040961e-05, + "loss": 1.9404, + "step": 1633 + }, + { + "epoch": 0.26363343013875445, + "grad_norm": 4.312530040740967, + "learning_rate": 8.648203876178514e-05, + "loss": 2.0072, + "step": 1634 + }, + { + "epoch": 0.2637947725072604, + "grad_norm": 4.333287715911865, + "learning_rate": 8.646416687129013e-05, + "loss": 2.1602, + "step": 1635 + }, + { + "epoch": 0.26395611487576637, + "grad_norm": 4.85657262802124, + "learning_rate": 8.644628502380479e-05, + "loss": 2.0286, + "step": 1636 + }, + { + "epoch": 0.2641174572442723, + "grad_norm": 4.16322135925293, + "learning_rate": 8.642839322421192e-05, + "loss": 2.2035, + "step": 1637 + }, + { + "epoch": 0.26427879961277834, + "grad_norm": 3.8348686695098877, + "learning_rate": 8.641049147739713e-05, + "loss": 1.9357, + "step": 1638 + }, + { + "epoch": 0.2644401419812843, + "grad_norm": 5.0198774337768555, + "learning_rate": 8.63925797882487e-05, + "loss": 2.0692, + "step": 1639 + }, + { + "epoch": 0.26460148434979025, + "grad_norm": 4.261548042297363, + "learning_rate": 8.637465816165763e-05, + "loss": 1.8413, + "step": 1640 + }, + { + "epoch": 0.2647628267182962, + "grad_norm": 4.39458703994751, + "learning_rate": 8.635672660251765e-05, + "loss": 2.0827, + "step": 1641 + }, + { + "epoch": 0.2649241690868022, + "grad_norm": 4.479990005493164, + "learning_rate": 8.63387851157252e-05, + "loss": 1.9944, + "step": 1642 + }, + { + "epoch": 0.2650855114553082, + "grad_norm": 5.603365421295166, + "learning_rate": 8.632083370617941e-05, + "loss": 2.3669, + "step": 1643 + }, + { + "epoch": 0.26524685382381413, + "grad_norm": 4.003785610198975, + "learning_rate": 8.630287237878214e-05, + "loss": 2.1475, + "step": 1644 + }, + { + "epoch": 0.2654081961923201, + "grad_norm": 4.122474670410156, + "learning_rate": 8.628490113843797e-05, + "loss": 1.9272, + "step": 1645 + }, + { + "epoch": 0.26556953856082605, + "grad_norm": 4.912667751312256, + "learning_rate": 8.626691999005414e-05, + "loss": 2.0954, + "step": 1646 + }, + { + "epoch": 0.26573088092933206, + "grad_norm": 5.25667667388916, + "learning_rate": 8.624892893854062e-05, + "loss": 2.0019, + "step": 1647 + }, + { + "epoch": 0.265892223297838, + "grad_norm": 6.928044319152832, + "learning_rate": 8.623092798881012e-05, + "loss": 2.2042, + "step": 1648 + }, + { + "epoch": 0.26605356566634397, + "grad_norm": 5.584704875946045, + "learning_rate": 8.6212917145778e-05, + "loss": 2.2149, + "step": 1649 + }, + { + "epoch": 0.26621490803484993, + "grad_norm": 4.9668121337890625, + "learning_rate": 8.619489641436236e-05, + "loss": 2.192, + "step": 1650 + }, + { + "epoch": 0.26637625040335594, + "grad_norm": 4.284021377563477, + "learning_rate": 8.617686579948397e-05, + "loss": 2.0923, + "step": 1651 + }, + { + "epoch": 0.2665375927718619, + "grad_norm": 3.5677449703216553, + "learning_rate": 8.61588253060663e-05, + "loss": 2.0471, + "step": 1652 + }, + { + "epoch": 0.26669893514036785, + "grad_norm": 5.858703136444092, + "learning_rate": 8.614077493903553e-05, + "loss": 2.2517, + "step": 1653 + }, + { + "epoch": 0.2668602775088738, + "grad_norm": 6.740094184875488, + "learning_rate": 8.612271470332057e-05, + "loss": 2.2777, + "step": 1654 + }, + { + "epoch": 0.2670216198773798, + "grad_norm": 6.313733100891113, + "learning_rate": 8.610464460385296e-05, + "loss": 1.8602, + "step": 1655 + }, + { + "epoch": 0.2671829622458858, + "grad_norm": 4.579122066497803, + "learning_rate": 8.608656464556699e-05, + "loss": 1.9912, + "step": 1656 + }, + { + "epoch": 0.26734430461439174, + "grad_norm": 4.619999885559082, + "learning_rate": 8.606847483339957e-05, + "loss": 2.1888, + "step": 1657 + }, + { + "epoch": 0.2675056469828977, + "grad_norm": 5.256847858428955, + "learning_rate": 8.605037517229037e-05, + "loss": 2.135, + "step": 1658 + }, + { + "epoch": 0.2676669893514037, + "grad_norm": 5.907052040100098, + "learning_rate": 8.603226566718174e-05, + "loss": 2.1339, + "step": 1659 + }, + { + "epoch": 0.26782833171990966, + "grad_norm": 4.06528902053833, + "learning_rate": 8.601414632301869e-05, + "loss": 2.0343, + "step": 1660 + }, + { + "epoch": 0.2679896740884156, + "grad_norm": 3.8804450035095215, + "learning_rate": 8.599601714474894e-05, + "loss": 2.0802, + "step": 1661 + }, + { + "epoch": 0.2681510164569216, + "grad_norm": 4.992329120635986, + "learning_rate": 8.597787813732286e-05, + "loss": 1.9986, + "step": 1662 + }, + { + "epoch": 0.26831235882542753, + "grad_norm": 4.483761787414551, + "learning_rate": 8.595972930569356e-05, + "loss": 2.1974, + "step": 1663 + }, + { + "epoch": 0.26847370119393354, + "grad_norm": 4.82814359664917, + "learning_rate": 8.594157065481679e-05, + "loss": 1.8907, + "step": 1664 + }, + { + "epoch": 0.2686350435624395, + "grad_norm": 4.251171112060547, + "learning_rate": 8.592340218965099e-05, + "loss": 2.3435, + "step": 1665 + }, + { + "epoch": 0.26879638593094546, + "grad_norm": 4.5502166748046875, + "learning_rate": 8.590522391515729e-05, + "loss": 2.0974, + "step": 1666 + }, + { + "epoch": 0.2689577282994514, + "grad_norm": 4.192358016967773, + "learning_rate": 8.588703583629948e-05, + "loss": 2.2086, + "step": 1667 + }, + { + "epoch": 0.2691190706679574, + "grad_norm": 4.075906753540039, + "learning_rate": 8.586883795804406e-05, + "loss": 2.0219, + "step": 1668 + }, + { + "epoch": 0.2692804130364634, + "grad_norm": 5.171660900115967, + "learning_rate": 8.585063028536016e-05, + "loss": 2.0953, + "step": 1669 + }, + { + "epoch": 0.26944175540496934, + "grad_norm": 3.467822313308716, + "learning_rate": 8.583241282321963e-05, + "loss": 2.4836, + "step": 1670 + }, + { + "epoch": 0.2696030977734753, + "grad_norm": 3.6750383377075195, + "learning_rate": 8.581418557659695e-05, + "loss": 1.9712, + "step": 1671 + }, + { + "epoch": 0.2697644401419813, + "grad_norm": 5.118645191192627, + "learning_rate": 8.579594855046933e-05, + "loss": 2.0233, + "step": 1672 + }, + { + "epoch": 0.26992578251048727, + "grad_norm": 4.578837871551514, + "learning_rate": 8.577770174981658e-05, + "loss": 2.3076, + "step": 1673 + }, + { + "epoch": 0.2700871248789932, + "grad_norm": 4.3101487159729, + "learning_rate": 8.575944517962125e-05, + "loss": 2.1367, + "step": 1674 + }, + { + "epoch": 0.2702484672474992, + "grad_norm": 3.4827797412872314, + "learning_rate": 8.574117884486847e-05, + "loss": 1.9562, + "step": 1675 + }, + { + "epoch": 0.27040980961600514, + "grad_norm": 4.5507683753967285, + "learning_rate": 8.572290275054613e-05, + "loss": 2.0804, + "step": 1676 + }, + { + "epoch": 0.27057115198451115, + "grad_norm": 3.2505276203155518, + "learning_rate": 8.570461690164474e-05, + "loss": 1.7681, + "step": 1677 + }, + { + "epoch": 0.2707324943530171, + "grad_norm": 4.711097717285156, + "learning_rate": 8.568632130315745e-05, + "loss": 2.3152, + "step": 1678 + }, + { + "epoch": 0.27089383672152306, + "grad_norm": 3.9545562267303467, + "learning_rate": 8.566801596008013e-05, + "loss": 2.1795, + "step": 1679 + }, + { + "epoch": 0.271055179090029, + "grad_norm": 4.976853370666504, + "learning_rate": 8.564970087741126e-05, + "loss": 1.9317, + "step": 1680 + }, + { + "epoch": 0.27121652145853503, + "grad_norm": 4.758996963500977, + "learning_rate": 8.5631376060152e-05, + "loss": 2.2757, + "step": 1681 + }, + { + "epoch": 0.271377863827041, + "grad_norm": 4.466953277587891, + "learning_rate": 8.561304151330617e-05, + "loss": 1.7344, + "step": 1682 + }, + { + "epoch": 0.27153920619554694, + "grad_norm": 6.375881195068359, + "learning_rate": 8.559469724188027e-05, + "loss": 2.0868, + "step": 1683 + }, + { + "epoch": 0.2717005485640529, + "grad_norm": 3.204301357269287, + "learning_rate": 8.55763432508834e-05, + "loss": 2.1115, + "step": 1684 + }, + { + "epoch": 0.2718618909325589, + "grad_norm": 5.237433910369873, + "learning_rate": 8.555797954532733e-05, + "loss": 1.9778, + "step": 1685 + }, + { + "epoch": 0.27202323330106487, + "grad_norm": 3.665327310562134, + "learning_rate": 8.553960613022652e-05, + "loss": 2.0409, + "step": 1686 + }, + { + "epoch": 0.2721845756695708, + "grad_norm": 4.775506019592285, + "learning_rate": 8.552122301059806e-05, + "loss": 2.071, + "step": 1687 + }, + { + "epoch": 0.2723459180380768, + "grad_norm": 3.734705686569214, + "learning_rate": 8.550283019146167e-05, + "loss": 2.0403, + "step": 1688 + }, + { + "epoch": 0.2725072604065828, + "grad_norm": 6.832278251647949, + "learning_rate": 8.548442767783975e-05, + "loss": 1.9087, + "step": 1689 + }, + { + "epoch": 0.27266860277508875, + "grad_norm": 7.069394111633301, + "learning_rate": 8.546601547475734e-05, + "loss": 2.4925, + "step": 1690 + }, + { + "epoch": 0.2728299451435947, + "grad_norm": 4.048940658569336, + "learning_rate": 8.54475935872421e-05, + "loss": 2.0747, + "step": 1691 + }, + { + "epoch": 0.27299128751210067, + "grad_norm": 4.290622234344482, + "learning_rate": 8.542916202032436e-05, + "loss": 2.2706, + "step": 1692 + }, + { + "epoch": 0.2731526298806066, + "grad_norm": 3.9478087425231934, + "learning_rate": 8.541072077903709e-05, + "loss": 2.2192, + "step": 1693 + }, + { + "epoch": 0.27331397224911264, + "grad_norm": 4.618847370147705, + "learning_rate": 8.53922698684159e-05, + "loss": 2.232, + "step": 1694 + }, + { + "epoch": 0.2734753146176186, + "grad_norm": 3.8600399494171143, + "learning_rate": 8.537380929349903e-05, + "loss": 2.0081, + "step": 1695 + }, + { + "epoch": 0.27363665698612455, + "grad_norm": 5.539452075958252, + "learning_rate": 8.535533905932738e-05, + "loss": 2.3145, + "step": 1696 + }, + { + "epoch": 0.2737979993546305, + "grad_norm": 5.374900817871094, + "learning_rate": 8.533685917094447e-05, + "loss": 1.9882, + "step": 1697 + }, + { + "epoch": 0.2739593417231365, + "grad_norm": 3.646822929382324, + "learning_rate": 8.531836963339645e-05, + "loss": 2.0992, + "step": 1698 + }, + { + "epoch": 0.2741206840916425, + "grad_norm": 5.130327224731445, + "learning_rate": 8.529987045173213e-05, + "loss": 2.0718, + "step": 1699 + }, + { + "epoch": 0.27428202646014843, + "grad_norm": 4.452453136444092, + "learning_rate": 8.528136163100295e-05, + "loss": 2.2012, + "step": 1700 + }, + { + "epoch": 0.2744433688286544, + "grad_norm": 3.7064199447631836, + "learning_rate": 8.526284317626294e-05, + "loss": 2.1222, + "step": 1701 + }, + { + "epoch": 0.2746047111971604, + "grad_norm": 4.690736293792725, + "learning_rate": 8.52443150925688e-05, + "loss": 2.171, + "step": 1702 + }, + { + "epoch": 0.27476605356566636, + "grad_norm": 3.434121608734131, + "learning_rate": 8.52257773849799e-05, + "loss": 2.1656, + "step": 1703 + }, + { + "epoch": 0.2749273959341723, + "grad_norm": 3.749788284301758, + "learning_rate": 8.520723005855813e-05, + "loss": 2.0383, + "step": 1704 + }, + { + "epoch": 0.27508873830267827, + "grad_norm": 4.258902072906494, + "learning_rate": 8.518867311836808e-05, + "loss": 2.0491, + "step": 1705 + }, + { + "epoch": 0.2752500806711842, + "grad_norm": 4.509291648864746, + "learning_rate": 8.517010656947696e-05, + "loss": 2.1671, + "step": 1706 + }, + { + "epoch": 0.27541142303969024, + "grad_norm": 4.518990516662598, + "learning_rate": 8.515153041695459e-05, + "loss": 2.111, + "step": 1707 + }, + { + "epoch": 0.2755727654081962, + "grad_norm": 6.002044200897217, + "learning_rate": 8.513294466587342e-05, + "loss": 1.9611, + "step": 1708 + }, + { + "epoch": 0.27573410777670215, + "grad_norm": 5.309200286865234, + "learning_rate": 8.511434932130855e-05, + "loss": 1.8995, + "step": 1709 + }, + { + "epoch": 0.2758954501452081, + "grad_norm": 2.978005886077881, + "learning_rate": 8.50957443883376e-05, + "loss": 2.0993, + "step": 1710 + }, + { + "epoch": 0.2760567925137141, + "grad_norm": 4.166509628295898, + "learning_rate": 8.507712987204094e-05, + "loss": 2.0326, + "step": 1711 + }, + { + "epoch": 0.2762181348822201, + "grad_norm": 4.222262859344482, + "learning_rate": 8.505850577750145e-05, + "loss": 1.9066, + "step": 1712 + }, + { + "epoch": 0.27637947725072604, + "grad_norm": 3.854065418243408, + "learning_rate": 8.503987210980471e-05, + "loss": 2.0014, + "step": 1713 + }, + { + "epoch": 0.276540819619232, + "grad_norm": 4.566579341888428, + "learning_rate": 8.502122887403883e-05, + "loss": 2.0936, + "step": 1714 + }, + { + "epoch": 0.276702161987738, + "grad_norm": 5.428273677825928, + "learning_rate": 8.50025760752946e-05, + "loss": 2.0519, + "step": 1715 + }, + { + "epoch": 0.27686350435624396, + "grad_norm": 6.084582805633545, + "learning_rate": 8.498391371866538e-05, + "loss": 2.0133, + "step": 1716 + }, + { + "epoch": 0.2770248467247499, + "grad_norm": 7.042144775390625, + "learning_rate": 8.496524180924718e-05, + "loss": 2.174, + "step": 1717 + }, + { + "epoch": 0.2771861890932559, + "grad_norm": 3.311413288116455, + "learning_rate": 8.494656035213857e-05, + "loss": 2.0763, + "step": 1718 + }, + { + "epoch": 0.27734753146176183, + "grad_norm": 4.330287456512451, + "learning_rate": 8.492786935244078e-05, + "loss": 2.0849, + "step": 1719 + }, + { + "epoch": 0.27750887383026784, + "grad_norm": 3.794973611831665, + "learning_rate": 8.490916881525759e-05, + "loss": 2.2467, + "step": 1720 + }, + { + "epoch": 0.2776702161987738, + "grad_norm": 3.6375041007995605, + "learning_rate": 8.489045874569544e-05, + "loss": 2.0532, + "step": 1721 + }, + { + "epoch": 0.27783155856727976, + "grad_norm": 4.892178058624268, + "learning_rate": 8.487173914886331e-05, + "loss": 2.3658, + "step": 1722 + }, + { + "epoch": 0.2779929009357857, + "grad_norm": 4.804330348968506, + "learning_rate": 8.485301002987284e-05, + "loss": 2.0655, + "step": 1723 + }, + { + "epoch": 0.2781542433042917, + "grad_norm": 5.6723432540893555, + "learning_rate": 8.483427139383826e-05, + "loss": 1.9983, + "step": 1724 + }, + { + "epoch": 0.2783155856727977, + "grad_norm": 4.15791130065918, + "learning_rate": 8.481552324587636e-05, + "loss": 2.0655, + "step": 1725 + }, + { + "epoch": 0.27847692804130364, + "grad_norm": 3.6118197441101074, + "learning_rate": 8.479676559110656e-05, + "loss": 2.0098, + "step": 1726 + }, + { + "epoch": 0.2786382704098096, + "grad_norm": 4.086574554443359, + "learning_rate": 8.477799843465088e-05, + "loss": 1.9056, + "step": 1727 + }, + { + "epoch": 0.2787996127783156, + "grad_norm": 3.7872986793518066, + "learning_rate": 8.475922178163392e-05, + "loss": 2.1815, + "step": 1728 + }, + { + "epoch": 0.27896095514682157, + "grad_norm": 4.921558856964111, + "learning_rate": 8.474043563718285e-05, + "loss": 1.8451, + "step": 1729 + }, + { + "epoch": 0.2791222975153275, + "grad_norm": 4.6860833168029785, + "learning_rate": 8.47216400064275e-05, + "loss": 1.5947, + "step": 1730 + }, + { + "epoch": 0.2792836398838335, + "grad_norm": 4.63102912902832, + "learning_rate": 8.470283489450022e-05, + "loss": 2.1791, + "step": 1731 + }, + { + "epoch": 0.2794449822523395, + "grad_norm": 4.210811614990234, + "learning_rate": 8.468402030653597e-05, + "loss": 1.9916, + "step": 1732 + }, + { + "epoch": 0.27960632462084545, + "grad_norm": 4.811304569244385, + "learning_rate": 8.466519624767235e-05, + "loss": 1.7945, + "step": 1733 + }, + { + "epoch": 0.2797676669893514, + "grad_norm": 5.427934646606445, + "learning_rate": 8.464636272304945e-05, + "loss": 1.9227, + "step": 1734 + }, + { + "epoch": 0.27992900935785736, + "grad_norm": 4.605364799499512, + "learning_rate": 8.462751973781003e-05, + "loss": 1.9805, + "step": 1735 + }, + { + "epoch": 0.2800903517263633, + "grad_norm": 4.8616862297058105, + "learning_rate": 8.460866729709937e-05, + "loss": 1.9491, + "step": 1736 + }, + { + "epoch": 0.28025169409486933, + "grad_norm": 4.779373645782471, + "learning_rate": 8.458980540606541e-05, + "loss": 1.9869, + "step": 1737 + }, + { + "epoch": 0.2804130364633753, + "grad_norm": 3.774815559387207, + "learning_rate": 8.457093406985857e-05, + "loss": 2.0055, + "step": 1738 + }, + { + "epoch": 0.28057437883188124, + "grad_norm": 5.13038444519043, + "learning_rate": 8.455205329363193e-05, + "loss": 1.8714, + "step": 1739 + }, + { + "epoch": 0.2807357212003872, + "grad_norm": 4.071251392364502, + "learning_rate": 8.453316308254111e-05, + "loss": 2.0485, + "step": 1740 + }, + { + "epoch": 0.2808970635688932, + "grad_norm": 4.834817409515381, + "learning_rate": 8.451426344174433e-05, + "loss": 2.0874, + "step": 1741 + }, + { + "epoch": 0.28105840593739917, + "grad_norm": 4.914193630218506, + "learning_rate": 8.449535437640234e-05, + "loss": 2.0483, + "step": 1742 + }, + { + "epoch": 0.2812197483059051, + "grad_norm": 8.055091857910156, + "learning_rate": 8.44764358916785e-05, + "loss": 2.4413, + "step": 1743 + }, + { + "epoch": 0.2813810906744111, + "grad_norm": 4.535501956939697, + "learning_rate": 8.445750799273877e-05, + "loss": 2.0976, + "step": 1744 + }, + { + "epoch": 0.2815424330429171, + "grad_norm": 9.10206127166748, + "learning_rate": 8.44385706847516e-05, + "loss": 2.1505, + "step": 1745 + }, + { + "epoch": 0.28170377541142305, + "grad_norm": 4.140308380126953, + "learning_rate": 8.44196239728881e-05, + "loss": 1.9627, + "step": 1746 + }, + { + "epoch": 0.281865117779929, + "grad_norm": 6.006581783294678, + "learning_rate": 8.440066786232186e-05, + "loss": 1.8477, + "step": 1747 + }, + { + "epoch": 0.28202646014843497, + "grad_norm": 4.237965106964111, + "learning_rate": 8.43817023582291e-05, + "loss": 2.0056, + "step": 1748 + }, + { + "epoch": 0.2821878025169409, + "grad_norm": 5.994650363922119, + "learning_rate": 8.436272746578859e-05, + "loss": 1.9832, + "step": 1749 + }, + { + "epoch": 0.28234914488544693, + "grad_norm": 5.219415187835693, + "learning_rate": 8.434374319018165e-05, + "loss": 2.0885, + "step": 1750 + }, + { + "epoch": 0.2825104872539529, + "grad_norm": 5.175772190093994, + "learning_rate": 8.432474953659219e-05, + "loss": 2.3823, + "step": 1751 + }, + { + "epoch": 0.28267182962245885, + "grad_norm": 6.690397262573242, + "learning_rate": 8.430574651020664e-05, + "loss": 1.9578, + "step": 1752 + }, + { + "epoch": 0.2828331719909648, + "grad_norm": 4.6421284675598145, + "learning_rate": 8.428673411621401e-05, + "loss": 2.0015, + "step": 1753 + }, + { + "epoch": 0.2829945143594708, + "grad_norm": 3.951106548309326, + "learning_rate": 8.426771235980587e-05, + "loss": 2.1639, + "step": 1754 + }, + { + "epoch": 0.2831558567279768, + "grad_norm": 4.136624813079834, + "learning_rate": 8.424868124617636e-05, + "loss": 2.0264, + "step": 1755 + }, + { + "epoch": 0.28331719909648273, + "grad_norm": 5.540128707885742, + "learning_rate": 8.422964078052213e-05, + "loss": 2.165, + "step": 1756 + }, + { + "epoch": 0.2834785414649887, + "grad_norm": 3.8847310543060303, + "learning_rate": 8.421059096804244e-05, + "loss": 2.0028, + "step": 1757 + }, + { + "epoch": 0.2836398838334947, + "grad_norm": 3.25927472114563, + "learning_rate": 8.419153181393909e-05, + "loss": 2.08, + "step": 1758 + }, + { + "epoch": 0.28380122620200066, + "grad_norm": 4.559014797210693, + "learning_rate": 8.417246332341637e-05, + "loss": 2.0188, + "step": 1759 + }, + { + "epoch": 0.2839625685705066, + "grad_norm": 5.007056713104248, + "learning_rate": 8.41533855016812e-05, + "loss": 2.2219, + "step": 1760 + }, + { + "epoch": 0.28412391093901257, + "grad_norm": 5.803976058959961, + "learning_rate": 8.413429835394302e-05, + "loss": 2.1245, + "step": 1761 + }, + { + "epoch": 0.2842852533075186, + "grad_norm": 3.300891876220703, + "learning_rate": 8.411520188541379e-05, + "loss": 2.117, + "step": 1762 + }, + { + "epoch": 0.28444659567602454, + "grad_norm": 3.984173536300659, + "learning_rate": 8.409609610130804e-05, + "loss": 1.945, + "step": 1763 + }, + { + "epoch": 0.2846079380445305, + "grad_norm": 8.194157600402832, + "learning_rate": 8.407698100684284e-05, + "loss": 2.1773, + "step": 1764 + }, + { + "epoch": 0.28476928041303645, + "grad_norm": 5.650846481323242, + "learning_rate": 8.405785660723783e-05, + "loss": 2.1189, + "step": 1765 + }, + { + "epoch": 0.2849306227815424, + "grad_norm": 3.9818012714385986, + "learning_rate": 8.403872290771513e-05, + "loss": 1.8912, + "step": 1766 + }, + { + "epoch": 0.2850919651500484, + "grad_norm": 6.198014259338379, + "learning_rate": 8.401957991349945e-05, + "loss": 2.1052, + "step": 1767 + }, + { + "epoch": 0.2852533075185544, + "grad_norm": 4.0621161460876465, + "learning_rate": 8.400042762981799e-05, + "loss": 2.1539, + "step": 1768 + }, + { + "epoch": 0.28541464988706033, + "grad_norm": 5.210166931152344, + "learning_rate": 8.398126606190056e-05, + "loss": 2.1784, + "step": 1769 + }, + { + "epoch": 0.2855759922555663, + "grad_norm": 5.423094749450684, + "learning_rate": 8.396209521497942e-05, + "loss": 1.9488, + "step": 1770 + }, + { + "epoch": 0.2857373346240723, + "grad_norm": 5.281177520751953, + "learning_rate": 8.394291509428945e-05, + "loss": 2.3128, + "step": 1771 + }, + { + "epoch": 0.28589867699257826, + "grad_norm": 3.8413915634155273, + "learning_rate": 8.3923725705068e-05, + "loss": 2.0873, + "step": 1772 + }, + { + "epoch": 0.2860600193610842, + "grad_norm": 4.192770481109619, + "learning_rate": 8.390452705255495e-05, + "loss": 1.9095, + "step": 1773 + }, + { + "epoch": 0.2862213617295902, + "grad_norm": 4.927295684814453, + "learning_rate": 8.388531914199275e-05, + "loss": 1.8837, + "step": 1774 + }, + { + "epoch": 0.2863827040980962, + "grad_norm": 3.406172037124634, + "learning_rate": 8.386610197862636e-05, + "loss": 2.0226, + "step": 1775 + }, + { + "epoch": 0.28654404646660214, + "grad_norm": 5.010184288024902, + "learning_rate": 8.384687556770326e-05, + "loss": 1.9072, + "step": 1776 + }, + { + "epoch": 0.2867053888351081, + "grad_norm": 4.786250591278076, + "learning_rate": 8.382763991447344e-05, + "loss": 2.0567, + "step": 1777 + }, + { + "epoch": 0.28686673120361406, + "grad_norm": 5.376852035522461, + "learning_rate": 8.380839502418945e-05, + "loss": 2.1561, + "step": 1778 + }, + { + "epoch": 0.28702807357212, + "grad_norm": 5.463832378387451, + "learning_rate": 8.378914090210634e-05, + "loss": 2.2231, + "step": 1779 + }, + { + "epoch": 0.287189415940626, + "grad_norm": 3.5489230155944824, + "learning_rate": 8.37698775534817e-05, + "loss": 2.2164, + "step": 1780 + }, + { + "epoch": 0.287350758309132, + "grad_norm": 5.189237117767334, + "learning_rate": 8.375060498357561e-05, + "loss": 2.1313, + "step": 1781 + }, + { + "epoch": 0.28751210067763794, + "grad_norm": 3.619138479232788, + "learning_rate": 8.373132319765066e-05, + "loss": 2.2953, + "step": 1782 + }, + { + "epoch": 0.2876734430461439, + "grad_norm": 4.930722236633301, + "learning_rate": 8.371203220097202e-05, + "loss": 2.4205, + "step": 1783 + }, + { + "epoch": 0.2878347854146499, + "grad_norm": 4.0688323974609375, + "learning_rate": 8.369273199880731e-05, + "loss": 2.1226, + "step": 1784 + }, + { + "epoch": 0.28799612778315586, + "grad_norm": 3.675079584121704, + "learning_rate": 8.367342259642672e-05, + "loss": 2.0162, + "step": 1785 + }, + { + "epoch": 0.2881574701516618, + "grad_norm": 4.427017688751221, + "learning_rate": 8.365410399910288e-05, + "loss": 2.3975, + "step": 1786 + }, + { + "epoch": 0.2883188125201678, + "grad_norm": 3.828359603881836, + "learning_rate": 8.363477621211099e-05, + "loss": 2.1091, + "step": 1787 + }, + { + "epoch": 0.2884801548886738, + "grad_norm": 3.7655129432678223, + "learning_rate": 8.361543924072873e-05, + "loss": 1.8609, + "step": 1788 + }, + { + "epoch": 0.28864149725717975, + "grad_norm": 4.367695331573486, + "learning_rate": 8.359609309023632e-05, + "loss": 2.1469, + "step": 1789 + }, + { + "epoch": 0.2888028396256857, + "grad_norm": 4.422239303588867, + "learning_rate": 8.357673776591643e-05, + "loss": 1.9042, + "step": 1790 + }, + { + "epoch": 0.28896418199419166, + "grad_norm": 3.3966128826141357, + "learning_rate": 8.355737327305433e-05, + "loss": 2.1162, + "step": 1791 + }, + { + "epoch": 0.2891255243626976, + "grad_norm": 5.572344779968262, + "learning_rate": 8.353799961693767e-05, + "loss": 2.0833, + "step": 1792 + }, + { + "epoch": 0.28928686673120363, + "grad_norm": 6.192033290863037, + "learning_rate": 8.351861680285668e-05, + "loss": 2.2833, + "step": 1793 + }, + { + "epoch": 0.2894482090997096, + "grad_norm": 3.767331600189209, + "learning_rate": 8.34992248361041e-05, + "loss": 2.1159, + "step": 1794 + }, + { + "epoch": 0.28960955146821554, + "grad_norm": 3.587618589401245, + "learning_rate": 8.347982372197514e-05, + "loss": 2.023, + "step": 1795 + }, + { + "epoch": 0.2897708938367215, + "grad_norm": 3.6493425369262695, + "learning_rate": 8.346041346576751e-05, + "loss": 2.3199, + "step": 1796 + }, + { + "epoch": 0.2899322362052275, + "grad_norm": 4.177402973175049, + "learning_rate": 8.344099407278141e-05, + "loss": 1.7962, + "step": 1797 + }, + { + "epoch": 0.29009357857373347, + "grad_norm": 4.965822219848633, + "learning_rate": 8.342156554831955e-05, + "loss": 1.8738, + "step": 1798 + }, + { + "epoch": 0.2902549209422394, + "grad_norm": 4.516384124755859, + "learning_rate": 8.340212789768712e-05, + "loss": 2.319, + "step": 1799 + }, + { + "epoch": 0.2904162633107454, + "grad_norm": 4.535025119781494, + "learning_rate": 8.338268112619183e-05, + "loss": 2.0098, + "step": 1800 + }, + { + "epoch": 0.2905776056792514, + "grad_norm": 3.820551633834839, + "learning_rate": 8.336322523914385e-05, + "loss": 1.9986, + "step": 1801 + }, + { + "epoch": 0.29073894804775735, + "grad_norm": 3.7913529872894287, + "learning_rate": 8.334376024185584e-05, + "loss": 2.091, + "step": 1802 + }, + { + "epoch": 0.2909002904162633, + "grad_norm": 4.389832496643066, + "learning_rate": 8.332428613964298e-05, + "loss": 2.1236, + "step": 1803 + }, + { + "epoch": 0.29106163278476926, + "grad_norm": 5.472838401794434, + "learning_rate": 8.33048029378229e-05, + "loss": 1.8827, + "step": 1804 + }, + { + "epoch": 0.2912229751532753, + "grad_norm": 4.006913661956787, + "learning_rate": 8.328531064171572e-05, + "loss": 1.9958, + "step": 1805 + }, + { + "epoch": 0.29138431752178123, + "grad_norm": 5.290530681610107, + "learning_rate": 8.326580925664406e-05, + "loss": 1.8631, + "step": 1806 + }, + { + "epoch": 0.2915456598902872, + "grad_norm": 3.8902676105499268, + "learning_rate": 8.324629878793303e-05, + "loss": 2.2186, + "step": 1807 + }, + { + "epoch": 0.29170700225879315, + "grad_norm": 5.456364154815674, + "learning_rate": 8.322677924091018e-05, + "loss": 2.1408, + "step": 1808 + }, + { + "epoch": 0.2918683446272991, + "grad_norm": 4.808485984802246, + "learning_rate": 8.320725062090557e-05, + "loss": 1.6919, + "step": 1809 + }, + { + "epoch": 0.2920296869958051, + "grad_norm": 4.256877899169922, + "learning_rate": 8.318771293325174e-05, + "loss": 2.2299, + "step": 1810 + }, + { + "epoch": 0.2921910293643111, + "grad_norm": 4.910990238189697, + "learning_rate": 8.316816618328367e-05, + "loss": 2.223, + "step": 1811 + }, + { + "epoch": 0.29235237173281703, + "grad_norm": 3.7441246509552, + "learning_rate": 8.314861037633889e-05, + "loss": 1.9777, + "step": 1812 + }, + { + "epoch": 0.292513714101323, + "grad_norm": 4.108473777770996, + "learning_rate": 8.312904551775731e-05, + "loss": 2.0013, + "step": 1813 + }, + { + "epoch": 0.292675056469829, + "grad_norm": 4.527835369110107, + "learning_rate": 8.310947161288136e-05, + "loss": 2.033, + "step": 1814 + }, + { + "epoch": 0.29283639883833495, + "grad_norm": 4.592922687530518, + "learning_rate": 8.308988866705596e-05, + "loss": 2.0957, + "step": 1815 + }, + { + "epoch": 0.2929977412068409, + "grad_norm": 4.6102800369262695, + "learning_rate": 8.307029668562847e-05, + "loss": 1.9809, + "step": 1816 + }, + { + "epoch": 0.29315908357534687, + "grad_norm": 5.48193883895874, + "learning_rate": 8.30506956739487e-05, + "loss": 1.9832, + "step": 1817 + }, + { + "epoch": 0.2933204259438529, + "grad_norm": 5.1117682456970215, + "learning_rate": 8.303108563736894e-05, + "loss": 1.9945, + "step": 1818 + }, + { + "epoch": 0.29348176831235884, + "grad_norm": 4.609536647796631, + "learning_rate": 8.3011466581244e-05, + "loss": 1.8112, + "step": 1819 + }, + { + "epoch": 0.2936431106808648, + "grad_norm": 4.317946910858154, + "learning_rate": 8.299183851093108e-05, + "loss": 1.9708, + "step": 1820 + }, + { + "epoch": 0.29380445304937075, + "grad_norm": 4.565094947814941, + "learning_rate": 8.297220143178986e-05, + "loss": 2.0382, + "step": 1821 + }, + { + "epoch": 0.2939657954178767, + "grad_norm": 3.6840734481811523, + "learning_rate": 8.295255534918248e-05, + "loss": 1.9289, + "step": 1822 + }, + { + "epoch": 0.2941271377863827, + "grad_norm": 3.8791749477386475, + "learning_rate": 8.293290026847356e-05, + "loss": 1.9768, + "step": 1823 + }, + { + "epoch": 0.2942884801548887, + "grad_norm": 3.9318950176239014, + "learning_rate": 8.291323619503018e-05, + "loss": 1.9546, + "step": 1824 + }, + { + "epoch": 0.29444982252339463, + "grad_norm": 4.0623884201049805, + "learning_rate": 8.289356313422182e-05, + "loss": 1.8619, + "step": 1825 + }, + { + "epoch": 0.2946111648919006, + "grad_norm": 3.451681137084961, + "learning_rate": 8.287388109142046e-05, + "loss": 1.8466, + "step": 1826 + }, + { + "epoch": 0.2947725072604066, + "grad_norm": 3.832487106323242, + "learning_rate": 8.285419007200055e-05, + "loss": 2.1728, + "step": 1827 + }, + { + "epoch": 0.29493384962891256, + "grad_norm": 5.565883159637451, + "learning_rate": 8.283449008133894e-05, + "loss": 2.1671, + "step": 1828 + }, + { + "epoch": 0.2950951919974185, + "grad_norm": 5.599283218383789, + "learning_rate": 8.281478112481497e-05, + "loss": 2.007, + "step": 1829 + }, + { + "epoch": 0.2952565343659245, + "grad_norm": 4.877832889556885, + "learning_rate": 8.279506320781041e-05, + "loss": 2.3093, + "step": 1830 + }, + { + "epoch": 0.2954178767344305, + "grad_norm": 4.901605606079102, + "learning_rate": 8.277533633570948e-05, + "loss": 1.7875, + "step": 1831 + }, + { + "epoch": 0.29557921910293644, + "grad_norm": 4.114346981048584, + "learning_rate": 8.275560051389884e-05, + "loss": 2.053, + "step": 1832 + }, + { + "epoch": 0.2957405614714424, + "grad_norm": 4.338045120239258, + "learning_rate": 8.273585574776758e-05, + "loss": 1.9956, + "step": 1833 + }, + { + "epoch": 0.29590190383994835, + "grad_norm": 4.81793212890625, + "learning_rate": 8.27161020427073e-05, + "loss": 1.901, + "step": 1834 + }, + { + "epoch": 0.29606324620845437, + "grad_norm": 4.17056941986084, + "learning_rate": 8.269633940411196e-05, + "loss": 1.873, + "step": 1835 + }, + { + "epoch": 0.2962245885769603, + "grad_norm": 5.027039527893066, + "learning_rate": 8.267656783737801e-05, + "loss": 2.2233, + "step": 1836 + }, + { + "epoch": 0.2963859309454663, + "grad_norm": 5.195655345916748, + "learning_rate": 8.26567873479043e-05, + "loss": 2.1567, + "step": 1837 + }, + { + "epoch": 0.29654727331397224, + "grad_norm": 4.910747051239014, + "learning_rate": 8.263699794109215e-05, + "loss": 1.8349, + "step": 1838 + }, + { + "epoch": 0.2967086156824782, + "grad_norm": 3.8523197174072266, + "learning_rate": 8.261719962234529e-05, + "loss": 2.0515, + "step": 1839 + }, + { + "epoch": 0.2968699580509842, + "grad_norm": 4.3005757331848145, + "learning_rate": 8.259739239706991e-05, + "loss": 1.9005, + "step": 1840 + }, + { + "epoch": 0.29703130041949016, + "grad_norm": 3.722055673599243, + "learning_rate": 8.257757627067459e-05, + "loss": 2.0472, + "step": 1841 + }, + { + "epoch": 0.2971926427879961, + "grad_norm": 3.796337604522705, + "learning_rate": 8.255775124857042e-05, + "loss": 1.927, + "step": 1842 + }, + { + "epoch": 0.2973539851565021, + "grad_norm": 4.589344024658203, + "learning_rate": 8.253791733617082e-05, + "loss": 1.9603, + "step": 1843 + }, + { + "epoch": 0.2975153275250081, + "grad_norm": 4.884768962860107, + "learning_rate": 8.251807453889171e-05, + "loss": 2.164, + "step": 1844 + }, + { + "epoch": 0.29767666989351405, + "grad_norm": 6.825287342071533, + "learning_rate": 8.249822286215139e-05, + "loss": 2.1274, + "step": 1845 + }, + { + "epoch": 0.29783801226202, + "grad_norm": 4.882968902587891, + "learning_rate": 8.247836231137061e-05, + "loss": 2.2944, + "step": 1846 + }, + { + "epoch": 0.29799935463052596, + "grad_norm": 3.7253918647766113, + "learning_rate": 8.245849289197253e-05, + "loss": 2.0048, + "step": 1847 + }, + { + "epoch": 0.29816069699903197, + "grad_norm": 5.805208206176758, + "learning_rate": 8.243861460938278e-05, + "loss": 2.0602, + "step": 1848 + }, + { + "epoch": 0.29832203936753793, + "grad_norm": 7.223762035369873, + "learning_rate": 8.241872746902935e-05, + "loss": 2.3192, + "step": 1849 + }, + { + "epoch": 0.2984833817360439, + "grad_norm": 5.2446208000183105, + "learning_rate": 8.239883147634263e-05, + "loss": 2.0628, + "step": 1850 + }, + { + "epoch": 0.29864472410454984, + "grad_norm": 7.644248962402344, + "learning_rate": 8.23789266367555e-05, + "loss": 2.1512, + "step": 1851 + }, + { + "epoch": 0.2988060664730558, + "grad_norm": 4.67316198348999, + "learning_rate": 8.235901295570324e-05, + "loss": 2.0144, + "step": 1852 + }, + { + "epoch": 0.2989674088415618, + "grad_norm": 4.881799697875977, + "learning_rate": 8.23390904386235e-05, + "loss": 1.8919, + "step": 1853 + }, + { + "epoch": 0.29912875121006777, + "grad_norm": 5.2973480224609375, + "learning_rate": 8.231915909095637e-05, + "loss": 2.0731, + "step": 1854 + }, + { + "epoch": 0.2992900935785737, + "grad_norm": 5.177144527435303, + "learning_rate": 8.229921891814436e-05, + "loss": 1.953, + "step": 1855 + }, + { + "epoch": 0.2994514359470797, + "grad_norm": 4.5609893798828125, + "learning_rate": 8.227926992563237e-05, + "loss": 1.8789, + "step": 1856 + }, + { + "epoch": 0.2996127783155857, + "grad_norm": 6.363867282867432, + "learning_rate": 8.225931211886772e-05, + "loss": 2.0935, + "step": 1857 + }, + { + "epoch": 0.29977412068409165, + "grad_norm": 4.509897708892822, + "learning_rate": 8.223934550330015e-05, + "loss": 2.0326, + "step": 1858 + }, + { + "epoch": 0.2999354630525976, + "grad_norm": 3.1676878929138184, + "learning_rate": 8.221937008438178e-05, + "loss": 2.1461, + "step": 1859 + }, + { + "epoch": 0.30009680542110356, + "grad_norm": 3.9433722496032715, + "learning_rate": 8.219938586756712e-05, + "loss": 2.073, + "step": 1860 + }, + { + "epoch": 0.3002581477896096, + "grad_norm": 4.704596996307373, + "learning_rate": 8.217939285831316e-05, + "loss": 2.3441, + "step": 1861 + }, + { + "epoch": 0.30041949015811553, + "grad_norm": 4.053814888000488, + "learning_rate": 8.21593910620792e-05, + "loss": 2.0403, + "step": 1862 + }, + { + "epoch": 0.3005808325266215, + "grad_norm": 5.388185024261475, + "learning_rate": 8.213938048432697e-05, + "loss": 1.9212, + "step": 1863 + }, + { + "epoch": 0.30074217489512745, + "grad_norm": 3.7790966033935547, + "learning_rate": 8.211936113052063e-05, + "loss": 2.034, + "step": 1864 + }, + { + "epoch": 0.30090351726363346, + "grad_norm": 3.5199756622314453, + "learning_rate": 8.20993330061267e-05, + "loss": 2.0374, + "step": 1865 + }, + { + "epoch": 0.3010648596321394, + "grad_norm": 5.511442184448242, + "learning_rate": 8.207929611661411e-05, + "loss": 2.457, + "step": 1866 + }, + { + "epoch": 0.30122620200064537, + "grad_norm": 4.7877655029296875, + "learning_rate": 8.205925046745419e-05, + "loss": 2.0208, + "step": 1867 + }, + { + "epoch": 0.30138754436915133, + "grad_norm": 4.138000965118408, + "learning_rate": 8.203919606412063e-05, + "loss": 1.9117, + "step": 1868 + }, + { + "epoch": 0.3015488867376573, + "grad_norm": 5.232863426208496, + "learning_rate": 8.201913291208954e-05, + "loss": 1.958, + "step": 1869 + }, + { + "epoch": 0.3017102291061633, + "grad_norm": 4.581702709197998, + "learning_rate": 8.199906101683941e-05, + "loss": 2.271, + "step": 1870 + }, + { + "epoch": 0.30187157147466925, + "grad_norm": 4.821216583251953, + "learning_rate": 8.19789803838511e-05, + "loss": 1.9112, + "step": 1871 + }, + { + "epoch": 0.3020329138431752, + "grad_norm": 6.259906768798828, + "learning_rate": 8.195889101860793e-05, + "loss": 2.0971, + "step": 1872 + }, + { + "epoch": 0.30219425621168117, + "grad_norm": 3.859330892562866, + "learning_rate": 8.19387929265955e-05, + "loss": 2.0736, + "step": 1873 + }, + { + "epoch": 0.3023555985801872, + "grad_norm": 3.745858907699585, + "learning_rate": 8.191868611330184e-05, + "loss": 2.0505, + "step": 1874 + }, + { + "epoch": 0.30251694094869314, + "grad_norm": 4.091365814208984, + "learning_rate": 8.189857058421741e-05, + "loss": 2.2169, + "step": 1875 + }, + { + "epoch": 0.3026782833171991, + "grad_norm": 4.674230098724365, + "learning_rate": 8.187844634483496e-05, + "loss": 2.0723, + "step": 1876 + }, + { + "epoch": 0.30283962568570505, + "grad_norm": 4.745269775390625, + "learning_rate": 8.185831340064967e-05, + "loss": 2.1235, + "step": 1877 + }, + { + "epoch": 0.30300096805421106, + "grad_norm": 6.834924221038818, + "learning_rate": 8.18381717571591e-05, + "loss": 2.2466, + "step": 1878 + }, + { + "epoch": 0.303162310422717, + "grad_norm": 5.590103626251221, + "learning_rate": 8.181802141986317e-05, + "loss": 2.1019, + "step": 1879 + }, + { + "epoch": 0.303323652791223, + "grad_norm": 4.22111701965332, + "learning_rate": 8.179786239426417e-05, + "loss": 2.0257, + "step": 1880 + }, + { + "epoch": 0.30348499515972893, + "grad_norm": 3.378553867340088, + "learning_rate": 8.177769468586677e-05, + "loss": 1.9401, + "step": 1881 + }, + { + "epoch": 0.3036463375282349, + "grad_norm": 4.3551506996154785, + "learning_rate": 8.175751830017801e-05, + "loss": 2.0098, + "step": 1882 + }, + { + "epoch": 0.3038076798967409, + "grad_norm": 6.12330961227417, + "learning_rate": 8.173733324270733e-05, + "loss": 1.6995, + "step": 1883 + }, + { + "epoch": 0.30396902226524686, + "grad_norm": 3.480078935623169, + "learning_rate": 8.171713951896647e-05, + "loss": 2.1602, + "step": 1884 + }, + { + "epoch": 0.3041303646337528, + "grad_norm": 5.214806079864502, + "learning_rate": 8.169693713446959e-05, + "loss": 2.0116, + "step": 1885 + }, + { + "epoch": 0.30429170700225877, + "grad_norm": 3.7905850410461426, + "learning_rate": 8.167672609473323e-05, + "loss": 2.0998, + "step": 1886 + }, + { + "epoch": 0.3044530493707648, + "grad_norm": 4.413736820220947, + "learning_rate": 8.16565064052762e-05, + "loss": 1.8602, + "step": 1887 + }, + { + "epoch": 0.30461439173927074, + "grad_norm": 3.732191801071167, + "learning_rate": 8.16362780716198e-05, + "loss": 1.9877, + "step": 1888 + }, + { + "epoch": 0.3047757341077767, + "grad_norm": 7.122950077056885, + "learning_rate": 8.161604109928757e-05, + "loss": 2.2097, + "step": 1889 + }, + { + "epoch": 0.30493707647628265, + "grad_norm": 4.7883806228637695, + "learning_rate": 8.159579549380552e-05, + "loss": 1.9594, + "step": 1890 + }, + { + "epoch": 0.30509841884478867, + "grad_norm": 3.7021067142486572, + "learning_rate": 8.157554126070191e-05, + "loss": 2.0678, + "step": 1891 + }, + { + "epoch": 0.3052597612132946, + "grad_norm": 4.850696086883545, + "learning_rate": 8.155527840550746e-05, + "loss": 2.0929, + "step": 1892 + }, + { + "epoch": 0.3054211035818006, + "grad_norm": 4.618776798248291, + "learning_rate": 8.153500693375515e-05, + "loss": 1.9542, + "step": 1893 + }, + { + "epoch": 0.30558244595030654, + "grad_norm": 5.683887481689453, + "learning_rate": 8.151472685098036e-05, + "loss": 1.956, + "step": 1894 + }, + { + "epoch": 0.3057437883188125, + "grad_norm": 5.511989593505859, + "learning_rate": 8.149443816272083e-05, + "loss": 2.067, + "step": 1895 + }, + { + "epoch": 0.3059051306873185, + "grad_norm": 4.221396446228027, + "learning_rate": 8.147414087451664e-05, + "loss": 1.8161, + "step": 1896 + }, + { + "epoch": 0.30606647305582446, + "grad_norm": 4.867199420928955, + "learning_rate": 8.14538349919102e-05, + "loss": 2.18, + "step": 1897 + }, + { + "epoch": 0.3062278154243304, + "grad_norm": 5.007112979888916, + "learning_rate": 8.14335205204463e-05, + "loss": 2.2441, + "step": 1898 + }, + { + "epoch": 0.3063891577928364, + "grad_norm": 3.9696736335754395, + "learning_rate": 8.141319746567204e-05, + "loss": 1.874, + "step": 1899 + }, + { + "epoch": 0.3065505001613424, + "grad_norm": 5.212623119354248, + "learning_rate": 8.139286583313689e-05, + "loss": 1.9406, + "step": 1900 + }, + { + "epoch": 0.30671184252984834, + "grad_norm": 4.380959987640381, + "learning_rate": 8.137252562839265e-05, + "loss": 1.8606, + "step": 1901 + }, + { + "epoch": 0.3068731848983543, + "grad_norm": 3.6407463550567627, + "learning_rate": 8.135217685699345e-05, + "loss": 2.0841, + "step": 1902 + }, + { + "epoch": 0.30703452726686026, + "grad_norm": 4.7653608322143555, + "learning_rate": 8.133181952449582e-05, + "loss": 2.106, + "step": 1903 + }, + { + "epoch": 0.30719586963536627, + "grad_norm": 5.136886119842529, + "learning_rate": 8.131145363645851e-05, + "loss": 2.1021, + "step": 1904 + }, + { + "epoch": 0.3073572120038722, + "grad_norm": 3.9044461250305176, + "learning_rate": 8.129107919844274e-05, + "loss": 2.0675, + "step": 1905 + }, + { + "epoch": 0.3075185543723782, + "grad_norm": 3.826951265335083, + "learning_rate": 8.127069621601198e-05, + "loss": 1.9938, + "step": 1906 + }, + { + "epoch": 0.30767989674088414, + "grad_norm": 4.828694820404053, + "learning_rate": 8.125030469473202e-05, + "loss": 2.0543, + "step": 1907 + }, + { + "epoch": 0.30784123910939015, + "grad_norm": 5.609941005706787, + "learning_rate": 8.122990464017109e-05, + "loss": 2.1488, + "step": 1908 + }, + { + "epoch": 0.3080025814778961, + "grad_norm": 4.673035144805908, + "learning_rate": 8.12094960578996e-05, + "loss": 1.8555, + "step": 1909 + }, + { + "epoch": 0.30816392384640207, + "grad_norm": 3.8414700031280518, + "learning_rate": 8.118907895349039e-05, + "loss": 2.1498, + "step": 1910 + }, + { + "epoch": 0.308325266214908, + "grad_norm": 4.503723621368408, + "learning_rate": 8.116865333251864e-05, + "loss": 2.1516, + "step": 1911 + }, + { + "epoch": 0.308486608583414, + "grad_norm": 3.872340440750122, + "learning_rate": 8.114821920056177e-05, + "loss": 2.0245, + "step": 1912 + }, + { + "epoch": 0.30864795095192, + "grad_norm": 4.699495315551758, + "learning_rate": 8.112777656319959e-05, + "loss": 2.2828, + "step": 1913 + }, + { + "epoch": 0.30880929332042595, + "grad_norm": 5.339008331298828, + "learning_rate": 8.110732542601423e-05, + "loss": 1.9724, + "step": 1914 + }, + { + "epoch": 0.3089706356889319, + "grad_norm": 4.341615676879883, + "learning_rate": 8.10868657945901e-05, + "loss": 2.0989, + "step": 1915 + }, + { + "epoch": 0.30913197805743786, + "grad_norm": 5.45268440246582, + "learning_rate": 8.106639767451396e-05, + "loss": 2.2122, + "step": 1916 + }, + { + "epoch": 0.3092933204259439, + "grad_norm": 6.427046298980713, + "learning_rate": 8.104592107137489e-05, + "loss": 2.0737, + "step": 1917 + }, + { + "epoch": 0.30945466279444983, + "grad_norm": 4.466939926147461, + "learning_rate": 8.102543599076427e-05, + "loss": 2.2812, + "step": 1918 + }, + { + "epoch": 0.3096160051629558, + "grad_norm": 3.7878451347351074, + "learning_rate": 8.100494243827582e-05, + "loss": 2.1585, + "step": 1919 + }, + { + "epoch": 0.30977734753146174, + "grad_norm": 5.668677806854248, + "learning_rate": 8.098444041950553e-05, + "loss": 1.9682, + "step": 1920 + }, + { + "epoch": 0.30993868989996776, + "grad_norm": 4.185689926147461, + "learning_rate": 8.096392994005177e-05, + "loss": 2.0743, + "step": 1921 + }, + { + "epoch": 0.3101000322684737, + "grad_norm": 7.4727463722229, + "learning_rate": 8.094341100551512e-05, + "loss": 2.4402, + "step": 1922 + }, + { + "epoch": 0.31026137463697967, + "grad_norm": 4.48516845703125, + "learning_rate": 8.092288362149858e-05, + "loss": 2.38, + "step": 1923 + }, + { + "epoch": 0.3104227170054856, + "grad_norm": 4.385977745056152, + "learning_rate": 8.09023477936074e-05, + "loss": 2.1088, + "step": 1924 + }, + { + "epoch": 0.3105840593739916, + "grad_norm": 3.730656385421753, + "learning_rate": 8.088180352744911e-05, + "loss": 2.1832, + "step": 1925 + }, + { + "epoch": 0.3107454017424976, + "grad_norm": 5.2465972900390625, + "learning_rate": 8.08612508286336e-05, + "loss": 2.1811, + "step": 1926 + }, + { + "epoch": 0.31090674411100355, + "grad_norm": 4.33252477645874, + "learning_rate": 8.084068970277305e-05, + "loss": 2.1834, + "step": 1927 + }, + { + "epoch": 0.3110680864795095, + "grad_norm": 7.046680450439453, + "learning_rate": 8.082012015548188e-05, + "loss": 2.371, + "step": 1928 + }, + { + "epoch": 0.31122942884801547, + "grad_norm": 4.966737270355225, + "learning_rate": 8.07995421923769e-05, + "loss": 2.0372, + "step": 1929 + }, + { + "epoch": 0.3113907712165215, + "grad_norm": 3.923862934112549, + "learning_rate": 8.077895581907718e-05, + "loss": 2.218, + "step": 1930 + }, + { + "epoch": 0.31155211358502743, + "grad_norm": 4.700870990753174, + "learning_rate": 8.075836104120407e-05, + "loss": 2.0869, + "step": 1931 + }, + { + "epoch": 0.3117134559535334, + "grad_norm": 4.45346736907959, + "learning_rate": 8.073775786438122e-05, + "loss": 2.0875, + "step": 1932 + }, + { + "epoch": 0.31187479832203935, + "grad_norm": 5.010293483734131, + "learning_rate": 8.071714629423459e-05, + "loss": 2.1765, + "step": 1933 + }, + { + "epoch": 0.31203614069054536, + "grad_norm": 3.4277050495147705, + "learning_rate": 8.069652633639241e-05, + "loss": 1.8646, + "step": 1934 + }, + { + "epoch": 0.3121974830590513, + "grad_norm": 4.554781913757324, + "learning_rate": 8.067589799648523e-05, + "loss": 1.6562, + "step": 1935 + }, + { + "epoch": 0.3123588254275573, + "grad_norm": 4.1686296463012695, + "learning_rate": 8.065526128014587e-05, + "loss": 2.0406, + "step": 1936 + }, + { + "epoch": 0.31252016779606323, + "grad_norm": 4.863027572631836, + "learning_rate": 8.063461619300943e-05, + "loss": 2.0048, + "step": 1937 + }, + { + "epoch": 0.31268151016456924, + "grad_norm": 4.528186321258545, + "learning_rate": 8.061396274071333e-05, + "loss": 1.8828, + "step": 1938 + }, + { + "epoch": 0.3128428525330752, + "grad_norm": 3.636806011199951, + "learning_rate": 8.059330092889723e-05, + "loss": 2.135, + "step": 1939 + }, + { + "epoch": 0.31300419490158116, + "grad_norm": 5.246001720428467, + "learning_rate": 8.057263076320309e-05, + "loss": 2.1713, + "step": 1940 + }, + { + "epoch": 0.3131655372700871, + "grad_norm": 4.590300559997559, + "learning_rate": 8.055195224927517e-05, + "loss": 1.9298, + "step": 1941 + }, + { + "epoch": 0.31332687963859307, + "grad_norm": 4.156294822692871, + "learning_rate": 8.053126539275997e-05, + "loss": 2.1276, + "step": 1942 + }, + { + "epoch": 0.3134882220070991, + "grad_norm": 7.552245140075684, + "learning_rate": 8.051057019930631e-05, + "loss": 2.1637, + "step": 1943 + }, + { + "epoch": 0.31364956437560504, + "grad_norm": 3.9365739822387695, + "learning_rate": 8.048986667456527e-05, + "loss": 2.2709, + "step": 1944 + }, + { + "epoch": 0.313810906744111, + "grad_norm": 5.453755855560303, + "learning_rate": 8.046915482419018e-05, + "loss": 2.1096, + "step": 1945 + }, + { + "epoch": 0.31397224911261695, + "grad_norm": 3.7501649856567383, + "learning_rate": 8.044843465383669e-05, + "loss": 2.0991, + "step": 1946 + }, + { + "epoch": 0.31413359148112296, + "grad_norm": 3.9636921882629395, + "learning_rate": 8.042770616916269e-05, + "loss": 2.0603, + "step": 1947 + }, + { + "epoch": 0.3142949338496289, + "grad_norm": 5.861011505126953, + "learning_rate": 8.040696937582832e-05, + "loss": 1.9778, + "step": 1948 + }, + { + "epoch": 0.3144562762181349, + "grad_norm": 5.550604820251465, + "learning_rate": 8.038622427949607e-05, + "loss": 2.007, + "step": 1949 + }, + { + "epoch": 0.31461761858664083, + "grad_norm": 5.225160121917725, + "learning_rate": 8.036547088583062e-05, + "loss": 2.3065, + "step": 1950 + }, + { + "epoch": 0.31477896095514685, + "grad_norm": 4.004058361053467, + "learning_rate": 8.034470920049892e-05, + "loss": 2.1883, + "step": 1951 + }, + { + "epoch": 0.3149403033236528, + "grad_norm": 4.266819477081299, + "learning_rate": 8.032393922917024e-05, + "loss": 1.8595, + "step": 1952 + }, + { + "epoch": 0.31510164569215876, + "grad_norm": 4.176856994628906, + "learning_rate": 8.030316097751606e-05, + "loss": 1.904, + "step": 1953 + }, + { + "epoch": 0.3152629880606647, + "grad_norm": 3.6861655712127686, + "learning_rate": 8.028237445121013e-05, + "loss": 2.0782, + "step": 1954 + }, + { + "epoch": 0.3154243304291707, + "grad_norm": 4.144218444824219, + "learning_rate": 8.026157965592849e-05, + "loss": 1.9113, + "step": 1955 + }, + { + "epoch": 0.3155856727976767, + "grad_norm": 4.056292533874512, + "learning_rate": 8.024077659734938e-05, + "loss": 1.9954, + "step": 1956 + }, + { + "epoch": 0.31574701516618264, + "grad_norm": 4.331222057342529, + "learning_rate": 8.021996528115335e-05, + "loss": 2.058, + "step": 1957 + }, + { + "epoch": 0.3159083575346886, + "grad_norm": 5.137801647186279, + "learning_rate": 8.01991457130232e-05, + "loss": 2.0594, + "step": 1958 + }, + { + "epoch": 0.31606969990319456, + "grad_norm": 3.8450074195861816, + "learning_rate": 8.017831789864394e-05, + "loss": 1.9862, + "step": 1959 + }, + { + "epoch": 0.31623104227170057, + "grad_norm": 3.9215176105499268, + "learning_rate": 8.01574818437029e-05, + "loss": 1.8986, + "step": 1960 + }, + { + "epoch": 0.3163923846402065, + "grad_norm": 4.650374412536621, + "learning_rate": 8.013663755388958e-05, + "loss": 1.9499, + "step": 1961 + }, + { + "epoch": 0.3165537270087125, + "grad_norm": 4.070428848266602, + "learning_rate": 8.01157850348958e-05, + "loss": 2.1298, + "step": 1962 + }, + { + "epoch": 0.31671506937721844, + "grad_norm": 4.45579719543457, + "learning_rate": 8.009492429241559e-05, + "loss": 2.3318, + "step": 1963 + }, + { + "epoch": 0.31687641174572445, + "grad_norm": 5.952184677124023, + "learning_rate": 8.007405533214524e-05, + "loss": 2.2048, + "step": 1964 + }, + { + "epoch": 0.3170377541142304, + "grad_norm": 4.4508376121521, + "learning_rate": 8.005317815978326e-05, + "loss": 2.0861, + "step": 1965 + }, + { + "epoch": 0.31719909648273636, + "grad_norm": 7.377951622009277, + "learning_rate": 8.003229278103043e-05, + "loss": 2.0022, + "step": 1966 + }, + { + "epoch": 0.3173604388512423, + "grad_norm": 5.454758644104004, + "learning_rate": 8.001139920158976e-05, + "loss": 1.9503, + "step": 1967 + }, + { + "epoch": 0.31752178121974833, + "grad_norm": 5.345936298370361, + "learning_rate": 7.999049742716649e-05, + "loss": 2.0027, + "step": 1968 + }, + { + "epoch": 0.3176831235882543, + "grad_norm": 5.304137229919434, + "learning_rate": 7.996958746346812e-05, + "loss": 2.0196, + "step": 1969 + }, + { + "epoch": 0.31784446595676025, + "grad_norm": 8.837878227233887, + "learning_rate": 7.994866931620438e-05, + "loss": 2.1589, + "step": 1970 + }, + { + "epoch": 0.3180058083252662, + "grad_norm": 4.453640460968018, + "learning_rate": 7.99277429910872e-05, + "loss": 1.9992, + "step": 1971 + }, + { + "epoch": 0.31816715069377216, + "grad_norm": 5.355186462402344, + "learning_rate": 7.99068084938308e-05, + "loss": 2.0244, + "step": 1972 + }, + { + "epoch": 0.3183284930622782, + "grad_norm": 4.449441432952881, + "learning_rate": 7.988586583015156e-05, + "loss": 2.0835, + "step": 1973 + }, + { + "epoch": 0.31848983543078413, + "grad_norm": 4.989075660705566, + "learning_rate": 7.986491500576818e-05, + "loss": 2.3297, + "step": 1974 + }, + { + "epoch": 0.3186511777992901, + "grad_norm": 4.702456951141357, + "learning_rate": 7.984395602640153e-05, + "loss": 1.8845, + "step": 1975 + }, + { + "epoch": 0.31881252016779604, + "grad_norm": 4.21317195892334, + "learning_rate": 7.982298889777471e-05, + "loss": 1.8683, + "step": 1976 + }, + { + "epoch": 0.31897386253630206, + "grad_norm": 3.82844877243042, + "learning_rate": 7.980201362561305e-05, + "loss": 2.1639, + "step": 1977 + }, + { + "epoch": 0.319135204904808, + "grad_norm": 5.126898765563965, + "learning_rate": 7.978103021564412e-05, + "loss": 1.8773, + "step": 1978 + }, + { + "epoch": 0.31929654727331397, + "grad_norm": 5.022282600402832, + "learning_rate": 7.976003867359767e-05, + "loss": 1.818, + "step": 1979 + }, + { + "epoch": 0.3194578896418199, + "grad_norm": 5.216141700744629, + "learning_rate": 7.973903900520574e-05, + "loss": 1.9978, + "step": 1980 + }, + { + "epoch": 0.31961923201032594, + "grad_norm": 7.017202377319336, + "learning_rate": 7.971803121620251e-05, + "loss": 1.9971, + "step": 1981 + }, + { + "epoch": 0.3197805743788319, + "grad_norm": 4.33177375793457, + "learning_rate": 7.969701531232445e-05, + "loss": 2.0671, + "step": 1982 + }, + { + "epoch": 0.31994191674733785, + "grad_norm": 3.8065788745880127, + "learning_rate": 7.967599129931019e-05, + "loss": 1.9905, + "step": 1983 + }, + { + "epoch": 0.3201032591158438, + "grad_norm": 4.233896255493164, + "learning_rate": 7.96549591829006e-05, + "loss": 1.9748, + "step": 1984 + }, + { + "epoch": 0.32026460148434976, + "grad_norm": 4.714889049530029, + "learning_rate": 7.963391896883874e-05, + "loss": 1.9152, + "step": 1985 + }, + { + "epoch": 0.3204259438528558, + "grad_norm": 5.847426891326904, + "learning_rate": 7.961287066286994e-05, + "loss": 2.0224, + "step": 1986 + }, + { + "epoch": 0.32058728622136173, + "grad_norm": 3.1339242458343506, + "learning_rate": 7.959181427074167e-05, + "loss": 2.0242, + "step": 1987 + }, + { + "epoch": 0.3207486285898677, + "grad_norm": 4.087879180908203, + "learning_rate": 7.957074979820365e-05, + "loss": 2.1799, + "step": 1988 + }, + { + "epoch": 0.32090997095837365, + "grad_norm": 4.901279926300049, + "learning_rate": 7.954967725100779e-05, + "loss": 2.2795, + "step": 1989 + }, + { + "epoch": 0.32107131332687966, + "grad_norm": 4.170873165130615, + "learning_rate": 7.95285966349082e-05, + "loss": 2.1406, + "step": 1990 + }, + { + "epoch": 0.3212326556953856, + "grad_norm": 3.6736488342285156, + "learning_rate": 7.950750795566123e-05, + "loss": 1.9033, + "step": 1991 + }, + { + "epoch": 0.3213939980638916, + "grad_norm": 4.261050224304199, + "learning_rate": 7.948641121902537e-05, + "loss": 2.2996, + "step": 1992 + }, + { + "epoch": 0.32155534043239753, + "grad_norm": 4.296965599060059, + "learning_rate": 7.946530643076138e-05, + "loss": 2.2059, + "step": 1993 + }, + { + "epoch": 0.32171668280090354, + "grad_norm": 3.860304594039917, + "learning_rate": 7.944419359663213e-05, + "loss": 1.8987, + "step": 1994 + }, + { + "epoch": 0.3218780251694095, + "grad_norm": 4.054232120513916, + "learning_rate": 7.94230727224028e-05, + "loss": 1.8697, + "step": 1995 + }, + { + "epoch": 0.32203936753791546, + "grad_norm": 4.921481132507324, + "learning_rate": 7.940194381384066e-05, + "loss": 2.0686, + "step": 1996 + }, + { + "epoch": 0.3222007099064214, + "grad_norm": 3.2399144172668457, + "learning_rate": 7.938080687671524e-05, + "loss": 1.9011, + "step": 1997 + }, + { + "epoch": 0.32236205227492737, + "grad_norm": 3.696622848510742, + "learning_rate": 7.935966191679824e-05, + "loss": 2.0503, + "step": 1998 + }, + { + "epoch": 0.3225233946434334, + "grad_norm": 4.244028568267822, + "learning_rate": 7.933850893986354e-05, + "loss": 1.9405, + "step": 1999 + }, + { + "epoch": 0.32268473701193934, + "grad_norm": 7.01930570602417, + "learning_rate": 7.931734795168724e-05, + "loss": 1.8827, + "step": 2000 + }, + { + "epoch": 0.3228460793804453, + "grad_norm": 4.2505645751953125, + "learning_rate": 7.92961789580476e-05, + "loss": 1.9412, + "step": 2001 + }, + { + "epoch": 0.32300742174895125, + "grad_norm": 3.3729472160339355, + "learning_rate": 7.927500196472506e-05, + "loss": 2.2648, + "step": 2002 + }, + { + "epoch": 0.32316876411745726, + "grad_norm": 4.692650318145752, + "learning_rate": 7.925381697750229e-05, + "loss": 2.4044, + "step": 2003 + }, + { + "epoch": 0.3233301064859632, + "grad_norm": 3.2933013439178467, + "learning_rate": 7.923262400216408e-05, + "loss": 2.0314, + "step": 2004 + }, + { + "epoch": 0.3234914488544692, + "grad_norm": 6.476714134216309, + "learning_rate": 7.921142304449745e-05, + "loss": 1.7739, + "step": 2005 + }, + { + "epoch": 0.32365279122297513, + "grad_norm": 5.154987812042236, + "learning_rate": 7.91902141102916e-05, + "loss": 2.0001, + "step": 2006 + }, + { + "epoch": 0.32381413359148115, + "grad_norm": 3.7503819465637207, + "learning_rate": 7.916899720533786e-05, + "loss": 2.227, + "step": 2007 + }, + { + "epoch": 0.3239754759599871, + "grad_norm": 4.138298511505127, + "learning_rate": 7.914777233542978e-05, + "loss": 1.9531, + "step": 2008 + }, + { + "epoch": 0.32413681832849306, + "grad_norm": 3.416266918182373, + "learning_rate": 7.912653950636306e-05, + "loss": 2.1842, + "step": 2009 + }, + { + "epoch": 0.324298160696999, + "grad_norm": 4.819826126098633, + "learning_rate": 7.91052987239356e-05, + "loss": 2.0731, + "step": 2010 + }, + { + "epoch": 0.32445950306550503, + "grad_norm": 4.643051624298096, + "learning_rate": 7.908404999394746e-05, + "loss": 2.1121, + "step": 2011 + }, + { + "epoch": 0.324620845434011, + "grad_norm": 3.803864002227783, + "learning_rate": 7.906279332220087e-05, + "loss": 1.831, + "step": 2012 + }, + { + "epoch": 0.32478218780251694, + "grad_norm": 5.006372928619385, + "learning_rate": 7.904152871450022e-05, + "loss": 1.8421, + "step": 2013 + }, + { + "epoch": 0.3249435301710229, + "grad_norm": 4.55983829498291, + "learning_rate": 7.902025617665205e-05, + "loss": 2.1396, + "step": 2014 + }, + { + "epoch": 0.32510487253952886, + "grad_norm": 5.165456771850586, + "learning_rate": 7.899897571446513e-05, + "loss": 2.0341, + "step": 2015 + }, + { + "epoch": 0.32526621490803487, + "grad_norm": 4.067203998565674, + "learning_rate": 7.897768733375033e-05, + "loss": 1.9253, + "step": 2016 + }, + { + "epoch": 0.3254275572765408, + "grad_norm": 3.1523919105529785, + "learning_rate": 7.895639104032071e-05, + "loss": 2.0588, + "step": 2017 + }, + { + "epoch": 0.3255888996450468, + "grad_norm": 3.850659132003784, + "learning_rate": 7.893508683999149e-05, + "loss": 1.8334, + "step": 2018 + }, + { + "epoch": 0.32575024201355274, + "grad_norm": 4.036777496337891, + "learning_rate": 7.891377473858002e-05, + "loss": 1.9177, + "step": 2019 + }, + { + "epoch": 0.32591158438205875, + "grad_norm": 6.105804443359375, + "learning_rate": 7.889245474190588e-05, + "loss": 2.1875, + "step": 2020 + }, + { + "epoch": 0.3260729267505647, + "grad_norm": 4.579841613769531, + "learning_rate": 7.88711268557907e-05, + "loss": 1.841, + "step": 2021 + }, + { + "epoch": 0.32623426911907066, + "grad_norm": 3.5988316535949707, + "learning_rate": 7.884979108605837e-05, + "loss": 2.1727, + "step": 2022 + }, + { + "epoch": 0.3263956114875766, + "grad_norm": 3.7694644927978516, + "learning_rate": 7.882844743853484e-05, + "loss": 2.1283, + "step": 2023 + }, + { + "epoch": 0.32655695385608263, + "grad_norm": 4.465296745300293, + "learning_rate": 7.880709591904832e-05, + "loss": 1.9834, + "step": 2024 + }, + { + "epoch": 0.3267182962245886, + "grad_norm": 3.46445631980896, + "learning_rate": 7.878573653342904e-05, + "loss": 2.144, + "step": 2025 + }, + { + "epoch": 0.32687963859309455, + "grad_norm": 4.885611057281494, + "learning_rate": 7.876436928750947e-05, + "loss": 1.9757, + "step": 2026 + }, + { + "epoch": 0.3270409809616005, + "grad_norm": 4.627254962921143, + "learning_rate": 7.874299418712421e-05, + "loss": 1.8614, + "step": 2027 + }, + { + "epoch": 0.32720232333010646, + "grad_norm": 3.813387632369995, + "learning_rate": 7.872161123810999e-05, + "loss": 2.1079, + "step": 2028 + }, + { + "epoch": 0.32736366569861247, + "grad_norm": 4.278948783874512, + "learning_rate": 7.870022044630569e-05, + "loss": 2.0231, + "step": 2029 + }, + { + "epoch": 0.32752500806711843, + "grad_norm": 4.185835361480713, + "learning_rate": 7.86788218175523e-05, + "loss": 2.0688, + "step": 2030 + }, + { + "epoch": 0.3276863504356244, + "grad_norm": 4.226234436035156, + "learning_rate": 7.865741535769303e-05, + "loss": 1.9921, + "step": 2031 + }, + { + "epoch": 0.32784769280413034, + "grad_norm": 3.9076058864593506, + "learning_rate": 7.863600107257314e-05, + "loss": 2.0349, + "step": 2032 + }, + { + "epoch": 0.32800903517263635, + "grad_norm": 3.57064151763916, + "learning_rate": 7.861457896804007e-05, + "loss": 1.9627, + "step": 2033 + }, + { + "epoch": 0.3281703775411423, + "grad_norm": 4.979307174682617, + "learning_rate": 7.859314904994339e-05, + "loss": 2.4244, + "step": 2034 + }, + { + "epoch": 0.32833171990964827, + "grad_norm": 4.064205169677734, + "learning_rate": 7.857171132413483e-05, + "loss": 2.1526, + "step": 2035 + }, + { + "epoch": 0.3284930622781542, + "grad_norm": 3.6024906635284424, + "learning_rate": 7.855026579646818e-05, + "loss": 1.9607, + "step": 2036 + }, + { + "epoch": 0.32865440464666024, + "grad_norm": 5.789567947387695, + "learning_rate": 7.852881247279944e-05, + "loss": 2.086, + "step": 2037 + }, + { + "epoch": 0.3288157470151662, + "grad_norm": 4.560086727142334, + "learning_rate": 7.85073513589867e-05, + "loss": 1.9153, + "step": 2038 + }, + { + "epoch": 0.32897708938367215, + "grad_norm": 4.217399597167969, + "learning_rate": 7.848588246089017e-05, + "loss": 2.1856, + "step": 2039 + }, + { + "epoch": 0.3291384317521781, + "grad_norm": 5.792709827423096, + "learning_rate": 7.84644057843722e-05, + "loss": 2.0268, + "step": 2040 + }, + { + "epoch": 0.3292997741206841, + "grad_norm": 4.474367141723633, + "learning_rate": 7.844292133529727e-05, + "loss": 2.0528, + "step": 2041 + }, + { + "epoch": 0.3294611164891901, + "grad_norm": 3.6288115978240967, + "learning_rate": 7.842142911953197e-05, + "loss": 1.9576, + "step": 2042 + }, + { + "epoch": 0.32962245885769603, + "grad_norm": 3.5075695514678955, + "learning_rate": 7.839992914294499e-05, + "loss": 1.9709, + "step": 2043 + }, + { + "epoch": 0.329783801226202, + "grad_norm": 3.612318277359009, + "learning_rate": 7.837842141140721e-05, + "loss": 1.9535, + "step": 2044 + }, + { + "epoch": 0.32994514359470795, + "grad_norm": 3.621208667755127, + "learning_rate": 7.835690593079156e-05, + "loss": 2.074, + "step": 2045 + }, + { + "epoch": 0.33010648596321396, + "grad_norm": 5.363526821136475, + "learning_rate": 7.833538270697309e-05, + "loss": 1.8701, + "step": 2046 + }, + { + "epoch": 0.3302678283317199, + "grad_norm": 4.517481803894043, + "learning_rate": 7.831385174582901e-05, + "loss": 1.9816, + "step": 2047 + }, + { + "epoch": 0.33042917070022587, + "grad_norm": 5.250185489654541, + "learning_rate": 7.829231305323858e-05, + "loss": 2.1113, + "step": 2048 + }, + { + "epoch": 0.33059051306873183, + "grad_norm": 3.945758581161499, + "learning_rate": 7.827076663508326e-05, + "loss": 1.8457, + "step": 2049 + }, + { + "epoch": 0.33075185543723784, + "grad_norm": 4.452152729034424, + "learning_rate": 7.82492124972465e-05, + "loss": 2.061, + "step": 2050 + }, + { + "epoch": 0.3309131978057438, + "grad_norm": 3.809438467025757, + "learning_rate": 7.822765064561397e-05, + "loss": 2.1061, + "step": 2051 + }, + { + "epoch": 0.33107454017424975, + "grad_norm": 3.6477572917938232, + "learning_rate": 7.820608108607339e-05, + "loss": 1.9717, + "step": 2052 + }, + { + "epoch": 0.3312358825427557, + "grad_norm": 5.503085613250732, + "learning_rate": 7.818450382451457e-05, + "loss": 2.0585, + "step": 2053 + }, + { + "epoch": 0.3313972249112617, + "grad_norm": 3.4868550300598145, + "learning_rate": 7.816291886682947e-05, + "loss": 2.0033, + "step": 2054 + }, + { + "epoch": 0.3315585672797677, + "grad_norm": 4.83896541595459, + "learning_rate": 7.814132621891215e-05, + "loss": 2.079, + "step": 2055 + }, + { + "epoch": 0.33171990964827364, + "grad_norm": 4.1418890953063965, + "learning_rate": 7.81197258866587e-05, + "loss": 1.8944, + "step": 2056 + }, + { + "epoch": 0.3318812520167796, + "grad_norm": 4.504819393157959, + "learning_rate": 7.809811787596739e-05, + "loss": 2.1623, + "step": 2057 + }, + { + "epoch": 0.33204259438528555, + "grad_norm": 5.678425312042236, + "learning_rate": 7.807650219273853e-05, + "loss": 2.2039, + "step": 2058 + }, + { + "epoch": 0.33220393675379156, + "grad_norm": 3.4308807849884033, + "learning_rate": 7.805487884287457e-05, + "loss": 2.099, + "step": 2059 + }, + { + "epoch": 0.3323652791222975, + "grad_norm": 4.822578430175781, + "learning_rate": 7.803324783228004e-05, + "loss": 2.0879, + "step": 2060 + }, + { + "epoch": 0.3325266214908035, + "grad_norm": 3.984004020690918, + "learning_rate": 7.801160916686152e-05, + "loss": 1.9915, + "step": 2061 + }, + { + "epoch": 0.33268796385930943, + "grad_norm": 4.755062103271484, + "learning_rate": 7.798996285252773e-05, + "loss": 1.9483, + "step": 2062 + }, + { + "epoch": 0.33284930622781544, + "grad_norm": 3.7323362827301025, + "learning_rate": 7.796830889518949e-05, + "loss": 1.9571, + "step": 2063 + }, + { + "epoch": 0.3330106485963214, + "grad_norm": 4.8012213706970215, + "learning_rate": 7.794664730075964e-05, + "loss": 1.9506, + "step": 2064 + }, + { + "epoch": 0.33317199096482736, + "grad_norm": 4.999101638793945, + "learning_rate": 7.792497807515317e-05, + "loss": 2.1258, + "step": 2065 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 6.21721076965332, + "learning_rate": 7.790330122428711e-05, + "loss": 1.9964, + "step": 2066 + }, + { + "epoch": 0.3334946757018393, + "grad_norm": 3.7509355545043945, + "learning_rate": 7.788161675408061e-05, + "loss": 1.9618, + "step": 2067 + }, + { + "epoch": 0.3336560180703453, + "grad_norm": 5.406489372253418, + "learning_rate": 7.785992467045487e-05, + "loss": 1.9184, + "step": 2068 + }, + { + "epoch": 0.33381736043885124, + "grad_norm": 4.889528274536133, + "learning_rate": 7.783822497933321e-05, + "loss": 2.2079, + "step": 2069 + }, + { + "epoch": 0.3339787028073572, + "grad_norm": 5.023280143737793, + "learning_rate": 7.781651768664095e-05, + "loss": 1.9985, + "step": 2070 + }, + { + "epoch": 0.33414004517586315, + "grad_norm": 4.156134128570557, + "learning_rate": 7.779480279830557e-05, + "loss": 1.9568, + "step": 2071 + }, + { + "epoch": 0.33430138754436917, + "grad_norm": 4.915197849273682, + "learning_rate": 7.777308032025657e-05, + "loss": 1.9985, + "step": 2072 + }, + { + "epoch": 0.3344627299128751, + "grad_norm": 4.4993157386779785, + "learning_rate": 7.775135025842554e-05, + "loss": 2.2353, + "step": 2073 + }, + { + "epoch": 0.3346240722813811, + "grad_norm": 4.201979160308838, + "learning_rate": 7.772961261874615e-05, + "loss": 2.0364, + "step": 2074 + }, + { + "epoch": 0.33478541464988704, + "grad_norm": 5.750626087188721, + "learning_rate": 7.770786740715414e-05, + "loss": 1.9875, + "step": 2075 + }, + { + "epoch": 0.33494675701839305, + "grad_norm": 4.650112152099609, + "learning_rate": 7.768611462958728e-05, + "loss": 1.8518, + "step": 2076 + }, + { + "epoch": 0.335108099386899, + "grad_norm": 4.810061454772949, + "learning_rate": 7.766435429198546e-05, + "loss": 1.9889, + "step": 2077 + }, + { + "epoch": 0.33526944175540496, + "grad_norm": 4.015198707580566, + "learning_rate": 7.76425864002906e-05, + "loss": 2.1328, + "step": 2078 + }, + { + "epoch": 0.3354307841239109, + "grad_norm": 4.309876918792725, + "learning_rate": 7.762081096044668e-05, + "loss": 1.9792, + "step": 2079 + }, + { + "epoch": 0.33559212649241693, + "grad_norm": 4.62183141708374, + "learning_rate": 7.759902797839979e-05, + "loss": 1.8456, + "step": 2080 + }, + { + "epoch": 0.3357534688609229, + "grad_norm": 5.688324451446533, + "learning_rate": 7.757723746009799e-05, + "loss": 2.2172, + "step": 2081 + }, + { + "epoch": 0.33591481122942884, + "grad_norm": 3.3715908527374268, + "learning_rate": 7.755543941149149e-05, + "loss": 2.1086, + "step": 2082 + }, + { + "epoch": 0.3360761535979348, + "grad_norm": 4.781265735626221, + "learning_rate": 7.753363383853249e-05, + "loss": 2.058, + "step": 2083 + }, + { + "epoch": 0.3362374959664408, + "grad_norm": 5.9148783683776855, + "learning_rate": 7.751182074717527e-05, + "loss": 1.6179, + "step": 2084 + }, + { + "epoch": 0.33639883833494677, + "grad_norm": 5.1858720779418945, + "learning_rate": 7.74900001433762e-05, + "loss": 1.9247, + "step": 2085 + }, + { + "epoch": 0.3365601807034527, + "grad_norm": 5.318194389343262, + "learning_rate": 7.746817203309362e-05, + "loss": 1.8551, + "step": 2086 + }, + { + "epoch": 0.3367215230719587, + "grad_norm": 4.078609943389893, + "learning_rate": 7.744633642228798e-05, + "loss": 1.9068, + "step": 2087 + }, + { + "epoch": 0.33688286544046464, + "grad_norm": 3.140766143798828, + "learning_rate": 7.742449331692177e-05, + "loss": 2.0437, + "step": 2088 + }, + { + "epoch": 0.33704420780897065, + "grad_norm": 6.085255146026611, + "learning_rate": 7.740264272295953e-05, + "loss": 2.4435, + "step": 2089 + }, + { + "epoch": 0.3372055501774766, + "grad_norm": 3.4636056423187256, + "learning_rate": 7.73807846463678e-05, + "loss": 2.0657, + "step": 2090 + }, + { + "epoch": 0.33736689254598257, + "grad_norm": 4.229785919189453, + "learning_rate": 7.735891909311524e-05, + "loss": 2.193, + "step": 2091 + }, + { + "epoch": 0.3375282349144885, + "grad_norm": 4.070128440856934, + "learning_rate": 7.733704606917247e-05, + "loss": 2.0491, + "step": 2092 + }, + { + "epoch": 0.33768957728299454, + "grad_norm": 3.961073398590088, + "learning_rate": 7.73151655805122e-05, + "loss": 1.8331, + "step": 2093 + }, + { + "epoch": 0.3378509196515005, + "grad_norm": 5.665839672088623, + "learning_rate": 7.729327763310919e-05, + "loss": 2.0938, + "step": 2094 + }, + { + "epoch": 0.33801226202000645, + "grad_norm": 3.713109254837036, + "learning_rate": 7.727138223294019e-05, + "loss": 2.1567, + "step": 2095 + }, + { + "epoch": 0.3381736043885124, + "grad_norm": 5.792564392089844, + "learning_rate": 7.724947938598401e-05, + "loss": 2.0109, + "step": 2096 + }, + { + "epoch": 0.3383349467570184, + "grad_norm": 4.827317237854004, + "learning_rate": 7.722756909822151e-05, + "loss": 1.9553, + "step": 2097 + }, + { + "epoch": 0.3384962891255244, + "grad_norm": 4.299433708190918, + "learning_rate": 7.720565137563554e-05, + "loss": 1.9985, + "step": 2098 + }, + { + "epoch": 0.33865763149403033, + "grad_norm": 4.724922180175781, + "learning_rate": 7.718372622421101e-05, + "loss": 2.0317, + "step": 2099 + }, + { + "epoch": 0.3388189738625363, + "grad_norm": 4.753623962402344, + "learning_rate": 7.716179364993486e-05, + "loss": 1.8099, + "step": 2100 + }, + { + "epoch": 0.33898031623104224, + "grad_norm": 6.869202136993408, + "learning_rate": 7.713985365879606e-05, + "loss": 1.955, + "step": 2101 + }, + { + "epoch": 0.33914165859954826, + "grad_norm": 5.462575912475586, + "learning_rate": 7.711790625678559e-05, + "loss": 1.798, + "step": 2102 + }, + { + "epoch": 0.3393030009680542, + "grad_norm": 7.04573392868042, + "learning_rate": 7.709595144989643e-05, + "loss": 2.1349, + "step": 2103 + }, + { + "epoch": 0.33946434333656017, + "grad_norm": 5.625308990478516, + "learning_rate": 7.707398924412365e-05, + "loss": 2.0368, + "step": 2104 + }, + { + "epoch": 0.3396256857050661, + "grad_norm": 4.275908946990967, + "learning_rate": 7.705201964546429e-05, + "loss": 1.9984, + "step": 2105 + }, + { + "epoch": 0.33978702807357214, + "grad_norm": 5.542468547821045, + "learning_rate": 7.70300426599174e-05, + "loss": 2.0229, + "step": 2106 + }, + { + "epoch": 0.3399483704420781, + "grad_norm": 3.9115285873413086, + "learning_rate": 7.70080582934841e-05, + "loss": 1.965, + "step": 2107 + }, + { + "epoch": 0.34010971281058405, + "grad_norm": 4.481706619262695, + "learning_rate": 7.698606655216745e-05, + "loss": 1.9295, + "step": 2108 + }, + { + "epoch": 0.34027105517909, + "grad_norm": 4.542694091796875, + "learning_rate": 7.69640674419726e-05, + "loss": 2.0746, + "step": 2109 + }, + { + "epoch": 0.340432397547596, + "grad_norm": 3.7146451473236084, + "learning_rate": 7.694206096890666e-05, + "loss": 1.7149, + "step": 2110 + }, + { + "epoch": 0.340593739916102, + "grad_norm": 5.7919602394104, + "learning_rate": 7.692004713897878e-05, + "loss": 1.8816, + "step": 2111 + }, + { + "epoch": 0.34075508228460794, + "grad_norm": 5.574487686157227, + "learning_rate": 7.689802595820013e-05, + "loss": 2.1589, + "step": 2112 + }, + { + "epoch": 0.3409164246531139, + "grad_norm": 4.516279220581055, + "learning_rate": 7.68759974325838e-05, + "loss": 2.1939, + "step": 2113 + }, + { + "epoch": 0.3410777670216199, + "grad_norm": 3.7355384826660156, + "learning_rate": 7.685396156814502e-05, + "loss": 2.0675, + "step": 2114 + }, + { + "epoch": 0.34123910939012586, + "grad_norm": 4.5356035232543945, + "learning_rate": 7.683191837090092e-05, + "loss": 1.8799, + "step": 2115 + }, + { + "epoch": 0.3414004517586318, + "grad_norm": 3.9328882694244385, + "learning_rate": 7.680986784687065e-05, + "loss": 2.1635, + "step": 2116 + }, + { + "epoch": 0.3415617941271378, + "grad_norm": 4.58782434463501, + "learning_rate": 7.678781000207542e-05, + "loss": 2.163, + "step": 2117 + }, + { + "epoch": 0.34172313649564373, + "grad_norm": 3.2896623611450195, + "learning_rate": 7.676574484253837e-05, + "loss": 2.2619, + "step": 2118 + }, + { + "epoch": 0.34188447886414974, + "grad_norm": 4.790579319000244, + "learning_rate": 7.674367237428466e-05, + "loss": 1.8289, + "step": 2119 + }, + { + "epoch": 0.3420458212326557, + "grad_norm": 5.787225723266602, + "learning_rate": 7.672159260334148e-05, + "loss": 2.2768, + "step": 2120 + }, + { + "epoch": 0.34220716360116166, + "grad_norm": 3.641658067703247, + "learning_rate": 7.669950553573795e-05, + "loss": 1.8741, + "step": 2121 + }, + { + "epoch": 0.3423685059696676, + "grad_norm": 3.926100254058838, + "learning_rate": 7.667741117750522e-05, + "loss": 2.0661, + "step": 2122 + }, + { + "epoch": 0.3425298483381736, + "grad_norm": 4.437743663787842, + "learning_rate": 7.665530953467643e-05, + "loss": 2.1468, + "step": 2123 + }, + { + "epoch": 0.3426911907066796, + "grad_norm": 5.147749423980713, + "learning_rate": 7.663320061328673e-05, + "loss": 1.9784, + "step": 2124 + }, + { + "epoch": 0.34285253307518554, + "grad_norm": 7.403375148773193, + "learning_rate": 7.661108441937321e-05, + "loss": 2.1373, + "step": 2125 + }, + { + "epoch": 0.3430138754436915, + "grad_norm": 4.105249881744385, + "learning_rate": 7.658896095897498e-05, + "loss": 1.8298, + "step": 2126 + }, + { + "epoch": 0.3431752178121975, + "grad_norm": 4.507058620452881, + "learning_rate": 7.656683023813311e-05, + "loss": 2.0232, + "step": 2127 + }, + { + "epoch": 0.34333656018070347, + "grad_norm": 4.855741500854492, + "learning_rate": 7.654469226289067e-05, + "loss": 2.0097, + "step": 2128 + }, + { + "epoch": 0.3434979025492094, + "grad_norm": 6.300288677215576, + "learning_rate": 7.652254703929273e-05, + "loss": 2.1869, + "step": 2129 + }, + { + "epoch": 0.3436592449177154, + "grad_norm": 3.5406997203826904, + "learning_rate": 7.650039457338628e-05, + "loss": 2.0674, + "step": 2130 + }, + { + "epoch": 0.34382058728622134, + "grad_norm": 3.7372398376464844, + "learning_rate": 7.647823487122034e-05, + "loss": 2.083, + "step": 2131 + }, + { + "epoch": 0.34398192965472735, + "grad_norm": 3.858139753341675, + "learning_rate": 7.645606793884592e-05, + "loss": 2.0578, + "step": 2132 + }, + { + "epoch": 0.3441432720232333, + "grad_norm": 4.632990837097168, + "learning_rate": 7.643389378231592e-05, + "loss": 2.0941, + "step": 2133 + }, + { + "epoch": 0.34430461439173926, + "grad_norm": 3.7732186317443848, + "learning_rate": 7.64117124076853e-05, + "loss": 2.1238, + "step": 2134 + }, + { + "epoch": 0.3444659567602452, + "grad_norm": 4.680307865142822, + "learning_rate": 7.638952382101094e-05, + "loss": 2.0591, + "step": 2135 + }, + { + "epoch": 0.34462729912875123, + "grad_norm": 4.768545627593994, + "learning_rate": 7.636732802835174e-05, + "loss": 2.0028, + "step": 2136 + }, + { + "epoch": 0.3447886414972572, + "grad_norm": 3.762096643447876, + "learning_rate": 7.63451250357685e-05, + "loss": 1.9523, + "step": 2137 + }, + { + "epoch": 0.34494998386576314, + "grad_norm": 5.83837366104126, + "learning_rate": 7.632291484932403e-05, + "loss": 2.1163, + "step": 2138 + }, + { + "epoch": 0.3451113262342691, + "grad_norm": 5.010060787200928, + "learning_rate": 7.63006974750831e-05, + "loss": 2.0617, + "step": 2139 + }, + { + "epoch": 0.3452726686027751, + "grad_norm": 4.351550579071045, + "learning_rate": 7.627847291911243e-05, + "loss": 2.0207, + "step": 2140 + }, + { + "epoch": 0.34543401097128107, + "grad_norm": 5.060973644256592, + "learning_rate": 7.625624118748074e-05, + "loss": 2.1295, + "step": 2141 + }, + { + "epoch": 0.345595353339787, + "grad_norm": 4.052987575531006, + "learning_rate": 7.623400228625863e-05, + "loss": 2.0639, + "step": 2142 + }, + { + "epoch": 0.345756695708293, + "grad_norm": 4.111716270446777, + "learning_rate": 7.621175622151873e-05, + "loss": 1.9512, + "step": 2143 + }, + { + "epoch": 0.345918038076799, + "grad_norm": 4.8807783126831055, + "learning_rate": 7.618950299933562e-05, + "loss": 2.2095, + "step": 2144 + }, + { + "epoch": 0.34607938044530495, + "grad_norm": 4.844640254974365, + "learning_rate": 7.61672426257858e-05, + "loss": 2.1281, + "step": 2145 + }, + { + "epoch": 0.3462407228138109, + "grad_norm": 4.600811004638672, + "learning_rate": 7.614497510694774e-05, + "loss": 2.0428, + "step": 2146 + }, + { + "epoch": 0.34640206518231687, + "grad_norm": 3.9224560260772705, + "learning_rate": 7.612270044890184e-05, + "loss": 2.0553, + "step": 2147 + }, + { + "epoch": 0.3465634075508228, + "grad_norm": 3.4691977500915527, + "learning_rate": 7.610041865773049e-05, + "loss": 2.063, + "step": 2148 + }, + { + "epoch": 0.34672474991932883, + "grad_norm": 4.167876720428467, + "learning_rate": 7.607812973951802e-05, + "loss": 1.8786, + "step": 2149 + }, + { + "epoch": 0.3468860922878348, + "grad_norm": 4.364349842071533, + "learning_rate": 7.605583370035069e-05, + "loss": 2.2173, + "step": 2150 + }, + { + "epoch": 0.34704743465634075, + "grad_norm": 3.3503668308258057, + "learning_rate": 7.603353054631667e-05, + "loss": 2.1548, + "step": 2151 + }, + { + "epoch": 0.3472087770248467, + "grad_norm": 4.5009331703186035, + "learning_rate": 7.601122028350617e-05, + "loss": 2.246, + "step": 2152 + }, + { + "epoch": 0.3473701193933527, + "grad_norm": 5.919765472412109, + "learning_rate": 7.598890291801124e-05, + "loss": 2.2103, + "step": 2153 + }, + { + "epoch": 0.3475314617618587, + "grad_norm": 3.6515698432922363, + "learning_rate": 7.596657845592594e-05, + "loss": 2.0917, + "step": 2154 + }, + { + "epoch": 0.34769280413036463, + "grad_norm": 4.1915693283081055, + "learning_rate": 7.59442469033462e-05, + "loss": 2.2508, + "step": 2155 + }, + { + "epoch": 0.3478541464988706, + "grad_norm": 3.4758102893829346, + "learning_rate": 7.592190826636997e-05, + "loss": 2.1832, + "step": 2156 + }, + { + "epoch": 0.3480154888673766, + "grad_norm": 5.346054553985596, + "learning_rate": 7.589956255109705e-05, + "loss": 2.0014, + "step": 2157 + }, + { + "epoch": 0.34817683123588256, + "grad_norm": 6.513914585113525, + "learning_rate": 7.587720976362927e-05, + "loss": 2.1498, + "step": 2158 + }, + { + "epoch": 0.3483381736043885, + "grad_norm": 6.071094512939453, + "learning_rate": 7.585484991007027e-05, + "loss": 1.9775, + "step": 2159 + }, + { + "epoch": 0.34849951597289447, + "grad_norm": 3.1601836681365967, + "learning_rate": 7.583248299652571e-05, + "loss": 2.1196, + "step": 2160 + }, + { + "epoch": 0.3486608583414004, + "grad_norm": 3.5352375507354736, + "learning_rate": 7.581010902910316e-05, + "loss": 1.8535, + "step": 2161 + }, + { + "epoch": 0.34882220070990644, + "grad_norm": 4.067091464996338, + "learning_rate": 7.578772801391209e-05, + "loss": 1.9708, + "step": 2162 + }, + { + "epoch": 0.3489835430784124, + "grad_norm": 4.216934680938721, + "learning_rate": 7.576533995706394e-05, + "loss": 1.846, + "step": 2163 + }, + { + "epoch": 0.34914488544691835, + "grad_norm": 5.02567195892334, + "learning_rate": 7.574294486467204e-05, + "loss": 1.9846, + "step": 2164 + }, + { + "epoch": 0.3493062278154243, + "grad_norm": 4.7987165451049805, + "learning_rate": 7.572054274285161e-05, + "loss": 1.9039, + "step": 2165 + }, + { + "epoch": 0.3494675701839303, + "grad_norm": 4.404263019561768, + "learning_rate": 7.569813359771986e-05, + "loss": 2.0985, + "step": 2166 + }, + { + "epoch": 0.3496289125524363, + "grad_norm": 4.553297996520996, + "learning_rate": 7.567571743539585e-05, + "loss": 2.0523, + "step": 2167 + }, + { + "epoch": 0.34979025492094223, + "grad_norm": 4.184520244598389, + "learning_rate": 7.565329426200065e-05, + "loss": 2.0261, + "step": 2168 + }, + { + "epoch": 0.3499515972894482, + "grad_norm": 3.7621541023254395, + "learning_rate": 7.563086408365712e-05, + "loss": 1.8697, + "step": 2169 + }, + { + "epoch": 0.3501129396579542, + "grad_norm": 3.4361226558685303, + "learning_rate": 7.560842690649014e-05, + "loss": 2.0835, + "step": 2170 + }, + { + "epoch": 0.35027428202646016, + "grad_norm": 4.431186199188232, + "learning_rate": 7.558598273662643e-05, + "loss": 1.9047, + "step": 2171 + }, + { + "epoch": 0.3504356243949661, + "grad_norm": 4.451605319976807, + "learning_rate": 7.556353158019467e-05, + "loss": 1.934, + "step": 2172 + }, + { + "epoch": 0.3505969667634721, + "grad_norm": 3.9445595741271973, + "learning_rate": 7.55410734433254e-05, + "loss": 2.0204, + "step": 2173 + }, + { + "epoch": 0.35075830913197803, + "grad_norm": 3.83076548576355, + "learning_rate": 7.551860833215112e-05, + "loss": 1.9018, + "step": 2174 + }, + { + "epoch": 0.35091965150048404, + "grad_norm": 3.7835171222686768, + "learning_rate": 7.549613625280617e-05, + "loss": 1.8462, + "step": 2175 + }, + { + "epoch": 0.35108099386899, + "grad_norm": 4.310787200927734, + "learning_rate": 7.547365721142687e-05, + "loss": 2.0906, + "step": 2176 + }, + { + "epoch": 0.35124233623749596, + "grad_norm": 5.287550449371338, + "learning_rate": 7.545117121415136e-05, + "loss": 2.0887, + "step": 2177 + }, + { + "epoch": 0.3514036786060019, + "grad_norm": 4.817923069000244, + "learning_rate": 7.542867826711974e-05, + "loss": 2.009, + "step": 2178 + }, + { + "epoch": 0.3515650209745079, + "grad_norm": 3.8996381759643555, + "learning_rate": 7.5406178376474e-05, + "loss": 2.0563, + "step": 2179 + }, + { + "epoch": 0.3517263633430139, + "grad_norm": 3.9155139923095703, + "learning_rate": 7.538367154835797e-05, + "loss": 1.9556, + "step": 2180 + }, + { + "epoch": 0.35188770571151984, + "grad_norm": 4.121738433837891, + "learning_rate": 7.536115778891746e-05, + "loss": 1.9905, + "step": 2181 + }, + { + "epoch": 0.3520490480800258, + "grad_norm": 4.250374794006348, + "learning_rate": 7.533863710430012e-05, + "loss": 2.1936, + "step": 2182 + }, + { + "epoch": 0.3522103904485318, + "grad_norm": 3.8731632232666016, + "learning_rate": 7.531610950065549e-05, + "loss": 2.261, + "step": 2183 + }, + { + "epoch": 0.35237173281703776, + "grad_norm": 3.329829216003418, + "learning_rate": 7.529357498413501e-05, + "loss": 1.903, + "step": 2184 + }, + { + "epoch": 0.3525330751855437, + "grad_norm": 3.554926633834839, + "learning_rate": 7.5271033560892e-05, + "loss": 2.0567, + "step": 2185 + }, + { + "epoch": 0.3526944175540497, + "grad_norm": 4.352489471435547, + "learning_rate": 7.524848523708168e-05, + "loss": 1.9621, + "step": 2186 + }, + { + "epoch": 0.3528557599225557, + "grad_norm": 4.469099521636963, + "learning_rate": 7.522593001886118e-05, + "loss": 1.787, + "step": 2187 + }, + { + "epoch": 0.35301710229106165, + "grad_norm": 4.401259899139404, + "learning_rate": 7.520336791238942e-05, + "loss": 2.2623, + "step": 2188 + }, + { + "epoch": 0.3531784446595676, + "grad_norm": 4.640010833740234, + "learning_rate": 7.518079892382732e-05, + "loss": 1.9496, + "step": 2189 + }, + { + "epoch": 0.35333978702807356, + "grad_norm": 5.381575107574463, + "learning_rate": 7.515822305933758e-05, + "loss": 2.1458, + "step": 2190 + }, + { + "epoch": 0.3535011293965795, + "grad_norm": 4.868968486785889, + "learning_rate": 7.513564032508484e-05, + "loss": 2.3643, + "step": 2191 + }, + { + "epoch": 0.35366247176508553, + "grad_norm": 3.7936339378356934, + "learning_rate": 7.511305072723559e-05, + "loss": 2.0604, + "step": 2192 + }, + { + "epoch": 0.3538238141335915, + "grad_norm": 4.639528751373291, + "learning_rate": 7.509045427195819e-05, + "loss": 2.0235, + "step": 2193 + }, + { + "epoch": 0.35398515650209744, + "grad_norm": 3.7656235694885254, + "learning_rate": 7.506785096542289e-05, + "loss": 1.9896, + "step": 2194 + }, + { + "epoch": 0.3541464988706034, + "grad_norm": 3.891155242919922, + "learning_rate": 7.50452408138018e-05, + "loss": 1.972, + "step": 2195 + }, + { + "epoch": 0.3543078412391094, + "grad_norm": 4.294320583343506, + "learning_rate": 7.502262382326888e-05, + "loss": 2.0749, + "step": 2196 + }, + { + "epoch": 0.35446918360761537, + "grad_norm": 4.284241676330566, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8442, + "step": 2197 + }, + { + "epoch": 0.3546305259761213, + "grad_norm": 4.18466854095459, + "learning_rate": 7.49773693501729e-05, + "loss": 2.0265, + "step": 2198 + }, + { + "epoch": 0.3547918683446273, + "grad_norm": 5.000754356384277, + "learning_rate": 7.495473187996709e-05, + "loss": 2.0031, + "step": 2199 + }, + { + "epoch": 0.3549532107131333, + "grad_norm": 4.0818705558776855, + "learning_rate": 7.493208759556406e-05, + "loss": 1.8871, + "step": 2200 + }, + { + "epoch": 0.35511455308163925, + "grad_norm": 4.289008140563965, + "learning_rate": 7.49094365031471e-05, + "loss": 1.9596, + "step": 2201 + }, + { + "epoch": 0.3552758954501452, + "grad_norm": 4.734685897827148, + "learning_rate": 7.48867786089014e-05, + "loss": 2.2165, + "step": 2202 + }, + { + "epoch": 0.35543723781865116, + "grad_norm": 3.9588840007781982, + "learning_rate": 7.48641139190139e-05, + "loss": 1.9112, + "step": 2203 + }, + { + "epoch": 0.3555985801871571, + "grad_norm": 5.025766372680664, + "learning_rate": 7.484144243967353e-05, + "loss": 1.9741, + "step": 2204 + }, + { + "epoch": 0.35575992255566313, + "grad_norm": 5.695304870605469, + "learning_rate": 7.481876417707102e-05, + "loss": 1.9144, + "step": 2205 + }, + { + "epoch": 0.3559212649241691, + "grad_norm": 5.085302829742432, + "learning_rate": 7.479607913739894e-05, + "loss": 2.021, + "step": 2206 + }, + { + "epoch": 0.35608260729267505, + "grad_norm": 4.136185169219971, + "learning_rate": 7.47733873268517e-05, + "loss": 1.879, + "step": 2207 + }, + { + "epoch": 0.356243949661181, + "grad_norm": 6.267850875854492, + "learning_rate": 7.475068875162561e-05, + "loss": 1.9467, + "step": 2208 + }, + { + "epoch": 0.356405292029687, + "grad_norm": 4.190792083740234, + "learning_rate": 7.472798341791877e-05, + "loss": 1.9547, + "step": 2209 + }, + { + "epoch": 0.35656663439819297, + "grad_norm": 6.514063358306885, + "learning_rate": 7.470527133193116e-05, + "loss": 1.6638, + "step": 2210 + }, + { + "epoch": 0.35672797676669893, + "grad_norm": 4.96861457824707, + "learning_rate": 7.46825524998646e-05, + "loss": 1.9349, + "step": 2211 + }, + { + "epoch": 0.3568893191352049, + "grad_norm": 4.728672504425049, + "learning_rate": 7.465982692792275e-05, + "loss": 1.8288, + "step": 2212 + }, + { + "epoch": 0.3570506615037109, + "grad_norm": 3.9773061275482178, + "learning_rate": 7.46370946223111e-05, + "loss": 1.7914, + "step": 2213 + }, + { + "epoch": 0.35721200387221685, + "grad_norm": 4.387299060821533, + "learning_rate": 7.461435558923698e-05, + "loss": 1.8791, + "step": 2214 + }, + { + "epoch": 0.3573733462407228, + "grad_norm": 4.533514499664307, + "learning_rate": 7.459160983490959e-05, + "loss": 2.2867, + "step": 2215 + }, + { + "epoch": 0.35753468860922877, + "grad_norm": 4.0857977867126465, + "learning_rate": 7.456885736553989e-05, + "loss": 2.14, + "step": 2216 + }, + { + "epoch": 0.3576960309777348, + "grad_norm": 4.624113082885742, + "learning_rate": 7.454609818734076e-05, + "loss": 1.7061, + "step": 2217 + }, + { + "epoch": 0.35785737334624074, + "grad_norm": 5.158027648925781, + "learning_rate": 7.452333230652688e-05, + "loss": 1.7976, + "step": 2218 + }, + { + "epoch": 0.3580187157147467, + "grad_norm": 4.00270938873291, + "learning_rate": 7.450055972931473e-05, + "loss": 2.1349, + "step": 2219 + }, + { + "epoch": 0.35818005808325265, + "grad_norm": 5.796380519866943, + "learning_rate": 7.447778046192267e-05, + "loss": 1.9829, + "step": 2220 + }, + { + "epoch": 0.3583414004517586, + "grad_norm": 4.402221202850342, + "learning_rate": 7.445499451057084e-05, + "loss": 2.1976, + "step": 2221 + }, + { + "epoch": 0.3585027428202646, + "grad_norm": 3.4079225063323975, + "learning_rate": 7.443220188148123e-05, + "loss": 2.0053, + "step": 2222 + }, + { + "epoch": 0.3586640851887706, + "grad_norm": 5.272507190704346, + "learning_rate": 7.440940258087764e-05, + "loss": 2.1123, + "step": 2223 + }, + { + "epoch": 0.35882542755727653, + "grad_norm": 4.93818998336792, + "learning_rate": 7.43865966149857e-05, + "loss": 1.9708, + "step": 2224 + }, + { + "epoch": 0.3589867699257825, + "grad_norm": 4.46998405456543, + "learning_rate": 7.436378399003288e-05, + "loss": 2.1915, + "step": 2225 + }, + { + "epoch": 0.3591481122942885, + "grad_norm": 5.026282787322998, + "learning_rate": 7.434096471224842e-05, + "loss": 1.7489, + "step": 2226 + }, + { + "epoch": 0.35930945466279446, + "grad_norm": 5.975449562072754, + "learning_rate": 7.431813878786343e-05, + "loss": 1.9611, + "step": 2227 + }, + { + "epoch": 0.3594707970313004, + "grad_norm": 3.4423348903656006, + "learning_rate": 7.42953062231108e-05, + "loss": 2.0725, + "step": 2228 + }, + { + "epoch": 0.35963213939980637, + "grad_norm": 3.4423348903656006, + "learning_rate": 7.42953062231108e-05, + "loss": 2.1422, + "step": 2229 + }, + { + "epoch": 0.3597934817683124, + "grad_norm": 5.178089141845703, + "learning_rate": 7.427246702422525e-05, + "loss": 1.9105, + "step": 2230 + }, + { + "epoch": 0.35995482413681834, + "grad_norm": 4.918189525604248, + "learning_rate": 7.42496211974433e-05, + "loss": 2.1764, + "step": 2231 + }, + { + "epoch": 0.3601161665053243, + "grad_norm": 4.000021934509277, + "learning_rate": 7.422676874900329e-05, + "loss": 2.0969, + "step": 2232 + }, + { + "epoch": 0.36027750887383025, + "grad_norm": 5.2618536949157715, + "learning_rate": 7.420390968514535e-05, + "loss": 1.9344, + "step": 2233 + }, + { + "epoch": 0.3604388512423362, + "grad_norm": 6.015827178955078, + "learning_rate": 7.418104401211143e-05, + "loss": 2.1002, + "step": 2234 + }, + { + "epoch": 0.3606001936108422, + "grad_norm": 4.103837013244629, + "learning_rate": 7.41581717361453e-05, + "loss": 2.0114, + "step": 2235 + }, + { + "epoch": 0.3607615359793482, + "grad_norm": 4.670347213745117, + "learning_rate": 7.41352928634925e-05, + "loss": 2.2096, + "step": 2236 + }, + { + "epoch": 0.36092287834785414, + "grad_norm": 3.644935131072998, + "learning_rate": 7.41124074004004e-05, + "loss": 2.0364, + "step": 2237 + }, + { + "epoch": 0.3610842207163601, + "grad_norm": 5.363805294036865, + "learning_rate": 7.408951535311815e-05, + "loss": 2.0182, + "step": 2238 + }, + { + "epoch": 0.3612455630848661, + "grad_norm": 5.148021697998047, + "learning_rate": 7.40666167278967e-05, + "loss": 2.1056, + "step": 2239 + }, + { + "epoch": 0.36140690545337206, + "grad_norm": 5.083660125732422, + "learning_rate": 7.404371153098883e-05, + "loss": 1.9729, + "step": 2240 + }, + { + "epoch": 0.361568247821878, + "grad_norm": 4.589669704437256, + "learning_rate": 7.402079976864905e-05, + "loss": 2.0438, + "step": 2241 + }, + { + "epoch": 0.361729590190384, + "grad_norm": 5.363358020782471, + "learning_rate": 7.399788144713374e-05, + "loss": 1.9283, + "step": 2242 + }, + { + "epoch": 0.36189093255889, + "grad_norm": 4.749497890472412, + "learning_rate": 7.3974956572701e-05, + "loss": 2.1566, + "step": 2243 + }, + { + "epoch": 0.36205227492739595, + "grad_norm": 4.069309234619141, + "learning_rate": 7.395202515161073e-05, + "loss": 1.7986, + "step": 2244 + }, + { + "epoch": 0.3622136172959019, + "grad_norm": 3.9743974208831787, + "learning_rate": 7.392908719012468e-05, + "loss": 2.138, + "step": 2245 + }, + { + "epoch": 0.36237495966440786, + "grad_norm": 4.392673969268799, + "learning_rate": 7.390614269450634e-05, + "loss": 1.753, + "step": 2246 + }, + { + "epoch": 0.36253630203291387, + "grad_norm": 5.113845348358154, + "learning_rate": 7.388319167102097e-05, + "loss": 1.8796, + "step": 2247 + }, + { + "epoch": 0.3626976444014198, + "grad_norm": 3.84138560295105, + "learning_rate": 7.386023412593563e-05, + "loss": 2.0539, + "step": 2248 + }, + { + "epoch": 0.3628589867699258, + "grad_norm": 3.8947153091430664, + "learning_rate": 7.383727006551916e-05, + "loss": 2.0031, + "step": 2249 + }, + { + "epoch": 0.36302032913843174, + "grad_norm": 3.7585270404815674, + "learning_rate": 7.381429949604218e-05, + "loss": 1.9055, + "step": 2250 + }, + { + "epoch": 0.3631816715069377, + "grad_norm": 4.749552249908447, + "learning_rate": 7.379132242377712e-05, + "loss": 1.9108, + "step": 2251 + }, + { + "epoch": 0.3633430138754437, + "grad_norm": 5.605308532714844, + "learning_rate": 7.37683388549981e-05, + "loss": 1.9858, + "step": 2252 + }, + { + "epoch": 0.36350435624394967, + "grad_norm": 4.167489051818848, + "learning_rate": 7.374534879598109e-05, + "loss": 1.9966, + "step": 2253 + }, + { + "epoch": 0.3636656986124556, + "grad_norm": 4.343514442443848, + "learning_rate": 7.372235225300382e-05, + "loss": 2.2502, + "step": 2254 + }, + { + "epoch": 0.3638270409809616, + "grad_norm": 4.107678413391113, + "learning_rate": 7.369934923234577e-05, + "loss": 2.0626, + "step": 2255 + }, + { + "epoch": 0.3639883833494676, + "grad_norm": 3.700336217880249, + "learning_rate": 7.36763397402882e-05, + "loss": 1.8523, + "step": 2256 + }, + { + "epoch": 0.36414972571797355, + "grad_norm": 4.870800495147705, + "learning_rate": 7.365332378311414e-05, + "loss": 2.0273, + "step": 2257 + }, + { + "epoch": 0.3643110680864795, + "grad_norm": 5.010525226593018, + "learning_rate": 7.363030136710836e-05, + "loss": 2.1416, + "step": 2258 + }, + { + "epoch": 0.36447241045498546, + "grad_norm": 5.208023548126221, + "learning_rate": 7.360727249855744e-05, + "loss": 1.8927, + "step": 2259 + }, + { + "epoch": 0.3646337528234915, + "grad_norm": 3.7048261165618896, + "learning_rate": 7.35842371837497e-05, + "loss": 2.0855, + "step": 2260 + }, + { + "epoch": 0.36479509519199743, + "grad_norm": 3.731630563735962, + "learning_rate": 7.356119542897518e-05, + "loss": 1.9924, + "step": 2261 + }, + { + "epoch": 0.3649564375605034, + "grad_norm": 4.801681995391846, + "learning_rate": 7.353814724052576e-05, + "loss": 2.1764, + "step": 2262 + }, + { + "epoch": 0.36511777992900935, + "grad_norm": 5.089466094970703, + "learning_rate": 7.3515092624695e-05, + "loss": 1.8582, + "step": 2263 + }, + { + "epoch": 0.3652791222975153, + "grad_norm": 4.4316840171813965, + "learning_rate": 7.349203158777826e-05, + "loss": 2.1998, + "step": 2264 + }, + { + "epoch": 0.3654404646660213, + "grad_norm": 3.751309394836426, + "learning_rate": 7.346896413607262e-05, + "loss": 2.053, + "step": 2265 + }, + { + "epoch": 0.36560180703452727, + "grad_norm": 2.876962661743164, + "learning_rate": 7.344589027587697e-05, + "loss": 2.16, + "step": 2266 + }, + { + "epoch": 0.3657631494030332, + "grad_norm": 4.779649257659912, + "learning_rate": 7.34228100134919e-05, + "loss": 1.84, + "step": 2267 + }, + { + "epoch": 0.3659244917715392, + "grad_norm": 3.540619373321533, + "learning_rate": 7.339972335521972e-05, + "loss": 1.972, + "step": 2268 + }, + { + "epoch": 0.3660858341400452, + "grad_norm": 4.27071475982666, + "learning_rate": 7.33766303073646e-05, + "loss": 1.913, + "step": 2269 + }, + { + "epoch": 0.36624717650855115, + "grad_norm": 5.800755977630615, + "learning_rate": 7.335353087623231e-05, + "loss": 1.8588, + "step": 2270 + }, + { + "epoch": 0.3664085188770571, + "grad_norm": 4.293838977813721, + "learning_rate": 7.333042506813048e-05, + "loss": 2.1418, + "step": 2271 + }, + { + "epoch": 0.36656986124556307, + "grad_norm": 3.397761821746826, + "learning_rate": 7.330731288936843e-05, + "loss": 2.0779, + "step": 2272 + }, + { + "epoch": 0.3667312036140691, + "grad_norm": 3.5220279693603516, + "learning_rate": 7.32841943462572e-05, + "loss": 1.8257, + "step": 2273 + }, + { + "epoch": 0.36689254598257504, + "grad_norm": 3.7657272815704346, + "learning_rate": 7.32610694451096e-05, + "loss": 1.8189, + "step": 2274 + }, + { + "epoch": 0.367053888351081, + "grad_norm": 4.797881126403809, + "learning_rate": 7.32379381922402e-05, + "loss": 2.0172, + "step": 2275 + }, + { + "epoch": 0.36721523071958695, + "grad_norm": 3.328403949737549, + "learning_rate": 7.321480059396523e-05, + "loss": 1.9931, + "step": 2276 + }, + { + "epoch": 0.3673765730880929, + "grad_norm": 6.129566192626953, + "learning_rate": 7.319165665660273e-05, + "loss": 1.9476, + "step": 2277 + }, + { + "epoch": 0.3675379154565989, + "grad_norm": 4.815751075744629, + "learning_rate": 7.316850638647243e-05, + "loss": 1.9807, + "step": 2278 + }, + { + "epoch": 0.3676992578251049, + "grad_norm": 5.611696243286133, + "learning_rate": 7.31453497898958e-05, + "loss": 1.8491, + "step": 2279 + }, + { + "epoch": 0.36786060019361083, + "grad_norm": 4.9298224449157715, + "learning_rate": 7.312218687319603e-05, + "loss": 1.8654, + "step": 2280 + }, + { + "epoch": 0.3680219425621168, + "grad_norm": 4.291321754455566, + "learning_rate": 7.309901764269802e-05, + "loss": 1.9494, + "step": 2281 + }, + { + "epoch": 0.3681832849306228, + "grad_norm": 4.082421779632568, + "learning_rate": 7.307584210472844e-05, + "loss": 2.0036, + "step": 2282 + }, + { + "epoch": 0.36834462729912876, + "grad_norm": 4.3880743980407715, + "learning_rate": 7.305266026561565e-05, + "loss": 1.9789, + "step": 2283 + }, + { + "epoch": 0.3685059696676347, + "grad_norm": 3.8602914810180664, + "learning_rate": 7.302947213168974e-05, + "loss": 1.8724, + "step": 2284 + }, + { + "epoch": 0.36866731203614067, + "grad_norm": 4.736367702484131, + "learning_rate": 7.300627770928252e-05, + "loss": 1.9731, + "step": 2285 + }, + { + "epoch": 0.3688286544046467, + "grad_norm": 5.397782802581787, + "learning_rate": 7.298307700472748e-05, + "loss": 1.9259, + "step": 2286 + }, + { + "epoch": 0.36898999677315264, + "grad_norm": 5.391665935516357, + "learning_rate": 7.295987002435989e-05, + "loss": 1.9912, + "step": 2287 + }, + { + "epoch": 0.3691513391416586, + "grad_norm": 4.335827350616455, + "learning_rate": 7.29366567745167e-05, + "loss": 1.7932, + "step": 2288 + }, + { + "epoch": 0.36931268151016455, + "grad_norm": 3.538144826889038, + "learning_rate": 7.291343726153656e-05, + "loss": 1.9518, + "step": 2289 + }, + { + "epoch": 0.36947402387867057, + "grad_norm": 3.7503881454467773, + "learning_rate": 7.289021149175986e-05, + "loss": 2.0434, + "step": 2290 + }, + { + "epoch": 0.3696353662471765, + "grad_norm": 3.6585474014282227, + "learning_rate": 7.286697947152867e-05, + "loss": 1.9967, + "step": 2291 + }, + { + "epoch": 0.3697967086156825, + "grad_norm": 4.704679489135742, + "learning_rate": 7.28437412071868e-05, + "loss": 2.1335, + "step": 2292 + }, + { + "epoch": 0.36995805098418844, + "grad_norm": 4.349279880523682, + "learning_rate": 7.28204967050797e-05, + "loss": 1.9715, + "step": 2293 + }, + { + "epoch": 0.3701193933526944, + "grad_norm": 4.37742805480957, + "learning_rate": 7.279724597155462e-05, + "loss": 1.7624, + "step": 2294 + }, + { + "epoch": 0.3702807357212004, + "grad_norm": 4.009250640869141, + "learning_rate": 7.277398901296044e-05, + "loss": 2.0692, + "step": 2295 + }, + { + "epoch": 0.37044207808970636, + "grad_norm": 5.741460800170898, + "learning_rate": 7.275072583564775e-05, + "loss": 1.9758, + "step": 2296 + }, + { + "epoch": 0.3706034204582123, + "grad_norm": 3.7926852703094482, + "learning_rate": 7.272745644596887e-05, + "loss": 1.9235, + "step": 2297 + }, + { + "epoch": 0.3707647628267183, + "grad_norm": 3.9284520149230957, + "learning_rate": 7.270418085027776e-05, + "loss": 2.3003, + "step": 2298 + }, + { + "epoch": 0.3709261051952243, + "grad_norm": 4.326069355010986, + "learning_rate": 7.268089905493013e-05, + "loss": 2.0143, + "step": 2299 + }, + { + "epoch": 0.37108744756373024, + "grad_norm": 5.114227771759033, + "learning_rate": 7.265761106628337e-05, + "loss": 2.0741, + "step": 2300 + }, + { + "epoch": 0.3712487899322362, + "grad_norm": 5.760619640350342, + "learning_rate": 7.263431689069651e-05, + "loss": 2.0508, + "step": 2301 + }, + { + "epoch": 0.37141013230074216, + "grad_norm": 5.333743095397949, + "learning_rate": 7.261101653453038e-05, + "loss": 1.8713, + "step": 2302 + }, + { + "epoch": 0.37157147466924817, + "grad_norm": 4.196969509124756, + "learning_rate": 7.258771000414735e-05, + "loss": 2.0845, + "step": 2303 + }, + { + "epoch": 0.3717328170377541, + "grad_norm": 3.85819673538208, + "learning_rate": 7.256439730591162e-05, + "loss": 2.2329, + "step": 2304 + }, + { + "epoch": 0.3718941594062601, + "grad_norm": 5.132253170013428, + "learning_rate": 7.2541078446189e-05, + "loss": 1.9991, + "step": 2305 + }, + { + "epoch": 0.37205550177476604, + "grad_norm": 6.330166339874268, + "learning_rate": 7.251775343134694e-05, + "loss": 1.9949, + "step": 2306 + }, + { + "epoch": 0.372216844143272, + "grad_norm": 3.9378414154052734, + "learning_rate": 7.249442226775469e-05, + "loss": 1.917, + "step": 2307 + }, + { + "epoch": 0.372378186511778, + "grad_norm": 4.884683132171631, + "learning_rate": 7.247108496178307e-05, + "loss": 2.036, + "step": 2308 + }, + { + "epoch": 0.37253952888028397, + "grad_norm": 3.73783540725708, + "learning_rate": 7.244774151980466e-05, + "loss": 1.905, + "step": 2309 + }, + { + "epoch": 0.3727008712487899, + "grad_norm": 4.066815376281738, + "learning_rate": 7.242439194819364e-05, + "loss": 2.1589, + "step": 2310 + }, + { + "epoch": 0.3728622136172959, + "grad_norm": 4.2685770988464355, + "learning_rate": 7.240103625332589e-05, + "loss": 2.2511, + "step": 2311 + }, + { + "epoch": 0.3730235559858019, + "grad_norm": 3.9241483211517334, + "learning_rate": 7.237767444157899e-05, + "loss": 1.9503, + "step": 2312 + }, + { + "epoch": 0.37318489835430785, + "grad_norm": 3.376739025115967, + "learning_rate": 7.235430651933217e-05, + "loss": 2.0288, + "step": 2313 + }, + { + "epoch": 0.3733462407228138, + "grad_norm": 3.62248158454895, + "learning_rate": 7.233093249296631e-05, + "loss": 2.0128, + "step": 2314 + }, + { + "epoch": 0.37350758309131976, + "grad_norm": 3.5569229125976562, + "learning_rate": 7.230755236886401e-05, + "loss": 2.3202, + "step": 2315 + }, + { + "epoch": 0.3736689254598258, + "grad_norm": 2.885396718978882, + "learning_rate": 7.228416615340949e-05, + "loss": 1.9526, + "step": 2316 + }, + { + "epoch": 0.37383026782833173, + "grad_norm": 3.926088571548462, + "learning_rate": 7.226077385298862e-05, + "loss": 1.8278, + "step": 2317 + }, + { + "epoch": 0.3739916101968377, + "grad_norm": 4.93345832824707, + "learning_rate": 7.223737547398898e-05, + "loss": 2.3146, + "step": 2318 + }, + { + "epoch": 0.37415295256534364, + "grad_norm": 3.6649792194366455, + "learning_rate": 7.221397102279979e-05, + "loss": 1.9611, + "step": 2319 + }, + { + "epoch": 0.37431429493384966, + "grad_norm": 4.375561714172363, + "learning_rate": 7.21905605058119e-05, + "loss": 1.8566, + "step": 2320 + }, + { + "epoch": 0.3744756373023556, + "grad_norm": 3.7766683101654053, + "learning_rate": 7.216714392941785e-05, + "loss": 2.3165, + "step": 2321 + }, + { + "epoch": 0.37463697967086157, + "grad_norm": 3.7465713024139404, + "learning_rate": 7.214372130001184e-05, + "loss": 2.0361, + "step": 2322 + }, + { + "epoch": 0.3747983220393675, + "grad_norm": 4.272165775299072, + "learning_rate": 7.212029262398972e-05, + "loss": 1.925, + "step": 2323 + }, + { + "epoch": 0.3749596644078735, + "grad_norm": 4.022883415222168, + "learning_rate": 7.209685790774892e-05, + "loss": 1.8744, + "step": 2324 + }, + { + "epoch": 0.3751210067763795, + "grad_norm": 4.182277202606201, + "learning_rate": 7.207341715768863e-05, + "loss": 2.1156, + "step": 2325 + }, + { + "epoch": 0.37528234914488545, + "grad_norm": 3.665260076522827, + "learning_rate": 7.20499703802096e-05, + "loss": 2.047, + "step": 2326 + }, + { + "epoch": 0.3754436915133914, + "grad_norm": 4.080063819885254, + "learning_rate": 7.202651758171431e-05, + "loss": 1.9225, + "step": 2327 + }, + { + "epoch": 0.37560503388189737, + "grad_norm": 6.866401672363281, + "learning_rate": 7.200305876860678e-05, + "loss": 2.1732, + "step": 2328 + }, + { + "epoch": 0.3757663762504034, + "grad_norm": 4.220969200134277, + "learning_rate": 7.19795939472928e-05, + "loss": 2.1178, + "step": 2329 + }, + { + "epoch": 0.37592771861890933, + "grad_norm": 5.637753486633301, + "learning_rate": 7.195612312417965e-05, + "loss": 2.1014, + "step": 2330 + }, + { + "epoch": 0.3760890609874153, + "grad_norm": 4.461935043334961, + "learning_rate": 7.193264630567635e-05, + "loss": 2.0136, + "step": 2331 + }, + { + "epoch": 0.37625040335592125, + "grad_norm": 4.913003444671631, + "learning_rate": 7.190916349819356e-05, + "loss": 2.0294, + "step": 2332 + }, + { + "epoch": 0.37641174572442726, + "grad_norm": 4.060622215270996, + "learning_rate": 7.188567470814354e-05, + "loss": 1.9674, + "step": 2333 + }, + { + "epoch": 0.3765730880929332, + "grad_norm": 4.1861348152160645, + "learning_rate": 7.18621799419402e-05, + "loss": 1.9682, + "step": 2334 + }, + { + "epoch": 0.3767344304614392, + "grad_norm": 3.848463296890259, + "learning_rate": 7.183867920599906e-05, + "loss": 1.8223, + "step": 2335 + }, + { + "epoch": 0.37689577282994513, + "grad_norm": 6.9484968185424805, + "learning_rate": 7.181517250673728e-05, + "loss": 1.9903, + "step": 2336 + }, + { + "epoch": 0.3770571151984511, + "grad_norm": 4.20919942855835, + "learning_rate": 7.179165985057368e-05, + "loss": 1.835, + "step": 2337 + }, + { + "epoch": 0.3772184575669571, + "grad_norm": 3.890127658843994, + "learning_rate": 7.176814124392866e-05, + "loss": 1.8775, + "step": 2338 + }, + { + "epoch": 0.37737979993546306, + "grad_norm": 6.341292381286621, + "learning_rate": 7.174461669322427e-05, + "loss": 2.2324, + "step": 2339 + }, + { + "epoch": 0.377541142303969, + "grad_norm": 4.031415939331055, + "learning_rate": 7.172108620488419e-05, + "loss": 1.7621, + "step": 2340 + }, + { + "epoch": 0.37770248467247497, + "grad_norm": 5.048474311828613, + "learning_rate": 7.16975497853337e-05, + "loss": 2.0766, + "step": 2341 + }, + { + "epoch": 0.377863827040981, + "grad_norm": 4.281801223754883, + "learning_rate": 7.16740074409997e-05, + "loss": 1.9402, + "step": 2342 + }, + { + "epoch": 0.37802516940948694, + "grad_norm": 4.380557060241699, + "learning_rate": 7.165045917831074e-05, + "loss": 2.0619, + "step": 2343 + }, + { + "epoch": 0.3781865117779929, + "grad_norm": 4.43021821975708, + "learning_rate": 7.162690500369694e-05, + "loss": 1.9965, + "step": 2344 + }, + { + "epoch": 0.37834785414649885, + "grad_norm": 4.685453414916992, + "learning_rate": 7.160334492359007e-05, + "loss": 1.942, + "step": 2345 + }, + { + "epoch": 0.37850919651500486, + "grad_norm": 4.532835483551025, + "learning_rate": 7.157977894442349e-05, + "loss": 1.9803, + "step": 2346 + }, + { + "epoch": 0.3786705388835108, + "grad_norm": 4.49874210357666, + "learning_rate": 7.155620707263223e-05, + "loss": 2.0574, + "step": 2347 + }, + { + "epoch": 0.3788318812520168, + "grad_norm": 5.898449897766113, + "learning_rate": 7.15326293146528e-05, + "loss": 1.9402, + "step": 2348 + }, + { + "epoch": 0.37899322362052273, + "grad_norm": 4.97932767868042, + "learning_rate": 7.150904567692348e-05, + "loss": 2.173, + "step": 2349 + }, + { + "epoch": 0.3791545659890287, + "grad_norm": 3.5668933391571045, + "learning_rate": 7.148545616588398e-05, + "loss": 1.9776, + "step": 2350 + }, + { + "epoch": 0.3793159083575347, + "grad_norm": 4.622684955596924, + "learning_rate": 7.146186078797578e-05, + "loss": 1.9501, + "step": 2351 + }, + { + "epoch": 0.37947725072604066, + "grad_norm": 3.8975729942321777, + "learning_rate": 7.143825954964187e-05, + "loss": 1.9992, + "step": 2352 + }, + { + "epoch": 0.3796385930945466, + "grad_norm": 4.953866004943848, + "learning_rate": 7.141465245732686e-05, + "loss": 1.9886, + "step": 2353 + }, + { + "epoch": 0.3797999354630526, + "grad_norm": 4.589084625244141, + "learning_rate": 7.139103951747695e-05, + "loss": 2.0232, + "step": 2354 + }, + { + "epoch": 0.3799612778315586, + "grad_norm": 5.781042575836182, + "learning_rate": 7.136742073653994e-05, + "loss": 1.9773, + "step": 2355 + }, + { + "epoch": 0.38012262020006454, + "grad_norm": 3.937396287918091, + "learning_rate": 7.134379612096525e-05, + "loss": 2.1002, + "step": 2356 + }, + { + "epoch": 0.3802839625685705, + "grad_norm": 4.775443077087402, + "learning_rate": 7.132016567720385e-05, + "loss": 2.2923, + "step": 2357 + }, + { + "epoch": 0.38044530493707646, + "grad_norm": 5.597178936004639, + "learning_rate": 7.129652941170835e-05, + "loss": 2.1866, + "step": 2358 + }, + { + "epoch": 0.38060664730558247, + "grad_norm": 3.9723527431488037, + "learning_rate": 7.12728873309329e-05, + "loss": 2.016, + "step": 2359 + }, + { + "epoch": 0.3807679896740884, + "grad_norm": 4.569044589996338, + "learning_rate": 7.124923944133326e-05, + "loss": 2.0614, + "step": 2360 + }, + { + "epoch": 0.3809293320425944, + "grad_norm": 4.657077312469482, + "learning_rate": 7.12255857493668e-05, + "loss": 2.1499, + "step": 2361 + }, + { + "epoch": 0.38109067441110034, + "grad_norm": 4.39790153503418, + "learning_rate": 7.120192626149242e-05, + "loss": 1.9213, + "step": 2362 + }, + { + "epoch": 0.38125201677960635, + "grad_norm": 4.823693752288818, + "learning_rate": 7.117826098417068e-05, + "loss": 2.2264, + "step": 2363 + }, + { + "epoch": 0.3814133591481123, + "grad_norm": 4.211658477783203, + "learning_rate": 7.115458992386364e-05, + "loss": 2.1787, + "step": 2364 + }, + { + "epoch": 0.38157470151661826, + "grad_norm": 3.9508187770843506, + "learning_rate": 7.113091308703498e-05, + "loss": 1.8984, + "step": 2365 + }, + { + "epoch": 0.3817360438851242, + "grad_norm": 3.8432865142822266, + "learning_rate": 7.110723048014996e-05, + "loss": 1.7879, + "step": 2366 + }, + { + "epoch": 0.3818973862536302, + "grad_norm": 7.136585235595703, + "learning_rate": 7.108354210967541e-05, + "loss": 1.9045, + "step": 2367 + }, + { + "epoch": 0.3820587286221362, + "grad_norm": 5.7573018074035645, + "learning_rate": 7.105984798207972e-05, + "loss": 2.1248, + "step": 2368 + }, + { + "epoch": 0.38222007099064215, + "grad_norm": 4.350116729736328, + "learning_rate": 7.103614810383288e-05, + "loss": 1.9145, + "step": 2369 + }, + { + "epoch": 0.3823814133591481, + "grad_norm": 3.821668863296509, + "learning_rate": 7.101244248140642e-05, + "loss": 2.0725, + "step": 2370 + }, + { + "epoch": 0.38254275572765406, + "grad_norm": 5.314784526824951, + "learning_rate": 7.098873112127345e-05, + "loss": 2.022, + "step": 2371 + }, + { + "epoch": 0.3827040980961601, + "grad_norm": 3.54974102973938, + "learning_rate": 7.096501402990865e-05, + "loss": 1.7158, + "step": 2372 + }, + { + "epoch": 0.38286544046466603, + "grad_norm": 4.784726619720459, + "learning_rate": 7.09412912137883e-05, + "loss": 2.0397, + "step": 2373 + }, + { + "epoch": 0.383026782833172, + "grad_norm": 4.838008403778076, + "learning_rate": 7.091756267939015e-05, + "loss": 2.0775, + "step": 2374 + }, + { + "epoch": 0.38318812520167794, + "grad_norm": 4.776695728302002, + "learning_rate": 7.089382843319361e-05, + "loss": 2.2584, + "step": 2375 + }, + { + "epoch": 0.38334946757018395, + "grad_norm": 3.448340654373169, + "learning_rate": 7.087008848167959e-05, + "loss": 1.8765, + "step": 2376 + }, + { + "epoch": 0.3835108099386899, + "grad_norm": 5.133579730987549, + "learning_rate": 7.084634283133059e-05, + "loss": 2.2597, + "step": 2377 + }, + { + "epoch": 0.38367215230719587, + "grad_norm": 4.340801239013672, + "learning_rate": 7.082259148863064e-05, + "loss": 2.0125, + "step": 2378 + }, + { + "epoch": 0.3838334946757018, + "grad_norm": 6.2008376121521, + "learning_rate": 7.079883446006535e-05, + "loss": 2.0045, + "step": 2379 + }, + { + "epoch": 0.3839948370442078, + "grad_norm": 6.701147079467773, + "learning_rate": 7.077507175212183e-05, + "loss": 1.8485, + "step": 2380 + }, + { + "epoch": 0.3841561794127138, + "grad_norm": 3.891029119491577, + "learning_rate": 7.075130337128884e-05, + "loss": 2.1758, + "step": 2381 + }, + { + "epoch": 0.38431752178121975, + "grad_norm": 4.597936153411865, + "learning_rate": 7.07275293240566e-05, + "loss": 2.1513, + "step": 2382 + }, + { + "epoch": 0.3844788641497257, + "grad_norm": 4.322336196899414, + "learning_rate": 7.07037496169169e-05, + "loss": 1.8242, + "step": 2383 + }, + { + "epoch": 0.38464020651823166, + "grad_norm": 5.00579309463501, + "learning_rate": 7.067996425636308e-05, + "loss": 1.8396, + "step": 2384 + }, + { + "epoch": 0.3848015488867377, + "grad_norm": 5.800734519958496, + "learning_rate": 7.065617324889006e-05, + "loss": 1.9924, + "step": 2385 + }, + { + "epoch": 0.38496289125524363, + "grad_norm": 4.476178169250488, + "learning_rate": 7.063237660099422e-05, + "loss": 2.2004, + "step": 2386 + }, + { + "epoch": 0.3851242336237496, + "grad_norm": 6.143631458282471, + "learning_rate": 7.060857431917358e-05, + "loss": 1.8849, + "step": 2387 + }, + { + "epoch": 0.38528557599225555, + "grad_norm": 4.4568328857421875, + "learning_rate": 7.058476640992759e-05, + "loss": 1.9289, + "step": 2388 + }, + { + "epoch": 0.38544691836076156, + "grad_norm": 4.568746089935303, + "learning_rate": 7.056095287975733e-05, + "loss": 2.0947, + "step": 2389 + }, + { + "epoch": 0.3856082607292675, + "grad_norm": 3.585322380065918, + "learning_rate": 7.053713373516538e-05, + "loss": 1.8935, + "step": 2390 + }, + { + "epoch": 0.3857696030977735, + "grad_norm": 3.811159610748291, + "learning_rate": 7.051330898265582e-05, + "loss": 1.9406, + "step": 2391 + }, + { + "epoch": 0.38593094546627943, + "grad_norm": 4.124796390533447, + "learning_rate": 7.048947862873434e-05, + "loss": 1.9188, + "step": 2392 + }, + { + "epoch": 0.38609228783478544, + "grad_norm": 4.1473846435546875, + "learning_rate": 7.046564267990807e-05, + "loss": 1.856, + "step": 2393 + }, + { + "epoch": 0.3862536302032914, + "grad_norm": 4.339417934417725, + "learning_rate": 7.044180114268572e-05, + "loss": 2.0046, + "step": 2394 + }, + { + "epoch": 0.38641497257179735, + "grad_norm": 5.697010040283203, + "learning_rate": 7.041795402357753e-05, + "loss": 2.0657, + "step": 2395 + }, + { + "epoch": 0.3865763149403033, + "grad_norm": 4.324869155883789, + "learning_rate": 7.039410132909524e-05, + "loss": 2.2903, + "step": 2396 + }, + { + "epoch": 0.38673765730880927, + "grad_norm": 3.6093547344207764, + "learning_rate": 7.037024306575212e-05, + "loss": 1.9665, + "step": 2397 + }, + { + "epoch": 0.3868989996773153, + "grad_norm": 3.4312937259674072, + "learning_rate": 7.034637924006297e-05, + "loss": 1.7842, + "step": 2398 + }, + { + "epoch": 0.38706034204582124, + "grad_norm": 4.827066898345947, + "learning_rate": 7.032250985854409e-05, + "loss": 2.0614, + "step": 2399 + }, + { + "epoch": 0.3872216844143272, + "grad_norm": 3.1605043411254883, + "learning_rate": 7.029863492771332e-05, + "loss": 1.7812, + "step": 2400 + }, + { + "epoch": 0.38738302678283315, + "grad_norm": 5.1260294914245605, + "learning_rate": 7.027475445409e-05, + "loss": 2.0561, + "step": 2401 + }, + { + "epoch": 0.38754436915133916, + "grad_norm": 4.700570106506348, + "learning_rate": 7.025086844419499e-05, + "loss": 1.9542, + "step": 2402 + }, + { + "epoch": 0.3877057115198451, + "grad_norm": 4.195440769195557, + "learning_rate": 7.022697690455065e-05, + "loss": 2.1843, + "step": 2403 + }, + { + "epoch": 0.3878670538883511, + "grad_norm": 4.5467095375061035, + "learning_rate": 7.020307984168088e-05, + "loss": 1.8634, + "step": 2404 + }, + { + "epoch": 0.38802839625685703, + "grad_norm": 3.495610237121582, + "learning_rate": 7.017917726211106e-05, + "loss": 1.8406, + "step": 2405 + }, + { + "epoch": 0.38818973862536305, + "grad_norm": 4.1670708656311035, + "learning_rate": 7.015526917236806e-05, + "loss": 2.0324, + "step": 2406 + }, + { + "epoch": 0.388351080993869, + "grad_norm": 4.634245872497559, + "learning_rate": 7.013135557898032e-05, + "loss": 2.1647, + "step": 2407 + }, + { + "epoch": 0.38851242336237496, + "grad_norm": 4.569153785705566, + "learning_rate": 7.01074364884777e-05, + "loss": 1.957, + "step": 2408 + }, + { + "epoch": 0.3886737657308809, + "grad_norm": 3.983964681625366, + "learning_rate": 7.008351190739162e-05, + "loss": 2.2292, + "step": 2409 + }, + { + "epoch": 0.3888351080993869, + "grad_norm": 4.7621331214904785, + "learning_rate": 7.0059581842255e-05, + "loss": 2.4628, + "step": 2410 + }, + { + "epoch": 0.3889964504678929, + "grad_norm": 3.6143999099731445, + "learning_rate": 7.003564629960222e-05, + "loss": 2.0751, + "step": 2411 + }, + { + "epoch": 0.38915779283639884, + "grad_norm": 4.416424751281738, + "learning_rate": 7.001170528596917e-05, + "loss": 1.8244, + "step": 2412 + }, + { + "epoch": 0.3893191352049048, + "grad_norm": 3.624743700027466, + "learning_rate": 6.998775880789326e-05, + "loss": 2.0942, + "step": 2413 + }, + { + "epoch": 0.38948047757341075, + "grad_norm": 5.6762776374816895, + "learning_rate": 6.996380687191335e-05, + "loss": 2.1162, + "step": 2414 + }, + { + "epoch": 0.38964181994191677, + "grad_norm": 4.099328517913818, + "learning_rate": 6.993984948456981e-05, + "loss": 2.1135, + "step": 2415 + }, + { + "epoch": 0.3898031623104227, + "grad_norm": 4.025030136108398, + "learning_rate": 6.991588665240454e-05, + "loss": 1.9314, + "step": 2416 + }, + { + "epoch": 0.3899645046789287, + "grad_norm": 4.785827159881592, + "learning_rate": 6.989191838196082e-05, + "loss": 1.8687, + "step": 2417 + }, + { + "epoch": 0.39012584704743464, + "grad_norm": 4.294789791107178, + "learning_rate": 6.986794467978355e-05, + "loss": 1.9542, + "step": 2418 + }, + { + "epoch": 0.39028718941594065, + "grad_norm": 4.796219825744629, + "learning_rate": 6.984396555241899e-05, + "loss": 2.0095, + "step": 2419 + }, + { + "epoch": 0.3904485317844466, + "grad_norm": 4.830128192901611, + "learning_rate": 6.981998100641497e-05, + "loss": 2.1175, + "step": 2420 + }, + { + "epoch": 0.39060987415295256, + "grad_norm": 4.1153950691223145, + "learning_rate": 6.979599104832075e-05, + "loss": 1.9986, + "step": 2421 + }, + { + "epoch": 0.3907712165214585, + "grad_norm": 4.315544605255127, + "learning_rate": 6.977199568468709e-05, + "loss": 1.9064, + "step": 2422 + }, + { + "epoch": 0.39093255888996453, + "grad_norm": 3.3176615238189697, + "learning_rate": 6.974799492206622e-05, + "loss": 1.8931, + "step": 2423 + }, + { + "epoch": 0.3910939012584705, + "grad_norm": 4.308636665344238, + "learning_rate": 6.972398876701187e-05, + "loss": 2.1158, + "step": 2424 + }, + { + "epoch": 0.39125524362697645, + "grad_norm": 5.303024768829346, + "learning_rate": 6.969997722607916e-05, + "loss": 2.2496, + "step": 2425 + }, + { + "epoch": 0.3914165859954824, + "grad_norm": 4.133060455322266, + "learning_rate": 6.967596030582478e-05, + "loss": 2.0026, + "step": 2426 + }, + { + "epoch": 0.39157792836398836, + "grad_norm": 4.185387134552002, + "learning_rate": 6.965193801280683e-05, + "loss": 2.1184, + "step": 2427 + }, + { + "epoch": 0.39173927073249437, + "grad_norm": 3.4608633518218994, + "learning_rate": 6.96279103535849e-05, + "loss": 1.954, + "step": 2428 + }, + { + "epoch": 0.39190061310100033, + "grad_norm": 4.845871448516846, + "learning_rate": 6.960387733472003e-05, + "loss": 1.9661, + "step": 2429 + }, + { + "epoch": 0.3920619554695063, + "grad_norm": 4.5562286376953125, + "learning_rate": 6.957983896277473e-05, + "loss": 1.9637, + "step": 2430 + }, + { + "epoch": 0.39222329783801224, + "grad_norm": 4.882256031036377, + "learning_rate": 6.9555795244313e-05, + "loss": 2.403, + "step": 2431 + }, + { + "epoch": 0.39238464020651825, + "grad_norm": 4.715920925140381, + "learning_rate": 6.953174618590026e-05, + "loss": 1.6436, + "step": 2432 + }, + { + "epoch": 0.3925459825750242, + "grad_norm": 4.414078235626221, + "learning_rate": 6.950769179410336e-05, + "loss": 1.8671, + "step": 2433 + }, + { + "epoch": 0.39270732494353017, + "grad_norm": 4.121337890625, + "learning_rate": 6.948363207549073e-05, + "loss": 1.9667, + "step": 2434 + }, + { + "epoch": 0.3928686673120361, + "grad_norm": 4.242137432098389, + "learning_rate": 6.945956703663211e-05, + "loss": 1.8209, + "step": 2435 + }, + { + "epoch": 0.39303000968054214, + "grad_norm": 3.7549500465393066, + "learning_rate": 6.943549668409879e-05, + "loss": 1.8583, + "step": 2436 + }, + { + "epoch": 0.3931913520490481, + "grad_norm": 4.3018879890441895, + "learning_rate": 6.941142102446342e-05, + "loss": 2.0814, + "step": 2437 + }, + { + "epoch": 0.39335269441755405, + "grad_norm": 4.0712971687316895, + "learning_rate": 6.938734006430024e-05, + "loss": 1.7733, + "step": 2438 + }, + { + "epoch": 0.39351403678606, + "grad_norm": 4.466480255126953, + "learning_rate": 6.936325381018478e-05, + "loss": 1.6663, + "step": 2439 + }, + { + "epoch": 0.39367537915456596, + "grad_norm": 5.666343688964844, + "learning_rate": 6.933916226869414e-05, + "loss": 1.8364, + "step": 2440 + }, + { + "epoch": 0.393836721523072, + "grad_norm": 3.993345260620117, + "learning_rate": 6.931506544640677e-05, + "loss": 1.7569, + "step": 2441 + }, + { + "epoch": 0.39399806389157793, + "grad_norm": 4.941212177276611, + "learning_rate": 6.929096334990264e-05, + "loss": 2.1538, + "step": 2442 + }, + { + "epoch": 0.3941594062600839, + "grad_norm": 3.743314504623413, + "learning_rate": 6.92668559857631e-05, + "loss": 1.9386, + "step": 2443 + }, + { + "epoch": 0.39432074862858985, + "grad_norm": 3.9533352851867676, + "learning_rate": 6.924274336057099e-05, + "loss": 1.9302, + "step": 2444 + }, + { + "epoch": 0.39448209099709586, + "grad_norm": 3.6654293537139893, + "learning_rate": 6.921862548091051e-05, + "loss": 1.9921, + "step": 2445 + }, + { + "epoch": 0.3946434333656018, + "grad_norm": 5.150160789489746, + "learning_rate": 6.91945023533674e-05, + "loss": 2.0982, + "step": 2446 + }, + { + "epoch": 0.39480477573410777, + "grad_norm": 3.7324631214141846, + "learning_rate": 6.917037398452876e-05, + "loss": 1.7244, + "step": 2447 + }, + { + "epoch": 0.39496611810261373, + "grad_norm": 4.152327537536621, + "learning_rate": 6.914624038098312e-05, + "loss": 2.1647, + "step": 2448 + }, + { + "epoch": 0.39512746047111974, + "grad_norm": 4.865609645843506, + "learning_rate": 6.912210154932049e-05, + "loss": 1.8787, + "step": 2449 + }, + { + "epoch": 0.3952888028396257, + "grad_norm": 3.872842311859131, + "learning_rate": 6.909795749613223e-05, + "loss": 2.1031, + "step": 2450 + }, + { + "epoch": 0.39545014520813165, + "grad_norm": 5.2089056968688965, + "learning_rate": 6.90738082280112e-05, + "loss": 1.7995, + "step": 2451 + }, + { + "epoch": 0.3956114875766376, + "grad_norm": 4.466630935668945, + "learning_rate": 6.904965375155167e-05, + "loss": 2.015, + "step": 2452 + }, + { + "epoch": 0.39577282994514357, + "grad_norm": 3.957719326019287, + "learning_rate": 6.902549407334929e-05, + "loss": 2.003, + "step": 2453 + }, + { + "epoch": 0.3959341723136496, + "grad_norm": 3.738546848297119, + "learning_rate": 6.900132920000117e-05, + "loss": 1.8192, + "step": 2454 + }, + { + "epoch": 0.39609551468215554, + "grad_norm": 3.4601757526397705, + "learning_rate": 6.897715913810582e-05, + "loss": 2.0163, + "step": 2455 + }, + { + "epoch": 0.3962568570506615, + "grad_norm": 3.8311967849731445, + "learning_rate": 6.89529838942632e-05, + "loss": 1.9218, + "step": 2456 + }, + { + "epoch": 0.39641819941916745, + "grad_norm": 3.2115988731384277, + "learning_rate": 6.892880347507461e-05, + "loss": 1.9858, + "step": 2457 + }, + { + "epoch": 0.39657954178767346, + "grad_norm": 4.476034164428711, + "learning_rate": 6.890461788714286e-05, + "loss": 2.0909, + "step": 2458 + }, + { + "epoch": 0.3967408841561794, + "grad_norm": 4.0941033363342285, + "learning_rate": 6.88804271370721e-05, + "loss": 1.9763, + "step": 2459 + }, + { + "epoch": 0.3969022265246854, + "grad_norm": 4.4979987144470215, + "learning_rate": 6.88562312314679e-05, + "loss": 1.9758, + "step": 2460 + }, + { + "epoch": 0.39706356889319133, + "grad_norm": 5.366666793823242, + "learning_rate": 6.883203017693726e-05, + "loss": 2.2577, + "step": 2461 + }, + { + "epoch": 0.39722491126169734, + "grad_norm": 4.031561374664307, + "learning_rate": 6.880782398008862e-05, + "loss": 1.8966, + "step": 2462 + }, + { + "epoch": 0.3973862536302033, + "grad_norm": 5.680191993713379, + "learning_rate": 6.878361264753171e-05, + "loss": 2.2706, + "step": 2463 + }, + { + "epoch": 0.39754759599870926, + "grad_norm": 4.758927822113037, + "learning_rate": 6.875939618587779e-05, + "loss": 1.8676, + "step": 2464 + }, + { + "epoch": 0.3977089383672152, + "grad_norm": 6.526544094085693, + "learning_rate": 6.873517460173941e-05, + "loss": 2.2248, + "step": 2465 + }, + { + "epoch": 0.3978702807357212, + "grad_norm": 4.234228134155273, + "learning_rate": 6.87109479017306e-05, + "loss": 1.7944, + "step": 2466 + }, + { + "epoch": 0.3980316231042272, + "grad_norm": 4.572405815124512, + "learning_rate": 6.868671609246678e-05, + "loss": 1.7055, + "step": 2467 + }, + { + "epoch": 0.39819296547273314, + "grad_norm": 3.432278633117676, + "learning_rate": 6.866247918056471e-05, + "loss": 2.0137, + "step": 2468 + }, + { + "epoch": 0.3983543078412391, + "grad_norm": 5.302343845367432, + "learning_rate": 6.86382371726426e-05, + "loss": 2.0285, + "step": 2469 + }, + { + "epoch": 0.39851565020974505, + "grad_norm": 4.160001277923584, + "learning_rate": 6.861399007532002e-05, + "loss": 2.3, + "step": 2470 + }, + { + "epoch": 0.39867699257825107, + "grad_norm": 4.059791088104248, + "learning_rate": 6.858973789521793e-05, + "loss": 2.1066, + "step": 2471 + }, + { + "epoch": 0.398838334946757, + "grad_norm": 4.373169422149658, + "learning_rate": 6.85654806389587e-05, + "loss": 2.0145, + "step": 2472 + }, + { + "epoch": 0.398999677315263, + "grad_norm": 3.703003168106079, + "learning_rate": 6.854121831316607e-05, + "loss": 1.822, + "step": 2473 + }, + { + "epoch": 0.39916101968376894, + "grad_norm": 3.666779041290283, + "learning_rate": 6.851695092446517e-05, + "loss": 1.941, + "step": 2474 + }, + { + "epoch": 0.39932236205227495, + "grad_norm": 5.37600564956665, + "learning_rate": 6.84926784794825e-05, + "loss": 2.2424, + "step": 2475 + }, + { + "epoch": 0.3994837044207809, + "grad_norm": 3.541210651397705, + "learning_rate": 6.846840098484596e-05, + "loss": 1.9848, + "step": 2476 + }, + { + "epoch": 0.39964504678928686, + "grad_norm": 3.728093385696411, + "learning_rate": 6.844411844718481e-05, + "loss": 2.1511, + "step": 2477 + }, + { + "epoch": 0.3998063891577928, + "grad_norm": 5.245541572570801, + "learning_rate": 6.841983087312971e-05, + "loss": 1.7297, + "step": 2478 + }, + { + "epoch": 0.39996773152629883, + "grad_norm": 4.711781024932861, + "learning_rate": 6.839553826931267e-05, + "loss": 1.9193, + "step": 2479 + }, + { + "epoch": 0.4001290738948048, + "grad_norm": 3.592806100845337, + "learning_rate": 6.837124064236709e-05, + "loss": 1.9462, + "step": 2480 + }, + { + "epoch": 0.40029041626331074, + "grad_norm": 7.024892807006836, + "learning_rate": 6.834693799892773e-05, + "loss": 1.9904, + "step": 2481 + }, + { + "epoch": 0.4004517586318167, + "grad_norm": 4.028841018676758, + "learning_rate": 6.832263034563073e-05, + "loss": 1.914, + "step": 2482 + }, + { + "epoch": 0.40061310100032266, + "grad_norm": 3.887765645980835, + "learning_rate": 6.829831768911361e-05, + "loss": 1.9009, + "step": 2483 + }, + { + "epoch": 0.40077444336882867, + "grad_norm": 5.264831066131592, + "learning_rate": 6.827400003601522e-05, + "loss": 1.9565, + "step": 2484 + }, + { + "epoch": 0.4009357857373346, + "grad_norm": 6.523907661437988, + "learning_rate": 6.82496773929758e-05, + "loss": 2.0184, + "step": 2485 + }, + { + "epoch": 0.4010971281058406, + "grad_norm": 4.7822160720825195, + "learning_rate": 6.822534976663695e-05, + "loss": 1.7987, + "step": 2486 + }, + { + "epoch": 0.40125847047434654, + "grad_norm": 3.6801819801330566, + "learning_rate": 6.820101716364162e-05, + "loss": 2.1259, + "step": 2487 + }, + { + "epoch": 0.40141981284285255, + "grad_norm": 4.366987228393555, + "learning_rate": 6.817667959063414e-05, + "loss": 2.0377, + "step": 2488 + }, + { + "epoch": 0.4015811552113585, + "grad_norm": 4.573003768920898, + "learning_rate": 6.815233705426019e-05, + "loss": 1.8376, + "step": 2489 + }, + { + "epoch": 0.40174249757986447, + "grad_norm": 3.706190347671509, + "learning_rate": 6.812798956116677e-05, + "loss": 1.9558, + "step": 2490 + }, + { + "epoch": 0.4019038399483704, + "grad_norm": 4.338067054748535, + "learning_rate": 6.81036371180023e-05, + "loss": 1.9312, + "step": 2491 + }, + { + "epoch": 0.40206518231687643, + "grad_norm": 5.0187907218933105, + "learning_rate": 6.807927973141651e-05, + "loss": 2.0904, + "step": 2492 + }, + { + "epoch": 0.4022265246853824, + "grad_norm": 4.582776069641113, + "learning_rate": 6.805491740806043e-05, + "loss": 1.9393, + "step": 2493 + }, + { + "epoch": 0.40238786705388835, + "grad_norm": 3.3647243976593018, + "learning_rate": 6.803055015458656e-05, + "loss": 2.0611, + "step": 2494 + }, + { + "epoch": 0.4025492094223943, + "grad_norm": 5.384952068328857, + "learning_rate": 6.800617797764865e-05, + "loss": 2.0093, + "step": 2495 + }, + { + "epoch": 0.4027105517909003, + "grad_norm": 4.835544109344482, + "learning_rate": 6.798180088390183e-05, + "loss": 1.7988, + "step": 2496 + }, + { + "epoch": 0.4028718941594063, + "grad_norm": 3.5792434215545654, + "learning_rate": 6.795741888000256e-05, + "loss": 1.949, + "step": 2497 + }, + { + "epoch": 0.40303323652791223, + "grad_norm": 3.372053861618042, + "learning_rate": 6.793303197260864e-05, + "loss": 1.8905, + "step": 2498 + }, + { + "epoch": 0.4031945788964182, + "grad_norm": 4.386721134185791, + "learning_rate": 6.790864016837923e-05, + "loss": 1.9178, + "step": 2499 + }, + { + "epoch": 0.40335592126492414, + "grad_norm": 3.8055319786071777, + "learning_rate": 6.788424347397482e-05, + "loss": 1.9185, + "step": 2500 + }, + { + "epoch": 0.40351726363343016, + "grad_norm": 4.16497278213501, + "learning_rate": 6.785984189605721e-05, + "loss": 2.0803, + "step": 2501 + }, + { + "epoch": 0.4036786060019361, + "grad_norm": 4.593762397766113, + "learning_rate": 6.783543544128957e-05, + "loss": 1.9773, + "step": 2502 + }, + { + "epoch": 0.40383994837044207, + "grad_norm": 4.823954105377197, + "learning_rate": 6.781102411633635e-05, + "loss": 2.1822, + "step": 2503 + }, + { + "epoch": 0.404001290738948, + "grad_norm": 4.050856590270996, + "learning_rate": 6.77866079278634e-05, + "loss": 2.1946, + "step": 2504 + }, + { + "epoch": 0.40416263310745404, + "grad_norm": 5.379467487335205, + "learning_rate": 6.776218688253784e-05, + "loss": 2.0267, + "step": 2505 + }, + { + "epoch": 0.40432397547596, + "grad_norm": 5.673772811889648, + "learning_rate": 6.773776098702816e-05, + "loss": 2.0904, + "step": 2506 + }, + { + "epoch": 0.40448531784446595, + "grad_norm": 3.2573928833007812, + "learning_rate": 6.771333024800411e-05, + "loss": 1.9578, + "step": 2507 + }, + { + "epoch": 0.4046466602129719, + "grad_norm": 3.8524608612060547, + "learning_rate": 6.768889467213684e-05, + "loss": 1.8406, + "step": 2508 + }, + { + "epoch": 0.4048080025814779, + "grad_norm": 4.494475841522217, + "learning_rate": 6.766445426609877e-05, + "loss": 1.7922, + "step": 2509 + }, + { + "epoch": 0.4049693449499839, + "grad_norm": 4.574101448059082, + "learning_rate": 6.764000903656366e-05, + "loss": 1.7937, + "step": 2510 + }, + { + "epoch": 0.40513068731848983, + "grad_norm": 4.136654853820801, + "learning_rate": 6.76155589902066e-05, + "loss": 1.8986, + "step": 2511 + }, + { + "epoch": 0.4052920296869958, + "grad_norm": 3.4168479442596436, + "learning_rate": 6.759110413370395e-05, + "loss": 2.0704, + "step": 2512 + }, + { + "epoch": 0.40545337205550175, + "grad_norm": 4.0800557136535645, + "learning_rate": 6.756664447373344e-05, + "loss": 2.0706, + "step": 2513 + }, + { + "epoch": 0.40561471442400776, + "grad_norm": 4.138115882873535, + "learning_rate": 6.754218001697402e-05, + "loss": 2.1694, + "step": 2514 + }, + { + "epoch": 0.4057760567925137, + "grad_norm": 5.320624828338623, + "learning_rate": 6.751771077010607e-05, + "loss": 1.8025, + "step": 2515 + }, + { + "epoch": 0.4059373991610197, + "grad_norm": 4.6533379554748535, + "learning_rate": 6.74932367398112e-05, + "loss": 1.9051, + "step": 2516 + }, + { + "epoch": 0.40609874152952563, + "grad_norm": 3.4974732398986816, + "learning_rate": 6.746875793277233e-05, + "loss": 1.7668, + "step": 2517 + }, + { + "epoch": 0.40626008389803164, + "grad_norm": 4.321040153503418, + "learning_rate": 6.744427435567373e-05, + "loss": 1.7968, + "step": 2518 + }, + { + "epoch": 0.4064214262665376, + "grad_norm": 5.548377990722656, + "learning_rate": 6.741978601520092e-05, + "loss": 1.7595, + "step": 2519 + }, + { + "epoch": 0.40658276863504356, + "grad_norm": 4.565524578094482, + "learning_rate": 6.739529291804076e-05, + "loss": 2.0442, + "step": 2520 + }, + { + "epoch": 0.4067441110035495, + "grad_norm": 3.937290668487549, + "learning_rate": 6.737079507088139e-05, + "loss": 1.8561, + "step": 2521 + }, + { + "epoch": 0.4069054533720555, + "grad_norm": 4.6694183349609375, + "learning_rate": 6.734629248041226e-05, + "loss": 2.262, + "step": 2522 + }, + { + "epoch": 0.4070667957405615, + "grad_norm": 4.72750186920166, + "learning_rate": 6.732178515332406e-05, + "loss": 2.0002, + "step": 2523 + }, + { + "epoch": 0.40722813810906744, + "grad_norm": 4.05148458480835, + "learning_rate": 6.729727309630885e-05, + "loss": 2.0543, + "step": 2524 + }, + { + "epoch": 0.4073894804775734, + "grad_norm": 4.776993274688721, + "learning_rate": 6.727275631605995e-05, + "loss": 1.8568, + "step": 2525 + }, + { + "epoch": 0.4075508228460794, + "grad_norm": 5.503616809844971, + "learning_rate": 6.724823481927198e-05, + "loss": 1.936, + "step": 2526 + }, + { + "epoch": 0.40771216521458536, + "grad_norm": 6.97620964050293, + "learning_rate": 6.72237086126408e-05, + "loss": 1.9444, + "step": 2527 + }, + { + "epoch": 0.4078735075830913, + "grad_norm": 5.984745025634766, + "learning_rate": 6.719917770286362e-05, + "loss": 2.0345, + "step": 2528 + }, + { + "epoch": 0.4080348499515973, + "grad_norm": 4.583484172821045, + "learning_rate": 6.717464209663891e-05, + "loss": 1.9219, + "step": 2529 + }, + { + "epoch": 0.40819619232010323, + "grad_norm": 3.649150848388672, + "learning_rate": 6.715010180066641e-05, + "loss": 1.9231, + "step": 2530 + }, + { + "epoch": 0.40835753468860925, + "grad_norm": 4.01031494140625, + "learning_rate": 6.712555682164715e-05, + "loss": 2.0998, + "step": 2531 + }, + { + "epoch": 0.4085188770571152, + "grad_norm": 4.0240349769592285, + "learning_rate": 6.710100716628344e-05, + "loss": 1.8933, + "step": 2532 + }, + { + "epoch": 0.40868021942562116, + "grad_norm": 3.484869956970215, + "learning_rate": 6.707645284127887e-05, + "loss": 1.8798, + "step": 2533 + }, + { + "epoch": 0.4088415617941271, + "grad_norm": 5.766565799713135, + "learning_rate": 6.70518938533383e-05, + "loss": 1.8677, + "step": 2534 + }, + { + "epoch": 0.40900290416263313, + "grad_norm": 5.190849781036377, + "learning_rate": 6.702733020916786e-05, + "loss": 2.1769, + "step": 2535 + }, + { + "epoch": 0.4091642465311391, + "grad_norm": 3.1495325565338135, + "learning_rate": 6.700276191547496e-05, + "loss": 1.9317, + "step": 2536 + }, + { + "epoch": 0.40932558889964504, + "grad_norm": 3.671961545944214, + "learning_rate": 6.697818897896828e-05, + "loss": 2.2158, + "step": 2537 + }, + { + "epoch": 0.409486931268151, + "grad_norm": 4.503951549530029, + "learning_rate": 6.695361140635776e-05, + "loss": 1.8565, + "step": 2538 + }, + { + "epoch": 0.409648273636657, + "grad_norm": 3.85292649269104, + "learning_rate": 6.69290292043546e-05, + "loss": 1.5461, + "step": 2539 + }, + { + "epoch": 0.40980961600516297, + "grad_norm": 6.35936975479126, + "learning_rate": 6.690444237967129e-05, + "loss": 1.8699, + "step": 2540 + }, + { + "epoch": 0.4099709583736689, + "grad_norm": 4.711306095123291, + "learning_rate": 6.687985093902155e-05, + "loss": 1.9941, + "step": 2541 + }, + { + "epoch": 0.4101323007421749, + "grad_norm": 3.86368989944458, + "learning_rate": 6.685525488912037e-05, + "loss": 1.7849, + "step": 2542 + }, + { + "epoch": 0.41029364311068084, + "grad_norm": 5.367621421813965, + "learning_rate": 6.683065423668403e-05, + "loss": 1.8176, + "step": 2543 + }, + { + "epoch": 0.41045498547918685, + "grad_norm": 5.0536208152771, + "learning_rate": 6.680604898843002e-05, + "loss": 1.9325, + "step": 2544 + }, + { + "epoch": 0.4106163278476928, + "grad_norm": 4.238080024719238, + "learning_rate": 6.678143915107713e-05, + "loss": 2.1431, + "step": 2545 + }, + { + "epoch": 0.41077767021619876, + "grad_norm": 4.902920246124268, + "learning_rate": 6.675682473134536e-05, + "loss": 1.8925, + "step": 2546 + }, + { + "epoch": 0.4109390125847047, + "grad_norm": 3.542854070663452, + "learning_rate": 6.673220573595598e-05, + "loss": 1.9062, + "step": 2547 + }, + { + "epoch": 0.41110035495321073, + "grad_norm": 4.582688331604004, + "learning_rate": 6.670758217163151e-05, + "loss": 2.0112, + "step": 2548 + }, + { + "epoch": 0.4112616973217167, + "grad_norm": 4.175585746765137, + "learning_rate": 6.668295404509574e-05, + "loss": 2.021, + "step": 2549 + }, + { + "epoch": 0.41142303969022265, + "grad_norm": 4.21159553527832, + "learning_rate": 6.665832136307366e-05, + "loss": 1.8985, + "step": 2550 + }, + { + "epoch": 0.4115843820587286, + "grad_norm": 4.019961357116699, + "learning_rate": 6.663368413229155e-05, + "loss": 1.855, + "step": 2551 + }, + { + "epoch": 0.4117457244272346, + "grad_norm": 3.089282989501953, + "learning_rate": 6.660904235947687e-05, + "loss": 1.7267, + "step": 2552 + }, + { + "epoch": 0.4119070667957406, + "grad_norm": 4.0074663162231445, + "learning_rate": 6.65843960513584e-05, + "loss": 1.857, + "step": 2553 + }, + { + "epoch": 0.41206840916424653, + "grad_norm": 3.842273235321045, + "learning_rate": 6.655974521466608e-05, + "loss": 2.0801, + "step": 2554 + }, + { + "epoch": 0.4122297515327525, + "grad_norm": 3.413011312484741, + "learning_rate": 6.653508985613117e-05, + "loss": 1.8709, + "step": 2555 + }, + { + "epoch": 0.41239109390125844, + "grad_norm": 3.932624340057373, + "learning_rate": 6.651042998248608e-05, + "loss": 1.8626, + "step": 2556 + }, + { + "epoch": 0.41255243626976446, + "grad_norm": 5.163025856018066, + "learning_rate": 6.648576560046452e-05, + "loss": 2.1256, + "step": 2557 + }, + { + "epoch": 0.4127137786382704, + "grad_norm": 5.4284186363220215, + "learning_rate": 6.64610967168014e-05, + "loss": 1.775, + "step": 2558 + }, + { + "epoch": 0.41287512100677637, + "grad_norm": 4.206260681152344, + "learning_rate": 6.643642333823286e-05, + "loss": 1.9807, + "step": 2559 + }, + { + "epoch": 0.4130364633752823, + "grad_norm": 3.877509593963623, + "learning_rate": 6.641174547149624e-05, + "loss": 1.7632, + "step": 2560 + }, + { + "epoch": 0.41319780574378834, + "grad_norm": 4.569438934326172, + "learning_rate": 6.638706312333018e-05, + "loss": 2.0044, + "step": 2561 + }, + { + "epoch": 0.4133591481122943, + "grad_norm": 4.021951198577881, + "learning_rate": 6.636237630047448e-05, + "loss": 1.8816, + "step": 2562 + }, + { + "epoch": 0.41352049048080025, + "grad_norm": 4.502031326293945, + "learning_rate": 6.633768500967019e-05, + "loss": 1.9982, + "step": 2563 + }, + { + "epoch": 0.4136818328493062, + "grad_norm": 5.826133728027344, + "learning_rate": 6.631298925765955e-05, + "loss": 2.087, + "step": 2564 + }, + { + "epoch": 0.4138431752178122, + "grad_norm": 3.8425328731536865, + "learning_rate": 6.628828905118608e-05, + "loss": 1.9743, + "step": 2565 + }, + { + "epoch": 0.4140045175863182, + "grad_norm": 4.0037760734558105, + "learning_rate": 6.626358439699442e-05, + "loss": 2.1166, + "step": 2566 + }, + { + "epoch": 0.41416585995482413, + "grad_norm": 3.5549118518829346, + "learning_rate": 6.623887530183051e-05, + "loss": 2.0896, + "step": 2567 + }, + { + "epoch": 0.4143272023233301, + "grad_norm": 5.458221435546875, + "learning_rate": 6.621416177244148e-05, + "loss": 1.9489, + "step": 2568 + }, + { + "epoch": 0.4144885446918361, + "grad_norm": 5.151551246643066, + "learning_rate": 6.618944381557568e-05, + "loss": 1.8281, + "step": 2569 + }, + { + "epoch": 0.41464988706034206, + "grad_norm": 3.459108829498291, + "learning_rate": 6.616472143798261e-05, + "loss": 1.8917, + "step": 2570 + }, + { + "epoch": 0.414811229428848, + "grad_norm": 3.567786693572998, + "learning_rate": 6.613999464641304e-05, + "loss": 1.9488, + "step": 2571 + }, + { + "epoch": 0.414972571797354, + "grad_norm": 6.768561840057373, + "learning_rate": 6.611526344761893e-05, + "loss": 2.0894, + "step": 2572 + }, + { + "epoch": 0.41513391416585993, + "grad_norm": 4.484906196594238, + "learning_rate": 6.609052784835342e-05, + "loss": 1.8344, + "step": 2573 + }, + { + "epoch": 0.41529525653436594, + "grad_norm": 3.241314649581909, + "learning_rate": 6.60657878553709e-05, + "loss": 1.8785, + "step": 2574 + }, + { + "epoch": 0.4154565989028719, + "grad_norm": 4.768258094787598, + "learning_rate": 6.604104347542693e-05, + "loss": 2.0155, + "step": 2575 + }, + { + "epoch": 0.41561794127137786, + "grad_norm": 4.974591255187988, + "learning_rate": 6.601629471527822e-05, + "loss": 1.9884, + "step": 2576 + }, + { + "epoch": 0.4157792836398838, + "grad_norm": 4.00331974029541, + "learning_rate": 6.599154158168278e-05, + "loss": 1.8776, + "step": 2577 + }, + { + "epoch": 0.4159406260083898, + "grad_norm": 3.2258119583129883, + "learning_rate": 6.596678408139973e-05, + "loss": 1.9786, + "step": 2578 + }, + { + "epoch": 0.4161019683768958, + "grad_norm": 4.530983924865723, + "learning_rate": 6.594202222118942e-05, + "loss": 2.0937, + "step": 2579 + }, + { + "epoch": 0.41626331074540174, + "grad_norm": 3.984894275665283, + "learning_rate": 6.591725600781336e-05, + "loss": 1.9892, + "step": 2580 + }, + { + "epoch": 0.4164246531139077, + "grad_norm": 3.5328073501586914, + "learning_rate": 6.589248544803431e-05, + "loss": 2.114, + "step": 2581 + }, + { + "epoch": 0.4165859954824137, + "grad_norm": 4.023244857788086, + "learning_rate": 6.586771054861613e-05, + "loss": 1.9073, + "step": 2582 + }, + { + "epoch": 0.41674733785091966, + "grad_norm": 4.235401153564453, + "learning_rate": 6.584293131632396e-05, + "loss": 1.9853, + "step": 2583 + }, + { + "epoch": 0.4169086802194256, + "grad_norm": 3.4555716514587402, + "learning_rate": 6.581814775792403e-05, + "loss": 1.9779, + "step": 2584 + }, + { + "epoch": 0.4170700225879316, + "grad_norm": 4.903307914733887, + "learning_rate": 6.579335988018383e-05, + "loss": 1.9086, + "step": 2585 + }, + { + "epoch": 0.41723136495643753, + "grad_norm": 4.533055782318115, + "learning_rate": 6.576856768987197e-05, + "loss": 2.2401, + "step": 2586 + }, + { + "epoch": 0.41739270732494355, + "grad_norm": 5.497504234313965, + "learning_rate": 6.574377119375829e-05, + "loss": 2.0772, + "step": 2587 + }, + { + "epoch": 0.4175540496934495, + "grad_norm": 5.563016891479492, + "learning_rate": 6.571897039861377e-05, + "loss": 1.8796, + "step": 2588 + }, + { + "epoch": 0.41771539206195546, + "grad_norm": 5.664429664611816, + "learning_rate": 6.569416531121056e-05, + "loss": 1.6566, + "step": 2589 + }, + { + "epoch": 0.4178767344304614, + "grad_norm": 5.091784477233887, + "learning_rate": 6.5669355938322e-05, + "loss": 2.0411, + "step": 2590 + }, + { + "epoch": 0.41803807679896743, + "grad_norm": 3.325024366378784, + "learning_rate": 6.564454228672259e-05, + "loss": 1.9708, + "step": 2591 + }, + { + "epoch": 0.4181994191674734, + "grad_norm": 5.186642169952393, + "learning_rate": 6.561972436318801e-05, + "loss": 2.0746, + "step": 2592 + }, + { + "epoch": 0.41836076153597934, + "grad_norm": 5.5119524002075195, + "learning_rate": 6.559490217449513e-05, + "loss": 1.7696, + "step": 2593 + }, + { + "epoch": 0.4185221039044853, + "grad_norm": 5.138498306274414, + "learning_rate": 6.55700757274219e-05, + "loss": 2.1851, + "step": 2594 + }, + { + "epoch": 0.4186834462729913, + "grad_norm": 4.437428951263428, + "learning_rate": 6.554524502874752e-05, + "loss": 2.0936, + "step": 2595 + }, + { + "epoch": 0.41884478864149727, + "grad_norm": 2.9758529663085938, + "learning_rate": 6.55204100852523e-05, + "loss": 1.9382, + "step": 2596 + }, + { + "epoch": 0.4190061310100032, + "grad_norm": 3.4663944244384766, + "learning_rate": 6.549557090371776e-05, + "loss": 2.0537, + "step": 2597 + }, + { + "epoch": 0.4191674733785092, + "grad_norm": 4.565867900848389, + "learning_rate": 6.547072749092652e-05, + "loss": 1.965, + "step": 2598 + }, + { + "epoch": 0.4193288157470152, + "grad_norm": 3.739332914352417, + "learning_rate": 6.544587985366237e-05, + "loss": 2.1783, + "step": 2599 + }, + { + "epoch": 0.41949015811552115, + "grad_norm": 4.538175106048584, + "learning_rate": 6.54210279987103e-05, + "loss": 2.411, + "step": 2600 + }, + { + "epoch": 0.4196515004840271, + "grad_norm": 7.21381139755249, + "learning_rate": 6.539617193285639e-05, + "loss": 2.1074, + "step": 2601 + }, + { + "epoch": 0.41981284285253306, + "grad_norm": 5.1080193519592285, + "learning_rate": 6.537131166288789e-05, + "loss": 2.015, + "step": 2602 + }, + { + "epoch": 0.419974185221039, + "grad_norm": 4.051692962646484, + "learning_rate": 6.534644719559321e-05, + "loss": 1.9574, + "step": 2603 + }, + { + "epoch": 0.42013552758954503, + "grad_norm": 3.8842334747314453, + "learning_rate": 6.532157853776191e-05, + "loss": 1.9782, + "step": 2604 + }, + { + "epoch": 0.420296869958051, + "grad_norm": 3.123093843460083, + "learning_rate": 6.529670569618467e-05, + "loss": 1.9015, + "step": 2605 + }, + { + "epoch": 0.42045821232655695, + "grad_norm": 5.435220241546631, + "learning_rate": 6.527182867765332e-05, + "loss": 2.1036, + "step": 2606 + }, + { + "epoch": 0.4206195546950629, + "grad_norm": 3.8733108043670654, + "learning_rate": 6.524694748896086e-05, + "loss": 1.9209, + "step": 2607 + }, + { + "epoch": 0.4207808970635689, + "grad_norm": 3.982532024383545, + "learning_rate": 6.522206213690141e-05, + "loss": 1.9799, + "step": 2608 + }, + { + "epoch": 0.42094223943207487, + "grad_norm": 4.876627445220947, + "learning_rate": 6.519717262827018e-05, + "loss": 1.9781, + "step": 2609 + }, + { + "epoch": 0.42110358180058083, + "grad_norm": 4.795150279998779, + "learning_rate": 6.517227896986359e-05, + "loss": 1.9817, + "step": 2610 + }, + { + "epoch": 0.4212649241690868, + "grad_norm": 4.908081531524658, + "learning_rate": 6.514738116847915e-05, + "loss": 1.6965, + "step": 2611 + }, + { + "epoch": 0.4214262665375928, + "grad_norm": 5.139437198638916, + "learning_rate": 6.512247923091552e-05, + "loss": 2.0079, + "step": 2612 + }, + { + "epoch": 0.42158760890609875, + "grad_norm": 5.754101276397705, + "learning_rate": 6.509757316397248e-05, + "loss": 1.9483, + "step": 2613 + }, + { + "epoch": 0.4217489512746047, + "grad_norm": 3.195584297180176, + "learning_rate": 6.507266297445092e-05, + "loss": 1.8382, + "step": 2614 + }, + { + "epoch": 0.42191029364311067, + "grad_norm": 5.019852638244629, + "learning_rate": 6.50477486691529e-05, + "loss": 2.0303, + "step": 2615 + }, + { + "epoch": 0.4220716360116166, + "grad_norm": 3.8010637760162354, + "learning_rate": 6.502283025488157e-05, + "loss": 2.1413, + "step": 2616 + }, + { + "epoch": 0.42223297838012264, + "grad_norm": 4.250025749206543, + "learning_rate": 6.49979077384412e-05, + "loss": 2.0895, + "step": 2617 + }, + { + "epoch": 0.4223943207486286, + "grad_norm": 4.784327030181885, + "learning_rate": 6.497298112663721e-05, + "loss": 1.9401, + "step": 2618 + }, + { + "epoch": 0.42255566311713455, + "grad_norm": 5.285989761352539, + "learning_rate": 6.49480504262761e-05, + "loss": 2.1307, + "step": 2619 + }, + { + "epoch": 0.4227170054856405, + "grad_norm": 3.9630815982818604, + "learning_rate": 6.49231156441655e-05, + "loss": 2.1602, + "step": 2620 + }, + { + "epoch": 0.4228783478541465, + "grad_norm": 4.505052089691162, + "learning_rate": 6.489817678711418e-05, + "loss": 2.1739, + "step": 2621 + }, + { + "epoch": 0.4230396902226525, + "grad_norm": 4.205039024353027, + "learning_rate": 6.487323386193199e-05, + "loss": 2.0078, + "step": 2622 + }, + { + "epoch": 0.42320103259115843, + "grad_norm": 4.105075359344482, + "learning_rate": 6.48482868754299e-05, + "loss": 2.0093, + "step": 2623 + }, + { + "epoch": 0.4233623749596644, + "grad_norm": 5.950715065002441, + "learning_rate": 6.482333583442002e-05, + "loss": 2.0699, + "step": 2624 + }, + { + "epoch": 0.4235237173281704, + "grad_norm": 3.457017183303833, + "learning_rate": 6.479838074571551e-05, + "loss": 2.0035, + "step": 2625 + }, + { + "epoch": 0.42368505969667636, + "grad_norm": 3.98242449760437, + "learning_rate": 6.477342161613068e-05, + "loss": 2.0604, + "step": 2626 + }, + { + "epoch": 0.4238464020651823, + "grad_norm": 4.566737651824951, + "learning_rate": 6.47484584524809e-05, + "loss": 2.2198, + "step": 2627 + }, + { + "epoch": 0.42400774443368827, + "grad_norm": 3.8550214767456055, + "learning_rate": 6.472349126158272e-05, + "loss": 1.9878, + "step": 2628 + }, + { + "epoch": 0.4241690868021943, + "grad_norm": 4.4448747634887695, + "learning_rate": 6.46985200502537e-05, + "loss": 1.8718, + "step": 2629 + }, + { + "epoch": 0.42433042917070024, + "grad_norm": 3.811269998550415, + "learning_rate": 6.467354482531253e-05, + "loss": 1.9142, + "step": 2630 + }, + { + "epoch": 0.4244917715392062, + "grad_norm": 4.5046796798706055, + "learning_rate": 6.464856559357903e-05, + "loss": 1.9578, + "step": 2631 + }, + { + "epoch": 0.42465311390771215, + "grad_norm": 5.346692085266113, + "learning_rate": 6.462358236187409e-05, + "loss": 2.0217, + "step": 2632 + }, + { + "epoch": 0.4248144562762181, + "grad_norm": 4.623473167419434, + "learning_rate": 6.459859513701967e-05, + "loss": 2.1269, + "step": 2633 + }, + { + "epoch": 0.4249757986447241, + "grad_norm": 3.123763084411621, + "learning_rate": 6.457360392583884e-05, + "loss": 1.9277, + "step": 2634 + }, + { + "epoch": 0.4251371410132301, + "grad_norm": 4.0693769454956055, + "learning_rate": 6.454860873515577e-05, + "loss": 1.9822, + "step": 2635 + }, + { + "epoch": 0.42529848338173604, + "grad_norm": 3.4166367053985596, + "learning_rate": 6.45236095717957e-05, + "loss": 1.9156, + "step": 2636 + }, + { + "epoch": 0.425459825750242, + "grad_norm": 3.774885416030884, + "learning_rate": 6.449860644258497e-05, + "loss": 1.8419, + "step": 2637 + }, + { + "epoch": 0.425621168118748, + "grad_norm": 3.373805046081543, + "learning_rate": 6.447359935435097e-05, + "loss": 1.9435, + "step": 2638 + }, + { + "epoch": 0.42578251048725396, + "grad_norm": 3.160585880279541, + "learning_rate": 6.444858831392223e-05, + "loss": 2.094, + "step": 2639 + }, + { + "epoch": 0.4259438528557599, + "grad_norm": 3.7261202335357666, + "learning_rate": 6.442357332812828e-05, + "loss": 1.786, + "step": 2640 + }, + { + "epoch": 0.4261051952242659, + "grad_norm": 5.06633186340332, + "learning_rate": 6.439855440379978e-05, + "loss": 1.9268, + "step": 2641 + }, + { + "epoch": 0.4262665375927719, + "grad_norm": 4.561039447784424, + "learning_rate": 6.437353154776849e-05, + "loss": 2.221, + "step": 2642 + }, + { + "epoch": 0.42642787996127784, + "grad_norm": 5.25416898727417, + "learning_rate": 6.434850476686715e-05, + "loss": 2.082, + "step": 2643 + }, + { + "epoch": 0.4265892223297838, + "grad_norm": 4.456058979034424, + "learning_rate": 6.43234740679297e-05, + "loss": 1.682, + "step": 2644 + }, + { + "epoch": 0.42675056469828976, + "grad_norm": 4.3220744132995605, + "learning_rate": 6.429843945779104e-05, + "loss": 2.0635, + "step": 2645 + }, + { + "epoch": 0.4269119070667957, + "grad_norm": 4.297271251678467, + "learning_rate": 6.427340094328718e-05, + "loss": 1.9167, + "step": 2646 + }, + { + "epoch": 0.4270732494353017, + "grad_norm": 4.471283435821533, + "learning_rate": 6.424835853125521e-05, + "loss": 1.9748, + "step": 2647 + }, + { + "epoch": 0.4272345918038077, + "grad_norm": 4.177629470825195, + "learning_rate": 6.422331222853326e-05, + "loss": 2.0103, + "step": 2648 + }, + { + "epoch": 0.42739593417231364, + "grad_norm": 4.565670967102051, + "learning_rate": 6.419826204196052e-05, + "loss": 1.8253, + "step": 2649 + }, + { + "epoch": 0.4275572765408196, + "grad_norm": 3.8919923305511475, + "learning_rate": 6.417320797837727e-05, + "loss": 1.925, + "step": 2650 + }, + { + "epoch": 0.4277186189093256, + "grad_norm": 4.406559467315674, + "learning_rate": 6.414815004462483e-05, + "loss": 1.8349, + "step": 2651 + }, + { + "epoch": 0.42787996127783157, + "grad_norm": 3.911963939666748, + "learning_rate": 6.412308824754557e-05, + "loss": 1.7009, + "step": 2652 + }, + { + "epoch": 0.4280413036463375, + "grad_norm": 4.386362552642822, + "learning_rate": 6.409802259398293e-05, + "loss": 2.1729, + "step": 2653 + }, + { + "epoch": 0.4282026460148435, + "grad_norm": 4.710752964019775, + "learning_rate": 6.407295309078138e-05, + "loss": 2.0782, + "step": 2654 + }, + { + "epoch": 0.4283639883833495, + "grad_norm": 4.254208087921143, + "learning_rate": 6.404787974478649e-05, + "loss": 1.9382, + "step": 2655 + }, + { + "epoch": 0.42852533075185545, + "grad_norm": 4.716608047485352, + "learning_rate": 6.402280256284481e-05, + "loss": 2.0621, + "step": 2656 + }, + { + "epoch": 0.4286866731203614, + "grad_norm": 4.185356616973877, + "learning_rate": 6.3997721551804e-05, + "loss": 1.8992, + "step": 2657 + }, + { + "epoch": 0.42884801548886736, + "grad_norm": 4.865534782409668, + "learning_rate": 6.397263671851273e-05, + "loss": 2.0066, + "step": 2658 + }, + { + "epoch": 0.4290093578573733, + "grad_norm": 5.55830717086792, + "learning_rate": 6.39475480698207e-05, + "loss": 1.7723, + "step": 2659 + }, + { + "epoch": 0.42917070022587933, + "grad_norm": 3.3979928493499756, + "learning_rate": 6.392245561257871e-05, + "loss": 1.9591, + "step": 2660 + }, + { + "epoch": 0.4293320425943853, + "grad_norm": 7.676721096038818, + "learning_rate": 6.389735935363855e-05, + "loss": 2.1379, + "step": 2661 + }, + { + "epoch": 0.42949338496289124, + "grad_norm": 4.122786521911621, + "learning_rate": 6.387225929985306e-05, + "loss": 1.9849, + "step": 2662 + }, + { + "epoch": 0.4296547273313972, + "grad_norm": 4.122786521911621, + "learning_rate": 6.387225929985306e-05, + "loss": 1.912, + "step": 2663 + }, + { + "epoch": 0.4298160696999032, + "grad_norm": 3.77955961227417, + "learning_rate": 6.384715545807613e-05, + "loss": 2.0548, + "step": 2664 + }, + { + "epoch": 0.42997741206840917, + "grad_norm": 6.101278781890869, + "learning_rate": 6.382204783516267e-05, + "loss": 2.0458, + "step": 2665 + }, + { + "epoch": 0.4301387544369151, + "grad_norm": 3.9506847858428955, + "learning_rate": 6.379693643796863e-05, + "loss": 1.8902, + "step": 2666 + }, + { + "epoch": 0.4303000968054211, + "grad_norm": 5.288346290588379, + "learning_rate": 6.377182127335096e-05, + "loss": 1.8848, + "step": 2667 + }, + { + "epoch": 0.4304614391739271, + "grad_norm": 4.493162155151367, + "learning_rate": 6.374670234816768e-05, + "loss": 2.0725, + "step": 2668 + }, + { + "epoch": 0.43062278154243305, + "grad_norm": 3.863415241241455, + "learning_rate": 6.372157966927785e-05, + "loss": 1.9583, + "step": 2669 + }, + { + "epoch": 0.430784123910939, + "grad_norm": 4.328066349029541, + "learning_rate": 6.369645324354149e-05, + "loss": 1.9972, + "step": 2670 + }, + { + "epoch": 0.43094546627944497, + "grad_norm": 4.514448165893555, + "learning_rate": 6.36713230778197e-05, + "loss": 1.8807, + "step": 2671 + }, + { + "epoch": 0.431106808647951, + "grad_norm": 4.6742329597473145, + "learning_rate": 6.364618917897456e-05, + "loss": 1.8058, + "step": 2672 + }, + { + "epoch": 0.43126815101645694, + "grad_norm": 3.9169187545776367, + "learning_rate": 6.362105155386923e-05, + "loss": 2.0142, + "step": 2673 + }, + { + "epoch": 0.4314294933849629, + "grad_norm": 3.444545269012451, + "learning_rate": 6.359591020936781e-05, + "loss": 2.0093, + "step": 2674 + }, + { + "epoch": 0.43159083575346885, + "grad_norm": 3.6630542278289795, + "learning_rate": 6.357076515233548e-05, + "loss": 1.8976, + "step": 2675 + }, + { + "epoch": 0.4317521781219748, + "grad_norm": 5.304434776306152, + "learning_rate": 6.35456163896384e-05, + "loss": 1.9855, + "step": 2676 + }, + { + "epoch": 0.4319135204904808, + "grad_norm": 4.996885776519775, + "learning_rate": 6.352046392814375e-05, + "loss": 1.9368, + "step": 2677 + }, + { + "epoch": 0.4320748628589868, + "grad_norm": 4.649833679199219, + "learning_rate": 6.34953077747197e-05, + "loss": 1.9488, + "step": 2678 + }, + { + "epoch": 0.43223620522749273, + "grad_norm": 3.217017889022827, + "learning_rate": 6.347014793623547e-05, + "loss": 2.1048, + "step": 2679 + }, + { + "epoch": 0.4323975475959987, + "grad_norm": 3.99094820022583, + "learning_rate": 6.344498441956127e-05, + "loss": 2.0496, + "step": 2680 + }, + { + "epoch": 0.4325588899645047, + "grad_norm": 3.524550199508667, + "learning_rate": 6.341981723156829e-05, + "loss": 1.8614, + "step": 2681 + }, + { + "epoch": 0.43272023233301066, + "grad_norm": 3.9145429134368896, + "learning_rate": 6.339464637912874e-05, + "loss": 1.9051, + "step": 2682 + }, + { + "epoch": 0.4328815747015166, + "grad_norm": 3.8224198818206787, + "learning_rate": 6.336947186911585e-05, + "loss": 2.0474, + "step": 2683 + }, + { + "epoch": 0.43304291707002257, + "grad_norm": 4.108032703399658, + "learning_rate": 6.334429370840381e-05, + "loss": 1.9799, + "step": 2684 + }, + { + "epoch": 0.4332042594385286, + "grad_norm": 3.7723186016082764, + "learning_rate": 6.331911190386785e-05, + "loss": 1.7414, + "step": 2685 + }, + { + "epoch": 0.43336560180703454, + "grad_norm": 4.491466045379639, + "learning_rate": 6.329392646238416e-05, + "loss": 2.1263, + "step": 2686 + }, + { + "epoch": 0.4335269441755405, + "grad_norm": 8.540251731872559, + "learning_rate": 6.326873739082993e-05, + "loss": 2.3336, + "step": 2687 + }, + { + "epoch": 0.43368828654404645, + "grad_norm": 4.491308212280273, + "learning_rate": 6.324354469608335e-05, + "loss": 2.144, + "step": 2688 + }, + { + "epoch": 0.4338496289125524, + "grad_norm": 3.9718141555786133, + "learning_rate": 6.32183483850236e-05, + "loss": 1.7607, + "step": 2689 + }, + { + "epoch": 0.4340109712810584, + "grad_norm": 4.296075820922852, + "learning_rate": 6.319314846453086e-05, + "loss": 1.9612, + "step": 2690 + }, + { + "epoch": 0.4341723136495644, + "grad_norm": 4.1339216232299805, + "learning_rate": 6.316794494148625e-05, + "loss": 1.932, + "step": 2691 + }, + { + "epoch": 0.43433365601807034, + "grad_norm": 4.131509780883789, + "learning_rate": 6.31427378227719e-05, + "loss": 1.9349, + "step": 2692 + }, + { + "epoch": 0.4344949983865763, + "grad_norm": 3.372370719909668, + "learning_rate": 6.311752711527095e-05, + "loss": 1.8986, + "step": 2693 + }, + { + "epoch": 0.4346563407550823, + "grad_norm": 4.359404563903809, + "learning_rate": 6.309231282586748e-05, + "loss": 1.7414, + "step": 2694 + }, + { + "epoch": 0.43481768312358826, + "grad_norm": 7.026899337768555, + "learning_rate": 6.306709496144654e-05, + "loss": 1.9072, + "step": 2695 + }, + { + "epoch": 0.4349790254920942, + "grad_norm": 5.448616027832031, + "learning_rate": 6.304187352889423e-05, + "loss": 1.8877, + "step": 2696 + }, + { + "epoch": 0.4351403678606002, + "grad_norm": 3.7698123455047607, + "learning_rate": 6.301664853509754e-05, + "loss": 2.2127, + "step": 2697 + }, + { + "epoch": 0.4353017102291062, + "grad_norm": 4.5785441398620605, + "learning_rate": 6.299141998694448e-05, + "loss": 1.8905, + "step": 2698 + }, + { + "epoch": 0.43546305259761214, + "grad_norm": 4.515157699584961, + "learning_rate": 6.2966187891324e-05, + "loss": 1.84, + "step": 2699 + }, + { + "epoch": 0.4356243949661181, + "grad_norm": 5.423101425170898, + "learning_rate": 6.294095225512603e-05, + "loss": 1.8639, + "step": 2700 + }, + { + "epoch": 0.43578573733462406, + "grad_norm": 5.622671127319336, + "learning_rate": 6.29157130852415e-05, + "loss": 1.9387, + "step": 2701 + }, + { + "epoch": 0.43594707970313007, + "grad_norm": 3.312377691268921, + "learning_rate": 6.289047038856226e-05, + "loss": 2.1385, + "step": 2702 + }, + { + "epoch": 0.436108422071636, + "grad_norm": 4.007981777191162, + "learning_rate": 6.286522417198115e-05, + "loss": 1.9194, + "step": 2703 + }, + { + "epoch": 0.436269764440142, + "grad_norm": 4.537135124206543, + "learning_rate": 6.283997444239194e-05, + "loss": 1.9863, + "step": 2704 + }, + { + "epoch": 0.43643110680864794, + "grad_norm": 3.8456368446350098, + "learning_rate": 6.281472120668939e-05, + "loss": 2.132, + "step": 2705 + }, + { + "epoch": 0.4365924491771539, + "grad_norm": 4.065152645111084, + "learning_rate": 6.278946447176923e-05, + "loss": 1.8911, + "step": 2706 + }, + { + "epoch": 0.4367537915456599, + "grad_norm": 3.647221565246582, + "learning_rate": 6.27642042445281e-05, + "loss": 1.9615, + "step": 2707 + }, + { + "epoch": 0.43691513391416587, + "grad_norm": 6.745212554931641, + "learning_rate": 6.27389405318636e-05, + "loss": 1.9975, + "step": 2708 + }, + { + "epoch": 0.4370764762826718, + "grad_norm": 4.661088466644287, + "learning_rate": 6.271367334067431e-05, + "loss": 2.1729, + "step": 2709 + }, + { + "epoch": 0.4372378186511778, + "grad_norm": 3.776611328125, + "learning_rate": 6.268840267785976e-05, + "loss": 1.9579, + "step": 2710 + }, + { + "epoch": 0.4373991610196838, + "grad_norm": 4.246987342834473, + "learning_rate": 6.266312855032042e-05, + "loss": 1.9564, + "step": 2711 + }, + { + "epoch": 0.43756050338818975, + "grad_norm": 5.6204752922058105, + "learning_rate": 6.26378509649577e-05, + "loss": 2.0972, + "step": 2712 + }, + { + "epoch": 0.4377218457566957, + "grad_norm": 4.941777229309082, + "learning_rate": 6.261256992867392e-05, + "loss": 2.0689, + "step": 2713 + }, + { + "epoch": 0.43788318812520166, + "grad_norm": 4.1518964767456055, + "learning_rate": 6.258728544837243e-05, + "loss": 1.899, + "step": 2714 + }, + { + "epoch": 0.4380445304937077, + "grad_norm": 4.4116411209106445, + "learning_rate": 6.256199753095745e-05, + "loss": 1.8749, + "step": 2715 + }, + { + "epoch": 0.43820587286221363, + "grad_norm": 4.02710485458374, + "learning_rate": 6.253670618333417e-05, + "loss": 1.9701, + "step": 2716 + }, + { + "epoch": 0.4383672152307196, + "grad_norm": 3.8983218669891357, + "learning_rate": 6.251141141240866e-05, + "loss": 1.9445, + "step": 2717 + }, + { + "epoch": 0.43852855759922554, + "grad_norm": 5.86122989654541, + "learning_rate": 6.2486113225088e-05, + "loss": 1.9994, + "step": 2718 + }, + { + "epoch": 0.4386898999677315, + "grad_norm": 4.823337078094482, + "learning_rate": 6.246081162828016e-05, + "loss": 1.9845, + "step": 2719 + }, + { + "epoch": 0.4388512423362375, + "grad_norm": 3.7847797870635986, + "learning_rate": 6.243550662889408e-05, + "loss": 1.5948, + "step": 2720 + }, + { + "epoch": 0.43901258470474347, + "grad_norm": 3.4540817737579346, + "learning_rate": 6.241019823383959e-05, + "loss": 2.0022, + "step": 2721 + }, + { + "epoch": 0.4391739270732494, + "grad_norm": 4.29924201965332, + "learning_rate": 6.238488645002744e-05, + "loss": 1.8881, + "step": 2722 + }, + { + "epoch": 0.4393352694417554, + "grad_norm": 4.001747131347656, + "learning_rate": 6.235957128436936e-05, + "loss": 2.1782, + "step": 2723 + }, + { + "epoch": 0.4394966118102614, + "grad_norm": 3.5031304359436035, + "learning_rate": 6.233425274377794e-05, + "loss": 2.0432, + "step": 2724 + }, + { + "epoch": 0.43965795417876735, + "grad_norm": 4.947254657745361, + "learning_rate": 6.230893083516672e-05, + "loss": 1.9521, + "step": 2725 + }, + { + "epoch": 0.4398192965472733, + "grad_norm": 4.556342124938965, + "learning_rate": 6.228360556545016e-05, + "loss": 1.8621, + "step": 2726 + }, + { + "epoch": 0.43998063891577927, + "grad_norm": 5.173356533050537, + "learning_rate": 6.225827694154364e-05, + "loss": 2.0605, + "step": 2727 + }, + { + "epoch": 0.4401419812842853, + "grad_norm": 5.181009292602539, + "learning_rate": 6.22329449703635e-05, + "loss": 1.9213, + "step": 2728 + }, + { + "epoch": 0.44030332365279123, + "grad_norm": 4.707743167877197, + "learning_rate": 6.220760965882686e-05, + "loss": 2.0548, + "step": 2729 + }, + { + "epoch": 0.4404646660212972, + "grad_norm": 4.754821300506592, + "learning_rate": 6.218227101385189e-05, + "loss": 2.2322, + "step": 2730 + }, + { + "epoch": 0.44062600838980315, + "grad_norm": 3.5412001609802246, + "learning_rate": 6.215692904235762e-05, + "loss": 1.9829, + "step": 2731 + }, + { + "epoch": 0.4407873507583091, + "grad_norm": 3.518568992614746, + "learning_rate": 6.213158375126398e-05, + "loss": 1.7684, + "step": 2732 + }, + { + "epoch": 0.4409486931268151, + "grad_norm": 4.005379676818848, + "learning_rate": 6.21062351474918e-05, + "loss": 1.8206, + "step": 2733 + }, + { + "epoch": 0.4411100354953211, + "grad_norm": 4.170416355133057, + "learning_rate": 6.208088323796286e-05, + "loss": 1.9699, + "step": 2734 + }, + { + "epoch": 0.44127137786382703, + "grad_norm": 3.935908317565918, + "learning_rate": 6.20555280295998e-05, + "loss": 1.7982, + "step": 2735 + }, + { + "epoch": 0.441432720232333, + "grad_norm": 5.015724182128906, + "learning_rate": 6.203016952932614e-05, + "loss": 1.8578, + "step": 2736 + }, + { + "epoch": 0.441594062600839, + "grad_norm": 6.472920894622803, + "learning_rate": 6.200480774406637e-05, + "loss": 2.1805, + "step": 2737 + }, + { + "epoch": 0.44175540496934496, + "grad_norm": 4.625115394592285, + "learning_rate": 6.197944268074583e-05, + "loss": 1.9196, + "step": 2738 + }, + { + "epoch": 0.4419167473378509, + "grad_norm": 4.94097375869751, + "learning_rate": 6.195407434629077e-05, + "loss": 1.9293, + "step": 2739 + }, + { + "epoch": 0.44207808970635687, + "grad_norm": 5.129204273223877, + "learning_rate": 6.192870274762831e-05, + "loss": 1.8207, + "step": 2740 + }, + { + "epoch": 0.4422394320748629, + "grad_norm": 4.711706638336182, + "learning_rate": 6.190332789168648e-05, + "loss": 2.3755, + "step": 2741 + }, + { + "epoch": 0.44240077444336884, + "grad_norm": 6.469273567199707, + "learning_rate": 6.187794978539419e-05, + "loss": 2.1367, + "step": 2742 + }, + { + "epoch": 0.4425621168118748, + "grad_norm": 6.1164870262146, + "learning_rate": 6.185256843568127e-05, + "loss": 2.1863, + "step": 2743 + }, + { + "epoch": 0.44272345918038075, + "grad_norm": 4.332474708557129, + "learning_rate": 6.18271838494784e-05, + "loss": 1.992, + "step": 2744 + }, + { + "epoch": 0.44288480154888676, + "grad_norm": 5.556947708129883, + "learning_rate": 6.180179603371715e-05, + "loss": 1.9286, + "step": 2745 + }, + { + "epoch": 0.4430461439173927, + "grad_norm": 4.1797990798950195, + "learning_rate": 6.177640499532996e-05, + "loss": 1.7601, + "step": 2746 + }, + { + "epoch": 0.4432074862858987, + "grad_norm": 3.9618661403656006, + "learning_rate": 6.175101074125019e-05, + "loss": 1.8364, + "step": 2747 + }, + { + "epoch": 0.44336882865440463, + "grad_norm": 5.564877033233643, + "learning_rate": 6.172561327841206e-05, + "loss": 2.0964, + "step": 2748 + }, + { + "epoch": 0.4435301710229106, + "grad_norm": 5.803670883178711, + "learning_rate": 6.170021261375063e-05, + "loss": 2.225, + "step": 2749 + }, + { + "epoch": 0.4436915133914166, + "grad_norm": 3.9706003665924072, + "learning_rate": 6.167480875420188e-05, + "loss": 1.9598, + "step": 2750 + }, + { + "epoch": 0.44385285575992256, + "grad_norm": 4.746767044067383, + "learning_rate": 6.164940170670266e-05, + "loss": 1.9101, + "step": 2751 + }, + { + "epoch": 0.4440141981284285, + "grad_norm": 3.3168787956237793, + "learning_rate": 6.162399147819066e-05, + "loss": 2.02, + "step": 2752 + }, + { + "epoch": 0.4441755404969345, + "grad_norm": 6.952641487121582, + "learning_rate": 6.159857807560449e-05, + "loss": 2.2001, + "step": 2753 + }, + { + "epoch": 0.4443368828654405, + "grad_norm": 2.936422348022461, + "learning_rate": 6.157316150588355e-05, + "loss": 1.9248, + "step": 2754 + }, + { + "epoch": 0.44449822523394644, + "grad_norm": 5.582231521606445, + "learning_rate": 6.154774177596816e-05, + "loss": 1.9168, + "step": 2755 + }, + { + "epoch": 0.4446595676024524, + "grad_norm": 3.6427130699157715, + "learning_rate": 6.15223188927995e-05, + "loss": 1.774, + "step": 2756 + }, + { + "epoch": 0.44482090997095836, + "grad_norm": 3.65397047996521, + "learning_rate": 6.149689286331958e-05, + "loss": 2.0293, + "step": 2757 + }, + { + "epoch": 0.44498225233946437, + "grad_norm": 4.262415885925293, + "learning_rate": 6.147146369447131e-05, + "loss": 1.8036, + "step": 2758 + }, + { + "epoch": 0.4451435947079703, + "grad_norm": 5.0748209953308105, + "learning_rate": 6.144603139319845e-05, + "loss": 2.052, + "step": 2759 + }, + { + "epoch": 0.4453049370764763, + "grad_norm": 3.724123954772949, + "learning_rate": 6.142059596644558e-05, + "loss": 1.9632, + "step": 2760 + }, + { + "epoch": 0.44546627944498224, + "grad_norm": 4.626696586608887, + "learning_rate": 6.139515742115816e-05, + "loss": 1.8951, + "step": 2761 + }, + { + "epoch": 0.4456276218134882, + "grad_norm": 4.935206890106201, + "learning_rate": 6.13697157642825e-05, + "loss": 1.9481, + "step": 2762 + }, + { + "epoch": 0.4457889641819942, + "grad_norm": 4.78384256362915, + "learning_rate": 6.134427100276579e-05, + "loss": 1.7873, + "step": 2763 + }, + { + "epoch": 0.44595030655050016, + "grad_norm": 5.536062717437744, + "learning_rate": 6.131882314355599e-05, + "loss": 1.8287, + "step": 2764 + }, + { + "epoch": 0.4461116489190061, + "grad_norm": 3.8308053016662598, + "learning_rate": 6.129337219360196e-05, + "loss": 2.2945, + "step": 2765 + }, + { + "epoch": 0.4462729912875121, + "grad_norm": 3.763317584991455, + "learning_rate": 6.126791815985343e-05, + "loss": 2.2219, + "step": 2766 + }, + { + "epoch": 0.4464343336560181, + "grad_norm": 3.8848650455474854, + "learning_rate": 6.12424610492609e-05, + "loss": 1.9936, + "step": 2767 + }, + { + "epoch": 0.44659567602452405, + "grad_norm": 4.475578308105469, + "learning_rate": 6.121700086877575e-05, + "loss": 2.0197, + "step": 2768 + }, + { + "epoch": 0.44675701839303, + "grad_norm": 5.381459712982178, + "learning_rate": 6.119153762535021e-05, + "loss": 2.2272, + "step": 2769 + }, + { + "epoch": 0.44691836076153596, + "grad_norm": 4.24346399307251, + "learning_rate": 6.116607132593733e-05, + "loss": 2.2393, + "step": 2770 + }, + { + "epoch": 0.44707970313004197, + "grad_norm": 5.122096538543701, + "learning_rate": 6.114060197749101e-05, + "loss": 1.9709, + "step": 2771 + }, + { + "epoch": 0.44724104549854793, + "grad_norm": 5.45127534866333, + "learning_rate": 6.111512958696594e-05, + "loss": 2.0853, + "step": 2772 + }, + { + "epoch": 0.4474023878670539, + "grad_norm": 4.182010173797607, + "learning_rate": 6.10896541613177e-05, + "loss": 1.8845, + "step": 2773 + }, + { + "epoch": 0.44756373023555984, + "grad_norm": 4.00731897354126, + "learning_rate": 6.106417570750265e-05, + "loss": 2.0787, + "step": 2774 + }, + { + "epoch": 0.44772507260406585, + "grad_norm": 5.120847702026367, + "learning_rate": 6.1038694232478e-05, + "loss": 1.6975, + "step": 2775 + }, + { + "epoch": 0.4478864149725718, + "grad_norm": 4.001104354858398, + "learning_rate": 6.1013209743201784e-05, + "loss": 1.9372, + "step": 2776 + }, + { + "epoch": 0.44804775734107777, + "grad_norm": 3.4694342613220215, + "learning_rate": 6.098772224663285e-05, + "loss": 2.0223, + "step": 2777 + }, + { + "epoch": 0.4482090997095837, + "grad_norm": 3.6993019580841064, + "learning_rate": 6.09622317497309e-05, + "loss": 1.9419, + "step": 2778 + }, + { + "epoch": 0.4483704420780897, + "grad_norm": 4.861392974853516, + "learning_rate": 6.093673825945638e-05, + "loss": 2.0335, + "step": 2779 + }, + { + "epoch": 0.4485317844465957, + "grad_norm": 4.051278114318848, + "learning_rate": 6.0911241782770644e-05, + "loss": 1.7612, + "step": 2780 + }, + { + "epoch": 0.44869312681510165, + "grad_norm": 3.8256630897521973, + "learning_rate": 6.08857423266358e-05, + "loss": 1.8444, + "step": 2781 + }, + { + "epoch": 0.4488544691836076, + "grad_norm": 4.158143043518066, + "learning_rate": 6.086023989801478e-05, + "loss": 2.157, + "step": 2782 + }, + { + "epoch": 0.44901581155211356, + "grad_norm": 4.315159797668457, + "learning_rate": 6.0834734503871374e-05, + "loss": 1.8622, + "step": 2783 + }, + { + "epoch": 0.4491771539206196, + "grad_norm": 3.9262237548828125, + "learning_rate": 6.0809226151170104e-05, + "loss": 1.8943, + "step": 2784 + }, + { + "epoch": 0.44933849628912553, + "grad_norm": 4.216813564300537, + "learning_rate": 6.078371484687635e-05, + "loss": 1.8937, + "step": 2785 + }, + { + "epoch": 0.4494998386576315, + "grad_norm": 4.525496482849121, + "learning_rate": 6.0758200597956306e-05, + "loss": 2.1272, + "step": 2786 + }, + { + "epoch": 0.44966118102613745, + "grad_norm": 3.2141411304473877, + "learning_rate": 6.0732683411376935e-05, + "loss": 1.9563, + "step": 2787 + }, + { + "epoch": 0.44982252339464346, + "grad_norm": 3.9821486473083496, + "learning_rate": 6.070716329410602e-05, + "loss": 2.0271, + "step": 2788 + }, + { + "epoch": 0.4499838657631494, + "grad_norm": 4.4023356437683105, + "learning_rate": 6.068164025311215e-05, + "loss": 2.0083, + "step": 2789 + }, + { + "epoch": 0.45014520813165537, + "grad_norm": 6.945428848266602, + "learning_rate": 6.065611429536471e-05, + "loss": 2.1312, + "step": 2790 + }, + { + "epoch": 0.45030655050016133, + "grad_norm": 4.356259346008301, + "learning_rate": 6.0630585427833876e-05, + "loss": 1.9445, + "step": 2791 + }, + { + "epoch": 0.4504678928686673, + "grad_norm": 3.8623836040496826, + "learning_rate": 6.060505365749061e-05, + "loss": 1.8719, + "step": 2792 + }, + { + "epoch": 0.4506292352371733, + "grad_norm": 8.641510009765625, + "learning_rate": 6.057951899130668e-05, + "loss": 2.0761, + "step": 2793 + }, + { + "epoch": 0.45079057760567925, + "grad_norm": 3.494572162628174, + "learning_rate": 6.055398143625465e-05, + "loss": 1.9555, + "step": 2794 + }, + { + "epoch": 0.4509519199741852, + "grad_norm": 4.154002666473389, + "learning_rate": 6.0528440999307846e-05, + "loss": 1.9841, + "step": 2795 + }, + { + "epoch": 0.45111326234269117, + "grad_norm": 3.343074083328247, + "learning_rate": 6.050289768744042e-05, + "loss": 1.742, + "step": 2796 + }, + { + "epoch": 0.4512746047111972, + "grad_norm": 4.662927150726318, + "learning_rate": 6.0477351507627276e-05, + "loss": 2.1056, + "step": 2797 + }, + { + "epoch": 0.45143594707970314, + "grad_norm": 5.556552886962891, + "learning_rate": 6.045180246684412e-05, + "loss": 1.9523, + "step": 2798 + }, + { + "epoch": 0.4515972894482091, + "grad_norm": 4.1540679931640625, + "learning_rate": 6.042625057206742e-05, + "loss": 1.9353, + "step": 2799 + }, + { + "epoch": 0.45175863181671505, + "grad_norm": 4.263247013092041, + "learning_rate": 6.0400695830274453e-05, + "loss": 1.9561, + "step": 2800 + }, + { + "epoch": 0.45191997418522106, + "grad_norm": 3.8728561401367188, + "learning_rate": 6.037513824844326e-05, + "loss": 1.9391, + "step": 2801 + }, + { + "epoch": 0.452081316553727, + "grad_norm": 3.4104127883911133, + "learning_rate": 6.034957783355264e-05, + "loss": 1.8916, + "step": 2802 + }, + { + "epoch": 0.452242658922233, + "grad_norm": 4.290063381195068, + "learning_rate": 6.032401459258217e-05, + "loss": 1.7813, + "step": 2803 + }, + { + "epoch": 0.45240400129073893, + "grad_norm": 4.381455898284912, + "learning_rate": 6.029844853251223e-05, + "loss": 2.1467, + "step": 2804 + }, + { + "epoch": 0.45256534365924495, + "grad_norm": 5.716737747192383, + "learning_rate": 6.0272879660323934e-05, + "loss": 1.8457, + "step": 2805 + }, + { + "epoch": 0.4527266860277509, + "grad_norm": 5.184352397918701, + "learning_rate": 6.024730798299918e-05, + "loss": 2.0207, + "step": 2806 + }, + { + "epoch": 0.45288802839625686, + "grad_norm": 5.929523944854736, + "learning_rate": 6.022173350752064e-05, + "loss": 1.9734, + "step": 2807 + }, + { + "epoch": 0.4530493707647628, + "grad_norm": 3.670257329940796, + "learning_rate": 6.0196156240871726e-05, + "loss": 1.8307, + "step": 2808 + }, + { + "epoch": 0.45321071313326877, + "grad_norm": 4.820164680480957, + "learning_rate": 6.017057619003663e-05, + "loss": 2.0091, + "step": 2809 + }, + { + "epoch": 0.4533720555017748, + "grad_norm": 5.240920066833496, + "learning_rate": 6.01449933620003e-05, + "loss": 1.9866, + "step": 2810 + }, + { + "epoch": 0.45353339787028074, + "grad_norm": 4.092869758605957, + "learning_rate": 6.011940776374846e-05, + "loss": 2.2141, + "step": 2811 + }, + { + "epoch": 0.4536947402387867, + "grad_norm": 4.588827610015869, + "learning_rate": 6.009381940226755e-05, + "loss": 1.9181, + "step": 2812 + }, + { + "epoch": 0.45385608260729265, + "grad_norm": 4.388471603393555, + "learning_rate": 6.006822828454478e-05, + "loss": 1.9646, + "step": 2813 + }, + { + "epoch": 0.45401742497579867, + "grad_norm": 4.334965229034424, + "learning_rate": 6.004263441756815e-05, + "loss": 1.9277, + "step": 2814 + }, + { + "epoch": 0.4541787673443046, + "grad_norm": 4.454052448272705, + "learning_rate": 6.001703780832636e-05, + "loss": 2.1556, + "step": 2815 + }, + { + "epoch": 0.4543401097128106, + "grad_norm": 3.230821132659912, + "learning_rate": 5.99914384638089e-05, + "loss": 1.6343, + "step": 2816 + }, + { + "epoch": 0.45450145208131654, + "grad_norm": 3.8127543926239014, + "learning_rate": 5.9965836391005966e-05, + "loss": 1.9881, + "step": 2817 + }, + { + "epoch": 0.45466279444982255, + "grad_norm": 3.7591707706451416, + "learning_rate": 5.9940231596908527e-05, + "loss": 2.0796, + "step": 2818 + }, + { + "epoch": 0.4548241368183285, + "grad_norm": 5.247003555297852, + "learning_rate": 5.991462408850828e-05, + "loss": 1.9372, + "step": 2819 + }, + { + "epoch": 0.45498547918683446, + "grad_norm": 3.789271116256714, + "learning_rate": 5.988901387279768e-05, + "loss": 2.0667, + "step": 2820 + }, + { + "epoch": 0.4551468215553404, + "grad_norm": 4.7549147605896, + "learning_rate": 5.986340095676992e-05, + "loss": 1.8772, + "step": 2821 + }, + { + "epoch": 0.4553081639238464, + "grad_norm": 5.16868782043457, + "learning_rate": 5.983778534741891e-05, + "loss": 1.9445, + "step": 2822 + }, + { + "epoch": 0.4554695062923524, + "grad_norm": 3.538343906402588, + "learning_rate": 5.98121670517393e-05, + "loss": 1.9306, + "step": 2823 + }, + { + "epoch": 0.45563084866085835, + "grad_norm": 3.2506930828094482, + "learning_rate": 5.97865460767265e-05, + "loss": 2.1249, + "step": 2824 + }, + { + "epoch": 0.4557921910293643, + "grad_norm": 3.847710609436035, + "learning_rate": 5.976092242937663e-05, + "loss": 1.8947, + "step": 2825 + }, + { + "epoch": 0.45595353339787026, + "grad_norm": 4.152834415435791, + "learning_rate": 5.9735296116686526e-05, + "loss": 1.9749, + "step": 2826 + }, + { + "epoch": 0.45611487576637627, + "grad_norm": 4.620455741882324, + "learning_rate": 5.970966714565379e-05, + "loss": 1.866, + "step": 2827 + }, + { + "epoch": 0.4562762181348822, + "grad_norm": 3.5664255619049072, + "learning_rate": 5.9684035523276716e-05, + "loss": 1.8183, + "step": 2828 + }, + { + "epoch": 0.4564375605033882, + "grad_norm": 4.250553607940674, + "learning_rate": 5.9658401256554354e-05, + "loss": 1.916, + "step": 2829 + }, + { + "epoch": 0.45659890287189414, + "grad_norm": 5.06902551651001, + "learning_rate": 5.963276435248642e-05, + "loss": 2.0314, + "step": 2830 + }, + { + "epoch": 0.45676024524040015, + "grad_norm": 3.892254590988159, + "learning_rate": 5.9607124818073426e-05, + "loss": 2.0672, + "step": 2831 + }, + { + "epoch": 0.4569215876089061, + "grad_norm": 4.237087249755859, + "learning_rate": 5.958148266031654e-05, + "loss": 2.0516, + "step": 2832 + }, + { + "epoch": 0.45708292997741207, + "grad_norm": 3.9006803035736084, + "learning_rate": 5.955583788621766e-05, + "loss": 1.8418, + "step": 2833 + }, + { + "epoch": 0.457244272345918, + "grad_norm": 4.705850601196289, + "learning_rate": 5.9530190502779425e-05, + "loss": 1.7653, + "step": 2834 + }, + { + "epoch": 0.457405614714424, + "grad_norm": 4.514369010925293, + "learning_rate": 5.950454051700518e-05, + "loss": 1.7485, + "step": 2835 + }, + { + "epoch": 0.45756695708293, + "grad_norm": 3.989203453063965, + "learning_rate": 5.947888793589894e-05, + "loss": 1.9439, + "step": 2836 + }, + { + "epoch": 0.45772829945143595, + "grad_norm": 3.7708961963653564, + "learning_rate": 5.945323276646548e-05, + "loss": 1.8588, + "step": 2837 + }, + { + "epoch": 0.4578896418199419, + "grad_norm": 3.7233071327209473, + "learning_rate": 5.942757501571026e-05, + "loss": 1.875, + "step": 2838 + }, + { + "epoch": 0.45805098418844786, + "grad_norm": 3.7089688777923584, + "learning_rate": 5.940191469063943e-05, + "loss": 1.7894, + "step": 2839 + }, + { + "epoch": 0.4582123265569539, + "grad_norm": 4.522425174713135, + "learning_rate": 5.937625179825988e-05, + "loss": 2.1627, + "step": 2840 + }, + { + "epoch": 0.45837366892545983, + "grad_norm": 6.098073959350586, + "learning_rate": 5.9350586345579165e-05, + "loss": 1.8362, + "step": 2841 + }, + { + "epoch": 0.4585350112939658, + "grad_norm": 5.885412693023682, + "learning_rate": 5.932491833960556e-05, + "loss": 1.9232, + "step": 2842 + }, + { + "epoch": 0.45869635366247175, + "grad_norm": 4.87391996383667, + "learning_rate": 5.929924778734801e-05, + "loss": 1.7168, + "step": 2843 + }, + { + "epoch": 0.45885769603097776, + "grad_norm": 3.8623905181884766, + "learning_rate": 5.9273574695816204e-05, + "loss": 1.8761, + "step": 2844 + }, + { + "epoch": 0.4590190383994837, + "grad_norm": 5.439482688903809, + "learning_rate": 5.924789907202048e-05, + "loss": 1.9109, + "step": 2845 + }, + { + "epoch": 0.45918038076798967, + "grad_norm": 6.422399520874023, + "learning_rate": 5.922222092297188e-05, + "loss": 1.718, + "step": 2846 + }, + { + "epoch": 0.4593417231364956, + "grad_norm": 4.699904441833496, + "learning_rate": 5.919654025568215e-05, + "loss": 2.2407, + "step": 2847 + }, + { + "epoch": 0.45950306550500164, + "grad_norm": 3.374533176422119, + "learning_rate": 5.917085707716372e-05, + "loss": 2.0182, + "step": 2848 + }, + { + "epoch": 0.4596644078735076, + "grad_norm": 6.88772439956665, + "learning_rate": 5.914517139442968e-05, + "loss": 1.9034, + "step": 2849 + }, + { + "epoch": 0.45982575024201355, + "grad_norm": 4.0415940284729, + "learning_rate": 5.9119483214493844e-05, + "loss": 1.9439, + "step": 2850 + }, + { + "epoch": 0.4599870926105195, + "grad_norm": 5.267615795135498, + "learning_rate": 5.9093792544370665e-05, + "loss": 1.7653, + "step": 2851 + }, + { + "epoch": 0.46014843497902547, + "grad_norm": 4.279847621917725, + "learning_rate": 5.9068099391075296e-05, + "loss": 1.7213, + "step": 2852 + }, + { + "epoch": 0.4603097773475315, + "grad_norm": 4.328471660614014, + "learning_rate": 5.904240376162358e-05, + "loss": 1.8291, + "step": 2853 + }, + { + "epoch": 0.46047111971603744, + "grad_norm": 4.959749698638916, + "learning_rate": 5.901670566303205e-05, + "loss": 2.0452, + "step": 2854 + }, + { + "epoch": 0.4606324620845434, + "grad_norm": 3.369969606399536, + "learning_rate": 5.899100510231783e-05, + "loss": 1.9867, + "step": 2855 + }, + { + "epoch": 0.46079380445304935, + "grad_norm": 4.172391891479492, + "learning_rate": 5.8965302086498816e-05, + "loss": 1.9064, + "step": 2856 + }, + { + "epoch": 0.46095514682155536, + "grad_norm": 3.6533727645874023, + "learning_rate": 5.893959662259353e-05, + "loss": 2.2306, + "step": 2857 + }, + { + "epoch": 0.4611164891900613, + "grad_norm": 4.612024307250977, + "learning_rate": 5.891388871762116e-05, + "loss": 1.8467, + "step": 2858 + }, + { + "epoch": 0.4612778315585673, + "grad_norm": 4.94940185546875, + "learning_rate": 5.8888178378601565e-05, + "loss": 1.77, + "step": 2859 + }, + { + "epoch": 0.46143917392707323, + "grad_norm": 5.480003356933594, + "learning_rate": 5.8862465612555286e-05, + "loss": 2.1171, + "step": 2860 + }, + { + "epoch": 0.46160051629557924, + "grad_norm": 4.599255084991455, + "learning_rate": 5.8836750426503487e-05, + "loss": 2.2328, + "step": 2861 + }, + { + "epoch": 0.4617618586640852, + "grad_norm": 4.497989177703857, + "learning_rate": 5.881103282746803e-05, + "loss": 1.8345, + "step": 2862 + }, + { + "epoch": 0.46192320103259116, + "grad_norm": 3.771289348602295, + "learning_rate": 5.8785312822471405e-05, + "loss": 2.0867, + "step": 2863 + }, + { + "epoch": 0.4620845434010971, + "grad_norm": 5.573981285095215, + "learning_rate": 5.8759590418536806e-05, + "loss": 1.6688, + "step": 2864 + }, + { + "epoch": 0.46224588576960307, + "grad_norm": 4.295119285583496, + "learning_rate": 5.873386562268803e-05, + "loss": 2.3071, + "step": 2865 + }, + { + "epoch": 0.4624072281381091, + "grad_norm": 4.947121620178223, + "learning_rate": 5.8708138441949556e-05, + "loss": 1.8318, + "step": 2866 + }, + { + "epoch": 0.46256857050661504, + "grad_norm": 3.5124878883361816, + "learning_rate": 5.868240888334653e-05, + "loss": 1.8992, + "step": 2867 + }, + { + "epoch": 0.462729912875121, + "grad_norm": 3.6015613079071045, + "learning_rate": 5.865667695390468e-05, + "loss": 2.0089, + "step": 2868 + }, + { + "epoch": 0.46289125524362695, + "grad_norm": 5.1361308097839355, + "learning_rate": 5.863094266065046e-05, + "loss": 2.0508, + "step": 2869 + }, + { + "epoch": 0.46305259761213297, + "grad_norm": 4.714128017425537, + "learning_rate": 5.860520601061093e-05, + "loss": 1.7286, + "step": 2870 + }, + { + "epoch": 0.4632139399806389, + "grad_norm": 4.3609938621521, + "learning_rate": 5.857946701081379e-05, + "loss": 2.0369, + "step": 2871 + }, + { + "epoch": 0.4633752823491449, + "grad_norm": 5.165975570678711, + "learning_rate": 5.855372566828741e-05, + "loss": 1.9922, + "step": 2872 + }, + { + "epoch": 0.46353662471765084, + "grad_norm": 3.714384078979492, + "learning_rate": 5.8527981990060756e-05, + "loss": 1.882, + "step": 2873 + }, + { + "epoch": 0.46369796708615685, + "grad_norm": 4.203963279724121, + "learning_rate": 5.850223598316347e-05, + "loss": 2.0468, + "step": 2874 + }, + { + "epoch": 0.4638593094546628, + "grad_norm": 5.084408760070801, + "learning_rate": 5.8476487654625814e-05, + "loss": 2.1784, + "step": 2875 + }, + { + "epoch": 0.46402065182316876, + "grad_norm": 5.743606090545654, + "learning_rate": 5.8450737011478686e-05, + "loss": 2.0263, + "step": 2876 + }, + { + "epoch": 0.4641819941916747, + "grad_norm": 4.035053730010986, + "learning_rate": 5.842498406075363e-05, + "loss": 1.9945, + "step": 2877 + }, + { + "epoch": 0.46434333656018073, + "grad_norm": 5.210145950317383, + "learning_rate": 5.8399228809482796e-05, + "loss": 2.2279, + "step": 2878 + }, + { + "epoch": 0.4645046789286867, + "grad_norm": 3.959944009780884, + "learning_rate": 5.8373471264698975e-05, + "loss": 1.8902, + "step": 2879 + }, + { + "epoch": 0.46466602129719264, + "grad_norm": 3.5130996704101562, + "learning_rate": 5.834771143343558e-05, + "loss": 1.924, + "step": 2880 + }, + { + "epoch": 0.4648273636656986, + "grad_norm": 3.5183193683624268, + "learning_rate": 5.832194932272664e-05, + "loss": 1.9975, + "step": 2881 + }, + { + "epoch": 0.46498870603420456, + "grad_norm": 5.39290189743042, + "learning_rate": 5.8296184939606834e-05, + "loss": 1.8334, + "step": 2882 + }, + { + "epoch": 0.46515004840271057, + "grad_norm": 3.941741943359375, + "learning_rate": 5.827041829111144e-05, + "loss": 1.9553, + "step": 2883 + }, + { + "epoch": 0.4653113907712165, + "grad_norm": 5.16987419128418, + "learning_rate": 5.824464938427636e-05, + "loss": 1.9728, + "step": 2884 + }, + { + "epoch": 0.4654727331397225, + "grad_norm": 4.827245235443115, + "learning_rate": 5.82188782261381e-05, + "loss": 1.8847, + "step": 2885 + }, + { + "epoch": 0.46563407550822844, + "grad_norm": 4.589350700378418, + "learning_rate": 5.81931048237338e-05, + "loss": 1.8077, + "step": 2886 + }, + { + "epoch": 0.46579541787673445, + "grad_norm": 5.262085437774658, + "learning_rate": 5.8167329184101216e-05, + "loss": 2.1506, + "step": 2887 + }, + { + "epoch": 0.4659567602452404, + "grad_norm": 4.675025939941406, + "learning_rate": 5.81415513142787e-05, + "loss": 1.7157, + "step": 2888 + }, + { + "epoch": 0.46611810261374637, + "grad_norm": 5.154076099395752, + "learning_rate": 5.8115771221305204e-05, + "loss": 2.0694, + "step": 2889 + }, + { + "epoch": 0.4662794449822523, + "grad_norm": 3.6449477672576904, + "learning_rate": 5.8089988912220306e-05, + "loss": 1.7284, + "step": 2890 + }, + { + "epoch": 0.46644078735075833, + "grad_norm": 4.152365684509277, + "learning_rate": 5.806420439406419e-05, + "loss": 1.9364, + "step": 2891 + }, + { + "epoch": 0.4666021297192643, + "grad_norm": 6.4119553565979, + "learning_rate": 5.8038417673877644e-05, + "loss": 1.834, + "step": 2892 + }, + { + "epoch": 0.46676347208777025, + "grad_norm": 4.654435157775879, + "learning_rate": 5.8012628758702025e-05, + "loss": 2.0098, + "step": 2893 + }, + { + "epoch": 0.4669248144562762, + "grad_norm": 5.395243167877197, + "learning_rate": 5.798683765557933e-05, + "loss": 2.0146, + "step": 2894 + }, + { + "epoch": 0.46708615682478216, + "grad_norm": 5.3816609382629395, + "learning_rate": 5.796104437155213e-05, + "loss": 2.2618, + "step": 2895 + }, + { + "epoch": 0.4672474991932882, + "grad_norm": 3.4671974182128906, + "learning_rate": 5.79352489136636e-05, + "loss": 1.8487, + "step": 2896 + }, + { + "epoch": 0.46740884156179413, + "grad_norm": 4.630336761474609, + "learning_rate": 5.790945128895753e-05, + "loss": 1.8822, + "step": 2897 + }, + { + "epoch": 0.4675701839303001, + "grad_norm": 3.7400383949279785, + "learning_rate": 5.7883651504478257e-05, + "loss": 1.987, + "step": 2898 + }, + { + "epoch": 0.46773152629880604, + "grad_norm": 5.043737888336182, + "learning_rate": 5.7857849567270725e-05, + "loss": 2.0825, + "step": 2899 + }, + { + "epoch": 0.46789286866731206, + "grad_norm": 4.905903339385986, + "learning_rate": 5.7832045484380495e-05, + "loss": 1.9055, + "step": 2900 + }, + { + "epoch": 0.468054211035818, + "grad_norm": 4.365562915802002, + "learning_rate": 5.7806239262853665e-05, + "loss": 2.0351, + "step": 2901 + }, + { + "epoch": 0.46821555340432397, + "grad_norm": 4.589780807495117, + "learning_rate": 5.778043090973696e-05, + "loss": 1.7406, + "step": 2902 + }, + { + "epoch": 0.4683768957728299, + "grad_norm": 3.2420592308044434, + "learning_rate": 5.775462043207766e-05, + "loss": 2.1973, + "step": 2903 + }, + { + "epoch": 0.46853823814133594, + "grad_norm": 4.705362796783447, + "learning_rate": 5.7728807836923624e-05, + "loss": 1.8936, + "step": 2904 + }, + { + "epoch": 0.4686995805098419, + "grad_norm": 3.618006944656372, + "learning_rate": 5.770299313132334e-05, + "loss": 2.0016, + "step": 2905 + }, + { + "epoch": 0.46886092287834785, + "grad_norm": 3.33437442779541, + "learning_rate": 5.767717632232579e-05, + "loss": 1.8208, + "step": 2906 + }, + { + "epoch": 0.4690222652468538, + "grad_norm": 3.5410573482513428, + "learning_rate": 5.7651357416980575e-05, + "loss": 2.1685, + "step": 2907 + }, + { + "epoch": 0.4691836076153598, + "grad_norm": 4.212526798248291, + "learning_rate": 5.76255364223379e-05, + "loss": 1.9039, + "step": 2908 + }, + { + "epoch": 0.4693449499838658, + "grad_norm": 3.523818016052246, + "learning_rate": 5.759971334544847e-05, + "loss": 1.97, + "step": 2909 + }, + { + "epoch": 0.46950629235237173, + "grad_norm": 4.60017728805542, + "learning_rate": 5.7573888193363603e-05, + "loss": 1.8066, + "step": 2910 + }, + { + "epoch": 0.4696676347208777, + "grad_norm": 4.738743782043457, + "learning_rate": 5.754806097313516e-05, + "loss": 1.8601, + "step": 2911 + }, + { + "epoch": 0.46982897708938365, + "grad_norm": 4.0911478996276855, + "learning_rate": 5.752223169181563e-05, + "loss": 2.3781, + "step": 2912 + }, + { + "epoch": 0.46999031945788966, + "grad_norm": 4.628448009490967, + "learning_rate": 5.749640035645798e-05, + "loss": 2.0009, + "step": 2913 + }, + { + "epoch": 0.4701516618263956, + "grad_norm": 6.537326812744141, + "learning_rate": 5.747056697411577e-05, + "loss": 1.755, + "step": 2914 + }, + { + "epoch": 0.4703130041949016, + "grad_norm": 4.529343128204346, + "learning_rate": 5.7444731551843145e-05, + "loss": 1.8524, + "step": 2915 + }, + { + "epoch": 0.47047434656340753, + "grad_norm": 6.468570709228516, + "learning_rate": 5.7418894096694785e-05, + "loss": 1.9736, + "step": 2916 + }, + { + "epoch": 0.47063568893191354, + "grad_norm": 4.821703910827637, + "learning_rate": 5.739305461572591e-05, + "loss": 1.9003, + "step": 2917 + }, + { + "epoch": 0.4707970313004195, + "grad_norm": 4.382984161376953, + "learning_rate": 5.736721311599232e-05, + "loss": 1.9844, + "step": 2918 + }, + { + "epoch": 0.47095837366892546, + "grad_norm": 5.662232398986816, + "learning_rate": 5.734136960455035e-05, + "loss": 2.1265, + "step": 2919 + }, + { + "epoch": 0.4711197160374314, + "grad_norm": 4.168646812438965, + "learning_rate": 5.731552408845689e-05, + "loss": 2.1374, + "step": 2920 + }, + { + "epoch": 0.4712810584059374, + "grad_norm": 4.660909175872803, + "learning_rate": 5.728967657476936e-05, + "loss": 1.6488, + "step": 2921 + }, + { + "epoch": 0.4714424007744434, + "grad_norm": 4.780895709991455, + "learning_rate": 5.7263827070545775e-05, + "loss": 1.76, + "step": 2922 + }, + { + "epoch": 0.47160374314294934, + "grad_norm": 5.367981910705566, + "learning_rate": 5.723797558284464e-05, + "loss": 2.1334, + "step": 2923 + }, + { + "epoch": 0.4717650855114553, + "grad_norm": 4.828367710113525, + "learning_rate": 5.721212211872502e-05, + "loss": 1.9658, + "step": 2924 + }, + { + "epoch": 0.47192642787996125, + "grad_norm": 4.586594581604004, + "learning_rate": 5.718626668524655e-05, + "loss": 1.7567, + "step": 2925 + }, + { + "epoch": 0.47208777024846726, + "grad_norm": 4.738962650299072, + "learning_rate": 5.716040928946935e-05, + "loss": 1.915, + "step": 2926 + }, + { + "epoch": 0.4722491126169732, + "grad_norm": 3.831724166870117, + "learning_rate": 5.7134549938454095e-05, + "loss": 2.1708, + "step": 2927 + }, + { + "epoch": 0.4724104549854792, + "grad_norm": 4.1255879402160645, + "learning_rate": 5.710868863926202e-05, + "loss": 1.927, + "step": 2928 + }, + { + "epoch": 0.47257179735398513, + "grad_norm": 3.5976734161376953, + "learning_rate": 5.708282539895485e-05, + "loss": 1.9019, + "step": 2929 + }, + { + "epoch": 0.47273313972249115, + "grad_norm": 4.1457600593566895, + "learning_rate": 5.70569602245949e-05, + "loss": 1.9118, + "step": 2930 + }, + { + "epoch": 0.4728944820909971, + "grad_norm": 3.9562318325042725, + "learning_rate": 5.7031093123244925e-05, + "loss": 1.9082, + "step": 2931 + }, + { + "epoch": 0.47305582445950306, + "grad_norm": 3.8960869312286377, + "learning_rate": 5.700522410196828e-05, + "loss": 1.9282, + "step": 2932 + }, + { + "epoch": 0.473217166828009, + "grad_norm": 4.371620178222656, + "learning_rate": 5.697935316782883e-05, + "loss": 1.9379, + "step": 2933 + }, + { + "epoch": 0.47337850919651503, + "grad_norm": 3.91361665725708, + "learning_rate": 5.695348032789093e-05, + "loss": 1.9144, + "step": 2934 + }, + { + "epoch": 0.473539851565021, + "grad_norm": 4.016637325286865, + "learning_rate": 5.692760558921949e-05, + "loss": 1.9151, + "step": 2935 + }, + { + "epoch": 0.47370119393352694, + "grad_norm": 5.121694087982178, + "learning_rate": 5.690172895887993e-05, + "loss": 2.1035, + "step": 2936 + }, + { + "epoch": 0.4738625363020329, + "grad_norm": 5.147009372711182, + "learning_rate": 5.687585044393819e-05, + "loss": 1.8547, + "step": 2937 + }, + { + "epoch": 0.47402387867053886, + "grad_norm": 5.944944381713867, + "learning_rate": 5.684997005146071e-05, + "loss": 2.0241, + "step": 2938 + }, + { + "epoch": 0.47418522103904487, + "grad_norm": 4.913337707519531, + "learning_rate": 5.6824087788514424e-05, + "loss": 1.8984, + "step": 2939 + }, + { + "epoch": 0.4743465634075508, + "grad_norm": 4.556769847869873, + "learning_rate": 5.679820366216684e-05, + "loss": 2.1496, + "step": 2940 + }, + { + "epoch": 0.4745079057760568, + "grad_norm": 4.924295902252197, + "learning_rate": 5.677231767948592e-05, + "loss": 1.9009, + "step": 2941 + }, + { + "epoch": 0.47466924814456274, + "grad_norm": 4.699732303619385, + "learning_rate": 5.674642984754016e-05, + "loss": 1.8048, + "step": 2942 + }, + { + "epoch": 0.47483059051306875, + "grad_norm": 5.167842864990234, + "learning_rate": 5.672054017339855e-05, + "loss": 1.8607, + "step": 2943 + }, + { + "epoch": 0.4749919328815747, + "grad_norm": 4.477108478546143, + "learning_rate": 5.669464866413058e-05, + "loss": 1.9929, + "step": 2944 + }, + { + "epoch": 0.47515327525008066, + "grad_norm": 4.411334037780762, + "learning_rate": 5.666875532680624e-05, + "loss": 2.0333, + "step": 2945 + }, + { + "epoch": 0.4753146176185866, + "grad_norm": 4.497818946838379, + "learning_rate": 5.664286016849604e-05, + "loss": 1.9406, + "step": 2946 + }, + { + "epoch": 0.47547595998709263, + "grad_norm": 5.784490585327148, + "learning_rate": 5.661696319627097e-05, + "loss": 1.9808, + "step": 2947 + }, + { + "epoch": 0.4756373023555986, + "grad_norm": 3.4934520721435547, + "learning_rate": 5.65910644172025e-05, + "loss": 1.8308, + "step": 2948 + }, + { + "epoch": 0.47579864472410455, + "grad_norm": 5.852362155914307, + "learning_rate": 5.656516383836262e-05, + "loss": 2.1092, + "step": 2949 + }, + { + "epoch": 0.4759599870926105, + "grad_norm": 5.923304080963135, + "learning_rate": 5.6539261466823814e-05, + "loss": 1.7185, + "step": 2950 + }, + { + "epoch": 0.4761213294611165, + "grad_norm": 4.36501407623291, + "learning_rate": 5.651335730965902e-05, + "loss": 1.9015, + "step": 2951 + }, + { + "epoch": 0.4762826718296225, + "grad_norm": 5.649499893188477, + "learning_rate": 5.648745137394171e-05, + "loss": 1.9853, + "step": 2952 + }, + { + "epoch": 0.47644401419812843, + "grad_norm": 3.385308265686035, + "learning_rate": 5.646154366674582e-05, + "loss": 1.9835, + "step": 2953 + }, + { + "epoch": 0.4766053565666344, + "grad_norm": 4.42126989364624, + "learning_rate": 5.643563419514576e-05, + "loss": 1.9702, + "step": 2954 + }, + { + "epoch": 0.47676669893514034, + "grad_norm": 4.301641464233398, + "learning_rate": 5.6409722966216436e-05, + "loss": 1.811, + "step": 2955 + }, + { + "epoch": 0.47692804130364636, + "grad_norm": 4.124393463134766, + "learning_rate": 5.638380998703322e-05, + "loss": 2.02, + "step": 2956 + }, + { + "epoch": 0.4770893836721523, + "grad_norm": 4.3341827392578125, + "learning_rate": 5.6357895264671976e-05, + "loss": 1.9654, + "step": 2957 + }, + { + "epoch": 0.47725072604065827, + "grad_norm": 5.749555587768555, + "learning_rate": 5.633197880620904e-05, + "loss": 2.0513, + "step": 2958 + }, + { + "epoch": 0.4774120684091642, + "grad_norm": 4.620724201202393, + "learning_rate": 5.63060606187212e-05, + "loss": 2.005, + "step": 2959 + }, + { + "epoch": 0.47757341077767024, + "grad_norm": 5.979537010192871, + "learning_rate": 5.6280140709285765e-05, + "loss": 2.0375, + "step": 2960 + }, + { + "epoch": 0.4777347531461762, + "grad_norm": 4.459228515625, + "learning_rate": 5.625421908498048e-05, + "loss": 1.948, + "step": 2961 + }, + { + "epoch": 0.47789609551468215, + "grad_norm": 4.105099201202393, + "learning_rate": 5.622829575288355e-05, + "loss": 1.8121, + "step": 2962 + }, + { + "epoch": 0.4780574378831881, + "grad_norm": 6.522353649139404, + "learning_rate": 5.620237072007367e-05, + "loss": 2.0249, + "step": 2963 + }, + { + "epoch": 0.4782187802516941, + "grad_norm": 4.792418479919434, + "learning_rate": 5.617644399363e-05, + "loss": 2.0696, + "step": 2964 + }, + { + "epoch": 0.4783801226202001, + "grad_norm": 4.468806743621826, + "learning_rate": 5.6150515580632146e-05, + "loss": 2.0892, + "step": 2965 + }, + { + "epoch": 0.47854146498870603, + "grad_norm": 3.8313448429107666, + "learning_rate": 5.6124585488160165e-05, + "loss": 2.0339, + "step": 2966 + }, + { + "epoch": 0.478702807357212, + "grad_norm": 3.5952653884887695, + "learning_rate": 5.6098653723294604e-05, + "loss": 1.9474, + "step": 2967 + }, + { + "epoch": 0.47886414972571795, + "grad_norm": 4.359584808349609, + "learning_rate": 5.6072720293116453e-05, + "loss": 1.9697, + "step": 2968 + }, + { + "epoch": 0.47902549209422396, + "grad_norm": 3.3227756023406982, + "learning_rate": 5.604678520470714e-05, + "loss": 1.8438, + "step": 2969 + }, + { + "epoch": 0.4791868344627299, + "grad_norm": 4.508224010467529, + "learning_rate": 5.6020848465148565e-05, + "loss": 1.7845, + "step": 2970 + }, + { + "epoch": 0.4793481768312359, + "grad_norm": 3.500623941421509, + "learning_rate": 5.599491008152309e-05, + "loss": 1.8891, + "step": 2971 + }, + { + "epoch": 0.47950951919974183, + "grad_norm": 5.407317638397217, + "learning_rate": 5.59689700609135e-05, + "loss": 1.9803, + "step": 2972 + }, + { + "epoch": 0.47967086156824784, + "grad_norm": 3.8324880599975586, + "learning_rate": 5.5943028410403034e-05, + "loss": 2.0477, + "step": 2973 + }, + { + "epoch": 0.4798322039367538, + "grad_norm": 3.5237014293670654, + "learning_rate": 5.5917085137075375e-05, + "loss": 1.7231, + "step": 2974 + }, + { + "epoch": 0.47999354630525976, + "grad_norm": 4.768789291381836, + "learning_rate": 5.589114024801468e-05, + "loss": 2.037, + "step": 2975 + }, + { + "epoch": 0.4801548886737657, + "grad_norm": 5.117864608764648, + "learning_rate": 5.586519375030549e-05, + "loss": 2.1124, + "step": 2976 + }, + { + "epoch": 0.4803162310422717, + "grad_norm": 3.719688653945923, + "learning_rate": 5.583924565103283e-05, + "loss": 1.9257, + "step": 2977 + }, + { + "epoch": 0.4804775734107777, + "grad_norm": 3.956259250640869, + "learning_rate": 5.5813295957282155e-05, + "loss": 1.8487, + "step": 2978 + }, + { + "epoch": 0.48063891577928364, + "grad_norm": 4.621438503265381, + "learning_rate": 5.578734467613933e-05, + "loss": 2.2352, + "step": 2979 + }, + { + "epoch": 0.4808002581477896, + "grad_norm": 6.483243465423584, + "learning_rate": 5.576139181469069e-05, + "loss": 2.1027, + "step": 2980 + }, + { + "epoch": 0.4809616005162956, + "grad_norm": 5.146411418914795, + "learning_rate": 5.573543738002298e-05, + "loss": 2.0156, + "step": 2981 + }, + { + "epoch": 0.48112294288480156, + "grad_norm": 4.754197597503662, + "learning_rate": 5.570948137922336e-05, + "loss": 2.1035, + "step": 2982 + }, + { + "epoch": 0.4812842852533075, + "grad_norm": 3.280088186264038, + "learning_rate": 5.568352381937947e-05, + "loss": 1.8788, + "step": 2983 + }, + { + "epoch": 0.4814456276218135, + "grad_norm": 6.88657283782959, + "learning_rate": 5.5657564707579315e-05, + "loss": 1.8554, + "step": 2984 + }, + { + "epoch": 0.48160696999031943, + "grad_norm": 5.31023645401001, + "learning_rate": 5.563160405091136e-05, + "loss": 1.7064, + "step": 2985 + }, + { + "epoch": 0.48176831235882545, + "grad_norm": 4.084466934204102, + "learning_rate": 5.5605641856464483e-05, + "loss": 1.9841, + "step": 2986 + }, + { + "epoch": 0.4819296547273314, + "grad_norm": 3.7623586654663086, + "learning_rate": 5.557967813132797e-05, + "loss": 1.9674, + "step": 2987 + }, + { + "epoch": 0.48209099709583736, + "grad_norm": 5.303673267364502, + "learning_rate": 5.555371288259155e-05, + "loss": 2.184, + "step": 2988 + }, + { + "epoch": 0.4822523394643433, + "grad_norm": 3.9341800212860107, + "learning_rate": 5.552774611734535e-05, + "loss": 2.0964, + "step": 2989 + }, + { + "epoch": 0.48241368183284933, + "grad_norm": 3.611407518386841, + "learning_rate": 5.550177784267991e-05, + "loss": 1.8166, + "step": 2990 + }, + { + "epoch": 0.4825750242013553, + "grad_norm": 4.035516262054443, + "learning_rate": 5.547580806568621e-05, + "loss": 1.7636, + "step": 2991 + }, + { + "epoch": 0.48273636656986124, + "grad_norm": 4.942079544067383, + "learning_rate": 5.544983679345559e-05, + "loss": 2.0076, + "step": 2992 + }, + { + "epoch": 0.4828977089383672, + "grad_norm": 3.7774479389190674, + "learning_rate": 5.542386403307984e-05, + "loss": 2.0257, + "step": 2993 + }, + { + "epoch": 0.4830590513068732, + "grad_norm": 5.655243873596191, + "learning_rate": 5.5397889791651145e-05, + "loss": 1.774, + "step": 2994 + }, + { + "epoch": 0.48322039367537917, + "grad_norm": 6.693843841552734, + "learning_rate": 5.5371914076262085e-05, + "loss": 2.0392, + "step": 2995 + }, + { + "epoch": 0.4833817360438851, + "grad_norm": 6.32671594619751, + "learning_rate": 5.534593689400565e-05, + "loss": 1.8613, + "step": 2996 + }, + { + "epoch": 0.4835430784123911, + "grad_norm": 5.015989780426025, + "learning_rate": 5.531995825197522e-05, + "loss": 1.8816, + "step": 2997 + }, + { + "epoch": 0.48370442078089704, + "grad_norm": 4.0533952713012695, + "learning_rate": 5.5293978157264605e-05, + "loss": 1.7802, + "step": 2998 + }, + { + "epoch": 0.48386576314940305, + "grad_norm": 3.7793238162994385, + "learning_rate": 5.5267996616967966e-05, + "loss": 2.1053, + "step": 2999 + }, + { + "epoch": 0.484027105517909, + "grad_norm": 7.2383222579956055, + "learning_rate": 5.524201363817991e-05, + "loss": 2.0855, + "step": 3000 + }, + { + "epoch": 0.48418844788641496, + "grad_norm": 6.114560127258301, + "learning_rate": 5.521602922799539e-05, + "loss": 1.9759, + "step": 3001 + }, + { + "epoch": 0.4843497902549209, + "grad_norm": 6.07392692565918, + "learning_rate": 5.519004339350977e-05, + "loss": 2.1192, + "step": 3002 + }, + { + "epoch": 0.48451113262342693, + "grad_norm": 4.475742340087891, + "learning_rate": 5.516405614181883e-05, + "loss": 1.8879, + "step": 3003 + }, + { + "epoch": 0.4846724749919329, + "grad_norm": 3.745962619781494, + "learning_rate": 5.513806748001866e-05, + "loss": 2.0256, + "step": 3004 + }, + { + "epoch": 0.48483381736043885, + "grad_norm": 4.5964508056640625, + "learning_rate": 5.5112077415205834e-05, + "loss": 1.9467, + "step": 3005 + }, + { + "epoch": 0.4849951597289448, + "grad_norm": 6.5591936111450195, + "learning_rate": 5.508608595447724e-05, + "loss": 2.0158, + "step": 3006 + }, + { + "epoch": 0.4851565020974508, + "grad_norm": 4.011103630065918, + "learning_rate": 5.506009310493014e-05, + "loss": 1.945, + "step": 3007 + }, + { + "epoch": 0.48531784446595677, + "grad_norm": 4.502754211425781, + "learning_rate": 5.5034098873662244e-05, + "loss": 1.9072, + "step": 3008 + }, + { + "epoch": 0.48547918683446273, + "grad_norm": 4.582625865936279, + "learning_rate": 5.5008103267771585e-05, + "loss": 1.8755, + "step": 3009 + }, + { + "epoch": 0.4856405292029687, + "grad_norm": 6.4384989738464355, + "learning_rate": 5.498210629435656e-05, + "loss": 2.1877, + "step": 3010 + }, + { + "epoch": 0.48580187157147464, + "grad_norm": 4.0016255378723145, + "learning_rate": 5.495610796051599e-05, + "loss": 1.9966, + "step": 3011 + }, + { + "epoch": 0.48596321393998065, + "grad_norm": 5.0590434074401855, + "learning_rate": 5.493010827334904e-05, + "loss": 2.0301, + "step": 3012 + }, + { + "epoch": 0.4861245563084866, + "grad_norm": 5.322793006896973, + "learning_rate": 5.490410723995524e-05, + "loss": 1.7476, + "step": 3013 + }, + { + "epoch": 0.48628589867699257, + "grad_norm": 3.817650556564331, + "learning_rate": 5.487810486743448e-05, + "loss": 1.8565, + "step": 3014 + }, + { + "epoch": 0.4864472410454985, + "grad_norm": 4.500606536865234, + "learning_rate": 5.485210116288704e-05, + "loss": 2.0309, + "step": 3015 + }, + { + "epoch": 0.48660858341400454, + "grad_norm": 5.041215896606445, + "learning_rate": 5.482609613341355e-05, + "loss": 2.004, + "step": 3016 + }, + { + "epoch": 0.4867699257825105, + "grad_norm": 4.371964454650879, + "learning_rate": 5.4800089786115e-05, + "loss": 1.6451, + "step": 3017 + }, + { + "epoch": 0.48693126815101645, + "grad_norm": 3.7129971981048584, + "learning_rate": 5.477408212809277e-05, + "loss": 1.8587, + "step": 3018 + }, + { + "epoch": 0.4870926105195224, + "grad_norm": 3.990514039993286, + "learning_rate": 5.4748073166448545e-05, + "loss": 1.9835, + "step": 3019 + }, + { + "epoch": 0.4872539528880284, + "grad_norm": 3.36611270904541, + "learning_rate": 5.472206290828438e-05, + "loss": 1.8937, + "step": 3020 + }, + { + "epoch": 0.4874152952565344, + "grad_norm": 3.641436815261841, + "learning_rate": 5.4696051360702725e-05, + "loss": 2.0468, + "step": 3021 + }, + { + "epoch": 0.48757663762504033, + "grad_norm": 3.7677080631256104, + "learning_rate": 5.467003853080634e-05, + "loss": 1.9942, + "step": 3022 + }, + { + "epoch": 0.4877379799935463, + "grad_norm": 3.343273639678955, + "learning_rate": 5.464402442569837e-05, + "loss": 1.9385, + "step": 3023 + }, + { + "epoch": 0.4878993223620523, + "grad_norm": 5.741142749786377, + "learning_rate": 5.461800905248225e-05, + "loss": 1.9521, + "step": 3024 + }, + { + "epoch": 0.48806066473055826, + "grad_norm": 4.08474588394165, + "learning_rate": 5.459199241826183e-05, + "loss": 2.1862, + "step": 3025 + }, + { + "epoch": 0.4882220070990642, + "grad_norm": 3.6710474491119385, + "learning_rate": 5.456597453014125e-05, + "loss": 2.0495, + "step": 3026 + }, + { + "epoch": 0.48838334946757017, + "grad_norm": 4.435654640197754, + "learning_rate": 5.453995539522503e-05, + "loss": 1.8065, + "step": 3027 + }, + { + "epoch": 0.48854469183607613, + "grad_norm": 4.730984210968018, + "learning_rate": 5.451393502061801e-05, + "loss": 1.7733, + "step": 3028 + }, + { + "epoch": 0.48870603420458214, + "grad_norm": 3.4625661373138428, + "learning_rate": 5.448791341342538e-05, + "loss": 1.9444, + "step": 3029 + }, + { + "epoch": 0.4888673765730881, + "grad_norm": 4.53637170791626, + "learning_rate": 5.446189058075265e-05, + "loss": 1.9885, + "step": 3030 + }, + { + "epoch": 0.48902871894159405, + "grad_norm": 5.451572895050049, + "learning_rate": 5.4435866529705706e-05, + "loss": 1.8764, + "step": 3031 + }, + { + "epoch": 0.4891900613101, + "grad_norm": 6.030241012573242, + "learning_rate": 5.4409841267390684e-05, + "loss": 2.2974, + "step": 3032 + }, + { + "epoch": 0.489351403678606, + "grad_norm": 3.6585793495178223, + "learning_rate": 5.4383814800914135e-05, + "loss": 2.0521, + "step": 3033 + }, + { + "epoch": 0.489512746047112, + "grad_norm": 4.740541934967041, + "learning_rate": 5.435778713738292e-05, + "loss": 1.9246, + "step": 3034 + }, + { + "epoch": 0.48967408841561794, + "grad_norm": 4.058865547180176, + "learning_rate": 5.433175828390418e-05, + "loss": 2.0038, + "step": 3035 + }, + { + "epoch": 0.4898354307841239, + "grad_norm": 4.699848651885986, + "learning_rate": 5.430572824758543e-05, + "loss": 1.8504, + "step": 3036 + }, + { + "epoch": 0.4899967731526299, + "grad_norm": 3.6751012802124023, + "learning_rate": 5.4279697035534496e-05, + "loss": 2.1574, + "step": 3037 + }, + { + "epoch": 0.49015811552113586, + "grad_norm": 4.686280250549316, + "learning_rate": 5.4253664654859515e-05, + "loss": 1.8434, + "step": 3038 + }, + { + "epoch": 0.4903194578896418, + "grad_norm": 7.2674880027771, + "learning_rate": 5.4227631112668955e-05, + "loss": 1.946, + "step": 3039 + }, + { + "epoch": 0.4904808002581478, + "grad_norm": 4.655464172363281, + "learning_rate": 5.4201596416071585e-05, + "loss": 1.9058, + "step": 3040 + }, + { + "epoch": 0.49064214262665373, + "grad_norm": 5.856889724731445, + "learning_rate": 5.417556057217652e-05, + "loss": 1.8253, + "step": 3041 + }, + { + "epoch": 0.49080348499515974, + "grad_norm": 5.061101913452148, + "learning_rate": 5.4149523588093156e-05, + "loss": 1.9645, + "step": 3042 + }, + { + "epoch": 0.4909648273636657, + "grad_norm": 5.176707744598389, + "learning_rate": 5.41234854709312e-05, + "loss": 1.7489, + "step": 3043 + }, + { + "epoch": 0.49112616973217166, + "grad_norm": 4.296136856079102, + "learning_rate": 5.4097446227800716e-05, + "loss": 1.9497, + "step": 3044 + }, + { + "epoch": 0.4912875121006776, + "grad_norm": 4.513986587524414, + "learning_rate": 5.4071405865812e-05, + "loss": 1.6907, + "step": 3045 + }, + { + "epoch": 0.4914488544691836, + "grad_norm": 3.427591323852539, + "learning_rate": 5.404536439207571e-05, + "loss": 1.9645, + "step": 3046 + }, + { + "epoch": 0.4916101968376896, + "grad_norm": 4.151731967926025, + "learning_rate": 5.401932181370281e-05, + "loss": 1.9425, + "step": 3047 + }, + { + "epoch": 0.49177153920619554, + "grad_norm": 5.4362711906433105, + "learning_rate": 5.3993278137804505e-05, + "loss": 1.9941, + "step": 3048 + }, + { + "epoch": 0.4919328815747015, + "grad_norm": 5.981296539306641, + "learning_rate": 5.3967233371492385e-05, + "loss": 1.9203, + "step": 3049 + }, + { + "epoch": 0.4920942239432075, + "grad_norm": 5.723951816558838, + "learning_rate": 5.3941187521878265e-05, + "loss": 1.9091, + "step": 3050 + }, + { + "epoch": 0.49225556631171347, + "grad_norm": 7.49515962600708, + "learning_rate": 5.391514059607431e-05, + "loss": 2.1372, + "step": 3051 + }, + { + "epoch": 0.4924169086802194, + "grad_norm": 4.525052547454834, + "learning_rate": 5.388909260119295e-05, + "loss": 1.9937, + "step": 3052 + }, + { + "epoch": 0.4925782510487254, + "grad_norm": 5.0868611335754395, + "learning_rate": 5.386304354434688e-05, + "loss": 1.8576, + "step": 3053 + }, + { + "epoch": 0.4927395934172314, + "grad_norm": 4.022094249725342, + "learning_rate": 5.383699343264915e-05, + "loss": 1.7156, + "step": 3054 + }, + { + "epoch": 0.49290093578573735, + "grad_norm": 4.555119037628174, + "learning_rate": 5.381094227321305e-05, + "loss": 2.0519, + "step": 3055 + }, + { + "epoch": 0.4930622781542433, + "grad_norm": 6.8928022384643555, + "learning_rate": 5.3784890073152184e-05, + "loss": 1.9971, + "step": 3056 + }, + { + "epoch": 0.49322362052274926, + "grad_norm": 3.7264394760131836, + "learning_rate": 5.375883683958041e-05, + "loss": 1.9227, + "step": 3057 + }, + { + "epoch": 0.4933849628912552, + "grad_norm": 6.371485710144043, + "learning_rate": 5.3732782579611885e-05, + "loss": 2.0955, + "step": 3058 + }, + { + "epoch": 0.49354630525976123, + "grad_norm": 5.1360063552856445, + "learning_rate": 5.370672730036105e-05, + "loss": 2.2286, + "step": 3059 + }, + { + "epoch": 0.4937076476282672, + "grad_norm": 4.728470802307129, + "learning_rate": 5.368067100894263e-05, + "loss": 2.0495, + "step": 3060 + }, + { + "epoch": 0.49386898999677314, + "grad_norm": 5.989841461181641, + "learning_rate": 5.36546137124716e-05, + "loss": 2.1039, + "step": 3061 + }, + { + "epoch": 0.4940303323652791, + "grad_norm": 4.98484468460083, + "learning_rate": 5.362855541806324e-05, + "loss": 1.7701, + "step": 3062 + }, + { + "epoch": 0.4941916747337851, + "grad_norm": 4.288937568664551, + "learning_rate": 5.360249613283308e-05, + "loss": 2.1159, + "step": 3063 + }, + { + "epoch": 0.49435301710229107, + "grad_norm": 4.535952091217041, + "learning_rate": 5.357643586389693e-05, + "loss": 2.1569, + "step": 3064 + }, + { + "epoch": 0.494514359470797, + "grad_norm": 4.4559102058410645, + "learning_rate": 5.355037461837088e-05, + "loss": 2.1523, + "step": 3065 + }, + { + "epoch": 0.494675701839303, + "grad_norm": 5.075308799743652, + "learning_rate": 5.3524312403371257e-05, + "loss": 1.8285, + "step": 3066 + }, + { + "epoch": 0.494837044207809, + "grad_norm": 4.463019847869873, + "learning_rate": 5.349824922601467e-05, + "loss": 2.0525, + "step": 3067 + }, + { + "epoch": 0.49499838657631495, + "grad_norm": 6.228049278259277, + "learning_rate": 5.3472185093418e-05, + "loss": 2.192, + "step": 3068 + }, + { + "epoch": 0.4951597289448209, + "grad_norm": 5.267539024353027, + "learning_rate": 5.34461200126984e-05, + "loss": 1.9324, + "step": 3069 + }, + { + "epoch": 0.49532107131332687, + "grad_norm": 4.502386093139648, + "learning_rate": 5.342005399097323e-05, + "loss": 2.1786, + "step": 3070 + }, + { + "epoch": 0.4954824136818328, + "grad_norm": 3.724005937576294, + "learning_rate": 5.339398703536014e-05, + "loss": 1.9359, + "step": 3071 + }, + { + "epoch": 0.49564375605033884, + "grad_norm": 3.993429183959961, + "learning_rate": 5.336791915297705e-05, + "loss": 1.9509, + "step": 3072 + }, + { + "epoch": 0.4958050984188448, + "grad_norm": 3.7583045959472656, + "learning_rate": 5.33418503509421e-05, + "loss": 1.9695, + "step": 3073 + }, + { + "epoch": 0.49596644078735075, + "grad_norm": 5.138786792755127, + "learning_rate": 5.331578063637371e-05, + "loss": 1.8942, + "step": 3074 + }, + { + "epoch": 0.4961277831558567, + "grad_norm": 3.136106014251709, + "learning_rate": 5.3289710016390535e-05, + "loss": 2.0119, + "step": 3075 + }, + { + "epoch": 0.4962891255243627, + "grad_norm": 3.523416757583618, + "learning_rate": 5.326363849811148e-05, + "loss": 1.6708, + "step": 3076 + }, + { + "epoch": 0.4964504678928687, + "grad_norm": 6.248582363128662, + "learning_rate": 5.3237566088655686e-05, + "loss": 1.9164, + "step": 3077 + }, + { + "epoch": 0.49661181026137463, + "grad_norm": 5.099051475524902, + "learning_rate": 5.321149279514256e-05, + "loss": 2.0021, + "step": 3078 + }, + { + "epoch": 0.4967731526298806, + "grad_norm": 4.510593414306641, + "learning_rate": 5.318541862469172e-05, + "loss": 1.9441, + "step": 3079 + }, + { + "epoch": 0.4969344949983866, + "grad_norm": 3.972517251968384, + "learning_rate": 5.315934358442306e-05, + "loss": 1.8179, + "step": 3080 + }, + { + "epoch": 0.49709583736689256, + "grad_norm": 4.122412204742432, + "learning_rate": 5.313326768145668e-05, + "loss": 2.0284, + "step": 3081 + }, + { + "epoch": 0.4972571797353985, + "grad_norm": 4.420566558837891, + "learning_rate": 5.310719092291292e-05, + "loss": 1.9152, + "step": 3082 + }, + { + "epoch": 0.49741852210390447, + "grad_norm": 4.935243129730225, + "learning_rate": 5.308111331591237e-05, + "loss": 1.9614, + "step": 3083 + }, + { + "epoch": 0.4975798644724105, + "grad_norm": 3.7829785346984863, + "learning_rate": 5.3055034867575826e-05, + "loss": 2.0301, + "step": 3084 + }, + { + "epoch": 0.49774120684091644, + "grad_norm": 5.601998805999756, + "learning_rate": 5.302895558502435e-05, + "loss": 1.9055, + "step": 3085 + }, + { + "epoch": 0.4979025492094224, + "grad_norm": 3.415437936782837, + "learning_rate": 5.300287547537921e-05, + "loss": 1.8584, + "step": 3086 + }, + { + "epoch": 0.49806389157792835, + "grad_norm": 3.8595170974731445, + "learning_rate": 5.297679454576189e-05, + "loss": 1.8926, + "step": 3087 + }, + { + "epoch": 0.4982252339464343, + "grad_norm": 5.454835891723633, + "learning_rate": 5.295071280329411e-05, + "loss": 2.0281, + "step": 3088 + }, + { + "epoch": 0.4983865763149403, + "grad_norm": 4.356043815612793, + "learning_rate": 5.292463025509783e-05, + "loss": 1.8235, + "step": 3089 + }, + { + "epoch": 0.4985479186834463, + "grad_norm": 4.273855686187744, + "learning_rate": 5.2898546908295196e-05, + "loss": 1.9884, + "step": 3090 + }, + { + "epoch": 0.49870926105195224, + "grad_norm": 4.208919048309326, + "learning_rate": 5.287246277000859e-05, + "loss": 1.9723, + "step": 3091 + }, + { + "epoch": 0.4988706034204582, + "grad_norm": 4.538530349731445, + "learning_rate": 5.284637784736059e-05, + "loss": 1.8405, + "step": 3092 + }, + { + "epoch": 0.4990319457889642, + "grad_norm": 4.146410942077637, + "learning_rate": 5.282029214747404e-05, + "loss": 1.9356, + "step": 3093 + }, + { + "epoch": 0.49919328815747016, + "grad_norm": 3.9748480319976807, + "learning_rate": 5.279420567747195e-05, + "loss": 1.8358, + "step": 3094 + }, + { + "epoch": 0.4993546305259761, + "grad_norm": 3.9157555103302, + "learning_rate": 5.276811844447754e-05, + "loss": 1.9643, + "step": 3095 + }, + { + "epoch": 0.4995159728944821, + "grad_norm": 5.487942218780518, + "learning_rate": 5.274203045561426e-05, + "loss": 2.1133, + "step": 3096 + }, + { + "epoch": 0.4996773152629881, + "grad_norm": 4.531738758087158, + "learning_rate": 5.2715941718005747e-05, + "loss": 1.7476, + "step": 3097 + }, + { + "epoch": 0.49983865763149404, + "grad_norm": 5.055044174194336, + "learning_rate": 5.268985223877586e-05, + "loss": 1.9826, + "step": 3098 + }, + { + "epoch": 0.5, + "grad_norm": 3.6202011108398438, + "learning_rate": 5.266376202504866e-05, + "loss": 1.9412, + "step": 3099 + }, + { + "epoch": 0.500161342368506, + "grad_norm": 3.817711591720581, + "learning_rate": 5.263767108394839e-05, + "loss": 1.8635, + "step": 3100 + }, + { + "epoch": 0.5003226847370119, + "grad_norm": 5.228734493255615, + "learning_rate": 5.261157942259951e-05, + "loss": 2.0059, + "step": 3101 + }, + { + "epoch": 0.5004840271055179, + "grad_norm": 3.718454360961914, + "learning_rate": 5.258548704812667e-05, + "loss": 1.7086, + "step": 3102 + }, + { + "epoch": 0.5006453694740238, + "grad_norm": 3.11460018157959, + "learning_rate": 5.255939396765471e-05, + "loss": 1.7851, + "step": 3103 + }, + { + "epoch": 0.5008067118425299, + "grad_norm": 4.518640041351318, + "learning_rate": 5.253330018830868e-05, + "loss": 1.8237, + "step": 3104 + }, + { + "epoch": 0.5009680542110359, + "grad_norm": 7.6932148933410645, + "learning_rate": 5.250720571721378e-05, + "loss": 2.1752, + "step": 3105 + }, + { + "epoch": 0.5011293965795418, + "grad_norm": 5.212931156158447, + "learning_rate": 5.248111056149545e-05, + "loss": 1.7233, + "step": 3106 + }, + { + "epoch": 0.5012907389480478, + "grad_norm": 5.633206844329834, + "learning_rate": 5.2455014728279304e-05, + "loss": 1.8938, + "step": 3107 + }, + { + "epoch": 0.5014520813165537, + "grad_norm": 3.8108067512512207, + "learning_rate": 5.2428918224691107e-05, + "loss": 1.8824, + "step": 3108 + }, + { + "epoch": 0.5016134236850597, + "grad_norm": 4.1044087409973145, + "learning_rate": 5.240282105785683e-05, + "loss": 1.7429, + "step": 3109 + }, + { + "epoch": 0.5017747660535656, + "grad_norm": 6.116450309753418, + "learning_rate": 5.237672323490266e-05, + "loss": 1.8687, + "step": 3110 + }, + { + "epoch": 0.5019361084220716, + "grad_norm": 5.414731502532959, + "learning_rate": 5.2350624762954884e-05, + "loss": 1.8503, + "step": 3111 + }, + { + "epoch": 0.5020974507905777, + "grad_norm": 4.763272762298584, + "learning_rate": 5.232452564914004e-05, + "loss": 2.0168, + "step": 3112 + }, + { + "epoch": 0.5022587931590836, + "grad_norm": 4.471733093261719, + "learning_rate": 5.2298425900584805e-05, + "loss": 1.8949, + "step": 3113 + }, + { + "epoch": 0.5024201355275896, + "grad_norm": 3.62532901763916, + "learning_rate": 5.2272325524416034e-05, + "loss": 1.7583, + "step": 3114 + }, + { + "epoch": 0.5025814778960955, + "grad_norm": 4.908361911773682, + "learning_rate": 5.2246224527760765e-05, + "loss": 2.134, + "step": 3115 + }, + { + "epoch": 0.5027428202646015, + "grad_norm": 5.122865676879883, + "learning_rate": 5.22201229177462e-05, + "loss": 2.0385, + "step": 3116 + }, + { + "epoch": 0.5029041626331074, + "grad_norm": 7.8537774085998535, + "learning_rate": 5.219402070149968e-05, + "loss": 1.8947, + "step": 3117 + }, + { + "epoch": 0.5030655050016134, + "grad_norm": 6.622297763824463, + "learning_rate": 5.2167917886148765e-05, + "loss": 1.8418, + "step": 3118 + }, + { + "epoch": 0.5032268473701194, + "grad_norm": 5.4166765213012695, + "learning_rate": 5.2141814478821146e-05, + "loss": 2.0358, + "step": 3119 + }, + { + "epoch": 0.5033881897386253, + "grad_norm": 5.115591526031494, + "learning_rate": 5.211571048664469e-05, + "loss": 1.8973, + "step": 3120 + }, + { + "epoch": 0.5035495321071314, + "grad_norm": 4.430712699890137, + "learning_rate": 5.2089605916747374e-05, + "loss": 1.9662, + "step": 3121 + }, + { + "epoch": 0.5037108744756373, + "grad_norm": 5.537373065948486, + "learning_rate": 5.20635007762574e-05, + "loss": 2.0732, + "step": 3122 + }, + { + "epoch": 0.5038722168441433, + "grad_norm": 3.0885260105133057, + "learning_rate": 5.203739507230311e-05, + "loss": 1.8567, + "step": 3123 + }, + { + "epoch": 0.5040335592126493, + "grad_norm": 4.156231880187988, + "learning_rate": 5.201128881201296e-05, + "loss": 1.6745, + "step": 3124 + }, + { + "epoch": 0.5041949015811552, + "grad_norm": 4.524260520935059, + "learning_rate": 5.1985182002515595e-05, + "loss": 2.0716, + "step": 3125 + }, + { + "epoch": 0.5043562439496612, + "grad_norm": 4.900241851806641, + "learning_rate": 5.195907465093982e-05, + "loss": 1.7979, + "step": 3126 + }, + { + "epoch": 0.5045175863181671, + "grad_norm": 3.2450215816497803, + "learning_rate": 5.1932966764414545e-05, + "loss": 1.8963, + "step": 3127 + }, + { + "epoch": 0.5046789286866731, + "grad_norm": 6.959305286407471, + "learning_rate": 5.190685835006888e-05, + "loss": 2.1063, + "step": 3128 + }, + { + "epoch": 0.504840271055179, + "grad_norm": 5.322142124176025, + "learning_rate": 5.188074941503203e-05, + "loss": 2.1626, + "step": 3129 + }, + { + "epoch": 0.5050016134236851, + "grad_norm": 4.897068500518799, + "learning_rate": 5.185463996643335e-05, + "loss": 2.1379, + "step": 3130 + }, + { + "epoch": 0.5051629557921911, + "grad_norm": 4.031672477722168, + "learning_rate": 5.182853001140235e-05, + "loss": 1.9897, + "step": 3131 + }, + { + "epoch": 0.505324298160697, + "grad_norm": 3.338622570037842, + "learning_rate": 5.180241955706872e-05, + "loss": 2.0872, + "step": 3132 + }, + { + "epoch": 0.505485640529203, + "grad_norm": 4.088404655456543, + "learning_rate": 5.1776308610562175e-05, + "loss": 1.9367, + "step": 3133 + }, + { + "epoch": 0.5056469828977089, + "grad_norm": 4.4865217208862305, + "learning_rate": 5.175019717901267e-05, + "loss": 1.9833, + "step": 3134 + }, + { + "epoch": 0.5058083252662149, + "grad_norm": 5.236238956451416, + "learning_rate": 5.172408526955025e-05, + "loss": 1.9675, + "step": 3135 + }, + { + "epoch": 0.5059696676347208, + "grad_norm": 4.071722984313965, + "learning_rate": 5.169797288930508e-05, + "loss": 1.9855, + "step": 3136 + }, + { + "epoch": 0.5061310100032268, + "grad_norm": 3.554135322570801, + "learning_rate": 5.1671860045407484e-05, + "loss": 2.0495, + "step": 3137 + }, + { + "epoch": 0.5062923523717329, + "grad_norm": 4.239428520202637, + "learning_rate": 5.164574674498788e-05, + "loss": 1.9573, + "step": 3138 + }, + { + "epoch": 0.5064536947402388, + "grad_norm": 4.632482528686523, + "learning_rate": 5.1619632995176845e-05, + "loss": 1.8922, + "step": 3139 + }, + { + "epoch": 0.5066150371087448, + "grad_norm": 4.101121425628662, + "learning_rate": 5.1593518803105055e-05, + "loss": 1.8699, + "step": 3140 + }, + { + "epoch": 0.5067763794772507, + "grad_norm": 4.697978973388672, + "learning_rate": 5.1567404175903286e-05, + "loss": 1.961, + "step": 3141 + }, + { + "epoch": 0.5069377218457567, + "grad_norm": 3.9164650440216064, + "learning_rate": 5.15412891207025e-05, + "loss": 1.8541, + "step": 3142 + }, + { + "epoch": 0.5070990642142627, + "grad_norm": 4.599571228027344, + "learning_rate": 5.151517364463371e-05, + "loss": 1.9727, + "step": 3143 + }, + { + "epoch": 0.5072604065827686, + "grad_norm": 5.059162139892578, + "learning_rate": 5.1489057754828075e-05, + "loss": 1.9445, + "step": 3144 + }, + { + "epoch": 0.5074217489512746, + "grad_norm": 3.971083641052246, + "learning_rate": 5.146294145841687e-05, + "loss": 1.88, + "step": 3145 + }, + { + "epoch": 0.5075830913197805, + "grad_norm": 4.057460308074951, + "learning_rate": 5.1436824762531444e-05, + "loss": 2.0365, + "step": 3146 + }, + { + "epoch": 0.5077444336882866, + "grad_norm": 3.408555507659912, + "learning_rate": 5.14107076743033e-05, + "loss": 1.9953, + "step": 3147 + }, + { + "epoch": 0.5079057760567925, + "grad_norm": 4.3520660400390625, + "learning_rate": 5.1384590200864047e-05, + "loss": 2.0391, + "step": 3148 + }, + { + "epoch": 0.5080671184252985, + "grad_norm": 4.447436332702637, + "learning_rate": 5.1358472349345366e-05, + "loss": 1.8032, + "step": 3149 + }, + { + "epoch": 0.5082284607938045, + "grad_norm": 3.5266644954681396, + "learning_rate": 5.1332354126879055e-05, + "loss": 1.8932, + "step": 3150 + }, + { + "epoch": 0.5083898031623104, + "grad_norm": 4.616915702819824, + "learning_rate": 5.1306235540597016e-05, + "loss": 1.9985, + "step": 3151 + }, + { + "epoch": 0.5085511455308164, + "grad_norm": 4.386703968048096, + "learning_rate": 5.128011659763125e-05, + "loss": 1.8963, + "step": 3152 + }, + { + "epoch": 0.5087124878993223, + "grad_norm": 3.514730215072632, + "learning_rate": 5.125399730511388e-05, + "loss": 2.1354, + "step": 3153 + }, + { + "epoch": 0.5088738302678283, + "grad_norm": 4.217443943023682, + "learning_rate": 5.1227877670177084e-05, + "loss": 1.8325, + "step": 3154 + }, + { + "epoch": 0.5090351726363344, + "grad_norm": 4.520450592041016, + "learning_rate": 5.1201757699953134e-05, + "loss": 1.9591, + "step": 3155 + }, + { + "epoch": 0.5091965150048403, + "grad_norm": 4.804542064666748, + "learning_rate": 5.117563740157444e-05, + "loss": 1.9719, + "step": 3156 + }, + { + "epoch": 0.5093578573733463, + "grad_norm": 4.119200229644775, + "learning_rate": 5.1149516782173465e-05, + "loss": 2.1569, + "step": 3157 + }, + { + "epoch": 0.5095191997418522, + "grad_norm": 5.18499231338501, + "learning_rate": 5.112339584888275e-05, + "loss": 2.0214, + "step": 3158 + }, + { + "epoch": 0.5096805421103582, + "grad_norm": 4.811190128326416, + "learning_rate": 5.1097274608834955e-05, + "loss": 2.2339, + "step": 3159 + }, + { + "epoch": 0.5098418844788641, + "grad_norm": 5.532687187194824, + "learning_rate": 5.107115306916278e-05, + "loss": 1.6811, + "step": 3160 + }, + { + "epoch": 0.5100032268473701, + "grad_norm": 4.852935314178467, + "learning_rate": 5.104503123699906e-05, + "loss": 2.2254, + "step": 3161 + }, + { + "epoch": 0.510164569215876, + "grad_norm": 5.2240495681762695, + "learning_rate": 5.101890911947668e-05, + "loss": 1.9324, + "step": 3162 + }, + { + "epoch": 0.510325911584382, + "grad_norm": 4.980324745178223, + "learning_rate": 5.099278672372859e-05, + "loss": 1.7929, + "step": 3163 + }, + { + "epoch": 0.5104872539528881, + "grad_norm": 5.097092628479004, + "learning_rate": 5.096666405688786e-05, + "loss": 2.1415, + "step": 3164 + }, + { + "epoch": 0.510648596321394, + "grad_norm": 6.43392276763916, + "learning_rate": 5.094054112608758e-05, + "loss": 2.025, + "step": 3165 + }, + { + "epoch": 0.5108099386899, + "grad_norm": 6.511987686157227, + "learning_rate": 5.0914417938460946e-05, + "loss": 1.8817, + "step": 3166 + }, + { + "epoch": 0.510971281058406, + "grad_norm": 2.9716813564300537, + "learning_rate": 5.0888294501141245e-05, + "loss": 1.7831, + "step": 3167 + }, + { + "epoch": 0.5111326234269119, + "grad_norm": 4.300621032714844, + "learning_rate": 5.0862170821261746e-05, + "loss": 1.8249, + "step": 3168 + }, + { + "epoch": 0.5112939657954179, + "grad_norm": 4.005673408508301, + "learning_rate": 5.083604690595589e-05, + "loss": 2.0527, + "step": 3169 + }, + { + "epoch": 0.5114553081639238, + "grad_norm": 3.7447731494903564, + "learning_rate": 5.080992276235712e-05, + "loss": 1.8692, + "step": 3170 + }, + { + "epoch": 0.5116166505324298, + "grad_norm": 4.364311695098877, + "learning_rate": 5.078379839759895e-05, + "loss": 2.2357, + "step": 3171 + }, + { + "epoch": 0.5117779929009358, + "grad_norm": 3.762906789779663, + "learning_rate": 5.0757673818814956e-05, + "loss": 2.1641, + "step": 3172 + }, + { + "epoch": 0.5119393352694418, + "grad_norm": 5.468181133270264, + "learning_rate": 5.073154903313878e-05, + "loss": 2.2245, + "step": 3173 + }, + { + "epoch": 0.5121006776379478, + "grad_norm": 4.8055500984191895, + "learning_rate": 5.070542404770413e-05, + "loss": 1.8051, + "step": 3174 + }, + { + "epoch": 0.5122620200064537, + "grad_norm": 4.544543743133545, + "learning_rate": 5.0679298869644745e-05, + "loss": 2.0303, + "step": 3175 + }, + { + "epoch": 0.5124233623749597, + "grad_norm": 5.0618510246276855, + "learning_rate": 5.065317350609443e-05, + "loss": 1.7931, + "step": 3176 + }, + { + "epoch": 0.5125847047434656, + "grad_norm": 3.694225549697876, + "learning_rate": 5.062704796418703e-05, + "loss": 2.0482, + "step": 3177 + }, + { + "epoch": 0.5127460471119716, + "grad_norm": 4.4860358238220215, + "learning_rate": 5.060092225105646e-05, + "loss": 1.9537, + "step": 3178 + }, + { + "epoch": 0.5129073894804775, + "grad_norm": 5.659295558929443, + "learning_rate": 5.0574796373836654e-05, + "loss": 1.79, + "step": 3179 + }, + { + "epoch": 0.5130687318489835, + "grad_norm": 4.734598636627197, + "learning_rate": 5.0548670339661605e-05, + "loss": 1.7415, + "step": 3180 + }, + { + "epoch": 0.5132300742174896, + "grad_norm": 4.6034955978393555, + "learning_rate": 5.052254415566536e-05, + "loss": 1.9392, + "step": 3181 + }, + { + "epoch": 0.5133914165859955, + "grad_norm": 4.362708568572998, + "learning_rate": 5.049641782898199e-05, + "loss": 2.0239, + "step": 3182 + }, + { + "epoch": 0.5135527589545015, + "grad_norm": 3.8116865158081055, + "learning_rate": 5.047029136674563e-05, + "loss": 1.8929, + "step": 3183 + }, + { + "epoch": 0.5137141013230074, + "grad_norm": 4.168476104736328, + "learning_rate": 5.044416477609038e-05, + "loss": 1.905, + "step": 3184 + }, + { + "epoch": 0.5138754436915134, + "grad_norm": 4.413907051086426, + "learning_rate": 5.041803806415049e-05, + "loss": 1.9336, + "step": 3185 + }, + { + "epoch": 0.5140367860600193, + "grad_norm": 4.9159770011901855, + "learning_rate": 5.039191123806013e-05, + "loss": 1.9518, + "step": 3186 + }, + { + "epoch": 0.5141981284285253, + "grad_norm": 3.9071576595306396, + "learning_rate": 5.03657843049536e-05, + "loss": 1.8817, + "step": 3187 + }, + { + "epoch": 0.5143594707970313, + "grad_norm": 3.7660021781921387, + "learning_rate": 5.033965727196513e-05, + "loss": 1.8742, + "step": 3188 + }, + { + "epoch": 0.5145208131655372, + "grad_norm": 5.279468536376953, + "learning_rate": 5.031353014622907e-05, + "loss": 1.8084, + "step": 3189 + }, + { + "epoch": 0.5146821555340433, + "grad_norm": 4.2043681144714355, + "learning_rate": 5.0287402934879725e-05, + "loss": 2.1356, + "step": 3190 + }, + { + "epoch": 0.5148434979025492, + "grad_norm": 4.737074851989746, + "learning_rate": 5.026127564505147e-05, + "loss": 2.0505, + "step": 3191 + }, + { + "epoch": 0.5150048402710552, + "grad_norm": 3.856682062149048, + "learning_rate": 5.0235148283878675e-05, + "loss": 1.8909, + "step": 3192 + }, + { + "epoch": 0.5151661826395612, + "grad_norm": 3.9248077869415283, + "learning_rate": 5.020902085849575e-05, + "loss": 1.8531, + "step": 3193 + }, + { + "epoch": 0.5153275250080671, + "grad_norm": 3.782620668411255, + "learning_rate": 5.018289337603709e-05, + "loss": 2.0022, + "step": 3194 + }, + { + "epoch": 0.5154888673765731, + "grad_norm": 6.275498390197754, + "learning_rate": 5.0156765843637156e-05, + "loss": 2.1021, + "step": 3195 + }, + { + "epoch": 0.515650209745079, + "grad_norm": 5.375777244567871, + "learning_rate": 5.013063826843036e-05, + "loss": 1.7244, + "step": 3196 + }, + { + "epoch": 0.515811552113585, + "grad_norm": 4.985101699829102, + "learning_rate": 5.01045106575512e-05, + "loss": 1.7357, + "step": 3197 + }, + { + "epoch": 0.515972894482091, + "grad_norm": 4.513815879821777, + "learning_rate": 5.007838301813409e-05, + "loss": 1.8166, + "step": 3198 + }, + { + "epoch": 0.516134236850597, + "grad_norm": 3.676476240158081, + "learning_rate": 5.0052255357313536e-05, + "loss": 1.7123, + "step": 3199 + }, + { + "epoch": 0.516295579219103, + "grad_norm": 4.253829479217529, + "learning_rate": 5.002612768222401e-05, + "loss": 1.8319, + "step": 3200 + }, + { + "epoch": 0.5164569215876089, + "grad_norm": 3.100123167037964, + "learning_rate": 5e-05, + "loss": 1.9244, + "step": 3201 + }, + { + "epoch": 0.5166182639561149, + "grad_norm": 3.931265115737915, + "learning_rate": 4.997387231777601e-05, + "loss": 2.0529, + "step": 3202 + }, + { + "epoch": 0.5167796063246208, + "grad_norm": 3.888005256652832, + "learning_rate": 4.9947744642686476e-05, + "loss": 1.9436, + "step": 3203 + }, + { + "epoch": 0.5169409486931268, + "grad_norm": 4.62919807434082, + "learning_rate": 4.9921616981865926e-05, + "loss": 1.9782, + "step": 3204 + }, + { + "epoch": 0.5171022910616327, + "grad_norm": 3.9387950897216797, + "learning_rate": 4.9895489342448814e-05, + "loss": 1.894, + "step": 3205 + }, + { + "epoch": 0.5172636334301387, + "grad_norm": 7.06707763671875, + "learning_rate": 4.9869361731569645e-05, + "loss": 1.8967, + "step": 3206 + }, + { + "epoch": 0.5174249757986448, + "grad_norm": 4.015041351318359, + "learning_rate": 4.984323415636285e-05, + "loss": 1.8313, + "step": 3207 + }, + { + "epoch": 0.5175863181671507, + "grad_norm": 4.058426856994629, + "learning_rate": 4.9817106623962915e-05, + "loss": 1.9187, + "step": 3208 + }, + { + "epoch": 0.5177476605356567, + "grad_norm": 4.128994941711426, + "learning_rate": 4.9790979141504254e-05, + "loss": 1.7979, + "step": 3209 + }, + { + "epoch": 0.5179090029041626, + "grad_norm": 3.9643068313598633, + "learning_rate": 4.9764851716121337e-05, + "loss": 1.8926, + "step": 3210 + }, + { + "epoch": 0.5180703452726686, + "grad_norm": 5.316287994384766, + "learning_rate": 4.973872435494853e-05, + "loss": 1.7501, + "step": 3211 + }, + { + "epoch": 0.5182316876411746, + "grad_norm": 6.077383995056152, + "learning_rate": 4.971259706512029e-05, + "loss": 1.9895, + "step": 3212 + }, + { + "epoch": 0.5183930300096805, + "grad_norm": 3.991368055343628, + "learning_rate": 4.968646985377093e-05, + "loss": 1.8354, + "step": 3213 + }, + { + "epoch": 0.5185543723781865, + "grad_norm": 5.030738353729248, + "learning_rate": 4.966034272803488e-05, + "loss": 2.3001, + "step": 3214 + }, + { + "epoch": 0.5187157147466925, + "grad_norm": 4.330802917480469, + "learning_rate": 4.9634215695046425e-05, + "loss": 1.9873, + "step": 3215 + }, + { + "epoch": 0.5188770571151985, + "grad_norm": 4.938310623168945, + "learning_rate": 4.960808876193987e-05, + "loss": 1.9398, + "step": 3216 + }, + { + "epoch": 0.5190383994837044, + "grad_norm": 3.9098732471466064, + "learning_rate": 4.9581961935849536e-05, + "loss": 1.8656, + "step": 3217 + }, + { + "epoch": 0.5191997418522104, + "grad_norm": 4.209815979003906, + "learning_rate": 4.955583522390962e-05, + "loss": 2.0892, + "step": 3218 + }, + { + "epoch": 0.5193610842207164, + "grad_norm": 3.854658842086792, + "learning_rate": 4.95297086332544e-05, + "loss": 1.8602, + "step": 3219 + }, + { + "epoch": 0.5195224265892223, + "grad_norm": 3.757833480834961, + "learning_rate": 4.9503582171018e-05, + "loss": 1.9197, + "step": 3220 + }, + { + "epoch": 0.5196837689577283, + "grad_norm": 5.500919818878174, + "learning_rate": 4.9477455844334645e-05, + "loss": 2.1579, + "step": 3221 + }, + { + "epoch": 0.5198451113262342, + "grad_norm": 4.940765380859375, + "learning_rate": 4.945132966033839e-05, + "loss": 2.0228, + "step": 3222 + }, + { + "epoch": 0.5200064536947402, + "grad_norm": 3.5651445388793945, + "learning_rate": 4.942520362616336e-05, + "loss": 1.8485, + "step": 3223 + }, + { + "epoch": 0.5201677960632463, + "grad_norm": 4.323605537414551, + "learning_rate": 4.9399077748943554e-05, + "loss": 1.8058, + "step": 3224 + }, + { + "epoch": 0.5203291384317522, + "grad_norm": 4.6020708084106445, + "learning_rate": 4.937295203581297e-05, + "loss": 1.903, + "step": 3225 + }, + { + "epoch": 0.5204904808002582, + "grad_norm": 5.301663875579834, + "learning_rate": 4.934682649390557e-05, + "loss": 2.0427, + "step": 3226 + }, + { + "epoch": 0.5206518231687641, + "grad_norm": 6.134703159332275, + "learning_rate": 4.932070113035527e-05, + "loss": 1.9043, + "step": 3227 + }, + { + "epoch": 0.5208131655372701, + "grad_norm": 4.341734886169434, + "learning_rate": 4.929457595229589e-05, + "loss": 1.8886, + "step": 3228 + }, + { + "epoch": 0.520974507905776, + "grad_norm": 4.849643707275391, + "learning_rate": 4.926845096686122e-05, + "loss": 2.1224, + "step": 3229 + }, + { + "epoch": 0.521135850274282, + "grad_norm": 4.229915142059326, + "learning_rate": 4.924232618118507e-05, + "loss": 1.8808, + "step": 3230 + }, + { + "epoch": 0.521297192642788, + "grad_norm": 5.061223983764648, + "learning_rate": 4.9216201602401065e-05, + "loss": 1.8286, + "step": 3231 + }, + { + "epoch": 0.5214585350112939, + "grad_norm": 4.838642120361328, + "learning_rate": 4.91900772376429e-05, + "loss": 1.771, + "step": 3232 + }, + { + "epoch": 0.5216198773798, + "grad_norm": 4.382593154907227, + "learning_rate": 4.9163953094044114e-05, + "loss": 2.0494, + "step": 3233 + }, + { + "epoch": 0.5217812197483059, + "grad_norm": 4.9571709632873535, + "learning_rate": 4.913782917873826e-05, + "loss": 2.0606, + "step": 3234 + }, + { + "epoch": 0.5219425621168119, + "grad_norm": 5.578112602233887, + "learning_rate": 4.911170549885877e-05, + "loss": 2.061, + "step": 3235 + }, + { + "epoch": 0.5221039044853178, + "grad_norm": 4.324019908905029, + "learning_rate": 4.908558206153906e-05, + "loss": 2.0398, + "step": 3236 + }, + { + "epoch": 0.5222652468538238, + "grad_norm": 4.468944549560547, + "learning_rate": 4.905945887391242e-05, + "loss": 1.8025, + "step": 3237 + }, + { + "epoch": 0.5224265892223298, + "grad_norm": 4.0166120529174805, + "learning_rate": 4.903333594311215e-05, + "loss": 2.0221, + "step": 3238 + }, + { + "epoch": 0.5225879315908357, + "grad_norm": 4.2127180099487305, + "learning_rate": 4.900721327627143e-05, + "loss": 2.0657, + "step": 3239 + }, + { + "epoch": 0.5227492739593417, + "grad_norm": 4.6842732429504395, + "learning_rate": 4.898109088052333e-05, + "loss": 2.1807, + "step": 3240 + }, + { + "epoch": 0.5229106163278477, + "grad_norm": 5.23138952255249, + "learning_rate": 4.895496876300096e-05, + "loss": 2.0688, + "step": 3241 + }, + { + "epoch": 0.5230719586963537, + "grad_norm": 4.115994453430176, + "learning_rate": 4.892884693083723e-05, + "loss": 1.8319, + "step": 3242 + }, + { + "epoch": 0.5232333010648597, + "grad_norm": 3.9842920303344727, + "learning_rate": 4.890272539116507e-05, + "loss": 1.8929, + "step": 3243 + }, + { + "epoch": 0.5233946434333656, + "grad_norm": 4.264963150024414, + "learning_rate": 4.887660415111727e-05, + "loss": 1.9543, + "step": 3244 + }, + { + "epoch": 0.5235559858018716, + "grad_norm": 4.318862438201904, + "learning_rate": 4.8850483217826546e-05, + "loss": 1.7397, + "step": 3245 + }, + { + "epoch": 0.5237173281703775, + "grad_norm": 4.763772487640381, + "learning_rate": 4.882436259842556e-05, + "loss": 1.9595, + "step": 3246 + }, + { + "epoch": 0.5238786705388835, + "grad_norm": 4.464093208312988, + "learning_rate": 4.879824230004688e-05, + "loss": 1.7959, + "step": 3247 + }, + { + "epoch": 0.5240400129073894, + "grad_norm": 5.059757232666016, + "learning_rate": 4.877212232982292e-05, + "loss": 2.1096, + "step": 3248 + }, + { + "epoch": 0.5242013552758954, + "grad_norm": 3.138390302658081, + "learning_rate": 4.874600269488613e-05, + "loss": 2.0268, + "step": 3249 + }, + { + "epoch": 0.5243626976444015, + "grad_norm": 4.7847700119018555, + "learning_rate": 4.8719883402368745e-05, + "loss": 1.7649, + "step": 3250 + }, + { + "epoch": 0.5245240400129074, + "grad_norm": 5.590267181396484, + "learning_rate": 4.8693764459402996e-05, + "loss": 2.3039, + "step": 3251 + }, + { + "epoch": 0.5246853823814134, + "grad_norm": 3.87428617477417, + "learning_rate": 4.866764587312097e-05, + "loss": 1.8538, + "step": 3252 + }, + { + "epoch": 0.5248467247499193, + "grad_norm": 4.53293514251709, + "learning_rate": 4.8641527650654646e-05, + "loss": 1.7762, + "step": 3253 + }, + { + "epoch": 0.5250080671184253, + "grad_norm": 5.136706829071045, + "learning_rate": 4.861540979913597e-05, + "loss": 1.9608, + "step": 3254 + }, + { + "epoch": 0.5251694094869312, + "grad_norm": 3.199901580810547, + "learning_rate": 4.858929232569671e-05, + "loss": 2.3918, + "step": 3255 + }, + { + "epoch": 0.5253307518554372, + "grad_norm": 5.633172035217285, + "learning_rate": 4.8563175237468575e-05, + "loss": 2.1478, + "step": 3256 + }, + { + "epoch": 0.5254920942239432, + "grad_norm": 5.483176231384277, + "learning_rate": 4.853705854158315e-05, + "loss": 1.8405, + "step": 3257 + }, + { + "epoch": 0.5256534365924492, + "grad_norm": 6.112262725830078, + "learning_rate": 4.8510942245171937e-05, + "loss": 1.9673, + "step": 3258 + }, + { + "epoch": 0.5258147789609552, + "grad_norm": 3.62973690032959, + "learning_rate": 4.8484826355366295e-05, + "loss": 1.7978, + "step": 3259 + }, + { + "epoch": 0.5259761213294611, + "grad_norm": 3.750770330429077, + "learning_rate": 4.845871087929751e-05, + "loss": 1.8656, + "step": 3260 + }, + { + "epoch": 0.5261374636979671, + "grad_norm": 5.85239315032959, + "learning_rate": 4.8432595824096705e-05, + "loss": 1.9442, + "step": 3261 + }, + { + "epoch": 0.5262988060664731, + "grad_norm": 5.854916572570801, + "learning_rate": 4.8406481196894956e-05, + "loss": 1.8402, + "step": 3262 + }, + { + "epoch": 0.526460148434979, + "grad_norm": 5.106485843658447, + "learning_rate": 4.838036700482316e-05, + "loss": 1.9989, + "step": 3263 + }, + { + "epoch": 0.526621490803485, + "grad_norm": 3.8806090354919434, + "learning_rate": 4.8354253255012134e-05, + "loss": 1.9908, + "step": 3264 + }, + { + "epoch": 0.5267828331719909, + "grad_norm": 3.4543380737304688, + "learning_rate": 4.8328139954592534e-05, + "loss": 1.8149, + "step": 3265 + }, + { + "epoch": 0.5269441755404969, + "grad_norm": 4.389228343963623, + "learning_rate": 4.830202711069493e-05, + "loss": 1.9839, + "step": 3266 + }, + { + "epoch": 0.527105517909003, + "grad_norm": 3.474240779876709, + "learning_rate": 4.827591473044978e-05, + "loss": 2.18, + "step": 3267 + }, + { + "epoch": 0.5272668602775089, + "grad_norm": 3.6651768684387207, + "learning_rate": 4.824980282098734e-05, + "loss": 2.0556, + "step": 3268 + }, + { + "epoch": 0.5274282026460149, + "grad_norm": 3.4399147033691406, + "learning_rate": 4.8223691389437844e-05, + "loss": 1.9626, + "step": 3269 + }, + { + "epoch": 0.5275895450145208, + "grad_norm": 4.90449857711792, + "learning_rate": 4.8197580442931295e-05, + "loss": 1.8866, + "step": 3270 + }, + { + "epoch": 0.5277508873830268, + "grad_norm": 4.023404121398926, + "learning_rate": 4.817146998859765e-05, + "loss": 2.0522, + "step": 3271 + }, + { + "epoch": 0.5279122297515327, + "grad_norm": 3.915118932723999, + "learning_rate": 4.814536003356666e-05, + "loss": 2.2881, + "step": 3272 + }, + { + "epoch": 0.5280735721200387, + "grad_norm": 4.261370658874512, + "learning_rate": 4.811925058496798e-05, + "loss": 2.1968, + "step": 3273 + }, + { + "epoch": 0.5282349144885446, + "grad_norm": 3.4954028129577637, + "learning_rate": 4.8093141649931126e-05, + "loss": 1.8929, + "step": 3274 + }, + { + "epoch": 0.5283962568570507, + "grad_norm": 3.987055778503418, + "learning_rate": 4.806703323558546e-05, + "loss": 2.2058, + "step": 3275 + }, + { + "epoch": 0.5285575992255567, + "grad_norm": 4.587975025177002, + "learning_rate": 4.804092534906018e-05, + "loss": 1.9484, + "step": 3276 + }, + { + "epoch": 0.5287189415940626, + "grad_norm": 4.985721111297607, + "learning_rate": 4.801481799748441e-05, + "loss": 1.8801, + "step": 3277 + }, + { + "epoch": 0.5288802839625686, + "grad_norm": 3.6657228469848633, + "learning_rate": 4.798871118798707e-05, + "loss": 1.8155, + "step": 3278 + }, + { + "epoch": 0.5290416263310745, + "grad_norm": 4.231720447540283, + "learning_rate": 4.796260492769691e-05, + "loss": 1.9869, + "step": 3279 + }, + { + "epoch": 0.5292029686995805, + "grad_norm": 3.9197402000427246, + "learning_rate": 4.7936499223742616e-05, + "loss": 2.2224, + "step": 3280 + }, + { + "epoch": 0.5293643110680865, + "grad_norm": 3.874336004257202, + "learning_rate": 4.791039408325264e-05, + "loss": 2.0934, + "step": 3281 + }, + { + "epoch": 0.5295256534365924, + "grad_norm": 4.764644145965576, + "learning_rate": 4.788428951335534e-05, + "loss": 1.8271, + "step": 3282 + }, + { + "epoch": 0.5296869958050984, + "grad_norm": 4.530116081237793, + "learning_rate": 4.785818552117886e-05, + "loss": 1.7182, + "step": 3283 + }, + { + "epoch": 0.5298483381736044, + "grad_norm": 3.121483564376831, + "learning_rate": 4.7832082113851247e-05, + "loss": 2.1423, + "step": 3284 + }, + { + "epoch": 0.5300096805421104, + "grad_norm": 3.517976760864258, + "learning_rate": 4.780597929850032e-05, + "loss": 1.7439, + "step": 3285 + }, + { + "epoch": 0.5301710229106164, + "grad_norm": 4.023410797119141, + "learning_rate": 4.777987708225382e-05, + "loss": 1.7658, + "step": 3286 + }, + { + "epoch": 0.5303323652791223, + "grad_norm": 3.6459848880767822, + "learning_rate": 4.775377547223924e-05, + "loss": 2.0099, + "step": 3287 + }, + { + "epoch": 0.5304937076476283, + "grad_norm": 3.7603046894073486, + "learning_rate": 4.772767447558398e-05, + "loss": 1.8944, + "step": 3288 + }, + { + "epoch": 0.5306550500161342, + "grad_norm": 3.734020948410034, + "learning_rate": 4.77015740994152e-05, + "loss": 1.7808, + "step": 3289 + }, + { + "epoch": 0.5308163923846402, + "grad_norm": 3.546699285507202, + "learning_rate": 4.767547435085997e-05, + "loss": 1.9671, + "step": 3290 + }, + { + "epoch": 0.5309777347531461, + "grad_norm": 4.612760543823242, + "learning_rate": 4.7649375237045135e-05, + "loss": 1.9822, + "step": 3291 + }, + { + "epoch": 0.5311390771216521, + "grad_norm": 4.891711235046387, + "learning_rate": 4.762327676509736e-05, + "loss": 1.7121, + "step": 3292 + }, + { + "epoch": 0.5313004194901582, + "grad_norm": 4.840190410614014, + "learning_rate": 4.759717894214318e-05, + "loss": 1.9484, + "step": 3293 + }, + { + "epoch": 0.5314617618586641, + "grad_norm": 3.971648931503296, + "learning_rate": 4.7571081775308905e-05, + "loss": 2.027, + "step": 3294 + }, + { + "epoch": 0.5316231042271701, + "grad_norm": 3.5140857696533203, + "learning_rate": 4.754498527172072e-05, + "loss": 1.935, + "step": 3295 + }, + { + "epoch": 0.531784446595676, + "grad_norm": 3.765691041946411, + "learning_rate": 4.751888943850455e-05, + "loss": 1.8352, + "step": 3296 + }, + { + "epoch": 0.531945788964182, + "grad_norm": 6.4853034019470215, + "learning_rate": 4.7492794282786236e-05, + "loss": 1.9982, + "step": 3297 + }, + { + "epoch": 0.5321071313326879, + "grad_norm": 4.060369491577148, + "learning_rate": 4.7466699811691326e-05, + "loss": 1.8903, + "step": 3298 + }, + { + "epoch": 0.5322684737011939, + "grad_norm": 8.457684516906738, + "learning_rate": 4.74406060323453e-05, + "loss": 2.0126, + "step": 3299 + }, + { + "epoch": 0.5324298160696999, + "grad_norm": 4.315256118774414, + "learning_rate": 4.741451295187332e-05, + "loss": 1.9478, + "step": 3300 + }, + { + "epoch": 0.5325911584382059, + "grad_norm": 3.9448628425598145, + "learning_rate": 4.7388420577400496e-05, + "loss": 1.6127, + "step": 3301 + }, + { + "epoch": 0.5327525008067119, + "grad_norm": 4.897828578948975, + "learning_rate": 4.736232891605161e-05, + "loss": 2.1037, + "step": 3302 + }, + { + "epoch": 0.5329138431752178, + "grad_norm": 5.935902118682861, + "learning_rate": 4.733623797495136e-05, + "loss": 1.8929, + "step": 3303 + }, + { + "epoch": 0.5330751855437238, + "grad_norm": 4.652507781982422, + "learning_rate": 4.731014776122416e-05, + "loss": 1.8967, + "step": 3304 + }, + { + "epoch": 0.5332365279122298, + "grad_norm": 3.986088275909424, + "learning_rate": 4.728405828199427e-05, + "loss": 1.9336, + "step": 3305 + }, + { + "epoch": 0.5333978702807357, + "grad_norm": 4.08700704574585, + "learning_rate": 4.725796954438577e-05, + "loss": 1.8997, + "step": 3306 + }, + { + "epoch": 0.5335592126492417, + "grad_norm": 5.69546365737915, + "learning_rate": 4.723188155552247e-05, + "loss": 1.9908, + "step": 3307 + }, + { + "epoch": 0.5337205550177476, + "grad_norm": 4.448991298675537, + "learning_rate": 4.720579432252807e-05, + "loss": 2.1116, + "step": 3308 + }, + { + "epoch": 0.5338818973862536, + "grad_norm": 3.5392987728118896, + "learning_rate": 4.717970785252595e-05, + "loss": 1.6317, + "step": 3309 + }, + { + "epoch": 0.5340432397547596, + "grad_norm": 3.903165102005005, + "learning_rate": 4.715362215263941e-05, + "loss": 1.8261, + "step": 3310 + }, + { + "epoch": 0.5342045821232656, + "grad_norm": 4.5423359870910645, + "learning_rate": 4.712753722999143e-05, + "loss": 1.8005, + "step": 3311 + }, + { + "epoch": 0.5343659244917716, + "grad_norm": 4.870762348175049, + "learning_rate": 4.710145309170481e-05, + "loss": 2.1654, + "step": 3312 + }, + { + "epoch": 0.5345272668602775, + "grad_norm": 4.262883186340332, + "learning_rate": 4.7075369744902175e-05, + "loss": 1.9795, + "step": 3313 + }, + { + "epoch": 0.5346886092287835, + "grad_norm": 4.609106063842773, + "learning_rate": 4.70492871967059e-05, + "loss": 1.7828, + "step": 3314 + }, + { + "epoch": 0.5348499515972894, + "grad_norm": 4.6488494873046875, + "learning_rate": 4.7023205454238136e-05, + "loss": 2.0667, + "step": 3315 + }, + { + "epoch": 0.5350112939657954, + "grad_norm": 5.292774677276611, + "learning_rate": 4.69971245246208e-05, + "loss": 1.8568, + "step": 3316 + }, + { + "epoch": 0.5351726363343013, + "grad_norm": 4.1986188888549805, + "learning_rate": 4.6971044414975666e-05, + "loss": 2.1148, + "step": 3317 + }, + { + "epoch": 0.5353339787028074, + "grad_norm": 4.246354579925537, + "learning_rate": 4.6944965132424185e-05, + "loss": 1.8744, + "step": 3318 + }, + { + "epoch": 0.5354953210713134, + "grad_norm": 6.183987140655518, + "learning_rate": 4.691888668408766e-05, + "loss": 2.0153, + "step": 3319 + }, + { + "epoch": 0.5356566634398193, + "grad_norm": 4.5026044845581055, + "learning_rate": 4.689280907708709e-05, + "loss": 1.8281, + "step": 3320 + }, + { + "epoch": 0.5358180058083253, + "grad_norm": 4.956146717071533, + "learning_rate": 4.686673231854334e-05, + "loss": 1.7823, + "step": 3321 + }, + { + "epoch": 0.5359793481768312, + "grad_norm": 5.603353977203369, + "learning_rate": 4.684065641557695e-05, + "loss": 1.9652, + "step": 3322 + }, + { + "epoch": 0.5361406905453372, + "grad_norm": 4.635777473449707, + "learning_rate": 4.681458137530829e-05, + "loss": 2.0288, + "step": 3323 + }, + { + "epoch": 0.5363020329138432, + "grad_norm": 4.934343338012695, + "learning_rate": 4.6788507204857446e-05, + "loss": 1.888, + "step": 3324 + }, + { + "epoch": 0.5364633752823491, + "grad_norm": 5.475190162658691, + "learning_rate": 4.6762433911344325e-05, + "loss": 1.7964, + "step": 3325 + }, + { + "epoch": 0.5366247176508551, + "grad_norm": 4.2750396728515625, + "learning_rate": 4.673636150188852e-05, + "loss": 1.8277, + "step": 3326 + }, + { + "epoch": 0.5367860600193611, + "grad_norm": 3.8814730644226074, + "learning_rate": 4.671028998360947e-05, + "loss": 1.9308, + "step": 3327 + }, + { + "epoch": 0.5369474023878671, + "grad_norm": 4.442133903503418, + "learning_rate": 4.6684219363626306e-05, + "loss": 1.9073, + "step": 3328 + }, + { + "epoch": 0.537108744756373, + "grad_norm": 3.948798894882202, + "learning_rate": 4.66581496490579e-05, + "loss": 1.8738, + "step": 3329 + }, + { + "epoch": 0.537270087124879, + "grad_norm": 4.171131610870361, + "learning_rate": 4.663208084702297e-05, + "loss": 1.837, + "step": 3330 + }, + { + "epoch": 0.537431429493385, + "grad_norm": 5.258491039276123, + "learning_rate": 4.6606012964639874e-05, + "loss": 2.0329, + "step": 3331 + }, + { + "epoch": 0.5375927718618909, + "grad_norm": 4.995650291442871, + "learning_rate": 4.6579946009026786e-05, + "loss": 1.9554, + "step": 3332 + }, + { + "epoch": 0.5377541142303969, + "grad_norm": 4.612634658813477, + "learning_rate": 4.655387998730161e-05, + "loss": 1.926, + "step": 3333 + }, + { + "epoch": 0.5379154565989028, + "grad_norm": 3.0606601238250732, + "learning_rate": 4.6527814906582e-05, + "loss": 1.9066, + "step": 3334 + }, + { + "epoch": 0.5380767989674088, + "grad_norm": 4.619715213775635, + "learning_rate": 4.6501750773985326e-05, + "loss": 1.632, + "step": 3335 + }, + { + "epoch": 0.5382381413359149, + "grad_norm": 3.8743972778320312, + "learning_rate": 4.647568759662876e-05, + "loss": 2.0805, + "step": 3336 + }, + { + "epoch": 0.5383994837044208, + "grad_norm": 5.274535179138184, + "learning_rate": 4.644962538162913e-05, + "loss": 1.919, + "step": 3337 + }, + { + "epoch": 0.5385608260729268, + "grad_norm": 5.369234561920166, + "learning_rate": 4.642356413610308e-05, + "loss": 1.9116, + "step": 3338 + }, + { + "epoch": 0.5387221684414327, + "grad_norm": 4.213049411773682, + "learning_rate": 4.6397503867166926e-05, + "loss": 1.8896, + "step": 3339 + }, + { + "epoch": 0.5388835108099387, + "grad_norm": 4.885444164276123, + "learning_rate": 4.637144458193677e-05, + "loss": 1.8171, + "step": 3340 + }, + { + "epoch": 0.5390448531784446, + "grad_norm": 3.53621768951416, + "learning_rate": 4.634538628752841e-05, + "loss": 1.9278, + "step": 3341 + }, + { + "epoch": 0.5392061955469506, + "grad_norm": 4.078290939331055, + "learning_rate": 4.631932899105739e-05, + "loss": 2.1414, + "step": 3342 + }, + { + "epoch": 0.5393675379154566, + "grad_norm": 5.2463908195495605, + "learning_rate": 4.629327269963897e-05, + "loss": 2.2, + "step": 3343 + }, + { + "epoch": 0.5395288802839626, + "grad_norm": 4.759715557098389, + "learning_rate": 4.6267217420388126e-05, + "loss": 1.9083, + "step": 3344 + }, + { + "epoch": 0.5396902226524686, + "grad_norm": 4.195973873138428, + "learning_rate": 4.6241163160419616e-05, + "loss": 1.8039, + "step": 3345 + }, + { + "epoch": 0.5398515650209745, + "grad_norm": 4.239854335784912, + "learning_rate": 4.621510992684783e-05, + "loss": 2.1011, + "step": 3346 + }, + { + "epoch": 0.5400129073894805, + "grad_norm": 3.7908120155334473, + "learning_rate": 4.618905772678696e-05, + "loss": 1.8025, + "step": 3347 + }, + { + "epoch": 0.5401742497579864, + "grad_norm": 4.560600757598877, + "learning_rate": 4.616300656735085e-05, + "loss": 2.0401, + "step": 3348 + }, + { + "epoch": 0.5403355921264924, + "grad_norm": 3.953225612640381, + "learning_rate": 4.613695645565312e-05, + "loss": 1.8735, + "step": 3349 + }, + { + "epoch": 0.5404969344949984, + "grad_norm": 4.011920928955078, + "learning_rate": 4.611090739880707e-05, + "loss": 1.9614, + "step": 3350 + }, + { + "epoch": 0.5406582768635043, + "grad_norm": 4.15378475189209, + "learning_rate": 4.6084859403925704e-05, + "loss": 1.9246, + "step": 3351 + }, + { + "epoch": 0.5408196192320103, + "grad_norm": 3.286672830581665, + "learning_rate": 4.6058812478121726e-05, + "loss": 1.9275, + "step": 3352 + }, + { + "epoch": 0.5409809616005163, + "grad_norm": 4.174089431762695, + "learning_rate": 4.6032766628507626e-05, + "loss": 1.9762, + "step": 3353 + }, + { + "epoch": 0.5411423039690223, + "grad_norm": 4.887900352478027, + "learning_rate": 4.600672186219551e-05, + "loss": 2.0994, + "step": 3354 + }, + { + "epoch": 0.5413036463375283, + "grad_norm": 3.994049310684204, + "learning_rate": 4.598067818629721e-05, + "loss": 1.5674, + "step": 3355 + }, + { + "epoch": 0.5414649887060342, + "grad_norm": 3.9422547817230225, + "learning_rate": 4.5954635607924306e-05, + "loss": 2.0313, + "step": 3356 + }, + { + "epoch": 0.5416263310745402, + "grad_norm": 3.7704057693481445, + "learning_rate": 4.5928594134188006e-05, + "loss": 1.5559, + "step": 3357 + }, + { + "epoch": 0.5417876734430461, + "grad_norm": 4.104972839355469, + "learning_rate": 4.590255377219931e-05, + "loss": 1.7469, + "step": 3358 + }, + { + "epoch": 0.5419490158115521, + "grad_norm": 4.480334281921387, + "learning_rate": 4.5876514529068805e-05, + "loss": 1.8588, + "step": 3359 + }, + { + "epoch": 0.542110358180058, + "grad_norm": 3.6700620651245117, + "learning_rate": 4.5850476411906856e-05, + "loss": 1.9828, + "step": 3360 + }, + { + "epoch": 0.5422717005485641, + "grad_norm": 4.9810357093811035, + "learning_rate": 4.582443942782348e-05, + "loss": 2.0797, + "step": 3361 + }, + { + "epoch": 0.5424330429170701, + "grad_norm": 3.96756911277771, + "learning_rate": 4.579840358392842e-05, + "loss": 1.8334, + "step": 3362 + }, + { + "epoch": 0.542594385285576, + "grad_norm": 5.304398536682129, + "learning_rate": 4.577236888733105e-05, + "loss": 1.9863, + "step": 3363 + }, + { + "epoch": 0.542755727654082, + "grad_norm": 3.4439711570739746, + "learning_rate": 4.5746335345140497e-05, + "loss": 1.7999, + "step": 3364 + }, + { + "epoch": 0.5429170700225879, + "grad_norm": 5.821254730224609, + "learning_rate": 4.57203029644655e-05, + "loss": 1.9819, + "step": 3365 + }, + { + "epoch": 0.5430784123910939, + "grad_norm": 3.3282694816589355, + "learning_rate": 4.569427175241458e-05, + "loss": 1.7094, + "step": 3366 + }, + { + "epoch": 0.5432397547595998, + "grad_norm": 5.027764320373535, + "learning_rate": 4.566824171609584e-05, + "loss": 2.05, + "step": 3367 + }, + { + "epoch": 0.5434010971281058, + "grad_norm": 3.789264678955078, + "learning_rate": 4.564221286261709e-05, + "loss": 1.8987, + "step": 3368 + }, + { + "epoch": 0.5435624394966118, + "grad_norm": 4.646523952484131, + "learning_rate": 4.561618519908587e-05, + "loss": 2.1439, + "step": 3369 + }, + { + "epoch": 0.5437237818651178, + "grad_norm": 5.464221954345703, + "learning_rate": 4.559015873260933e-05, + "loss": 1.8975, + "step": 3370 + }, + { + "epoch": 0.5438851242336238, + "grad_norm": 3.8311192989349365, + "learning_rate": 4.5564133470294325e-05, + "loss": 1.9645, + "step": 3371 + }, + { + "epoch": 0.5440464666021297, + "grad_norm": 3.876864194869995, + "learning_rate": 4.553810941924735e-05, + "loss": 1.8745, + "step": 3372 + }, + { + "epoch": 0.5442078089706357, + "grad_norm": 4.448172569274902, + "learning_rate": 4.551208658657463e-05, + "loss": 1.8245, + "step": 3373 + }, + { + "epoch": 0.5443691513391417, + "grad_norm": 3.681504249572754, + "learning_rate": 4.548606497938199e-05, + "loss": 1.7546, + "step": 3374 + }, + { + "epoch": 0.5445304937076476, + "grad_norm": 5.545989990234375, + "learning_rate": 4.546004460477498e-05, + "loss": 1.7352, + "step": 3375 + }, + { + "epoch": 0.5446918360761536, + "grad_norm": 3.8270070552825928, + "learning_rate": 4.543402546985875e-05, + "loss": 1.8872, + "step": 3376 + }, + { + "epoch": 0.5448531784446595, + "grad_norm": 3.8583481311798096, + "learning_rate": 4.5408007581738185e-05, + "loss": 2.1329, + "step": 3377 + }, + { + "epoch": 0.5450145208131656, + "grad_norm": 3.3820762634277344, + "learning_rate": 4.5381990947517766e-05, + "loss": 1.9026, + "step": 3378 + }, + { + "epoch": 0.5451758631816715, + "grad_norm": 5.060296058654785, + "learning_rate": 4.535597557430164e-05, + "loss": 2.004, + "step": 3379 + }, + { + "epoch": 0.5453372055501775, + "grad_norm": 4.134612083435059, + "learning_rate": 4.532996146919367e-05, + "loss": 1.7844, + "step": 3380 + }, + { + "epoch": 0.5454985479186835, + "grad_norm": 3.6750853061676025, + "learning_rate": 4.5303948639297287e-05, + "loss": 2.001, + "step": 3381 + }, + { + "epoch": 0.5456598902871894, + "grad_norm": 4.092704772949219, + "learning_rate": 4.527793709171564e-05, + "loss": 1.9174, + "step": 3382 + }, + { + "epoch": 0.5458212326556954, + "grad_norm": 4.162294864654541, + "learning_rate": 4.525192683355147e-05, + "loss": 1.9129, + "step": 3383 + }, + { + "epoch": 0.5459825750242013, + "grad_norm": 4.528878211975098, + "learning_rate": 4.5225917871907245e-05, + "loss": 2.0338, + "step": 3384 + }, + { + "epoch": 0.5461439173927073, + "grad_norm": 3.582669973373413, + "learning_rate": 4.5199910213884996e-05, + "loss": 1.9855, + "step": 3385 + }, + { + "epoch": 0.5463052597612132, + "grad_norm": 4.393901348114014, + "learning_rate": 4.517390386658646e-05, + "loss": 1.8243, + "step": 3386 + }, + { + "epoch": 0.5464666021297193, + "grad_norm": 6.063263893127441, + "learning_rate": 4.514789883711296e-05, + "loss": 2.0507, + "step": 3387 + }, + { + "epoch": 0.5466279444982253, + "grad_norm": 4.6668782234191895, + "learning_rate": 4.5121895132565534e-05, + "loss": 1.8884, + "step": 3388 + }, + { + "epoch": 0.5467892868667312, + "grad_norm": 5.516592025756836, + "learning_rate": 4.509589276004477e-05, + "loss": 1.9339, + "step": 3389 + }, + { + "epoch": 0.5469506292352372, + "grad_norm": 3.8053150177001953, + "learning_rate": 4.5069891726650974e-05, + "loss": 2.1245, + "step": 3390 + }, + { + "epoch": 0.5471119716037431, + "grad_norm": 3.28340220451355, + "learning_rate": 4.504389203948403e-05, + "loss": 1.8647, + "step": 3391 + }, + { + "epoch": 0.5472733139722491, + "grad_norm": 4.491049289703369, + "learning_rate": 4.501789370564345e-05, + "loss": 1.7355, + "step": 3392 + }, + { + "epoch": 0.547434656340755, + "grad_norm": 3.9153873920440674, + "learning_rate": 4.499189673222845e-05, + "loss": 1.7228, + "step": 3393 + }, + { + "epoch": 0.547595998709261, + "grad_norm": 4.5766987800598145, + "learning_rate": 4.496590112633776e-05, + "loss": 2.1064, + "step": 3394 + }, + { + "epoch": 0.547757341077767, + "grad_norm": 4.931046962738037, + "learning_rate": 4.493990689506987e-05, + "loss": 1.8381, + "step": 3395 + }, + { + "epoch": 0.547918683446273, + "grad_norm": 4.126611232757568, + "learning_rate": 4.491391404552278e-05, + "loss": 2.0963, + "step": 3396 + }, + { + "epoch": 0.548080025814779, + "grad_norm": 3.649404287338257, + "learning_rate": 4.488792258479418e-05, + "loss": 1.7649, + "step": 3397 + }, + { + "epoch": 0.548241368183285, + "grad_norm": 5.112252712249756, + "learning_rate": 4.486193251998134e-05, + "loss": 1.6735, + "step": 3398 + }, + { + "epoch": 0.5484027105517909, + "grad_norm": 3.7966115474700928, + "learning_rate": 4.483594385818118e-05, + "loss": 1.7547, + "step": 3399 + }, + { + "epoch": 0.5485640529202969, + "grad_norm": 4.304330348968506, + "learning_rate": 4.4809956606490226e-05, + "loss": 2.1255, + "step": 3400 + }, + { + "epoch": 0.5487253952888028, + "grad_norm": 4.657885551452637, + "learning_rate": 4.478397077200463e-05, + "loss": 2.0066, + "step": 3401 + }, + { + "epoch": 0.5488867376573088, + "grad_norm": 4.4623260498046875, + "learning_rate": 4.4757986361820094e-05, + "loss": 2.3324, + "step": 3402 + }, + { + "epoch": 0.5490480800258147, + "grad_norm": 4.30092191696167, + "learning_rate": 4.473200338303204e-05, + "loss": 1.8566, + "step": 3403 + }, + { + "epoch": 0.5492094223943208, + "grad_norm": 4.318838119506836, + "learning_rate": 4.470602184273543e-05, + "loss": 1.93, + "step": 3404 + }, + { + "epoch": 0.5493707647628268, + "grad_norm": 4.305547714233398, + "learning_rate": 4.468004174802479e-05, + "loss": 1.9673, + "step": 3405 + }, + { + "epoch": 0.5495321071313327, + "grad_norm": 5.350564479827881, + "learning_rate": 4.465406310599438e-05, + "loss": 2.031, + "step": 3406 + }, + { + "epoch": 0.5496934494998387, + "grad_norm": 5.960731506347656, + "learning_rate": 4.462808592373792e-05, + "loss": 1.7523, + "step": 3407 + }, + { + "epoch": 0.5498547918683446, + "grad_norm": 3.5292716026306152, + "learning_rate": 4.460211020834887e-05, + "loss": 1.8345, + "step": 3408 + }, + { + "epoch": 0.5500161342368506, + "grad_norm": 4.717067241668701, + "learning_rate": 4.4576135966920165e-05, + "loss": 2.1662, + "step": 3409 + }, + { + "epoch": 0.5501774766053565, + "grad_norm": 4.258657455444336, + "learning_rate": 4.455016320654442e-05, + "loss": 1.9488, + "step": 3410 + }, + { + "epoch": 0.5503388189738625, + "grad_norm": 3.420463800430298, + "learning_rate": 4.452419193431379e-05, + "loss": 1.8774, + "step": 3411 + }, + { + "epoch": 0.5505001613423685, + "grad_norm": 5.453540325164795, + "learning_rate": 4.4498222157320094e-05, + "loss": 2.1955, + "step": 3412 + }, + { + "epoch": 0.5506615037108745, + "grad_norm": 4.927591800689697, + "learning_rate": 4.447225388265465e-05, + "loss": 1.9469, + "step": 3413 + }, + { + "epoch": 0.5508228460793805, + "grad_norm": 6.436439037322998, + "learning_rate": 4.4446287117408456e-05, + "loss": 2.0445, + "step": 3414 + }, + { + "epoch": 0.5509841884478864, + "grad_norm": 4.598954200744629, + "learning_rate": 4.4420321868672026e-05, + "loss": 1.9913, + "step": 3415 + }, + { + "epoch": 0.5511455308163924, + "grad_norm": 3.52225399017334, + "learning_rate": 4.439435814353553e-05, + "loss": 2.1382, + "step": 3416 + }, + { + "epoch": 0.5513068731848983, + "grad_norm": 3.898519992828369, + "learning_rate": 4.436839594908866e-05, + "loss": 1.9294, + "step": 3417 + }, + { + "epoch": 0.5514682155534043, + "grad_norm": 3.636260747909546, + "learning_rate": 4.43424352924207e-05, + "loss": 1.9964, + "step": 3418 + }, + { + "epoch": 0.5516295579219103, + "grad_norm": 4.266148090362549, + "learning_rate": 4.431647618062055e-05, + "loss": 2.0357, + "step": 3419 + }, + { + "epoch": 0.5517909002904162, + "grad_norm": 4.997242450714111, + "learning_rate": 4.4290518620776645e-05, + "loss": 2.213, + "step": 3420 + }, + { + "epoch": 0.5519522426589223, + "grad_norm": 4.129906177520752, + "learning_rate": 4.4264562619977044e-05, + "loss": 1.9439, + "step": 3421 + }, + { + "epoch": 0.5521135850274282, + "grad_norm": 5.237497806549072, + "learning_rate": 4.423860818530932e-05, + "loss": 1.9271, + "step": 3422 + }, + { + "epoch": 0.5522749273959342, + "grad_norm": 3.959937810897827, + "learning_rate": 4.4212655323860684e-05, + "loss": 1.8816, + "step": 3423 + }, + { + "epoch": 0.5524362697644402, + "grad_norm": 3.959937810897827, + "learning_rate": 4.4212655323860684e-05, + "loss": 2.0413, + "step": 3424 + }, + { + "epoch": 0.5525976121329461, + "grad_norm": 5.576602458953857, + "learning_rate": 4.418670404271785e-05, + "loss": 1.9931, + "step": 3425 + }, + { + "epoch": 0.5527589545014521, + "grad_norm": 6.255558967590332, + "learning_rate": 4.416075434896717e-05, + "loss": 2.0849, + "step": 3426 + }, + { + "epoch": 0.552920296869958, + "grad_norm": 6.220365524291992, + "learning_rate": 4.413480624969452e-05, + "loss": 1.7052, + "step": 3427 + }, + { + "epoch": 0.553081639238464, + "grad_norm": 4.45718240737915, + "learning_rate": 4.410885975198533e-05, + "loss": 1.9092, + "step": 3428 + }, + { + "epoch": 0.5532429816069699, + "grad_norm": 4.006811618804932, + "learning_rate": 4.408291486292462e-05, + "loss": 1.734, + "step": 3429 + }, + { + "epoch": 0.553404323975476, + "grad_norm": 3.4543488025665283, + "learning_rate": 4.405697158959698e-05, + "loss": 1.7235, + "step": 3430 + }, + { + "epoch": 0.553565666343982, + "grad_norm": 5.664391040802002, + "learning_rate": 4.403102993908653e-05, + "loss": 1.8804, + "step": 3431 + }, + { + "epoch": 0.5537270087124879, + "grad_norm": 5.415212154388428, + "learning_rate": 4.400508991847692e-05, + "loss": 1.9773, + "step": 3432 + }, + { + "epoch": 0.5538883510809939, + "grad_norm": 8.737555503845215, + "learning_rate": 4.3979151534851446e-05, + "loss": 1.9354, + "step": 3433 + }, + { + "epoch": 0.5540496934494998, + "grad_norm": 4.1277289390563965, + "learning_rate": 4.395321479529287e-05, + "loss": 1.9464, + "step": 3434 + }, + { + "epoch": 0.5542110358180058, + "grad_norm": 3.8060250282287598, + "learning_rate": 4.3927279706883565e-05, + "loss": 1.6957, + "step": 3435 + }, + { + "epoch": 0.5543723781865117, + "grad_norm": 5.43300199508667, + "learning_rate": 4.39013462767054e-05, + "loss": 2.0556, + "step": 3436 + }, + { + "epoch": 0.5545337205550177, + "grad_norm": 5.492201328277588, + "learning_rate": 4.3875414511839847e-05, + "loss": 1.8884, + "step": 3437 + }, + { + "epoch": 0.5546950629235237, + "grad_norm": 5.079095840454102, + "learning_rate": 4.3849484419367866e-05, + "loss": 1.6756, + "step": 3438 + }, + { + "epoch": 0.5548564052920297, + "grad_norm": 4.2949604988098145, + "learning_rate": 4.382355600637002e-05, + "loss": 1.5468, + "step": 3439 + }, + { + "epoch": 0.5550177476605357, + "grad_norm": 5.174224853515625, + "learning_rate": 4.3797629279926325e-05, + "loss": 1.9101, + "step": 3440 + }, + { + "epoch": 0.5551790900290416, + "grad_norm": 4.244043350219727, + "learning_rate": 4.377170424711646e-05, + "loss": 1.9175, + "step": 3441 + }, + { + "epoch": 0.5553404323975476, + "grad_norm": 4.246883869171143, + "learning_rate": 4.374578091501954e-05, + "loss": 1.6105, + "step": 3442 + }, + { + "epoch": 0.5555017747660536, + "grad_norm": 3.899681806564331, + "learning_rate": 4.371985929071424e-05, + "loss": 2.0417, + "step": 3443 + }, + { + "epoch": 0.5556631171345595, + "grad_norm": 4.2130632400512695, + "learning_rate": 4.3693939381278815e-05, + "loss": 2.2013, + "step": 3444 + }, + { + "epoch": 0.5558244595030655, + "grad_norm": 4.7803754806518555, + "learning_rate": 4.3668021193790974e-05, + "loss": 2.0384, + "step": 3445 + }, + { + "epoch": 0.5559858018715714, + "grad_norm": 4.410373210906982, + "learning_rate": 4.364210473532804e-05, + "loss": 1.9194, + "step": 3446 + }, + { + "epoch": 0.5561471442400775, + "grad_norm": 4.31472110748291, + "learning_rate": 4.36161900129668e-05, + "loss": 1.9001, + "step": 3447 + }, + { + "epoch": 0.5563084866085835, + "grad_norm": 6.867047309875488, + "learning_rate": 4.359027703378357e-05, + "loss": 1.9496, + "step": 3448 + }, + { + "epoch": 0.5564698289770894, + "grad_norm": 4.220922946929932, + "learning_rate": 4.356436580485424e-05, + "loss": 1.6947, + "step": 3449 + }, + { + "epoch": 0.5566311713455954, + "grad_norm": 4.422349452972412, + "learning_rate": 4.3538456333254186e-05, + "loss": 1.8319, + "step": 3450 + }, + { + "epoch": 0.5567925137141013, + "grad_norm": 3.9616305828094482, + "learning_rate": 4.351254862605828e-05, + "loss": 1.9927, + "step": 3451 + }, + { + "epoch": 0.5569538560826073, + "grad_norm": 4.980384826660156, + "learning_rate": 4.3486642690340986e-05, + "loss": 1.774, + "step": 3452 + }, + { + "epoch": 0.5571151984511132, + "grad_norm": 4.41873025894165, + "learning_rate": 4.346073853317619e-05, + "loss": 1.8737, + "step": 3453 + }, + { + "epoch": 0.5572765408196192, + "grad_norm": 3.8804445266723633, + "learning_rate": 4.343483616163739e-05, + "loss": 1.7567, + "step": 3454 + }, + { + "epoch": 0.5574378831881251, + "grad_norm": 3.948648691177368, + "learning_rate": 4.340893558279753e-05, + "loss": 1.7944, + "step": 3455 + }, + { + "epoch": 0.5575992255566312, + "grad_norm": 3.5727169513702393, + "learning_rate": 4.338303680372905e-05, + "loss": 1.6433, + "step": 3456 + }, + { + "epoch": 0.5577605679251372, + "grad_norm": 3.9678032398223877, + "learning_rate": 4.335713983150398e-05, + "loss": 1.9791, + "step": 3457 + }, + { + "epoch": 0.5579219102936431, + "grad_norm": 4.215176105499268, + "learning_rate": 4.333124467319377e-05, + "loss": 1.9158, + "step": 3458 + }, + { + "epoch": 0.5580832526621491, + "grad_norm": 4.337172985076904, + "learning_rate": 4.330535133586944e-05, + "loss": 2.0593, + "step": 3459 + }, + { + "epoch": 0.558244595030655, + "grad_norm": 4.706808090209961, + "learning_rate": 4.3279459826601455e-05, + "loss": 2.1515, + "step": 3460 + }, + { + "epoch": 0.558405937399161, + "grad_norm": 4.440176963806152, + "learning_rate": 4.325357015245985e-05, + "loss": 1.8315, + "step": 3461 + }, + { + "epoch": 0.558567279767667, + "grad_norm": 4.729047775268555, + "learning_rate": 4.322768232051407e-05, + "loss": 2.0323, + "step": 3462 + }, + { + "epoch": 0.5587286221361729, + "grad_norm": 5.698724746704102, + "learning_rate": 4.320179633783317e-05, + "loss": 2.3371, + "step": 3463 + }, + { + "epoch": 0.558889964504679, + "grad_norm": 3.6514177322387695, + "learning_rate": 4.317591221148557e-05, + "loss": 1.6228, + "step": 3464 + }, + { + "epoch": 0.5590513068731849, + "grad_norm": 5.009037971496582, + "learning_rate": 4.315002994853931e-05, + "loss": 1.9361, + "step": 3465 + }, + { + "epoch": 0.5592126492416909, + "grad_norm": 4.258859634399414, + "learning_rate": 4.312414955606181e-05, + "loss": 2.0629, + "step": 3466 + }, + { + "epoch": 0.5593739916101969, + "grad_norm": 4.319770812988281, + "learning_rate": 4.3098271041120076e-05, + "loss": 1.9236, + "step": 3467 + }, + { + "epoch": 0.5595353339787028, + "grad_norm": 4.81157112121582, + "learning_rate": 4.3072394410780515e-05, + "loss": 1.85, + "step": 3468 + }, + { + "epoch": 0.5596966763472088, + "grad_norm": 4.622674942016602, + "learning_rate": 4.3046519672109084e-05, + "loss": 1.7307, + "step": 3469 + }, + { + "epoch": 0.5598580187157147, + "grad_norm": 4.097692489624023, + "learning_rate": 4.30206468321712e-05, + "loss": 1.8736, + "step": 3470 + }, + { + "epoch": 0.5600193610842207, + "grad_norm": 4.507349967956543, + "learning_rate": 4.2994775898031726e-05, + "loss": 1.8505, + "step": 3471 + }, + { + "epoch": 0.5601807034527266, + "grad_norm": 3.4053497314453125, + "learning_rate": 4.29689068767551e-05, + "loss": 1.9851, + "step": 3472 + }, + { + "epoch": 0.5603420458212327, + "grad_norm": 5.3332624435424805, + "learning_rate": 4.2943039775405116e-05, + "loss": 1.9102, + "step": 3473 + }, + { + "epoch": 0.5605033881897387, + "grad_norm": 4.859854698181152, + "learning_rate": 4.291717460104516e-05, + "loss": 1.8431, + "step": 3474 + }, + { + "epoch": 0.5606647305582446, + "grad_norm": 5.216459274291992, + "learning_rate": 4.289131136073799e-05, + "loss": 2.0909, + "step": 3475 + }, + { + "epoch": 0.5608260729267506, + "grad_norm": 5.3039727210998535, + "learning_rate": 4.286545006154591e-05, + "loss": 1.9839, + "step": 3476 + }, + { + "epoch": 0.5609874152952565, + "grad_norm": 3.725084066390991, + "learning_rate": 4.283959071053066e-05, + "loss": 1.6874, + "step": 3477 + }, + { + "epoch": 0.5611487576637625, + "grad_norm": 4.183096885681152, + "learning_rate": 4.281373331475347e-05, + "loss": 1.7348, + "step": 3478 + }, + { + "epoch": 0.5613101000322684, + "grad_norm": 5.283970832824707, + "learning_rate": 4.2787877881274974e-05, + "loss": 1.8059, + "step": 3479 + }, + { + "epoch": 0.5614714424007744, + "grad_norm": 7.493622303009033, + "learning_rate": 4.276202441715538e-05, + "loss": 2.048, + "step": 3480 + }, + { + "epoch": 0.5616327847692804, + "grad_norm": 3.493828773498535, + "learning_rate": 4.273617292945425e-05, + "loss": 2.0378, + "step": 3481 + }, + { + "epoch": 0.5617941271377864, + "grad_norm": 4.150033473968506, + "learning_rate": 4.2710323425230644e-05, + "loss": 1.8063, + "step": 3482 + }, + { + "epoch": 0.5619554695062924, + "grad_norm": 3.346498966217041, + "learning_rate": 4.2684475911543145e-05, + "loss": 1.9965, + "step": 3483 + }, + { + "epoch": 0.5621168118747983, + "grad_norm": 7.276752471923828, + "learning_rate": 4.2658630395449665e-05, + "loss": 2.0183, + "step": 3484 + }, + { + "epoch": 0.5622781542433043, + "grad_norm": 4.562815189361572, + "learning_rate": 4.26327868840077e-05, + "loss": 1.982, + "step": 3485 + }, + { + "epoch": 0.5624394966118103, + "grad_norm": 4.3584394454956055, + "learning_rate": 4.26069453842741e-05, + "loss": 1.5823, + "step": 3486 + }, + { + "epoch": 0.5626008389803162, + "grad_norm": 3.395272970199585, + "learning_rate": 4.258110590330523e-05, + "loss": 2.1431, + "step": 3487 + }, + { + "epoch": 0.5627621813488222, + "grad_norm": 3.7238121032714844, + "learning_rate": 4.255526844815685e-05, + "loss": 2.133, + "step": 3488 + }, + { + "epoch": 0.5629235237173281, + "grad_norm": 3.6202895641326904, + "learning_rate": 4.252943302588423e-05, + "loss": 1.9198, + "step": 3489 + }, + { + "epoch": 0.5630848660858342, + "grad_norm": 5.4320902824401855, + "learning_rate": 4.2503599643542024e-05, + "loss": 2.0379, + "step": 3490 + }, + { + "epoch": 0.5632462084543401, + "grad_norm": 5.918498516082764, + "learning_rate": 4.247776830818439e-05, + "loss": 1.9996, + "step": 3491 + }, + { + "epoch": 0.5634075508228461, + "grad_norm": 4.040607929229736, + "learning_rate": 4.245193902686483e-05, + "loss": 1.7064, + "step": 3492 + }, + { + "epoch": 0.5635688931913521, + "grad_norm": 4.910560131072998, + "learning_rate": 4.2426111806636415e-05, + "loss": 1.8541, + "step": 3493 + }, + { + "epoch": 0.563730235559858, + "grad_norm": 4.844120025634766, + "learning_rate": 4.240028665455156e-05, + "loss": 2.1311, + "step": 3494 + }, + { + "epoch": 0.563891577928364, + "grad_norm": 4.727542877197266, + "learning_rate": 4.2374463577662116e-05, + "loss": 2.1812, + "step": 3495 + }, + { + "epoch": 0.5640529202968699, + "grad_norm": 4.263223648071289, + "learning_rate": 4.234864258301943e-05, + "loss": 1.9294, + "step": 3496 + }, + { + "epoch": 0.5642142626653759, + "grad_norm": 4.178357124328613, + "learning_rate": 4.232282367767422e-05, + "loss": 1.8706, + "step": 3497 + }, + { + "epoch": 0.5643756050338818, + "grad_norm": 4.068645000457764, + "learning_rate": 4.229700686867668e-05, + "loss": 2.0063, + "step": 3498 + }, + { + "epoch": 0.5645369474023879, + "grad_norm": 3.6699161529541016, + "learning_rate": 4.227119216307637e-05, + "loss": 2.024, + "step": 3499 + }, + { + "epoch": 0.5646982897708939, + "grad_norm": 4.167466163635254, + "learning_rate": 4.224537956792235e-05, + "loss": 2.0087, + "step": 3500 + }, + { + "epoch": 0.5648596321393998, + "grad_norm": 4.946616172790527, + "learning_rate": 4.221956909026304e-05, + "loss": 2.1271, + "step": 3501 + }, + { + "epoch": 0.5650209745079058, + "grad_norm": 5.21272087097168, + "learning_rate": 4.2193760737146346e-05, + "loss": 2.0256, + "step": 3502 + }, + { + "epoch": 0.5651823168764117, + "grad_norm": 5.247081279754639, + "learning_rate": 4.21679545156195e-05, + "loss": 1.7813, + "step": 3503 + }, + { + "epoch": 0.5653436592449177, + "grad_norm": 3.512169599533081, + "learning_rate": 4.214215043272928e-05, + "loss": 1.9385, + "step": 3504 + }, + { + "epoch": 0.5655050016134237, + "grad_norm": 5.469997406005859, + "learning_rate": 4.211634849552175e-05, + "loss": 1.9733, + "step": 3505 + }, + { + "epoch": 0.5656663439819296, + "grad_norm": 4.067634582519531, + "learning_rate": 4.209054871104249e-05, + "loss": 1.8343, + "step": 3506 + }, + { + "epoch": 0.5658276863504357, + "grad_norm": 5.573502540588379, + "learning_rate": 4.2064751086336405e-05, + "loss": 1.5979, + "step": 3507 + }, + { + "epoch": 0.5659890287189416, + "grad_norm": 3.959174156188965, + "learning_rate": 4.203895562844789e-05, + "loss": 1.8609, + "step": 3508 + }, + { + "epoch": 0.5661503710874476, + "grad_norm": 3.4906005859375, + "learning_rate": 4.2013162344420695e-05, + "loss": 1.7882, + "step": 3509 + }, + { + "epoch": 0.5663117134559535, + "grad_norm": 4.062140941619873, + "learning_rate": 4.198737124129799e-05, + "loss": 2.0697, + "step": 3510 + }, + { + "epoch": 0.5664730558244595, + "grad_norm": 5.04123592376709, + "learning_rate": 4.196158232612238e-05, + "loss": 1.9035, + "step": 3511 + }, + { + "epoch": 0.5666343981929655, + "grad_norm": 3.7623865604400635, + "learning_rate": 4.193579560593581e-05, + "loss": 2.055, + "step": 3512 + }, + { + "epoch": 0.5667957405614714, + "grad_norm": 5.195204257965088, + "learning_rate": 4.19100110877797e-05, + "loss": 1.8476, + "step": 3513 + }, + { + "epoch": 0.5669570829299774, + "grad_norm": 4.111303329467773, + "learning_rate": 4.188422877869481e-05, + "loss": 1.6723, + "step": 3514 + }, + { + "epoch": 0.5671184252984833, + "grad_norm": 4.042357921600342, + "learning_rate": 4.1858448685721306e-05, + "loss": 1.8876, + "step": 3515 + }, + { + "epoch": 0.5672797676669894, + "grad_norm": 3.8034486770629883, + "learning_rate": 4.183267081589878e-05, + "loss": 1.9028, + "step": 3516 + }, + { + "epoch": 0.5674411100354954, + "grad_norm": 7.3269853591918945, + "learning_rate": 4.18068951762662e-05, + "loss": 1.7978, + "step": 3517 + }, + { + "epoch": 0.5676024524040013, + "grad_norm": 4.7104692459106445, + "learning_rate": 4.178112177386192e-05, + "loss": 1.8569, + "step": 3518 + }, + { + "epoch": 0.5677637947725073, + "grad_norm": 4.491191387176514, + "learning_rate": 4.175535061572365e-05, + "loss": 2.2193, + "step": 3519 + }, + { + "epoch": 0.5679251371410132, + "grad_norm": 4.737609386444092, + "learning_rate": 4.172958170888858e-05, + "loss": 1.9274, + "step": 3520 + }, + { + "epoch": 0.5680864795095192, + "grad_norm": 4.3185343742370605, + "learning_rate": 4.170381506039317e-05, + "loss": 1.8375, + "step": 3521 + }, + { + "epoch": 0.5682478218780251, + "grad_norm": 5.701799392700195, + "learning_rate": 4.1678050677273375e-05, + "loss": 1.8371, + "step": 3522 + }, + { + "epoch": 0.5684091642465311, + "grad_norm": 4.154476642608643, + "learning_rate": 4.165228856656443e-05, + "loss": 1.9134, + "step": 3523 + }, + { + "epoch": 0.5685705066150372, + "grad_norm": 5.257763862609863, + "learning_rate": 4.162652873530104e-05, + "loss": 1.8649, + "step": 3524 + }, + { + "epoch": 0.5687318489835431, + "grad_norm": 4.1844048500061035, + "learning_rate": 4.1600771190517216e-05, + "loss": 1.8578, + "step": 3525 + }, + { + "epoch": 0.5688931913520491, + "grad_norm": 4.613462924957275, + "learning_rate": 4.1575015939246384e-05, + "loss": 2.0878, + "step": 3526 + }, + { + "epoch": 0.569054533720555, + "grad_norm": 4.987559795379639, + "learning_rate": 4.154926298852131e-05, + "loss": 1.9917, + "step": 3527 + }, + { + "epoch": 0.569215876089061, + "grad_norm": 5.124747276306152, + "learning_rate": 4.15235123453742e-05, + "loss": 1.9757, + "step": 3528 + }, + { + "epoch": 0.569377218457567, + "grad_norm": 3.884608745574951, + "learning_rate": 4.149776401683654e-05, + "loss": 1.8557, + "step": 3529 + }, + { + "epoch": 0.5695385608260729, + "grad_norm": 4.390828609466553, + "learning_rate": 4.147201800993926e-05, + "loss": 1.8436, + "step": 3530 + }, + { + "epoch": 0.5696999031945789, + "grad_norm": 4.279433727264404, + "learning_rate": 4.144627433171262e-05, + "loss": 1.9243, + "step": 3531 + }, + { + "epoch": 0.5698612455630848, + "grad_norm": 3.7821900844573975, + "learning_rate": 4.142053298918622e-05, + "loss": 2.0182, + "step": 3532 + }, + { + "epoch": 0.5700225879315909, + "grad_norm": 3.6471571922302246, + "learning_rate": 4.139479398938909e-05, + "loss": 1.8736, + "step": 3533 + }, + { + "epoch": 0.5701839303000968, + "grad_norm": 4.540952205657959, + "learning_rate": 4.136905733934955e-05, + "loss": 2.1067, + "step": 3534 + }, + { + "epoch": 0.5703452726686028, + "grad_norm": 4.5960469245910645, + "learning_rate": 4.134332304609533e-05, + "loss": 1.7422, + "step": 3535 + }, + { + "epoch": 0.5705066150371088, + "grad_norm": 5.430549621582031, + "learning_rate": 4.131759111665349e-05, + "loss": 2.1988, + "step": 3536 + }, + { + "epoch": 0.5706679574056147, + "grad_norm": 5.013370990753174, + "learning_rate": 4.1291861558050456e-05, + "loss": 2.0637, + "step": 3537 + }, + { + "epoch": 0.5708292997741207, + "grad_norm": 3.6673946380615234, + "learning_rate": 4.126613437731197e-05, + "loss": 1.6338, + "step": 3538 + }, + { + "epoch": 0.5709906421426266, + "grad_norm": 4.181926250457764, + "learning_rate": 4.1240409581463206e-05, + "loss": 1.7872, + "step": 3539 + }, + { + "epoch": 0.5711519845111326, + "grad_norm": 3.8125319480895996, + "learning_rate": 4.121468717752859e-05, + "loss": 1.7137, + "step": 3540 + }, + { + "epoch": 0.5713133268796385, + "grad_norm": 5.434143543243408, + "learning_rate": 4.118896717253199e-05, + "loss": 2.1284, + "step": 3541 + }, + { + "epoch": 0.5714746692481446, + "grad_norm": 5.147502422332764, + "learning_rate": 4.116324957349652e-05, + "loss": 1.7338, + "step": 3542 + }, + { + "epoch": 0.5716360116166506, + "grad_norm": 4.157900810241699, + "learning_rate": 4.113753438744472e-05, + "loss": 1.8178, + "step": 3543 + }, + { + "epoch": 0.5717973539851565, + "grad_norm": 4.950132846832275, + "learning_rate": 4.1111821621398446e-05, + "loss": 1.7818, + "step": 3544 + }, + { + "epoch": 0.5719586963536625, + "grad_norm": 5.8169732093811035, + "learning_rate": 4.1086111282378846e-05, + "loss": 1.8208, + "step": 3545 + }, + { + "epoch": 0.5721200387221684, + "grad_norm": 3.5307111740112305, + "learning_rate": 4.1060403377406486e-05, + "loss": 1.8114, + "step": 3546 + }, + { + "epoch": 0.5722813810906744, + "grad_norm": 4.841145038604736, + "learning_rate": 4.103469791350119e-05, + "loss": 1.7671, + "step": 3547 + }, + { + "epoch": 0.5724427234591803, + "grad_norm": 3.4705560207366943, + "learning_rate": 4.100899489768219e-05, + "loss": 1.8366, + "step": 3548 + }, + { + "epoch": 0.5726040658276863, + "grad_norm": 3.809705972671509, + "learning_rate": 4.098329433696797e-05, + "loss": 2.1217, + "step": 3549 + }, + { + "epoch": 0.5727654081961924, + "grad_norm": 4.063918590545654, + "learning_rate": 4.095759623837643e-05, + "loss": 1.8421, + "step": 3550 + }, + { + "epoch": 0.5729267505646983, + "grad_norm": 6.015925407409668, + "learning_rate": 4.093190060892471e-05, + "loss": 1.6947, + "step": 3551 + }, + { + "epoch": 0.5730880929332043, + "grad_norm": 5.437800407409668, + "learning_rate": 4.090620745562935e-05, + "loss": 1.9608, + "step": 3552 + }, + { + "epoch": 0.5732494353017102, + "grad_norm": 4.587212562561035, + "learning_rate": 4.088051678550617e-05, + "loss": 1.965, + "step": 3553 + }, + { + "epoch": 0.5734107776702162, + "grad_norm": 5.168732166290283, + "learning_rate": 4.085482860557033e-05, + "loss": 1.936, + "step": 3554 + }, + { + "epoch": 0.5735721200387222, + "grad_norm": 4.875430583953857, + "learning_rate": 4.0829142922836284e-05, + "loss": 1.8593, + "step": 3555 + }, + { + "epoch": 0.5737334624072281, + "grad_norm": 4.574604511260986, + "learning_rate": 4.0803459744317854e-05, + "loss": 1.943, + "step": 3556 + }, + { + "epoch": 0.5738948047757341, + "grad_norm": 3.4781699180603027, + "learning_rate": 4.077777907702814e-05, + "loss": 1.8509, + "step": 3557 + }, + { + "epoch": 0.57405614714424, + "grad_norm": 6.551433563232422, + "learning_rate": 4.0752100927979535e-05, + "loss": 2.0652, + "step": 3558 + }, + { + "epoch": 0.5742174895127461, + "grad_norm": 4.630098819732666, + "learning_rate": 4.072642530418382e-05, + "loss": 1.8877, + "step": 3559 + }, + { + "epoch": 0.574378831881252, + "grad_norm": 5.285862445831299, + "learning_rate": 4.0700752212651996e-05, + "loss": 2.2135, + "step": 3560 + }, + { + "epoch": 0.574540174249758, + "grad_norm": 3.9878244400024414, + "learning_rate": 4.067508166039446e-05, + "loss": 1.8804, + "step": 3561 + }, + { + "epoch": 0.574701516618264, + "grad_norm": 5.5384979248046875, + "learning_rate": 4.064941365442084e-05, + "loss": 1.9407, + "step": 3562 + }, + { + "epoch": 0.5748628589867699, + "grad_norm": 5.246916770935059, + "learning_rate": 4.062374820174013e-05, + "loss": 1.9085, + "step": 3563 + }, + { + "epoch": 0.5750242013552759, + "grad_norm": 4.07591438293457, + "learning_rate": 4.0598085309360575e-05, + "loss": 1.792, + "step": 3564 + }, + { + "epoch": 0.5751855437237818, + "grad_norm": 3.93082594871521, + "learning_rate": 4.057242498428976e-05, + "loss": 1.9746, + "step": 3565 + }, + { + "epoch": 0.5753468860922878, + "grad_norm": 4.674701690673828, + "learning_rate": 4.0546767233534525e-05, + "loss": 1.938, + "step": 3566 + }, + { + "epoch": 0.5755082284607939, + "grad_norm": 4.735285758972168, + "learning_rate": 4.0521112064101075e-05, + "loss": 1.6369, + "step": 3567 + }, + { + "epoch": 0.5756695708292998, + "grad_norm": 3.8902482986450195, + "learning_rate": 4.049545948299482e-05, + "loss": 2.0786, + "step": 3568 + }, + { + "epoch": 0.5758309131978058, + "grad_norm": 3.805572509765625, + "learning_rate": 4.046980949722058e-05, + "loss": 1.7551, + "step": 3569 + }, + { + "epoch": 0.5759922555663117, + "grad_norm": 5.669454097747803, + "learning_rate": 4.044416211378236e-05, + "loss": 2.1775, + "step": 3570 + }, + { + "epoch": 0.5761535979348177, + "grad_norm": 4.8873820304870605, + "learning_rate": 4.0418517339683474e-05, + "loss": 1.8586, + "step": 3571 + }, + { + "epoch": 0.5763149403033236, + "grad_norm": 6.248045921325684, + "learning_rate": 4.039287518192659e-05, + "loss": 1.9961, + "step": 3572 + }, + { + "epoch": 0.5764762826718296, + "grad_norm": 4.519307613372803, + "learning_rate": 4.036723564751358e-05, + "loss": 2.2647, + "step": 3573 + }, + { + "epoch": 0.5766376250403356, + "grad_norm": 4.497463226318359, + "learning_rate": 4.034159874344566e-05, + "loss": 1.5972, + "step": 3574 + }, + { + "epoch": 0.5767989674088415, + "grad_norm": 4.406150817871094, + "learning_rate": 4.031596447672328e-05, + "loss": 1.9623, + "step": 3575 + }, + { + "epoch": 0.5769603097773476, + "grad_norm": 4.784985542297363, + "learning_rate": 4.029033285434623e-05, + "loss": 1.8943, + "step": 3576 + }, + { + "epoch": 0.5771216521458535, + "grad_norm": 4.867000102996826, + "learning_rate": 4.026470388331347e-05, + "loss": 1.8319, + "step": 3577 + }, + { + "epoch": 0.5772829945143595, + "grad_norm": 5.884014129638672, + "learning_rate": 4.0239077570623385e-05, + "loss": 1.8488, + "step": 3578 + }, + { + "epoch": 0.5774443368828654, + "grad_norm": 4.11932897567749, + "learning_rate": 4.02134539232735e-05, + "loss": 1.8305, + "step": 3579 + }, + { + "epoch": 0.5776056792513714, + "grad_norm": 6.096425533294678, + "learning_rate": 4.0187832948260705e-05, + "loss": 1.7353, + "step": 3580 + }, + { + "epoch": 0.5777670216198774, + "grad_norm": 4.391887664794922, + "learning_rate": 4.0162214652581116e-05, + "loss": 1.8747, + "step": 3581 + }, + { + "epoch": 0.5779283639883833, + "grad_norm": 5.163760185241699, + "learning_rate": 4.013659904323009e-05, + "loss": 1.913, + "step": 3582 + }, + { + "epoch": 0.5780897063568893, + "grad_norm": 5.652235507965088, + "learning_rate": 4.011098612720233e-05, + "loss": 2.1561, + "step": 3583 + }, + { + "epoch": 0.5782510487253952, + "grad_norm": 5.187814235687256, + "learning_rate": 4.0085375911491733e-05, + "loss": 2.0876, + "step": 3584 + }, + { + "epoch": 0.5784123910939013, + "grad_norm": 4.556102752685547, + "learning_rate": 4.0059768403091505e-05, + "loss": 1.8114, + "step": 3585 + }, + { + "epoch": 0.5785737334624073, + "grad_norm": 4.740772247314453, + "learning_rate": 4.0034163608994045e-05, + "loss": 1.6901, + "step": 3586 + }, + { + "epoch": 0.5787350758309132, + "grad_norm": 3.779283046722412, + "learning_rate": 4.000856153619112e-05, + "loss": 1.9207, + "step": 3587 + }, + { + "epoch": 0.5788964181994192, + "grad_norm": 6.24001407623291, + "learning_rate": 3.998296219167364e-05, + "loss": 1.7858, + "step": 3588 + }, + { + "epoch": 0.5790577605679251, + "grad_norm": 5.309969902038574, + "learning_rate": 3.995736558243186e-05, + "loss": 1.8324, + "step": 3589 + }, + { + "epoch": 0.5792191029364311, + "grad_norm": 4.181309223175049, + "learning_rate": 3.993177171545522e-05, + "loss": 2.1065, + "step": 3590 + }, + { + "epoch": 0.579380445304937, + "grad_norm": 4.735886573791504, + "learning_rate": 3.990618059773247e-05, + "loss": 2.1249, + "step": 3591 + }, + { + "epoch": 0.579541787673443, + "grad_norm": 5.766643047332764, + "learning_rate": 3.9880592236251554e-05, + "loss": 1.9642, + "step": 3592 + }, + { + "epoch": 0.5797031300419491, + "grad_norm": 3.530975818634033, + "learning_rate": 3.985500663799972e-05, + "loss": 1.9122, + "step": 3593 + }, + { + "epoch": 0.579864472410455, + "grad_norm": 5.174084663391113, + "learning_rate": 3.982942380996338e-05, + "loss": 1.9706, + "step": 3594 + }, + { + "epoch": 0.580025814778961, + "grad_norm": 3.8036489486694336, + "learning_rate": 3.980384375912829e-05, + "loss": 1.6745, + "step": 3595 + }, + { + "epoch": 0.5801871571474669, + "grad_norm": 3.690394639968872, + "learning_rate": 3.977826649247938e-05, + "loss": 2.0887, + "step": 3596 + }, + { + "epoch": 0.5803484995159729, + "grad_norm": 5.070824146270752, + "learning_rate": 3.9752692017000827e-05, + "loss": 1.8814, + "step": 3597 + }, + { + "epoch": 0.5805098418844788, + "grad_norm": 3.071666955947876, + "learning_rate": 3.9727120339676084e-05, + "loss": 1.8724, + "step": 3598 + }, + { + "epoch": 0.5806711842529848, + "grad_norm": 7.061601638793945, + "learning_rate": 3.9701551467487776e-05, + "loss": 2.0402, + "step": 3599 + }, + { + "epoch": 0.5808325266214908, + "grad_norm": 5.6859002113342285, + "learning_rate": 3.9675985407417836e-05, + "loss": 1.9293, + "step": 3600 + }, + { + "epoch": 0.5809938689899967, + "grad_norm": 4.9450554847717285, + "learning_rate": 3.965042216644738e-05, + "loss": 1.7457, + "step": 3601 + }, + { + "epoch": 0.5811552113585028, + "grad_norm": 4.177241325378418, + "learning_rate": 3.962486175155675e-05, + "loss": 1.8122, + "step": 3602 + }, + { + "epoch": 0.5813165537270087, + "grad_norm": 5.32080078125, + "learning_rate": 3.9599304169725545e-05, + "loss": 1.9454, + "step": 3603 + }, + { + "epoch": 0.5814778960955147, + "grad_norm": 5.042309284210205, + "learning_rate": 3.957374942793258e-05, + "loss": 1.8161, + "step": 3604 + }, + { + "epoch": 0.5816392384640207, + "grad_norm": 5.364151477813721, + "learning_rate": 3.954819753315588e-05, + "loss": 1.9463, + "step": 3605 + }, + { + "epoch": 0.5818005808325266, + "grad_norm": 4.314451694488525, + "learning_rate": 3.9522648492372735e-05, + "loss": 1.6316, + "step": 3606 + }, + { + "epoch": 0.5819619232010326, + "grad_norm": 4.097662925720215, + "learning_rate": 3.94971023125596e-05, + "loss": 1.9523, + "step": 3607 + }, + { + "epoch": 0.5821232655695385, + "grad_norm": 5.013316631317139, + "learning_rate": 3.947155900069216e-05, + "loss": 1.9738, + "step": 3608 + }, + { + "epoch": 0.5822846079380445, + "grad_norm": 4.259307384490967, + "learning_rate": 3.944601856374537e-05, + "loss": 1.6727, + "step": 3609 + }, + { + "epoch": 0.5824459503065506, + "grad_norm": 4.392331600189209, + "learning_rate": 3.942048100869333e-05, + "loss": 1.9385, + "step": 3610 + }, + { + "epoch": 0.5826072926750565, + "grad_norm": 3.544884204864502, + "learning_rate": 3.939494634250941e-05, + "loss": 1.6515, + "step": 3611 + }, + { + "epoch": 0.5827686350435625, + "grad_norm": 4.88466739654541, + "learning_rate": 3.936941457216614e-05, + "loss": 1.7893, + "step": 3612 + }, + { + "epoch": 0.5829299774120684, + "grad_norm": 4.783680438995361, + "learning_rate": 3.934388570463531e-05, + "loss": 2.0079, + "step": 3613 + }, + { + "epoch": 0.5830913197805744, + "grad_norm": 4.30138635635376, + "learning_rate": 3.931835974688785e-05, + "loss": 1.9392, + "step": 3614 + }, + { + "epoch": 0.5832526621490803, + "grad_norm": 3.4884119033813477, + "learning_rate": 3.929283670589399e-05, + "loss": 1.8283, + "step": 3615 + }, + { + "epoch": 0.5834140045175863, + "grad_norm": 5.524956703186035, + "learning_rate": 3.926731658862307e-05, + "loss": 1.9238, + "step": 3616 + }, + { + "epoch": 0.5835753468860922, + "grad_norm": 5.218562126159668, + "learning_rate": 3.9241799402043705e-05, + "loss": 1.9534, + "step": 3617 + }, + { + "epoch": 0.5837366892545982, + "grad_norm": 5.083594799041748, + "learning_rate": 3.9216285153123646e-05, + "loss": 2.1464, + "step": 3618 + }, + { + "epoch": 0.5838980316231043, + "grad_norm": 4.488029956817627, + "learning_rate": 3.919077384882991e-05, + "loss": 1.8756, + "step": 3619 + }, + { + "epoch": 0.5840593739916102, + "grad_norm": 6.185628414154053, + "learning_rate": 3.9165265496128644e-05, + "loss": 1.8962, + "step": 3620 + }, + { + "epoch": 0.5842207163601162, + "grad_norm": 5.930531024932861, + "learning_rate": 3.9139760101985225e-05, + "loss": 2.1463, + "step": 3621 + }, + { + "epoch": 0.5843820587286221, + "grad_norm": 7.0362629890441895, + "learning_rate": 3.911425767336421e-05, + "loss": 1.9975, + "step": 3622 + }, + { + "epoch": 0.5845434010971281, + "grad_norm": 4.510569095611572, + "learning_rate": 3.908875821722937e-05, + "loss": 1.8865, + "step": 3623 + }, + { + "epoch": 0.5847047434656341, + "grad_norm": 7.450949192047119, + "learning_rate": 3.9063261740543636e-05, + "loss": 1.6148, + "step": 3624 + }, + { + "epoch": 0.58486608583414, + "grad_norm": 5.08569860458374, + "learning_rate": 3.903776825026911e-05, + "loss": 1.8569, + "step": 3625 + }, + { + "epoch": 0.585027428202646, + "grad_norm": 3.9076004028320312, + "learning_rate": 3.901227775336715e-05, + "loss": 1.9772, + "step": 3626 + }, + { + "epoch": 0.585188770571152, + "grad_norm": 4.3509955406188965, + "learning_rate": 3.8986790256798214e-05, + "loss": 1.8251, + "step": 3627 + }, + { + "epoch": 0.585350112939658, + "grad_norm": 6.478168964385986, + "learning_rate": 3.896130576752201e-05, + "loss": 1.8171, + "step": 3628 + }, + { + "epoch": 0.585511455308164, + "grad_norm": 4.264913082122803, + "learning_rate": 3.893582429249735e-05, + "loss": 1.9046, + "step": 3629 + }, + { + "epoch": 0.5856727976766699, + "grad_norm": 5.290229320526123, + "learning_rate": 3.891034583868231e-05, + "loss": 1.9554, + "step": 3630 + }, + { + "epoch": 0.5858341400451759, + "grad_norm": 4.591266632080078, + "learning_rate": 3.8884870413034064e-05, + "loss": 1.9888, + "step": 3631 + }, + { + "epoch": 0.5859954824136818, + "grad_norm": 4.453161239624023, + "learning_rate": 3.885939802250901e-05, + "loss": 2.1186, + "step": 3632 + }, + { + "epoch": 0.5861568247821878, + "grad_norm": 4.925012111663818, + "learning_rate": 3.883392867406269e-05, + "loss": 1.849, + "step": 3633 + }, + { + "epoch": 0.5863181671506937, + "grad_norm": 5.076260089874268, + "learning_rate": 3.8808462374649803e-05, + "loss": 1.8674, + "step": 3634 + }, + { + "epoch": 0.5864795095191997, + "grad_norm": 4.231884002685547, + "learning_rate": 3.878299913122427e-05, + "loss": 1.9519, + "step": 3635 + }, + { + "epoch": 0.5866408518877058, + "grad_norm": 3.9470291137695312, + "learning_rate": 3.875753895073913e-05, + "loss": 2.0958, + "step": 3636 + }, + { + "epoch": 0.5868021942562117, + "grad_norm": 3.966118335723877, + "learning_rate": 3.87320818401466e-05, + "loss": 1.689, + "step": 3637 + }, + { + "epoch": 0.5869635366247177, + "grad_norm": 4.230781078338623, + "learning_rate": 3.8706627806398046e-05, + "loss": 1.8142, + "step": 3638 + }, + { + "epoch": 0.5871248789932236, + "grad_norm": 3.732171058654785, + "learning_rate": 3.868117685644403e-05, + "loss": 1.8904, + "step": 3639 + }, + { + "epoch": 0.5872862213617296, + "grad_norm": 4.025249004364014, + "learning_rate": 3.865572899723422e-05, + "loss": 2.0185, + "step": 3640 + }, + { + "epoch": 0.5874475637302355, + "grad_norm": 4.11497163772583, + "learning_rate": 3.8630284235717504e-05, + "loss": 1.7455, + "step": 3641 + }, + { + "epoch": 0.5876089060987415, + "grad_norm": 4.722634792327881, + "learning_rate": 3.860484257884184e-05, + "loss": 1.8329, + "step": 3642 + }, + { + "epoch": 0.5877702484672475, + "grad_norm": 4.155136585235596, + "learning_rate": 3.857940403355444e-05, + "loss": 1.8166, + "step": 3643 + }, + { + "epoch": 0.5879315908357534, + "grad_norm": 4.59343957901001, + "learning_rate": 3.855396860680155e-05, + "loss": 2.0221, + "step": 3644 + }, + { + "epoch": 0.5880929332042595, + "grad_norm": 4.209593296051025, + "learning_rate": 3.8528536305528695e-05, + "loss": 1.9369, + "step": 3645 + }, + { + "epoch": 0.5882542755727654, + "grad_norm": 4.7490363121032715, + "learning_rate": 3.850310713668044e-05, + "loss": 1.8632, + "step": 3646 + }, + { + "epoch": 0.5884156179412714, + "grad_norm": 4.539514541625977, + "learning_rate": 3.847768110720052e-05, + "loss": 1.9854, + "step": 3647 + }, + { + "epoch": 0.5885769603097774, + "grad_norm": 3.969621419906616, + "learning_rate": 3.845225822403186e-05, + "loss": 1.8744, + "step": 3648 + }, + { + "epoch": 0.5887383026782833, + "grad_norm": 4.80803108215332, + "learning_rate": 3.842683849411646e-05, + "loss": 2.061, + "step": 3649 + }, + { + "epoch": 0.5888996450467893, + "grad_norm": 5.44177770614624, + "learning_rate": 3.840142192439552e-05, + "loss": 2.037, + "step": 3650 + }, + { + "epoch": 0.5890609874152952, + "grad_norm": 4.776845932006836, + "learning_rate": 3.837600852180933e-05, + "loss": 1.8394, + "step": 3651 + }, + { + "epoch": 0.5892223297838012, + "grad_norm": 3.8401901721954346, + "learning_rate": 3.835059829329735e-05, + "loss": 1.9709, + "step": 3652 + }, + { + "epoch": 0.5893836721523072, + "grad_norm": 4.661105155944824, + "learning_rate": 3.832519124579811e-05, + "loss": 1.9098, + "step": 3653 + }, + { + "epoch": 0.5895450145208132, + "grad_norm": 4.342999458312988, + "learning_rate": 3.829978738624938e-05, + "loss": 1.9601, + "step": 3654 + }, + { + "epoch": 0.5897063568893192, + "grad_norm": 5.721419334411621, + "learning_rate": 3.827438672158795e-05, + "loss": 1.7262, + "step": 3655 + }, + { + "epoch": 0.5898676992578251, + "grad_norm": 3.692500352859497, + "learning_rate": 3.824898925874982e-05, + "loss": 2.0584, + "step": 3656 + }, + { + "epoch": 0.5900290416263311, + "grad_norm": 5.073899269104004, + "learning_rate": 3.822359500467006e-05, + "loss": 2.1524, + "step": 3657 + }, + { + "epoch": 0.590190383994837, + "grad_norm": 3.4609923362731934, + "learning_rate": 3.819820396628287e-05, + "loss": 1.9398, + "step": 3658 + }, + { + "epoch": 0.590351726363343, + "grad_norm": 4.638036251068115, + "learning_rate": 3.8172816150521616e-05, + "loss": 1.7962, + "step": 3659 + }, + { + "epoch": 0.590513068731849, + "grad_norm": 3.909994125366211, + "learning_rate": 3.8147431564318745e-05, + "loss": 2.0509, + "step": 3660 + }, + { + "epoch": 0.5906744111003549, + "grad_norm": 5.345953464508057, + "learning_rate": 3.812205021460582e-05, + "loss": 1.9027, + "step": 3661 + }, + { + "epoch": 0.590835753468861, + "grad_norm": 4.3474626541137695, + "learning_rate": 3.809667210831353e-05, + "loss": 1.7642, + "step": 3662 + }, + { + "epoch": 0.5909970958373669, + "grad_norm": 4.156850814819336, + "learning_rate": 3.807129725237171e-05, + "loss": 2.0627, + "step": 3663 + }, + { + "epoch": 0.5911584382058729, + "grad_norm": 4.340262413024902, + "learning_rate": 3.8045925653709233e-05, + "loss": 2.0942, + "step": 3664 + }, + { + "epoch": 0.5913197805743788, + "grad_norm": 5.7656683921813965, + "learning_rate": 3.8020557319254174e-05, + "loss": 2.28, + "step": 3665 + }, + { + "epoch": 0.5914811229428848, + "grad_norm": 5.124190807342529, + "learning_rate": 3.799519225593362e-05, + "loss": 2.0985, + "step": 3666 + }, + { + "epoch": 0.5916424653113908, + "grad_norm": 3.294722557067871, + "learning_rate": 3.7969830470673866e-05, + "loss": 1.9777, + "step": 3667 + }, + { + "epoch": 0.5918038076798967, + "grad_norm": 5.506333351135254, + "learning_rate": 3.794447197040022e-05, + "loss": 1.8562, + "step": 3668 + }, + { + "epoch": 0.5919651500484027, + "grad_norm": 4.190850734710693, + "learning_rate": 3.7919116762037146e-05, + "loss": 1.8515, + "step": 3669 + }, + { + "epoch": 0.5921264924169087, + "grad_norm": 4.269705295562744, + "learning_rate": 3.789376485250821e-05, + "loss": 1.9124, + "step": 3670 + }, + { + "epoch": 0.5922878347854147, + "grad_norm": 5.131028175354004, + "learning_rate": 3.786841624873604e-05, + "loss": 1.8684, + "step": 3671 + }, + { + "epoch": 0.5924491771539206, + "grad_norm": 3.8137192726135254, + "learning_rate": 3.784307095764241e-05, + "loss": 1.8912, + "step": 3672 + }, + { + "epoch": 0.5926105195224266, + "grad_norm": 4.432575702667236, + "learning_rate": 3.781772898614812e-05, + "loss": 2.1149, + "step": 3673 + }, + { + "epoch": 0.5927718618909326, + "grad_norm": 3.3182921409606934, + "learning_rate": 3.779239034117316e-05, + "loss": 1.8549, + "step": 3674 + }, + { + "epoch": 0.5929332042594385, + "grad_norm": 4.513149738311768, + "learning_rate": 3.776705502963652e-05, + "loss": 2.0607, + "step": 3675 + }, + { + "epoch": 0.5930945466279445, + "grad_norm": 4.182868480682373, + "learning_rate": 3.774172305845636e-05, + "loss": 2.0251, + "step": 3676 + }, + { + "epoch": 0.5932558889964504, + "grad_norm": 3.6221394538879395, + "learning_rate": 3.771639443454984e-05, + "loss": 1.7068, + "step": 3677 + }, + { + "epoch": 0.5934172313649564, + "grad_norm": 3.82957124710083, + "learning_rate": 3.76910691648333e-05, + "loss": 1.7472, + "step": 3678 + }, + { + "epoch": 0.5935785737334625, + "grad_norm": 5.206719398498535, + "learning_rate": 3.7665747256222075e-05, + "loss": 1.9413, + "step": 3679 + }, + { + "epoch": 0.5937399161019684, + "grad_norm": 3.959946393966675, + "learning_rate": 3.764042871563066e-05, + "loss": 1.875, + "step": 3680 + }, + { + "epoch": 0.5939012584704744, + "grad_norm": 4.654921054840088, + "learning_rate": 3.761511354997256e-05, + "loss": 2.055, + "step": 3681 + }, + { + "epoch": 0.5940626008389803, + "grad_norm": 5.93565559387207, + "learning_rate": 3.758980176616042e-05, + "loss": 1.8814, + "step": 3682 + }, + { + "epoch": 0.5942239432074863, + "grad_norm": 5.584222793579102, + "learning_rate": 3.7564493371105934e-05, + "loss": 1.9983, + "step": 3683 + }, + { + "epoch": 0.5943852855759922, + "grad_norm": 4.700340747833252, + "learning_rate": 3.753918837171984e-05, + "loss": 1.9201, + "step": 3684 + }, + { + "epoch": 0.5945466279444982, + "grad_norm": 4.035298824310303, + "learning_rate": 3.7513886774912024e-05, + "loss": 2.1135, + "step": 3685 + }, + { + "epoch": 0.5947079703130042, + "grad_norm": 3.782871723175049, + "learning_rate": 3.7488588587591356e-05, + "loss": 1.8368, + "step": 3686 + }, + { + "epoch": 0.5948693126815101, + "grad_norm": 4.397482872009277, + "learning_rate": 3.7463293816665866e-05, + "loss": 1.9948, + "step": 3687 + }, + { + "epoch": 0.5950306550500162, + "grad_norm": 4.766461372375488, + "learning_rate": 3.7438002469042565e-05, + "loss": 2.1219, + "step": 3688 + }, + { + "epoch": 0.5951919974185221, + "grad_norm": 4.941188812255859, + "learning_rate": 3.741271455162757e-05, + "loss": 2.0298, + "step": 3689 + }, + { + "epoch": 0.5953533397870281, + "grad_norm": 3.774362325668335, + "learning_rate": 3.738743007132608e-05, + "loss": 2.0194, + "step": 3690 + }, + { + "epoch": 0.595514682155534, + "grad_norm": 3.833545207977295, + "learning_rate": 3.736214903504233e-05, + "loss": 1.9942, + "step": 3691 + }, + { + "epoch": 0.59567602452404, + "grad_norm": 3.833545207977295, + "learning_rate": 3.736214903504233e-05, + "loss": 1.9729, + "step": 3692 + }, + { + "epoch": 0.595837366892546, + "grad_norm": 4.3665313720703125, + "learning_rate": 3.7336871449679586e-05, + "loss": 1.8651, + "step": 3693 + }, + { + "epoch": 0.5959987092610519, + "grad_norm": 4.348609924316406, + "learning_rate": 3.7311597322140246e-05, + "loss": 2.2343, + "step": 3694 + }, + { + "epoch": 0.5961600516295579, + "grad_norm": 3.520056962966919, + "learning_rate": 3.728632665932569e-05, + "loss": 1.8885, + "step": 3695 + }, + { + "epoch": 0.5963213939980639, + "grad_norm": 5.269471168518066, + "learning_rate": 3.726105946813642e-05, + "loss": 1.9451, + "step": 3696 + }, + { + "epoch": 0.5964827363665699, + "grad_norm": 4.60741662979126, + "learning_rate": 3.723579575547194e-05, + "loss": 1.8556, + "step": 3697 + }, + { + "epoch": 0.5966440787350759, + "grad_norm": 4.3959641456604, + "learning_rate": 3.721053552823078e-05, + "loss": 1.7342, + "step": 3698 + }, + { + "epoch": 0.5968054211035818, + "grad_norm": 4.219175815582275, + "learning_rate": 3.7185278793310605e-05, + "loss": 1.8752, + "step": 3699 + }, + { + "epoch": 0.5969667634720878, + "grad_norm": 3.3393092155456543, + "learning_rate": 3.716002555760806e-05, + "loss": 1.8868, + "step": 3700 + }, + { + "epoch": 0.5971281058405937, + "grad_norm": 3.608086109161377, + "learning_rate": 3.713477582801886e-05, + "loss": 1.9005, + "step": 3701 + }, + { + "epoch": 0.5972894482090997, + "grad_norm": 4.510677337646484, + "learning_rate": 3.710952961143773e-05, + "loss": 1.844, + "step": 3702 + }, + { + "epoch": 0.5974507905776056, + "grad_norm": 3.330909013748169, + "learning_rate": 3.7084286914758505e-05, + "loss": 1.8193, + "step": 3703 + }, + { + "epoch": 0.5976121329461116, + "grad_norm": 3.9209372997283936, + "learning_rate": 3.705904774487396e-05, + "loss": 1.9915, + "step": 3704 + }, + { + "epoch": 0.5977734753146177, + "grad_norm": 4.028830051422119, + "learning_rate": 3.703381210867601e-05, + "loss": 1.9291, + "step": 3705 + }, + { + "epoch": 0.5979348176831236, + "grad_norm": 3.884798288345337, + "learning_rate": 3.7008580013055524e-05, + "loss": 2.0992, + "step": 3706 + }, + { + "epoch": 0.5980961600516296, + "grad_norm": 4.469210624694824, + "learning_rate": 3.698335146490246e-05, + "loss": 1.9829, + "step": 3707 + }, + { + "epoch": 0.5982575024201355, + "grad_norm": 5.87947940826416, + "learning_rate": 3.695812647110577e-05, + "loss": 1.7308, + "step": 3708 + }, + { + "epoch": 0.5984188447886415, + "grad_norm": 4.097621917724609, + "learning_rate": 3.6932905038553464e-05, + "loss": 1.919, + "step": 3709 + }, + { + "epoch": 0.5985801871571474, + "grad_norm": 5.142523288726807, + "learning_rate": 3.690768717413254e-05, + "loss": 2.1386, + "step": 3710 + }, + { + "epoch": 0.5987415295256534, + "grad_norm": 4.307010650634766, + "learning_rate": 3.6882472884729066e-05, + "loss": 1.7298, + "step": 3711 + }, + { + "epoch": 0.5989028718941594, + "grad_norm": 4.007396697998047, + "learning_rate": 3.6857262177228125e-05, + "loss": 1.7682, + "step": 3712 + }, + { + "epoch": 0.5990642142626654, + "grad_norm": 3.942392110824585, + "learning_rate": 3.683205505851377e-05, + "loss": 1.9615, + "step": 3713 + }, + { + "epoch": 0.5992255566311714, + "grad_norm": 6.172910690307617, + "learning_rate": 3.680685153546916e-05, + "loss": 2.005, + "step": 3714 + }, + { + "epoch": 0.5993868989996773, + "grad_norm": 4.684194087982178, + "learning_rate": 3.6781651614976386e-05, + "loss": 1.753, + "step": 3715 + }, + { + "epoch": 0.5995482413681833, + "grad_norm": 5.116608142852783, + "learning_rate": 3.675645530391665e-05, + "loss": 2.0317, + "step": 3716 + }, + { + "epoch": 0.5997095837366893, + "grad_norm": 3.8065035343170166, + "learning_rate": 3.673126260917006e-05, + "loss": 2.0698, + "step": 3717 + }, + { + "epoch": 0.5998709261051952, + "grad_norm": 5.370408535003662, + "learning_rate": 3.670607353761584e-05, + "loss": 1.9875, + "step": 3718 + }, + { + "epoch": 0.6000322684737012, + "grad_norm": 3.7742254734039307, + "learning_rate": 3.668088809613215e-05, + "loss": 1.7665, + "step": 3719 + }, + { + "epoch": 0.6001936108422071, + "grad_norm": 4.05025053024292, + "learning_rate": 3.665570629159619e-05, + "loss": 1.7383, + "step": 3720 + }, + { + "epoch": 0.6003549532107131, + "grad_norm": 3.3534834384918213, + "learning_rate": 3.663052813088417e-05, + "loss": 1.8674, + "step": 3721 + }, + { + "epoch": 0.6005162955792192, + "grad_norm": 5.025367259979248, + "learning_rate": 3.6605353620871266e-05, + "loss": 1.9043, + "step": 3722 + }, + { + "epoch": 0.6006776379477251, + "grad_norm": 4.877417087554932, + "learning_rate": 3.6580182768431735e-05, + "loss": 1.6918, + "step": 3723 + }, + { + "epoch": 0.6008389803162311, + "grad_norm": 4.852436542510986, + "learning_rate": 3.6555015580438745e-05, + "loss": 2.0019, + "step": 3724 + }, + { + "epoch": 0.601000322684737, + "grad_norm": 5.720123767852783, + "learning_rate": 3.6529852063764545e-05, + "loss": 1.8533, + "step": 3725 + }, + { + "epoch": 0.601161665053243, + "grad_norm": 4.051018714904785, + "learning_rate": 3.65046922252803e-05, + "loss": 2.2468, + "step": 3726 + }, + { + "epoch": 0.6013230074217489, + "grad_norm": 4.266456604003906, + "learning_rate": 3.6479536071856265e-05, + "loss": 1.7491, + "step": 3727 + }, + { + "epoch": 0.6014843497902549, + "grad_norm": 3.0934205055236816, + "learning_rate": 3.645438361036161e-05, + "loss": 1.9993, + "step": 3728 + }, + { + "epoch": 0.6016456921587608, + "grad_norm": 4.6057329177856445, + "learning_rate": 3.6429234847664535e-05, + "loss": 1.8901, + "step": 3729 + }, + { + "epoch": 0.6018070345272669, + "grad_norm": 7.07024621963501, + "learning_rate": 3.640408979063219e-05, + "loss": 2.274, + "step": 3730 + }, + { + "epoch": 0.6019683768957729, + "grad_norm": 3.6758999824523926, + "learning_rate": 3.6378948446130786e-05, + "loss": 1.6481, + "step": 3731 + }, + { + "epoch": 0.6021297192642788, + "grad_norm": 3.463942050933838, + "learning_rate": 3.6353810821025436e-05, + "loss": 1.7646, + "step": 3732 + }, + { + "epoch": 0.6022910616327848, + "grad_norm": 5.796482086181641, + "learning_rate": 3.632867692218032e-05, + "loss": 2.008, + "step": 3733 + }, + { + "epoch": 0.6024524040012907, + "grad_norm": 4.755577087402344, + "learning_rate": 3.630354675645853e-05, + "loss": 1.8181, + "step": 3734 + }, + { + "epoch": 0.6026137463697967, + "grad_norm": 4.626229763031006, + "learning_rate": 3.627842033072216e-05, + "loss": 1.7967, + "step": 3735 + }, + { + "epoch": 0.6027750887383027, + "grad_norm": 5.281877517700195, + "learning_rate": 3.625329765183233e-05, + "loss": 1.9245, + "step": 3736 + }, + { + "epoch": 0.6029364311068086, + "grad_norm": 4.206997871398926, + "learning_rate": 3.6228178726649047e-05, + "loss": 1.9532, + "step": 3737 + }, + { + "epoch": 0.6030977734753146, + "grad_norm": 3.6428096294403076, + "learning_rate": 3.620306356203139e-05, + "loss": 1.7949, + "step": 3738 + }, + { + "epoch": 0.6032591158438206, + "grad_norm": 6.974172115325928, + "learning_rate": 3.6177952164837335e-05, + "loss": 1.8432, + "step": 3739 + }, + { + "epoch": 0.6034204582123266, + "grad_norm": 4.038150310516357, + "learning_rate": 3.615284454192388e-05, + "loss": 1.8784, + "step": 3740 + }, + { + "epoch": 0.6035818005808326, + "grad_norm": 7.101322650909424, + "learning_rate": 3.612774070014694e-05, + "loss": 2.1201, + "step": 3741 + }, + { + "epoch": 0.6037431429493385, + "grad_norm": 9.740883827209473, + "learning_rate": 3.610264064636146e-05, + "loss": 2.1495, + "step": 3742 + }, + { + "epoch": 0.6039044853178445, + "grad_norm": 5.0577712059021, + "learning_rate": 3.607754438742129e-05, + "loss": 1.9048, + "step": 3743 + }, + { + "epoch": 0.6040658276863504, + "grad_norm": 6.079409599304199, + "learning_rate": 3.605245193017931e-05, + "loss": 1.7851, + "step": 3744 + }, + { + "epoch": 0.6042271700548564, + "grad_norm": 4.561661243438721, + "learning_rate": 3.602736328148728e-05, + "loss": 1.6548, + "step": 3745 + }, + { + "epoch": 0.6043885124233623, + "grad_norm": 4.185081481933594, + "learning_rate": 3.600227844819601e-05, + "loss": 1.7029, + "step": 3746 + }, + { + "epoch": 0.6045498547918683, + "grad_norm": 4.4725117683410645, + "learning_rate": 3.5977197437155205e-05, + "loss": 1.8245, + "step": 3747 + }, + { + "epoch": 0.6047111971603744, + "grad_norm": 4.323187351226807, + "learning_rate": 3.5952120255213526e-05, + "loss": 1.9139, + "step": 3748 + }, + { + "epoch": 0.6048725395288803, + "grad_norm": 4.430797100067139, + "learning_rate": 3.592704690921863e-05, + "loss": 1.9795, + "step": 3749 + }, + { + "epoch": 0.6050338818973863, + "grad_norm": 5.248645782470703, + "learning_rate": 3.5901977406017085e-05, + "loss": 1.9281, + "step": 3750 + }, + { + "epoch": 0.6051952242658922, + "grad_norm": 3.7061660289764404, + "learning_rate": 3.5876911752454447e-05, + "loss": 1.6232, + "step": 3751 + }, + { + "epoch": 0.6053565666343982, + "grad_norm": 4.320940971374512, + "learning_rate": 3.585184995537518e-05, + "loss": 1.8486, + "step": 3752 + }, + { + "epoch": 0.6055179090029041, + "grad_norm": 3.9274215698242188, + "learning_rate": 3.5826792021622744e-05, + "loss": 1.7725, + "step": 3753 + }, + { + "epoch": 0.6056792513714101, + "grad_norm": 4.034487724304199, + "learning_rate": 3.580173795803948e-05, + "loss": 1.917, + "step": 3754 + }, + { + "epoch": 0.6058405937399161, + "grad_norm": 3.7273917198181152, + "learning_rate": 3.577668777146676e-05, + "loss": 1.933, + "step": 3755 + }, + { + "epoch": 0.6060019361084221, + "grad_norm": 5.619368553161621, + "learning_rate": 3.57516414687448e-05, + "loss": 2.0396, + "step": 3756 + }, + { + "epoch": 0.6061632784769281, + "grad_norm": 4.922757625579834, + "learning_rate": 3.572659905671283e-05, + "loss": 1.8759, + "step": 3757 + }, + { + "epoch": 0.606324620845434, + "grad_norm": 4.182972431182861, + "learning_rate": 3.5701560542208965e-05, + "loss": 1.9355, + "step": 3758 + }, + { + "epoch": 0.60648596321394, + "grad_norm": 3.7297675609588623, + "learning_rate": 3.5676525932070317e-05, + "loss": 1.9765, + "step": 3759 + }, + { + "epoch": 0.606647305582446, + "grad_norm": 4.987452030181885, + "learning_rate": 3.565149523313286e-05, + "loss": 1.7428, + "step": 3760 + }, + { + "epoch": 0.6068086479509519, + "grad_norm": 4.672898292541504, + "learning_rate": 3.562646845223153e-05, + "loss": 1.8289, + "step": 3761 + }, + { + "epoch": 0.6069699903194579, + "grad_norm": 5.305594444274902, + "learning_rate": 3.560144559620023e-05, + "loss": 1.8384, + "step": 3762 + }, + { + "epoch": 0.6071313326879638, + "grad_norm": 3.834249973297119, + "learning_rate": 3.5576426671871736e-05, + "loss": 1.5013, + "step": 3763 + }, + { + "epoch": 0.6072926750564698, + "grad_norm": 4.112975120544434, + "learning_rate": 3.55514116860778e-05, + "loss": 1.889, + "step": 3764 + }, + { + "epoch": 0.6074540174249758, + "grad_norm": 4.56207799911499, + "learning_rate": 3.552640064564903e-05, + "loss": 1.7246, + "step": 3765 + }, + { + "epoch": 0.6076153597934818, + "grad_norm": 4.238980770111084, + "learning_rate": 3.550139355741504e-05, + "loss": 2.1032, + "step": 3766 + }, + { + "epoch": 0.6077767021619878, + "grad_norm": 4.23140287399292, + "learning_rate": 3.54763904282043e-05, + "loss": 1.91, + "step": 3767 + }, + { + "epoch": 0.6079380445304937, + "grad_norm": 5.2451863288879395, + "learning_rate": 3.5451391264844244e-05, + "loss": 1.7813, + "step": 3768 + }, + { + "epoch": 0.6080993868989997, + "grad_norm": 4.276443004608154, + "learning_rate": 3.542639607416116e-05, + "loss": 1.9734, + "step": 3769 + }, + { + "epoch": 0.6082607292675056, + "grad_norm": 3.4733471870422363, + "learning_rate": 3.540140486298035e-05, + "loss": 1.8424, + "step": 3770 + }, + { + "epoch": 0.6084220716360116, + "grad_norm": 4.838557720184326, + "learning_rate": 3.5376417638125914e-05, + "loss": 1.9435, + "step": 3771 + }, + { + "epoch": 0.6085834140045175, + "grad_norm": 4.959713935852051, + "learning_rate": 3.535143440642097e-05, + "loss": 2.0424, + "step": 3772 + }, + { + "epoch": 0.6087447563730236, + "grad_norm": 6.249136447906494, + "learning_rate": 3.532645517468748e-05, + "loss": 1.7399, + "step": 3773 + }, + { + "epoch": 0.6089060987415296, + "grad_norm": 4.988114356994629, + "learning_rate": 3.5301479949746314e-05, + "loss": 2.0397, + "step": 3774 + }, + { + "epoch": 0.6090674411100355, + "grad_norm": 3.851431369781494, + "learning_rate": 3.52765087384173e-05, + "loss": 1.8279, + "step": 3775 + }, + { + "epoch": 0.6092287834785415, + "grad_norm": 3.687272787094116, + "learning_rate": 3.5251541547519094e-05, + "loss": 2.0134, + "step": 3776 + }, + { + "epoch": 0.6093901258470474, + "grad_norm": 4.138549327850342, + "learning_rate": 3.522657838386933e-05, + "loss": 1.9043, + "step": 3777 + }, + { + "epoch": 0.6095514682155534, + "grad_norm": 4.868224143981934, + "learning_rate": 3.520161925428449e-05, + "loss": 1.8726, + "step": 3778 + }, + { + "epoch": 0.6097128105840594, + "grad_norm": 6.811543941497803, + "learning_rate": 3.5176664165579986e-05, + "loss": 2.0306, + "step": 3779 + }, + { + "epoch": 0.6098741529525653, + "grad_norm": 3.8094887733459473, + "learning_rate": 3.5151713124570086e-05, + "loss": 1.7307, + "step": 3780 + }, + { + "epoch": 0.6100354953210713, + "grad_norm": 5.4980950355529785, + "learning_rate": 3.512676613806802e-05, + "loss": 1.799, + "step": 3781 + }, + { + "epoch": 0.6101968376895773, + "grad_norm": 4.2599711418151855, + "learning_rate": 3.510182321288582e-05, + "loss": 1.9913, + "step": 3782 + }, + { + "epoch": 0.6103581800580833, + "grad_norm": 5.045507907867432, + "learning_rate": 3.507688435583451e-05, + "loss": 1.6917, + "step": 3783 + }, + { + "epoch": 0.6105195224265892, + "grad_norm": 4.295395374298096, + "learning_rate": 3.5051949573723926e-05, + "loss": 1.7401, + "step": 3784 + }, + { + "epoch": 0.6106808647950952, + "grad_norm": 4.576744556427002, + "learning_rate": 3.50270188733628e-05, + "loss": 2.2997, + "step": 3785 + }, + { + "epoch": 0.6108422071636012, + "grad_norm": 4.532271385192871, + "learning_rate": 3.5002092261558814e-05, + "loss": 1.7825, + "step": 3786 + }, + { + "epoch": 0.6110035495321071, + "grad_norm": 3.389321804046631, + "learning_rate": 3.497716974511844e-05, + "loss": 1.8228, + "step": 3787 + }, + { + "epoch": 0.6111648919006131, + "grad_norm": 4.300199031829834, + "learning_rate": 3.495225133084712e-05, + "loss": 1.9312, + "step": 3788 + }, + { + "epoch": 0.611326234269119, + "grad_norm": 5.128062725067139, + "learning_rate": 3.4927337025549077e-05, + "loss": 2.3558, + "step": 3789 + }, + { + "epoch": 0.611487576637625, + "grad_norm": 5.287378311157227, + "learning_rate": 3.4902426836027534e-05, + "loss": 1.8695, + "step": 3790 + }, + { + "epoch": 0.611648919006131, + "grad_norm": 4.9498748779296875, + "learning_rate": 3.4877520769084484e-05, + "loss": 2.448, + "step": 3791 + }, + { + "epoch": 0.611810261374637, + "grad_norm": 3.59617280960083, + "learning_rate": 3.4852618831520855e-05, + "loss": 1.7811, + "step": 3792 + }, + { + "epoch": 0.611971603743143, + "grad_norm": 3.691159725189209, + "learning_rate": 3.482772103013641e-05, + "loss": 1.926, + "step": 3793 + }, + { + "epoch": 0.6121329461116489, + "grad_norm": 4.8668532371521, + "learning_rate": 3.480282737172983e-05, + "loss": 1.8611, + "step": 3794 + }, + { + "epoch": 0.6122942884801549, + "grad_norm": 4.09564208984375, + "learning_rate": 3.477793786309861e-05, + "loss": 2.0656, + "step": 3795 + }, + { + "epoch": 0.6124556308486608, + "grad_norm": 4.320098400115967, + "learning_rate": 3.4753052511039155e-05, + "loss": 1.7872, + "step": 3796 + }, + { + "epoch": 0.6126169732171668, + "grad_norm": 3.934811592102051, + "learning_rate": 3.4728171322346694e-05, + "loss": 1.7117, + "step": 3797 + }, + { + "epoch": 0.6127783155856728, + "grad_norm": 5.060281753540039, + "learning_rate": 3.470329430381535e-05, + "loss": 1.7985, + "step": 3798 + }, + { + "epoch": 0.6129396579541788, + "grad_norm": 4.571928977966309, + "learning_rate": 3.467842146223812e-05, + "loss": 1.9876, + "step": 3799 + }, + { + "epoch": 0.6131010003226848, + "grad_norm": 5.835862159729004, + "learning_rate": 3.4653552804406805e-05, + "loss": 1.7337, + "step": 3800 + }, + { + "epoch": 0.6132623426911907, + "grad_norm": 4.562862396240234, + "learning_rate": 3.462868833711214e-05, + "loss": 1.8781, + "step": 3801 + }, + { + "epoch": 0.6134236850596967, + "grad_norm": 4.681947231292725, + "learning_rate": 3.460382806714362e-05, + "loss": 1.9091, + "step": 3802 + }, + { + "epoch": 0.6135850274282026, + "grad_norm": 4.201179504394531, + "learning_rate": 3.457897200128971e-05, + "loss": 1.7072, + "step": 3803 + }, + { + "epoch": 0.6137463697967086, + "grad_norm": 4.7126383781433105, + "learning_rate": 3.455412014633763e-05, + "loss": 1.9345, + "step": 3804 + }, + { + "epoch": 0.6139077121652146, + "grad_norm": 5.711738109588623, + "learning_rate": 3.452927250907349e-05, + "loss": 2.0958, + "step": 3805 + }, + { + "epoch": 0.6140690545337205, + "grad_norm": 3.8236541748046875, + "learning_rate": 3.450442909628224e-05, + "loss": 1.8577, + "step": 3806 + }, + { + "epoch": 0.6142303969022265, + "grad_norm": 3.954463243484497, + "learning_rate": 3.4479589914747706e-05, + "loss": 1.7468, + "step": 3807 + }, + { + "epoch": 0.6143917392707325, + "grad_norm": 3.7589309215545654, + "learning_rate": 3.445475497125249e-05, + "loss": 1.8477, + "step": 3808 + }, + { + "epoch": 0.6145530816392385, + "grad_norm": 3.4328806400299072, + "learning_rate": 3.4429924272578116e-05, + "loss": 1.6261, + "step": 3809 + }, + { + "epoch": 0.6147144240077445, + "grad_norm": 3.705533027648926, + "learning_rate": 3.4405097825504906e-05, + "loss": 1.6058, + "step": 3810 + }, + { + "epoch": 0.6148757663762504, + "grad_norm": 4.693033695220947, + "learning_rate": 3.4380275636811986e-05, + "loss": 2.0203, + "step": 3811 + }, + { + "epoch": 0.6150371087447564, + "grad_norm": 3.9044811725616455, + "learning_rate": 3.435545771327743e-05, + "loss": 1.9886, + "step": 3812 + }, + { + "epoch": 0.6151984511132623, + "grad_norm": 4.10264778137207, + "learning_rate": 3.433064406167801e-05, + "loss": 1.8557, + "step": 3813 + }, + { + "epoch": 0.6153597934817683, + "grad_norm": 4.515639781951904, + "learning_rate": 3.4305834688789465e-05, + "loss": 1.9711, + "step": 3814 + }, + { + "epoch": 0.6155211358502742, + "grad_norm": 5.027278900146484, + "learning_rate": 3.428102960138625e-05, + "loss": 1.9102, + "step": 3815 + }, + { + "epoch": 0.6156824782187803, + "grad_norm": 3.451860189437866, + "learning_rate": 3.4256228806241733e-05, + "loss": 1.9471, + "step": 3816 + }, + { + "epoch": 0.6158438205872863, + "grad_norm": 4.987911701202393, + "learning_rate": 3.423143231012803e-05, + "loss": 1.6859, + "step": 3817 + }, + { + "epoch": 0.6160051629557922, + "grad_norm": 4.156739234924316, + "learning_rate": 3.4206640119816187e-05, + "loss": 1.6867, + "step": 3818 + }, + { + "epoch": 0.6161665053242982, + "grad_norm": 4.318131446838379, + "learning_rate": 3.418185224207597e-05, + "loss": 1.9572, + "step": 3819 + }, + { + "epoch": 0.6163278476928041, + "grad_norm": 4.359016418457031, + "learning_rate": 3.4157068683676055e-05, + "loss": 1.8555, + "step": 3820 + }, + { + "epoch": 0.6164891900613101, + "grad_norm": 5.2175140380859375, + "learning_rate": 3.4132289451383866e-05, + "loss": 1.9462, + "step": 3821 + }, + { + "epoch": 0.616650532429816, + "grad_norm": 4.045276165008545, + "learning_rate": 3.410751455196571e-05, + "loss": 1.8704, + "step": 3822 + }, + { + "epoch": 0.616811874798322, + "grad_norm": 4.262764930725098, + "learning_rate": 3.4082743992186655e-05, + "loss": 1.9725, + "step": 3823 + }, + { + "epoch": 0.616973217166828, + "grad_norm": 4.094369888305664, + "learning_rate": 3.405797777881059e-05, + "loss": 1.8869, + "step": 3824 + }, + { + "epoch": 0.617134559535334, + "grad_norm": 3.974959135055542, + "learning_rate": 3.4033215918600285e-05, + "loss": 1.9153, + "step": 3825 + }, + { + "epoch": 0.61729590190384, + "grad_norm": 3.951637029647827, + "learning_rate": 3.400845841831723e-05, + "loss": 2.0925, + "step": 3826 + }, + { + "epoch": 0.6174572442723459, + "grad_norm": 4.118864059448242, + "learning_rate": 3.3983705284721795e-05, + "loss": 1.6755, + "step": 3827 + }, + { + "epoch": 0.6176185866408519, + "grad_norm": 5.3612775802612305, + "learning_rate": 3.3958956524573085e-05, + "loss": 1.8371, + "step": 3828 + }, + { + "epoch": 0.6177799290093579, + "grad_norm": 5.615139484405518, + "learning_rate": 3.3934212144629104e-05, + "loss": 2.0056, + "step": 3829 + }, + { + "epoch": 0.6179412713778638, + "grad_norm": 3.605287551879883, + "learning_rate": 3.390947215164657e-05, + "loss": 1.8722, + "step": 3830 + }, + { + "epoch": 0.6181026137463698, + "grad_norm": 3.9551656246185303, + "learning_rate": 3.388473655238109e-05, + "loss": 1.7807, + "step": 3831 + }, + { + "epoch": 0.6182639561148757, + "grad_norm": 4.237039566040039, + "learning_rate": 3.386000535358696e-05, + "loss": 1.7805, + "step": 3832 + }, + { + "epoch": 0.6184252984833818, + "grad_norm": 4.651735305786133, + "learning_rate": 3.38352785620174e-05, + "loss": 1.7804, + "step": 3833 + }, + { + "epoch": 0.6185866408518877, + "grad_norm": 4.280829906463623, + "learning_rate": 3.381055618442434e-05, + "loss": 1.9884, + "step": 3834 + }, + { + "epoch": 0.6187479832203937, + "grad_norm": 4.486728191375732, + "learning_rate": 3.378583822755853e-05, + "loss": 1.7981, + "step": 3835 + }, + { + "epoch": 0.6189093255888997, + "grad_norm": 4.050109386444092, + "learning_rate": 3.376112469816951e-05, + "loss": 1.9284, + "step": 3836 + }, + { + "epoch": 0.6190706679574056, + "grad_norm": 4.030494689941406, + "learning_rate": 3.37364156030056e-05, + "loss": 1.926, + "step": 3837 + }, + { + "epoch": 0.6192320103259116, + "grad_norm": 4.372687816619873, + "learning_rate": 3.3711710948813956e-05, + "loss": 1.7221, + "step": 3838 + }, + { + "epoch": 0.6193933526944175, + "grad_norm": 5.265523433685303, + "learning_rate": 3.368701074234045e-05, + "loss": 2.1795, + "step": 3839 + }, + { + "epoch": 0.6195546950629235, + "grad_norm": 4.2426629066467285, + "learning_rate": 3.366231499032983e-05, + "loss": 2.1742, + "step": 3840 + }, + { + "epoch": 0.6197160374314294, + "grad_norm": 6.021782398223877, + "learning_rate": 3.363762369952552e-05, + "loss": 2.0044, + "step": 3841 + }, + { + "epoch": 0.6198773797999355, + "grad_norm": 4.393467426300049, + "learning_rate": 3.3612936876669834e-05, + "loss": 1.8005, + "step": 3842 + }, + { + "epoch": 0.6200387221684415, + "grad_norm": 4.665099620819092, + "learning_rate": 3.358825452850376e-05, + "loss": 2.0681, + "step": 3843 + }, + { + "epoch": 0.6202000645369474, + "grad_norm": 4.036859035491943, + "learning_rate": 3.356357666176716e-05, + "loss": 1.7972, + "step": 3844 + }, + { + "epoch": 0.6203614069054534, + "grad_norm": 5.150184154510498, + "learning_rate": 3.353890328319861e-05, + "loss": 2.0301, + "step": 3845 + }, + { + "epoch": 0.6205227492739593, + "grad_norm": 4.837057113647461, + "learning_rate": 3.3514234399535485e-05, + "loss": 2.0091, + "step": 3846 + }, + { + "epoch": 0.6206840916424653, + "grad_norm": 5.241817951202393, + "learning_rate": 3.3489570017513914e-05, + "loss": 2.0307, + "step": 3847 + }, + { + "epoch": 0.6208454340109713, + "grad_norm": 5.212489128112793, + "learning_rate": 3.3464910143868844e-05, + "loss": 1.9736, + "step": 3848 + }, + { + "epoch": 0.6210067763794772, + "grad_norm": 4.452176094055176, + "learning_rate": 3.3440254785333936e-05, + "loss": 1.8269, + "step": 3849 + }, + { + "epoch": 0.6211681187479832, + "grad_norm": 4.795321941375732, + "learning_rate": 3.341560394864162e-05, + "loss": 2.0215, + "step": 3850 + }, + { + "epoch": 0.6213294611164892, + "grad_norm": 4.669750213623047, + "learning_rate": 3.3390957640523145e-05, + "loss": 1.9653, + "step": 3851 + }, + { + "epoch": 0.6214908034849952, + "grad_norm": 4.048704147338867, + "learning_rate": 3.3366315867708466e-05, + "loss": 1.6575, + "step": 3852 + }, + { + "epoch": 0.6216521458535011, + "grad_norm": 3.779632091522217, + "learning_rate": 3.334167863692634e-05, + "loss": 1.9369, + "step": 3853 + }, + { + "epoch": 0.6218134882220071, + "grad_norm": 4.874792575836182, + "learning_rate": 3.331704595490426e-05, + "loss": 1.8517, + "step": 3854 + }, + { + "epoch": 0.6219748305905131, + "grad_norm": 6.897017478942871, + "learning_rate": 3.3292417828368495e-05, + "loss": 2.0271, + "step": 3855 + }, + { + "epoch": 0.622136172959019, + "grad_norm": 4.712216377258301, + "learning_rate": 3.326779426404402e-05, + "loss": 1.8385, + "step": 3856 + }, + { + "epoch": 0.622297515327525, + "grad_norm": 6.182539463043213, + "learning_rate": 3.324317526865465e-05, + "loss": 2.05, + "step": 3857 + }, + { + "epoch": 0.6224588576960309, + "grad_norm": 4.800398826599121, + "learning_rate": 3.321856084892287e-05, + "loss": 1.9153, + "step": 3858 + }, + { + "epoch": 0.622620200064537, + "grad_norm": 4.936999797821045, + "learning_rate": 3.319395101156998e-05, + "loss": 1.6719, + "step": 3859 + }, + { + "epoch": 0.622781542433043, + "grad_norm": 3.894695997238159, + "learning_rate": 3.316934576331598e-05, + "loss": 1.8424, + "step": 3860 + }, + { + "epoch": 0.6229428848015489, + "grad_norm": 5.069485664367676, + "learning_rate": 3.314474511087964e-05, + "loss": 1.6743, + "step": 3861 + }, + { + "epoch": 0.6231042271700549, + "grad_norm": 4.204349517822266, + "learning_rate": 3.312014906097848e-05, + "loss": 1.8606, + "step": 3862 + }, + { + "epoch": 0.6232655695385608, + "grad_norm": 3.7627949714660645, + "learning_rate": 3.309555762032873e-05, + "loss": 1.7485, + "step": 3863 + }, + { + "epoch": 0.6234269119070668, + "grad_norm": 4.631115436553955, + "learning_rate": 3.307097079564542e-05, + "loss": 1.7235, + "step": 3864 + }, + { + "epoch": 0.6235882542755727, + "grad_norm": 8.076692581176758, + "learning_rate": 3.304638859364225e-05, + "loss": 2.1334, + "step": 3865 + }, + { + "epoch": 0.6237495966440787, + "grad_norm": 5.298736095428467, + "learning_rate": 3.302181102103173e-05, + "loss": 1.6522, + "step": 3866 + }, + { + "epoch": 0.6239109390125847, + "grad_norm": 5.513214111328125, + "learning_rate": 3.299723808452504e-05, + "loss": 1.7262, + "step": 3867 + }, + { + "epoch": 0.6240722813810907, + "grad_norm": 4.249666690826416, + "learning_rate": 3.297266979083215e-05, + "loss": 1.94, + "step": 3868 + }, + { + "epoch": 0.6242336237495967, + "grad_norm": 3.5764763355255127, + "learning_rate": 3.29481061466617e-05, + "loss": 1.6686, + "step": 3869 + }, + { + "epoch": 0.6243949661181026, + "grad_norm": 4.225545406341553, + "learning_rate": 3.292354715872113e-05, + "loss": 1.8488, + "step": 3870 + }, + { + "epoch": 0.6245563084866086, + "grad_norm": 3.946922540664673, + "learning_rate": 3.289899283371657e-05, + "loss": 1.8323, + "step": 3871 + }, + { + "epoch": 0.6247176508551145, + "grad_norm": 3.8466291427612305, + "learning_rate": 3.287444317835285e-05, + "loss": 2.327, + "step": 3872 + }, + { + "epoch": 0.6248789932236205, + "grad_norm": 3.4624412059783936, + "learning_rate": 3.2849898199333605e-05, + "loss": 1.8043, + "step": 3873 + }, + { + "epoch": 0.6250403355921265, + "grad_norm": 4.060080051422119, + "learning_rate": 3.28253579033611e-05, + "loss": 1.8283, + "step": 3874 + }, + { + "epoch": 0.6252016779606324, + "grad_norm": 3.958866834640503, + "learning_rate": 3.280082229713639e-05, + "loss": 1.7741, + "step": 3875 + }, + { + "epoch": 0.6253630203291385, + "grad_norm": 4.636531829833984, + "learning_rate": 3.27762913873592e-05, + "loss": 1.7591, + "step": 3876 + }, + { + "epoch": 0.6255243626976444, + "grad_norm": 4.473749160766602, + "learning_rate": 3.275176518072804e-05, + "loss": 1.9947, + "step": 3877 + }, + { + "epoch": 0.6256857050661504, + "grad_norm": 4.48061466217041, + "learning_rate": 3.2727243683940045e-05, + "loss": 1.7359, + "step": 3878 + }, + { + "epoch": 0.6258470474346564, + "grad_norm": 3.749462604522705, + "learning_rate": 3.2702726903691156e-05, + "loss": 1.7407, + "step": 3879 + }, + { + "epoch": 0.6260083898031623, + "grad_norm": 3.427985429763794, + "learning_rate": 3.267821484667594e-05, + "loss": 1.7586, + "step": 3880 + }, + { + "epoch": 0.6261697321716683, + "grad_norm": 3.203972339630127, + "learning_rate": 3.265370751958776e-05, + "loss": 1.9288, + "step": 3881 + }, + { + "epoch": 0.6263310745401742, + "grad_norm": 6.207522392272949, + "learning_rate": 3.2629204929118605e-05, + "loss": 1.9522, + "step": 3882 + }, + { + "epoch": 0.6264924169086802, + "grad_norm": 5.126103401184082, + "learning_rate": 3.260470708195924e-05, + "loss": 1.9996, + "step": 3883 + }, + { + "epoch": 0.6266537592771861, + "grad_norm": 4.193669319152832, + "learning_rate": 3.258021398479907e-05, + "loss": 1.6116, + "step": 3884 + }, + { + "epoch": 0.6268151016456922, + "grad_norm": 5.127233028411865, + "learning_rate": 3.255572564432628e-05, + "loss": 2.2015, + "step": 3885 + }, + { + "epoch": 0.6269764440141982, + "grad_norm": 3.4576432704925537, + "learning_rate": 3.253124206722768e-05, + "loss": 1.8934, + "step": 3886 + }, + { + "epoch": 0.6271377863827041, + "grad_norm": 3.747927665710449, + "learning_rate": 3.250676326018882e-05, + "loss": 2.0763, + "step": 3887 + }, + { + "epoch": 0.6272991287512101, + "grad_norm": 6.565838813781738, + "learning_rate": 3.248228922989396e-05, + "loss": 2.1956, + "step": 3888 + }, + { + "epoch": 0.627460471119716, + "grad_norm": 5.520340919494629, + "learning_rate": 3.245781998302599e-05, + "loss": 1.9575, + "step": 3889 + }, + { + "epoch": 0.627621813488222, + "grad_norm": 4.918091297149658, + "learning_rate": 3.2433355526266595e-05, + "loss": 1.9128, + "step": 3890 + }, + { + "epoch": 0.627783155856728, + "grad_norm": 4.920687675476074, + "learning_rate": 3.2408895866296056e-05, + "loss": 1.9027, + "step": 3891 + }, + { + "epoch": 0.6279444982252339, + "grad_norm": 4.013607978820801, + "learning_rate": 3.2384441009793395e-05, + "loss": 1.8993, + "step": 3892 + }, + { + "epoch": 0.6281058405937399, + "grad_norm": 4.493187427520752, + "learning_rate": 3.235999096343633e-05, + "loss": 1.8503, + "step": 3893 + }, + { + "epoch": 0.6282671829622459, + "grad_norm": 3.9341964721679688, + "learning_rate": 3.233554573390123e-05, + "loss": 1.9401, + "step": 3894 + }, + { + "epoch": 0.6284285253307519, + "grad_norm": 5.569158554077148, + "learning_rate": 3.231110532786316e-05, + "loss": 1.9528, + "step": 3895 + }, + { + "epoch": 0.6285898676992578, + "grad_norm": 3.7844202518463135, + "learning_rate": 3.2286669751995904e-05, + "loss": 1.9074, + "step": 3896 + }, + { + "epoch": 0.6287512100677638, + "grad_norm": 5.982578277587891, + "learning_rate": 3.226223901297185e-05, + "loss": 1.8828, + "step": 3897 + }, + { + "epoch": 0.6289125524362698, + "grad_norm": 4.350557804107666, + "learning_rate": 3.2237813117462166e-05, + "loss": 1.8033, + "step": 3898 + }, + { + "epoch": 0.6290738948047757, + "grad_norm": 3.943612813949585, + "learning_rate": 3.2213392072136616e-05, + "loss": 2.1312, + "step": 3899 + }, + { + "epoch": 0.6292352371732817, + "grad_norm": 4.545783519744873, + "learning_rate": 3.218897588366365e-05, + "loss": 1.8734, + "step": 3900 + }, + { + "epoch": 0.6293965795417876, + "grad_norm": 3.6101181507110596, + "learning_rate": 3.2164564558710456e-05, + "loss": 2.0215, + "step": 3901 + }, + { + "epoch": 0.6295579219102937, + "grad_norm": 4.474806308746338, + "learning_rate": 3.2140158103942794e-05, + "loss": 1.8417, + "step": 3902 + }, + { + "epoch": 0.6297192642787997, + "grad_norm": 3.520000696182251, + "learning_rate": 3.2115756526025195e-05, + "loss": 1.9087, + "step": 3903 + }, + { + "epoch": 0.6298806066473056, + "grad_norm": 5.004185199737549, + "learning_rate": 3.209135983162077e-05, + "loss": 1.9131, + "step": 3904 + }, + { + "epoch": 0.6300419490158116, + "grad_norm": 4.817507266998291, + "learning_rate": 3.2066968027391374e-05, + "loss": 1.7474, + "step": 3905 + }, + { + "epoch": 0.6302032913843175, + "grad_norm": 3.5767366886138916, + "learning_rate": 3.204258111999745e-05, + "loss": 1.7991, + "step": 3906 + }, + { + "epoch": 0.6303646337528235, + "grad_norm": 4.406589984893799, + "learning_rate": 3.201819911609819e-05, + "loss": 1.7198, + "step": 3907 + }, + { + "epoch": 0.6305259761213294, + "grad_norm": 4.156294822692871, + "learning_rate": 3.199382202235135e-05, + "loss": 1.7961, + "step": 3908 + }, + { + "epoch": 0.6306873184898354, + "grad_norm": 6.422266483306885, + "learning_rate": 3.1969449845413454e-05, + "loss": 1.7938, + "step": 3909 + }, + { + "epoch": 0.6308486608583413, + "grad_norm": 4.197117805480957, + "learning_rate": 3.194508259193958e-05, + "loss": 1.7533, + "step": 3910 + }, + { + "epoch": 0.6310100032268474, + "grad_norm": 5.0357890129089355, + "learning_rate": 3.192072026858352e-05, + "loss": 1.9352, + "step": 3911 + }, + { + "epoch": 0.6311713455953534, + "grad_norm": 3.7781126499176025, + "learning_rate": 3.189636288199771e-05, + "loss": 1.8253, + "step": 3912 + }, + { + "epoch": 0.6313326879638593, + "grad_norm": 4.784677505493164, + "learning_rate": 3.187201043883323e-05, + "loss": 1.9062, + "step": 3913 + }, + { + "epoch": 0.6314940303323653, + "grad_norm": 4.113647937774658, + "learning_rate": 3.184766294573983e-05, + "loss": 1.644, + "step": 3914 + }, + { + "epoch": 0.6316553727008712, + "grad_norm": 4.529688358306885, + "learning_rate": 3.1823320409365865e-05, + "loss": 2.0294, + "step": 3915 + }, + { + "epoch": 0.6318167150693772, + "grad_norm": 3.490339994430542, + "learning_rate": 3.179898283635839e-05, + "loss": 1.8773, + "step": 3916 + }, + { + "epoch": 0.6319780574378832, + "grad_norm": 5.333843231201172, + "learning_rate": 3.1774650233363055e-05, + "loss": 1.9404, + "step": 3917 + }, + { + "epoch": 0.6321393998063891, + "grad_norm": 3.6392922401428223, + "learning_rate": 3.175032260702422e-05, + "loss": 2.0004, + "step": 3918 + }, + { + "epoch": 0.6323007421748952, + "grad_norm": 6.618895053863525, + "learning_rate": 3.172599996398479e-05, + "loss": 1.8705, + "step": 3919 + }, + { + "epoch": 0.6324620845434011, + "grad_norm": 4.968805313110352, + "learning_rate": 3.170168231088641e-05, + "loss": 1.7857, + "step": 3920 + }, + { + "epoch": 0.6326234269119071, + "grad_norm": 5.2753705978393555, + "learning_rate": 3.167736965436927e-05, + "loss": 1.7994, + "step": 3921 + }, + { + "epoch": 0.632784769280413, + "grad_norm": 4.1668195724487305, + "learning_rate": 3.165306200107229e-05, + "loss": 1.9737, + "step": 3922 + }, + { + "epoch": 0.632946111648919, + "grad_norm": 6.095056056976318, + "learning_rate": 3.162875935763294e-05, + "loss": 1.6118, + "step": 3923 + }, + { + "epoch": 0.633107454017425, + "grad_norm": 3.8742151260375977, + "learning_rate": 3.1604461730687346e-05, + "loss": 1.7393, + "step": 3924 + }, + { + "epoch": 0.6332687963859309, + "grad_norm": 5.468986511230469, + "learning_rate": 3.158016912687032e-05, + "loss": 1.8525, + "step": 3925 + }, + { + "epoch": 0.6334301387544369, + "grad_norm": 4.554811954498291, + "learning_rate": 3.15558815528152e-05, + "loss": 2.0176, + "step": 3926 + }, + { + "epoch": 0.6335914811229428, + "grad_norm": 4.299023628234863, + "learning_rate": 3.153159901515406e-05, + "loss": 2.0495, + "step": 3927 + }, + { + "epoch": 0.6337528234914489, + "grad_norm": 5.944100856781006, + "learning_rate": 3.150732152051751e-05, + "loss": 1.8224, + "step": 3928 + }, + { + "epoch": 0.6339141658599549, + "grad_norm": 4.19835901260376, + "learning_rate": 3.148304907553485e-05, + "loss": 2.0148, + "step": 3929 + }, + { + "epoch": 0.6340755082284608, + "grad_norm": 3.8810858726501465, + "learning_rate": 3.145878168683395e-05, + "loss": 1.9752, + "step": 3930 + }, + { + "epoch": 0.6342368505969668, + "grad_norm": 5.0286736488342285, + "learning_rate": 3.143451936104131e-05, + "loss": 1.9513, + "step": 3931 + }, + { + "epoch": 0.6343981929654727, + "grad_norm": 4.726410388946533, + "learning_rate": 3.1410262104782085e-05, + "loss": 1.7122, + "step": 3932 + }, + { + "epoch": 0.6345595353339787, + "grad_norm": 3.8950557708740234, + "learning_rate": 3.138600992468e-05, + "loss": 1.8204, + "step": 3933 + }, + { + "epoch": 0.6347208777024846, + "grad_norm": 3.7046666145324707, + "learning_rate": 3.136176282735741e-05, + "loss": 1.9165, + "step": 3934 + }, + { + "epoch": 0.6348822200709906, + "grad_norm": 3.938035249710083, + "learning_rate": 3.1337520819435303e-05, + "loss": 1.7226, + "step": 3935 + }, + { + "epoch": 0.6350435624394967, + "grad_norm": 7.021340370178223, + "learning_rate": 3.131328390753324e-05, + "loss": 1.8995, + "step": 3936 + }, + { + "epoch": 0.6352049048080026, + "grad_norm": 4.165858745574951, + "learning_rate": 3.12890520982694e-05, + "loss": 1.9557, + "step": 3937 + }, + { + "epoch": 0.6353662471765086, + "grad_norm": 3.9283430576324463, + "learning_rate": 3.126482539826061e-05, + "loss": 1.8387, + "step": 3938 + }, + { + "epoch": 0.6355275895450145, + "grad_norm": 3.8701705932617188, + "learning_rate": 3.124060381412223e-05, + "loss": 1.8546, + "step": 3939 + }, + { + "epoch": 0.6356889319135205, + "grad_norm": 3.4759013652801514, + "learning_rate": 3.1216387352468305e-05, + "loss": 1.8128, + "step": 3940 + }, + { + "epoch": 0.6358502742820265, + "grad_norm": 4.323712348937988, + "learning_rate": 3.119217601991139e-05, + "loss": 1.797, + "step": 3941 + }, + { + "epoch": 0.6360116166505324, + "grad_norm": 3.5588533878326416, + "learning_rate": 3.1167969823062734e-05, + "loss": 1.9885, + "step": 3942 + }, + { + "epoch": 0.6361729590190384, + "grad_norm": 3.617077589035034, + "learning_rate": 3.11437687685321e-05, + "loss": 2.1947, + "step": 3943 + }, + { + "epoch": 0.6363343013875443, + "grad_norm": 4.54804801940918, + "learning_rate": 3.1119572862927916e-05, + "loss": 1.6814, + "step": 3944 + }, + { + "epoch": 0.6364956437560504, + "grad_norm": 3.9997711181640625, + "learning_rate": 3.109538211285714e-05, + "loss": 1.8827, + "step": 3945 + }, + { + "epoch": 0.6366569861245563, + "grad_norm": 3.9156370162963867, + "learning_rate": 3.10711965249254e-05, + "loss": 1.8592, + "step": 3946 + }, + { + "epoch": 0.6368183284930623, + "grad_norm": 4.000783443450928, + "learning_rate": 3.10470161057368e-05, + "loss": 1.7229, + "step": 3947 + }, + { + "epoch": 0.6369796708615683, + "grad_norm": 4.192603588104248, + "learning_rate": 3.1022840861894174e-05, + "loss": 1.7745, + "step": 3948 + }, + { + "epoch": 0.6371410132300742, + "grad_norm": 5.290771484375, + "learning_rate": 3.0998670799998844e-05, + "loss": 2.181, + "step": 3949 + }, + { + "epoch": 0.6373023555985802, + "grad_norm": 4.79047155380249, + "learning_rate": 3.0974505926650724e-05, + "loss": 1.8853, + "step": 3950 + }, + { + "epoch": 0.6374636979670861, + "grad_norm": 4.0281171798706055, + "learning_rate": 3.095034624844835e-05, + "loss": 1.9216, + "step": 3951 + }, + { + "epoch": 0.6376250403355921, + "grad_norm": 3.3467166423797607, + "learning_rate": 3.092619177198881e-05, + "loss": 1.8407, + "step": 3952 + }, + { + "epoch": 0.637786382704098, + "grad_norm": 4.751272678375244, + "learning_rate": 3.090204250386779e-05, + "loss": 1.7854, + "step": 3953 + }, + { + "epoch": 0.6379477250726041, + "grad_norm": 6.508646488189697, + "learning_rate": 3.087789845067953e-05, + "loss": 1.8706, + "step": 3954 + }, + { + "epoch": 0.6381090674411101, + "grad_norm": 3.634064197540283, + "learning_rate": 3.0853759619016896e-05, + "loss": 1.733, + "step": 3955 + }, + { + "epoch": 0.638270409809616, + "grad_norm": 7.2373151779174805, + "learning_rate": 3.0829626015471245e-05, + "loss": 1.9196, + "step": 3956 + }, + { + "epoch": 0.638431752178122, + "grad_norm": 4.87274694442749, + "learning_rate": 3.080549764663261e-05, + "loss": 1.8556, + "step": 3957 + }, + { + "epoch": 0.6385930945466279, + "grad_norm": 3.414288282394409, + "learning_rate": 3.078137451908949e-05, + "loss": 1.9769, + "step": 3958 + }, + { + "epoch": 0.6387544369151339, + "grad_norm": 4.94952392578125, + "learning_rate": 3.0757256639429025e-05, + "loss": 1.8745, + "step": 3959 + }, + { + "epoch": 0.6389157792836399, + "grad_norm": 4.189605236053467, + "learning_rate": 3.07331440142369e-05, + "loss": 1.6985, + "step": 3960 + }, + { + "epoch": 0.6390771216521458, + "grad_norm": 3.7005834579467773, + "learning_rate": 3.070903665009738e-05, + "loss": 1.8928, + "step": 3961 + }, + { + "epoch": 0.6392384640206519, + "grad_norm": 5.443114757537842, + "learning_rate": 3.0684934553593244e-05, + "loss": 1.8405, + "step": 3962 + }, + { + "epoch": 0.6393998063891578, + "grad_norm": 4.24349308013916, + "learning_rate": 3.066083773130588e-05, + "loss": 1.8022, + "step": 3963 + }, + { + "epoch": 0.6395611487576638, + "grad_norm": 3.877264976501465, + "learning_rate": 3.0636746189815235e-05, + "loss": 2.1855, + "step": 3964 + }, + { + "epoch": 0.6397224911261697, + "grad_norm": 5.466297149658203, + "learning_rate": 3.0612659935699774e-05, + "loss": 2.0294, + "step": 3965 + }, + { + "epoch": 0.6398838334946757, + "grad_norm": 4.056169509887695, + "learning_rate": 3.058857897553659e-05, + "loss": 1.9896, + "step": 3966 + }, + { + "epoch": 0.6400451758631817, + "grad_norm": 4.240475654602051, + "learning_rate": 3.0564503315901226e-05, + "loss": 1.8254, + "step": 3967 + }, + { + "epoch": 0.6402065182316876, + "grad_norm": 3.5806643962860107, + "learning_rate": 3.0540432963367905e-05, + "loss": 1.7046, + "step": 3968 + }, + { + "epoch": 0.6403678606001936, + "grad_norm": 4.3338117599487305, + "learning_rate": 3.051636792450928e-05, + "loss": 1.9001, + "step": 3969 + }, + { + "epoch": 0.6405292029686995, + "grad_norm": 3.7095282077789307, + "learning_rate": 3.0492308205896635e-05, + "loss": 1.5849, + "step": 3970 + }, + { + "epoch": 0.6406905453372056, + "grad_norm": 4.299651145935059, + "learning_rate": 3.0468253814099756e-05, + "loss": 1.8346, + "step": 3971 + }, + { + "epoch": 0.6408518877057116, + "grad_norm": 5.332083225250244, + "learning_rate": 3.044420475568701e-05, + "loss": 1.8271, + "step": 3972 + }, + { + "epoch": 0.6410132300742175, + "grad_norm": 4.321498394012451, + "learning_rate": 3.042016103722526e-05, + "loss": 1.8567, + "step": 3973 + }, + { + "epoch": 0.6411745724427235, + "grad_norm": 4.005444526672363, + "learning_rate": 3.039612266527998e-05, + "loss": 1.6229, + "step": 3974 + }, + { + "epoch": 0.6413359148112294, + "grad_norm": 3.9515342712402344, + "learning_rate": 3.0372089646415125e-05, + "loss": 1.938, + "step": 3975 + }, + { + "epoch": 0.6414972571797354, + "grad_norm": 4.833498954772949, + "learning_rate": 3.0348061987193178e-05, + "loss": 1.8208, + "step": 3976 + }, + { + "epoch": 0.6416585995482413, + "grad_norm": 4.180032253265381, + "learning_rate": 3.0324039694175233e-05, + "loss": 1.9654, + "step": 3977 + }, + { + "epoch": 0.6418199419167473, + "grad_norm": 5.152658462524414, + "learning_rate": 3.030002277392085e-05, + "loss": 1.9309, + "step": 3978 + }, + { + "epoch": 0.6419812842852534, + "grad_norm": 3.8878352642059326, + "learning_rate": 3.0276011232988145e-05, + "loss": 1.8735, + "step": 3979 + }, + { + "epoch": 0.6421426266537593, + "grad_norm": 5.087482929229736, + "learning_rate": 3.0252005077933775e-05, + "loss": 1.86, + "step": 3980 + }, + { + "epoch": 0.6423039690222653, + "grad_norm": 3.9759790897369385, + "learning_rate": 3.0228004315312917e-05, + "loss": 1.6493, + "step": 3981 + }, + { + "epoch": 0.6424653113907712, + "grad_norm": 6.068326473236084, + "learning_rate": 3.0204008951679246e-05, + "loss": 1.9676, + "step": 3982 + }, + { + "epoch": 0.6426266537592772, + "grad_norm": 4.612680912017822, + "learning_rate": 3.018001899358504e-05, + "loss": 1.8544, + "step": 3983 + }, + { + "epoch": 0.6427879961277831, + "grad_norm": 4.5577826499938965, + "learning_rate": 3.0156034447581005e-05, + "loss": 1.9273, + "step": 3984 + }, + { + "epoch": 0.6429493384962891, + "grad_norm": 3.682194948196411, + "learning_rate": 3.0132055320216468e-05, + "loss": 1.7893, + "step": 3985 + }, + { + "epoch": 0.6431106808647951, + "grad_norm": 4.775866508483887, + "learning_rate": 3.0108081618039167e-05, + "loss": 1.8938, + "step": 3986 + }, + { + "epoch": 0.643272023233301, + "grad_norm": 4.182424545288086, + "learning_rate": 3.008411334759548e-05, + "loss": 1.809, + "step": 3987 + }, + { + "epoch": 0.6434333656018071, + "grad_norm": 3.62164568901062, + "learning_rate": 3.0060150515430198e-05, + "loss": 2.0824, + "step": 3988 + }, + { + "epoch": 0.643594707970313, + "grad_norm": 3.801424503326416, + "learning_rate": 3.0036193128086665e-05, + "loss": 1.654, + "step": 3989 + }, + { + "epoch": 0.643756050338819, + "grad_norm": 4.317459583282471, + "learning_rate": 3.001224119210676e-05, + "loss": 1.8066, + "step": 3990 + }, + { + "epoch": 0.643917392707325, + "grad_norm": 4.665870666503906, + "learning_rate": 2.9988294714030833e-05, + "loss": 2.182, + "step": 3991 + }, + { + "epoch": 0.6440787350758309, + "grad_norm": 4.350362777709961, + "learning_rate": 2.9964353700397797e-05, + "loss": 1.8528, + "step": 3992 + }, + { + "epoch": 0.6442400774443369, + "grad_norm": 3.7095420360565186, + "learning_rate": 2.9940418157745004e-05, + "loss": 1.8254, + "step": 3993 + }, + { + "epoch": 0.6444014198128428, + "grad_norm": 4.114126205444336, + "learning_rate": 2.9916488092608387e-05, + "loss": 2.025, + "step": 3994 + }, + { + "epoch": 0.6445627621813488, + "grad_norm": 5.245405673980713, + "learning_rate": 2.9892563511522304e-05, + "loss": 1.8729, + "step": 3995 + }, + { + "epoch": 0.6447241045498547, + "grad_norm": 3.8701298236846924, + "learning_rate": 2.98686444210197e-05, + "loss": 1.8739, + "step": 3996 + }, + { + "epoch": 0.6448854469183608, + "grad_norm": 5.725188255310059, + "learning_rate": 2.9844730827631943e-05, + "loss": 1.9052, + "step": 3997 + }, + { + "epoch": 0.6450467892868668, + "grad_norm": 4.8308000564575195, + "learning_rate": 2.9820822737888965e-05, + "loss": 1.934, + "step": 3998 + }, + { + "epoch": 0.6452081316553727, + "grad_norm": 4.190275192260742, + "learning_rate": 2.979692015831913e-05, + "loss": 1.7112, + "step": 3999 + }, + { + "epoch": 0.6453694740238787, + "grad_norm": 3.8999693393707275, + "learning_rate": 2.9773023095449355e-05, + "loss": 1.8895, + "step": 4000 + }, + { + "epoch": 0.6455308163923846, + "grad_norm": 5.109170913696289, + "learning_rate": 2.9749131555805033e-05, + "loss": 1.7905, + "step": 4001 + }, + { + "epoch": 0.6456921587608906, + "grad_norm": 5.036904335021973, + "learning_rate": 2.9725245545910008e-05, + "loss": 1.8134, + "step": 4002 + }, + { + "epoch": 0.6458535011293965, + "grad_norm": 4.558663368225098, + "learning_rate": 2.97013650722867e-05, + "loss": 2.1622, + "step": 4003 + }, + { + "epoch": 0.6460148434979025, + "grad_norm": 6.16807222366333, + "learning_rate": 2.9677490141455916e-05, + "loss": 1.7927, + "step": 4004 + }, + { + "epoch": 0.6461761858664086, + "grad_norm": 5.861327171325684, + "learning_rate": 2.965362075993705e-05, + "loss": 1.9503, + "step": 4005 + }, + { + "epoch": 0.6463375282349145, + "grad_norm": 3.5244321823120117, + "learning_rate": 2.9629756934247883e-05, + "loss": 2.043, + "step": 4006 + }, + { + "epoch": 0.6464988706034205, + "grad_norm": 4.2635416984558105, + "learning_rate": 2.9605898670904774e-05, + "loss": 1.99, + "step": 4007 + }, + { + "epoch": 0.6466602129719264, + "grad_norm": 5.670888423919678, + "learning_rate": 2.958204597642248e-05, + "loss": 1.8356, + "step": 4008 + }, + { + "epoch": 0.6468215553404324, + "grad_norm": 5.056757926940918, + "learning_rate": 2.955819885731429e-05, + "loss": 1.6895, + "step": 4009 + }, + { + "epoch": 0.6469828977089384, + "grad_norm": 4.226497650146484, + "learning_rate": 2.9534357320091937e-05, + "loss": 1.9286, + "step": 4010 + }, + { + "epoch": 0.6471442400774443, + "grad_norm": 4.541499614715576, + "learning_rate": 2.9510521371265676e-05, + "loss": 1.7026, + "step": 4011 + }, + { + "epoch": 0.6473055824459503, + "grad_norm": 5.744909763336182, + "learning_rate": 2.948669101734419e-05, + "loss": 2.0614, + "step": 4012 + }, + { + "epoch": 0.6474669248144562, + "grad_norm": 5.761630535125732, + "learning_rate": 2.946286626483463e-05, + "loss": 1.7397, + "step": 4013 + }, + { + "epoch": 0.6476282671829623, + "grad_norm": 3.778589963912964, + "learning_rate": 2.943904712024268e-05, + "loss": 1.7743, + "step": 4014 + }, + { + "epoch": 0.6477896095514682, + "grad_norm": 5.145801067352295, + "learning_rate": 2.941523359007241e-05, + "loss": 2.186, + "step": 4015 + }, + { + "epoch": 0.6479509519199742, + "grad_norm": 3.8931326866149902, + "learning_rate": 2.9391425680826444e-05, + "loss": 1.8601, + "step": 4016 + }, + { + "epoch": 0.6481122942884802, + "grad_norm": 4.508549690246582, + "learning_rate": 2.9367623399005782e-05, + "loss": 1.8606, + "step": 4017 + }, + { + "epoch": 0.6482736366569861, + "grad_norm": 5.005433559417725, + "learning_rate": 2.9343826751109955e-05, + "loss": 1.7911, + "step": 4018 + }, + { + "epoch": 0.6484349790254921, + "grad_norm": 4.451712608337402, + "learning_rate": 2.932003574363692e-05, + "loss": 2.0797, + "step": 4019 + }, + { + "epoch": 0.648596321393998, + "grad_norm": 4.714862823486328, + "learning_rate": 2.9296250383083118e-05, + "loss": 1.7912, + "step": 4020 + }, + { + "epoch": 0.648757663762504, + "grad_norm": 6.01436710357666, + "learning_rate": 2.9272470675943408e-05, + "loss": 1.602, + "step": 4021 + }, + { + "epoch": 0.6489190061310101, + "grad_norm": 3.64809250831604, + "learning_rate": 2.924869662871117e-05, + "loss": 1.8688, + "step": 4022 + }, + { + "epoch": 0.649080348499516, + "grad_norm": 4.9753546714782715, + "learning_rate": 2.922492824787816e-05, + "loss": 1.9459, + "step": 4023 + }, + { + "epoch": 0.649241690868022, + "grad_norm": 6.607090473175049, + "learning_rate": 2.9201165539934673e-05, + "loss": 1.8438, + "step": 4024 + }, + { + "epoch": 0.6494030332365279, + "grad_norm": 6.046296119689941, + "learning_rate": 2.917740851136939e-05, + "loss": 2.0968, + "step": 4025 + }, + { + "epoch": 0.6495643756050339, + "grad_norm": 4.133302688598633, + "learning_rate": 2.9153657168669428e-05, + "loss": 1.9329, + "step": 4026 + }, + { + "epoch": 0.6497257179735398, + "grad_norm": 4.30959939956665, + "learning_rate": 2.912991151832043e-05, + "loss": 1.9147, + "step": 4027 + }, + { + "epoch": 0.6498870603420458, + "grad_norm": 5.278558254241943, + "learning_rate": 2.91061715668064e-05, + "loss": 1.8902, + "step": 4028 + }, + { + "epoch": 0.6500484027105518, + "grad_norm": 7.491674423217773, + "learning_rate": 2.9082437320609867e-05, + "loss": 2.0245, + "step": 4029 + }, + { + "epoch": 0.6502097450790577, + "grad_norm": 3.6092584133148193, + "learning_rate": 2.9058708786211718e-05, + "loss": 1.7284, + "step": 4030 + }, + { + "epoch": 0.6503710874475638, + "grad_norm": 4.31028413772583, + "learning_rate": 2.9034985970091355e-05, + "loss": 1.6524, + "step": 4031 + }, + { + "epoch": 0.6505324298160697, + "grad_norm": 5.1833672523498535, + "learning_rate": 2.9011268878726556e-05, + "loss": 1.7917, + "step": 4032 + }, + { + "epoch": 0.6506937721845757, + "grad_norm": 3.9345691204071045, + "learning_rate": 2.89875575185936e-05, + "loss": 2.0056, + "step": 4033 + }, + { + "epoch": 0.6508551145530816, + "grad_norm": 4.322170257568359, + "learning_rate": 2.8963851896167128e-05, + "loss": 1.9763, + "step": 4034 + }, + { + "epoch": 0.6510164569215876, + "grad_norm": 3.895662307739258, + "learning_rate": 2.8940152017920286e-05, + "loss": 1.9022, + "step": 4035 + }, + { + "epoch": 0.6511777992900936, + "grad_norm": 5.397833824157715, + "learning_rate": 2.891645789032459e-05, + "loss": 1.9762, + "step": 4036 + }, + { + "epoch": 0.6513391416585995, + "grad_norm": 5.5054826736450195, + "learning_rate": 2.889276951985005e-05, + "loss": 2.0662, + "step": 4037 + }, + { + "epoch": 0.6515004840271055, + "grad_norm": 5.851878643035889, + "learning_rate": 2.886908691296504e-05, + "loss": 2.0854, + "step": 4038 + }, + { + "epoch": 0.6516618263956114, + "grad_norm": 5.049559116363525, + "learning_rate": 2.884541007613637e-05, + "loss": 1.9758, + "step": 4039 + }, + { + "epoch": 0.6518231687641175, + "grad_norm": 3.9392457008361816, + "learning_rate": 2.8821739015829337e-05, + "loss": 1.833, + "step": 4040 + }, + { + "epoch": 0.6519845111326235, + "grad_norm": 5.671417236328125, + "learning_rate": 2.879807373850759e-05, + "loss": 1.9672, + "step": 4041 + }, + { + "epoch": 0.6521458535011294, + "grad_norm": 5.260490417480469, + "learning_rate": 2.8774414250633212e-05, + "loss": 1.8732, + "step": 4042 + }, + { + "epoch": 0.6523071958696354, + "grad_norm": 5.128913879394531, + "learning_rate": 2.8750760558666757e-05, + "loss": 1.7957, + "step": 4043 + }, + { + "epoch": 0.6524685382381413, + "grad_norm": 4.581787586212158, + "learning_rate": 2.872711266906713e-05, + "loss": 1.7629, + "step": 4044 + }, + { + "epoch": 0.6526298806066473, + "grad_norm": 4.499758243560791, + "learning_rate": 2.870347058829167e-05, + "loss": 2.0256, + "step": 4045 + }, + { + "epoch": 0.6527912229751532, + "grad_norm": 3.7136127948760986, + "learning_rate": 2.867983432279616e-05, + "loss": 1.8258, + "step": 4046 + }, + { + "epoch": 0.6529525653436592, + "grad_norm": 3.6342873573303223, + "learning_rate": 2.865620387903476e-05, + "loss": 1.8578, + "step": 4047 + }, + { + "epoch": 0.6531139077121653, + "grad_norm": 3.641798973083496, + "learning_rate": 2.8632579263460068e-05, + "loss": 1.7957, + "step": 4048 + }, + { + "epoch": 0.6532752500806712, + "grad_norm": 3.681467056274414, + "learning_rate": 2.8608960482523056e-05, + "loss": 1.9007, + "step": 4049 + }, + { + "epoch": 0.6534365924491772, + "grad_norm": 4.51108455657959, + "learning_rate": 2.8585347542673156e-05, + "loss": 1.7851, + "step": 4050 + }, + { + "epoch": 0.6535979348176831, + "grad_norm": 3.876131534576416, + "learning_rate": 2.8561740450358142e-05, + "loss": 1.668, + "step": 4051 + }, + { + "epoch": 0.6537592771861891, + "grad_norm": 3.4206247329711914, + "learning_rate": 2.853813921202423e-05, + "loss": 1.99, + "step": 4052 + }, + { + "epoch": 0.653920619554695, + "grad_norm": 4.4058122634887695, + "learning_rate": 2.8514543834116037e-05, + "loss": 1.8108, + "step": 4053 + }, + { + "epoch": 0.654081961923201, + "grad_norm": 5.689077377319336, + "learning_rate": 2.8490954323076546e-05, + "loss": 2.0066, + "step": 4054 + }, + { + "epoch": 0.654243304291707, + "grad_norm": 5.168551921844482, + "learning_rate": 2.8467370685347205e-05, + "loss": 1.8448, + "step": 4055 + }, + { + "epoch": 0.6544046466602129, + "grad_norm": 4.483346939086914, + "learning_rate": 2.844379292736778e-05, + "loss": 1.6863, + "step": 4056 + }, + { + "epoch": 0.654565989028719, + "grad_norm": 4.934708118438721, + "learning_rate": 2.84202210555765e-05, + "loss": 1.993, + "step": 4057 + }, + { + "epoch": 0.6547273313972249, + "grad_norm": 3.775606155395508, + "learning_rate": 2.8396655076409923e-05, + "loss": 1.8005, + "step": 4058 + }, + { + "epoch": 0.6548886737657309, + "grad_norm": 3.6856369972229004, + "learning_rate": 2.837309499630306e-05, + "loss": 1.964, + "step": 4059 + }, + { + "epoch": 0.6550500161342369, + "grad_norm": 4.163701057434082, + "learning_rate": 2.834954082168928e-05, + "loss": 1.9154, + "step": 4060 + }, + { + "epoch": 0.6552113585027428, + "grad_norm": 4.498649597167969, + "learning_rate": 2.8325992559000313e-05, + "loss": 1.6676, + "step": 4061 + }, + { + "epoch": 0.6553727008712488, + "grad_norm": 4.522988319396973, + "learning_rate": 2.830245021466631e-05, + "loss": 1.7289, + "step": 4062 + }, + { + "epoch": 0.6555340432397547, + "grad_norm": 3.821260690689087, + "learning_rate": 2.8278913795115825e-05, + "loss": 1.9413, + "step": 4063 + }, + { + "epoch": 0.6556953856082607, + "grad_norm": 4.8176093101501465, + "learning_rate": 2.825538330677575e-05, + "loss": 1.5925, + "step": 4064 + }, + { + "epoch": 0.6558567279767668, + "grad_norm": 4.966172695159912, + "learning_rate": 2.823185875607135e-05, + "loss": 1.8659, + "step": 4065 + }, + { + "epoch": 0.6560180703452727, + "grad_norm": 4.94246768951416, + "learning_rate": 2.8208340149426338e-05, + "loss": 1.8868, + "step": 4066 + }, + { + "epoch": 0.6561794127137787, + "grad_norm": 6.051825523376465, + "learning_rate": 2.818482749326272e-05, + "loss": 1.6712, + "step": 4067 + }, + { + "epoch": 0.6563407550822846, + "grad_norm": 6.057215213775635, + "learning_rate": 2.8161320794000955e-05, + "loss": 1.6316, + "step": 4068 + }, + { + "epoch": 0.6565020974507906, + "grad_norm": 3.5899481773376465, + "learning_rate": 2.8137820058059804e-05, + "loss": 1.8105, + "step": 4069 + }, + { + "epoch": 0.6566634398192965, + "grad_norm": 4.588062763214111, + "learning_rate": 2.8114325291856465e-05, + "loss": 1.7202, + "step": 4070 + }, + { + "epoch": 0.6568247821878025, + "grad_norm": 5.241124629974365, + "learning_rate": 2.8090836501806432e-05, + "loss": 1.7447, + "step": 4071 + }, + { + "epoch": 0.6569861245563084, + "grad_norm": 5.199393272399902, + "learning_rate": 2.806735369432365e-05, + "loss": 1.8149, + "step": 4072 + }, + { + "epoch": 0.6571474669248144, + "grad_norm": 3.5903971195220947, + "learning_rate": 2.8043876875820363e-05, + "loss": 1.6505, + "step": 4073 + }, + { + "epoch": 0.6573088092933205, + "grad_norm": 4.2009406089782715, + "learning_rate": 2.802040605270722e-05, + "loss": 1.6893, + "step": 4074 + }, + { + "epoch": 0.6574701516618264, + "grad_norm": 4.394023418426514, + "learning_rate": 2.799694123139322e-05, + "loss": 1.9336, + "step": 4075 + }, + { + "epoch": 0.6576314940303324, + "grad_norm": 4.379487991333008, + "learning_rate": 2.797348241828569e-05, + "loss": 1.9616, + "step": 4076 + }, + { + "epoch": 0.6577928363988383, + "grad_norm": 3.5809361934661865, + "learning_rate": 2.7950029619790397e-05, + "loss": 1.555, + "step": 4077 + }, + { + "epoch": 0.6579541787673443, + "grad_norm": 4.558257102966309, + "learning_rate": 2.7926582842311378e-05, + "loss": 1.8658, + "step": 4078 + }, + { + "epoch": 0.6581155211358503, + "grad_norm": 4.692923545837402, + "learning_rate": 2.790314209225109e-05, + "loss": 1.6488, + "step": 4079 + }, + { + "epoch": 0.6582768635043562, + "grad_norm": 4.130902290344238, + "learning_rate": 2.787970737601031e-05, + "loss": 1.9426, + "step": 4080 + }, + { + "epoch": 0.6584382058728622, + "grad_norm": 4.575941562652588, + "learning_rate": 2.785627869998817e-05, + "loss": 1.8641, + "step": 4081 + }, + { + "epoch": 0.6585995482413682, + "grad_norm": 3.7854466438293457, + "learning_rate": 2.7832856070582146e-05, + "loss": 1.8893, + "step": 4082 + }, + { + "epoch": 0.6587608906098742, + "grad_norm": 4.291102886199951, + "learning_rate": 2.7809439494188117e-05, + "loss": 1.8129, + "step": 4083 + }, + { + "epoch": 0.6589222329783802, + "grad_norm": 5.6730194091796875, + "learning_rate": 2.7786028977200225e-05, + "loss": 1.7007, + "step": 4084 + }, + { + "epoch": 0.6590835753468861, + "grad_norm": 4.0196404457092285, + "learning_rate": 2.7762624526011038e-05, + "loss": 1.7915, + "step": 4085 + }, + { + "epoch": 0.6592449177153921, + "grad_norm": 4.962471008300781, + "learning_rate": 2.773922614701139e-05, + "loss": 2.0735, + "step": 4086 + }, + { + "epoch": 0.659406260083898, + "grad_norm": 5.386277675628662, + "learning_rate": 2.7715833846590532e-05, + "loss": 1.752, + "step": 4087 + }, + { + "epoch": 0.659567602452404, + "grad_norm": 5.766901016235352, + "learning_rate": 2.769244763113601e-05, + "loss": 1.8034, + "step": 4088 + }, + { + "epoch": 0.6597289448209099, + "grad_norm": 4.224059581756592, + "learning_rate": 2.7669067507033697e-05, + "loss": 1.8776, + "step": 4089 + }, + { + "epoch": 0.6598902871894159, + "grad_norm": 5.296021938323975, + "learning_rate": 2.7645693480667856e-05, + "loss": 1.7963, + "step": 4090 + }, + { + "epoch": 0.660051629557922, + "grad_norm": 4.75903844833374, + "learning_rate": 2.7622325558421026e-05, + "loss": 2.0486, + "step": 4091 + }, + { + "epoch": 0.6602129719264279, + "grad_norm": 3.8697707653045654, + "learning_rate": 2.7598963746674132e-05, + "loss": 1.9202, + "step": 4092 + }, + { + "epoch": 0.6603743142949339, + "grad_norm": 5.087195873260498, + "learning_rate": 2.7575608051806374e-05, + "loss": 2.0034, + "step": 4093 + }, + { + "epoch": 0.6605356566634398, + "grad_norm": 4.344791889190674, + "learning_rate": 2.7552258480195347e-05, + "loss": 1.822, + "step": 4094 + }, + { + "epoch": 0.6606969990319458, + "grad_norm": 3.8124961853027344, + "learning_rate": 2.7528915038216908e-05, + "loss": 1.8831, + "step": 4095 + }, + { + "epoch": 0.6608583414004517, + "grad_norm": 4.018934726715088, + "learning_rate": 2.750557773224531e-05, + "loss": 1.7947, + "step": 4096 + }, + { + "epoch": 0.6610196837689577, + "grad_norm": 3.811086654663086, + "learning_rate": 2.7482246568653043e-05, + "loss": 1.9091, + "step": 4097 + }, + { + "epoch": 0.6611810261374637, + "grad_norm": 5.172258377075195, + "learning_rate": 2.745892155381101e-05, + "loss": 1.8699, + "step": 4098 + }, + { + "epoch": 0.6613423685059696, + "grad_norm": 4.130087375640869, + "learning_rate": 2.7435602694088386e-05, + "loss": 1.8169, + "step": 4099 + }, + { + "epoch": 0.6615037108744757, + "grad_norm": 5.704213619232178, + "learning_rate": 2.7412289995852657e-05, + "loss": 1.8186, + "step": 4100 + }, + { + "epoch": 0.6616650532429816, + "grad_norm": 3.660961389541626, + "learning_rate": 2.7388983465469665e-05, + "loss": 1.9181, + "step": 4101 + }, + { + "epoch": 0.6618263956114876, + "grad_norm": 4.935924530029297, + "learning_rate": 2.7365683109303498e-05, + "loss": 1.9634, + "step": 4102 + }, + { + "epoch": 0.6619877379799936, + "grad_norm": 4.407436370849609, + "learning_rate": 2.7342388933716668e-05, + "loss": 1.8782, + "step": 4103 + }, + { + "epoch": 0.6621490803484995, + "grad_norm": 4.4017205238342285, + "learning_rate": 2.731910094506988e-05, + "loss": 1.75, + "step": 4104 + }, + { + "epoch": 0.6623104227170055, + "grad_norm": 3.8433196544647217, + "learning_rate": 2.7295819149722258e-05, + "loss": 1.9688, + "step": 4105 + }, + { + "epoch": 0.6624717650855114, + "grad_norm": 3.9323267936706543, + "learning_rate": 2.7272543554031137e-05, + "loss": 1.6916, + "step": 4106 + }, + { + "epoch": 0.6626331074540174, + "grad_norm": 4.648200988769531, + "learning_rate": 2.7249274164352255e-05, + "loss": 1.8686, + "step": 4107 + }, + { + "epoch": 0.6627944498225234, + "grad_norm": 4.506359577178955, + "learning_rate": 2.7226010987039552e-05, + "loss": 2.0056, + "step": 4108 + }, + { + "epoch": 0.6629557921910294, + "grad_norm": 4.010270595550537, + "learning_rate": 2.7202754028445376e-05, + "loss": 1.9273, + "step": 4109 + }, + { + "epoch": 0.6631171345595354, + "grad_norm": 5.485168933868408, + "learning_rate": 2.717950329492028e-05, + "loss": 1.9363, + "step": 4110 + }, + { + "epoch": 0.6632784769280413, + "grad_norm": 4.302667140960693, + "learning_rate": 2.7156258792813218e-05, + "loss": 2.0121, + "step": 4111 + }, + { + "epoch": 0.6634398192965473, + "grad_norm": 5.655758857727051, + "learning_rate": 2.713302052847132e-05, + "loss": 2.0036, + "step": 4112 + }, + { + "epoch": 0.6636011616650532, + "grad_norm": 3.56538724899292, + "learning_rate": 2.710978850824014e-05, + "loss": 2.0621, + "step": 4113 + }, + { + "epoch": 0.6637625040335592, + "grad_norm": 4.988595008850098, + "learning_rate": 2.708656273846345e-05, + "loss": 1.692, + "step": 4114 + }, + { + "epoch": 0.6639238464020651, + "grad_norm": 4.9897613525390625, + "learning_rate": 2.7063343225483308e-05, + "loss": 1.8756, + "step": 4115 + }, + { + "epoch": 0.6640851887705711, + "grad_norm": 3.808546781539917, + "learning_rate": 2.7040129975640123e-05, + "loss": 2.2095, + "step": 4116 + }, + { + "epoch": 0.6642465311390772, + "grad_norm": 4.039368629455566, + "learning_rate": 2.701692299527252e-05, + "loss": 1.7944, + "step": 4117 + }, + { + "epoch": 0.6644078735075831, + "grad_norm": 5.334078311920166, + "learning_rate": 2.69937222907175e-05, + "loss": 1.7066, + "step": 4118 + }, + { + "epoch": 0.6645692158760891, + "grad_norm": 3.4958536624908447, + "learning_rate": 2.697052786831027e-05, + "loss": 1.8442, + "step": 4119 + }, + { + "epoch": 0.664730558244595, + "grad_norm": 4.124898433685303, + "learning_rate": 2.6947339734384364e-05, + "loss": 1.7974, + "step": 4120 + }, + { + "epoch": 0.664891900613101, + "grad_norm": 4.844030857086182, + "learning_rate": 2.6924157895271563e-05, + "loss": 1.7956, + "step": 4121 + }, + { + "epoch": 0.665053242981607, + "grad_norm": 4.001951694488525, + "learning_rate": 2.6900982357301997e-05, + "loss": 1.9523, + "step": 4122 + }, + { + "epoch": 0.6652145853501129, + "grad_norm": 4.164713382720947, + "learning_rate": 2.687781312680398e-05, + "loss": 1.5221, + "step": 4123 + }, + { + "epoch": 0.6653759277186189, + "grad_norm": 4.255041122436523, + "learning_rate": 2.685465021010421e-05, + "loss": 2.0482, + "step": 4124 + }, + { + "epoch": 0.6655372700871249, + "grad_norm": 3.6462719440460205, + "learning_rate": 2.683149361352756e-05, + "loss": 1.8136, + "step": 4125 + }, + { + "epoch": 0.6656986124556309, + "grad_norm": 4.150946617126465, + "learning_rate": 2.680834334339727e-05, + "loss": 2.3349, + "step": 4126 + }, + { + "epoch": 0.6658599548241368, + "grad_norm": 5.911656379699707, + "learning_rate": 2.6785199406034784e-05, + "loss": 1.6946, + "step": 4127 + }, + { + "epoch": 0.6660212971926428, + "grad_norm": 4.190311431884766, + "learning_rate": 2.676206180775982e-05, + "loss": 1.7547, + "step": 4128 + }, + { + "epoch": 0.6661826395611488, + "grad_norm": 4.114874839782715, + "learning_rate": 2.6738930554890418e-05, + "loss": 1.8269, + "step": 4129 + }, + { + "epoch": 0.6663439819296547, + "grad_norm": 3.957735061645508, + "learning_rate": 2.671580565374282e-05, + "loss": 1.701, + "step": 4130 + }, + { + "epoch": 0.6665053242981607, + "grad_norm": 6.421628475189209, + "learning_rate": 2.6692687110631597e-05, + "loss": 1.6187, + "step": 4131 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.102411270141602, + "learning_rate": 2.6669574931869523e-05, + "loss": 1.9229, + "step": 4132 + }, + { + "epoch": 0.6668280090351726, + "grad_norm": 3.5106594562530518, + "learning_rate": 2.6646469123767694e-05, + "loss": 1.8732, + "step": 4133 + }, + { + "epoch": 0.6669893514036787, + "grad_norm": 4.111449241638184, + "learning_rate": 2.6623369692635404e-05, + "loss": 1.6204, + "step": 4134 + }, + { + "epoch": 0.6671506937721846, + "grad_norm": 4.131481647491455, + "learning_rate": 2.6600276644780275e-05, + "loss": 2.0642, + "step": 4135 + }, + { + "epoch": 0.6673120361406906, + "grad_norm": 5.979979991912842, + "learning_rate": 2.6577189986508123e-05, + "loss": 2.0188, + "step": 4136 + }, + { + "epoch": 0.6674733785091965, + "grad_norm": 4.239624500274658, + "learning_rate": 2.6554109724123027e-05, + "loss": 1.9138, + "step": 4137 + }, + { + "epoch": 0.6676347208777025, + "grad_norm": 5.137545108795166, + "learning_rate": 2.6531035863927378e-05, + "loss": 1.9357, + "step": 4138 + }, + { + "epoch": 0.6677960632462084, + "grad_norm": 4.375874042510986, + "learning_rate": 2.650796841222176e-05, + "loss": 1.7126, + "step": 4139 + }, + { + "epoch": 0.6679574056147144, + "grad_norm": 4.2277021408081055, + "learning_rate": 2.648490737530503e-05, + "loss": 1.8765, + "step": 4140 + }, + { + "epoch": 0.6681187479832204, + "grad_norm": 5.449302673339844, + "learning_rate": 2.646185275947426e-05, + "loss": 1.6811, + "step": 4141 + }, + { + "epoch": 0.6682800903517263, + "grad_norm": 4.741269111633301, + "learning_rate": 2.6438804571024835e-05, + "loss": 1.8743, + "step": 4142 + }, + { + "epoch": 0.6684414327202324, + "grad_norm": 5.969488143920898, + "learning_rate": 2.641576281625031e-05, + "loss": 2.1562, + "step": 4143 + }, + { + "epoch": 0.6686027750887383, + "grad_norm": 4.281977653503418, + "learning_rate": 2.6392727501442572e-05, + "loss": 1.8982, + "step": 4144 + }, + { + "epoch": 0.6687641174572443, + "grad_norm": 3.7031009197235107, + "learning_rate": 2.6369698632891638e-05, + "loss": 1.972, + "step": 4145 + }, + { + "epoch": 0.6689254598257502, + "grad_norm": 4.476454257965088, + "learning_rate": 2.6346676216885873e-05, + "loss": 1.9243, + "step": 4146 + }, + { + "epoch": 0.6690868021942562, + "grad_norm": 4.0415730476379395, + "learning_rate": 2.6323660259711795e-05, + "loss": 1.7471, + "step": 4147 + }, + { + "epoch": 0.6692481445627622, + "grad_norm": 4.301098346710205, + "learning_rate": 2.6300650767654234e-05, + "loss": 1.8926, + "step": 4148 + }, + { + "epoch": 0.6694094869312681, + "grad_norm": 5.061169147491455, + "learning_rate": 2.627764774699617e-05, + "loss": 1.8014, + "step": 4149 + }, + { + "epoch": 0.6695708292997741, + "grad_norm": 4.341492652893066, + "learning_rate": 2.625465120401891e-05, + "loss": 1.9786, + "step": 4150 + }, + { + "epoch": 0.6697321716682801, + "grad_norm": 5.11741304397583, + "learning_rate": 2.623166114500192e-05, + "loss": 2.1888, + "step": 4151 + }, + { + "epoch": 0.6698935140367861, + "grad_norm": 4.154516220092773, + "learning_rate": 2.6208677576222896e-05, + "loss": 1.7038, + "step": 4152 + }, + { + "epoch": 0.670054856405292, + "grad_norm": 3.9316623210906982, + "learning_rate": 2.6185700503957823e-05, + "loss": 1.8391, + "step": 4153 + }, + { + "epoch": 0.670216198773798, + "grad_norm": 7.304197788238525, + "learning_rate": 2.6162729934480844e-05, + "loss": 2.0465, + "step": 4154 + }, + { + "epoch": 0.670377541142304, + "grad_norm": 3.8144609928131104, + "learning_rate": 2.6139765874064382e-05, + "loss": 1.7615, + "step": 4155 + }, + { + "epoch": 0.6705388835108099, + "grad_norm": 3.904538869857788, + "learning_rate": 2.6116808328979054e-05, + "loss": 2.3631, + "step": 4156 + }, + { + "epoch": 0.6707002258793159, + "grad_norm": 5.528686046600342, + "learning_rate": 2.6093857305493664e-05, + "loss": 1.728, + "step": 4157 + }, + { + "epoch": 0.6708615682478218, + "grad_norm": 4.866077899932861, + "learning_rate": 2.6070912809875324e-05, + "loss": 1.9217, + "step": 4158 + }, + { + "epoch": 0.6710229106163278, + "grad_norm": 3.7308342456817627, + "learning_rate": 2.6047974848389285e-05, + "loss": 1.8429, + "step": 4159 + }, + { + "epoch": 0.6711842529848339, + "grad_norm": 4.611958026885986, + "learning_rate": 2.602504342729902e-05, + "loss": 1.5838, + "step": 4160 + }, + { + "epoch": 0.6713455953533398, + "grad_norm": 6.402384281158447, + "learning_rate": 2.6002118552866284e-05, + "loss": 2.021, + "step": 4161 + }, + { + "epoch": 0.6715069377218458, + "grad_norm": 4.659445762634277, + "learning_rate": 2.5979200231350946e-05, + "loss": 1.8546, + "step": 4162 + }, + { + "epoch": 0.6716682800903517, + "grad_norm": 3.7538390159606934, + "learning_rate": 2.595628846901118e-05, + "loss": 1.7822, + "step": 4163 + }, + { + "epoch": 0.6718296224588577, + "grad_norm": 4.030730724334717, + "learning_rate": 2.593338327210332e-05, + "loss": 1.5749, + "step": 4164 + }, + { + "epoch": 0.6719909648273636, + "grad_norm": 3.891728401184082, + "learning_rate": 2.5910484646881862e-05, + "loss": 1.7048, + "step": 4165 + }, + { + "epoch": 0.6721523071958696, + "grad_norm": 3.5389225482940674, + "learning_rate": 2.5887592599599618e-05, + "loss": 1.6243, + "step": 4166 + }, + { + "epoch": 0.6723136495643756, + "grad_norm": 4.831793785095215, + "learning_rate": 2.586470713650751e-05, + "loss": 2.1921, + "step": 4167 + }, + { + "epoch": 0.6724749919328816, + "grad_norm": 4.646327495574951, + "learning_rate": 2.5841828263854717e-05, + "loss": 1.8703, + "step": 4168 + }, + { + "epoch": 0.6726363343013876, + "grad_norm": 3.7683122158050537, + "learning_rate": 2.581895598788857e-05, + "loss": 1.7485, + "step": 4169 + }, + { + "epoch": 0.6727976766698935, + "grad_norm": 4.0349626541137695, + "learning_rate": 2.5796090314854663e-05, + "loss": 1.7957, + "step": 4170 + }, + { + "epoch": 0.6729590190383995, + "grad_norm": 3.9039664268493652, + "learning_rate": 2.577323125099671e-05, + "loss": 2.0349, + "step": 4171 + }, + { + "epoch": 0.6731203614069055, + "grad_norm": 3.512286901473999, + "learning_rate": 2.5750378802556707e-05, + "loss": 1.8589, + "step": 4172 + }, + { + "epoch": 0.6732817037754114, + "grad_norm": 5.282522201538086, + "learning_rate": 2.5727532975774737e-05, + "loss": 1.8648, + "step": 4173 + }, + { + "epoch": 0.6734430461439174, + "grad_norm": 5.973352909088135, + "learning_rate": 2.57046937768892e-05, + "loss": 1.8859, + "step": 4174 + }, + { + "epoch": 0.6736043885124233, + "grad_norm": 4.13266658782959, + "learning_rate": 2.5681861212136578e-05, + "loss": 1.8761, + "step": 4175 + }, + { + "epoch": 0.6737657308809293, + "grad_norm": 5.284298896789551, + "learning_rate": 2.5659035287751575e-05, + "loss": 2.4626, + "step": 4176 + }, + { + "epoch": 0.6739270732494353, + "grad_norm": 3.9527158737182617, + "learning_rate": 2.563621600996714e-05, + "loss": 2.0104, + "step": 4177 + }, + { + "epoch": 0.6740884156179413, + "grad_norm": 5.228986740112305, + "learning_rate": 2.5613403385014323e-05, + "loss": 2.0879, + "step": 4178 + }, + { + "epoch": 0.6742497579864473, + "grad_norm": 3.9151477813720703, + "learning_rate": 2.5590597419122396e-05, + "loss": 1.8697, + "step": 4179 + }, + { + "epoch": 0.6744111003549532, + "grad_norm": 4.744083404541016, + "learning_rate": 2.5567798118518792e-05, + "loss": 1.6128, + "step": 4180 + }, + { + "epoch": 0.6745724427234592, + "grad_norm": 4.110485076904297, + "learning_rate": 2.5545005489429187e-05, + "loss": 1.8802, + "step": 4181 + }, + { + "epoch": 0.6747337850919651, + "grad_norm": 3.936558723449707, + "learning_rate": 2.552221953807734e-05, + "loss": 2.0994, + "step": 4182 + }, + { + "epoch": 0.6748951274604711, + "grad_norm": 4.597769737243652, + "learning_rate": 2.5499440270685277e-05, + "loss": 1.8867, + "step": 4183 + }, + { + "epoch": 0.675056469828977, + "grad_norm": 3.885920524597168, + "learning_rate": 2.547666769347312e-05, + "loss": 2.1563, + "step": 4184 + }, + { + "epoch": 0.6752178121974831, + "grad_norm": 4.100528717041016, + "learning_rate": 2.5453901812659242e-05, + "loss": 1.8185, + "step": 4185 + }, + { + "epoch": 0.6753791545659891, + "grad_norm": 5.49969482421875, + "learning_rate": 2.5431142634460115e-05, + "loss": 1.9517, + "step": 4186 + }, + { + "epoch": 0.675540496934495, + "grad_norm": 5.925865173339844, + "learning_rate": 2.5408390165090433e-05, + "loss": 1.8116, + "step": 4187 + }, + { + "epoch": 0.675701839303001, + "grad_norm": 4.8131256103515625, + "learning_rate": 2.538564441076302e-05, + "loss": 1.7581, + "step": 4188 + }, + { + "epoch": 0.6758631816715069, + "grad_norm": 4.608528137207031, + "learning_rate": 2.5362905377688912e-05, + "loss": 1.8789, + "step": 4189 + }, + { + "epoch": 0.6760245240400129, + "grad_norm": 4.1504340171813965, + "learning_rate": 2.5340173072077267e-05, + "loss": 1.8511, + "step": 4190 + }, + { + "epoch": 0.6761858664085189, + "grad_norm": 3.9584457874298096, + "learning_rate": 2.5317447500135406e-05, + "loss": 1.8161, + "step": 4191 + }, + { + "epoch": 0.6763472087770248, + "grad_norm": 5.7329254150390625, + "learning_rate": 2.529472866806885e-05, + "loss": 1.8829, + "step": 4192 + }, + { + "epoch": 0.6765085511455308, + "grad_norm": 4.299564361572266, + "learning_rate": 2.5272016582081236e-05, + "loss": 1.8923, + "step": 4193 + }, + { + "epoch": 0.6766698935140368, + "grad_norm": 4.187342166900635, + "learning_rate": 2.5249311248374406e-05, + "loss": 1.9306, + "step": 4194 + }, + { + "epoch": 0.6768312358825428, + "grad_norm": 4.859185695648193, + "learning_rate": 2.5226612673148314e-05, + "loss": 1.9833, + "step": 4195 + }, + { + "epoch": 0.6769925782510487, + "grad_norm": 4.390213966369629, + "learning_rate": 2.5203920862601073e-05, + "loss": 1.8682, + "step": 4196 + }, + { + "epoch": 0.6771539206195547, + "grad_norm": 3.902789354324341, + "learning_rate": 2.5181235822928996e-05, + "loss": 1.6186, + "step": 4197 + }, + { + "epoch": 0.6773152629880607, + "grad_norm": 4.4394450187683105, + "learning_rate": 2.5158557560326483e-05, + "loss": 1.8283, + "step": 4198 + }, + { + "epoch": 0.6774766053565666, + "grad_norm": 5.325255870819092, + "learning_rate": 2.5135886080986114e-05, + "loss": 1.812, + "step": 4199 + }, + { + "epoch": 0.6776379477250726, + "grad_norm": 5.705172538757324, + "learning_rate": 2.5113221391098642e-05, + "loss": 2.1034, + "step": 4200 + }, + { + "epoch": 0.6777992900935785, + "grad_norm": 4.679775238037109, + "learning_rate": 2.509056349685292e-05, + "loss": 1.8822, + "step": 4201 + }, + { + "epoch": 0.6779606324620845, + "grad_norm": 4.4186201095581055, + "learning_rate": 2.506791240443595e-05, + "loss": 2.0473, + "step": 4202 + }, + { + "epoch": 0.6781219748305906, + "grad_norm": 5.696232318878174, + "learning_rate": 2.5045268120032932e-05, + "loss": 1.8613, + "step": 4203 + }, + { + "epoch": 0.6782833171990965, + "grad_norm": 4.359367370605469, + "learning_rate": 2.5022630649827128e-05, + "loss": 2.0278, + "step": 4204 + }, + { + "epoch": 0.6784446595676025, + "grad_norm": 4.115344524383545, + "learning_rate": 2.500000000000001e-05, + "loss": 1.8859, + "step": 4205 + }, + { + "epoch": 0.6786060019361084, + "grad_norm": 3.563119649887085, + "learning_rate": 2.4977376176731127e-05, + "loss": 2.0457, + "step": 4206 + }, + { + "epoch": 0.6787673443046144, + "grad_norm": 5.598097324371338, + "learning_rate": 2.4954759186198223e-05, + "loss": 1.9112, + "step": 4207 + }, + { + "epoch": 0.6789286866731203, + "grad_norm": 3.9232401847839355, + "learning_rate": 2.4932149034577117e-05, + "loss": 1.7803, + "step": 4208 + }, + { + "epoch": 0.6790900290416263, + "grad_norm": 4.832250118255615, + "learning_rate": 2.4909545728041822e-05, + "loss": 1.8358, + "step": 4209 + }, + { + "epoch": 0.6792513714101323, + "grad_norm": 4.787143230438232, + "learning_rate": 2.488694927276441e-05, + "loss": 1.9455, + "step": 4210 + }, + { + "epoch": 0.6794127137786383, + "grad_norm": 5.10698127746582, + "learning_rate": 2.486435967491516e-05, + "loss": 2.0101, + "step": 4211 + }, + { + "epoch": 0.6795740561471443, + "grad_norm": 4.221131801605225, + "learning_rate": 2.4841776940662408e-05, + "loss": 2.0602, + "step": 4212 + }, + { + "epoch": 0.6797353985156502, + "grad_norm": 4.46644926071167, + "learning_rate": 2.481920107617268e-05, + "loss": 1.7403, + "step": 4213 + }, + { + "epoch": 0.6798967408841562, + "grad_norm": 4.517517566680908, + "learning_rate": 2.4796632087610583e-05, + "loss": 2.1973, + "step": 4214 + }, + { + "epoch": 0.6800580832526621, + "grad_norm": 4.196491241455078, + "learning_rate": 2.4774069981138848e-05, + "loss": 2.1414, + "step": 4215 + }, + { + "epoch": 0.6802194256211681, + "grad_norm": 5.910837650299072, + "learning_rate": 2.475151476291832e-05, + "loss": 2.0071, + "step": 4216 + }, + { + "epoch": 0.6803807679896741, + "grad_norm": 3.324673652648926, + "learning_rate": 2.472896643910802e-05, + "loss": 1.8987, + "step": 4217 + }, + { + "epoch": 0.68054211035818, + "grad_norm": 4.347282886505127, + "learning_rate": 2.4706425015865025e-05, + "loss": 1.7767, + "step": 4218 + }, + { + "epoch": 0.680703452726686, + "grad_norm": 4.210080146789551, + "learning_rate": 2.4683890499344532e-05, + "loss": 1.8631, + "step": 4219 + }, + { + "epoch": 0.680864795095192, + "grad_norm": 3.8832204341888428, + "learning_rate": 2.4661362895699903e-05, + "loss": 1.687, + "step": 4220 + }, + { + "epoch": 0.681026137463698, + "grad_norm": 4.468530654907227, + "learning_rate": 2.4638842211082542e-05, + "loss": 1.8627, + "step": 4221 + }, + { + "epoch": 0.681187479832204, + "grad_norm": 3.586127996444702, + "learning_rate": 2.461632845164204e-05, + "loss": 1.774, + "step": 4222 + }, + { + "epoch": 0.6813488222007099, + "grad_norm": 4.217072486877441, + "learning_rate": 2.4593821623526013e-05, + "loss": 1.7192, + "step": 4223 + }, + { + "epoch": 0.6815101645692159, + "grad_norm": 5.281252861022949, + "learning_rate": 2.457132173288027e-05, + "loss": 1.8081, + "step": 4224 + }, + { + "epoch": 0.6816715069377218, + "grad_norm": 4.885616302490234, + "learning_rate": 2.4548828785848645e-05, + "loss": 2.1488, + "step": 4225 + }, + { + "epoch": 0.6818328493062278, + "grad_norm": 5.50325345993042, + "learning_rate": 2.4526342788573146e-05, + "loss": 2.0298, + "step": 4226 + }, + { + "epoch": 0.6819941916747337, + "grad_norm": 5.605451583862305, + "learning_rate": 2.4503863747193844e-05, + "loss": 1.7473, + "step": 4227 + }, + { + "epoch": 0.6821555340432398, + "grad_norm": 6.766907691955566, + "learning_rate": 2.4481391667848895e-05, + "loss": 2.145, + "step": 4228 + }, + { + "epoch": 0.6823168764117458, + "grad_norm": 5.514566421508789, + "learning_rate": 2.4458926556674615e-05, + "loss": 1.8165, + "step": 4229 + }, + { + "epoch": 0.6824782187802517, + "grad_norm": 3.5636658668518066, + "learning_rate": 2.4436468419805336e-05, + "loss": 1.7146, + "step": 4230 + }, + { + "epoch": 0.6826395611487577, + "grad_norm": 3.876129150390625, + "learning_rate": 2.441401726337358e-05, + "loss": 1.8953, + "step": 4231 + }, + { + "epoch": 0.6828009035172636, + "grad_norm": 5.891449928283691, + "learning_rate": 2.439157309350986e-05, + "loss": 1.8175, + "step": 4232 + }, + { + "epoch": 0.6829622458857696, + "grad_norm": 5.565365791320801, + "learning_rate": 2.4369135916342884e-05, + "loss": 1.7192, + "step": 4233 + }, + { + "epoch": 0.6831235882542755, + "grad_norm": 4.477042198181152, + "learning_rate": 2.434670573799937e-05, + "loss": 1.8873, + "step": 4234 + }, + { + "epoch": 0.6832849306227815, + "grad_norm": 4.289474010467529, + "learning_rate": 2.4324282564604157e-05, + "loss": 1.7617, + "step": 4235 + }, + { + "epoch": 0.6834462729912875, + "grad_norm": 3.986325263977051, + "learning_rate": 2.4301866402280154e-05, + "loss": 1.6774, + "step": 4236 + }, + { + "epoch": 0.6836076153597935, + "grad_norm": 6.523401737213135, + "learning_rate": 2.4279457257148407e-05, + "loss": 1.9909, + "step": 4237 + }, + { + "epoch": 0.6837689577282995, + "grad_norm": 4.952888011932373, + "learning_rate": 2.4257055135327976e-05, + "loss": 1.8376, + "step": 4238 + }, + { + "epoch": 0.6839303000968054, + "grad_norm": 5.137017726898193, + "learning_rate": 2.4234660042936064e-05, + "loss": 2.0815, + "step": 4239 + }, + { + "epoch": 0.6840916424653114, + "grad_norm": 5.571824550628662, + "learning_rate": 2.421227198608792e-05, + "loss": 1.8191, + "step": 4240 + }, + { + "epoch": 0.6842529848338174, + "grad_norm": 4.201406955718994, + "learning_rate": 2.418989097089685e-05, + "loss": 1.8579, + "step": 4241 + }, + { + "epoch": 0.6844143272023233, + "grad_norm": 5.3459696769714355, + "learning_rate": 2.4167517003474304e-05, + "loss": 1.898, + "step": 4242 + }, + { + "epoch": 0.6845756695708293, + "grad_norm": 5.0349650382995605, + "learning_rate": 2.4145150089929743e-05, + "loss": 2.2807, + "step": 4243 + }, + { + "epoch": 0.6847370119393352, + "grad_norm": 4.771966934204102, + "learning_rate": 2.4122790236370756e-05, + "loss": 2.0599, + "step": 4244 + }, + { + "epoch": 0.6848983543078412, + "grad_norm": 4.0456624031066895, + "learning_rate": 2.410043744890294e-05, + "loss": 1.9737, + "step": 4245 + }, + { + "epoch": 0.6850596966763473, + "grad_norm": 5.042922496795654, + "learning_rate": 2.4078091733630043e-05, + "loss": 1.6734, + "step": 4246 + }, + { + "epoch": 0.6852210390448532, + "grad_norm": 7.21657133102417, + "learning_rate": 2.4055753096653794e-05, + "loss": 1.7746, + "step": 4247 + }, + { + "epoch": 0.6853823814133592, + "grad_norm": 3.572934627532959, + "learning_rate": 2.4033421544074073e-05, + "loss": 1.8811, + "step": 4248 + }, + { + "epoch": 0.6855437237818651, + "grad_norm": 3.880200147628784, + "learning_rate": 2.4011097081988747e-05, + "loss": 1.9709, + "step": 4249 + }, + { + "epoch": 0.6857050661503711, + "grad_norm": 5.2548723220825195, + "learning_rate": 2.3988779716493832e-05, + "loss": 2.0609, + "step": 4250 + }, + { + "epoch": 0.685866408518877, + "grad_norm": 5.024470329284668, + "learning_rate": 2.396646945368331e-05, + "loss": 1.7785, + "step": 4251 + }, + { + "epoch": 0.686027750887383, + "grad_norm": 3.6195952892303467, + "learning_rate": 2.3944166299649317e-05, + "loss": 1.7746, + "step": 4252 + }, + { + "epoch": 0.686189093255889, + "grad_norm": 3.8889803886413574, + "learning_rate": 2.392187026048198e-05, + "loss": 1.9318, + "step": 4253 + }, + { + "epoch": 0.686350435624395, + "grad_norm": 3.503079652786255, + "learning_rate": 2.3899581342269516e-05, + "loss": 1.8082, + "step": 4254 + }, + { + "epoch": 0.686511777992901, + "grad_norm": 4.067113876342773, + "learning_rate": 2.3877299551098185e-05, + "loss": 1.7902, + "step": 4255 + }, + { + "epoch": 0.6866731203614069, + "grad_norm": 3.907656669616699, + "learning_rate": 2.3855024893052285e-05, + "loss": 2.1521, + "step": 4256 + }, + { + "epoch": 0.6868344627299129, + "grad_norm": 4.6518425941467285, + "learning_rate": 2.3832757374214222e-05, + "loss": 1.8983, + "step": 4257 + }, + { + "epoch": 0.6869958050984188, + "grad_norm": 4.6036295890808105, + "learning_rate": 2.3810497000664382e-05, + "loss": 1.8195, + "step": 4258 + }, + { + "epoch": 0.6871571474669248, + "grad_norm": 4.095510959625244, + "learning_rate": 2.3788243778481275e-05, + "loss": 1.7026, + "step": 4259 + }, + { + "epoch": 0.6873184898354308, + "grad_norm": 5.651637554168701, + "learning_rate": 2.3765997713741374e-05, + "loss": 1.9468, + "step": 4260 + }, + { + "epoch": 0.6874798322039367, + "grad_norm": 4.282829284667969, + "learning_rate": 2.3743758812519278e-05, + "loss": 1.9029, + "step": 4261 + }, + { + "epoch": 0.6876411745724427, + "grad_norm": 4.165994644165039, + "learning_rate": 2.372152708088756e-05, + "loss": 1.8754, + "step": 4262 + }, + { + "epoch": 0.6878025169409487, + "grad_norm": 3.732685089111328, + "learning_rate": 2.369930252491691e-05, + "loss": 1.9601, + "step": 4263 + }, + { + "epoch": 0.6879638593094547, + "grad_norm": 5.241577625274658, + "learning_rate": 2.3677085150675994e-05, + "loss": 1.765, + "step": 4264 + }, + { + "epoch": 0.6881252016779607, + "grad_norm": 4.694557189941406, + "learning_rate": 2.3654874964231518e-05, + "loss": 1.7963, + "step": 4265 + }, + { + "epoch": 0.6882865440464666, + "grad_norm": 3.5138211250305176, + "learning_rate": 2.3632671971648277e-05, + "loss": 1.725, + "step": 4266 + }, + { + "epoch": 0.6884478864149726, + "grad_norm": 4.039546966552734, + "learning_rate": 2.3610476178989054e-05, + "loss": 1.633, + "step": 4267 + }, + { + "epoch": 0.6886092287834785, + "grad_norm": 4.485110759735107, + "learning_rate": 2.3588287592314717e-05, + "loss": 1.7488, + "step": 4268 + }, + { + "epoch": 0.6887705711519845, + "grad_norm": 3.9109413623809814, + "learning_rate": 2.356610621768408e-05, + "loss": 1.8953, + "step": 4269 + }, + { + "epoch": 0.6889319135204904, + "grad_norm": 4.28920841217041, + "learning_rate": 2.3543932061154096e-05, + "loss": 2.0305, + "step": 4270 + }, + { + "epoch": 0.6890932558889965, + "grad_norm": 6.500674724578857, + "learning_rate": 2.3521765128779643e-05, + "loss": 2.0727, + "step": 4271 + }, + { + "epoch": 0.6892545982575025, + "grad_norm": 4.590287685394287, + "learning_rate": 2.349960542661372e-05, + "loss": 1.904, + "step": 4272 + }, + { + "epoch": 0.6894159406260084, + "grad_norm": 3.8235952854156494, + "learning_rate": 2.3477452960707285e-05, + "loss": 1.6787, + "step": 4273 + }, + { + "epoch": 0.6895772829945144, + "grad_norm": 3.4124348163604736, + "learning_rate": 2.345530773710934e-05, + "loss": 1.8031, + "step": 4274 + }, + { + "epoch": 0.6897386253630203, + "grad_norm": 4.249532222747803, + "learning_rate": 2.3433169761866898e-05, + "loss": 1.6595, + "step": 4275 + }, + { + "epoch": 0.6898999677315263, + "grad_norm": 4.482729434967041, + "learning_rate": 2.341103904102504e-05, + "loss": 1.9365, + "step": 4276 + }, + { + "epoch": 0.6900613101000322, + "grad_norm": 4.421379566192627, + "learning_rate": 2.3388915580626808e-05, + "loss": 1.758, + "step": 4277 + }, + { + "epoch": 0.6902226524685382, + "grad_norm": 6.173230171203613, + "learning_rate": 2.3366799386713277e-05, + "loss": 1.932, + "step": 4278 + }, + { + "epoch": 0.6903839948370442, + "grad_norm": 5.280002117156982, + "learning_rate": 2.3344690465323583e-05, + "loss": 1.9702, + "step": 4279 + }, + { + "epoch": 0.6905453372055502, + "grad_norm": 4.441904067993164, + "learning_rate": 2.332258882249479e-05, + "loss": 2.1519, + "step": 4280 + }, + { + "epoch": 0.6907066795740562, + "grad_norm": 5.471134185791016, + "learning_rate": 2.330049446426208e-05, + "loss": 2.1167, + "step": 4281 + }, + { + "epoch": 0.6908680219425621, + "grad_norm": 3.910367250442505, + "learning_rate": 2.3278407396658536e-05, + "loss": 1.9564, + "step": 4282 + }, + { + "epoch": 0.6910293643110681, + "grad_norm": 4.43289852142334, + "learning_rate": 2.3256327625715347e-05, + "loss": 1.8651, + "step": 4283 + }, + { + "epoch": 0.691190706679574, + "grad_norm": 4.122693061828613, + "learning_rate": 2.323425515746164e-05, + "loss": 1.791, + "step": 4284 + }, + { + "epoch": 0.69135204904808, + "grad_norm": 4.674631118774414, + "learning_rate": 2.3212189997924594e-05, + "loss": 1.9087, + "step": 4285 + }, + { + "epoch": 0.691513391416586, + "grad_norm": 4.20643424987793, + "learning_rate": 2.3190132153129345e-05, + "loss": 1.7867, + "step": 4286 + }, + { + "epoch": 0.6916747337850919, + "grad_norm": 4.392679691314697, + "learning_rate": 2.31680816290991e-05, + "loss": 1.8619, + "step": 4287 + }, + { + "epoch": 0.691836076153598, + "grad_norm": 4.6754045486450195, + "learning_rate": 2.3146038431854977e-05, + "loss": 1.9409, + "step": 4288 + }, + { + "epoch": 0.691997418522104, + "grad_norm": 4.793563365936279, + "learning_rate": 2.3124002567416197e-05, + "loss": 1.7599, + "step": 4289 + }, + { + "epoch": 0.6921587608906099, + "grad_norm": 5.450015544891357, + "learning_rate": 2.310197404179989e-05, + "loss": 1.7792, + "step": 4290 + }, + { + "epoch": 0.6923201032591159, + "grad_norm": 4.160528659820557, + "learning_rate": 2.307995286102121e-05, + "loss": 2.0111, + "step": 4291 + }, + { + "epoch": 0.6924814456276218, + "grad_norm": 4.624261379241943, + "learning_rate": 2.3057939031093344e-05, + "loss": 2.035, + "step": 4292 + }, + { + "epoch": 0.6926427879961278, + "grad_norm": 4.306497097015381, + "learning_rate": 2.3035932558027418e-05, + "loss": 1.7682, + "step": 4293 + }, + { + "epoch": 0.6928041303646337, + "grad_norm": 4.892125129699707, + "learning_rate": 2.3013933447832574e-05, + "loss": 1.7313, + "step": 4294 + }, + { + "epoch": 0.6929654727331397, + "grad_norm": 4.463599681854248, + "learning_rate": 2.2991941706515922e-05, + "loss": 1.9946, + "step": 4295 + }, + { + "epoch": 0.6931268151016456, + "grad_norm": 3.9114396572113037, + "learning_rate": 2.296995734008262e-05, + "loss": 1.8424, + "step": 4296 + }, + { + "epoch": 0.6932881574701517, + "grad_norm": 3.474891424179077, + "learning_rate": 2.2947980354535726e-05, + "loss": 2.013, + "step": 4297 + }, + { + "epoch": 0.6934494998386577, + "grad_norm": 3.928380250930786, + "learning_rate": 2.2926010755876364e-05, + "loss": 1.6954, + "step": 4298 + }, + { + "epoch": 0.6936108422071636, + "grad_norm": 4.158672332763672, + "learning_rate": 2.290404855010357e-05, + "loss": 2.2009, + "step": 4299 + }, + { + "epoch": 0.6937721845756696, + "grad_norm": 4.239987850189209, + "learning_rate": 2.2882093743214426e-05, + "loss": 1.8984, + "step": 4300 + }, + { + "epoch": 0.6939335269441755, + "grad_norm": 4.503664016723633, + "learning_rate": 2.2860146341203937e-05, + "loss": 1.7688, + "step": 4301 + }, + { + "epoch": 0.6940948693126815, + "grad_norm": 4.7965087890625, + "learning_rate": 2.2838206350065145e-05, + "loss": 1.7498, + "step": 4302 + }, + { + "epoch": 0.6942562116811875, + "grad_norm": 5.290661811828613, + "learning_rate": 2.281627377578901e-05, + "loss": 2.0565, + "step": 4303 + }, + { + "epoch": 0.6944175540496934, + "grad_norm": 3.9973301887512207, + "learning_rate": 2.2794348624364476e-05, + "loss": 1.9461, + "step": 4304 + }, + { + "epoch": 0.6945788964181994, + "grad_norm": 5.286022663116455, + "learning_rate": 2.2772430901778514e-05, + "loss": 1.7823, + "step": 4305 + }, + { + "epoch": 0.6947402387867054, + "grad_norm": 4.26722526550293, + "learning_rate": 2.2750520614015993e-05, + "loss": 2.0004, + "step": 4306 + }, + { + "epoch": 0.6949015811552114, + "grad_norm": 5.046075820922852, + "learning_rate": 2.2728617767059824e-05, + "loss": 1.8403, + "step": 4307 + }, + { + "epoch": 0.6950629235237173, + "grad_norm": 4.135906219482422, + "learning_rate": 2.2706722366890807e-05, + "loss": 1.8142, + "step": 4308 + }, + { + "epoch": 0.6952242658922233, + "grad_norm": 9.872830390930176, + "learning_rate": 2.2684834419487798e-05, + "loss": 1.9971, + "step": 4309 + }, + { + "epoch": 0.6953856082607293, + "grad_norm": 4.156027317047119, + "learning_rate": 2.2662953930827546e-05, + "loss": 1.7378, + "step": 4310 + }, + { + "epoch": 0.6955469506292352, + "grad_norm": 4.600287914276123, + "learning_rate": 2.2641080906884764e-05, + "loss": 1.9674, + "step": 4311 + }, + { + "epoch": 0.6957082929977412, + "grad_norm": 4.224839210510254, + "learning_rate": 2.26192153536322e-05, + "loss": 1.6652, + "step": 4312 + }, + { + "epoch": 0.6958696353662471, + "grad_norm": 4.38108491897583, + "learning_rate": 2.2597357277040493e-05, + "loss": 1.733, + "step": 4313 + }, + { + "epoch": 0.6960309777347532, + "grad_norm": 3.771247625350952, + "learning_rate": 2.257550668307823e-05, + "loss": 1.9172, + "step": 4314 + }, + { + "epoch": 0.6961923201032592, + "grad_norm": 4.842396259307861, + "learning_rate": 2.255366357771203e-05, + "loss": 1.8974, + "step": 4315 + }, + { + "epoch": 0.6963536624717651, + "grad_norm": 4.061734199523926, + "learning_rate": 2.253182796690641e-05, + "loss": 2.0048, + "step": 4316 + }, + { + "epoch": 0.6965150048402711, + "grad_norm": 4.278356075286865, + "learning_rate": 2.250999985662382e-05, + "loss": 1.7021, + "step": 4317 + }, + { + "epoch": 0.696676347208777, + "grad_norm": 4.616026878356934, + "learning_rate": 2.2488179252824747e-05, + "loss": 1.7921, + "step": 4318 + }, + { + "epoch": 0.696837689577283, + "grad_norm": 4.216506481170654, + "learning_rate": 2.246636616146753e-05, + "loss": 1.9744, + "step": 4319 + }, + { + "epoch": 0.6969990319457889, + "grad_norm": 3.369396209716797, + "learning_rate": 2.2444560588508533e-05, + "loss": 1.65, + "step": 4320 + }, + { + "epoch": 0.6971603743142949, + "grad_norm": 4.403998374938965, + "learning_rate": 2.2422762539902013e-05, + "loss": 1.9481, + "step": 4321 + }, + { + "epoch": 0.6973217166828009, + "grad_norm": 4.213418483734131, + "learning_rate": 2.2400972021600226e-05, + "loss": 2.0417, + "step": 4322 + }, + { + "epoch": 0.6974830590513069, + "grad_norm": 3.916518211364746, + "learning_rate": 2.2379189039553305e-05, + "loss": 1.8633, + "step": 4323 + }, + { + "epoch": 0.6976444014198129, + "grad_norm": 3.631333827972412, + "learning_rate": 2.2357413599709402e-05, + "loss": 1.9357, + "step": 4324 + }, + { + "epoch": 0.6978057437883188, + "grad_norm": 4.107703685760498, + "learning_rate": 2.233564570801453e-05, + "loss": 2.1404, + "step": 4325 + }, + { + "epoch": 0.6979670861568248, + "grad_norm": 3.8593263626098633, + "learning_rate": 2.2313885370412718e-05, + "loss": 1.8382, + "step": 4326 + }, + { + "epoch": 0.6981284285253307, + "grad_norm": 4.466026306152344, + "learning_rate": 2.229213259284586e-05, + "loss": 2.0149, + "step": 4327 + }, + { + "epoch": 0.6982897708938367, + "grad_norm": 3.4434030055999756, + "learning_rate": 2.227038738125385e-05, + "loss": 2.0153, + "step": 4328 + }, + { + "epoch": 0.6984511132623427, + "grad_norm": 3.6990842819213867, + "learning_rate": 2.224864974157447e-05, + "loss": 2.0005, + "step": 4329 + }, + { + "epoch": 0.6986124556308486, + "grad_norm": 3.6912689208984375, + "learning_rate": 2.2226919679743453e-05, + "loss": 1.7826, + "step": 4330 + }, + { + "epoch": 0.6987737979993547, + "grad_norm": 4.0772199630737305, + "learning_rate": 2.2205197201694446e-05, + "loss": 1.9131, + "step": 4331 + }, + { + "epoch": 0.6989351403678606, + "grad_norm": 3.928609609603882, + "learning_rate": 2.2183482313359066e-05, + "loss": 2.0697, + "step": 4332 + }, + { + "epoch": 0.6990964827363666, + "grad_norm": 4.35366153717041, + "learning_rate": 2.2161775020666818e-05, + "loss": 1.8996, + "step": 4333 + }, + { + "epoch": 0.6992578251048726, + "grad_norm": 4.93165397644043, + "learning_rate": 2.214007532954513e-05, + "loss": 1.9748, + "step": 4334 + }, + { + "epoch": 0.6994191674733785, + "grad_norm": 3.9155633449554443, + "learning_rate": 2.2118383245919406e-05, + "loss": 1.7677, + "step": 4335 + }, + { + "epoch": 0.6995805098418845, + "grad_norm": 4.280554294586182, + "learning_rate": 2.2096698775712894e-05, + "loss": 1.6805, + "step": 4336 + }, + { + "epoch": 0.6997418522103904, + "grad_norm": 3.150489091873169, + "learning_rate": 2.207502192484685e-05, + "loss": 1.7984, + "step": 4337 + }, + { + "epoch": 0.6999031945788964, + "grad_norm": 4.621681213378906, + "learning_rate": 2.2053352699240365e-05, + "loss": 1.9041, + "step": 4338 + }, + { + "epoch": 0.7000645369474023, + "grad_norm": 4.467617034912109, + "learning_rate": 2.2031691104810525e-05, + "loss": 1.9356, + "step": 4339 + }, + { + "epoch": 0.7002258793159084, + "grad_norm": 4.062320709228516, + "learning_rate": 2.201003714747228e-05, + "loss": 1.6686, + "step": 4340 + }, + { + "epoch": 0.7003872216844144, + "grad_norm": 4.962801933288574, + "learning_rate": 2.198839083313849e-05, + "loss": 2.0948, + "step": 4341 + }, + { + "epoch": 0.7005485640529203, + "grad_norm": 5.018860340118408, + "learning_rate": 2.1966752167719984e-05, + "loss": 1.6878, + "step": 4342 + }, + { + "epoch": 0.7007099064214263, + "grad_norm": 4.773665904998779, + "learning_rate": 2.194512115712543e-05, + "loss": 1.6959, + "step": 4343 + }, + { + "epoch": 0.7008712487899322, + "grad_norm": 4.268318176269531, + "learning_rate": 2.1923497807261477e-05, + "loss": 1.9142, + "step": 4344 + }, + { + "epoch": 0.7010325911584382, + "grad_norm": 4.606492519378662, + "learning_rate": 2.190188212403262e-05, + "loss": 2.0696, + "step": 4345 + }, + { + "epoch": 0.7011939335269441, + "grad_norm": 4.331446647644043, + "learning_rate": 2.188027411334131e-05, + "loss": 1.8616, + "step": 4346 + }, + { + "epoch": 0.7013552758954501, + "grad_norm": 5.121551990509033, + "learning_rate": 2.1858673781087852e-05, + "loss": 1.8384, + "step": 4347 + }, + { + "epoch": 0.7015166182639561, + "grad_norm": 3.758315324783325, + "learning_rate": 2.1837081133170523e-05, + "loss": 1.7561, + "step": 4348 + }, + { + "epoch": 0.7016779606324621, + "grad_norm": 3.490201473236084, + "learning_rate": 2.1815496175485434e-05, + "loss": 1.7623, + "step": 4349 + }, + { + "epoch": 0.7018393030009681, + "grad_norm": 5.102977275848389, + "learning_rate": 2.1793918913926636e-05, + "loss": 1.9809, + "step": 4350 + }, + { + "epoch": 0.702000645369474, + "grad_norm": 3.9732844829559326, + "learning_rate": 2.1772349354386034e-05, + "loss": 1.6756, + "step": 4351 + }, + { + "epoch": 0.70216198773798, + "grad_norm": 6.685610771179199, + "learning_rate": 2.1750787502753512e-05, + "loss": 2.042, + "step": 4352 + }, + { + "epoch": 0.702323330106486, + "grad_norm": 4.101576328277588, + "learning_rate": 2.1729233364916775e-05, + "loss": 1.7987, + "step": 4353 + }, + { + "epoch": 0.7024846724749919, + "grad_norm": 4.826432228088379, + "learning_rate": 2.1707686946761418e-05, + "loss": 1.9823, + "step": 4354 + }, + { + "epoch": 0.7026460148434979, + "grad_norm": 4.119592666625977, + "learning_rate": 2.1686148254171013e-05, + "loss": 2.2493, + "step": 4355 + }, + { + "epoch": 0.7028073572120038, + "grad_norm": 4.75625467300415, + "learning_rate": 2.1664617293026917e-05, + "loss": 2.1533, + "step": 4356 + }, + { + "epoch": 0.7029686995805099, + "grad_norm": 5.245871543884277, + "learning_rate": 2.164309406920846e-05, + "loss": 1.8152, + "step": 4357 + }, + { + "epoch": 0.7031300419490158, + "grad_norm": 4.917736053466797, + "learning_rate": 2.1621578588592793e-05, + "loss": 1.7487, + "step": 4358 + }, + { + "epoch": 0.7032913843175218, + "grad_norm": 6.010506629943848, + "learning_rate": 2.1600070857055015e-05, + "loss": 1.8144, + "step": 4359 + }, + { + "epoch": 0.7034527266860278, + "grad_norm": 4.053597450256348, + "learning_rate": 2.157857088046804e-05, + "loss": 1.5957, + "step": 4360 + }, + { + "epoch": 0.7036140690545337, + "grad_norm": 5.857894420623779, + "learning_rate": 2.1557078664702746e-05, + "loss": 1.9174, + "step": 4361 + }, + { + "epoch": 0.7037754114230397, + "grad_norm": 5.7891035079956055, + "learning_rate": 2.1535594215627803e-05, + "loss": 1.8816, + "step": 4362 + }, + { + "epoch": 0.7039367537915456, + "grad_norm": 4.0410614013671875, + "learning_rate": 2.151411753910984e-05, + "loss": 1.7359, + "step": 4363 + }, + { + "epoch": 0.7040980961600516, + "grad_norm": 4.876594066619873, + "learning_rate": 2.14926486410133e-05, + "loss": 1.8103, + "step": 4364 + }, + { + "epoch": 0.7042594385285575, + "grad_norm": 4.136269569396973, + "learning_rate": 2.147118752720056e-05, + "loss": 1.7087, + "step": 4365 + }, + { + "epoch": 0.7044207808970636, + "grad_norm": 4.762213230133057, + "learning_rate": 2.1449734203531828e-05, + "loss": 1.9995, + "step": 4366 + }, + { + "epoch": 0.7045821232655696, + "grad_norm": 3.8271641731262207, + "learning_rate": 2.1428288675865176e-05, + "loss": 1.5648, + "step": 4367 + }, + { + "epoch": 0.7047434656340755, + "grad_norm": 4.143524169921875, + "learning_rate": 2.1406850950056612e-05, + "loss": 1.822, + "step": 4368 + }, + { + "epoch": 0.7049048080025815, + "grad_norm": 3.7212743759155273, + "learning_rate": 2.1385421031959947e-05, + "loss": 1.918, + "step": 4369 + }, + { + "epoch": 0.7050661503710874, + "grad_norm": 3.9864838123321533, + "learning_rate": 2.136399892742687e-05, + "loss": 1.7706, + "step": 4370 + }, + { + "epoch": 0.7052274927395934, + "grad_norm": 5.322200775146484, + "learning_rate": 2.1342584642306985e-05, + "loss": 1.8825, + "step": 4371 + }, + { + "epoch": 0.7053888351080994, + "grad_norm": 4.411141395568848, + "learning_rate": 2.132117818244771e-05, + "loss": 1.6506, + "step": 4372 + }, + { + "epoch": 0.7055501774766053, + "grad_norm": 3.4371304512023926, + "learning_rate": 2.1299779553694323e-05, + "loss": 1.9689, + "step": 4373 + }, + { + "epoch": 0.7057115198451114, + "grad_norm": 5.998043060302734, + "learning_rate": 2.1278388761890022e-05, + "loss": 1.9898, + "step": 4374 + }, + { + "epoch": 0.7058728622136173, + "grad_norm": 3.8267035484313965, + "learning_rate": 2.125700581287579e-05, + "loss": 2.1537, + "step": 4375 + }, + { + "epoch": 0.7060342045821233, + "grad_norm": 4.702099800109863, + "learning_rate": 2.1235630712490538e-05, + "loss": 1.7038, + "step": 4376 + }, + { + "epoch": 0.7061955469506292, + "grad_norm": 5.004813194274902, + "learning_rate": 2.1214263466570965e-05, + "loss": 1.8292, + "step": 4377 + }, + { + "epoch": 0.7063568893191352, + "grad_norm": 5.187638282775879, + "learning_rate": 2.1192904080951704e-05, + "loss": 1.9856, + "step": 4378 + }, + { + "epoch": 0.7065182316876412, + "grad_norm": 4.296599388122559, + "learning_rate": 2.117155256146517e-05, + "loss": 1.8487, + "step": 4379 + }, + { + "epoch": 0.7066795740561471, + "grad_norm": 3.953773260116577, + "learning_rate": 2.115020891394165e-05, + "loss": 1.8964, + "step": 4380 + }, + { + "epoch": 0.7068409164246531, + "grad_norm": 4.438882827758789, + "learning_rate": 2.1128873144209317e-05, + "loss": 1.8821, + "step": 4381 + }, + { + "epoch": 0.707002258793159, + "grad_norm": 5.1794023513793945, + "learning_rate": 2.1107545258094135e-05, + "loss": 2.0808, + "step": 4382 + }, + { + "epoch": 0.7071636011616651, + "grad_norm": 3.880484104156494, + "learning_rate": 2.108622526141999e-05, + "loss": 1.8642, + "step": 4383 + }, + { + "epoch": 0.7073249435301711, + "grad_norm": 4.025290489196777, + "learning_rate": 2.106491316000852e-05, + "loss": 2.0761, + "step": 4384 + }, + { + "epoch": 0.707486285898677, + "grad_norm": 3.9385993480682373, + "learning_rate": 2.10436089596793e-05, + "loss": 1.8177, + "step": 4385 + }, + { + "epoch": 0.707647628267183, + "grad_norm": 3.9777097702026367, + "learning_rate": 2.1022312666249665e-05, + "loss": 1.8149, + "step": 4386 + }, + { + "epoch": 0.7078089706356889, + "grad_norm": 3.2477285861968994, + "learning_rate": 2.1001024285534878e-05, + "loss": 1.9699, + "step": 4387 + }, + { + "epoch": 0.7079703130041949, + "grad_norm": 4.853583335876465, + "learning_rate": 2.0979743823347957e-05, + "loss": 1.7118, + "step": 4388 + }, + { + "epoch": 0.7081316553727008, + "grad_norm": 4.169306755065918, + "learning_rate": 2.095847128549981e-05, + "loss": 1.7005, + "step": 4389 + }, + { + "epoch": 0.7082929977412068, + "grad_norm": 4.811184883117676, + "learning_rate": 2.0937206677799142e-05, + "loss": 1.9682, + "step": 4390 + }, + { + "epoch": 0.7084543401097129, + "grad_norm": 4.938937664031982, + "learning_rate": 2.0915950006052553e-05, + "loss": 1.6893, + "step": 4391 + }, + { + "epoch": 0.7086156824782188, + "grad_norm": 5.426696300506592, + "learning_rate": 2.089470127606442e-05, + "loss": 2.0481, + "step": 4392 + }, + { + "epoch": 0.7087770248467248, + "grad_norm": 4.347787857055664, + "learning_rate": 2.087346049363696e-05, + "loss": 1.8751, + "step": 4393 + }, + { + "epoch": 0.7089383672152307, + "grad_norm": 5.035215377807617, + "learning_rate": 2.085222766457025e-05, + "loss": 1.7715, + "step": 4394 + }, + { + "epoch": 0.7090997095837367, + "grad_norm": 4.536472320556641, + "learning_rate": 2.0831002794662157e-05, + "loss": 2.015, + "step": 4395 + }, + { + "epoch": 0.7092610519522426, + "grad_norm": 5.405670642852783, + "learning_rate": 2.0809785889708423e-05, + "loss": 1.9514, + "step": 4396 + }, + { + "epoch": 0.7094223943207486, + "grad_norm": 4.638657569885254, + "learning_rate": 2.0788576955502547e-05, + "loss": 1.8223, + "step": 4397 + }, + { + "epoch": 0.7095837366892546, + "grad_norm": 5.7009453773498535, + "learning_rate": 2.076737599783593e-05, + "loss": 1.8343, + "step": 4398 + }, + { + "epoch": 0.7097450790577605, + "grad_norm": 4.524998664855957, + "learning_rate": 2.074618302249772e-05, + "loss": 2.0693, + "step": 4399 + }, + { + "epoch": 0.7099064214262666, + "grad_norm": 5.167919158935547, + "learning_rate": 2.0724998035274945e-05, + "loss": 1.9377, + "step": 4400 + }, + { + "epoch": 0.7100677637947725, + "grad_norm": 3.5350277423858643, + "learning_rate": 2.0703821041952404e-05, + "loss": 1.8358, + "step": 4401 + }, + { + "epoch": 0.7102291061632785, + "grad_norm": 4.2776055335998535, + "learning_rate": 2.0682652048312767e-05, + "loss": 2.0817, + "step": 4402 + }, + { + "epoch": 0.7103904485317845, + "grad_norm": 4.363068103790283, + "learning_rate": 2.0661491060136467e-05, + "loss": 1.6639, + "step": 4403 + }, + { + "epoch": 0.7105517909002904, + "grad_norm": 3.997136116027832, + "learning_rate": 2.0640338083201766e-05, + "loss": 1.7986, + "step": 4404 + }, + { + "epoch": 0.7107131332687964, + "grad_norm": 4.03304386138916, + "learning_rate": 2.061919312328477e-05, + "loss": 1.8539, + "step": 4405 + }, + { + "epoch": 0.7108744756373023, + "grad_norm": 4.127360820770264, + "learning_rate": 2.059805618615934e-05, + "loss": 1.8774, + "step": 4406 + }, + { + "epoch": 0.7110358180058083, + "grad_norm": 3.846283197402954, + "learning_rate": 2.0576927277597213e-05, + "loss": 1.8691, + "step": 4407 + }, + { + "epoch": 0.7111971603743142, + "grad_norm": 4.539346694946289, + "learning_rate": 2.0555806403367878e-05, + "loss": 2.0739, + "step": 4408 + }, + { + "epoch": 0.7113585027428203, + "grad_norm": 5.109747409820557, + "learning_rate": 2.053469356923865e-05, + "loss": 1.6983, + "step": 4409 + }, + { + "epoch": 0.7115198451113263, + "grad_norm": 4.709097385406494, + "learning_rate": 2.0513588780974637e-05, + "loss": 1.749, + "step": 4410 + }, + { + "epoch": 0.7116811874798322, + "grad_norm": 4.342752456665039, + "learning_rate": 2.049249204433879e-05, + "loss": 1.9668, + "step": 4411 + }, + { + "epoch": 0.7118425298483382, + "grad_norm": 4.634512424468994, + "learning_rate": 2.04714033650918e-05, + "loss": 1.6424, + "step": 4412 + }, + { + "epoch": 0.7120038722168441, + "grad_norm": 4.253017425537109, + "learning_rate": 2.0450322748992224e-05, + "loss": 1.7382, + "step": 4413 + }, + { + "epoch": 0.7121652145853501, + "grad_norm": 3.9589767456054688, + "learning_rate": 2.0429250201796358e-05, + "loss": 1.7899, + "step": 4414 + }, + { + "epoch": 0.712326556953856, + "grad_norm": 4.6450629234313965, + "learning_rate": 2.0408185729258343e-05, + "loss": 1.6952, + "step": 4415 + }, + { + "epoch": 0.712487899322362, + "grad_norm": 3.912813901901245, + "learning_rate": 2.0387129337130083e-05, + "loss": 1.7723, + "step": 4416 + }, + { + "epoch": 0.7126492416908681, + "grad_norm": 3.898334264755249, + "learning_rate": 2.0366081031161267e-05, + "loss": 1.674, + "step": 4417 + }, + { + "epoch": 0.712810584059374, + "grad_norm": 5.272930145263672, + "learning_rate": 2.034504081709943e-05, + "loss": 2.0234, + "step": 4418 + }, + { + "epoch": 0.71297192642788, + "grad_norm": 4.26560640335083, + "learning_rate": 2.0324008700689827e-05, + "loss": 1.8778, + "step": 4419 + }, + { + "epoch": 0.7131332687963859, + "grad_norm": 4.033371925354004, + "learning_rate": 2.030298468767557e-05, + "loss": 1.8857, + "step": 4420 + }, + { + "epoch": 0.7132946111648919, + "grad_norm": 5.359758377075195, + "learning_rate": 2.0281968783797488e-05, + "loss": 1.8295, + "step": 4421 + }, + { + "epoch": 0.7134559535333979, + "grad_norm": 4.577740669250488, + "learning_rate": 2.0260960994794276e-05, + "loss": 1.9281, + "step": 4422 + }, + { + "epoch": 0.7136172959019038, + "grad_norm": 4.2320685386657715, + "learning_rate": 2.0239961326402323e-05, + "loss": 1.9201, + "step": 4423 + }, + { + "epoch": 0.7137786382704098, + "grad_norm": 4.954527854919434, + "learning_rate": 2.021896978435589e-05, + "loss": 2.0447, + "step": 4424 + }, + { + "epoch": 0.7139399806389157, + "grad_norm": 4.132653713226318, + "learning_rate": 2.019798637438694e-05, + "loss": 1.8366, + "step": 4425 + }, + { + "epoch": 0.7141013230074218, + "grad_norm": 3.632702589035034, + "learning_rate": 2.017701110222529e-05, + "loss": 1.576, + "step": 4426 + }, + { + "epoch": 0.7142626653759278, + "grad_norm": 4.073683261871338, + "learning_rate": 2.0156043973598476e-05, + "loss": 1.7663, + "step": 4427 + }, + { + "epoch": 0.7144240077444337, + "grad_norm": 3.756171226501465, + "learning_rate": 2.013508499423183e-05, + "loss": 1.9047, + "step": 4428 + }, + { + "epoch": 0.7145853501129397, + "grad_norm": 5.1460442543029785, + "learning_rate": 2.011413416984846e-05, + "loss": 2.045, + "step": 4429 + }, + { + "epoch": 0.7147466924814456, + "grad_norm": 4.379173755645752, + "learning_rate": 2.009319150616923e-05, + "loss": 1.5792, + "step": 4430 + }, + { + "epoch": 0.7149080348499516, + "grad_norm": 3.989184856414795, + "learning_rate": 2.0072257008912826e-05, + "loss": 1.9315, + "step": 4431 + }, + { + "epoch": 0.7150693772184575, + "grad_norm": 4.4064507484436035, + "learning_rate": 2.005133068379564e-05, + "loss": 1.7277, + "step": 4432 + }, + { + "epoch": 0.7152307195869635, + "grad_norm": 3.6643855571746826, + "learning_rate": 2.0030412536531895e-05, + "loss": 1.8192, + "step": 4433 + }, + { + "epoch": 0.7153920619554696, + "grad_norm": 5.062724590301514, + "learning_rate": 2.000950257283351e-05, + "loss": 1.8298, + "step": 4434 + }, + { + "epoch": 0.7155534043239755, + "grad_norm": 3.734053134918213, + "learning_rate": 1.9988600798410258e-05, + "loss": 1.949, + "step": 4435 + }, + { + "epoch": 0.7157147466924815, + "grad_norm": 4.470686435699463, + "learning_rate": 1.996770721896957e-05, + "loss": 1.6318, + "step": 4436 + }, + { + "epoch": 0.7158760890609874, + "grad_norm": 4.909954071044922, + "learning_rate": 1.9946821840216752e-05, + "loss": 1.5582, + "step": 4437 + }, + { + "epoch": 0.7160374314294934, + "grad_norm": 5.325438499450684, + "learning_rate": 1.9925944667854757e-05, + "loss": 1.7757, + "step": 4438 + }, + { + "epoch": 0.7161987737979993, + "grad_norm": 4.973210334777832, + "learning_rate": 1.9905075707584407e-05, + "loss": 2.0065, + "step": 4439 + }, + { + "epoch": 0.7163601161665053, + "grad_norm": 4.6979594230651855, + "learning_rate": 1.9884214965104194e-05, + "loss": 1.7517, + "step": 4440 + }, + { + "epoch": 0.7165214585350113, + "grad_norm": 4.94175386428833, + "learning_rate": 1.9863362446110416e-05, + "loss": 1.7648, + "step": 4441 + }, + { + "epoch": 0.7166828009035172, + "grad_norm": 4.009958267211914, + "learning_rate": 1.984251815629712e-05, + "loss": 2.2666, + "step": 4442 + }, + { + "epoch": 0.7168441432720233, + "grad_norm": 3.5523719787597656, + "learning_rate": 1.982168210135606e-05, + "loss": 1.8394, + "step": 4443 + }, + { + "epoch": 0.7170054856405292, + "grad_norm": 3.77880859375, + "learning_rate": 1.9800854286976815e-05, + "loss": 2.2207, + "step": 4444 + }, + { + "epoch": 0.7171668280090352, + "grad_norm": 5.544381618499756, + "learning_rate": 1.978003471884665e-05, + "loss": 1.87, + "step": 4445 + }, + { + "epoch": 0.7173281703775412, + "grad_norm": 4.609133243560791, + "learning_rate": 1.9759223402650635e-05, + "loss": 1.7028, + "step": 4446 + }, + { + "epoch": 0.7174895127460471, + "grad_norm": 4.625114440917969, + "learning_rate": 1.973842034407154e-05, + "loss": 1.9978, + "step": 4447 + }, + { + "epoch": 0.7176508551145531, + "grad_norm": 5.1108269691467285, + "learning_rate": 1.9717625548789893e-05, + "loss": 2.0785, + "step": 4448 + }, + { + "epoch": 0.717812197483059, + "grad_norm": 4.0849480628967285, + "learning_rate": 1.969683902248395e-05, + "loss": 1.8791, + "step": 4449 + }, + { + "epoch": 0.717973539851565, + "grad_norm": 3.570565938949585, + "learning_rate": 1.9676060770829774e-05, + "loss": 1.7057, + "step": 4450 + }, + { + "epoch": 0.7181348822200709, + "grad_norm": 4.450013160705566, + "learning_rate": 1.9655290799501074e-05, + "loss": 2.0126, + "step": 4451 + }, + { + "epoch": 0.718296224588577, + "grad_norm": 5.133970260620117, + "learning_rate": 1.9634529114169398e-05, + "loss": 1.7061, + "step": 4452 + }, + { + "epoch": 0.718457566957083, + "grad_norm": 5.0176167488098145, + "learning_rate": 1.9613775720503928e-05, + "loss": 1.8928, + "step": 4453 + }, + { + "epoch": 0.7186189093255889, + "grad_norm": 3.944841146469116, + "learning_rate": 1.9593030624171683e-05, + "loss": 1.9382, + "step": 4454 + }, + { + "epoch": 0.7187802516940949, + "grad_norm": 5.478379249572754, + "learning_rate": 1.957229383083734e-05, + "loss": 1.979, + "step": 4455 + }, + { + "epoch": 0.7189415940626008, + "grad_norm": 4.344667434692383, + "learning_rate": 1.9551565346163326e-05, + "loss": 1.835, + "step": 4456 + }, + { + "epoch": 0.7191029364311068, + "grad_norm": 5.0222697257995605, + "learning_rate": 1.9530845175809836e-05, + "loss": 2.0196, + "step": 4457 + }, + { + "epoch": 0.7192642787996127, + "grad_norm": 3.5757687091827393, + "learning_rate": 1.9510133325434742e-05, + "loss": 1.7501, + "step": 4458 + }, + { + "epoch": 0.7194256211681187, + "grad_norm": 3.818265438079834, + "learning_rate": 1.94894298006937e-05, + "loss": 1.9179, + "step": 4459 + }, + { + "epoch": 0.7195869635366248, + "grad_norm": 4.43404483795166, + "learning_rate": 1.946873460724003e-05, + "loss": 1.8416, + "step": 4460 + }, + { + "epoch": 0.7197483059051307, + "grad_norm": 4.358422756195068, + "learning_rate": 1.944804775072484e-05, + "loss": 2.1449, + "step": 4461 + }, + { + "epoch": 0.7199096482736367, + "grad_norm": 4.856911659240723, + "learning_rate": 1.9427369236796905e-05, + "loss": 1.7152, + "step": 4462 + }, + { + "epoch": 0.7200709906421426, + "grad_norm": 4.715914249420166, + "learning_rate": 1.9406699071102774e-05, + "loss": 1.8595, + "step": 4463 + }, + { + "epoch": 0.7202323330106486, + "grad_norm": 4.349678993225098, + "learning_rate": 1.9386037259286677e-05, + "loss": 1.8718, + "step": 4464 + }, + { + "epoch": 0.7203936753791546, + "grad_norm": 4.57448148727417, + "learning_rate": 1.9365383806990562e-05, + "loss": 1.9621, + "step": 4465 + }, + { + "epoch": 0.7205550177476605, + "grad_norm": 4.83892297744751, + "learning_rate": 1.9344738719854137e-05, + "loss": 1.8222, + "step": 4466 + }, + { + "epoch": 0.7207163601161665, + "grad_norm": 4.104072570800781, + "learning_rate": 1.932410200351479e-05, + "loss": 1.8106, + "step": 4467 + }, + { + "epoch": 0.7208777024846724, + "grad_norm": 4.178228378295898, + "learning_rate": 1.930347366360762e-05, + "loss": 1.9169, + "step": 4468 + }, + { + "epoch": 0.7210390448531785, + "grad_norm": 3.7050023078918457, + "learning_rate": 1.9282853705765435e-05, + "loss": 1.7201, + "step": 4469 + }, + { + "epoch": 0.7212003872216844, + "grad_norm": 5.124715805053711, + "learning_rate": 1.926224213561881e-05, + "loss": 2.0132, + "step": 4470 + }, + { + "epoch": 0.7213617295901904, + "grad_norm": 3.972278594970703, + "learning_rate": 1.9241638958795942e-05, + "loss": 2.0047, + "step": 4471 + }, + { + "epoch": 0.7215230719586964, + "grad_norm": 5.051692485809326, + "learning_rate": 1.922104418092283e-05, + "loss": 1.9983, + "step": 4472 + }, + { + "epoch": 0.7216844143272023, + "grad_norm": 4.551050186157227, + "learning_rate": 1.920045780762309e-05, + "loss": 1.8696, + "step": 4473 + }, + { + "epoch": 0.7218457566957083, + "grad_norm": 4.42860746383667, + "learning_rate": 1.917987984451812e-05, + "loss": 1.8013, + "step": 4474 + }, + { + "epoch": 0.7220070990642142, + "grad_norm": 4.822419166564941, + "learning_rate": 1.9159310297226957e-05, + "loss": 1.8173, + "step": 4475 + }, + { + "epoch": 0.7221684414327202, + "grad_norm": 5.97450590133667, + "learning_rate": 1.9138749171366398e-05, + "loss": 1.7555, + "step": 4476 + }, + { + "epoch": 0.7223297838012263, + "grad_norm": 3.7013676166534424, + "learning_rate": 1.911819647255088e-05, + "loss": 1.5018, + "step": 4477 + }, + { + "epoch": 0.7224911261697322, + "grad_norm": 3.790663242340088, + "learning_rate": 1.909765220639261e-05, + "loss": 1.8122, + "step": 4478 + }, + { + "epoch": 0.7226524685382382, + "grad_norm": 4.289879322052002, + "learning_rate": 1.9077116378501424e-05, + "loss": 1.7903, + "step": 4479 + }, + { + "epoch": 0.7228138109067441, + "grad_norm": 6.576168537139893, + "learning_rate": 1.9056588994484877e-05, + "loss": 1.8691, + "step": 4480 + }, + { + "epoch": 0.7229751532752501, + "grad_norm": 3.9365086555480957, + "learning_rate": 1.9036070059948252e-05, + "loss": 2.0826, + "step": 4481 + }, + { + "epoch": 0.723136495643756, + "grad_norm": 4.5562310218811035, + "learning_rate": 1.901555958049447e-05, + "loss": 2.068, + "step": 4482 + }, + { + "epoch": 0.723297838012262, + "grad_norm": 4.068548202514648, + "learning_rate": 1.8995057561724193e-05, + "loss": 1.9986, + "step": 4483 + }, + { + "epoch": 0.723459180380768, + "grad_norm": 5.269343376159668, + "learning_rate": 1.897456400923574e-05, + "loss": 2.0491, + "step": 4484 + }, + { + "epoch": 0.7236205227492739, + "grad_norm": 4.068410396575928, + "learning_rate": 1.895407892862512e-05, + "loss": 1.762, + "step": 4485 + }, + { + "epoch": 0.72378186511778, + "grad_norm": 3.507288694381714, + "learning_rate": 1.893360232548605e-05, + "loss": 1.6844, + "step": 4486 + }, + { + "epoch": 0.7239432074862859, + "grad_norm": 4.270358562469482, + "learning_rate": 1.891313420540992e-05, + "loss": 1.6996, + "step": 4487 + }, + { + "epoch": 0.7241045498547919, + "grad_norm": 6.514777660369873, + "learning_rate": 1.889267457398578e-05, + "loss": 1.744, + "step": 4488 + }, + { + "epoch": 0.7242658922232978, + "grad_norm": 3.457181692123413, + "learning_rate": 1.887222343680041e-05, + "loss": 1.6918, + "step": 4489 + }, + { + "epoch": 0.7244272345918038, + "grad_norm": 3.9298248291015625, + "learning_rate": 1.885178079943823e-05, + "loss": 1.8281, + "step": 4490 + }, + { + "epoch": 0.7245885769603098, + "grad_norm": 3.867527723312378, + "learning_rate": 1.883134666748137e-05, + "loss": 1.8823, + "step": 4491 + }, + { + "epoch": 0.7247499193288157, + "grad_norm": 4.336234092712402, + "learning_rate": 1.8810921046509617e-05, + "loss": 1.8338, + "step": 4492 + }, + { + "epoch": 0.7249112616973217, + "grad_norm": 4.065129280090332, + "learning_rate": 1.8790503942100412e-05, + "loss": 1.9443, + "step": 4493 + }, + { + "epoch": 0.7250726040658277, + "grad_norm": 3.868798017501831, + "learning_rate": 1.877009535982894e-05, + "loss": 1.7561, + "step": 4494 + }, + { + "epoch": 0.7252339464343337, + "grad_norm": 4.448511123657227, + "learning_rate": 1.874969530526797e-05, + "loss": 1.7878, + "step": 4495 + }, + { + "epoch": 0.7253952888028397, + "grad_norm": 3.7267308235168457, + "learning_rate": 1.872930378398804e-05, + "loss": 1.8618, + "step": 4496 + }, + { + "epoch": 0.7255566311713456, + "grad_norm": 3.7738239765167236, + "learning_rate": 1.8708920801557257e-05, + "loss": 1.7849, + "step": 4497 + }, + { + "epoch": 0.7257179735398516, + "grad_norm": 5.435400485992432, + "learning_rate": 1.8688546363541487e-05, + "loss": 1.9727, + "step": 4498 + }, + { + "epoch": 0.7258793159083575, + "grad_norm": 5.157015323638916, + "learning_rate": 1.866818047550419e-05, + "loss": 1.8461, + "step": 4499 + }, + { + "epoch": 0.7260406582768635, + "grad_norm": 5.213715553283691, + "learning_rate": 1.8647823143006542e-05, + "loss": 2.0061, + "step": 4500 + }, + { + "epoch": 0.7262020006453694, + "grad_norm": 4.5512518882751465, + "learning_rate": 1.8627474371607347e-05, + "loss": 2.0263, + "step": 4501 + }, + { + "epoch": 0.7263633430138754, + "grad_norm": 5.680060386657715, + "learning_rate": 1.8607134166863112e-05, + "loss": 2.3281, + "step": 4502 + }, + { + "epoch": 0.7265246853823815, + "grad_norm": 4.809805870056152, + "learning_rate": 1.858680253432797e-05, + "loss": 1.9999, + "step": 4503 + }, + { + "epoch": 0.7266860277508874, + "grad_norm": 4.11499547958374, + "learning_rate": 1.8566479479553715e-05, + "loss": 1.7536, + "step": 4504 + }, + { + "epoch": 0.7268473701193934, + "grad_norm": 4.847894668579102, + "learning_rate": 1.8546165008089805e-05, + "loss": 2.0412, + "step": 4505 + }, + { + "epoch": 0.7270087124878993, + "grad_norm": 4.410421848297119, + "learning_rate": 1.852585912548338e-05, + "loss": 1.6655, + "step": 4506 + }, + { + "epoch": 0.7271700548564053, + "grad_norm": 4.558352470397949, + "learning_rate": 1.8505561837279195e-05, + "loss": 1.7886, + "step": 4507 + }, + { + "epoch": 0.7273313972249112, + "grad_norm": 3.6568939685821533, + "learning_rate": 1.8485273149019655e-05, + "loss": 1.6887, + "step": 4508 + }, + { + "epoch": 0.7274927395934172, + "grad_norm": 4.80131196975708, + "learning_rate": 1.8464993066244886e-05, + "loss": 1.7628, + "step": 4509 + }, + { + "epoch": 0.7276540819619232, + "grad_norm": 4.128368377685547, + "learning_rate": 1.8444721594492558e-05, + "loss": 1.886, + "step": 4510 + }, + { + "epoch": 0.7278154243304291, + "grad_norm": 5.377955436706543, + "learning_rate": 1.84244587392981e-05, + "loss": 1.9814, + "step": 4511 + }, + { + "epoch": 0.7279767666989352, + "grad_norm": 4.4759697914123535, + "learning_rate": 1.840420450619449e-05, + "loss": 1.7729, + "step": 4512 + }, + { + "epoch": 0.7281381090674411, + "grad_norm": 4.838116645812988, + "learning_rate": 1.8383958900712434e-05, + "loss": 1.9534, + "step": 4513 + }, + { + "epoch": 0.7282994514359471, + "grad_norm": 3.3044047355651855, + "learning_rate": 1.8363721928380205e-05, + "loss": 2.0678, + "step": 4514 + }, + { + "epoch": 0.728460793804453, + "grad_norm": 4.035040378570557, + "learning_rate": 1.8343493594723803e-05, + "loss": 1.8531, + "step": 4515 + }, + { + "epoch": 0.728622136172959, + "grad_norm": 4.368617534637451, + "learning_rate": 1.832327390526678e-05, + "loss": 1.5639, + "step": 4516 + }, + { + "epoch": 0.728783478541465, + "grad_norm": 3.922774076461792, + "learning_rate": 1.8303062865530406e-05, + "loss": 1.9464, + "step": 4517 + }, + { + "epoch": 0.7289448209099709, + "grad_norm": 3.48451828956604, + "learning_rate": 1.8282860481033543e-05, + "loss": 1.7091, + "step": 4518 + }, + { + "epoch": 0.7291061632784769, + "grad_norm": 3.918025016784668, + "learning_rate": 1.8262666757292674e-05, + "loss": 1.9751, + "step": 4519 + }, + { + "epoch": 0.729267505646983, + "grad_norm": 4.220199108123779, + "learning_rate": 1.824248169982199e-05, + "loss": 1.8982, + "step": 4520 + }, + { + "epoch": 0.7294288480154889, + "grad_norm": 5.191842555999756, + "learning_rate": 1.822230531413323e-05, + "loss": 2.1131, + "step": 4521 + }, + { + "epoch": 0.7295901903839949, + "grad_norm": 4.386340618133545, + "learning_rate": 1.820213760573584e-05, + "loss": 2.1085, + "step": 4522 + }, + { + "epoch": 0.7297515327525008, + "grad_norm": 4.766170501708984, + "learning_rate": 1.818197858013685e-05, + "loss": 1.882, + "step": 4523 + }, + { + "epoch": 0.7299128751210068, + "grad_norm": 4.039060115814209, + "learning_rate": 1.8161828242840923e-05, + "loss": 1.8015, + "step": 4524 + }, + { + "epoch": 0.7300742174895127, + "grad_norm": 4.161253929138184, + "learning_rate": 1.8141686599350337e-05, + "loss": 1.6455, + "step": 4525 + }, + { + "epoch": 0.7302355598580187, + "grad_norm": 5.185811996459961, + "learning_rate": 1.8121553655165057e-05, + "loss": 1.6829, + "step": 4526 + }, + { + "epoch": 0.7303969022265246, + "grad_norm": 4.229522228240967, + "learning_rate": 1.8101429415782594e-05, + "loss": 2.0075, + "step": 4527 + }, + { + "epoch": 0.7305582445950306, + "grad_norm": 3.822842597961426, + "learning_rate": 1.808131388669816e-05, + "loss": 1.7609, + "step": 4528 + }, + { + "epoch": 0.7307195869635367, + "grad_norm": 4.920767784118652, + "learning_rate": 1.8061207073404507e-05, + "loss": 1.9016, + "step": 4529 + }, + { + "epoch": 0.7308809293320426, + "grad_norm": 4.125351428985596, + "learning_rate": 1.8041108981392086e-05, + "loss": 1.9906, + "step": 4530 + }, + { + "epoch": 0.7310422717005486, + "grad_norm": 4.810054302215576, + "learning_rate": 1.802101961614891e-05, + "loss": 1.627, + "step": 4531 + }, + { + "epoch": 0.7312036140690545, + "grad_norm": 5.05076789855957, + "learning_rate": 1.8000938983160608e-05, + "loss": 2.0142, + "step": 4532 + }, + { + "epoch": 0.7313649564375605, + "grad_norm": 4.283308029174805, + "learning_rate": 1.7980867087910486e-05, + "loss": 1.7554, + "step": 4533 + }, + { + "epoch": 0.7315262988060665, + "grad_norm": 5.207695484161377, + "learning_rate": 1.796080393587939e-05, + "loss": 1.584, + "step": 4534 + }, + { + "epoch": 0.7316876411745724, + "grad_norm": 5.01399564743042, + "learning_rate": 1.794074953254583e-05, + "loss": 2.0779, + "step": 4535 + }, + { + "epoch": 0.7318489835430784, + "grad_norm": 3.69757080078125, + "learning_rate": 1.7920703883385888e-05, + "loss": 1.7488, + "step": 4536 + }, + { + "epoch": 0.7320103259115844, + "grad_norm": 6.600658416748047, + "learning_rate": 1.7900666993873305e-05, + "loss": 1.8841, + "step": 4537 + }, + { + "epoch": 0.7321716682800904, + "grad_norm": 4.896603107452393, + "learning_rate": 1.7880638869479365e-05, + "loss": 1.8702, + "step": 4538 + }, + { + "epoch": 0.7323330106485963, + "grad_norm": 3.632988214492798, + "learning_rate": 1.7860619515673033e-05, + "loss": 1.617, + "step": 4539 + }, + { + "epoch": 0.7324943530171023, + "grad_norm": 4.568796157836914, + "learning_rate": 1.7840608937920804e-05, + "loss": 1.8584, + "step": 4540 + }, + { + "epoch": 0.7326556953856083, + "grad_norm": 3.9885737895965576, + "learning_rate": 1.7820607141686846e-05, + "loss": 1.8715, + "step": 4541 + }, + { + "epoch": 0.7328170377541142, + "grad_norm": 5.049192428588867, + "learning_rate": 1.780061413243288e-05, + "loss": 2.1547, + "step": 4542 + }, + { + "epoch": 0.7329783801226202, + "grad_norm": 6.493274211883545, + "learning_rate": 1.778062991561824e-05, + "loss": 2.0009, + "step": 4543 + }, + { + "epoch": 0.7331397224911261, + "grad_norm": 5.213850975036621, + "learning_rate": 1.7760654496699875e-05, + "loss": 1.9638, + "step": 4544 + }, + { + "epoch": 0.7333010648596321, + "grad_norm": 4.086864948272705, + "learning_rate": 1.774068788113229e-05, + "loss": 1.8733, + "step": 4545 + }, + { + "epoch": 0.7334624072281382, + "grad_norm": 4.98647928237915, + "learning_rate": 1.7720730074367646e-05, + "loss": 1.7348, + "step": 4546 + }, + { + "epoch": 0.7336237495966441, + "grad_norm": 4.155590057373047, + "learning_rate": 1.770078108185565e-05, + "loss": 1.7009, + "step": 4547 + }, + { + "epoch": 0.7337850919651501, + "grad_norm": 5.692659378051758, + "learning_rate": 1.7680840909043644e-05, + "loss": 2.0436, + "step": 4548 + }, + { + "epoch": 0.733946434333656, + "grad_norm": 3.6536619663238525, + "learning_rate": 1.7660909561376504e-05, + "loss": 1.9367, + "step": 4549 + }, + { + "epoch": 0.734107776702162, + "grad_norm": 5.5632171630859375, + "learning_rate": 1.764098704429677e-05, + "loss": 1.6076, + "step": 4550 + }, + { + "epoch": 0.7342691190706679, + "grad_norm": 5.216808795928955, + "learning_rate": 1.7621073363244488e-05, + "loss": 1.8999, + "step": 4551 + }, + { + "epoch": 0.7344304614391739, + "grad_norm": 4.736422061920166, + "learning_rate": 1.760116852365738e-05, + "loss": 1.8347, + "step": 4552 + }, + { + "epoch": 0.7345918038076799, + "grad_norm": 4.6815409660339355, + "learning_rate": 1.7581272530970667e-05, + "loss": 1.7078, + "step": 4553 + }, + { + "epoch": 0.7347531461761858, + "grad_norm": 3.6447505950927734, + "learning_rate": 1.7561385390617226e-05, + "loss": 1.976, + "step": 4554 + }, + { + "epoch": 0.7349144885446919, + "grad_norm": 3.5308313369750977, + "learning_rate": 1.7541507108027466e-05, + "loss": 1.7106, + "step": 4555 + }, + { + "epoch": 0.7350758309131978, + "grad_norm": 9.241874694824219, + "learning_rate": 1.7521637688629393e-05, + "loss": 1.7049, + "step": 4556 + }, + { + "epoch": 0.7352371732817038, + "grad_norm": 4.630099296569824, + "learning_rate": 1.7501777137848625e-05, + "loss": 1.8567, + "step": 4557 + }, + { + "epoch": 0.7353985156502097, + "grad_norm": 4.028338432312012, + "learning_rate": 1.7481925461108295e-05, + "loss": 1.7505, + "step": 4558 + }, + { + "epoch": 0.7355598580187157, + "grad_norm": 7.373607158660889, + "learning_rate": 1.746208266382918e-05, + "loss": 2.1431, + "step": 4559 + }, + { + "epoch": 0.7357212003872217, + "grad_norm": 4.728418350219727, + "learning_rate": 1.7442248751429574e-05, + "loss": 1.8313, + "step": 4560 + }, + { + "epoch": 0.7358825427557276, + "grad_norm": 4.013697147369385, + "learning_rate": 1.7422423729325397e-05, + "loss": 1.7527, + "step": 4561 + }, + { + "epoch": 0.7360438851242336, + "grad_norm": 4.053626537322998, + "learning_rate": 1.7402607602930104e-05, + "loss": 1.8807, + "step": 4562 + }, + { + "epoch": 0.7362052274927396, + "grad_norm": 4.765527248382568, + "learning_rate": 1.7382800377654727e-05, + "loss": 1.9369, + "step": 4563 + }, + { + "epoch": 0.7363665698612456, + "grad_norm": 4.913549900054932, + "learning_rate": 1.7363002058907867e-05, + "loss": 1.7338, + "step": 4564 + }, + { + "epoch": 0.7365279122297516, + "grad_norm": 4.462325572967529, + "learning_rate": 1.734321265209572e-05, + "loss": 2.0695, + "step": 4565 + }, + { + "epoch": 0.7366892545982575, + "grad_norm": 4.088284969329834, + "learning_rate": 1.7323432162622006e-05, + "loss": 1.6925, + "step": 4566 + }, + { + "epoch": 0.7368505969667635, + "grad_norm": 3.6851367950439453, + "learning_rate": 1.730366059588805e-05, + "loss": 1.7553, + "step": 4567 + }, + { + "epoch": 0.7370119393352694, + "grad_norm": 4.4936418533325195, + "learning_rate": 1.728389795729272e-05, + "loss": 1.9055, + "step": 4568 + }, + { + "epoch": 0.7371732817037754, + "grad_norm": 3.878669261932373, + "learning_rate": 1.7264144252232422e-05, + "loss": 2.0015, + "step": 4569 + }, + { + "epoch": 0.7373346240722813, + "grad_norm": 4.058463096618652, + "learning_rate": 1.724439948610119e-05, + "loss": 1.846, + "step": 4570 + }, + { + "epoch": 0.7374959664407873, + "grad_norm": 4.332618713378906, + "learning_rate": 1.7224663664290536e-05, + "loss": 2.031, + "step": 4571 + }, + { + "epoch": 0.7376573088092934, + "grad_norm": 3.5738584995269775, + "learning_rate": 1.7204936792189607e-05, + "loss": 1.8451, + "step": 4572 + }, + { + "epoch": 0.7378186511777993, + "grad_norm": 5.251870632171631, + "learning_rate": 1.7185218875185035e-05, + "loss": 1.9784, + "step": 4573 + }, + { + "epoch": 0.7379799935463053, + "grad_norm": 4.949958324432373, + "learning_rate": 1.7165509918661067e-05, + "loss": 1.998, + "step": 4574 + }, + { + "epoch": 0.7381413359148112, + "grad_norm": 4.119620323181152, + "learning_rate": 1.7145809927999447e-05, + "loss": 1.8491, + "step": 4575 + }, + { + "epoch": 0.7383026782833172, + "grad_norm": 5.099935054779053, + "learning_rate": 1.7126118908579535e-05, + "loss": 1.8617, + "step": 4576 + }, + { + "epoch": 0.7384640206518231, + "grad_norm": 4.226795196533203, + "learning_rate": 1.710643686577818e-05, + "loss": 1.9372, + "step": 4577 + }, + { + "epoch": 0.7386253630203291, + "grad_norm": 4.3429107666015625, + "learning_rate": 1.708676380496983e-05, + "loss": 1.9013, + "step": 4578 + }, + { + "epoch": 0.7387867053888351, + "grad_norm": 4.148284435272217, + "learning_rate": 1.7067099731526444e-05, + "loss": 1.939, + "step": 4579 + }, + { + "epoch": 0.7389480477573411, + "grad_norm": 5.373454570770264, + "learning_rate": 1.7047444650817517e-05, + "loss": 1.8943, + "step": 4580 + }, + { + "epoch": 0.7391093901258471, + "grad_norm": 4.9623260498046875, + "learning_rate": 1.7027798568210156e-05, + "loss": 1.8009, + "step": 4581 + }, + { + "epoch": 0.739270732494353, + "grad_norm": 3.955017328262329, + "learning_rate": 1.700816148906894e-05, + "loss": 1.8899, + "step": 4582 + }, + { + "epoch": 0.739432074862859, + "grad_norm": 4.221274375915527, + "learning_rate": 1.698853341875602e-05, + "loss": 1.7815, + "step": 4583 + }, + { + "epoch": 0.739593417231365, + "grad_norm": 3.688870668411255, + "learning_rate": 1.6968914362631065e-05, + "loss": 1.7886, + "step": 4584 + }, + { + "epoch": 0.7397547595998709, + "grad_norm": 4.286644458770752, + "learning_rate": 1.6949304326051335e-05, + "loss": 1.8917, + "step": 4585 + }, + { + "epoch": 0.7399161019683769, + "grad_norm": 4.146304130554199, + "learning_rate": 1.692970331437155e-05, + "loss": 1.8755, + "step": 4586 + }, + { + "epoch": 0.7400774443368828, + "grad_norm": 3.87103533744812, + "learning_rate": 1.6910111332944056e-05, + "loss": 1.9856, + "step": 4587 + }, + { + "epoch": 0.7402387867053888, + "grad_norm": 4.180723190307617, + "learning_rate": 1.689052838711864e-05, + "loss": 1.9044, + "step": 4588 + }, + { + "epoch": 0.7404001290738949, + "grad_norm": 3.6202878952026367, + "learning_rate": 1.6870954482242707e-05, + "loss": 1.6294, + "step": 4589 + }, + { + "epoch": 0.7405614714424008, + "grad_norm": 4.394001483917236, + "learning_rate": 1.685138962366112e-05, + "loss": 2.2767, + "step": 4590 + }, + { + "epoch": 0.7407228138109068, + "grad_norm": 4.077112197875977, + "learning_rate": 1.683183381671633e-05, + "loss": 1.8177, + "step": 4591 + }, + { + "epoch": 0.7408841561794127, + "grad_norm": 4.718794345855713, + "learning_rate": 1.6812287066748262e-05, + "loss": 1.889, + "step": 4592 + }, + { + "epoch": 0.7410454985479187, + "grad_norm": 4.432754993438721, + "learning_rate": 1.6792749379094437e-05, + "loss": 1.9168, + "step": 4593 + }, + { + "epoch": 0.7412068409164246, + "grad_norm": 3.764059066772461, + "learning_rate": 1.6773220759089835e-05, + "loss": 1.9586, + "step": 4594 + }, + { + "epoch": 0.7413681832849306, + "grad_norm": 4.602880001068115, + "learning_rate": 1.6753701212066975e-05, + "loss": 1.8579, + "step": 4595 + }, + { + "epoch": 0.7415295256534365, + "grad_norm": 5.309647083282471, + "learning_rate": 1.6734190743355944e-05, + "loss": 1.8077, + "step": 4596 + }, + { + "epoch": 0.7416908680219426, + "grad_norm": 4.698686599731445, + "learning_rate": 1.671468935828428e-05, + "loss": 1.7992, + "step": 4597 + }, + { + "epoch": 0.7418522103904486, + "grad_norm": 3.6300902366638184, + "learning_rate": 1.6695197062177108e-05, + "loss": 1.8038, + "step": 4598 + }, + { + "epoch": 0.7420135527589545, + "grad_norm": 4.105565547943115, + "learning_rate": 1.6675713860357036e-05, + "loss": 1.7547, + "step": 4599 + }, + { + "epoch": 0.7421748951274605, + "grad_norm": 3.8115761280059814, + "learning_rate": 1.665623975814416e-05, + "loss": 1.9306, + "step": 4600 + }, + { + "epoch": 0.7423362374959664, + "grad_norm": 3.8606653213500977, + "learning_rate": 1.663677476085616e-05, + "loss": 1.61, + "step": 4601 + }, + { + "epoch": 0.7424975798644724, + "grad_norm": 4.54530143737793, + "learning_rate": 1.6617318873808184e-05, + "loss": 1.9141, + "step": 4602 + }, + { + "epoch": 0.7426589222329784, + "grad_norm": 4.4852681159973145, + "learning_rate": 1.6597872102312885e-05, + "loss": 1.7797, + "step": 4603 + }, + { + "epoch": 0.7428202646014843, + "grad_norm": 5.656156063079834, + "learning_rate": 1.6578434451680468e-05, + "loss": 1.9747, + "step": 4604 + }, + { + "epoch": 0.7429816069699903, + "grad_norm": 3.343182325363159, + "learning_rate": 1.6559005927218614e-05, + "loss": 1.6285, + "step": 4605 + }, + { + "epoch": 0.7431429493384963, + "grad_norm": 4.80542516708374, + "learning_rate": 1.6539586534232504e-05, + "loss": 2.0188, + "step": 4606 + }, + { + "epoch": 0.7433042917070023, + "grad_norm": 3.5260865688323975, + "learning_rate": 1.652017627802487e-05, + "loss": 1.6938, + "step": 4607 + }, + { + "epoch": 0.7434656340755083, + "grad_norm": 4.347525596618652, + "learning_rate": 1.6500775163895893e-05, + "loss": 1.7483, + "step": 4608 + }, + { + "epoch": 0.7436269764440142, + "grad_norm": 3.326824426651001, + "learning_rate": 1.6481383197143325e-05, + "loss": 1.8742, + "step": 4609 + }, + { + "epoch": 0.7437883188125202, + "grad_norm": 4.1977033615112305, + "learning_rate": 1.646200038306234e-05, + "loss": 1.7538, + "step": 4610 + }, + { + "epoch": 0.7439496611810261, + "grad_norm": 3.4120020866394043, + "learning_rate": 1.6442626726945687e-05, + "loss": 1.8996, + "step": 4611 + }, + { + "epoch": 0.7441110035495321, + "grad_norm": 4.192962646484375, + "learning_rate": 1.6423262234083557e-05, + "loss": 1.7158, + "step": 4612 + }, + { + "epoch": 0.744272345918038, + "grad_norm": 4.311139106750488, + "learning_rate": 1.640390690976369e-05, + "loss": 2.0214, + "step": 4613 + }, + { + "epoch": 0.744433688286544, + "grad_norm": 4.723755359649658, + "learning_rate": 1.6384560759271267e-05, + "loss": 1.837, + "step": 4614 + }, + { + "epoch": 0.7445950306550501, + "grad_norm": 3.802628755569458, + "learning_rate": 1.6365223787889017e-05, + "loss": 1.8493, + "step": 4615 + }, + { + "epoch": 0.744756373023556, + "grad_norm": 4.033158779144287, + "learning_rate": 1.634589600089712e-05, + "loss": 1.8956, + "step": 4616 + }, + { + "epoch": 0.744917715392062, + "grad_norm": 5.609979629516602, + "learning_rate": 1.6326577403573284e-05, + "loss": 1.9424, + "step": 4617 + }, + { + "epoch": 0.7450790577605679, + "grad_norm": 3.737351894378662, + "learning_rate": 1.6307268001192688e-05, + "loss": 1.8098, + "step": 4618 + }, + { + "epoch": 0.7452404001290739, + "grad_norm": 4.970017433166504, + "learning_rate": 1.6287967799027975e-05, + "loss": 1.7239, + "step": 4619 + }, + { + "epoch": 0.7454017424975798, + "grad_norm": 4.549147605895996, + "learning_rate": 1.626867680234934e-05, + "loss": 2.1699, + "step": 4620 + }, + { + "epoch": 0.7455630848660858, + "grad_norm": 5.983554363250732, + "learning_rate": 1.6249395016424416e-05, + "loss": 1.7461, + "step": 4621 + }, + { + "epoch": 0.7457244272345918, + "grad_norm": 4.125341892242432, + "learning_rate": 1.6230122446518327e-05, + "loss": 1.9705, + "step": 4622 + }, + { + "epoch": 0.7458857696030978, + "grad_norm": 4.720033645629883, + "learning_rate": 1.6210859097893667e-05, + "loss": 1.8833, + "step": 4623 + }, + { + "epoch": 0.7460471119716038, + "grad_norm": 3.916238784790039, + "learning_rate": 1.6191604975810565e-05, + "loss": 1.9698, + "step": 4624 + }, + { + "epoch": 0.7462084543401097, + "grad_norm": 6.627416133880615, + "learning_rate": 1.6172360085526565e-05, + "loss": 1.772, + "step": 4625 + }, + { + "epoch": 0.7463697967086157, + "grad_norm": 4.832714080810547, + "learning_rate": 1.615312443229676e-05, + "loss": 1.8113, + "step": 4626 + }, + { + "epoch": 0.7465311390771217, + "grad_norm": 4.001484394073486, + "learning_rate": 1.6133898021373646e-05, + "loss": 2.0775, + "step": 4627 + }, + { + "epoch": 0.7466924814456276, + "grad_norm": 3.636345148086548, + "learning_rate": 1.6114680858007257e-05, + "loss": 1.9637, + "step": 4628 + }, + { + "epoch": 0.7468538238141336, + "grad_norm": 3.999509572982788, + "learning_rate": 1.609547294744505e-05, + "loss": 1.8473, + "step": 4629 + }, + { + "epoch": 0.7470151661826395, + "grad_norm": 5.523262977600098, + "learning_rate": 1.6076274294932013e-05, + "loss": 1.828, + "step": 4630 + }, + { + "epoch": 0.7471765085511455, + "grad_norm": 3.6508684158325195, + "learning_rate": 1.605708490571056e-05, + "loss": 1.7955, + "step": 4631 + }, + { + "epoch": 0.7473378509196515, + "grad_norm": 4.2339701652526855, + "learning_rate": 1.603790478502058e-05, + "loss": 1.7679, + "step": 4632 + }, + { + "epoch": 0.7474991932881575, + "grad_norm": 4.072476387023926, + "learning_rate": 1.6018733938099462e-05, + "loss": 1.8788, + "step": 4633 + }, + { + "epoch": 0.7476605356566635, + "grad_norm": 3.755317449569702, + "learning_rate": 1.5999572370182016e-05, + "loss": 1.6835, + "step": 4634 + }, + { + "epoch": 0.7478218780251694, + "grad_norm": 4.9306535720825195, + "learning_rate": 1.5980420086500575e-05, + "loss": 1.844, + "step": 4635 + }, + { + "epoch": 0.7479832203936754, + "grad_norm": 3.9784042835235596, + "learning_rate": 1.5961277092284876e-05, + "loss": 1.9675, + "step": 4636 + }, + { + "epoch": 0.7481445627621813, + "grad_norm": 4.35151481628418, + "learning_rate": 1.5942143392762176e-05, + "loss": 1.8588, + "step": 4637 + }, + { + "epoch": 0.7483059051306873, + "grad_norm": 3.947786331176758, + "learning_rate": 1.592301899315716e-05, + "loss": 2.0314, + "step": 4638 + }, + { + "epoch": 0.7484672474991932, + "grad_norm": 3.782907724380493, + "learning_rate": 1.5903903898691962e-05, + "loss": 1.7101, + "step": 4639 + }, + { + "epoch": 0.7486285898676993, + "grad_norm": 3.448230266571045, + "learning_rate": 1.5884798114586226e-05, + "loss": 1.7833, + "step": 4640 + }, + { + "epoch": 0.7487899322362053, + "grad_norm": 4.434009552001953, + "learning_rate": 1.5865701646057002e-05, + "loss": 1.9052, + "step": 4641 + }, + { + "epoch": 0.7489512746047112, + "grad_norm": 3.897461414337158, + "learning_rate": 1.58466144983188e-05, + "loss": 1.7225, + "step": 4642 + }, + { + "epoch": 0.7491126169732172, + "grad_norm": 3.770840644836426, + "learning_rate": 1.5827536676583642e-05, + "loss": 1.7474, + "step": 4643 + }, + { + "epoch": 0.7492739593417231, + "grad_norm": 3.8453481197357178, + "learning_rate": 1.5808468186060936e-05, + "loss": 1.7936, + "step": 4644 + }, + { + "epoch": 0.7494353017102291, + "grad_norm": 4.0624680519104, + "learning_rate": 1.5789409031957563e-05, + "loss": 1.8487, + "step": 4645 + }, + { + "epoch": 0.749596644078735, + "grad_norm": 3.979088068008423, + "learning_rate": 1.5770359219477887e-05, + "loss": 1.6695, + "step": 4646 + }, + { + "epoch": 0.749757986447241, + "grad_norm": 4.707864284515381, + "learning_rate": 1.5751318753823658e-05, + "loss": 1.9013, + "step": 4647 + }, + { + "epoch": 0.749919328815747, + "grad_norm": 4.127801895141602, + "learning_rate": 1.573228764019415e-05, + "loss": 1.6308, + "step": 4648 + }, + { + "epoch": 0.750080671184253, + "grad_norm": 4.5204572677612305, + "learning_rate": 1.5713265883786e-05, + "loss": 1.6404, + "step": 4649 + }, + { + "epoch": 0.750242013552759, + "grad_norm": 5.12779426574707, + "learning_rate": 1.5694253489793374e-05, + "loss": 2.094, + "step": 4650 + }, + { + "epoch": 0.750403355921265, + "grad_norm": 3.957977056503296, + "learning_rate": 1.5675250463407807e-05, + "loss": 1.878, + "step": 4651 + }, + { + "epoch": 0.7505646982897709, + "grad_norm": 4.045324802398682, + "learning_rate": 1.5656256809818342e-05, + "loss": 1.8236, + "step": 4652 + }, + { + "epoch": 0.7507260406582769, + "grad_norm": 4.513662338256836, + "learning_rate": 1.56372725342114e-05, + "loss": 1.6848, + "step": 4653 + }, + { + "epoch": 0.7508873830267828, + "grad_norm": 4.850427627563477, + "learning_rate": 1.5618297641770895e-05, + "loss": 1.5516, + "step": 4654 + }, + { + "epoch": 0.7510487253952888, + "grad_norm": 3.8891425132751465, + "learning_rate": 1.5599332137678137e-05, + "loss": 1.8541, + "step": 4655 + }, + { + "epoch": 0.7512100677637947, + "grad_norm": 4.624578952789307, + "learning_rate": 1.558037602711191e-05, + "loss": 1.9117, + "step": 4656 + }, + { + "epoch": 0.7513714101323007, + "grad_norm": 4.53134822845459, + "learning_rate": 1.5561429315248406e-05, + "loss": 1.9477, + "step": 4657 + }, + { + "epoch": 0.7515327525008068, + "grad_norm": 5.0841264724731445, + "learning_rate": 1.554249200726125e-05, + "loss": 1.7465, + "step": 4658 + }, + { + "epoch": 0.7516940948693127, + "grad_norm": 4.755808353424072, + "learning_rate": 1.5523564108321497e-05, + "loss": 1.8322, + "step": 4659 + }, + { + "epoch": 0.7518554372378187, + "grad_norm": 5.515625, + "learning_rate": 1.550464562359768e-05, + "loss": 1.7347, + "step": 4660 + }, + { + "epoch": 0.7520167796063246, + "grad_norm": 3.8248002529144287, + "learning_rate": 1.5485736558255697e-05, + "loss": 1.8377, + "step": 4661 + }, + { + "epoch": 0.7521781219748306, + "grad_norm": 5.635315418243408, + "learning_rate": 1.5466836917458893e-05, + "loss": 1.8682, + "step": 4662 + }, + { + "epoch": 0.7523394643433365, + "grad_norm": 5.133936882019043, + "learning_rate": 1.5447946706368084e-05, + "loss": 1.8941, + "step": 4663 + }, + { + "epoch": 0.7525008067118425, + "grad_norm": 3.916292667388916, + "learning_rate": 1.5429065930141433e-05, + "loss": 1.8327, + "step": 4664 + }, + { + "epoch": 0.7526621490803485, + "grad_norm": 4.06002950668335, + "learning_rate": 1.5410194593934607e-05, + "loss": 1.9623, + "step": 4665 + }, + { + "epoch": 0.7528234914488545, + "grad_norm": 4.704767227172852, + "learning_rate": 1.5391332702900625e-05, + "loss": 1.8785, + "step": 4666 + }, + { + "epoch": 0.7529848338173605, + "grad_norm": 3.932387113571167, + "learning_rate": 1.5372480262189986e-05, + "loss": 1.924, + "step": 4667 + }, + { + "epoch": 0.7531461761858664, + "grad_norm": 5.115250587463379, + "learning_rate": 1.535363727695055e-05, + "loss": 1.9074, + "step": 4668 + }, + { + "epoch": 0.7533075185543724, + "grad_norm": 4.762061595916748, + "learning_rate": 1.5334803752327663e-05, + "loss": 1.7341, + "step": 4669 + }, + { + "epoch": 0.7534688609228783, + "grad_norm": 3.636070489883423, + "learning_rate": 1.5315979693464037e-05, + "loss": 1.466, + "step": 4670 + }, + { + "epoch": 0.7536302032913843, + "grad_norm": 6.28103494644165, + "learning_rate": 1.5297165105499794e-05, + "loss": 1.6176, + "step": 4671 + }, + { + "epoch": 0.7537915456598903, + "grad_norm": 5.284980773925781, + "learning_rate": 1.5278359993572517e-05, + "loss": 2.0502, + "step": 4672 + }, + { + "epoch": 0.7539528880283962, + "grad_norm": 4.022558689117432, + "learning_rate": 1.5259564362817148e-05, + "loss": 1.9754, + "step": 4673 + }, + { + "epoch": 0.7541142303969022, + "grad_norm": 4.9722185134887695, + "learning_rate": 1.5240778218366098e-05, + "loss": 1.813, + "step": 4674 + }, + { + "epoch": 0.7542755727654082, + "grad_norm": 4.032649517059326, + "learning_rate": 1.5222001565349114e-05, + "loss": 1.7048, + "step": 4675 + }, + { + "epoch": 0.7544369151339142, + "grad_norm": 4.553777694702148, + "learning_rate": 1.5203234408893436e-05, + "loss": 2.0578, + "step": 4676 + }, + { + "epoch": 0.7545982575024202, + "grad_norm": 3.5396218299865723, + "learning_rate": 1.5184476754123644e-05, + "loss": 1.9123, + "step": 4677 + }, + { + "epoch": 0.7547595998709261, + "grad_norm": 4.456936359405518, + "learning_rate": 1.516572860616175e-05, + "loss": 1.8527, + "step": 4678 + }, + { + "epoch": 0.7549209422394321, + "grad_norm": 4.161593437194824, + "learning_rate": 1.5146989970127157e-05, + "loss": 1.6447, + "step": 4679 + }, + { + "epoch": 0.755082284607938, + "grad_norm": 4.463810443878174, + "learning_rate": 1.51282608511367e-05, + "loss": 1.6882, + "step": 4680 + }, + { + "epoch": 0.755243626976444, + "grad_norm": 5.106133460998535, + "learning_rate": 1.5109541254304587e-05, + "loss": 1.8498, + "step": 4681 + }, + { + "epoch": 0.7554049693449499, + "grad_norm": 5.957087993621826, + "learning_rate": 1.5090831184742415e-05, + "loss": 1.9037, + "step": 4682 + }, + { + "epoch": 0.755566311713456, + "grad_norm": 3.988837480545044, + "learning_rate": 1.507213064755924e-05, + "loss": 1.8145, + "step": 4683 + }, + { + "epoch": 0.755727654081962, + "grad_norm": 3.7715859413146973, + "learning_rate": 1.5053439647861434e-05, + "loss": 1.606, + "step": 4684 + }, + { + "epoch": 0.7558889964504679, + "grad_norm": 5.431835651397705, + "learning_rate": 1.5034758190752835e-05, + "loss": 2.0316, + "step": 4685 + }, + { + "epoch": 0.7560503388189739, + "grad_norm": 4.677783012390137, + "learning_rate": 1.5016086281334624e-05, + "loss": 1.8509, + "step": 4686 + }, + { + "epoch": 0.7562116811874798, + "grad_norm": 4.996568202972412, + "learning_rate": 1.4997423924705417e-05, + "loss": 1.9228, + "step": 4687 + }, + { + "epoch": 0.7563730235559858, + "grad_norm": 5.09377908706665, + "learning_rate": 1.4978771125961177e-05, + "loss": 1.8016, + "step": 4688 + }, + { + "epoch": 0.7565343659244917, + "grad_norm": 5.763912677764893, + "learning_rate": 1.4960127890195308e-05, + "loss": 1.7141, + "step": 4689 + }, + { + "epoch": 0.7566957082929977, + "grad_norm": 4.399535179138184, + "learning_rate": 1.4941494222498543e-05, + "loss": 1.7796, + "step": 4690 + }, + { + "epoch": 0.7568570506615037, + "grad_norm": 3.825272798538208, + "learning_rate": 1.4922870127959065e-05, + "loss": 1.8581, + "step": 4691 + }, + { + "epoch": 0.7570183930300097, + "grad_norm": 4.161649227142334, + "learning_rate": 1.4904255611662387e-05, + "loss": 1.9555, + "step": 4692 + }, + { + "epoch": 0.7571797353985157, + "grad_norm": 4.9506707191467285, + "learning_rate": 1.488565067869146e-05, + "loss": 1.7525, + "step": 4693 + }, + { + "epoch": 0.7573410777670216, + "grad_norm": 3.6513702869415283, + "learning_rate": 1.4867055334126578e-05, + "loss": 1.7729, + "step": 4694 + }, + { + "epoch": 0.7575024201355276, + "grad_norm": 3.864866256713867, + "learning_rate": 1.4848469583045404e-05, + "loss": 1.4987, + "step": 4695 + }, + { + "epoch": 0.7576637625040336, + "grad_norm": 4.677921772003174, + "learning_rate": 1.4829893430523052e-05, + "loss": 1.9189, + "step": 4696 + }, + { + "epoch": 0.7578251048725395, + "grad_norm": 3.988673210144043, + "learning_rate": 1.4811326881631937e-05, + "loss": 2.0152, + "step": 4697 + }, + { + "epoch": 0.7579864472410455, + "grad_norm": 5.229036331176758, + "learning_rate": 1.4792769941441903e-05, + "loss": 1.8561, + "step": 4698 + }, + { + "epoch": 0.7581477896095514, + "grad_norm": 3.684943675994873, + "learning_rate": 1.4774222615020122e-05, + "loss": 1.7979, + "step": 4699 + }, + { + "epoch": 0.7583091319780574, + "grad_norm": 3.7034828662872314, + "learning_rate": 1.4755684907431205e-05, + "loss": 1.8502, + "step": 4700 + }, + { + "epoch": 0.7584704743465635, + "grad_norm": 4.172142028808594, + "learning_rate": 1.473715682373707e-05, + "loss": 1.8207, + "step": 4701 + }, + { + "epoch": 0.7586318167150694, + "grad_norm": 5.415553092956543, + "learning_rate": 1.4718638368997073e-05, + "loss": 1.7674, + "step": 4702 + }, + { + "epoch": 0.7587931590835754, + "grad_norm": 4.393552780151367, + "learning_rate": 1.4700129548267872e-05, + "loss": 1.844, + "step": 4703 + }, + { + "epoch": 0.7589545014520813, + "grad_norm": 11.484271049499512, + "learning_rate": 1.468163036660356e-05, + "loss": 1.9213, + "step": 4704 + }, + { + "epoch": 0.7591158438205873, + "grad_norm": 3.798029661178589, + "learning_rate": 1.4663140829055533e-05, + "loss": 1.8191, + "step": 4705 + }, + { + "epoch": 0.7592771861890932, + "grad_norm": 4.819342136383057, + "learning_rate": 1.4644660940672627e-05, + "loss": 1.8311, + "step": 4706 + }, + { + "epoch": 0.7594385285575992, + "grad_norm": 3.9851481914520264, + "learning_rate": 1.462619070650098e-05, + "loss": 1.7942, + "step": 4707 + }, + { + "epoch": 0.7595998709261051, + "grad_norm": 5.322359561920166, + "learning_rate": 1.4607730131584108e-05, + "loss": 1.6912, + "step": 4708 + }, + { + "epoch": 0.7597612132946112, + "grad_norm": 3.942171812057495, + "learning_rate": 1.458927922096292e-05, + "loss": 1.8307, + "step": 4709 + }, + { + "epoch": 0.7599225556631172, + "grad_norm": 5.3993730545043945, + "learning_rate": 1.4570837979675644e-05, + "loss": 1.9579, + "step": 4710 + }, + { + "epoch": 0.7600838980316231, + "grad_norm": 4.565049171447754, + "learning_rate": 1.4552406412757913e-05, + "loss": 1.7947, + "step": 4711 + }, + { + "epoch": 0.7602452404001291, + "grad_norm": 3.4901652336120605, + "learning_rate": 1.4533984525242667e-05, + "loss": 1.8477, + "step": 4712 + }, + { + "epoch": 0.760406582768635, + "grad_norm": 6.0585222244262695, + "learning_rate": 1.4515572322160254e-05, + "loss": 1.8443, + "step": 4713 + }, + { + "epoch": 0.760567925137141, + "grad_norm": 3.3346543312072754, + "learning_rate": 1.4497169808538325e-05, + "loss": 1.8413, + "step": 4714 + }, + { + "epoch": 0.760729267505647, + "grad_norm": 5.118963718414307, + "learning_rate": 1.4478776989401949e-05, + "loss": 1.8299, + "step": 4715 + }, + { + "epoch": 0.7608906098741529, + "grad_norm": 3.9776089191436768, + "learning_rate": 1.4460393869773492e-05, + "loss": 1.8031, + "step": 4716 + }, + { + "epoch": 0.7610519522426589, + "grad_norm": 4.204662799835205, + "learning_rate": 1.444202045467269e-05, + "loss": 1.8442, + "step": 4717 + }, + { + "epoch": 0.7612132946111649, + "grad_norm": 3.9457521438598633, + "learning_rate": 1.4423656749116621e-05, + "loss": 1.8611, + "step": 4718 + }, + { + "epoch": 0.7613746369796709, + "grad_norm": 3.5638539791107178, + "learning_rate": 1.4405302758119743e-05, + "loss": 1.8136, + "step": 4719 + }, + { + "epoch": 0.7615359793481769, + "grad_norm": 4.715579986572266, + "learning_rate": 1.4386958486693835e-05, + "loss": 1.907, + "step": 4720 + }, + { + "epoch": 0.7616973217166828, + "grad_norm": 4.191335678100586, + "learning_rate": 1.4368623939848003e-05, + "loss": 1.9415, + "step": 4721 + }, + { + "epoch": 0.7618586640851888, + "grad_norm": 4.509104251861572, + "learning_rate": 1.435029912258875e-05, + "loss": 1.9443, + "step": 4722 + }, + { + "epoch": 0.7620200064536947, + "grad_norm": 4.599697113037109, + "learning_rate": 1.4331984039919877e-05, + "loss": 1.9331, + "step": 4723 + }, + { + "epoch": 0.7621813488222007, + "grad_norm": 4.296796798706055, + "learning_rate": 1.4313678696842559e-05, + "loss": 1.9413, + "step": 4724 + }, + { + "epoch": 0.7623426911907066, + "grad_norm": 5.785429000854492, + "learning_rate": 1.4295383098355264e-05, + "loss": 2.0427, + "step": 4725 + }, + { + "epoch": 0.7625040335592127, + "grad_norm": 5.768642425537109, + "learning_rate": 1.4277097249453874e-05, + "loss": 1.974, + "step": 4726 + }, + { + "epoch": 0.7626653759277187, + "grad_norm": 4.165225982666016, + "learning_rate": 1.425882115513153e-05, + "loss": 1.6863, + "step": 4727 + }, + { + "epoch": 0.7628267182962246, + "grad_norm": 4.48101282119751, + "learning_rate": 1.4240554820378772e-05, + "loss": 1.8847, + "step": 4728 + }, + { + "epoch": 0.7629880606647306, + "grad_norm": 5.002747058868408, + "learning_rate": 1.4222298250183413e-05, + "loss": 1.7281, + "step": 4729 + }, + { + "epoch": 0.7631494030332365, + "grad_norm": 4.607141017913818, + "learning_rate": 1.4204051449530676e-05, + "loss": 1.7632, + "step": 4730 + }, + { + "epoch": 0.7633107454017425, + "grad_norm": 6.016432285308838, + "learning_rate": 1.4185814423403038e-05, + "loss": 2.0498, + "step": 4731 + }, + { + "epoch": 0.7634720877702484, + "grad_norm": 4.647368907928467, + "learning_rate": 1.4167587176780378e-05, + "loss": 1.7946, + "step": 4732 + }, + { + "epoch": 0.7636334301387544, + "grad_norm": 4.305753231048584, + "learning_rate": 1.4149369714639853e-05, + "loss": 1.8973, + "step": 4733 + }, + { + "epoch": 0.7637947725072604, + "grad_norm": 4.50528621673584, + "learning_rate": 1.4131162041955948e-05, + "loss": 1.9, + "step": 4734 + }, + { + "epoch": 0.7639561148757664, + "grad_norm": 5.1306562423706055, + "learning_rate": 1.4112964163700527e-05, + "loss": 2.0559, + "step": 4735 + }, + { + "epoch": 0.7641174572442724, + "grad_norm": 4.5340576171875, + "learning_rate": 1.4094776084842725e-05, + "loss": 1.6647, + "step": 4736 + }, + { + "epoch": 0.7642787996127783, + "grad_norm": 4.456101417541504, + "learning_rate": 1.407659781034903e-05, + "loss": 1.6651, + "step": 4737 + }, + { + "epoch": 0.7644401419812843, + "grad_norm": 3.8839051723480225, + "learning_rate": 1.405842934518322e-05, + "loss": 1.9903, + "step": 4738 + }, + { + "epoch": 0.7646014843497903, + "grad_norm": 4.24583101272583, + "learning_rate": 1.4040270694306457e-05, + "loss": 1.789, + "step": 4739 + }, + { + "epoch": 0.7647628267182962, + "grad_norm": 4.413949012756348, + "learning_rate": 1.402212186267714e-05, + "loss": 1.8537, + "step": 4740 + }, + { + "epoch": 0.7649241690868022, + "grad_norm": 5.955898761749268, + "learning_rate": 1.400398285525108e-05, + "loss": 1.8682, + "step": 4741 + }, + { + "epoch": 0.7650855114553081, + "grad_norm": 4.545259475708008, + "learning_rate": 1.3985853676981314e-05, + "loss": 1.8026, + "step": 4742 + }, + { + "epoch": 0.7652468538238142, + "grad_norm": 4.107945919036865, + "learning_rate": 1.3967734332818266e-05, + "loss": 1.6435, + "step": 4743 + }, + { + "epoch": 0.7654081961923201, + "grad_norm": 4.146108150482178, + "learning_rate": 1.394962482770964e-05, + "loss": 2.089, + "step": 4744 + }, + { + "epoch": 0.7655695385608261, + "grad_norm": 4.125722885131836, + "learning_rate": 1.3931525166600446e-05, + "loss": 1.9152, + "step": 4745 + }, + { + "epoch": 0.7657308809293321, + "grad_norm": 3.9361684322357178, + "learning_rate": 1.3913435354433036e-05, + "loss": 1.7265, + "step": 4746 + }, + { + "epoch": 0.765892223297838, + "grad_norm": 4.01522159576416, + "learning_rate": 1.3895355396147041e-05, + "loss": 2.022, + "step": 4747 + }, + { + "epoch": 0.766053565666344, + "grad_norm": 4.31265115737915, + "learning_rate": 1.3877285296679438e-05, + "loss": 1.9056, + "step": 4748 + }, + { + "epoch": 0.7662149080348499, + "grad_norm": 3.7925353050231934, + "learning_rate": 1.3859225060964459e-05, + "loss": 1.7249, + "step": 4749 + }, + { + "epoch": 0.7663762504033559, + "grad_norm": 3.631206512451172, + "learning_rate": 1.3841174693933712e-05, + "loss": 1.9949, + "step": 4750 + }, + { + "epoch": 0.7665375927718618, + "grad_norm": 4.69987154006958, + "learning_rate": 1.382313420051604e-05, + "loss": 1.7979, + "step": 4751 + }, + { + "epoch": 0.7666989351403679, + "grad_norm": 4.0977091789245605, + "learning_rate": 1.3805103585637647e-05, + "loss": 1.701, + "step": 4752 + }, + { + "epoch": 0.7668602775088739, + "grad_norm": 4.90214204788208, + "learning_rate": 1.3787082854222005e-05, + "loss": 1.676, + "step": 4753 + }, + { + "epoch": 0.7670216198773798, + "grad_norm": 4.234584331512451, + "learning_rate": 1.3769072011189876e-05, + "loss": 1.7801, + "step": 4754 + }, + { + "epoch": 0.7671829622458858, + "grad_norm": 4.4239912033081055, + "learning_rate": 1.3751071061459381e-05, + "loss": 2.016, + "step": 4755 + }, + { + "epoch": 0.7673443046143917, + "grad_norm": 4.1023454666137695, + "learning_rate": 1.373308000994588e-05, + "loss": 1.9956, + "step": 4756 + }, + { + "epoch": 0.7675056469828977, + "grad_norm": 4.720330238342285, + "learning_rate": 1.3715098861562059e-05, + "loss": 1.6989, + "step": 4757 + }, + { + "epoch": 0.7676669893514037, + "grad_norm": 4.460195064544678, + "learning_rate": 1.3697127621217865e-05, + "loss": 2.011, + "step": 4758 + }, + { + "epoch": 0.7678283317199096, + "grad_norm": 3.4943008422851562, + "learning_rate": 1.3679166293820606e-05, + "loss": 1.9777, + "step": 4759 + }, + { + "epoch": 0.7679896740884156, + "grad_norm": 4.435558319091797, + "learning_rate": 1.366121488427481e-05, + "loss": 1.7261, + "step": 4760 + }, + { + "epoch": 0.7681510164569216, + "grad_norm": 4.208644866943359, + "learning_rate": 1.3643273397482365e-05, + "loss": 1.7009, + "step": 4761 + }, + { + "epoch": 0.7683123588254276, + "grad_norm": 4.223933696746826, + "learning_rate": 1.3625341838342376e-05, + "loss": 1.7992, + "step": 4762 + }, + { + "epoch": 0.7684737011939335, + "grad_norm": 4.545698165893555, + "learning_rate": 1.3607420211751321e-05, + "loss": 1.8289, + "step": 4763 + }, + { + "epoch": 0.7686350435624395, + "grad_norm": 3.726844549179077, + "learning_rate": 1.3589508522602873e-05, + "loss": 1.8346, + "step": 4764 + }, + { + "epoch": 0.7687963859309455, + "grad_norm": 4.714771270751953, + "learning_rate": 1.3571606775788087e-05, + "loss": 1.5316, + "step": 4765 + }, + { + "epoch": 0.7689577282994514, + "grad_norm": 4.0585103034973145, + "learning_rate": 1.3553714976195214e-05, + "loss": 1.7228, + "step": 4766 + }, + { + "epoch": 0.7691190706679574, + "grad_norm": 4.517932415008545, + "learning_rate": 1.3535833128709869e-05, + "loss": 1.8709, + "step": 4767 + }, + { + "epoch": 0.7692804130364633, + "grad_norm": 5.770853519439697, + "learning_rate": 1.351796123821487e-05, + "loss": 1.9515, + "step": 4768 + }, + { + "epoch": 0.7694417554049694, + "grad_norm": 4.701601982116699, + "learning_rate": 1.3500099309590397e-05, + "loss": 1.92, + "step": 4769 + }, + { + "epoch": 0.7696030977734754, + "grad_norm": 4.671560287475586, + "learning_rate": 1.348224734771385e-05, + "loss": 1.8667, + "step": 4770 + }, + { + "epoch": 0.7697644401419813, + "grad_norm": 4.16002893447876, + "learning_rate": 1.346440535745992e-05, + "loss": 1.7221, + "step": 4771 + }, + { + "epoch": 0.7699257825104873, + "grad_norm": 4.713809013366699, + "learning_rate": 1.3446573343700597e-05, + "loss": 1.7392, + "step": 4772 + }, + { + "epoch": 0.7700871248789932, + "grad_norm": 4.215362071990967, + "learning_rate": 1.3428751311305132e-05, + "loss": 1.9662, + "step": 4773 + }, + { + "epoch": 0.7702484672474992, + "grad_norm": 4.4590253829956055, + "learning_rate": 1.3410939265140027e-05, + "loss": 1.7877, + "step": 4774 + }, + { + "epoch": 0.7704098096160051, + "grad_norm": 3.882617950439453, + "learning_rate": 1.3393137210069118e-05, + "loss": 1.6606, + "step": 4775 + }, + { + "epoch": 0.7705711519845111, + "grad_norm": 3.123530626296997, + "learning_rate": 1.337534515095345e-05, + "loss": 1.9039, + "step": 4776 + }, + { + "epoch": 0.770732494353017, + "grad_norm": 5.224033832550049, + "learning_rate": 1.335756309265136e-05, + "loss": 1.7497, + "step": 4777 + }, + { + "epoch": 0.7708938367215231, + "grad_norm": 4.352362155914307, + "learning_rate": 1.3339791040018479e-05, + "loss": 1.8906, + "step": 4778 + }, + { + "epoch": 0.7710551790900291, + "grad_norm": 4.679987907409668, + "learning_rate": 1.3322028997907666e-05, + "loss": 1.9451, + "step": 4779 + }, + { + "epoch": 0.771216521458535, + "grad_norm": 4.4323883056640625, + "learning_rate": 1.3304276971169088e-05, + "loss": 1.9463, + "step": 4780 + }, + { + "epoch": 0.771377863827041, + "grad_norm": 4.972047805786133, + "learning_rate": 1.3286534964650121e-05, + "loss": 1.8902, + "step": 4781 + }, + { + "epoch": 0.771539206195547, + "grad_norm": 4.959163665771484, + "learning_rate": 1.3268802983195484e-05, + "loss": 1.9566, + "step": 4782 + }, + { + "epoch": 0.7717005485640529, + "grad_norm": 4.445774078369141, + "learning_rate": 1.3251081031647078e-05, + "loss": 1.8361, + "step": 4783 + }, + { + "epoch": 0.7718618909325589, + "grad_norm": 5.686622142791748, + "learning_rate": 1.3233369114844101e-05, + "loss": 1.9, + "step": 4784 + }, + { + "epoch": 0.7720232333010648, + "grad_norm": 4.390459060668945, + "learning_rate": 1.3215667237623036e-05, + "loss": 1.6565, + "step": 4785 + }, + { + "epoch": 0.7721845756695709, + "grad_norm": 4.644404888153076, + "learning_rate": 1.3197975404817564e-05, + "loss": 1.7143, + "step": 4786 + }, + { + "epoch": 0.7723459180380768, + "grad_norm": 4.473944664001465, + "learning_rate": 1.3180293621258694e-05, + "loss": 1.9453, + "step": 4787 + }, + { + "epoch": 0.7725072604065828, + "grad_norm": 5.86933708190918, + "learning_rate": 1.3162621891774617e-05, + "loss": 1.7406, + "step": 4788 + }, + { + "epoch": 0.7726686027750888, + "grad_norm": 3.9441027641296387, + "learning_rate": 1.3144960221190861e-05, + "loss": 1.6576, + "step": 4789 + }, + { + "epoch": 0.7728299451435947, + "grad_norm": 4.592226028442383, + "learning_rate": 1.3127308614330119e-05, + "loss": 1.7783, + "step": 4790 + }, + { + "epoch": 0.7729912875121007, + "grad_norm": 3.8888795375823975, + "learning_rate": 1.3109667076012417e-05, + "loss": 1.8405, + "step": 4791 + }, + { + "epoch": 0.7731526298806066, + "grad_norm": 8.0985689163208, + "learning_rate": 1.3092035611054976e-05, + "loss": 2.0734, + "step": 4792 + }, + { + "epoch": 0.7733139722491126, + "grad_norm": 4.424942970275879, + "learning_rate": 1.3074414224272286e-05, + "loss": 1.8247, + "step": 4793 + }, + { + "epoch": 0.7734753146176185, + "grad_norm": 4.277080535888672, + "learning_rate": 1.3056802920476075e-05, + "loss": 1.8728, + "step": 4794 + }, + { + "epoch": 0.7736366569861246, + "grad_norm": 6.436392784118652, + "learning_rate": 1.3039201704475345e-05, + "loss": 1.9738, + "step": 4795 + }, + { + "epoch": 0.7737979993546306, + "grad_norm": 4.508710861206055, + "learning_rate": 1.3021610581076316e-05, + "loss": 2.0282, + "step": 4796 + }, + { + "epoch": 0.7739593417231365, + "grad_norm": 5.0215582847595215, + "learning_rate": 1.3004029555082453e-05, + "loss": 1.5631, + "step": 4797 + }, + { + "epoch": 0.7741206840916425, + "grad_norm": 4.453874588012695, + "learning_rate": 1.2986458631294491e-05, + "loss": 1.8881, + "step": 4798 + }, + { + "epoch": 0.7742820264601484, + "grad_norm": 4.145251750946045, + "learning_rate": 1.296889781451036e-05, + "loss": 1.8948, + "step": 4799 + }, + { + "epoch": 0.7744433688286544, + "grad_norm": 3.951270818710327, + "learning_rate": 1.2951347109525291e-05, + "loss": 1.8586, + "step": 4800 + }, + { + "epoch": 0.7746047111971603, + "grad_norm": 5.916871070861816, + "learning_rate": 1.2933806521131692e-05, + "loss": 2.2301, + "step": 4801 + }, + { + "epoch": 0.7747660535656663, + "grad_norm": 4.65308952331543, + "learning_rate": 1.2916276054119259e-05, + "loss": 1.8263, + "step": 4802 + }, + { + "epoch": 0.7749273959341723, + "grad_norm": 4.195945739746094, + "learning_rate": 1.2898755713274879e-05, + "loss": 1.8826, + "step": 4803 + }, + { + "epoch": 0.7750887383026783, + "grad_norm": 5.570291996002197, + "learning_rate": 1.2881245503382722e-05, + "loss": 1.7518, + "step": 4804 + }, + { + "epoch": 0.7752500806711843, + "grad_norm": 4.299945831298828, + "learning_rate": 1.2863745429224144e-05, + "loss": 1.8924, + "step": 4805 + }, + { + "epoch": 0.7754114230396902, + "grad_norm": 3.5652012825012207, + "learning_rate": 1.2846255495577774e-05, + "loss": 1.6966, + "step": 4806 + }, + { + "epoch": 0.7755727654081962, + "grad_norm": 4.118142127990723, + "learning_rate": 1.2828775707219442e-05, + "loss": 1.6356, + "step": 4807 + }, + { + "epoch": 0.7757341077767022, + "grad_norm": 3.89166522026062, + "learning_rate": 1.281130606892223e-05, + "loss": 1.9345, + "step": 4808 + }, + { + "epoch": 0.7758954501452081, + "grad_norm": 5.561777591705322, + "learning_rate": 1.2793846585456437e-05, + "loss": 2.08, + "step": 4809 + }, + { + "epoch": 0.7760567925137141, + "grad_norm": 3.5589771270751953, + "learning_rate": 1.2776397261589573e-05, + "loss": 1.6954, + "step": 4810 + }, + { + "epoch": 0.77621813488222, + "grad_norm": 4.224887847900391, + "learning_rate": 1.2758958102086416e-05, + "loss": 1.7887, + "step": 4811 + }, + { + "epoch": 0.7763794772507261, + "grad_norm": 4.228940010070801, + "learning_rate": 1.2741529111708934e-05, + "loss": 1.904, + "step": 4812 + }, + { + "epoch": 0.776540819619232, + "grad_norm": 5.163037300109863, + "learning_rate": 1.2724110295216301e-05, + "loss": 1.9206, + "step": 4813 + }, + { + "epoch": 0.776702161987738, + "grad_norm": 5.459940433502197, + "learning_rate": 1.2706701657364988e-05, + "loss": 1.79, + "step": 4814 + }, + { + "epoch": 0.776863504356244, + "grad_norm": 5.294158935546875, + "learning_rate": 1.2689303202908608e-05, + "loss": 1.907, + "step": 4815 + }, + { + "epoch": 0.7770248467247499, + "grad_norm": 4.8452348709106445, + "learning_rate": 1.2671914936598018e-05, + "loss": 1.9486, + "step": 4816 + }, + { + "epoch": 0.7771861890932559, + "grad_norm": 4.328494548797607, + "learning_rate": 1.2654536863181326e-05, + "loss": 1.8546, + "step": 4817 + }, + { + "epoch": 0.7773475314617618, + "grad_norm": 4.225079536437988, + "learning_rate": 1.2637168987403797e-05, + "loss": 1.8669, + "step": 4818 + }, + { + "epoch": 0.7775088738302678, + "grad_norm": 6.0057244300842285, + "learning_rate": 1.2619811314007974e-05, + "loss": 1.8974, + "step": 4819 + }, + { + "epoch": 0.7776702161987737, + "grad_norm": 6.11793851852417, + "learning_rate": 1.260246384773357e-05, + "loss": 1.9355, + "step": 4820 + }, + { + "epoch": 0.7778315585672798, + "grad_norm": 4.3321075439453125, + "learning_rate": 1.258512659331751e-05, + "loss": 1.9473, + "step": 4821 + }, + { + "epoch": 0.7779929009357858, + "grad_norm": 4.267263412475586, + "learning_rate": 1.256779955549397e-05, + "loss": 1.7181, + "step": 4822 + }, + { + "epoch": 0.7781542433042917, + "grad_norm": 5.465700149536133, + "learning_rate": 1.2550482738994285e-05, + "loss": 1.9786, + "step": 4823 + }, + { + "epoch": 0.7783155856727977, + "grad_norm": 4.487921714782715, + "learning_rate": 1.253317614854706e-05, + "loss": 1.9194, + "step": 4824 + }, + { + "epoch": 0.7784769280413036, + "grad_norm": 5.506918907165527, + "learning_rate": 1.2515879788878038e-05, + "loss": 2.0618, + "step": 4825 + }, + { + "epoch": 0.7786382704098096, + "grad_norm": 4.063748836517334, + "learning_rate": 1.2498593664710234e-05, + "loss": 1.7781, + "step": 4826 + }, + { + "epoch": 0.7787996127783156, + "grad_norm": 4.7127909660339355, + "learning_rate": 1.2481317780763802e-05, + "loss": 2.0239, + "step": 4827 + }, + { + "epoch": 0.7789609551468215, + "grad_norm": 4.559621334075928, + "learning_rate": 1.2464052141756177e-05, + "loss": 2.127, + "step": 4828 + }, + { + "epoch": 0.7791222975153276, + "grad_norm": 4.137547016143799, + "learning_rate": 1.2446796752401913e-05, + "loss": 1.9814, + "step": 4829 + }, + { + "epoch": 0.7792836398838335, + "grad_norm": 5.029731273651123, + "learning_rate": 1.2429551617412844e-05, + "loss": 1.7936, + "step": 4830 + }, + { + "epoch": 0.7794449822523395, + "grad_norm": 5.414242267608643, + "learning_rate": 1.2412316741497953e-05, + "loss": 2.129, + "step": 4831 + }, + { + "epoch": 0.7796063246208454, + "grad_norm": 3.9555225372314453, + "learning_rate": 1.2395092129363428e-05, + "loss": 2.1969, + "step": 4832 + }, + { + "epoch": 0.7797676669893514, + "grad_norm": 3.4828078746795654, + "learning_rate": 1.2377877785712649e-05, + "loss": 1.8161, + "step": 4833 + }, + { + "epoch": 0.7799290093578574, + "grad_norm": 3.443568706512451, + "learning_rate": 1.236067371524624e-05, + "loss": 1.5697, + "step": 4834 + }, + { + "epoch": 0.7800903517263633, + "grad_norm": 4.887693405151367, + "learning_rate": 1.2343479922661965e-05, + "loss": 1.835, + "step": 4835 + }, + { + "epoch": 0.7802516940948693, + "grad_norm": 4.772017478942871, + "learning_rate": 1.2326296412654787e-05, + "loss": 1.9179, + "step": 4836 + }, + { + "epoch": 0.7804130364633752, + "grad_norm": 5.858239650726318, + "learning_rate": 1.2309123189916904e-05, + "loss": 1.7846, + "step": 4837 + }, + { + "epoch": 0.7805743788318813, + "grad_norm": 4.29452657699585, + "learning_rate": 1.2291960259137647e-05, + "loss": 1.9617, + "step": 4838 + }, + { + "epoch": 0.7807357212003873, + "grad_norm": 4.060075283050537, + "learning_rate": 1.2274807625003598e-05, + "loss": 1.8724, + "step": 4839 + }, + { + "epoch": 0.7808970635688932, + "grad_norm": 4.429147720336914, + "learning_rate": 1.2257665292198461e-05, + "loss": 2.0031, + "step": 4840 + }, + { + "epoch": 0.7810584059373992, + "grad_norm": 3.9962515830993652, + "learning_rate": 1.2240533265403198e-05, + "loss": 1.7474, + "step": 4841 + }, + { + "epoch": 0.7812197483059051, + "grad_norm": 3.854008197784424, + "learning_rate": 1.2223411549295888e-05, + "loss": 1.8632, + "step": 4842 + }, + { + "epoch": 0.7813810906744111, + "grad_norm": 3.9905378818511963, + "learning_rate": 1.2206300148551848e-05, + "loss": 1.8312, + "step": 4843 + }, + { + "epoch": 0.781542433042917, + "grad_norm": 4.155596733093262, + "learning_rate": 1.2189199067843538e-05, + "loss": 2.0119, + "step": 4844 + }, + { + "epoch": 0.781703775411423, + "grad_norm": 3.753019332885742, + "learning_rate": 1.2172108311840641e-05, + "loss": 1.9925, + "step": 4845 + }, + { + "epoch": 0.7818651177799291, + "grad_norm": 4.133620262145996, + "learning_rate": 1.2155027885209991e-05, + "loss": 1.7772, + "step": 4846 + }, + { + "epoch": 0.782026460148435, + "grad_norm": 4.011654853820801, + "learning_rate": 1.213795779261559e-05, + "loss": 1.6683, + "step": 4847 + }, + { + "epoch": 0.782187802516941, + "grad_norm": 4.1891679763793945, + "learning_rate": 1.212089803871867e-05, + "loss": 1.8676, + "step": 4848 + }, + { + "epoch": 0.7823491448854469, + "grad_norm": 4.765373706817627, + "learning_rate": 1.2103848628177573e-05, + "loss": 1.952, + "step": 4849 + }, + { + "epoch": 0.7825104872539529, + "grad_norm": 4.962911128997803, + "learning_rate": 1.2086809565647878e-05, + "loss": 1.8573, + "step": 4850 + }, + { + "epoch": 0.7826718296224588, + "grad_norm": 3.755892276763916, + "learning_rate": 1.2069780855782304e-05, + "loss": 1.7528, + "step": 4851 + }, + { + "epoch": 0.7828331719909648, + "grad_norm": 5.934349060058594, + "learning_rate": 1.2052762503230746e-05, + "loss": 1.6815, + "step": 4852 + }, + { + "epoch": 0.7829945143594708, + "grad_norm": 4.053897857666016, + "learning_rate": 1.2035754512640262e-05, + "loss": 1.7697, + "step": 4853 + }, + { + "epoch": 0.7831558567279767, + "grad_norm": 4.091436386108398, + "learning_rate": 1.2018756888655125e-05, + "loss": 1.9912, + "step": 4854 + }, + { + "epoch": 0.7833171990964828, + "grad_norm": 4.620340347290039, + "learning_rate": 1.200176963591671e-05, + "loss": 1.8034, + "step": 4855 + }, + { + "epoch": 0.7834785414649887, + "grad_norm": 4.518535137176514, + "learning_rate": 1.198479275906363e-05, + "loss": 1.6941, + "step": 4856 + }, + { + "epoch": 0.7836398838334947, + "grad_norm": 6.105302810668945, + "learning_rate": 1.1967826262731602e-05, + "loss": 1.884, + "step": 4857 + }, + { + "epoch": 0.7838012262020007, + "grad_norm": 5.046043395996094, + "learning_rate": 1.1950870151553561e-05, + "loss": 2.0027, + "step": 4858 + }, + { + "epoch": 0.7839625685705066, + "grad_norm": 4.713179588317871, + "learning_rate": 1.1933924430159572e-05, + "loss": 1.767, + "step": 4859 + }, + { + "epoch": 0.7841239109390126, + "grad_norm": 5.794550895690918, + "learning_rate": 1.1916989103176856e-05, + "loss": 1.9028, + "step": 4860 + }, + { + "epoch": 0.7842852533075185, + "grad_norm": 4.4063825607299805, + "learning_rate": 1.1900064175229847e-05, + "loss": 1.7676, + "step": 4861 + }, + { + "epoch": 0.7844465956760245, + "grad_norm": 4.139499664306641, + "learning_rate": 1.1883149650940074e-05, + "loss": 1.9148, + "step": 4862 + }, + { + "epoch": 0.7846079380445304, + "grad_norm": 5.17821741104126, + "learning_rate": 1.186624553492628e-05, + "loss": 1.7091, + "step": 4863 + }, + { + "epoch": 0.7847692804130365, + "grad_norm": 3.906259059906006, + "learning_rate": 1.1849351831804318e-05, + "loss": 1.6162, + "step": 4864 + }, + { + "epoch": 0.7849306227815425, + "grad_norm": 4.134382724761963, + "learning_rate": 1.1832468546187247e-05, + "loss": 1.6765, + "step": 4865 + }, + { + "epoch": 0.7850919651500484, + "grad_norm": 4.796937465667725, + "learning_rate": 1.1815595682685237e-05, + "loss": 1.8752, + "step": 4866 + }, + { + "epoch": 0.7852533075185544, + "grad_norm": 5.201277732849121, + "learning_rate": 1.1798733245905651e-05, + "loss": 1.7716, + "step": 4867 + }, + { + "epoch": 0.7854146498870603, + "grad_norm": 4.218733310699463, + "learning_rate": 1.1781881240452958e-05, + "loss": 1.9781, + "step": 4868 + }, + { + "epoch": 0.7855759922555663, + "grad_norm": 4.711889266967773, + "learning_rate": 1.176503967092884e-05, + "loss": 1.8243, + "step": 4869 + }, + { + "epoch": 0.7857373346240722, + "grad_norm": 3.912487030029297, + "learning_rate": 1.1748208541932077e-05, + "loss": 1.7795, + "step": 4870 + }, + { + "epoch": 0.7858986769925782, + "grad_norm": 5.722700595855713, + "learning_rate": 1.1731387858058613e-05, + "loss": 1.7087, + "step": 4871 + }, + { + "epoch": 0.7860600193610843, + "grad_norm": 4.365345478057861, + "learning_rate": 1.1714577623901547e-05, + "loss": 1.8558, + "step": 4872 + }, + { + "epoch": 0.7862213617295902, + "grad_norm": 4.097417831420898, + "learning_rate": 1.1697777844051105e-05, + "loss": 1.823, + "step": 4873 + }, + { + "epoch": 0.7863827040980962, + "grad_norm": 5.112419605255127, + "learning_rate": 1.1680988523094705e-05, + "loss": 1.6119, + "step": 4874 + }, + { + "epoch": 0.7865440464666021, + "grad_norm": 5.568664073944092, + "learning_rate": 1.1664209665616849e-05, + "loss": 1.9551, + "step": 4875 + }, + { + "epoch": 0.7867053888351081, + "grad_norm": 3.6490912437438965, + "learning_rate": 1.1647441276199233e-05, + "loss": 1.8411, + "step": 4876 + }, + { + "epoch": 0.7868667312036141, + "grad_norm": 4.620687484741211, + "learning_rate": 1.1630683359420652e-05, + "loss": 1.8486, + "step": 4877 + }, + { + "epoch": 0.78702807357212, + "grad_norm": 3.4812748432159424, + "learning_rate": 1.1613935919857094e-05, + "loss": 1.9814, + "step": 4878 + }, + { + "epoch": 0.787189415940626, + "grad_norm": 4.480321407318115, + "learning_rate": 1.1597198962081612e-05, + "loss": 1.9138, + "step": 4879 + }, + { + "epoch": 0.7873507583091319, + "grad_norm": 4.20350456237793, + "learning_rate": 1.1580472490664474e-05, + "loss": 2.0126, + "step": 4880 + }, + { + "epoch": 0.787512100677638, + "grad_norm": 4.852173805236816, + "learning_rate": 1.1563756510173024e-05, + "loss": 1.748, + "step": 4881 + }, + { + "epoch": 0.787673443046144, + "grad_norm": 4.446038722991943, + "learning_rate": 1.154705102517179e-05, + "loss": 1.9694, + "step": 4882 + }, + { + "epoch": 0.7878347854146499, + "grad_norm": 7.211885452270508, + "learning_rate": 1.1530356040222402e-05, + "loss": 2.2675, + "step": 4883 + }, + { + "epoch": 0.7879961277831559, + "grad_norm": 4.495987415313721, + "learning_rate": 1.151367155988361e-05, + "loss": 1.9904, + "step": 4884 + }, + { + "epoch": 0.7881574701516618, + "grad_norm": 4.462154388427734, + "learning_rate": 1.149699758871135e-05, + "loss": 1.7895, + "step": 4885 + }, + { + "epoch": 0.7883188125201678, + "grad_norm": 4.5518622398376465, + "learning_rate": 1.1480334131258625e-05, + "loss": 1.7954, + "step": 4886 + }, + { + "epoch": 0.7884801548886737, + "grad_norm": 4.522104740142822, + "learning_rate": 1.1463681192075632e-05, + "loss": 1.914, + "step": 4887 + }, + { + "epoch": 0.7886414972571797, + "grad_norm": 6.0603251457214355, + "learning_rate": 1.1447038775709623e-05, + "loss": 1.6616, + "step": 4888 + }, + { + "epoch": 0.7888028396256858, + "grad_norm": 4.378868103027344, + "learning_rate": 1.1430406886705053e-05, + "loss": 2.0277, + "step": 4889 + }, + { + "epoch": 0.7889641819941917, + "grad_norm": 4.344919204711914, + "learning_rate": 1.1413785529603438e-05, + "loss": 1.9149, + "step": 4890 + }, + { + "epoch": 0.7891255243626977, + "grad_norm": 4.162896156311035, + "learning_rate": 1.1397174708943458e-05, + "loss": 1.6958, + "step": 4891 + }, + { + "epoch": 0.7892868667312036, + "grad_norm": 3.8431289196014404, + "learning_rate": 1.1380574429260881e-05, + "loss": 1.8051, + "step": 4892 + }, + { + "epoch": 0.7894482090997096, + "grad_norm": 4.48696231842041, + "learning_rate": 1.1363984695088653e-05, + "loss": 1.7665, + "step": 4893 + }, + { + "epoch": 0.7896095514682155, + "grad_norm": 5.2174811363220215, + "learning_rate": 1.1347405510956765e-05, + "loss": 1.773, + "step": 4894 + }, + { + "epoch": 0.7897708938367215, + "grad_norm": 4.319388389587402, + "learning_rate": 1.1330836881392404e-05, + "loss": 1.9333, + "step": 4895 + }, + { + "epoch": 0.7899322362052275, + "grad_norm": 4.041973114013672, + "learning_rate": 1.1314278810919826e-05, + "loss": 1.8259, + "step": 4896 + }, + { + "epoch": 0.7900935785737334, + "grad_norm": 4.26777982711792, + "learning_rate": 1.12977313040604e-05, + "loss": 1.8426, + "step": 4897 + }, + { + "epoch": 0.7902549209422395, + "grad_norm": 3.559138536453247, + "learning_rate": 1.1281194365332649e-05, + "loss": 1.8108, + "step": 4898 + }, + { + "epoch": 0.7904162633107454, + "grad_norm": 4.491950988769531, + "learning_rate": 1.1264667999252171e-05, + "loss": 2.0104, + "step": 4899 + }, + { + "epoch": 0.7905776056792514, + "grad_norm": 4.133874893188477, + "learning_rate": 1.1248152210331714e-05, + "loss": 1.7412, + "step": 4900 + }, + { + "epoch": 0.7907389480477574, + "grad_norm": 3.8438262939453125, + "learning_rate": 1.1231647003081092e-05, + "loss": 2.0872, + "step": 4901 + }, + { + "epoch": 0.7909002904162633, + "grad_norm": 4.686543941497803, + "learning_rate": 1.1215152382007283e-05, + "loss": 1.9076, + "step": 4902 + }, + { + "epoch": 0.7910616327847693, + "grad_norm": 4.718398094177246, + "learning_rate": 1.1198668351614323e-05, + "loss": 1.7459, + "step": 4903 + }, + { + "epoch": 0.7912229751532752, + "grad_norm": 4.179304599761963, + "learning_rate": 1.1182194916403399e-05, + "loss": 2.0392, + "step": 4904 + }, + { + "epoch": 0.7913843175217812, + "grad_norm": 4.1944899559021, + "learning_rate": 1.1165732080872766e-05, + "loss": 1.8769, + "step": 4905 + }, + { + "epoch": 0.7915456598902871, + "grad_norm": 4.269710063934326, + "learning_rate": 1.114927984951783e-05, + "loss": 1.5916, + "step": 4906 + }, + { + "epoch": 0.7917070022587932, + "grad_norm": 3.848909854888916, + "learning_rate": 1.1132838226831054e-05, + "loss": 1.916, + "step": 4907 + }, + { + "epoch": 0.7918683446272992, + "grad_norm": 4.461495399475098, + "learning_rate": 1.1116407217302027e-05, + "loss": 1.8363, + "step": 4908 + }, + { + "epoch": 0.7920296869958051, + "grad_norm": 3.3846654891967773, + "learning_rate": 1.1099986825417453e-05, + "loss": 1.9127, + "step": 4909 + }, + { + "epoch": 0.7921910293643111, + "grad_norm": 3.9858438968658447, + "learning_rate": 1.1083577055661116e-05, + "loss": 1.8888, + "step": 4910 + }, + { + "epoch": 0.792352371732817, + "grad_norm": 3.759572982788086, + "learning_rate": 1.1067177912513898e-05, + "loss": 2.0451, + "step": 4911 + }, + { + "epoch": 0.792513714101323, + "grad_norm": 4.379895210266113, + "learning_rate": 1.1050789400453782e-05, + "loss": 1.7487, + "step": 4912 + }, + { + "epoch": 0.7926750564698289, + "grad_norm": 9.255057334899902, + "learning_rate": 1.103441152395588e-05, + "loss": 1.7979, + "step": 4913 + }, + { + "epoch": 0.7928363988383349, + "grad_norm": 4.533437252044678, + "learning_rate": 1.1018044287492341e-05, + "loss": 1.9864, + "step": 4914 + }, + { + "epoch": 0.792997741206841, + "grad_norm": 3.9128003120422363, + "learning_rate": 1.100168769553247e-05, + "loss": 1.9298, + "step": 4915 + }, + { + "epoch": 0.7931590835753469, + "grad_norm": 4.127525806427002, + "learning_rate": 1.098534175254261e-05, + "loss": 1.7195, + "step": 4916 + }, + { + "epoch": 0.7933204259438529, + "grad_norm": 5.006974697113037, + "learning_rate": 1.0969006462986253e-05, + "loss": 1.998, + "step": 4917 + }, + { + "epoch": 0.7934817683123588, + "grad_norm": 4.356435775756836, + "learning_rate": 1.0952681831323914e-05, + "loss": 1.8028, + "step": 4918 + }, + { + "epoch": 0.7936431106808648, + "grad_norm": 4.146892070770264, + "learning_rate": 1.093636786201327e-05, + "loss": 1.8735, + "step": 4919 + }, + { + "epoch": 0.7938044530493708, + "grad_norm": 3.9896233081817627, + "learning_rate": 1.0920064559509025e-05, + "loss": 1.7779, + "step": 4920 + }, + { + "epoch": 0.7939657954178767, + "grad_norm": 5.546407222747803, + "learning_rate": 1.0903771928263018e-05, + "loss": 1.8785, + "step": 4921 + }, + { + "epoch": 0.7941271377863827, + "grad_norm": 4.734492301940918, + "learning_rate": 1.0887489972724141e-05, + "loss": 1.9074, + "step": 4922 + }, + { + "epoch": 0.7942884801548886, + "grad_norm": 5.046846866607666, + "learning_rate": 1.0871218697338376e-05, + "loss": 1.9277, + "step": 4923 + }, + { + "epoch": 0.7944498225233947, + "grad_norm": 5.657299518585205, + "learning_rate": 1.0854958106548812e-05, + "loss": 2.0755, + "step": 4924 + }, + { + "epoch": 0.7946111648919006, + "grad_norm": 6.1893630027771, + "learning_rate": 1.0838708204795584e-05, + "loss": 1.9284, + "step": 4925 + }, + { + "epoch": 0.7947725072604066, + "grad_norm": 5.536571025848389, + "learning_rate": 1.082246899651595e-05, + "loss": 2.0, + "step": 4926 + }, + { + "epoch": 0.7949338496289126, + "grad_norm": 4.260076522827148, + "learning_rate": 1.080624048614422e-05, + "loss": 1.7571, + "step": 4927 + }, + { + "epoch": 0.7950951919974185, + "grad_norm": 4.106614112854004, + "learning_rate": 1.0790022678111772e-05, + "loss": 1.7714, + "step": 4928 + }, + { + "epoch": 0.7952565343659245, + "grad_norm": 5.757980823516846, + "learning_rate": 1.0773815576847095e-05, + "loss": 1.6824, + "step": 4929 + }, + { + "epoch": 0.7954178767344304, + "grad_norm": 4.206118106842041, + "learning_rate": 1.075761918677574e-05, + "loss": 1.7748, + "step": 4930 + }, + { + "epoch": 0.7955792191029364, + "grad_norm": 4.439728260040283, + "learning_rate": 1.0741433512320315e-05, + "loss": 1.8324, + "step": 4931 + }, + { + "epoch": 0.7957405614714425, + "grad_norm": 3.572453022003174, + "learning_rate": 1.0725258557900537e-05, + "loss": 1.6651, + "step": 4932 + }, + { + "epoch": 0.7959019038399484, + "grad_norm": 4.678240776062012, + "learning_rate": 1.0709094327933155e-05, + "loss": 1.7703, + "step": 4933 + }, + { + "epoch": 0.7960632462084544, + "grad_norm": 5.155104637145996, + "learning_rate": 1.0692940826832038e-05, + "loss": 1.879, + "step": 4934 + }, + { + "epoch": 0.7962245885769603, + "grad_norm": 4.731137752532959, + "learning_rate": 1.0676798059008081e-05, + "loss": 1.6497, + "step": 4935 + }, + { + "epoch": 0.7963859309454663, + "grad_norm": 5.3584089279174805, + "learning_rate": 1.0660666028869254e-05, + "loss": 1.9967, + "step": 4936 + }, + { + "epoch": 0.7965472733139722, + "grad_norm": 4.699703216552734, + "learning_rate": 1.0644544740820638e-05, + "loss": 2.2507, + "step": 4937 + }, + { + "epoch": 0.7967086156824782, + "grad_norm": 4.3637776374816895, + "learning_rate": 1.062843419926432e-05, + "loss": 2.0056, + "step": 4938 + }, + { + "epoch": 0.7968699580509842, + "grad_norm": 4.8707594871521, + "learning_rate": 1.0612334408599512e-05, + "loss": 1.9799, + "step": 4939 + }, + { + "epoch": 0.7970313004194901, + "grad_norm": 5.19387674331665, + "learning_rate": 1.0596245373222424e-05, + "loss": 1.8095, + "step": 4940 + }, + { + "epoch": 0.7971926427879962, + "grad_norm": 5.282740116119385, + "learning_rate": 1.05801670975264e-05, + "loss": 1.7478, + "step": 4941 + }, + { + "epoch": 0.7973539851565021, + "grad_norm": 7.607383728027344, + "learning_rate": 1.0564099585901788e-05, + "loss": 1.9698, + "step": 4942 + }, + { + "epoch": 0.7975153275250081, + "grad_norm": 5.037977695465088, + "learning_rate": 1.0548042842736038e-05, + "loss": 1.8196, + "step": 4943 + }, + { + "epoch": 0.797676669893514, + "grad_norm": 4.223226547241211, + "learning_rate": 1.0531996872413618e-05, + "loss": 1.752, + "step": 4944 + }, + { + "epoch": 0.79783801226202, + "grad_norm": 6.1297221183776855, + "learning_rate": 1.0515961679316111e-05, + "loss": 2.0072, + "step": 4945 + }, + { + "epoch": 0.797999354630526, + "grad_norm": 4.56374454498291, + "learning_rate": 1.0499937267822101e-05, + "loss": 1.8715, + "step": 4946 + }, + { + "epoch": 0.7981606969990319, + "grad_norm": 4.783345699310303, + "learning_rate": 1.0483923642307258e-05, + "loss": 1.8331, + "step": 4947 + }, + { + "epoch": 0.7983220393675379, + "grad_norm": 4.3292765617370605, + "learning_rate": 1.0467920807144282e-05, + "loss": 1.7773, + "step": 4948 + }, + { + "epoch": 0.7984833817360439, + "grad_norm": 5.089447498321533, + "learning_rate": 1.0451928766702979e-05, + "loss": 1.9119, + "step": 4949 + }, + { + "epoch": 0.7986447241045499, + "grad_norm": 4.106072902679443, + "learning_rate": 1.0435947525350149e-05, + "loss": 1.7819, + "step": 4950 + }, + { + "epoch": 0.7988060664730559, + "grad_norm": 4.993464469909668, + "learning_rate": 1.0419977087449656e-05, + "loss": 1.9956, + "step": 4951 + }, + { + "epoch": 0.7989674088415618, + "grad_norm": 5.2668304443359375, + "learning_rate": 1.0404017457362459e-05, + "loss": 1.8632, + "step": 4952 + }, + { + "epoch": 0.7991287512100678, + "grad_norm": 5.383654594421387, + "learning_rate": 1.0388068639446502e-05, + "loss": 1.967, + "step": 4953 + }, + { + "epoch": 0.7992900935785737, + "grad_norm": 4.778904914855957, + "learning_rate": 1.0372130638056826e-05, + "loss": 1.6598, + "step": 4954 + }, + { + "epoch": 0.7994514359470797, + "grad_norm": 3.6620941162109375, + "learning_rate": 1.0356203457545483e-05, + "loss": 1.8743, + "step": 4955 + }, + { + "epoch": 0.7996127783155856, + "grad_norm": 4.586091041564941, + "learning_rate": 1.0340287102261603e-05, + "loss": 1.988, + "step": 4956 + }, + { + "epoch": 0.7997741206840916, + "grad_norm": 4.33573055267334, + "learning_rate": 1.0324381576551323e-05, + "loss": 1.7612, + "step": 4957 + }, + { + "epoch": 0.7999354630525977, + "grad_norm": 6.598884105682373, + "learning_rate": 1.0308486884757868e-05, + "loss": 1.9919, + "step": 4958 + }, + { + "epoch": 0.8000968054211036, + "grad_norm": 3.6663243770599365, + "learning_rate": 1.0292603031221465e-05, + "loss": 1.7532, + "step": 4959 + }, + { + "epoch": 0.8002581477896096, + "grad_norm": 4.019326210021973, + "learning_rate": 1.027673002027938e-05, + "loss": 1.7217, + "step": 4960 + }, + { + "epoch": 0.8004194901581155, + "grad_norm": 4.4263505935668945, + "learning_rate": 1.0260867856265966e-05, + "loss": 1.7163, + "step": 4961 + }, + { + "epoch": 0.8005808325266215, + "grad_norm": 3.918682813644409, + "learning_rate": 1.0245016543512553e-05, + "loss": 1.7715, + "step": 4962 + }, + { + "epoch": 0.8007421748951274, + "grad_norm": 5.8670148849487305, + "learning_rate": 1.022917608634757e-05, + "loss": 2.0226, + "step": 4963 + }, + { + "epoch": 0.8009035172636334, + "grad_norm": 4.7948455810546875, + "learning_rate": 1.0213346489096414e-05, + "loss": 1.841, + "step": 4964 + }, + { + "epoch": 0.8010648596321394, + "grad_norm": 4.6134209632873535, + "learning_rate": 1.0197527756081582e-05, + "loss": 1.7509, + "step": 4965 + }, + { + "epoch": 0.8012262020006453, + "grad_norm": 4.026442050933838, + "learning_rate": 1.018171989162256e-05, + "loss": 1.789, + "step": 4966 + }, + { + "epoch": 0.8013875443691514, + "grad_norm": 4.19586181640625, + "learning_rate": 1.0165922900035885e-05, + "loss": 1.7807, + "step": 4967 + }, + { + "epoch": 0.8015488867376573, + "grad_norm": 4.882951736450195, + "learning_rate": 1.0150136785635095e-05, + "loss": 1.9988, + "step": 4968 + }, + { + "epoch": 0.8017102291061633, + "grad_norm": 7.722875595092773, + "learning_rate": 1.0134361552730825e-05, + "loss": 1.7145, + "step": 4969 + }, + { + "epoch": 0.8018715714746693, + "grad_norm": 4.069356918334961, + "learning_rate": 1.0118597205630658e-05, + "loss": 1.761, + "step": 4970 + }, + { + "epoch": 0.8020329138431752, + "grad_norm": 4.985439300537109, + "learning_rate": 1.010284374863928e-05, + "loss": 2.0331, + "step": 4971 + }, + { + "epoch": 0.8021942562116812, + "grad_norm": 4.771203517913818, + "learning_rate": 1.0087101186058346e-05, + "loss": 2.0332, + "step": 4972 + }, + { + "epoch": 0.8023555985801871, + "grad_norm": 4.417211532592773, + "learning_rate": 1.0071369522186547e-05, + "loss": 1.7046, + "step": 4973 + }, + { + "epoch": 0.8025169409486931, + "grad_norm": 3.9871044158935547, + "learning_rate": 1.005564876131963e-05, + "loss": 1.9312, + "step": 4974 + }, + { + "epoch": 0.8026782833171991, + "grad_norm": 5.729042053222656, + "learning_rate": 1.0039938907750323e-05, + "loss": 1.8495, + "step": 4975 + }, + { + "epoch": 0.8028396256857051, + "grad_norm": 4.6541266441345215, + "learning_rate": 1.0024239965768418e-05, + "loss": 1.7877, + "step": 4976 + }, + { + "epoch": 0.8030009680542111, + "grad_norm": 4.373040199279785, + "learning_rate": 1.0008551939660676e-05, + "loss": 2.0013, + "step": 4977 + }, + { + "epoch": 0.803162310422717, + "grad_norm": 3.792407989501953, + "learning_rate": 9.992874833710936e-06, + "loss": 1.7861, + "step": 4978 + }, + { + "epoch": 0.803323652791223, + "grad_norm": 3.719904661178589, + "learning_rate": 9.9772086522e-06, + "loss": 1.6944, + "step": 4979 + }, + { + "epoch": 0.8034849951597289, + "grad_norm": 3.9930500984191895, + "learning_rate": 9.961553399405733e-06, + "loss": 1.4582, + "step": 4980 + }, + { + "epoch": 0.8036463375282349, + "grad_norm": 4.299803733825684, + "learning_rate": 9.945909079602966e-06, + "loss": 1.7848, + "step": 4981 + }, + { + "epoch": 0.8038076798967408, + "grad_norm": 4.049663066864014, + "learning_rate": 9.930275697063613e-06, + "loss": 1.7776, + "step": 4982 + }, + { + "epoch": 0.8039690222652468, + "grad_norm": 4.3470001220703125, + "learning_rate": 9.914653256056522e-06, + "loss": 2.0063, + "step": 4983 + }, + { + "epoch": 0.8041303646337529, + "grad_norm": 3.8651134967803955, + "learning_rate": 9.899041760847628e-06, + "loss": 1.8212, + "step": 4984 + }, + { + "epoch": 0.8042917070022588, + "grad_norm": 4.063155651092529, + "learning_rate": 9.883441215699823e-06, + "loss": 1.6792, + "step": 4985 + }, + { + "epoch": 0.8044530493707648, + "grad_norm": 5.242466449737549, + "learning_rate": 9.867851624873038e-06, + "loss": 1.7351, + "step": 4986 + }, + { + "epoch": 0.8046143917392707, + "grad_norm": 3.8467047214508057, + "learning_rate": 9.85227299262419e-06, + "loss": 1.8724, + "step": 4987 + }, + { + "epoch": 0.8047757341077767, + "grad_norm": 6.567809581756592, + "learning_rate": 9.836705323207207e-06, + "loss": 1.7028, + "step": 4988 + }, + { + "epoch": 0.8049370764762827, + "grad_norm": 5.676382064819336, + "learning_rate": 9.821148620873071e-06, + "loss": 1.9252, + "step": 4989 + }, + { + "epoch": 0.8050984188447886, + "grad_norm": 4.816730976104736, + "learning_rate": 9.805602889869692e-06, + "loss": 1.7533, + "step": 4990 + }, + { + "epoch": 0.8052597612132946, + "grad_norm": 4.336112022399902, + "learning_rate": 9.790068134442049e-06, + "loss": 1.957, + "step": 4991 + }, + { + "epoch": 0.8054211035818006, + "grad_norm": 3.976856231689453, + "learning_rate": 9.774544358832082e-06, + "loss": 1.8226, + "step": 4992 + }, + { + "epoch": 0.8055824459503066, + "grad_norm": 4.793515682220459, + "learning_rate": 9.75903156727877e-06, + "loss": 1.9546, + "step": 4993 + }, + { + "epoch": 0.8057437883188125, + "grad_norm": 6.302753448486328, + "learning_rate": 9.74352976401805e-06, + "loss": 2.0595, + "step": 4994 + }, + { + "epoch": 0.8059051306873185, + "grad_norm": 4.390596866607666, + "learning_rate": 9.728038953282903e-06, + "loss": 1.9161, + "step": 4995 + }, + { + "epoch": 0.8060664730558245, + "grad_norm": 4.317444324493408, + "learning_rate": 9.712559139303257e-06, + "loss": 1.7779, + "step": 4996 + }, + { + "epoch": 0.8062278154243304, + "grad_norm": 6.430931091308594, + "learning_rate": 9.697090326306097e-06, + "loss": 1.9661, + "step": 4997 + }, + { + "epoch": 0.8063891577928364, + "grad_norm": 4.240243911743164, + "learning_rate": 9.681632518515354e-06, + "loss": 1.7169, + "step": 4998 + }, + { + "epoch": 0.8065505001613423, + "grad_norm": 4.05634069442749, + "learning_rate": 9.666185720151965e-06, + "loss": 1.718, + "step": 4999 + }, + { + "epoch": 0.8067118425298483, + "grad_norm": 5.419493198394775, + "learning_rate": 9.6507499354339e-06, + "loss": 1.7795, + "step": 5000 + }, + { + "epoch": 0.8068731848983544, + "grad_norm": 3.757697820663452, + "learning_rate": 9.635325168576054e-06, + "loss": 1.6912, + "step": 5001 + }, + { + "epoch": 0.8070345272668603, + "grad_norm": 5.66853141784668, + "learning_rate": 9.619911423790378e-06, + "loss": 1.7352, + "step": 5002 + }, + { + "epoch": 0.8071958696353663, + "grad_norm": 5.9120306968688965, + "learning_rate": 9.604508705285764e-06, + "loss": 1.9152, + "step": 5003 + }, + { + "epoch": 0.8073572120038722, + "grad_norm": 4.561873912811279, + "learning_rate": 9.58911701726814e-06, + "loss": 2.0036, + "step": 5004 + }, + { + "epoch": 0.8075185543723782, + "grad_norm": 4.15148401260376, + "learning_rate": 9.573736363940377e-06, + "loss": 1.6839, + "step": 5005 + }, + { + "epoch": 0.8076798967408841, + "grad_norm": 3.8810439109802246, + "learning_rate": 9.558366749502357e-06, + "loss": 1.7462, + "step": 5006 + }, + { + "epoch": 0.8078412391093901, + "grad_norm": 4.477046489715576, + "learning_rate": 9.543008178150931e-06, + "loss": 1.7553, + "step": 5007 + }, + { + "epoch": 0.808002581477896, + "grad_norm": 5.181619167327881, + "learning_rate": 9.527660654079968e-06, + "loss": 1.8165, + "step": 5008 + }, + { + "epoch": 0.808163923846402, + "grad_norm": 4.163424015045166, + "learning_rate": 9.51232418148027e-06, + "loss": 1.8912, + "step": 5009 + }, + { + "epoch": 0.8083252662149081, + "grad_norm": 3.726431369781494, + "learning_rate": 9.496998764539684e-06, + "loss": 1.8877, + "step": 5010 + }, + { + "epoch": 0.808486608583414, + "grad_norm": 4.380525588989258, + "learning_rate": 9.481684407442987e-06, + "loss": 1.9208, + "step": 5011 + }, + { + "epoch": 0.80864795095192, + "grad_norm": 4.851559638977051, + "learning_rate": 9.466381114371941e-06, + "loss": 1.6984, + "step": 5012 + }, + { + "epoch": 0.808809293320426, + "grad_norm": 6.974335670471191, + "learning_rate": 9.451088889505321e-06, + "loss": 1.8413, + "step": 5013 + }, + { + "epoch": 0.8089706356889319, + "grad_norm": 3.3473620414733887, + "learning_rate": 9.435807737018842e-06, + "loss": 1.6117, + "step": 5014 + }, + { + "epoch": 0.8091319780574379, + "grad_norm": 5.172091007232666, + "learning_rate": 9.42053766108522e-06, + "loss": 1.9959, + "step": 5015 + }, + { + "epoch": 0.8092933204259438, + "grad_norm": 3.5614988803863525, + "learning_rate": 9.405278665874129e-06, + "loss": 1.7952, + "step": 5016 + }, + { + "epoch": 0.8094546627944498, + "grad_norm": 5.11036491394043, + "learning_rate": 9.390030755552242e-06, + "loss": 1.9466, + "step": 5017 + }, + { + "epoch": 0.8096160051629558, + "grad_norm": 4.175196170806885, + "learning_rate": 9.374793934283166e-06, + "loss": 1.9538, + "step": 5018 + }, + { + "epoch": 0.8097773475314618, + "grad_norm": 4.084559440612793, + "learning_rate": 9.359568206227525e-06, + "loss": 1.9314, + "step": 5019 + }, + { + "epoch": 0.8099386898999678, + "grad_norm": 4.597757816314697, + "learning_rate": 9.344353575542875e-06, + "loss": 1.8681, + "step": 5020 + }, + { + "epoch": 0.8101000322684737, + "grad_norm": 3.842285394668579, + "learning_rate": 9.329150046383772e-06, + "loss": 1.9994, + "step": 5021 + }, + { + "epoch": 0.8102613746369797, + "grad_norm": 4.087825298309326, + "learning_rate": 9.313957622901726e-06, + "loss": 1.9577, + "step": 5022 + }, + { + "epoch": 0.8104227170054856, + "grad_norm": 5.270120143890381, + "learning_rate": 9.298776309245194e-06, + "loss": 1.9583, + "step": 5023 + }, + { + "epoch": 0.8105840593739916, + "grad_norm": 4.861396789550781, + "learning_rate": 9.283606109559644e-06, + "loss": 2.0954, + "step": 5024 + }, + { + "epoch": 0.8107454017424975, + "grad_norm": 5.2622199058532715, + "learning_rate": 9.268447027987488e-06, + "loss": 1.9247, + "step": 5025 + }, + { + "epoch": 0.8109067441110035, + "grad_norm": 4.728799343109131, + "learning_rate": 9.253299068668086e-06, + "loss": 1.838, + "step": 5026 + }, + { + "epoch": 0.8110680864795096, + "grad_norm": 3.924114227294922, + "learning_rate": 9.238162235737768e-06, + "loss": 1.671, + "step": 5027 + }, + { + "epoch": 0.8112294288480155, + "grad_norm": 6.050514221191406, + "learning_rate": 9.22303653332986e-06, + "loss": 1.9685, + "step": 5028 + }, + { + "epoch": 0.8113907712165215, + "grad_norm": 4.385323524475098, + "learning_rate": 9.207921965574594e-06, + "loss": 1.8523, + "step": 5029 + }, + { + "epoch": 0.8115521135850274, + "grad_norm": 3.637460470199585, + "learning_rate": 9.192818536599213e-06, + "loss": 1.7259, + "step": 5030 + }, + { + "epoch": 0.8117134559535334, + "grad_norm": 4.110119819641113, + "learning_rate": 9.177726250527868e-06, + "loss": 1.8351, + "step": 5031 + }, + { + "epoch": 0.8118747983220393, + "grad_norm": 4.287790775299072, + "learning_rate": 9.162645111481727e-06, + "loss": 2.0917, + "step": 5032 + }, + { + "epoch": 0.8120361406905453, + "grad_norm": 4.925917625427246, + "learning_rate": 9.147575123578844e-06, + "loss": 1.7523, + "step": 5033 + }, + { + "epoch": 0.8121974830590513, + "grad_norm": 7.13510274887085, + "learning_rate": 9.132516290934301e-06, + "loss": 2.1304, + "step": 5034 + }, + { + "epoch": 0.8123588254275573, + "grad_norm": 5.606991291046143, + "learning_rate": 9.11746861766008e-06, + "loss": 1.7848, + "step": 5035 + }, + { + "epoch": 0.8125201677960633, + "grad_norm": 6.008810997009277, + "learning_rate": 9.102432107865121e-06, + "loss": 1.9205, + "step": 5036 + }, + { + "epoch": 0.8126815101645692, + "grad_norm": 4.687321186065674, + "learning_rate": 9.087406765655355e-06, + "loss": 1.7783, + "step": 5037 + }, + { + "epoch": 0.8128428525330752, + "grad_norm": 4.49308967590332, + "learning_rate": 9.07239259513361e-06, + "loss": 1.7734, + "step": 5038 + }, + { + "epoch": 0.8130041949015812, + "grad_norm": 5.762004852294922, + "learning_rate": 9.057389600399719e-06, + "loss": 1.9634, + "step": 5039 + }, + { + "epoch": 0.8131655372700871, + "grad_norm": 4.135660171508789, + "learning_rate": 9.042397785550405e-06, + "loss": 1.6049, + "step": 5040 + }, + { + "epoch": 0.8133268796385931, + "grad_norm": 4.049421310424805, + "learning_rate": 9.027417154679396e-06, + "loss": 1.7663, + "step": 5041 + }, + { + "epoch": 0.813488222007099, + "grad_norm": 3.8373448848724365, + "learning_rate": 9.012447711877332e-06, + "loss": 1.7997, + "step": 5042 + }, + { + "epoch": 0.813649564375605, + "grad_norm": 4.999689102172852, + "learning_rate": 8.997489461231772e-06, + "loss": 2.0029, + "step": 5043 + }, + { + "epoch": 0.813810906744111, + "grad_norm": 4.012880802154541, + "learning_rate": 8.9825424068273e-06, + "loss": 1.7545, + "step": 5044 + }, + { + "epoch": 0.813972249112617, + "grad_norm": 4.000550270080566, + "learning_rate": 8.967606552745361e-06, + "loss": 1.8365, + "step": 5045 + }, + { + "epoch": 0.814133591481123, + "grad_norm": 4.201355934143066, + "learning_rate": 8.952681903064374e-06, + "loss": 1.9151, + "step": 5046 + }, + { + "epoch": 0.8142949338496289, + "grad_norm": 7.912868499755859, + "learning_rate": 8.937768461859714e-06, + "loss": 1.6631, + "step": 5047 + }, + { + "epoch": 0.8144562762181349, + "grad_norm": 3.9873435497283936, + "learning_rate": 8.92286623320368e-06, + "loss": 1.8191, + "step": 5048 + }, + { + "epoch": 0.8146176185866408, + "grad_norm": 4.569661617279053, + "learning_rate": 8.907975221165481e-06, + "loss": 2.1193, + "step": 5049 + }, + { + "epoch": 0.8147789609551468, + "grad_norm": 4.338842868804932, + "learning_rate": 8.893095429811332e-06, + "loss": 1.9049, + "step": 5050 + }, + { + "epoch": 0.8149403033236527, + "grad_norm": 5.899425029754639, + "learning_rate": 8.878226863204309e-06, + "loss": 2.068, + "step": 5051 + }, + { + "epoch": 0.8151016456921588, + "grad_norm": 5.389956951141357, + "learning_rate": 8.863369525404485e-06, + "loss": 1.6494, + "step": 5052 + }, + { + "epoch": 0.8152629880606648, + "grad_norm": 5.630323886871338, + "learning_rate": 8.848523420468818e-06, + "loss": 1.9578, + "step": 5053 + }, + { + "epoch": 0.8154243304291707, + "grad_norm": 4.566960334777832, + "learning_rate": 8.833688552451236e-06, + "loss": 1.8953, + "step": 5054 + }, + { + "epoch": 0.8155856727976767, + "grad_norm": 5.83742094039917, + "learning_rate": 8.818864925402564e-06, + "loss": 1.7418, + "step": 5055 + }, + { + "epoch": 0.8157470151661826, + "grad_norm": 3.897963285446167, + "learning_rate": 8.8040525433706e-06, + "loss": 2.1279, + "step": 5056 + }, + { + "epoch": 0.8159083575346886, + "grad_norm": 4.441431999206543, + "learning_rate": 8.789251410400023e-06, + "loss": 1.7854, + "step": 5057 + }, + { + "epoch": 0.8160696999031946, + "grad_norm": 6.147578716278076, + "learning_rate": 8.77446153053249e-06, + "loss": 1.9488, + "step": 5058 + }, + { + "epoch": 0.8162310422717005, + "grad_norm": 4.624359130859375, + "learning_rate": 8.759682907806537e-06, + "loss": 1.6247, + "step": 5059 + }, + { + "epoch": 0.8163923846402065, + "grad_norm": 5.683760643005371, + "learning_rate": 8.744915546257671e-06, + "loss": 2.0023, + "step": 5060 + }, + { + "epoch": 0.8165537270087125, + "grad_norm": 4.435068130493164, + "learning_rate": 8.730159449918285e-06, + "loss": 2.0267, + "step": 5061 + }, + { + "epoch": 0.8167150693772185, + "grad_norm": 5.430092811584473, + "learning_rate": 8.715414622817708e-06, + "loss": 1.8278, + "step": 5062 + }, + { + "epoch": 0.8168764117457245, + "grad_norm": 3.4103713035583496, + "learning_rate": 8.700681068982225e-06, + "loss": 1.6731, + "step": 5063 + }, + { + "epoch": 0.8170377541142304, + "grad_norm": 4.3300042152404785, + "learning_rate": 8.685958792434989e-06, + "loss": 1.9492, + "step": 5064 + }, + { + "epoch": 0.8171990964827364, + "grad_norm": 4.567752361297607, + "learning_rate": 8.671247797196113e-06, + "loss": 1.8913, + "step": 5065 + }, + { + "epoch": 0.8173604388512423, + "grad_norm": 4.536086559295654, + "learning_rate": 8.65654808728259e-06, + "loss": 2.0211, + "step": 5066 + }, + { + "epoch": 0.8175217812197483, + "grad_norm": 4.3093438148498535, + "learning_rate": 8.641859666708397e-06, + "loss": 1.8431, + "step": 5067 + }, + { + "epoch": 0.8176831235882542, + "grad_norm": 4.09118127822876, + "learning_rate": 8.627182539484353e-06, + "loss": 1.7744, + "step": 5068 + }, + { + "epoch": 0.8178444659567602, + "grad_norm": 5.769215106964111, + "learning_rate": 8.612516709618252e-06, + "loss": 1.9448, + "step": 5069 + }, + { + "epoch": 0.8180058083252663, + "grad_norm": 4.4861297607421875, + "learning_rate": 8.597862181114764e-06, + "loss": 1.4927, + "step": 5070 + }, + { + "epoch": 0.8181671506937722, + "grad_norm": 4.781145095825195, + "learning_rate": 8.583218957975504e-06, + "loss": 1.849, + "step": 5071 + }, + { + "epoch": 0.8183284930622782, + "grad_norm": 5.037804126739502, + "learning_rate": 8.568587044198968e-06, + "loss": 1.7988, + "step": 5072 + }, + { + "epoch": 0.8184898354307841, + "grad_norm": 4.66380500793457, + "learning_rate": 8.553966443780599e-06, + "loss": 1.819, + "step": 5073 + }, + { + "epoch": 0.8186511777992901, + "grad_norm": 5.495998859405518, + "learning_rate": 8.539357160712718e-06, + "loss": 2.0507, + "step": 5074 + }, + { + "epoch": 0.818812520167796, + "grad_norm": 4.616162300109863, + "learning_rate": 8.524759198984566e-06, + "loss": 1.9237, + "step": 5075 + }, + { + "epoch": 0.818973862536302, + "grad_norm": 4.727950572967529, + "learning_rate": 8.51017256258232e-06, + "loss": 2.0016, + "step": 5076 + }, + { + "epoch": 0.819135204904808, + "grad_norm": 3.9767277240753174, + "learning_rate": 8.495597255489007e-06, + "loss": 2.0227, + "step": 5077 + }, + { + "epoch": 0.819296547273314, + "grad_norm": 3.8664677143096924, + "learning_rate": 8.481033281684631e-06, + "loss": 1.7761, + "step": 5078 + }, + { + "epoch": 0.81945788964182, + "grad_norm": 4.328319549560547, + "learning_rate": 8.46648064514603e-06, + "loss": 2.0187, + "step": 5079 + }, + { + "epoch": 0.8196192320103259, + "grad_norm": 7.903204917907715, + "learning_rate": 8.45193934984701e-06, + "loss": 2.0693, + "step": 5080 + }, + { + "epoch": 0.8197805743788319, + "grad_norm": 4.029341220855713, + "learning_rate": 8.437409399758234e-06, + "loss": 1.9574, + "step": 5081 + }, + { + "epoch": 0.8199419167473379, + "grad_norm": 5.163949966430664, + "learning_rate": 8.422890798847282e-06, + "loss": 1.5796, + "step": 5082 + }, + { + "epoch": 0.8201032591158438, + "grad_norm": 4.17691707611084, + "learning_rate": 8.408383551078652e-06, + "loss": 1.965, + "step": 5083 + }, + { + "epoch": 0.8202646014843498, + "grad_norm": 4.767606735229492, + "learning_rate": 8.393887660413719e-06, + "loss": 2.0412, + "step": 5084 + }, + { + "epoch": 0.8204259438528557, + "grad_norm": 4.345592498779297, + "learning_rate": 8.379403130810764e-06, + "loss": 1.6822, + "step": 5085 + }, + { + "epoch": 0.8205872862213617, + "grad_norm": 4.43311882019043, + "learning_rate": 8.364929966224955e-06, + "loss": 1.7265, + "step": 5086 + }, + { + "epoch": 0.8207486285898677, + "grad_norm": 3.951153516769409, + "learning_rate": 8.350468170608394e-06, + "loss": 1.7785, + "step": 5087 + }, + { + "epoch": 0.8209099709583737, + "grad_norm": 4.090831756591797, + "learning_rate": 8.336017747910019e-06, + "loss": 1.7555, + "step": 5088 + }, + { + "epoch": 0.8210713133268797, + "grad_norm": 4.850238800048828, + "learning_rate": 8.321578702075733e-06, + "loss": 1.5936, + "step": 5089 + }, + { + "epoch": 0.8212326556953856, + "grad_norm": 4.753424167633057, + "learning_rate": 8.30715103704826e-06, + "loss": 1.7142, + "step": 5090 + }, + { + "epoch": 0.8213939980638916, + "grad_norm": 4.926393508911133, + "learning_rate": 8.292734756767284e-06, + "loss": 1.8298, + "step": 5091 + }, + { + "epoch": 0.8215553404323975, + "grad_norm": 5.0380940437316895, + "learning_rate": 8.278329865169321e-06, + "loss": 2.0694, + "step": 5092 + }, + { + "epoch": 0.8217166828009035, + "grad_norm": 4.265560626983643, + "learning_rate": 8.263936366187824e-06, + "loss": 1.8107, + "step": 5093 + }, + { + "epoch": 0.8218780251694094, + "grad_norm": 4.015749454498291, + "learning_rate": 8.2495542637531e-06, + "loss": 1.8572, + "step": 5094 + }, + { + "epoch": 0.8220393675379155, + "grad_norm": 4.692224502563477, + "learning_rate": 8.235183561792382e-06, + "loss": 1.679, + "step": 5095 + }, + { + "epoch": 0.8222007099064215, + "grad_norm": 3.4330830574035645, + "learning_rate": 8.220824264229737e-06, + "loss": 1.7406, + "step": 5096 + }, + { + "epoch": 0.8223620522749274, + "grad_norm": 4.453279972076416, + "learning_rate": 8.206476374986178e-06, + "loss": 1.9318, + "step": 5097 + }, + { + "epoch": 0.8225233946434334, + "grad_norm": 4.732417106628418, + "learning_rate": 8.192139897979556e-06, + "loss": 1.7594, + "step": 5098 + }, + { + "epoch": 0.8226847370119393, + "grad_norm": 5.123412132263184, + "learning_rate": 8.17781483712462e-06, + "loss": 2.0592, + "step": 5099 + }, + { + "epoch": 0.8228460793804453, + "grad_norm": 4.737793445587158, + "learning_rate": 8.163501196333018e-06, + "loss": 1.9146, + "step": 5100 + }, + { + "epoch": 0.8230074217489513, + "grad_norm": 4.241943359375, + "learning_rate": 8.149198979513257e-06, + "loss": 1.8821, + "step": 5101 + }, + { + "epoch": 0.8231687641174572, + "grad_norm": 4.252610206604004, + "learning_rate": 8.134908190570723e-06, + "loss": 1.8205, + "step": 5102 + }, + { + "epoch": 0.8233301064859632, + "grad_norm": 4.118143081665039, + "learning_rate": 8.120628833407717e-06, + "loss": 1.6768, + "step": 5103 + }, + { + "epoch": 0.8234914488544692, + "grad_norm": 5.339510917663574, + "learning_rate": 8.106360911923382e-06, + "loss": 2.15, + "step": 5104 + }, + { + "epoch": 0.8236527912229752, + "grad_norm": 4.20247745513916, + "learning_rate": 8.092104430013736e-06, + "loss": 1.7785, + "step": 5105 + }, + { + "epoch": 0.8238141335914811, + "grad_norm": 3.4559247493743896, + "learning_rate": 8.077859391571712e-06, + "loss": 1.865, + "step": 5106 + }, + { + "epoch": 0.8239754759599871, + "grad_norm": 4.361405372619629, + "learning_rate": 8.063625800487067e-06, + "loss": 1.7921, + "step": 5107 + }, + { + "epoch": 0.8241368183284931, + "grad_norm": 3.9835152626037598, + "learning_rate": 8.049403660646487e-06, + "loss": 1.9069, + "step": 5108 + }, + { + "epoch": 0.824298160696999, + "grad_norm": 3.749183416366577, + "learning_rate": 8.035192975933476e-06, + "loss": 1.6726, + "step": 5109 + }, + { + "epoch": 0.824459503065505, + "grad_norm": 4.866585731506348, + "learning_rate": 8.020993750228461e-06, + "loss": 1.7821, + "step": 5110 + }, + { + "epoch": 0.8246208454340109, + "grad_norm": 5.902980804443359, + "learning_rate": 8.006805987408705e-06, + "loss": 1.8989, + "step": 5111 + }, + { + "epoch": 0.8247821878025169, + "grad_norm": 4.18698787689209, + "learning_rate": 7.992629691348335e-06, + "loss": 1.848, + "step": 5112 + }, + { + "epoch": 0.824943530171023, + "grad_norm": 4.467362880706787, + "learning_rate": 7.978464865918395e-06, + "loss": 1.9784, + "step": 5113 + }, + { + "epoch": 0.8251048725395289, + "grad_norm": 4.428437232971191, + "learning_rate": 7.964311514986733e-06, + "loss": 1.9523, + "step": 5114 + }, + { + "epoch": 0.8252662149080349, + "grad_norm": 5.125341415405273, + "learning_rate": 7.950169642418126e-06, + "loss": 1.8608, + "step": 5115 + }, + { + "epoch": 0.8254275572765408, + "grad_norm": 4.956939220428467, + "learning_rate": 7.936039252074157e-06, + "loss": 1.6212, + "step": 5116 + }, + { + "epoch": 0.8255888996450468, + "grad_norm": 5.354269504547119, + "learning_rate": 7.921920347813333e-06, + "loss": 1.9752, + "step": 5117 + }, + { + "epoch": 0.8257502420135527, + "grad_norm": 3.843909502029419, + "learning_rate": 7.907812933490971e-06, + "loss": 1.8323, + "step": 5118 + }, + { + "epoch": 0.8259115843820587, + "grad_norm": 4.785239219665527, + "learning_rate": 7.893717012959296e-06, + "loss": 1.8471, + "step": 5119 + }, + { + "epoch": 0.8260729267505647, + "grad_norm": 5.060910224914551, + "learning_rate": 7.879632590067353e-06, + "loss": 1.9037, + "step": 5120 + }, + { + "epoch": 0.8262342691190707, + "grad_norm": 3.5868513584136963, + "learning_rate": 7.865559668661088e-06, + "loss": 1.5919, + "step": 5121 + }, + { + "epoch": 0.8263956114875767, + "grad_norm": 3.619961738586426, + "learning_rate": 7.85149825258325e-06, + "loss": 1.7938, + "step": 5122 + }, + { + "epoch": 0.8265569538560826, + "grad_norm": 5.197341442108154, + "learning_rate": 7.837448345673526e-06, + "loss": 1.7578, + "step": 5123 + }, + { + "epoch": 0.8267182962245886, + "grad_norm": 4.56782341003418, + "learning_rate": 7.82340995176839e-06, + "loss": 1.9851, + "step": 5124 + }, + { + "epoch": 0.8268796385930945, + "grad_norm": 4.951657295227051, + "learning_rate": 7.809383074701193e-06, + "loss": 1.7909, + "step": 5125 + }, + { + "epoch": 0.8270409809616005, + "grad_norm": 4.474319934844971, + "learning_rate": 7.79536771830217e-06, + "loss": 1.8335, + "step": 5126 + }, + { + "epoch": 0.8272023233301065, + "grad_norm": 4.652566909790039, + "learning_rate": 7.781363886398363e-06, + "loss": 1.8465, + "step": 5127 + }, + { + "epoch": 0.8273636656986124, + "grad_norm": 3.9626662731170654, + "learning_rate": 7.767371582813715e-06, + "loss": 1.6353, + "step": 5128 + }, + { + "epoch": 0.8275250080671184, + "grad_norm": 5.283198356628418, + "learning_rate": 7.753390811368971e-06, + "loss": 1.5737, + "step": 5129 + }, + { + "epoch": 0.8276863504356244, + "grad_norm": 4.049074649810791, + "learning_rate": 7.739421575881783e-06, + "loss": 2.0153, + "step": 5130 + }, + { + "epoch": 0.8278476928041304, + "grad_norm": 5.280628204345703, + "learning_rate": 7.725463880166589e-06, + "loss": 2.0416, + "step": 5131 + }, + { + "epoch": 0.8280090351726364, + "grad_norm": 4.2782769203186035, + "learning_rate": 7.711517728034746e-06, + "loss": 1.7204, + "step": 5132 + }, + { + "epoch": 0.8281703775411423, + "grad_norm": 4.3173136711120605, + "learning_rate": 7.697583123294388e-06, + "loss": 1.8216, + "step": 5133 + }, + { + "epoch": 0.8283317199096483, + "grad_norm": 4.055956840515137, + "learning_rate": 7.683660069750559e-06, + "loss": 1.7376, + "step": 5134 + }, + { + "epoch": 0.8284930622781542, + "grad_norm": 4.467798709869385, + "learning_rate": 7.6697485712051e-06, + "loss": 1.9001, + "step": 5135 + }, + { + "epoch": 0.8286544046466602, + "grad_norm": 5.5613813400268555, + "learning_rate": 7.65584863145673e-06, + "loss": 1.9246, + "step": 5136 + }, + { + "epoch": 0.8288157470151661, + "grad_norm": 4.992427825927734, + "learning_rate": 7.641960254301e-06, + "loss": 1.8027, + "step": 5137 + }, + { + "epoch": 0.8289770893836722, + "grad_norm": 3.728839874267578, + "learning_rate": 7.6280834435302876e-06, + "loss": 1.8116, + "step": 5138 + }, + { + "epoch": 0.8291384317521782, + "grad_norm": 3.922977924346924, + "learning_rate": 7.6142182029338424e-06, + "loss": 1.7641, + "step": 5139 + }, + { + "epoch": 0.8292997741206841, + "grad_norm": 4.391849994659424, + "learning_rate": 7.600364536297738e-06, + "loss": 1.9565, + "step": 5140 + }, + { + "epoch": 0.8294611164891901, + "grad_norm": 4.4909515380859375, + "learning_rate": 7.586522447404882e-06, + "loss": 1.8532, + "step": 5141 + }, + { + "epoch": 0.829622458857696, + "grad_norm": 4.394071578979492, + "learning_rate": 7.57269194003502e-06, + "loss": 1.5828, + "step": 5142 + }, + { + "epoch": 0.829783801226202, + "grad_norm": 4.511841297149658, + "learning_rate": 7.5588730179647625e-06, + "loss": 1.8064, + "step": 5143 + }, + { + "epoch": 0.829945143594708, + "grad_norm": 8.176545143127441, + "learning_rate": 7.545065684967517e-06, + "loss": 2.1051, + "step": 5144 + }, + { + "epoch": 0.8301064859632139, + "grad_norm": 3.9797441959381104, + "learning_rate": 7.531269944813568e-06, + "loss": 1.9633, + "step": 5145 + }, + { + "epoch": 0.8302678283317199, + "grad_norm": 4.101449012756348, + "learning_rate": 7.517485801269986e-06, + "loss": 1.6769, + "step": 5146 + }, + { + "epoch": 0.8304291707002259, + "grad_norm": 3.9880740642547607, + "learning_rate": 7.503713258100725e-06, + "loss": 1.8013, + "step": 5147 + }, + { + "epoch": 0.8305905130687319, + "grad_norm": 4.547024250030518, + "learning_rate": 7.489952319066529e-06, + "loss": 1.9486, + "step": 5148 + }, + { + "epoch": 0.8307518554372378, + "grad_norm": 4.98875617980957, + "learning_rate": 7.476202987925013e-06, + "loss": 1.8601, + "step": 5149 + }, + { + "epoch": 0.8309131978057438, + "grad_norm": 4.726477146148682, + "learning_rate": 7.462465268430591e-06, + "loss": 1.6014, + "step": 5150 + }, + { + "epoch": 0.8310745401742498, + "grad_norm": 3.761723756790161, + "learning_rate": 7.448739164334501e-06, + "loss": 1.8683, + "step": 5151 + }, + { + "epoch": 0.8312358825427557, + "grad_norm": 4.3507080078125, + "learning_rate": 7.43502467938485e-06, + "loss": 1.938, + "step": 5152 + }, + { + "epoch": 0.8313972249112617, + "grad_norm": 4.776767730712891, + "learning_rate": 7.421321817326526e-06, + "loss": 1.711, + "step": 5153 + }, + { + "epoch": 0.8315585672797676, + "grad_norm": 4.126363277435303, + "learning_rate": 7.407630581901293e-06, + "loss": 1.6088, + "step": 5154 + }, + { + "epoch": 0.8317199096482737, + "grad_norm": 3.9798381328582764, + "learning_rate": 7.393950976847674e-06, + "loss": 1.6647, + "step": 5155 + }, + { + "epoch": 0.8318812520167796, + "grad_norm": 4.233016490936279, + "learning_rate": 7.380283005901084e-06, + "loss": 1.758, + "step": 5156 + }, + { + "epoch": 0.8320425943852856, + "grad_norm": 4.131165504455566, + "learning_rate": 7.366626672793714e-06, + "loss": 1.6231, + "step": 5157 + }, + { + "epoch": 0.8322039367537916, + "grad_norm": 4.439856052398682, + "learning_rate": 7.352981981254608e-06, + "loss": 1.8005, + "step": 5158 + }, + { + "epoch": 0.8323652791222975, + "grad_norm": 6.2160186767578125, + "learning_rate": 7.339348935009616e-06, + "loss": 1.7089, + "step": 5159 + }, + { + "epoch": 0.8325266214908035, + "grad_norm": 5.1408796310424805, + "learning_rate": 7.325727537781396e-06, + "loss": 1.622, + "step": 5160 + }, + { + "epoch": 0.8326879638593094, + "grad_norm": 3.8749024868011475, + "learning_rate": 7.312117793289447e-06, + "loss": 1.7168, + "step": 5161 + }, + { + "epoch": 0.8328493062278154, + "grad_norm": 5.038416862487793, + "learning_rate": 7.298519705250067e-06, + "loss": 2.0763, + "step": 5162 + }, + { + "epoch": 0.8330106485963213, + "grad_norm": 4.5244526863098145, + "learning_rate": 7.284933277376404e-06, + "loss": 1.8442, + "step": 5163 + }, + { + "epoch": 0.8331719909648274, + "grad_norm": 5.41386079788208, + "learning_rate": 7.271358513378368e-06, + "loss": 1.6265, + "step": 5164 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 4.516528129577637, + "learning_rate": 7.257795416962753e-06, + "loss": 1.7632, + "step": 5165 + }, + { + "epoch": 0.8334946757018393, + "grad_norm": 4.514156341552734, + "learning_rate": 7.244243991833094e-06, + "loss": 1.7597, + "step": 5166 + }, + { + "epoch": 0.8336560180703453, + "grad_norm": 5.281730651855469, + "learning_rate": 7.230704241689806e-06, + "loss": 2.0508, + "step": 5167 + }, + { + "epoch": 0.8338173604388512, + "grad_norm": 5.620670318603516, + "learning_rate": 7.217176170230056e-06, + "loss": 1.9524, + "step": 5168 + }, + { + "epoch": 0.8339787028073572, + "grad_norm": 4.294712543487549, + "learning_rate": 7.203659781147881e-06, + "loss": 2.0827, + "step": 5169 + }, + { + "epoch": 0.8341400451758632, + "grad_norm": 3.571934461593628, + "learning_rate": 7.190155078134064e-06, + "loss": 1.8611, + "step": 5170 + }, + { + "epoch": 0.8343013875443691, + "grad_norm": 4.268311023712158, + "learning_rate": 7.1766620648762665e-06, + "loss": 1.8018, + "step": 5171 + }, + { + "epoch": 0.8344627299128751, + "grad_norm": 5.438915252685547, + "learning_rate": 7.163180745058889e-06, + "loss": 1.8312, + "step": 5172 + }, + { + "epoch": 0.8346240722813811, + "grad_norm": 5.227361679077148, + "learning_rate": 7.149711122363201e-06, + "loss": 1.9746, + "step": 5173 + }, + { + "epoch": 0.8347854146498871, + "grad_norm": 4.696291446685791, + "learning_rate": 7.13625320046723e-06, + "loss": 1.9036, + "step": 5174 + }, + { + "epoch": 0.834946757018393, + "grad_norm": 4.491235256195068, + "learning_rate": 7.1228069830458264e-06, + "loss": 1.9708, + "step": 5175 + }, + { + "epoch": 0.835108099386899, + "grad_norm": 4.827406883239746, + "learning_rate": 7.109372473770659e-06, + "loss": 1.7651, + "step": 5176 + }, + { + "epoch": 0.835269441755405, + "grad_norm": 4.635233402252197, + "learning_rate": 7.095949676310171e-06, + "loss": 1.8846, + "step": 5177 + }, + { + "epoch": 0.8354307841239109, + "grad_norm": 5.021635055541992, + "learning_rate": 7.082538594329641e-06, + "loss": 2.1273, + "step": 5178 + }, + { + "epoch": 0.8355921264924169, + "grad_norm": 5.73133659362793, + "learning_rate": 7.069139231491118e-06, + "loss": 1.8384, + "step": 5179 + }, + { + "epoch": 0.8357534688609228, + "grad_norm": 6.3099188804626465, + "learning_rate": 7.055751591453469e-06, + "loss": 2.0464, + "step": 5180 + }, + { + "epoch": 0.8359148112294289, + "grad_norm": 4.243574142456055, + "learning_rate": 7.0423756778723375e-06, + "loss": 2.0335, + "step": 5181 + }, + { + "epoch": 0.8360761535979349, + "grad_norm": 3.7860779762268066, + "learning_rate": 7.0290114944002065e-06, + "loss": 1.9782, + "step": 5182 + }, + { + "epoch": 0.8362374959664408, + "grad_norm": 5.813986301422119, + "learning_rate": 7.015659044686307e-06, + "loss": 1.9711, + "step": 5183 + }, + { + "epoch": 0.8363988383349468, + "grad_norm": 4.148378849029541, + "learning_rate": 7.002318332376712e-06, + "loss": 1.925, + "step": 5184 + }, + { + "epoch": 0.8365601807034527, + "grad_norm": 3.4092843532562256, + "learning_rate": 6.988989361114251e-06, + "loss": 1.7628, + "step": 5185 + }, + { + "epoch": 0.8367215230719587, + "grad_norm": 4.043959617614746, + "learning_rate": 6.97567213453858e-06, + "loss": 1.7128, + "step": 5186 + }, + { + "epoch": 0.8368828654404646, + "grad_norm": 4.311399936676025, + "learning_rate": 6.962366656286118e-06, + "loss": 1.8537, + "step": 5187 + }, + { + "epoch": 0.8370442078089706, + "grad_norm": 4.286233901977539, + "learning_rate": 6.949072929990091e-06, + "loss": 1.8384, + "step": 5188 + }, + { + "epoch": 0.8372055501774766, + "grad_norm": 3.0191707611083984, + "learning_rate": 6.935790959280525e-06, + "loss": 1.804, + "step": 5189 + }, + { + "epoch": 0.8373668925459826, + "grad_norm": 4.2001729011535645, + "learning_rate": 6.922520747784206e-06, + "loss": 1.655, + "step": 5190 + }, + { + "epoch": 0.8375282349144886, + "grad_norm": 4.738022327423096, + "learning_rate": 6.9092622991247576e-06, + "loss": 1.5961, + "step": 5191 + }, + { + "epoch": 0.8376895772829945, + "grad_norm": 5.298644065856934, + "learning_rate": 6.896015616922535e-06, + "loss": 1.7559, + "step": 5192 + }, + { + "epoch": 0.8378509196515005, + "grad_norm": 4.534740924835205, + "learning_rate": 6.882780704794734e-06, + "loss": 1.5783, + "step": 5193 + }, + { + "epoch": 0.8380122620200064, + "grad_norm": 4.212889671325684, + "learning_rate": 6.869557566355284e-06, + "loss": 1.9214, + "step": 5194 + }, + { + "epoch": 0.8381736043885124, + "grad_norm": 4.8258256912231445, + "learning_rate": 6.856346205214947e-06, + "loss": 1.6304, + "step": 5195 + }, + { + "epoch": 0.8383349467570184, + "grad_norm": 4.971568584442139, + "learning_rate": 6.843146624981239e-06, + "loss": 2.0498, + "step": 5196 + }, + { + "epoch": 0.8384962891255243, + "grad_norm": 5.606351375579834, + "learning_rate": 6.829958829258465e-06, + "loss": 1.8753, + "step": 5197 + }, + { + "epoch": 0.8386576314940304, + "grad_norm": 4.643892288208008, + "learning_rate": 6.816782821647727e-06, + "loss": 2.143, + "step": 5198 + }, + { + "epoch": 0.8388189738625363, + "grad_norm": 4.603559494018555, + "learning_rate": 6.8036186057468866e-06, + "loss": 1.8819, + "step": 5199 + }, + { + "epoch": 0.8389803162310423, + "grad_norm": 3.6417465209960938, + "learning_rate": 6.790466185150596e-06, + "loss": 1.7776, + "step": 5200 + }, + { + "epoch": 0.8391416585995483, + "grad_norm": 3.8199212551116943, + "learning_rate": 6.777325563450282e-06, + "loss": 1.8418, + "step": 5201 + }, + { + "epoch": 0.8393030009680542, + "grad_norm": 4.013891696929932, + "learning_rate": 6.7641967442341635e-06, + "loss": 1.8983, + "step": 5202 + }, + { + "epoch": 0.8394643433365602, + "grad_norm": 3.892119884490967, + "learning_rate": 6.751079731087217e-06, + "loss": 1.7774, + "step": 5203 + }, + { + "epoch": 0.8396256857050661, + "grad_norm": 5.950384140014648, + "learning_rate": 6.737974527591212e-06, + "loss": 1.9238, + "step": 5204 + }, + { + "epoch": 0.8397870280735721, + "grad_norm": 4.6189398765563965, + "learning_rate": 6.724881137324679e-06, + "loss": 1.7152, + "step": 5205 + }, + { + "epoch": 0.839948370442078, + "grad_norm": 5.0324835777282715, + "learning_rate": 6.711799563862942e-06, + "loss": 1.9298, + "step": 5206 + }, + { + "epoch": 0.8401097128105841, + "grad_norm": 4.298720359802246, + "learning_rate": 6.698729810778065e-06, + "loss": 1.667, + "step": 5207 + }, + { + "epoch": 0.8402710551790901, + "grad_norm": 4.028954029083252, + "learning_rate": 6.685671881638933e-06, + "loss": 1.6205, + "step": 5208 + }, + { + "epoch": 0.840432397547596, + "grad_norm": 4.030002593994141, + "learning_rate": 6.672625780011144e-06, + "loss": 1.7649, + "step": 5209 + }, + { + "epoch": 0.840593739916102, + "grad_norm": 4.125081539154053, + "learning_rate": 6.659591509457125e-06, + "loss": 1.9293, + "step": 5210 + }, + { + "epoch": 0.8407550822846079, + "grad_norm": 3.785576105117798, + "learning_rate": 6.6465690735360244e-06, + "loss": 2.1153, + "step": 5211 + }, + { + "epoch": 0.8409164246531139, + "grad_norm": 4.0753493309021, + "learning_rate": 6.633558475803792e-06, + "loss": 1.6938, + "step": 5212 + }, + { + "epoch": 0.8410777670216198, + "grad_norm": 4.192071437835693, + "learning_rate": 6.6205597198131295e-06, + "loss": 1.8384, + "step": 5213 + }, + { + "epoch": 0.8412391093901258, + "grad_norm": 5.801394939422607, + "learning_rate": 6.607572809113488e-06, + "loss": 1.8592, + "step": 5214 + }, + { + "epoch": 0.8414004517586318, + "grad_norm": 4.262351036071777, + "learning_rate": 6.594597747251136e-06, + "loss": 1.713, + "step": 5215 + }, + { + "epoch": 0.8415617941271378, + "grad_norm": 4.121074199676514, + "learning_rate": 6.581634537769054e-06, + "loss": 1.7004, + "step": 5216 + }, + { + "epoch": 0.8417231364956438, + "grad_norm": 3.9050257205963135, + "learning_rate": 6.568683184206997e-06, + "loss": 1.6932, + "step": 5217 + }, + { + "epoch": 0.8418844788641497, + "grad_norm": 7.386196136474609, + "learning_rate": 6.555743690101523e-06, + "loss": 1.8373, + "step": 5218 + }, + { + "epoch": 0.8420458212326557, + "grad_norm": 4.8773627281188965, + "learning_rate": 6.542816058985895e-06, + "loss": 2.0369, + "step": 5219 + }, + { + "epoch": 0.8422071636011617, + "grad_norm": 4.547236919403076, + "learning_rate": 6.529900294390162e-06, + "loss": 1.6388, + "step": 5220 + }, + { + "epoch": 0.8423685059696676, + "grad_norm": 4.2048869132995605, + "learning_rate": 6.516996399841152e-06, + "loss": 1.7337, + "step": 5221 + }, + { + "epoch": 0.8425298483381736, + "grad_norm": 3.9539923667907715, + "learning_rate": 6.504104378862408e-06, + "loss": 2.0345, + "step": 5222 + }, + { + "epoch": 0.8426911907066795, + "grad_norm": 3.4182205200195312, + "learning_rate": 6.49122423497428e-06, + "loss": 1.7882, + "step": 5223 + }, + { + "epoch": 0.8428525330751856, + "grad_norm": 6.698798656463623, + "learning_rate": 6.478355971693834e-06, + "loss": 1.9628, + "step": 5224 + }, + { + "epoch": 0.8430138754436916, + "grad_norm": 4.609959602355957, + "learning_rate": 6.465499592534902e-06, + "loss": 1.651, + "step": 5225 + }, + { + "epoch": 0.8431752178121975, + "grad_norm": 4.455406665802002, + "learning_rate": 6.452655101008098e-06, + "loss": 1.8349, + "step": 5226 + }, + { + "epoch": 0.8433365601807035, + "grad_norm": 4.491783142089844, + "learning_rate": 6.439822500620751e-06, + "loss": 1.9411, + "step": 5227 + }, + { + "epoch": 0.8434979025492094, + "grad_norm": 4.449477672576904, + "learning_rate": 6.427001794876975e-06, + "loss": 1.7111, + "step": 5228 + }, + { + "epoch": 0.8436592449177154, + "grad_norm": 3.9713246822357178, + "learning_rate": 6.414192987277601e-06, + "loss": 1.789, + "step": 5229 + }, + { + "epoch": 0.8438205872862213, + "grad_norm": 4.128500938415527, + "learning_rate": 6.401396081320255e-06, + "loss": 1.8, + "step": 5230 + }, + { + "epoch": 0.8439819296547273, + "grad_norm": 4.807419300079346, + "learning_rate": 6.388611080499274e-06, + "loss": 1.7202, + "step": 5231 + }, + { + "epoch": 0.8441432720232332, + "grad_norm": 3.9238312244415283, + "learning_rate": 6.3758379883057714e-06, + "loss": 2.1235, + "step": 5232 + }, + { + "epoch": 0.8443046143917393, + "grad_norm": 3.4898500442504883, + "learning_rate": 6.363076808227586e-06, + "loss": 2.0414, + "step": 5233 + }, + { + "epoch": 0.8444659567602453, + "grad_norm": 3.7841458320617676, + "learning_rate": 6.350327543749329e-06, + "loss": 1.7985, + "step": 5234 + }, + { + "epoch": 0.8446272991287512, + "grad_norm": 5.285628318786621, + "learning_rate": 6.337590198352339e-06, + "loss": 1.9694, + "step": 5235 + }, + { + "epoch": 0.8447886414972572, + "grad_norm": 3.4902424812316895, + "learning_rate": 6.3248647755147e-06, + "loss": 1.9331, + "step": 5236 + }, + { + "epoch": 0.8449499838657631, + "grad_norm": 4.591045379638672, + "learning_rate": 6.312151278711237e-06, + "loss": 1.8746, + "step": 5237 + }, + { + "epoch": 0.8451113262342691, + "grad_norm": 9.962109565734863, + "learning_rate": 6.299449711413552e-06, + "loss": 2.3339, + "step": 5238 + }, + { + "epoch": 0.8452726686027751, + "grad_norm": 4.510113716125488, + "learning_rate": 6.286760077089954e-06, + "loss": 1.7606, + "step": 5239 + }, + { + "epoch": 0.845434010971281, + "grad_norm": 4.234711647033691, + "learning_rate": 6.274082379205487e-06, + "loss": 1.8549, + "step": 5240 + }, + { + "epoch": 0.8455953533397871, + "grad_norm": 6.012852191925049, + "learning_rate": 6.261416621221977e-06, + "loss": 2.0749, + "step": 5241 + }, + { + "epoch": 0.845756695708293, + "grad_norm": 4.803767681121826, + "learning_rate": 6.248762806597946e-06, + "loss": 1.774, + "step": 5242 + }, + { + "epoch": 0.845918038076799, + "grad_norm": 5.22224235534668, + "learning_rate": 6.236120938788692e-06, + "loss": 1.8564, + "step": 5243 + }, + { + "epoch": 0.846079380445305, + "grad_norm": 3.6898679733276367, + "learning_rate": 6.223491021246214e-06, + "loss": 1.5177, + "step": 5244 + }, + { + "epoch": 0.8462407228138109, + "grad_norm": 4.5900959968566895, + "learning_rate": 6.2108730574192865e-06, + "loss": 1.6623, + "step": 5245 + }, + { + "epoch": 0.8464020651823169, + "grad_norm": 3.6462502479553223, + "learning_rate": 6.198267050753387e-06, + "loss": 1.8379, + "step": 5246 + }, + { + "epoch": 0.8465634075508228, + "grad_norm": 4.66995906829834, + "learning_rate": 6.185673004690745e-06, + "loss": 1.8918, + "step": 5247 + }, + { + "epoch": 0.8467247499193288, + "grad_norm": 4.3259100914001465, + "learning_rate": 6.173090922670316e-06, + "loss": 1.978, + "step": 5248 + }, + { + "epoch": 0.8468860922878347, + "grad_norm": 3.9500341415405273, + "learning_rate": 6.160520808127807e-06, + "loss": 1.7887, + "step": 5249 + }, + { + "epoch": 0.8470474346563408, + "grad_norm": 5.067325592041016, + "learning_rate": 6.147962664495632e-06, + "loss": 1.7481, + "step": 5250 + }, + { + "epoch": 0.8472087770248468, + "grad_norm": 5.685298442840576, + "learning_rate": 6.135416495202934e-06, + "loss": 1.7633, + "step": 5251 + }, + { + "epoch": 0.8473701193933527, + "grad_norm": 4.695085525512695, + "learning_rate": 6.122882303675626e-06, + "loss": 1.7703, + "step": 5252 + }, + { + "epoch": 0.8475314617618587, + "grad_norm": 4.995589256286621, + "learning_rate": 6.110360093336292e-06, + "loss": 1.858, + "step": 5253 + }, + { + "epoch": 0.8476928041303646, + "grad_norm": 3.847963571548462, + "learning_rate": 6.097849867604311e-06, + "loss": 1.657, + "step": 5254 + }, + { + "epoch": 0.8478541464988706, + "grad_norm": 4.688514232635498, + "learning_rate": 6.085351629895736e-06, + "loss": 2.0851, + "step": 5255 + }, + { + "epoch": 0.8480154888673765, + "grad_norm": 4.997213363647461, + "learning_rate": 6.0728653836233555e-06, + "loss": 1.8903, + "step": 5256 + }, + { + "epoch": 0.8481768312358825, + "grad_norm": 4.4050726890563965, + "learning_rate": 6.060391132196713e-06, + "loss": 1.8936, + "step": 5257 + }, + { + "epoch": 0.8483381736043886, + "grad_norm": 6.244439601898193, + "learning_rate": 6.047928879022052e-06, + "loss": 1.9428, + "step": 5258 + }, + { + "epoch": 0.8484995159728945, + "grad_norm": 4.172328948974609, + "learning_rate": 6.0354786275023224e-06, + "loss": 1.958, + "step": 5259 + }, + { + "epoch": 0.8486608583414005, + "grad_norm": 5.372505187988281, + "learning_rate": 6.023040381037254e-06, + "loss": 1.7281, + "step": 5260 + }, + { + "epoch": 0.8488222007099064, + "grad_norm": 4.413226127624512, + "learning_rate": 6.01061414302323e-06, + "loss": 1.8378, + "step": 5261 + }, + { + "epoch": 0.8489835430784124, + "grad_norm": 5.235836029052734, + "learning_rate": 5.998199916853414e-06, + "loss": 2.0496, + "step": 5262 + }, + { + "epoch": 0.8491448854469184, + "grad_norm": 4.358976364135742, + "learning_rate": 5.985797705917651e-06, + "loss": 1.4936, + "step": 5263 + }, + { + "epoch": 0.8493062278154243, + "grad_norm": 4.531014919281006, + "learning_rate": 5.973407513602514e-06, + "loss": 1.7078, + "step": 5264 + }, + { + "epoch": 0.8494675701839303, + "grad_norm": 5.202716827392578, + "learning_rate": 5.961029343291308e-06, + "loss": 1.8907, + "step": 5265 + }, + { + "epoch": 0.8496289125524362, + "grad_norm": 4.391210556030273, + "learning_rate": 5.948663198364035e-06, + "loss": 1.917, + "step": 5266 + }, + { + "epoch": 0.8497902549209423, + "grad_norm": 4.849218845367432, + "learning_rate": 5.936309082197439e-06, + "loss": 2.2373, + "step": 5267 + }, + { + "epoch": 0.8499515972894482, + "grad_norm": 4.686940670013428, + "learning_rate": 5.923966998164937e-06, + "loss": 1.9442, + "step": 5268 + }, + { + "epoch": 0.8501129396579542, + "grad_norm": 4.684127330780029, + "learning_rate": 5.911636949636718e-06, + "loss": 1.9074, + "step": 5269 + }, + { + "epoch": 0.8502742820264602, + "grad_norm": 4.158158302307129, + "learning_rate": 5.8993189399796315e-06, + "loss": 1.888, + "step": 5270 + }, + { + "epoch": 0.8504356243949661, + "grad_norm": 4.792473793029785, + "learning_rate": 5.887012972557276e-06, + "loss": 1.8296, + "step": 5271 + }, + { + "epoch": 0.8505969667634721, + "grad_norm": 3.8275701999664307, + "learning_rate": 5.8747190507299375e-06, + "loss": 1.7749, + "step": 5272 + }, + { + "epoch": 0.850758309131978, + "grad_norm": 3.9070305824279785, + "learning_rate": 5.86243717785463e-06, + "loss": 1.8192, + "step": 5273 + }, + { + "epoch": 0.850919651500484, + "grad_norm": 5.525460720062256, + "learning_rate": 5.850167357285069e-06, + "loss": 1.8901, + "step": 5274 + }, + { + "epoch": 0.8510809938689899, + "grad_norm": 4.825296878814697, + "learning_rate": 5.837909592371682e-06, + "loss": 1.848, + "step": 5275 + }, + { + "epoch": 0.851242336237496, + "grad_norm": 3.576374053955078, + "learning_rate": 5.825663886461585e-06, + "loss": 1.6513, + "step": 5276 + }, + { + "epoch": 0.851403678606002, + "grad_norm": 4.482271671295166, + "learning_rate": 5.813430242898649e-06, + "loss": 1.8692, + "step": 5277 + }, + { + "epoch": 0.8515650209745079, + "grad_norm": 5.846675872802734, + "learning_rate": 5.8012086650234e-06, + "loss": 1.9382, + "step": 5278 + }, + { + "epoch": 0.8517263633430139, + "grad_norm": 5.3428544998168945, + "learning_rate": 5.788999156173086e-06, + "loss": 1.8543, + "step": 5279 + }, + { + "epoch": 0.8518877057115198, + "grad_norm": 4.3809895515441895, + "learning_rate": 5.776801719681691e-06, + "loss": 1.993, + "step": 5280 + }, + { + "epoch": 0.8520490480800258, + "grad_norm": 4.829232215881348, + "learning_rate": 5.764616358879838e-06, + "loss": 2.0357, + "step": 5281 + }, + { + "epoch": 0.8522103904485318, + "grad_norm": 4.304603099822998, + "learning_rate": 5.752443077094927e-06, + "loss": 1.7668, + "step": 5282 + }, + { + "epoch": 0.8523717328170377, + "grad_norm": 4.619054794311523, + "learning_rate": 5.740281877650994e-06, + "loss": 1.7929, + "step": 5283 + }, + { + "epoch": 0.8525330751855438, + "grad_norm": 5.700281143188477, + "learning_rate": 5.728132763868832e-06, + "loss": 1.8248, + "step": 5284 + }, + { + "epoch": 0.8526944175540497, + "grad_norm": 4.372552871704102, + "learning_rate": 5.715995739065877e-06, + "loss": 2.0041, + "step": 5285 + }, + { + "epoch": 0.8528557599225557, + "grad_norm": 4.835000038146973, + "learning_rate": 5.703870806556316e-06, + "loss": 1.9382, + "step": 5286 + }, + { + "epoch": 0.8530171022910616, + "grad_norm": 4.303082466125488, + "learning_rate": 5.691757969651001e-06, + "loss": 1.6725, + "step": 5287 + }, + { + "epoch": 0.8531784446595676, + "grad_norm": 5.479561805725098, + "learning_rate": 5.679657231657487e-06, + "loss": 1.8091, + "step": 5288 + }, + { + "epoch": 0.8533397870280736, + "grad_norm": 4.941107749938965, + "learning_rate": 5.667568595880046e-06, + "loss": 1.7673, + "step": 5289 + }, + { + "epoch": 0.8535011293965795, + "grad_norm": 4.362493515014648, + "learning_rate": 5.655492065619605e-06, + "loss": 1.8743, + "step": 5290 + }, + { + "epoch": 0.8536624717650855, + "grad_norm": 4.439315319061279, + "learning_rate": 5.643427644173837e-06, + "loss": 1.8494, + "step": 5291 + }, + { + "epoch": 0.8538238141335914, + "grad_norm": 4.728001117706299, + "learning_rate": 5.631375334837058e-06, + "loss": 2.1247, + "step": 5292 + }, + { + "epoch": 0.8539851565020975, + "grad_norm": 3.551706552505493, + "learning_rate": 5.619335140900317e-06, + "loss": 1.8912, + "step": 5293 + }, + { + "epoch": 0.8541464988706035, + "grad_norm": 4.582133769989014, + "learning_rate": 5.607307065651324e-06, + "loss": 1.9535, + "step": 5294 + }, + { + "epoch": 0.8543078412391094, + "grad_norm": 4.534191131591797, + "learning_rate": 5.595291112374507e-06, + "loss": 1.8993, + "step": 5295 + }, + { + "epoch": 0.8544691836076154, + "grad_norm": 3.634624481201172, + "learning_rate": 5.5832872843509465e-06, + "loss": 1.7037, + "step": 5296 + }, + { + "epoch": 0.8546305259761213, + "grad_norm": 3.549039840698242, + "learning_rate": 5.571295584858466e-06, + "loss": 1.9124, + "step": 5297 + }, + { + "epoch": 0.8547918683446273, + "grad_norm": 3.984860420227051, + "learning_rate": 5.559316017171518e-06, + "loss": 1.8104, + "step": 5298 + }, + { + "epoch": 0.8549532107131332, + "grad_norm": 4.351929187774658, + "learning_rate": 5.547348584561296e-06, + "loss": 1.6888, + "step": 5299 + }, + { + "epoch": 0.8551145530816392, + "grad_norm": 4.291843414306641, + "learning_rate": 5.535393290295643e-06, + "loss": 1.8186, + "step": 5300 + }, + { + "epoch": 0.8552758954501453, + "grad_norm": 4.084909439086914, + "learning_rate": 5.523450137639091e-06, + "loss": 1.5097, + "step": 5301 + }, + { + "epoch": 0.8554372378186512, + "grad_norm": 4.398079872131348, + "learning_rate": 5.5115191298528876e-06, + "loss": 1.9224, + "step": 5302 + }, + { + "epoch": 0.8555985801871572, + "grad_norm": 4.4744768142700195, + "learning_rate": 5.499600270194921e-06, + "loss": 1.9395, + "step": 5303 + }, + { + "epoch": 0.8557599225556631, + "grad_norm": 4.3137431144714355, + "learning_rate": 5.487693561919794e-06, + "loss": 1.8131, + "step": 5304 + }, + { + "epoch": 0.8559212649241691, + "grad_norm": 4.522165298461914, + "learning_rate": 5.47579900827877e-06, + "loss": 1.6565, + "step": 5305 + }, + { + "epoch": 0.856082607292675, + "grad_norm": 4.212401866912842, + "learning_rate": 5.463916612519821e-06, + "loss": 1.8043, + "step": 5306 + }, + { + "epoch": 0.856243949661181, + "grad_norm": 4.5480780601501465, + "learning_rate": 5.45204637788756e-06, + "loss": 1.9229, + "step": 5307 + }, + { + "epoch": 0.856405292029687, + "grad_norm": 4.158749103546143, + "learning_rate": 5.440188307623317e-06, + "loss": 1.8929, + "step": 5308 + }, + { + "epoch": 0.8565666343981929, + "grad_norm": 4.080484867095947, + "learning_rate": 5.428342404965076e-06, + "loss": 1.6823, + "step": 5309 + }, + { + "epoch": 0.856727976766699, + "grad_norm": 3.7655515670776367, + "learning_rate": 5.4165086731475186e-06, + "loss": 1.7646, + "step": 5310 + }, + { + "epoch": 0.8568893191352049, + "grad_norm": 4.867955207824707, + "learning_rate": 5.404687115401969e-06, + "loss": 2.0057, + "step": 5311 + }, + { + "epoch": 0.8570506615037109, + "grad_norm": 3.499534845352173, + "learning_rate": 5.392877734956475e-06, + "loss": 1.7056, + "step": 5312 + }, + { + "epoch": 0.8572120038722169, + "grad_norm": 3.72841477394104, + "learning_rate": 5.3810805350357205e-06, + "loss": 1.6767, + "step": 5313 + }, + { + "epoch": 0.8573733462407228, + "grad_norm": 4.146420001983643, + "learning_rate": 5.369295518861078e-06, + "loss": 2.0991, + "step": 5314 + }, + { + "epoch": 0.8575346886092288, + "grad_norm": 5.447978973388672, + "learning_rate": 5.35752268965059e-06, + "loss": 1.9444, + "step": 5315 + }, + { + "epoch": 0.8576960309777347, + "grad_norm": 4.583520889282227, + "learning_rate": 5.345762050618963e-06, + "loss": 1.7592, + "step": 5316 + }, + { + "epoch": 0.8578573733462407, + "grad_norm": 4.727061748504639, + "learning_rate": 5.3340136049776055e-06, + "loss": 1.9346, + "step": 5317 + }, + { + "epoch": 0.8580187157147466, + "grad_norm": 6.0795578956604, + "learning_rate": 5.322277355934558e-06, + "loss": 1.6686, + "step": 5318 + }, + { + "epoch": 0.8581800580832527, + "grad_norm": 5.296669006347656, + "learning_rate": 5.3105533066945605e-06, + "loss": 1.7791, + "step": 5319 + }, + { + "epoch": 0.8583414004517587, + "grad_norm": 4.125102996826172, + "learning_rate": 5.298841460458998e-06, + "loss": 2.0862, + "step": 5320 + }, + { + "epoch": 0.8585027428202646, + "grad_norm": 5.359317779541016, + "learning_rate": 5.287141820425945e-06, + "loss": 1.9058, + "step": 5321 + }, + { + "epoch": 0.8586640851887706, + "grad_norm": 3.726865291595459, + "learning_rate": 5.2754543897901184e-06, + "loss": 1.6196, + "step": 5322 + }, + { + "epoch": 0.8588254275572765, + "grad_norm": 6.377970218658447, + "learning_rate": 5.263779171742933e-06, + "loss": 1.981, + "step": 5323 + }, + { + "epoch": 0.8589867699257825, + "grad_norm": 4.332647323608398, + "learning_rate": 5.2521161694724375e-06, + "loss": 1.8543, + "step": 5324 + }, + { + "epoch": 0.8591481122942884, + "grad_norm": 3.9370486736297607, + "learning_rate": 5.240465386163368e-06, + "loss": 1.72, + "step": 5325 + }, + { + "epoch": 0.8593094546627944, + "grad_norm": 4.345332145690918, + "learning_rate": 5.2288268249971125e-06, + "loss": 1.8494, + "step": 5326 + }, + { + "epoch": 0.8594707970313005, + "grad_norm": 3.694758653640747, + "learning_rate": 5.217200489151713e-06, + "loss": 1.7929, + "step": 5327 + }, + { + "epoch": 0.8596321393998064, + "grad_norm": 4.486896514892578, + "learning_rate": 5.2055863818018965e-06, + "loss": 1.9654, + "step": 5328 + }, + { + "epoch": 0.8597934817683124, + "grad_norm": 4.784701347351074, + "learning_rate": 5.193984506119032e-06, + "loss": 1.9568, + "step": 5329 + }, + { + "epoch": 0.8599548241368183, + "grad_norm": 4.010159015655518, + "learning_rate": 5.1823948652711565e-06, + "loss": 1.668, + "step": 5330 + }, + { + "epoch": 0.8601161665053243, + "grad_norm": 4.728301048278809, + "learning_rate": 5.170817462422961e-06, + "loss": 1.9033, + "step": 5331 + }, + { + "epoch": 0.8602775088738303, + "grad_norm": 4.136148929595947, + "learning_rate": 5.159252300735812e-06, + "loss": 1.6008, + "step": 5332 + }, + { + "epoch": 0.8604388512423362, + "grad_norm": 3.8405919075012207, + "learning_rate": 5.1476993833677045e-06, + "loss": 2.0028, + "step": 5333 + }, + { + "epoch": 0.8606001936108422, + "grad_norm": 5.300532341003418, + "learning_rate": 5.13615871347331e-06, + "loss": 1.6452, + "step": 5334 + }, + { + "epoch": 0.8607615359793481, + "grad_norm": 4.644827842712402, + "learning_rate": 5.124630294203942e-06, + "loss": 1.5739, + "step": 5335 + }, + { + "epoch": 0.8609228783478542, + "grad_norm": 4.183739185333252, + "learning_rate": 5.113114128707591e-06, + "loss": 2.0056, + "step": 5336 + }, + { + "epoch": 0.8610842207163601, + "grad_norm": 5.444863796234131, + "learning_rate": 5.1016102201288776e-06, + "loss": 1.645, + "step": 5337 + }, + { + "epoch": 0.8612455630848661, + "grad_norm": 6.546682834625244, + "learning_rate": 5.090118571609098e-06, + "loss": 1.9507, + "step": 5338 + }, + { + "epoch": 0.8614069054533721, + "grad_norm": 4.524355888366699, + "learning_rate": 5.078639186286177e-06, + "loss": 2.0171, + "step": 5339 + }, + { + "epoch": 0.861568247821878, + "grad_norm": 4.184797286987305, + "learning_rate": 5.0671720672947064e-06, + "loss": 1.6688, + "step": 5340 + }, + { + "epoch": 0.861729590190384, + "grad_norm": 5.031257629394531, + "learning_rate": 5.055717217765926e-06, + "loss": 2.1551, + "step": 5341 + }, + { + "epoch": 0.8618909325588899, + "grad_norm": 3.807931423187256, + "learning_rate": 5.044274640827718e-06, + "loss": 1.82, + "step": 5342 + }, + { + "epoch": 0.8620522749273959, + "grad_norm": 4.028903007507324, + "learning_rate": 5.032844339604631e-06, + "loss": 1.5555, + "step": 5343 + }, + { + "epoch": 0.862213617295902, + "grad_norm": 7.599312782287598, + "learning_rate": 5.021426317217831e-06, + "loss": 1.9882, + "step": 5344 + }, + { + "epoch": 0.8623749596644079, + "grad_norm": 5.002162456512451, + "learning_rate": 5.010020576785174e-06, + "loss": 1.9644, + "step": 5345 + }, + { + "epoch": 0.8625363020329139, + "grad_norm": 5.8590545654296875, + "learning_rate": 4.998627121421112e-06, + "loss": 1.7531, + "step": 5346 + }, + { + "epoch": 0.8626976444014198, + "grad_norm": 4.679166316986084, + "learning_rate": 4.987245954236791e-06, + "loss": 1.6703, + "step": 5347 + }, + { + "epoch": 0.8628589867699258, + "grad_norm": 3.858346700668335, + "learning_rate": 4.975877078339964e-06, + "loss": 1.6805, + "step": 5348 + }, + { + "epoch": 0.8630203291384317, + "grad_norm": 3.771313190460205, + "learning_rate": 4.964520496835057e-06, + "loss": 2.1385, + "step": 5349 + }, + { + "epoch": 0.8631816715069377, + "grad_norm": 4.188822269439697, + "learning_rate": 4.953176212823113e-06, + "loss": 1.5778, + "step": 5350 + }, + { + "epoch": 0.8633430138754437, + "grad_norm": 6.0285325050354, + "learning_rate": 4.941844229401821e-06, + "loss": 1.7102, + "step": 5351 + }, + { + "epoch": 0.8635043562439496, + "grad_norm": 4.193412780761719, + "learning_rate": 4.930524549665538e-06, + "loss": 1.7374, + "step": 5352 + }, + { + "epoch": 0.8636656986124557, + "grad_norm": 5.323854446411133, + "learning_rate": 4.919217176705238e-06, + "loss": 1.6838, + "step": 5353 + }, + { + "epoch": 0.8638270409809616, + "grad_norm": 4.328566074371338, + "learning_rate": 4.9079221136085315e-06, + "loss": 1.7497, + "step": 5354 + }, + { + "epoch": 0.8639883833494676, + "grad_norm": 4.033364772796631, + "learning_rate": 4.896639363459671e-06, + "loss": 1.5371, + "step": 5355 + }, + { + "epoch": 0.8641497257179735, + "grad_norm": 4.1560444831848145, + "learning_rate": 4.885368929339562e-06, + "loss": 2.0093, + "step": 5356 + }, + { + "epoch": 0.8643110680864795, + "grad_norm": 3.9417884349823, + "learning_rate": 4.8741108143257215e-06, + "loss": 2.1247, + "step": 5357 + }, + { + "epoch": 0.8644724104549855, + "grad_norm": 4.554046630859375, + "learning_rate": 4.862865021492335e-06, + "loss": 2.0201, + "step": 5358 + }, + { + "epoch": 0.8646337528234914, + "grad_norm": 4.345934867858887, + "learning_rate": 4.851631553910185e-06, + "loss": 1.9753, + "step": 5359 + }, + { + "epoch": 0.8647950951919974, + "grad_norm": 4.904932498931885, + "learning_rate": 4.8404104146467284e-06, + "loss": 2.0068, + "step": 5360 + }, + { + "epoch": 0.8649564375605033, + "grad_norm": 4.793888092041016, + "learning_rate": 4.8292016067660206e-06, + "loss": 1.7089, + "step": 5361 + }, + { + "epoch": 0.8651177799290094, + "grad_norm": 4.269280433654785, + "learning_rate": 4.8180051333287735e-06, + "loss": 1.8964, + "step": 5362 + }, + { + "epoch": 0.8652791222975154, + "grad_norm": 3.4615707397460938, + "learning_rate": 4.8068209973923255e-06, + "loss": 1.5962, + "step": 5363 + }, + { + "epoch": 0.8654404646660213, + "grad_norm": 3.4652092456817627, + "learning_rate": 4.795649202010622e-06, + "loss": 1.7917, + "step": 5364 + }, + { + "epoch": 0.8656018070345273, + "grad_norm": 4.4619574546813965, + "learning_rate": 4.784489750234283e-06, + "loss": 1.859, + "step": 5365 + }, + { + "epoch": 0.8657631494030332, + "grad_norm": 5.999852657318115, + "learning_rate": 4.773342645110518e-06, + "loss": 1.8824, + "step": 5366 + }, + { + "epoch": 0.8659244917715392, + "grad_norm": 4.514030933380127, + "learning_rate": 4.762207889683196e-06, + "loss": 1.5664, + "step": 5367 + }, + { + "epoch": 0.8660858341400451, + "grad_norm": 4.133525371551514, + "learning_rate": 4.751085486992779e-06, + "loss": 1.9823, + "step": 5368 + }, + { + "epoch": 0.8662471765085511, + "grad_norm": 4.643326759338379, + "learning_rate": 4.739975440076405e-06, + "loss": 1.7787, + "step": 5369 + }, + { + "epoch": 0.8664085188770572, + "grad_norm": 3.7829456329345703, + "learning_rate": 4.728877751967786e-06, + "loss": 1.7204, + "step": 5370 + }, + { + "epoch": 0.8665698612455631, + "grad_norm": 3.434251546859741, + "learning_rate": 4.717792425697288e-06, + "loss": 1.8811, + "step": 5371 + }, + { + "epoch": 0.8667312036140691, + "grad_norm": 3.654670476913452, + "learning_rate": 4.706719464291903e-06, + "loss": 1.7796, + "step": 5372 + }, + { + "epoch": 0.866892545982575, + "grad_norm": 4.253548622131348, + "learning_rate": 4.695658870775232e-06, + "loss": 1.7287, + "step": 5373 + }, + { + "epoch": 0.867053888351081, + "grad_norm": 5.27730655670166, + "learning_rate": 4.684610648167503e-06, + "loss": 1.7725, + "step": 5374 + }, + { + "epoch": 0.867215230719587, + "grad_norm": 4.814943790435791, + "learning_rate": 4.673574799485586e-06, + "loss": 1.8509, + "step": 5375 + }, + { + "epoch": 0.8673765730880929, + "grad_norm": 4.000126838684082, + "learning_rate": 4.662551327742942e-06, + "loss": 1.7523, + "step": 5376 + }, + { + "epoch": 0.8675379154565989, + "grad_norm": 4.85685396194458, + "learning_rate": 4.651540235949658e-06, + "loss": 1.6614, + "step": 5377 + }, + { + "epoch": 0.8676992578251048, + "grad_norm": 5.462538719177246, + "learning_rate": 4.640541527112474e-06, + "loss": 1.6222, + "step": 5378 + }, + { + "epoch": 0.8678606001936109, + "grad_norm": 4.697770118713379, + "learning_rate": 4.629555204234693e-06, + "loss": 2.0384, + "step": 5379 + }, + { + "epoch": 0.8680219425621168, + "grad_norm": 4.688658714294434, + "learning_rate": 4.618581270316292e-06, + "loss": 1.6785, + "step": 5380 + }, + { + "epoch": 0.8681832849306228, + "grad_norm": 4.218990802764893, + "learning_rate": 4.607619728353818e-06, + "loss": 1.7295, + "step": 5381 + }, + { + "epoch": 0.8683446272991288, + "grad_norm": 4.786597728729248, + "learning_rate": 4.596670581340479e-06, + "loss": 1.7838, + "step": 5382 + }, + { + "epoch": 0.8685059696676347, + "grad_norm": 4.64976692199707, + "learning_rate": 4.585733832266048e-06, + "loss": 2.016, + "step": 5383 + }, + { + "epoch": 0.8686673120361407, + "grad_norm": 5.516035079956055, + "learning_rate": 4.57480948411696e-06, + "loss": 1.9975, + "step": 5384 + }, + { + "epoch": 0.8688286544046466, + "grad_norm": 4.395437240600586, + "learning_rate": 4.563897539876228e-06, + "loss": 1.8529, + "step": 5385 + }, + { + "epoch": 0.8689899967731526, + "grad_norm": 4.031848907470703, + "learning_rate": 4.552998002523512e-06, + "loss": 1.7504, + "step": 5386 + }, + { + "epoch": 0.8691513391416587, + "grad_norm": 4.178362846374512, + "learning_rate": 4.542110875035038e-06, + "loss": 2.1688, + "step": 5387 + }, + { + "epoch": 0.8693126815101646, + "grad_norm": 3.6858437061309814, + "learning_rate": 4.531236160383701e-06, + "loss": 1.3889, + "step": 5388 + }, + { + "epoch": 0.8694740238786706, + "grad_norm": 6.3474321365356445, + "learning_rate": 4.520373861538951e-06, + "loss": 1.845, + "step": 5389 + }, + { + "epoch": 0.8696353662471765, + "grad_norm": 4.003809452056885, + "learning_rate": 4.50952398146689e-06, + "loss": 2.0094, + "step": 5390 + }, + { + "epoch": 0.8697967086156825, + "grad_norm": 3.9384477138519287, + "learning_rate": 4.498686523130191e-06, + "loss": 1.9549, + "step": 5391 + }, + { + "epoch": 0.8699580509841884, + "grad_norm": 4.913197040557861, + "learning_rate": 4.487861489488177e-06, + "loss": 2.1316, + "step": 5392 + }, + { + "epoch": 0.8701193933526944, + "grad_norm": 4.362610816955566, + "learning_rate": 4.4770488834967485e-06, + "loss": 1.8528, + "step": 5393 + }, + { + "epoch": 0.8702807357212003, + "grad_norm": 4.632915019989014, + "learning_rate": 4.4662487081084115e-06, + "loss": 2.0092, + "step": 5394 + }, + { + "epoch": 0.8704420780897063, + "grad_norm": 4.530938625335693, + "learning_rate": 4.455460966272307e-06, + "loss": 2.0791, + "step": 5395 + }, + { + "epoch": 0.8706034204582124, + "grad_norm": 3.6736834049224854, + "learning_rate": 4.444685660934139e-06, + "loss": 2.2223, + "step": 5396 + }, + { + "epoch": 0.8707647628267183, + "grad_norm": 4.891629695892334, + "learning_rate": 4.433922795036255e-06, + "loss": 1.6743, + "step": 5397 + }, + { + "epoch": 0.8709261051952243, + "grad_norm": 4.351646900177002, + "learning_rate": 4.423172371517575e-06, + "loss": 1.8404, + "step": 5398 + }, + { + "epoch": 0.8710874475637302, + "grad_norm": 4.50725793838501, + "learning_rate": 4.412434393313652e-06, + "loss": 1.9123, + "step": 5399 + }, + { + "epoch": 0.8712487899322362, + "grad_norm": 4.609782695770264, + "learning_rate": 4.401708863356602e-06, + "loss": 1.558, + "step": 5400 + }, + { + "epoch": 0.8714101323007422, + "grad_norm": 4.794640064239502, + "learning_rate": 4.39099578457518e-06, + "loss": 1.5952, + "step": 5401 + }, + { + "epoch": 0.8715714746692481, + "grad_norm": 5.642186641693115, + "learning_rate": 4.38029515989472e-06, + "loss": 2.0321, + "step": 5402 + }, + { + "epoch": 0.8717328170377541, + "grad_norm": 4.8262152671813965, + "learning_rate": 4.369606992237146e-06, + "loss": 1.7114, + "step": 5403 + }, + { + "epoch": 0.8718941594062601, + "grad_norm": 4.665055751800537, + "learning_rate": 4.358931284521023e-06, + "loss": 1.6008, + "step": 5404 + }, + { + "epoch": 0.8720555017747661, + "grad_norm": 4.443180561065674, + "learning_rate": 4.3482680396614516e-06, + "loss": 1.7421, + "step": 5405 + }, + { + "epoch": 0.872216844143272, + "grad_norm": 5.374730110168457, + "learning_rate": 4.337617260570187e-06, + "loss": 1.8272, + "step": 5406 + }, + { + "epoch": 0.872378186511778, + "grad_norm": 5.673450469970703, + "learning_rate": 4.326978950155536e-06, + "loss": 1.9179, + "step": 5407 + }, + { + "epoch": 0.872539528880284, + "grad_norm": 4.725485801696777, + "learning_rate": 4.3163531113224465e-06, + "loss": 1.6315, + "step": 5408 + }, + { + "epoch": 0.8727008712487899, + "grad_norm": 4.904433250427246, + "learning_rate": 4.305739746972415e-06, + "loss": 2.2354, + "step": 5409 + }, + { + "epoch": 0.8728622136172959, + "grad_norm": 3.5843193531036377, + "learning_rate": 4.2951388600035555e-06, + "loss": 1.7607, + "step": 5410 + }, + { + "epoch": 0.8730235559858018, + "grad_norm": 4.858310699462891, + "learning_rate": 4.28455045331056e-06, + "loss": 2.0684, + "step": 5411 + }, + { + "epoch": 0.8731848983543078, + "grad_norm": 4.717955589294434, + "learning_rate": 4.273974529784747e-06, + "loss": 2.0185, + "step": 5412 + }, + { + "epoch": 0.8733462407228139, + "grad_norm": 4.557201862335205, + "learning_rate": 4.2634110923139796e-06, + "loss": 1.6191, + "step": 5413 + }, + { + "epoch": 0.8735075830913198, + "grad_norm": 3.5318820476531982, + "learning_rate": 4.252860143782761e-06, + "loss": 1.7523, + "step": 5414 + }, + { + "epoch": 0.8736689254598258, + "grad_norm": 8.056474685668945, + "learning_rate": 4.242321687072137e-06, + "loss": 1.8573, + "step": 5415 + }, + { + "epoch": 0.8738302678283317, + "grad_norm": 4.431918144226074, + "learning_rate": 4.231795725059756e-06, + "loss": 1.8516, + "step": 5416 + }, + { + "epoch": 0.8739916101968377, + "grad_norm": 4.269765377044678, + "learning_rate": 4.221282260619891e-06, + "loss": 1.7545, + "step": 5417 + }, + { + "epoch": 0.8741529525653436, + "grad_norm": 3.9823522567749023, + "learning_rate": 4.2107812966233395e-06, + "loss": 1.6845, + "step": 5418 + }, + { + "epoch": 0.8743142949338496, + "grad_norm": 5.325613498687744, + "learning_rate": 4.200292835937553e-06, + "loss": 1.9474, + "step": 5419 + }, + { + "epoch": 0.8744756373023556, + "grad_norm": 3.8651812076568604, + "learning_rate": 4.189816881426506e-06, + "loss": 1.6847, + "step": 5420 + }, + { + "epoch": 0.8746369796708615, + "grad_norm": 4.520545959472656, + "learning_rate": 4.179353435950805e-06, + "loss": 1.9354, + "step": 5421 + }, + { + "epoch": 0.8747983220393676, + "grad_norm": 5.559961318969727, + "learning_rate": 4.168902502367611e-06, + "loss": 1.8653, + "step": 5422 + }, + { + "epoch": 0.8749596644078735, + "grad_norm": 3.3139851093292236, + "learning_rate": 4.1584640835306944e-06, + "loss": 1.7976, + "step": 5423 + }, + { + "epoch": 0.8751210067763795, + "grad_norm": 3.545987606048584, + "learning_rate": 4.148038182290376e-06, + "loss": 1.713, + "step": 5424 + }, + { + "epoch": 0.8752823491448855, + "grad_norm": 3.64139986038208, + "learning_rate": 4.1376248014935945e-06, + "loss": 1.8402, + "step": 5425 + }, + { + "epoch": 0.8754436915133914, + "grad_norm": 4.494565010070801, + "learning_rate": 4.127223943983849e-06, + "loss": 1.9319, + "step": 5426 + }, + { + "epoch": 0.8756050338818974, + "grad_norm": 4.627253532409668, + "learning_rate": 4.1168356126012055e-06, + "loss": 1.6034, + "step": 5427 + }, + { + "epoch": 0.8757663762504033, + "grad_norm": 3.559372901916504, + "learning_rate": 4.106459810182345e-06, + "loss": 1.7515, + "step": 5428 + }, + { + "epoch": 0.8759277186189093, + "grad_norm": 5.645289421081543, + "learning_rate": 4.096096539560501e-06, + "loss": 1.7272, + "step": 5429 + }, + { + "epoch": 0.8760890609874153, + "grad_norm": 5.593602657318115, + "learning_rate": 4.0857458035654935e-06, + "loss": 1.8373, + "step": 5430 + }, + { + "epoch": 0.8762504033559213, + "grad_norm": 4.428093433380127, + "learning_rate": 4.075407605023706e-06, + "loss": 1.648, + "step": 5431 + }, + { + "epoch": 0.8764117457244273, + "grad_norm": 4.232276916503906, + "learning_rate": 4.0650819467581315e-06, + "loss": 1.7091, + "step": 5432 + }, + { + "epoch": 0.8765730880929332, + "grad_norm": 4.081578731536865, + "learning_rate": 4.0547688315883015e-06, + "loss": 1.6103, + "step": 5433 + }, + { + "epoch": 0.8767344304614392, + "grad_norm": 5.288313865661621, + "learning_rate": 4.044468262330353e-06, + "loss": 1.8681, + "step": 5434 + }, + { + "epoch": 0.8768957728299451, + "grad_norm": 4.759239196777344, + "learning_rate": 4.03418024179697e-06, + "loss": 1.819, + "step": 5435 + }, + { + "epoch": 0.8770571151984511, + "grad_norm": 4.259239196777344, + "learning_rate": 4.023904772797443e-06, + "loss": 1.736, + "step": 5436 + }, + { + "epoch": 0.877218457566957, + "grad_norm": 4.10651969909668, + "learning_rate": 4.01364185813759e-06, + "loss": 1.6239, + "step": 5437 + }, + { + "epoch": 0.877379799935463, + "grad_norm": 5.2628302574157715, + "learning_rate": 4.003391500619852e-06, + "loss": 1.7202, + "step": 5438 + }, + { + "epoch": 0.8775411423039691, + "grad_norm": 4.976608753204346, + "learning_rate": 3.993153703043196e-06, + "loss": 1.7757, + "step": 5439 + }, + { + "epoch": 0.877702484672475, + "grad_norm": 3.359093427658081, + "learning_rate": 3.9829284682031845e-06, + "loss": 1.9446, + "step": 5440 + }, + { + "epoch": 0.877863827040981, + "grad_norm": 4.429149627685547, + "learning_rate": 3.972715798891952e-06, + "loss": 1.6241, + "step": 5441 + }, + { + "epoch": 0.8780251694094869, + "grad_norm": 5.290810585021973, + "learning_rate": 3.962515697898173e-06, + "loss": 1.8892, + "step": 5442 + }, + { + "epoch": 0.8781865117779929, + "grad_norm": 3.9987807273864746, + "learning_rate": 3.952328168007141e-06, + "loss": 1.9139, + "step": 5443 + }, + { + "epoch": 0.8783478541464989, + "grad_norm": 4.468580722808838, + "learning_rate": 3.942153212000654e-06, + "loss": 1.7947, + "step": 5444 + }, + { + "epoch": 0.8785091965150048, + "grad_norm": 4.247889041900635, + "learning_rate": 3.93199083265714e-06, + "loss": 2.0586, + "step": 5445 + }, + { + "epoch": 0.8786705388835108, + "grad_norm": 4.7147746086120605, + "learning_rate": 3.9218410327515385e-06, + "loss": 1.9079, + "step": 5446 + }, + { + "epoch": 0.8788318812520168, + "grad_norm": 5.2043304443359375, + "learning_rate": 3.911703815055395e-06, + "loss": 1.8722, + "step": 5447 + }, + { + "epoch": 0.8789932236205228, + "grad_norm": 5.258590221405029, + "learning_rate": 3.901579182336796e-06, + "loss": 1.6139, + "step": 5448 + }, + { + "epoch": 0.8791545659890287, + "grad_norm": 5.239922523498535, + "learning_rate": 3.891467137360388e-06, + "loss": 1.8144, + "step": 5449 + }, + { + "epoch": 0.8793159083575347, + "grad_norm": 4.352474212646484, + "learning_rate": 3.881367682887393e-06, + "loss": 1.5627, + "step": 5450 + }, + { + "epoch": 0.8794772507260407, + "grad_norm": 4.475174903869629, + "learning_rate": 3.871280821675605e-06, + "loss": 1.6751, + "step": 5451 + }, + { + "epoch": 0.8796385930945466, + "grad_norm": 4.773714542388916, + "learning_rate": 3.861206556479352e-06, + "loss": 2.0421, + "step": 5452 + }, + { + "epoch": 0.8797999354630526, + "grad_norm": 4.278417587280273, + "learning_rate": 3.851144890049535e-06, + "loss": 1.8468, + "step": 5453 + }, + { + "epoch": 0.8799612778315585, + "grad_norm": 4.038722515106201, + "learning_rate": 3.841095825133623e-06, + "loss": 1.956, + "step": 5454 + }, + { + "epoch": 0.8801226202000645, + "grad_norm": 3.979750394821167, + "learning_rate": 3.831059364475631e-06, + "loss": 1.7624, + "step": 5455 + }, + { + "epoch": 0.8802839625685706, + "grad_norm": 6.502319812774658, + "learning_rate": 3.821035510816151e-06, + "loss": 2.0544, + "step": 5456 + }, + { + "epoch": 0.8804453049370765, + "grad_norm": 5.762221336364746, + "learning_rate": 3.8110242668923045e-06, + "loss": 1.8721, + "step": 5457 + }, + { + "epoch": 0.8806066473055825, + "grad_norm": 4.8464436531066895, + "learning_rate": 3.801025635437799e-06, + "loss": 1.7393, + "step": 5458 + }, + { + "epoch": 0.8807679896740884, + "grad_norm": 4.780818939208984, + "learning_rate": 3.7910396191828677e-06, + "loss": 2.0555, + "step": 5459 + }, + { + "epoch": 0.8809293320425944, + "grad_norm": 3.8620622158050537, + "learning_rate": 3.7810662208543348e-06, + "loss": 1.544, + "step": 5460 + }, + { + "epoch": 0.8810906744111003, + "grad_norm": 3.8314895629882812, + "learning_rate": 3.771105443175543e-06, + "loss": 1.7244, + "step": 5461 + }, + { + "epoch": 0.8812520167796063, + "grad_norm": 3.7106902599334717, + "learning_rate": 3.7611572888664183e-06, + "loss": 1.6506, + "step": 5462 + }, + { + "epoch": 0.8814133591481123, + "grad_norm": 3.627849578857422, + "learning_rate": 3.751221760643414e-06, + "loss": 1.7715, + "step": 5463 + }, + { + "epoch": 0.8815747015166182, + "grad_norm": 4.333226680755615, + "learning_rate": 3.741298861219561e-06, + "loss": 2.0436, + "step": 5464 + }, + { + "epoch": 0.8817360438851243, + "grad_norm": 5.117793083190918, + "learning_rate": 3.7313885933044245e-06, + "loss": 1.8617, + "step": 5465 + }, + { + "epoch": 0.8818973862536302, + "grad_norm": 4.468812465667725, + "learning_rate": 3.721490959604118e-06, + "loss": 1.795, + "step": 5466 + }, + { + "epoch": 0.8820587286221362, + "grad_norm": 4.807857990264893, + "learning_rate": 3.711605962821324e-06, + "loss": 1.6412, + "step": 5467 + }, + { + "epoch": 0.8822200709906421, + "grad_norm": 4.753337860107422, + "learning_rate": 3.7017336056552608e-06, + "loss": 1.7431, + "step": 5468 + }, + { + "epoch": 0.8823814133591481, + "grad_norm": 4.103842735290527, + "learning_rate": 3.6918738908016948e-06, + "loss": 1.8165, + "step": 5469 + }, + { + "epoch": 0.8825427557276541, + "grad_norm": 4.501655101776123, + "learning_rate": 3.6820268209529328e-06, + "loss": 1.9522, + "step": 5470 + }, + { + "epoch": 0.88270409809616, + "grad_norm": 3.9741098880767822, + "learning_rate": 3.672192398797858e-06, + "loss": 1.8075, + "step": 5471 + }, + { + "epoch": 0.882865440464666, + "grad_norm": 4.133610248565674, + "learning_rate": 3.662370627021855e-06, + "loss": 1.9183, + "step": 5472 + }, + { + "epoch": 0.883026782833172, + "grad_norm": 4.6065802574157715, + "learning_rate": 3.652561508306912e-06, + "loss": 1.9396, + "step": 5473 + }, + { + "epoch": 0.883188125201678, + "grad_norm": 4.168022155761719, + "learning_rate": 3.642765045331503e-06, + "loss": 1.5395, + "step": 5474 + }, + { + "epoch": 0.883349467570184, + "grad_norm": 3.8084218502044678, + "learning_rate": 3.6329812407706885e-06, + "loss": 1.7228, + "step": 5475 + }, + { + "epoch": 0.8835108099386899, + "grad_norm": 5.351351261138916, + "learning_rate": 3.6232100972960427e-06, + "loss": 1.6885, + "step": 5476 + }, + { + "epoch": 0.8836721523071959, + "grad_norm": 3.873673915863037, + "learning_rate": 3.6134516175757193e-06, + "loss": 1.7994, + "step": 5477 + }, + { + "epoch": 0.8838334946757018, + "grad_norm": 4.273514270782471, + "learning_rate": 3.603705804274371e-06, + "loss": 1.8824, + "step": 5478 + }, + { + "epoch": 0.8839948370442078, + "grad_norm": 4.087368965148926, + "learning_rate": 3.593972660053219e-06, + "loss": 2.0012, + "step": 5479 + }, + { + "epoch": 0.8841561794127137, + "grad_norm": 4.9953107833862305, + "learning_rate": 3.5842521875700197e-06, + "loss": 2.0025, + "step": 5480 + }, + { + "epoch": 0.8843175217812197, + "grad_norm": 4.712258338928223, + "learning_rate": 3.574544389479062e-06, + "loss": 1.9964, + "step": 5481 + }, + { + "epoch": 0.8844788641497258, + "grad_norm": 5.13985538482666, + "learning_rate": 3.564849268431192e-06, + "loss": 1.8126, + "step": 5482 + }, + { + "epoch": 0.8846402065182317, + "grad_norm": 7.525959491729736, + "learning_rate": 3.5551668270737638e-06, + "loss": 1.9318, + "step": 5483 + }, + { + "epoch": 0.8848015488867377, + "grad_norm": 4.160379886627197, + "learning_rate": 3.545497068050713e-06, + "loss": 1.952, + "step": 5484 + }, + { + "epoch": 0.8849628912552436, + "grad_norm": 3.685530662536621, + "learning_rate": 3.5358399940024544e-06, + "loss": 1.5698, + "step": 5485 + }, + { + "epoch": 0.8851242336237496, + "grad_norm": 4.459519863128662, + "learning_rate": 3.526195607566002e-06, + "loss": 1.9705, + "step": 5486 + }, + { + "epoch": 0.8852855759922555, + "grad_norm": 3.8491642475128174, + "learning_rate": 3.516563911374865e-06, + "loss": 1.824, + "step": 5487 + }, + { + "epoch": 0.8854469183607615, + "grad_norm": 4.472646236419678, + "learning_rate": 3.50694490805909e-06, + "loss": 1.7509, + "step": 5488 + }, + { + "epoch": 0.8856082607292675, + "grad_norm": 4.286990165710449, + "learning_rate": 3.4973386002452535e-06, + "loss": 1.8416, + "step": 5489 + }, + { + "epoch": 0.8857696030977735, + "grad_norm": 3.676454782485962, + "learning_rate": 3.487744990556502e-06, + "loss": 1.8689, + "step": 5490 + }, + { + "epoch": 0.8859309454662795, + "grad_norm": 4.483414173126221, + "learning_rate": 3.478164081612478e-06, + "loss": 1.7064, + "step": 5491 + }, + { + "epoch": 0.8860922878347854, + "grad_norm": 3.92512845993042, + "learning_rate": 3.468595876029357e-06, + "loss": 1.8683, + "step": 5492 + }, + { + "epoch": 0.8862536302032914, + "grad_norm": 3.9366745948791504, + "learning_rate": 3.4590403764198753e-06, + "loss": 1.6761, + "step": 5493 + }, + { + "epoch": 0.8864149725717974, + "grad_norm": 4.361815929412842, + "learning_rate": 3.4494975853932577e-06, + "loss": 2.0082, + "step": 5494 + }, + { + "epoch": 0.8865763149403033, + "grad_norm": 4.422248363494873, + "learning_rate": 3.4399675055552973e-06, + "loss": 1.8268, + "step": 5495 + }, + { + "epoch": 0.8867376573088093, + "grad_norm": 4.271640777587891, + "learning_rate": 3.43045013950829e-06, + "loss": 1.8297, + "step": 5496 + }, + { + "epoch": 0.8868989996773152, + "grad_norm": 3.809992551803589, + "learning_rate": 3.420945489851085e-06, + "loss": 2.0953, + "step": 5497 + }, + { + "epoch": 0.8870603420458212, + "grad_norm": 4.003145694732666, + "learning_rate": 3.411453559179023e-06, + "loss": 1.6517, + "step": 5498 + }, + { + "epoch": 0.8872216844143272, + "grad_norm": 4.99307918548584, + "learning_rate": 3.4019743500840084e-06, + "loss": 1.7525, + "step": 5499 + }, + { + "epoch": 0.8873830267828332, + "grad_norm": 4.213025093078613, + "learning_rate": 3.3925078651544486e-06, + "loss": 1.8423, + "step": 5500 + }, + { + "epoch": 0.8875443691513392, + "grad_norm": 4.226743698120117, + "learning_rate": 3.383054106975292e-06, + "loss": 1.7812, + "step": 5501 + }, + { + "epoch": 0.8877057115198451, + "grad_norm": 5.146629333496094, + "learning_rate": 3.373613078128002e-06, + "loss": 2.0277, + "step": 5502 + }, + { + "epoch": 0.8878670538883511, + "grad_norm": 5.330268859863281, + "learning_rate": 3.364184781190549e-06, + "loss": 1.8118, + "step": 5503 + }, + { + "epoch": 0.888028396256857, + "grad_norm": 4.649377822875977, + "learning_rate": 3.3547692187374747e-06, + "loss": 1.7216, + "step": 5504 + }, + { + "epoch": 0.888189738625363, + "grad_norm": 5.103134632110596, + "learning_rate": 3.3453663933397938e-06, + "loss": 1.893, + "step": 5505 + }, + { + "epoch": 0.888351080993869, + "grad_norm": 4.233748912811279, + "learning_rate": 3.335976307565075e-06, + "loss": 1.7237, + "step": 5506 + }, + { + "epoch": 0.888512423362375, + "grad_norm": 4.579054832458496, + "learning_rate": 3.3265989639773953e-06, + "loss": 1.8852, + "step": 5507 + }, + { + "epoch": 0.888673765730881, + "grad_norm": 3.687269926071167, + "learning_rate": 3.3172343651373504e-06, + "loss": 1.7031, + "step": 5508 + }, + { + "epoch": 0.8888351080993869, + "grad_norm": 4.788128852844238, + "learning_rate": 3.307882513602051e-06, + "loss": 2.1332, + "step": 5509 + }, + { + "epoch": 0.8889964504678929, + "grad_norm": 4.407412528991699, + "learning_rate": 3.298543411925159e-06, + "loss": 2.054, + "step": 5510 + }, + { + "epoch": 0.8891577928363988, + "grad_norm": 5.501307487487793, + "learning_rate": 3.289217062656802e-06, + "loss": 1.9332, + "step": 5511 + }, + { + "epoch": 0.8893191352049048, + "grad_norm": 4.539480686187744, + "learning_rate": 3.2799034683436815e-06, + "loss": 1.6599, + "step": 5512 + }, + { + "epoch": 0.8894804775734108, + "grad_norm": 3.469583511352539, + "learning_rate": 3.2706026315289682e-06, + "loss": 1.8929, + "step": 5513 + }, + { + "epoch": 0.8896418199419167, + "grad_norm": 4.4009528160095215, + "learning_rate": 3.2613145547523928e-06, + "loss": 1.8845, + "step": 5514 + }, + { + "epoch": 0.8898031623104227, + "grad_norm": 4.314871311187744, + "learning_rate": 3.2520392405501644e-06, + "loss": 1.7426, + "step": 5515 + }, + { + "epoch": 0.8899645046789287, + "grad_norm": 3.7380380630493164, + "learning_rate": 3.242776691455013e-06, + "loss": 1.9838, + "step": 5516 + }, + { + "epoch": 0.8901258470474347, + "grad_norm": 6.570891380310059, + "learning_rate": 3.2335269099962096e-06, + "loss": 1.7812, + "step": 5517 + }, + { + "epoch": 0.8902871894159406, + "grad_norm": 3.98508882522583, + "learning_rate": 3.2242898986995063e-06, + "loss": 1.9104, + "step": 5518 + }, + { + "epoch": 0.8904485317844466, + "grad_norm": 3.6875693798065186, + "learning_rate": 3.215065660087202e-06, + "loss": 1.7309, + "step": 5519 + }, + { + "epoch": 0.8906098741529526, + "grad_norm": 4.245000839233398, + "learning_rate": 3.205854196678071e-06, + "loss": 1.9901, + "step": 5520 + }, + { + "epoch": 0.8907712165214585, + "grad_norm": 4.279839992523193, + "learning_rate": 3.1966555109874287e-06, + "loss": 1.6667, + "step": 5521 + }, + { + "epoch": 0.8909325588899645, + "grad_norm": 3.916095495223999, + "learning_rate": 3.1874696055270715e-06, + "loss": 2.0707, + "step": 5522 + }, + { + "epoch": 0.8910939012584704, + "grad_norm": 4.966620445251465, + "learning_rate": 3.178296482805354e-06, + "loss": 1.7666, + "step": 5523 + }, + { + "epoch": 0.8912552436269764, + "grad_norm": 3.7445385456085205, + "learning_rate": 3.169136145327084e-06, + "loss": 2.0391, + "step": 5524 + }, + { + "epoch": 0.8914165859954825, + "grad_norm": 4.792840957641602, + "learning_rate": 3.159988595593616e-06, + "loss": 1.8792, + "step": 5525 + }, + { + "epoch": 0.8915779283639884, + "grad_norm": 5.726657867431641, + "learning_rate": 3.150853836102802e-06, + "loss": 1.8947, + "step": 5526 + }, + { + "epoch": 0.8917392707324944, + "grad_norm": 4.585726261138916, + "learning_rate": 3.141731869348996e-06, + "loss": 2.1522, + "step": 5527 + }, + { + "epoch": 0.8919006131010003, + "grad_norm": 5.255412578582764, + "learning_rate": 3.1326226978230678e-06, + "loss": 2.0892, + "step": 5528 + }, + { + "epoch": 0.8920619554695063, + "grad_norm": 4.796871662139893, + "learning_rate": 3.1235263240123824e-06, + "loss": 1.9708, + "step": 5529 + }, + { + "epoch": 0.8922232978380122, + "grad_norm": 4.137606143951416, + "learning_rate": 3.1144427504008254e-06, + "loss": 1.7527, + "step": 5530 + }, + { + "epoch": 0.8923846402065182, + "grad_norm": 4.873930931091309, + "learning_rate": 3.105371979468763e-06, + "loss": 1.7856, + "step": 5531 + }, + { + "epoch": 0.8925459825750242, + "grad_norm": 5.601640701293945, + "learning_rate": 3.096314013693108e-06, + "loss": 1.8228, + "step": 5532 + }, + { + "epoch": 0.8927073249435302, + "grad_norm": 5.193307399749756, + "learning_rate": 3.087268855547221e-06, + "loss": 1.9018, + "step": 5533 + }, + { + "epoch": 0.8928686673120362, + "grad_norm": 4.120820999145508, + "learning_rate": 3.0782365075010145e-06, + "loss": 1.7555, + "step": 5534 + }, + { + "epoch": 0.8930300096805421, + "grad_norm": 4.9706902503967285, + "learning_rate": 3.069216972020866e-06, + "loss": 2.0308, + "step": 5535 + }, + { + "epoch": 0.8931913520490481, + "grad_norm": 4.47065544128418, + "learning_rate": 3.0602102515696953e-06, + "loss": 1.6246, + "step": 5536 + }, + { + "epoch": 0.893352694417554, + "grad_norm": 3.629650354385376, + "learning_rate": 3.0512163486068666e-06, + "loss": 1.8044, + "step": 5537 + }, + { + "epoch": 0.89351403678606, + "grad_norm": 5.262669086456299, + "learning_rate": 3.0422352655883057e-06, + "loss": 2.0888, + "step": 5538 + }, + { + "epoch": 0.893675379154566, + "grad_norm": 4.949001312255859, + "learning_rate": 3.0332670049663837e-06, + "loss": 1.7839, + "step": 5539 + }, + { + "epoch": 0.8938367215230719, + "grad_norm": 4.028216361999512, + "learning_rate": 3.0243115691900136e-06, + "loss": 1.7119, + "step": 5540 + }, + { + "epoch": 0.8939980638915779, + "grad_norm": 4.334787845611572, + "learning_rate": 3.0153689607045845e-06, + "loss": 1.7027, + "step": 5541 + }, + { + "epoch": 0.8941594062600839, + "grad_norm": 4.003933906555176, + "learning_rate": 3.00643918195197e-06, + "loss": 1.7413, + "step": 5542 + }, + { + "epoch": 0.8943207486285899, + "grad_norm": 4.258213996887207, + "learning_rate": 2.9975222353705756e-06, + "loss": 1.6041, + "step": 5543 + }, + { + "epoch": 0.8944820909970959, + "grad_norm": 4.005317687988281, + "learning_rate": 2.988618123395276e-06, + "loss": 1.9746, + "step": 5544 + }, + { + "epoch": 0.8946434333656018, + "grad_norm": 5.412639617919922, + "learning_rate": 2.979726848457437e-06, + "loss": 1.7949, + "step": 5545 + }, + { + "epoch": 0.8948047757341078, + "grad_norm": 3.905019760131836, + "learning_rate": 2.9708484129849556e-06, + "loss": 1.715, + "step": 5546 + }, + { + "epoch": 0.8949661181026137, + "grad_norm": 4.405820846557617, + "learning_rate": 2.9619828194021816e-06, + "loss": 1.7162, + "step": 5547 + }, + { + "epoch": 0.8951274604711197, + "grad_norm": 3.744107484817505, + "learning_rate": 2.953130070129967e-06, + "loss": 1.6591, + "step": 5548 + }, + { + "epoch": 0.8952888028396256, + "grad_norm": 4.333800315856934, + "learning_rate": 2.944290167585684e-06, + "loss": 1.6746, + "step": 5549 + }, + { + "epoch": 0.8954501452081317, + "grad_norm": 4.936779975891113, + "learning_rate": 2.9354631141831623e-06, + "loss": 1.8484, + "step": 5550 + }, + { + "epoch": 0.8956114875766377, + "grad_norm": 3.6195812225341797, + "learning_rate": 2.9266489123327468e-06, + "loss": 1.9801, + "step": 5551 + }, + { + "epoch": 0.8957728299451436, + "grad_norm": 6.064610004425049, + "learning_rate": 2.9178475644412563e-06, + "loss": 2.0025, + "step": 5552 + }, + { + "epoch": 0.8959341723136496, + "grad_norm": 4.333761215209961, + "learning_rate": 2.909059072912018e-06, + "loss": 1.8259, + "step": 5553 + }, + { + "epoch": 0.8960955146821555, + "grad_norm": 6.1533894538879395, + "learning_rate": 2.9002834401448296e-06, + "loss": 1.6982, + "step": 5554 + }, + { + "epoch": 0.8962568570506615, + "grad_norm": 6.941343784332275, + "learning_rate": 2.8915206685359798e-06, + "loss": 1.8972, + "step": 5555 + }, + { + "epoch": 0.8964181994191674, + "grad_norm": 6.984273910522461, + "learning_rate": 2.8827707604782704e-06, + "loss": 1.7878, + "step": 5556 + }, + { + "epoch": 0.8965795417876734, + "grad_norm": 4.510810852050781, + "learning_rate": 2.8740337183609466e-06, + "loss": 1.9487, + "step": 5557 + }, + { + "epoch": 0.8967408841561794, + "grad_norm": 4.577576160430908, + "learning_rate": 2.865309544569794e-06, + "loss": 2.0792, + "step": 5558 + }, + { + "epoch": 0.8969022265246854, + "grad_norm": 3.6346867084503174, + "learning_rate": 2.8565982414870297e-06, + "loss": 1.8338, + "step": 5559 + }, + { + "epoch": 0.8970635688931914, + "grad_norm": 4.3393449783325195, + "learning_rate": 2.8478998114914004e-06, + "loss": 1.8608, + "step": 5560 + }, + { + "epoch": 0.8972249112616973, + "grad_norm": 4.22618293762207, + "learning_rate": 2.839214256958106e-06, + "loss": 1.7635, + "step": 5561 + }, + { + "epoch": 0.8973862536302033, + "grad_norm": 5.2830023765563965, + "learning_rate": 2.8305415802588608e-06, + "loss": 1.6503, + "step": 5562 + }, + { + "epoch": 0.8975475959987093, + "grad_norm": 4.610961437225342, + "learning_rate": 2.8218817837618317e-06, + "loss": 1.9631, + "step": 5563 + }, + { + "epoch": 0.8977089383672152, + "grad_norm": 4.6389570236206055, + "learning_rate": 2.8132348698316934e-06, + "loss": 1.8174, + "step": 5564 + }, + { + "epoch": 0.8978702807357212, + "grad_norm": 4.900999546051025, + "learning_rate": 2.804600840829574e-06, + "loss": 1.8821, + "step": 5565 + }, + { + "epoch": 0.8980316231042271, + "grad_norm": 4.001184463500977, + "learning_rate": 2.795979699113127e-06, + "loss": 1.5665, + "step": 5566 + }, + { + "epoch": 0.8981929654727331, + "grad_norm": 4.928239822387695, + "learning_rate": 2.7873714470364466e-06, + "loss": 1.8012, + "step": 5567 + }, + { + "epoch": 0.8983543078412392, + "grad_norm": 4.311793804168701, + "learning_rate": 2.7787760869501133e-06, + "loss": 1.7901, + "step": 5568 + }, + { + "epoch": 0.8985156502097451, + "grad_norm": 5.386144638061523, + "learning_rate": 2.770193621201217e-06, + "loss": 1.6103, + "step": 5569 + }, + { + "epoch": 0.8986769925782511, + "grad_norm": 8.556854248046875, + "learning_rate": 2.7616240521332882e-06, + "loss": 1.7432, + "step": 5570 + }, + { + "epoch": 0.898838334946757, + "grad_norm": 4.126765727996826, + "learning_rate": 2.7530673820863715e-06, + "loss": 2.3162, + "step": 5571 + }, + { + "epoch": 0.898999677315263, + "grad_norm": 4.71306037902832, + "learning_rate": 2.744523613396954e-06, + "loss": 1.8745, + "step": 5572 + }, + { + "epoch": 0.8991610196837689, + "grad_norm": 4.547961711883545, + "learning_rate": 2.735992748398025e-06, + "loss": 1.7934, + "step": 5573 + }, + { + "epoch": 0.8993223620522749, + "grad_norm": 4.691380023956299, + "learning_rate": 2.727474789419038e-06, + "loss": 1.7966, + "step": 5574 + }, + { + "epoch": 0.8994837044207808, + "grad_norm": 6.193465709686279, + "learning_rate": 2.718969738785937e-06, + "loss": 1.9246, + "step": 5575 + }, + { + "epoch": 0.8996450467892869, + "grad_norm": 4.564996242523193, + "learning_rate": 2.7104775988211205e-06, + "loss": 1.7162, + "step": 5576 + }, + { + "epoch": 0.8998063891577929, + "grad_norm": 4.405948162078857, + "learning_rate": 2.701998371843478e-06, + "loss": 1.8958, + "step": 5577 + }, + { + "epoch": 0.8999677315262988, + "grad_norm": 4.4796624183654785, + "learning_rate": 2.6935320601683634e-06, + "loss": 1.7776, + "step": 5578 + }, + { + "epoch": 0.9001290738948048, + "grad_norm": 4.329618453979492, + "learning_rate": 2.6850786661076044e-06, + "loss": 1.7769, + "step": 5579 + }, + { + "epoch": 0.9002904162633107, + "grad_norm": 3.8581998348236084, + "learning_rate": 2.676638191969516e-06, + "loss": 1.7699, + "step": 5580 + }, + { + "epoch": 0.9004517586318167, + "grad_norm": 4.222093105316162, + "learning_rate": 2.6682106400588546e-06, + "loss": 1.9077, + "step": 5581 + }, + { + "epoch": 0.9006131010003227, + "grad_norm": 5.26242733001709, + "learning_rate": 2.6597960126768906e-06, + "loss": 1.9529, + "step": 5582 + }, + { + "epoch": 0.9007744433688286, + "grad_norm": 4.077167510986328, + "learning_rate": 2.65139431212133e-06, + "loss": 1.9152, + "step": 5583 + }, + { + "epoch": 0.9009357857373346, + "grad_norm": 4.203107833862305, + "learning_rate": 2.6430055406863607e-06, + "loss": 1.5174, + "step": 5584 + }, + { + "epoch": 0.9010971281058406, + "grad_norm": 4.102605819702148, + "learning_rate": 2.6346297006626274e-06, + "loss": 1.9124, + "step": 5585 + }, + { + "epoch": 0.9012584704743466, + "grad_norm": 4.139077186584473, + "learning_rate": 2.6262667943372845e-06, + "loss": 1.8356, + "step": 5586 + }, + { + "epoch": 0.9014198128428526, + "grad_norm": 4.112833023071289, + "learning_rate": 2.617916823993899e-06, + "loss": 1.7165, + "step": 5587 + }, + { + "epoch": 0.9015811552113585, + "grad_norm": 4.986573696136475, + "learning_rate": 2.609579791912553e-06, + "loss": 2.0159, + "step": 5588 + }, + { + "epoch": 0.9017424975798645, + "grad_norm": 5.297725677490234, + "learning_rate": 2.601255700369765e-06, + "loss": 1.7165, + "step": 5589 + }, + { + "epoch": 0.9019038399483704, + "grad_norm": 5.899139404296875, + "learning_rate": 2.592944551638543e-06, + "loss": 2.0275, + "step": 5590 + }, + { + "epoch": 0.9020651823168764, + "grad_norm": 3.807053804397583, + "learning_rate": 2.5846463479883344e-06, + "loss": 1.9027, + "step": 5591 + }, + { + "epoch": 0.9022265246853823, + "grad_norm": 4.398458003997803, + "learning_rate": 2.57636109168507e-06, + "loss": 1.7302, + "step": 5592 + }, + { + "epoch": 0.9023878670538884, + "grad_norm": 4.744655609130859, + "learning_rate": 2.5680887849911463e-06, + "loss": 1.84, + "step": 5593 + }, + { + "epoch": 0.9025492094223944, + "grad_norm": 3.9637608528137207, + "learning_rate": 2.5598294301654114e-06, + "loss": 1.8567, + "step": 5594 + }, + { + "epoch": 0.9027105517909003, + "grad_norm": 4.358543395996094, + "learning_rate": 2.5515830294631894e-06, + "loss": 1.7194, + "step": 5595 + }, + { + "epoch": 0.9028718941594063, + "grad_norm": 5.891459941864014, + "learning_rate": 2.5433495851362567e-06, + "loss": 1.7255, + "step": 5596 + }, + { + "epoch": 0.9030332365279122, + "grad_norm": 4.939186096191406, + "learning_rate": 2.53512909943287e-06, + "loss": 1.8839, + "step": 5597 + }, + { + "epoch": 0.9031945788964182, + "grad_norm": 4.955162525177002, + "learning_rate": 2.5269215745977126e-06, + "loss": 1.8185, + "step": 5598 + }, + { + "epoch": 0.9033559212649241, + "grad_norm": 4.159775733947754, + "learning_rate": 2.518727012871974e-06, + "loss": 1.9379, + "step": 5599 + }, + { + "epoch": 0.9035172636334301, + "grad_norm": 5.024981498718262, + "learning_rate": 2.5105454164932594e-06, + "loss": 1.8584, + "step": 5600 + }, + { + "epoch": 0.9036786060019361, + "grad_norm": 3.6600778102874756, + "learning_rate": 2.5023767876956704e-06, + "loss": 1.907, + "step": 5601 + }, + { + "epoch": 0.9038399483704421, + "grad_norm": 5.974171161651611, + "learning_rate": 2.494221128709745e-06, + "loss": 1.736, + "step": 5602 + }, + { + "epoch": 0.9040012907389481, + "grad_norm": 5.048582553863525, + "learning_rate": 2.4860784417624904e-06, + "loss": 2.0173, + "step": 5603 + }, + { + "epoch": 0.904162633107454, + "grad_norm": 5.431088924407959, + "learning_rate": 2.4779487290773617e-06, + "loss": 1.8394, + "step": 5604 + }, + { + "epoch": 0.90432397547596, + "grad_norm": 4.85250997543335, + "learning_rate": 2.469831992874272e-06, + "loss": 1.8768, + "step": 5605 + }, + { + "epoch": 0.904485317844466, + "grad_norm": 3.9703378677368164, + "learning_rate": 2.4617282353696093e-06, + "loss": 1.8982, + "step": 5606 + }, + { + "epoch": 0.9046466602129719, + "grad_norm": 4.356378078460693, + "learning_rate": 2.4536374587761924e-06, + "loss": 1.6529, + "step": 5607 + }, + { + "epoch": 0.9048080025814779, + "grad_norm": 4.12722635269165, + "learning_rate": 2.445559665303321e-06, + "loss": 1.7909, + "step": 5608 + }, + { + "epoch": 0.9049693449499838, + "grad_norm": 4.801215648651123, + "learning_rate": 2.4374948571567246e-06, + "loss": 1.8611, + "step": 5609 + }, + { + "epoch": 0.9051306873184899, + "grad_norm": 3.9699883460998535, + "learning_rate": 2.429443036538609e-06, + "loss": 1.9078, + "step": 5610 + }, + { + "epoch": 0.9052920296869958, + "grad_norm": 4.241724491119385, + "learning_rate": 2.4214042056476093e-06, + "loss": 2.0308, + "step": 5611 + }, + { + "epoch": 0.9054533720555018, + "grad_norm": 3.783005714416504, + "learning_rate": 2.4133783666788424e-06, + "loss": 1.8767, + "step": 5612 + }, + { + "epoch": 0.9056147144240078, + "grad_norm": 5.081392288208008, + "learning_rate": 2.4053655218238493e-06, + "loss": 2.0387, + "step": 5613 + }, + { + "epoch": 0.9057760567925137, + "grad_norm": 5.225015640258789, + "learning_rate": 2.397365673270646e-06, + "loss": 1.7805, + "step": 5614 + }, + { + "epoch": 0.9059373991610197, + "grad_norm": 4.1674981117248535, + "learning_rate": 2.389378823203681e-06, + "loss": 1.8708, + "step": 5615 + }, + { + "epoch": 0.9060987415295256, + "grad_norm": 3.9809725284576416, + "learning_rate": 2.3814049738038744e-06, + "loss": 1.7738, + "step": 5616 + }, + { + "epoch": 0.9062600838980316, + "grad_norm": 3.365668296813965, + "learning_rate": 2.373444127248581e-06, + "loss": 1.8897, + "step": 5617 + }, + { + "epoch": 0.9064214262665375, + "grad_norm": 3.308582067489624, + "learning_rate": 2.3654962857115937e-06, + "loss": 1.8113, + "step": 5618 + }, + { + "epoch": 0.9065827686350436, + "grad_norm": 3.76804780960083, + "learning_rate": 2.3575614513631884e-06, + "loss": 1.7842, + "step": 5619 + }, + { + "epoch": 0.9067441110035496, + "grad_norm": 3.941070079803467, + "learning_rate": 2.3496396263700482e-06, + "loss": 1.7475, + "step": 5620 + }, + { + "epoch": 0.9069054533720555, + "grad_norm": 4.967513084411621, + "learning_rate": 2.3417308128953485e-06, + "loss": 1.9079, + "step": 5621 + }, + { + "epoch": 0.9070667957405615, + "grad_norm": 4.02778434753418, + "learning_rate": 2.333835013098673e-06, + "loss": 1.6699, + "step": 5622 + }, + { + "epoch": 0.9072281381090674, + "grad_norm": 3.6762659549713135, + "learning_rate": 2.3259522291360747e-06, + "loss": 1.8003, + "step": 5623 + }, + { + "epoch": 0.9073894804775734, + "grad_norm": 3.978619337081909, + "learning_rate": 2.318082463160032e-06, + "loss": 1.6767, + "step": 5624 + }, + { + "epoch": 0.9075508228460794, + "grad_norm": 5.992420196533203, + "learning_rate": 2.3102257173194974e-06, + "loss": 1.8987, + "step": 5625 + }, + { + "epoch": 0.9077121652145853, + "grad_norm": 4.104101657867432, + "learning_rate": 2.302381993759839e-06, + "loss": 1.7226, + "step": 5626 + }, + { + "epoch": 0.9078735075830913, + "grad_norm": 5.735623359680176, + "learning_rate": 2.2945512946228984e-06, + "loss": 2.1269, + "step": 5627 + }, + { + "epoch": 0.9080348499515973, + "grad_norm": 4.954807758331299, + "learning_rate": 2.286733622046927e-06, + "loss": 1.8843, + "step": 5628 + }, + { + "epoch": 0.9081961923201033, + "grad_norm": 4.75136661529541, + "learning_rate": 2.27892897816665e-06, + "loss": 1.5943, + "step": 5629 + }, + { + "epoch": 0.9083575346886092, + "grad_norm": 3.810737371444702, + "learning_rate": 2.271137365113213e-06, + "loss": 1.6648, + "step": 5630 + }, + { + "epoch": 0.9085188770571152, + "grad_norm": 3.4491994380950928, + "learning_rate": 2.2633587850142133e-06, + "loss": 1.7016, + "step": 5631 + }, + { + "epoch": 0.9086802194256212, + "grad_norm": 5.240142345428467, + "learning_rate": 2.2555932399936973e-06, + "loss": 1.8236, + "step": 5632 + }, + { + "epoch": 0.9088415617941271, + "grad_norm": 4.1295061111450195, + "learning_rate": 2.2478407321721296e-06, + "loss": 1.6504, + "step": 5633 + }, + { + "epoch": 0.9090029041626331, + "grad_norm": 8.141188621520996, + "learning_rate": 2.2401012636664387e-06, + "loss": 1.9291, + "step": 5634 + }, + { + "epoch": 0.909164246531139, + "grad_norm": 3.8959414958953857, + "learning_rate": 2.2323748365899675e-06, + "loss": 1.7571, + "step": 5635 + }, + { + "epoch": 0.9093255888996451, + "grad_norm": 4.677822589874268, + "learning_rate": 2.2246614530525346e-06, + "loss": 1.8032, + "step": 5636 + }, + { + "epoch": 0.909486931268151, + "grad_norm": 5.629193305969238, + "learning_rate": 2.216961115160354e-06, + "loss": 1.7274, + "step": 5637 + }, + { + "epoch": 0.909648273636657, + "grad_norm": 5.022723197937012, + "learning_rate": 2.2092738250161114e-06, + "loss": 1.7807, + "step": 5638 + }, + { + "epoch": 0.909809616005163, + "grad_norm": 8.506092071533203, + "learning_rate": 2.2015995847189107e-06, + "loss": 1.8648, + "step": 5639 + }, + { + "epoch": 0.9099709583736689, + "grad_norm": 4.457287788391113, + "learning_rate": 2.1939383963642867e-06, + "loss": 1.7594, + "step": 5640 + }, + { + "epoch": 0.9101323007421749, + "grad_norm": 4.704784393310547, + "learning_rate": 2.1862902620442437e-06, + "loss": 2.1164, + "step": 5641 + }, + { + "epoch": 0.9102936431106808, + "grad_norm": 5.191957950592041, + "learning_rate": 2.178655183847189e-06, + "loss": 1.8367, + "step": 5642 + }, + { + "epoch": 0.9104549854791868, + "grad_norm": 4.328970432281494, + "learning_rate": 2.1710331638579717e-06, + "loss": 1.8905, + "step": 5643 + }, + { + "epoch": 0.9106163278476928, + "grad_norm": 5.440768718719482, + "learning_rate": 2.1634242041578713e-06, + "loss": 1.5234, + "step": 5644 + }, + { + "epoch": 0.9107776702161988, + "grad_norm": 4.354081153869629, + "learning_rate": 2.1558283068246253e-06, + "loss": 2.1624, + "step": 5645 + }, + { + "epoch": 0.9109390125847048, + "grad_norm": 4.221670627593994, + "learning_rate": 2.1482454739323755e-06, + "loss": 1.7389, + "step": 5646 + }, + { + "epoch": 0.9111003549532107, + "grad_norm": 8.989388465881348, + "learning_rate": 2.1406757075517147e-06, + "loss": 1.8681, + "step": 5647 + }, + { + "epoch": 0.9112616973217167, + "grad_norm": 6.524087905883789, + "learning_rate": 2.133119009749651e-06, + "loss": 1.8335, + "step": 5648 + }, + { + "epoch": 0.9114230396902226, + "grad_norm": 4.2385711669921875, + "learning_rate": 2.1255753825896453e-06, + "loss": 1.5785, + "step": 5649 + }, + { + "epoch": 0.9115843820587286, + "grad_norm": 5.5846147537231445, + "learning_rate": 2.1180448281315657e-06, + "loss": 1.9489, + "step": 5650 + }, + { + "epoch": 0.9117457244272346, + "grad_norm": 5.0407304763793945, + "learning_rate": 2.11052734843174e-06, + "loss": 1.937, + "step": 5651 + }, + { + "epoch": 0.9119070667957405, + "grad_norm": 4.131408214569092, + "learning_rate": 2.1030229455428928e-06, + "loss": 1.8004, + "step": 5652 + }, + { + "epoch": 0.9120684091642466, + "grad_norm": 4.67261266708374, + "learning_rate": 2.0955316215142074e-06, + "loss": 2.0531, + "step": 5653 + }, + { + "epoch": 0.9122297515327525, + "grad_norm": 3.847583532333374, + "learning_rate": 2.088053378391269e-06, + "loss": 1.6934, + "step": 5654 + }, + { + "epoch": 0.9123910939012585, + "grad_norm": 3.829157829284668, + "learning_rate": 2.0805882182161063e-06, + "loss": 1.7926, + "step": 5655 + }, + { + "epoch": 0.9125524362697645, + "grad_norm": 4.377475261688232, + "learning_rate": 2.0731361430271877e-06, + "loss": 1.7305, + "step": 5656 + }, + { + "epoch": 0.9127137786382704, + "grad_norm": 4.612732887268066, + "learning_rate": 2.065697154859375e-06, + "loss": 2.0563, + "step": 5657 + }, + { + "epoch": 0.9128751210067764, + "grad_norm": 4.503220558166504, + "learning_rate": 2.0582712557439874e-06, + "loss": 1.7823, + "step": 5658 + }, + { + "epoch": 0.9130364633752823, + "grad_norm": 4.670096397399902, + "learning_rate": 2.050858447708759e-06, + "loss": 2.0674, + "step": 5659 + }, + { + "epoch": 0.9131978057437883, + "grad_norm": 6.374212265014648, + "learning_rate": 2.043458732777831e-06, + "loss": 1.7429, + "step": 5660 + }, + { + "epoch": 0.9133591481122942, + "grad_norm": 3.732630491256714, + "learning_rate": 2.0360721129718152e-06, + "loss": 1.9587, + "step": 5661 + }, + { + "epoch": 0.9135204904808003, + "grad_norm": 4.829896926879883, + "learning_rate": 2.028698590307698e-06, + "loss": 1.8403, + "step": 5662 + }, + { + "epoch": 0.9136818328493063, + "grad_norm": 3.5910189151763916, + "learning_rate": 2.021338166798914e-06, + "loss": 1.7132, + "step": 5663 + }, + { + "epoch": 0.9138431752178122, + "grad_norm": 5.4957733154296875, + "learning_rate": 2.0139908444553267e-06, + "loss": 1.9427, + "step": 5664 + }, + { + "epoch": 0.9140045175863182, + "grad_norm": 5.186479091644287, + "learning_rate": 2.0066566252831986e-06, + "loss": 2.1578, + "step": 5665 + }, + { + "epoch": 0.9141658599548241, + "grad_norm": 3.928703546524048, + "learning_rate": 1.999335511285244e-06, + "loss": 1.8647, + "step": 5666 + }, + { + "epoch": 0.9143272023233301, + "grad_norm": 4.604649066925049, + "learning_rate": 1.992027504460575e-06, + "loss": 1.753, + "step": 5667 + }, + { + "epoch": 0.914488544691836, + "grad_norm": 4.960216522216797, + "learning_rate": 1.984732606804729e-06, + "loss": 1.8151, + "step": 5668 + }, + { + "epoch": 0.914649887060342, + "grad_norm": 5.813380241394043, + "learning_rate": 1.977450820309684e-06, + "loss": 1.854, + "step": 5669 + }, + { + "epoch": 0.914811229428848, + "grad_norm": 4.861239433288574, + "learning_rate": 1.9701821469637948e-06, + "loss": 1.8343, + "step": 5670 + }, + { + "epoch": 0.914972571797354, + "grad_norm": 4.349196910858154, + "learning_rate": 1.96292658875189e-06, + "loss": 1.7532, + "step": 5671 + }, + { + "epoch": 0.91513391416586, + "grad_norm": 4.05011510848999, + "learning_rate": 1.9556841476551736e-06, + "loss": 2.1045, + "step": 5672 + }, + { + "epoch": 0.9152952565343659, + "grad_norm": 5.115226745605469, + "learning_rate": 1.9484548256512912e-06, + "loss": 2.2221, + "step": 5673 + }, + { + "epoch": 0.9154565989028719, + "grad_norm": 4.569157600402832, + "learning_rate": 1.9412386247142864e-06, + "loss": 2.0785, + "step": 5674 + }, + { + "epoch": 0.9156179412713779, + "grad_norm": 6.013679027557373, + "learning_rate": 1.934035546814644e-06, + "loss": 1.829, + "step": 5675 + }, + { + "epoch": 0.9157792836398838, + "grad_norm": 4.638603210449219, + "learning_rate": 1.9268455939192463e-06, + "loss": 1.7385, + "step": 5676 + }, + { + "epoch": 0.9159406260083898, + "grad_norm": 4.1503520011901855, + "learning_rate": 1.9196687679914062e-06, + "loss": 1.7304, + "step": 5677 + }, + { + "epoch": 0.9161019683768957, + "grad_norm": 5.443770885467529, + "learning_rate": 1.9125050709908387e-06, + "loss": 1.9824, + "step": 5678 + }, + { + "epoch": 0.9162633107454018, + "grad_norm": 4.665947437286377, + "learning_rate": 1.9053545048736744e-06, + "loss": 1.737, + "step": 5679 + }, + { + "epoch": 0.9164246531139078, + "grad_norm": 4.867141246795654, + "learning_rate": 1.8982170715924785e-06, + "loss": 1.9976, + "step": 5680 + }, + { + "epoch": 0.9165859954824137, + "grad_norm": 4.335331916809082, + "learning_rate": 1.8910927730962036e-06, + "loss": 1.8882, + "step": 5681 + }, + { + "epoch": 0.9167473378509197, + "grad_norm": 3.68391489982605, + "learning_rate": 1.8839816113302266e-06, + "loss": 1.574, + "step": 5682 + }, + { + "epoch": 0.9169086802194256, + "grad_norm": 4.229113578796387, + "learning_rate": 1.8768835882363389e-06, + "loss": 1.7778, + "step": 5683 + }, + { + "epoch": 0.9170700225879316, + "grad_norm": 4.200973987579346, + "learning_rate": 1.8697987057527566e-06, + "loss": 1.8908, + "step": 5684 + }, + { + "epoch": 0.9172313649564375, + "grad_norm": 4.999737739562988, + "learning_rate": 1.8627269658140711e-06, + "loss": 1.761, + "step": 5685 + }, + { + "epoch": 0.9173927073249435, + "grad_norm": 4.386737823486328, + "learning_rate": 1.8556683703513267e-06, + "loss": 1.8375, + "step": 5686 + }, + { + "epoch": 0.9175540496934494, + "grad_norm": 5.096742153167725, + "learning_rate": 1.8486229212919481e-06, + "loss": 1.6345, + "step": 5687 + }, + { + "epoch": 0.9177153920619555, + "grad_norm": 5.353132724761963, + "learning_rate": 1.841590620559791e-06, + "loss": 1.7384, + "step": 5688 + }, + { + "epoch": 0.9178767344304615, + "grad_norm": 5.973819732666016, + "learning_rate": 1.8345714700751026e-06, + "loss": 1.8727, + "step": 5689 + }, + { + "epoch": 0.9180380767989674, + "grad_norm": 3.748918056488037, + "learning_rate": 1.827565471754561e-06, + "loss": 1.8188, + "step": 5690 + }, + { + "epoch": 0.9181994191674734, + "grad_norm": 4.631107807159424, + "learning_rate": 1.82057262751123e-06, + "loss": 1.8604, + "step": 5691 + }, + { + "epoch": 0.9183607615359793, + "grad_norm": 5.493433475494385, + "learning_rate": 1.8135929392545993e-06, + "loss": 2.1599, + "step": 5692 + }, + { + "epoch": 0.9185221039044853, + "grad_norm": 3.2908809185028076, + "learning_rate": 1.8066264088905548e-06, + "loss": 1.9088, + "step": 5693 + }, + { + "epoch": 0.9186834462729913, + "grad_norm": 4.060011863708496, + "learning_rate": 1.7996730383213867e-06, + "loss": 1.814, + "step": 5694 + }, + { + "epoch": 0.9188447886414972, + "grad_norm": 3.9401357173919678, + "learning_rate": 1.7927328294458146e-06, + "loss": 1.7475, + "step": 5695 + }, + { + "epoch": 0.9190061310100033, + "grad_norm": 5.0063796043396, + "learning_rate": 1.785805784158928e-06, + "loss": 1.9751, + "step": 5696 + }, + { + "epoch": 0.9191674733785092, + "grad_norm": 4.859718322753906, + "learning_rate": 1.7788919043522646e-06, + "loss": 1.7047, + "step": 5697 + }, + { + "epoch": 0.9193288157470152, + "grad_norm": 3.6432406902313232, + "learning_rate": 1.77199119191373e-06, + "loss": 1.5968, + "step": 5698 + }, + { + "epoch": 0.9194901581155212, + "grad_norm": 4.3131022453308105, + "learning_rate": 1.765103648727645e-06, + "loss": 1.8461, + "step": 5699 + }, + { + "epoch": 0.9196515004840271, + "grad_norm": 4.97066593170166, + "learning_rate": 1.75822927667475e-06, + "loss": 2.1464, + "step": 5700 + }, + { + "epoch": 0.9198128428525331, + "grad_norm": 5.496631622314453, + "learning_rate": 1.751368077632176e-06, + "loss": 2.04, + "step": 5701 + }, + { + "epoch": 0.919974185221039, + "grad_norm": 4.846836566925049, + "learning_rate": 1.7445200534734474e-06, + "loss": 1.9715, + "step": 5702 + }, + { + "epoch": 0.920135527589545, + "grad_norm": 4.323795318603516, + "learning_rate": 1.7376852060685123e-06, + "loss": 1.8746, + "step": 5703 + }, + { + "epoch": 0.9202968699580509, + "grad_norm": 3.578923225402832, + "learning_rate": 1.7308635372837056e-06, + "loss": 1.7163, + "step": 5704 + }, + { + "epoch": 0.920458212326557, + "grad_norm": 3.9475107192993164, + "learning_rate": 1.7240550489817653e-06, + "loss": 1.9772, + "step": 5705 + }, + { + "epoch": 0.920619554695063, + "grad_norm": 4.6500325202941895, + "learning_rate": 1.717259743021843e-06, + "loss": 1.6877, + "step": 5706 + }, + { + "epoch": 0.9207808970635689, + "grad_norm": 3.917170763015747, + "learning_rate": 1.7104776212594653e-06, + "loss": 1.8975, + "step": 5707 + }, + { + "epoch": 0.9209422394320749, + "grad_norm": 4.595219612121582, + "learning_rate": 1.70370868554659e-06, + "loss": 1.7851, + "step": 5708 + }, + { + "epoch": 0.9211035818005808, + "grad_norm": 5.927668571472168, + "learning_rate": 1.6969529377315441e-06, + "loss": 1.9125, + "step": 5709 + }, + { + "epoch": 0.9212649241690868, + "grad_norm": 4.408805847167969, + "learning_rate": 1.6902103796590795e-06, + "loss": 1.7602, + "step": 5710 + }, + { + "epoch": 0.9214262665375927, + "grad_norm": 4.741026401519775, + "learning_rate": 1.6834810131703293e-06, + "loss": 1.557, + "step": 5711 + }, + { + "epoch": 0.9215876089060987, + "grad_norm": 3.8772976398468018, + "learning_rate": 1.6767648401028346e-06, + "loss": 2.0056, + "step": 5712 + }, + { + "epoch": 0.9217489512746048, + "grad_norm": 4.7736897468566895, + "learning_rate": 1.6700618622905228e-06, + "loss": 1.6526, + "step": 5713 + }, + { + "epoch": 0.9219102936431107, + "grad_norm": 3.9673078060150146, + "learning_rate": 1.66337208156373e-06, + "loss": 1.7841, + "step": 5714 + }, + { + "epoch": 0.9220716360116167, + "grad_norm": 3.6450281143188477, + "learning_rate": 1.6566954997491723e-06, + "loss": 1.6754, + "step": 5715 + }, + { + "epoch": 0.9222329783801226, + "grad_norm": 4.685565948486328, + "learning_rate": 1.6500321186699918e-06, + "loss": 1.7412, + "step": 5716 + }, + { + "epoch": 0.9223943207486286, + "grad_norm": 5.0415449142456055, + "learning_rate": 1.6433819401456996e-06, + "loss": 1.7575, + "step": 5717 + }, + { + "epoch": 0.9225556631171346, + "grad_norm": 4.050350189208984, + "learning_rate": 1.6367449659921986e-06, + "loss": 1.5767, + "step": 5718 + }, + { + "epoch": 0.9227170054856405, + "grad_norm": 7.504308223724365, + "learning_rate": 1.6301211980218e-06, + "loss": 1.6768, + "step": 5719 + }, + { + "epoch": 0.9228783478541465, + "grad_norm": 3.741177558898926, + "learning_rate": 1.6235106380432186e-06, + "loss": 1.944, + "step": 5720 + }, + { + "epoch": 0.9230396902226524, + "grad_norm": 4.127954483032227, + "learning_rate": 1.6169132878615322e-06, + "loss": 2.0029, + "step": 5721 + }, + { + "epoch": 0.9232010325911585, + "grad_norm": 5.195093154907227, + "learning_rate": 1.6103291492782391e-06, + "loss": 1.7876, + "step": 5722 + }, + { + "epoch": 0.9233623749596644, + "grad_norm": 6.398956775665283, + "learning_rate": 1.6037582240912175e-06, + "loss": 1.5098, + "step": 5723 + }, + { + "epoch": 0.9235237173281704, + "grad_norm": 6.398956775665283, + "learning_rate": 1.6037582240912175e-06, + "loss": 1.9152, + "step": 5724 + }, + { + "epoch": 0.9236850596966764, + "grad_norm": 4.0053815841674805, + "learning_rate": 1.597200514094732e-06, + "loss": 1.8956, + "step": 5725 + }, + { + "epoch": 0.9238464020651823, + "grad_norm": 4.242628574371338, + "learning_rate": 1.5906560210794562e-06, + "loss": 1.6887, + "step": 5726 + }, + { + "epoch": 0.9240077444336883, + "grad_norm": 4.242440700531006, + "learning_rate": 1.5841247468324383e-06, + "loss": 1.9101, + "step": 5727 + }, + { + "epoch": 0.9241690868021942, + "grad_norm": 5.061077117919922, + "learning_rate": 1.5776066931371348e-06, + "loss": 1.6569, + "step": 5728 + }, + { + "epoch": 0.9243304291707002, + "grad_norm": 4.0543413162231445, + "learning_rate": 1.5711018617733607e-06, + "loss": 1.7956, + "step": 5729 + }, + { + "epoch": 0.9244917715392061, + "grad_norm": 4.903491497039795, + "learning_rate": 1.5646102545173624e-06, + "loss": 1.7056, + "step": 5730 + }, + { + "epoch": 0.9246531139077122, + "grad_norm": 5.305523872375488, + "learning_rate": 1.5581318731417383e-06, + "loss": 1.6872, + "step": 5731 + }, + { + "epoch": 0.9248144562762182, + "grad_norm": 4.152594566345215, + "learning_rate": 1.55166671941549e-06, + "loss": 1.7782, + "step": 5732 + }, + { + "epoch": 0.9249757986447241, + "grad_norm": 5.141536712646484, + "learning_rate": 1.5452147951040163e-06, + "loss": 1.8871, + "step": 5733 + }, + { + "epoch": 0.9251371410132301, + "grad_norm": 3.96051287651062, + "learning_rate": 1.538776101969086e-06, + "loss": 1.8092, + "step": 5734 + }, + { + "epoch": 0.925298483381736, + "grad_norm": 4.150367259979248, + "learning_rate": 1.532350641768876e-06, + "loss": 1.8495, + "step": 5735 + }, + { + "epoch": 0.925459825750242, + "grad_norm": 5.4383416175842285, + "learning_rate": 1.5259384162579216e-06, + "loss": 2.0074, + "step": 5736 + }, + { + "epoch": 0.925621168118748, + "grad_norm": 4.244972229003906, + "learning_rate": 1.519539427187172e-06, + "loss": 1.5395, + "step": 5737 + }, + { + "epoch": 0.9257825104872539, + "grad_norm": 4.363376140594482, + "learning_rate": 1.5131536763039521e-06, + "loss": 1.6113, + "step": 5738 + }, + { + "epoch": 0.92594385285576, + "grad_norm": 4.801346302032471, + "learning_rate": 1.5067811653519558e-06, + "loss": 1.7692, + "step": 5739 + }, + { + "epoch": 0.9261051952242659, + "grad_norm": 6.123290061950684, + "learning_rate": 1.5004218960712802e-06, + "loss": 1.7549, + "step": 5740 + }, + { + "epoch": 0.9262665375927719, + "grad_norm": 5.035151958465576, + "learning_rate": 1.4940758701984136e-06, + "loss": 1.9065, + "step": 5741 + }, + { + "epoch": 0.9264278799612778, + "grad_norm": 4.790508270263672, + "learning_rate": 1.4877430894662036e-06, + "loss": 1.9566, + "step": 5742 + }, + { + "epoch": 0.9265892223297838, + "grad_norm": 4.044414043426514, + "learning_rate": 1.4814235556039003e-06, + "loss": 1.5918, + "step": 5743 + }, + { + "epoch": 0.9267505646982898, + "grad_norm": 4.563947677612305, + "learning_rate": 1.4751172703371342e-06, + "loss": 1.8414, + "step": 5744 + }, + { + "epoch": 0.9269119070667957, + "grad_norm": 4.41463565826416, + "learning_rate": 1.4688242353879e-06, + "loss": 1.8903, + "step": 5745 + }, + { + "epoch": 0.9270732494353017, + "grad_norm": 5.345877170562744, + "learning_rate": 1.4625444524746068e-06, + "loss": 1.9152, + "step": 5746 + }, + { + "epoch": 0.9272345918038076, + "grad_norm": 4.196035385131836, + "learning_rate": 1.4562779233120105e-06, + "loss": 1.6025, + "step": 5747 + }, + { + "epoch": 0.9273959341723137, + "grad_norm": 4.328422546386719, + "learning_rate": 1.4500246496112758e-06, + "loss": 1.7882, + "step": 5748 + }, + { + "epoch": 0.9275572765408197, + "grad_norm": 3.8723225593566895, + "learning_rate": 1.4437846330799255e-06, + "loss": 1.7593, + "step": 5749 + }, + { + "epoch": 0.9277186189093256, + "grad_norm": 4.046195030212402, + "learning_rate": 1.4375578754218855e-06, + "loss": 1.9627, + "step": 5750 + }, + { + "epoch": 0.9278799612778316, + "grad_norm": 5.210872650146484, + "learning_rate": 1.4313443783374404e-06, + "loss": 1.8299, + "step": 5751 + }, + { + "epoch": 0.9280413036463375, + "grad_norm": 4.25077486038208, + "learning_rate": 1.4251441435232659e-06, + "loss": 1.6938, + "step": 5752 + }, + { + "epoch": 0.9282026460148435, + "grad_norm": 3.886307716369629, + "learning_rate": 1.4189571726724082e-06, + "loss": 1.7097, + "step": 5753 + }, + { + "epoch": 0.9283639883833494, + "grad_norm": 4.030497074127197, + "learning_rate": 1.412783467474299e-06, + "loss": 1.8167, + "step": 5754 + }, + { + "epoch": 0.9285253307518554, + "grad_norm": 3.4684436321258545, + "learning_rate": 1.4066230296147454e-06, + "loss": 1.8662, + "step": 5755 + }, + { + "epoch": 0.9286866731203615, + "grad_norm": 4.377821922302246, + "learning_rate": 1.400475860775935e-06, + "loss": 1.9748, + "step": 5756 + }, + { + "epoch": 0.9288480154888674, + "grad_norm": 6.5896711349487305, + "learning_rate": 1.3943419626364196e-06, + "loss": 1.9087, + "step": 5757 + }, + { + "epoch": 0.9290093578573734, + "grad_norm": 3.9209225177764893, + "learning_rate": 1.3882213368711372e-06, + "loss": 1.9682, + "step": 5758 + }, + { + "epoch": 0.9291707002258793, + "grad_norm": 4.070976257324219, + "learning_rate": 1.3821139851514064e-06, + "loss": 2.043, + "step": 5759 + }, + { + "epoch": 0.9293320425943853, + "grad_norm": 4.065196514129639, + "learning_rate": 1.3760199091449044e-06, + "loss": 1.6531, + "step": 5760 + }, + { + "epoch": 0.9294933849628912, + "grad_norm": 5.823979377746582, + "learning_rate": 1.3699391105157056e-06, + "loss": 2.0396, + "step": 5761 + }, + { + "epoch": 0.9296547273313972, + "grad_norm": 4.389406681060791, + "learning_rate": 1.3638715909242316e-06, + "loss": 1.7936, + "step": 5762 + }, + { + "epoch": 0.9298160696999032, + "grad_norm": 4.885544776916504, + "learning_rate": 1.357817352027313e-06, + "loss": 1.8755, + "step": 5763 + }, + { + "epoch": 0.9299774120684091, + "grad_norm": 4.0908427238464355, + "learning_rate": 1.351776395478116e-06, + "loss": 1.8113, + "step": 5764 + }, + { + "epoch": 0.9301387544369152, + "grad_norm": 3.9577252864837646, + "learning_rate": 1.3457487229262155e-06, + "loss": 1.8165, + "step": 5765 + }, + { + "epoch": 0.9303000968054211, + "grad_norm": 5.4702067375183105, + "learning_rate": 1.3397343360175286e-06, + "loss": 1.6826, + "step": 5766 + }, + { + "epoch": 0.9304614391739271, + "grad_norm": 4.719128608703613, + "learning_rate": 1.3337332363943634e-06, + "loss": 1.9677, + "step": 5767 + }, + { + "epoch": 0.930622781542433, + "grad_norm": 5.333267688751221, + "learning_rate": 1.327745425695398e-06, + "loss": 2.0296, + "step": 5768 + }, + { + "epoch": 0.930784123910939, + "grad_norm": 4.493922233581543, + "learning_rate": 1.3217709055556638e-06, + "loss": 1.7187, + "step": 5769 + }, + { + "epoch": 0.930945466279445, + "grad_norm": 4.137994766235352, + "learning_rate": 1.3158096776065942e-06, + "loss": 1.7014, + "step": 5770 + }, + { + "epoch": 0.9311068086479509, + "grad_norm": 5.764809608459473, + "learning_rate": 1.3098617434759596e-06, + "loss": 1.9381, + "step": 5771 + }, + { + "epoch": 0.9312681510164569, + "grad_norm": 4.776947021484375, + "learning_rate": 1.3039271047879331e-06, + "loss": 1.92, + "step": 5772 + }, + { + "epoch": 0.9314294933849628, + "grad_norm": 4.990124225616455, + "learning_rate": 1.2980057631630294e-06, + "loss": 1.9291, + "step": 5773 + }, + { + "epoch": 0.9315908357534689, + "grad_norm": 5.358241081237793, + "learning_rate": 1.2920977202181494e-06, + "loss": 1.8355, + "step": 5774 + }, + { + "epoch": 0.9317521781219749, + "grad_norm": 5.3820295333862305, + "learning_rate": 1.286202977566553e-06, + "loss": 1.7486, + "step": 5775 + }, + { + "epoch": 0.9319135204904808, + "grad_norm": 4.732501029968262, + "learning_rate": 1.2803215368178745e-06, + "loss": 1.606, + "step": 5776 + }, + { + "epoch": 0.9320748628589868, + "grad_norm": 4.951090335845947, + "learning_rate": 1.2744533995781183e-06, + "loss": 1.4972, + "step": 5777 + }, + { + "epoch": 0.9322362052274927, + "grad_norm": 3.594369649887085, + "learning_rate": 1.268598567449647e-06, + "loss": 1.8737, + "step": 5778 + }, + { + "epoch": 0.9323975475959987, + "grad_norm": 4.86506462097168, + "learning_rate": 1.2627570420311929e-06, + "loss": 1.8318, + "step": 5779 + }, + { + "epoch": 0.9325588899645046, + "grad_norm": 5.794149398803711, + "learning_rate": 1.256928824917858e-06, + "loss": 1.8681, + "step": 5780 + }, + { + "epoch": 0.9327202323330106, + "grad_norm": 4.306437969207764, + "learning_rate": 1.2511139177011133e-06, + "loss": 1.7558, + "step": 5781 + }, + { + "epoch": 0.9328815747015167, + "grad_norm": 5.511157512664795, + "learning_rate": 1.2453123219687834e-06, + "loss": 1.728, + "step": 5782 + }, + { + "epoch": 0.9330429170700226, + "grad_norm": 5.119511127471924, + "learning_rate": 1.2395240393050733e-06, + "loss": 1.9274, + "step": 5783 + }, + { + "epoch": 0.9332042594385286, + "grad_norm": 4.189093112945557, + "learning_rate": 1.2337490712905352e-06, + "loss": 1.6383, + "step": 5784 + }, + { + "epoch": 0.9333656018070345, + "grad_norm": 5.592221260070801, + "learning_rate": 1.2279874195021024e-06, + "loss": 1.8266, + "step": 5785 + }, + { + "epoch": 0.9335269441755405, + "grad_norm": 4.038226127624512, + "learning_rate": 1.222239085513066e-06, + "loss": 1.7665, + "step": 5786 + }, + { + "epoch": 0.9336882865440465, + "grad_norm": 4.962045669555664, + "learning_rate": 1.2165040708930765e-06, + "loss": 1.6626, + "step": 5787 + }, + { + "epoch": 0.9338496289125524, + "grad_norm": 4.691133499145508, + "learning_rate": 1.2107823772081472e-06, + "loss": 1.848, + "step": 5788 + }, + { + "epoch": 0.9340109712810584, + "grad_norm": 4.325597286224365, + "learning_rate": 1.2050740060206679e-06, + "loss": 1.7769, + "step": 5789 + }, + { + "epoch": 0.9341723136495643, + "grad_norm": 3.8352766036987305, + "learning_rate": 1.1993789588893634e-06, + "loss": 1.9731, + "step": 5790 + }, + { + "epoch": 0.9343336560180704, + "grad_norm": 3.7644858360290527, + "learning_rate": 1.1936972373693567e-06, + "loss": 1.7886, + "step": 5791 + }, + { + "epoch": 0.9344949983865763, + "grad_norm": 3.7606143951416016, + "learning_rate": 1.1880288430120901e-06, + "loss": 1.8622, + "step": 5792 + }, + { + "epoch": 0.9346563407550823, + "grad_norm": 4.325725078582764, + "learning_rate": 1.1823737773654087e-06, + "loss": 1.7914, + "step": 5793 + }, + { + "epoch": 0.9348176831235883, + "grad_norm": 4.37510347366333, + "learning_rate": 1.1767320419734884e-06, + "loss": 1.8447, + "step": 5794 + }, + { + "epoch": 0.9349790254920942, + "grad_norm": 3.663802146911621, + "learning_rate": 1.1711036383768693e-06, + "loss": 1.8028, + "step": 5795 + }, + { + "epoch": 0.9351403678606002, + "grad_norm": 4.456315040588379, + "learning_rate": 1.165488568112466e-06, + "loss": 2.0647, + "step": 5796 + }, + { + "epoch": 0.9353017102291061, + "grad_norm": 6.146907806396484, + "learning_rate": 1.1598868327135359e-06, + "loss": 2.0919, + "step": 5797 + }, + { + "epoch": 0.9354630525976121, + "grad_norm": 3.3174848556518555, + "learning_rate": 1.1542984337097107e-06, + "loss": 1.8917, + "step": 5798 + }, + { + "epoch": 0.9356243949661182, + "grad_norm": 5.587307929992676, + "learning_rate": 1.1487233726269585e-06, + "loss": 1.7521, + "step": 5799 + }, + { + "epoch": 0.9357857373346241, + "grad_norm": 4.982137680053711, + "learning_rate": 1.1431616509876287e-06, + "loss": 1.8063, + "step": 5800 + }, + { + "epoch": 0.9359470797031301, + "grad_norm": 4.172671318054199, + "learning_rate": 1.1376132703104115e-06, + "loss": 1.6838, + "step": 5801 + }, + { + "epoch": 0.936108422071636, + "grad_norm": 4.366324424743652, + "learning_rate": 1.1320782321103673e-06, + "loss": 1.9022, + "step": 5802 + }, + { + "epoch": 0.936269764440142, + "grad_norm": 3.5740795135498047, + "learning_rate": 1.1265565378989041e-06, + "loss": 1.8462, + "step": 5803 + }, + { + "epoch": 0.9364311068086479, + "grad_norm": 3.788661003112793, + "learning_rate": 1.1210481891837877e-06, + "loss": 2.0685, + "step": 5804 + }, + { + "epoch": 0.9365924491771539, + "grad_norm": 4.398108005523682, + "learning_rate": 1.1155531874691371e-06, + "loss": 2.0232, + "step": 5805 + }, + { + "epoch": 0.9367537915456599, + "grad_norm": 3.890148401260376, + "learning_rate": 1.1100715342554357e-06, + "loss": 1.7261, + "step": 5806 + }, + { + "epoch": 0.9369151339141658, + "grad_norm": 4.149150848388672, + "learning_rate": 1.1046032310395193e-06, + "loss": 1.6467, + "step": 5807 + }, + { + "epoch": 0.9370764762826719, + "grad_norm": 4.093803405761719, + "learning_rate": 1.0991482793145657e-06, + "loss": 1.8167, + "step": 5808 + }, + { + "epoch": 0.9372378186511778, + "grad_norm": 4.130459308624268, + "learning_rate": 1.0937066805701223e-06, + "loss": 1.6255, + "step": 5809 + }, + { + "epoch": 0.9373991610196838, + "grad_norm": 3.8121321201324463, + "learning_rate": 1.088278436292084e-06, + "loss": 1.9435, + "step": 5810 + }, + { + "epoch": 0.9375605033881897, + "grad_norm": 5.062289714813232, + "learning_rate": 1.0828635479627036e-06, + "loss": 2.1098, + "step": 5811 + }, + { + "epoch": 0.9377218457566957, + "grad_norm": 4.468748092651367, + "learning_rate": 1.0774620170605764e-06, + "loss": 1.7855, + "step": 5812 + }, + { + "epoch": 0.9378831881252017, + "grad_norm": 4.119090557098389, + "learning_rate": 1.0720738450606615e-06, + "loss": 1.7658, + "step": 5813 + }, + { + "epoch": 0.9380445304937076, + "grad_norm": 5.087363243103027, + "learning_rate": 1.0666990334342707e-06, + "loss": 1.7402, + "step": 5814 + }, + { + "epoch": 0.9382058728622136, + "grad_norm": 4.254721164703369, + "learning_rate": 1.0613375836490468e-06, + "loss": 1.9288, + "step": 5815 + }, + { + "epoch": 0.9383672152307196, + "grad_norm": 4.193010330200195, + "learning_rate": 1.0559894971690132e-06, + "loss": 1.8224, + "step": 5816 + }, + { + "epoch": 0.9385285575992256, + "grad_norm": 4.436793804168701, + "learning_rate": 1.0506547754545292e-06, + "loss": 1.749, + "step": 5817 + }, + { + "epoch": 0.9386898999677316, + "grad_norm": 5.846003532409668, + "learning_rate": 1.0453334199623022e-06, + "loss": 1.7162, + "step": 5818 + }, + { + "epoch": 0.9388512423362375, + "grad_norm": 5.199612617492676, + "learning_rate": 1.0400254321453974e-06, + "loss": 1.8913, + "step": 5819 + }, + { + "epoch": 0.9390125847047435, + "grad_norm": 3.845651626586914, + "learning_rate": 1.0347308134532218e-06, + "loss": 1.8086, + "step": 5820 + }, + { + "epoch": 0.9391739270732494, + "grad_norm": 4.292616844177246, + "learning_rate": 1.0294495653315418e-06, + "loss": 1.8092, + "step": 5821 + }, + { + "epoch": 0.9393352694417554, + "grad_norm": 5.048312664031982, + "learning_rate": 1.0241816892224644e-06, + "loss": 1.8648, + "step": 5822 + }, + { + "epoch": 0.9394966118102613, + "grad_norm": 5.327896595001221, + "learning_rate": 1.0189271865644446e-06, + "loss": 1.9031, + "step": 5823 + }, + { + "epoch": 0.9396579541787673, + "grad_norm": 4.470217227935791, + "learning_rate": 1.0136860587923015e-06, + "loss": 1.6775, + "step": 5824 + }, + { + "epoch": 0.9398192965472734, + "grad_norm": 5.024284362792969, + "learning_rate": 1.0084583073371733e-06, + "loss": 1.6311, + "step": 5825 + }, + { + "epoch": 0.9399806389157793, + "grad_norm": 7.035519599914551, + "learning_rate": 1.0032439336265742e-06, + "loss": 1.5856, + "step": 5826 + }, + { + "epoch": 0.9401419812842853, + "grad_norm": 4.376884460449219, + "learning_rate": 9.980429390843427e-07, + "loss": 1.9039, + "step": 5827 + }, + { + "epoch": 0.9403033236527912, + "grad_norm": 5.319319248199463, + "learning_rate": 9.928553251306871e-07, + "loss": 1.7658, + "step": 5828 + }, + { + "epoch": 0.9404646660212972, + "grad_norm": 5.395619869232178, + "learning_rate": 9.87681093182141e-07, + "loss": 2.0921, + "step": 5829 + }, + { + "epoch": 0.9406260083898031, + "grad_norm": 4.178811073303223, + "learning_rate": 9.82520244651597e-07, + "loss": 1.9595, + "step": 5830 + }, + { + "epoch": 0.9407873507583091, + "grad_norm": 5.348474502563477, + "learning_rate": 9.773727809482825e-07, + "loss": 2.0595, + "step": 5831 + }, + { + "epoch": 0.9409486931268151, + "grad_norm": 4.0664448738098145, + "learning_rate": 9.722387034777847e-07, + "loss": 1.9365, + "step": 5832 + }, + { + "epoch": 0.941110035495321, + "grad_norm": 3.9825870990753174, + "learning_rate": 9.671180136420154e-07, + "loss": 1.6047, + "step": 5833 + }, + { + "epoch": 0.9412713778638271, + "grad_norm": 4.089111804962158, + "learning_rate": 9.620107128392563e-07, + "loss": 1.7241, + "step": 5834 + }, + { + "epoch": 0.941432720232333, + "grad_norm": 3.979795455932617, + "learning_rate": 9.569168024640973e-07, + "loss": 1.6304, + "step": 5835 + }, + { + "epoch": 0.941594062600839, + "grad_norm": 4.714263439178467, + "learning_rate": 9.518362839075145e-07, + "loss": 1.9056, + "step": 5836 + }, + { + "epoch": 0.941755404969345, + "grad_norm": 5.615983963012695, + "learning_rate": 9.467691585568039e-07, + "loss": 1.7513, + "step": 5837 + }, + { + "epoch": 0.9419167473378509, + "grad_norm": 4.188431262969971, + "learning_rate": 9.417154277955864e-07, + "loss": 1.9914, + "step": 5838 + }, + { + "epoch": 0.9420780897063569, + "grad_norm": 3.9508488178253174, + "learning_rate": 9.366750930038748e-07, + "loss": 1.7786, + "step": 5839 + }, + { + "epoch": 0.9422394320748628, + "grad_norm": 4.674958229064941, + "learning_rate": 9.316481555579681e-07, + "loss": 1.8749, + "step": 5840 + }, + { + "epoch": 0.9424007744433688, + "grad_norm": 4.925543785095215, + "learning_rate": 9.266346168305517e-07, + "loss": 1.7405, + "step": 5841 + }, + { + "epoch": 0.9425621168118749, + "grad_norm": 5.0826544761657715, + "learning_rate": 9.21634478190625e-07, + "loss": 1.9369, + "step": 5842 + }, + { + "epoch": 0.9427234591803808, + "grad_norm": 6.129978179931641, + "learning_rate": 9.166477410035401e-07, + "loss": 1.7705, + "step": 5843 + }, + { + "epoch": 0.9428848015488868, + "grad_norm": 3.634263277053833, + "learning_rate": 9.116744066309913e-07, + "loss": 1.9332, + "step": 5844 + }, + { + "epoch": 0.9430461439173927, + "grad_norm": 4.566090106964111, + "learning_rate": 9.067144764309976e-07, + "loss": 2.0883, + "step": 5845 + }, + { + "epoch": 0.9432074862858987, + "grad_norm": 3.382263660430908, + "learning_rate": 9.017679517579425e-07, + "loss": 1.9461, + "step": 5846 + }, + { + "epoch": 0.9433688286544046, + "grad_norm": 4.951333045959473, + "learning_rate": 8.968348339625287e-07, + "loss": 1.8161, + "step": 5847 + }, + { + "epoch": 0.9435301710229106, + "grad_norm": 5.0788960456848145, + "learning_rate": 8.919151243918067e-07, + "loss": 1.8715, + "step": 5848 + }, + { + "epoch": 0.9436915133914165, + "grad_norm": 4.868221759796143, + "learning_rate": 8.870088243891572e-07, + "loss": 1.9675, + "step": 5849 + }, + { + "epoch": 0.9438528557599225, + "grad_norm": 3.2534308433532715, + "learning_rate": 8.821159352943143e-07, + "loss": 1.639, + "step": 5850 + }, + { + "epoch": 0.9440141981284286, + "grad_norm": 3.6828863620758057, + "learning_rate": 8.772364584433368e-07, + "loss": 1.9083, + "step": 5851 + }, + { + "epoch": 0.9441755404969345, + "grad_norm": 4.399649620056152, + "learning_rate": 8.723703951686313e-07, + "loss": 1.9833, + "step": 5852 + }, + { + "epoch": 0.9443368828654405, + "grad_norm": 4.549874782562256, + "learning_rate": 8.675177467989349e-07, + "loss": 1.79, + "step": 5853 + }, + { + "epoch": 0.9444982252339464, + "grad_norm": 4.242159843444824, + "learning_rate": 8.626785146593208e-07, + "loss": 1.8302, + "step": 5854 + }, + { + "epoch": 0.9446595676024524, + "grad_norm": 6.748963356018066, + "learning_rate": 8.578527000711989e-07, + "loss": 1.8678, + "step": 5855 + }, + { + "epoch": 0.9448209099709584, + "grad_norm": 3.9145658016204834, + "learning_rate": 8.530403043523205e-07, + "loss": 1.7411, + "step": 5856 + }, + { + "epoch": 0.9449822523394643, + "grad_norm": 4.138341426849365, + "learning_rate": 8.482413288167734e-07, + "loss": 1.5438, + "step": 5857 + }, + { + "epoch": 0.9451435947079703, + "grad_norm": 4.497691631317139, + "learning_rate": 8.43455774774965e-07, + "loss": 1.751, + "step": 5858 + }, + { + "epoch": 0.9453049370764763, + "grad_norm": 3.659984588623047, + "learning_rate": 8.386836435336609e-07, + "loss": 1.9816, + "step": 5859 + }, + { + "epoch": 0.9454662794449823, + "grad_norm": 3.222717523574829, + "learning_rate": 8.339249363959411e-07, + "loss": 1.6494, + "step": 5860 + }, + { + "epoch": 0.9456276218134883, + "grad_norm": 3.8509652614593506, + "learning_rate": 8.29179654661244e-07, + "loss": 1.8249, + "step": 5861 + }, + { + "epoch": 0.9457889641819942, + "grad_norm": 5.055091381072998, + "learning_rate": 8.244477996253108e-07, + "loss": 2.0407, + "step": 5862 + }, + { + "epoch": 0.9459503065505002, + "grad_norm": 3.966942310333252, + "learning_rate": 8.197293725802469e-07, + "loss": 1.6762, + "step": 5863 + }, + { + "epoch": 0.9461116489190061, + "grad_norm": 3.865260601043701, + "learning_rate": 8.150243748144659e-07, + "loss": 1.9668, + "step": 5864 + }, + { + "epoch": 0.9462729912875121, + "grad_norm": 4.931168556213379, + "learning_rate": 8.103328076127347e-07, + "loss": 2.018, + "step": 5865 + }, + { + "epoch": 0.946434333656018, + "grad_norm": 4.939724922180176, + "learning_rate": 8.056546722561343e-07, + "loss": 1.7499, + "step": 5866 + }, + { + "epoch": 0.946595676024524, + "grad_norm": 4.040256500244141, + "learning_rate": 8.00989970022098e-07, + "loss": 1.9447, + "step": 5867 + }, + { + "epoch": 0.9467570183930301, + "grad_norm": 3.61901593208313, + "learning_rate": 7.963387021843683e-07, + "loss": 1.7853, + "step": 5868 + }, + { + "epoch": 0.946918360761536, + "grad_norm": 4.144516468048096, + "learning_rate": 7.917008700130401e-07, + "loss": 1.7975, + "step": 5869 + }, + { + "epoch": 0.947079703130042, + "grad_norm": 4.960970401763916, + "learning_rate": 7.870764747745285e-07, + "loss": 1.882, + "step": 5870 + }, + { + "epoch": 0.9472410454985479, + "grad_norm": 5.900622844696045, + "learning_rate": 7.824655177315787e-07, + "loss": 1.8373, + "step": 5871 + }, + { + "epoch": 0.9474023878670539, + "grad_norm": 6.7140212059021, + "learning_rate": 7.778680001432725e-07, + "loss": 1.7062, + "step": 5872 + }, + { + "epoch": 0.9475637302355598, + "grad_norm": 4.1263427734375, + "learning_rate": 7.732839232650224e-07, + "loss": 1.6811, + "step": 5873 + }, + { + "epoch": 0.9477250726040658, + "grad_norm": 4.739447116851807, + "learning_rate": 7.687132883485549e-07, + "loss": 1.7811, + "step": 5874 + }, + { + "epoch": 0.9478864149725718, + "grad_norm": 4.560645580291748, + "learning_rate": 7.641560966419492e-07, + "loss": 1.8688, + "step": 5875 + }, + { + "epoch": 0.9480477573410777, + "grad_norm": 4.571281433105469, + "learning_rate": 7.596123493895991e-07, + "loss": 1.9427, + "step": 5876 + }, + { + "epoch": 0.9482090997095838, + "grad_norm": 4.332674980163574, + "learning_rate": 7.550820478322285e-07, + "loss": 1.6717, + "step": 5877 + }, + { + "epoch": 0.9483704420780897, + "grad_norm": 5.199446678161621, + "learning_rate": 7.50565193206898e-07, + "loss": 1.8095, + "step": 5878 + }, + { + "epoch": 0.9485317844465957, + "grad_norm": 4.981563568115234, + "learning_rate": 7.460617867469822e-07, + "loss": 1.7219, + "step": 5879 + }, + { + "epoch": 0.9486931268151017, + "grad_norm": 4.1707353591918945, + "learning_rate": 7.415718296822028e-07, + "loss": 1.7694, + "step": 5880 + }, + { + "epoch": 0.9488544691836076, + "grad_norm": 5.002323627471924, + "learning_rate": 7.370953232385902e-07, + "loss": 1.6788, + "step": 5881 + }, + { + "epoch": 0.9490158115521136, + "grad_norm": 5.204863548278809, + "learning_rate": 7.326322686385112e-07, + "loss": 1.6622, + "step": 5882 + }, + { + "epoch": 0.9491771539206195, + "grad_norm": 4.420432090759277, + "learning_rate": 7.281826671006576e-07, + "loss": 1.7891, + "step": 5883 + }, + { + "epoch": 0.9493384962891255, + "grad_norm": 4.139347553253174, + "learning_rate": 7.237465198400461e-07, + "loss": 1.7936, + "step": 5884 + }, + { + "epoch": 0.9494998386576315, + "grad_norm": 4.050226211547852, + "learning_rate": 7.193238280680248e-07, + "loss": 1.7334, + "step": 5885 + }, + { + "epoch": 0.9496611810261375, + "grad_norm": 4.120753765106201, + "learning_rate": 7.149145929922607e-07, + "loss": 1.8455, + "step": 5886 + }, + { + "epoch": 0.9498225233946435, + "grad_norm": 5.404914379119873, + "learning_rate": 7.105188158167575e-07, + "loss": 1.7015, + "step": 5887 + }, + { + "epoch": 0.9499838657631494, + "grad_norm": 4.45375919342041, + "learning_rate": 7.061364977418217e-07, + "loss": 1.7473, + "step": 5888 + }, + { + "epoch": 0.9501452081316554, + "grad_norm": 5.266660213470459, + "learning_rate": 7.017676399641182e-07, + "loss": 2.0971, + "step": 5889 + }, + { + "epoch": 0.9503065505001613, + "grad_norm": 5.51798677444458, + "learning_rate": 6.974122436766039e-07, + "loss": 2.1716, + "step": 5890 + }, + { + "epoch": 0.9504678928686673, + "grad_norm": 4.540561199188232, + "learning_rate": 6.930703100685775e-07, + "loss": 1.6084, + "step": 5891 + }, + { + "epoch": 0.9506292352371732, + "grad_norm": 3.912099838256836, + "learning_rate": 6.887418403256574e-07, + "loss": 1.7189, + "step": 5892 + }, + { + "epoch": 0.9507905776056792, + "grad_norm": 4.840001583099365, + "learning_rate": 6.844268356297867e-07, + "loss": 1.677, + "step": 5893 + }, + { + "epoch": 0.9509519199741853, + "grad_norm": 3.7052505016326904, + "learning_rate": 6.801252971592287e-07, + "loss": 1.8608, + "step": 5894 + }, + { + "epoch": 0.9511132623426912, + "grad_norm": 5.620258808135986, + "learning_rate": 6.758372260885715e-07, + "loss": 1.7261, + "step": 5895 + }, + { + "epoch": 0.9512746047111972, + "grad_norm": 4.253757953643799, + "learning_rate": 6.715626235887341e-07, + "loss": 1.696, + "step": 5896 + }, + { + "epoch": 0.9514359470797031, + "grad_norm": 5.048199653625488, + "learning_rate": 6.673014908269326e-07, + "loss": 1.863, + "step": 5897 + }, + { + "epoch": 0.9515972894482091, + "grad_norm": 4.374826431274414, + "learning_rate": 6.630538289667366e-07, + "loss": 1.8576, + "step": 5898 + }, + { + "epoch": 0.951758631816715, + "grad_norm": 4.63386869430542, + "learning_rate": 6.588196391680124e-07, + "loss": 1.7793, + "step": 5899 + }, + { + "epoch": 0.951919974185221, + "grad_norm": 3.973889112472534, + "learning_rate": 6.545989225869631e-07, + "loss": 1.6409, + "step": 5900 + }, + { + "epoch": 0.952081316553727, + "grad_norm": 3.8073489665985107, + "learning_rate": 6.503916803761057e-07, + "loss": 1.4881, + "step": 5901 + }, + { + "epoch": 0.952242658922233, + "grad_norm": 4.754162311553955, + "learning_rate": 6.461979136842877e-07, + "loss": 1.6191, + "step": 5902 + }, + { + "epoch": 0.952404001290739, + "grad_norm": 4.1775221824646, + "learning_rate": 6.420176236566544e-07, + "loss": 1.7152, + "step": 5903 + }, + { + "epoch": 0.952565343659245, + "grad_norm": 4.4393510818481445, + "learning_rate": 6.378508114346982e-07, + "loss": 1.8372, + "step": 5904 + }, + { + "epoch": 0.9527266860277509, + "grad_norm": 4.985992431640625, + "learning_rate": 6.336974781562088e-07, + "loss": 1.928, + "step": 5905 + }, + { + "epoch": 0.9528880283962569, + "grad_norm": 5.360131740570068, + "learning_rate": 6.295576249553125e-07, + "loss": 2.0691, + "step": 5906 + }, + { + "epoch": 0.9530493707647628, + "grad_norm": 4.566859245300293, + "learning_rate": 6.25431252962444e-07, + "loss": 1.568, + "step": 5907 + }, + { + "epoch": 0.9532107131332688, + "grad_norm": 4.333277702331543, + "learning_rate": 6.213183633043574e-07, + "loss": 1.8062, + "step": 5908 + }, + { + "epoch": 0.9533720555017747, + "grad_norm": 4.303825378417969, + "learning_rate": 6.172189571041376e-07, + "loss": 1.9251, + "step": 5909 + }, + { + "epoch": 0.9535333978702807, + "grad_norm": 3.563807487487793, + "learning_rate": 6.131330354811615e-07, + "loss": 1.8189, + "step": 5910 + }, + { + "epoch": 0.9536947402387868, + "grad_norm": 3.7442002296447754, + "learning_rate": 6.090605995511589e-07, + "loss": 1.9291, + "step": 5911 + }, + { + "epoch": 0.9538560826072927, + "grad_norm": 5.451288223266602, + "learning_rate": 6.050016504261458e-07, + "loss": 2.0853, + "step": 5912 + }, + { + "epoch": 0.9540174249757987, + "grad_norm": 4.133009433746338, + "learning_rate": 6.009561892144744e-07, + "loss": 1.9121, + "step": 5913 + }, + { + "epoch": 0.9541787673443046, + "grad_norm": 5.506205081939697, + "learning_rate": 5.969242170208056e-07, + "loss": 1.8173, + "step": 5914 + }, + { + "epoch": 0.9543401097128106, + "grad_norm": 4.302342414855957, + "learning_rate": 5.929057349461198e-07, + "loss": 1.7966, + "step": 5915 + }, + { + "epoch": 0.9545014520813165, + "grad_norm": 4.712862491607666, + "learning_rate": 5.889007440877059e-07, + "loss": 1.7672, + "step": 5916 + }, + { + "epoch": 0.9546627944498225, + "grad_norm": 4.7828450202941895, + "learning_rate": 5.849092455391892e-07, + "loss": 1.8546, + "step": 5917 + }, + { + "epoch": 0.9548241368183285, + "grad_norm": 4.673771381378174, + "learning_rate": 5.809312403904921e-07, + "loss": 1.6937, + "step": 5918 + }, + { + "epoch": 0.9549854791868344, + "grad_norm": 6.275789737701416, + "learning_rate": 5.769667297278513e-07, + "loss": 1.6837, + "step": 5919 + }, + { + "epoch": 0.9551468215553405, + "grad_norm": 4.317024230957031, + "learning_rate": 5.730157146338399e-07, + "loss": 1.6549, + "step": 5920 + }, + { + "epoch": 0.9553081639238464, + "grad_norm": 4.117687702178955, + "learning_rate": 5.690781961873115e-07, + "loss": 1.9171, + "step": 5921 + }, + { + "epoch": 0.9554695062923524, + "grad_norm": 5.107153415679932, + "learning_rate": 5.651541754634726e-07, + "loss": 1.8191, + "step": 5922 + }, + { + "epoch": 0.9556308486608583, + "grad_norm": 4.189974784851074, + "learning_rate": 5.612436535338106e-07, + "loss": 1.7533, + "step": 5923 + }, + { + "epoch": 0.9557921910293643, + "grad_norm": 5.197915077209473, + "learning_rate": 5.573466314661546e-07, + "loss": 2.0262, + "step": 5924 + }, + { + "epoch": 0.9559535333978703, + "grad_norm": 4.776738166809082, + "learning_rate": 5.534631103246257e-07, + "loss": 1.9174, + "step": 5925 + }, + { + "epoch": 0.9561148757663762, + "grad_norm": 4.674618721008301, + "learning_rate": 5.495930911696757e-07, + "loss": 1.7167, + "step": 5926 + }, + { + "epoch": 0.9562762181348822, + "grad_norm": 4.881612300872803, + "learning_rate": 5.457365750580534e-07, + "loss": 1.9184, + "step": 5927 + }, + { + "epoch": 0.9564375605033882, + "grad_norm": 5.173159122467041, + "learning_rate": 5.418935630428279e-07, + "loss": 1.7982, + "step": 5928 + }, + { + "epoch": 0.9565989028718942, + "grad_norm": 4.905303001403809, + "learning_rate": 5.380640561733819e-07, + "loss": 1.8097, + "step": 5929 + }, + { + "epoch": 0.9567602452404002, + "grad_norm": 5.457943439483643, + "learning_rate": 5.342480554954177e-07, + "loss": 1.9987, + "step": 5930 + }, + { + "epoch": 0.9569215876089061, + "grad_norm": 5.061657905578613, + "learning_rate": 5.304455620509297e-07, + "loss": 1.7285, + "step": 5931 + }, + { + "epoch": 0.9570829299774121, + "grad_norm": 4.368500709533691, + "learning_rate": 5.266565768782427e-07, + "loss": 1.9036, + "step": 5932 + }, + { + "epoch": 0.957244272345918, + "grad_norm": 4.104846000671387, + "learning_rate": 5.228811010119849e-07, + "loss": 1.8644, + "step": 5933 + }, + { + "epoch": 0.957405614714424, + "grad_norm": 4.591108798980713, + "learning_rate": 5.191191354830926e-07, + "loss": 2.1388, + "step": 5934 + }, + { + "epoch": 0.9575669570829299, + "grad_norm": 4.90683650970459, + "learning_rate": 5.15370681318822e-07, + "loss": 2.1739, + "step": 5935 + }, + { + "epoch": 0.9577282994514359, + "grad_norm": 5.190983295440674, + "learning_rate": 5.116357395427262e-07, + "loss": 1.7365, + "step": 5936 + }, + { + "epoch": 0.957889641819942, + "grad_norm": 4.570216178894043, + "learning_rate": 5.079143111746898e-07, + "loss": 1.9492, + "step": 5937 + }, + { + "epoch": 0.9580509841884479, + "grad_norm": 5.459639549255371, + "learning_rate": 5.042063972308831e-07, + "loss": 1.7176, + "step": 5938 + }, + { + "epoch": 0.9582123265569539, + "grad_norm": 6.138484954833984, + "learning_rate": 5.005119987238071e-07, + "loss": 1.8889, + "step": 5939 + }, + { + "epoch": 0.9583736689254598, + "grad_norm": 5.507758617401123, + "learning_rate": 4.968311166622552e-07, + "loss": 1.7188, + "step": 5940 + }, + { + "epoch": 0.9585350112939658, + "grad_norm": 5.310621738433838, + "learning_rate": 4.931637520513455e-07, + "loss": 1.8531, + "step": 5941 + }, + { + "epoch": 0.9586963536624717, + "grad_norm": 4.495138168334961, + "learning_rate": 4.895099058924879e-07, + "loss": 1.768, + "step": 5942 + }, + { + "epoch": 0.9588576960309777, + "grad_norm": 6.398849964141846, + "learning_rate": 4.858695791834178e-07, + "loss": 1.8201, + "step": 5943 + }, + { + "epoch": 0.9590190383994837, + "grad_norm": 4.211677551269531, + "learning_rate": 4.822427729181678e-07, + "loss": 2.0938, + "step": 5944 + }, + { + "epoch": 0.9591803807679897, + "grad_norm": 4.224961757659912, + "learning_rate": 4.786294880870845e-07, + "loss": 1.8269, + "step": 5945 + }, + { + "epoch": 0.9593417231364957, + "grad_norm": 4.252756118774414, + "learning_rate": 4.750297256768177e-07, + "loss": 1.8621, + "step": 5946 + }, + { + "epoch": 0.9595030655050016, + "grad_norm": 3.9692041873931885, + "learning_rate": 4.7144348667032545e-07, + "loss": 1.8073, + "step": 5947 + }, + { + "epoch": 0.9596644078735076, + "grad_norm": 4.611053466796875, + "learning_rate": 4.6787077204687445e-07, + "loss": 1.7518, + "step": 5948 + }, + { + "epoch": 0.9598257502420136, + "grad_norm": 5.75453519821167, + "learning_rate": 4.643115827820399e-07, + "loss": 1.7066, + "step": 5949 + }, + { + "epoch": 0.9599870926105195, + "grad_norm": 5.713967323303223, + "learning_rate": 4.607659198477055e-07, + "loss": 1.899, + "step": 5950 + }, + { + "epoch": 0.9601484349790255, + "grad_norm": 3.9248242378234863, + "learning_rate": 4.5723378421205776e-07, + "loss": 1.8813, + "step": 5951 + }, + { + "epoch": 0.9603097773475314, + "grad_norm": 3.939185857772827, + "learning_rate": 4.537151768395864e-07, + "loss": 1.7456, + "step": 5952 + }, + { + "epoch": 0.9604711197160374, + "grad_norm": 3.6323013305664062, + "learning_rate": 4.5021009869108957e-07, + "loss": 1.8564, + "step": 5953 + }, + { + "epoch": 0.9606324620845434, + "grad_norm": 5.585293292999268, + "learning_rate": 4.4671855072367377e-07, + "loss": 1.6519, + "step": 5954 + }, + { + "epoch": 0.9607938044530494, + "grad_norm": 5.114597797393799, + "learning_rate": 4.432405338907486e-07, + "loss": 1.6841, + "step": 5955 + }, + { + "epoch": 0.9609551468215554, + "grad_norm": 4.141439914703369, + "learning_rate": 4.397760491420322e-07, + "loss": 2.0173, + "step": 5956 + }, + { + "epoch": 0.9611164891900613, + "grad_norm": 4.197299957275391, + "learning_rate": 4.3632509742354553e-07, + "loss": 1.8466, + "step": 5957 + }, + { + "epoch": 0.9612778315585673, + "grad_norm": 3.91989803314209, + "learning_rate": 4.3288767967760715e-07, + "loss": 1.979, + "step": 5958 + }, + { + "epoch": 0.9614391739270732, + "grad_norm": 3.3383967876434326, + "learning_rate": 4.29463796842855e-07, + "loss": 1.7008, + "step": 5959 + }, + { + "epoch": 0.9616005162955792, + "grad_norm": 4.163468360900879, + "learning_rate": 4.2605344985421346e-07, + "loss": 1.9048, + "step": 5960 + }, + { + "epoch": 0.9617618586640851, + "grad_norm": 3.660228967666626, + "learning_rate": 4.226566396429266e-07, + "loss": 1.7799, + "step": 5961 + }, + { + "epoch": 0.9619232010325912, + "grad_norm": 3.3542301654815674, + "learning_rate": 4.1927336713653007e-07, + "loss": 1.8733, + "step": 5962 + }, + { + "epoch": 0.9620845434010972, + "grad_norm": 4.129444599151611, + "learning_rate": 4.159036332588739e-07, + "loss": 1.7442, + "step": 5963 + }, + { + "epoch": 0.9622458857696031, + "grad_norm": 3.580047607421875, + "learning_rate": 4.125474389300998e-07, + "loss": 1.7935, + "step": 5964 + }, + { + "epoch": 0.9624072281381091, + "grad_norm": 4.628206729888916, + "learning_rate": 4.092047850666636e-07, + "loss": 2.0498, + "step": 5965 + }, + { + "epoch": 0.962568570506615, + "grad_norm": 4.669541835784912, + "learning_rate": 4.058756725813129e-07, + "loss": 1.9091, + "step": 5966 + }, + { + "epoch": 0.962729912875121, + "grad_norm": 5.048605918884277, + "learning_rate": 4.025601023831094e-07, + "loss": 1.9444, + "step": 5967 + }, + { + "epoch": 0.962891255243627, + "grad_norm": 3.759253740310669, + "learning_rate": 3.992580753774067e-07, + "loss": 1.9328, + "step": 5968 + }, + { + "epoch": 0.9630525976121329, + "grad_norm": 6.495770454406738, + "learning_rate": 3.9596959246585575e-07, + "loss": 1.6384, + "step": 5969 + }, + { + "epoch": 0.9632139399806389, + "grad_norm": 9.589229583740234, + "learning_rate": 3.926946545464327e-07, + "loss": 1.9898, + "step": 5970 + }, + { + "epoch": 0.9633752823491449, + "grad_norm": 5.594679832458496, + "learning_rate": 3.894332625133945e-07, + "loss": 1.6206, + "step": 5971 + }, + { + "epoch": 0.9635366247176509, + "grad_norm": 3.989413261413574, + "learning_rate": 3.861854172572954e-07, + "loss": 1.8372, + "step": 5972 + }, + { + "epoch": 0.9636979670861568, + "grad_norm": 4.727180004119873, + "learning_rate": 3.829511196650093e-07, + "loss": 1.9627, + "step": 5973 + }, + { + "epoch": 0.9638593094546628, + "grad_norm": 4.975203037261963, + "learning_rate": 3.797303706196964e-07, + "loss": 1.8582, + "step": 5974 + }, + { + "epoch": 0.9640206518231688, + "grad_norm": 3.692265272140503, + "learning_rate": 3.7652317100082543e-07, + "loss": 1.6858, + "step": 5975 + }, + { + "epoch": 0.9641819941916747, + "grad_norm": 5.113863945007324, + "learning_rate": 3.7332952168416257e-07, + "loss": 1.7237, + "step": 5976 + }, + { + "epoch": 0.9643433365601807, + "grad_norm": 6.761280536651611, + "learning_rate": 3.7014942354176575e-07, + "loss": 2.0606, + "step": 5977 + }, + { + "epoch": 0.9645046789286866, + "grad_norm": 5.555169582366943, + "learning_rate": 3.6698287744200697e-07, + "loss": 1.9713, + "step": 5978 + }, + { + "epoch": 0.9646660212971926, + "grad_norm": 5.829470634460449, + "learning_rate": 3.638298842495502e-07, + "loss": 1.7158, + "step": 5979 + }, + { + "epoch": 0.9648273636656987, + "grad_norm": 4.175316333770752, + "learning_rate": 3.6069044482535674e-07, + "loss": 1.6679, + "step": 5980 + }, + { + "epoch": 0.9649887060342046, + "grad_norm": 5.2124714851379395, + "learning_rate": 3.5756456002668525e-07, + "loss": 1.929, + "step": 5981 + }, + { + "epoch": 0.9651500484027106, + "grad_norm": 5.597807884216309, + "learning_rate": 3.5445223070710855e-07, + "loss": 2.0152, + "step": 5982 + }, + { + "epoch": 0.9653113907712165, + "grad_norm": 3.889457941055298, + "learning_rate": 3.513534577164801e-07, + "loss": 1.5768, + "step": 5983 + }, + { + "epoch": 0.9654727331397225, + "grad_norm": 5.3268141746521, + "learning_rate": 3.482682419009509e-07, + "loss": 1.8987, + "step": 5984 + }, + { + "epoch": 0.9656340755082284, + "grad_norm": 4.461936950683594, + "learning_rate": 3.4519658410299136e-07, + "loss": 1.7494, + "step": 5985 + }, + { + "epoch": 0.9657954178767344, + "grad_norm": 4.557859897613525, + "learning_rate": 3.4213848516134186e-07, + "loss": 2.0838, + "step": 5986 + }, + { + "epoch": 0.9659567602452404, + "grad_norm": 5.341065406799316, + "learning_rate": 3.390939459110676e-07, + "loss": 1.9941, + "step": 5987 + }, + { + "epoch": 0.9661181026137464, + "grad_norm": 5.358587265014648, + "learning_rate": 3.360629671835036e-07, + "loss": 1.699, + "step": 5988 + }, + { + "epoch": 0.9662794449822524, + "grad_norm": 3.979477882385254, + "learning_rate": 3.330455498063045e-07, + "loss": 1.7465, + "step": 5989 + }, + { + "epoch": 0.9664407873507583, + "grad_norm": 3.8328661918640137, + "learning_rate": 3.300416946034168e-07, + "loss": 1.7958, + "step": 5990 + }, + { + "epoch": 0.9666021297192643, + "grad_norm": 4.59087610244751, + "learning_rate": 3.270514023950733e-07, + "loss": 2.0609, + "step": 5991 + }, + { + "epoch": 0.9667634720877702, + "grad_norm": 4.539885520935059, + "learning_rate": 3.24074673997804e-07, + "loss": 1.7699, + "step": 5992 + }, + { + "epoch": 0.9669248144562762, + "grad_norm": 3.7460310459136963, + "learning_rate": 3.2111151022445883e-07, + "loss": 1.7859, + "step": 5993 + }, + { + "epoch": 0.9670861568247822, + "grad_norm": 5.758821964263916, + "learning_rate": 3.1816191188415166e-07, + "loss": 1.7002, + "step": 5994 + }, + { + "epoch": 0.9672474991932881, + "grad_norm": 3.9628663063049316, + "learning_rate": 3.1522587978231045e-07, + "loss": 1.7443, + "step": 5995 + }, + { + "epoch": 0.9674088415617941, + "grad_norm": 5.32503080368042, + "learning_rate": 3.123034147206605e-07, + "loss": 1.7886, + "step": 5996 + }, + { + "epoch": 0.9675701839303001, + "grad_norm": 5.373361587524414, + "learning_rate": 3.0939451749720794e-07, + "loss": 1.6785, + "step": 5997 + }, + { + "epoch": 0.9677315262988061, + "grad_norm": 4.285488605499268, + "learning_rate": 3.064991889062674e-07, + "loss": 1.8405, + "step": 5998 + }, + { + "epoch": 0.9678928686673121, + "grad_norm": 4.067709445953369, + "learning_rate": 3.036174297384453e-07, + "loss": 1.8125, + "step": 5999 + }, + { + "epoch": 0.968054211035818, + "grad_norm": 4.507450103759766, + "learning_rate": 3.007492407806456e-07, + "loss": 1.8592, + "step": 6000 + }, + { + "epoch": 0.968215553404324, + "grad_norm": 4.683064937591553, + "learning_rate": 2.9789462281605284e-07, + "loss": 1.4376, + "step": 6001 + }, + { + "epoch": 0.9683768957728299, + "grad_norm": 3.8618106842041016, + "learning_rate": 2.950535766241602e-07, + "loss": 1.7857, + "step": 6002 + }, + { + "epoch": 0.9685382381413359, + "grad_norm": 6.9576239585876465, + "learning_rate": 2.9222610298074717e-07, + "loss": 1.7491, + "step": 6003 + }, + { + "epoch": 0.9686995805098418, + "grad_norm": 4.038752555847168, + "learning_rate": 2.894122026579016e-07, + "loss": 1.5902, + "step": 6004 + }, + { + "epoch": 0.9688609228783479, + "grad_norm": 5.251195907592773, + "learning_rate": 2.866118764239756e-07, + "loss": 2.095, + "step": 6005 + }, + { + "epoch": 0.9690222652468539, + "grad_norm": 5.801555633544922, + "learning_rate": 2.8382512504365186e-07, + "loss": 1.5683, + "step": 6006 + }, + { + "epoch": 0.9691836076153598, + "grad_norm": 5.112753391265869, + "learning_rate": 2.810519492778774e-07, + "loss": 1.8896, + "step": 6007 + }, + { + "epoch": 0.9693449499838658, + "grad_norm": 4.228761672973633, + "learning_rate": 2.7829234988390184e-07, + "loss": 1.6108, + "step": 6008 + }, + { + "epoch": 0.9695062923523717, + "grad_norm": 4.433478355407715, + "learning_rate": 2.7554632761526146e-07, + "loss": 1.8418, + "step": 6009 + }, + { + "epoch": 0.9696676347208777, + "grad_norm": 5.7169365882873535, + "learning_rate": 2.7281388322180635e-07, + "loss": 1.93, + "step": 6010 + }, + { + "epoch": 0.9698289770893836, + "grad_norm": 5.642277240753174, + "learning_rate": 2.700950174496564e-07, + "loss": 1.7152, + "step": 6011 + }, + { + "epoch": 0.9699903194578896, + "grad_norm": 5.4277448654174805, + "learning_rate": 2.673897310412288e-07, + "loss": 1.8661, + "step": 6012 + }, + { + "epoch": 0.9701516618263956, + "grad_norm": 5.835224151611328, + "learning_rate": 2.646980247352437e-07, + "loss": 1.6669, + "step": 6013 + }, + { + "epoch": 0.9703130041949016, + "grad_norm": 5.225020885467529, + "learning_rate": 2.6201989926669115e-07, + "loss": 1.8734, + "step": 6014 + }, + { + "epoch": 0.9704743465634076, + "grad_norm": 4.355700492858887, + "learning_rate": 2.5935535536688036e-07, + "loss": 1.6864, + "step": 6015 + }, + { + "epoch": 0.9706356889319135, + "grad_norm": 4.854405403137207, + "learning_rate": 2.5670439376339063e-07, + "loss": 1.8816, + "step": 6016 + }, + { + "epoch": 0.9707970313004195, + "grad_norm": 5.25092887878418, + "learning_rate": 2.5406701518009834e-07, + "loss": 1.9376, + "step": 6017 + }, + { + "epoch": 0.9709583736689255, + "grad_norm": 5.707984447479248, + "learning_rate": 2.5144322033717747e-07, + "loss": 1.5456, + "step": 6018 + }, + { + "epoch": 0.9711197160374314, + "grad_norm": 4.779830455780029, + "learning_rate": 2.488330099510883e-07, + "loss": 1.7704, + "step": 6019 + }, + { + "epoch": 0.9712810584059374, + "grad_norm": 3.8003082275390625, + "learning_rate": 2.4623638473457167e-07, + "loss": 1.6952, + "step": 6020 + }, + { + "epoch": 0.9714424007744433, + "grad_norm": 5.577223300933838, + "learning_rate": 2.436533453966772e-07, + "loss": 1.8789, + "step": 6021 + }, + { + "epoch": 0.9716037431429493, + "grad_norm": 4.0358123779296875, + "learning_rate": 2.4108389264272947e-07, + "loss": 1.846, + "step": 6022 + }, + { + "epoch": 0.9717650855114554, + "grad_norm": 4.698176383972168, + "learning_rate": 2.3852802717435617e-07, + "loss": 1.8529, + "step": 6023 + }, + { + "epoch": 0.9719264278799613, + "grad_norm": 4.503703594207764, + "learning_rate": 2.359857496894602e-07, + "loss": 1.613, + "step": 6024 + }, + { + "epoch": 0.9720877702484673, + "grad_norm": 3.923738956451416, + "learning_rate": 2.3345706088224729e-07, + "loss": 1.9184, + "step": 6025 + }, + { + "epoch": 0.9722491126169732, + "grad_norm": 5.014230728149414, + "learning_rate": 2.3094196144320956e-07, + "loss": 1.8012, + "step": 6026 + }, + { + "epoch": 0.9724104549854792, + "grad_norm": 4.5727033615112305, + "learning_rate": 2.284404520591199e-07, + "loss": 2.0795, + "step": 6027 + }, + { + "epoch": 0.9725717973539851, + "grad_norm": 4.497374534606934, + "learning_rate": 2.25952533413043e-07, + "loss": 1.8711, + "step": 6028 + }, + { + "epoch": 0.9727331397224911, + "grad_norm": 3.5301945209503174, + "learning_rate": 2.2347820618434657e-07, + "loss": 1.7145, + "step": 6029 + }, + { + "epoch": 0.972894482090997, + "grad_norm": 4.955672264099121, + "learning_rate": 2.2101747104866788e-07, + "loss": 1.5397, + "step": 6030 + }, + { + "epoch": 0.9730558244595031, + "grad_norm": 3.6721248626708984, + "learning_rate": 2.185703286779417e-07, + "loss": 1.7076, + "step": 6031 + }, + { + "epoch": 0.9732171668280091, + "grad_norm": 4.408209323883057, + "learning_rate": 2.161367797403946e-07, + "loss": 2.1241, + "step": 6032 + }, + { + "epoch": 0.973378509196515, + "grad_norm": 3.461690902709961, + "learning_rate": 2.137168249005339e-07, + "loss": 1.9949, + "step": 6033 + }, + { + "epoch": 0.973539851565021, + "grad_norm": 4.539180755615234, + "learning_rate": 2.113104648191644e-07, + "loss": 1.6892, + "step": 6034 + }, + { + "epoch": 0.9737011939335269, + "grad_norm": 5.39754056930542, + "learning_rate": 2.0891770015336044e-07, + "loss": 1.7824, + "step": 6035 + }, + { + "epoch": 0.9738625363020329, + "grad_norm": 3.8471028804779053, + "learning_rate": 2.0653853155650492e-07, + "loss": 1.8523, + "step": 6036 + }, + { + "epoch": 0.9740238786705389, + "grad_norm": 3.98358416557312, + "learning_rate": 2.0417295967825602e-07, + "loss": 1.9759, + "step": 6037 + }, + { + "epoch": 0.9741852210390448, + "grad_norm": 4.597020626068115, + "learning_rate": 2.0182098516456362e-07, + "loss": 1.7696, + "step": 6038 + }, + { + "epoch": 0.9743465634075508, + "grad_norm": 4.354307174682617, + "learning_rate": 1.9948260865766398e-07, + "loss": 1.8683, + "step": 6039 + }, + { + "epoch": 0.9745079057760568, + "grad_norm": 4.493190288543701, + "learning_rate": 1.971578307960742e-07, + "loss": 2.0457, + "step": 6040 + }, + { + "epoch": 0.9746692481445628, + "grad_norm": 4.883968353271484, + "learning_rate": 1.9484665221460861e-07, + "loss": 1.9905, + "step": 6041 + }, + { + "epoch": 0.9748305905130688, + "grad_norm": 4.397480010986328, + "learning_rate": 1.9254907354436802e-07, + "loss": 1.6917, + "step": 6042 + }, + { + "epoch": 0.9749919328815747, + "grad_norm": 4.127442359924316, + "learning_rate": 1.9026509541272275e-07, + "loss": 1.7102, + "step": 6043 + }, + { + "epoch": 0.9751532752500807, + "grad_norm": 3.7167837619781494, + "learning_rate": 1.879947184433517e-07, + "loss": 1.8229, + "step": 6044 + }, + { + "epoch": 0.9753146176185866, + "grad_norm": 3.8155481815338135, + "learning_rate": 1.8573794325620343e-07, + "loss": 1.7235, + "step": 6045 + }, + { + "epoch": 0.9754759599870926, + "grad_norm": 4.336913585662842, + "learning_rate": 1.8349477046751828e-07, + "loss": 1.7814, + "step": 6046 + }, + { + "epoch": 0.9756373023555985, + "grad_norm": 4.289729595184326, + "learning_rate": 1.81265200689823e-07, + "loss": 1.972, + "step": 6047 + }, + { + "epoch": 0.9757986447241046, + "grad_norm": 5.120328426361084, + "learning_rate": 1.7904923453193056e-07, + "loss": 1.966, + "step": 6048 + }, + { + "epoch": 0.9759599870926106, + "grad_norm": 5.423091411590576, + "learning_rate": 1.7684687259893473e-07, + "loss": 1.7488, + "step": 6049 + }, + { + "epoch": 0.9761213294611165, + "grad_norm": 4.753484725952148, + "learning_rate": 1.7465811549222667e-07, + "loss": 1.7431, + "step": 6050 + }, + { + "epoch": 0.9762826718296225, + "grad_norm": 6.624080181121826, + "learning_rate": 1.724829638094616e-07, + "loss": 1.9048, + "step": 6051 + }, + { + "epoch": 0.9764440141981284, + "grad_norm": 4.905503749847412, + "learning_rate": 1.7032141814459778e-07, + "loss": 1.6309, + "step": 6052 + }, + { + "epoch": 0.9766053565666344, + "grad_norm": 4.667633056640625, + "learning_rate": 1.6817347908786863e-07, + "loss": 1.9415, + "step": 6053 + }, + { + "epoch": 0.9767666989351403, + "grad_norm": 3.977940559387207, + "learning_rate": 1.660391472257994e-07, + "loss": 2.0148, + "step": 6054 + }, + { + "epoch": 0.9769280413036463, + "grad_norm": 4.7679443359375, + "learning_rate": 1.6391842314119054e-07, + "loss": 1.6452, + "step": 6055 + }, + { + "epoch": 0.9770893836721523, + "grad_norm": 3.8190910816192627, + "learning_rate": 1.6181130741314e-07, + "loss": 1.6309, + "step": 6056 + }, + { + "epoch": 0.9772507260406583, + "grad_norm": 4.3463826179504395, + "learning_rate": 1.5971780061701524e-07, + "loss": 1.7745, + "step": 6057 + }, + { + "epoch": 0.9774120684091643, + "grad_norm": 5.691588878631592, + "learning_rate": 1.576379033244757e-07, + "loss": 1.8893, + "step": 6058 + }, + { + "epoch": 0.9775734107776702, + "grad_norm": 4.150592803955078, + "learning_rate": 1.555716161034615e-07, + "loss": 1.7366, + "step": 6059 + }, + { + "epoch": 0.9777347531461762, + "grad_norm": 3.8583078384399414, + "learning_rate": 1.5351893951819906e-07, + "loss": 1.7324, + "step": 6060 + }, + { + "epoch": 0.9778960955146822, + "grad_norm": 4.069317817687988, + "learning_rate": 1.5147987412920116e-07, + "loss": 1.7406, + "step": 6061 + }, + { + "epoch": 0.9780574378831881, + "grad_norm": 4.734239101409912, + "learning_rate": 1.494544204932502e-07, + "loss": 1.8533, + "step": 6062 + }, + { + "epoch": 0.9782187802516941, + "grad_norm": 4.880402565002441, + "learning_rate": 1.4744257916343153e-07, + "loss": 1.8194, + "step": 6063 + }, + { + "epoch": 0.9783801226202, + "grad_norm": 3.40069317817688, + "learning_rate": 1.4544435068909456e-07, + "loss": 1.7689, + "step": 6064 + }, + { + "epoch": 0.9785414649887061, + "grad_norm": 4.194741249084473, + "learning_rate": 1.434597356158862e-07, + "loss": 1.6788, + "step": 6065 + }, + { + "epoch": 0.978702807357212, + "grad_norm": 5.724354267120361, + "learning_rate": 1.4148873448573408e-07, + "loss": 2.2099, + "step": 6066 + }, + { + "epoch": 0.978864149725718, + "grad_norm": 4.644571304321289, + "learning_rate": 1.3953134783682987e-07, + "loss": 1.6492, + "step": 6067 + }, + { + "epoch": 0.979025492094224, + "grad_norm": 4.232357978820801, + "learning_rate": 1.375875762036738e-07, + "loss": 1.6987, + "step": 6068 + }, + { + "epoch": 0.9791868344627299, + "grad_norm": 7.880347728729248, + "learning_rate": 1.3565742011703576e-07, + "loss": 2.0461, + "step": 6069 + }, + { + "epoch": 0.9793481768312359, + "grad_norm": 3.9191014766693115, + "learning_rate": 1.3374088010396635e-07, + "loss": 1.6321, + "step": 6070 + }, + { + "epoch": 0.9795095191997418, + "grad_norm": 4.339367866516113, + "learning_rate": 1.3183795668779697e-07, + "loss": 1.8875, + "step": 6071 + }, + { + "epoch": 0.9796708615682478, + "grad_norm": 4.509636878967285, + "learning_rate": 1.2994865038815086e-07, + "loss": 1.8603, + "step": 6072 + }, + { + "epoch": 0.9798322039367537, + "grad_norm": 4.019570827484131, + "learning_rate": 1.2807296172092086e-07, + "loss": 1.9206, + "step": 6073 + }, + { + "epoch": 0.9799935463052598, + "grad_norm": 5.26365852355957, + "learning_rate": 1.2621089119829178e-07, + "loss": 1.8456, + "step": 6074 + }, + { + "epoch": 0.9801548886737658, + "grad_norm": 4.8168864250183105, + "learning_rate": 1.243624393287235e-07, + "loss": 1.7182, + "step": 6075 + }, + { + "epoch": 0.9803162310422717, + "grad_norm": 3.4046692848205566, + "learning_rate": 1.2252760661695672e-07, + "loss": 1.9333, + "step": 6076 + }, + { + "epoch": 0.9804775734107777, + "grad_norm": 4.598193645477295, + "learning_rate": 1.2070639356401292e-07, + "loss": 1.9252, + "step": 6077 + }, + { + "epoch": 0.9806389157792836, + "grad_norm": 4.1954240798950195, + "learning_rate": 1.1889880066720538e-07, + "loss": 1.8775, + "step": 6078 + }, + { + "epoch": 0.9808002581477896, + "grad_norm": 3.791823148727417, + "learning_rate": 1.1710482842011151e-07, + "loss": 1.7546, + "step": 6079 + }, + { + "epoch": 0.9809616005162956, + "grad_norm": 4.0534868240356445, + "learning_rate": 1.1532447731260054e-07, + "loss": 1.9442, + "step": 6080 + }, + { + "epoch": 0.9811229428848015, + "grad_norm": 4.838503360748291, + "learning_rate": 1.1355774783081696e-07, + "loss": 1.7617, + "step": 6081 + }, + { + "epoch": 0.9812842852533075, + "grad_norm": 5.009237289428711, + "learning_rate": 1.1180464045719708e-07, + "loss": 1.7312, + "step": 6082 + }, + { + "epoch": 0.9814456276218135, + "grad_norm": 4.203751087188721, + "learning_rate": 1.1006515567043574e-07, + "loss": 1.5578, + "step": 6083 + }, + { + "epoch": 0.9816069699903195, + "grad_norm": 3.759047269821167, + "learning_rate": 1.0833929394552522e-07, + "loss": 1.7059, + "step": 6084 + }, + { + "epoch": 0.9817683123588254, + "grad_norm": 3.4367129802703857, + "learning_rate": 1.0662705575373855e-07, + "loss": 1.3799, + "step": 6085 + }, + { + "epoch": 0.9819296547273314, + "grad_norm": 4.999564170837402, + "learning_rate": 1.0492844156262394e-07, + "loss": 1.6221, + "step": 6086 + }, + { + "epoch": 0.9820909970958374, + "grad_norm": 4.454932689666748, + "learning_rate": 1.0324345183599926e-07, + "loss": 1.9766, + "step": 6087 + }, + { + "epoch": 0.9822523394643433, + "grad_norm": 4.134593963623047, + "learning_rate": 1.0157208703397426e-07, + "loss": 1.9193, + "step": 6088 + }, + { + "epoch": 0.9824136818328493, + "grad_norm": 3.602125644683838, + "learning_rate": 9.991434761293938e-08, + "loss": 1.826, + "step": 6089 + }, + { + "epoch": 0.9825750242013552, + "grad_norm": 4.12139892578125, + "learning_rate": 9.827023402556035e-08, + "loss": 1.8143, + "step": 6090 + }, + { + "epoch": 0.9827363665698613, + "grad_norm": 4.671372890472412, + "learning_rate": 9.663974672078912e-08, + "loss": 1.6706, + "step": 6091 + }, + { + "epoch": 0.9828977089383673, + "grad_norm": 4.771209239959717, + "learning_rate": 9.502288614383625e-08, + "loss": 1.8342, + "step": 6092 + }, + { + "epoch": 0.9830590513068732, + "grad_norm": 4.081023216247559, + "learning_rate": 9.341965273621522e-08, + "loss": 1.7494, + "step": 6093 + }, + { + "epoch": 0.9832203936753792, + "grad_norm": 4.775580406188965, + "learning_rate": 9.183004693570363e-08, + "loss": 1.8544, + "step": 6094 + }, + { + "epoch": 0.9833817360438851, + "grad_norm": 5.823474884033203, + "learning_rate": 9.025406917636537e-08, + "loss": 2.0259, + "step": 6095 + }, + { + "epoch": 0.9835430784123911, + "grad_norm": 3.8634917736053467, + "learning_rate": 8.869171988854508e-08, + "loss": 2.1488, + "step": 6096 + }, + { + "epoch": 0.983704420780897, + "grad_norm": 5.826895236968994, + "learning_rate": 8.714299949885707e-08, + "loss": 1.7576, + "step": 6097 + }, + { + "epoch": 0.983865763149403, + "grad_norm": 4.188059329986572, + "learning_rate": 8.560790843019639e-08, + "loss": 1.76, + "step": 6098 + }, + { + "epoch": 0.984027105517909, + "grad_norm": 3.9793922901153564, + "learning_rate": 8.408644710173886e-08, + "loss": 1.8688, + "step": 6099 + }, + { + "epoch": 0.984188447886415, + "grad_norm": 3.9827539920806885, + "learning_rate": 8.2578615928941e-08, + "loss": 1.6499, + "step": 6100 + }, + { + "epoch": 0.984349790254921, + "grad_norm": 6.811621189117432, + "learning_rate": 8.108441532353461e-08, + "loss": 1.8214, + "step": 6101 + }, + { + "epoch": 0.9845111326234269, + "grad_norm": 3.6228678226470947, + "learning_rate": 7.960384569353219e-08, + "loss": 1.8004, + "step": 6102 + }, + { + "epoch": 0.9846724749919329, + "grad_norm": 3.770827531814575, + "learning_rate": 7.813690744321033e-08, + "loss": 1.9498, + "step": 6103 + }, + { + "epoch": 0.9848338173604388, + "grad_norm": 5.747048854827881, + "learning_rate": 7.668360097314864e-08, + "loss": 1.8909, + "step": 6104 + }, + { + "epoch": 0.9849951597289448, + "grad_norm": 4.43508243560791, + "learning_rate": 7.524392668018521e-08, + "loss": 1.5331, + "step": 6105 + }, + { + "epoch": 0.9851565020974508, + "grad_norm": 4.186741352081299, + "learning_rate": 7.381788495743891e-08, + "loss": 1.9215, + "step": 6106 + }, + { + "epoch": 0.9853178444659567, + "grad_norm": 3.6606838703155518, + "learning_rate": 7.240547619430382e-08, + "loss": 1.8836, + "step": 6107 + }, + { + "epoch": 0.9854791868344628, + "grad_norm": 4.097906589508057, + "learning_rate": 7.100670077646587e-08, + "loss": 1.6784, + "step": 6108 + }, + { + "epoch": 0.9856405292029687, + "grad_norm": 4.952674388885498, + "learning_rate": 6.962155908586954e-08, + "loss": 1.7973, + "step": 6109 + }, + { + "epoch": 0.9858018715714747, + "grad_norm": 4.816624641418457, + "learning_rate": 6.825005150075114e-08, + "loss": 1.7826, + "step": 6110 + }, + { + "epoch": 0.9859632139399807, + "grad_norm": 5.050424098968506, + "learning_rate": 6.689217839561113e-08, + "loss": 1.9938, + "step": 6111 + }, + { + "epoch": 0.9861245563084866, + "grad_norm": 4.853276252746582, + "learning_rate": 6.554794014124177e-08, + "loss": 1.8858, + "step": 6112 + }, + { + "epoch": 0.9862858986769926, + "grad_norm": 4.018308639526367, + "learning_rate": 6.421733710469391e-08, + "loss": 1.8462, + "step": 6113 + }, + { + "epoch": 0.9864472410454985, + "grad_norm": 4.396552562713623, + "learning_rate": 6.290036964931578e-08, + "loss": 1.8676, + "step": 6114 + }, + { + "epoch": 0.9866085834140045, + "grad_norm": 4.659566402435303, + "learning_rate": 6.159703813471418e-08, + "loss": 1.6639, + "step": 6115 + }, + { + "epoch": 0.9867699257825104, + "grad_norm": 4.091868877410889, + "learning_rate": 6.030734291677664e-08, + "loss": 1.6306, + "step": 6116 + }, + { + "epoch": 0.9869312681510165, + "grad_norm": 6.1804375648498535, + "learning_rate": 5.903128434768257e-08, + "loss": 2.1267, + "step": 6117 + }, + { + "epoch": 0.9870926105195225, + "grad_norm": 4.831748008728027, + "learning_rate": 5.7768862775864354e-08, + "loss": 1.6718, + "step": 6118 + }, + { + "epoch": 0.9872539528880284, + "grad_norm": 3.624600887298584, + "learning_rate": 5.652007854605179e-08, + "loss": 1.6628, + "step": 6119 + }, + { + "epoch": 0.9874152952565344, + "grad_norm": 4.931962490081787, + "learning_rate": 5.5284931999227685e-08, + "loss": 1.7394, + "step": 6120 + }, + { + "epoch": 0.9875766376250403, + "grad_norm": 5.466416835784912, + "learning_rate": 5.4063423472672236e-08, + "loss": 1.7392, + "step": 6121 + }, + { + "epoch": 0.9877379799935463, + "grad_norm": 5.067384243011475, + "learning_rate": 5.28555532999353e-08, + "loss": 1.8295, + "step": 6122 + }, + { + "epoch": 0.9878993223620522, + "grad_norm": 4.261351585388184, + "learning_rate": 5.1661321810836385e-08, + "loss": 2.1182, + "step": 6123 + }, + { + "epoch": 0.9880606647305582, + "grad_norm": 3.832385540008545, + "learning_rate": 5.048072933148129e-08, + "loss": 1.8679, + "step": 6124 + }, + { + "epoch": 0.9882220070990642, + "grad_norm": 4.070018768310547, + "learning_rate": 4.9313776184234386e-08, + "loss": 1.8629, + "step": 6125 + }, + { + "epoch": 0.9883833494675702, + "grad_norm": 3.701960325241089, + "learning_rate": 4.8160462687757425e-08, + "loss": 1.8328, + "step": 6126 + }, + { + "epoch": 0.9885446918360762, + "grad_norm": 5.63292121887207, + "learning_rate": 4.7020789156965175e-08, + "loss": 1.8521, + "step": 6127 + }, + { + "epoch": 0.9887060342045821, + "grad_norm": 3.8325681686401367, + "learning_rate": 4.5894755903075347e-08, + "loss": 1.7439, + "step": 6128 + }, + { + "epoch": 0.9888673765730881, + "grad_norm": 3.7844038009643555, + "learning_rate": 4.478236323355311e-08, + "loss": 1.6977, + "step": 6129 + }, + { + "epoch": 0.989028718941594, + "grad_norm": 3.7526204586029053, + "learning_rate": 4.368361145214994e-08, + "loss": 2.0068, + "step": 6130 + }, + { + "epoch": 0.9891900613101, + "grad_norm": 4.725560665130615, + "learning_rate": 4.25985008589036e-08, + "loss": 1.7959, + "step": 6131 + }, + { + "epoch": 0.989351403678606, + "grad_norm": 4.374151706695557, + "learning_rate": 4.152703175011041e-08, + "loss": 1.8819, + "step": 6132 + }, + { + "epoch": 0.9895127460471119, + "grad_norm": 3.925537347793579, + "learning_rate": 4.046920441834745e-08, + "loss": 1.9331, + "step": 6133 + }, + { + "epoch": 0.989674088415618, + "grad_norm": 4.154895305633545, + "learning_rate": 3.942501915247254e-08, + "loss": 1.7509, + "step": 6134 + }, + { + "epoch": 0.989835430784124, + "grad_norm": 4.7291998863220215, + "learning_rate": 3.839447623760761e-08, + "loss": 1.7866, + "step": 6135 + }, + { + "epoch": 0.9899967731526299, + "grad_norm": 3.555507183074951, + "learning_rate": 3.737757595515534e-08, + "loss": 1.7108, + "step": 6136 + }, + { + "epoch": 0.9901581155211359, + "grad_norm": 4.529835224151611, + "learning_rate": 3.637431858279916e-08, + "loss": 1.6214, + "step": 6137 + }, + { + "epoch": 0.9903194578896418, + "grad_norm": 4.146393775939941, + "learning_rate": 3.538470439448105e-08, + "loss": 1.73, + "step": 6138 + }, + { + "epoch": 0.9904808002581478, + "grad_norm": 5.236659526824951, + "learning_rate": 3.4408733660440395e-08, + "loss": 1.8729, + "step": 6139 + }, + { + "epoch": 0.9906421426266537, + "grad_norm": 4.751691818237305, + "learning_rate": 3.344640664716958e-08, + "loss": 1.6228, + "step": 6140 + }, + { + "epoch": 0.9908034849951597, + "grad_norm": 8.98833179473877, + "learning_rate": 3.249772361744175e-08, + "loss": 1.8316, + "step": 6141 + }, + { + "epoch": 0.9909648273636656, + "grad_norm": 5.137564182281494, + "learning_rate": 3.156268483031077e-08, + "loss": 1.8294, + "step": 6142 + }, + { + "epoch": 0.9911261697321717, + "grad_norm": 4.2446370124816895, + "learning_rate": 3.064129054110021e-08, + "loss": 1.7952, + "step": 6143 + }, + { + "epoch": 0.9912875121006777, + "grad_norm": 4.958536148071289, + "learning_rate": 2.9733541001408794e-08, + "loss": 1.7894, + "step": 6144 + }, + { + "epoch": 0.9914488544691836, + "grad_norm": 5.8110671043396, + "learning_rate": 2.8839436459104918e-08, + "loss": 2.0013, + "step": 6145 + }, + { + "epoch": 0.9916101968376896, + "grad_norm": 3.97647762298584, + "learning_rate": 2.795897715833773e-08, + "loss": 2.0651, + "step": 6146 + }, + { + "epoch": 0.9917715392061955, + "grad_norm": 3.5526373386383057, + "learning_rate": 2.709216333952602e-08, + "loss": 1.7351, + "step": 6147 + }, + { + "epoch": 0.9919328815747015, + "grad_norm": 4.236942768096924, + "learning_rate": 2.6238995239369346e-08, + "loss": 1.6112, + "step": 6148 + }, + { + "epoch": 0.9920942239432075, + "grad_norm": 3.8910460472106934, + "learning_rate": 2.5399473090825798e-08, + "loss": 1.5615, + "step": 6149 + }, + { + "epoch": 0.9922555663117134, + "grad_norm": 5.299407482147217, + "learning_rate": 2.4573597123145332e-08, + "loss": 1.8359, + "step": 6150 + }, + { + "epoch": 0.9924169086802195, + "grad_norm": 3.532557249069214, + "learning_rate": 2.3761367561841998e-08, + "loss": 1.9855, + "step": 6151 + }, + { + "epoch": 0.9925782510487254, + "grad_norm": 4.3103437423706055, + "learning_rate": 2.2962784628705046e-08, + "loss": 1.7814, + "step": 6152 + }, + { + "epoch": 0.9927395934172314, + "grad_norm": 4.5299973487854, + "learning_rate": 2.2177848541793388e-08, + "loss": 1.7005, + "step": 6153 + }, + { + "epoch": 0.9929009357857373, + "grad_norm": 3.790895462036133, + "learning_rate": 2.1406559515452228e-08, + "loss": 1.4501, + "step": 6154 + }, + { + "epoch": 0.9930622781542433, + "grad_norm": 4.620772361755371, + "learning_rate": 2.0648917760279775e-08, + "loss": 1.7938, + "step": 6155 + }, + { + "epoch": 0.9932236205227493, + "grad_norm": 4.014719009399414, + "learning_rate": 1.9904923483171635e-08, + "loss": 2.0391, + "step": 6156 + }, + { + "epoch": 0.9933849628912552, + "grad_norm": 4.058664321899414, + "learning_rate": 1.9174576887276417e-08, + "loss": 1.6082, + "step": 6157 + }, + { + "epoch": 0.9935463052597612, + "grad_norm": 4.010701656341553, + "learning_rate": 1.845787817202349e-08, + "loss": 1.6187, + "step": 6158 + }, + { + "epoch": 0.9937076476282671, + "grad_norm": 4.73102331161499, + "learning_rate": 1.7754827533122964e-08, + "loss": 1.9062, + "step": 6159 + }, + { + "epoch": 0.9938689899967732, + "grad_norm": 4.531217575073242, + "learning_rate": 1.7065425162549054e-08, + "loss": 1.8403, + "step": 6160 + }, + { + "epoch": 0.9940303323652792, + "grad_norm": 6.234742641448975, + "learning_rate": 1.6389671248545623e-08, + "loss": 1.7349, + "step": 6161 + }, + { + "epoch": 0.9941916747337851, + "grad_norm": 3.9504058361053467, + "learning_rate": 1.5727565975642844e-08, + "loss": 1.9756, + "step": 6162 + }, + { + "epoch": 0.9943530171022911, + "grad_norm": 5.519317150115967, + "learning_rate": 1.5079109524634983e-08, + "loss": 1.6333, + "step": 6163 + }, + { + "epoch": 0.994514359470797, + "grad_norm": 3.721313238143921, + "learning_rate": 1.4444302072591508e-08, + "loss": 1.7705, + "step": 6164 + }, + { + "epoch": 0.994675701839303, + "grad_norm": 4.5881757736206055, + "learning_rate": 1.3823143792851545e-08, + "loss": 1.7942, + "step": 6165 + }, + { + "epoch": 0.9948370442078089, + "grad_norm": 4.943873405456543, + "learning_rate": 1.3215634855029413e-08, + "loss": 1.8513, + "step": 6166 + }, + { + "epoch": 0.9949983865763149, + "grad_norm": 4.627598762512207, + "learning_rate": 1.2621775425020189e-08, + "loss": 1.6969, + "step": 6167 + }, + { + "epoch": 0.995159728944821, + "grad_norm": 4.433756351470947, + "learning_rate": 1.2041565664977494e-08, + "loss": 1.9785, + "step": 6168 + }, + { + "epoch": 0.9953210713133269, + "grad_norm": 3.788708448410034, + "learning_rate": 1.1475005733335708e-08, + "loss": 1.9499, + "step": 6169 + }, + { + "epoch": 0.9954824136818329, + "grad_norm": 4.006746768951416, + "learning_rate": 1.0922095784798858e-08, + "loss": 1.9891, + "step": 6170 + }, + { + "epoch": 0.9956437560503388, + "grad_norm": 4.220999240875244, + "learning_rate": 1.0382835970357275e-08, + "loss": 1.7248, + "step": 6171 + }, + { + "epoch": 0.9958050984188448, + "grad_norm": 3.901395320892334, + "learning_rate": 9.857226437248735e-09, + "loss": 1.5894, + "step": 6172 + }, + { + "epoch": 0.9959664407873507, + "grad_norm": 4.1000285148620605, + "learning_rate": 9.34526732900287e-09, + "loss": 1.8607, + "step": 6173 + }, + { + "epoch": 0.9961277831558567, + "grad_norm": 3.8186614513397217, + "learning_rate": 8.846958785418968e-09, + "loss": 1.7024, + "step": 6174 + }, + { + "epoch": 0.9962891255243627, + "grad_norm": 4.361697196960449, + "learning_rate": 8.362300942560409e-09, + "loss": 2.022, + "step": 6175 + }, + { + "epoch": 0.9964504678928686, + "grad_norm": 4.559261322021484, + "learning_rate": 7.891293932776878e-09, + "loss": 1.8452, + "step": 6176 + }, + { + "epoch": 0.9966118102613747, + "grad_norm": 3.8690664768218994, + "learning_rate": 7.433937884676611e-09, + "loss": 1.9764, + "step": 6177 + }, + { + "epoch": 0.9967731526298806, + "grad_norm": 5.568852424621582, + "learning_rate": 6.990232923148599e-09, + "loss": 1.8318, + "step": 6178 + }, + { + "epoch": 0.9969344949983866, + "grad_norm": 4.514222621917725, + "learning_rate": 6.5601791693514766e-09, + "loss": 1.6589, + "step": 6179 + }, + { + "epoch": 0.9970958373668926, + "grad_norm": 5.722458839416504, + "learning_rate": 6.143776740713536e-09, + "loss": 1.8228, + "step": 6180 + }, + { + "epoch": 0.9972571797353985, + "grad_norm": 4.524956703186035, + "learning_rate": 5.741025750943818e-09, + "loss": 1.8108, + "step": 6181 + }, + { + "epoch": 0.9974185221039045, + "grad_norm": 5.531214237213135, + "learning_rate": 5.351926310015465e-09, + "loss": 1.9062, + "step": 6182 + }, + { + "epoch": 0.9975798644724104, + "grad_norm": 5.001838684082031, + "learning_rate": 4.97647852417682e-09, + "loss": 1.7831, + "step": 6183 + }, + { + "epoch": 0.9977412068409164, + "grad_norm": 4.544003963470459, + "learning_rate": 4.614682495951428e-09, + "loss": 1.811, + "step": 6184 + }, + { + "epoch": 0.9979025492094223, + "grad_norm": 4.039700031280518, + "learning_rate": 4.266538324132485e-09, + "loss": 1.7955, + "step": 6185 + }, + { + "epoch": 0.9980638915779284, + "grad_norm": 5.618652820587158, + "learning_rate": 3.932046103777287e-09, + "loss": 1.7273, + "step": 6186 + }, + { + "epoch": 0.9982252339464344, + "grad_norm": 4.512746334075928, + "learning_rate": 3.6112059262294327e-09, + "loss": 1.7958, + "step": 6187 + }, + { + "epoch": 0.9983865763149403, + "grad_norm": 4.33228874206543, + "learning_rate": 3.3040178790966216e-09, + "loss": 1.6066, + "step": 6188 + }, + { + "epoch": 0.9985479186834463, + "grad_norm": 4.162201404571533, + "learning_rate": 3.0104820462673044e-09, + "loss": 1.8014, + "step": 6189 + }, + { + "epoch": 0.9987092610519522, + "grad_norm": 4.221867084503174, + "learning_rate": 2.730598507882931e-09, + "loss": 1.8571, + "step": 6190 + }, + { + "epoch": 0.9988706034204582, + "grad_norm": 4.4337053298950195, + "learning_rate": 2.464367340376805e-09, + "loss": 1.8666, + "step": 6191 + }, + { + "epoch": 0.9990319457889641, + "grad_norm": 4.931772232055664, + "learning_rate": 2.21178861644078e-09, + "loss": 1.5379, + "step": 6192 + }, + { + "epoch": 0.9991932881574701, + "grad_norm": 3.5758066177368164, + "learning_rate": 1.9728624050530107e-09, + "loss": 1.7382, + "step": 6193 + }, + { + "epoch": 0.9993546305259762, + "grad_norm": 4.1605000495910645, + "learning_rate": 1.7475887714502038e-09, + "loss": 1.7858, + "step": 6194 + }, + { + "epoch": 0.9995159728944821, + "grad_norm": 5.4196696281433105, + "learning_rate": 1.535967777149816e-09, + "loss": 1.801, + "step": 6195 + }, + { + "epoch": 0.9996773152629881, + "grad_norm": 4.785623550415039, + "learning_rate": 1.3379994799278538e-09, + "loss": 1.6988, + "step": 6196 + }, + { + "epoch": 0.999838657631494, + "grad_norm": 4.673151016235352, + "learning_rate": 1.1536839338521787e-09, + "loss": 1.6805, + "step": 6197 + }, + { + "epoch": 0.999838657631494, + "step": 6197, + "total_flos": 7.714147240515731e+18, + "train_loss": 0.05741394475061547, + "train_runtime": 1948.2586, + "train_samples_per_second": 101.798, + "train_steps_per_second": 3.181 + } + ], + "logging_steps": 1.0, + "max_steps": 6198, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.714147240515731e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}