diff --git "a/checkpoint-6337/trainer_state.json" "b/checkpoint-6337/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6337/trainer_state.json" @@ -0,0 +1,44393 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6337, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015780337699226762, + "grad_norm": 3.4695091247558594, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.9755, + "step": 1 + }, + { + "epoch": 0.00031560675398453525, + "grad_norm": 3.1075096130371094, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9021, + "step": 2 + }, + { + "epoch": 0.0004734101309768029, + "grad_norm": 3.351684331893921, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.929, + "step": 3 + }, + { + "epoch": 0.0006312135079690705, + "grad_norm": 3.1467652320861816, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9287, + "step": 4 + }, + { + "epoch": 0.0007890168849613382, + "grad_norm": 3.3596863746643066, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9073, + "step": 5 + }, + { + "epoch": 0.0009468202619536059, + "grad_norm": 3.2084641456604004, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.897, + "step": 6 + }, + { + "epoch": 0.0011046236389458735, + "grad_norm": 3.1724932193756104, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.9461, + "step": 7 + }, + { + "epoch": 0.001262427015938141, + "grad_norm": 3.4164812564849854, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.9553, + "step": 8 + }, + { + "epoch": 0.0014202303929304087, + "grad_norm": 3.167628765106201, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.9082, + "step": 9 + }, + { + "epoch": 0.0015780337699226763, + "grad_norm": 3.0002071857452393, + "learning_rate": 5.000000000000001e-07, + "loss": 0.93, + "step": 10 + }, + { + "epoch": 0.001735837146914944, + "grad_norm": 2.6759836673736572, + "learning_rate": 5.5e-07, + "loss": 0.941, + "step": 11 + }, + { + "epoch": 0.0018936405239072117, + "grad_norm": 2.5295722484588623, + "learning_rate": 6.000000000000001e-07, + "loss": 0.9225, + "step": 12 + }, + { + "epoch": 0.002051443900899479, + "grad_norm": 2.5313825607299805, + "learning_rate": 6.5e-07, + "loss": 0.9303, + "step": 13 + }, + { + "epoch": 0.002209247277891747, + "grad_norm": 2.360046148300171, + "learning_rate": 7.000000000000001e-07, + "loss": 0.9316, + "step": 14 + }, + { + "epoch": 0.0023670506548840145, + "grad_norm": 2.2620716094970703, + "learning_rate": 7.5e-07, + "loss": 0.9204, + "step": 15 + }, + { + "epoch": 0.002524854031876282, + "grad_norm": 2.1830193996429443, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9483, + "step": 16 + }, + { + "epoch": 0.00268265740886855, + "grad_norm": 1.7452276945114136, + "learning_rate": 8.500000000000001e-07, + "loss": 0.9222, + "step": 17 + }, + { + "epoch": 0.0028404607858608173, + "grad_norm": 1.6064575910568237, + "learning_rate": 9.000000000000001e-07, + "loss": 0.8648, + "step": 18 + }, + { + "epoch": 0.0029982641628530852, + "grad_norm": 1.5318560600280762, + "learning_rate": 9.500000000000001e-07, + "loss": 0.9208, + "step": 19 + }, + { + "epoch": 0.0031560675398453527, + "grad_norm": 1.4010709524154663, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9075, + "step": 20 + }, + { + "epoch": 0.00331387091683762, + "grad_norm": 1.2859939336776733, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.9119, + "step": 21 + }, + { + "epoch": 0.003471674293829888, + "grad_norm": 1.254467487335205, + "learning_rate": 1.1e-06, + "loss": 0.8984, + "step": 22 + }, + { + "epoch": 0.0036294776708221555, + "grad_norm": 1.2097504138946533, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.8949, + "step": 23 + }, + { + "epoch": 0.0037872810478144234, + "grad_norm": 1.135109305381775, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8751, + "step": 24 + }, + { + "epoch": 0.0039450844248066904, + "grad_norm": 1.0397167205810547, + "learning_rate": 1.25e-06, + "loss": 0.8591, + "step": 25 + }, + { + "epoch": 0.004102887801798958, + "grad_norm": 1.0846976041793823, + "learning_rate": 1.3e-06, + "loss": 0.8796, + "step": 26 + }, + { + "epoch": 0.004260691178791226, + "grad_norm": 1.0377200841903687, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8704, + "step": 27 + }, + { + "epoch": 0.004418494555783494, + "grad_norm": 0.9901278018951416, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8599, + "step": 28 + }, + { + "epoch": 0.004576297932775761, + "grad_norm": 0.997525691986084, + "learning_rate": 1.45e-06, + "loss": 0.8992, + "step": 29 + }, + { + "epoch": 0.004734101309768029, + "grad_norm": 0.9974583387374878, + "learning_rate": 1.5e-06, + "loss": 0.8722, + "step": 30 + }, + { + "epoch": 0.004891904686760297, + "grad_norm": 0.9311898350715637, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8744, + "step": 31 + }, + { + "epoch": 0.005049708063752564, + "grad_norm": 0.8861135840415955, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8615, + "step": 32 + }, + { + "epoch": 0.005207511440744832, + "grad_norm": 0.8253664970397949, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8506, + "step": 33 + }, + { + "epoch": 0.0053653148177371, + "grad_norm": 0.8385401964187622, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8711, + "step": 34 + }, + { + "epoch": 0.005523118194729367, + "grad_norm": 0.8381912112236023, + "learning_rate": 1.75e-06, + "loss": 0.8261, + "step": 35 + }, + { + "epoch": 0.005680921571721635, + "grad_norm": 0.8371213674545288, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8619, + "step": 36 + }, + { + "epoch": 0.005838724948713903, + "grad_norm": 0.7881415486335754, + "learning_rate": 1.85e-06, + "loss": 0.8696, + "step": 37 + }, + { + "epoch": 0.0059965283257061705, + "grad_norm": 0.7667366862297058, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.8475, + "step": 38 + }, + { + "epoch": 0.0061543317026984375, + "grad_norm": 0.7892696857452393, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.7874, + "step": 39 + }, + { + "epoch": 0.006312135079690705, + "grad_norm": 0.7391462922096252, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.826, + "step": 40 + }, + { + "epoch": 0.006469938456682973, + "grad_norm": 0.7768069505691528, + "learning_rate": 2.05e-06, + "loss": 0.8174, + "step": 41 + }, + { + "epoch": 0.00662774183367524, + "grad_norm": 0.7187530994415283, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.84, + "step": 42 + }, + { + "epoch": 0.006785545210667508, + "grad_norm": 0.7308444976806641, + "learning_rate": 2.15e-06, + "loss": 0.777, + "step": 43 + }, + { + "epoch": 0.006943348587659776, + "grad_norm": 0.6875916123390198, + "learning_rate": 2.2e-06, + "loss": 0.8117, + "step": 44 + }, + { + "epoch": 0.007101151964652043, + "grad_norm": 0.6827040314674377, + "learning_rate": 2.25e-06, + "loss": 0.7676, + "step": 45 + }, + { + "epoch": 0.007258955341644311, + "grad_norm": 0.726186990737915, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.8373, + "step": 46 + }, + { + "epoch": 0.007416758718636579, + "grad_norm": 0.8154804110527039, + "learning_rate": 2.35e-06, + "loss": 0.8314, + "step": 47 + }, + { + "epoch": 0.007574562095628847, + "grad_norm": 0.681153416633606, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7976, + "step": 48 + }, + { + "epoch": 0.007732365472621114, + "grad_norm": 0.6598945260047913, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7631, + "step": 49 + }, + { + "epoch": 0.007890168849613381, + "grad_norm": 0.6514580249786377, + "learning_rate": 2.5e-06, + "loss": 0.7869, + "step": 50 + }, + { + "epoch": 0.00804797222660565, + "grad_norm": 0.6831402778625488, + "learning_rate": 2.55e-06, + "loss": 0.815, + "step": 51 + }, + { + "epoch": 0.008205775603597917, + "grad_norm": 0.6234767436981201, + "learning_rate": 2.6e-06, + "loss": 0.7642, + "step": 52 + }, + { + "epoch": 0.008363578980590185, + "grad_norm": 0.6619821786880493, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.823, + "step": 53 + }, + { + "epoch": 0.008521382357582452, + "grad_norm": 0.6569198369979858, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.8163, + "step": 54 + }, + { + "epoch": 0.00867918573457472, + "grad_norm": 0.6125742197036743, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7659, + "step": 55 + }, + { + "epoch": 0.008836989111566988, + "grad_norm": 0.6333118677139282, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7638, + "step": 56 + }, + { + "epoch": 0.008994792488559255, + "grad_norm": 0.605268120765686, + "learning_rate": 2.85e-06, + "loss": 0.7945, + "step": 57 + }, + { + "epoch": 0.009152595865551522, + "grad_norm": 0.6622551083564758, + "learning_rate": 2.9e-06, + "loss": 0.7867, + "step": 58 + }, + { + "epoch": 0.009310399242543791, + "grad_norm": 0.6064326167106628, + "learning_rate": 2.95e-06, + "loss": 0.7665, + "step": 59 + }, + { + "epoch": 0.009468202619536058, + "grad_norm": 0.5782569050788879, + "learning_rate": 3e-06, + "loss": 0.7511, + "step": 60 + }, + { + "epoch": 0.009626005996528325, + "grad_norm": 0.6393043398857117, + "learning_rate": 3.05e-06, + "loss": 0.7652, + "step": 61 + }, + { + "epoch": 0.009783809373520594, + "grad_norm": 0.6123020052909851, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7713, + "step": 62 + }, + { + "epoch": 0.009941612750512861, + "grad_norm": 0.6291369795799255, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7397, + "step": 63 + }, + { + "epoch": 0.010099416127505128, + "grad_norm": 0.6048417091369629, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7419, + "step": 64 + }, + { + "epoch": 0.010257219504497397, + "grad_norm": 0.6030091643333435, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7566, + "step": 65 + }, + { + "epoch": 0.010415022881489664, + "grad_norm": 0.6006994843482971, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7675, + "step": 66 + }, + { + "epoch": 0.01057282625848193, + "grad_norm": 0.6062237620353699, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.7617, + "step": 67 + }, + { + "epoch": 0.0107306296354742, + "grad_norm": 0.5656557083129883, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.7557, + "step": 68 + }, + { + "epoch": 0.010888433012466467, + "grad_norm": 0.5785462260246277, + "learning_rate": 3.45e-06, + "loss": 0.7704, + "step": 69 + }, + { + "epoch": 0.011046236389458734, + "grad_norm": 0.643459677696228, + "learning_rate": 3.5e-06, + "loss": 0.7822, + "step": 70 + }, + { + "epoch": 0.011204039766451002, + "grad_norm": 0.575687825679779, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7311, + "step": 71 + }, + { + "epoch": 0.01136184314344327, + "grad_norm": 0.5986055135726929, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7363, + "step": 72 + }, + { + "epoch": 0.011519646520435538, + "grad_norm": 0.5744639039039612, + "learning_rate": 3.65e-06, + "loss": 0.7387, + "step": 73 + }, + { + "epoch": 0.011677449897427805, + "grad_norm": 0.659566342830658, + "learning_rate": 3.7e-06, + "loss": 0.7617, + "step": 74 + }, + { + "epoch": 0.011835253274420072, + "grad_norm": 0.5614746809005737, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7421, + "step": 75 + }, + { + "epoch": 0.011993056651412341, + "grad_norm": 0.5739306211471558, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7575, + "step": 76 + }, + { + "epoch": 0.012150860028404608, + "grad_norm": 0.5919414162635803, + "learning_rate": 3.85e-06, + "loss": 0.7604, + "step": 77 + }, + { + "epoch": 0.012308663405396875, + "grad_norm": 0.5688791275024414, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7284, + "step": 78 + }, + { + "epoch": 0.012466466782389144, + "grad_norm": 0.5759897232055664, + "learning_rate": 3.95e-06, + "loss": 0.7271, + "step": 79 + }, + { + "epoch": 0.01262427015938141, + "grad_norm": 0.5806029438972473, + "learning_rate": 4.000000000000001e-06, + "loss": 0.721, + "step": 80 + }, + { + "epoch": 0.012782073536373678, + "grad_norm": 0.6108107566833496, + "learning_rate": 4.05e-06, + "loss": 0.7481, + "step": 81 + }, + { + "epoch": 0.012939876913365947, + "grad_norm": 0.5440589189529419, + "learning_rate": 4.1e-06, + "loss": 0.7141, + "step": 82 + }, + { + "epoch": 0.013097680290358214, + "grad_norm": 0.5595912933349609, + "learning_rate": 4.15e-06, + "loss": 0.7285, + "step": 83 + }, + { + "epoch": 0.01325548366735048, + "grad_norm": 0.5574108958244324, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.748, + "step": 84 + }, + { + "epoch": 0.01341328704434275, + "grad_norm": 0.566811740398407, + "learning_rate": 4.25e-06, + "loss": 0.7537, + "step": 85 + }, + { + "epoch": 0.013571090421335016, + "grad_norm": 0.5741453170776367, + "learning_rate": 4.3e-06, + "loss": 0.7675, + "step": 86 + }, + { + "epoch": 0.013728893798327283, + "grad_norm": 0.6193757653236389, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7662, + "step": 87 + }, + { + "epoch": 0.013886697175319552, + "grad_norm": 0.5547411441802979, + "learning_rate": 4.4e-06, + "loss": 0.7295, + "step": 88 + }, + { + "epoch": 0.01404450055231182, + "grad_norm": 0.5620011687278748, + "learning_rate": 4.450000000000001e-06, + "loss": 0.6906, + "step": 89 + }, + { + "epoch": 0.014202303929304086, + "grad_norm": 0.5607304573059082, + "learning_rate": 4.5e-06, + "loss": 0.7593, + "step": 90 + }, + { + "epoch": 0.014360107306296355, + "grad_norm": 0.6090514659881592, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7899, + "step": 91 + }, + { + "epoch": 0.014517910683288622, + "grad_norm": 0.5685172080993652, + "learning_rate": 4.600000000000001e-06, + "loss": 0.7666, + "step": 92 + }, + { + "epoch": 0.01467571406028089, + "grad_norm": 0.5406185984611511, + "learning_rate": 4.65e-06, + "loss": 0.7195, + "step": 93 + }, + { + "epoch": 0.014833517437273158, + "grad_norm": 0.6353302597999573, + "learning_rate": 4.7e-06, + "loss": 0.7712, + "step": 94 + }, + { + "epoch": 0.014991320814265425, + "grad_norm": 0.5817251205444336, + "learning_rate": 4.75e-06, + "loss": 0.7267, + "step": 95 + }, + { + "epoch": 0.015149124191257694, + "grad_norm": 0.5612877011299133, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7517, + "step": 96 + }, + { + "epoch": 0.01530692756824996, + "grad_norm": 0.6283246278762817, + "learning_rate": 4.85e-06, + "loss": 0.7223, + "step": 97 + }, + { + "epoch": 0.015464730945242228, + "grad_norm": 0.6132266521453857, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7431, + "step": 98 + }, + { + "epoch": 0.015622534322234496, + "grad_norm": 0.5491782426834106, + "learning_rate": 4.95e-06, + "loss": 0.7168, + "step": 99 + }, + { + "epoch": 0.015780337699226762, + "grad_norm": 0.58565354347229, + "learning_rate": 5e-06, + "loss": 0.7129, + "step": 100 + }, + { + "epoch": 0.01593814107621903, + "grad_norm": 0.5712704658508301, + "learning_rate": 4.999999965503045e-06, + "loss": 0.7102, + "step": 101 + }, + { + "epoch": 0.0160959444532113, + "grad_norm": 0.566000759601593, + "learning_rate": 4.99999986201218e-06, + "loss": 0.7354, + "step": 102 + }, + { + "epoch": 0.016253747830203568, + "grad_norm": 0.6005913615226746, + "learning_rate": 4.999999689527407e-06, + "loss": 0.678, + "step": 103 + }, + { + "epoch": 0.016411551207195833, + "grad_norm": 0.592139720916748, + "learning_rate": 4.999999448048732e-06, + "loss": 0.7168, + "step": 104 + }, + { + "epoch": 0.016569354584188102, + "grad_norm": 0.6043803095817566, + "learning_rate": 4.999999137576161e-06, + "loss": 0.7132, + "step": 105 + }, + { + "epoch": 0.01672715796118037, + "grad_norm": 0.6334400773048401, + "learning_rate": 4.999998758109703e-06, + "loss": 0.7301, + "step": 106 + }, + { + "epoch": 0.016884961338172636, + "grad_norm": 0.5717909336090088, + "learning_rate": 4.999998309649369e-06, + "loss": 0.7342, + "step": 107 + }, + { + "epoch": 0.017042764715164905, + "grad_norm": 0.5694952607154846, + "learning_rate": 4.99999779219517e-06, + "loss": 0.7111, + "step": 108 + }, + { + "epoch": 0.017200568092157174, + "grad_norm": 0.5668104887008667, + "learning_rate": 4.9999972057471215e-06, + "loss": 0.7361, + "step": 109 + }, + { + "epoch": 0.01735837146914944, + "grad_norm": 0.5958070158958435, + "learning_rate": 4.9999965503052385e-06, + "loss": 0.7257, + "step": 110 + }, + { + "epoch": 0.017516174846141708, + "grad_norm": 0.6269883513450623, + "learning_rate": 4.99999582586954e-06, + "loss": 0.7518, + "step": 111 + }, + { + "epoch": 0.017673978223133976, + "grad_norm": 0.5962989330291748, + "learning_rate": 4.999995032440046e-06, + "loss": 0.7045, + "step": 112 + }, + { + "epoch": 0.017831781600126242, + "grad_norm": 0.5502008199691772, + "learning_rate": 4.999994170016777e-06, + "loss": 0.7041, + "step": 113 + }, + { + "epoch": 0.01798958497711851, + "grad_norm": 0.5531915426254272, + "learning_rate": 4.999993238599759e-06, + "loss": 0.723, + "step": 114 + }, + { + "epoch": 0.01814738835411078, + "grad_norm": 0.5891967415809631, + "learning_rate": 4.9999922381890175e-06, + "loss": 0.7017, + "step": 115 + }, + { + "epoch": 0.018305191731103045, + "grad_norm": 0.5877284407615662, + "learning_rate": 4.999991168784578e-06, + "loss": 0.7416, + "step": 116 + }, + { + "epoch": 0.018462995108095313, + "grad_norm": 0.5425035357475281, + "learning_rate": 4.9999900303864715e-06, + "loss": 0.7111, + "step": 117 + }, + { + "epoch": 0.018620798485087582, + "grad_norm": 0.6062846183776855, + "learning_rate": 4.99998882299473e-06, + "loss": 0.7406, + "step": 118 + }, + { + "epoch": 0.018778601862079847, + "grad_norm": 0.5614964365959167, + "learning_rate": 4.999987546609385e-06, + "loss": 0.6745, + "step": 119 + }, + { + "epoch": 0.018936405239072116, + "grad_norm": 0.5659580826759338, + "learning_rate": 4.999986201230473e-06, + "loss": 0.7057, + "step": 120 + }, + { + "epoch": 0.019094208616064385, + "grad_norm": 0.5849928855895996, + "learning_rate": 4.999984786858031e-06, + "loss": 0.7273, + "step": 121 + }, + { + "epoch": 0.01925201199305665, + "grad_norm": 0.5829923152923584, + "learning_rate": 4.999983303492098e-06, + "loss": 0.6892, + "step": 122 + }, + { + "epoch": 0.01940981537004892, + "grad_norm": 0.5888005495071411, + "learning_rate": 4.999981751132714e-06, + "loss": 0.6985, + "step": 123 + }, + { + "epoch": 0.019567618747041188, + "grad_norm": 0.5881317257881165, + "learning_rate": 4.999980129779923e-06, + "loss": 0.6814, + "step": 124 + }, + { + "epoch": 0.019725422124033453, + "grad_norm": 0.5580435395240784, + "learning_rate": 4.9999784394337705e-06, + "loss": 0.6901, + "step": 125 + }, + { + "epoch": 0.019883225501025722, + "grad_norm": 0.5416051745414734, + "learning_rate": 4.9999766800943015e-06, + "loss": 0.7152, + "step": 126 + }, + { + "epoch": 0.02004102887801799, + "grad_norm": 0.5538280606269836, + "learning_rate": 4.999974851761564e-06, + "loss": 0.7008, + "step": 127 + }, + { + "epoch": 0.020198832255010256, + "grad_norm": 0.587465226650238, + "learning_rate": 4.999972954435612e-06, + "loss": 0.7346, + "step": 128 + }, + { + "epoch": 0.020356635632002525, + "grad_norm": 0.5665954947471619, + "learning_rate": 4.999970988116493e-06, + "loss": 0.694, + "step": 129 + }, + { + "epoch": 0.020514439008994793, + "grad_norm": 0.6517206430435181, + "learning_rate": 4.999968952804265e-06, + "loss": 0.7111, + "step": 130 + }, + { + "epoch": 0.02067224238598706, + "grad_norm": 0.5685126185417175, + "learning_rate": 4.9999668484989825e-06, + "loss": 0.7101, + "step": 131 + }, + { + "epoch": 0.020830045762979327, + "grad_norm": 0.56971275806427, + "learning_rate": 4.999964675200704e-06, + "loss": 0.6869, + "step": 132 + }, + { + "epoch": 0.020987849139971596, + "grad_norm": 0.6332544684410095, + "learning_rate": 4.9999624329094894e-06, + "loss": 0.7192, + "step": 133 + }, + { + "epoch": 0.02114565251696386, + "grad_norm": 0.5714592337608337, + "learning_rate": 4.9999601216254e-06, + "loss": 0.7004, + "step": 134 + }, + { + "epoch": 0.02130345589395613, + "grad_norm": 0.593350887298584, + "learning_rate": 4.999957741348501e-06, + "loss": 0.6944, + "step": 135 + }, + { + "epoch": 0.0214612592709484, + "grad_norm": 0.5451998710632324, + "learning_rate": 4.9999552920788575e-06, + "loss": 0.6778, + "step": 136 + }, + { + "epoch": 0.021619062647940664, + "grad_norm": 0.530829131603241, + "learning_rate": 4.9999527738165365e-06, + "loss": 0.7103, + "step": 137 + }, + { + "epoch": 0.021776866024932933, + "grad_norm": 0.63400799036026, + "learning_rate": 4.999950186561608e-06, + "loss": 0.7615, + "step": 138 + }, + { + "epoch": 0.021934669401925202, + "grad_norm": 0.5641283392906189, + "learning_rate": 4.999947530314143e-06, + "loss": 0.6941, + "step": 139 + }, + { + "epoch": 0.022092472778917467, + "grad_norm": 0.5932353734970093, + "learning_rate": 4.999944805074215e-06, + "loss": 0.7085, + "step": 140 + }, + { + "epoch": 0.022250276155909736, + "grad_norm": 0.5735512375831604, + "learning_rate": 4.9999420108419e-06, + "loss": 0.6922, + "step": 141 + }, + { + "epoch": 0.022408079532902005, + "grad_norm": 0.5803022384643555, + "learning_rate": 4.999939147617274e-06, + "loss": 0.6866, + "step": 142 + }, + { + "epoch": 0.022565882909894273, + "grad_norm": 0.5519378781318665, + "learning_rate": 4.999936215400416e-06, + "loss": 0.725, + "step": 143 + }, + { + "epoch": 0.02272368628688654, + "grad_norm": 0.5602562427520752, + "learning_rate": 4.999933214191408e-06, + "loss": 0.6688, + "step": 144 + }, + { + "epoch": 0.022881489663878807, + "grad_norm": 0.5791318416595459, + "learning_rate": 4.999930143990332e-06, + "loss": 0.736, + "step": 145 + }, + { + "epoch": 0.023039293040871076, + "grad_norm": 0.5677676796913147, + "learning_rate": 4.999927004797273e-06, + "loss": 0.7283, + "step": 146 + }, + { + "epoch": 0.02319709641786334, + "grad_norm": 0.5746163725852966, + "learning_rate": 4.999923796612317e-06, + "loss": 0.7203, + "step": 147 + }, + { + "epoch": 0.02335489979485561, + "grad_norm": 0.5557523369789124, + "learning_rate": 4.9999205194355534e-06, + "loss": 0.6742, + "step": 148 + }, + { + "epoch": 0.02351270317184788, + "grad_norm": 0.588647723197937, + "learning_rate": 4.9999171732670726e-06, + "loss": 0.659, + "step": 149 + }, + { + "epoch": 0.023670506548840144, + "grad_norm": 0.570029616355896, + "learning_rate": 4.999913758106967e-06, + "loss": 0.7109, + "step": 150 + }, + { + "epoch": 0.023828309925832413, + "grad_norm": 0.5667696595191956, + "learning_rate": 4.999910273955329e-06, + "loss": 0.68, + "step": 151 + }, + { + "epoch": 0.023986113302824682, + "grad_norm": 0.650233805179596, + "learning_rate": 4.999906720812257e-06, + "loss": 0.7234, + "step": 152 + }, + { + "epoch": 0.024143916679816947, + "grad_norm": 0.5987951159477234, + "learning_rate": 4.999903098677849e-06, + "loss": 0.6429, + "step": 153 + }, + { + "epoch": 0.024301720056809216, + "grad_norm": 0.57980877161026, + "learning_rate": 4.999899407552205e-06, + "loss": 0.7225, + "step": 154 + }, + { + "epoch": 0.024459523433801485, + "grad_norm": 0.5566393733024597, + "learning_rate": 4.999895647435424e-06, + "loss": 0.6982, + "step": 155 + }, + { + "epoch": 0.02461732681079375, + "grad_norm": 0.5524844527244568, + "learning_rate": 4.999891818327614e-06, + "loss": 0.7172, + "step": 156 + }, + { + "epoch": 0.02477513018778602, + "grad_norm": 0.5868800282478333, + "learning_rate": 4.999887920228877e-06, + "loss": 0.6808, + "step": 157 + }, + { + "epoch": 0.024932933564778288, + "grad_norm": 0.5716592669487, + "learning_rate": 4.999883953139323e-06, + "loss": 0.6895, + "step": 158 + }, + { + "epoch": 0.025090736941770553, + "grad_norm": 0.547123908996582, + "learning_rate": 4.99987991705906e-06, + "loss": 0.718, + "step": 159 + }, + { + "epoch": 0.02524854031876282, + "grad_norm": 0.5697892904281616, + "learning_rate": 4.999875811988201e-06, + "loss": 0.7039, + "step": 160 + }, + { + "epoch": 0.02540634369575509, + "grad_norm": 0.5757319331169128, + "learning_rate": 4.999871637926858e-06, + "loss": 0.7227, + "step": 161 + }, + { + "epoch": 0.025564147072747356, + "grad_norm": 0.550275444984436, + "learning_rate": 4.999867394875145e-06, + "loss": 0.7276, + "step": 162 + }, + { + "epoch": 0.025721950449739624, + "grad_norm": 0.5599570870399475, + "learning_rate": 4.999863082833183e-06, + "loss": 0.7396, + "step": 163 + }, + { + "epoch": 0.025879753826731893, + "grad_norm": 0.5693298578262329, + "learning_rate": 4.999858701801087e-06, + "loss": 0.6647, + "step": 164 + }, + { + "epoch": 0.02603755720372416, + "grad_norm": 0.5545943379402161, + "learning_rate": 4.99985425177898e-06, + "loss": 0.7082, + "step": 165 + }, + { + "epoch": 0.026195360580716427, + "grad_norm": 0.5741023421287537, + "learning_rate": 4.9998497327669845e-06, + "loss": 0.6952, + "step": 166 + }, + { + "epoch": 0.026353163957708696, + "grad_norm": 0.5653087496757507, + "learning_rate": 4.999845144765225e-06, + "loss": 0.695, + "step": 167 + }, + { + "epoch": 0.02651096733470096, + "grad_norm": 0.5562973618507385, + "learning_rate": 4.999840487773828e-06, + "loss": 0.6983, + "step": 168 + }, + { + "epoch": 0.02666877071169323, + "grad_norm": 0.5707293748855591, + "learning_rate": 4.999835761792922e-06, + "loss": 0.6754, + "step": 169 + }, + { + "epoch": 0.0268265740886855, + "grad_norm": 0.559601902961731, + "learning_rate": 4.999830966822639e-06, + "loss": 0.7154, + "step": 170 + }, + { + "epoch": 0.026984377465677764, + "grad_norm": 0.5796419382095337, + "learning_rate": 4.999826102863109e-06, + "loss": 0.7411, + "step": 171 + }, + { + "epoch": 0.027142180842670033, + "grad_norm": 0.574495255947113, + "learning_rate": 4.999821169914467e-06, + "loss": 0.6874, + "step": 172 + }, + { + "epoch": 0.0272999842196623, + "grad_norm": 0.6012911200523376, + "learning_rate": 4.99981616797685e-06, + "loss": 0.6802, + "step": 173 + }, + { + "epoch": 0.027457787596654567, + "grad_norm": 0.6089543700218201, + "learning_rate": 4.999811097050394e-06, + "loss": 0.6803, + "step": 174 + }, + { + "epoch": 0.027615590973646836, + "grad_norm": 0.6179903745651245, + "learning_rate": 4.999805957135242e-06, + "loss": 0.6748, + "step": 175 + }, + { + "epoch": 0.027773394350639104, + "grad_norm": 0.5262030363082886, + "learning_rate": 4.999800748231534e-06, + "loss": 0.7386, + "step": 176 + }, + { + "epoch": 0.02793119772763137, + "grad_norm": 0.5947113633155823, + "learning_rate": 4.999795470339413e-06, + "loss": 0.6844, + "step": 177 + }, + { + "epoch": 0.02808900110462364, + "grad_norm": 0.5661666989326477, + "learning_rate": 4.999790123459025e-06, + "loss": 0.7144, + "step": 178 + }, + { + "epoch": 0.028246804481615907, + "grad_norm": 0.6252312064170837, + "learning_rate": 4.999784707590519e-06, + "loss": 0.6923, + "step": 179 + }, + { + "epoch": 0.028404607858608173, + "grad_norm": 0.565272867679596, + "learning_rate": 4.999779222734044e-06, + "loss": 0.6635, + "step": 180 + }, + { + "epoch": 0.02856241123560044, + "grad_norm": 0.5912328362464905, + "learning_rate": 4.99977366888975e-06, + "loss": 0.6817, + "step": 181 + }, + { + "epoch": 0.02872021461259271, + "grad_norm": 0.572708785533905, + "learning_rate": 4.9997680460577925e-06, + "loss": 0.7259, + "step": 182 + }, + { + "epoch": 0.02887801798958498, + "grad_norm": 0.5593901872634888, + "learning_rate": 4.999762354238324e-06, + "loss": 0.7182, + "step": 183 + }, + { + "epoch": 0.029035821366577244, + "grad_norm": 0.5774214267730713, + "learning_rate": 4.999756593431504e-06, + "loss": 0.7148, + "step": 184 + }, + { + "epoch": 0.029193624743569513, + "grad_norm": 0.5614857077598572, + "learning_rate": 4.999750763637491e-06, + "loss": 0.7118, + "step": 185 + }, + { + "epoch": 0.02935142812056178, + "grad_norm": 0.5858423113822937, + "learning_rate": 4.999744864856444e-06, + "loss": 0.7101, + "step": 186 + }, + { + "epoch": 0.029509231497554047, + "grad_norm": 0.5738166570663452, + "learning_rate": 4.999738897088527e-06, + "loss": 0.6422, + "step": 187 + }, + { + "epoch": 0.029667034874546316, + "grad_norm": 0.5767882466316223, + "learning_rate": 4.999732860333907e-06, + "loss": 0.687, + "step": 188 + }, + { + "epoch": 0.029824838251538584, + "grad_norm": 0.5941026210784912, + "learning_rate": 4.999726754592746e-06, + "loss": 0.6935, + "step": 189 + }, + { + "epoch": 0.02998264162853085, + "grad_norm": 0.566830575466156, + "learning_rate": 4.999720579865216e-06, + "loss": 0.6847, + "step": 190 + }, + { + "epoch": 0.03014044500552312, + "grad_norm": 0.5870015025138855, + "learning_rate": 4.999714336151487e-06, + "loss": 0.6725, + "step": 191 + }, + { + "epoch": 0.030298248382515387, + "grad_norm": 0.5675283074378967, + "learning_rate": 4.99970802345173e-06, + "loss": 0.7113, + "step": 192 + }, + { + "epoch": 0.030456051759507653, + "grad_norm": 0.575271487236023, + "learning_rate": 4.99970164176612e-06, + "loss": 0.6972, + "step": 193 + }, + { + "epoch": 0.03061385513649992, + "grad_norm": 0.6220059394836426, + "learning_rate": 4.999695191094833e-06, + "loss": 0.6602, + "step": 194 + }, + { + "epoch": 0.03077165851349219, + "grad_norm": 0.5734169483184814, + "learning_rate": 4.999688671438046e-06, + "loss": 0.7019, + "step": 195 + }, + { + "epoch": 0.030929461890484455, + "grad_norm": 0.5798291563987732, + "learning_rate": 4.999682082795941e-06, + "loss": 0.6955, + "step": 196 + }, + { + "epoch": 0.031087265267476724, + "grad_norm": 0.5823208689689636, + "learning_rate": 4.9996754251687e-06, + "loss": 0.6816, + "step": 197 + }, + { + "epoch": 0.031245068644468993, + "grad_norm": 0.6037646532058716, + "learning_rate": 4.9996686985565035e-06, + "loss": 0.7274, + "step": 198 + }, + { + "epoch": 0.03140287202146126, + "grad_norm": 0.6083869338035583, + "learning_rate": 4.999661902959541e-06, + "loss": 0.7312, + "step": 199 + }, + { + "epoch": 0.031560675398453523, + "grad_norm": 0.557180643081665, + "learning_rate": 4.999655038377996e-06, + "loss": 0.7042, + "step": 200 + }, + { + "epoch": 0.03171847877544579, + "grad_norm": 0.5604941248893738, + "learning_rate": 4.999648104812062e-06, + "loss": 0.6735, + "step": 201 + }, + { + "epoch": 0.03187628215243806, + "grad_norm": 0.5835480093955994, + "learning_rate": 4.999641102261927e-06, + "loss": 0.6871, + "step": 202 + }, + { + "epoch": 0.03203408552943033, + "grad_norm": 0.5800153613090515, + "learning_rate": 4.999634030727787e-06, + "loss": 0.7135, + "step": 203 + }, + { + "epoch": 0.0321918889064226, + "grad_norm": 0.6008113622665405, + "learning_rate": 4.999626890209836e-06, + "loss": 0.7211, + "step": 204 + }, + { + "epoch": 0.03234969228341487, + "grad_norm": 0.5937141180038452, + "learning_rate": 4.99961968070827e-06, + "loss": 0.7025, + "step": 205 + }, + { + "epoch": 0.032507495660407136, + "grad_norm": 0.5836013555526733, + "learning_rate": 4.999612402223289e-06, + "loss": 0.7216, + "step": 206 + }, + { + "epoch": 0.0326652990373994, + "grad_norm": 0.5514218211174011, + "learning_rate": 4.999605054755095e-06, + "loss": 0.6878, + "step": 207 + }, + { + "epoch": 0.03282310241439167, + "grad_norm": 0.5864423513412476, + "learning_rate": 4.999597638303888e-06, + "loss": 0.6952, + "step": 208 + }, + { + "epoch": 0.032980905791383935, + "grad_norm": 0.5596604347229004, + "learning_rate": 4.999590152869875e-06, + "loss": 0.6981, + "step": 209 + }, + { + "epoch": 0.033138709168376204, + "grad_norm": 0.5714897513389587, + "learning_rate": 4.999582598453262e-06, + "loss": 0.7012, + "step": 210 + }, + { + "epoch": 0.03329651254536847, + "grad_norm": 0.6075905561447144, + "learning_rate": 4.9995749750542575e-06, + "loss": 0.6672, + "step": 211 + }, + { + "epoch": 0.03345431592236074, + "grad_norm": 0.602791965007782, + "learning_rate": 4.999567282673071e-06, + "loss": 0.7003, + "step": 212 + }, + { + "epoch": 0.033612119299353004, + "grad_norm": 0.6320512890815735, + "learning_rate": 4.999559521309916e-06, + "loss": 0.6542, + "step": 213 + }, + { + "epoch": 0.03376992267634527, + "grad_norm": 0.5556775331497192, + "learning_rate": 4.999551690965006e-06, + "loss": 0.7065, + "step": 214 + }, + { + "epoch": 0.03392772605333754, + "grad_norm": 0.6003708839416504, + "learning_rate": 4.999543791638557e-06, + "loss": 0.7164, + "step": 215 + }, + { + "epoch": 0.03408552943032981, + "grad_norm": 0.59291011095047, + "learning_rate": 4.999535823330789e-06, + "loss": 0.6725, + "step": 216 + }, + { + "epoch": 0.03424333280732208, + "grad_norm": 0.5823165774345398, + "learning_rate": 4.999527786041918e-06, + "loss": 0.7171, + "step": 217 + }, + { + "epoch": 0.03440113618431435, + "grad_norm": 0.5751739740371704, + "learning_rate": 4.999519679772169e-06, + "loss": 0.6997, + "step": 218 + }, + { + "epoch": 0.03455893956130661, + "grad_norm": 0.5675159096717834, + "learning_rate": 4.999511504521764e-06, + "loss": 0.6634, + "step": 219 + }, + { + "epoch": 0.03471674293829888, + "grad_norm": 0.5914757251739502, + "learning_rate": 4.99950326029093e-06, + "loss": 0.6627, + "step": 220 + }, + { + "epoch": 0.03487454631529115, + "grad_norm": 0.600142776966095, + "learning_rate": 4.999494947079894e-06, + "loss": 0.6456, + "step": 221 + }, + { + "epoch": 0.035032349692283415, + "grad_norm": 0.6334402561187744, + "learning_rate": 4.999486564888886e-06, + "loss": 0.6905, + "step": 222 + }, + { + "epoch": 0.035190153069275684, + "grad_norm": 0.553313136100769, + "learning_rate": 4.999478113718135e-06, + "loss": 0.6792, + "step": 223 + }, + { + "epoch": 0.03534795644626795, + "grad_norm": 0.6522662043571472, + "learning_rate": 4.999469593567877e-06, + "loss": 0.7042, + "step": 224 + }, + { + "epoch": 0.035505759823260215, + "grad_norm": 0.579100489616394, + "learning_rate": 4.999461004438345e-06, + "loss": 0.6703, + "step": 225 + }, + { + "epoch": 0.035663563200252484, + "grad_norm": 0.5745454430580139, + "learning_rate": 4.999452346329777e-06, + "loss": 0.7068, + "step": 226 + }, + { + "epoch": 0.03582136657724475, + "grad_norm": 0.5631722807884216, + "learning_rate": 4.999443619242413e-06, + "loss": 0.6415, + "step": 227 + }, + { + "epoch": 0.03597916995423702, + "grad_norm": 0.5802041888237, + "learning_rate": 4.999434823176493e-06, + "loss": 0.7148, + "step": 228 + }, + { + "epoch": 0.03613697333122929, + "grad_norm": 0.568011462688446, + "learning_rate": 4.999425958132258e-06, + "loss": 0.6942, + "step": 229 + }, + { + "epoch": 0.03629477670822156, + "grad_norm": 0.5698195695877075, + "learning_rate": 4.999417024109955e-06, + "loss": 0.7091, + "step": 230 + }, + { + "epoch": 0.03645258008521382, + "grad_norm": 0.5525780320167542, + "learning_rate": 4.99940802110983e-06, + "loss": 0.6997, + "step": 231 + }, + { + "epoch": 0.03661038346220609, + "grad_norm": 0.5847318768501282, + "learning_rate": 4.999398949132131e-06, + "loss": 0.6587, + "step": 232 + }, + { + "epoch": 0.03676818683919836, + "grad_norm": 0.5880563259124756, + "learning_rate": 4.999389808177109e-06, + "loss": 0.684, + "step": 233 + }, + { + "epoch": 0.03692599021619063, + "grad_norm": 0.6318623423576355, + "learning_rate": 4.999380598245015e-06, + "loss": 0.6902, + "step": 234 + }, + { + "epoch": 0.037083793593182895, + "grad_norm": 0.6510125994682312, + "learning_rate": 4.999371319336105e-06, + "loss": 0.7113, + "step": 235 + }, + { + "epoch": 0.037241596970175164, + "grad_norm": 0.5765509605407715, + "learning_rate": 4.9993619714506335e-06, + "loss": 0.6931, + "step": 236 + }, + { + "epoch": 0.037399400347167426, + "grad_norm": 0.5744260549545288, + "learning_rate": 4.9993525545888586e-06, + "loss": 0.727, + "step": 237 + }, + { + "epoch": 0.037557203724159695, + "grad_norm": 0.6622337698936462, + "learning_rate": 4.999343068751042e-06, + "loss": 0.6931, + "step": 238 + }, + { + "epoch": 0.037715007101151964, + "grad_norm": 0.5808623433113098, + "learning_rate": 4.999333513937443e-06, + "loss": 0.7032, + "step": 239 + }, + { + "epoch": 0.03787281047814423, + "grad_norm": 0.5975818634033203, + "learning_rate": 4.999323890148327e-06, + "loss": 0.6565, + "step": 240 + }, + { + "epoch": 0.0380306138551365, + "grad_norm": 0.57035231590271, + "learning_rate": 4.999314197383959e-06, + "loss": 0.6581, + "step": 241 + }, + { + "epoch": 0.03818841723212877, + "grad_norm": 0.5968270897865295, + "learning_rate": 4.999304435644606e-06, + "loss": 0.6607, + "step": 242 + }, + { + "epoch": 0.03834622060912104, + "grad_norm": 0.5562549233436584, + "learning_rate": 4.9992946049305394e-06, + "loss": 0.6938, + "step": 243 + }, + { + "epoch": 0.0385040239861133, + "grad_norm": 0.6161041855812073, + "learning_rate": 4.999284705242027e-06, + "loss": 0.7135, + "step": 244 + }, + { + "epoch": 0.03866182736310557, + "grad_norm": 0.6044813394546509, + "learning_rate": 4.9992747365793455e-06, + "loss": 0.673, + "step": 245 + }, + { + "epoch": 0.03881963074009784, + "grad_norm": 0.5869848132133484, + "learning_rate": 4.999264698942768e-06, + "loss": 0.6918, + "step": 246 + }, + { + "epoch": 0.03897743411709011, + "grad_norm": 0.5812231302261353, + "learning_rate": 4.999254592332572e-06, + "loss": 0.7159, + "step": 247 + }, + { + "epoch": 0.039135237494082376, + "grad_norm": 0.6004703640937805, + "learning_rate": 4.999244416749037e-06, + "loss": 0.6538, + "step": 248 + }, + { + "epoch": 0.039293040871074644, + "grad_norm": 0.5813343524932861, + "learning_rate": 4.999234172192443e-06, + "loss": 0.6667, + "step": 249 + }, + { + "epoch": 0.039450844248066906, + "grad_norm": 0.6214852929115295, + "learning_rate": 4.999223858663073e-06, + "loss": 0.6786, + "step": 250 + }, + { + "epoch": 0.039608647625059175, + "grad_norm": 0.5806785225868225, + "learning_rate": 4.999213476161212e-06, + "loss": 0.7021, + "step": 251 + }, + { + "epoch": 0.039766451002051444, + "grad_norm": 0.592923104763031, + "learning_rate": 4.999203024687146e-06, + "loss": 0.7163, + "step": 252 + }, + { + "epoch": 0.03992425437904371, + "grad_norm": 0.6129065155982971, + "learning_rate": 4.999192504241163e-06, + "loss": 0.6859, + "step": 253 + }, + { + "epoch": 0.04008205775603598, + "grad_norm": 0.5701097846031189, + "learning_rate": 4.999181914823555e-06, + "loss": 0.6539, + "step": 254 + }, + { + "epoch": 0.04023986113302825, + "grad_norm": 0.5989294648170471, + "learning_rate": 4.999171256434613e-06, + "loss": 0.6778, + "step": 255 + }, + { + "epoch": 0.04039766451002051, + "grad_norm": 0.5902093052864075, + "learning_rate": 4.99916052907463e-06, + "loss": 0.6295, + "step": 256 + }, + { + "epoch": 0.04055546788701278, + "grad_norm": 0.5864452123641968, + "learning_rate": 4.9991497327439055e-06, + "loss": 0.6687, + "step": 257 + }, + { + "epoch": 0.04071327126400505, + "grad_norm": 0.614521861076355, + "learning_rate": 4.999138867442734e-06, + "loss": 0.6884, + "step": 258 + }, + { + "epoch": 0.04087107464099732, + "grad_norm": 0.5736198425292969, + "learning_rate": 4.999127933171417e-06, + "loss": 0.7036, + "step": 259 + }, + { + "epoch": 0.04102887801798959, + "grad_norm": 0.6196522116661072, + "learning_rate": 4.999116929930256e-06, + "loss": 0.7009, + "step": 260 + }, + { + "epoch": 0.041186681394981856, + "grad_norm": 0.5769186019897461, + "learning_rate": 4.999105857719554e-06, + "loss": 0.677, + "step": 261 + }, + { + "epoch": 0.04134448477197412, + "grad_norm": 0.6262659430503845, + "learning_rate": 4.9990947165396185e-06, + "loss": 0.704, + "step": 262 + }, + { + "epoch": 0.041502288148966386, + "grad_norm": 0.5980515480041504, + "learning_rate": 4.999083506390755e-06, + "loss": 0.6613, + "step": 263 + }, + { + "epoch": 0.041660091525958655, + "grad_norm": 0.5899896025657654, + "learning_rate": 4.999072227273274e-06, + "loss": 0.6639, + "step": 264 + }, + { + "epoch": 0.041817894902950924, + "grad_norm": 0.6150217652320862, + "learning_rate": 4.999060879187485e-06, + "loss": 0.6744, + "step": 265 + }, + { + "epoch": 0.04197569827994319, + "grad_norm": 0.5907060503959656, + "learning_rate": 4.999049462133704e-06, + "loss": 0.6693, + "step": 266 + }, + { + "epoch": 0.04213350165693546, + "grad_norm": 0.5581928491592407, + "learning_rate": 4.999037976112243e-06, + "loss": 0.677, + "step": 267 + }, + { + "epoch": 0.04229130503392772, + "grad_norm": 0.6152248382568359, + "learning_rate": 4.999026421123423e-06, + "loss": 0.6877, + "step": 268 + }, + { + "epoch": 0.04244910841091999, + "grad_norm": 0.5779948234558105, + "learning_rate": 4.9990147971675585e-06, + "loss": 0.6759, + "step": 269 + }, + { + "epoch": 0.04260691178791226, + "grad_norm": 0.5667346715927124, + "learning_rate": 4.999003104244973e-06, + "loss": 0.6581, + "step": 270 + }, + { + "epoch": 0.04276471516490453, + "grad_norm": 0.6046004295349121, + "learning_rate": 4.998991342355988e-06, + "loss": 0.6608, + "step": 271 + }, + { + "epoch": 0.0429225185418968, + "grad_norm": 0.587218165397644, + "learning_rate": 4.998979511500929e-06, + "loss": 0.6777, + "step": 272 + }, + { + "epoch": 0.04308032191888907, + "grad_norm": 0.5900250673294067, + "learning_rate": 4.998967611680121e-06, + "loss": 0.7031, + "step": 273 + }, + { + "epoch": 0.04323812529588133, + "grad_norm": 0.5990288853645325, + "learning_rate": 4.998955642893895e-06, + "loss": 0.673, + "step": 274 + }, + { + "epoch": 0.0433959286728736, + "grad_norm": 0.6019039750099182, + "learning_rate": 4.998943605142579e-06, + "loss": 0.6787, + "step": 275 + }, + { + "epoch": 0.043553732049865866, + "grad_norm": 0.5937849283218384, + "learning_rate": 4.998931498426506e-06, + "loss": 0.6828, + "step": 276 + }, + { + "epoch": 0.043711535426858135, + "grad_norm": 0.6089298129081726, + "learning_rate": 4.99891932274601e-06, + "loss": 0.6931, + "step": 277 + }, + { + "epoch": 0.043869338803850404, + "grad_norm": 0.5610269904136658, + "learning_rate": 4.998907078101427e-06, + "loss": 0.6594, + "step": 278 + }, + { + "epoch": 0.04402714218084267, + "grad_norm": 0.6588159799575806, + "learning_rate": 4.998894764493095e-06, + "loss": 0.6988, + "step": 279 + }, + { + "epoch": 0.044184945557834934, + "grad_norm": 0.5775243043899536, + "learning_rate": 4.998882381921354e-06, + "loss": 0.6728, + "step": 280 + }, + { + "epoch": 0.0443427489348272, + "grad_norm": 0.5718449950218201, + "learning_rate": 4.998869930386546e-06, + "loss": 0.6766, + "step": 281 + }, + { + "epoch": 0.04450055231181947, + "grad_norm": 0.606595516204834, + "learning_rate": 4.998857409889013e-06, + "loss": 0.6492, + "step": 282 + }, + { + "epoch": 0.04465835568881174, + "grad_norm": 0.5423402786254883, + "learning_rate": 4.998844820429103e-06, + "loss": 0.6066, + "step": 283 + }, + { + "epoch": 0.04481615906580401, + "grad_norm": 0.6066984534263611, + "learning_rate": 4.998832162007162e-06, + "loss": 0.6481, + "step": 284 + }, + { + "epoch": 0.04497396244279628, + "grad_norm": 0.6280665993690491, + "learning_rate": 4.998819434623539e-06, + "loss": 0.6749, + "step": 285 + }, + { + "epoch": 0.04513176581978855, + "grad_norm": 0.5799391865730286, + "learning_rate": 4.9988066382785875e-06, + "loss": 0.6534, + "step": 286 + }, + { + "epoch": 0.04528956919678081, + "grad_norm": 0.6013739705085754, + "learning_rate": 4.998793772972657e-06, + "loss": 0.711, + "step": 287 + }, + { + "epoch": 0.04544737257377308, + "grad_norm": 0.5728563666343689, + "learning_rate": 4.998780838706105e-06, + "loss": 0.6631, + "step": 288 + }, + { + "epoch": 0.045605175950765346, + "grad_norm": 0.5979239344596863, + "learning_rate": 4.998767835479288e-06, + "loss": 0.6805, + "step": 289 + }, + { + "epoch": 0.045762979327757615, + "grad_norm": 0.5882554054260254, + "learning_rate": 4.998754763292565e-06, + "loss": 0.6794, + "step": 290 + }, + { + "epoch": 0.045920782704749884, + "grad_norm": 0.6107369661331177, + "learning_rate": 4.998741622146296e-06, + "loss": 0.6983, + "step": 291 + }, + { + "epoch": 0.04607858608174215, + "grad_norm": 0.5764024257659912, + "learning_rate": 4.998728412040845e-06, + "loss": 0.6407, + "step": 292 + }, + { + "epoch": 0.046236389458734414, + "grad_norm": 0.5824241638183594, + "learning_rate": 4.998715132976575e-06, + "loss": 0.6728, + "step": 293 + }, + { + "epoch": 0.04639419283572668, + "grad_norm": 0.5927640795707703, + "learning_rate": 4.998701784953853e-06, + "loss": 0.7068, + "step": 294 + }, + { + "epoch": 0.04655199621271895, + "grad_norm": 0.566312313079834, + "learning_rate": 4.998688367973049e-06, + "loss": 0.6789, + "step": 295 + }, + { + "epoch": 0.04670979958971122, + "grad_norm": 0.5688636302947998, + "learning_rate": 4.998674882034531e-06, + "loss": 0.7195, + "step": 296 + }, + { + "epoch": 0.04686760296670349, + "grad_norm": 0.5618046522140503, + "learning_rate": 4.998661327138672e-06, + "loss": 0.6644, + "step": 297 + }, + { + "epoch": 0.04702540634369576, + "grad_norm": 0.6005878448486328, + "learning_rate": 4.998647703285846e-06, + "loss": 0.6772, + "step": 298 + }, + { + "epoch": 0.04718320972068802, + "grad_norm": 0.5815431475639343, + "learning_rate": 4.99863401047643e-06, + "loss": 0.6907, + "step": 299 + }, + { + "epoch": 0.04734101309768029, + "grad_norm": 0.5753422975540161, + "learning_rate": 4.9986202487108e-06, + "loss": 0.6597, + "step": 300 + }, + { + "epoch": 0.04749881647467256, + "grad_norm": 0.6568279266357422, + "learning_rate": 4.998606417989338e-06, + "loss": 0.6874, + "step": 301 + }, + { + "epoch": 0.047656619851664826, + "grad_norm": 0.6008743047714233, + "learning_rate": 4.998592518312424e-06, + "loss": 0.6959, + "step": 302 + }, + { + "epoch": 0.047814423228657095, + "grad_norm": 0.5859663486480713, + "learning_rate": 4.998578549680442e-06, + "loss": 0.6753, + "step": 303 + }, + { + "epoch": 0.047972226605649364, + "grad_norm": 0.586944580078125, + "learning_rate": 4.9985645120937775e-06, + "loss": 0.6723, + "step": 304 + }, + { + "epoch": 0.048130029982641626, + "grad_norm": 0.5542764663696289, + "learning_rate": 4.998550405552818e-06, + "loss": 0.6755, + "step": 305 + }, + { + "epoch": 0.048287833359633894, + "grad_norm": 0.6082794070243835, + "learning_rate": 4.998536230057953e-06, + "loss": 0.6797, + "step": 306 + }, + { + "epoch": 0.04844563673662616, + "grad_norm": 0.5880076885223389, + "learning_rate": 4.998521985609574e-06, + "loss": 0.6881, + "step": 307 + }, + { + "epoch": 0.04860344011361843, + "grad_norm": 0.6128143668174744, + "learning_rate": 4.998507672208073e-06, + "loss": 0.6652, + "step": 308 + }, + { + "epoch": 0.0487612434906107, + "grad_norm": 0.6205839514732361, + "learning_rate": 4.998493289853847e-06, + "loss": 0.6398, + "step": 309 + }, + { + "epoch": 0.04891904686760297, + "grad_norm": 0.6115543246269226, + "learning_rate": 4.998478838547289e-06, + "loss": 0.6736, + "step": 310 + }, + { + "epoch": 0.04907685024459523, + "grad_norm": 0.6236895322799683, + "learning_rate": 4.998464318288803e-06, + "loss": 0.693, + "step": 311 + }, + { + "epoch": 0.0492346536215875, + "grad_norm": 0.6036615967750549, + "learning_rate": 4.998449729078786e-06, + "loss": 0.6722, + "step": 312 + }, + { + "epoch": 0.04939245699857977, + "grad_norm": 0.647555410861969, + "learning_rate": 4.9984350709176414e-06, + "loss": 0.7027, + "step": 313 + }, + { + "epoch": 0.04955026037557204, + "grad_norm": 0.6417546272277832, + "learning_rate": 4.998420343805774e-06, + "loss": 0.6853, + "step": 314 + }, + { + "epoch": 0.049708063752564306, + "grad_norm": 0.6082236170768738, + "learning_rate": 4.998405547743591e-06, + "loss": 0.649, + "step": 315 + }, + { + "epoch": 0.049865867129556575, + "grad_norm": 0.6165286302566528, + "learning_rate": 4.9983906827314995e-06, + "loss": 0.6515, + "step": 316 + }, + { + "epoch": 0.05002367050654884, + "grad_norm": 0.5871098041534424, + "learning_rate": 4.998375748769911e-06, + "loss": 0.6974, + "step": 317 + }, + { + "epoch": 0.050181473883541106, + "grad_norm": 0.6099493503570557, + "learning_rate": 4.998360745859237e-06, + "loss": 0.663, + "step": 318 + }, + { + "epoch": 0.050339277260533374, + "grad_norm": 0.5965157151222229, + "learning_rate": 4.99834567399989e-06, + "loss": 0.6902, + "step": 319 + }, + { + "epoch": 0.05049708063752564, + "grad_norm": 0.5805944204330444, + "learning_rate": 4.998330533192288e-06, + "loss": 0.6667, + "step": 320 + }, + { + "epoch": 0.05065488401451791, + "grad_norm": 0.5696669220924377, + "learning_rate": 4.9983153234368495e-06, + "loss": 0.7126, + "step": 321 + }, + { + "epoch": 0.05081268739151018, + "grad_norm": 0.6023860573768616, + "learning_rate": 4.998300044733992e-06, + "loss": 0.6872, + "step": 322 + }, + { + "epoch": 0.05097049076850245, + "grad_norm": 0.5759710669517517, + "learning_rate": 4.998284697084138e-06, + "loss": 0.7069, + "step": 323 + }, + { + "epoch": 0.05112829414549471, + "grad_norm": 0.5912811756134033, + "learning_rate": 4.998269280487712e-06, + "loss": 0.633, + "step": 324 + }, + { + "epoch": 0.05128609752248698, + "grad_norm": 0.6077597141265869, + "learning_rate": 4.998253794945138e-06, + "loss": 0.6995, + "step": 325 + }, + { + "epoch": 0.05144390089947925, + "grad_norm": 0.6074457168579102, + "learning_rate": 4.998238240456844e-06, + "loss": 0.6675, + "step": 326 + }, + { + "epoch": 0.05160170427647152, + "grad_norm": 0.5997494459152222, + "learning_rate": 4.99822261702326e-06, + "loss": 0.7325, + "step": 327 + }, + { + "epoch": 0.051759507653463786, + "grad_norm": 0.5813965201377869, + "learning_rate": 4.9982069246448165e-06, + "loss": 0.6552, + "step": 328 + }, + { + "epoch": 0.051917311030456055, + "grad_norm": 0.6059664487838745, + "learning_rate": 4.998191163321946e-06, + "loss": 0.6864, + "step": 329 + }, + { + "epoch": 0.05207511440744832, + "grad_norm": 0.5916436314582825, + "learning_rate": 4.998175333055084e-06, + "loss": 0.6239, + "step": 330 + }, + { + "epoch": 0.052232917784440586, + "grad_norm": 0.6300515532493591, + "learning_rate": 4.9981594338446684e-06, + "loss": 0.6503, + "step": 331 + }, + { + "epoch": 0.052390721161432854, + "grad_norm": 0.5750347971916199, + "learning_rate": 4.998143465691136e-06, + "loss": 0.6748, + "step": 332 + }, + { + "epoch": 0.05254852453842512, + "grad_norm": 0.5579541325569153, + "learning_rate": 4.9981274285949284e-06, + "loss": 0.6659, + "step": 333 + }, + { + "epoch": 0.05270632791541739, + "grad_norm": 0.5829914808273315, + "learning_rate": 4.9981113225564895e-06, + "loss": 0.6765, + "step": 334 + }, + { + "epoch": 0.05286413129240966, + "grad_norm": 0.5955128073692322, + "learning_rate": 4.998095147576261e-06, + "loss": 0.6792, + "step": 335 + }, + { + "epoch": 0.05302193466940192, + "grad_norm": 0.5873895883560181, + "learning_rate": 4.998078903654692e-06, + "loss": 0.679, + "step": 336 + }, + { + "epoch": 0.05317973804639419, + "grad_norm": 0.6097041368484497, + "learning_rate": 4.998062590792229e-06, + "loss": 0.6342, + "step": 337 + }, + { + "epoch": 0.05333754142338646, + "grad_norm": 0.6153303980827332, + "learning_rate": 4.998046208989323e-06, + "loss": 0.6964, + "step": 338 + }, + { + "epoch": 0.05349534480037873, + "grad_norm": 0.6509466767311096, + "learning_rate": 4.998029758246426e-06, + "loss": 0.6663, + "step": 339 + }, + { + "epoch": 0.053653148177371, + "grad_norm": 0.599280595779419, + "learning_rate": 4.998013238563991e-06, + "loss": 0.6407, + "step": 340 + }, + { + "epoch": 0.053810951554363266, + "grad_norm": 0.5892170071601868, + "learning_rate": 4.997996649942476e-06, + "loss": 0.6734, + "step": 341 + }, + { + "epoch": 0.05396875493135553, + "grad_norm": 0.602360188961029, + "learning_rate": 4.997979992382338e-06, + "loss": 0.6778, + "step": 342 + }, + { + "epoch": 0.0541265583083478, + "grad_norm": 0.6347168684005737, + "learning_rate": 4.997963265884035e-06, + "loss": 0.6237, + "step": 343 + }, + { + "epoch": 0.054284361685340066, + "grad_norm": 0.6106343269348145, + "learning_rate": 4.997946470448031e-06, + "loss": 0.6922, + "step": 344 + }, + { + "epoch": 0.054442165062332334, + "grad_norm": 0.6039180755615234, + "learning_rate": 4.997929606074788e-06, + "loss": 0.6753, + "step": 345 + }, + { + "epoch": 0.0545999684393246, + "grad_norm": 0.5909541249275208, + "learning_rate": 4.997912672764772e-06, + "loss": 0.6794, + "step": 346 + }, + { + "epoch": 0.05475777181631687, + "grad_norm": 0.5898770093917847, + "learning_rate": 4.99789567051845e-06, + "loss": 0.6652, + "step": 347 + }, + { + "epoch": 0.054915575193309134, + "grad_norm": 0.5652443766593933, + "learning_rate": 4.997878599336291e-06, + "loss": 0.6445, + "step": 348 + }, + { + "epoch": 0.0550733785703014, + "grad_norm": 0.6308527588844299, + "learning_rate": 4.9978614592187675e-06, + "loss": 0.6475, + "step": 349 + }, + { + "epoch": 0.05523118194729367, + "grad_norm": 0.5761753916740417, + "learning_rate": 4.997844250166351e-06, + "loss": 0.6084, + "step": 350 + }, + { + "epoch": 0.05538898532428594, + "grad_norm": 0.5787163376808167, + "learning_rate": 4.9978269721795175e-06, + "loss": 0.6673, + "step": 351 + }, + { + "epoch": 0.05554678870127821, + "grad_norm": 0.6460014581680298, + "learning_rate": 4.997809625258743e-06, + "loss": 0.6934, + "step": 352 + }, + { + "epoch": 0.05570459207827048, + "grad_norm": 0.5741932392120361, + "learning_rate": 4.997792209404506e-06, + "loss": 0.6309, + "step": 353 + }, + { + "epoch": 0.05586239545526274, + "grad_norm": 0.6123661398887634, + "learning_rate": 4.997774724617289e-06, + "loss": 0.6751, + "step": 354 + }, + { + "epoch": 0.05602019883225501, + "grad_norm": 0.5850964784622192, + "learning_rate": 4.997757170897572e-06, + "loss": 0.6589, + "step": 355 + }, + { + "epoch": 0.05617800220924728, + "grad_norm": 0.5619524121284485, + "learning_rate": 4.9977395482458415e-06, + "loss": 0.6667, + "step": 356 + }, + { + "epoch": 0.056335805586239546, + "grad_norm": 0.5878908038139343, + "learning_rate": 4.997721856662583e-06, + "loss": 0.6805, + "step": 357 + }, + { + "epoch": 0.056493608963231814, + "grad_norm": 0.6137134432792664, + "learning_rate": 4.997704096148283e-06, + "loss": 0.6244, + "step": 358 + }, + { + "epoch": 0.05665141234022408, + "grad_norm": 0.6053595542907715, + "learning_rate": 4.997686266703436e-06, + "loss": 0.7159, + "step": 359 + }, + { + "epoch": 0.056809215717216345, + "grad_norm": 0.6243941187858582, + "learning_rate": 4.99766836832853e-06, + "loss": 0.6802, + "step": 360 + }, + { + "epoch": 0.056967019094208614, + "grad_norm": 0.6301602721214294, + "learning_rate": 4.997650401024061e-06, + "loss": 0.685, + "step": 361 + }, + { + "epoch": 0.05712482247120088, + "grad_norm": 0.6077635288238525, + "learning_rate": 4.997632364790524e-06, + "loss": 0.669, + "step": 362 + }, + { + "epoch": 0.05728262584819315, + "grad_norm": 0.6059938669204712, + "learning_rate": 4.9976142596284175e-06, + "loss": 0.6309, + "step": 363 + }, + { + "epoch": 0.05744042922518542, + "grad_norm": 0.5727999806404114, + "learning_rate": 4.99759608553824e-06, + "loss": 0.6755, + "step": 364 + }, + { + "epoch": 0.05759823260217769, + "grad_norm": 0.6282404065132141, + "learning_rate": 4.997577842520493e-06, + "loss": 0.6549, + "step": 365 + }, + { + "epoch": 0.05775603597916996, + "grad_norm": 0.6303530931472778, + "learning_rate": 4.997559530575682e-06, + "loss": 0.6461, + "step": 366 + }, + { + "epoch": 0.05791383935616222, + "grad_norm": 0.5858336091041565, + "learning_rate": 4.9975411497043105e-06, + "loss": 0.6837, + "step": 367 + }, + { + "epoch": 0.05807164273315449, + "grad_norm": 0.5967845916748047, + "learning_rate": 4.997522699906886e-06, + "loss": 0.6654, + "step": 368 + }, + { + "epoch": 0.05822944611014676, + "grad_norm": 0.6188459396362305, + "learning_rate": 4.9975041811839195e-06, + "loss": 0.6758, + "step": 369 + }, + { + "epoch": 0.058387249487139026, + "grad_norm": 0.6252828240394592, + "learning_rate": 4.997485593535919e-06, + "loss": 0.672, + "step": 370 + }, + { + "epoch": 0.058545052864131295, + "grad_norm": 0.5647075772285461, + "learning_rate": 4.9974669369633995e-06, + "loss": 0.6729, + "step": 371 + }, + { + "epoch": 0.05870285624112356, + "grad_norm": 0.617946445941925, + "learning_rate": 4.997448211466876e-06, + "loss": 0.6731, + "step": 372 + }, + { + "epoch": 0.058860659618115825, + "grad_norm": 0.6188806891441345, + "learning_rate": 4.997429417046864e-06, + "loss": 0.639, + "step": 373 + }, + { + "epoch": 0.059018462995108094, + "grad_norm": 0.6063072681427002, + "learning_rate": 4.997410553703883e-06, + "loss": 0.6377, + "step": 374 + }, + { + "epoch": 0.05917626637210036, + "grad_norm": 0.5826771855354309, + "learning_rate": 4.997391621438454e-06, + "loss": 0.646, + "step": 375 + }, + { + "epoch": 0.05933406974909263, + "grad_norm": 0.594504177570343, + "learning_rate": 4.997372620251099e-06, + "loss": 0.6878, + "step": 376 + }, + { + "epoch": 0.0594918731260849, + "grad_norm": 0.5958885550498962, + "learning_rate": 4.9973535501423406e-06, + "loss": 0.63, + "step": 377 + }, + { + "epoch": 0.05964967650307717, + "grad_norm": 0.6117749810218811, + "learning_rate": 4.997334411112709e-06, + "loss": 0.6299, + "step": 378 + }, + { + "epoch": 0.05980747988006943, + "grad_norm": 0.5918693542480469, + "learning_rate": 4.997315203162729e-06, + "loss": 0.6556, + "step": 379 + }, + { + "epoch": 0.0599652832570617, + "grad_norm": 0.5816782116889954, + "learning_rate": 4.997295926292932e-06, + "loss": 0.644, + "step": 380 + }, + { + "epoch": 0.06012308663405397, + "grad_norm": 0.5884042978286743, + "learning_rate": 4.9972765805038505e-06, + "loss": 0.6607, + "step": 381 + }, + { + "epoch": 0.06028089001104624, + "grad_norm": 0.5957321524620056, + "learning_rate": 4.997257165796017e-06, + "loss": 0.6764, + "step": 382 + }, + { + "epoch": 0.060438693388038506, + "grad_norm": 0.5993992686271667, + "learning_rate": 4.9972376821699685e-06, + "loss": 0.6587, + "step": 383 + }, + { + "epoch": 0.060596496765030775, + "grad_norm": 0.5903099179267883, + "learning_rate": 4.997218129626242e-06, + "loss": 0.6846, + "step": 384 + }, + { + "epoch": 0.060754300142023036, + "grad_norm": 0.6106293797492981, + "learning_rate": 4.997198508165378e-06, + "loss": 0.6522, + "step": 385 + }, + { + "epoch": 0.060912103519015305, + "grad_norm": 0.5818246006965637, + "learning_rate": 4.997178817787918e-06, + "loss": 0.7107, + "step": 386 + }, + { + "epoch": 0.061069906896007574, + "grad_norm": 0.5836179852485657, + "learning_rate": 4.997159058494403e-06, + "loss": 0.6737, + "step": 387 + }, + { + "epoch": 0.06122771027299984, + "grad_norm": 0.5796902775764465, + "learning_rate": 4.997139230285382e-06, + "loss": 0.6348, + "step": 388 + }, + { + "epoch": 0.06138551364999211, + "grad_norm": 0.6561937928199768, + "learning_rate": 4.9971193331613984e-06, + "loss": 0.6994, + "step": 389 + }, + { + "epoch": 0.06154331702698438, + "grad_norm": 0.5921893119812012, + "learning_rate": 4.997099367123004e-06, + "loss": 0.6274, + "step": 390 + }, + { + "epoch": 0.06170112040397664, + "grad_norm": 0.6568378210067749, + "learning_rate": 4.99707933217075e-06, + "loss": 0.6946, + "step": 391 + }, + { + "epoch": 0.06185892378096891, + "grad_norm": 0.5814316272735596, + "learning_rate": 4.9970592283051876e-06, + "loss": 0.656, + "step": 392 + }, + { + "epoch": 0.06201672715796118, + "grad_norm": 0.5647265911102295, + "learning_rate": 4.997039055526872e-06, + "loss": 0.6494, + "step": 393 + }, + { + "epoch": 0.06217453053495345, + "grad_norm": 0.5883191823959351, + "learning_rate": 4.9970188138363605e-06, + "loss": 0.6327, + "step": 394 + }, + { + "epoch": 0.06233233391194572, + "grad_norm": 0.626193642616272, + "learning_rate": 4.996998503234212e-06, + "loss": 0.6692, + "step": 395 + }, + { + "epoch": 0.062490137288937986, + "grad_norm": 0.675872802734375, + "learning_rate": 4.996978123720987e-06, + "loss": 0.6957, + "step": 396 + }, + { + "epoch": 0.06264794066593025, + "grad_norm": 0.6106968522071838, + "learning_rate": 4.996957675297246e-06, + "loss": 0.6683, + "step": 397 + }, + { + "epoch": 0.06280574404292252, + "grad_norm": 0.6164631247520447, + "learning_rate": 4.996937157963556e-06, + "loss": 0.6014, + "step": 398 + }, + { + "epoch": 0.06296354741991479, + "grad_norm": 0.5857820510864258, + "learning_rate": 4.996916571720482e-06, + "loss": 0.6627, + "step": 399 + }, + { + "epoch": 0.06312135079690705, + "grad_norm": 0.892193615436554, + "learning_rate": 4.996895916568592e-06, + "loss": 0.6673, + "step": 400 + }, + { + "epoch": 0.06327915417389932, + "grad_norm": 0.6220782399177551, + "learning_rate": 4.9968751925084554e-06, + "loss": 0.6366, + "step": 401 + }, + { + "epoch": 0.06343695755089158, + "grad_norm": 0.6324949860572815, + "learning_rate": 4.996854399540646e-06, + "loss": 0.697, + "step": 402 + }, + { + "epoch": 0.06359476092788385, + "grad_norm": 0.620777428150177, + "learning_rate": 4.996833537665735e-06, + "loss": 0.701, + "step": 403 + }, + { + "epoch": 0.06375256430487612, + "grad_norm": 0.6128180027008057, + "learning_rate": 4.996812606884301e-06, + "loss": 0.6463, + "step": 404 + }, + { + "epoch": 0.06391036768186839, + "grad_norm": 0.6063960790634155, + "learning_rate": 4.996791607196921e-06, + "loss": 0.6234, + "step": 405 + }, + { + "epoch": 0.06406817105886066, + "grad_norm": 0.6094223260879517, + "learning_rate": 4.996770538604172e-06, + "loss": 0.6839, + "step": 406 + }, + { + "epoch": 0.06422597443585293, + "grad_norm": 0.5966141819953918, + "learning_rate": 4.996749401106639e-06, + "loss": 0.7023, + "step": 407 + }, + { + "epoch": 0.0643837778128452, + "grad_norm": 0.5935360789299011, + "learning_rate": 4.996728194704903e-06, + "loss": 0.6402, + "step": 408 + }, + { + "epoch": 0.06454158118983747, + "grad_norm": 0.6105814576148987, + "learning_rate": 4.99670691939955e-06, + "loss": 0.6262, + "step": 409 + }, + { + "epoch": 0.06469938456682973, + "grad_norm": 0.5639218091964722, + "learning_rate": 4.996685575191167e-06, + "loss": 0.6613, + "step": 410 + }, + { + "epoch": 0.064857187943822, + "grad_norm": 0.614898145198822, + "learning_rate": 4.996664162080342e-06, + "loss": 0.6992, + "step": 411 + }, + { + "epoch": 0.06501499132081427, + "grad_norm": 0.6033980846405029, + "learning_rate": 4.996642680067669e-06, + "loss": 0.6391, + "step": 412 + }, + { + "epoch": 0.06517279469780653, + "grad_norm": 0.6357479095458984, + "learning_rate": 4.996621129153738e-06, + "loss": 0.6696, + "step": 413 + }, + { + "epoch": 0.0653305980747988, + "grad_norm": 0.613133430480957, + "learning_rate": 4.996599509339144e-06, + "loss": 0.6933, + "step": 414 + }, + { + "epoch": 0.06548840145179106, + "grad_norm": 0.5979904532432556, + "learning_rate": 4.996577820624485e-06, + "loss": 0.6507, + "step": 415 + }, + { + "epoch": 0.06564620482878333, + "grad_norm": 0.5899348258972168, + "learning_rate": 4.996556063010359e-06, + "loss": 0.6633, + "step": 416 + }, + { + "epoch": 0.0658040082057756, + "grad_norm": 0.5964170098304749, + "learning_rate": 4.996534236497366e-06, + "loss": 0.6777, + "step": 417 + }, + { + "epoch": 0.06596181158276787, + "grad_norm": 0.5668293833732605, + "learning_rate": 4.9965123410861095e-06, + "loss": 0.6533, + "step": 418 + }, + { + "epoch": 0.06611961495976014, + "grad_norm": 0.5599425435066223, + "learning_rate": 4.996490376777192e-06, + "loss": 0.6675, + "step": 419 + }, + { + "epoch": 0.06627741833675241, + "grad_norm": 0.5843794345855713, + "learning_rate": 4.996468343571221e-06, + "loss": 0.6884, + "step": 420 + }, + { + "epoch": 0.06643522171374468, + "grad_norm": 0.6075505614280701, + "learning_rate": 4.996446241468804e-06, + "loss": 0.6352, + "step": 421 + }, + { + "epoch": 0.06659302509073695, + "grad_norm": 0.6102336049079895, + "learning_rate": 4.996424070470551e-06, + "loss": 0.6781, + "step": 422 + }, + { + "epoch": 0.06675082846772921, + "grad_norm": 0.5913850665092468, + "learning_rate": 4.9964018305770744e-06, + "loss": 0.6716, + "step": 423 + }, + { + "epoch": 0.06690863184472148, + "grad_norm": 0.6131360530853271, + "learning_rate": 4.996379521788987e-06, + "loss": 0.6787, + "step": 424 + }, + { + "epoch": 0.06706643522171374, + "grad_norm": 0.6347812414169312, + "learning_rate": 4.996357144106906e-06, + "loss": 0.6696, + "step": 425 + }, + { + "epoch": 0.06722423859870601, + "grad_norm": 0.6270948052406311, + "learning_rate": 4.996334697531448e-06, + "loss": 0.6501, + "step": 426 + }, + { + "epoch": 0.06738204197569828, + "grad_norm": 0.6059204936027527, + "learning_rate": 4.996312182063231e-06, + "loss": 0.6503, + "step": 427 + }, + { + "epoch": 0.06753984535269054, + "grad_norm": 0.5858277678489685, + "learning_rate": 4.996289597702879e-06, + "loss": 0.6309, + "step": 428 + }, + { + "epoch": 0.06769764872968281, + "grad_norm": 0.5978555083274841, + "learning_rate": 4.996266944451014e-06, + "loss": 0.6427, + "step": 429 + }, + { + "epoch": 0.06785545210667508, + "grad_norm": 0.6304132342338562, + "learning_rate": 4.996244222308261e-06, + "loss": 0.6557, + "step": 430 + }, + { + "epoch": 0.06801325548366735, + "grad_norm": 0.6685929298400879, + "learning_rate": 4.996221431275247e-06, + "loss": 0.7285, + "step": 431 + }, + { + "epoch": 0.06817105886065962, + "grad_norm": 0.6048439145088196, + "learning_rate": 4.996198571352602e-06, + "loss": 0.6583, + "step": 432 + }, + { + "epoch": 0.06832886223765189, + "grad_norm": 0.5936669111251831, + "learning_rate": 4.996175642540956e-06, + "loss": 0.6894, + "step": 433 + }, + { + "epoch": 0.06848666561464416, + "grad_norm": 0.63093101978302, + "learning_rate": 4.996152644840942e-06, + "loss": 0.6614, + "step": 434 + }, + { + "epoch": 0.06864446899163643, + "grad_norm": 0.5911127328872681, + "learning_rate": 4.996129578253195e-06, + "loss": 0.6843, + "step": 435 + }, + { + "epoch": 0.0688022723686287, + "grad_norm": 0.5780293345451355, + "learning_rate": 4.996106442778351e-06, + "loss": 0.6522, + "step": 436 + }, + { + "epoch": 0.06896007574562095, + "grad_norm": 0.6081932187080383, + "learning_rate": 4.996083238417048e-06, + "loss": 0.6413, + "step": 437 + }, + { + "epoch": 0.06911787912261322, + "grad_norm": 0.5765565037727356, + "learning_rate": 4.996059965169929e-06, + "loss": 0.6803, + "step": 438 + }, + { + "epoch": 0.06927568249960549, + "grad_norm": 0.616504967212677, + "learning_rate": 4.996036623037633e-06, + "loss": 0.6637, + "step": 439 + }, + { + "epoch": 0.06943348587659776, + "grad_norm": 0.6159474849700928, + "learning_rate": 4.996013212020806e-06, + "loss": 0.6661, + "step": 440 + }, + { + "epoch": 0.06959128925359002, + "grad_norm": 0.6100020408630371, + "learning_rate": 4.995989732120094e-06, + "loss": 0.6522, + "step": 441 + }, + { + "epoch": 0.0697490926305823, + "grad_norm": 0.5881067514419556, + "learning_rate": 4.995966183336144e-06, + "loss": 0.673, + "step": 442 + }, + { + "epoch": 0.06990689600757456, + "grad_norm": 0.6119846105575562, + "learning_rate": 4.995942565669607e-06, + "loss": 0.6254, + "step": 443 + }, + { + "epoch": 0.07006469938456683, + "grad_norm": 0.5941219925880432, + "learning_rate": 4.9959188791211345e-06, + "loss": 0.6601, + "step": 444 + }, + { + "epoch": 0.0702225027615591, + "grad_norm": 0.6651306748390198, + "learning_rate": 4.99589512369138e-06, + "loss": 0.664, + "step": 445 + }, + { + "epoch": 0.07038030613855137, + "grad_norm": 0.6148424744606018, + "learning_rate": 4.995871299380998e-06, + "loss": 0.6655, + "step": 446 + }, + { + "epoch": 0.07053810951554364, + "grad_norm": 0.5521381497383118, + "learning_rate": 4.995847406190648e-06, + "loss": 0.6368, + "step": 447 + }, + { + "epoch": 0.0706959128925359, + "grad_norm": 0.6173676252365112, + "learning_rate": 4.995823444120989e-06, + "loss": 0.6829, + "step": 448 + }, + { + "epoch": 0.07085371626952817, + "grad_norm": 0.5936547517776489, + "learning_rate": 4.995799413172681e-06, + "loss": 0.6738, + "step": 449 + }, + { + "epoch": 0.07101151964652043, + "grad_norm": 0.5979297757148743, + "learning_rate": 4.995775313346388e-06, + "loss": 0.6618, + "step": 450 + }, + { + "epoch": 0.0711693230235127, + "grad_norm": 0.605919361114502, + "learning_rate": 4.995751144642776e-06, + "loss": 0.6762, + "step": 451 + }, + { + "epoch": 0.07132712640050497, + "grad_norm": 0.6205594539642334, + "learning_rate": 4.995726907062511e-06, + "loss": 0.6749, + "step": 452 + }, + { + "epoch": 0.07148492977749724, + "grad_norm": 0.5889684557914734, + "learning_rate": 4.995702600606262e-06, + "loss": 0.6737, + "step": 453 + }, + { + "epoch": 0.0716427331544895, + "grad_norm": 0.5857123136520386, + "learning_rate": 4.995678225274699e-06, + "loss": 0.6272, + "step": 454 + }, + { + "epoch": 0.07180053653148177, + "grad_norm": 0.6458742022514343, + "learning_rate": 4.9956537810684956e-06, + "loss": 0.6553, + "step": 455 + }, + { + "epoch": 0.07195833990847404, + "grad_norm": 0.6379228234291077, + "learning_rate": 4.995629267988327e-06, + "loss": 0.638, + "step": 456 + }, + { + "epoch": 0.07211614328546631, + "grad_norm": 0.6206799149513245, + "learning_rate": 4.9956046860348685e-06, + "loss": 0.6464, + "step": 457 + }, + { + "epoch": 0.07227394666245858, + "grad_norm": 0.6076207756996155, + "learning_rate": 4.995580035208799e-06, + "loss": 0.6674, + "step": 458 + }, + { + "epoch": 0.07243175003945085, + "grad_norm": 0.6347712874412537, + "learning_rate": 4.9955553155108e-06, + "loss": 0.6713, + "step": 459 + }, + { + "epoch": 0.07258955341644312, + "grad_norm": 0.64301997423172, + "learning_rate": 4.995530526941551e-06, + "loss": 0.6305, + "step": 460 + }, + { + "epoch": 0.07274735679343539, + "grad_norm": 0.6340868473052979, + "learning_rate": 4.995505669501738e-06, + "loss": 0.6611, + "step": 461 + }, + { + "epoch": 0.07290516017042764, + "grad_norm": 0.6075769662857056, + "learning_rate": 4.995480743192047e-06, + "loss": 0.6712, + "step": 462 + }, + { + "epoch": 0.07306296354741991, + "grad_norm": 0.6020073294639587, + "learning_rate": 4.995455748013165e-06, + "loss": 0.67, + "step": 463 + }, + { + "epoch": 0.07322076692441218, + "grad_norm": 0.6439855098724365, + "learning_rate": 4.995430683965783e-06, + "loss": 0.6362, + "step": 464 + }, + { + "epoch": 0.07337857030140445, + "grad_norm": 0.5995173454284668, + "learning_rate": 4.9954055510505916e-06, + "loss": 0.6255, + "step": 465 + }, + { + "epoch": 0.07353637367839672, + "grad_norm": 0.6235276460647583, + "learning_rate": 4.995380349268285e-06, + "loss": 0.6611, + "step": 466 + }, + { + "epoch": 0.07369417705538898, + "grad_norm": 0.598096489906311, + "learning_rate": 4.995355078619558e-06, + "loss": 0.6209, + "step": 467 + }, + { + "epoch": 0.07385198043238125, + "grad_norm": 0.6308861374855042, + "learning_rate": 4.995329739105109e-06, + "loss": 0.7089, + "step": 468 + }, + { + "epoch": 0.07400978380937352, + "grad_norm": 0.6625574827194214, + "learning_rate": 4.9953043307256375e-06, + "loss": 0.6712, + "step": 469 + }, + { + "epoch": 0.07416758718636579, + "grad_norm": 0.5790125131607056, + "learning_rate": 4.995278853481844e-06, + "loss": 0.651, + "step": 470 + }, + { + "epoch": 0.07432539056335806, + "grad_norm": 0.5990249514579773, + "learning_rate": 4.9952533073744305e-06, + "loss": 0.6748, + "step": 471 + }, + { + "epoch": 0.07448319394035033, + "grad_norm": 0.6266292929649353, + "learning_rate": 4.995227692404104e-06, + "loss": 0.6626, + "step": 472 + }, + { + "epoch": 0.0746409973173426, + "grad_norm": 0.6056614518165588, + "learning_rate": 4.995202008571571e-06, + "loss": 0.6854, + "step": 473 + }, + { + "epoch": 0.07479880069433485, + "grad_norm": 0.6586396098136902, + "learning_rate": 4.99517625587754e-06, + "loss": 0.6735, + "step": 474 + }, + { + "epoch": 0.07495660407132712, + "grad_norm": 0.598272442817688, + "learning_rate": 4.995150434322721e-06, + "loss": 0.6511, + "step": 475 + }, + { + "epoch": 0.07511440744831939, + "grad_norm": 0.6018388867378235, + "learning_rate": 4.995124543907827e-06, + "loss": 0.6472, + "step": 476 + }, + { + "epoch": 0.07527221082531166, + "grad_norm": 0.6016151905059814, + "learning_rate": 4.995098584633573e-06, + "loss": 0.6535, + "step": 477 + }, + { + "epoch": 0.07543001420230393, + "grad_norm": 0.5974687337875366, + "learning_rate": 4.995072556500675e-06, + "loss": 0.6279, + "step": 478 + }, + { + "epoch": 0.0755878175792962, + "grad_norm": 0.5771424770355225, + "learning_rate": 4.9950464595098525e-06, + "loss": 0.6128, + "step": 479 + }, + { + "epoch": 0.07574562095628846, + "grad_norm": 0.5990319848060608, + "learning_rate": 4.995020293661824e-06, + "loss": 0.6631, + "step": 480 + }, + { + "epoch": 0.07590342433328073, + "grad_norm": 0.6154145002365112, + "learning_rate": 4.994994058957312e-06, + "loss": 0.674, + "step": 481 + }, + { + "epoch": 0.076061227710273, + "grad_norm": 0.5880782604217529, + "learning_rate": 4.994967755397041e-06, + "loss": 0.6806, + "step": 482 + }, + { + "epoch": 0.07621903108726527, + "grad_norm": 0.6260122656822205, + "learning_rate": 4.994941382981737e-06, + "loss": 0.6564, + "step": 483 + }, + { + "epoch": 0.07637683446425754, + "grad_norm": 0.614010214805603, + "learning_rate": 4.994914941712128e-06, + "loss": 0.6898, + "step": 484 + }, + { + "epoch": 0.07653463784124981, + "grad_norm": 0.6106681227684021, + "learning_rate": 4.994888431588942e-06, + "loss": 0.6649, + "step": 485 + }, + { + "epoch": 0.07669244121824208, + "grad_norm": 0.5772574543952942, + "learning_rate": 4.994861852612913e-06, + "loss": 0.6516, + "step": 486 + }, + { + "epoch": 0.07685024459523433, + "grad_norm": 0.623252272605896, + "learning_rate": 4.994835204784774e-06, + "loss": 0.6649, + "step": 487 + }, + { + "epoch": 0.0770080479722266, + "grad_norm": 0.6045888066291809, + "learning_rate": 4.9948084881052586e-06, + "loss": 0.6977, + "step": 488 + }, + { + "epoch": 0.07716585134921887, + "grad_norm": 0.5967352986335754, + "learning_rate": 4.994781702575106e-06, + "loss": 0.6522, + "step": 489 + }, + { + "epoch": 0.07732365472621114, + "grad_norm": 0.5931550860404968, + "learning_rate": 4.994754848195054e-06, + "loss": 0.6556, + "step": 490 + }, + { + "epoch": 0.07748145810320341, + "grad_norm": 0.6138692498207092, + "learning_rate": 4.9947279249658444e-06, + "loss": 0.6383, + "step": 491 + }, + { + "epoch": 0.07763926148019568, + "grad_norm": 0.5887081027030945, + "learning_rate": 4.994700932888221e-06, + "loss": 0.6373, + "step": 492 + }, + { + "epoch": 0.07779706485718794, + "grad_norm": 0.6237157583236694, + "learning_rate": 4.994673871962928e-06, + "loss": 0.6742, + "step": 493 + }, + { + "epoch": 0.07795486823418021, + "grad_norm": 0.592624306678772, + "learning_rate": 4.994646742190712e-06, + "loss": 0.6158, + "step": 494 + }, + { + "epoch": 0.07811267161117248, + "grad_norm": 0.640743613243103, + "learning_rate": 4.994619543572321e-06, + "loss": 0.616, + "step": 495 + }, + { + "epoch": 0.07827047498816475, + "grad_norm": 0.5915449261665344, + "learning_rate": 4.994592276108508e-06, + "loss": 0.6567, + "step": 496 + }, + { + "epoch": 0.07842827836515702, + "grad_norm": 0.6550456881523132, + "learning_rate": 4.994564939800023e-06, + "loss": 0.6255, + "step": 497 + }, + { + "epoch": 0.07858608174214929, + "grad_norm": 0.6263878345489502, + "learning_rate": 4.994537534647622e-06, + "loss": 0.6873, + "step": 498 + }, + { + "epoch": 0.07874388511914154, + "grad_norm": 0.5912036299705505, + "learning_rate": 4.994510060652061e-06, + "loss": 0.6312, + "step": 499 + }, + { + "epoch": 0.07890168849613381, + "grad_norm": 0.6086687445640564, + "learning_rate": 4.994482517814098e-06, + "loss": 0.6835, + "step": 500 + }, + { + "epoch": 0.07905949187312608, + "grad_norm": 0.5643777847290039, + "learning_rate": 4.994454906134493e-06, + "loss": 0.6533, + "step": 501 + }, + { + "epoch": 0.07921729525011835, + "grad_norm": 0.6123368740081787, + "learning_rate": 4.994427225614008e-06, + "loss": 0.5978, + "step": 502 + }, + { + "epoch": 0.07937509862711062, + "grad_norm": 0.5865204930305481, + "learning_rate": 4.994399476253407e-06, + "loss": 0.633, + "step": 503 + }, + { + "epoch": 0.07953290200410289, + "grad_norm": 0.6190997362136841, + "learning_rate": 4.9943716580534566e-06, + "loss": 0.6844, + "step": 504 + }, + { + "epoch": 0.07969070538109516, + "grad_norm": 0.6117483973503113, + "learning_rate": 4.9943437710149224e-06, + "loss": 0.6726, + "step": 505 + }, + { + "epoch": 0.07984850875808742, + "grad_norm": 0.6502527594566345, + "learning_rate": 4.994315815138576e-06, + "loss": 0.6658, + "step": 506 + }, + { + "epoch": 0.0800063121350797, + "grad_norm": 0.6002123951911926, + "learning_rate": 4.994287790425188e-06, + "loss": 0.6673, + "step": 507 + }, + { + "epoch": 0.08016411551207196, + "grad_norm": 0.6003100275993347, + "learning_rate": 4.994259696875533e-06, + "loss": 0.6778, + "step": 508 + }, + { + "epoch": 0.08032191888906423, + "grad_norm": 0.6198926568031311, + "learning_rate": 4.994231534490385e-06, + "loss": 0.6883, + "step": 509 + }, + { + "epoch": 0.0804797222660565, + "grad_norm": 0.5803676843643188, + "learning_rate": 4.994203303270521e-06, + "loss": 0.6787, + "step": 510 + }, + { + "epoch": 0.08063752564304875, + "grad_norm": 0.5819549560546875, + "learning_rate": 4.994175003216722e-06, + "loss": 0.6698, + "step": 511 + }, + { + "epoch": 0.08079532902004102, + "grad_norm": 0.6756862998008728, + "learning_rate": 4.9941466343297674e-06, + "loss": 0.6636, + "step": 512 + }, + { + "epoch": 0.08095313239703329, + "grad_norm": 0.6260047554969788, + "learning_rate": 4.99411819661044e-06, + "loss": 0.6704, + "step": 513 + }, + { + "epoch": 0.08111093577402556, + "grad_norm": 0.6111018061637878, + "learning_rate": 4.994089690059526e-06, + "loss": 0.6311, + "step": 514 + }, + { + "epoch": 0.08126873915101783, + "grad_norm": 0.606752336025238, + "learning_rate": 4.99406111467781e-06, + "loss": 0.6544, + "step": 515 + }, + { + "epoch": 0.0814265425280101, + "grad_norm": 0.6125901341438293, + "learning_rate": 4.994032470466083e-06, + "loss": 0.6338, + "step": 516 + }, + { + "epoch": 0.08158434590500237, + "grad_norm": 0.6132068634033203, + "learning_rate": 4.994003757425133e-06, + "loss": 0.6568, + "step": 517 + }, + { + "epoch": 0.08174214928199464, + "grad_norm": 0.5951746702194214, + "learning_rate": 4.993974975555755e-06, + "loss": 0.6806, + "step": 518 + }, + { + "epoch": 0.0818999526589869, + "grad_norm": 0.6201399564743042, + "learning_rate": 4.993946124858741e-06, + "loss": 0.6543, + "step": 519 + }, + { + "epoch": 0.08205775603597917, + "grad_norm": 0.6069450378417969, + "learning_rate": 4.993917205334889e-06, + "loss": 0.6732, + "step": 520 + }, + { + "epoch": 0.08221555941297144, + "grad_norm": 0.58624267578125, + "learning_rate": 4.993888216984996e-06, + "loss": 0.6707, + "step": 521 + }, + { + "epoch": 0.08237336278996371, + "grad_norm": 0.6508557200431824, + "learning_rate": 4.993859159809863e-06, + "loss": 0.6364, + "step": 522 + }, + { + "epoch": 0.08253116616695597, + "grad_norm": 0.6103181838989258, + "learning_rate": 4.993830033810291e-06, + "loss": 0.678, + "step": 523 + }, + { + "epoch": 0.08268896954394823, + "grad_norm": 0.6175912618637085, + "learning_rate": 4.993800838987083e-06, + "loss": 0.6455, + "step": 524 + }, + { + "epoch": 0.0828467729209405, + "grad_norm": 0.6240972876548767, + "learning_rate": 4.993771575341048e-06, + "loss": 0.6426, + "step": 525 + }, + { + "epoch": 0.08300457629793277, + "grad_norm": 0.5609580278396606, + "learning_rate": 4.99374224287299e-06, + "loss": 0.638, + "step": 526 + }, + { + "epoch": 0.08316237967492504, + "grad_norm": 0.657199501991272, + "learning_rate": 4.99371284158372e-06, + "loss": 0.6843, + "step": 527 + }, + { + "epoch": 0.08332018305191731, + "grad_norm": 0.6521002054214478, + "learning_rate": 4.993683371474049e-06, + "loss": 0.6767, + "step": 528 + }, + { + "epoch": 0.08347798642890958, + "grad_norm": 0.5803499817848206, + "learning_rate": 4.99365383254479e-06, + "loss": 0.6642, + "step": 529 + }, + { + "epoch": 0.08363578980590185, + "grad_norm": 0.5957757830619812, + "learning_rate": 4.99362422479676e-06, + "loss": 0.6726, + "step": 530 + }, + { + "epoch": 0.08379359318289412, + "grad_norm": 0.6229822039604187, + "learning_rate": 4.993594548230775e-06, + "loss": 0.6538, + "step": 531 + }, + { + "epoch": 0.08395139655988638, + "grad_norm": 0.5826054215431213, + "learning_rate": 4.993564802847654e-06, + "loss": 0.6061, + "step": 532 + }, + { + "epoch": 0.08410919993687865, + "grad_norm": 0.5999711751937866, + "learning_rate": 4.993534988648217e-06, + "loss": 0.6333, + "step": 533 + }, + { + "epoch": 0.08426700331387092, + "grad_norm": 0.6202274560928345, + "learning_rate": 4.993505105633288e-06, + "loss": 0.6342, + "step": 534 + }, + { + "epoch": 0.08442480669086319, + "grad_norm": 0.631309986114502, + "learning_rate": 4.993475153803692e-06, + "loss": 0.6764, + "step": 535 + }, + { + "epoch": 0.08458261006785545, + "grad_norm": 0.6453179717063904, + "learning_rate": 4.993445133160254e-06, + "loss": 0.6947, + "step": 536 + }, + { + "epoch": 0.08474041344484771, + "grad_norm": 0.6496890783309937, + "learning_rate": 4.993415043703804e-06, + "loss": 0.6588, + "step": 537 + }, + { + "epoch": 0.08489821682183998, + "grad_norm": 0.6241375207901001, + "learning_rate": 4.99338488543517e-06, + "loss": 0.635, + "step": 538 + }, + { + "epoch": 0.08505602019883225, + "grad_norm": 0.6059864163398743, + "learning_rate": 4.993354658355188e-06, + "loss": 0.6465, + "step": 539 + }, + { + "epoch": 0.08521382357582452, + "grad_norm": 0.5841369032859802, + "learning_rate": 4.993324362464691e-06, + "loss": 0.6184, + "step": 540 + }, + { + "epoch": 0.08537162695281679, + "grad_norm": 0.6138466596603394, + "learning_rate": 4.993293997764513e-06, + "loss": 0.6297, + "step": 541 + }, + { + "epoch": 0.08552943032980906, + "grad_norm": 0.6016539931297302, + "learning_rate": 4.993263564255494e-06, + "loss": 0.6384, + "step": 542 + }, + { + "epoch": 0.08568723370680133, + "grad_norm": 0.6354431509971619, + "learning_rate": 4.993233061938473e-06, + "loss": 0.5942, + "step": 543 + }, + { + "epoch": 0.0858450370837936, + "grad_norm": 0.6518524289131165, + "learning_rate": 4.993202490814292e-06, + "loss": 0.659, + "step": 544 + }, + { + "epoch": 0.08600284046078586, + "grad_norm": 0.5801978707313538, + "learning_rate": 4.993171850883794e-06, + "loss": 0.6077, + "step": 545 + }, + { + "epoch": 0.08616064383777813, + "grad_norm": 0.6058133840560913, + "learning_rate": 4.993141142147827e-06, + "loss": 0.6457, + "step": 546 + }, + { + "epoch": 0.0863184472147704, + "grad_norm": 0.593062698841095, + "learning_rate": 4.993110364607236e-06, + "loss": 0.6462, + "step": 547 + }, + { + "epoch": 0.08647625059176266, + "grad_norm": 0.5917820334434509, + "learning_rate": 4.993079518262871e-06, + "loss": 0.6299, + "step": 548 + }, + { + "epoch": 0.08663405396875493, + "grad_norm": 0.5956170558929443, + "learning_rate": 4.993048603115583e-06, + "loss": 0.6552, + "step": 549 + }, + { + "epoch": 0.0867918573457472, + "grad_norm": 0.6386120915412903, + "learning_rate": 4.993017619166226e-06, + "loss": 0.6858, + "step": 550 + }, + { + "epoch": 0.08694966072273946, + "grad_norm": 0.6389192342758179, + "learning_rate": 4.992986566415655e-06, + "loss": 0.6578, + "step": 551 + }, + { + "epoch": 0.08710746409973173, + "grad_norm": 0.5814597606658936, + "learning_rate": 4.9929554448647275e-06, + "loss": 0.6231, + "step": 552 + }, + { + "epoch": 0.087265267476724, + "grad_norm": 0.6372679471969604, + "learning_rate": 4.992924254514301e-06, + "loss": 0.6561, + "step": 553 + }, + { + "epoch": 0.08742307085371627, + "grad_norm": 0.6064439415931702, + "learning_rate": 4.992892995365236e-06, + "loss": 0.657, + "step": 554 + }, + { + "epoch": 0.08758087423070854, + "grad_norm": 0.6159340739250183, + "learning_rate": 4.992861667418397e-06, + "loss": 0.6557, + "step": 555 + }, + { + "epoch": 0.08773867760770081, + "grad_norm": 0.5878521203994751, + "learning_rate": 4.992830270674648e-06, + "loss": 0.6473, + "step": 556 + }, + { + "epoch": 0.08789648098469308, + "grad_norm": 0.6020849943161011, + "learning_rate": 4.992798805134854e-06, + "loss": 0.6658, + "step": 557 + }, + { + "epoch": 0.08805428436168534, + "grad_norm": 0.567807674407959, + "learning_rate": 4.992767270799885e-06, + "loss": 0.6576, + "step": 558 + }, + { + "epoch": 0.08821208773867761, + "grad_norm": 0.6479179263114929, + "learning_rate": 4.992735667670612e-06, + "loss": 0.6684, + "step": 559 + }, + { + "epoch": 0.08836989111566987, + "grad_norm": 0.5897206664085388, + "learning_rate": 4.992703995747904e-06, + "loss": 0.6474, + "step": 560 + }, + { + "epoch": 0.08852769449266214, + "grad_norm": 0.6179800033569336, + "learning_rate": 4.992672255032638e-06, + "loss": 0.6464, + "step": 561 + }, + { + "epoch": 0.0886854978696544, + "grad_norm": 0.5977960228919983, + "learning_rate": 4.9926404455256885e-06, + "loss": 0.6234, + "step": 562 + }, + { + "epoch": 0.08884330124664667, + "grad_norm": 0.600533127784729, + "learning_rate": 4.9926085672279346e-06, + "loss": 0.6538, + "step": 563 + }, + { + "epoch": 0.08900110462363894, + "grad_norm": 0.5709935426712036, + "learning_rate": 4.992576620140255e-06, + "loss": 0.6482, + "step": 564 + }, + { + "epoch": 0.08915890800063121, + "grad_norm": 0.5927051901817322, + "learning_rate": 4.992544604263531e-06, + "loss": 0.658, + "step": 565 + }, + { + "epoch": 0.08931671137762348, + "grad_norm": 0.6025221943855286, + "learning_rate": 4.9925125195986476e-06, + "loss": 0.6558, + "step": 566 + }, + { + "epoch": 0.08947451475461575, + "grad_norm": 0.5881064534187317, + "learning_rate": 4.9924803661464895e-06, + "loss": 0.659, + "step": 567 + }, + { + "epoch": 0.08963231813160802, + "grad_norm": 0.6026913523674011, + "learning_rate": 4.992448143907943e-06, + "loss": 0.6458, + "step": 568 + }, + { + "epoch": 0.08979012150860029, + "grad_norm": 0.6516624093055725, + "learning_rate": 4.992415852883899e-06, + "loss": 0.6537, + "step": 569 + }, + { + "epoch": 0.08994792488559256, + "grad_norm": 0.6379014253616333, + "learning_rate": 4.992383493075249e-06, + "loss": 0.578, + "step": 570 + }, + { + "epoch": 0.09010572826258482, + "grad_norm": 0.5896294116973877, + "learning_rate": 4.9923510644828835e-06, + "loss": 0.6759, + "step": 571 + }, + { + "epoch": 0.0902635316395771, + "grad_norm": 0.597492516040802, + "learning_rate": 4.9923185671077e-06, + "loss": 0.6856, + "step": 572 + }, + { + "epoch": 0.09042133501656935, + "grad_norm": 0.7058597207069397, + "learning_rate": 4.992286000950594e-06, + "loss": 0.6598, + "step": 573 + }, + { + "epoch": 0.09057913839356162, + "grad_norm": 0.5779749155044556, + "learning_rate": 4.992253366012465e-06, + "loss": 0.6634, + "step": 574 + }, + { + "epoch": 0.09073694177055389, + "grad_norm": 0.5696356296539307, + "learning_rate": 4.992220662294213e-06, + "loss": 0.6514, + "step": 575 + }, + { + "epoch": 0.09089474514754615, + "grad_norm": 0.5956026315689087, + "learning_rate": 4.992187889796741e-06, + "loss": 0.6575, + "step": 576 + }, + { + "epoch": 0.09105254852453842, + "grad_norm": 0.7209203243255615, + "learning_rate": 4.992155048520953e-06, + "loss": 0.698, + "step": 577 + }, + { + "epoch": 0.09121035190153069, + "grad_norm": 0.5873061418533325, + "learning_rate": 4.992122138467756e-06, + "loss": 0.6349, + "step": 578 + }, + { + "epoch": 0.09136815527852296, + "grad_norm": 0.5850968360900879, + "learning_rate": 4.992089159638057e-06, + "loss": 0.6306, + "step": 579 + }, + { + "epoch": 0.09152595865551523, + "grad_norm": 0.5865573883056641, + "learning_rate": 4.992056112032768e-06, + "loss": 0.6674, + "step": 580 + }, + { + "epoch": 0.0916837620325075, + "grad_norm": 0.6550090312957764, + "learning_rate": 4.9920229956528e-06, + "loss": 0.696, + "step": 581 + }, + { + "epoch": 0.09184156540949977, + "grad_norm": 0.6096286773681641, + "learning_rate": 4.991989810499066e-06, + "loss": 0.6042, + "step": 582 + }, + { + "epoch": 0.09199936878649204, + "grad_norm": 0.5901389718055725, + "learning_rate": 4.9919565565724835e-06, + "loss": 0.6445, + "step": 583 + }, + { + "epoch": 0.0921571721634843, + "grad_norm": 0.6151852011680603, + "learning_rate": 4.991923233873969e-06, + "loss": 0.6329, + "step": 584 + }, + { + "epoch": 0.09231497554047656, + "grad_norm": 0.632743775844574, + "learning_rate": 4.991889842404443e-06, + "loss": 0.6356, + "step": 585 + }, + { + "epoch": 0.09247277891746883, + "grad_norm": 0.6366567015647888, + "learning_rate": 4.991856382164827e-06, + "loss": 0.664, + "step": 586 + }, + { + "epoch": 0.0926305822944611, + "grad_norm": 0.6036646366119385, + "learning_rate": 4.991822853156043e-06, + "loss": 0.672, + "step": 587 + }, + { + "epoch": 0.09278838567145337, + "grad_norm": 0.6357161402702332, + "learning_rate": 4.991789255379018e-06, + "loss": 0.6646, + "step": 588 + }, + { + "epoch": 0.09294618904844563, + "grad_norm": 0.6097987294197083, + "learning_rate": 4.9917555888346794e-06, + "loss": 0.6449, + "step": 589 + }, + { + "epoch": 0.0931039924254379, + "grad_norm": 0.6390067934989929, + "learning_rate": 4.991721853523955e-06, + "loss": 0.6428, + "step": 590 + }, + { + "epoch": 0.09326179580243017, + "grad_norm": 0.6125891804695129, + "learning_rate": 4.991688049447776e-06, + "loss": 0.5934, + "step": 591 + }, + { + "epoch": 0.09341959917942244, + "grad_norm": 0.5703574419021606, + "learning_rate": 4.991654176607075e-06, + "loss": 0.6187, + "step": 592 + }, + { + "epoch": 0.09357740255641471, + "grad_norm": 0.5983945727348328, + "learning_rate": 4.991620235002789e-06, + "loss": 0.6538, + "step": 593 + }, + { + "epoch": 0.09373520593340698, + "grad_norm": 0.6186562776565552, + "learning_rate": 4.991586224635851e-06, + "loss": 0.6735, + "step": 594 + }, + { + "epoch": 0.09389300931039925, + "grad_norm": 0.621983528137207, + "learning_rate": 4.9915521455072035e-06, + "loss": 0.6637, + "step": 595 + }, + { + "epoch": 0.09405081268739152, + "grad_norm": 0.6209889650344849, + "learning_rate": 4.9915179976177855e-06, + "loss": 0.6458, + "step": 596 + }, + { + "epoch": 0.09420861606438377, + "grad_norm": 0.5986520051956177, + "learning_rate": 4.9914837809685386e-06, + "loss": 0.6866, + "step": 597 + }, + { + "epoch": 0.09436641944137604, + "grad_norm": 0.6145833730697632, + "learning_rate": 4.991449495560408e-06, + "loss": 0.6623, + "step": 598 + }, + { + "epoch": 0.09452422281836831, + "grad_norm": 0.6565549969673157, + "learning_rate": 4.991415141394339e-06, + "loss": 0.6591, + "step": 599 + }, + { + "epoch": 0.09468202619536058, + "grad_norm": 0.6176513433456421, + "learning_rate": 4.99138071847128e-06, + "loss": 0.6395, + "step": 600 + }, + { + "epoch": 0.09483982957235285, + "grad_norm": 0.6156229376792908, + "learning_rate": 4.991346226792182e-06, + "loss": 0.6337, + "step": 601 + }, + { + "epoch": 0.09499763294934511, + "grad_norm": 0.6132075190544128, + "learning_rate": 4.991311666357996e-06, + "loss": 0.6388, + "step": 602 + }, + { + "epoch": 0.09515543632633738, + "grad_norm": 0.6197117567062378, + "learning_rate": 4.991277037169675e-06, + "loss": 0.6856, + "step": 603 + }, + { + "epoch": 0.09531323970332965, + "grad_norm": 0.6919817924499512, + "learning_rate": 4.991242339228176e-06, + "loss": 0.6669, + "step": 604 + }, + { + "epoch": 0.09547104308032192, + "grad_norm": 0.6249364018440247, + "learning_rate": 4.991207572534456e-06, + "loss": 0.6856, + "step": 605 + }, + { + "epoch": 0.09562884645731419, + "grad_norm": 0.6257843971252441, + "learning_rate": 4.991172737089476e-06, + "loss": 0.6374, + "step": 606 + }, + { + "epoch": 0.09578664983430646, + "grad_norm": 0.5867553949356079, + "learning_rate": 4.9911378328941944e-06, + "loss": 0.6251, + "step": 607 + }, + { + "epoch": 0.09594445321129873, + "grad_norm": 0.6265820264816284, + "learning_rate": 4.991102859949577e-06, + "loss": 0.6892, + "step": 608 + }, + { + "epoch": 0.096102256588291, + "grad_norm": 0.5982639789581299, + "learning_rate": 4.991067818256588e-06, + "loss": 0.5834, + "step": 609 + }, + { + "epoch": 0.09626005996528325, + "grad_norm": 0.6259893178939819, + "learning_rate": 4.991032707816194e-06, + "loss": 0.6251, + "step": 610 + }, + { + "epoch": 0.09641786334227552, + "grad_norm": 0.6230077743530273, + "learning_rate": 4.990997528629365e-06, + "loss": 0.6343, + "step": 611 + }, + { + "epoch": 0.09657566671926779, + "grad_norm": 0.6253799796104431, + "learning_rate": 4.9909622806970705e-06, + "loss": 0.681, + "step": 612 + }, + { + "epoch": 0.09673347009626006, + "grad_norm": 0.6201931834220886, + "learning_rate": 4.9909269640202855e-06, + "loss": 0.6729, + "step": 613 + }, + { + "epoch": 0.09689127347325233, + "grad_norm": 0.6011062264442444, + "learning_rate": 4.990891578599982e-06, + "loss": 0.6549, + "step": 614 + }, + { + "epoch": 0.0970490768502446, + "grad_norm": 0.5866005420684814, + "learning_rate": 4.990856124437138e-06, + "loss": 0.6422, + "step": 615 + }, + { + "epoch": 0.09720688022723686, + "grad_norm": 0.6311702132225037, + "learning_rate": 4.990820601532732e-06, + "loss": 0.676, + "step": 616 + }, + { + "epoch": 0.09736468360422913, + "grad_norm": 0.6011964082717896, + "learning_rate": 4.990785009887744e-06, + "loss": 0.6525, + "step": 617 + }, + { + "epoch": 0.0975224869812214, + "grad_norm": 0.6164799332618713, + "learning_rate": 4.990749349503156e-06, + "loss": 0.6665, + "step": 618 + }, + { + "epoch": 0.09768029035821367, + "grad_norm": 0.6249286532402039, + "learning_rate": 4.990713620379953e-06, + "loss": 0.6661, + "step": 619 + }, + { + "epoch": 0.09783809373520594, + "grad_norm": 0.6010226011276245, + "learning_rate": 4.990677822519121e-06, + "loss": 0.6435, + "step": 620 + }, + { + "epoch": 0.09799589711219821, + "grad_norm": 0.5993772745132446, + "learning_rate": 4.990641955921646e-06, + "loss": 0.6791, + "step": 621 + }, + { + "epoch": 0.09815370048919046, + "grad_norm": 0.6331866979598999, + "learning_rate": 4.9906060205885206e-06, + "loss": 0.6245, + "step": 622 + }, + { + "epoch": 0.09831150386618273, + "grad_norm": 0.6250171661376953, + "learning_rate": 4.9905700165207345e-06, + "loss": 0.6509, + "step": 623 + }, + { + "epoch": 0.098469307243175, + "grad_norm": 0.6126798391342163, + "learning_rate": 4.9905339437192825e-06, + "loss": 0.658, + "step": 624 + }, + { + "epoch": 0.09862711062016727, + "grad_norm": 0.6103727221488953, + "learning_rate": 4.990497802185159e-06, + "loss": 0.6624, + "step": 625 + }, + { + "epoch": 0.09878491399715954, + "grad_norm": 0.6092430353164673, + "learning_rate": 4.990461591919362e-06, + "loss": 0.6584, + "step": 626 + }, + { + "epoch": 0.0989427173741518, + "grad_norm": 0.5674698948860168, + "learning_rate": 4.9904253129228916e-06, + "loss": 0.6299, + "step": 627 + }, + { + "epoch": 0.09910052075114407, + "grad_norm": 0.6033174395561218, + "learning_rate": 4.990388965196748e-06, + "loss": 0.652, + "step": 628 + }, + { + "epoch": 0.09925832412813634, + "grad_norm": 0.6227395534515381, + "learning_rate": 4.990352548741935e-06, + "loss": 0.6668, + "step": 629 + }, + { + "epoch": 0.09941612750512861, + "grad_norm": 0.6107274889945984, + "learning_rate": 4.990316063559457e-06, + "loss": 0.6719, + "step": 630 + }, + { + "epoch": 0.09957393088212088, + "grad_norm": 0.621626615524292, + "learning_rate": 4.990279509650321e-06, + "loss": 0.6364, + "step": 631 + }, + { + "epoch": 0.09973173425911315, + "grad_norm": 0.586185872554779, + "learning_rate": 4.990242887015536e-06, + "loss": 0.6421, + "step": 632 + }, + { + "epoch": 0.09988953763610542, + "grad_norm": 0.6320814490318298, + "learning_rate": 4.990206195656112e-06, + "loss": 0.6325, + "step": 633 + }, + { + "epoch": 0.10004734101309767, + "grad_norm": 0.592439591884613, + "learning_rate": 4.990169435573062e-06, + "loss": 0.6611, + "step": 634 + }, + { + "epoch": 0.10020514439008994, + "grad_norm": 0.578355073928833, + "learning_rate": 4.9901326067674015e-06, + "loss": 0.6722, + "step": 635 + }, + { + "epoch": 0.10036294776708221, + "grad_norm": 0.6121755242347717, + "learning_rate": 4.990095709240146e-06, + "loss": 0.6614, + "step": 636 + }, + { + "epoch": 0.10052075114407448, + "grad_norm": 0.6014378666877747, + "learning_rate": 4.990058742992314e-06, + "loss": 0.6505, + "step": 637 + }, + { + "epoch": 0.10067855452106675, + "grad_norm": 0.6020945310592651, + "learning_rate": 4.990021708024926e-06, + "loss": 0.5879, + "step": 638 + }, + { + "epoch": 0.10083635789805902, + "grad_norm": 0.638414740562439, + "learning_rate": 4.989984604339002e-06, + "loss": 0.6587, + "step": 639 + }, + { + "epoch": 0.10099416127505129, + "grad_norm": 0.5972415804862976, + "learning_rate": 4.989947431935569e-06, + "loss": 0.641, + "step": 640 + }, + { + "epoch": 0.10115196465204356, + "grad_norm": 0.6102055907249451, + "learning_rate": 4.989910190815651e-06, + "loss": 0.6409, + "step": 641 + }, + { + "epoch": 0.10130976802903582, + "grad_norm": 0.5795861482620239, + "learning_rate": 4.989872880980276e-06, + "loss": 0.5973, + "step": 642 + }, + { + "epoch": 0.10146757140602809, + "grad_norm": 0.6280496716499329, + "learning_rate": 4.989835502430474e-06, + "loss": 0.6171, + "step": 643 + }, + { + "epoch": 0.10162537478302036, + "grad_norm": 0.6210328936576843, + "learning_rate": 4.989798055167277e-06, + "loss": 0.658, + "step": 644 + }, + { + "epoch": 0.10178317816001263, + "grad_norm": 0.7666929364204407, + "learning_rate": 4.989760539191717e-06, + "loss": 0.6371, + "step": 645 + }, + { + "epoch": 0.1019409815370049, + "grad_norm": 0.5933274030685425, + "learning_rate": 4.989722954504831e-06, + "loss": 0.6449, + "step": 646 + }, + { + "epoch": 0.10209878491399715, + "grad_norm": 0.6040042638778687, + "learning_rate": 4.989685301107654e-06, + "loss": 0.615, + "step": 647 + }, + { + "epoch": 0.10225658829098942, + "grad_norm": 0.5947490334510803, + "learning_rate": 4.989647579001228e-06, + "loss": 0.624, + "step": 648 + }, + { + "epoch": 0.10241439166798169, + "grad_norm": 0.6093249320983887, + "learning_rate": 4.989609788186592e-06, + "loss": 0.6491, + "step": 649 + }, + { + "epoch": 0.10257219504497396, + "grad_norm": 0.6214513182640076, + "learning_rate": 4.989571928664791e-06, + "loss": 0.631, + "step": 650 + }, + { + "epoch": 0.10272999842196623, + "grad_norm": 0.6212220191955566, + "learning_rate": 4.9895340004368665e-06, + "loss": 0.6733, + "step": 651 + }, + { + "epoch": 0.1028878017989585, + "grad_norm": 0.6171040534973145, + "learning_rate": 4.989496003503868e-06, + "loss": 0.6738, + "step": 652 + }, + { + "epoch": 0.10304560517595077, + "grad_norm": 0.5968170762062073, + "learning_rate": 4.989457937866844e-06, + "loss": 0.6499, + "step": 653 + }, + { + "epoch": 0.10320340855294304, + "grad_norm": 0.6016853451728821, + "learning_rate": 4.989419803526843e-06, + "loss": 0.6249, + "step": 654 + }, + { + "epoch": 0.1033612119299353, + "grad_norm": 0.5786157250404358, + "learning_rate": 4.98938160048492e-06, + "loss": 0.5752, + "step": 655 + }, + { + "epoch": 0.10351901530692757, + "grad_norm": 0.6222103238105774, + "learning_rate": 4.9893433287421265e-06, + "loss": 0.6175, + "step": 656 + }, + { + "epoch": 0.10367681868391984, + "grad_norm": 0.5992321372032166, + "learning_rate": 4.989304988299521e-06, + "loss": 0.6662, + "step": 657 + }, + { + "epoch": 0.10383462206091211, + "grad_norm": 0.6051515340805054, + "learning_rate": 4.98926657915816e-06, + "loss": 0.6809, + "step": 658 + }, + { + "epoch": 0.10399242543790436, + "grad_norm": 0.6599189043045044, + "learning_rate": 4.989228101319106e-06, + "loss": 0.6598, + "step": 659 + }, + { + "epoch": 0.10415022881489663, + "grad_norm": 0.5894850492477417, + "learning_rate": 4.989189554783418e-06, + "loss": 0.6665, + "step": 660 + }, + { + "epoch": 0.1043080321918889, + "grad_norm": 0.5847735404968262, + "learning_rate": 4.989150939552162e-06, + "loss": 0.6223, + "step": 661 + }, + { + "epoch": 0.10446583556888117, + "grad_norm": 0.5913602709770203, + "learning_rate": 4.9891122556264015e-06, + "loss": 0.6556, + "step": 662 + }, + { + "epoch": 0.10462363894587344, + "grad_norm": 0.6041651368141174, + "learning_rate": 4.989073503007206e-06, + "loss": 0.6066, + "step": 663 + }, + { + "epoch": 0.10478144232286571, + "grad_norm": 0.5660608410835266, + "learning_rate": 4.989034681695644e-06, + "loss": 0.6268, + "step": 664 + }, + { + "epoch": 0.10493924569985798, + "grad_norm": 0.5800573825836182, + "learning_rate": 4.988995791692788e-06, + "loss": 0.6307, + "step": 665 + }, + { + "epoch": 0.10509704907685025, + "grad_norm": 0.5766053199768066, + "learning_rate": 4.988956832999709e-06, + "loss": 0.6537, + "step": 666 + }, + { + "epoch": 0.10525485245384252, + "grad_norm": 0.6069685220718384, + "learning_rate": 4.988917805617484e-06, + "loss": 0.6562, + "step": 667 + }, + { + "epoch": 0.10541265583083478, + "grad_norm": 0.6245861053466797, + "learning_rate": 4.9888787095471904e-06, + "loss": 0.6335, + "step": 668 + }, + { + "epoch": 0.10557045920782705, + "grad_norm": 0.6105977892875671, + "learning_rate": 4.9888395447899055e-06, + "loss": 0.6786, + "step": 669 + }, + { + "epoch": 0.10572826258481932, + "grad_norm": 0.5970708131790161, + "learning_rate": 4.988800311346711e-06, + "loss": 0.6587, + "step": 670 + }, + { + "epoch": 0.10588606596181158, + "grad_norm": 0.6274979710578918, + "learning_rate": 4.98876100921869e-06, + "loss": 0.6254, + "step": 671 + }, + { + "epoch": 0.10604386933880385, + "grad_norm": 0.6414305567741394, + "learning_rate": 4.988721638406927e-06, + "loss": 0.6586, + "step": 672 + }, + { + "epoch": 0.10620167271579611, + "grad_norm": 0.5663453936576843, + "learning_rate": 4.988682198912509e-06, + "loss": 0.6293, + "step": 673 + }, + { + "epoch": 0.10635947609278838, + "grad_norm": 0.6417929530143738, + "learning_rate": 4.988642690736523e-06, + "loss": 0.6669, + "step": 674 + }, + { + "epoch": 0.10651727946978065, + "grad_norm": 0.5961757898330688, + "learning_rate": 4.988603113880059e-06, + "loss": 0.6369, + "step": 675 + }, + { + "epoch": 0.10667508284677292, + "grad_norm": 0.6168935894966125, + "learning_rate": 4.988563468344212e-06, + "loss": 0.6507, + "step": 676 + }, + { + "epoch": 0.10683288622376519, + "grad_norm": 0.6196932792663574, + "learning_rate": 4.988523754130074e-06, + "loss": 0.6059, + "step": 677 + }, + { + "epoch": 0.10699068960075746, + "grad_norm": 0.6085158586502075, + "learning_rate": 4.9884839712387415e-06, + "loss": 0.6692, + "step": 678 + }, + { + "epoch": 0.10714849297774973, + "grad_norm": 0.6351853609085083, + "learning_rate": 4.988444119671313e-06, + "loss": 0.6392, + "step": 679 + }, + { + "epoch": 0.107306296354742, + "grad_norm": 0.6414273977279663, + "learning_rate": 4.988404199428887e-06, + "loss": 0.6768, + "step": 680 + }, + { + "epoch": 0.10746409973173426, + "grad_norm": 0.5864552855491638, + "learning_rate": 4.988364210512565e-06, + "loss": 0.6413, + "step": 681 + }, + { + "epoch": 0.10762190310872653, + "grad_norm": 0.6175075173377991, + "learning_rate": 4.988324152923453e-06, + "loss": 0.6539, + "step": 682 + }, + { + "epoch": 0.10777970648571879, + "grad_norm": 0.6025212407112122, + "learning_rate": 4.988284026662654e-06, + "loss": 0.6267, + "step": 683 + }, + { + "epoch": 0.10793750986271106, + "grad_norm": 0.6458533406257629, + "learning_rate": 4.988243831731277e-06, + "loss": 0.6654, + "step": 684 + }, + { + "epoch": 0.10809531323970333, + "grad_norm": 0.5975069999694824, + "learning_rate": 4.9882035681304306e-06, + "loss": 0.6355, + "step": 685 + }, + { + "epoch": 0.1082531166166956, + "grad_norm": 0.6233134269714355, + "learning_rate": 4.9881632358612255e-06, + "loss": 0.6206, + "step": 686 + }, + { + "epoch": 0.10841091999368786, + "grad_norm": 0.7895084023475647, + "learning_rate": 4.988122834924776e-06, + "loss": 0.6379, + "step": 687 + }, + { + "epoch": 0.10856872337068013, + "grad_norm": 0.6212348937988281, + "learning_rate": 4.988082365322197e-06, + "loss": 0.6349, + "step": 688 + }, + { + "epoch": 0.1087265267476724, + "grad_norm": 0.6340476274490356, + "learning_rate": 4.988041827054603e-06, + "loss": 0.666, + "step": 689 + }, + { + "epoch": 0.10888433012466467, + "grad_norm": 0.6250340938568115, + "learning_rate": 4.9880012201231155e-06, + "loss": 0.6257, + "step": 690 + }, + { + "epoch": 0.10904213350165694, + "grad_norm": 0.6242830753326416, + "learning_rate": 4.987960544528854e-06, + "loss": 0.6592, + "step": 691 + }, + { + "epoch": 0.1091999368786492, + "grad_norm": 0.6284206509590149, + "learning_rate": 4.987919800272941e-06, + "loss": 0.6585, + "step": 692 + }, + { + "epoch": 0.10935774025564148, + "grad_norm": 0.6215904355049133, + "learning_rate": 4.987878987356501e-06, + "loss": 0.6547, + "step": 693 + }, + { + "epoch": 0.10951554363263374, + "grad_norm": 0.6591547131538391, + "learning_rate": 4.9878381057806615e-06, + "loss": 0.6303, + "step": 694 + }, + { + "epoch": 0.10967334700962601, + "grad_norm": 0.5918931365013123, + "learning_rate": 4.987797155546549e-06, + "loss": 0.6411, + "step": 695 + }, + { + "epoch": 0.10983115038661827, + "grad_norm": 0.6448194980621338, + "learning_rate": 4.987756136655294e-06, + "loss": 0.5921, + "step": 696 + }, + { + "epoch": 0.10998895376361054, + "grad_norm": 0.629386842250824, + "learning_rate": 4.987715049108029e-06, + "loss": 0.6419, + "step": 697 + }, + { + "epoch": 0.1101467571406028, + "grad_norm": 0.5971304178237915, + "learning_rate": 4.9876738929058885e-06, + "loss": 0.6487, + "step": 698 + }, + { + "epoch": 0.11030456051759507, + "grad_norm": 0.652079164981842, + "learning_rate": 4.987632668050008e-06, + "loss": 0.6289, + "step": 699 + }, + { + "epoch": 0.11046236389458734, + "grad_norm": 0.6510581374168396, + "learning_rate": 4.987591374541523e-06, + "loss": 0.6657, + "step": 700 + }, + { + "epoch": 0.11062016727157961, + "grad_norm": 0.5817864537239075, + "learning_rate": 4.9875500123815755e-06, + "loss": 0.6226, + "step": 701 + }, + { + "epoch": 0.11077797064857188, + "grad_norm": 0.6237527132034302, + "learning_rate": 4.987508581571307e-06, + "loss": 0.6104, + "step": 702 + }, + { + "epoch": 0.11093577402556415, + "grad_norm": 0.5994796752929688, + "learning_rate": 4.98746708211186e-06, + "loss": 0.6507, + "step": 703 + }, + { + "epoch": 0.11109357740255642, + "grad_norm": 0.6146982908248901, + "learning_rate": 4.987425514004381e-06, + "loss": 0.6488, + "step": 704 + }, + { + "epoch": 0.11125138077954869, + "grad_norm": 0.6261718273162842, + "learning_rate": 4.987383877250014e-06, + "loss": 0.6333, + "step": 705 + }, + { + "epoch": 0.11140918415654096, + "grad_norm": 0.6297543048858643, + "learning_rate": 4.987342171849912e-06, + "loss": 0.6196, + "step": 706 + }, + { + "epoch": 0.11156698753353322, + "grad_norm": 0.591560959815979, + "learning_rate": 4.9873003978052236e-06, + "loss": 0.6223, + "step": 707 + }, + { + "epoch": 0.11172479091052548, + "grad_norm": 0.601751983165741, + "learning_rate": 4.9872585551171025e-06, + "loss": 0.654, + "step": 708 + }, + { + "epoch": 0.11188259428751775, + "grad_norm": 0.6266677975654602, + "learning_rate": 4.987216643786704e-06, + "loss": 0.6754, + "step": 709 + }, + { + "epoch": 0.11204039766451002, + "grad_norm": 0.5937751531600952, + "learning_rate": 4.987174663815183e-06, + "loss": 0.621, + "step": 710 + }, + { + "epoch": 0.11219820104150229, + "grad_norm": 0.6311497688293457, + "learning_rate": 4.9871326152037e-06, + "loss": 0.6635, + "step": 711 + }, + { + "epoch": 0.11235600441849455, + "grad_norm": 0.6197444796562195, + "learning_rate": 4.987090497953414e-06, + "loss": 0.6567, + "step": 712 + }, + { + "epoch": 0.11251380779548682, + "grad_norm": 0.611314058303833, + "learning_rate": 4.987048312065488e-06, + "loss": 0.6478, + "step": 713 + }, + { + "epoch": 0.11267161117247909, + "grad_norm": 0.6013267636299133, + "learning_rate": 4.9870060575410865e-06, + "loss": 0.6148, + "step": 714 + }, + { + "epoch": 0.11282941454947136, + "grad_norm": 0.6376367211341858, + "learning_rate": 4.986963734381374e-06, + "loss": 0.6088, + "step": 715 + }, + { + "epoch": 0.11298721792646363, + "grad_norm": 0.6046638488769531, + "learning_rate": 4.986921342587521e-06, + "loss": 0.6505, + "step": 716 + }, + { + "epoch": 0.1131450213034559, + "grad_norm": 0.5740271210670471, + "learning_rate": 4.986878882160695e-06, + "loss": 0.6493, + "step": 717 + }, + { + "epoch": 0.11330282468044817, + "grad_norm": 0.6046896576881409, + "learning_rate": 4.9868363531020695e-06, + "loss": 0.6308, + "step": 718 + }, + { + "epoch": 0.11346062805744044, + "grad_norm": 0.6252300143241882, + "learning_rate": 4.986793755412817e-06, + "loss": 0.6678, + "step": 719 + }, + { + "epoch": 0.11361843143443269, + "grad_norm": 0.5835577845573425, + "learning_rate": 4.986751089094115e-06, + "loss": 0.6427, + "step": 720 + }, + { + "epoch": 0.11377623481142496, + "grad_norm": 0.6209131479263306, + "learning_rate": 4.9867083541471394e-06, + "loss": 0.6369, + "step": 721 + }, + { + "epoch": 0.11393403818841723, + "grad_norm": 0.6183339953422546, + "learning_rate": 4.98666555057307e-06, + "loss": 0.6754, + "step": 722 + }, + { + "epoch": 0.1140918415654095, + "grad_norm": 0.5800102353096008, + "learning_rate": 4.986622678373088e-06, + "loss": 0.6659, + "step": 723 + }, + { + "epoch": 0.11424964494240177, + "grad_norm": 0.5979956388473511, + "learning_rate": 4.986579737548376e-06, + "loss": 0.6371, + "step": 724 + }, + { + "epoch": 0.11440744831939403, + "grad_norm": 0.6228020191192627, + "learning_rate": 4.98653672810012e-06, + "loss": 0.6246, + "step": 725 + }, + { + "epoch": 0.1145652516963863, + "grad_norm": 0.6395916938781738, + "learning_rate": 4.986493650029506e-06, + "loss": 0.6614, + "step": 726 + }, + { + "epoch": 0.11472305507337857, + "grad_norm": 0.6124863028526306, + "learning_rate": 4.986450503337723e-06, + "loss": 0.6217, + "step": 727 + }, + { + "epoch": 0.11488085845037084, + "grad_norm": 0.6137666702270508, + "learning_rate": 4.986407288025964e-06, + "loss": 0.6417, + "step": 728 + }, + { + "epoch": 0.11503866182736311, + "grad_norm": 0.6249995231628418, + "learning_rate": 4.986364004095419e-06, + "loss": 0.6481, + "step": 729 + }, + { + "epoch": 0.11519646520435538, + "grad_norm": 0.5854856371879578, + "learning_rate": 4.986320651547282e-06, + "loss": 0.6398, + "step": 730 + }, + { + "epoch": 0.11535426858134765, + "grad_norm": 0.6225841641426086, + "learning_rate": 4.986277230382752e-06, + "loss": 0.6464, + "step": 731 + }, + { + "epoch": 0.11551207195833992, + "grad_norm": 0.6337052583694458, + "learning_rate": 4.986233740603026e-06, + "loss": 0.661, + "step": 732 + }, + { + "epoch": 0.11566987533533217, + "grad_norm": 0.6871806383132935, + "learning_rate": 4.986190182209304e-06, + "loss": 0.6731, + "step": 733 + }, + { + "epoch": 0.11582767871232444, + "grad_norm": 0.6150521636009216, + "learning_rate": 4.986146555202789e-06, + "loss": 0.6081, + "step": 734 + }, + { + "epoch": 0.11598548208931671, + "grad_norm": 0.6298655271530151, + "learning_rate": 4.986102859584684e-06, + "loss": 0.6533, + "step": 735 + }, + { + "epoch": 0.11614328546630898, + "grad_norm": 0.6238766312599182, + "learning_rate": 4.986059095356195e-06, + "loss": 0.6322, + "step": 736 + }, + { + "epoch": 0.11630108884330125, + "grad_norm": 0.6006590723991394, + "learning_rate": 4.9860152625185306e-06, + "loss": 0.638, + "step": 737 + }, + { + "epoch": 0.11645889222029351, + "grad_norm": 0.6113336682319641, + "learning_rate": 4.985971361072899e-06, + "loss": 0.6604, + "step": 738 + }, + { + "epoch": 0.11661669559728578, + "grad_norm": 0.592668354511261, + "learning_rate": 4.985927391020514e-06, + "loss": 0.6268, + "step": 739 + }, + { + "epoch": 0.11677449897427805, + "grad_norm": 0.6109687685966492, + "learning_rate": 4.9858833523625874e-06, + "loss": 0.6367, + "step": 740 + }, + { + "epoch": 0.11693230235127032, + "grad_norm": 0.6443423628807068, + "learning_rate": 4.985839245100334e-06, + "loss": 0.6377, + "step": 741 + }, + { + "epoch": 0.11709010572826259, + "grad_norm": 0.5952412486076355, + "learning_rate": 4.985795069234973e-06, + "loss": 0.6512, + "step": 742 + }, + { + "epoch": 0.11724790910525486, + "grad_norm": 0.5993346571922302, + "learning_rate": 4.985750824767722e-06, + "loss": 0.6286, + "step": 743 + }, + { + "epoch": 0.11740571248224713, + "grad_norm": 0.6387029886245728, + "learning_rate": 4.9857065116998025e-06, + "loss": 0.6453, + "step": 744 + }, + { + "epoch": 0.11756351585923938, + "grad_norm": 0.5912245512008667, + "learning_rate": 4.985662130032438e-06, + "loss": 0.6767, + "step": 745 + }, + { + "epoch": 0.11772131923623165, + "grad_norm": 0.6701906323432922, + "learning_rate": 4.985617679766853e-06, + "loss": 0.6253, + "step": 746 + }, + { + "epoch": 0.11787912261322392, + "grad_norm": 0.6133910417556763, + "learning_rate": 4.985573160904273e-06, + "loss": 0.6581, + "step": 747 + }, + { + "epoch": 0.11803692599021619, + "grad_norm": 0.6203881502151489, + "learning_rate": 4.985528573445929e-06, + "loss": 0.6378, + "step": 748 + }, + { + "epoch": 0.11819472936720846, + "grad_norm": 0.6089757680892944, + "learning_rate": 4.9854839173930495e-06, + "loss": 0.6501, + "step": 749 + }, + { + "epoch": 0.11835253274420073, + "grad_norm": 0.6312099695205688, + "learning_rate": 4.985439192746868e-06, + "loss": 0.6124, + "step": 750 + }, + { + "epoch": 0.118510336121193, + "grad_norm": 0.625335156917572, + "learning_rate": 4.985394399508618e-06, + "loss": 0.6416, + "step": 751 + }, + { + "epoch": 0.11866813949818526, + "grad_norm": 0.6088402271270752, + "learning_rate": 4.9853495376795355e-06, + "loss": 0.6556, + "step": 752 + }, + { + "epoch": 0.11882594287517753, + "grad_norm": 0.6130397915840149, + "learning_rate": 4.9853046072608606e-06, + "loss": 0.63, + "step": 753 + }, + { + "epoch": 0.1189837462521698, + "grad_norm": 0.6408692598342896, + "learning_rate": 4.985259608253831e-06, + "loss": 0.6399, + "step": 754 + }, + { + "epoch": 0.11914154962916207, + "grad_norm": 0.5873361229896545, + "learning_rate": 4.985214540659689e-06, + "loss": 0.6325, + "step": 755 + }, + { + "epoch": 0.11929935300615434, + "grad_norm": 0.5983771085739136, + "learning_rate": 4.985169404479679e-06, + "loss": 0.648, + "step": 756 + }, + { + "epoch": 0.11945715638314659, + "grad_norm": 0.6090245842933655, + "learning_rate": 4.985124199715046e-06, + "loss": 0.6038, + "step": 757 + }, + { + "epoch": 0.11961495976013886, + "grad_norm": 0.6397486925125122, + "learning_rate": 4.985078926367039e-06, + "loss": 0.6319, + "step": 758 + }, + { + "epoch": 0.11977276313713113, + "grad_norm": 0.6090947389602661, + "learning_rate": 4.985033584436905e-06, + "loss": 0.6244, + "step": 759 + }, + { + "epoch": 0.1199305665141234, + "grad_norm": 0.6076495051383972, + "learning_rate": 4.9849881739258975e-06, + "loss": 0.6143, + "step": 760 + }, + { + "epoch": 0.12008836989111567, + "grad_norm": 0.6083663105964661, + "learning_rate": 4.984942694835269e-06, + "loss": 0.6343, + "step": 761 + }, + { + "epoch": 0.12024617326810794, + "grad_norm": 0.6146347522735596, + "learning_rate": 4.984897147166274e-06, + "loss": 0.6615, + "step": 762 + }, + { + "epoch": 0.1204039766451002, + "grad_norm": 0.6218137145042419, + "learning_rate": 4.98485153092017e-06, + "loss": 0.6159, + "step": 763 + }, + { + "epoch": 0.12056178002209247, + "grad_norm": 0.591456413269043, + "learning_rate": 4.984805846098216e-06, + "loss": 0.6427, + "step": 764 + }, + { + "epoch": 0.12071958339908474, + "grad_norm": 0.5927408337593079, + "learning_rate": 4.9847600927016725e-06, + "loss": 0.6306, + "step": 765 + }, + { + "epoch": 0.12087738677607701, + "grad_norm": 0.6461350321769714, + "learning_rate": 4.984714270731803e-06, + "loss": 0.6653, + "step": 766 + }, + { + "epoch": 0.12103519015306928, + "grad_norm": 0.6130207180976868, + "learning_rate": 4.984668380189871e-06, + "loss": 0.6197, + "step": 767 + }, + { + "epoch": 0.12119299353006155, + "grad_norm": 0.6011877059936523, + "learning_rate": 4.984622421077144e-06, + "loss": 0.6115, + "step": 768 + }, + { + "epoch": 0.12135079690705382, + "grad_norm": 0.5939370393753052, + "learning_rate": 4.984576393394888e-06, + "loss": 0.6089, + "step": 769 + }, + { + "epoch": 0.12150860028404607, + "grad_norm": 0.6029325723648071, + "learning_rate": 4.984530297144378e-06, + "loss": 0.6379, + "step": 770 + }, + { + "epoch": 0.12166640366103834, + "grad_norm": 0.5716822147369385, + "learning_rate": 4.984484132326881e-06, + "loss": 0.6231, + "step": 771 + }, + { + "epoch": 0.12182420703803061, + "grad_norm": 0.6190779209136963, + "learning_rate": 4.984437898943674e-06, + "loss": 0.6592, + "step": 772 + }, + { + "epoch": 0.12198201041502288, + "grad_norm": 0.6198763847351074, + "learning_rate": 4.984391596996031e-06, + "loss": 0.6424, + "step": 773 + }, + { + "epoch": 0.12213981379201515, + "grad_norm": 0.6004554033279419, + "learning_rate": 4.984345226485232e-06, + "loss": 0.6281, + "step": 774 + }, + { + "epoch": 0.12229761716900742, + "grad_norm": 0.5974472165107727, + "learning_rate": 4.984298787412556e-06, + "loss": 0.6369, + "step": 775 + }, + { + "epoch": 0.12245542054599969, + "grad_norm": 0.6040560007095337, + "learning_rate": 4.984252279779284e-06, + "loss": 0.6459, + "step": 776 + }, + { + "epoch": 0.12261322392299195, + "grad_norm": 0.6181743144989014, + "learning_rate": 4.984205703586699e-06, + "loss": 0.655, + "step": 777 + }, + { + "epoch": 0.12277102729998422, + "grad_norm": 0.6129052042961121, + "learning_rate": 4.984159058836088e-06, + "loss": 0.6243, + "step": 778 + }, + { + "epoch": 0.12292883067697649, + "grad_norm": 0.5938361287117004, + "learning_rate": 4.984112345528736e-06, + "loss": 0.6381, + "step": 779 + }, + { + "epoch": 0.12308663405396876, + "grad_norm": 0.6039736270904541, + "learning_rate": 4.984065563665936e-06, + "loss": 0.639, + "step": 780 + }, + { + "epoch": 0.12324443743096103, + "grad_norm": 0.6031100749969482, + "learning_rate": 4.984018713248975e-06, + "loss": 0.6584, + "step": 781 + }, + { + "epoch": 0.12340224080795328, + "grad_norm": 0.5818390846252441, + "learning_rate": 4.983971794279147e-06, + "loss": 0.6169, + "step": 782 + }, + { + "epoch": 0.12356004418494555, + "grad_norm": 0.636507511138916, + "learning_rate": 4.983924806757749e-06, + "loss": 0.6565, + "step": 783 + }, + { + "epoch": 0.12371784756193782, + "grad_norm": 0.61595219373703, + "learning_rate": 4.983877750686076e-06, + "loss": 0.6403, + "step": 784 + }, + { + "epoch": 0.12387565093893009, + "grad_norm": 0.590165376663208, + "learning_rate": 4.983830626065427e-06, + "loss": 0.6235, + "step": 785 + }, + { + "epoch": 0.12403345431592236, + "grad_norm": 0.7213973999023438, + "learning_rate": 4.983783432897102e-06, + "loss": 0.6364, + "step": 786 + }, + { + "epoch": 0.12419125769291463, + "grad_norm": 0.6123033165931702, + "learning_rate": 4.983736171182404e-06, + "loss": 0.6534, + "step": 787 + }, + { + "epoch": 0.1243490610699069, + "grad_norm": 0.6014653444290161, + "learning_rate": 4.983688840922637e-06, + "loss": 0.6522, + "step": 788 + }, + { + "epoch": 0.12450686444689917, + "grad_norm": 0.6012893915176392, + "learning_rate": 4.9836414421191074e-06, + "loss": 0.6098, + "step": 789 + }, + { + "epoch": 0.12466466782389143, + "grad_norm": 0.6142413020133972, + "learning_rate": 4.983593974773123e-06, + "loss": 0.6576, + "step": 790 + }, + { + "epoch": 0.1248224712008837, + "grad_norm": 0.6333629488945007, + "learning_rate": 4.983546438885995e-06, + "loss": 0.6708, + "step": 791 + }, + { + "epoch": 0.12498027457787597, + "grad_norm": 0.6965512633323669, + "learning_rate": 4.983498834459033e-06, + "loss": 0.6327, + "step": 792 + }, + { + "epoch": 0.12513807795486823, + "grad_norm": 0.6511685252189636, + "learning_rate": 4.9834511614935525e-06, + "loss": 0.6665, + "step": 793 + }, + { + "epoch": 0.1252958813318605, + "grad_norm": 0.6017680764198303, + "learning_rate": 4.983403419990868e-06, + "loss": 0.6417, + "step": 794 + }, + { + "epoch": 0.12545368470885276, + "grad_norm": 0.6191858649253845, + "learning_rate": 4.983355609952299e-06, + "loss": 0.6361, + "step": 795 + }, + { + "epoch": 0.12561148808584505, + "grad_norm": 0.6632404327392578, + "learning_rate": 4.983307731379162e-06, + "loss": 0.6229, + "step": 796 + }, + { + "epoch": 0.1257692914628373, + "grad_norm": 0.6223204135894775, + "learning_rate": 4.983259784272781e-06, + "loss": 0.6588, + "step": 797 + }, + { + "epoch": 0.12592709483982958, + "grad_norm": 0.6119071841239929, + "learning_rate": 4.983211768634479e-06, + "loss": 0.6279, + "step": 798 + }, + { + "epoch": 0.12608489821682184, + "grad_norm": 0.6045776009559631, + "learning_rate": 4.983163684465579e-06, + "loss": 0.6432, + "step": 799 + }, + { + "epoch": 0.1262427015938141, + "grad_norm": 0.5877479910850525, + "learning_rate": 4.9831155317674105e-06, + "loss": 0.6467, + "step": 800 + }, + { + "epoch": 0.12640050497080638, + "grad_norm": 0.6201174259185791, + "learning_rate": 4.983067310541301e-06, + "loss": 0.6801, + "step": 801 + }, + { + "epoch": 0.12655830834779863, + "grad_norm": 0.6278963685035706, + "learning_rate": 4.983019020788581e-06, + "loss": 0.6642, + "step": 802 + }, + { + "epoch": 0.12671611172479091, + "grad_norm": 0.6208709478378296, + "learning_rate": 4.982970662510583e-06, + "loss": 0.6791, + "step": 803 + }, + { + "epoch": 0.12687391510178317, + "grad_norm": 0.6258389353752136, + "learning_rate": 4.9829222357086435e-06, + "loss": 0.6762, + "step": 804 + }, + { + "epoch": 0.12703171847877545, + "grad_norm": 0.5867360234260559, + "learning_rate": 4.9828737403840975e-06, + "loss": 0.6388, + "step": 805 + }, + { + "epoch": 0.1271895218557677, + "grad_norm": 0.6117899417877197, + "learning_rate": 4.982825176538284e-06, + "loss": 0.6252, + "step": 806 + }, + { + "epoch": 0.12734732523276, + "grad_norm": 0.6776025891304016, + "learning_rate": 4.982776544172543e-06, + "loss": 0.6823, + "step": 807 + }, + { + "epoch": 0.12750512860975224, + "grad_norm": 0.6469349265098572, + "learning_rate": 4.982727843288216e-06, + "loss": 0.6159, + "step": 808 + }, + { + "epoch": 0.12766293198674453, + "grad_norm": 0.6127049326896667, + "learning_rate": 4.982679073886647e-06, + "loss": 0.6186, + "step": 809 + }, + { + "epoch": 0.12782073536373678, + "grad_norm": 0.5773444175720215, + "learning_rate": 4.982630235969183e-06, + "loss": 0.6369, + "step": 810 + }, + { + "epoch": 0.12797853874072906, + "grad_norm": 0.6018272042274475, + "learning_rate": 4.982581329537171e-06, + "loss": 0.6439, + "step": 811 + }, + { + "epoch": 0.12813634211772132, + "grad_norm": 0.6667247414588928, + "learning_rate": 4.982532354591961e-06, + "loss": 0.6074, + "step": 812 + }, + { + "epoch": 0.12829414549471357, + "grad_norm": 0.6107209324836731, + "learning_rate": 4.982483311134905e-06, + "loss": 0.6296, + "step": 813 + }, + { + "epoch": 0.12845194887170586, + "grad_norm": 0.5909243822097778, + "learning_rate": 4.982434199167355e-06, + "loss": 0.6155, + "step": 814 + }, + { + "epoch": 0.1286097522486981, + "grad_norm": 0.6183427572250366, + "learning_rate": 4.982385018690668e-06, + "loss": 0.619, + "step": 815 + }, + { + "epoch": 0.1287675556256904, + "grad_norm": 0.6045028567314148, + "learning_rate": 4.982335769706201e-06, + "loss": 0.6474, + "step": 816 + }, + { + "epoch": 0.12892535900268265, + "grad_norm": 0.6138720512390137, + "learning_rate": 4.9822864522153125e-06, + "loss": 0.6366, + "step": 817 + }, + { + "epoch": 0.12908316237967493, + "grad_norm": 0.609494686126709, + "learning_rate": 4.982237066219363e-06, + "loss": 0.6404, + "step": 818 + }, + { + "epoch": 0.1292409657566672, + "grad_norm": 0.6650531888008118, + "learning_rate": 4.9821876117197175e-06, + "loss": 0.6078, + "step": 819 + }, + { + "epoch": 0.12939876913365947, + "grad_norm": 0.6163921356201172, + "learning_rate": 4.982138088717739e-06, + "loss": 0.6299, + "step": 820 + }, + { + "epoch": 0.12955657251065172, + "grad_norm": 0.6029989719390869, + "learning_rate": 4.982088497214794e-06, + "loss": 0.608, + "step": 821 + }, + { + "epoch": 0.129714375887644, + "grad_norm": 0.6182960867881775, + "learning_rate": 4.982038837212253e-06, + "loss": 0.612, + "step": 822 + }, + { + "epoch": 0.12987217926463626, + "grad_norm": 0.6064902544021606, + "learning_rate": 4.9819891087114854e-06, + "loss": 0.6209, + "step": 823 + }, + { + "epoch": 0.13002998264162854, + "grad_norm": 0.6146350502967834, + "learning_rate": 4.981939311713864e-06, + "loss": 0.6197, + "step": 824 + }, + { + "epoch": 0.1301877860186208, + "grad_norm": 0.5913491249084473, + "learning_rate": 4.981889446220761e-06, + "loss": 0.6144, + "step": 825 + }, + { + "epoch": 0.13034558939561305, + "grad_norm": 0.6101393103599548, + "learning_rate": 4.9818395122335554e-06, + "loss": 0.6066, + "step": 826 + }, + { + "epoch": 0.13050339277260534, + "grad_norm": 0.6417921185493469, + "learning_rate": 4.981789509753624e-06, + "loss": 0.6436, + "step": 827 + }, + { + "epoch": 0.1306611961495976, + "grad_norm": 0.6686950922012329, + "learning_rate": 4.981739438782347e-06, + "loss": 0.5984, + "step": 828 + }, + { + "epoch": 0.13081899952658987, + "grad_norm": 0.5953874588012695, + "learning_rate": 4.9816892993211055e-06, + "loss": 0.6382, + "step": 829 + }, + { + "epoch": 0.13097680290358213, + "grad_norm": 0.6146031618118286, + "learning_rate": 4.9816390913712845e-06, + "loss": 0.6228, + "step": 830 + }, + { + "epoch": 0.1311346062805744, + "grad_norm": 0.5932271480560303, + "learning_rate": 4.981588814934268e-06, + "loss": 0.6604, + "step": 831 + }, + { + "epoch": 0.13129240965756667, + "grad_norm": 0.6322116255760193, + "learning_rate": 4.981538470011445e-06, + "loss": 0.663, + "step": 832 + }, + { + "epoch": 0.13145021303455895, + "grad_norm": 0.6449164152145386, + "learning_rate": 4.981488056604204e-06, + "loss": 0.5973, + "step": 833 + }, + { + "epoch": 0.1316080164115512, + "grad_norm": 0.6179631352424622, + "learning_rate": 4.981437574713937e-06, + "loss": 0.6162, + "step": 834 + }, + { + "epoch": 0.1317658197885435, + "grad_norm": 0.6393401622772217, + "learning_rate": 4.981387024342037e-06, + "loss": 0.6293, + "step": 835 + }, + { + "epoch": 0.13192362316553574, + "grad_norm": 0.6342131495475769, + "learning_rate": 4.981336405489898e-06, + "loss": 0.6105, + "step": 836 + }, + { + "epoch": 0.132081426542528, + "grad_norm": 0.6112942695617676, + "learning_rate": 4.981285718158918e-06, + "loss": 0.6288, + "step": 837 + }, + { + "epoch": 0.13223922991952028, + "grad_norm": 0.600506067276001, + "learning_rate": 4.9812349623504966e-06, + "loss": 0.6413, + "step": 838 + }, + { + "epoch": 0.13239703329651253, + "grad_norm": 0.6295110583305359, + "learning_rate": 4.981184138066033e-06, + "loss": 0.5992, + "step": 839 + }, + { + "epoch": 0.13255483667350482, + "grad_norm": 0.6273886561393738, + "learning_rate": 4.98113324530693e-06, + "loss": 0.6175, + "step": 840 + }, + { + "epoch": 0.13271264005049707, + "grad_norm": 0.592329204082489, + "learning_rate": 4.981082284074593e-06, + "loss": 0.6419, + "step": 841 + }, + { + "epoch": 0.13287044342748935, + "grad_norm": 0.6474609971046448, + "learning_rate": 4.981031254370427e-06, + "loss": 0.6436, + "step": 842 + }, + { + "epoch": 0.1330282468044816, + "grad_norm": 0.6497297883033752, + "learning_rate": 4.980980156195842e-06, + "loss": 0.6181, + "step": 843 + }, + { + "epoch": 0.1331860501814739, + "grad_norm": 0.6021966934204102, + "learning_rate": 4.980928989552249e-06, + "loss": 0.6463, + "step": 844 + }, + { + "epoch": 0.13334385355846615, + "grad_norm": 0.5931944847106934, + "learning_rate": 4.980877754441056e-06, + "loss": 0.6354, + "step": 845 + }, + { + "epoch": 0.13350165693545843, + "grad_norm": 0.6045413613319397, + "learning_rate": 4.98082645086368e-06, + "loss": 0.5986, + "step": 846 + }, + { + "epoch": 0.13365946031245068, + "grad_norm": 0.6363947987556458, + "learning_rate": 4.980775078821537e-06, + "loss": 0.6135, + "step": 847 + }, + { + "epoch": 0.13381726368944297, + "grad_norm": 0.5982140302658081, + "learning_rate": 4.980723638316044e-06, + "loss": 0.648, + "step": 848 + }, + { + "epoch": 0.13397506706643522, + "grad_norm": 0.7558128237724304, + "learning_rate": 4.980672129348621e-06, + "loss": 0.6354, + "step": 849 + }, + { + "epoch": 0.13413287044342748, + "grad_norm": 0.6188845038414001, + "learning_rate": 4.9806205519206875e-06, + "loss": 0.6356, + "step": 850 + }, + { + "epoch": 0.13429067382041976, + "grad_norm": 0.5920409560203552, + "learning_rate": 4.9805689060336694e-06, + "loss": 0.6235, + "step": 851 + }, + { + "epoch": 0.13444847719741201, + "grad_norm": 0.6182528734207153, + "learning_rate": 4.980517191688992e-06, + "loss": 0.643, + "step": 852 + }, + { + "epoch": 0.1346062805744043, + "grad_norm": 0.6087068915367126, + "learning_rate": 4.980465408888081e-06, + "loss": 0.6543, + "step": 853 + }, + { + "epoch": 0.13476408395139655, + "grad_norm": 0.5963150262832642, + "learning_rate": 4.9804135576323666e-06, + "loss": 0.6445, + "step": 854 + }, + { + "epoch": 0.13492188732838883, + "grad_norm": 0.6027868986129761, + "learning_rate": 4.98036163792328e-06, + "loss": 0.6526, + "step": 855 + }, + { + "epoch": 0.1350796907053811, + "grad_norm": 0.5980510115623474, + "learning_rate": 4.980309649762252e-06, + "loss": 0.6521, + "step": 856 + }, + { + "epoch": 0.13523749408237337, + "grad_norm": 0.6804466247558594, + "learning_rate": 4.980257593150719e-06, + "loss": 0.6626, + "step": 857 + }, + { + "epoch": 0.13539529745936563, + "grad_norm": 0.5910912156105042, + "learning_rate": 4.980205468090118e-06, + "loss": 0.6553, + "step": 858 + }, + { + "epoch": 0.1355531008363579, + "grad_norm": 0.6278927326202393, + "learning_rate": 4.980153274581887e-06, + "loss": 0.6174, + "step": 859 + }, + { + "epoch": 0.13571090421335016, + "grad_norm": 0.6183095574378967, + "learning_rate": 4.980101012627467e-06, + "loss": 0.6438, + "step": 860 + }, + { + "epoch": 0.13586870759034245, + "grad_norm": 0.5965843200683594, + "learning_rate": 4.980048682228298e-06, + "loss": 0.6414, + "step": 861 + }, + { + "epoch": 0.1360265109673347, + "grad_norm": 0.6035426259040833, + "learning_rate": 4.979996283385826e-06, + "loss": 0.6377, + "step": 862 + }, + { + "epoch": 0.13618431434432696, + "grad_norm": 0.5912325978279114, + "learning_rate": 4.9799438161014975e-06, + "loss": 0.6566, + "step": 863 + }, + { + "epoch": 0.13634211772131924, + "grad_norm": 0.5959058403968811, + "learning_rate": 4.9798912803767595e-06, + "loss": 0.6475, + "step": 864 + }, + { + "epoch": 0.1364999210983115, + "grad_norm": 0.5672042965888977, + "learning_rate": 4.979838676213063e-06, + "loss": 0.6397, + "step": 865 + }, + { + "epoch": 0.13665772447530378, + "grad_norm": 0.6103313565254211, + "learning_rate": 4.979786003611859e-06, + "loss": 0.6349, + "step": 866 + }, + { + "epoch": 0.13681552785229603, + "grad_norm": 0.5745828747749329, + "learning_rate": 4.9797332625746e-06, + "loss": 0.5901, + "step": 867 + }, + { + "epoch": 0.13697333122928831, + "grad_norm": 0.5945615768432617, + "learning_rate": 4.9796804531027435e-06, + "loss": 0.6534, + "step": 868 + }, + { + "epoch": 0.13713113460628057, + "grad_norm": 0.5940654873847961, + "learning_rate": 4.979627575197745e-06, + "loss": 0.6489, + "step": 869 + }, + { + "epoch": 0.13728893798327285, + "grad_norm": 0.5990129709243774, + "learning_rate": 4.979574628861066e-06, + "loss": 0.6214, + "step": 870 + }, + { + "epoch": 0.1374467413602651, + "grad_norm": 0.5948096513748169, + "learning_rate": 4.9795216140941665e-06, + "loss": 0.6124, + "step": 871 + }, + { + "epoch": 0.1376045447372574, + "grad_norm": 0.6223277449607849, + "learning_rate": 4.979468530898509e-06, + "loss": 0.6355, + "step": 872 + }, + { + "epoch": 0.13776234811424964, + "grad_norm": 0.6466497182846069, + "learning_rate": 4.979415379275559e-06, + "loss": 0.627, + "step": 873 + }, + { + "epoch": 0.1379201514912419, + "grad_norm": 0.5830071568489075, + "learning_rate": 4.979362159226785e-06, + "loss": 0.6254, + "step": 874 + }, + { + "epoch": 0.13807795486823418, + "grad_norm": 0.5977612137794495, + "learning_rate": 4.979308870753653e-06, + "loss": 0.6012, + "step": 875 + }, + { + "epoch": 0.13823575824522644, + "grad_norm": 0.5891567468643188, + "learning_rate": 4.979255513857635e-06, + "loss": 0.66, + "step": 876 + }, + { + "epoch": 0.13839356162221872, + "grad_norm": 0.5959478616714478, + "learning_rate": 4.979202088540203e-06, + "loss": 0.6353, + "step": 877 + }, + { + "epoch": 0.13855136499921097, + "grad_norm": 0.616981565952301, + "learning_rate": 4.979148594802832e-06, + "loss": 0.6426, + "step": 878 + }, + { + "epoch": 0.13870916837620326, + "grad_norm": 0.613396167755127, + "learning_rate": 4.979095032646998e-06, + "loss": 0.6538, + "step": 879 + }, + { + "epoch": 0.1388669717531955, + "grad_norm": 0.5930302739143372, + "learning_rate": 4.97904140207418e-06, + "loss": 0.6457, + "step": 880 + }, + { + "epoch": 0.1390247751301878, + "grad_norm": 0.6025069355964661, + "learning_rate": 4.978987703085856e-06, + "loss": 0.6443, + "step": 881 + }, + { + "epoch": 0.13918257850718005, + "grad_norm": 0.6064319014549255, + "learning_rate": 4.97893393568351e-06, + "loss": 0.6402, + "step": 882 + }, + { + "epoch": 0.13934038188417233, + "grad_norm": 0.5758137106895447, + "learning_rate": 4.9788800998686245e-06, + "loss": 0.637, + "step": 883 + }, + { + "epoch": 0.1394981852611646, + "grad_norm": 0.5761468410491943, + "learning_rate": 4.978826195642686e-06, + "loss": 0.6345, + "step": 884 + }, + { + "epoch": 0.13965598863815687, + "grad_norm": 0.6252012252807617, + "learning_rate": 4.978772223007183e-06, + "loss": 0.6451, + "step": 885 + }, + { + "epoch": 0.13981379201514912, + "grad_norm": 0.6034058928489685, + "learning_rate": 4.978718181963603e-06, + "loss": 0.6087, + "step": 886 + }, + { + "epoch": 0.13997159539214138, + "grad_norm": 0.6625187993049622, + "learning_rate": 4.97866407251344e-06, + "loss": 0.595, + "step": 887 + }, + { + "epoch": 0.14012939876913366, + "grad_norm": 0.651247501373291, + "learning_rate": 4.978609894658184e-06, + "loss": 0.6607, + "step": 888 + }, + { + "epoch": 0.14028720214612592, + "grad_norm": 0.6075007319450378, + "learning_rate": 4.978555648399332e-06, + "loss": 0.6356, + "step": 889 + }, + { + "epoch": 0.1404450055231182, + "grad_norm": 0.600036084651947, + "learning_rate": 4.978501333738381e-06, + "loss": 0.6128, + "step": 890 + }, + { + "epoch": 0.14060280890011045, + "grad_norm": 0.641432523727417, + "learning_rate": 4.97844695067683e-06, + "loss": 0.6251, + "step": 891 + }, + { + "epoch": 0.14076061227710274, + "grad_norm": 0.6460295915603638, + "learning_rate": 4.97839249921618e-06, + "loss": 0.6275, + "step": 892 + }, + { + "epoch": 0.140918415654095, + "grad_norm": 0.5987889170646667, + "learning_rate": 4.978337979357933e-06, + "loss": 0.6136, + "step": 893 + }, + { + "epoch": 0.14107621903108727, + "grad_norm": 0.6143597364425659, + "learning_rate": 4.9782833911035945e-06, + "loss": 0.6592, + "step": 894 + }, + { + "epoch": 0.14123402240807953, + "grad_norm": 0.6144183874130249, + "learning_rate": 4.978228734454671e-06, + "loss": 0.6436, + "step": 895 + }, + { + "epoch": 0.1413918257850718, + "grad_norm": 0.6333827972412109, + "learning_rate": 4.9781740094126695e-06, + "loss": 0.6528, + "step": 896 + }, + { + "epoch": 0.14154962916206407, + "grad_norm": 0.5945299863815308, + "learning_rate": 4.978119215979101e-06, + "loss": 0.5767, + "step": 897 + }, + { + "epoch": 0.14170743253905635, + "grad_norm": 0.6198012828826904, + "learning_rate": 4.978064354155479e-06, + "loss": 0.6593, + "step": 898 + }, + { + "epoch": 0.1418652359160486, + "grad_norm": 0.5852805376052856, + "learning_rate": 4.978009423943316e-06, + "loss": 0.6013, + "step": 899 + }, + { + "epoch": 0.14202303929304086, + "grad_norm": 0.6062827110290527, + "learning_rate": 4.977954425344128e-06, + "loss": 0.628, + "step": 900 + }, + { + "epoch": 0.14218084267003314, + "grad_norm": 0.6049870848655701, + "learning_rate": 4.977899358359434e-06, + "loss": 0.6225, + "step": 901 + }, + { + "epoch": 0.1423386460470254, + "grad_norm": 0.6394222974777222, + "learning_rate": 4.977844222990752e-06, + "loss": 0.6259, + "step": 902 + }, + { + "epoch": 0.14249644942401768, + "grad_norm": 0.5906399488449097, + "learning_rate": 4.977789019239605e-06, + "loss": 0.5934, + "step": 903 + }, + { + "epoch": 0.14265425280100993, + "grad_norm": 0.6267125606536865, + "learning_rate": 4.977733747107516e-06, + "loss": 0.6427, + "step": 904 + }, + { + "epoch": 0.14281205617800222, + "grad_norm": 0.8201143741607666, + "learning_rate": 4.97767840659601e-06, + "loss": 0.6408, + "step": 905 + }, + { + "epoch": 0.14296985955499447, + "grad_norm": 0.607143223285675, + "learning_rate": 4.977622997706615e-06, + "loss": 0.6034, + "step": 906 + }, + { + "epoch": 0.14312766293198675, + "grad_norm": 0.6133495569229126, + "learning_rate": 4.97756752044086e-06, + "loss": 0.6238, + "step": 907 + }, + { + "epoch": 0.143285466308979, + "grad_norm": 0.6592559218406677, + "learning_rate": 4.977511974800276e-06, + "loss": 0.6137, + "step": 908 + }, + { + "epoch": 0.1434432696859713, + "grad_norm": 0.6101793050765991, + "learning_rate": 4.977456360786396e-06, + "loss": 0.6482, + "step": 909 + }, + { + "epoch": 0.14360107306296355, + "grad_norm": 0.6255741119384766, + "learning_rate": 4.977400678400753e-06, + "loss": 0.6409, + "step": 910 + }, + { + "epoch": 0.1437588764399558, + "grad_norm": 0.5958518385887146, + "learning_rate": 4.9773449276448875e-06, + "loss": 0.5838, + "step": 911 + }, + { + "epoch": 0.14391667981694808, + "grad_norm": 0.6139114499092102, + "learning_rate": 4.9772891085203345e-06, + "loss": 0.652, + "step": 912 + }, + { + "epoch": 0.14407448319394034, + "grad_norm": 0.5984649062156677, + "learning_rate": 4.977233221028635e-06, + "loss": 0.6608, + "step": 913 + }, + { + "epoch": 0.14423228657093262, + "grad_norm": 0.6118693351745605, + "learning_rate": 4.977177265171335e-06, + "loss": 0.6444, + "step": 914 + }, + { + "epoch": 0.14439008994792488, + "grad_norm": 0.5934162735939026, + "learning_rate": 4.977121240949974e-06, + "loss": 0.6482, + "step": 915 + }, + { + "epoch": 0.14454789332491716, + "grad_norm": 0.6019640564918518, + "learning_rate": 4.9770651483661e-06, + "loss": 0.6435, + "step": 916 + }, + { + "epoch": 0.14470569670190941, + "grad_norm": 0.5906794667243958, + "learning_rate": 4.977008987421261e-06, + "loss": 0.6012, + "step": 917 + }, + { + "epoch": 0.1448635000789017, + "grad_norm": 0.6129385828971863, + "learning_rate": 4.976952758117007e-06, + "loss": 0.642, + "step": 918 + }, + { + "epoch": 0.14502130345589395, + "grad_norm": 0.6514279246330261, + "learning_rate": 4.976896460454891e-06, + "loss": 0.6405, + "step": 919 + }, + { + "epoch": 0.14517910683288623, + "grad_norm": 0.6453127861022949, + "learning_rate": 4.9768400944364645e-06, + "loss": 0.6186, + "step": 920 + }, + { + "epoch": 0.1453369102098785, + "grad_norm": 0.6324509382247925, + "learning_rate": 4.976783660063284e-06, + "loss": 0.6185, + "step": 921 + }, + { + "epoch": 0.14549471358687077, + "grad_norm": 0.6335612535476685, + "learning_rate": 4.976727157336908e-06, + "loss": 0.6612, + "step": 922 + }, + { + "epoch": 0.14565251696386303, + "grad_norm": 0.6552138924598694, + "learning_rate": 4.976670586258894e-06, + "loss": 0.645, + "step": 923 + }, + { + "epoch": 0.14581032034085528, + "grad_norm": 0.6237993836402893, + "learning_rate": 4.976613946830804e-06, + "loss": 0.6117, + "step": 924 + }, + { + "epoch": 0.14596812371784756, + "grad_norm": 0.6401041150093079, + "learning_rate": 4.9765572390542e-06, + "loss": 0.6496, + "step": 925 + }, + { + "epoch": 0.14612592709483982, + "grad_norm": 0.628280758857727, + "learning_rate": 4.97650046293065e-06, + "loss": 0.6394, + "step": 926 + }, + { + "epoch": 0.1462837304718321, + "grad_norm": 0.5976707935333252, + "learning_rate": 4.9764436184617185e-06, + "loss": 0.6204, + "step": 927 + }, + { + "epoch": 0.14644153384882436, + "grad_norm": 0.5820045471191406, + "learning_rate": 4.976386705648975e-06, + "loss": 0.6472, + "step": 928 + }, + { + "epoch": 0.14659933722581664, + "grad_norm": 0.600651741027832, + "learning_rate": 4.97632972449399e-06, + "loss": 0.6276, + "step": 929 + }, + { + "epoch": 0.1467571406028089, + "grad_norm": 0.6363376379013062, + "learning_rate": 4.976272674998335e-06, + "loss": 0.654, + "step": 930 + }, + { + "epoch": 0.14691494397980118, + "grad_norm": 0.6553608179092407, + "learning_rate": 4.976215557163587e-06, + "loss": 0.6343, + "step": 931 + }, + { + "epoch": 0.14707274735679343, + "grad_norm": 0.6423518061637878, + "learning_rate": 4.976158370991319e-06, + "loss": 0.6333, + "step": 932 + }, + { + "epoch": 0.14723055073378571, + "grad_norm": 0.602048933506012, + "learning_rate": 4.976101116483113e-06, + "loss": 0.6443, + "step": 933 + }, + { + "epoch": 0.14738835411077797, + "grad_norm": 0.6035428047180176, + "learning_rate": 4.976043793640545e-06, + "loss": 0.6408, + "step": 934 + }, + { + "epoch": 0.14754615748777025, + "grad_norm": 0.610040545463562, + "learning_rate": 4.9759864024652005e-06, + "loss": 0.6232, + "step": 935 + }, + { + "epoch": 0.1477039608647625, + "grad_norm": 0.6229634284973145, + "learning_rate": 4.975928942958661e-06, + "loss": 0.6459, + "step": 936 + }, + { + "epoch": 0.14786176424175476, + "grad_norm": 0.5759377479553223, + "learning_rate": 4.9758714151225126e-06, + "loss": 0.6336, + "step": 937 + }, + { + "epoch": 0.14801956761874704, + "grad_norm": 0.6177905797958374, + "learning_rate": 4.975813818958345e-06, + "loss": 0.6443, + "step": 938 + }, + { + "epoch": 0.1481773709957393, + "grad_norm": 0.6076806783676147, + "learning_rate": 4.975756154467745e-06, + "loss": 0.6223, + "step": 939 + }, + { + "epoch": 0.14833517437273158, + "grad_norm": 0.5796939730644226, + "learning_rate": 4.975698421652305e-06, + "loss": 0.66, + "step": 940 + }, + { + "epoch": 0.14849297774972384, + "grad_norm": 0.6356583833694458, + "learning_rate": 4.975640620513619e-06, + "loss": 0.6446, + "step": 941 + }, + { + "epoch": 0.14865078112671612, + "grad_norm": 0.6030867695808411, + "learning_rate": 4.975582751053282e-06, + "loss": 0.6497, + "step": 942 + }, + { + "epoch": 0.14880858450370837, + "grad_norm": 0.643054187297821, + "learning_rate": 4.9755248132728905e-06, + "loss": 0.6109, + "step": 943 + }, + { + "epoch": 0.14896638788070066, + "grad_norm": 0.6281759738922119, + "learning_rate": 4.975466807174043e-06, + "loss": 0.5939, + "step": 944 + }, + { + "epoch": 0.1491241912576929, + "grad_norm": 0.6281636953353882, + "learning_rate": 4.975408732758341e-06, + "loss": 0.6513, + "step": 945 + }, + { + "epoch": 0.1492819946346852, + "grad_norm": 0.6514776945114136, + "learning_rate": 4.975350590027387e-06, + "loss": 0.6068, + "step": 946 + }, + { + "epoch": 0.14943979801167745, + "grad_norm": 0.6355276107788086, + "learning_rate": 4.975292378982786e-06, + "loss": 0.6754, + "step": 947 + }, + { + "epoch": 0.1495976013886697, + "grad_norm": 0.6436530351638794, + "learning_rate": 4.975234099626144e-06, + "loss": 0.624, + "step": 948 + }, + { + "epoch": 0.149755404765662, + "grad_norm": 0.6209598183631897, + "learning_rate": 4.97517575195907e-06, + "loss": 0.6198, + "step": 949 + }, + { + "epoch": 0.14991320814265424, + "grad_norm": 0.6201461553573608, + "learning_rate": 4.975117335983174e-06, + "loss": 0.6411, + "step": 950 + }, + { + "epoch": 0.15007101151964652, + "grad_norm": 0.62816321849823, + "learning_rate": 4.975058851700068e-06, + "loss": 0.6163, + "step": 951 + }, + { + "epoch": 0.15022881489663878, + "grad_norm": 0.6164888143539429, + "learning_rate": 4.975000299111365e-06, + "loss": 0.6129, + "step": 952 + }, + { + "epoch": 0.15038661827363106, + "grad_norm": 0.6275198459625244, + "learning_rate": 4.974941678218682e-06, + "loss": 0.5749, + "step": 953 + }, + { + "epoch": 0.15054442165062332, + "grad_norm": 0.6391667127609253, + "learning_rate": 4.974882989023637e-06, + "loss": 0.6223, + "step": 954 + }, + { + "epoch": 0.1507022250276156, + "grad_norm": 0.6131268739700317, + "learning_rate": 4.97482423152785e-06, + "loss": 0.6697, + "step": 955 + }, + { + "epoch": 0.15086002840460785, + "grad_norm": 0.597704291343689, + "learning_rate": 4.974765405732942e-06, + "loss": 0.6313, + "step": 956 + }, + { + "epoch": 0.15101783178160014, + "grad_norm": 0.6171226501464844, + "learning_rate": 4.974706511640536e-06, + "loss": 0.6328, + "step": 957 + }, + { + "epoch": 0.1511756351585924, + "grad_norm": 0.6155167818069458, + "learning_rate": 4.974647549252258e-06, + "loss": 0.6412, + "step": 958 + }, + { + "epoch": 0.15133343853558467, + "grad_norm": 0.6604779958724976, + "learning_rate": 4.974588518569734e-06, + "loss": 0.6026, + "step": 959 + }, + { + "epoch": 0.15149124191257693, + "grad_norm": 0.5766696333885193, + "learning_rate": 4.974529419594595e-06, + "loss": 0.6166, + "step": 960 + }, + { + "epoch": 0.15164904528956918, + "grad_norm": 0.6586501002311707, + "learning_rate": 4.9744702523284705e-06, + "loss": 0.6359, + "step": 961 + }, + { + "epoch": 0.15180684866656147, + "grad_norm": 0.6245135068893433, + "learning_rate": 4.974411016772994e-06, + "loss": 0.6078, + "step": 962 + }, + { + "epoch": 0.15196465204355372, + "grad_norm": 0.5948320031166077, + "learning_rate": 4.9743517129298e-06, + "loss": 0.6428, + "step": 963 + }, + { + "epoch": 0.152122455420546, + "grad_norm": 0.6047842502593994, + "learning_rate": 4.9742923408005255e-06, + "loss": 0.6529, + "step": 964 + }, + { + "epoch": 0.15228025879753826, + "grad_norm": 0.601737916469574, + "learning_rate": 4.974232900386809e-06, + "loss": 0.6495, + "step": 965 + }, + { + "epoch": 0.15243806217453054, + "grad_norm": 0.6277657151222229, + "learning_rate": 4.974173391690291e-06, + "loss": 0.6332, + "step": 966 + }, + { + "epoch": 0.1525958655515228, + "grad_norm": 0.6098552346229553, + "learning_rate": 4.974113814712613e-06, + "loss": 0.6311, + "step": 967 + }, + { + "epoch": 0.15275366892851508, + "grad_norm": 0.6191047430038452, + "learning_rate": 4.97405416945542e-06, + "loss": 0.636, + "step": 968 + }, + { + "epoch": 0.15291147230550733, + "grad_norm": 0.6566717624664307, + "learning_rate": 4.973994455920357e-06, + "loss": 0.6795, + "step": 969 + }, + { + "epoch": 0.15306927568249962, + "grad_norm": 0.6293083429336548, + "learning_rate": 4.973934674109073e-06, + "loss": 0.6056, + "step": 970 + }, + { + "epoch": 0.15322707905949187, + "grad_norm": 0.6003572344779968, + "learning_rate": 4.973874824023219e-06, + "loss": 0.633, + "step": 971 + }, + { + "epoch": 0.15338488243648415, + "grad_norm": 0.5949231386184692, + "learning_rate": 4.973814905664444e-06, + "loss": 0.6272, + "step": 972 + }, + { + "epoch": 0.1535426858134764, + "grad_norm": 0.5774254202842712, + "learning_rate": 4.973754919034403e-06, + "loss": 0.5922, + "step": 973 + }, + { + "epoch": 0.15370048919046866, + "grad_norm": 0.5895035862922668, + "learning_rate": 4.9736948641347514e-06, + "loss": 0.6234, + "step": 974 + }, + { + "epoch": 0.15385829256746095, + "grad_norm": 0.6236995458602905, + "learning_rate": 4.973634740967147e-06, + "loss": 0.6375, + "step": 975 + }, + { + "epoch": 0.1540160959444532, + "grad_norm": 0.6114476919174194, + "learning_rate": 4.973574549533248e-06, + "loss": 0.6521, + "step": 976 + }, + { + "epoch": 0.15417389932144548, + "grad_norm": 0.615924596786499, + "learning_rate": 4.973514289834717e-06, + "loss": 0.6555, + "step": 977 + }, + { + "epoch": 0.15433170269843774, + "grad_norm": 0.6343036890029907, + "learning_rate": 4.973453961873215e-06, + "loss": 0.6467, + "step": 978 + }, + { + "epoch": 0.15448950607543002, + "grad_norm": 0.59037846326828, + "learning_rate": 4.973393565650409e-06, + "loss": 0.6285, + "step": 979 + }, + { + "epoch": 0.15464730945242228, + "grad_norm": 0.6005411744117737, + "learning_rate": 4.973333101167963e-06, + "loss": 0.6272, + "step": 980 + }, + { + "epoch": 0.15480511282941456, + "grad_norm": 0.6691623330116272, + "learning_rate": 4.97327256842755e-06, + "loss": 0.6342, + "step": 981 + }, + { + "epoch": 0.15496291620640681, + "grad_norm": 0.6190291047096252, + "learning_rate": 4.9732119674308365e-06, + "loss": 0.6374, + "step": 982 + }, + { + "epoch": 0.1551207195833991, + "grad_norm": 0.6148681044578552, + "learning_rate": 4.9731512981794975e-06, + "loss": 0.6665, + "step": 983 + }, + { + "epoch": 0.15527852296039135, + "grad_norm": 0.6379438638687134, + "learning_rate": 4.973090560675205e-06, + "loss": 0.6314, + "step": 984 + }, + { + "epoch": 0.1554363263373836, + "grad_norm": 0.6040390133857727, + "learning_rate": 4.973029754919637e-06, + "loss": 0.6069, + "step": 985 + }, + { + "epoch": 0.1555941297143759, + "grad_norm": 0.6074916124343872, + "learning_rate": 4.9729688809144714e-06, + "loss": 0.6381, + "step": 986 + }, + { + "epoch": 0.15575193309136814, + "grad_norm": 0.5984866619110107, + "learning_rate": 4.972907938661389e-06, + "loss": 0.6444, + "step": 987 + }, + { + "epoch": 0.15590973646836043, + "grad_norm": 0.6340929865837097, + "learning_rate": 4.972846928162069e-06, + "loss": 0.6584, + "step": 988 + }, + { + "epoch": 0.15606753984535268, + "grad_norm": 0.619951069355011, + "learning_rate": 4.972785849418197e-06, + "loss": 0.6261, + "step": 989 + }, + { + "epoch": 0.15622534322234496, + "grad_norm": 0.6158046722412109, + "learning_rate": 4.972724702431458e-06, + "loss": 0.6284, + "step": 990 + }, + { + "epoch": 0.15638314659933722, + "grad_norm": 0.5828391909599304, + "learning_rate": 4.97266348720354e-06, + "loss": 0.6168, + "step": 991 + }, + { + "epoch": 0.1565409499763295, + "grad_norm": 0.5683507323265076, + "learning_rate": 4.9726022037361336e-06, + "loss": 0.629, + "step": 992 + }, + { + "epoch": 0.15669875335332176, + "grad_norm": 0.6311944127082825, + "learning_rate": 4.9725408520309275e-06, + "loss": 0.6586, + "step": 993 + }, + { + "epoch": 0.15685655673031404, + "grad_norm": 0.6190201640129089, + "learning_rate": 4.972479432089616e-06, + "loss": 0.6223, + "step": 994 + }, + { + "epoch": 0.1570143601073063, + "grad_norm": 0.5934759974479675, + "learning_rate": 4.972417943913895e-06, + "loss": 0.6241, + "step": 995 + }, + { + "epoch": 0.15717216348429858, + "grad_norm": 0.6098936796188354, + "learning_rate": 4.97235638750546e-06, + "loss": 0.6091, + "step": 996 + }, + { + "epoch": 0.15732996686129083, + "grad_norm": 0.6114173531532288, + "learning_rate": 4.97229476286601e-06, + "loss": 0.616, + "step": 997 + }, + { + "epoch": 0.1574877702382831, + "grad_norm": 0.6004275679588318, + "learning_rate": 4.972233069997247e-06, + "loss": 0.6538, + "step": 998 + }, + { + "epoch": 0.15764557361527537, + "grad_norm": 0.605111837387085, + "learning_rate": 4.972171308900872e-06, + "loss": 0.6537, + "step": 999 + }, + { + "epoch": 0.15780337699226762, + "grad_norm": 0.6355132460594177, + "learning_rate": 4.972109479578591e-06, + "loss": 0.6583, + "step": 1000 + }, + { + "epoch": 0.1579611803692599, + "grad_norm": 0.6122344136238098, + "learning_rate": 4.9720475820321085e-06, + "loss": 0.5901, + "step": 1001 + }, + { + "epoch": 0.15811898374625216, + "grad_norm": 0.6062139272689819, + "learning_rate": 4.9719856162631345e-06, + "loss": 0.6393, + "step": 1002 + }, + { + "epoch": 0.15827678712324444, + "grad_norm": 0.5970062017440796, + "learning_rate": 4.971923582273377e-06, + "loss": 0.6158, + "step": 1003 + }, + { + "epoch": 0.1584345905002367, + "grad_norm": 0.5901026129722595, + "learning_rate": 4.9718614800645505e-06, + "loss": 0.6295, + "step": 1004 + }, + { + "epoch": 0.15859239387722898, + "grad_norm": 0.598991334438324, + "learning_rate": 4.9717993096383676e-06, + "loss": 0.5783, + "step": 1005 + }, + { + "epoch": 0.15875019725422124, + "grad_norm": 0.6183978915214539, + "learning_rate": 4.971737070996544e-06, + "loss": 0.6292, + "step": 1006 + }, + { + "epoch": 0.15890800063121352, + "grad_norm": 0.5984471440315247, + "learning_rate": 4.971674764140797e-06, + "loss": 0.6156, + "step": 1007 + }, + { + "epoch": 0.15906580400820577, + "grad_norm": 0.6243711709976196, + "learning_rate": 4.971612389072847e-06, + "loss": 0.6247, + "step": 1008 + }, + { + "epoch": 0.15922360738519803, + "grad_norm": 0.6154190301895142, + "learning_rate": 4.971549945794415e-06, + "loss": 0.6263, + "step": 1009 + }, + { + "epoch": 0.1593814107621903, + "grad_norm": 0.6181488633155823, + "learning_rate": 4.9714874343072245e-06, + "loss": 0.6453, + "step": 1010 + }, + { + "epoch": 0.15953921413918257, + "grad_norm": 0.619059681892395, + "learning_rate": 4.971424854613e-06, + "loss": 0.6543, + "step": 1011 + }, + { + "epoch": 0.15969701751617485, + "grad_norm": 0.5748116374015808, + "learning_rate": 4.971362206713469e-06, + "loss": 0.6165, + "step": 1012 + }, + { + "epoch": 0.1598548208931671, + "grad_norm": 0.5885348320007324, + "learning_rate": 4.971299490610361e-06, + "loss": 0.6403, + "step": 1013 + }, + { + "epoch": 0.1600126242701594, + "grad_norm": 0.6201492547988892, + "learning_rate": 4.971236706305405e-06, + "loss": 0.6606, + "step": 1014 + }, + { + "epoch": 0.16017042764715164, + "grad_norm": 0.5799436569213867, + "learning_rate": 4.971173853800336e-06, + "loss": 0.6375, + "step": 1015 + }, + { + "epoch": 0.16032823102414392, + "grad_norm": 0.6224368810653687, + "learning_rate": 4.9711109330968865e-06, + "loss": 0.6229, + "step": 1016 + }, + { + "epoch": 0.16048603440113618, + "grad_norm": 0.6631140112876892, + "learning_rate": 4.971047944196795e-06, + "loss": 0.6302, + "step": 1017 + }, + { + "epoch": 0.16064383777812846, + "grad_norm": 0.6409574747085571, + "learning_rate": 4.9709848871017975e-06, + "loss": 0.6334, + "step": 1018 + }, + { + "epoch": 0.16080164115512072, + "grad_norm": 0.6317877769470215, + "learning_rate": 4.970921761813637e-06, + "loss": 0.6105, + "step": 1019 + }, + { + "epoch": 0.160959444532113, + "grad_norm": 0.6144220232963562, + "learning_rate": 4.9708585683340535e-06, + "loss": 0.6691, + "step": 1020 + }, + { + "epoch": 0.16111724790910525, + "grad_norm": 0.6154007315635681, + "learning_rate": 4.970795306664791e-06, + "loss": 0.6169, + "step": 1021 + }, + { + "epoch": 0.1612750512860975, + "grad_norm": 0.5965768098831177, + "learning_rate": 4.9707319768075964e-06, + "loss": 0.5966, + "step": 1022 + }, + { + "epoch": 0.1614328546630898, + "grad_norm": 0.616767942905426, + "learning_rate": 4.9706685787642175e-06, + "loss": 0.6207, + "step": 1023 + }, + { + "epoch": 0.16159065804008205, + "grad_norm": 0.6292237043380737, + "learning_rate": 4.970605112536403e-06, + "loss": 0.6211, + "step": 1024 + }, + { + "epoch": 0.16174846141707433, + "grad_norm": 0.6417022943496704, + "learning_rate": 4.970541578125905e-06, + "loss": 0.6191, + "step": 1025 + }, + { + "epoch": 0.16190626479406658, + "grad_norm": 0.6100021004676819, + "learning_rate": 4.970477975534476e-06, + "loss": 0.6485, + "step": 1026 + }, + { + "epoch": 0.16206406817105887, + "grad_norm": 0.618932843208313, + "learning_rate": 4.970414304763872e-06, + "loss": 0.612, + "step": 1027 + }, + { + "epoch": 0.16222187154805112, + "grad_norm": 0.6187143921852112, + "learning_rate": 4.970350565815851e-06, + "loss": 0.6242, + "step": 1028 + }, + { + "epoch": 0.1623796749250434, + "grad_norm": 0.607555091381073, + "learning_rate": 4.970286758692171e-06, + "loss": 0.6458, + "step": 1029 + }, + { + "epoch": 0.16253747830203566, + "grad_norm": 0.6003580093383789, + "learning_rate": 4.970222883394593e-06, + "loss": 0.6288, + "step": 1030 + }, + { + "epoch": 0.16269528167902794, + "grad_norm": 0.6334800720214844, + "learning_rate": 4.970158939924881e-06, + "loss": 0.6187, + "step": 1031 + }, + { + "epoch": 0.1628530850560202, + "grad_norm": 0.6411580443382263, + "learning_rate": 4.970094928284797e-06, + "loss": 0.6269, + "step": 1032 + }, + { + "epoch": 0.16301088843301248, + "grad_norm": 0.5795535445213318, + "learning_rate": 4.97003084847611e-06, + "loss": 0.6381, + "step": 1033 + }, + { + "epoch": 0.16316869181000473, + "grad_norm": 0.6224144101142883, + "learning_rate": 4.969966700500588e-06, + "loss": 0.6302, + "step": 1034 + }, + { + "epoch": 0.163326495186997, + "grad_norm": 0.6337095499038696, + "learning_rate": 4.969902484360001e-06, + "loss": 0.6571, + "step": 1035 + }, + { + "epoch": 0.16348429856398927, + "grad_norm": 0.6199482679367065, + "learning_rate": 4.969838200056119e-06, + "loss": 0.6498, + "step": 1036 + }, + { + "epoch": 0.16364210194098153, + "grad_norm": 0.5945393443107605, + "learning_rate": 4.969773847590721e-06, + "loss": 0.6316, + "step": 1037 + }, + { + "epoch": 0.1637999053179738, + "grad_norm": 0.5756955146789551, + "learning_rate": 4.969709426965579e-06, + "loss": 0.6039, + "step": 1038 + }, + { + "epoch": 0.16395770869496606, + "grad_norm": 0.6047154068946838, + "learning_rate": 4.969644938182472e-06, + "loss": 0.6106, + "step": 1039 + }, + { + "epoch": 0.16411551207195835, + "grad_norm": 0.5850980877876282, + "learning_rate": 4.96958038124318e-06, + "loss": 0.5918, + "step": 1040 + }, + { + "epoch": 0.1642733154489506, + "grad_norm": 0.5983694195747375, + "learning_rate": 4.969515756149485e-06, + "loss": 0.6068, + "step": 1041 + }, + { + "epoch": 0.16443111882594288, + "grad_norm": 0.5937641859054565, + "learning_rate": 4.9694510629031685e-06, + "loss": 0.6428, + "step": 1042 + }, + { + "epoch": 0.16458892220293514, + "grad_norm": 0.6369004249572754, + "learning_rate": 4.969386301506019e-06, + "loss": 0.6536, + "step": 1043 + }, + { + "epoch": 0.16474672557992742, + "grad_norm": 0.6393812894821167, + "learning_rate": 4.969321471959822e-06, + "loss": 0.6118, + "step": 1044 + }, + { + "epoch": 0.16490452895691968, + "grad_norm": 0.612896203994751, + "learning_rate": 4.969256574266366e-06, + "loss": 0.6287, + "step": 1045 + }, + { + "epoch": 0.16506233233391193, + "grad_norm": 0.6343592405319214, + "learning_rate": 4.969191608427442e-06, + "loss": 0.6121, + "step": 1046 + }, + { + "epoch": 0.16522013571090421, + "grad_norm": 0.6181201934814453, + "learning_rate": 4.969126574444844e-06, + "loss": 0.6304, + "step": 1047 + }, + { + "epoch": 0.16537793908789647, + "grad_norm": 0.6072591543197632, + "learning_rate": 4.969061472320368e-06, + "loss": 0.6125, + "step": 1048 + }, + { + "epoch": 0.16553574246488875, + "grad_norm": 0.6324918270111084, + "learning_rate": 4.968996302055807e-06, + "loss": 0.5662, + "step": 1049 + }, + { + "epoch": 0.165693545841881, + "grad_norm": 0.6514537930488586, + "learning_rate": 4.968931063652963e-06, + "loss": 0.5973, + "step": 1050 + }, + { + "epoch": 0.1658513492188733, + "grad_norm": 0.604461669921875, + "learning_rate": 4.968865757113635e-06, + "loss": 0.6173, + "step": 1051 + }, + { + "epoch": 0.16600915259586554, + "grad_norm": 0.6158382296562195, + "learning_rate": 4.968800382439624e-06, + "loss": 0.6062, + "step": 1052 + }, + { + "epoch": 0.16616695597285783, + "grad_norm": 0.6335897445678711, + "learning_rate": 4.968734939632736e-06, + "loss": 0.6663, + "step": 1053 + }, + { + "epoch": 0.16632475934985008, + "grad_norm": 0.6048577427864075, + "learning_rate": 4.968669428694777e-06, + "loss": 0.6441, + "step": 1054 + }, + { + "epoch": 0.16648256272684236, + "grad_norm": 0.6199697852134705, + "learning_rate": 4.968603849627555e-06, + "loss": 0.5728, + "step": 1055 + }, + { + "epoch": 0.16664036610383462, + "grad_norm": 0.6430922150611877, + "learning_rate": 4.968538202432879e-06, + "loss": 0.6478, + "step": 1056 + }, + { + "epoch": 0.1667981694808269, + "grad_norm": 0.6558423042297363, + "learning_rate": 4.96847248711256e-06, + "loss": 0.5948, + "step": 1057 + }, + { + "epoch": 0.16695597285781916, + "grad_norm": 0.6181206703186035, + "learning_rate": 4.968406703668413e-06, + "loss": 0.6589, + "step": 1058 + }, + { + "epoch": 0.1671137762348114, + "grad_norm": 0.6207762360572815, + "learning_rate": 4.9683408521022546e-06, + "loss": 0.6157, + "step": 1059 + }, + { + "epoch": 0.1672715796118037, + "grad_norm": 0.6136142611503601, + "learning_rate": 4.968274932415899e-06, + "loss": 0.6543, + "step": 1060 + }, + { + "epoch": 0.16742938298879595, + "grad_norm": 0.6110532283782959, + "learning_rate": 4.968208944611168e-06, + "loss": 0.6372, + "step": 1061 + }, + { + "epoch": 0.16758718636578823, + "grad_norm": 0.5790903568267822, + "learning_rate": 4.968142888689881e-06, + "loss": 0.6049, + "step": 1062 + }, + { + "epoch": 0.1677449897427805, + "grad_norm": 0.6146594882011414, + "learning_rate": 4.968076764653862e-06, + "loss": 0.6582, + "step": 1063 + }, + { + "epoch": 0.16790279311977277, + "grad_norm": 0.603947639465332, + "learning_rate": 4.968010572504935e-06, + "loss": 0.6202, + "step": 1064 + }, + { + "epoch": 0.16806059649676502, + "grad_norm": 0.6598784923553467, + "learning_rate": 4.967944312244929e-06, + "loss": 0.6232, + "step": 1065 + }, + { + "epoch": 0.1682183998737573, + "grad_norm": 0.6231120824813843, + "learning_rate": 4.96787798387567e-06, + "loss": 0.6696, + "step": 1066 + }, + { + "epoch": 0.16837620325074956, + "grad_norm": 0.6665984392166138, + "learning_rate": 4.967811587398989e-06, + "loss": 0.6168, + "step": 1067 + }, + { + "epoch": 0.16853400662774184, + "grad_norm": 0.6319500207901001, + "learning_rate": 4.967745122816719e-06, + "loss": 0.6524, + "step": 1068 + }, + { + "epoch": 0.1686918100047341, + "grad_norm": 0.6346251964569092, + "learning_rate": 4.967678590130694e-06, + "loss": 0.6465, + "step": 1069 + }, + { + "epoch": 0.16884961338172638, + "grad_norm": 0.6036311388015747, + "learning_rate": 4.967611989342751e-06, + "loss": 0.6032, + "step": 1070 + }, + { + "epoch": 0.16900741675871864, + "grad_norm": 0.6098418235778809, + "learning_rate": 4.967545320454727e-06, + "loss": 0.62, + "step": 1071 + }, + { + "epoch": 0.1691652201357109, + "grad_norm": 0.6033768057823181, + "learning_rate": 4.967478583468462e-06, + "loss": 0.6275, + "step": 1072 + }, + { + "epoch": 0.16932302351270317, + "grad_norm": 0.6348797082901001, + "learning_rate": 4.9674117783857975e-06, + "loss": 0.6423, + "step": 1073 + }, + { + "epoch": 0.16948082688969543, + "grad_norm": 0.5918552875518799, + "learning_rate": 4.967344905208579e-06, + "loss": 0.6208, + "step": 1074 + }, + { + "epoch": 0.1696386302666877, + "grad_norm": 0.5937137007713318, + "learning_rate": 4.967277963938649e-06, + "loss": 0.6575, + "step": 1075 + }, + { + "epoch": 0.16979643364367997, + "grad_norm": 0.6271960735321045, + "learning_rate": 4.967210954577857e-06, + "loss": 0.6007, + "step": 1076 + }, + { + "epoch": 0.16995423702067225, + "grad_norm": 0.6580274701118469, + "learning_rate": 4.967143877128052e-06, + "loss": 0.6267, + "step": 1077 + }, + { + "epoch": 0.1701120403976645, + "grad_norm": 0.5992641448974609, + "learning_rate": 4.967076731591085e-06, + "loss": 0.6489, + "step": 1078 + }, + { + "epoch": 0.1702698437746568, + "grad_norm": 0.6319767236709595, + "learning_rate": 4.96700951796881e-06, + "loss": 0.6148, + "step": 1079 + }, + { + "epoch": 0.17042764715164904, + "grad_norm": 0.6085304021835327, + "learning_rate": 4.966942236263079e-06, + "loss": 0.6112, + "step": 1080 + }, + { + "epoch": 0.17058545052864132, + "grad_norm": 0.5905590653419495, + "learning_rate": 4.9668748864757526e-06, + "loss": 0.6596, + "step": 1081 + }, + { + "epoch": 0.17074325390563358, + "grad_norm": 0.6327752470970154, + "learning_rate": 4.966807468608687e-06, + "loss": 0.6236, + "step": 1082 + }, + { + "epoch": 0.17090105728262583, + "grad_norm": 0.5889350175857544, + "learning_rate": 4.966739982663743e-06, + "loss": 0.6261, + "step": 1083 + }, + { + "epoch": 0.17105886065961812, + "grad_norm": 0.650726318359375, + "learning_rate": 4.966672428642785e-06, + "loss": 0.629, + "step": 1084 + }, + { + "epoch": 0.17121666403661037, + "grad_norm": 0.6258044838905334, + "learning_rate": 4.9666048065476744e-06, + "loss": 0.5835, + "step": 1085 + }, + { + "epoch": 0.17137446741360265, + "grad_norm": 0.6128823161125183, + "learning_rate": 4.96653711638028e-06, + "loss": 0.6415, + "step": 1086 + }, + { + "epoch": 0.1715322707905949, + "grad_norm": 0.6273114681243896, + "learning_rate": 4.966469358142467e-06, + "loss": 0.6648, + "step": 1087 + }, + { + "epoch": 0.1716900741675872, + "grad_norm": 0.6097704172134399, + "learning_rate": 4.966401531836109e-06, + "loss": 0.638, + "step": 1088 + }, + { + "epoch": 0.17184787754457945, + "grad_norm": 0.6029524207115173, + "learning_rate": 4.966333637463075e-06, + "loss": 0.616, + "step": 1089 + }, + { + "epoch": 0.17200568092157173, + "grad_norm": 0.5991334319114685, + "learning_rate": 4.96626567502524e-06, + "loss": 0.633, + "step": 1090 + }, + { + "epoch": 0.17216348429856398, + "grad_norm": 0.6087436079978943, + "learning_rate": 4.966197644524479e-06, + "loss": 0.6238, + "step": 1091 + }, + { + "epoch": 0.17232128767555627, + "grad_norm": 0.6344772577285767, + "learning_rate": 4.96612954596267e-06, + "loss": 0.6543, + "step": 1092 + }, + { + "epoch": 0.17247909105254852, + "grad_norm": 0.6010157465934753, + "learning_rate": 4.966061379341692e-06, + "loss": 0.6747, + "step": 1093 + }, + { + "epoch": 0.1726368944295408, + "grad_norm": 0.6436886191368103, + "learning_rate": 4.965993144663426e-06, + "loss": 0.6359, + "step": 1094 + }, + { + "epoch": 0.17279469780653306, + "grad_norm": 0.6059756278991699, + "learning_rate": 4.965924841929756e-06, + "loss": 0.6113, + "step": 1095 + }, + { + "epoch": 0.17295250118352531, + "grad_norm": 0.6224124431610107, + "learning_rate": 4.965856471142565e-06, + "loss": 0.631, + "step": 1096 + }, + { + "epoch": 0.1731103045605176, + "grad_norm": 0.5911850929260254, + "learning_rate": 4.965788032303743e-06, + "loss": 0.6348, + "step": 1097 + }, + { + "epoch": 0.17326810793750985, + "grad_norm": 0.6185610890388489, + "learning_rate": 4.965719525415177e-06, + "loss": 0.6469, + "step": 1098 + }, + { + "epoch": 0.17342591131450213, + "grad_norm": 0.615892767906189, + "learning_rate": 4.9656509504787565e-06, + "loss": 0.6455, + "step": 1099 + }, + { + "epoch": 0.1735837146914944, + "grad_norm": 0.6445240378379822, + "learning_rate": 4.9655823074963755e-06, + "loss": 0.6283, + "step": 1100 + }, + { + "epoch": 0.17374151806848667, + "grad_norm": 0.5901970267295837, + "learning_rate": 4.965513596469929e-06, + "loss": 0.6408, + "step": 1101 + }, + { + "epoch": 0.17389932144547893, + "grad_norm": 0.6214928030967712, + "learning_rate": 4.965444817401311e-06, + "loss": 0.5749, + "step": 1102 + }, + { + "epoch": 0.1740571248224712, + "grad_norm": 0.6416847705841064, + "learning_rate": 4.965375970292421e-06, + "loss": 0.6442, + "step": 1103 + }, + { + "epoch": 0.17421492819946346, + "grad_norm": 0.6378587484359741, + "learning_rate": 4.96530705514516e-06, + "loss": 0.6051, + "step": 1104 + }, + { + "epoch": 0.17437273157645575, + "grad_norm": 0.5863860249519348, + "learning_rate": 4.965238071961428e-06, + "loss": 0.6379, + "step": 1105 + }, + { + "epoch": 0.174530534953448, + "grad_norm": 0.6125226616859436, + "learning_rate": 4.965169020743129e-06, + "loss": 0.6271, + "step": 1106 + }, + { + "epoch": 0.17468833833044028, + "grad_norm": 0.6395373940467834, + "learning_rate": 4.965099901492171e-06, + "loss": 0.6059, + "step": 1107 + }, + { + "epoch": 0.17484614170743254, + "grad_norm": 0.6199135780334473, + "learning_rate": 4.965030714210459e-06, + "loss": 0.6431, + "step": 1108 + }, + { + "epoch": 0.1750039450844248, + "grad_norm": 0.6075945496559143, + "learning_rate": 4.964961458899903e-06, + "loss": 0.5995, + "step": 1109 + }, + { + "epoch": 0.17516174846141708, + "grad_norm": 0.6155432462692261, + "learning_rate": 4.964892135562414e-06, + "loss": 0.5809, + "step": 1110 + }, + { + "epoch": 0.17531955183840933, + "grad_norm": 0.6291399002075195, + "learning_rate": 4.964822744199906e-06, + "loss": 0.5779, + "step": 1111 + }, + { + "epoch": 0.17547735521540161, + "grad_norm": 0.6273796558380127, + "learning_rate": 4.964753284814294e-06, + "loss": 0.6396, + "step": 1112 + }, + { + "epoch": 0.17563515859239387, + "grad_norm": 0.6320323348045349, + "learning_rate": 4.964683757407494e-06, + "loss": 0.6455, + "step": 1113 + }, + { + "epoch": 0.17579296196938615, + "grad_norm": 0.6115767955780029, + "learning_rate": 4.964614161981426e-06, + "loss": 0.6378, + "step": 1114 + }, + { + "epoch": 0.1759507653463784, + "grad_norm": 0.5976405739784241, + "learning_rate": 4.964544498538009e-06, + "loss": 0.6677, + "step": 1115 + }, + { + "epoch": 0.1761085687233707, + "grad_norm": 0.6185165047645569, + "learning_rate": 4.964474767079167e-06, + "loss": 0.6326, + "step": 1116 + }, + { + "epoch": 0.17626637210036294, + "grad_norm": 0.619864821434021, + "learning_rate": 4.964404967606825e-06, + "loss": 0.6074, + "step": 1117 + }, + { + "epoch": 0.17642417547735523, + "grad_norm": 0.6177020072937012, + "learning_rate": 4.964335100122907e-06, + "loss": 0.645, + "step": 1118 + }, + { + "epoch": 0.17658197885434748, + "grad_norm": 0.6797804236412048, + "learning_rate": 4.964265164629342e-06, + "loss": 0.586, + "step": 1119 + }, + { + "epoch": 0.17673978223133974, + "grad_norm": 0.664466917514801, + "learning_rate": 4.964195161128062e-06, + "loss": 0.6456, + "step": 1120 + }, + { + "epoch": 0.17689758560833202, + "grad_norm": 0.5959522128105164, + "learning_rate": 4.964125089620997e-06, + "loss": 0.6347, + "step": 1121 + }, + { + "epoch": 0.17705538898532427, + "grad_norm": 0.6140851378440857, + "learning_rate": 4.96405495011008e-06, + "loss": 0.6138, + "step": 1122 + }, + { + "epoch": 0.17721319236231656, + "grad_norm": 0.5971763730049133, + "learning_rate": 4.963984742597249e-06, + "loss": 0.5978, + "step": 1123 + }, + { + "epoch": 0.1773709957393088, + "grad_norm": 0.6033616662025452, + "learning_rate": 4.96391446708444e-06, + "loss": 0.625, + "step": 1124 + }, + { + "epoch": 0.1775287991163011, + "grad_norm": 0.6185871958732605, + "learning_rate": 4.963844123573593e-06, + "loss": 0.627, + "step": 1125 + }, + { + "epoch": 0.17768660249329335, + "grad_norm": 0.6057677865028381, + "learning_rate": 4.963773712066649e-06, + "loss": 0.5934, + "step": 1126 + }, + { + "epoch": 0.17784440587028563, + "grad_norm": 0.6567206978797913, + "learning_rate": 4.963703232565552e-06, + "loss": 0.6506, + "step": 1127 + }, + { + "epoch": 0.1780022092472779, + "grad_norm": 0.643224835395813, + "learning_rate": 4.963632685072246e-06, + "loss": 0.6649, + "step": 1128 + }, + { + "epoch": 0.17816001262427017, + "grad_norm": 0.6369585394859314, + "learning_rate": 4.9635620695886775e-06, + "loss": 0.6059, + "step": 1129 + }, + { + "epoch": 0.17831781600126242, + "grad_norm": 0.6393840909004211, + "learning_rate": 4.963491386116798e-06, + "loss": 0.63, + "step": 1130 + }, + { + "epoch": 0.1784756193782547, + "grad_norm": 0.6471207141876221, + "learning_rate": 4.963420634658555e-06, + "loss": 0.6227, + "step": 1131 + }, + { + "epoch": 0.17863342275524696, + "grad_norm": 0.649418294429779, + "learning_rate": 4.963349815215902e-06, + "loss": 0.6245, + "step": 1132 + }, + { + "epoch": 0.17879122613223922, + "grad_norm": 0.6067545413970947, + "learning_rate": 4.963278927790796e-06, + "loss": 0.6696, + "step": 1133 + }, + { + "epoch": 0.1789490295092315, + "grad_norm": 0.6424644589424133, + "learning_rate": 4.96320797238519e-06, + "loss": 0.5889, + "step": 1134 + }, + { + "epoch": 0.17910683288622375, + "grad_norm": 0.6386503577232361, + "learning_rate": 4.963136949001043e-06, + "loss": 0.6563, + "step": 1135 + }, + { + "epoch": 0.17926463626321604, + "grad_norm": 0.6506074070930481, + "learning_rate": 4.9630658576403155e-06, + "loss": 0.636, + "step": 1136 + }, + { + "epoch": 0.1794224396402083, + "grad_norm": 0.5798661708831787, + "learning_rate": 4.96299469830497e-06, + "loss": 0.6562, + "step": 1137 + }, + { + "epoch": 0.17958024301720057, + "grad_norm": 0.5868306159973145, + "learning_rate": 4.962923470996969e-06, + "loss": 0.6212, + "step": 1138 + }, + { + "epoch": 0.17973804639419283, + "grad_norm": 0.6010650992393494, + "learning_rate": 4.96285217571828e-06, + "loss": 0.5881, + "step": 1139 + }, + { + "epoch": 0.1798958497711851, + "grad_norm": 0.6282439231872559, + "learning_rate": 4.962780812470869e-06, + "loss": 0.6378, + "step": 1140 + }, + { + "epoch": 0.18005365314817737, + "grad_norm": 0.6219528913497925, + "learning_rate": 4.962709381256706e-06, + "loss": 0.6444, + "step": 1141 + }, + { + "epoch": 0.18021145652516965, + "grad_norm": 0.6287857294082642, + "learning_rate": 4.962637882077762e-06, + "loss": 0.6186, + "step": 1142 + }, + { + "epoch": 0.1803692599021619, + "grad_norm": 0.6366180181503296, + "learning_rate": 4.9625663149360105e-06, + "loss": 0.6301, + "step": 1143 + }, + { + "epoch": 0.1805270632791542, + "grad_norm": 0.6224188804626465, + "learning_rate": 4.962494679833427e-06, + "loss": 0.62, + "step": 1144 + }, + { + "epoch": 0.18068486665614644, + "grad_norm": 0.6093705892562866, + "learning_rate": 4.962422976771988e-06, + "loss": 0.6337, + "step": 1145 + }, + { + "epoch": 0.1808426700331387, + "grad_norm": 0.6762253046035767, + "learning_rate": 4.962351205753672e-06, + "loss": 0.6543, + "step": 1146 + }, + { + "epoch": 0.18100047341013098, + "grad_norm": 0.6202844381332397, + "learning_rate": 4.96227936678046e-06, + "loss": 0.6308, + "step": 1147 + }, + { + "epoch": 0.18115827678712323, + "grad_norm": 0.6001341342926025, + "learning_rate": 4.962207459854336e-06, + "loss": 0.5862, + "step": 1148 + }, + { + "epoch": 0.18131608016411552, + "grad_norm": 0.5914742946624756, + "learning_rate": 4.962135484977282e-06, + "loss": 0.603, + "step": 1149 + }, + { + "epoch": 0.18147388354110777, + "grad_norm": 0.6343789100646973, + "learning_rate": 4.962063442151285e-06, + "loss": 0.6388, + "step": 1150 + }, + { + "epoch": 0.18163168691810005, + "grad_norm": 0.6411586999893188, + "learning_rate": 4.961991331378334e-06, + "loss": 0.5939, + "step": 1151 + }, + { + "epoch": 0.1817894902950923, + "grad_norm": 0.5878909230232239, + "learning_rate": 4.96191915266042e-06, + "loss": 0.6598, + "step": 1152 + }, + { + "epoch": 0.1819472936720846, + "grad_norm": 0.5966161489486694, + "learning_rate": 4.961846905999532e-06, + "loss": 0.6192, + "step": 1153 + }, + { + "epoch": 0.18210509704907685, + "grad_norm": 0.6358116269111633, + "learning_rate": 4.961774591397665e-06, + "loss": 0.6694, + "step": 1154 + }, + { + "epoch": 0.18226290042606913, + "grad_norm": 0.6949954628944397, + "learning_rate": 4.961702208856817e-06, + "loss": 0.5986, + "step": 1155 + }, + { + "epoch": 0.18242070380306138, + "grad_norm": 0.6130051016807556, + "learning_rate": 4.961629758378983e-06, + "loss": 0.6409, + "step": 1156 + }, + { + "epoch": 0.18257850718005364, + "grad_norm": 0.5792812705039978, + "learning_rate": 4.9615572399661635e-06, + "loss": 0.5942, + "step": 1157 + }, + { + "epoch": 0.18273631055704592, + "grad_norm": 0.5982370972633362, + "learning_rate": 4.96148465362036e-06, + "loss": 0.6211, + "step": 1158 + }, + { + "epoch": 0.18289411393403818, + "grad_norm": 0.6109289526939392, + "learning_rate": 4.961411999343575e-06, + "loss": 0.6118, + "step": 1159 + }, + { + "epoch": 0.18305191731103046, + "grad_norm": 0.5954134464263916, + "learning_rate": 4.9613392771378134e-06, + "loss": 0.6093, + "step": 1160 + }, + { + "epoch": 0.18320972068802271, + "grad_norm": 0.6256135106086731, + "learning_rate": 4.961266487005083e-06, + "loss": 0.6399, + "step": 1161 + }, + { + "epoch": 0.183367524065015, + "grad_norm": 0.6300309896469116, + "learning_rate": 4.961193628947391e-06, + "loss": 0.6129, + "step": 1162 + }, + { + "epoch": 0.18352532744200725, + "grad_norm": 0.6159512996673584, + "learning_rate": 4.9611207029667515e-06, + "loss": 0.6083, + "step": 1163 + }, + { + "epoch": 0.18368313081899953, + "grad_norm": 0.6344775557518005, + "learning_rate": 4.961047709065175e-06, + "loss": 0.6327, + "step": 1164 + }, + { + "epoch": 0.1838409341959918, + "grad_norm": 0.6381154656410217, + "learning_rate": 4.960974647244674e-06, + "loss": 0.6296, + "step": 1165 + }, + { + "epoch": 0.18399873757298407, + "grad_norm": 0.656421422958374, + "learning_rate": 4.960901517507267e-06, + "loss": 0.6597, + "step": 1166 + }, + { + "epoch": 0.18415654094997633, + "grad_norm": 0.6158045530319214, + "learning_rate": 4.960828319854973e-06, + "loss": 0.6151, + "step": 1167 + }, + { + "epoch": 0.1843143443269686, + "grad_norm": 0.6138966679573059, + "learning_rate": 4.9607550542898104e-06, + "loss": 0.567, + "step": 1168 + }, + { + "epoch": 0.18447214770396086, + "grad_norm": 0.6174358129501343, + "learning_rate": 4.9606817208138025e-06, + "loss": 0.5786, + "step": 1169 + }, + { + "epoch": 0.18462995108095312, + "grad_norm": 0.6164169311523438, + "learning_rate": 4.960608319428972e-06, + "loss": 0.6315, + "step": 1170 + }, + { + "epoch": 0.1847877544579454, + "grad_norm": 0.6131998300552368, + "learning_rate": 4.960534850137345e-06, + "loss": 0.6294, + "step": 1171 + }, + { + "epoch": 0.18494555783493766, + "grad_norm": 0.6249794960021973, + "learning_rate": 4.960461312940949e-06, + "loss": 0.6662, + "step": 1172 + }, + { + "epoch": 0.18510336121192994, + "grad_norm": 0.6089789867401123, + "learning_rate": 4.960387707841814e-06, + "loss": 0.636, + "step": 1173 + }, + { + "epoch": 0.1852611645889222, + "grad_norm": 0.6098273396492004, + "learning_rate": 4.96031403484197e-06, + "loss": 0.6476, + "step": 1174 + }, + { + "epoch": 0.18541896796591448, + "grad_norm": 1.2749830484390259, + "learning_rate": 4.9602402939434515e-06, + "loss": 0.6937, + "step": 1175 + }, + { + "epoch": 0.18557677134290673, + "grad_norm": 0.5985050797462463, + "learning_rate": 4.9601664851482924e-06, + "loss": 0.6301, + "step": 1176 + }, + { + "epoch": 0.18573457471989901, + "grad_norm": 0.6285187005996704, + "learning_rate": 4.960092608458531e-06, + "loss": 0.6059, + "step": 1177 + }, + { + "epoch": 0.18589237809689127, + "grad_norm": 0.6492058634757996, + "learning_rate": 4.960018663876205e-06, + "loss": 0.63, + "step": 1178 + }, + { + "epoch": 0.18605018147388355, + "grad_norm": 0.6034079194068909, + "learning_rate": 4.959944651403356e-06, + "loss": 0.6229, + "step": 1179 + }, + { + "epoch": 0.1862079848508758, + "grad_norm": 0.578924834728241, + "learning_rate": 4.959870571042026e-06, + "loss": 0.6265, + "step": 1180 + }, + { + "epoch": 0.1863657882278681, + "grad_norm": 0.610810399055481, + "learning_rate": 4.95979642279426e-06, + "loss": 0.612, + "step": 1181 + }, + { + "epoch": 0.18652359160486034, + "grad_norm": 0.6076445579528809, + "learning_rate": 4.959722206662103e-06, + "loss": 0.5907, + "step": 1182 + }, + { + "epoch": 0.1866813949818526, + "grad_norm": 0.6255353689193726, + "learning_rate": 4.959647922647605e-06, + "loss": 0.603, + "step": 1183 + }, + { + "epoch": 0.18683919835884488, + "grad_norm": 0.614747166633606, + "learning_rate": 4.9595735707528145e-06, + "loss": 0.6692, + "step": 1184 + }, + { + "epoch": 0.18699700173583714, + "grad_norm": 0.6049399971961975, + "learning_rate": 4.959499150979784e-06, + "loss": 0.6272, + "step": 1185 + }, + { + "epoch": 0.18715480511282942, + "grad_norm": 0.5925064086914062, + "learning_rate": 4.959424663330568e-06, + "loss": 0.5846, + "step": 1186 + }, + { + "epoch": 0.18731260848982167, + "grad_norm": 0.6303737163543701, + "learning_rate": 4.9593501078072206e-06, + "loss": 0.6437, + "step": 1187 + }, + { + "epoch": 0.18747041186681396, + "grad_norm": 0.6220330595970154, + "learning_rate": 4.959275484411801e-06, + "loss": 0.6437, + "step": 1188 + }, + { + "epoch": 0.1876282152438062, + "grad_norm": 0.6106336712837219, + "learning_rate": 4.959200793146367e-06, + "loss": 0.6429, + "step": 1189 + }, + { + "epoch": 0.1877860186207985, + "grad_norm": 0.5695015788078308, + "learning_rate": 4.9591260340129815e-06, + "loss": 0.6164, + "step": 1190 + }, + { + "epoch": 0.18794382199779075, + "grad_norm": 0.6142430901527405, + "learning_rate": 4.959051207013706e-06, + "loss": 0.6269, + "step": 1191 + }, + { + "epoch": 0.18810162537478303, + "grad_norm": 0.597961962223053, + "learning_rate": 4.958976312150608e-06, + "loss": 0.6417, + "step": 1192 + }, + { + "epoch": 0.1882594287517753, + "grad_norm": 0.613878607749939, + "learning_rate": 4.958901349425752e-06, + "loss": 0.6022, + "step": 1193 + }, + { + "epoch": 0.18841723212876754, + "grad_norm": 0.6023911833763123, + "learning_rate": 4.9588263188412086e-06, + "loss": 0.5968, + "step": 1194 + }, + { + "epoch": 0.18857503550575982, + "grad_norm": 0.6347556710243225, + "learning_rate": 4.958751220399046e-06, + "loss": 0.6107, + "step": 1195 + }, + { + "epoch": 0.18873283888275208, + "grad_norm": 0.6021919250488281, + "learning_rate": 4.95867605410134e-06, + "loss": 0.6574, + "step": 1196 + }, + { + "epoch": 0.18889064225974436, + "grad_norm": 0.5864298343658447, + "learning_rate": 4.9586008199501625e-06, + "loss": 0.5959, + "step": 1197 + }, + { + "epoch": 0.18904844563673662, + "grad_norm": 0.6056833267211914, + "learning_rate": 4.95852551794759e-06, + "loss": 0.6022, + "step": 1198 + }, + { + "epoch": 0.1892062490137289, + "grad_norm": 0.622661292552948, + "learning_rate": 4.958450148095703e-06, + "loss": 0.6363, + "step": 1199 + }, + { + "epoch": 0.18936405239072115, + "grad_norm": 0.6321606040000916, + "learning_rate": 4.958374710396578e-06, + "loss": 0.6356, + "step": 1200 + }, + { + "epoch": 0.18952185576771344, + "grad_norm": 0.6022459864616394, + "learning_rate": 4.9582992048522995e-06, + "loss": 0.6421, + "step": 1201 + }, + { + "epoch": 0.1896796591447057, + "grad_norm": 0.5910664200782776, + "learning_rate": 4.95822363146495e-06, + "loss": 0.6284, + "step": 1202 + }, + { + "epoch": 0.18983746252169797, + "grad_norm": 0.6052101850509644, + "learning_rate": 4.958147990236616e-06, + "loss": 0.5819, + "step": 1203 + }, + { + "epoch": 0.18999526589869023, + "grad_norm": 0.600798487663269, + "learning_rate": 4.9580722811693835e-06, + "loss": 0.6361, + "step": 1204 + }, + { + "epoch": 0.1901530692756825, + "grad_norm": 0.5985654592514038, + "learning_rate": 4.957996504265345e-06, + "loss": 0.6165, + "step": 1205 + }, + { + "epoch": 0.19031087265267477, + "grad_norm": 0.6012553572654724, + "learning_rate": 4.957920659526588e-06, + "loss": 0.5915, + "step": 1206 + }, + { + "epoch": 0.19046867602966702, + "grad_norm": 0.566299319267273, + "learning_rate": 4.957844746955208e-06, + "loss": 0.6116, + "step": 1207 + }, + { + "epoch": 0.1906264794066593, + "grad_norm": 0.6332440972328186, + "learning_rate": 4.9577687665532995e-06, + "loss": 0.5958, + "step": 1208 + }, + { + "epoch": 0.19078428278365156, + "grad_norm": 0.5821355581283569, + "learning_rate": 4.9576927183229585e-06, + "loss": 0.5964, + "step": 1209 + }, + { + "epoch": 0.19094208616064384, + "grad_norm": 0.5974559783935547, + "learning_rate": 4.957616602266286e-06, + "loss": 0.6355, + "step": 1210 + }, + { + "epoch": 0.1910998895376361, + "grad_norm": 0.6454991698265076, + "learning_rate": 4.957540418385379e-06, + "loss": 0.6444, + "step": 1211 + }, + { + "epoch": 0.19125769291462838, + "grad_norm": 0.6019113659858704, + "learning_rate": 4.957464166682343e-06, + "loss": 0.5831, + "step": 1212 + }, + { + "epoch": 0.19141549629162063, + "grad_norm": 0.5876724123954773, + "learning_rate": 4.957387847159282e-06, + "loss": 0.6305, + "step": 1213 + }, + { + "epoch": 0.19157329966861292, + "grad_norm": 0.5758741497993469, + "learning_rate": 4.957311459818302e-06, + "loss": 0.6393, + "step": 1214 + }, + { + "epoch": 0.19173110304560517, + "grad_norm": 0.6195574402809143, + "learning_rate": 4.957235004661509e-06, + "loss": 0.6179, + "step": 1215 + }, + { + "epoch": 0.19188890642259746, + "grad_norm": 0.6200394630432129, + "learning_rate": 4.9571584816910165e-06, + "loss": 0.5988, + "step": 1216 + }, + { + "epoch": 0.1920467097995897, + "grad_norm": 0.6106705069541931, + "learning_rate": 4.957081890908934e-06, + "loss": 0.6531, + "step": 1217 + }, + { + "epoch": 0.192204513176582, + "grad_norm": 0.6030598878860474, + "learning_rate": 4.957005232317375e-06, + "loss": 0.6173, + "step": 1218 + }, + { + "epoch": 0.19236231655357425, + "grad_norm": 0.6379125714302063, + "learning_rate": 4.956928505918458e-06, + "loss": 0.6307, + "step": 1219 + }, + { + "epoch": 0.1925201199305665, + "grad_norm": 0.5803065299987793, + "learning_rate": 4.956851711714297e-06, + "loss": 0.651, + "step": 1220 + }, + { + "epoch": 0.19267792330755878, + "grad_norm": 0.642530620098114, + "learning_rate": 4.956774849707012e-06, + "loss": 0.622, + "step": 1221 + }, + { + "epoch": 0.19283572668455104, + "grad_norm": 0.6142280697822571, + "learning_rate": 4.956697919898727e-06, + "loss": 0.5937, + "step": 1222 + }, + { + "epoch": 0.19299353006154332, + "grad_norm": 0.5995895862579346, + "learning_rate": 4.956620922291561e-06, + "loss": 0.6254, + "step": 1223 + }, + { + "epoch": 0.19315133343853558, + "grad_norm": 0.6149164438247681, + "learning_rate": 4.956543856887642e-06, + "loss": 0.6049, + "step": 1224 + }, + { + "epoch": 0.19330913681552786, + "grad_norm": 0.5998221635818481, + "learning_rate": 4.956466723689095e-06, + "loss": 0.6064, + "step": 1225 + }, + { + "epoch": 0.19346694019252011, + "grad_norm": 0.6090437173843384, + "learning_rate": 4.95638952269805e-06, + "loss": 0.6172, + "step": 1226 + }, + { + "epoch": 0.1936247435695124, + "grad_norm": 0.6353545188903809, + "learning_rate": 4.956312253916637e-06, + "loss": 0.619, + "step": 1227 + }, + { + "epoch": 0.19378254694650465, + "grad_norm": 0.5963947772979736, + "learning_rate": 4.956234917346988e-06, + "loss": 0.6564, + "step": 1228 + }, + { + "epoch": 0.19394035032349694, + "grad_norm": 0.6125789880752563, + "learning_rate": 4.956157512991239e-06, + "loss": 0.6211, + "step": 1229 + }, + { + "epoch": 0.1940981537004892, + "grad_norm": 0.6337981224060059, + "learning_rate": 4.956080040851523e-06, + "loss": 0.6114, + "step": 1230 + }, + { + "epoch": 0.19425595707748144, + "grad_norm": 0.6248219609260559, + "learning_rate": 4.956002500929982e-06, + "loss": 0.6154, + "step": 1231 + }, + { + "epoch": 0.19441376045447373, + "grad_norm": 0.6480812430381775, + "learning_rate": 4.955924893228752e-06, + "loss": 0.6093, + "step": 1232 + }, + { + "epoch": 0.19457156383146598, + "grad_norm": 0.6215121150016785, + "learning_rate": 4.955847217749977e-06, + "loss": 0.6395, + "step": 1233 + }, + { + "epoch": 0.19472936720845826, + "grad_norm": 0.6307222843170166, + "learning_rate": 4.9557694744958015e-06, + "loss": 0.6093, + "step": 1234 + }, + { + "epoch": 0.19488717058545052, + "grad_norm": 0.5990995168685913, + "learning_rate": 4.955691663468369e-06, + "loss": 0.6381, + "step": 1235 + }, + { + "epoch": 0.1950449739624428, + "grad_norm": 0.5964449048042297, + "learning_rate": 4.955613784669828e-06, + "loss": 0.6047, + "step": 1236 + }, + { + "epoch": 0.19520277733943506, + "grad_norm": 0.600102424621582, + "learning_rate": 4.9555358381023275e-06, + "loss": 0.6042, + "step": 1237 + }, + { + "epoch": 0.19536058071642734, + "grad_norm": 0.5935695171356201, + "learning_rate": 4.9554578237680186e-06, + "loss": 0.6425, + "step": 1238 + }, + { + "epoch": 0.1955183840934196, + "grad_norm": 0.5709253549575806, + "learning_rate": 4.955379741669054e-06, + "loss": 0.6368, + "step": 1239 + }, + { + "epoch": 0.19567618747041188, + "grad_norm": 0.6210572719573975, + "learning_rate": 4.9553015918075885e-06, + "loss": 0.5928, + "step": 1240 + }, + { + "epoch": 0.19583399084740413, + "grad_norm": 0.608635663986206, + "learning_rate": 4.9552233741857805e-06, + "loss": 0.6189, + "step": 1241 + }, + { + "epoch": 0.19599179422439642, + "grad_norm": 0.6320560574531555, + "learning_rate": 4.955145088805786e-06, + "loss": 0.6307, + "step": 1242 + }, + { + "epoch": 0.19614959760138867, + "grad_norm": 0.632556676864624, + "learning_rate": 4.955066735669768e-06, + "loss": 0.617, + "step": 1243 + }, + { + "epoch": 0.19630740097838092, + "grad_norm": 0.5906263589859009, + "learning_rate": 4.9549883147798875e-06, + "loss": 0.629, + "step": 1244 + }, + { + "epoch": 0.1964652043553732, + "grad_norm": 0.5742282271385193, + "learning_rate": 4.954909826138308e-06, + "loss": 0.6146, + "step": 1245 + }, + { + "epoch": 0.19662300773236546, + "grad_norm": 0.6459119915962219, + "learning_rate": 4.954831269747198e-06, + "loss": 0.6379, + "step": 1246 + }, + { + "epoch": 0.19678081110935775, + "grad_norm": 0.5814904570579529, + "learning_rate": 4.954752645608722e-06, + "loss": 0.6671, + "step": 1247 + }, + { + "epoch": 0.19693861448635, + "grad_norm": 0.6074311137199402, + "learning_rate": 4.9546739537250535e-06, + "loss": 0.6747, + "step": 1248 + }, + { + "epoch": 0.19709641786334228, + "grad_norm": 0.5659074783325195, + "learning_rate": 4.954595194098362e-06, + "loss": 0.6034, + "step": 1249 + }, + { + "epoch": 0.19725422124033454, + "grad_norm": 0.6029398441314697, + "learning_rate": 4.954516366730822e-06, + "loss": 0.6208, + "step": 1250 + }, + { + "epoch": 0.19741202461732682, + "grad_norm": 0.6617392301559448, + "learning_rate": 4.954437471624608e-06, + "loss": 0.625, + "step": 1251 + }, + { + "epoch": 0.19756982799431907, + "grad_norm": 0.6332879662513733, + "learning_rate": 4.954358508781898e-06, + "loss": 0.6271, + "step": 1252 + }, + { + "epoch": 0.19772763137131136, + "grad_norm": 0.6084469556808472, + "learning_rate": 4.9542794782048715e-06, + "loss": 0.6156, + "step": 1253 + }, + { + "epoch": 0.1978854347483036, + "grad_norm": 0.6174158453941345, + "learning_rate": 4.954200379895708e-06, + "loss": 0.6496, + "step": 1254 + }, + { + "epoch": 0.1980432381252959, + "grad_norm": 0.6207244992256165, + "learning_rate": 4.954121213856593e-06, + "loss": 0.6262, + "step": 1255 + }, + { + "epoch": 0.19820104150228815, + "grad_norm": 0.5995996594429016, + "learning_rate": 4.954041980089709e-06, + "loss": 0.6078, + "step": 1256 + }, + { + "epoch": 0.1983588448792804, + "grad_norm": 0.6144423484802246, + "learning_rate": 4.953962678597244e-06, + "loss": 0.6498, + "step": 1257 + }, + { + "epoch": 0.1985166482562727, + "grad_norm": 0.5980164408683777, + "learning_rate": 4.953883309381385e-06, + "loss": 0.6393, + "step": 1258 + }, + { + "epoch": 0.19867445163326494, + "grad_norm": 0.5776756405830383, + "learning_rate": 4.953803872444324e-06, + "loss": 0.6148, + "step": 1259 + }, + { + "epoch": 0.19883225501025723, + "grad_norm": 0.6110414862632751, + "learning_rate": 4.953724367788253e-06, + "loss": 0.6105, + "step": 1260 + }, + { + "epoch": 0.19899005838724948, + "grad_norm": 0.6109161972999573, + "learning_rate": 4.953644795415364e-06, + "loss": 0.6672, + "step": 1261 + }, + { + "epoch": 0.19914786176424176, + "grad_norm": 0.6083004474639893, + "learning_rate": 4.953565155327857e-06, + "loss": 0.6589, + "step": 1262 + }, + { + "epoch": 0.19930566514123402, + "grad_norm": 0.5883878469467163, + "learning_rate": 4.953485447527927e-06, + "loss": 0.6034, + "step": 1263 + }, + { + "epoch": 0.1994634685182263, + "grad_norm": 0.584928572177887, + "learning_rate": 4.953405672017773e-06, + "loss": 0.6023, + "step": 1264 + }, + { + "epoch": 0.19962127189521855, + "grad_norm": 0.7274770140647888, + "learning_rate": 4.9533258287996e-06, + "loss": 0.6067, + "step": 1265 + }, + { + "epoch": 0.19977907527221084, + "grad_norm": 0.612489640712738, + "learning_rate": 4.953245917875608e-06, + "loss": 0.6337, + "step": 1266 + }, + { + "epoch": 0.1999368786492031, + "grad_norm": 0.6245325207710266, + "learning_rate": 4.953165939248005e-06, + "loss": 0.6444, + "step": 1267 + }, + { + "epoch": 0.20009468202619535, + "grad_norm": 0.6302911639213562, + "learning_rate": 4.953085892918997e-06, + "loss": 0.6451, + "step": 1268 + }, + { + "epoch": 0.20025248540318763, + "grad_norm": 0.6326889395713806, + "learning_rate": 4.953005778890793e-06, + "loss": 0.6146, + "step": 1269 + }, + { + "epoch": 0.20041028878017988, + "grad_norm": 0.6278961896896362, + "learning_rate": 4.9529255971656035e-06, + "loss": 0.6106, + "step": 1270 + }, + { + "epoch": 0.20056809215717217, + "grad_norm": 0.6280059814453125, + "learning_rate": 4.952845347745643e-06, + "loss": 0.6151, + "step": 1271 + }, + { + "epoch": 0.20072589553416442, + "grad_norm": 0.5914972424507141, + "learning_rate": 4.952765030633124e-06, + "loss": 0.5935, + "step": 1272 + }, + { + "epoch": 0.2008836989111567, + "grad_norm": 0.6422427892684937, + "learning_rate": 4.952684645830264e-06, + "loss": 0.6236, + "step": 1273 + }, + { + "epoch": 0.20104150228814896, + "grad_norm": 0.6101568341255188, + "learning_rate": 4.9526041933392825e-06, + "loss": 0.6062, + "step": 1274 + }, + { + "epoch": 0.20119930566514124, + "grad_norm": 0.5959360003471375, + "learning_rate": 4.9525236731623985e-06, + "loss": 0.6355, + "step": 1275 + }, + { + "epoch": 0.2013571090421335, + "grad_norm": 0.5921663641929626, + "learning_rate": 4.952443085301835e-06, + "loss": 0.6344, + "step": 1276 + }, + { + "epoch": 0.20151491241912578, + "grad_norm": 0.61577969789505, + "learning_rate": 4.952362429759815e-06, + "loss": 0.6496, + "step": 1277 + }, + { + "epoch": 0.20167271579611804, + "grad_norm": 0.5970199704170227, + "learning_rate": 4.952281706538565e-06, + "loss": 0.6273, + "step": 1278 + }, + { + "epoch": 0.20183051917311032, + "grad_norm": 0.6352749466896057, + "learning_rate": 4.952200915640313e-06, + "loss": 0.6408, + "step": 1279 + }, + { + "epoch": 0.20198832255010257, + "grad_norm": 0.6215179562568665, + "learning_rate": 4.952120057067288e-06, + "loss": 0.6517, + "step": 1280 + }, + { + "epoch": 0.20214612592709483, + "grad_norm": 0.5765315890312195, + "learning_rate": 4.9520391308217215e-06, + "loss": 0.6124, + "step": 1281 + }, + { + "epoch": 0.2023039293040871, + "grad_norm": 0.6181051135063171, + "learning_rate": 4.9519581369058475e-06, + "loss": 0.625, + "step": 1282 + }, + { + "epoch": 0.20246173268107936, + "grad_norm": 0.6351206302642822, + "learning_rate": 4.951877075321902e-06, + "loss": 0.6508, + "step": 1283 + }, + { + "epoch": 0.20261953605807165, + "grad_norm": 0.6243249773979187, + "learning_rate": 4.95179594607212e-06, + "loss": 0.6177, + "step": 1284 + }, + { + "epoch": 0.2027773394350639, + "grad_norm": 0.6431304812431335, + "learning_rate": 4.951714749158742e-06, + "loss": 0.6333, + "step": 1285 + }, + { + "epoch": 0.20293514281205619, + "grad_norm": 0.5860026478767395, + "learning_rate": 4.951633484584009e-06, + "loss": 0.6125, + "step": 1286 + }, + { + "epoch": 0.20309294618904844, + "grad_norm": 0.6053611636161804, + "learning_rate": 4.951552152350161e-06, + "loss": 0.6229, + "step": 1287 + }, + { + "epoch": 0.20325074956604072, + "grad_norm": 0.6226319670677185, + "learning_rate": 4.9514707524594465e-06, + "loss": 0.6483, + "step": 1288 + }, + { + "epoch": 0.20340855294303298, + "grad_norm": 0.6397374868392944, + "learning_rate": 4.95138928491411e-06, + "loss": 0.6115, + "step": 1289 + }, + { + "epoch": 0.20356635632002526, + "grad_norm": 0.6199672222137451, + "learning_rate": 4.951307749716399e-06, + "loss": 0.5984, + "step": 1290 + }, + { + "epoch": 0.20372415969701752, + "grad_norm": 0.6283205151557922, + "learning_rate": 4.951226146868565e-06, + "loss": 0.6458, + "step": 1291 + }, + { + "epoch": 0.2038819630740098, + "grad_norm": 0.5774952173233032, + "learning_rate": 4.9511444763728596e-06, + "loss": 0.5875, + "step": 1292 + }, + { + "epoch": 0.20403976645100205, + "grad_norm": 0.5847025513648987, + "learning_rate": 4.9510627382315365e-06, + "loss": 0.6138, + "step": 1293 + }, + { + "epoch": 0.2041975698279943, + "grad_norm": 0.6145697832107544, + "learning_rate": 4.950980932446852e-06, + "loss": 0.6329, + "step": 1294 + }, + { + "epoch": 0.2043553732049866, + "grad_norm": 0.5755574703216553, + "learning_rate": 4.9508990590210636e-06, + "loss": 0.5699, + "step": 1295 + }, + { + "epoch": 0.20451317658197884, + "grad_norm": 0.5798789262771606, + "learning_rate": 4.95081711795643e-06, + "loss": 0.6404, + "step": 1296 + }, + { + "epoch": 0.20467097995897113, + "grad_norm": 0.5823343396186829, + "learning_rate": 4.950735109255214e-06, + "loss": 0.6257, + "step": 1297 + }, + { + "epoch": 0.20482878333596338, + "grad_norm": 0.6193516254425049, + "learning_rate": 4.950653032919677e-06, + "loss": 0.6119, + "step": 1298 + }, + { + "epoch": 0.20498658671295567, + "grad_norm": 0.6196638941764832, + "learning_rate": 4.950570888952085e-06, + "loss": 0.6249, + "step": 1299 + }, + { + "epoch": 0.20514439008994792, + "grad_norm": 0.5759353637695312, + "learning_rate": 4.950488677354707e-06, + "loss": 0.6014, + "step": 1300 + }, + { + "epoch": 0.2053021934669402, + "grad_norm": 0.6046234369277954, + "learning_rate": 4.950406398129808e-06, + "loss": 0.6329, + "step": 1301 + }, + { + "epoch": 0.20545999684393246, + "grad_norm": 0.6002787351608276, + "learning_rate": 4.950324051279662e-06, + "loss": 0.6136, + "step": 1302 + }, + { + "epoch": 0.20561780022092474, + "grad_norm": 0.6149343848228455, + "learning_rate": 4.95024163680654e-06, + "loss": 0.5756, + "step": 1303 + }, + { + "epoch": 0.205775603597917, + "grad_norm": 0.6404880285263062, + "learning_rate": 4.950159154712716e-06, + "loss": 0.6168, + "step": 1304 + }, + { + "epoch": 0.20593340697490925, + "grad_norm": 0.5993021726608276, + "learning_rate": 4.9500766050004676e-06, + "loss": 0.638, + "step": 1305 + }, + { + "epoch": 0.20609121035190153, + "grad_norm": 0.6046354174613953, + "learning_rate": 4.949993987672072e-06, + "loss": 0.5959, + "step": 1306 + }, + { + "epoch": 0.2062490137288938, + "grad_norm": 0.6227611303329468, + "learning_rate": 4.94991130272981e-06, + "loss": 0.6058, + "step": 1307 + }, + { + "epoch": 0.20640681710588607, + "grad_norm": 0.6072728037834167, + "learning_rate": 4.949828550175963e-06, + "loss": 0.6305, + "step": 1308 + }, + { + "epoch": 0.20656462048287833, + "grad_norm": 0.574679434299469, + "learning_rate": 4.949745730012815e-06, + "loss": 0.6174, + "step": 1309 + }, + { + "epoch": 0.2067224238598706, + "grad_norm": 0.5697793364524841, + "learning_rate": 4.94966284224265e-06, + "loss": 0.6018, + "step": 1310 + }, + { + "epoch": 0.20688022723686286, + "grad_norm": 0.626074492931366, + "learning_rate": 4.949579886867759e-06, + "loss": 0.6193, + "step": 1311 + }, + { + "epoch": 0.20703803061385515, + "grad_norm": 0.6048347353935242, + "learning_rate": 4.949496863890428e-06, + "loss": 0.6192, + "step": 1312 + }, + { + "epoch": 0.2071958339908474, + "grad_norm": 0.608214795589447, + "learning_rate": 4.94941377331295e-06, + "loss": 0.62, + "step": 1313 + }, + { + "epoch": 0.20735363736783968, + "grad_norm": 0.6000281572341919, + "learning_rate": 4.949330615137618e-06, + "loss": 0.6381, + "step": 1314 + }, + { + "epoch": 0.20751144074483194, + "grad_norm": 0.6143856644630432, + "learning_rate": 4.949247389366727e-06, + "loss": 0.6039, + "step": 1315 + }, + { + "epoch": 0.20766924412182422, + "grad_norm": 0.5912512540817261, + "learning_rate": 4.9491640960025735e-06, + "loss": 0.5962, + "step": 1316 + }, + { + "epoch": 0.20782704749881648, + "grad_norm": 0.5831612348556519, + "learning_rate": 4.949080735047456e-06, + "loss": 0.6017, + "step": 1317 + }, + { + "epoch": 0.20798485087580873, + "grad_norm": 0.6538140177726746, + "learning_rate": 4.948997306503674e-06, + "loss": 0.6095, + "step": 1318 + }, + { + "epoch": 0.208142654252801, + "grad_norm": 0.6169470548629761, + "learning_rate": 4.948913810373532e-06, + "loss": 0.6216, + "step": 1319 + }, + { + "epoch": 0.20830045762979327, + "grad_norm": 0.5939083695411682, + "learning_rate": 4.9488302466593344e-06, + "loss": 0.5787, + "step": 1320 + }, + { + "epoch": 0.20845826100678555, + "grad_norm": 0.5887499451637268, + "learning_rate": 4.9487466153633856e-06, + "loss": 0.5999, + "step": 1321 + }, + { + "epoch": 0.2086160643837778, + "grad_norm": 0.6040934324264526, + "learning_rate": 4.948662916487994e-06, + "loss": 0.6213, + "step": 1322 + }, + { + "epoch": 0.2087738677607701, + "grad_norm": 0.5894956588745117, + "learning_rate": 4.948579150035471e-06, + "loss": 0.5903, + "step": 1323 + }, + { + "epoch": 0.20893167113776234, + "grad_norm": 0.5985233783721924, + "learning_rate": 4.948495316008126e-06, + "loss": 0.6451, + "step": 1324 + }, + { + "epoch": 0.20908947451475463, + "grad_norm": 0.635859489440918, + "learning_rate": 4.948411414408275e-06, + "loss": 0.5999, + "step": 1325 + }, + { + "epoch": 0.20924727789174688, + "grad_norm": 0.5762813687324524, + "learning_rate": 4.9483274452382315e-06, + "loss": 0.6002, + "step": 1326 + }, + { + "epoch": 0.20940508126873916, + "grad_norm": 0.6255648136138916, + "learning_rate": 4.948243408500315e-06, + "loss": 0.6231, + "step": 1327 + }, + { + "epoch": 0.20956288464573142, + "grad_norm": 0.6309247016906738, + "learning_rate": 4.948159304196842e-06, + "loss": 0.6196, + "step": 1328 + }, + { + "epoch": 0.20972068802272367, + "grad_norm": 0.6338143348693848, + "learning_rate": 4.9480751323301354e-06, + "loss": 0.6244, + "step": 1329 + }, + { + "epoch": 0.20987849139971596, + "grad_norm": 0.6019792556762695, + "learning_rate": 4.947990892902518e-06, + "loss": 0.6064, + "step": 1330 + }, + { + "epoch": 0.2100362947767082, + "grad_norm": 0.6262502074241638, + "learning_rate": 4.9479065859163145e-06, + "loss": 0.6046, + "step": 1331 + }, + { + "epoch": 0.2101940981537005, + "grad_norm": 0.582848310470581, + "learning_rate": 4.94782221137385e-06, + "loss": 0.6155, + "step": 1332 + }, + { + "epoch": 0.21035190153069275, + "grad_norm": 0.5635239481925964, + "learning_rate": 4.947737769277457e-06, + "loss": 0.606, + "step": 1333 + }, + { + "epoch": 0.21050970490768503, + "grad_norm": 0.610198438167572, + "learning_rate": 4.9476532596294616e-06, + "loss": 0.6623, + "step": 1334 + }, + { + "epoch": 0.21066750828467729, + "grad_norm": 0.598120391368866, + "learning_rate": 4.947568682432198e-06, + "loss": 0.6344, + "step": 1335 + }, + { + "epoch": 0.21082531166166957, + "grad_norm": 0.610343873500824, + "learning_rate": 4.947484037688e-06, + "loss": 0.6018, + "step": 1336 + }, + { + "epoch": 0.21098311503866182, + "grad_norm": 0.5946330428123474, + "learning_rate": 4.9473993253992035e-06, + "loss": 0.6474, + "step": 1337 + }, + { + "epoch": 0.2111409184156541, + "grad_norm": 0.6715523600578308, + "learning_rate": 4.947314545568147e-06, + "loss": 0.6114, + "step": 1338 + }, + { + "epoch": 0.21129872179264636, + "grad_norm": 0.6005040407180786, + "learning_rate": 4.94722969819717e-06, + "loss": 0.5941, + "step": 1339 + }, + { + "epoch": 0.21145652516963864, + "grad_norm": 0.6218941807746887, + "learning_rate": 4.947144783288613e-06, + "loss": 0.597, + "step": 1340 + }, + { + "epoch": 0.2116143285466309, + "grad_norm": 0.617173969745636, + "learning_rate": 4.947059800844821e-06, + "loss": 0.602, + "step": 1341 + }, + { + "epoch": 0.21177213192362315, + "grad_norm": 0.6159076690673828, + "learning_rate": 4.946974750868139e-06, + "loss": 0.6141, + "step": 1342 + }, + { + "epoch": 0.21192993530061544, + "grad_norm": 0.6272429823875427, + "learning_rate": 4.946889633360913e-06, + "loss": 0.6142, + "step": 1343 + }, + { + "epoch": 0.2120877386776077, + "grad_norm": 0.628171980381012, + "learning_rate": 4.946804448325493e-06, + "loss": 0.609, + "step": 1344 + }, + { + "epoch": 0.21224554205459997, + "grad_norm": 0.6452605128288269, + "learning_rate": 4.94671919576423e-06, + "loss": 0.6217, + "step": 1345 + }, + { + "epoch": 0.21240334543159223, + "grad_norm": 0.6007925868034363, + "learning_rate": 4.946633875679477e-06, + "loss": 0.5744, + "step": 1346 + }, + { + "epoch": 0.2125611488085845, + "grad_norm": 0.6354795098304749, + "learning_rate": 4.946548488073587e-06, + "loss": 0.6275, + "step": 1347 + }, + { + "epoch": 0.21271895218557677, + "grad_norm": 0.5729158520698547, + "learning_rate": 4.946463032948918e-06, + "loss": 0.6088, + "step": 1348 + }, + { + "epoch": 0.21287675556256905, + "grad_norm": 0.5987066626548767, + "learning_rate": 4.946377510307827e-06, + "loss": 0.6048, + "step": 1349 + }, + { + "epoch": 0.2130345589395613, + "grad_norm": 0.599997878074646, + "learning_rate": 4.946291920152677e-06, + "loss": 0.5896, + "step": 1350 + }, + { + "epoch": 0.21319236231655359, + "grad_norm": 0.5933699011802673, + "learning_rate": 4.946206262485826e-06, + "loss": 0.6061, + "step": 1351 + }, + { + "epoch": 0.21335016569354584, + "grad_norm": 0.6199212670326233, + "learning_rate": 4.9461205373096424e-06, + "loss": 0.6268, + "step": 1352 + }, + { + "epoch": 0.21350796907053812, + "grad_norm": 0.6157801747322083, + "learning_rate": 4.946034744626489e-06, + "loss": 0.61, + "step": 1353 + }, + { + "epoch": 0.21366577244753038, + "grad_norm": 0.6467691659927368, + "learning_rate": 4.945948884438734e-06, + "loss": 0.5989, + "step": 1354 + }, + { + "epoch": 0.21382357582452263, + "grad_norm": 0.583934485912323, + "learning_rate": 4.945862956748747e-06, + "loss": 0.5831, + "step": 1355 + }, + { + "epoch": 0.21398137920151492, + "grad_norm": 0.6313218474388123, + "learning_rate": 4.9457769615589e-06, + "loss": 0.6202, + "step": 1356 + }, + { + "epoch": 0.21413918257850717, + "grad_norm": 0.5938848853111267, + "learning_rate": 4.945690898871566e-06, + "loss": 0.6061, + "step": 1357 + }, + { + "epoch": 0.21429698595549945, + "grad_norm": 0.6672472357749939, + "learning_rate": 4.94560476868912e-06, + "loss": 0.5679, + "step": 1358 + }, + { + "epoch": 0.2144547893324917, + "grad_norm": 0.5886817574501038, + "learning_rate": 4.945518571013939e-06, + "loss": 0.5931, + "step": 1359 + }, + { + "epoch": 0.214612592709484, + "grad_norm": 0.6388434767723083, + "learning_rate": 4.945432305848401e-06, + "loss": 0.6278, + "step": 1360 + }, + { + "epoch": 0.21477039608647625, + "grad_norm": 0.605842649936676, + "learning_rate": 4.945345973194888e-06, + "loss": 0.663, + "step": 1361 + }, + { + "epoch": 0.21492819946346853, + "grad_norm": 0.5870705246925354, + "learning_rate": 4.945259573055782e-06, + "loss": 0.561, + "step": 1362 + }, + { + "epoch": 0.21508600284046078, + "grad_norm": 0.5989512205123901, + "learning_rate": 4.945173105433467e-06, + "loss": 0.5857, + "step": 1363 + }, + { + "epoch": 0.21524380621745307, + "grad_norm": 0.6118288040161133, + "learning_rate": 4.945086570330331e-06, + "loss": 0.6141, + "step": 1364 + }, + { + "epoch": 0.21540160959444532, + "grad_norm": 0.6088497042655945, + "learning_rate": 4.944999967748761e-06, + "loss": 0.6136, + "step": 1365 + }, + { + "epoch": 0.21555941297143758, + "grad_norm": 0.594270646572113, + "learning_rate": 4.944913297691146e-06, + "loss": 0.6194, + "step": 1366 + }, + { + "epoch": 0.21571721634842986, + "grad_norm": 0.6281502842903137, + "learning_rate": 4.944826560159879e-06, + "loss": 0.5934, + "step": 1367 + }, + { + "epoch": 0.2158750197254221, + "grad_norm": 0.5950168967247009, + "learning_rate": 4.944739755157354e-06, + "loss": 0.6048, + "step": 1368 + }, + { + "epoch": 0.2160328231024144, + "grad_norm": 0.6194471716880798, + "learning_rate": 4.944652882685966e-06, + "loss": 0.586, + "step": 1369 + }, + { + "epoch": 0.21619062647940665, + "grad_norm": 0.5938549041748047, + "learning_rate": 4.944565942748113e-06, + "loss": 0.6587, + "step": 1370 + }, + { + "epoch": 0.21634842985639893, + "grad_norm": 0.6099259853363037, + "learning_rate": 4.944478935346193e-06, + "loss": 0.6118, + "step": 1371 + }, + { + "epoch": 0.2165062332333912, + "grad_norm": 0.5927821397781372, + "learning_rate": 4.944391860482609e-06, + "loss": 0.5963, + "step": 1372 + }, + { + "epoch": 0.21666403661038347, + "grad_norm": 0.6165619492530823, + "learning_rate": 4.944304718159764e-06, + "loss": 0.6331, + "step": 1373 + }, + { + "epoch": 0.21682183998737573, + "grad_norm": 0.5867920517921448, + "learning_rate": 4.944217508380061e-06, + "loss": 0.6075, + "step": 1374 + }, + { + "epoch": 0.216979643364368, + "grad_norm": 0.6114839911460876, + "learning_rate": 4.944130231145908e-06, + "loss": 0.6188, + "step": 1375 + }, + { + "epoch": 0.21713744674136026, + "grad_norm": 0.6135778427124023, + "learning_rate": 4.944042886459714e-06, + "loss": 0.5763, + "step": 1376 + }, + { + "epoch": 0.21729525011835255, + "grad_norm": 0.5963658094406128, + "learning_rate": 4.943955474323889e-06, + "loss": 0.5993, + "step": 1377 + }, + { + "epoch": 0.2174530534953448, + "grad_norm": 0.6297968626022339, + "learning_rate": 4.9438679947408455e-06, + "loss": 0.6256, + "step": 1378 + }, + { + "epoch": 0.21761085687233706, + "grad_norm": 0.5774078369140625, + "learning_rate": 4.943780447712998e-06, + "loss": 0.6194, + "step": 1379 + }, + { + "epoch": 0.21776866024932934, + "grad_norm": 0.6339438557624817, + "learning_rate": 4.943692833242762e-06, + "loss": 0.6209, + "step": 1380 + }, + { + "epoch": 0.2179264636263216, + "grad_norm": 0.6465830206871033, + "learning_rate": 4.9436051513325554e-06, + "loss": 0.6338, + "step": 1381 + }, + { + "epoch": 0.21808426700331388, + "grad_norm": 0.635376513004303, + "learning_rate": 4.943517401984799e-06, + "loss": 0.6431, + "step": 1382 + }, + { + "epoch": 0.21824207038030613, + "grad_norm": 0.5817705988883972, + "learning_rate": 4.943429585201913e-06, + "loss": 0.6206, + "step": 1383 + }, + { + "epoch": 0.2183998737572984, + "grad_norm": 0.6396196484565735, + "learning_rate": 4.943341700986321e-06, + "loss": 0.6136, + "step": 1384 + }, + { + "epoch": 0.21855767713429067, + "grad_norm": 0.583951473236084, + "learning_rate": 4.943253749340451e-06, + "loss": 0.5974, + "step": 1385 + }, + { + "epoch": 0.21871548051128295, + "grad_norm": 0.5974057912826538, + "learning_rate": 4.943165730266728e-06, + "loss": 0.6021, + "step": 1386 + }, + { + "epoch": 0.2188732838882752, + "grad_norm": 0.618179976940155, + "learning_rate": 4.943077643767581e-06, + "loss": 0.6148, + "step": 1387 + }, + { + "epoch": 0.2190310872652675, + "grad_norm": 0.6256776452064514, + "learning_rate": 4.942989489845441e-06, + "loss": 0.6268, + "step": 1388 + }, + { + "epoch": 0.21918889064225974, + "grad_norm": 0.5905525088310242, + "learning_rate": 4.942901268502742e-06, + "loss": 0.6568, + "step": 1389 + }, + { + "epoch": 0.21934669401925203, + "grad_norm": 0.6189777851104736, + "learning_rate": 4.942812979741918e-06, + "loss": 0.5978, + "step": 1390 + }, + { + "epoch": 0.21950449739624428, + "grad_norm": 0.607720136642456, + "learning_rate": 4.942724623565405e-06, + "loss": 0.6142, + "step": 1391 + }, + { + "epoch": 0.21966230077323654, + "grad_norm": 0.6153449416160583, + "learning_rate": 4.942636199975643e-06, + "loss": 0.6115, + "step": 1392 + }, + { + "epoch": 0.21982010415022882, + "grad_norm": 0.6170888543128967, + "learning_rate": 4.942547708975069e-06, + "loss": 0.6087, + "step": 1393 + }, + { + "epoch": 0.21997790752722107, + "grad_norm": 0.6277616620063782, + "learning_rate": 4.942459150566129e-06, + "loss": 0.6038, + "step": 1394 + }, + { + "epoch": 0.22013571090421336, + "grad_norm": 0.6197009682655334, + "learning_rate": 4.942370524751265e-06, + "loss": 0.6432, + "step": 1395 + }, + { + "epoch": 0.2202935142812056, + "grad_norm": 0.6066128015518188, + "learning_rate": 4.9422818315329234e-06, + "loss": 0.6132, + "step": 1396 + }, + { + "epoch": 0.2204513176581979, + "grad_norm": 0.623389482498169, + "learning_rate": 4.942193070913552e-06, + "loss": 0.6507, + "step": 1397 + }, + { + "epoch": 0.22060912103519015, + "grad_norm": 0.5969215035438538, + "learning_rate": 4.942104242895599e-06, + "loss": 0.6368, + "step": 1398 + }, + { + "epoch": 0.22076692441218243, + "grad_norm": 0.6025614142417908, + "learning_rate": 4.942015347481518e-06, + "loss": 0.6237, + "step": 1399 + }, + { + "epoch": 0.22092472778917469, + "grad_norm": 0.5954381823539734, + "learning_rate": 4.9419263846737606e-06, + "loss": 0.6217, + "step": 1400 + }, + { + "epoch": 0.22108253116616697, + "grad_norm": 0.6012186408042908, + "learning_rate": 4.941837354474784e-06, + "loss": 0.6463, + "step": 1401 + }, + { + "epoch": 0.22124033454315922, + "grad_norm": 0.6036189794540405, + "learning_rate": 4.941748256887042e-06, + "loss": 0.6102, + "step": 1402 + }, + { + "epoch": 0.22139813792015148, + "grad_norm": 0.6032610535621643, + "learning_rate": 4.941659091912996e-06, + "loss": 0.6317, + "step": 1403 + }, + { + "epoch": 0.22155594129714376, + "grad_norm": 0.5965151190757751, + "learning_rate": 4.941569859555105e-06, + "loss": 0.6358, + "step": 1404 + }, + { + "epoch": 0.22171374467413602, + "grad_norm": 0.5931841731071472, + "learning_rate": 4.9414805598158345e-06, + "loss": 0.609, + "step": 1405 + }, + { + "epoch": 0.2218715480511283, + "grad_norm": 0.5748729705810547, + "learning_rate": 4.941391192697646e-06, + "loss": 0.5701, + "step": 1406 + }, + { + "epoch": 0.22202935142812055, + "grad_norm": 0.6166751980781555, + "learning_rate": 4.941301758203007e-06, + "loss": 0.6243, + "step": 1407 + }, + { + "epoch": 0.22218715480511284, + "grad_norm": 0.6560108065605164, + "learning_rate": 4.9412122563343855e-06, + "loss": 0.6524, + "step": 1408 + }, + { + "epoch": 0.2223449581821051, + "grad_norm": 0.5907800793647766, + "learning_rate": 4.941122687094252e-06, + "loss": 0.6399, + "step": 1409 + }, + { + "epoch": 0.22250276155909737, + "grad_norm": 0.6038073301315308, + "learning_rate": 4.941033050485078e-06, + "loss": 0.6191, + "step": 1410 + }, + { + "epoch": 0.22266056493608963, + "grad_norm": 0.6019279360771179, + "learning_rate": 4.940943346509337e-06, + "loss": 0.6342, + "step": 1411 + }, + { + "epoch": 0.2228183683130819, + "grad_norm": 0.6560201644897461, + "learning_rate": 4.940853575169504e-06, + "loss": 0.6234, + "step": 1412 + }, + { + "epoch": 0.22297617169007417, + "grad_norm": 0.6430417895317078, + "learning_rate": 4.9407637364680585e-06, + "loss": 0.651, + "step": 1413 + }, + { + "epoch": 0.22313397506706645, + "grad_norm": 0.632432758808136, + "learning_rate": 4.940673830407478e-06, + "loss": 0.6147, + "step": 1414 + }, + { + "epoch": 0.2232917784440587, + "grad_norm": 0.6040425896644592, + "learning_rate": 4.940583856990245e-06, + "loss": 0.6225, + "step": 1415 + }, + { + "epoch": 0.22344958182105096, + "grad_norm": 0.5877988338470459, + "learning_rate": 4.940493816218841e-06, + "loss": 0.6315, + "step": 1416 + }, + { + "epoch": 0.22360738519804324, + "grad_norm": 0.6608521342277527, + "learning_rate": 4.9404037080957525e-06, + "loss": 0.6394, + "step": 1417 + }, + { + "epoch": 0.2237651885750355, + "grad_norm": 0.6133540272712708, + "learning_rate": 4.940313532623465e-06, + "loss": 0.6102, + "step": 1418 + }, + { + "epoch": 0.22392299195202778, + "grad_norm": 0.597998321056366, + "learning_rate": 4.9402232898044685e-06, + "loss": 0.6194, + "step": 1419 + }, + { + "epoch": 0.22408079532902003, + "grad_norm": 0.5981501340866089, + "learning_rate": 4.940132979641252e-06, + "loss": 0.6333, + "step": 1420 + }, + { + "epoch": 0.22423859870601232, + "grad_norm": 0.6087415814399719, + "learning_rate": 4.940042602136308e-06, + "loss": 0.6011, + "step": 1421 + }, + { + "epoch": 0.22439640208300457, + "grad_norm": 0.6337860822677612, + "learning_rate": 4.9399521572921325e-06, + "loss": 0.5968, + "step": 1422 + }, + { + "epoch": 0.22455420545999685, + "grad_norm": 0.6131370067596436, + "learning_rate": 4.939861645111219e-06, + "loss": 0.6133, + "step": 1423 + }, + { + "epoch": 0.2247120088369891, + "grad_norm": 0.6026253700256348, + "learning_rate": 4.939771065596067e-06, + "loss": 0.6125, + "step": 1424 + }, + { + "epoch": 0.2248698122139814, + "grad_norm": 0.5986268520355225, + "learning_rate": 4.939680418749177e-06, + "loss": 0.631, + "step": 1425 + }, + { + "epoch": 0.22502761559097365, + "grad_norm": 0.6466418504714966, + "learning_rate": 4.939589704573049e-06, + "loss": 0.6261, + "step": 1426 + }, + { + "epoch": 0.22518541896796593, + "grad_norm": 0.6139489412307739, + "learning_rate": 4.939498923070187e-06, + "loss": 0.5883, + "step": 1427 + }, + { + "epoch": 0.22534322234495818, + "grad_norm": 0.6162557005882263, + "learning_rate": 4.939408074243096e-06, + "loss": 0.6331, + "step": 1428 + }, + { + "epoch": 0.22550102572195044, + "grad_norm": 0.5843167901039124, + "learning_rate": 4.939317158094284e-06, + "loss": 0.6461, + "step": 1429 + }, + { + "epoch": 0.22565882909894272, + "grad_norm": 0.641244649887085, + "learning_rate": 4.939226174626259e-06, + "loss": 0.5473, + "step": 1430 + }, + { + "epoch": 0.22581663247593498, + "grad_norm": 0.6180683970451355, + "learning_rate": 4.9391351238415335e-06, + "loss": 0.588, + "step": 1431 + }, + { + "epoch": 0.22597443585292726, + "grad_norm": 0.6078048944473267, + "learning_rate": 4.939044005742619e-06, + "loss": 0.6142, + "step": 1432 + }, + { + "epoch": 0.2261322392299195, + "grad_norm": 0.623877227306366, + "learning_rate": 4.938952820332031e-06, + "loss": 0.6371, + "step": 1433 + }, + { + "epoch": 0.2262900426069118, + "grad_norm": 0.6202167868614197, + "learning_rate": 4.938861567612286e-06, + "loss": 0.5893, + "step": 1434 + }, + { + "epoch": 0.22644784598390405, + "grad_norm": 0.6355668902397156, + "learning_rate": 4.9387702475859e-06, + "loss": 0.6101, + "step": 1435 + }, + { + "epoch": 0.22660564936089633, + "grad_norm": 0.5820677280426025, + "learning_rate": 4.9386788602553955e-06, + "loss": 0.6109, + "step": 1436 + }, + { + "epoch": 0.2267634527378886, + "grad_norm": 0.6361674070358276, + "learning_rate": 4.9385874056232955e-06, + "loss": 0.5839, + "step": 1437 + }, + { + "epoch": 0.22692125611488087, + "grad_norm": 0.6418603658676147, + "learning_rate": 4.938495883692122e-06, + "loss": 0.5944, + "step": 1438 + }, + { + "epoch": 0.22707905949187313, + "grad_norm": 0.618431031703949, + "learning_rate": 4.938404294464401e-06, + "loss": 0.6157, + "step": 1439 + }, + { + "epoch": 0.22723686286886538, + "grad_norm": 0.6675086617469788, + "learning_rate": 4.938312637942661e-06, + "loss": 0.6087, + "step": 1440 + }, + { + "epoch": 0.22739466624585766, + "grad_norm": 0.6240835785865784, + "learning_rate": 4.938220914129431e-06, + "loss": 0.5787, + "step": 1441 + }, + { + "epoch": 0.22755246962284992, + "grad_norm": 0.608292281627655, + "learning_rate": 4.9381291230272415e-06, + "loss": 0.6072, + "step": 1442 + }, + { + "epoch": 0.2277102729998422, + "grad_norm": 0.591194212436676, + "learning_rate": 4.938037264638628e-06, + "loss": 0.6182, + "step": 1443 + }, + { + "epoch": 0.22786807637683446, + "grad_norm": 0.6083149909973145, + "learning_rate": 4.937945338966122e-06, + "loss": 0.6131, + "step": 1444 + }, + { + "epoch": 0.22802587975382674, + "grad_norm": 0.6416597366333008, + "learning_rate": 4.937853346012265e-06, + "loss": 0.6684, + "step": 1445 + }, + { + "epoch": 0.228183683130819, + "grad_norm": 0.5609596371650696, + "learning_rate": 4.937761285779592e-06, + "loss": 0.5698, + "step": 1446 + }, + { + "epoch": 0.22834148650781128, + "grad_norm": 0.6055806875228882, + "learning_rate": 4.937669158270645e-06, + "loss": 0.626, + "step": 1447 + }, + { + "epoch": 0.22849928988480353, + "grad_norm": 0.602242648601532, + "learning_rate": 4.937576963487967e-06, + "loss": 0.6369, + "step": 1448 + }, + { + "epoch": 0.2286570932617958, + "grad_norm": 0.6343538165092468, + "learning_rate": 4.937484701434101e-06, + "loss": 0.5937, + "step": 1449 + }, + { + "epoch": 0.22881489663878807, + "grad_norm": 0.6002455949783325, + "learning_rate": 4.937392372111594e-06, + "loss": 0.5975, + "step": 1450 + }, + { + "epoch": 0.22897270001578035, + "grad_norm": 0.6105063557624817, + "learning_rate": 4.937299975522994e-06, + "loss": 0.6001, + "step": 1451 + }, + { + "epoch": 0.2291305033927726, + "grad_norm": 0.5967088341712952, + "learning_rate": 4.937207511670852e-06, + "loss": 0.5979, + "step": 1452 + }, + { + "epoch": 0.22928830676976486, + "grad_norm": 0.636893630027771, + "learning_rate": 4.937114980557719e-06, + "loss": 0.6181, + "step": 1453 + }, + { + "epoch": 0.22944611014675714, + "grad_norm": 0.5922220945358276, + "learning_rate": 4.937022382186148e-06, + "loss": 0.6133, + "step": 1454 + }, + { + "epoch": 0.2296039135237494, + "grad_norm": 0.6224690079689026, + "learning_rate": 4.9369297165586935e-06, + "loss": 0.6317, + "step": 1455 + }, + { + "epoch": 0.22976171690074168, + "grad_norm": 0.5961106419563293, + "learning_rate": 4.936836983677916e-06, + "loss": 0.6333, + "step": 1456 + }, + { + "epoch": 0.22991952027773394, + "grad_norm": 0.592511773109436, + "learning_rate": 4.936744183546372e-06, + "loss": 0.6148, + "step": 1457 + }, + { + "epoch": 0.23007732365472622, + "grad_norm": 0.6077726483345032, + "learning_rate": 4.936651316166625e-06, + "loss": 0.5764, + "step": 1458 + }, + { + "epoch": 0.23023512703171847, + "grad_norm": 0.5971184372901917, + "learning_rate": 4.9365583815412346e-06, + "loss": 0.625, + "step": 1459 + }, + { + "epoch": 0.23039293040871076, + "grad_norm": 0.611920177936554, + "learning_rate": 4.936465379672769e-06, + "loss": 0.6407, + "step": 1460 + }, + { + "epoch": 0.230550733785703, + "grad_norm": 0.6087852120399475, + "learning_rate": 4.9363723105637915e-06, + "loss": 0.6347, + "step": 1461 + }, + { + "epoch": 0.2307085371626953, + "grad_norm": 0.6183415651321411, + "learning_rate": 4.936279174216874e-06, + "loss": 0.6251, + "step": 1462 + }, + { + "epoch": 0.23086634053968755, + "grad_norm": 0.5839583277702332, + "learning_rate": 4.9361859706345835e-06, + "loss": 0.6292, + "step": 1463 + }, + { + "epoch": 0.23102414391667983, + "grad_norm": 0.6049478054046631, + "learning_rate": 4.936092699819495e-06, + "loss": 0.6202, + "step": 1464 + }, + { + "epoch": 0.23118194729367209, + "grad_norm": 0.5721340775489807, + "learning_rate": 4.935999361774181e-06, + "loss": 0.591, + "step": 1465 + }, + { + "epoch": 0.23133975067066434, + "grad_norm": 0.6207857728004456, + "learning_rate": 4.935905956501218e-06, + "loss": 0.6236, + "step": 1466 + }, + { + "epoch": 0.23149755404765662, + "grad_norm": 0.6481266617774963, + "learning_rate": 4.935812484003183e-06, + "loss": 0.6516, + "step": 1467 + }, + { + "epoch": 0.23165535742464888, + "grad_norm": 0.609841525554657, + "learning_rate": 4.935718944282657e-06, + "loss": 0.6459, + "step": 1468 + }, + { + "epoch": 0.23181316080164116, + "grad_norm": 0.6043233871459961, + "learning_rate": 4.9356253373422205e-06, + "loss": 0.6272, + "step": 1469 + }, + { + "epoch": 0.23197096417863342, + "grad_norm": 0.5899990797042847, + "learning_rate": 4.9355316631844564e-06, + "loss": 0.6125, + "step": 1470 + }, + { + "epoch": 0.2321287675556257, + "grad_norm": 0.594575047492981, + "learning_rate": 4.9354379218119506e-06, + "loss": 0.6315, + "step": 1471 + }, + { + "epoch": 0.23228657093261795, + "grad_norm": 0.5769400000572205, + "learning_rate": 4.93534411322729e-06, + "loss": 0.6113, + "step": 1472 + }, + { + "epoch": 0.23244437430961024, + "grad_norm": 0.6353433132171631, + "learning_rate": 4.935250237433063e-06, + "loss": 0.6257, + "step": 1473 + }, + { + "epoch": 0.2326021776866025, + "grad_norm": 0.5873032212257385, + "learning_rate": 4.935156294431862e-06, + "loss": 0.645, + "step": 1474 + }, + { + "epoch": 0.23275998106359477, + "grad_norm": 0.6120390892028809, + "learning_rate": 4.935062284226278e-06, + "loss": 0.616, + "step": 1475 + }, + { + "epoch": 0.23291778444058703, + "grad_norm": 0.5956204533576965, + "learning_rate": 4.934968206818905e-06, + "loss": 0.5912, + "step": 1476 + }, + { + "epoch": 0.23307558781757928, + "grad_norm": 0.6181733012199402, + "learning_rate": 4.93487406221234e-06, + "loss": 0.6589, + "step": 1477 + }, + { + "epoch": 0.23323339119457157, + "grad_norm": 0.5858859419822693, + "learning_rate": 4.934779850409182e-06, + "loss": 0.6267, + "step": 1478 + }, + { + "epoch": 0.23339119457156382, + "grad_norm": 0.5987781882286072, + "learning_rate": 4.934685571412031e-06, + "loss": 0.6208, + "step": 1479 + }, + { + "epoch": 0.2335489979485561, + "grad_norm": 0.5857381820678711, + "learning_rate": 4.934591225223487e-06, + "loss": 0.6005, + "step": 1480 + }, + { + "epoch": 0.23370680132554836, + "grad_norm": 0.6205964684486389, + "learning_rate": 4.934496811846156e-06, + "loss": 0.5804, + "step": 1481 + }, + { + "epoch": 0.23386460470254064, + "grad_norm": 0.5801072120666504, + "learning_rate": 4.934402331282642e-06, + "loss": 0.6244, + "step": 1482 + }, + { + "epoch": 0.2340224080795329, + "grad_norm": 0.6085876226425171, + "learning_rate": 4.934307783535552e-06, + "loss": 0.6127, + "step": 1483 + }, + { + "epoch": 0.23418021145652518, + "grad_norm": 0.5876258611679077, + "learning_rate": 4.9342131686074976e-06, + "loss": 0.5945, + "step": 1484 + }, + { + "epoch": 0.23433801483351743, + "grad_norm": 0.6047869324684143, + "learning_rate": 4.934118486501088e-06, + "loss": 0.6361, + "step": 1485 + }, + { + "epoch": 0.23449581821050972, + "grad_norm": 0.5996703505516052, + "learning_rate": 4.934023737218937e-06, + "loss": 0.5926, + "step": 1486 + }, + { + "epoch": 0.23465362158750197, + "grad_norm": 0.6068390607833862, + "learning_rate": 4.933928920763659e-06, + "loss": 0.5612, + "step": 1487 + }, + { + "epoch": 0.23481142496449425, + "grad_norm": 0.6007887125015259, + "learning_rate": 4.933834037137871e-06, + "loss": 0.6076, + "step": 1488 + }, + { + "epoch": 0.2349692283414865, + "grad_norm": 0.6340495944023132, + "learning_rate": 4.933739086344191e-06, + "loss": 0.6207, + "step": 1489 + }, + { + "epoch": 0.23512703171847876, + "grad_norm": 0.6082830429077148, + "learning_rate": 4.933644068385241e-06, + "loss": 0.6234, + "step": 1490 + }, + { + "epoch": 0.23528483509547105, + "grad_norm": 0.6191714406013489, + "learning_rate": 4.933548983263641e-06, + "loss": 0.5982, + "step": 1491 + }, + { + "epoch": 0.2354426384724633, + "grad_norm": 0.5704101920127869, + "learning_rate": 4.933453830982017e-06, + "loss": 0.5949, + "step": 1492 + }, + { + "epoch": 0.23560044184945558, + "grad_norm": 0.6148662567138672, + "learning_rate": 4.933358611542994e-06, + "loss": 0.6237, + "step": 1493 + }, + { + "epoch": 0.23575824522644784, + "grad_norm": 0.6315892338752747, + "learning_rate": 4.9332633249492e-06, + "loss": 0.5957, + "step": 1494 + }, + { + "epoch": 0.23591604860344012, + "grad_norm": 0.6323703527450562, + "learning_rate": 4.933167971203265e-06, + "loss": 0.5964, + "step": 1495 + }, + { + "epoch": 0.23607385198043238, + "grad_norm": 0.6344191431999207, + "learning_rate": 4.933072550307819e-06, + "loss": 0.6216, + "step": 1496 + }, + { + "epoch": 0.23623165535742466, + "grad_norm": 0.6072808504104614, + "learning_rate": 4.932977062265498e-06, + "loss": 0.6127, + "step": 1497 + }, + { + "epoch": 0.2363894587344169, + "grad_norm": 0.6118766069412231, + "learning_rate": 4.932881507078936e-06, + "loss": 0.6505, + "step": 1498 + }, + { + "epoch": 0.2365472621114092, + "grad_norm": 0.6086880564689636, + "learning_rate": 4.932785884750769e-06, + "loss": 0.5983, + "step": 1499 + }, + { + "epoch": 0.23670506548840145, + "grad_norm": 0.5931991338729858, + "learning_rate": 4.932690195283638e-06, + "loss": 0.5906, + "step": 1500 + }, + { + "epoch": 0.23686286886539373, + "grad_norm": 0.573767364025116, + "learning_rate": 4.932594438680182e-06, + "loss": 0.62, + "step": 1501 + }, + { + "epoch": 0.237020672242386, + "grad_norm": 0.6035709381103516, + "learning_rate": 4.932498614943045e-06, + "loss": 0.6368, + "step": 1502 + }, + { + "epoch": 0.23717847561937824, + "grad_norm": 0.5609701871871948, + "learning_rate": 4.93240272407487e-06, + "loss": 0.6044, + "step": 1503 + }, + { + "epoch": 0.23733627899637053, + "grad_norm": 0.6214385032653809, + "learning_rate": 4.932306766078304e-06, + "loss": 0.582, + "step": 1504 + }, + { + "epoch": 0.23749408237336278, + "grad_norm": 0.5750260949134827, + "learning_rate": 4.932210740955997e-06, + "loss": 0.6358, + "step": 1505 + }, + { + "epoch": 0.23765188575035506, + "grad_norm": 0.6199482679367065, + "learning_rate": 4.932114648710597e-06, + "loss": 0.5707, + "step": 1506 + }, + { + "epoch": 0.23780968912734732, + "grad_norm": 0.6272065043449402, + "learning_rate": 4.932018489344755e-06, + "loss": 0.6481, + "step": 1507 + }, + { + "epoch": 0.2379674925043396, + "grad_norm": 0.6680588722229004, + "learning_rate": 4.931922262861127e-06, + "loss": 0.6291, + "step": 1508 + }, + { + "epoch": 0.23812529588133186, + "grad_norm": 0.6249493360519409, + "learning_rate": 4.931825969262369e-06, + "loss": 0.6257, + "step": 1509 + }, + { + "epoch": 0.23828309925832414, + "grad_norm": 0.636210560798645, + "learning_rate": 4.931729608551136e-06, + "loss": 0.5934, + "step": 1510 + }, + { + "epoch": 0.2384409026353164, + "grad_norm": 0.5831694006919861, + "learning_rate": 4.931633180730089e-06, + "loss": 0.6176, + "step": 1511 + }, + { + "epoch": 0.23859870601230868, + "grad_norm": 0.6144446730613708, + "learning_rate": 4.931536685801888e-06, + "loss": 0.6595, + "step": 1512 + }, + { + "epoch": 0.23875650938930093, + "grad_norm": 0.5948143601417542, + "learning_rate": 4.931440123769198e-06, + "loss": 0.6391, + "step": 1513 + }, + { + "epoch": 0.23891431276629319, + "grad_norm": 0.6019822955131531, + "learning_rate": 4.931343494634682e-06, + "loss": 0.5932, + "step": 1514 + }, + { + "epoch": 0.23907211614328547, + "grad_norm": 0.5933119654655457, + "learning_rate": 4.9312467984010075e-06, + "loss": 0.6137, + "step": 1515 + }, + { + "epoch": 0.23922991952027772, + "grad_norm": 0.6075839996337891, + "learning_rate": 4.931150035070842e-06, + "loss": 0.6219, + "step": 1516 + }, + { + "epoch": 0.23938772289727, + "grad_norm": 0.6027563810348511, + "learning_rate": 4.931053204646858e-06, + "loss": 0.6327, + "step": 1517 + }, + { + "epoch": 0.23954552627426226, + "grad_norm": 0.5920613408088684, + "learning_rate": 4.930956307131726e-06, + "loss": 0.6189, + "step": 1518 + }, + { + "epoch": 0.23970332965125454, + "grad_norm": 0.5765639543533325, + "learning_rate": 4.9308593425281214e-06, + "loss": 0.6367, + "step": 1519 + }, + { + "epoch": 0.2398611330282468, + "grad_norm": 0.6040428876876831, + "learning_rate": 4.9307623108387195e-06, + "loss": 0.614, + "step": 1520 + }, + { + "epoch": 0.24001893640523908, + "grad_norm": 0.5974823832511902, + "learning_rate": 4.930665212066198e-06, + "loss": 0.5981, + "step": 1521 + }, + { + "epoch": 0.24017673978223134, + "grad_norm": 0.6213223934173584, + "learning_rate": 4.9305680462132365e-06, + "loss": 0.6382, + "step": 1522 + }, + { + "epoch": 0.24033454315922362, + "grad_norm": 0.6209188103675842, + "learning_rate": 4.930470813282517e-06, + "loss": 0.6267, + "step": 1523 + }, + { + "epoch": 0.24049234653621587, + "grad_norm": 0.6329535841941833, + "learning_rate": 4.930373513276724e-06, + "loss": 0.5993, + "step": 1524 + }, + { + "epoch": 0.24065014991320816, + "grad_norm": 0.5656059980392456, + "learning_rate": 4.930276146198541e-06, + "loss": 0.5975, + "step": 1525 + }, + { + "epoch": 0.2408079532902004, + "grad_norm": 0.5725824236869812, + "learning_rate": 4.930178712050655e-06, + "loss": 0.5882, + "step": 1526 + }, + { + "epoch": 0.24096575666719267, + "grad_norm": 0.6170440912246704, + "learning_rate": 4.930081210835755e-06, + "loss": 0.6443, + "step": 1527 + }, + { + "epoch": 0.24112356004418495, + "grad_norm": 0.6038187742233276, + "learning_rate": 4.929983642556533e-06, + "loss": 0.6098, + "step": 1528 + }, + { + "epoch": 0.2412813634211772, + "grad_norm": 0.580174446105957, + "learning_rate": 4.9298860072156815e-06, + "loss": 0.5875, + "step": 1529 + }, + { + "epoch": 0.24143916679816949, + "grad_norm": 0.581500232219696, + "learning_rate": 4.929788304815893e-06, + "loss": 0.5898, + "step": 1530 + }, + { + "epoch": 0.24159697017516174, + "grad_norm": 0.6288472414016724, + "learning_rate": 4.929690535359867e-06, + "loss": 0.5801, + "step": 1531 + }, + { + "epoch": 0.24175477355215402, + "grad_norm": 0.6218459606170654, + "learning_rate": 4.9295926988503e-06, + "loss": 0.5839, + "step": 1532 + }, + { + "epoch": 0.24191257692914628, + "grad_norm": 0.6569736003875732, + "learning_rate": 4.929494795289891e-06, + "loss": 0.6237, + "step": 1533 + }, + { + "epoch": 0.24207038030613856, + "grad_norm": 0.6220338940620422, + "learning_rate": 4.929396824681343e-06, + "loss": 0.5842, + "step": 1534 + }, + { + "epoch": 0.24222818368313082, + "grad_norm": 0.5882163643836975, + "learning_rate": 4.92929878702736e-06, + "loss": 0.5906, + "step": 1535 + }, + { + "epoch": 0.2423859870601231, + "grad_norm": 0.5989025831222534, + "learning_rate": 4.9292006823306485e-06, + "loss": 0.6278, + "step": 1536 + }, + { + "epoch": 0.24254379043711535, + "grad_norm": 0.6430116295814514, + "learning_rate": 4.929102510593913e-06, + "loss": 0.5782, + "step": 1537 + }, + { + "epoch": 0.24270159381410764, + "grad_norm": 0.5879490971565247, + "learning_rate": 4.929004271819866e-06, + "loss": 0.6035, + "step": 1538 + }, + { + "epoch": 0.2428593971910999, + "grad_norm": 0.6337499022483826, + "learning_rate": 4.928905966011216e-06, + "loss": 0.6182, + "step": 1539 + }, + { + "epoch": 0.24301720056809215, + "grad_norm": 0.5805697441101074, + "learning_rate": 4.928807593170678e-06, + "loss": 0.5915, + "step": 1540 + }, + { + "epoch": 0.24317500394508443, + "grad_norm": 0.6068822741508484, + "learning_rate": 4.928709153300966e-06, + "loss": 0.5942, + "step": 1541 + }, + { + "epoch": 0.24333280732207668, + "grad_norm": 0.57672518491745, + "learning_rate": 4.928610646404797e-06, + "loss": 0.6242, + "step": 1542 + }, + { + "epoch": 0.24349061069906897, + "grad_norm": 0.5783392786979675, + "learning_rate": 4.928512072484889e-06, + "loss": 0.5945, + "step": 1543 + }, + { + "epoch": 0.24364841407606122, + "grad_norm": 0.579450786113739, + "learning_rate": 4.928413431543962e-06, + "loss": 0.6132, + "step": 1544 + }, + { + "epoch": 0.2438062174530535, + "grad_norm": 0.6011279225349426, + "learning_rate": 4.92831472358474e-06, + "loss": 0.6249, + "step": 1545 + }, + { + "epoch": 0.24396402083004576, + "grad_norm": 0.6351925134658813, + "learning_rate": 4.928215948609945e-06, + "loss": 0.622, + "step": 1546 + }, + { + "epoch": 0.24412182420703804, + "grad_norm": 0.593647301197052, + "learning_rate": 4.928117106622305e-06, + "loss": 0.5874, + "step": 1547 + }, + { + "epoch": 0.2442796275840303, + "grad_norm": 0.6137877702713013, + "learning_rate": 4.928018197624548e-06, + "loss": 0.5934, + "step": 1548 + }, + { + "epoch": 0.24443743096102258, + "grad_norm": 0.611279308795929, + "learning_rate": 4.9279192216194005e-06, + "loss": 0.6131, + "step": 1549 + }, + { + "epoch": 0.24459523433801483, + "grad_norm": 0.5754262804985046, + "learning_rate": 4.927820178609597e-06, + "loss": 0.6034, + "step": 1550 + }, + { + "epoch": 0.2447530377150071, + "grad_norm": 0.5891356468200684, + "learning_rate": 4.92772106859787e-06, + "loss": 0.6479, + "step": 1551 + }, + { + "epoch": 0.24491084109199937, + "grad_norm": 0.5998702645301819, + "learning_rate": 4.927621891586953e-06, + "loss": 0.5921, + "step": 1552 + }, + { + "epoch": 0.24506864446899163, + "grad_norm": 0.6070131659507751, + "learning_rate": 4.927522647579586e-06, + "loss": 0.6011, + "step": 1553 + }, + { + "epoch": 0.2452264478459839, + "grad_norm": 0.639409601688385, + "learning_rate": 4.927423336578505e-06, + "loss": 0.5965, + "step": 1554 + }, + { + "epoch": 0.24538425122297616, + "grad_norm": 0.6008479595184326, + "learning_rate": 4.927323958586454e-06, + "loss": 0.633, + "step": 1555 + }, + { + "epoch": 0.24554205459996845, + "grad_norm": 0.6055286526679993, + "learning_rate": 4.927224513606173e-06, + "loss": 0.6215, + "step": 1556 + }, + { + "epoch": 0.2456998579769607, + "grad_norm": 0.6406872868537903, + "learning_rate": 4.927125001640407e-06, + "loss": 0.6067, + "step": 1557 + }, + { + "epoch": 0.24585766135395298, + "grad_norm": 0.6204764246940613, + "learning_rate": 4.927025422691902e-06, + "loss": 0.6485, + "step": 1558 + }, + { + "epoch": 0.24601546473094524, + "grad_norm": 0.6369426250457764, + "learning_rate": 4.9269257767634085e-06, + "loss": 0.6285, + "step": 1559 + }, + { + "epoch": 0.24617326810793752, + "grad_norm": 0.5984443426132202, + "learning_rate": 4.926826063857673e-06, + "loss": 0.6513, + "step": 1560 + }, + { + "epoch": 0.24633107148492978, + "grad_norm": 0.5965696573257446, + "learning_rate": 4.926726283977449e-06, + "loss": 0.5837, + "step": 1561 + }, + { + "epoch": 0.24648887486192206, + "grad_norm": 0.6105973124504089, + "learning_rate": 4.926626437125491e-06, + "loss": 0.6362, + "step": 1562 + }, + { + "epoch": 0.2466466782389143, + "grad_norm": 0.6111522912979126, + "learning_rate": 4.926526523304553e-06, + "loss": 0.6089, + "step": 1563 + }, + { + "epoch": 0.24680448161590657, + "grad_norm": 0.5827422142028809, + "learning_rate": 4.926426542517394e-06, + "loss": 0.6098, + "step": 1564 + }, + { + "epoch": 0.24696228499289885, + "grad_norm": 0.6102567315101624, + "learning_rate": 4.926326494766771e-06, + "loss": 0.5986, + "step": 1565 + }, + { + "epoch": 0.2471200883698911, + "grad_norm": 0.5955950617790222, + "learning_rate": 4.9262263800554475e-06, + "loss": 0.629, + "step": 1566 + }, + { + "epoch": 0.2472778917468834, + "grad_norm": 0.5608857870101929, + "learning_rate": 4.926126198386184e-06, + "loss": 0.5912, + "step": 1567 + }, + { + "epoch": 0.24743569512387564, + "grad_norm": 0.6075494885444641, + "learning_rate": 4.926025949761748e-06, + "loss": 0.6107, + "step": 1568 + }, + { + "epoch": 0.24759349850086793, + "grad_norm": 0.579353392124176, + "learning_rate": 4.925925634184904e-06, + "loss": 0.6211, + "step": 1569 + }, + { + "epoch": 0.24775130187786018, + "grad_norm": 0.6046282649040222, + "learning_rate": 4.9258252516584205e-06, + "loss": 0.6055, + "step": 1570 + }, + { + "epoch": 0.24790910525485246, + "grad_norm": 0.6060624718666077, + "learning_rate": 4.925724802185069e-06, + "loss": 0.596, + "step": 1571 + }, + { + "epoch": 0.24806690863184472, + "grad_norm": 0.5908854007720947, + "learning_rate": 4.925624285767622e-06, + "loss": 0.6083, + "step": 1572 + }, + { + "epoch": 0.248224712008837, + "grad_norm": 0.5902642011642456, + "learning_rate": 4.925523702408851e-06, + "loss": 0.6181, + "step": 1573 + }, + { + "epoch": 0.24838251538582926, + "grad_norm": 0.6237324476242065, + "learning_rate": 4.925423052111534e-06, + "loss": 0.5862, + "step": 1574 + }, + { + "epoch": 0.24854031876282154, + "grad_norm": 0.6011880040168762, + "learning_rate": 4.9253223348784486e-06, + "loss": 0.6214, + "step": 1575 + }, + { + "epoch": 0.2486981221398138, + "grad_norm": 0.6219435334205627, + "learning_rate": 4.925221550712374e-06, + "loss": 0.6245, + "step": 1576 + }, + { + "epoch": 0.24885592551680605, + "grad_norm": 0.5956230163574219, + "learning_rate": 4.925120699616091e-06, + "loss": 0.6217, + "step": 1577 + }, + { + "epoch": 0.24901372889379833, + "grad_norm": 0.6513617038726807, + "learning_rate": 4.925019781592384e-06, + "loss": 0.5974, + "step": 1578 + }, + { + "epoch": 0.24917153227079059, + "grad_norm": 0.6077998876571655, + "learning_rate": 4.924918796644037e-06, + "loss": 0.595, + "step": 1579 + }, + { + "epoch": 0.24932933564778287, + "grad_norm": 0.608784019947052, + "learning_rate": 4.924817744773837e-06, + "loss": 0.61, + "step": 1580 + }, + { + "epoch": 0.24948713902477512, + "grad_norm": 0.6229702234268188, + "learning_rate": 4.924716625984573e-06, + "loss": 0.6287, + "step": 1581 + }, + { + "epoch": 0.2496449424017674, + "grad_norm": 0.6116113066673279, + "learning_rate": 4.924615440279037e-06, + "loss": 0.6115, + "step": 1582 + }, + { + "epoch": 0.24980274577875966, + "grad_norm": 0.6535546779632568, + "learning_rate": 4.92451418766002e-06, + "loss": 0.597, + "step": 1583 + }, + { + "epoch": 0.24996054915575194, + "grad_norm": 0.6077874898910522, + "learning_rate": 4.924412868130316e-06, + "loss": 0.6054, + "step": 1584 + }, + { + "epoch": 0.2501183525327442, + "grad_norm": 0.6208374500274658, + "learning_rate": 4.924311481692723e-06, + "loss": 0.6246, + "step": 1585 + }, + { + "epoch": 0.25027615590973645, + "grad_norm": 0.6305823922157288, + "learning_rate": 4.924210028350037e-06, + "loss": 0.621, + "step": 1586 + }, + { + "epoch": 0.25043395928672874, + "grad_norm": 0.5978865623474121, + "learning_rate": 4.924108508105058e-06, + "loss": 0.6213, + "step": 1587 + }, + { + "epoch": 0.250591762663721, + "grad_norm": 0.6404411196708679, + "learning_rate": 4.924006920960589e-06, + "loss": 0.6112, + "step": 1588 + }, + { + "epoch": 0.25074956604071325, + "grad_norm": 0.6473909020423889, + "learning_rate": 4.923905266919433e-06, + "loss": 0.5819, + "step": 1589 + }, + { + "epoch": 0.25090736941770553, + "grad_norm": 0.5857617855072021, + "learning_rate": 4.923803545984396e-06, + "loss": 0.6131, + "step": 1590 + }, + { + "epoch": 0.2510651727946978, + "grad_norm": 0.5925673842430115, + "learning_rate": 4.923701758158283e-06, + "loss": 0.6098, + "step": 1591 + }, + { + "epoch": 0.2512229761716901, + "grad_norm": 0.6611897945404053, + "learning_rate": 4.923599903443906e-06, + "loss": 0.6281, + "step": 1592 + }, + { + "epoch": 0.2513807795486823, + "grad_norm": 0.6464869379997253, + "learning_rate": 4.923497981844074e-06, + "loss": 0.6318, + "step": 1593 + }, + { + "epoch": 0.2515385829256746, + "grad_norm": 0.6133266687393188, + "learning_rate": 4.9233959933616e-06, + "loss": 0.6574, + "step": 1594 + }, + { + "epoch": 0.2516963863026669, + "grad_norm": 0.6338121891021729, + "learning_rate": 4.923293937999301e-06, + "loss": 0.6218, + "step": 1595 + }, + { + "epoch": 0.25185418967965917, + "grad_norm": 0.6440802812576294, + "learning_rate": 4.92319181575999e-06, + "loss": 0.6155, + "step": 1596 + }, + { + "epoch": 0.2520119930566514, + "grad_norm": 0.6025655269622803, + "learning_rate": 4.923089626646487e-06, + "loss": 0.6314, + "step": 1597 + }, + { + "epoch": 0.2521697964336437, + "grad_norm": 0.5841513276100159, + "learning_rate": 4.922987370661612e-06, + "loss": 0.5777, + "step": 1598 + }, + { + "epoch": 0.25232759981063596, + "grad_norm": 0.609203577041626, + "learning_rate": 4.922885047808187e-06, + "loss": 0.58, + "step": 1599 + }, + { + "epoch": 0.2524854031876282, + "grad_norm": 0.5628113746643066, + "learning_rate": 4.922782658089037e-06, + "loss": 0.6306, + "step": 1600 + }, + { + "epoch": 0.25264320656462047, + "grad_norm": 0.5859259963035583, + "learning_rate": 4.922680201506986e-06, + "loss": 0.5875, + "step": 1601 + }, + { + "epoch": 0.25280100994161275, + "grad_norm": 0.6324567794799805, + "learning_rate": 4.922577678064861e-06, + "loss": 0.6054, + "step": 1602 + }, + { + "epoch": 0.25295881331860504, + "grad_norm": 0.5935527682304382, + "learning_rate": 4.922475087765494e-06, + "loss": 0.6367, + "step": 1603 + }, + { + "epoch": 0.25311661669559726, + "grad_norm": 0.592417299747467, + "learning_rate": 4.9223724306117135e-06, + "loss": 0.6318, + "step": 1604 + }, + { + "epoch": 0.25327442007258955, + "grad_norm": 0.623104453086853, + "learning_rate": 4.922269706606355e-06, + "loss": 0.5982, + "step": 1605 + }, + { + "epoch": 0.25343222344958183, + "grad_norm": 0.5884117484092712, + "learning_rate": 4.922166915752252e-06, + "loss": 0.6262, + "step": 1606 + }, + { + "epoch": 0.2535900268265741, + "grad_norm": 0.6042266488075256, + "learning_rate": 4.922064058052241e-06, + "loss": 0.6086, + "step": 1607 + }, + { + "epoch": 0.25374783020356634, + "grad_norm": 0.5927168726921082, + "learning_rate": 4.9219611335091625e-06, + "loss": 0.6462, + "step": 1608 + }, + { + "epoch": 0.2539056335805586, + "grad_norm": 0.5989639759063721, + "learning_rate": 4.921858142125855e-06, + "loss": 0.6132, + "step": 1609 + }, + { + "epoch": 0.2540634369575509, + "grad_norm": 0.6116414070129395, + "learning_rate": 4.921755083905161e-06, + "loss": 0.6392, + "step": 1610 + }, + { + "epoch": 0.2542212403345432, + "grad_norm": 0.589184045791626, + "learning_rate": 4.921651958849926e-06, + "loss": 0.5788, + "step": 1611 + }, + { + "epoch": 0.2543790437115354, + "grad_norm": 0.6054829955101013, + "learning_rate": 4.921548766962995e-06, + "loss": 0.6491, + "step": 1612 + }, + { + "epoch": 0.2545368470885277, + "grad_norm": 0.5884580612182617, + "learning_rate": 4.921445508247215e-06, + "loss": 0.6226, + "step": 1613 + }, + { + "epoch": 0.25469465046552, + "grad_norm": 0.5924122929573059, + "learning_rate": 4.921342182705438e-06, + "loss": 0.6048, + "step": 1614 + }, + { + "epoch": 0.2548524538425122, + "grad_norm": 0.6083982586860657, + "learning_rate": 4.921238790340515e-06, + "loss": 0.6213, + "step": 1615 + }, + { + "epoch": 0.2550102572195045, + "grad_norm": 0.6080753803253174, + "learning_rate": 4.9211353311552964e-06, + "loss": 0.6354, + "step": 1616 + }, + { + "epoch": 0.25516806059649677, + "grad_norm": 0.6434323191642761, + "learning_rate": 4.9210318051526405e-06, + "loss": 0.5778, + "step": 1617 + }, + { + "epoch": 0.25532586397348905, + "grad_norm": 0.6179301142692566, + "learning_rate": 4.920928212335404e-06, + "loss": 0.6175, + "step": 1618 + }, + { + "epoch": 0.2554836673504813, + "grad_norm": 0.5887150168418884, + "learning_rate": 4.920824552706445e-06, + "loss": 0.6427, + "step": 1619 + }, + { + "epoch": 0.25564147072747356, + "grad_norm": 0.6283835172653198, + "learning_rate": 4.920720826268625e-06, + "loss": 0.5976, + "step": 1620 + }, + { + "epoch": 0.25579927410446585, + "grad_norm": 0.6090170741081238, + "learning_rate": 4.920617033024805e-06, + "loss": 0.65, + "step": 1621 + }, + { + "epoch": 0.25595707748145813, + "grad_norm": 0.6104285717010498, + "learning_rate": 4.920513172977851e-06, + "loss": 0.6069, + "step": 1622 + }, + { + "epoch": 0.25611488085845036, + "grad_norm": 0.5787195563316345, + "learning_rate": 4.92040924613063e-06, + "loss": 0.6108, + "step": 1623 + }, + { + "epoch": 0.25627268423544264, + "grad_norm": 0.5814347863197327, + "learning_rate": 4.920305252486007e-06, + "loss": 0.6298, + "step": 1624 + }, + { + "epoch": 0.2564304876124349, + "grad_norm": 0.6076638102531433, + "learning_rate": 4.920201192046855e-06, + "loss": 0.6219, + "step": 1625 + }, + { + "epoch": 0.25658829098942715, + "grad_norm": 0.6313490271568298, + "learning_rate": 4.920097064816045e-06, + "loss": 0.5786, + "step": 1626 + }, + { + "epoch": 0.25674609436641943, + "grad_norm": 0.5937352180480957, + "learning_rate": 4.919992870796451e-06, + "loss": 0.5968, + "step": 1627 + }, + { + "epoch": 0.2569038977434117, + "grad_norm": 0.6051065325737, + "learning_rate": 4.919888609990947e-06, + "loss": 0.5669, + "step": 1628 + }, + { + "epoch": 0.257061701120404, + "grad_norm": 0.6060165166854858, + "learning_rate": 4.919784282402411e-06, + "loss": 0.6229, + "step": 1629 + }, + { + "epoch": 0.2572195044973962, + "grad_norm": 0.648403525352478, + "learning_rate": 4.919679888033724e-06, + "loss": 0.6358, + "step": 1630 + }, + { + "epoch": 0.2573773078743885, + "grad_norm": 0.6336390972137451, + "learning_rate": 4.919575426887764e-06, + "loss": 0.5983, + "step": 1631 + }, + { + "epoch": 0.2575351112513808, + "grad_norm": 0.6123771667480469, + "learning_rate": 4.919470898967416e-06, + "loss": 0.63, + "step": 1632 + }, + { + "epoch": 0.25769291462837307, + "grad_norm": 0.6285672783851624, + "learning_rate": 4.919366304275564e-06, + "loss": 0.6061, + "step": 1633 + }, + { + "epoch": 0.2578507180053653, + "grad_norm": 0.6183193922042847, + "learning_rate": 4.919261642815096e-06, + "loss": 0.6436, + "step": 1634 + }, + { + "epoch": 0.2580085213823576, + "grad_norm": 0.5999541282653809, + "learning_rate": 4.919156914588897e-06, + "loss": 0.6103, + "step": 1635 + }, + { + "epoch": 0.25816632475934986, + "grad_norm": 0.6471021771430969, + "learning_rate": 4.91905211959986e-06, + "loss": 0.6059, + "step": 1636 + }, + { + "epoch": 0.2583241281363421, + "grad_norm": 0.5892012715339661, + "learning_rate": 4.918947257850876e-06, + "loss": 0.5754, + "step": 1637 + }, + { + "epoch": 0.2584819315133344, + "grad_norm": 0.6440108418464661, + "learning_rate": 4.91884232934484e-06, + "loss": 0.5895, + "step": 1638 + }, + { + "epoch": 0.25863973489032666, + "grad_norm": 0.6636278033256531, + "learning_rate": 4.918737334084647e-06, + "loss": 0.6006, + "step": 1639 + }, + { + "epoch": 0.25879753826731894, + "grad_norm": 0.608349084854126, + "learning_rate": 4.918632272073195e-06, + "loss": 0.6015, + "step": 1640 + }, + { + "epoch": 0.25895534164431117, + "grad_norm": 0.6104540228843689, + "learning_rate": 4.918527143313382e-06, + "loss": 0.6114, + "step": 1641 + }, + { + "epoch": 0.25911314502130345, + "grad_norm": 0.6491744518280029, + "learning_rate": 4.918421947808112e-06, + "loss": 0.604, + "step": 1642 + }, + { + "epoch": 0.25927094839829573, + "grad_norm": 0.6084750294685364, + "learning_rate": 4.9183166855602855e-06, + "loss": 0.6414, + "step": 1643 + }, + { + "epoch": 0.259428751775288, + "grad_norm": 0.6286389827728271, + "learning_rate": 4.9182113565728085e-06, + "loss": 0.5753, + "step": 1644 + }, + { + "epoch": 0.25958655515228024, + "grad_norm": 0.6276134848594666, + "learning_rate": 4.918105960848589e-06, + "loss": 0.6279, + "step": 1645 + }, + { + "epoch": 0.2597443585292725, + "grad_norm": 0.6356462836265564, + "learning_rate": 4.918000498390534e-06, + "loss": 0.6049, + "step": 1646 + }, + { + "epoch": 0.2599021619062648, + "grad_norm": 0.6146996021270752, + "learning_rate": 4.917894969201555e-06, + "loss": 0.6237, + "step": 1647 + }, + { + "epoch": 0.2600599652832571, + "grad_norm": 0.5981345772743225, + "learning_rate": 4.9177893732845625e-06, + "loss": 0.6144, + "step": 1648 + }, + { + "epoch": 0.2602177686602493, + "grad_norm": 0.5547905564308167, + "learning_rate": 4.917683710642473e-06, + "loss": 0.5757, + "step": 1649 + }, + { + "epoch": 0.2603755720372416, + "grad_norm": 0.6139596104621887, + "learning_rate": 4.917577981278202e-06, + "loss": 0.6036, + "step": 1650 + }, + { + "epoch": 0.2605333754142339, + "grad_norm": 0.6075848340988159, + "learning_rate": 4.917472185194666e-06, + "loss": 0.6067, + "step": 1651 + }, + { + "epoch": 0.2606911787912261, + "grad_norm": 0.6007901430130005, + "learning_rate": 4.917366322394786e-06, + "loss": 0.6176, + "step": 1652 + }, + { + "epoch": 0.2608489821682184, + "grad_norm": 0.5811975598335266, + "learning_rate": 4.917260392881484e-06, + "loss": 0.6056, + "step": 1653 + }, + { + "epoch": 0.2610067855452107, + "grad_norm": 0.6658692359924316, + "learning_rate": 4.917154396657683e-06, + "loss": 0.6172, + "step": 1654 + }, + { + "epoch": 0.26116458892220296, + "grad_norm": 0.594546377658844, + "learning_rate": 4.917048333726307e-06, + "loss": 0.5903, + "step": 1655 + }, + { + "epoch": 0.2613223922991952, + "grad_norm": 0.6801757216453552, + "learning_rate": 4.916942204090284e-06, + "loss": 0.6077, + "step": 1656 + }, + { + "epoch": 0.26148019567618747, + "grad_norm": 0.591001570224762, + "learning_rate": 4.916836007752544e-06, + "loss": 0.5892, + "step": 1657 + }, + { + "epoch": 0.26163799905317975, + "grad_norm": 0.6046202778816223, + "learning_rate": 4.916729744716016e-06, + "loss": 0.6034, + "step": 1658 + }, + { + "epoch": 0.26179580243017203, + "grad_norm": 0.5677275657653809, + "learning_rate": 4.916623414983632e-06, + "loss": 0.6201, + "step": 1659 + }, + { + "epoch": 0.26195360580716426, + "grad_norm": 0.6119646430015564, + "learning_rate": 4.91651701855833e-06, + "loss": 0.5961, + "step": 1660 + }, + { + "epoch": 0.26211140918415654, + "grad_norm": 0.5533767938613892, + "learning_rate": 4.916410555443043e-06, + "loss": 0.6232, + "step": 1661 + }, + { + "epoch": 0.2622692125611488, + "grad_norm": 0.6152979135513306, + "learning_rate": 4.91630402564071e-06, + "loss": 0.6174, + "step": 1662 + }, + { + "epoch": 0.26242701593814105, + "grad_norm": 0.585414707660675, + "learning_rate": 4.9161974291542705e-06, + "loss": 0.5859, + "step": 1663 + }, + { + "epoch": 0.26258481931513333, + "grad_norm": 0.6196632981300354, + "learning_rate": 4.916090765986668e-06, + "loss": 0.5791, + "step": 1664 + }, + { + "epoch": 0.2627426226921256, + "grad_norm": 0.6013092994689941, + "learning_rate": 4.915984036140844e-06, + "loss": 0.5805, + "step": 1665 + }, + { + "epoch": 0.2629004260691179, + "grad_norm": 0.6330252289772034, + "learning_rate": 4.915877239619746e-06, + "loss": 0.6104, + "step": 1666 + }, + { + "epoch": 0.2630582294461101, + "grad_norm": 0.6712751388549805, + "learning_rate": 4.91577037642632e-06, + "loss": 0.5738, + "step": 1667 + }, + { + "epoch": 0.2632160328231024, + "grad_norm": 0.5798711776733398, + "learning_rate": 4.915663446563516e-06, + "loss": 0.6361, + "step": 1668 + }, + { + "epoch": 0.2633738362000947, + "grad_norm": 0.5703718066215515, + "learning_rate": 4.915556450034283e-06, + "loss": 0.6344, + "step": 1669 + }, + { + "epoch": 0.263531639577087, + "grad_norm": 0.5893207788467407, + "learning_rate": 4.915449386841576e-06, + "loss": 0.5953, + "step": 1670 + }, + { + "epoch": 0.2636894429540792, + "grad_norm": 0.5858250856399536, + "learning_rate": 4.91534225698835e-06, + "loss": 0.6061, + "step": 1671 + }, + { + "epoch": 0.2638472463310715, + "grad_norm": 0.5891169309616089, + "learning_rate": 4.91523506047756e-06, + "loss": 0.6101, + "step": 1672 + }, + { + "epoch": 0.26400504970806377, + "grad_norm": 0.586290180683136, + "learning_rate": 4.915127797312164e-06, + "loss": 0.6093, + "step": 1673 + }, + { + "epoch": 0.264162853085056, + "grad_norm": 0.6410964727401733, + "learning_rate": 4.915020467495125e-06, + "loss": 0.585, + "step": 1674 + }, + { + "epoch": 0.2643206564620483, + "grad_norm": 0.5775142908096313, + "learning_rate": 4.914913071029402e-06, + "loss": 0.5895, + "step": 1675 + }, + { + "epoch": 0.26447845983904056, + "grad_norm": 0.5964720249176025, + "learning_rate": 4.9148056079179606e-06, + "loss": 0.5911, + "step": 1676 + }, + { + "epoch": 0.26463626321603284, + "grad_norm": 0.6000779867172241, + "learning_rate": 4.914698078163765e-06, + "loss": 0.637, + "step": 1677 + }, + { + "epoch": 0.26479406659302507, + "grad_norm": 0.5873983502388, + "learning_rate": 4.914590481769784e-06, + "loss": 0.6162, + "step": 1678 + }, + { + "epoch": 0.26495186997001735, + "grad_norm": 0.6098331212997437, + "learning_rate": 4.914482818738988e-06, + "loss": 0.6291, + "step": 1679 + }, + { + "epoch": 0.26510967334700963, + "grad_norm": 0.6263851523399353, + "learning_rate": 4.914375089074345e-06, + "loss": 0.6378, + "step": 1680 + }, + { + "epoch": 0.2652674767240019, + "grad_norm": 0.6465455889701843, + "learning_rate": 4.9142672927788324e-06, + "loss": 0.5538, + "step": 1681 + }, + { + "epoch": 0.26542528010099414, + "grad_norm": 0.6048521995544434, + "learning_rate": 4.914159429855421e-06, + "loss": 0.6216, + "step": 1682 + }, + { + "epoch": 0.2655830834779864, + "grad_norm": 0.6646795272827148, + "learning_rate": 4.91405150030709e-06, + "loss": 0.6165, + "step": 1683 + }, + { + "epoch": 0.2657408868549787, + "grad_norm": 0.5944454669952393, + "learning_rate": 4.913943504136817e-06, + "loss": 0.6415, + "step": 1684 + }, + { + "epoch": 0.265898690231971, + "grad_norm": 0.6404935717582703, + "learning_rate": 4.913835441347583e-06, + "loss": 0.594, + "step": 1685 + }, + { + "epoch": 0.2660564936089632, + "grad_norm": 0.6243854761123657, + "learning_rate": 4.91372731194237e-06, + "loss": 0.626, + "step": 1686 + }, + { + "epoch": 0.2662142969859555, + "grad_norm": 0.5760720372200012, + "learning_rate": 4.913619115924161e-06, + "loss": 0.5723, + "step": 1687 + }, + { + "epoch": 0.2663721003629478, + "grad_norm": 0.6224719285964966, + "learning_rate": 4.913510853295944e-06, + "loss": 0.6047, + "step": 1688 + }, + { + "epoch": 0.26652990373994, + "grad_norm": 0.5958613157272339, + "learning_rate": 4.9134025240607065e-06, + "loss": 0.5968, + "step": 1689 + }, + { + "epoch": 0.2666877071169323, + "grad_norm": 0.6037695407867432, + "learning_rate": 4.913294128221436e-06, + "loss": 0.6037, + "step": 1690 + }, + { + "epoch": 0.2668455104939246, + "grad_norm": 0.6213704347610474, + "learning_rate": 4.913185665781127e-06, + "loss": 0.6007, + "step": 1691 + }, + { + "epoch": 0.26700331387091686, + "grad_norm": 0.6088175773620605, + "learning_rate": 4.9130771367427705e-06, + "loss": 0.6309, + "step": 1692 + }, + { + "epoch": 0.2671611172479091, + "grad_norm": 0.5936397910118103, + "learning_rate": 4.912968541109362e-06, + "loss": 0.596, + "step": 1693 + }, + { + "epoch": 0.26731892062490137, + "grad_norm": 0.6120638847351074, + "learning_rate": 4.9128598788838996e-06, + "loss": 0.6083, + "step": 1694 + }, + { + "epoch": 0.26747672400189365, + "grad_norm": 0.5825720429420471, + "learning_rate": 4.91275115006938e-06, + "loss": 0.623, + "step": 1695 + }, + { + "epoch": 0.26763452737888593, + "grad_norm": 0.6193100810050964, + "learning_rate": 4.9126423546688065e-06, + "loss": 0.6086, + "step": 1696 + }, + { + "epoch": 0.26779233075587816, + "grad_norm": 0.5694641470909119, + "learning_rate": 4.91253349268518e-06, + "loss": 0.6079, + "step": 1697 + }, + { + "epoch": 0.26795013413287044, + "grad_norm": 0.5722231268882751, + "learning_rate": 4.9124245641215055e-06, + "loss": 0.6322, + "step": 1698 + }, + { + "epoch": 0.2681079375098627, + "grad_norm": 0.5965193510055542, + "learning_rate": 4.912315568980788e-06, + "loss": 0.5991, + "step": 1699 + }, + { + "epoch": 0.26826574088685495, + "grad_norm": 0.5848649144172668, + "learning_rate": 4.912206507266036e-06, + "loss": 0.5955, + "step": 1700 + }, + { + "epoch": 0.26842354426384724, + "grad_norm": 0.5865596532821655, + "learning_rate": 4.912097378980261e-06, + "loss": 0.6305, + "step": 1701 + }, + { + "epoch": 0.2685813476408395, + "grad_norm": 0.5962886214256287, + "learning_rate": 4.911988184126473e-06, + "loss": 0.6424, + "step": 1702 + }, + { + "epoch": 0.2687391510178318, + "grad_norm": 0.6010408997535706, + "learning_rate": 4.911878922707685e-06, + "loss": 0.5922, + "step": 1703 + }, + { + "epoch": 0.26889695439482403, + "grad_norm": 0.6291582584381104, + "learning_rate": 4.911769594726913e-06, + "loss": 0.6496, + "step": 1704 + }, + { + "epoch": 0.2690547577718163, + "grad_norm": 0.61832195520401, + "learning_rate": 4.911660200187175e-06, + "loss": 0.569, + "step": 1705 + }, + { + "epoch": 0.2692125611488086, + "grad_norm": 0.6240119338035583, + "learning_rate": 4.9115507390914896e-06, + "loss": 0.6114, + "step": 1706 + }, + { + "epoch": 0.2693703645258009, + "grad_norm": 0.6425908207893372, + "learning_rate": 4.911441211442877e-06, + "loss": 0.6113, + "step": 1707 + }, + { + "epoch": 0.2695281679027931, + "grad_norm": 0.6279119849205017, + "learning_rate": 4.91133161724436e-06, + "loss": 0.5933, + "step": 1708 + }, + { + "epoch": 0.2696859712797854, + "grad_norm": 0.6095125079154968, + "learning_rate": 4.9112219564989635e-06, + "loss": 0.6105, + "step": 1709 + }, + { + "epoch": 0.26984377465677767, + "grad_norm": 0.6028263568878174, + "learning_rate": 4.911112229209715e-06, + "loss": 0.6336, + "step": 1710 + }, + { + "epoch": 0.2700015780337699, + "grad_norm": 0.5911049842834473, + "learning_rate": 4.911002435379641e-06, + "loss": 0.6128, + "step": 1711 + }, + { + "epoch": 0.2701593814107622, + "grad_norm": 0.6046620607376099, + "learning_rate": 4.910892575011771e-06, + "loss": 0.5885, + "step": 1712 + }, + { + "epoch": 0.27031718478775446, + "grad_norm": 0.594312310218811, + "learning_rate": 4.910782648109139e-06, + "loss": 0.597, + "step": 1713 + }, + { + "epoch": 0.27047498816474674, + "grad_norm": 0.6098408102989197, + "learning_rate": 4.9106726546747765e-06, + "loss": 0.6138, + "step": 1714 + }, + { + "epoch": 0.27063279154173897, + "grad_norm": 0.5872399210929871, + "learning_rate": 4.910562594711722e-06, + "loss": 0.6064, + "step": 1715 + }, + { + "epoch": 0.27079059491873125, + "grad_norm": 0.6334125399589539, + "learning_rate": 4.91045246822301e-06, + "loss": 0.6033, + "step": 1716 + }, + { + "epoch": 0.27094839829572354, + "grad_norm": 0.5709351897239685, + "learning_rate": 4.910342275211681e-06, + "loss": 0.5956, + "step": 1717 + }, + { + "epoch": 0.2711062016727158, + "grad_norm": 0.606876015663147, + "learning_rate": 4.910232015680776e-06, + "loss": 0.587, + "step": 1718 + }, + { + "epoch": 0.27126400504970805, + "grad_norm": 0.5946324467658997, + "learning_rate": 4.910121689633337e-06, + "loss": 0.6016, + "step": 1719 + }, + { + "epoch": 0.27142180842670033, + "grad_norm": 0.6051509380340576, + "learning_rate": 4.91001129707241e-06, + "loss": 0.5745, + "step": 1720 + }, + { + "epoch": 0.2715796118036926, + "grad_norm": 0.5876028537750244, + "learning_rate": 4.909900838001042e-06, + "loss": 0.595, + "step": 1721 + }, + { + "epoch": 0.2717374151806849, + "grad_norm": 0.5878472924232483, + "learning_rate": 4.909790312422279e-06, + "loss": 0.596, + "step": 1722 + }, + { + "epoch": 0.2718952185576771, + "grad_norm": 0.6139421463012695, + "learning_rate": 4.909679720339174e-06, + "loss": 0.6119, + "step": 1723 + }, + { + "epoch": 0.2720530219346694, + "grad_norm": 0.6382610201835632, + "learning_rate": 4.909569061754777e-06, + "loss": 0.6351, + "step": 1724 + }, + { + "epoch": 0.2722108253116617, + "grad_norm": 0.6206804513931274, + "learning_rate": 4.9094583366721425e-06, + "loss": 0.5912, + "step": 1725 + }, + { + "epoch": 0.2723686286886539, + "grad_norm": 0.6146532297134399, + "learning_rate": 4.909347545094327e-06, + "loss": 0.6222, + "step": 1726 + }, + { + "epoch": 0.2725264320656462, + "grad_norm": 0.5786424279212952, + "learning_rate": 4.909236687024387e-06, + "loss": 0.6377, + "step": 1727 + }, + { + "epoch": 0.2726842354426385, + "grad_norm": 0.5790310502052307, + "learning_rate": 4.909125762465383e-06, + "loss": 0.5998, + "step": 1728 + }, + { + "epoch": 0.27284203881963076, + "grad_norm": 0.5858691334724426, + "learning_rate": 4.909014771420376e-06, + "loss": 0.6141, + "step": 1729 + }, + { + "epoch": 0.272999842196623, + "grad_norm": 0.5534268021583557, + "learning_rate": 4.908903713892428e-06, + "loss": 0.5742, + "step": 1730 + }, + { + "epoch": 0.27315764557361527, + "grad_norm": 0.6140767931938171, + "learning_rate": 4.908792589884604e-06, + "loss": 0.604, + "step": 1731 + }, + { + "epoch": 0.27331544895060755, + "grad_norm": 0.6442809104919434, + "learning_rate": 4.908681399399973e-06, + "loss": 0.6277, + "step": 1732 + }, + { + "epoch": 0.27347325232759984, + "grad_norm": 0.5852057933807373, + "learning_rate": 4.908570142441601e-06, + "loss": 0.5919, + "step": 1733 + }, + { + "epoch": 0.27363105570459206, + "grad_norm": 0.5951572060585022, + "learning_rate": 4.90845881901256e-06, + "loss": 0.6295, + "step": 1734 + }, + { + "epoch": 0.27378885908158435, + "grad_norm": 0.5771151185035706, + "learning_rate": 4.908347429115922e-06, + "loss": 0.6321, + "step": 1735 + }, + { + "epoch": 0.27394666245857663, + "grad_norm": 0.6284305453300476, + "learning_rate": 4.90823597275476e-06, + "loss": 0.6272, + "step": 1736 + }, + { + "epoch": 0.27410446583556886, + "grad_norm": 0.5931362509727478, + "learning_rate": 4.908124449932151e-06, + "loss": 0.6158, + "step": 1737 + }, + { + "epoch": 0.27426226921256114, + "grad_norm": 0.5707323551177979, + "learning_rate": 4.908012860651173e-06, + "loss": 0.582, + "step": 1738 + }, + { + "epoch": 0.2744200725895534, + "grad_norm": 0.599815309047699, + "learning_rate": 4.907901204914903e-06, + "loss": 0.6005, + "step": 1739 + }, + { + "epoch": 0.2745778759665457, + "grad_norm": 0.5735132098197937, + "learning_rate": 4.907789482726426e-06, + "loss": 0.6304, + "step": 1740 + }, + { + "epoch": 0.27473567934353793, + "grad_norm": 0.5916606187820435, + "learning_rate": 4.9076776940888235e-06, + "loss": 0.6161, + "step": 1741 + }, + { + "epoch": 0.2748934827205302, + "grad_norm": 0.5941879749298096, + "learning_rate": 4.907565839005182e-06, + "loss": 0.5725, + "step": 1742 + }, + { + "epoch": 0.2750512860975225, + "grad_norm": 0.5938867926597595, + "learning_rate": 4.907453917478586e-06, + "loss": 0.6118, + "step": 1743 + }, + { + "epoch": 0.2752090894745148, + "grad_norm": 0.5779732465744019, + "learning_rate": 4.907341929512126e-06, + "loss": 0.621, + "step": 1744 + }, + { + "epoch": 0.275366892851507, + "grad_norm": 0.6124617457389832, + "learning_rate": 4.9072298751088914e-06, + "loss": 0.6084, + "step": 1745 + }, + { + "epoch": 0.2755246962284993, + "grad_norm": 0.6059616804122925, + "learning_rate": 4.907117754271976e-06, + "loss": 0.5952, + "step": 1746 + }, + { + "epoch": 0.27568249960549157, + "grad_norm": 0.757199764251709, + "learning_rate": 4.907005567004473e-06, + "loss": 0.5973, + "step": 1747 + }, + { + "epoch": 0.2758403029824838, + "grad_norm": 0.586572527885437, + "learning_rate": 4.906893313309479e-06, + "loss": 0.6202, + "step": 1748 + }, + { + "epoch": 0.2759981063594761, + "grad_norm": 0.6267151236534119, + "learning_rate": 4.906780993190092e-06, + "loss": 0.5929, + "step": 1749 + }, + { + "epoch": 0.27615590973646836, + "grad_norm": 0.6183449029922485, + "learning_rate": 4.9066686066494115e-06, + "loss": 0.585, + "step": 1750 + }, + { + "epoch": 0.27631371311346065, + "grad_norm": 0.6264848113059998, + "learning_rate": 4.90655615369054e-06, + "loss": 0.5879, + "step": 1751 + }, + { + "epoch": 0.2764715164904529, + "grad_norm": 0.6416293382644653, + "learning_rate": 4.906443634316579e-06, + "loss": 0.6315, + "step": 1752 + }, + { + "epoch": 0.27662931986744516, + "grad_norm": 0.5857084393501282, + "learning_rate": 4.906331048530635e-06, + "loss": 0.5975, + "step": 1753 + }, + { + "epoch": 0.27678712324443744, + "grad_norm": 0.5849494338035583, + "learning_rate": 4.9062183963358155e-06, + "loss": 0.6067, + "step": 1754 + }, + { + "epoch": 0.2769449266214297, + "grad_norm": 0.5505243539810181, + "learning_rate": 4.906105677735229e-06, + "loss": 0.6085, + "step": 1755 + }, + { + "epoch": 0.27710272999842195, + "grad_norm": 0.6183199882507324, + "learning_rate": 4.905992892731986e-06, + "loss": 0.6362, + "step": 1756 + }, + { + "epoch": 0.27726053337541423, + "grad_norm": 0.590331494808197, + "learning_rate": 4.905880041329198e-06, + "loss": 0.5937, + "step": 1757 + }, + { + "epoch": 0.2774183367524065, + "grad_norm": 0.6060701608657837, + "learning_rate": 4.905767123529982e-06, + "loss": 0.599, + "step": 1758 + }, + { + "epoch": 0.2775761401293988, + "grad_norm": 0.5884186029434204, + "learning_rate": 4.905654139337452e-06, + "loss": 0.6176, + "step": 1759 + }, + { + "epoch": 0.277733943506391, + "grad_norm": 0.5665607452392578, + "learning_rate": 4.905541088754728e-06, + "loss": 0.5907, + "step": 1760 + }, + { + "epoch": 0.2778917468833833, + "grad_norm": 0.6428797245025635, + "learning_rate": 4.905427971784928e-06, + "loss": 0.6015, + "step": 1761 + }, + { + "epoch": 0.2780495502603756, + "grad_norm": 0.6340263485908508, + "learning_rate": 4.9053147884311755e-06, + "loss": 0.5998, + "step": 1762 + }, + { + "epoch": 0.2782073536373678, + "grad_norm": 0.6063253879547119, + "learning_rate": 4.9052015386965935e-06, + "loss": 0.5957, + "step": 1763 + }, + { + "epoch": 0.2783651570143601, + "grad_norm": 0.5662021636962891, + "learning_rate": 4.905088222584307e-06, + "loss": 0.5885, + "step": 1764 + }, + { + "epoch": 0.2785229603913524, + "grad_norm": 0.5659974813461304, + "learning_rate": 4.9049748400974425e-06, + "loss": 0.5679, + "step": 1765 + }, + { + "epoch": 0.27868076376834466, + "grad_norm": 0.6277613639831543, + "learning_rate": 4.9048613912391314e-06, + "loss": 0.611, + "step": 1766 + }, + { + "epoch": 0.2788385671453369, + "grad_norm": 0.6073848605155945, + "learning_rate": 4.9047478760125025e-06, + "loss": 0.616, + "step": 1767 + }, + { + "epoch": 0.2789963705223292, + "grad_norm": 0.6125918626785278, + "learning_rate": 4.90463429442069e-06, + "loss": 0.6325, + "step": 1768 + }, + { + "epoch": 0.27915417389932146, + "grad_norm": 0.5755658745765686, + "learning_rate": 4.904520646466826e-06, + "loss": 0.5686, + "step": 1769 + }, + { + "epoch": 0.27931197727631374, + "grad_norm": 0.6522769331932068, + "learning_rate": 4.90440693215405e-06, + "loss": 0.6362, + "step": 1770 + }, + { + "epoch": 0.27946978065330597, + "grad_norm": 0.6008327007293701, + "learning_rate": 4.904293151485498e-06, + "loss": 0.569, + "step": 1771 + }, + { + "epoch": 0.27962758403029825, + "grad_norm": 0.5885869860649109, + "learning_rate": 4.904179304464312e-06, + "loss": 0.5693, + "step": 1772 + }, + { + "epoch": 0.27978538740729053, + "grad_norm": 0.5939190983772278, + "learning_rate": 4.904065391093632e-06, + "loss": 0.6292, + "step": 1773 + }, + { + "epoch": 0.27994319078428276, + "grad_norm": 0.5860068202018738, + "learning_rate": 4.903951411376604e-06, + "loss": 0.6253, + "step": 1774 + }, + { + "epoch": 0.28010099416127504, + "grad_norm": 0.5953362584114075, + "learning_rate": 4.9038373653163705e-06, + "loss": 0.6079, + "step": 1775 + }, + { + "epoch": 0.2802587975382673, + "grad_norm": 0.6474849581718445, + "learning_rate": 4.903723252916082e-06, + "loss": 0.6, + "step": 1776 + }, + { + "epoch": 0.2804166009152596, + "grad_norm": 0.6114062070846558, + "learning_rate": 4.903609074178885e-06, + "loss": 0.6042, + "step": 1777 + }, + { + "epoch": 0.28057440429225183, + "grad_norm": 0.6151652932167053, + "learning_rate": 4.903494829107932e-06, + "loss": 0.6396, + "step": 1778 + }, + { + "epoch": 0.2807322076692441, + "grad_norm": 0.5684833526611328, + "learning_rate": 4.903380517706376e-06, + "loss": 0.5958, + "step": 1779 + }, + { + "epoch": 0.2808900110462364, + "grad_norm": 0.5875791311264038, + "learning_rate": 4.903266139977372e-06, + "loss": 0.5995, + "step": 1780 + }, + { + "epoch": 0.2810478144232287, + "grad_norm": 0.6008281111717224, + "learning_rate": 4.903151695924075e-06, + "loss": 0.6317, + "step": 1781 + }, + { + "epoch": 0.2812056178002209, + "grad_norm": 0.6261932849884033, + "learning_rate": 4.903037185549645e-06, + "loss": 0.5936, + "step": 1782 + }, + { + "epoch": 0.2813634211772132, + "grad_norm": 0.5937637686729431, + "learning_rate": 4.902922608857242e-06, + "loss": 0.6263, + "step": 1783 + }, + { + "epoch": 0.2815212245542055, + "grad_norm": 0.5870650410652161, + "learning_rate": 4.902807965850027e-06, + "loss": 0.6427, + "step": 1784 + }, + { + "epoch": 0.2816790279311977, + "grad_norm": 0.5669851303100586, + "learning_rate": 4.902693256531165e-06, + "loss": 0.6205, + "step": 1785 + }, + { + "epoch": 0.28183683130819, + "grad_norm": 0.6080726981163025, + "learning_rate": 4.90257848090382e-06, + "loss": 0.6049, + "step": 1786 + }, + { + "epoch": 0.28199463468518227, + "grad_norm": 0.6237051486968994, + "learning_rate": 4.902463638971162e-06, + "loss": 0.5945, + "step": 1787 + }, + { + "epoch": 0.28215243806217455, + "grad_norm": 0.620781660079956, + "learning_rate": 4.902348730736358e-06, + "loss": 0.6201, + "step": 1788 + }, + { + "epoch": 0.2823102414391668, + "grad_norm": 0.6526220440864563, + "learning_rate": 4.902233756202581e-06, + "loss": 0.6101, + "step": 1789 + }, + { + "epoch": 0.28246804481615906, + "grad_norm": 0.6029321551322937, + "learning_rate": 4.902118715373002e-06, + "loss": 0.6486, + "step": 1790 + }, + { + "epoch": 0.28262584819315134, + "grad_norm": 0.6554720997810364, + "learning_rate": 4.902003608250798e-06, + "loss": 0.6392, + "step": 1791 + }, + { + "epoch": 0.2827836515701436, + "grad_norm": 0.6375058889389038, + "learning_rate": 4.901888434839145e-06, + "loss": 0.6099, + "step": 1792 + }, + { + "epoch": 0.28294145494713585, + "grad_norm": 0.5899685621261597, + "learning_rate": 4.901773195141222e-06, + "loss": 0.6617, + "step": 1793 + }, + { + "epoch": 0.28309925832412813, + "grad_norm": 0.5861541628837585, + "learning_rate": 4.901657889160207e-06, + "loss": 0.5814, + "step": 1794 + }, + { + "epoch": 0.2832570617011204, + "grad_norm": 0.6055976152420044, + "learning_rate": 4.901542516899285e-06, + "loss": 0.596, + "step": 1795 + }, + { + "epoch": 0.2834148650781127, + "grad_norm": 0.7259289622306824, + "learning_rate": 4.901427078361638e-06, + "loss": 0.6117, + "step": 1796 + }, + { + "epoch": 0.2835726684551049, + "grad_norm": 0.6020194292068481, + "learning_rate": 4.901311573550453e-06, + "loss": 0.5995, + "step": 1797 + }, + { + "epoch": 0.2837304718320972, + "grad_norm": 0.6011783480644226, + "learning_rate": 4.901196002468917e-06, + "loss": 0.6115, + "step": 1798 + }, + { + "epoch": 0.2838882752090895, + "grad_norm": 0.5963730812072754, + "learning_rate": 4.90108036512022e-06, + "loss": 0.6191, + "step": 1799 + }, + { + "epoch": 0.2840460785860817, + "grad_norm": 0.6527393460273743, + "learning_rate": 4.900964661507553e-06, + "loss": 0.6281, + "step": 1800 + }, + { + "epoch": 0.284203881963074, + "grad_norm": 0.5946130752563477, + "learning_rate": 4.900848891634109e-06, + "loss": 0.6172, + "step": 1801 + }, + { + "epoch": 0.2843616853400663, + "grad_norm": 0.5736398100852966, + "learning_rate": 4.900733055503083e-06, + "loss": 0.5863, + "step": 1802 + }, + { + "epoch": 0.28451948871705857, + "grad_norm": 0.5695441365242004, + "learning_rate": 4.900617153117672e-06, + "loss": 0.6158, + "step": 1803 + }, + { + "epoch": 0.2846772920940508, + "grad_norm": 0.6437745690345764, + "learning_rate": 4.900501184481076e-06, + "loss": 0.6385, + "step": 1804 + }, + { + "epoch": 0.2848350954710431, + "grad_norm": 0.610278308391571, + "learning_rate": 4.900385149596492e-06, + "loss": 0.6068, + "step": 1805 + }, + { + "epoch": 0.28499289884803536, + "grad_norm": 0.5847537517547607, + "learning_rate": 4.900269048467126e-06, + "loss": 0.5698, + "step": 1806 + }, + { + "epoch": 0.28515070222502764, + "grad_norm": 0.6079323887825012, + "learning_rate": 4.900152881096179e-06, + "loss": 0.5766, + "step": 1807 + }, + { + "epoch": 0.28530850560201987, + "grad_norm": 0.6403319239616394, + "learning_rate": 4.900036647486859e-06, + "loss": 0.6141, + "step": 1808 + }, + { + "epoch": 0.28546630897901215, + "grad_norm": 0.6233461499214172, + "learning_rate": 4.8999203476423725e-06, + "loss": 0.6166, + "step": 1809 + }, + { + "epoch": 0.28562411235600443, + "grad_norm": 0.5799030065536499, + "learning_rate": 4.899803981565931e-06, + "loss": 0.603, + "step": 1810 + }, + { + "epoch": 0.28578191573299666, + "grad_norm": 0.6314442157745361, + "learning_rate": 4.899687549260743e-06, + "loss": 0.6165, + "step": 1811 + }, + { + "epoch": 0.28593971910998894, + "grad_norm": 0.5959477424621582, + "learning_rate": 4.899571050730024e-06, + "loss": 0.6136, + "step": 1812 + }, + { + "epoch": 0.2860975224869812, + "grad_norm": 0.6240156292915344, + "learning_rate": 4.899454485976989e-06, + "loss": 0.5882, + "step": 1813 + }, + { + "epoch": 0.2862553258639735, + "grad_norm": 0.6302262544631958, + "learning_rate": 4.899337855004854e-06, + "loss": 0.6056, + "step": 1814 + }, + { + "epoch": 0.28641312924096574, + "grad_norm": 0.5752485990524292, + "learning_rate": 4.899221157816838e-06, + "loss": 0.5922, + "step": 1815 + }, + { + "epoch": 0.286570932617958, + "grad_norm": 0.5920911431312561, + "learning_rate": 4.8991043944161616e-06, + "loss": 0.63, + "step": 1816 + }, + { + "epoch": 0.2867287359949503, + "grad_norm": 0.586580216884613, + "learning_rate": 4.898987564806047e-06, + "loss": 0.5646, + "step": 1817 + }, + { + "epoch": 0.2868865393719426, + "grad_norm": 0.5881460309028625, + "learning_rate": 4.89887066898972e-06, + "loss": 0.6312, + "step": 1818 + }, + { + "epoch": 0.2870443427489348, + "grad_norm": 0.5742670297622681, + "learning_rate": 4.898753706970403e-06, + "loss": 0.5965, + "step": 1819 + }, + { + "epoch": 0.2872021461259271, + "grad_norm": 0.581668496131897, + "learning_rate": 4.898636678751327e-06, + "loss": 0.5926, + "step": 1820 + }, + { + "epoch": 0.2873599495029194, + "grad_norm": 0.5796857476234436, + "learning_rate": 4.898519584335721e-06, + "loss": 0.6074, + "step": 1821 + }, + { + "epoch": 0.2875177528799116, + "grad_norm": 0.5773339867591858, + "learning_rate": 4.8984024237268165e-06, + "loss": 0.5854, + "step": 1822 + }, + { + "epoch": 0.2876755562569039, + "grad_norm": 0.6126556396484375, + "learning_rate": 4.898285196927848e-06, + "loss": 0.6136, + "step": 1823 + }, + { + "epoch": 0.28783335963389617, + "grad_norm": 0.5799577236175537, + "learning_rate": 4.898167903942047e-06, + "loss": 0.585, + "step": 1824 + }, + { + "epoch": 0.28799116301088845, + "grad_norm": 0.6724323034286499, + "learning_rate": 4.898050544772655e-06, + "loss": 0.6575, + "step": 1825 + }, + { + "epoch": 0.2881489663878807, + "grad_norm": 0.65704345703125, + "learning_rate": 4.897933119422907e-06, + "loss": 0.6429, + "step": 1826 + }, + { + "epoch": 0.28830676976487296, + "grad_norm": 0.613616406917572, + "learning_rate": 4.8978156278960465e-06, + "loss": 0.5884, + "step": 1827 + }, + { + "epoch": 0.28846457314186524, + "grad_norm": 0.5959572792053223, + "learning_rate": 4.8976980701953145e-06, + "loss": 0.5988, + "step": 1828 + }, + { + "epoch": 0.2886223765188575, + "grad_norm": 0.5974178910255432, + "learning_rate": 4.897580446323955e-06, + "loss": 0.6029, + "step": 1829 + }, + { + "epoch": 0.28878017989584975, + "grad_norm": 0.5949099063873291, + "learning_rate": 4.897462756285216e-06, + "loss": 0.6247, + "step": 1830 + }, + { + "epoch": 0.28893798327284204, + "grad_norm": 0.6126395463943481, + "learning_rate": 4.897345000082343e-06, + "loss": 0.6272, + "step": 1831 + }, + { + "epoch": 0.2890957866498343, + "grad_norm": 0.5733036994934082, + "learning_rate": 4.897227177718587e-06, + "loss": 0.5993, + "step": 1832 + }, + { + "epoch": 0.2892535900268266, + "grad_norm": 0.5816693305969238, + "learning_rate": 4.897109289197201e-06, + "loss": 0.6185, + "step": 1833 + }, + { + "epoch": 0.28941139340381883, + "grad_norm": 0.6185307502746582, + "learning_rate": 4.896991334521436e-06, + "loss": 0.5912, + "step": 1834 + }, + { + "epoch": 0.2895691967808111, + "grad_norm": 0.6140174269676208, + "learning_rate": 4.896873313694549e-06, + "loss": 0.5791, + "step": 1835 + }, + { + "epoch": 0.2897270001578034, + "grad_norm": 0.6580426096916199, + "learning_rate": 4.896755226719796e-06, + "loss": 0.5981, + "step": 1836 + }, + { + "epoch": 0.2898848035347956, + "grad_norm": 0.6058812141418457, + "learning_rate": 4.896637073600436e-06, + "loss": 0.619, + "step": 1837 + }, + { + "epoch": 0.2900426069117879, + "grad_norm": 0.6076010465621948, + "learning_rate": 4.896518854339731e-06, + "loss": 0.6002, + "step": 1838 + }, + { + "epoch": 0.2902004102887802, + "grad_norm": 0.6262593865394592, + "learning_rate": 4.896400568940942e-06, + "loss": 0.6112, + "step": 1839 + }, + { + "epoch": 0.29035821366577247, + "grad_norm": 0.5996988415718079, + "learning_rate": 4.896282217407334e-06, + "loss": 0.6236, + "step": 1840 + }, + { + "epoch": 0.2905160170427647, + "grad_norm": 0.6274606585502625, + "learning_rate": 4.896163799742173e-06, + "loss": 0.5697, + "step": 1841 + }, + { + "epoch": 0.290673820419757, + "grad_norm": 0.5839720964431763, + "learning_rate": 4.896045315948727e-06, + "loss": 0.5819, + "step": 1842 + }, + { + "epoch": 0.29083162379674926, + "grad_norm": 0.5883240103721619, + "learning_rate": 4.895926766030268e-06, + "loss": 0.5701, + "step": 1843 + }, + { + "epoch": 0.29098942717374154, + "grad_norm": 0.6216784715652466, + "learning_rate": 4.895808149990064e-06, + "loss": 0.6022, + "step": 1844 + }, + { + "epoch": 0.29114723055073377, + "grad_norm": 0.5888087749481201, + "learning_rate": 4.895689467831391e-06, + "loss": 0.6173, + "step": 1845 + }, + { + "epoch": 0.29130503392772605, + "grad_norm": 0.616840660572052, + "learning_rate": 4.895570719557523e-06, + "loss": 0.6021, + "step": 1846 + }, + { + "epoch": 0.29146283730471834, + "grad_norm": 0.6195382475852966, + "learning_rate": 4.895451905171739e-06, + "loss": 0.6011, + "step": 1847 + }, + { + "epoch": 0.29162064068171056, + "grad_norm": 0.6104652881622314, + "learning_rate": 4.895333024677316e-06, + "loss": 0.639, + "step": 1848 + }, + { + "epoch": 0.29177844405870285, + "grad_norm": 0.6082359552383423, + "learning_rate": 4.895214078077536e-06, + "loss": 0.6068, + "step": 1849 + }, + { + "epoch": 0.29193624743569513, + "grad_norm": 0.5701900124549866, + "learning_rate": 4.895095065375681e-06, + "loss": 0.6078, + "step": 1850 + }, + { + "epoch": 0.2920940508126874, + "grad_norm": 0.6205955743789673, + "learning_rate": 4.894975986575036e-06, + "loss": 0.6395, + "step": 1851 + }, + { + "epoch": 0.29225185418967964, + "grad_norm": 0.5994669795036316, + "learning_rate": 4.8948568416788876e-06, + "loss": 0.5971, + "step": 1852 + }, + { + "epoch": 0.2924096575666719, + "grad_norm": 0.5753355622291565, + "learning_rate": 4.894737630690523e-06, + "loss": 0.604, + "step": 1853 + }, + { + "epoch": 0.2925674609436642, + "grad_norm": 0.6378225088119507, + "learning_rate": 4.894618353613231e-06, + "loss": 0.6033, + "step": 1854 + }, + { + "epoch": 0.2927252643206565, + "grad_norm": 0.5861126184463501, + "learning_rate": 4.894499010450306e-06, + "loss": 0.5874, + "step": 1855 + }, + { + "epoch": 0.2928830676976487, + "grad_norm": 0.5728844404220581, + "learning_rate": 4.8943796012050405e-06, + "loss": 0.5804, + "step": 1856 + }, + { + "epoch": 0.293040871074641, + "grad_norm": 0.583821713924408, + "learning_rate": 4.89426012588073e-06, + "loss": 0.5922, + "step": 1857 + }, + { + "epoch": 0.2931986744516333, + "grad_norm": 0.576522946357727, + "learning_rate": 4.894140584480671e-06, + "loss": 0.6093, + "step": 1858 + }, + { + "epoch": 0.2933564778286255, + "grad_norm": 0.6181520223617554, + "learning_rate": 4.894020977008164e-06, + "loss": 0.6109, + "step": 1859 + }, + { + "epoch": 0.2935142812056178, + "grad_norm": 0.5962374210357666, + "learning_rate": 4.893901303466508e-06, + "loss": 0.6259, + "step": 1860 + }, + { + "epoch": 0.29367208458261007, + "grad_norm": 0.6057572960853577, + "learning_rate": 4.893781563859007e-06, + "loss": 0.6225, + "step": 1861 + }, + { + "epoch": 0.29382988795960235, + "grad_norm": 0.5986577868461609, + "learning_rate": 4.893661758188964e-06, + "loss": 0.5771, + "step": 1862 + }, + { + "epoch": 0.2939876913365946, + "grad_norm": 0.5850417017936707, + "learning_rate": 4.8935418864596885e-06, + "loss": 0.5923, + "step": 1863 + }, + { + "epoch": 0.29414549471358686, + "grad_norm": 0.5963559746742249, + "learning_rate": 4.8934219486744864e-06, + "loss": 0.6211, + "step": 1864 + }, + { + "epoch": 0.29430329809057915, + "grad_norm": 0.5885325074195862, + "learning_rate": 4.893301944836667e-06, + "loss": 0.6101, + "step": 1865 + }, + { + "epoch": 0.29446110146757143, + "grad_norm": 0.5853924751281738, + "learning_rate": 4.893181874949543e-06, + "loss": 0.5856, + "step": 1866 + }, + { + "epoch": 0.29461890484456366, + "grad_norm": 0.6403554081916809, + "learning_rate": 4.893061739016429e-06, + "loss": 0.6092, + "step": 1867 + }, + { + "epoch": 0.29477670822155594, + "grad_norm": 0.623193085193634, + "learning_rate": 4.892941537040639e-06, + "loss": 0.5998, + "step": 1868 + }, + { + "epoch": 0.2949345115985482, + "grad_norm": 0.5751511454582214, + "learning_rate": 4.892821269025491e-06, + "loss": 0.6372, + "step": 1869 + }, + { + "epoch": 0.2950923149755405, + "grad_norm": 0.6220905780792236, + "learning_rate": 4.892700934974305e-06, + "loss": 0.6079, + "step": 1870 + }, + { + "epoch": 0.29525011835253273, + "grad_norm": 0.6021876931190491, + "learning_rate": 4.8925805348904e-06, + "loss": 0.6265, + "step": 1871 + }, + { + "epoch": 0.295407921729525, + "grad_norm": 0.6072553992271423, + "learning_rate": 4.892460068777099e-06, + "loss": 0.6158, + "step": 1872 + }, + { + "epoch": 0.2955657251065173, + "grad_norm": 0.5929582118988037, + "learning_rate": 4.892339536637728e-06, + "loss": 0.6051, + "step": 1873 + }, + { + "epoch": 0.2957235284835095, + "grad_norm": 0.6035999059677124, + "learning_rate": 4.892218938475613e-06, + "loss": 0.5866, + "step": 1874 + }, + { + "epoch": 0.2958813318605018, + "grad_norm": 0.6352666020393372, + "learning_rate": 4.89209827429408e-06, + "loss": 0.5863, + "step": 1875 + }, + { + "epoch": 0.2960391352374941, + "grad_norm": 0.6089082956314087, + "learning_rate": 4.891977544096463e-06, + "loss": 0.6368, + "step": 1876 + }, + { + "epoch": 0.29619693861448637, + "grad_norm": 0.6232772469520569, + "learning_rate": 4.891856747886091e-06, + "loss": 0.5797, + "step": 1877 + }, + { + "epoch": 0.2963547419914786, + "grad_norm": 0.6221973896026611, + "learning_rate": 4.891735885666299e-06, + "loss": 0.5915, + "step": 1878 + }, + { + "epoch": 0.2965125453684709, + "grad_norm": 0.6162261962890625, + "learning_rate": 4.8916149574404205e-06, + "loss": 0.5996, + "step": 1879 + }, + { + "epoch": 0.29667034874546316, + "grad_norm": 0.5794464945793152, + "learning_rate": 4.891493963211795e-06, + "loss": 0.6211, + "step": 1880 + }, + { + "epoch": 0.29682815212245545, + "grad_norm": 0.5904097557067871, + "learning_rate": 4.891372902983761e-06, + "loss": 0.617, + "step": 1881 + }, + { + "epoch": 0.2969859554994477, + "grad_norm": 0.6071037650108337, + "learning_rate": 4.891251776759659e-06, + "loss": 0.5795, + "step": 1882 + }, + { + "epoch": 0.29714375887643996, + "grad_norm": 0.6028374433517456, + "learning_rate": 4.8911305845428325e-06, + "loss": 0.6232, + "step": 1883 + }, + { + "epoch": 0.29730156225343224, + "grad_norm": 0.625228762626648, + "learning_rate": 4.891009326336626e-06, + "loss": 0.6506, + "step": 1884 + }, + { + "epoch": 0.29745936563042447, + "grad_norm": 0.5687570571899414, + "learning_rate": 4.890888002144385e-06, + "loss": 0.5653, + "step": 1885 + }, + { + "epoch": 0.29761716900741675, + "grad_norm": 0.5818720459938049, + "learning_rate": 4.890766611969459e-06, + "loss": 0.5998, + "step": 1886 + }, + { + "epoch": 0.29777497238440903, + "grad_norm": 0.5797486901283264, + "learning_rate": 4.890645155815197e-06, + "loss": 0.5749, + "step": 1887 + }, + { + "epoch": 0.2979327757614013, + "grad_norm": 0.6188523769378662, + "learning_rate": 4.890523633684952e-06, + "loss": 0.5973, + "step": 1888 + }, + { + "epoch": 0.29809057913839354, + "grad_norm": 0.6036829352378845, + "learning_rate": 4.890402045582076e-06, + "loss": 0.5911, + "step": 1889 + }, + { + "epoch": 0.2982483825153858, + "grad_norm": 0.6098964810371399, + "learning_rate": 4.8902803915099275e-06, + "loss": 0.6211, + "step": 1890 + }, + { + "epoch": 0.2984061858923781, + "grad_norm": 0.5823766589164734, + "learning_rate": 4.890158671471861e-06, + "loss": 0.6052, + "step": 1891 + }, + { + "epoch": 0.2985639892693704, + "grad_norm": 0.6010814905166626, + "learning_rate": 4.890036885471236e-06, + "loss": 0.6013, + "step": 1892 + }, + { + "epoch": 0.2987217926463626, + "grad_norm": 0.5968747138977051, + "learning_rate": 4.889915033511415e-06, + "loss": 0.6139, + "step": 1893 + }, + { + "epoch": 0.2988795960233549, + "grad_norm": 0.6250336170196533, + "learning_rate": 4.8897931155957605e-06, + "loss": 0.6072, + "step": 1894 + }, + { + "epoch": 0.2990373994003472, + "grad_norm": 0.6033928990364075, + "learning_rate": 4.8896711317276356e-06, + "loss": 0.6174, + "step": 1895 + }, + { + "epoch": 0.2991952027773394, + "grad_norm": 0.6136029362678528, + "learning_rate": 4.889549081910409e-06, + "loss": 0.578, + "step": 1896 + }, + { + "epoch": 0.2993530061543317, + "grad_norm": 0.5955790281295776, + "learning_rate": 4.889426966147446e-06, + "loss": 0.5904, + "step": 1897 + }, + { + "epoch": 0.299510809531324, + "grad_norm": 0.5983783602714539, + "learning_rate": 4.88930478444212e-06, + "loss": 0.5662, + "step": 1898 + }, + { + "epoch": 0.29966861290831626, + "grad_norm": 0.5919486880302429, + "learning_rate": 4.889182536797801e-06, + "loss": 0.5925, + "step": 1899 + }, + { + "epoch": 0.2998264162853085, + "grad_norm": 0.6204228401184082, + "learning_rate": 4.889060223217863e-06, + "loss": 0.6135, + "step": 1900 + }, + { + "epoch": 0.29998421966230077, + "grad_norm": 0.5936400294303894, + "learning_rate": 4.888937843705681e-06, + "loss": 0.6013, + "step": 1901 + }, + { + "epoch": 0.30014202303929305, + "grad_norm": 0.6020577549934387, + "learning_rate": 4.888815398264634e-06, + "loss": 0.6097, + "step": 1902 + }, + { + "epoch": 0.30029982641628533, + "grad_norm": 0.6300604343414307, + "learning_rate": 4.8886928868981e-06, + "loss": 0.5746, + "step": 1903 + }, + { + "epoch": 0.30045762979327756, + "grad_norm": 0.59354567527771, + "learning_rate": 4.88857030960946e-06, + "loss": 0.5747, + "step": 1904 + }, + { + "epoch": 0.30061543317026984, + "grad_norm": 0.5912811756134033, + "learning_rate": 4.888447666402097e-06, + "loss": 0.6087, + "step": 1905 + }, + { + "epoch": 0.3007732365472621, + "grad_norm": 0.6015482544898987, + "learning_rate": 4.8883249572793964e-06, + "loss": 0.6223, + "step": 1906 + }, + { + "epoch": 0.3009310399242544, + "grad_norm": 0.6204609274864197, + "learning_rate": 4.888202182244744e-06, + "loss": 0.6348, + "step": 1907 + }, + { + "epoch": 0.30108884330124663, + "grad_norm": 0.6325560808181763, + "learning_rate": 4.888079341301528e-06, + "loss": 0.6006, + "step": 1908 + }, + { + "epoch": 0.3012466466782389, + "grad_norm": 0.6040710210800171, + "learning_rate": 4.887956434453139e-06, + "loss": 0.6203, + "step": 1909 + }, + { + "epoch": 0.3014044500552312, + "grad_norm": 0.6065768599510193, + "learning_rate": 4.887833461702968e-06, + "loss": 0.5818, + "step": 1910 + }, + { + "epoch": 0.3015622534322234, + "grad_norm": 0.6171576380729675, + "learning_rate": 4.887710423054409e-06, + "loss": 0.5996, + "step": 1911 + }, + { + "epoch": 0.3017200568092157, + "grad_norm": 0.6071099042892456, + "learning_rate": 4.887587318510858e-06, + "loss": 0.5792, + "step": 1912 + }, + { + "epoch": 0.301877860186208, + "grad_norm": 0.5938000679016113, + "learning_rate": 4.887464148075713e-06, + "loss": 0.6125, + "step": 1913 + }, + { + "epoch": 0.3020356635632003, + "grad_norm": 0.59859699010849, + "learning_rate": 4.8873409117523716e-06, + "loss": 0.6151, + "step": 1914 + }, + { + "epoch": 0.3021934669401925, + "grad_norm": 0.6003421545028687, + "learning_rate": 4.887217609544237e-06, + "loss": 0.5964, + "step": 1915 + }, + { + "epoch": 0.3023512703171848, + "grad_norm": 0.6461100578308105, + "learning_rate": 4.887094241454709e-06, + "loss": 0.5638, + "step": 1916 + }, + { + "epoch": 0.30250907369417707, + "grad_norm": 0.6264464259147644, + "learning_rate": 4.886970807487194e-06, + "loss": 0.6354, + "step": 1917 + }, + { + "epoch": 0.30266687707116935, + "grad_norm": 0.6254862546920776, + "learning_rate": 4.8868473076451004e-06, + "loss": 0.6185, + "step": 1918 + }, + { + "epoch": 0.3028246804481616, + "grad_norm": 0.5863887667655945, + "learning_rate": 4.886723741931833e-06, + "loss": 0.5786, + "step": 1919 + }, + { + "epoch": 0.30298248382515386, + "grad_norm": 0.586346447467804, + "learning_rate": 4.886600110350804e-06, + "loss": 0.6092, + "step": 1920 + }, + { + "epoch": 0.30314028720214614, + "grad_norm": 0.6077525615692139, + "learning_rate": 4.886476412905425e-06, + "loss": 0.6099, + "step": 1921 + }, + { + "epoch": 0.30329809057913837, + "grad_norm": 0.5990687608718872, + "learning_rate": 4.886352649599109e-06, + "loss": 0.6003, + "step": 1922 + }, + { + "epoch": 0.30345589395613065, + "grad_norm": 0.5940207242965698, + "learning_rate": 4.886228820435272e-06, + "loss": 0.612, + "step": 1923 + }, + { + "epoch": 0.30361369733312293, + "grad_norm": 0.6125337481498718, + "learning_rate": 4.886104925417333e-06, + "loss": 0.6157, + "step": 1924 + }, + { + "epoch": 0.3037715007101152, + "grad_norm": 0.5987204313278198, + "learning_rate": 4.885980964548708e-06, + "loss": 0.6339, + "step": 1925 + }, + { + "epoch": 0.30392930408710744, + "grad_norm": 0.6211839318275452, + "learning_rate": 4.885856937832821e-06, + "loss": 0.587, + "step": 1926 + }, + { + "epoch": 0.3040871074640997, + "grad_norm": 0.5913103222846985, + "learning_rate": 4.885732845273092e-06, + "loss": 0.6203, + "step": 1927 + }, + { + "epoch": 0.304244910841092, + "grad_norm": 0.6010593771934509, + "learning_rate": 4.8856086868729494e-06, + "loss": 0.5607, + "step": 1928 + }, + { + "epoch": 0.3044027142180843, + "grad_norm": 0.5805857181549072, + "learning_rate": 4.885484462635816e-06, + "loss": 0.6295, + "step": 1929 + }, + { + "epoch": 0.3045605175950765, + "grad_norm": 0.5921640992164612, + "learning_rate": 4.8853601725651215e-06, + "loss": 0.5779, + "step": 1930 + }, + { + "epoch": 0.3047183209720688, + "grad_norm": 0.593693733215332, + "learning_rate": 4.885235816664296e-06, + "loss": 0.5689, + "step": 1931 + }, + { + "epoch": 0.3048761243490611, + "grad_norm": 0.5804865956306458, + "learning_rate": 4.885111394936772e-06, + "loss": 0.6283, + "step": 1932 + }, + { + "epoch": 0.3050339277260533, + "grad_norm": 0.6049553155899048, + "learning_rate": 4.884986907385983e-06, + "loss": 0.5847, + "step": 1933 + }, + { + "epoch": 0.3051917311030456, + "grad_norm": 0.6186820268630981, + "learning_rate": 4.8848623540153635e-06, + "loss": 0.6114, + "step": 1934 + }, + { + "epoch": 0.3053495344800379, + "grad_norm": 0.5767170786857605, + "learning_rate": 4.884737734828352e-06, + "loss": 0.6316, + "step": 1935 + }, + { + "epoch": 0.30550733785703016, + "grad_norm": 0.6083515286445618, + "learning_rate": 4.884613049828387e-06, + "loss": 0.6114, + "step": 1936 + }, + { + "epoch": 0.3056651412340224, + "grad_norm": 0.6578494310379028, + "learning_rate": 4.8844882990189104e-06, + "loss": 0.6017, + "step": 1937 + }, + { + "epoch": 0.30582294461101467, + "grad_norm": 0.6185199618339539, + "learning_rate": 4.884363482403364e-06, + "loss": 0.611, + "step": 1938 + }, + { + "epoch": 0.30598074798800695, + "grad_norm": 0.6176357269287109, + "learning_rate": 4.884238599985194e-06, + "loss": 0.5934, + "step": 1939 + }, + { + "epoch": 0.30613855136499923, + "grad_norm": 0.5845763683319092, + "learning_rate": 4.884113651767845e-06, + "loss": 0.5779, + "step": 1940 + }, + { + "epoch": 0.30629635474199146, + "grad_norm": 0.6027329564094543, + "learning_rate": 4.883988637754765e-06, + "loss": 0.6321, + "step": 1941 + }, + { + "epoch": 0.30645415811898374, + "grad_norm": 0.5837133526802063, + "learning_rate": 4.883863557949407e-06, + "loss": 0.5942, + "step": 1942 + }, + { + "epoch": 0.306611961495976, + "grad_norm": 0.7003186345100403, + "learning_rate": 4.883738412355219e-06, + "loss": 0.6012, + "step": 1943 + }, + { + "epoch": 0.3067697648729683, + "grad_norm": 0.5811489224433899, + "learning_rate": 4.883613200975658e-06, + "loss": 0.6453, + "step": 1944 + }, + { + "epoch": 0.30692756824996054, + "grad_norm": 0.5851175785064697, + "learning_rate": 4.883487923814178e-06, + "loss": 0.5769, + "step": 1945 + }, + { + "epoch": 0.3070853716269528, + "grad_norm": 0.5942931175231934, + "learning_rate": 4.883362580874236e-06, + "loss": 0.594, + "step": 1946 + }, + { + "epoch": 0.3072431750039451, + "grad_norm": 0.5911831259727478, + "learning_rate": 4.883237172159293e-06, + "loss": 0.6016, + "step": 1947 + }, + { + "epoch": 0.30740097838093733, + "grad_norm": 0.59279465675354, + "learning_rate": 4.883111697672808e-06, + "loss": 0.6188, + "step": 1948 + }, + { + "epoch": 0.3075587817579296, + "grad_norm": 0.5867100358009338, + "learning_rate": 4.882986157418244e-06, + "loss": 0.6022, + "step": 1949 + }, + { + "epoch": 0.3077165851349219, + "grad_norm": 0.5821495652198792, + "learning_rate": 4.882860551399066e-06, + "loss": 0.612, + "step": 1950 + }, + { + "epoch": 0.3078743885119142, + "grad_norm": 0.5812498927116394, + "learning_rate": 4.882734879618741e-06, + "loss": 0.5788, + "step": 1951 + }, + { + "epoch": 0.3080321918889064, + "grad_norm": 0.6025142669677734, + "learning_rate": 4.882609142080737e-06, + "loss": 0.623, + "step": 1952 + }, + { + "epoch": 0.3081899952658987, + "grad_norm": 0.6338112354278564, + "learning_rate": 4.882483338788524e-06, + "loss": 0.5831, + "step": 1953 + }, + { + "epoch": 0.30834779864289097, + "grad_norm": 0.6447449326515198, + "learning_rate": 4.8823574697455735e-06, + "loss": 0.5801, + "step": 1954 + }, + { + "epoch": 0.30850560201988325, + "grad_norm": 0.6123966574668884, + "learning_rate": 4.882231534955358e-06, + "loss": 0.6289, + "step": 1955 + }, + { + "epoch": 0.3086634053968755, + "grad_norm": 0.6176717281341553, + "learning_rate": 4.882105534421357e-06, + "loss": 0.5759, + "step": 1956 + }, + { + "epoch": 0.30882120877386776, + "grad_norm": 0.6101246476173401, + "learning_rate": 4.881979468147043e-06, + "loss": 0.6033, + "step": 1957 + }, + { + "epoch": 0.30897901215086004, + "grad_norm": 0.5859583616256714, + "learning_rate": 4.881853336135898e-06, + "loss": 0.6129, + "step": 1958 + }, + { + "epoch": 0.30913681552785227, + "grad_norm": 0.596494734287262, + "learning_rate": 4.881727138391402e-06, + "loss": 0.5836, + "step": 1959 + }, + { + "epoch": 0.30929461890484455, + "grad_norm": 0.5976899862289429, + "learning_rate": 4.881600874917038e-06, + "loss": 0.6081, + "step": 1960 + }, + { + "epoch": 0.30945242228183684, + "grad_norm": 0.6194748878479004, + "learning_rate": 4.88147454571629e-06, + "loss": 0.6276, + "step": 1961 + }, + { + "epoch": 0.3096102256588291, + "grad_norm": 0.6059582233428955, + "learning_rate": 4.881348150792646e-06, + "loss": 0.6388, + "step": 1962 + }, + { + "epoch": 0.30976802903582135, + "grad_norm": 0.5800589323043823, + "learning_rate": 4.881221690149593e-06, + "loss": 0.5801, + "step": 1963 + }, + { + "epoch": 0.30992583241281363, + "grad_norm": 0.5909252762794495, + "learning_rate": 4.881095163790621e-06, + "loss": 0.5869, + "step": 1964 + }, + { + "epoch": 0.3100836357898059, + "grad_norm": 0.5760681629180908, + "learning_rate": 4.8809685717192216e-06, + "loss": 0.6182, + "step": 1965 + }, + { + "epoch": 0.3102414391667982, + "grad_norm": 0.5886933207511902, + "learning_rate": 4.880841913938888e-06, + "loss": 0.6026, + "step": 1966 + }, + { + "epoch": 0.3103992425437904, + "grad_norm": 0.5987986922264099, + "learning_rate": 4.880715190453118e-06, + "loss": 0.5984, + "step": 1967 + }, + { + "epoch": 0.3105570459207827, + "grad_norm": 0.5862927436828613, + "learning_rate": 4.880588401265406e-06, + "loss": 0.5992, + "step": 1968 + }, + { + "epoch": 0.310714849297775, + "grad_norm": 0.5947364568710327, + "learning_rate": 4.880461546379253e-06, + "loss": 0.606, + "step": 1969 + }, + { + "epoch": 0.3108726526747672, + "grad_norm": 0.5779932141304016, + "learning_rate": 4.880334625798159e-06, + "loss": 0.5958, + "step": 1970 + }, + { + "epoch": 0.3110304560517595, + "grad_norm": 0.6047290563583374, + "learning_rate": 4.880207639525627e-06, + "loss": 0.6239, + "step": 1971 + }, + { + "epoch": 0.3111882594287518, + "grad_norm": 0.6070672869682312, + "learning_rate": 4.880080587565162e-06, + "loss": 0.6018, + "step": 1972 + }, + { + "epoch": 0.31134606280574406, + "grad_norm": 0.594688892364502, + "learning_rate": 4.879953469920269e-06, + "loss": 0.6199, + "step": 1973 + }, + { + "epoch": 0.3115038661827363, + "grad_norm": 0.6582217216491699, + "learning_rate": 4.879826286594457e-06, + "loss": 0.5647, + "step": 1974 + }, + { + "epoch": 0.31166166955972857, + "grad_norm": 0.5741918683052063, + "learning_rate": 4.879699037591236e-06, + "loss": 0.582, + "step": 1975 + }, + { + "epoch": 0.31181947293672085, + "grad_norm": 0.5886889696121216, + "learning_rate": 4.8795717229141175e-06, + "loss": 0.6178, + "step": 1976 + }, + { + "epoch": 0.31197727631371314, + "grad_norm": 0.5697699189186096, + "learning_rate": 4.879444342566615e-06, + "loss": 0.5814, + "step": 1977 + }, + { + "epoch": 0.31213507969070536, + "grad_norm": 0.5874729156494141, + "learning_rate": 4.879316896552244e-06, + "loss": 0.6318, + "step": 1978 + }, + { + "epoch": 0.31229288306769765, + "grad_norm": 0.6349831223487854, + "learning_rate": 4.879189384874522e-06, + "loss": 0.6124, + "step": 1979 + }, + { + "epoch": 0.31245068644468993, + "grad_norm": 0.5841462016105652, + "learning_rate": 4.879061807536968e-06, + "loss": 0.5781, + "step": 1980 + }, + { + "epoch": 0.3126084898216822, + "grad_norm": 0.6140736937522888, + "learning_rate": 4.878934164543103e-06, + "loss": 0.6277, + "step": 1981 + }, + { + "epoch": 0.31276629319867444, + "grad_norm": 0.5929992198944092, + "learning_rate": 4.878806455896448e-06, + "loss": 0.6193, + "step": 1982 + }, + { + "epoch": 0.3129240965756667, + "grad_norm": 0.6092530488967896, + "learning_rate": 4.878678681600529e-06, + "loss": 0.5936, + "step": 1983 + }, + { + "epoch": 0.313081899952659, + "grad_norm": 0.5689201354980469, + "learning_rate": 4.878550841658873e-06, + "loss": 0.6239, + "step": 1984 + }, + { + "epoch": 0.31323970332965123, + "grad_norm": 0.610710859298706, + "learning_rate": 4.878422936075006e-06, + "loss": 0.6037, + "step": 1985 + }, + { + "epoch": 0.3133975067066435, + "grad_norm": 0.6410678625106812, + "learning_rate": 4.878294964852459e-06, + "loss": 0.5957, + "step": 1986 + }, + { + "epoch": 0.3135553100836358, + "grad_norm": 0.5681567788124084, + "learning_rate": 4.878166927994764e-06, + "loss": 0.6255, + "step": 1987 + }, + { + "epoch": 0.3137131134606281, + "grad_norm": 0.6415820717811584, + "learning_rate": 4.8780388255054545e-06, + "loss": 0.5909, + "step": 1988 + }, + { + "epoch": 0.3138709168376203, + "grad_norm": 0.577948808670044, + "learning_rate": 4.877910657388064e-06, + "loss": 0.6103, + "step": 1989 + }, + { + "epoch": 0.3140287202146126, + "grad_norm": 0.5951072573661804, + "learning_rate": 4.877782423646132e-06, + "loss": 0.6267, + "step": 1990 + }, + { + "epoch": 0.31418652359160487, + "grad_norm": 0.6078763008117676, + "learning_rate": 4.877654124283197e-06, + "loss": 0.5725, + "step": 1991 + }, + { + "epoch": 0.31434432696859715, + "grad_norm": 0.5965746641159058, + "learning_rate": 4.8775257593027984e-06, + "loss": 0.5891, + "step": 1992 + }, + { + "epoch": 0.3145021303455894, + "grad_norm": 0.5947031378746033, + "learning_rate": 4.87739732870848e-06, + "loss": 0.5896, + "step": 1993 + }, + { + "epoch": 0.31465993372258166, + "grad_norm": 0.6249370574951172, + "learning_rate": 4.877268832503785e-06, + "loss": 0.591, + "step": 1994 + }, + { + "epoch": 0.31481773709957395, + "grad_norm": 0.6080582141876221, + "learning_rate": 4.8771402706922614e-06, + "loss": 0.6283, + "step": 1995 + }, + { + "epoch": 0.3149755404765662, + "grad_norm": 0.601013720035553, + "learning_rate": 4.877011643277456e-06, + "loss": 0.5777, + "step": 1996 + }, + { + "epoch": 0.31513334385355846, + "grad_norm": 0.5987601280212402, + "learning_rate": 4.876882950262918e-06, + "loss": 0.6149, + "step": 1997 + }, + { + "epoch": 0.31529114723055074, + "grad_norm": 0.5809248089790344, + "learning_rate": 4.8767541916522e-06, + "loss": 0.5872, + "step": 1998 + }, + { + "epoch": 0.315448950607543, + "grad_norm": 0.6326442956924438, + "learning_rate": 4.876625367448855e-06, + "loss": 0.5937, + "step": 1999 + }, + { + "epoch": 0.31560675398453525, + "grad_norm": 0.5759526491165161, + "learning_rate": 4.876496477656439e-06, + "loss": 0.5808, + "step": 2000 + }, + { + "epoch": 0.31576455736152753, + "grad_norm": 0.5727280974388123, + "learning_rate": 4.876367522278509e-06, + "loss": 0.5755, + "step": 2001 + }, + { + "epoch": 0.3159223607385198, + "grad_norm": 0.5714939832687378, + "learning_rate": 4.876238501318622e-06, + "loss": 0.5972, + "step": 2002 + }, + { + "epoch": 0.3160801641155121, + "grad_norm": 0.5918742418289185, + "learning_rate": 4.87610941478034e-06, + "loss": 0.6156, + "step": 2003 + }, + { + "epoch": 0.3162379674925043, + "grad_norm": 0.6008358001708984, + "learning_rate": 4.875980262667227e-06, + "loss": 0.6202, + "step": 2004 + }, + { + "epoch": 0.3163957708694966, + "grad_norm": 0.6123740673065186, + "learning_rate": 4.8758510449828446e-06, + "loss": 0.6242, + "step": 2005 + }, + { + "epoch": 0.3165535742464889, + "grad_norm": 0.6380306482315063, + "learning_rate": 4.875721761730759e-06, + "loss": 0.5927, + "step": 2006 + }, + { + "epoch": 0.3167113776234811, + "grad_norm": 0.5747419595718384, + "learning_rate": 4.875592412914541e-06, + "loss": 0.5944, + "step": 2007 + }, + { + "epoch": 0.3168691810004734, + "grad_norm": 0.5956222414970398, + "learning_rate": 4.8754629985377575e-06, + "loss": 0.5954, + "step": 2008 + }, + { + "epoch": 0.3170269843774657, + "grad_norm": 0.6113317608833313, + "learning_rate": 4.8753335186039815e-06, + "loss": 0.6095, + "step": 2009 + }, + { + "epoch": 0.31718478775445796, + "grad_norm": 0.6071817874908447, + "learning_rate": 4.875203973116786e-06, + "loss": 0.6033, + "step": 2010 + }, + { + "epoch": 0.3173425911314502, + "grad_norm": 0.6394937038421631, + "learning_rate": 4.875074362079745e-06, + "loss": 0.5911, + "step": 2011 + }, + { + "epoch": 0.3175003945084425, + "grad_norm": 0.6299561262130737, + "learning_rate": 4.874944685496437e-06, + "loss": 0.6127, + "step": 2012 + }, + { + "epoch": 0.31765819788543476, + "grad_norm": 0.5949356555938721, + "learning_rate": 4.87481494337044e-06, + "loss": 0.5769, + "step": 2013 + }, + { + "epoch": 0.31781600126242704, + "grad_norm": 0.6025078296661377, + "learning_rate": 4.8746851357053346e-06, + "loss": 0.5997, + "step": 2014 + }, + { + "epoch": 0.31797380463941927, + "grad_norm": 0.5971928238868713, + "learning_rate": 4.874555262504703e-06, + "loss": 0.5921, + "step": 2015 + }, + { + "epoch": 0.31813160801641155, + "grad_norm": 0.6041265726089478, + "learning_rate": 4.87442532377213e-06, + "loss": 0.6049, + "step": 2016 + }, + { + "epoch": 0.31828941139340383, + "grad_norm": 0.5940149426460266, + "learning_rate": 4.874295319511201e-06, + "loss": 0.596, + "step": 2017 + }, + { + "epoch": 0.31844721477039606, + "grad_norm": 0.5724262595176697, + "learning_rate": 4.874165249725504e-06, + "loss": 0.5889, + "step": 2018 + }, + { + "epoch": 0.31860501814738834, + "grad_norm": 0.6044447422027588, + "learning_rate": 4.8740351144186295e-06, + "loss": 0.6037, + "step": 2019 + }, + { + "epoch": 0.3187628215243806, + "grad_norm": 0.595079243183136, + "learning_rate": 4.873904913594168e-06, + "loss": 0.6175, + "step": 2020 + }, + { + "epoch": 0.3189206249013729, + "grad_norm": 0.6063107252120972, + "learning_rate": 4.873774647255712e-06, + "loss": 0.6134, + "step": 2021 + }, + { + "epoch": 0.31907842827836513, + "grad_norm": 0.5844847559928894, + "learning_rate": 4.873644315406858e-06, + "loss": 0.6179, + "step": 2022 + }, + { + "epoch": 0.3192362316553574, + "grad_norm": 0.6194273829460144, + "learning_rate": 4.873513918051202e-06, + "loss": 0.607, + "step": 2023 + }, + { + "epoch": 0.3193940350323497, + "grad_norm": 0.6228103637695312, + "learning_rate": 4.873383455192343e-06, + "loss": 0.5794, + "step": 2024 + }, + { + "epoch": 0.319551838409342, + "grad_norm": 0.5653970837593079, + "learning_rate": 4.873252926833881e-06, + "loss": 0.5811, + "step": 2025 + }, + { + "epoch": 0.3197096417863342, + "grad_norm": 0.5708668231964111, + "learning_rate": 4.873122332979418e-06, + "loss": 0.6097, + "step": 2026 + }, + { + "epoch": 0.3198674451633265, + "grad_norm": 0.6145603656768799, + "learning_rate": 4.87299167363256e-06, + "loss": 0.583, + "step": 2027 + }, + { + "epoch": 0.3200252485403188, + "grad_norm": 0.5928651094436646, + "learning_rate": 4.872860948796911e-06, + "loss": 0.5955, + "step": 2028 + }, + { + "epoch": 0.32018305191731106, + "grad_norm": 0.6226240396499634, + "learning_rate": 4.8727301584760785e-06, + "loss": 0.6212, + "step": 2029 + }, + { + "epoch": 0.3203408552943033, + "grad_norm": 0.5749849677085876, + "learning_rate": 4.872599302673674e-06, + "loss": 0.5946, + "step": 2030 + }, + { + "epoch": 0.32049865867129557, + "grad_norm": 0.5948768258094788, + "learning_rate": 4.872468381393307e-06, + "loss": 0.6149, + "step": 2031 + }, + { + "epoch": 0.32065646204828785, + "grad_norm": 0.607522189617157, + "learning_rate": 4.87233739463859e-06, + "loss": 0.6012, + "step": 2032 + }, + { + "epoch": 0.3208142654252801, + "grad_norm": 0.6284456253051758, + "learning_rate": 4.87220634241314e-06, + "loss": 0.5995, + "step": 2033 + }, + { + "epoch": 0.32097206880227236, + "grad_norm": 0.5976285934448242, + "learning_rate": 4.872075224720573e-06, + "loss": 0.6238, + "step": 2034 + }, + { + "epoch": 0.32112987217926464, + "grad_norm": 0.6016439199447632, + "learning_rate": 4.871944041564507e-06, + "loss": 0.6145, + "step": 2035 + }, + { + "epoch": 0.3212876755562569, + "grad_norm": 0.5943738222122192, + "learning_rate": 4.871812792948562e-06, + "loss": 0.6409, + "step": 2036 + }, + { + "epoch": 0.32144547893324915, + "grad_norm": 0.6407731771469116, + "learning_rate": 4.871681478876361e-06, + "loss": 0.6376, + "step": 2037 + }, + { + "epoch": 0.32160328231024143, + "grad_norm": 0.5997557640075684, + "learning_rate": 4.8715500993515284e-06, + "loss": 0.5945, + "step": 2038 + }, + { + "epoch": 0.3217610856872337, + "grad_norm": 0.5625950694084167, + "learning_rate": 4.871418654377688e-06, + "loss": 0.5639, + "step": 2039 + }, + { + "epoch": 0.321918889064226, + "grad_norm": 0.5838669538497925, + "learning_rate": 4.87128714395847e-06, + "loss": 0.6117, + "step": 2040 + }, + { + "epoch": 0.3220766924412182, + "grad_norm": 0.5692921280860901, + "learning_rate": 4.871155568097502e-06, + "loss": 0.5932, + "step": 2041 + }, + { + "epoch": 0.3222344958182105, + "grad_norm": 0.595718264579773, + "learning_rate": 4.871023926798415e-06, + "loss": 0.5874, + "step": 2042 + }, + { + "epoch": 0.3223922991952028, + "grad_norm": 0.6104974746704102, + "learning_rate": 4.870892220064843e-06, + "loss": 0.5957, + "step": 2043 + }, + { + "epoch": 0.322550102572195, + "grad_norm": 0.6342365741729736, + "learning_rate": 4.870760447900421e-06, + "loss": 0.6161, + "step": 2044 + }, + { + "epoch": 0.3227079059491873, + "grad_norm": 0.6532948613166809, + "learning_rate": 4.870628610308783e-06, + "loss": 0.6137, + "step": 2045 + }, + { + "epoch": 0.3228657093261796, + "grad_norm": 0.6025333404541016, + "learning_rate": 4.870496707293571e-06, + "loss": 0.5939, + "step": 2046 + }, + { + "epoch": 0.32302351270317187, + "grad_norm": 0.5997824668884277, + "learning_rate": 4.870364738858423e-06, + "loss": 0.6261, + "step": 2047 + }, + { + "epoch": 0.3231813160801641, + "grad_norm": 0.5952382683753967, + "learning_rate": 4.870232705006981e-06, + "loss": 0.6091, + "step": 2048 + }, + { + "epoch": 0.3233391194571564, + "grad_norm": 0.6018871068954468, + "learning_rate": 4.87010060574289e-06, + "loss": 0.6247, + "step": 2049 + }, + { + "epoch": 0.32349692283414866, + "grad_norm": 0.6247005462646484, + "learning_rate": 4.869968441069795e-06, + "loss": 0.587, + "step": 2050 + }, + { + "epoch": 0.32365472621114094, + "grad_norm": 0.6273247599601746, + "learning_rate": 4.869836210991343e-06, + "loss": 0.6124, + "step": 2051 + }, + { + "epoch": 0.32381252958813317, + "grad_norm": 0.5944460034370422, + "learning_rate": 4.869703915511184e-06, + "loss": 0.598, + "step": 2052 + }, + { + "epoch": 0.32397033296512545, + "grad_norm": 0.6016001105308533, + "learning_rate": 4.869571554632968e-06, + "loss": 0.6019, + "step": 2053 + }, + { + "epoch": 0.32412813634211773, + "grad_norm": 0.5847764015197754, + "learning_rate": 4.869439128360349e-06, + "loss": 0.5954, + "step": 2054 + }, + { + "epoch": 0.32428593971910996, + "grad_norm": 0.5702676177024841, + "learning_rate": 4.869306636696981e-06, + "loss": 0.5962, + "step": 2055 + }, + { + "epoch": 0.32444374309610224, + "grad_norm": 0.590527355670929, + "learning_rate": 4.86917407964652e-06, + "loss": 0.6169, + "step": 2056 + }, + { + "epoch": 0.3246015464730945, + "grad_norm": 0.61552494764328, + "learning_rate": 4.869041457212626e-06, + "loss": 0.5888, + "step": 2057 + }, + { + "epoch": 0.3247593498500868, + "grad_norm": 0.5740402340888977, + "learning_rate": 4.868908769398957e-06, + "loss": 0.6078, + "step": 2058 + }, + { + "epoch": 0.32491715322707904, + "grad_norm": 0.5936267375946045, + "learning_rate": 4.868776016209176e-06, + "loss": 0.5824, + "step": 2059 + }, + { + "epoch": 0.3250749566040713, + "grad_norm": 0.600146472454071, + "learning_rate": 4.868643197646946e-06, + "loss": 0.6354, + "step": 2060 + }, + { + "epoch": 0.3252327599810636, + "grad_norm": 0.5911927819252014, + "learning_rate": 4.868510313715933e-06, + "loss": 0.6312, + "step": 2061 + }, + { + "epoch": 0.3253905633580559, + "grad_norm": 0.6071283221244812, + "learning_rate": 4.868377364419804e-06, + "loss": 0.6207, + "step": 2062 + }, + { + "epoch": 0.3255483667350481, + "grad_norm": 0.5998826622962952, + "learning_rate": 4.868244349762229e-06, + "loss": 0.6032, + "step": 2063 + }, + { + "epoch": 0.3257061701120404, + "grad_norm": 0.6075683832168579, + "learning_rate": 4.868111269746878e-06, + "loss": 0.6093, + "step": 2064 + }, + { + "epoch": 0.3258639734890327, + "grad_norm": 0.6075190305709839, + "learning_rate": 4.867978124377423e-06, + "loss": 0.5742, + "step": 2065 + }, + { + "epoch": 0.32602177686602496, + "grad_norm": 0.591975212097168, + "learning_rate": 4.86784491365754e-06, + "loss": 0.5977, + "step": 2066 + }, + { + "epoch": 0.3261795802430172, + "grad_norm": 0.5865190625190735, + "learning_rate": 4.867711637590904e-06, + "loss": 0.5995, + "step": 2067 + }, + { + "epoch": 0.32633738362000947, + "grad_norm": 0.5895482897758484, + "learning_rate": 4.867578296181194e-06, + "loss": 0.6229, + "step": 2068 + }, + { + "epoch": 0.32649518699700175, + "grad_norm": 0.6138648986816406, + "learning_rate": 4.867444889432089e-06, + "loss": 0.6288, + "step": 2069 + }, + { + "epoch": 0.326652990373994, + "grad_norm": 0.6196238994598389, + "learning_rate": 4.867311417347272e-06, + "loss": 0.617, + "step": 2070 + }, + { + "epoch": 0.32681079375098626, + "grad_norm": 0.5827062129974365, + "learning_rate": 4.8671778799304255e-06, + "loss": 0.6478, + "step": 2071 + }, + { + "epoch": 0.32696859712797854, + "grad_norm": 0.6747530698776245, + "learning_rate": 4.867044277185235e-06, + "loss": 0.6317, + "step": 2072 + }, + { + "epoch": 0.3271264005049708, + "grad_norm": 0.580110490322113, + "learning_rate": 4.866910609115388e-06, + "loss": 0.5785, + "step": 2073 + }, + { + "epoch": 0.32728420388196305, + "grad_norm": 0.6330163478851318, + "learning_rate": 4.866776875724572e-06, + "loss": 0.5933, + "step": 2074 + }, + { + "epoch": 0.32744200725895534, + "grad_norm": 0.6008664965629578, + "learning_rate": 4.8666430770164795e-06, + "loss": 0.6257, + "step": 2075 + }, + { + "epoch": 0.3275998106359476, + "grad_norm": 0.6016117930412292, + "learning_rate": 4.866509212994802e-06, + "loss": 0.6199, + "step": 2076 + }, + { + "epoch": 0.3277576140129399, + "grad_norm": 0.6345735788345337, + "learning_rate": 4.866375283663235e-06, + "loss": 0.5895, + "step": 2077 + }, + { + "epoch": 0.32791541738993213, + "grad_norm": 0.5888423323631287, + "learning_rate": 4.866241289025473e-06, + "loss": 0.6058, + "step": 2078 + }, + { + "epoch": 0.3280732207669244, + "grad_norm": 0.5872825980186462, + "learning_rate": 4.8661072290852155e-06, + "loss": 0.5716, + "step": 2079 + }, + { + "epoch": 0.3282310241439167, + "grad_norm": 0.6232918500900269, + "learning_rate": 4.8659731038461605e-06, + "loss": 0.5926, + "step": 2080 + }, + { + "epoch": 0.3283888275209089, + "grad_norm": 0.5627777576446533, + "learning_rate": 4.865838913312011e-06, + "loss": 0.5801, + "step": 2081 + }, + { + "epoch": 0.3285466308979012, + "grad_norm": 0.6135475635528564, + "learning_rate": 4.86570465748647e-06, + "loss": 0.607, + "step": 2082 + }, + { + "epoch": 0.3287044342748935, + "grad_norm": 0.594552755355835, + "learning_rate": 4.865570336373241e-06, + "loss": 0.5854, + "step": 2083 + }, + { + "epoch": 0.32886223765188577, + "grad_norm": 0.5925341844558716, + "learning_rate": 4.865435949976034e-06, + "loss": 0.5924, + "step": 2084 + }, + { + "epoch": 0.329020041028878, + "grad_norm": 0.5932354927062988, + "learning_rate": 4.865301498298555e-06, + "loss": 0.6466, + "step": 2085 + }, + { + "epoch": 0.3291778444058703, + "grad_norm": 0.6120088696479797, + "learning_rate": 4.865166981344516e-06, + "loss": 0.6051, + "step": 2086 + }, + { + "epoch": 0.32933564778286256, + "grad_norm": 0.606268048286438, + "learning_rate": 4.8650323991176295e-06, + "loss": 0.5755, + "step": 2087 + }, + { + "epoch": 0.32949345115985484, + "grad_norm": 0.5931373238563538, + "learning_rate": 4.8648977516216085e-06, + "loss": 0.6327, + "step": 2088 + }, + { + "epoch": 0.32965125453684707, + "grad_norm": 0.6658146977424622, + "learning_rate": 4.86476303886017e-06, + "loss": 0.5894, + "step": 2089 + }, + { + "epoch": 0.32980905791383935, + "grad_norm": 0.5950360894203186, + "learning_rate": 4.864628260837031e-06, + "loss": 0.6343, + "step": 2090 + }, + { + "epoch": 0.32996686129083164, + "grad_norm": 0.6251366138458252, + "learning_rate": 4.864493417555911e-06, + "loss": 0.5699, + "step": 2091 + }, + { + "epoch": 0.33012466466782386, + "grad_norm": 0.5632107853889465, + "learning_rate": 4.864358509020532e-06, + "loss": 0.6149, + "step": 2092 + }, + { + "epoch": 0.33028246804481615, + "grad_norm": 0.6189080476760864, + "learning_rate": 4.8642235352346166e-06, + "loss": 0.5735, + "step": 2093 + }, + { + "epoch": 0.33044027142180843, + "grad_norm": 0.6103638410568237, + "learning_rate": 4.864088496201891e-06, + "loss": 0.6082, + "step": 2094 + }, + { + "epoch": 0.3305980747988007, + "grad_norm": 0.6351014375686646, + "learning_rate": 4.863953391926079e-06, + "loss": 0.6194, + "step": 2095 + }, + { + "epoch": 0.33075587817579294, + "grad_norm": 0.6111370325088501, + "learning_rate": 4.863818222410913e-06, + "loss": 0.6273, + "step": 2096 + }, + { + "epoch": 0.3309136815527852, + "grad_norm": 0.6077091097831726, + "learning_rate": 4.86368298766012e-06, + "loss": 0.5846, + "step": 2097 + }, + { + "epoch": 0.3310714849297775, + "grad_norm": 0.6041120290756226, + "learning_rate": 4.863547687677435e-06, + "loss": 0.5856, + "step": 2098 + }, + { + "epoch": 0.3312292883067698, + "grad_norm": 0.5822417140007019, + "learning_rate": 4.863412322466589e-06, + "loss": 0.5961, + "step": 2099 + }, + { + "epoch": 0.331387091683762, + "grad_norm": 0.6428790092468262, + "learning_rate": 4.86327689203132e-06, + "loss": 0.5844, + "step": 2100 + }, + { + "epoch": 0.3315448950607543, + "grad_norm": 0.6487017273902893, + "learning_rate": 4.863141396375365e-06, + "loss": 0.5975, + "step": 2101 + }, + { + "epoch": 0.3317026984377466, + "grad_norm": 0.6142215132713318, + "learning_rate": 4.863005835502463e-06, + "loss": 0.6121, + "step": 2102 + }, + { + "epoch": 0.33186050181473886, + "grad_norm": 0.6039011478424072, + "learning_rate": 4.862870209416355e-06, + "loss": 0.6029, + "step": 2103 + }, + { + "epoch": 0.3320183051917311, + "grad_norm": 0.6196528077125549, + "learning_rate": 4.862734518120785e-06, + "loss": 0.6217, + "step": 2104 + }, + { + "epoch": 0.33217610856872337, + "grad_norm": 0.580222487449646, + "learning_rate": 4.862598761619497e-06, + "loss": 0.5971, + "step": 2105 + }, + { + "epoch": 0.33233391194571565, + "grad_norm": 0.6053614616394043, + "learning_rate": 4.862462939916237e-06, + "loss": 0.6322, + "step": 2106 + }, + { + "epoch": 0.3324917153227079, + "grad_norm": 0.6038833260536194, + "learning_rate": 4.862327053014754e-06, + "loss": 0.6088, + "step": 2107 + }, + { + "epoch": 0.33264951869970016, + "grad_norm": 0.5700918436050415, + "learning_rate": 4.862191100918798e-06, + "loss": 0.5712, + "step": 2108 + }, + { + "epoch": 0.33280732207669245, + "grad_norm": 0.623579204082489, + "learning_rate": 4.8620550836321214e-06, + "loss": 0.5975, + "step": 2109 + }, + { + "epoch": 0.33296512545368473, + "grad_norm": 0.6417087316513062, + "learning_rate": 4.861919001158478e-06, + "loss": 0.5903, + "step": 2110 + }, + { + "epoch": 0.33312292883067696, + "grad_norm": 0.5967323780059814, + "learning_rate": 4.861782853501622e-06, + "loss": 0.5818, + "step": 2111 + }, + { + "epoch": 0.33328073220766924, + "grad_norm": 0.6167300343513489, + "learning_rate": 4.861646640665312e-06, + "loss": 0.618, + "step": 2112 + }, + { + "epoch": 0.3334385355846615, + "grad_norm": 0.6076642274856567, + "learning_rate": 4.861510362653306e-06, + "loss": 0.6212, + "step": 2113 + }, + { + "epoch": 0.3335963389616538, + "grad_norm": 0.5965882539749146, + "learning_rate": 4.861374019469367e-06, + "loss": 0.5723, + "step": 2114 + }, + { + "epoch": 0.33375414233864603, + "grad_norm": 0.6007471680641174, + "learning_rate": 4.861237611117257e-06, + "loss": 0.6158, + "step": 2115 + }, + { + "epoch": 0.3339119457156383, + "grad_norm": 0.6285093426704407, + "learning_rate": 4.861101137600738e-06, + "loss": 0.5715, + "step": 2116 + }, + { + "epoch": 0.3340697490926306, + "grad_norm": 0.5941210985183716, + "learning_rate": 4.8609645989235796e-06, + "loss": 0.6224, + "step": 2117 + }, + { + "epoch": 0.3342275524696228, + "grad_norm": 0.6018872261047363, + "learning_rate": 4.8608279950895485e-06, + "loss": 0.5901, + "step": 2118 + }, + { + "epoch": 0.3343853558466151, + "grad_norm": 0.6309861540794373, + "learning_rate": 4.860691326102415e-06, + "loss": 0.586, + "step": 2119 + }, + { + "epoch": 0.3345431592236074, + "grad_norm": 0.563592255115509, + "learning_rate": 4.86055459196595e-06, + "loss": 0.585, + "step": 2120 + }, + { + "epoch": 0.33470096260059967, + "grad_norm": 0.6347326636314392, + "learning_rate": 4.860417792683928e-06, + "loss": 0.5682, + "step": 2121 + }, + { + "epoch": 0.3348587659775919, + "grad_norm": 0.6047946214675903, + "learning_rate": 4.860280928260124e-06, + "loss": 0.6119, + "step": 2122 + }, + { + "epoch": 0.3350165693545842, + "grad_norm": 0.6168464422225952, + "learning_rate": 4.860143998698315e-06, + "loss": 0.5925, + "step": 2123 + }, + { + "epoch": 0.33517437273157646, + "grad_norm": 0.5894562005996704, + "learning_rate": 4.860007004002281e-06, + "loss": 0.61, + "step": 2124 + }, + { + "epoch": 0.33533217610856875, + "grad_norm": 0.5793420076370239, + "learning_rate": 4.8598699441758e-06, + "loss": 0.6211, + "step": 2125 + }, + { + "epoch": 0.335489979485561, + "grad_norm": 0.5870744585990906, + "learning_rate": 4.859732819222658e-06, + "loss": 0.5888, + "step": 2126 + }, + { + "epoch": 0.33564778286255326, + "grad_norm": 0.6015093922615051, + "learning_rate": 4.859595629146637e-06, + "loss": 0.6119, + "step": 2127 + }, + { + "epoch": 0.33580558623954554, + "grad_norm": 0.5820061564445496, + "learning_rate": 4.859458373951523e-06, + "loss": 0.63, + "step": 2128 + }, + { + "epoch": 0.33596338961653777, + "grad_norm": 0.5957794785499573, + "learning_rate": 4.859321053641106e-06, + "loss": 0.5637, + "step": 2129 + }, + { + "epoch": 0.33612119299353005, + "grad_norm": 0.6114950776100159, + "learning_rate": 4.859183668219173e-06, + "loss": 0.6147, + "step": 2130 + }, + { + "epoch": 0.33627899637052233, + "grad_norm": 0.5785254836082458, + "learning_rate": 4.859046217689518e-06, + "loss": 0.5646, + "step": 2131 + }, + { + "epoch": 0.3364367997475146, + "grad_norm": 0.5830047130584717, + "learning_rate": 4.858908702055932e-06, + "loss": 0.6126, + "step": 2132 + }, + { + "epoch": 0.33659460312450684, + "grad_norm": 0.5844760537147522, + "learning_rate": 4.858771121322212e-06, + "loss": 0.6181, + "step": 2133 + }, + { + "epoch": 0.3367524065014991, + "grad_norm": 0.634690523147583, + "learning_rate": 4.858633475492154e-06, + "loss": 0.5954, + "step": 2134 + }, + { + "epoch": 0.3369102098784914, + "grad_norm": 0.6157019734382629, + "learning_rate": 4.8584957645695564e-06, + "loss": 0.5871, + "step": 2135 + }, + { + "epoch": 0.3370680132554837, + "grad_norm": 0.6096652746200562, + "learning_rate": 4.8583579885582205e-06, + "loss": 0.6323, + "step": 2136 + }, + { + "epoch": 0.3372258166324759, + "grad_norm": 0.6086934804916382, + "learning_rate": 4.858220147461949e-06, + "loss": 0.6175, + "step": 2137 + }, + { + "epoch": 0.3373836200094682, + "grad_norm": 0.6275618672370911, + "learning_rate": 4.858082241284543e-06, + "loss": 0.6066, + "step": 2138 + }, + { + "epoch": 0.3375414233864605, + "grad_norm": 0.6551195979118347, + "learning_rate": 4.857944270029812e-06, + "loss": 0.5746, + "step": 2139 + }, + { + "epoch": 0.33769922676345276, + "grad_norm": 0.5928221344947815, + "learning_rate": 4.857806233701562e-06, + "loss": 0.5849, + "step": 2140 + }, + { + "epoch": 0.337857030140445, + "grad_norm": 0.6305047869682312, + "learning_rate": 4.857668132303603e-06, + "loss": 0.5983, + "step": 2141 + }, + { + "epoch": 0.3380148335174373, + "grad_norm": 0.6227213144302368, + "learning_rate": 4.857529965839746e-06, + "loss": 0.6027, + "step": 2142 + }, + { + "epoch": 0.33817263689442956, + "grad_norm": 0.5909115076065063, + "learning_rate": 4.857391734313803e-06, + "loss": 0.6007, + "step": 2143 + }, + { + "epoch": 0.3383304402714218, + "grad_norm": 0.5848812460899353, + "learning_rate": 4.857253437729591e-06, + "loss": 0.6215, + "step": 2144 + }, + { + "epoch": 0.33848824364841407, + "grad_norm": 0.5961494445800781, + "learning_rate": 4.8571150760909244e-06, + "loss": 0.5845, + "step": 2145 + }, + { + "epoch": 0.33864604702540635, + "grad_norm": 0.5873366594314575, + "learning_rate": 4.856976649401624e-06, + "loss": 0.6065, + "step": 2146 + }, + { + "epoch": 0.33880385040239863, + "grad_norm": 0.6005118489265442, + "learning_rate": 4.856838157665508e-06, + "loss": 0.6339, + "step": 2147 + }, + { + "epoch": 0.33896165377939086, + "grad_norm": 0.6131367683410645, + "learning_rate": 4.856699600886399e-06, + "loss": 0.5927, + "step": 2148 + }, + { + "epoch": 0.33911945715638314, + "grad_norm": 0.5762116312980652, + "learning_rate": 4.856560979068121e-06, + "loss": 0.6061, + "step": 2149 + }, + { + "epoch": 0.3392772605333754, + "grad_norm": 0.6148077249526978, + "learning_rate": 4.8564222922145e-06, + "loss": 0.613, + "step": 2150 + }, + { + "epoch": 0.3394350639103677, + "grad_norm": 0.5817411541938782, + "learning_rate": 4.856283540329363e-06, + "loss": 0.591, + "step": 2151 + }, + { + "epoch": 0.33959286728735993, + "grad_norm": 0.6266690492630005, + "learning_rate": 4.856144723416539e-06, + "loss": 0.6175, + "step": 2152 + }, + { + "epoch": 0.3397506706643522, + "grad_norm": 0.6083249449729919, + "learning_rate": 4.85600584147986e-06, + "loss": 0.5927, + "step": 2153 + }, + { + "epoch": 0.3399084740413445, + "grad_norm": 0.5886242389678955, + "learning_rate": 4.855866894523158e-06, + "loss": 0.6026, + "step": 2154 + }, + { + "epoch": 0.3400662774183367, + "grad_norm": 0.6127223372459412, + "learning_rate": 4.855727882550267e-06, + "loss": 0.5678, + "step": 2155 + }, + { + "epoch": 0.340224080795329, + "grad_norm": 0.5992236733436584, + "learning_rate": 4.855588805565025e-06, + "loss": 0.5905, + "step": 2156 + }, + { + "epoch": 0.3403818841723213, + "grad_norm": 0.6197926998138428, + "learning_rate": 4.855449663571269e-06, + "loss": 0.5743, + "step": 2157 + }, + { + "epoch": 0.3405396875493136, + "grad_norm": 0.6065779328346252, + "learning_rate": 4.85531045657284e-06, + "loss": 0.6007, + "step": 2158 + }, + { + "epoch": 0.3406974909263058, + "grad_norm": 0.5763601660728455, + "learning_rate": 4.855171184573577e-06, + "loss": 0.6418, + "step": 2159 + }, + { + "epoch": 0.3408552943032981, + "grad_norm": 0.5730987191200256, + "learning_rate": 4.855031847577328e-06, + "loss": 0.5991, + "step": 2160 + }, + { + "epoch": 0.34101309768029037, + "grad_norm": 0.6158075928688049, + "learning_rate": 4.854892445587934e-06, + "loss": 0.5964, + "step": 2161 + }, + { + "epoch": 0.34117090105728265, + "grad_norm": 0.6243632435798645, + "learning_rate": 4.854752978609245e-06, + "loss": 0.6604, + "step": 2162 + }, + { + "epoch": 0.3413287044342749, + "grad_norm": 0.6223141551017761, + "learning_rate": 4.854613446645109e-06, + "loss": 0.6214, + "step": 2163 + }, + { + "epoch": 0.34148650781126716, + "grad_norm": 0.5859208106994629, + "learning_rate": 4.854473849699377e-06, + "loss": 0.5988, + "step": 2164 + }, + { + "epoch": 0.34164431118825944, + "grad_norm": 0.5924085974693298, + "learning_rate": 4.854334187775901e-06, + "loss": 0.5877, + "step": 2165 + }, + { + "epoch": 0.34180211456525167, + "grad_norm": 0.5937061309814453, + "learning_rate": 4.854194460878536e-06, + "loss": 0.6344, + "step": 2166 + }, + { + "epoch": 0.34195991794224395, + "grad_norm": 0.5823507308959961, + "learning_rate": 4.854054669011137e-06, + "loss": 0.5934, + "step": 2167 + }, + { + "epoch": 0.34211772131923623, + "grad_norm": 0.6349167823791504, + "learning_rate": 4.853914812177564e-06, + "loss": 0.6186, + "step": 2168 + }, + { + "epoch": 0.3422755246962285, + "grad_norm": 0.6337264776229858, + "learning_rate": 4.853774890381674e-06, + "loss": 0.604, + "step": 2169 + }, + { + "epoch": 0.34243332807322074, + "grad_norm": 0.6230252981185913, + "learning_rate": 4.853634903627331e-06, + "loss": 0.643, + "step": 2170 + }, + { + "epoch": 0.342591131450213, + "grad_norm": 0.6103272438049316, + "learning_rate": 4.8534948519183974e-06, + "loss": 0.6367, + "step": 2171 + }, + { + "epoch": 0.3427489348272053, + "grad_norm": 0.6017941236495972, + "learning_rate": 4.853354735258737e-06, + "loss": 0.6208, + "step": 2172 + }, + { + "epoch": 0.3429067382041976, + "grad_norm": 0.6313395500183105, + "learning_rate": 4.853214553652219e-06, + "loss": 0.6342, + "step": 2173 + }, + { + "epoch": 0.3430645415811898, + "grad_norm": 0.5863996744155884, + "learning_rate": 4.85307430710271e-06, + "loss": 0.6148, + "step": 2174 + }, + { + "epoch": 0.3432223449581821, + "grad_norm": 0.6073969602584839, + "learning_rate": 4.852933995614082e-06, + "loss": 0.5944, + "step": 2175 + }, + { + "epoch": 0.3433801483351744, + "grad_norm": 0.628950297832489, + "learning_rate": 4.852793619190206e-06, + "loss": 0.5837, + "step": 2176 + }, + { + "epoch": 0.34353795171216667, + "grad_norm": 0.5901555418968201, + "learning_rate": 4.852653177834957e-06, + "loss": 0.5883, + "step": 2177 + }, + { + "epoch": 0.3436957550891589, + "grad_norm": 0.5760615468025208, + "learning_rate": 4.85251267155221e-06, + "loss": 0.6139, + "step": 2178 + }, + { + "epoch": 0.3438535584661512, + "grad_norm": 0.600027859210968, + "learning_rate": 4.852372100345844e-06, + "loss": 0.5997, + "step": 2179 + }, + { + "epoch": 0.34401136184314346, + "grad_norm": 0.5839065909385681, + "learning_rate": 4.852231464219737e-06, + "loss": 0.6484, + "step": 2180 + }, + { + "epoch": 0.3441691652201357, + "grad_norm": 0.6121768951416016, + "learning_rate": 4.85209076317777e-06, + "loss": 0.6041, + "step": 2181 + }, + { + "epoch": 0.34432696859712797, + "grad_norm": 0.6438751816749573, + "learning_rate": 4.851949997223827e-06, + "loss": 0.6102, + "step": 2182 + }, + { + "epoch": 0.34448477197412025, + "grad_norm": 0.5822332501411438, + "learning_rate": 4.8518091663617935e-06, + "loss": 0.5951, + "step": 2183 + }, + { + "epoch": 0.34464257535111253, + "grad_norm": 0.5920370817184448, + "learning_rate": 4.851668270595554e-06, + "loss": 0.6045, + "step": 2184 + }, + { + "epoch": 0.34480037872810476, + "grad_norm": 0.5921240448951721, + "learning_rate": 4.8515273099289985e-06, + "loss": 0.6366, + "step": 2185 + }, + { + "epoch": 0.34495818210509704, + "grad_norm": 0.5959035158157349, + "learning_rate": 4.851386284366017e-06, + "loss": 0.6167, + "step": 2186 + }, + { + "epoch": 0.3451159854820893, + "grad_norm": 0.5996573567390442, + "learning_rate": 4.851245193910501e-06, + "loss": 0.5902, + "step": 2187 + }, + { + "epoch": 0.3452737888590816, + "grad_norm": 0.6282497048377991, + "learning_rate": 4.8511040385663445e-06, + "loss": 0.62, + "step": 2188 + }, + { + "epoch": 0.34543159223607384, + "grad_norm": 0.5981736183166504, + "learning_rate": 4.850962818337442e-06, + "loss": 0.6346, + "step": 2189 + }, + { + "epoch": 0.3455893956130661, + "grad_norm": 0.5907652974128723, + "learning_rate": 4.850821533227693e-06, + "loss": 0.6115, + "step": 2190 + }, + { + "epoch": 0.3457471989900584, + "grad_norm": 0.6227137446403503, + "learning_rate": 4.850680183240994e-06, + "loss": 0.6099, + "step": 2191 + }, + { + "epoch": 0.34590500236705063, + "grad_norm": 0.5750992298126221, + "learning_rate": 4.850538768381249e-06, + "loss": 0.5745, + "step": 2192 + }, + { + "epoch": 0.3460628057440429, + "grad_norm": 0.6198127865791321, + "learning_rate": 4.850397288652357e-06, + "loss": 0.6135, + "step": 2193 + }, + { + "epoch": 0.3462206091210352, + "grad_norm": 0.5849228501319885, + "learning_rate": 4.850255744058226e-06, + "loss": 0.6247, + "step": 2194 + }, + { + "epoch": 0.3463784124980275, + "grad_norm": 0.5863015055656433, + "learning_rate": 4.8501141346027615e-06, + "loss": 0.5828, + "step": 2195 + }, + { + "epoch": 0.3465362158750197, + "grad_norm": 0.6065423488616943, + "learning_rate": 4.849972460289869e-06, + "loss": 0.5403, + "step": 2196 + }, + { + "epoch": 0.346694019252012, + "grad_norm": 0.566260576248169, + "learning_rate": 4.8498307211234615e-06, + "loss": 0.5973, + "step": 2197 + }, + { + "epoch": 0.34685182262900427, + "grad_norm": 0.627583384513855, + "learning_rate": 4.849688917107449e-06, + "loss": 0.6059, + "step": 2198 + }, + { + "epoch": 0.34700962600599655, + "grad_norm": 0.5987470746040344, + "learning_rate": 4.849547048245745e-06, + "loss": 0.5841, + "step": 2199 + }, + { + "epoch": 0.3471674293829888, + "grad_norm": 0.6044975519180298, + "learning_rate": 4.849405114542266e-06, + "loss": 0.6077, + "step": 2200 + }, + { + "epoch": 0.34732523275998106, + "grad_norm": 0.5877317786216736, + "learning_rate": 4.849263116000927e-06, + "loss": 0.6045, + "step": 2201 + }, + { + "epoch": 0.34748303613697334, + "grad_norm": 0.5828509330749512, + "learning_rate": 4.849121052625649e-06, + "loss": 0.6129, + "step": 2202 + }, + { + "epoch": 0.34764083951396557, + "grad_norm": 0.5674540996551514, + "learning_rate": 4.84897892442035e-06, + "loss": 0.6047, + "step": 2203 + }, + { + "epoch": 0.34779864289095785, + "grad_norm": 0.6086059212684631, + "learning_rate": 4.848836731388955e-06, + "loss": 0.5528, + "step": 2204 + }, + { + "epoch": 0.34795644626795014, + "grad_norm": 0.6066405773162842, + "learning_rate": 4.848694473535387e-06, + "loss": 0.6064, + "step": 2205 + }, + { + "epoch": 0.3481142496449424, + "grad_norm": 0.6265670657157898, + "learning_rate": 4.848552150863573e-06, + "loss": 0.6099, + "step": 2206 + }, + { + "epoch": 0.34827205302193465, + "grad_norm": 0.5906252264976501, + "learning_rate": 4.848409763377438e-06, + "loss": 0.618, + "step": 2207 + }, + { + "epoch": 0.34842985639892693, + "grad_norm": 0.6013564467430115, + "learning_rate": 4.848267311080914e-06, + "loss": 0.6027, + "step": 2208 + }, + { + "epoch": 0.3485876597759192, + "grad_norm": 0.5707611441612244, + "learning_rate": 4.8481247939779316e-06, + "loss": 0.5868, + "step": 2209 + }, + { + "epoch": 0.3487454631529115, + "grad_norm": 0.5880939364433289, + "learning_rate": 4.847982212072425e-06, + "loss": 0.5901, + "step": 2210 + }, + { + "epoch": 0.3489032665299037, + "grad_norm": 0.760917603969574, + "learning_rate": 4.847839565368327e-06, + "loss": 0.6374, + "step": 2211 + }, + { + "epoch": 0.349061069906896, + "grad_norm": 0.6168128848075867, + "learning_rate": 4.8476968538695756e-06, + "loss": 0.6156, + "step": 2212 + }, + { + "epoch": 0.3492188732838883, + "grad_norm": 0.5493120551109314, + "learning_rate": 4.84755407758011e-06, + "loss": 0.5966, + "step": 2213 + }, + { + "epoch": 0.34937667666088057, + "grad_norm": 0.5654345750808716, + "learning_rate": 4.847411236503869e-06, + "loss": 0.5946, + "step": 2214 + }, + { + "epoch": 0.3495344800378728, + "grad_norm": 0.5819793343544006, + "learning_rate": 4.847268330644795e-06, + "loss": 0.5949, + "step": 2215 + }, + { + "epoch": 0.3496922834148651, + "grad_norm": 0.5660437345504761, + "learning_rate": 4.847125360006832e-06, + "loss": 0.5942, + "step": 2216 + }, + { + "epoch": 0.34985008679185736, + "grad_norm": 0.5856075882911682, + "learning_rate": 4.846982324593926e-06, + "loss": 0.6247, + "step": 2217 + }, + { + "epoch": 0.3500078901688496, + "grad_norm": 0.5873866081237793, + "learning_rate": 4.846839224410025e-06, + "loss": 0.6073, + "step": 2218 + }, + { + "epoch": 0.35016569354584187, + "grad_norm": 0.6101171374320984, + "learning_rate": 4.8466960594590765e-06, + "loss": 0.5721, + "step": 2219 + }, + { + "epoch": 0.35032349692283415, + "grad_norm": 0.5972588658332825, + "learning_rate": 4.846552829745033e-06, + "loss": 0.6336, + "step": 2220 + }, + { + "epoch": 0.35048130029982644, + "grad_norm": 0.6062920689582825, + "learning_rate": 4.846409535271846e-06, + "loss": 0.5737, + "step": 2221 + }, + { + "epoch": 0.35063910367681866, + "grad_norm": 0.6070078611373901, + "learning_rate": 4.846266176043471e-06, + "loss": 0.5951, + "step": 2222 + }, + { + "epoch": 0.35079690705381095, + "grad_norm": 0.5912320017814636, + "learning_rate": 4.846122752063865e-06, + "loss": 0.5728, + "step": 2223 + }, + { + "epoch": 0.35095471043080323, + "grad_norm": 0.5996730923652649, + "learning_rate": 4.845979263336985e-06, + "loss": 0.6087, + "step": 2224 + }, + { + "epoch": 0.3511125138077955, + "grad_norm": 0.6622829437255859, + "learning_rate": 4.845835709866791e-06, + "loss": 0.6164, + "step": 2225 + }, + { + "epoch": 0.35127031718478774, + "grad_norm": 0.5930745005607605, + "learning_rate": 4.8456920916572445e-06, + "loss": 0.6093, + "step": 2226 + }, + { + "epoch": 0.35142812056178, + "grad_norm": 0.6239539980888367, + "learning_rate": 4.84554840871231e-06, + "loss": 0.5879, + "step": 2227 + }, + { + "epoch": 0.3515859239387723, + "grad_norm": 0.6174868941307068, + "learning_rate": 4.845404661035952e-06, + "loss": 0.625, + "step": 2228 + }, + { + "epoch": 0.35174372731576453, + "grad_norm": 0.6113665103912354, + "learning_rate": 4.8452608486321384e-06, + "loss": 0.6011, + "step": 2229 + }, + { + "epoch": 0.3519015306927568, + "grad_norm": 0.6096169948577881, + "learning_rate": 4.845116971504838e-06, + "loss": 0.595, + "step": 2230 + }, + { + "epoch": 0.3520593340697491, + "grad_norm": 0.612723171710968, + "learning_rate": 4.84497302965802e-06, + "loss": 0.5806, + "step": 2231 + }, + { + "epoch": 0.3522171374467414, + "grad_norm": 0.5926188826560974, + "learning_rate": 4.844829023095658e-06, + "loss": 0.5783, + "step": 2232 + }, + { + "epoch": 0.3523749408237336, + "grad_norm": 0.5926647782325745, + "learning_rate": 4.844684951821725e-06, + "loss": 0.5827, + "step": 2233 + }, + { + "epoch": 0.3525327442007259, + "grad_norm": 0.5831171870231628, + "learning_rate": 4.844540815840199e-06, + "loss": 0.6221, + "step": 2234 + }, + { + "epoch": 0.35269054757771817, + "grad_norm": 0.6169406771659851, + "learning_rate": 4.844396615155058e-06, + "loss": 0.6167, + "step": 2235 + }, + { + "epoch": 0.35284835095471045, + "grad_norm": 0.7267430424690247, + "learning_rate": 4.84425234977028e-06, + "loss": 0.5622, + "step": 2236 + }, + { + "epoch": 0.3530061543317027, + "grad_norm": 0.6278664469718933, + "learning_rate": 4.844108019689846e-06, + "loss": 0.6148, + "step": 2237 + }, + { + "epoch": 0.35316395770869496, + "grad_norm": 0.6176533699035645, + "learning_rate": 4.843963624917741e-06, + "loss": 0.6013, + "step": 2238 + }, + { + "epoch": 0.35332176108568725, + "grad_norm": 0.7206166982650757, + "learning_rate": 4.8438191654579484e-06, + "loss": 0.6041, + "step": 2239 + }, + { + "epoch": 0.3534795644626795, + "grad_norm": 0.596451997756958, + "learning_rate": 4.843674641314456e-06, + "loss": 0.6192, + "step": 2240 + }, + { + "epoch": 0.35363736783967176, + "grad_norm": 0.582636296749115, + "learning_rate": 4.843530052491251e-06, + "loss": 0.6137, + "step": 2241 + }, + { + "epoch": 0.35379517121666404, + "grad_norm": 0.6416839361190796, + "learning_rate": 4.843385398992324e-06, + "loss": 0.6211, + "step": 2242 + }, + { + "epoch": 0.3539529745936563, + "grad_norm": 0.6638273596763611, + "learning_rate": 4.843240680821668e-06, + "loss": 0.6393, + "step": 2243 + }, + { + "epoch": 0.35411077797064855, + "grad_norm": 0.5889508128166199, + "learning_rate": 4.8430958979832775e-06, + "loss": 0.6389, + "step": 2244 + }, + { + "epoch": 0.35426858134764083, + "grad_norm": 0.6399648785591125, + "learning_rate": 4.842951050481147e-06, + "loss": 0.6049, + "step": 2245 + }, + { + "epoch": 0.3544263847246331, + "grad_norm": 0.6129251718521118, + "learning_rate": 4.8428061383192735e-06, + "loss": 0.586, + "step": 2246 + }, + { + "epoch": 0.3545841881016254, + "grad_norm": 0.604430079460144, + "learning_rate": 4.842661161501656e-06, + "loss": 0.6093, + "step": 2247 + }, + { + "epoch": 0.3547419914786176, + "grad_norm": 0.6000022292137146, + "learning_rate": 4.842516120032298e-06, + "loss": 0.5781, + "step": 2248 + }, + { + "epoch": 0.3548997948556099, + "grad_norm": 0.580492377281189, + "learning_rate": 4.842371013915199e-06, + "loss": 0.5873, + "step": 2249 + }, + { + "epoch": 0.3550575982326022, + "grad_norm": 0.6162877082824707, + "learning_rate": 4.842225843154366e-06, + "loss": 0.5855, + "step": 2250 + }, + { + "epoch": 0.35521540160959447, + "grad_norm": 0.6141302585601807, + "learning_rate": 4.842080607753804e-06, + "loss": 0.5907, + "step": 2251 + }, + { + "epoch": 0.3553732049865867, + "grad_norm": 0.5794208645820618, + "learning_rate": 4.841935307717522e-06, + "loss": 0.5715, + "step": 2252 + }, + { + "epoch": 0.355531008363579, + "grad_norm": 0.6078240275382996, + "learning_rate": 4.84178994304953e-06, + "loss": 0.632, + "step": 2253 + }, + { + "epoch": 0.35568881174057126, + "grad_norm": 0.6091111302375793, + "learning_rate": 4.8416445137538385e-06, + "loss": 0.6085, + "step": 2254 + }, + { + "epoch": 0.3558466151175635, + "grad_norm": 0.6141535639762878, + "learning_rate": 4.841499019834462e-06, + "loss": 0.5867, + "step": 2255 + }, + { + "epoch": 0.3560044184945558, + "grad_norm": 0.6000842452049255, + "learning_rate": 4.841353461295416e-06, + "loss": 0.6147, + "step": 2256 + }, + { + "epoch": 0.35616222187154806, + "grad_norm": 0.5595958828926086, + "learning_rate": 4.841207838140717e-06, + "loss": 0.5846, + "step": 2257 + }, + { + "epoch": 0.35632002524854034, + "grad_norm": 0.5549435019493103, + "learning_rate": 4.841062150374383e-06, + "loss": 0.6033, + "step": 2258 + }, + { + "epoch": 0.35647782862553257, + "grad_norm": 0.5742657780647278, + "learning_rate": 4.840916398000437e-06, + "loss": 0.5925, + "step": 2259 + }, + { + "epoch": 0.35663563200252485, + "grad_norm": 0.6090025901794434, + "learning_rate": 4.840770581022899e-06, + "loss": 0.5972, + "step": 2260 + }, + { + "epoch": 0.35679343537951713, + "grad_norm": 0.6187394261360168, + "learning_rate": 4.840624699445795e-06, + "loss": 0.5806, + "step": 2261 + }, + { + "epoch": 0.3569512387565094, + "grad_norm": 0.5707067251205444, + "learning_rate": 4.840478753273149e-06, + "loss": 0.5478, + "step": 2262 + }, + { + "epoch": 0.35710904213350164, + "grad_norm": 0.6018473505973816, + "learning_rate": 4.840332742508991e-06, + "loss": 0.6001, + "step": 2263 + }, + { + "epoch": 0.3572668455104939, + "grad_norm": 0.6304538249969482, + "learning_rate": 4.840186667157349e-06, + "loss": 0.6115, + "step": 2264 + }, + { + "epoch": 0.3574246488874862, + "grad_norm": 0.5951569676399231, + "learning_rate": 4.840040527222255e-06, + "loss": 0.6407, + "step": 2265 + }, + { + "epoch": 0.35758245226447843, + "grad_norm": 0.5914053320884705, + "learning_rate": 4.839894322707741e-06, + "loss": 0.5857, + "step": 2266 + }, + { + "epoch": 0.3577402556414707, + "grad_norm": 0.6032325029373169, + "learning_rate": 4.839748053617845e-06, + "loss": 0.5575, + "step": 2267 + }, + { + "epoch": 0.357898059018463, + "grad_norm": 0.6005054712295532, + "learning_rate": 4.839601719956599e-06, + "loss": 0.5943, + "step": 2268 + }, + { + "epoch": 0.3580558623954553, + "grad_norm": 0.6402924060821533, + "learning_rate": 4.839455321728045e-06, + "loss": 0.6244, + "step": 2269 + }, + { + "epoch": 0.3582136657724475, + "grad_norm": 0.6004598140716553, + "learning_rate": 4.839308858936222e-06, + "loss": 0.6425, + "step": 2270 + }, + { + "epoch": 0.3583714691494398, + "grad_norm": 0.6033027768135071, + "learning_rate": 4.839162331585172e-06, + "loss": 0.6162, + "step": 2271 + }, + { + "epoch": 0.3585292725264321, + "grad_norm": 0.5731379985809326, + "learning_rate": 4.8390157396789395e-06, + "loss": 0.5779, + "step": 2272 + }, + { + "epoch": 0.35868707590342436, + "grad_norm": 0.5926746129989624, + "learning_rate": 4.838869083221569e-06, + "loss": 0.6012, + "step": 2273 + }, + { + "epoch": 0.3588448792804166, + "grad_norm": 0.6258995532989502, + "learning_rate": 4.8387223622171085e-06, + "loss": 0.6432, + "step": 2274 + }, + { + "epoch": 0.35900268265740887, + "grad_norm": 0.5564040541648865, + "learning_rate": 4.8385755766696076e-06, + "loss": 0.5983, + "step": 2275 + }, + { + "epoch": 0.35916048603440115, + "grad_norm": 0.6077659726142883, + "learning_rate": 4.8384287265831165e-06, + "loss": 0.6119, + "step": 2276 + }, + { + "epoch": 0.3593182894113934, + "grad_norm": 0.598429799079895, + "learning_rate": 4.8382818119616874e-06, + "loss": 0.6004, + "step": 2277 + }, + { + "epoch": 0.35947609278838566, + "grad_norm": 0.5872741341590881, + "learning_rate": 4.838134832809376e-06, + "loss": 0.6327, + "step": 2278 + }, + { + "epoch": 0.35963389616537794, + "grad_norm": 0.6042211651802063, + "learning_rate": 4.837987789130238e-06, + "loss": 0.6151, + "step": 2279 + }, + { + "epoch": 0.3597916995423702, + "grad_norm": 0.563129186630249, + "learning_rate": 4.837840680928331e-06, + "loss": 0.618, + "step": 2280 + }, + { + "epoch": 0.35994950291936245, + "grad_norm": 0.6169670224189758, + "learning_rate": 4.837693508207716e-06, + "loss": 0.6174, + "step": 2281 + }, + { + "epoch": 0.36010730629635473, + "grad_norm": 0.5991275310516357, + "learning_rate": 4.837546270972453e-06, + "loss": 0.6132, + "step": 2282 + }, + { + "epoch": 0.360265109673347, + "grad_norm": 0.5740308165550232, + "learning_rate": 4.837398969226608e-06, + "loss": 0.6028, + "step": 2283 + }, + { + "epoch": 0.3604229130503393, + "grad_norm": 0.6142768859863281, + "learning_rate": 4.837251602974243e-06, + "loss": 0.6035, + "step": 2284 + }, + { + "epoch": 0.3605807164273315, + "grad_norm": 0.5677372217178345, + "learning_rate": 4.8371041722194266e-06, + "loss": 0.5939, + "step": 2285 + }, + { + "epoch": 0.3607385198043238, + "grad_norm": 0.6013078093528748, + "learning_rate": 4.836956676966228e-06, + "loss": 0.5731, + "step": 2286 + }, + { + "epoch": 0.3608963231813161, + "grad_norm": 0.6038976907730103, + "learning_rate": 4.836809117218716e-06, + "loss": 0.5895, + "step": 2287 + }, + { + "epoch": 0.3610541265583084, + "grad_norm": 0.6013904809951782, + "learning_rate": 4.836661492980964e-06, + "loss": 0.6033, + "step": 2288 + }, + { + "epoch": 0.3612119299353006, + "grad_norm": 0.5825456380844116, + "learning_rate": 4.836513804257047e-06, + "loss": 0.5755, + "step": 2289 + }, + { + "epoch": 0.3613697333122929, + "grad_norm": 0.6723445057868958, + "learning_rate": 4.836366051051039e-06, + "loss": 0.5875, + "step": 2290 + }, + { + "epoch": 0.36152753668928517, + "grad_norm": 0.6232483386993408, + "learning_rate": 4.836218233367019e-06, + "loss": 0.5886, + "step": 2291 + }, + { + "epoch": 0.3616853400662774, + "grad_norm": 0.5633705854415894, + "learning_rate": 4.836070351209066e-06, + "loss": 0.6345, + "step": 2292 + }, + { + "epoch": 0.3618431434432697, + "grad_norm": 0.6312127113342285, + "learning_rate": 4.835922404581261e-06, + "loss": 0.5771, + "step": 2293 + }, + { + "epoch": 0.36200094682026196, + "grad_norm": 0.6017730832099915, + "learning_rate": 4.835774393487687e-06, + "loss": 0.5905, + "step": 2294 + }, + { + "epoch": 0.36215875019725424, + "grad_norm": 0.5816523432731628, + "learning_rate": 4.8356263179324295e-06, + "loss": 0.5846, + "step": 2295 + }, + { + "epoch": 0.36231655357424647, + "grad_norm": 0.607916533946991, + "learning_rate": 4.8354781779195725e-06, + "loss": 0.5901, + "step": 2296 + }, + { + "epoch": 0.36247435695123875, + "grad_norm": 0.5899295806884766, + "learning_rate": 4.835329973453208e-06, + "loss": 0.5966, + "step": 2297 + }, + { + "epoch": 0.36263216032823103, + "grad_norm": 0.6081169843673706, + "learning_rate": 4.8351817045374225e-06, + "loss": 0.6249, + "step": 2298 + }, + { + "epoch": 0.3627899637052233, + "grad_norm": 0.6016937494277954, + "learning_rate": 4.835033371176311e-06, + "loss": 0.5731, + "step": 2299 + }, + { + "epoch": 0.36294776708221554, + "grad_norm": 0.5724815726280212, + "learning_rate": 4.8348849733739656e-06, + "loss": 0.6227, + "step": 2300 + }, + { + "epoch": 0.3631055704592078, + "grad_norm": 0.6111145615577698, + "learning_rate": 4.834736511134481e-06, + "loss": 0.5955, + "step": 2301 + }, + { + "epoch": 0.3632633738362001, + "grad_norm": 0.5933822989463806, + "learning_rate": 4.8345879844619556e-06, + "loss": 0.6357, + "step": 2302 + }, + { + "epoch": 0.36342117721319234, + "grad_norm": 0.6320707201957703, + "learning_rate": 4.834439393360489e-06, + "loss": 0.6109, + "step": 2303 + }, + { + "epoch": 0.3635789805901846, + "grad_norm": 0.5791633129119873, + "learning_rate": 4.8342907378341794e-06, + "loss": 0.5984, + "step": 2304 + }, + { + "epoch": 0.3637367839671769, + "grad_norm": 0.6270887851715088, + "learning_rate": 4.834142017887131e-06, + "loss": 0.6161, + "step": 2305 + }, + { + "epoch": 0.3638945873441692, + "grad_norm": 0.5670351386070251, + "learning_rate": 4.833993233523449e-06, + "loss": 0.576, + "step": 2306 + }, + { + "epoch": 0.3640523907211614, + "grad_norm": 0.6345406770706177, + "learning_rate": 4.8338443847472375e-06, + "loss": 0.6177, + "step": 2307 + }, + { + "epoch": 0.3642101940981537, + "grad_norm": 0.5692490339279175, + "learning_rate": 4.833695471562606e-06, + "loss": 0.6241, + "step": 2308 + }, + { + "epoch": 0.364367997475146, + "grad_norm": 0.6045083403587341, + "learning_rate": 4.833546493973663e-06, + "loss": 0.6261, + "step": 2309 + }, + { + "epoch": 0.36452580085213826, + "grad_norm": 0.5967316031455994, + "learning_rate": 4.83339745198452e-06, + "loss": 0.5778, + "step": 2310 + }, + { + "epoch": 0.3646836042291305, + "grad_norm": 0.5835462212562561, + "learning_rate": 4.833248345599292e-06, + "loss": 0.5733, + "step": 2311 + }, + { + "epoch": 0.36484140760612277, + "grad_norm": 0.5842637419700623, + "learning_rate": 4.833099174822092e-06, + "loss": 0.6041, + "step": 2312 + }, + { + "epoch": 0.36499921098311505, + "grad_norm": 0.5836603045463562, + "learning_rate": 4.8329499396570374e-06, + "loss": 0.6426, + "step": 2313 + }, + { + "epoch": 0.3651570143601073, + "grad_norm": 0.6015911102294922, + "learning_rate": 4.832800640108246e-06, + "loss": 0.5693, + "step": 2314 + }, + { + "epoch": 0.36531481773709956, + "grad_norm": 0.5740453600883484, + "learning_rate": 4.83265127617984e-06, + "loss": 0.5965, + "step": 2315 + }, + { + "epoch": 0.36547262111409184, + "grad_norm": 0.6002984046936035, + "learning_rate": 4.832501847875939e-06, + "loss": 0.6051, + "step": 2316 + }, + { + "epoch": 0.3656304244910841, + "grad_norm": 0.5941546559333801, + "learning_rate": 4.832352355200669e-06, + "loss": 0.6112, + "step": 2317 + }, + { + "epoch": 0.36578822786807635, + "grad_norm": 0.6143139600753784, + "learning_rate": 4.832202798158153e-06, + "loss": 0.5957, + "step": 2318 + }, + { + "epoch": 0.36594603124506864, + "grad_norm": 0.585865318775177, + "learning_rate": 4.832053176752522e-06, + "loss": 0.5965, + "step": 2319 + }, + { + "epoch": 0.3661038346220609, + "grad_norm": 0.5932718515396118, + "learning_rate": 4.831903490987902e-06, + "loss": 0.5766, + "step": 2320 + }, + { + "epoch": 0.3662616379990532, + "grad_norm": 0.6044071316719055, + "learning_rate": 4.831753740868427e-06, + "loss": 0.5703, + "step": 2321 + }, + { + "epoch": 0.36641944137604543, + "grad_norm": 0.5810785889625549, + "learning_rate": 4.831603926398226e-06, + "loss": 0.6076, + "step": 2322 + }, + { + "epoch": 0.3665772447530377, + "grad_norm": 0.6058409214019775, + "learning_rate": 4.831454047581437e-06, + "loss": 0.5861, + "step": 2323 + }, + { + "epoch": 0.36673504813003, + "grad_norm": 0.5937615036964417, + "learning_rate": 4.8313041044221945e-06, + "loss": 0.6197, + "step": 2324 + }, + { + "epoch": 0.3668928515070223, + "grad_norm": 0.5725736618041992, + "learning_rate": 4.831154096924636e-06, + "loss": 0.5894, + "step": 2325 + }, + { + "epoch": 0.3670506548840145, + "grad_norm": 0.6259397268295288, + "learning_rate": 4.831004025092904e-06, + "loss": 0.6147, + "step": 2326 + }, + { + "epoch": 0.3672084582610068, + "grad_norm": 0.630378246307373, + "learning_rate": 4.830853888931137e-06, + "loss": 0.5427, + "step": 2327 + }, + { + "epoch": 0.36736626163799907, + "grad_norm": 0.5976507067680359, + "learning_rate": 4.83070368844348e-06, + "loss": 0.587, + "step": 2328 + }, + { + "epoch": 0.3675240650149913, + "grad_norm": 0.625155508518219, + "learning_rate": 4.830553423634078e-06, + "loss": 0.5972, + "step": 2329 + }, + { + "epoch": 0.3676818683919836, + "grad_norm": 0.5891312956809998, + "learning_rate": 4.830403094507078e-06, + "loss": 0.5841, + "step": 2330 + }, + { + "epoch": 0.36783967176897586, + "grad_norm": 0.6129378080368042, + "learning_rate": 4.830252701066628e-06, + "loss": 0.5855, + "step": 2331 + }, + { + "epoch": 0.36799747514596814, + "grad_norm": 0.5621052384376526, + "learning_rate": 4.83010224331688e-06, + "loss": 0.565, + "step": 2332 + }, + { + "epoch": 0.36815527852296037, + "grad_norm": 0.6037278771400452, + "learning_rate": 4.829951721261984e-06, + "loss": 0.6025, + "step": 2333 + }, + { + "epoch": 0.36831308189995265, + "grad_norm": 0.5985351204872131, + "learning_rate": 4.8298011349060965e-06, + "loss": 0.6324, + "step": 2334 + }, + { + "epoch": 0.36847088527694494, + "grad_norm": 0.5831551551818848, + "learning_rate": 4.829650484253372e-06, + "loss": 0.5975, + "step": 2335 + }, + { + "epoch": 0.3686286886539372, + "grad_norm": 0.5842099785804749, + "learning_rate": 4.829499769307968e-06, + "loss": 0.5877, + "step": 2336 + }, + { + "epoch": 0.36878649203092945, + "grad_norm": 0.5748575329780579, + "learning_rate": 4.829348990074044e-06, + "loss": 0.6026, + "step": 2337 + }, + { + "epoch": 0.36894429540792173, + "grad_norm": 0.5898144841194153, + "learning_rate": 4.829198146555761e-06, + "loss": 0.5895, + "step": 2338 + }, + { + "epoch": 0.369102098784914, + "grad_norm": 0.5996237993240356, + "learning_rate": 4.829047238757283e-06, + "loss": 0.6289, + "step": 2339 + }, + { + "epoch": 0.36925990216190624, + "grad_norm": 0.58537757396698, + "learning_rate": 4.828896266682774e-06, + "loss": 0.585, + "step": 2340 + }, + { + "epoch": 0.3694177055388985, + "grad_norm": 0.6263660788536072, + "learning_rate": 4.8287452303364e-06, + "loss": 0.5904, + "step": 2341 + }, + { + "epoch": 0.3695755089158908, + "grad_norm": 0.6106609106063843, + "learning_rate": 4.828594129722329e-06, + "loss": 0.5811, + "step": 2342 + }, + { + "epoch": 0.3697333122928831, + "grad_norm": 0.5685436725616455, + "learning_rate": 4.828442964844733e-06, + "loss": 0.6188, + "step": 2343 + }, + { + "epoch": 0.3698911156698753, + "grad_norm": 0.5815638899803162, + "learning_rate": 4.828291735707781e-06, + "loss": 0.6056, + "step": 2344 + }, + { + "epoch": 0.3700489190468676, + "grad_norm": 0.6224932670593262, + "learning_rate": 4.828140442315649e-06, + "loss": 0.59, + "step": 2345 + }, + { + "epoch": 0.3702067224238599, + "grad_norm": 0.5899309515953064, + "learning_rate": 4.82798908467251e-06, + "loss": 0.622, + "step": 2346 + }, + { + "epoch": 0.37036452580085216, + "grad_norm": 0.5968064665794373, + "learning_rate": 4.8278376627825435e-06, + "loss": 0.5688, + "step": 2347 + }, + { + "epoch": 0.3705223291778444, + "grad_norm": 0.6170229315757751, + "learning_rate": 4.827686176649927e-06, + "loss": 0.6503, + "step": 2348 + }, + { + "epoch": 0.37068013255483667, + "grad_norm": 0.6161218881607056, + "learning_rate": 4.827534626278841e-06, + "loss": 0.6081, + "step": 2349 + }, + { + "epoch": 0.37083793593182895, + "grad_norm": 0.5938693284988403, + "learning_rate": 4.827383011673469e-06, + "loss": 0.6126, + "step": 2350 + }, + { + "epoch": 0.3709957393088212, + "grad_norm": 0.5981703996658325, + "learning_rate": 4.827231332837994e-06, + "loss": 0.5679, + "step": 2351 + }, + { + "epoch": 0.37115354268581346, + "grad_norm": 0.6012349128723145, + "learning_rate": 4.8270795897766025e-06, + "loss": 0.5871, + "step": 2352 + }, + { + "epoch": 0.37131134606280575, + "grad_norm": 0.601826548576355, + "learning_rate": 4.826927782493481e-06, + "loss": 0.6466, + "step": 2353 + }, + { + "epoch": 0.37146914943979803, + "grad_norm": 0.5871109962463379, + "learning_rate": 4.826775910992823e-06, + "loss": 0.571, + "step": 2354 + }, + { + "epoch": 0.37162695281679026, + "grad_norm": 0.6839401125907898, + "learning_rate": 4.826623975278814e-06, + "loss": 0.5891, + "step": 2355 + }, + { + "epoch": 0.37178475619378254, + "grad_norm": 0.5959831476211548, + "learning_rate": 4.826471975355652e-06, + "loss": 0.5789, + "step": 2356 + }, + { + "epoch": 0.3719425595707748, + "grad_norm": 0.6303712725639343, + "learning_rate": 4.82631991122753e-06, + "loss": 0.6073, + "step": 2357 + }, + { + "epoch": 0.3721003629477671, + "grad_norm": 0.5654816627502441, + "learning_rate": 4.826167782898643e-06, + "loss": 0.6052, + "step": 2358 + }, + { + "epoch": 0.37225816632475933, + "grad_norm": 0.598612368106842, + "learning_rate": 4.826015590373192e-06, + "loss": 0.6234, + "step": 2359 + }, + { + "epoch": 0.3724159697017516, + "grad_norm": 0.5984699130058289, + "learning_rate": 4.825863333655375e-06, + "loss": 0.6023, + "step": 2360 + }, + { + "epoch": 0.3725737730787439, + "grad_norm": 0.6051937341690063, + "learning_rate": 4.825711012749396e-06, + "loss": 0.6029, + "step": 2361 + }, + { + "epoch": 0.3727315764557362, + "grad_norm": 0.599648654460907, + "learning_rate": 4.825558627659457e-06, + "loss": 0.6184, + "step": 2362 + }, + { + "epoch": 0.3728893798327284, + "grad_norm": 0.601737380027771, + "learning_rate": 4.8254061783897645e-06, + "loss": 0.5883, + "step": 2363 + }, + { + "epoch": 0.3730471832097207, + "grad_norm": 0.5855202078819275, + "learning_rate": 4.8252536649445246e-06, + "loss": 0.6094, + "step": 2364 + }, + { + "epoch": 0.373204986586713, + "grad_norm": 0.5996052026748657, + "learning_rate": 4.825101087327948e-06, + "loss": 0.6369, + "step": 2365 + }, + { + "epoch": 0.3733627899637052, + "grad_norm": 0.5814879536628723, + "learning_rate": 4.824948445544244e-06, + "loss": 0.6042, + "step": 2366 + }, + { + "epoch": 0.3735205933406975, + "grad_norm": 0.5991435050964355, + "learning_rate": 4.824795739597626e-06, + "loss": 0.5889, + "step": 2367 + }, + { + "epoch": 0.37367839671768976, + "grad_norm": 0.5952427387237549, + "learning_rate": 4.824642969492307e-06, + "loss": 0.6117, + "step": 2368 + }, + { + "epoch": 0.37383620009468205, + "grad_norm": 0.6297371983528137, + "learning_rate": 4.824490135232504e-06, + "loss": 0.6263, + "step": 2369 + }, + { + "epoch": 0.3739940034716743, + "grad_norm": 0.5962414145469666, + "learning_rate": 4.824337236822435e-06, + "loss": 0.593, + "step": 2370 + }, + { + "epoch": 0.37415180684866656, + "grad_norm": 0.6006728410720825, + "learning_rate": 4.82418427426632e-06, + "loss": 0.5914, + "step": 2371 + }, + { + "epoch": 0.37430961022565884, + "grad_norm": 0.602773129940033, + "learning_rate": 4.824031247568379e-06, + "loss": 0.6145, + "step": 2372 + }, + { + "epoch": 0.3744674136026511, + "grad_norm": 0.5957955121994019, + "learning_rate": 4.823878156732837e-06, + "loss": 0.6408, + "step": 2373 + }, + { + "epoch": 0.37462521697964335, + "grad_norm": 0.5875023603439331, + "learning_rate": 4.8237250017639174e-06, + "loss": 0.6208, + "step": 2374 + }, + { + "epoch": 0.37478302035663563, + "grad_norm": 0.6012594699859619, + "learning_rate": 4.823571782665848e-06, + "loss": 0.595, + "step": 2375 + }, + { + "epoch": 0.3749408237336279, + "grad_norm": 0.5848643779754639, + "learning_rate": 4.823418499442856e-06, + "loss": 0.6107, + "step": 2376 + }, + { + "epoch": 0.37509862711062014, + "grad_norm": 0.6326854228973389, + "learning_rate": 4.823265152099172e-06, + "loss": 0.5905, + "step": 2377 + }, + { + "epoch": 0.3752564304876124, + "grad_norm": 0.5898669362068176, + "learning_rate": 4.82311174063903e-06, + "loss": 0.5669, + "step": 2378 + }, + { + "epoch": 0.3754142338646047, + "grad_norm": 0.5926377773284912, + "learning_rate": 4.8229582650666614e-06, + "loss": 0.6067, + "step": 2379 + }, + { + "epoch": 0.375572037241597, + "grad_norm": 0.5925242900848389, + "learning_rate": 4.822804725386302e-06, + "loss": 0.6117, + "step": 2380 + }, + { + "epoch": 0.3757298406185892, + "grad_norm": 0.5364958047866821, + "learning_rate": 4.82265112160219e-06, + "loss": 0.6147, + "step": 2381 + }, + { + "epoch": 0.3758876439955815, + "grad_norm": 0.5765461325645447, + "learning_rate": 4.822497453718564e-06, + "loss": 0.6345, + "step": 2382 + }, + { + "epoch": 0.3760454473725738, + "grad_norm": 0.5579507946968079, + "learning_rate": 4.822343721739666e-06, + "loss": 0.5909, + "step": 2383 + }, + { + "epoch": 0.37620325074956606, + "grad_norm": 0.5855575203895569, + "learning_rate": 4.822189925669737e-06, + "loss": 0.6077, + "step": 2384 + }, + { + "epoch": 0.3763610541265583, + "grad_norm": 0.5735572576522827, + "learning_rate": 4.822036065513021e-06, + "loss": 0.5844, + "step": 2385 + }, + { + "epoch": 0.3765188575035506, + "grad_norm": 0.5907706618309021, + "learning_rate": 4.821882141273767e-06, + "loss": 0.5947, + "step": 2386 + }, + { + "epoch": 0.37667666088054286, + "grad_norm": 0.6000971794128418, + "learning_rate": 4.82172815295622e-06, + "loss": 0.5622, + "step": 2387 + }, + { + "epoch": 0.3768344642575351, + "grad_norm": 0.6029958724975586, + "learning_rate": 4.821574100564631e-06, + "loss": 0.6134, + "step": 2388 + }, + { + "epoch": 0.37699226763452737, + "grad_norm": 0.6340547204017639, + "learning_rate": 4.8214199841032505e-06, + "loss": 0.5996, + "step": 2389 + }, + { + "epoch": 0.37715007101151965, + "grad_norm": 0.5995404124259949, + "learning_rate": 4.8212658035763335e-06, + "loss": 0.6067, + "step": 2390 + }, + { + "epoch": 0.37730787438851193, + "grad_norm": 0.6176741719245911, + "learning_rate": 4.821111558988134e-06, + "loss": 0.617, + "step": 2391 + }, + { + "epoch": 0.37746567776550416, + "grad_norm": 0.6307854652404785, + "learning_rate": 4.820957250342909e-06, + "loss": 0.5942, + "step": 2392 + }, + { + "epoch": 0.37762348114249644, + "grad_norm": 0.592699408531189, + "learning_rate": 4.820802877644916e-06, + "loss": 0.6254, + "step": 2393 + }, + { + "epoch": 0.3777812845194887, + "grad_norm": 0.6207907795906067, + "learning_rate": 4.820648440898417e-06, + "loss": 0.5715, + "step": 2394 + }, + { + "epoch": 0.377939087896481, + "grad_norm": 0.5590139627456665, + "learning_rate": 4.8204939401076734e-06, + "loss": 0.6083, + "step": 2395 + }, + { + "epoch": 0.37809689127347323, + "grad_norm": 0.5694829821586609, + "learning_rate": 4.820339375276948e-06, + "loss": 0.5672, + "step": 2396 + }, + { + "epoch": 0.3782546946504655, + "grad_norm": 0.563846230506897, + "learning_rate": 4.820184746410508e-06, + "loss": 0.6172, + "step": 2397 + }, + { + "epoch": 0.3784124980274578, + "grad_norm": 0.5860932469367981, + "learning_rate": 4.82003005351262e-06, + "loss": 0.5602, + "step": 2398 + }, + { + "epoch": 0.3785703014044501, + "grad_norm": 0.6221300363540649, + "learning_rate": 4.819875296587552e-06, + "loss": 0.5865, + "step": 2399 + }, + { + "epoch": 0.3787281047814423, + "grad_norm": 0.5853946805000305, + "learning_rate": 4.819720475639579e-06, + "loss": 0.6041, + "step": 2400 + }, + { + "epoch": 0.3788859081584346, + "grad_norm": 0.6287828683853149, + "learning_rate": 4.819565590672969e-06, + "loss": 0.6103, + "step": 2401 + }, + { + "epoch": 0.3790437115354269, + "grad_norm": 0.5998898148536682, + "learning_rate": 4.819410641691999e-06, + "loss": 0.6217, + "step": 2402 + }, + { + "epoch": 0.3792015149124191, + "grad_norm": 0.633412778377533, + "learning_rate": 4.819255628700943e-06, + "loss": 0.599, + "step": 2403 + }, + { + "epoch": 0.3793593182894114, + "grad_norm": 0.5870721340179443, + "learning_rate": 4.819100551704081e-06, + "loss": 0.5954, + "step": 2404 + }, + { + "epoch": 0.37951712166640367, + "grad_norm": 0.6236311197280884, + "learning_rate": 4.8189454107056936e-06, + "loss": 0.648, + "step": 2405 + }, + { + "epoch": 0.37967492504339595, + "grad_norm": 0.5760555863380432, + "learning_rate": 4.818790205710059e-06, + "loss": 0.5878, + "step": 2406 + }, + { + "epoch": 0.3798327284203882, + "grad_norm": 0.5840359330177307, + "learning_rate": 4.818634936721463e-06, + "loss": 0.6072, + "step": 2407 + }, + { + "epoch": 0.37999053179738046, + "grad_norm": 0.5912415385246277, + "learning_rate": 4.818479603744191e-06, + "loss": 0.5978, + "step": 2408 + }, + { + "epoch": 0.38014833517437274, + "grad_norm": 0.585011899471283, + "learning_rate": 4.818324206782529e-06, + "loss": 0.6106, + "step": 2409 + }, + { + "epoch": 0.380306138551365, + "grad_norm": 0.6064879298210144, + "learning_rate": 4.8181687458407635e-06, + "loss": 0.6073, + "step": 2410 + }, + { + "epoch": 0.38046394192835725, + "grad_norm": 0.5954642295837402, + "learning_rate": 4.818013220923189e-06, + "loss": 0.5965, + "step": 2411 + }, + { + "epoch": 0.38062174530534953, + "grad_norm": 0.6293755769729614, + "learning_rate": 4.8178576320340945e-06, + "loss": 0.5395, + "step": 2412 + }, + { + "epoch": 0.3807795486823418, + "grad_norm": 0.636538565158844, + "learning_rate": 4.8177019791777746e-06, + "loss": 0.599, + "step": 2413 + }, + { + "epoch": 0.38093735205933404, + "grad_norm": 0.6008752584457397, + "learning_rate": 4.817546262358525e-06, + "loss": 0.6053, + "step": 2414 + }, + { + "epoch": 0.3810951554363263, + "grad_norm": 0.6151610612869263, + "learning_rate": 4.817390481580643e-06, + "loss": 0.5512, + "step": 2415 + }, + { + "epoch": 0.3812529588133186, + "grad_norm": 0.5894986391067505, + "learning_rate": 4.817234636848429e-06, + "loss": 0.5796, + "step": 2416 + }, + { + "epoch": 0.3814107621903109, + "grad_norm": 0.5894566774368286, + "learning_rate": 4.817078728166183e-06, + "loss": 0.5904, + "step": 2417 + }, + { + "epoch": 0.3815685655673031, + "grad_norm": 0.5925518870353699, + "learning_rate": 4.816922755538206e-06, + "loss": 0.6433, + "step": 2418 + }, + { + "epoch": 0.3817263689442954, + "grad_norm": 0.5816114544868469, + "learning_rate": 4.816766718968805e-06, + "loss": 0.566, + "step": 2419 + }, + { + "epoch": 0.3818841723212877, + "grad_norm": 0.5925312638282776, + "learning_rate": 4.816610618462286e-06, + "loss": 0.5704, + "step": 2420 + }, + { + "epoch": 0.38204197569827997, + "grad_norm": 0.6054251790046692, + "learning_rate": 4.816454454022955e-06, + "loss": 0.6184, + "step": 2421 + }, + { + "epoch": 0.3821997790752722, + "grad_norm": 0.5996747016906738, + "learning_rate": 4.816298225655124e-06, + "loss": 0.6151, + "step": 2422 + }, + { + "epoch": 0.3823575824522645, + "grad_norm": 0.5733508467674255, + "learning_rate": 4.8161419333631034e-06, + "loss": 0.5926, + "step": 2423 + }, + { + "epoch": 0.38251538582925676, + "grad_norm": 0.5808524489402771, + "learning_rate": 4.815985577151206e-06, + "loss": 0.5841, + "step": 2424 + }, + { + "epoch": 0.382673189206249, + "grad_norm": 0.6145884394645691, + "learning_rate": 4.8158291570237494e-06, + "loss": 0.6158, + "step": 2425 + }, + { + "epoch": 0.38283099258324127, + "grad_norm": 0.5921456217765808, + "learning_rate": 4.815672672985047e-06, + "loss": 0.5668, + "step": 2426 + }, + { + "epoch": 0.38298879596023355, + "grad_norm": 0.6288674473762512, + "learning_rate": 4.815516125039419e-06, + "loss": 0.5938, + "step": 2427 + }, + { + "epoch": 0.38314659933722583, + "grad_norm": 0.5845141410827637, + "learning_rate": 4.815359513191187e-06, + "loss": 0.5975, + "step": 2428 + }, + { + "epoch": 0.38330440271421806, + "grad_norm": 0.58511883020401, + "learning_rate": 4.81520283744467e-06, + "loss": 0.5986, + "step": 2429 + }, + { + "epoch": 0.38346220609121034, + "grad_norm": 0.5779051780700684, + "learning_rate": 4.8150460978041946e-06, + "loss": 0.5618, + "step": 2430 + }, + { + "epoch": 0.3836200094682026, + "grad_norm": 0.5510224103927612, + "learning_rate": 4.814889294274085e-06, + "loss": 0.5707, + "step": 2431 + }, + { + "epoch": 0.3837778128451949, + "grad_norm": 0.5641284584999084, + "learning_rate": 4.814732426858669e-06, + "loss": 0.5691, + "step": 2432 + }, + { + "epoch": 0.38393561622218714, + "grad_norm": 0.616722047328949, + "learning_rate": 4.814575495562277e-06, + "loss": 0.5865, + "step": 2433 + }, + { + "epoch": 0.3840934195991794, + "grad_norm": 0.6525416374206543, + "learning_rate": 4.814418500389238e-06, + "loss": 0.5679, + "step": 2434 + }, + { + "epoch": 0.3842512229761717, + "grad_norm": 0.5836542248725891, + "learning_rate": 4.814261441343885e-06, + "loss": 0.5681, + "step": 2435 + }, + { + "epoch": 0.384409026353164, + "grad_norm": 0.5938203930854797, + "learning_rate": 4.814104318430554e-06, + "loss": 0.5808, + "step": 2436 + }, + { + "epoch": 0.3845668297301562, + "grad_norm": 0.5869240164756775, + "learning_rate": 4.813947131653579e-06, + "loss": 0.5985, + "step": 2437 + }, + { + "epoch": 0.3847246331071485, + "grad_norm": 0.5868367552757263, + "learning_rate": 4.8137898810172985e-06, + "loss": 0.5936, + "step": 2438 + }, + { + "epoch": 0.3848824364841408, + "grad_norm": 0.5775062441825867, + "learning_rate": 4.813632566526054e-06, + "loss": 0.6017, + "step": 2439 + }, + { + "epoch": 0.385040239861133, + "grad_norm": 0.5740727782249451, + "learning_rate": 4.8134751881841856e-06, + "loss": 0.584, + "step": 2440 + }, + { + "epoch": 0.3851980432381253, + "grad_norm": 0.5721902847290039, + "learning_rate": 4.813317745996037e-06, + "loss": 0.5829, + "step": 2441 + }, + { + "epoch": 0.38535584661511757, + "grad_norm": 0.6385089159011841, + "learning_rate": 4.813160239965952e-06, + "loss": 0.6303, + "step": 2442 + }, + { + "epoch": 0.38551364999210985, + "grad_norm": 0.5972583889961243, + "learning_rate": 4.813002670098279e-06, + "loss": 0.5983, + "step": 2443 + }, + { + "epoch": 0.3856714533691021, + "grad_norm": 0.5977823734283447, + "learning_rate": 4.812845036397366e-06, + "loss": 0.6245, + "step": 2444 + }, + { + "epoch": 0.38582925674609436, + "grad_norm": 0.6214934587478638, + "learning_rate": 4.8126873388675635e-06, + "loss": 0.608, + "step": 2445 + }, + { + "epoch": 0.38598706012308664, + "grad_norm": 0.6200762987136841, + "learning_rate": 4.8125295775132225e-06, + "loss": 0.6302, + "step": 2446 + }, + { + "epoch": 0.3861448635000789, + "grad_norm": 0.6042806506156921, + "learning_rate": 4.812371752338698e-06, + "loss": 0.5735, + "step": 2447 + }, + { + "epoch": 0.38630266687707115, + "grad_norm": 0.6260852813720703, + "learning_rate": 4.812213863348345e-06, + "loss": 0.6187, + "step": 2448 + }, + { + "epoch": 0.38646047025406344, + "grad_norm": 0.6377527713775635, + "learning_rate": 4.812055910546521e-06, + "loss": 0.6039, + "step": 2449 + }, + { + "epoch": 0.3866182736310557, + "grad_norm": 0.6390556693077087, + "learning_rate": 4.811897893937586e-06, + "loss": 0.5833, + "step": 2450 + }, + { + "epoch": 0.38677607700804795, + "grad_norm": 0.5895956754684448, + "learning_rate": 4.811739813525899e-06, + "loss": 0.6102, + "step": 2451 + }, + { + "epoch": 0.38693388038504023, + "grad_norm": 0.5974439978599548, + "learning_rate": 4.811581669315825e-06, + "loss": 0.5887, + "step": 2452 + }, + { + "epoch": 0.3870916837620325, + "grad_norm": 0.5845072865486145, + "learning_rate": 4.811423461311725e-06, + "loss": 0.6236, + "step": 2453 + }, + { + "epoch": 0.3872494871390248, + "grad_norm": 0.6114129424095154, + "learning_rate": 4.811265189517968e-06, + "loss": 0.6369, + "step": 2454 + }, + { + "epoch": 0.387407290516017, + "grad_norm": 0.6216821670532227, + "learning_rate": 4.811106853938922e-06, + "loss": 0.6209, + "step": 2455 + }, + { + "epoch": 0.3875650938930093, + "grad_norm": 0.6260226964950562, + "learning_rate": 4.810948454578955e-06, + "loss": 0.6231, + "step": 2456 + }, + { + "epoch": 0.3877228972700016, + "grad_norm": 0.6018202304840088, + "learning_rate": 4.8107899914424404e-06, + "loss": 0.6027, + "step": 2457 + }, + { + "epoch": 0.38788070064699387, + "grad_norm": 0.562353789806366, + "learning_rate": 4.810631464533749e-06, + "loss": 0.5891, + "step": 2458 + }, + { + "epoch": 0.3880385040239861, + "grad_norm": 0.5701082944869995, + "learning_rate": 4.810472873857258e-06, + "loss": 0.5531, + "step": 2459 + }, + { + "epoch": 0.3881963074009784, + "grad_norm": 0.6098718643188477, + "learning_rate": 4.810314219417343e-06, + "loss": 0.5555, + "step": 2460 + }, + { + "epoch": 0.38835411077797066, + "grad_norm": 0.5965044498443604, + "learning_rate": 4.810155501218381e-06, + "loss": 0.5732, + "step": 2461 + }, + { + "epoch": 0.3885119141549629, + "grad_norm": 0.5907555818557739, + "learning_rate": 4.809996719264755e-06, + "loss": 0.571, + "step": 2462 + }, + { + "epoch": 0.3886697175319552, + "grad_norm": 0.6111880540847778, + "learning_rate": 4.809837873560846e-06, + "loss": 0.5949, + "step": 2463 + }, + { + "epoch": 0.38882752090894745, + "grad_norm": 0.60197514295578, + "learning_rate": 4.809678964111038e-06, + "loss": 0.5848, + "step": 2464 + }, + { + "epoch": 0.38898532428593974, + "grad_norm": 0.6233932971954346, + "learning_rate": 4.8095199909197145e-06, + "loss": 0.6085, + "step": 2465 + }, + { + "epoch": 0.38914312766293196, + "grad_norm": 0.5815563201904297, + "learning_rate": 4.809360953991266e-06, + "loss": 0.591, + "step": 2466 + }, + { + "epoch": 0.38930093103992425, + "grad_norm": 0.5835162401199341, + "learning_rate": 4.809201853330079e-06, + "loss": 0.6151, + "step": 2467 + }, + { + "epoch": 0.38945873441691653, + "grad_norm": 0.5874367952346802, + "learning_rate": 4.809042688940545e-06, + "loss": 0.6035, + "step": 2468 + }, + { + "epoch": 0.3896165377939088, + "grad_norm": 0.5784921646118164, + "learning_rate": 4.808883460827058e-06, + "loss": 0.6167, + "step": 2469 + }, + { + "epoch": 0.38977434117090104, + "grad_norm": 0.601662814617157, + "learning_rate": 4.808724168994009e-06, + "loss": 0.5912, + "step": 2470 + }, + { + "epoch": 0.3899321445478933, + "grad_norm": 0.5919822454452515, + "learning_rate": 4.808564813445797e-06, + "loss": 0.5973, + "step": 2471 + }, + { + "epoch": 0.3900899479248856, + "grad_norm": 0.5911552906036377, + "learning_rate": 4.80840539418682e-06, + "loss": 0.5946, + "step": 2472 + }, + { + "epoch": 0.3902477513018779, + "grad_norm": 0.6083328127861023, + "learning_rate": 4.808245911221476e-06, + "loss": 0.5946, + "step": 2473 + }, + { + "epoch": 0.3904055546788701, + "grad_norm": 0.5847287774085999, + "learning_rate": 4.808086364554166e-06, + "loss": 0.6203, + "step": 2474 + }, + { + "epoch": 0.3905633580558624, + "grad_norm": 0.606340229511261, + "learning_rate": 4.8079267541892945e-06, + "loss": 0.5913, + "step": 2475 + }, + { + "epoch": 0.3907211614328547, + "grad_norm": 0.5904209017753601, + "learning_rate": 4.8077670801312655e-06, + "loss": 0.5824, + "step": 2476 + }, + { + "epoch": 0.3908789648098469, + "grad_norm": 0.5732273459434509, + "learning_rate": 4.807607342384487e-06, + "loss": 0.6097, + "step": 2477 + }, + { + "epoch": 0.3910367681868392, + "grad_norm": 0.5972214341163635, + "learning_rate": 4.807447540953366e-06, + "loss": 0.5813, + "step": 2478 + }, + { + "epoch": 0.3911945715638315, + "grad_norm": 0.6098765730857849, + "learning_rate": 4.807287675842312e-06, + "loss": 0.6157, + "step": 2479 + }, + { + "epoch": 0.39135237494082376, + "grad_norm": 0.6171345710754395, + "learning_rate": 4.807127747055739e-06, + "loss": 0.6016, + "step": 2480 + }, + { + "epoch": 0.391510178317816, + "grad_norm": 0.6493605971336365, + "learning_rate": 4.806967754598059e-06, + "loss": 0.613, + "step": 2481 + }, + { + "epoch": 0.39166798169480826, + "grad_norm": 0.5891979336738586, + "learning_rate": 4.806807698473687e-06, + "loss": 0.5904, + "step": 2482 + }, + { + "epoch": 0.39182578507180055, + "grad_norm": 0.575095534324646, + "learning_rate": 4.806647578687042e-06, + "loss": 0.604, + "step": 2483 + }, + { + "epoch": 0.39198358844879283, + "grad_norm": 0.6207751631736755, + "learning_rate": 4.806487395242542e-06, + "loss": 0.6082, + "step": 2484 + }, + { + "epoch": 0.39214139182578506, + "grad_norm": 0.5944333076477051, + "learning_rate": 4.806327148144607e-06, + "loss": 0.6164, + "step": 2485 + }, + { + "epoch": 0.39229919520277734, + "grad_norm": 0.6253919005393982, + "learning_rate": 4.8061668373976605e-06, + "loss": 0.5661, + "step": 2486 + }, + { + "epoch": 0.3924569985797696, + "grad_norm": 0.6294531226158142, + "learning_rate": 4.806006463006126e-06, + "loss": 0.5932, + "step": 2487 + }, + { + "epoch": 0.39261480195676185, + "grad_norm": 0.5557107329368591, + "learning_rate": 4.80584602497443e-06, + "loss": 0.6023, + "step": 2488 + }, + { + "epoch": 0.39277260533375413, + "grad_norm": 0.5673990845680237, + "learning_rate": 4.805685523307e-06, + "loss": 0.6159, + "step": 2489 + }, + { + "epoch": 0.3929304087107464, + "grad_norm": 0.6033639907836914, + "learning_rate": 4.805524958008265e-06, + "loss": 0.5928, + "step": 2490 + }, + { + "epoch": 0.3930882120877387, + "grad_norm": 0.5991799831390381, + "learning_rate": 4.805364329082656e-06, + "loss": 0.5892, + "step": 2491 + }, + { + "epoch": 0.3932460154647309, + "grad_norm": 0.6065400242805481, + "learning_rate": 4.805203636534608e-06, + "loss": 0.5888, + "step": 2492 + }, + { + "epoch": 0.3934038188417232, + "grad_norm": 0.579412579536438, + "learning_rate": 4.805042880368552e-06, + "loss": 0.5768, + "step": 2493 + }, + { + "epoch": 0.3935616222187155, + "grad_norm": 0.592714786529541, + "learning_rate": 4.804882060588929e-06, + "loss": 0.5955, + "step": 2494 + }, + { + "epoch": 0.3937194255957078, + "grad_norm": 0.6019005179405212, + "learning_rate": 4.804721177200173e-06, + "loss": 0.6024, + "step": 2495 + }, + { + "epoch": 0.3938772289727, + "grad_norm": 0.5828425884246826, + "learning_rate": 4.804560230206727e-06, + "loss": 0.599, + "step": 2496 + }, + { + "epoch": 0.3940350323496923, + "grad_norm": 0.5741686820983887, + "learning_rate": 4.8043992196130316e-06, + "loss": 0.5837, + "step": 2497 + }, + { + "epoch": 0.39419283572668457, + "grad_norm": 0.5999045372009277, + "learning_rate": 4.804238145423531e-06, + "loss": 0.5789, + "step": 2498 + }, + { + "epoch": 0.3943506391036768, + "grad_norm": 0.5889496207237244, + "learning_rate": 4.804077007642669e-06, + "loss": 0.5748, + "step": 2499 + }, + { + "epoch": 0.3945084424806691, + "grad_norm": 0.5900644063949585, + "learning_rate": 4.803915806274893e-06, + "loss": 0.5883, + "step": 2500 + }, + { + "epoch": 0.39466624585766136, + "grad_norm": 0.5998064279556274, + "learning_rate": 4.803754541324653e-06, + "loss": 0.6072, + "step": 2501 + }, + { + "epoch": 0.39482404923465364, + "grad_norm": 0.6262025237083435, + "learning_rate": 4.8035932127963994e-06, + "loss": 0.5844, + "step": 2502 + }, + { + "epoch": 0.39498185261164587, + "grad_norm": 0.5822706818580627, + "learning_rate": 4.8034318206945825e-06, + "loss": 0.5886, + "step": 2503 + }, + { + "epoch": 0.39513965598863815, + "grad_norm": 0.5704922080039978, + "learning_rate": 4.803270365023659e-06, + "loss": 0.5672, + "step": 2504 + }, + { + "epoch": 0.39529745936563043, + "grad_norm": 0.6196812391281128, + "learning_rate": 4.803108845788082e-06, + "loss": 0.6194, + "step": 2505 + }, + { + "epoch": 0.3954552627426227, + "grad_norm": 0.6031139492988586, + "learning_rate": 4.802947262992311e-06, + "loss": 0.5746, + "step": 2506 + }, + { + "epoch": 0.39561306611961494, + "grad_norm": 0.5849658250808716, + "learning_rate": 4.802785616640804e-06, + "loss": 0.5686, + "step": 2507 + }, + { + "epoch": 0.3957708694966072, + "grad_norm": 0.6022820472717285, + "learning_rate": 4.802623906738023e-06, + "loss": 0.5783, + "step": 2508 + }, + { + "epoch": 0.3959286728735995, + "grad_norm": 0.6116674542427063, + "learning_rate": 4.802462133288431e-06, + "loss": 0.6366, + "step": 2509 + }, + { + "epoch": 0.3960864762505918, + "grad_norm": 0.5994503498077393, + "learning_rate": 4.802300296296491e-06, + "loss": 0.5687, + "step": 2510 + }, + { + "epoch": 0.396244279627584, + "grad_norm": 0.5740094780921936, + "learning_rate": 4.802138395766671e-06, + "loss": 0.5751, + "step": 2511 + }, + { + "epoch": 0.3964020830045763, + "grad_norm": 0.5929451584815979, + "learning_rate": 4.801976431703439e-06, + "loss": 0.5478, + "step": 2512 + }, + { + "epoch": 0.3965598863815686, + "grad_norm": 0.5582947731018066, + "learning_rate": 4.801814404111263e-06, + "loss": 0.6258, + "step": 2513 + }, + { + "epoch": 0.3967176897585608, + "grad_norm": 0.590717077255249, + "learning_rate": 4.801652312994617e-06, + "loss": 0.6117, + "step": 2514 + }, + { + "epoch": 0.3968754931355531, + "grad_norm": 0.6224145293235779, + "learning_rate": 4.801490158357973e-06, + "loss": 0.5817, + "step": 2515 + }, + { + "epoch": 0.3970332965125454, + "grad_norm": 0.5641626119613647, + "learning_rate": 4.8013279402058055e-06, + "loss": 0.5861, + "step": 2516 + }, + { + "epoch": 0.39719109988953766, + "grad_norm": 0.5951860547065735, + "learning_rate": 4.801165658542593e-06, + "loss": 0.6005, + "step": 2517 + }, + { + "epoch": 0.3973489032665299, + "grad_norm": 0.5955455899238586, + "learning_rate": 4.801003313372812e-06, + "loss": 0.578, + "step": 2518 + }, + { + "epoch": 0.39750670664352217, + "grad_norm": 0.6048750877380371, + "learning_rate": 4.800840904700944e-06, + "loss": 0.6119, + "step": 2519 + }, + { + "epoch": 0.39766451002051445, + "grad_norm": 0.5778781175613403, + "learning_rate": 4.800678432531472e-06, + "loss": 0.6172, + "step": 2520 + }, + { + "epoch": 0.39782231339750673, + "grad_norm": 0.5828583836555481, + "learning_rate": 4.800515896868877e-06, + "loss": 0.606, + "step": 2521 + }, + { + "epoch": 0.39798011677449896, + "grad_norm": 0.6169100999832153, + "learning_rate": 4.800353297717648e-06, + "loss": 0.5854, + "step": 2522 + }, + { + "epoch": 0.39813792015149124, + "grad_norm": 0.5884960293769836, + "learning_rate": 4.800190635082271e-06, + "loss": 0.6332, + "step": 2523 + }, + { + "epoch": 0.3982957235284835, + "grad_norm": 0.5803056359291077, + "learning_rate": 4.800027908967234e-06, + "loss": 0.596, + "step": 2524 + }, + { + "epoch": 0.39845352690547575, + "grad_norm": 0.6037631034851074, + "learning_rate": 4.7998651193770285e-06, + "loss": 0.5934, + "step": 2525 + }, + { + "epoch": 0.39861133028246803, + "grad_norm": 0.613514244556427, + "learning_rate": 4.799702266316148e-06, + "loss": 0.5835, + "step": 2526 + }, + { + "epoch": 0.3987691336594603, + "grad_norm": 0.677547037601471, + "learning_rate": 4.7995393497890864e-06, + "loss": 0.5805, + "step": 2527 + }, + { + "epoch": 0.3989269370364526, + "grad_norm": 0.60105961561203, + "learning_rate": 4.799376369800339e-06, + "loss": 0.6053, + "step": 2528 + }, + { + "epoch": 0.3990847404134448, + "grad_norm": 0.5907654762268066, + "learning_rate": 4.799213326354404e-06, + "loss": 0.6147, + "step": 2529 + }, + { + "epoch": 0.3992425437904371, + "grad_norm": 0.58949875831604, + "learning_rate": 4.799050219455781e-06, + "loss": 0.5707, + "step": 2530 + }, + { + "epoch": 0.3994003471674294, + "grad_norm": 0.606492280960083, + "learning_rate": 4.798887049108972e-06, + "loss": 0.5847, + "step": 2531 + }, + { + "epoch": 0.3995581505444217, + "grad_norm": 0.5859805345535278, + "learning_rate": 4.79872381531848e-06, + "loss": 0.624, + "step": 2532 + }, + { + "epoch": 0.3997159539214139, + "grad_norm": 0.5812191963195801, + "learning_rate": 4.798560518088809e-06, + "loss": 0.5863, + "step": 2533 + }, + { + "epoch": 0.3998737572984062, + "grad_norm": 0.6150538325309753, + "learning_rate": 4.798397157424466e-06, + "loss": 0.5904, + "step": 2534 + }, + { + "epoch": 0.40003156067539847, + "grad_norm": 0.5832411646842957, + "learning_rate": 4.79823373332996e-06, + "loss": 0.6145, + "step": 2535 + }, + { + "epoch": 0.4001893640523907, + "grad_norm": 0.5776795148849487, + "learning_rate": 4.7980702458098e-06, + "loss": 0.6335, + "step": 2536 + }, + { + "epoch": 0.400347167429383, + "grad_norm": 0.565015435218811, + "learning_rate": 4.7979066948684995e-06, + "loss": 0.5276, + "step": 2537 + }, + { + "epoch": 0.40050497080637526, + "grad_norm": 0.6051506400108337, + "learning_rate": 4.7977430805105705e-06, + "loss": 0.5942, + "step": 2538 + }, + { + "epoch": 0.40066277418336754, + "grad_norm": 0.573990523815155, + "learning_rate": 4.797579402740529e-06, + "loss": 0.5799, + "step": 2539 + }, + { + "epoch": 0.40082057756035977, + "grad_norm": 0.5994016528129578, + "learning_rate": 4.7974156615628915e-06, + "loss": 0.5634, + "step": 2540 + }, + { + "epoch": 0.40097838093735205, + "grad_norm": 0.569948136806488, + "learning_rate": 4.797251856982178e-06, + "loss": 0.5582, + "step": 2541 + }, + { + "epoch": 0.40113618431434434, + "grad_norm": 0.6044588685035706, + "learning_rate": 4.797087989002908e-06, + "loss": 0.6084, + "step": 2542 + }, + { + "epoch": 0.4012939876913366, + "grad_norm": 0.6095245480537415, + "learning_rate": 4.796924057629604e-06, + "loss": 0.5772, + "step": 2543 + }, + { + "epoch": 0.40145179106832884, + "grad_norm": 0.6128333806991577, + "learning_rate": 4.7967600628667906e-06, + "loss": 0.574, + "step": 2544 + }, + { + "epoch": 0.4016095944453211, + "grad_norm": 0.6016311645507812, + "learning_rate": 4.796596004718995e-06, + "loss": 0.6133, + "step": 2545 + }, + { + "epoch": 0.4017673978223134, + "grad_norm": 0.589280903339386, + "learning_rate": 4.796431883190742e-06, + "loss": 0.5971, + "step": 2546 + }, + { + "epoch": 0.4019252011993057, + "grad_norm": 0.5813164114952087, + "learning_rate": 4.796267698286562e-06, + "loss": 0.5769, + "step": 2547 + }, + { + "epoch": 0.4020830045762979, + "grad_norm": 0.5667118430137634, + "learning_rate": 4.796103450010987e-06, + "loss": 0.5832, + "step": 2548 + }, + { + "epoch": 0.4022408079532902, + "grad_norm": 0.5932059288024902, + "learning_rate": 4.795939138368549e-06, + "loss": 0.6167, + "step": 2549 + }, + { + "epoch": 0.4023986113302825, + "grad_norm": 0.6004566550254822, + "learning_rate": 4.795774763363783e-06, + "loss": 0.5838, + "step": 2550 + }, + { + "epoch": 0.4025564147072747, + "grad_norm": 0.6080920100212097, + "learning_rate": 4.795610325001224e-06, + "loss": 0.5654, + "step": 2551 + }, + { + "epoch": 0.402714218084267, + "grad_norm": 0.5754812955856323, + "learning_rate": 4.795445823285413e-06, + "loss": 0.6146, + "step": 2552 + }, + { + "epoch": 0.4028720214612593, + "grad_norm": 0.5764461755752563, + "learning_rate": 4.795281258220888e-06, + "loss": 0.5739, + "step": 2553 + }, + { + "epoch": 0.40302982483825156, + "grad_norm": 0.573250412940979, + "learning_rate": 4.7951166298121895e-06, + "loss": 0.6084, + "step": 2554 + }, + { + "epoch": 0.4031876282152438, + "grad_norm": 0.6204304099082947, + "learning_rate": 4.794951938063862e-06, + "loss": 0.6199, + "step": 2555 + }, + { + "epoch": 0.40334543159223607, + "grad_norm": 0.5996941328048706, + "learning_rate": 4.794787182980451e-06, + "loss": 0.5455, + "step": 2556 + }, + { + "epoch": 0.40350323496922835, + "grad_norm": 0.6071109771728516, + "learning_rate": 4.794622364566504e-06, + "loss": 0.5848, + "step": 2557 + }, + { + "epoch": 0.40366103834622064, + "grad_norm": 0.6239042282104492, + "learning_rate": 4.794457482826568e-06, + "loss": 0.6221, + "step": 2558 + }, + { + "epoch": 0.40381884172321286, + "grad_norm": 0.6024141907691956, + "learning_rate": 4.794292537765194e-06, + "loss": 0.6111, + "step": 2559 + }, + { + "epoch": 0.40397664510020515, + "grad_norm": 0.589627742767334, + "learning_rate": 4.794127529386933e-06, + "loss": 0.5907, + "step": 2560 + }, + { + "epoch": 0.4041344484771974, + "grad_norm": 0.558574914932251, + "learning_rate": 4.7939624576963405e-06, + "loss": 0.6093, + "step": 2561 + }, + { + "epoch": 0.40429225185418965, + "grad_norm": 0.5887386202812195, + "learning_rate": 4.793797322697972e-06, + "loss": 0.6015, + "step": 2562 + }, + { + "epoch": 0.40445005523118194, + "grad_norm": 0.6225807070732117, + "learning_rate": 4.793632124396384e-06, + "loss": 0.6005, + "step": 2563 + }, + { + "epoch": 0.4046078586081742, + "grad_norm": 0.6241739988327026, + "learning_rate": 4.793466862796134e-06, + "loss": 0.5842, + "step": 2564 + }, + { + "epoch": 0.4047656619851665, + "grad_norm": 0.5653369426727295, + "learning_rate": 4.793301537901786e-06, + "loss": 0.6154, + "step": 2565 + }, + { + "epoch": 0.40492346536215873, + "grad_norm": 0.549761950969696, + "learning_rate": 4.7931361497179016e-06, + "loss": 0.5756, + "step": 2566 + }, + { + "epoch": 0.405081268739151, + "grad_norm": 0.5907023549079895, + "learning_rate": 4.7929706982490445e-06, + "loss": 0.5873, + "step": 2567 + }, + { + "epoch": 0.4052390721161433, + "grad_norm": 0.6052724719047546, + "learning_rate": 4.792805183499781e-06, + "loss": 0.5771, + "step": 2568 + }, + { + "epoch": 0.4053968754931356, + "grad_norm": 0.5914238691329956, + "learning_rate": 4.792639605474678e-06, + "loss": 0.5875, + "step": 2569 + }, + { + "epoch": 0.4055546788701278, + "grad_norm": 0.6595968008041382, + "learning_rate": 4.792473964178307e-06, + "loss": 0.594, + "step": 2570 + }, + { + "epoch": 0.4057124822471201, + "grad_norm": 0.5953301787376404, + "learning_rate": 4.792308259615237e-06, + "loss": 0.5634, + "step": 2571 + }, + { + "epoch": 0.40587028562411237, + "grad_norm": 0.6078230142593384, + "learning_rate": 4.792142491790042e-06, + "loss": 0.577, + "step": 2572 + }, + { + "epoch": 0.4060280890011046, + "grad_norm": 0.5855866074562073, + "learning_rate": 4.791976660707299e-06, + "loss": 0.6146, + "step": 2573 + }, + { + "epoch": 0.4061858923780969, + "grad_norm": 0.5888255834579468, + "learning_rate": 4.791810766371582e-06, + "loss": 0.5961, + "step": 2574 + }, + { + "epoch": 0.40634369575508916, + "grad_norm": 0.5970906019210815, + "learning_rate": 4.791644808787469e-06, + "loss": 0.576, + "step": 2575 + }, + { + "epoch": 0.40650149913208145, + "grad_norm": 0.644451916217804, + "learning_rate": 4.79147878795954e-06, + "loss": 0.5755, + "step": 2576 + }, + { + "epoch": 0.4066593025090737, + "grad_norm": 0.5966418981552124, + "learning_rate": 4.79131270389238e-06, + "loss": 0.5928, + "step": 2577 + }, + { + "epoch": 0.40681710588606596, + "grad_norm": 0.6343523263931274, + "learning_rate": 4.791146556590569e-06, + "loss": 0.5989, + "step": 2578 + }, + { + "epoch": 0.40697490926305824, + "grad_norm": 0.5552530884742737, + "learning_rate": 4.790980346058693e-06, + "loss": 0.5729, + "step": 2579 + }, + { + "epoch": 0.4071327126400505, + "grad_norm": 0.5998293161392212, + "learning_rate": 4.79081407230134e-06, + "loss": 0.6129, + "step": 2580 + }, + { + "epoch": 0.40729051601704275, + "grad_norm": 0.5930588841438293, + "learning_rate": 4.790647735323098e-06, + "loss": 0.5873, + "step": 2581 + }, + { + "epoch": 0.40744831939403503, + "grad_norm": 0.5967591404914856, + "learning_rate": 4.790481335128557e-06, + "loss": 0.574, + "step": 2582 + }, + { + "epoch": 0.4076061227710273, + "grad_norm": 0.6195694208145142, + "learning_rate": 4.79031487172231e-06, + "loss": 0.604, + "step": 2583 + }, + { + "epoch": 0.4077639261480196, + "grad_norm": 0.6047678589820862, + "learning_rate": 4.790148345108952e-06, + "loss": 0.5395, + "step": 2584 + }, + { + "epoch": 0.4079217295250118, + "grad_norm": 0.5967791676521301, + "learning_rate": 4.789981755293076e-06, + "loss": 0.5898, + "step": 2585 + }, + { + "epoch": 0.4080795329020041, + "grad_norm": 0.6290505528450012, + "learning_rate": 4.789815102279282e-06, + "loss": 0.6104, + "step": 2586 + }, + { + "epoch": 0.4082373362789964, + "grad_norm": 0.6039069890975952, + "learning_rate": 4.789648386072169e-06, + "loss": 0.5798, + "step": 2587 + }, + { + "epoch": 0.4083951396559886, + "grad_norm": 0.6118987202644348, + "learning_rate": 4.789481606676336e-06, + "loss": 0.6015, + "step": 2588 + }, + { + "epoch": 0.4085529430329809, + "grad_norm": 0.6030541062355042, + "learning_rate": 4.789314764096388e-06, + "loss": 0.5752, + "step": 2589 + }, + { + "epoch": 0.4087107464099732, + "grad_norm": 0.5759915113449097, + "learning_rate": 4.789147858336927e-06, + "loss": 0.5802, + "step": 2590 + }, + { + "epoch": 0.40886854978696546, + "grad_norm": 0.5791226029396057, + "learning_rate": 4.7889808894025616e-06, + "loss": 0.5816, + "step": 2591 + }, + { + "epoch": 0.4090263531639577, + "grad_norm": 0.5580329298973083, + "learning_rate": 4.7888138572978985e-06, + "loss": 0.5601, + "step": 2592 + }, + { + "epoch": 0.40918415654095, + "grad_norm": 0.5899879336357117, + "learning_rate": 4.788646762027548e-06, + "loss": 0.5979, + "step": 2593 + }, + { + "epoch": 0.40934195991794226, + "grad_norm": 0.62022864818573, + "learning_rate": 4.788479603596121e-06, + "loss": 0.5826, + "step": 2594 + }, + { + "epoch": 0.40949976329493454, + "grad_norm": 0.5917497277259827, + "learning_rate": 4.788312382008231e-06, + "loss": 0.5732, + "step": 2595 + }, + { + "epoch": 0.40965756667192676, + "grad_norm": 0.5968687534332275, + "learning_rate": 4.788145097268492e-06, + "loss": 0.5576, + "step": 2596 + }, + { + "epoch": 0.40981537004891905, + "grad_norm": 0.5671234130859375, + "learning_rate": 4.787977749381522e-06, + "loss": 0.5774, + "step": 2597 + }, + { + "epoch": 0.40997317342591133, + "grad_norm": 0.6114801168441772, + "learning_rate": 4.787810338351939e-06, + "loss": 0.5979, + "step": 2598 + }, + { + "epoch": 0.41013097680290356, + "grad_norm": 0.5988049507141113, + "learning_rate": 4.787642864184362e-06, + "loss": 0.5988, + "step": 2599 + }, + { + "epoch": 0.41028878017989584, + "grad_norm": 0.5856754183769226, + "learning_rate": 4.787475326883414e-06, + "loss": 0.6144, + "step": 2600 + }, + { + "epoch": 0.4104465835568881, + "grad_norm": 0.5993143320083618, + "learning_rate": 4.7873077264537185e-06, + "loss": 0.5828, + "step": 2601 + }, + { + "epoch": 0.4106043869338804, + "grad_norm": 0.5663980841636658, + "learning_rate": 4.787140062899901e-06, + "loss": 0.579, + "step": 2602 + }, + { + "epoch": 0.41076219031087263, + "grad_norm": 0.5667562484741211, + "learning_rate": 4.786972336226589e-06, + "loss": 0.5686, + "step": 2603 + }, + { + "epoch": 0.4109199936878649, + "grad_norm": 0.5773575305938721, + "learning_rate": 4.78680454643841e-06, + "loss": 0.5923, + "step": 2604 + }, + { + "epoch": 0.4110777970648572, + "grad_norm": 0.5756294131278992, + "learning_rate": 4.7866366935399946e-06, + "loss": 0.6255, + "step": 2605 + }, + { + "epoch": 0.4112356004418495, + "grad_norm": 0.5999125242233276, + "learning_rate": 4.786468777535977e-06, + "loss": 0.6082, + "step": 2606 + }, + { + "epoch": 0.4113934038188417, + "grad_norm": 0.6037368774414062, + "learning_rate": 4.78630079843099e-06, + "loss": 0.5892, + "step": 2607 + }, + { + "epoch": 0.411551207195834, + "grad_norm": 0.5855896472930908, + "learning_rate": 4.786132756229669e-06, + "loss": 0.5663, + "step": 2608 + }, + { + "epoch": 0.4117090105728263, + "grad_norm": 0.6038429737091064, + "learning_rate": 4.785964650936652e-06, + "loss": 0.6121, + "step": 2609 + }, + { + "epoch": 0.4118668139498185, + "grad_norm": 0.6237514615058899, + "learning_rate": 4.785796482556578e-06, + "loss": 0.5906, + "step": 2610 + }, + { + "epoch": 0.4120246173268108, + "grad_norm": 0.6121618151664734, + "learning_rate": 4.78562825109409e-06, + "loss": 0.572, + "step": 2611 + }, + { + "epoch": 0.41218242070380307, + "grad_norm": 0.5839176177978516, + "learning_rate": 4.785459956553828e-06, + "loss": 0.6122, + "step": 2612 + }, + { + "epoch": 0.41234022408079535, + "grad_norm": 0.5965617299079895, + "learning_rate": 4.785291598940438e-06, + "loss": 0.5818, + "step": 2613 + }, + { + "epoch": 0.4124980274577876, + "grad_norm": 0.6554787755012512, + "learning_rate": 4.785123178258566e-06, + "loss": 0.5719, + "step": 2614 + }, + { + "epoch": 0.41265583083477986, + "grad_norm": 0.6517807245254517, + "learning_rate": 4.78495469451286e-06, + "loss": 0.612, + "step": 2615 + }, + { + "epoch": 0.41281363421177214, + "grad_norm": 0.6258900761604309, + "learning_rate": 4.784786147707969e-06, + "loss": 0.5998, + "step": 2616 + }, + { + "epoch": 0.4129714375887644, + "grad_norm": 0.6208417415618896, + "learning_rate": 4.784617537848546e-06, + "loss": 0.5864, + "step": 2617 + }, + { + "epoch": 0.41312924096575665, + "grad_norm": 0.5641432404518127, + "learning_rate": 4.784448864939243e-06, + "loss": 0.6022, + "step": 2618 + }, + { + "epoch": 0.41328704434274893, + "grad_norm": 0.6415213346481323, + "learning_rate": 4.7842801289847165e-06, + "loss": 0.5818, + "step": 2619 + }, + { + "epoch": 0.4134448477197412, + "grad_norm": 0.593088686466217, + "learning_rate": 4.784111329989621e-06, + "loss": 0.6104, + "step": 2620 + }, + { + "epoch": 0.41360265109673344, + "grad_norm": 0.5853867530822754, + "learning_rate": 4.783942467958616e-06, + "loss": 0.5756, + "step": 2621 + }, + { + "epoch": 0.4137604544737257, + "grad_norm": 0.5993637442588806, + "learning_rate": 4.783773542896361e-06, + "loss": 0.5794, + "step": 2622 + }, + { + "epoch": 0.413918257850718, + "grad_norm": 0.5726490616798401, + "learning_rate": 4.78360455480752e-06, + "loss": 0.555, + "step": 2623 + }, + { + "epoch": 0.4140760612277103, + "grad_norm": 0.5885564088821411, + "learning_rate": 4.783435503696755e-06, + "loss": 0.5463, + "step": 2624 + }, + { + "epoch": 0.4142338646047025, + "grad_norm": 0.5989733338356018, + "learning_rate": 4.783266389568731e-06, + "loss": 0.5998, + "step": 2625 + }, + { + "epoch": 0.4143916679816948, + "grad_norm": 0.5824238657951355, + "learning_rate": 4.783097212428117e-06, + "loss": 0.6121, + "step": 2626 + }, + { + "epoch": 0.4145494713586871, + "grad_norm": 0.5949556231498718, + "learning_rate": 4.78292797227958e-06, + "loss": 0.59, + "step": 2627 + }, + { + "epoch": 0.41470727473567937, + "grad_norm": 0.5692095756530762, + "learning_rate": 4.782758669127792e-06, + "loss": 0.6097, + "step": 2628 + }, + { + "epoch": 0.4148650781126716, + "grad_norm": 0.6068646907806396, + "learning_rate": 4.782589302977424e-06, + "loss": 0.6037, + "step": 2629 + }, + { + "epoch": 0.4150228814896639, + "grad_norm": 0.5921100378036499, + "learning_rate": 4.782419873833151e-06, + "loss": 0.6093, + "step": 2630 + }, + { + "epoch": 0.41518068486665616, + "grad_norm": 0.5902360677719116, + "learning_rate": 4.782250381699648e-06, + "loss": 0.5895, + "step": 2631 + }, + { + "epoch": 0.41533848824364844, + "grad_norm": 0.599510669708252, + "learning_rate": 4.782080826581593e-06, + "loss": 0.5915, + "step": 2632 + }, + { + "epoch": 0.41549629162064067, + "grad_norm": 0.616295576095581, + "learning_rate": 4.781911208483666e-06, + "loss": 0.605, + "step": 2633 + }, + { + "epoch": 0.41565409499763295, + "grad_norm": 0.5989536046981812, + "learning_rate": 4.781741527410547e-06, + "loss": 0.6079, + "step": 2634 + }, + { + "epoch": 0.41581189837462523, + "grad_norm": 0.606496274471283, + "learning_rate": 4.781571783366921e-06, + "loss": 0.5388, + "step": 2635 + }, + { + "epoch": 0.41596970175161746, + "grad_norm": 0.5721690058708191, + "learning_rate": 4.781401976357469e-06, + "loss": 0.6245, + "step": 2636 + }, + { + "epoch": 0.41612750512860974, + "grad_norm": 0.5814123153686523, + "learning_rate": 4.78123210638688e-06, + "loss": 0.6298, + "step": 2637 + }, + { + "epoch": 0.416285308505602, + "grad_norm": 0.5871867537498474, + "learning_rate": 4.78106217345984e-06, + "loss": 0.6102, + "step": 2638 + }, + { + "epoch": 0.4164431118825943, + "grad_norm": 0.5794053673744202, + "learning_rate": 4.780892177581041e-06, + "loss": 0.5582, + "step": 2639 + }, + { + "epoch": 0.41660091525958654, + "grad_norm": 0.5733655095100403, + "learning_rate": 4.780722118755174e-06, + "loss": 0.5779, + "step": 2640 + }, + { + "epoch": 0.4167587186365788, + "grad_norm": 0.5843023061752319, + "learning_rate": 4.780551996986929e-06, + "loss": 0.5818, + "step": 2641 + }, + { + "epoch": 0.4169165220135711, + "grad_norm": 0.5858648419380188, + "learning_rate": 4.780381812281005e-06, + "loss": 0.5942, + "step": 2642 + }, + { + "epoch": 0.4170743253905634, + "grad_norm": 0.5878325700759888, + "learning_rate": 4.780211564642098e-06, + "loss": 0.5894, + "step": 2643 + }, + { + "epoch": 0.4172321287675556, + "grad_norm": 0.5800529718399048, + "learning_rate": 4.780041254074904e-06, + "loss": 0.605, + "step": 2644 + }, + { + "epoch": 0.4173899321445479, + "grad_norm": 0.5722482800483704, + "learning_rate": 4.779870880584126e-06, + "loss": 0.5844, + "step": 2645 + }, + { + "epoch": 0.4175477355215402, + "grad_norm": 0.6268527507781982, + "learning_rate": 4.779700444174463e-06, + "loss": 0.5857, + "step": 2646 + }, + { + "epoch": 0.4177055388985324, + "grad_norm": 0.5933964848518372, + "learning_rate": 4.779529944850621e-06, + "loss": 0.5782, + "step": 2647 + }, + { + "epoch": 0.4178633422755247, + "grad_norm": 0.5938589572906494, + "learning_rate": 4.779359382617305e-06, + "loss": 0.5674, + "step": 2648 + }, + { + "epoch": 0.41802114565251697, + "grad_norm": 0.6071444153785706, + "learning_rate": 4.7791887574792204e-06, + "loss": 0.5692, + "step": 2649 + }, + { + "epoch": 0.41817894902950925, + "grad_norm": 0.5795953869819641, + "learning_rate": 4.779018069441079e-06, + "loss": 0.6024, + "step": 2650 + }, + { + "epoch": 0.4183367524065015, + "grad_norm": 0.6048374772071838, + "learning_rate": 4.778847318507589e-06, + "loss": 0.6106, + "step": 2651 + }, + { + "epoch": 0.41849455578349376, + "grad_norm": 0.5890195369720459, + "learning_rate": 4.778676504683463e-06, + "loss": 0.6135, + "step": 2652 + }, + { + "epoch": 0.41865235916048604, + "grad_norm": 0.5743721723556519, + "learning_rate": 4.778505627973416e-06, + "loss": 0.5635, + "step": 2653 + }, + { + "epoch": 0.4188101625374783, + "grad_norm": 0.5868130922317505, + "learning_rate": 4.778334688382164e-06, + "loss": 0.6028, + "step": 2654 + }, + { + "epoch": 0.41896796591447055, + "grad_norm": 0.5817955732345581, + "learning_rate": 4.778163685914422e-06, + "loss": 0.5849, + "step": 2655 + }, + { + "epoch": 0.41912576929146284, + "grad_norm": 0.5897790193557739, + "learning_rate": 4.7779926205749115e-06, + "loss": 0.6363, + "step": 2656 + }, + { + "epoch": 0.4192835726684551, + "grad_norm": 0.6244426965713501, + "learning_rate": 4.777821492368354e-06, + "loss": 0.5864, + "step": 2657 + }, + { + "epoch": 0.41944137604544734, + "grad_norm": 0.59083491563797, + "learning_rate": 4.77765030129947e-06, + "loss": 0.5449, + "step": 2658 + }, + { + "epoch": 0.4195991794224396, + "grad_norm": 0.5851543545722961, + "learning_rate": 4.777479047372986e-06, + "loss": 0.6098, + "step": 2659 + }, + { + "epoch": 0.4197569827994319, + "grad_norm": 0.5929679274559021, + "learning_rate": 4.777307730593627e-06, + "loss": 0.5744, + "step": 2660 + }, + { + "epoch": 0.4199147861764242, + "grad_norm": 0.5910808444023132, + "learning_rate": 4.777136350966122e-06, + "loss": 0.5638, + "step": 2661 + }, + { + "epoch": 0.4200725895534164, + "grad_norm": 0.6317501068115234, + "learning_rate": 4.7769649084951995e-06, + "loss": 0.5803, + "step": 2662 + }, + { + "epoch": 0.4202303929304087, + "grad_norm": 0.5818907022476196, + "learning_rate": 4.776793403185591e-06, + "loss": 0.5822, + "step": 2663 + }, + { + "epoch": 0.420388196307401, + "grad_norm": 0.6002300977706909, + "learning_rate": 4.7766218350420305e-06, + "loss": 0.5778, + "step": 2664 + }, + { + "epoch": 0.42054599968439327, + "grad_norm": 0.6090407371520996, + "learning_rate": 4.776450204069252e-06, + "loss": 0.5859, + "step": 2665 + }, + { + "epoch": 0.4207038030613855, + "grad_norm": 0.5740833282470703, + "learning_rate": 4.776278510271992e-06, + "loss": 0.55, + "step": 2666 + }, + { + "epoch": 0.4208616064383778, + "grad_norm": 0.6168435215950012, + "learning_rate": 4.7761067536549895e-06, + "loss": 0.5861, + "step": 2667 + }, + { + "epoch": 0.42101940981537006, + "grad_norm": 0.6139153838157654, + "learning_rate": 4.775934934222985e-06, + "loss": 0.549, + "step": 2668 + }, + { + "epoch": 0.42117721319236234, + "grad_norm": 0.6066169142723083, + "learning_rate": 4.775763051980719e-06, + "loss": 0.6257, + "step": 2669 + }, + { + "epoch": 0.42133501656935457, + "grad_norm": 0.5798470377922058, + "learning_rate": 4.775591106932937e-06, + "loss": 0.616, + "step": 2670 + }, + { + "epoch": 0.42149281994634685, + "grad_norm": 0.60130774974823, + "learning_rate": 4.7754190990843806e-06, + "loss": 0.6194, + "step": 2671 + }, + { + "epoch": 0.42165062332333914, + "grad_norm": 0.6167163252830505, + "learning_rate": 4.7752470284398e-06, + "loss": 0.5664, + "step": 2672 + }, + { + "epoch": 0.42180842670033136, + "grad_norm": 0.5973706245422363, + "learning_rate": 4.775074895003944e-06, + "loss": 0.5956, + "step": 2673 + }, + { + "epoch": 0.42196623007732365, + "grad_norm": 0.5851060152053833, + "learning_rate": 4.774902698781562e-06, + "loss": 0.6122, + "step": 2674 + }, + { + "epoch": 0.42212403345431593, + "grad_norm": 0.6060495376586914, + "learning_rate": 4.774730439777405e-06, + "loss": 0.6014, + "step": 2675 + }, + { + "epoch": 0.4222818368313082, + "grad_norm": 0.5805568099021912, + "learning_rate": 4.77455811799623e-06, + "loss": 0.6075, + "step": 2676 + }, + { + "epoch": 0.42243964020830044, + "grad_norm": 0.5999797582626343, + "learning_rate": 4.774385733442789e-06, + "loss": 0.5867, + "step": 2677 + }, + { + "epoch": 0.4225974435852927, + "grad_norm": 0.586336076259613, + "learning_rate": 4.774213286121842e-06, + "loss": 0.6187, + "step": 2678 + }, + { + "epoch": 0.422755246962285, + "grad_norm": 0.5581575036048889, + "learning_rate": 4.774040776038149e-06, + "loss": 0.602, + "step": 2679 + }, + { + "epoch": 0.4229130503392773, + "grad_norm": 0.6198563575744629, + "learning_rate": 4.773868203196468e-06, + "loss": 0.5902, + "step": 2680 + }, + { + "epoch": 0.4230708537162695, + "grad_norm": 0.5803166627883911, + "learning_rate": 4.773695567601562e-06, + "loss": 0.5747, + "step": 2681 + }, + { + "epoch": 0.4232286570932618, + "grad_norm": 0.5773801207542419, + "learning_rate": 4.773522869258197e-06, + "loss": 0.5848, + "step": 2682 + }, + { + "epoch": 0.4233864604702541, + "grad_norm": 0.6190651059150696, + "learning_rate": 4.7733501081711385e-06, + "loss": 0.5605, + "step": 2683 + }, + { + "epoch": 0.4235442638472463, + "grad_norm": 0.601433277130127, + "learning_rate": 4.773177284345154e-06, + "loss": 0.6278, + "step": 2684 + }, + { + "epoch": 0.4237020672242386, + "grad_norm": 0.6123886108398438, + "learning_rate": 4.773004397785013e-06, + "loss": 0.5971, + "step": 2685 + }, + { + "epoch": 0.42385987060123087, + "grad_norm": 0.6127108931541443, + "learning_rate": 4.772831448495486e-06, + "loss": 0.6105, + "step": 2686 + }, + { + "epoch": 0.42401767397822315, + "grad_norm": 0.5799890756607056, + "learning_rate": 4.772658436481348e-06, + "loss": 0.615, + "step": 2687 + }, + { + "epoch": 0.4241754773552154, + "grad_norm": 0.6000646948814392, + "learning_rate": 4.772485361747372e-06, + "loss": 0.6189, + "step": 2688 + }, + { + "epoch": 0.42433328073220766, + "grad_norm": 0.5580902099609375, + "learning_rate": 4.772312224298335e-06, + "loss": 0.5701, + "step": 2689 + }, + { + "epoch": 0.42449108410919995, + "grad_norm": 0.6168724894523621, + "learning_rate": 4.772139024139015e-06, + "loss": 0.6155, + "step": 2690 + }, + { + "epoch": 0.42464888748619223, + "grad_norm": 0.5708628296852112, + "learning_rate": 4.771965761274191e-06, + "loss": 0.5948, + "step": 2691 + }, + { + "epoch": 0.42480669086318446, + "grad_norm": 0.5846563577651978, + "learning_rate": 4.771792435708648e-06, + "loss": 0.6048, + "step": 2692 + }, + { + "epoch": 0.42496449424017674, + "grad_norm": 0.5897479057312012, + "learning_rate": 4.771619047447164e-06, + "loss": 0.5956, + "step": 2693 + }, + { + "epoch": 0.425122297617169, + "grad_norm": 0.5830829739570618, + "learning_rate": 4.77144559649453e-06, + "loss": 0.5655, + "step": 2694 + }, + { + "epoch": 0.42528010099416125, + "grad_norm": 0.5663753151893616, + "learning_rate": 4.771272082855528e-06, + "loss": 0.5737, + "step": 2695 + }, + { + "epoch": 0.42543790437115353, + "grad_norm": 0.5658997893333435, + "learning_rate": 4.771098506534949e-06, + "loss": 0.6079, + "step": 2696 + }, + { + "epoch": 0.4255957077481458, + "grad_norm": 0.6047332286834717, + "learning_rate": 4.770924867537583e-06, + "loss": 0.6108, + "step": 2697 + }, + { + "epoch": 0.4257535111251381, + "grad_norm": 0.584319531917572, + "learning_rate": 4.770751165868221e-06, + "loss": 0.5886, + "step": 2698 + }, + { + "epoch": 0.4259113145021303, + "grad_norm": 0.5729540586471558, + "learning_rate": 4.770577401531658e-06, + "loss": 0.6076, + "step": 2699 + }, + { + "epoch": 0.4260691178791226, + "grad_norm": 0.5921175479888916, + "learning_rate": 4.77040357453269e-06, + "loss": 0.5774, + "step": 2700 + }, + { + "epoch": 0.4262269212561149, + "grad_norm": 0.626929759979248, + "learning_rate": 4.770229684876112e-06, + "loss": 0.594, + "step": 2701 + }, + { + "epoch": 0.42638472463310717, + "grad_norm": 0.5999388098716736, + "learning_rate": 4.770055732566725e-06, + "loss": 0.5892, + "step": 2702 + }, + { + "epoch": 0.4265425280100994, + "grad_norm": 0.5724656581878662, + "learning_rate": 4.769881717609328e-06, + "loss": 0.6519, + "step": 2703 + }, + { + "epoch": 0.4267003313870917, + "grad_norm": 0.7296868562698364, + "learning_rate": 4.769707640008724e-06, + "loss": 0.5443, + "step": 2704 + }, + { + "epoch": 0.42685813476408396, + "grad_norm": 0.5868944525718689, + "learning_rate": 4.769533499769718e-06, + "loss": 0.6382, + "step": 2705 + }, + { + "epoch": 0.42701593814107625, + "grad_norm": 0.6306737661361694, + "learning_rate": 4.769359296897115e-06, + "loss": 0.6006, + "step": 2706 + }, + { + "epoch": 0.4271737415180685, + "grad_norm": 0.6254597902297974, + "learning_rate": 4.769185031395722e-06, + "loss": 0.593, + "step": 2707 + }, + { + "epoch": 0.42733154489506076, + "grad_norm": 0.7143978476524353, + "learning_rate": 4.76901070327035e-06, + "loss": 0.5825, + "step": 2708 + }, + { + "epoch": 0.42748934827205304, + "grad_norm": 0.5945603847503662, + "learning_rate": 4.768836312525809e-06, + "loss": 0.6012, + "step": 2709 + }, + { + "epoch": 0.42764715164904527, + "grad_norm": 0.6628995537757874, + "learning_rate": 4.768661859166911e-06, + "loss": 0.589, + "step": 2710 + }, + { + "epoch": 0.42780495502603755, + "grad_norm": 0.6585350632667542, + "learning_rate": 4.768487343198471e-06, + "loss": 0.586, + "step": 2711 + }, + { + "epoch": 0.42796275840302983, + "grad_norm": 0.5895000696182251, + "learning_rate": 4.768312764625306e-06, + "loss": 0.5865, + "step": 2712 + }, + { + "epoch": 0.4281205617800221, + "grad_norm": 0.596474289894104, + "learning_rate": 4.768138123452234e-06, + "loss": 0.6145, + "step": 2713 + }, + { + "epoch": 0.42827836515701434, + "grad_norm": 0.6027510762214661, + "learning_rate": 4.7679634196840734e-06, + "loss": 0.5972, + "step": 2714 + }, + { + "epoch": 0.4284361685340066, + "grad_norm": 0.5994906425476074, + "learning_rate": 4.767788653325647e-06, + "loss": 0.5776, + "step": 2715 + }, + { + "epoch": 0.4285939719109989, + "grad_norm": 0.626194953918457, + "learning_rate": 4.7676138243817764e-06, + "loss": 0.6334, + "step": 2716 + }, + { + "epoch": 0.4287517752879912, + "grad_norm": 0.6330685615539551, + "learning_rate": 4.767438932857288e-06, + "loss": 0.6064, + "step": 2717 + }, + { + "epoch": 0.4289095786649834, + "grad_norm": 0.5921592116355896, + "learning_rate": 4.767263978757007e-06, + "loss": 0.5858, + "step": 2718 + }, + { + "epoch": 0.4290673820419757, + "grad_norm": 0.602155864238739, + "learning_rate": 4.767088962085763e-06, + "loss": 0.5833, + "step": 2719 + }, + { + "epoch": 0.429225185418968, + "grad_norm": 0.5867778658866882, + "learning_rate": 4.766913882848385e-06, + "loss": 0.6082, + "step": 2720 + }, + { + "epoch": 0.4293829887959602, + "grad_norm": 0.605570912361145, + "learning_rate": 4.766738741049706e-06, + "loss": 0.5937, + "step": 2721 + }, + { + "epoch": 0.4295407921729525, + "grad_norm": 0.6041519641876221, + "learning_rate": 4.766563536694557e-06, + "loss": 0.5817, + "step": 2722 + }, + { + "epoch": 0.4296985955499448, + "grad_norm": 0.6029902100563049, + "learning_rate": 4.7663882697877766e-06, + "loss": 0.636, + "step": 2723 + }, + { + "epoch": 0.42985639892693706, + "grad_norm": 0.6410138607025146, + "learning_rate": 4.7662129403341995e-06, + "loss": 0.5967, + "step": 2724 + }, + { + "epoch": 0.4300142023039293, + "grad_norm": 0.6172596216201782, + "learning_rate": 4.766037548338664e-06, + "loss": 0.5994, + "step": 2725 + }, + { + "epoch": 0.43017200568092157, + "grad_norm": 0.5801158547401428, + "learning_rate": 4.765862093806013e-06, + "loss": 0.5705, + "step": 2726 + }, + { + "epoch": 0.43032980905791385, + "grad_norm": 0.5699033141136169, + "learning_rate": 4.765686576741086e-06, + "loss": 0.5763, + "step": 2727 + }, + { + "epoch": 0.43048761243490613, + "grad_norm": 0.6263460516929626, + "learning_rate": 4.7655109971487275e-06, + "loss": 0.6055, + "step": 2728 + }, + { + "epoch": 0.43064541581189836, + "grad_norm": 0.5779768228530884, + "learning_rate": 4.7653353550337834e-06, + "loss": 0.6013, + "step": 2729 + }, + { + "epoch": 0.43080321918889064, + "grad_norm": 0.6151825785636902, + "learning_rate": 4.765159650401102e-06, + "loss": 0.6186, + "step": 2730 + }, + { + "epoch": 0.4309610225658829, + "grad_norm": 0.6216652393341064, + "learning_rate": 4.7649838832555306e-06, + "loss": 0.5909, + "step": 2731 + }, + { + "epoch": 0.43111882594287515, + "grad_norm": 0.5430160164833069, + "learning_rate": 4.764808053601921e-06, + "loss": 0.5759, + "step": 2732 + }, + { + "epoch": 0.43127662931986743, + "grad_norm": 0.5804219245910645, + "learning_rate": 4.764632161445125e-06, + "loss": 0.6093, + "step": 2733 + }, + { + "epoch": 0.4314344326968597, + "grad_norm": 0.599999725818634, + "learning_rate": 4.764456206789998e-06, + "loss": 0.5894, + "step": 2734 + }, + { + "epoch": 0.431592236073852, + "grad_norm": 0.6165889501571655, + "learning_rate": 4.764280189641394e-06, + "loss": 0.6023, + "step": 2735 + }, + { + "epoch": 0.4317500394508442, + "grad_norm": 0.5754494667053223, + "learning_rate": 4.764104110004172e-06, + "loss": 0.5895, + "step": 2736 + }, + { + "epoch": 0.4319078428278365, + "grad_norm": 0.5685514211654663, + "learning_rate": 4.763927967883191e-06, + "loss": 0.5786, + "step": 2737 + }, + { + "epoch": 0.4320656462048288, + "grad_norm": 0.5621395707130432, + "learning_rate": 4.763751763283312e-06, + "loss": 0.5833, + "step": 2738 + }, + { + "epoch": 0.4322234495818211, + "grad_norm": 0.5865002870559692, + "learning_rate": 4.763575496209398e-06, + "loss": 0.6042, + "step": 2739 + }, + { + "epoch": 0.4323812529588133, + "grad_norm": 0.5723501443862915, + "learning_rate": 4.763399166666314e-06, + "loss": 0.5826, + "step": 2740 + }, + { + "epoch": 0.4325390563358056, + "grad_norm": 0.6359639763832092, + "learning_rate": 4.763222774658926e-06, + "loss": 0.62, + "step": 2741 + }, + { + "epoch": 0.43269685971279787, + "grad_norm": 0.6119403839111328, + "learning_rate": 4.763046320192101e-06, + "loss": 0.6207, + "step": 2742 + }, + { + "epoch": 0.43285466308979015, + "grad_norm": 0.5801235437393188, + "learning_rate": 4.76286980327071e-06, + "loss": 0.584, + "step": 2743 + }, + { + "epoch": 0.4330124664667824, + "grad_norm": 0.6053288578987122, + "learning_rate": 4.762693223899623e-06, + "loss": 0.599, + "step": 2744 + }, + { + "epoch": 0.43317026984377466, + "grad_norm": 0.5668724775314331, + "learning_rate": 4.762516582083715e-06, + "loss": 0.6202, + "step": 2745 + }, + { + "epoch": 0.43332807322076694, + "grad_norm": 0.565248429775238, + "learning_rate": 4.762339877827859e-06, + "loss": 0.5851, + "step": 2746 + }, + { + "epoch": 0.43348587659775917, + "grad_norm": 0.5973761677742004, + "learning_rate": 4.7621631111369335e-06, + "loss": 0.6003, + "step": 2747 + }, + { + "epoch": 0.43364367997475145, + "grad_norm": 0.5728095173835754, + "learning_rate": 4.761986282015816e-06, + "loss": 0.6223, + "step": 2748 + }, + { + "epoch": 0.43380148335174373, + "grad_norm": 0.6046932935714722, + "learning_rate": 4.761809390469386e-06, + "loss": 0.5626, + "step": 2749 + }, + { + "epoch": 0.433959286728736, + "grad_norm": 0.5883917808532715, + "learning_rate": 4.761632436502527e-06, + "loss": 0.5841, + "step": 2750 + }, + { + "epoch": 0.43411709010572824, + "grad_norm": 0.5916411876678467, + "learning_rate": 4.76145542012012e-06, + "loss": 0.6159, + "step": 2751 + }, + { + "epoch": 0.4342748934827205, + "grad_norm": 0.5967442393302917, + "learning_rate": 4.761278341327051e-06, + "loss": 0.5961, + "step": 2752 + }, + { + "epoch": 0.4344326968597128, + "grad_norm": 0.596850574016571, + "learning_rate": 4.761101200128208e-06, + "loss": 0.5634, + "step": 2753 + }, + { + "epoch": 0.4345905002367051, + "grad_norm": 0.6543436646461487, + "learning_rate": 4.760923996528479e-06, + "loss": 0.5993, + "step": 2754 + }, + { + "epoch": 0.4347483036136973, + "grad_norm": 0.6227535605430603, + "learning_rate": 4.760746730532755e-06, + "loss": 0.6101, + "step": 2755 + }, + { + "epoch": 0.4349061069906896, + "grad_norm": 0.5772423148155212, + "learning_rate": 4.760569402145927e-06, + "loss": 0.5813, + "step": 2756 + }, + { + "epoch": 0.4350639103676819, + "grad_norm": 0.5944473147392273, + "learning_rate": 4.7603920113728894e-06, + "loss": 0.5703, + "step": 2757 + }, + { + "epoch": 0.4352217137446741, + "grad_norm": 0.5950362086296082, + "learning_rate": 4.760214558218538e-06, + "loss": 0.5805, + "step": 2758 + }, + { + "epoch": 0.4353795171216664, + "grad_norm": 0.6190072298049927, + "learning_rate": 4.760037042687769e-06, + "loss": 0.6012, + "step": 2759 + }, + { + "epoch": 0.4355373204986587, + "grad_norm": 0.5829633474349976, + "learning_rate": 4.759859464785484e-06, + "loss": 0.5483, + "step": 2760 + }, + { + "epoch": 0.43569512387565096, + "grad_norm": 0.6005684733390808, + "learning_rate": 4.75968182451658e-06, + "loss": 0.5933, + "step": 2761 + }, + { + "epoch": 0.4358529272526432, + "grad_norm": 0.6383069157600403, + "learning_rate": 4.759504121885962e-06, + "loss": 0.5836, + "step": 2762 + }, + { + "epoch": 0.43601073062963547, + "grad_norm": 0.5883978009223938, + "learning_rate": 4.759326356898534e-06, + "loss": 0.611, + "step": 2763 + }, + { + "epoch": 0.43616853400662775, + "grad_norm": 0.6022762060165405, + "learning_rate": 4.759148529559201e-06, + "loss": 0.583, + "step": 2764 + }, + { + "epoch": 0.43632633738362003, + "grad_norm": 0.5723748207092285, + "learning_rate": 4.75897063987287e-06, + "loss": 0.5922, + "step": 2765 + }, + { + "epoch": 0.43648414076061226, + "grad_norm": 0.6035156846046448, + "learning_rate": 4.758792687844453e-06, + "loss": 0.6264, + "step": 2766 + }, + { + "epoch": 0.43664194413760454, + "grad_norm": 0.5855363011360168, + "learning_rate": 4.758614673478859e-06, + "loss": 0.6074, + "step": 2767 + }, + { + "epoch": 0.4367997475145968, + "grad_norm": 0.5681228637695312, + "learning_rate": 4.758436596781001e-06, + "loss": 0.5894, + "step": 2768 + }, + { + "epoch": 0.43695755089158905, + "grad_norm": 0.5668431520462036, + "learning_rate": 4.758258457755793e-06, + "loss": 0.6344, + "step": 2769 + }, + { + "epoch": 0.43711535426858134, + "grad_norm": 0.5751395225524902, + "learning_rate": 4.758080256408153e-06, + "loss": 0.6134, + "step": 2770 + }, + { + "epoch": 0.4372731576455736, + "grad_norm": 0.6039667129516602, + "learning_rate": 4.757901992742997e-06, + "loss": 0.5762, + "step": 2771 + }, + { + "epoch": 0.4374309610225659, + "grad_norm": 0.5899812579154968, + "learning_rate": 4.757723666765246e-06, + "loss": 0.5991, + "step": 2772 + }, + { + "epoch": 0.43758876439955813, + "grad_norm": 0.5811094045639038, + "learning_rate": 4.75754527847982e-06, + "loss": 0.5919, + "step": 2773 + }, + { + "epoch": 0.4377465677765504, + "grad_norm": 0.5980886220932007, + "learning_rate": 4.757366827891643e-06, + "loss": 0.5873, + "step": 2774 + }, + { + "epoch": 0.4379043711535427, + "grad_norm": 0.5916863679885864, + "learning_rate": 4.7571883150056396e-06, + "loss": 0.5697, + "step": 2775 + }, + { + "epoch": 0.438062174530535, + "grad_norm": 0.5886271595954895, + "learning_rate": 4.7570097398267366e-06, + "loss": 0.5863, + "step": 2776 + }, + { + "epoch": 0.4382199779075272, + "grad_norm": 0.5859681963920593, + "learning_rate": 4.756831102359862e-06, + "loss": 0.571, + "step": 2777 + }, + { + "epoch": 0.4383777812845195, + "grad_norm": 0.6106812953948975, + "learning_rate": 4.7566524026099455e-06, + "loss": 0.5675, + "step": 2778 + }, + { + "epoch": 0.43853558466151177, + "grad_norm": 0.5789639353752136, + "learning_rate": 4.75647364058192e-06, + "loss": 0.5927, + "step": 2779 + }, + { + "epoch": 0.43869338803850405, + "grad_norm": 0.5964926481246948, + "learning_rate": 4.756294816280717e-06, + "loss": 0.6023, + "step": 2780 + }, + { + "epoch": 0.4388511914154963, + "grad_norm": 0.578890323638916, + "learning_rate": 4.756115929711273e-06, + "loss": 0.5811, + "step": 2781 + }, + { + "epoch": 0.43900899479248856, + "grad_norm": 0.5742915272712708, + "learning_rate": 4.7559369808785235e-06, + "loss": 0.5851, + "step": 2782 + }, + { + "epoch": 0.43916679816948084, + "grad_norm": 0.5812144875526428, + "learning_rate": 4.755757969787409e-06, + "loss": 0.5656, + "step": 2783 + }, + { + "epoch": 0.43932460154647307, + "grad_norm": 0.583630383014679, + "learning_rate": 4.755578896442868e-06, + "loss": 0.5693, + "step": 2784 + }, + { + "epoch": 0.43948240492346535, + "grad_norm": 0.5791051983833313, + "learning_rate": 4.755399760849844e-06, + "loss": 0.5839, + "step": 2785 + }, + { + "epoch": 0.43964020830045764, + "grad_norm": 0.5932734608650208, + "learning_rate": 4.755220563013279e-06, + "loss": 0.6186, + "step": 2786 + }, + { + "epoch": 0.4397980116774499, + "grad_norm": 0.603436291217804, + "learning_rate": 4.755041302938119e-06, + "loss": 0.6105, + "step": 2787 + }, + { + "epoch": 0.43995581505444215, + "grad_norm": 0.6084696054458618, + "learning_rate": 4.754861980629313e-06, + "loss": 0.6311, + "step": 2788 + }, + { + "epoch": 0.44011361843143443, + "grad_norm": 0.6119621396064758, + "learning_rate": 4.754682596091808e-06, + "loss": 0.621, + "step": 2789 + }, + { + "epoch": 0.4402714218084267, + "grad_norm": 0.5859028100967407, + "learning_rate": 4.754503149330554e-06, + "loss": 0.5784, + "step": 2790 + }, + { + "epoch": 0.440429225185419, + "grad_norm": 0.5894346237182617, + "learning_rate": 4.7543236403505045e-06, + "loss": 0.5805, + "step": 2791 + }, + { + "epoch": 0.4405870285624112, + "grad_norm": 0.6027980446815491, + "learning_rate": 4.7541440691566135e-06, + "loss": 0.5597, + "step": 2792 + }, + { + "epoch": 0.4407448319394035, + "grad_norm": 0.5887139439582825, + "learning_rate": 4.753964435753836e-06, + "loss": 0.6016, + "step": 2793 + }, + { + "epoch": 0.4409026353163958, + "grad_norm": 0.5902952551841736, + "learning_rate": 4.753784740147129e-06, + "loss": 0.5782, + "step": 2794 + }, + { + "epoch": 0.441060438693388, + "grad_norm": 0.5814700126647949, + "learning_rate": 4.753604982341454e-06, + "loss": 0.5817, + "step": 2795 + }, + { + "epoch": 0.4412182420703803, + "grad_norm": 0.5833851099014282, + "learning_rate": 4.753425162341771e-06, + "loss": 0.596, + "step": 2796 + }, + { + "epoch": 0.4413760454473726, + "grad_norm": 0.8031631112098694, + "learning_rate": 4.753245280153041e-06, + "loss": 0.5993, + "step": 2797 + }, + { + "epoch": 0.44153384882436486, + "grad_norm": 0.6088582277297974, + "learning_rate": 4.753065335780229e-06, + "loss": 0.5851, + "step": 2798 + }, + { + "epoch": 0.4416916522013571, + "grad_norm": 0.643575131893158, + "learning_rate": 4.752885329228302e-06, + "loss": 0.6061, + "step": 2799 + }, + { + "epoch": 0.44184945557834937, + "grad_norm": 0.6004914045333862, + "learning_rate": 4.7527052605022264e-06, + "loss": 0.5805, + "step": 2800 + }, + { + "epoch": 0.44200725895534165, + "grad_norm": 0.5777478814125061, + "learning_rate": 4.752525129606973e-06, + "loss": 0.5829, + "step": 2801 + }, + { + "epoch": 0.44216506233233394, + "grad_norm": 0.5910287499427795, + "learning_rate": 4.752344936547512e-06, + "loss": 0.6119, + "step": 2802 + }, + { + "epoch": 0.44232286570932616, + "grad_norm": 0.612045168876648, + "learning_rate": 4.752164681328817e-06, + "loss": 0.5978, + "step": 2803 + }, + { + "epoch": 0.44248066908631845, + "grad_norm": 0.5773860812187195, + "learning_rate": 4.751984363955861e-06, + "loss": 0.5784, + "step": 2804 + }, + { + "epoch": 0.44263847246331073, + "grad_norm": 0.589431881904602, + "learning_rate": 4.7518039844336224e-06, + "loss": 0.6024, + "step": 2805 + }, + { + "epoch": 0.44279627584030296, + "grad_norm": 0.5788934230804443, + "learning_rate": 4.7516235427670786e-06, + "loss": 0.5832, + "step": 2806 + }, + { + "epoch": 0.44295407921729524, + "grad_norm": 0.5937870144844055, + "learning_rate": 4.751443038961208e-06, + "loss": 0.621, + "step": 2807 + }, + { + "epoch": 0.4431118825942875, + "grad_norm": 0.5824783444404602, + "learning_rate": 4.751262473020994e-06, + "loss": 0.5994, + "step": 2808 + }, + { + "epoch": 0.4432696859712798, + "grad_norm": 0.6123673915863037, + "learning_rate": 4.7510818449514195e-06, + "loss": 0.612, + "step": 2809 + }, + { + "epoch": 0.44342748934827203, + "grad_norm": 0.6018616557121277, + "learning_rate": 4.750901154757469e-06, + "loss": 0.5823, + "step": 2810 + }, + { + "epoch": 0.4435852927252643, + "grad_norm": 0.637744665145874, + "learning_rate": 4.750720402444128e-06, + "loss": 0.5569, + "step": 2811 + }, + { + "epoch": 0.4437430961022566, + "grad_norm": 0.5836570262908936, + "learning_rate": 4.750539588016386e-06, + "loss": 0.588, + "step": 2812 + }, + { + "epoch": 0.4439008994792489, + "grad_norm": 0.5925715565681458, + "learning_rate": 4.750358711479234e-06, + "loss": 0.6091, + "step": 2813 + }, + { + "epoch": 0.4440587028562411, + "grad_norm": 0.5948092937469482, + "learning_rate": 4.750177772837661e-06, + "loss": 0.5976, + "step": 2814 + }, + { + "epoch": 0.4442165062332334, + "grad_norm": 0.5890248417854309, + "learning_rate": 4.7499967720966625e-06, + "loss": 0.6207, + "step": 2815 + }, + { + "epoch": 0.44437430961022567, + "grad_norm": 0.5835292935371399, + "learning_rate": 4.749815709261234e-06, + "loss": 0.5776, + "step": 2816 + }, + { + "epoch": 0.44453211298721795, + "grad_norm": 0.5877990126609802, + "learning_rate": 4.749634584336371e-06, + "loss": 0.5765, + "step": 2817 + }, + { + "epoch": 0.4446899163642102, + "grad_norm": 0.5949344635009766, + "learning_rate": 4.749453397327073e-06, + "loss": 0.6096, + "step": 2818 + }, + { + "epoch": 0.44484771974120246, + "grad_norm": 0.6080163717269897, + "learning_rate": 4.749272148238339e-06, + "loss": 0.5701, + "step": 2819 + }, + { + "epoch": 0.44500552311819475, + "grad_norm": 0.5782211422920227, + "learning_rate": 4.749090837075173e-06, + "loss": 0.6029, + "step": 2820 + }, + { + "epoch": 0.445163326495187, + "grad_norm": 0.6040611267089844, + "learning_rate": 4.748909463842577e-06, + "loss": 0.6015, + "step": 2821 + }, + { + "epoch": 0.44532112987217926, + "grad_norm": 0.6019403338432312, + "learning_rate": 4.748728028545558e-06, + "loss": 0.5969, + "step": 2822 + }, + { + "epoch": 0.44547893324917154, + "grad_norm": 0.6153128743171692, + "learning_rate": 4.748546531189123e-06, + "loss": 0.5926, + "step": 2823 + }, + { + "epoch": 0.4456367366261638, + "grad_norm": 0.5930256843566895, + "learning_rate": 4.7483649717782795e-06, + "loss": 0.61, + "step": 2824 + }, + { + "epoch": 0.44579454000315605, + "grad_norm": 0.6106759905815125, + "learning_rate": 4.748183350318039e-06, + "loss": 0.6186, + "step": 2825 + }, + { + "epoch": 0.44595234338014833, + "grad_norm": 0.5888760685920715, + "learning_rate": 4.748001666813414e-06, + "loss": 0.5946, + "step": 2826 + }, + { + "epoch": 0.4461101467571406, + "grad_norm": 0.5950741171836853, + "learning_rate": 4.7478199212694186e-06, + "loss": 0.5853, + "step": 2827 + }, + { + "epoch": 0.4462679501341329, + "grad_norm": 0.6045480966567993, + "learning_rate": 4.7476381136910685e-06, + "loss": 0.6285, + "step": 2828 + }, + { + "epoch": 0.4464257535111251, + "grad_norm": 0.5780251622200012, + "learning_rate": 4.74745624408338e-06, + "loss": 0.5755, + "step": 2829 + }, + { + "epoch": 0.4465835568881174, + "grad_norm": 0.6459381580352783, + "learning_rate": 4.747274312451373e-06, + "loss": 0.6122, + "step": 2830 + }, + { + "epoch": 0.4467413602651097, + "grad_norm": 0.5933216214179993, + "learning_rate": 4.747092318800069e-06, + "loss": 0.6432, + "step": 2831 + }, + { + "epoch": 0.4468991636421019, + "grad_norm": 0.6148660182952881, + "learning_rate": 4.74691026313449e-06, + "loss": 0.6048, + "step": 2832 + }, + { + "epoch": 0.4470569670190942, + "grad_norm": 0.5895636677742004, + "learning_rate": 4.74672814545966e-06, + "loss": 0.5802, + "step": 2833 + }, + { + "epoch": 0.4472147703960865, + "grad_norm": 0.5891528129577637, + "learning_rate": 4.746545965780606e-06, + "loss": 0.605, + "step": 2834 + }, + { + "epoch": 0.44737257377307876, + "grad_norm": 0.5666994452476501, + "learning_rate": 4.746363724102354e-06, + "loss": 0.5731, + "step": 2835 + }, + { + "epoch": 0.447530377150071, + "grad_norm": 0.5816834568977356, + "learning_rate": 4.7461814204299365e-06, + "loss": 0.6147, + "step": 2836 + }, + { + "epoch": 0.4476881805270633, + "grad_norm": 0.5729712843894958, + "learning_rate": 4.745999054768381e-06, + "loss": 0.5973, + "step": 2837 + }, + { + "epoch": 0.44784598390405556, + "grad_norm": 0.628150224685669, + "learning_rate": 4.745816627122722e-06, + "loss": 0.5953, + "step": 2838 + }, + { + "epoch": 0.44800378728104784, + "grad_norm": 0.5706257820129395, + "learning_rate": 4.745634137497994e-06, + "loss": 0.5803, + "step": 2839 + }, + { + "epoch": 0.44816159065804007, + "grad_norm": 0.6167499423027039, + "learning_rate": 4.7454515858992345e-06, + "loss": 0.6162, + "step": 2840 + }, + { + "epoch": 0.44831939403503235, + "grad_norm": 0.6195171475410461, + "learning_rate": 4.745268972331479e-06, + "loss": 0.562, + "step": 2841 + }, + { + "epoch": 0.44847719741202463, + "grad_norm": 0.6242823600769043, + "learning_rate": 4.745086296799769e-06, + "loss": 0.6102, + "step": 2842 + }, + { + "epoch": 0.44863500078901686, + "grad_norm": 0.6065537929534912, + "learning_rate": 4.744903559309145e-06, + "loss": 0.5823, + "step": 2843 + }, + { + "epoch": 0.44879280416600914, + "grad_norm": 0.5818787813186646, + "learning_rate": 4.744720759864651e-06, + "loss": 0.5693, + "step": 2844 + }, + { + "epoch": 0.4489506075430014, + "grad_norm": 0.5979520082473755, + "learning_rate": 4.744537898471332e-06, + "loss": 0.6132, + "step": 2845 + }, + { + "epoch": 0.4491084109199937, + "grad_norm": 0.5996047854423523, + "learning_rate": 4.744354975134232e-06, + "loss": 0.5758, + "step": 2846 + }, + { + "epoch": 0.44926621429698593, + "grad_norm": 0.577583372592926, + "learning_rate": 4.744171989858403e-06, + "loss": 0.5656, + "step": 2847 + }, + { + "epoch": 0.4494240176739782, + "grad_norm": 0.5795374512672424, + "learning_rate": 4.743988942648891e-06, + "loss": 0.5743, + "step": 2848 + }, + { + "epoch": 0.4495818210509705, + "grad_norm": 0.6052619814872742, + "learning_rate": 4.743805833510752e-06, + "loss": 0.5758, + "step": 2849 + }, + { + "epoch": 0.4497396244279628, + "grad_norm": 0.6032705903053284, + "learning_rate": 4.743622662449036e-06, + "loss": 0.6119, + "step": 2850 + }, + { + "epoch": 0.449897427804955, + "grad_norm": 0.6007672548294067, + "learning_rate": 4.743439429468799e-06, + "loss": 0.6127, + "step": 2851 + }, + { + "epoch": 0.4500552311819473, + "grad_norm": 0.5924961566925049, + "learning_rate": 4.7432561345750976e-06, + "loss": 0.618, + "step": 2852 + }, + { + "epoch": 0.4502130345589396, + "grad_norm": 0.5857877135276794, + "learning_rate": 4.7430727777729915e-06, + "loss": 0.6175, + "step": 2853 + }, + { + "epoch": 0.45037083793593186, + "grad_norm": 0.58440762758255, + "learning_rate": 4.742889359067539e-06, + "loss": 0.5632, + "step": 2854 + }, + { + "epoch": 0.4505286413129241, + "grad_norm": 0.5713158249855042, + "learning_rate": 4.742705878463804e-06, + "loss": 0.5827, + "step": 2855 + }, + { + "epoch": 0.45068644468991637, + "grad_norm": 0.5995749831199646, + "learning_rate": 4.7425223359668484e-06, + "loss": 0.5884, + "step": 2856 + }, + { + "epoch": 0.45084424806690865, + "grad_norm": 0.5931146740913391, + "learning_rate": 4.742338731581738e-06, + "loss": 0.5772, + "step": 2857 + }, + { + "epoch": 0.4510020514439009, + "grad_norm": 0.5784067511558533, + "learning_rate": 4.74215506531354e-06, + "loss": 0.572, + "step": 2858 + }, + { + "epoch": 0.45115985482089316, + "grad_norm": 0.610621988773346, + "learning_rate": 4.741971337167325e-06, + "loss": 0.5961, + "step": 2859 + }, + { + "epoch": 0.45131765819788544, + "grad_norm": 0.6042687892913818, + "learning_rate": 4.74178754714816e-06, + "loss": 0.5877, + "step": 2860 + }, + { + "epoch": 0.4514754615748777, + "grad_norm": 0.6182512044906616, + "learning_rate": 4.741603695261119e-06, + "loss": 0.5949, + "step": 2861 + }, + { + "epoch": 0.45163326495186995, + "grad_norm": 0.6074263453483582, + "learning_rate": 4.741419781511276e-06, + "loss": 0.5745, + "step": 2862 + }, + { + "epoch": 0.45179106832886223, + "grad_norm": 0.609637439250946, + "learning_rate": 4.741235805903707e-06, + "loss": 0.5708, + "step": 2863 + }, + { + "epoch": 0.4519488717058545, + "grad_norm": 0.5899096727371216, + "learning_rate": 4.7410517684434876e-06, + "loss": 0.5881, + "step": 2864 + }, + { + "epoch": 0.4521066750828468, + "grad_norm": 0.620669424533844, + "learning_rate": 4.740867669135698e-06, + "loss": 0.5944, + "step": 2865 + }, + { + "epoch": 0.452264478459839, + "grad_norm": 0.6070924997329712, + "learning_rate": 4.740683507985418e-06, + "loss": 0.5984, + "step": 2866 + }, + { + "epoch": 0.4524222818368313, + "grad_norm": 0.6183027029037476, + "learning_rate": 4.740499284997732e-06, + "loss": 0.6124, + "step": 2867 + }, + { + "epoch": 0.4525800852138236, + "grad_norm": 0.5561773777008057, + "learning_rate": 4.740315000177722e-06, + "loss": 0.5996, + "step": 2868 + }, + { + "epoch": 0.4527378885908158, + "grad_norm": 0.59273362159729, + "learning_rate": 4.7401306535304746e-06, + "loss": 0.5789, + "step": 2869 + }, + { + "epoch": 0.4528956919678081, + "grad_norm": 0.5953887104988098, + "learning_rate": 4.739946245061077e-06, + "loss": 0.6114, + "step": 2870 + }, + { + "epoch": 0.4530534953448004, + "grad_norm": 0.5851513743400574, + "learning_rate": 4.73976177477462e-06, + "loss": 0.5741, + "step": 2871 + }, + { + "epoch": 0.45321129872179267, + "grad_norm": 0.5905422568321228, + "learning_rate": 4.739577242676191e-06, + "loss": 0.6029, + "step": 2872 + }, + { + "epoch": 0.4533691020987849, + "grad_norm": 0.6164819002151489, + "learning_rate": 4.739392648770887e-06, + "loss": 0.6263, + "step": 2873 + }, + { + "epoch": 0.4535269054757772, + "grad_norm": 0.5860670208930969, + "learning_rate": 4.739207993063799e-06, + "loss": 0.5932, + "step": 2874 + }, + { + "epoch": 0.45368470885276946, + "grad_norm": 0.5795205235481262, + "learning_rate": 4.739023275560024e-06, + "loss": 0.5951, + "step": 2875 + }, + { + "epoch": 0.45384251222976174, + "grad_norm": 0.6044898629188538, + "learning_rate": 4.738838496264661e-06, + "loss": 0.5922, + "step": 2876 + }, + { + "epoch": 0.45400031560675397, + "grad_norm": 0.5972192883491516, + "learning_rate": 4.738653655182808e-06, + "loss": 0.5962, + "step": 2877 + }, + { + "epoch": 0.45415811898374625, + "grad_norm": 0.8427751660346985, + "learning_rate": 4.738468752319567e-06, + "loss": 0.5781, + "step": 2878 + }, + { + "epoch": 0.45431592236073853, + "grad_norm": 0.5845064520835876, + "learning_rate": 4.738283787680039e-06, + "loss": 0.6191, + "step": 2879 + }, + { + "epoch": 0.45447372573773076, + "grad_norm": 0.5917274355888367, + "learning_rate": 4.7380987612693316e-06, + "loss": 0.5851, + "step": 2880 + }, + { + "epoch": 0.45463152911472304, + "grad_norm": 0.5773484110832214, + "learning_rate": 4.737913673092549e-06, + "loss": 0.6031, + "step": 2881 + }, + { + "epoch": 0.4547893324917153, + "grad_norm": 0.6146697402000427, + "learning_rate": 4.7377285231548e-06, + "loss": 0.5843, + "step": 2882 + }, + { + "epoch": 0.4549471358687076, + "grad_norm": 0.578318178653717, + "learning_rate": 4.737543311461194e-06, + "loss": 0.5967, + "step": 2883 + }, + { + "epoch": 0.45510493924569984, + "grad_norm": 0.594019889831543, + "learning_rate": 4.737358038016843e-06, + "loss": 0.5832, + "step": 2884 + }, + { + "epoch": 0.4552627426226921, + "grad_norm": 0.5727084279060364, + "learning_rate": 4.737172702826858e-06, + "loss": 0.6174, + "step": 2885 + }, + { + "epoch": 0.4554205459996844, + "grad_norm": 0.5958375334739685, + "learning_rate": 4.736987305896357e-06, + "loss": 0.5944, + "step": 2886 + }, + { + "epoch": 0.4555783493766767, + "grad_norm": 0.5860264301300049, + "learning_rate": 4.736801847230454e-06, + "loss": 0.5872, + "step": 2887 + }, + { + "epoch": 0.4557361527536689, + "grad_norm": 0.5950831174850464, + "learning_rate": 4.736616326834267e-06, + "loss": 0.6028, + "step": 2888 + }, + { + "epoch": 0.4558939561306612, + "grad_norm": 0.6298971176147461, + "learning_rate": 4.736430744712918e-06, + "loss": 0.6219, + "step": 2889 + }, + { + "epoch": 0.4560517595076535, + "grad_norm": 0.5919068455696106, + "learning_rate": 4.736245100871527e-06, + "loss": 0.6081, + "step": 2890 + }, + { + "epoch": 0.45620956288464576, + "grad_norm": 0.5966792106628418, + "learning_rate": 4.736059395315219e-06, + "loss": 0.6211, + "step": 2891 + }, + { + "epoch": 0.456367366261638, + "grad_norm": 0.5929974913597107, + "learning_rate": 4.735873628049117e-06, + "loss": 0.5635, + "step": 2892 + }, + { + "epoch": 0.45652516963863027, + "grad_norm": 0.5981597304344177, + "learning_rate": 4.7356877990783486e-06, + "loss": 0.5951, + "step": 2893 + }, + { + "epoch": 0.45668297301562255, + "grad_norm": 0.6108375787734985, + "learning_rate": 4.735501908408042e-06, + "loss": 0.6386, + "step": 2894 + }, + { + "epoch": 0.4568407763926148, + "grad_norm": 0.5993546843528748, + "learning_rate": 4.735315956043328e-06, + "loss": 0.5677, + "step": 2895 + }, + { + "epoch": 0.45699857976960706, + "grad_norm": 0.6177210807800293, + "learning_rate": 4.735129941989339e-06, + "loss": 0.5522, + "step": 2896 + }, + { + "epoch": 0.45715638314659934, + "grad_norm": 0.5678161382675171, + "learning_rate": 4.734943866251206e-06, + "loss": 0.5913, + "step": 2897 + }, + { + "epoch": 0.4573141865235916, + "grad_norm": 0.6016837358474731, + "learning_rate": 4.7347577288340665e-06, + "loss": 0.6057, + "step": 2898 + }, + { + "epoch": 0.45747198990058385, + "grad_norm": 0.591022789478302, + "learning_rate": 4.734571529743056e-06, + "loss": 0.5752, + "step": 2899 + }, + { + "epoch": 0.45762979327757614, + "grad_norm": 0.597064733505249, + "learning_rate": 4.734385268983315e-06, + "loss": 0.5992, + "step": 2900 + }, + { + "epoch": 0.4577875966545684, + "grad_norm": 0.6226060390472412, + "learning_rate": 4.734198946559982e-06, + "loss": 0.5824, + "step": 2901 + }, + { + "epoch": 0.4579454000315607, + "grad_norm": 0.609544575214386, + "learning_rate": 4.734012562478199e-06, + "loss": 0.5857, + "step": 2902 + }, + { + "epoch": 0.45810320340855293, + "grad_norm": 0.6841819286346436, + "learning_rate": 4.733826116743112e-06, + "loss": 0.5779, + "step": 2903 + }, + { + "epoch": 0.4582610067855452, + "grad_norm": 0.6322498321533203, + "learning_rate": 4.7336396093598635e-06, + "loss": 0.5911, + "step": 2904 + }, + { + "epoch": 0.4584188101625375, + "grad_norm": 0.5980339646339417, + "learning_rate": 4.7334530403336035e-06, + "loss": 0.581, + "step": 2905 + }, + { + "epoch": 0.4585766135395297, + "grad_norm": 0.5594825148582458, + "learning_rate": 4.733266409669478e-06, + "loss": 0.5773, + "step": 2906 + }, + { + "epoch": 0.458734416916522, + "grad_norm": 0.5995595455169678, + "learning_rate": 4.733079717372639e-06, + "loss": 0.622, + "step": 2907 + }, + { + "epoch": 0.4588922202935143, + "grad_norm": 0.5920237302780151, + "learning_rate": 4.73289296344824e-06, + "loss": 0.604, + "step": 2908 + }, + { + "epoch": 0.45905002367050657, + "grad_norm": 0.5810033082962036, + "learning_rate": 4.732706147901433e-06, + "loss": 0.5962, + "step": 2909 + }, + { + "epoch": 0.4592078270474988, + "grad_norm": 0.5674750804901123, + "learning_rate": 4.732519270737374e-06, + "loss": 0.6205, + "step": 2910 + }, + { + "epoch": 0.4593656304244911, + "grad_norm": 0.6202820539474487, + "learning_rate": 4.7323323319612215e-06, + "loss": 0.5928, + "step": 2911 + }, + { + "epoch": 0.45952343380148336, + "grad_norm": 0.5906978249549866, + "learning_rate": 4.732145331578133e-06, + "loss": 0.5839, + "step": 2912 + }, + { + "epoch": 0.45968123717847564, + "grad_norm": 0.5967821478843689, + "learning_rate": 4.731958269593271e-06, + "loss": 0.6303, + "step": 2913 + }, + { + "epoch": 0.45983904055546787, + "grad_norm": 0.5737401843070984, + "learning_rate": 4.731771146011797e-06, + "loss": 0.5807, + "step": 2914 + }, + { + "epoch": 0.45999684393246015, + "grad_norm": 0.604326069355011, + "learning_rate": 4.7315839608388735e-06, + "loss": 0.5837, + "step": 2915 + }, + { + "epoch": 0.46015464730945244, + "grad_norm": 0.5986136198043823, + "learning_rate": 4.731396714079669e-06, + "loss": 0.5461, + "step": 2916 + }, + { + "epoch": 0.46031245068644466, + "grad_norm": 0.5840728878974915, + "learning_rate": 4.7312094057393495e-06, + "loss": 0.5713, + "step": 2917 + }, + { + "epoch": 0.46047025406343695, + "grad_norm": 0.5816648006439209, + "learning_rate": 4.731022035823085e-06, + "loss": 0.5951, + "step": 2918 + }, + { + "epoch": 0.46062805744042923, + "grad_norm": 0.5871838331222534, + "learning_rate": 4.730834604336048e-06, + "loss": 0.5741, + "step": 2919 + }, + { + "epoch": 0.4607858608174215, + "grad_norm": 0.6083319783210754, + "learning_rate": 4.7306471112834075e-06, + "loss": 0.5856, + "step": 2920 + }, + { + "epoch": 0.46094366419441374, + "grad_norm": 0.6025320291519165, + "learning_rate": 4.7304595566703394e-06, + "loss": 0.6205, + "step": 2921 + }, + { + "epoch": 0.461101467571406, + "grad_norm": 0.5789783596992493, + "learning_rate": 4.730271940502022e-06, + "loss": 0.5873, + "step": 2922 + }, + { + "epoch": 0.4612592709483983, + "grad_norm": 0.547845721244812, + "learning_rate": 4.730084262783629e-06, + "loss": 0.5711, + "step": 2923 + }, + { + "epoch": 0.4614170743253906, + "grad_norm": 0.5932142734527588, + "learning_rate": 4.729896523520343e-06, + "loss": 0.5913, + "step": 2924 + }, + { + "epoch": 0.4615748777023828, + "grad_norm": 0.6596287488937378, + "learning_rate": 4.7297087227173445e-06, + "loss": 0.6234, + "step": 2925 + }, + { + "epoch": 0.4617326810793751, + "grad_norm": 0.616863489151001, + "learning_rate": 4.7295208603798156e-06, + "loss": 0.6013, + "step": 2926 + }, + { + "epoch": 0.4618904844563674, + "grad_norm": 0.5605264902114868, + "learning_rate": 4.72933293651294e-06, + "loss": 0.5919, + "step": 2927 + }, + { + "epoch": 0.46204828783335966, + "grad_norm": 0.6095606088638306, + "learning_rate": 4.729144951121907e-06, + "loss": 0.5834, + "step": 2928 + }, + { + "epoch": 0.4622060912103519, + "grad_norm": 0.5573198795318604, + "learning_rate": 4.728956904211902e-06, + "loss": 0.5563, + "step": 2929 + }, + { + "epoch": 0.46236389458734417, + "grad_norm": 0.5705845952033997, + "learning_rate": 4.728768795788116e-06, + "loss": 0.6007, + "step": 2930 + }, + { + "epoch": 0.46252169796433645, + "grad_norm": 0.5921619534492493, + "learning_rate": 4.728580625855739e-06, + "loss": 0.6009, + "step": 2931 + }, + { + "epoch": 0.4626795013413287, + "grad_norm": 0.5833213925361633, + "learning_rate": 4.728392394419964e-06, + "loss": 0.5806, + "step": 2932 + }, + { + "epoch": 0.46283730471832096, + "grad_norm": 0.5856598615646362, + "learning_rate": 4.728204101485987e-06, + "loss": 0.5827, + "step": 2933 + }, + { + "epoch": 0.46299510809531325, + "grad_norm": 0.5895663499832153, + "learning_rate": 4.728015747059005e-06, + "loss": 0.6017, + "step": 2934 + }, + { + "epoch": 0.46315291147230553, + "grad_norm": 0.6211767196655273, + "learning_rate": 4.727827331144213e-06, + "loss": 0.5695, + "step": 2935 + }, + { + "epoch": 0.46331071484929776, + "grad_norm": 0.5863941311836243, + "learning_rate": 4.727638853746814e-06, + "loss": 0.5747, + "step": 2936 + }, + { + "epoch": 0.46346851822629004, + "grad_norm": 0.6030191779136658, + "learning_rate": 4.727450314872009e-06, + "loss": 0.6132, + "step": 2937 + }, + { + "epoch": 0.4636263216032823, + "grad_norm": 0.5600023865699768, + "learning_rate": 4.727261714524999e-06, + "loss": 0.5813, + "step": 2938 + }, + { + "epoch": 0.4637841249802746, + "grad_norm": 0.5989962220191956, + "learning_rate": 4.727073052710991e-06, + "loss": 0.6104, + "step": 2939 + }, + { + "epoch": 0.46394192835726683, + "grad_norm": 0.574929416179657, + "learning_rate": 4.726884329435192e-06, + "loss": 0.5637, + "step": 2940 + }, + { + "epoch": 0.4640997317342591, + "grad_norm": 0.6025831699371338, + "learning_rate": 4.726695544702808e-06, + "loss": 0.6252, + "step": 2941 + }, + { + "epoch": 0.4642575351112514, + "grad_norm": 0.5847452878952026, + "learning_rate": 4.726506698519051e-06, + "loss": 0.5784, + "step": 2942 + }, + { + "epoch": 0.4644153384882436, + "grad_norm": 0.5764058828353882, + "learning_rate": 4.7263177908891325e-06, + "loss": 0.5933, + "step": 2943 + }, + { + "epoch": 0.4645731418652359, + "grad_norm": 0.5859397053718567, + "learning_rate": 4.726128821818264e-06, + "loss": 0.5737, + "step": 2944 + }, + { + "epoch": 0.4647309452422282, + "grad_norm": 0.6112645864486694, + "learning_rate": 4.725939791311663e-06, + "loss": 0.616, + "step": 2945 + }, + { + "epoch": 0.46488874861922047, + "grad_norm": 0.6161361336708069, + "learning_rate": 4.725750699374545e-06, + "loss": 0.615, + "step": 2946 + }, + { + "epoch": 0.4650465519962127, + "grad_norm": 0.6178465485572815, + "learning_rate": 4.72556154601213e-06, + "loss": 0.5477, + "step": 2947 + }, + { + "epoch": 0.465204355373205, + "grad_norm": 0.5982931852340698, + "learning_rate": 4.725372331229635e-06, + "loss": 0.5737, + "step": 2948 + }, + { + "epoch": 0.46536215875019726, + "grad_norm": 0.6100964546203613, + "learning_rate": 4.725183055032286e-06, + "loss": 0.5895, + "step": 2949 + }, + { + "epoch": 0.46551996212718955, + "grad_norm": 0.6155681014060974, + "learning_rate": 4.7249937174253025e-06, + "loss": 0.5672, + "step": 2950 + }, + { + "epoch": 0.4656777655041818, + "grad_norm": 0.6069581508636475, + "learning_rate": 4.724804318413912e-06, + "loss": 0.5803, + "step": 2951 + }, + { + "epoch": 0.46583556888117406, + "grad_norm": 0.6109415292739868, + "learning_rate": 4.7246148580033415e-06, + "loss": 0.5891, + "step": 2952 + }, + { + "epoch": 0.46599337225816634, + "grad_norm": 0.5722568035125732, + "learning_rate": 4.724425336198819e-06, + "loss": 0.5952, + "step": 2953 + }, + { + "epoch": 0.46615117563515857, + "grad_norm": 0.6079490184783936, + "learning_rate": 4.724235753005575e-06, + "loss": 0.5642, + "step": 2954 + }, + { + "epoch": 0.46630897901215085, + "grad_norm": 0.6075934767723083, + "learning_rate": 4.724046108428842e-06, + "loss": 0.612, + "step": 2955 + }, + { + "epoch": 0.46646678238914313, + "grad_norm": 0.6164495944976807, + "learning_rate": 4.723856402473853e-06, + "loss": 0.5676, + "step": 2956 + }, + { + "epoch": 0.4666245857661354, + "grad_norm": 0.5699167847633362, + "learning_rate": 4.723666635145843e-06, + "loss": 0.6133, + "step": 2957 + }, + { + "epoch": 0.46678238914312764, + "grad_norm": 0.5709851384162903, + "learning_rate": 4.723476806450051e-06, + "loss": 0.6019, + "step": 2958 + }, + { + "epoch": 0.4669401925201199, + "grad_norm": 0.6120640635490417, + "learning_rate": 4.723286916391715e-06, + "loss": 0.5899, + "step": 2959 + }, + { + "epoch": 0.4670979958971122, + "grad_norm": 0.6327582597732544, + "learning_rate": 4.723096964976075e-06, + "loss": 0.622, + "step": 2960 + }, + { + "epoch": 0.4672557992741045, + "grad_norm": 0.5528719425201416, + "learning_rate": 4.722906952208373e-06, + "loss": 0.6008, + "step": 2961 + }, + { + "epoch": 0.4674136026510967, + "grad_norm": 0.5869691371917725, + "learning_rate": 4.722716878093853e-06, + "loss": 0.6251, + "step": 2962 + }, + { + "epoch": 0.467571406028089, + "grad_norm": 0.5889107584953308, + "learning_rate": 4.7225267426377615e-06, + "loss": 0.5572, + "step": 2963 + }, + { + "epoch": 0.4677292094050813, + "grad_norm": 0.5657742619514465, + "learning_rate": 4.722336545845344e-06, + "loss": 0.588, + "step": 2964 + }, + { + "epoch": 0.46788701278207356, + "grad_norm": 0.5866580605506897, + "learning_rate": 4.7221462877218524e-06, + "loss": 0.629, + "step": 2965 + }, + { + "epoch": 0.4680448161590658, + "grad_norm": 0.5991636514663696, + "learning_rate": 4.721955968272534e-06, + "loss": 0.5906, + "step": 2966 + }, + { + "epoch": 0.4682026195360581, + "grad_norm": 0.61302649974823, + "learning_rate": 4.721765587502644e-06, + "loss": 0.5972, + "step": 2967 + }, + { + "epoch": 0.46836042291305036, + "grad_norm": 0.582893431186676, + "learning_rate": 4.721575145417434e-06, + "loss": 0.5894, + "step": 2968 + }, + { + "epoch": 0.4685182262900426, + "grad_norm": 0.6155790090560913, + "learning_rate": 4.721384642022161e-06, + "loss": 0.5958, + "step": 2969 + }, + { + "epoch": 0.46867602966703487, + "grad_norm": 0.6005142331123352, + "learning_rate": 4.721194077322084e-06, + "loss": 0.5641, + "step": 2970 + }, + { + "epoch": 0.46883383304402715, + "grad_norm": 0.5961930155754089, + "learning_rate": 4.721003451322458e-06, + "loss": 0.5814, + "step": 2971 + }, + { + "epoch": 0.46899163642101943, + "grad_norm": 0.6125468015670776, + "learning_rate": 4.7208127640285475e-06, + "loss": 0.6088, + "step": 2972 + }, + { + "epoch": 0.46914943979801166, + "grad_norm": 0.5798413753509521, + "learning_rate": 4.720622015445614e-06, + "loss": 0.577, + "step": 2973 + }, + { + "epoch": 0.46930724317500394, + "grad_norm": 0.6021834015846252, + "learning_rate": 4.720431205578922e-06, + "loss": 0.5842, + "step": 2974 + }, + { + "epoch": 0.4694650465519962, + "grad_norm": 0.575585126876831, + "learning_rate": 4.720240334433735e-06, + "loss": 0.6027, + "step": 2975 + }, + { + "epoch": 0.4696228499289885, + "grad_norm": 0.5662208199501038, + "learning_rate": 4.720049402015324e-06, + "loss": 0.5809, + "step": 2976 + }, + { + "epoch": 0.46978065330598073, + "grad_norm": 0.6301442980766296, + "learning_rate": 4.719858408328956e-06, + "loss": 0.6217, + "step": 2977 + }, + { + "epoch": 0.469938456682973, + "grad_norm": 0.6608861684799194, + "learning_rate": 4.719667353379902e-06, + "loss": 0.5831, + "step": 2978 + }, + { + "epoch": 0.4700962600599653, + "grad_norm": 0.5827069282531738, + "learning_rate": 4.719476237173436e-06, + "loss": 0.5797, + "step": 2979 + }, + { + "epoch": 0.4702540634369575, + "grad_norm": 0.6114890575408936, + "learning_rate": 4.719285059714832e-06, + "loss": 0.5761, + "step": 2980 + }, + { + "epoch": 0.4704118668139498, + "grad_norm": 0.5643553733825684, + "learning_rate": 4.719093821009366e-06, + "loss": 0.5892, + "step": 2981 + }, + { + "epoch": 0.4705696701909421, + "grad_norm": 0.5742526650428772, + "learning_rate": 4.718902521062315e-06, + "loss": 0.5765, + "step": 2982 + }, + { + "epoch": 0.4707274735679344, + "grad_norm": 0.5806805491447449, + "learning_rate": 4.718711159878958e-06, + "loss": 0.6334, + "step": 2983 + }, + { + "epoch": 0.4708852769449266, + "grad_norm": 0.6013113260269165, + "learning_rate": 4.718519737464579e-06, + "loss": 0.5941, + "step": 2984 + }, + { + "epoch": 0.4710430803219189, + "grad_norm": 0.5972934365272522, + "learning_rate": 4.718328253824457e-06, + "loss": 0.5821, + "step": 2985 + }, + { + "epoch": 0.47120088369891117, + "grad_norm": 0.5813754200935364, + "learning_rate": 4.718136708963878e-06, + "loss": 0.5991, + "step": 2986 + }, + { + "epoch": 0.47135868707590345, + "grad_norm": 0.6155495643615723, + "learning_rate": 4.717945102888129e-06, + "loss": 0.5745, + "step": 2987 + }, + { + "epoch": 0.4715164904528957, + "grad_norm": 0.601281464099884, + "learning_rate": 4.717753435602498e-06, + "loss": 0.5803, + "step": 2988 + }, + { + "epoch": 0.47167429382988796, + "grad_norm": 0.6119242310523987, + "learning_rate": 4.717561707112272e-06, + "loss": 0.5722, + "step": 2989 + }, + { + "epoch": 0.47183209720688024, + "grad_norm": 0.588280439376831, + "learning_rate": 4.717369917422745e-06, + "loss": 0.5762, + "step": 2990 + }, + { + "epoch": 0.47198990058387247, + "grad_norm": 0.564857542514801, + "learning_rate": 4.7171780665392095e-06, + "loss": 0.5875, + "step": 2991 + }, + { + "epoch": 0.47214770396086475, + "grad_norm": 0.6129816770553589, + "learning_rate": 4.7169861544669585e-06, + "loss": 0.5833, + "step": 2992 + }, + { + "epoch": 0.47230550733785703, + "grad_norm": 0.5612311959266663, + "learning_rate": 4.71679418121129e-06, + "loss": 0.5625, + "step": 2993 + }, + { + "epoch": 0.4724633107148493, + "grad_norm": 0.5624921321868896, + "learning_rate": 4.7166021467775015e-06, + "loss": 0.56, + "step": 2994 + }, + { + "epoch": 0.47262111409184154, + "grad_norm": 0.5691313147544861, + "learning_rate": 4.716410051170892e-06, + "loss": 0.6107, + "step": 2995 + }, + { + "epoch": 0.4727789174688338, + "grad_norm": 0.5877928137779236, + "learning_rate": 4.716217894396764e-06, + "loss": 0.614, + "step": 2996 + }, + { + "epoch": 0.4729367208458261, + "grad_norm": 0.5803868770599365, + "learning_rate": 4.71602567646042e-06, + "loss": 0.5943, + "step": 2997 + }, + { + "epoch": 0.4730945242228184, + "grad_norm": 0.5795320272445679, + "learning_rate": 4.715833397367164e-06, + "loss": 0.6027, + "step": 2998 + }, + { + "epoch": 0.4732523275998106, + "grad_norm": 0.5946059823036194, + "learning_rate": 4.7156410571223035e-06, + "loss": 0.6138, + "step": 2999 + }, + { + "epoch": 0.4734101309768029, + "grad_norm": 0.5927489995956421, + "learning_rate": 4.715448655731146e-06, + "loss": 0.623, + "step": 3000 + }, + { + "epoch": 0.4735679343537952, + "grad_norm": 0.5845480561256409, + "learning_rate": 4.715256193199003e-06, + "loss": 0.5828, + "step": 3001 + }, + { + "epoch": 0.47372573773078747, + "grad_norm": 0.6194727420806885, + "learning_rate": 4.715063669531183e-06, + "loss": 0.6055, + "step": 3002 + }, + { + "epoch": 0.4738835411077797, + "grad_norm": 0.5845394730567932, + "learning_rate": 4.714871084733001e-06, + "loss": 0.5638, + "step": 3003 + }, + { + "epoch": 0.474041344484772, + "grad_norm": 0.5682274699211121, + "learning_rate": 4.714678438809772e-06, + "loss": 0.5635, + "step": 3004 + }, + { + "epoch": 0.47419914786176426, + "grad_norm": 0.5745446681976318, + "learning_rate": 4.7144857317668125e-06, + "loss": 0.5887, + "step": 3005 + }, + { + "epoch": 0.4743569512387565, + "grad_norm": 0.5911332368850708, + "learning_rate": 4.71429296360944e-06, + "loss": 0.6039, + "step": 3006 + }, + { + "epoch": 0.47451475461574877, + "grad_norm": 0.5915188789367676, + "learning_rate": 4.714100134342975e-06, + "loss": 0.5808, + "step": 3007 + }, + { + "epoch": 0.47467255799274105, + "grad_norm": 0.5851894617080688, + "learning_rate": 4.713907243972739e-06, + "loss": 0.5885, + "step": 3008 + }, + { + "epoch": 0.47483036136973333, + "grad_norm": 0.6122081279754639, + "learning_rate": 4.713714292504056e-06, + "loss": 0.595, + "step": 3009 + }, + { + "epoch": 0.47498816474672556, + "grad_norm": 0.5786735415458679, + "learning_rate": 4.713521279942249e-06, + "loss": 0.6209, + "step": 3010 + }, + { + "epoch": 0.47514596812371784, + "grad_norm": 0.5710821151733398, + "learning_rate": 4.713328206292647e-06, + "loss": 0.5866, + "step": 3011 + }, + { + "epoch": 0.4753037715007101, + "grad_norm": 0.5784355998039246, + "learning_rate": 4.713135071560577e-06, + "loss": 0.556, + "step": 3012 + }, + { + "epoch": 0.4754615748777024, + "grad_norm": 0.6229528188705444, + "learning_rate": 4.712941875751369e-06, + "loss": 0.6104, + "step": 3013 + }, + { + "epoch": 0.47561937825469464, + "grad_norm": 0.5802075862884521, + "learning_rate": 4.712748618870355e-06, + "loss": 0.5931, + "step": 3014 + }, + { + "epoch": 0.4757771816316869, + "grad_norm": 0.6040281057357788, + "learning_rate": 4.71255530092287e-06, + "loss": 0.5693, + "step": 3015 + }, + { + "epoch": 0.4759349850086792, + "grad_norm": 0.6118032932281494, + "learning_rate": 4.712361921914246e-06, + "loss": 0.5937, + "step": 3016 + }, + { + "epoch": 0.47609278838567143, + "grad_norm": 0.7581448554992676, + "learning_rate": 4.712168481849822e-06, + "loss": 0.5706, + "step": 3017 + }, + { + "epoch": 0.4762505917626637, + "grad_norm": 0.5811131596565247, + "learning_rate": 4.711974980734936e-06, + "loss": 0.6157, + "step": 3018 + }, + { + "epoch": 0.476408395139656, + "grad_norm": 0.6181474924087524, + "learning_rate": 4.7117814185749275e-06, + "loss": 0.5729, + "step": 3019 + }, + { + "epoch": 0.4765661985166483, + "grad_norm": 0.5954312682151794, + "learning_rate": 4.71158779537514e-06, + "loss": 0.583, + "step": 3020 + }, + { + "epoch": 0.4767240018936405, + "grad_norm": 0.6045989990234375, + "learning_rate": 4.711394111140916e-06, + "loss": 0.5843, + "step": 3021 + }, + { + "epoch": 0.4768818052706328, + "grad_norm": 0.5926102995872498, + "learning_rate": 4.711200365877599e-06, + "loss": 0.5675, + "step": 3022 + }, + { + "epoch": 0.47703960864762507, + "grad_norm": 0.5635679364204407, + "learning_rate": 4.711006559590539e-06, + "loss": 0.6003, + "step": 3023 + }, + { + "epoch": 0.47719741202461735, + "grad_norm": 0.5878939628601074, + "learning_rate": 4.710812692285083e-06, + "loss": 0.6057, + "step": 3024 + }, + { + "epoch": 0.4773552154016096, + "grad_norm": 0.5919951796531677, + "learning_rate": 4.7106187639665805e-06, + "loss": 0.6198, + "step": 3025 + }, + { + "epoch": 0.47751301877860186, + "grad_norm": 0.6087184548377991, + "learning_rate": 4.710424774640385e-06, + "loss": 0.6237, + "step": 3026 + }, + { + "epoch": 0.47767082215559414, + "grad_norm": 0.6335252523422241, + "learning_rate": 4.71023072431185e-06, + "loss": 0.5553, + "step": 3027 + }, + { + "epoch": 0.47782862553258637, + "grad_norm": 0.6105009913444519, + "learning_rate": 4.710036612986329e-06, + "loss": 0.5974, + "step": 3028 + }, + { + "epoch": 0.47798642890957865, + "grad_norm": 0.5697445869445801, + "learning_rate": 4.7098424406691814e-06, + "loss": 0.5794, + "step": 3029 + }, + { + "epoch": 0.47814423228657094, + "grad_norm": 0.5782665014266968, + "learning_rate": 4.709648207365765e-06, + "loss": 0.6005, + "step": 3030 + }, + { + "epoch": 0.4783020356635632, + "grad_norm": 0.5699340105056763, + "learning_rate": 4.709453913081438e-06, + "loss": 0.595, + "step": 3031 + }, + { + "epoch": 0.47845983904055545, + "grad_norm": 0.6389720439910889, + "learning_rate": 4.709259557821566e-06, + "loss": 0.5968, + "step": 3032 + }, + { + "epoch": 0.47861764241754773, + "grad_norm": 0.5838049054145813, + "learning_rate": 4.709065141591511e-06, + "loss": 0.614, + "step": 3033 + }, + { + "epoch": 0.47877544579454, + "grad_norm": 0.6107947826385498, + "learning_rate": 4.708870664396639e-06, + "loss": 0.6014, + "step": 3034 + }, + { + "epoch": 0.4789332491715323, + "grad_norm": 0.604834258556366, + "learning_rate": 4.708676126242315e-06, + "loss": 0.5778, + "step": 3035 + }, + { + "epoch": 0.4790910525485245, + "grad_norm": 0.5965508222579956, + "learning_rate": 4.70848152713391e-06, + "loss": 0.5688, + "step": 3036 + }, + { + "epoch": 0.4792488559255168, + "grad_norm": 0.5740787982940674, + "learning_rate": 4.708286867076795e-06, + "loss": 0.5551, + "step": 3037 + }, + { + "epoch": 0.4794066593025091, + "grad_norm": 0.5963127017021179, + "learning_rate": 4.7080921460763405e-06, + "loss": 0.5752, + "step": 3038 + }, + { + "epoch": 0.47956446267950137, + "grad_norm": 0.5920292139053345, + "learning_rate": 4.70789736413792e-06, + "loss": 0.6153, + "step": 3039 + }, + { + "epoch": 0.4797222660564936, + "grad_norm": 0.5915069580078125, + "learning_rate": 4.707702521266911e-06, + "loss": 0.6222, + "step": 3040 + }, + { + "epoch": 0.4798800694334859, + "grad_norm": 0.599689245223999, + "learning_rate": 4.707507617468689e-06, + "loss": 0.5881, + "step": 3041 + }, + { + "epoch": 0.48003787281047816, + "grad_norm": 0.6121731996536255, + "learning_rate": 4.7073126527486335e-06, + "loss": 0.6006, + "step": 3042 + }, + { + "epoch": 0.4801956761874704, + "grad_norm": 0.6022850275039673, + "learning_rate": 4.707117627112125e-06, + "loss": 0.603, + "step": 3043 + }, + { + "epoch": 0.48035347956446267, + "grad_norm": 0.6178485155105591, + "learning_rate": 4.7069225405645456e-06, + "loss": 0.573, + "step": 3044 + }, + { + "epoch": 0.48051128294145495, + "grad_norm": 0.5798830986022949, + "learning_rate": 4.706727393111279e-06, + "loss": 0.609, + "step": 3045 + }, + { + "epoch": 0.48066908631844724, + "grad_norm": 0.6462085843086243, + "learning_rate": 4.706532184757712e-06, + "loss": 0.5749, + "step": 3046 + }, + { + "epoch": 0.48082688969543946, + "grad_norm": 0.6168819665908813, + "learning_rate": 4.70633691550923e-06, + "loss": 0.5737, + "step": 3047 + }, + { + "epoch": 0.48098469307243175, + "grad_norm": 0.6046976447105408, + "learning_rate": 4.706141585371223e-06, + "loss": 0.6269, + "step": 3048 + }, + { + "epoch": 0.48114249644942403, + "grad_norm": 0.6107098460197449, + "learning_rate": 4.705946194349082e-06, + "loss": 0.6005, + "step": 3049 + }, + { + "epoch": 0.4813002998264163, + "grad_norm": 0.5939887762069702, + "learning_rate": 4.705750742448199e-06, + "loss": 0.5682, + "step": 3050 + }, + { + "epoch": 0.48145810320340854, + "grad_norm": 0.5918442606925964, + "learning_rate": 4.705555229673968e-06, + "loss": 0.5931, + "step": 3051 + }, + { + "epoch": 0.4816159065804008, + "grad_norm": 0.5955492854118347, + "learning_rate": 4.705359656031784e-06, + "loss": 0.5635, + "step": 3052 + }, + { + "epoch": 0.4817737099573931, + "grad_norm": 0.6209884285926819, + "learning_rate": 4.705164021527045e-06, + "loss": 0.5735, + "step": 3053 + }, + { + "epoch": 0.48193151333438533, + "grad_norm": 0.5929357409477234, + "learning_rate": 4.704968326165151e-06, + "loss": 0.5985, + "step": 3054 + }, + { + "epoch": 0.4820893167113776, + "grad_norm": 0.5917227268218994, + "learning_rate": 4.7047725699515e-06, + "loss": 0.6287, + "step": 3055 + }, + { + "epoch": 0.4822471200883699, + "grad_norm": 0.6481499075889587, + "learning_rate": 4.7045767528914964e-06, + "loss": 0.6042, + "step": 3056 + }, + { + "epoch": 0.4824049234653622, + "grad_norm": 0.5895054340362549, + "learning_rate": 4.704380874990544e-06, + "loss": 0.6067, + "step": 3057 + }, + { + "epoch": 0.4825627268423544, + "grad_norm": 0.5863064527511597, + "learning_rate": 4.704184936254049e-06, + "loss": 0.5694, + "step": 3058 + }, + { + "epoch": 0.4827205302193467, + "grad_norm": 0.5561804175376892, + "learning_rate": 4.703988936687419e-06, + "loss": 0.5646, + "step": 3059 + }, + { + "epoch": 0.48287833359633897, + "grad_norm": 0.58956378698349, + "learning_rate": 4.7037928762960615e-06, + "loss": 0.5943, + "step": 3060 + }, + { + "epoch": 0.48303613697333125, + "grad_norm": 0.5777053236961365, + "learning_rate": 4.703596755085388e-06, + "loss": 0.5582, + "step": 3061 + }, + { + "epoch": 0.4831939403503235, + "grad_norm": 0.5734872817993164, + "learning_rate": 4.703400573060811e-06, + "loss": 0.5631, + "step": 3062 + }, + { + "epoch": 0.48335174372731576, + "grad_norm": 0.6208211183547974, + "learning_rate": 4.7032043302277455e-06, + "loss": 0.6289, + "step": 3063 + }, + { + "epoch": 0.48350954710430805, + "grad_norm": 0.6116321086883545, + "learning_rate": 4.7030080265916066e-06, + "loss": 0.577, + "step": 3064 + }, + { + "epoch": 0.4836673504813003, + "grad_norm": 0.6016972661018372, + "learning_rate": 4.702811662157812e-06, + "loss": 0.6192, + "step": 3065 + }, + { + "epoch": 0.48382515385829256, + "grad_norm": 0.6624463796615601, + "learning_rate": 4.70261523693178e-06, + "loss": 0.6196, + "step": 3066 + }, + { + "epoch": 0.48398295723528484, + "grad_norm": 0.5907204151153564, + "learning_rate": 4.702418750918933e-06, + "loss": 0.576, + "step": 3067 + }, + { + "epoch": 0.4841407606122771, + "grad_norm": 0.5800524950027466, + "learning_rate": 4.702222204124693e-06, + "loss": 0.5824, + "step": 3068 + }, + { + "epoch": 0.48429856398926935, + "grad_norm": 0.6021711826324463, + "learning_rate": 4.702025596554482e-06, + "loss": 0.5777, + "step": 3069 + }, + { + "epoch": 0.48445636736626163, + "grad_norm": 0.6027588248252869, + "learning_rate": 4.70182892821373e-06, + "loss": 0.6054, + "step": 3070 + }, + { + "epoch": 0.4846141707432539, + "grad_norm": 0.6037135124206543, + "learning_rate": 4.701632199107862e-06, + "loss": 0.5863, + "step": 3071 + }, + { + "epoch": 0.4847719741202462, + "grad_norm": 0.6141204237937927, + "learning_rate": 4.701435409242306e-06, + "loss": 0.602, + "step": 3072 + }, + { + "epoch": 0.4849297774972384, + "grad_norm": 0.5746604800224304, + "learning_rate": 4.701238558622496e-06, + "loss": 0.5804, + "step": 3073 + }, + { + "epoch": 0.4850875808742307, + "grad_norm": 0.5782328844070435, + "learning_rate": 4.701041647253864e-06, + "loss": 0.5757, + "step": 3074 + }, + { + "epoch": 0.485245384251223, + "grad_norm": 0.5975884199142456, + "learning_rate": 4.700844675141842e-06, + "loss": 0.5642, + "step": 3075 + }, + { + "epoch": 0.48540318762821527, + "grad_norm": 0.5864390134811401, + "learning_rate": 4.700647642291868e-06, + "loss": 0.604, + "step": 3076 + }, + { + "epoch": 0.4855609910052075, + "grad_norm": 0.590674638748169, + "learning_rate": 4.700450548709378e-06, + "loss": 0.5884, + "step": 3077 + }, + { + "epoch": 0.4857187943821998, + "grad_norm": 0.5809804797172546, + "learning_rate": 4.700253394399814e-06, + "loss": 0.5518, + "step": 3078 + }, + { + "epoch": 0.48587659775919206, + "grad_norm": 0.5847716927528381, + "learning_rate": 4.700056179368614e-06, + "loss": 0.5327, + "step": 3079 + }, + { + "epoch": 0.4860344011361843, + "grad_norm": 0.5957671403884888, + "learning_rate": 4.699858903621223e-06, + "loss": 0.5774, + "step": 3080 + }, + { + "epoch": 0.4861922045131766, + "grad_norm": 0.6369858384132385, + "learning_rate": 4.699661567163083e-06, + "loss": 0.5984, + "step": 3081 + }, + { + "epoch": 0.48635000789016886, + "grad_norm": 0.5806320309638977, + "learning_rate": 4.699464169999642e-06, + "loss": 0.5862, + "step": 3082 + }, + { + "epoch": 0.48650781126716114, + "grad_norm": 0.5939436554908752, + "learning_rate": 4.6992667121363465e-06, + "loss": 0.6023, + "step": 3083 + }, + { + "epoch": 0.48666561464415337, + "grad_norm": 0.6427435278892517, + "learning_rate": 4.699069193578647e-06, + "loss": 0.5759, + "step": 3084 + }, + { + "epoch": 0.48682341802114565, + "grad_norm": 0.5977331399917603, + "learning_rate": 4.6988716143319935e-06, + "loss": 0.5851, + "step": 3085 + }, + { + "epoch": 0.48698122139813793, + "grad_norm": 0.6046851873397827, + "learning_rate": 4.6986739744018395e-06, + "loss": 0.5825, + "step": 3086 + }, + { + "epoch": 0.4871390247751302, + "grad_norm": 0.641221284866333, + "learning_rate": 4.698476273793638e-06, + "loss": 0.5961, + "step": 3087 + }, + { + "epoch": 0.48729682815212244, + "grad_norm": 0.5893169641494751, + "learning_rate": 4.6982785125128475e-06, + "loss": 0.5562, + "step": 3088 + }, + { + "epoch": 0.4874546315291147, + "grad_norm": 0.6326732039451599, + "learning_rate": 4.698080690564923e-06, + "loss": 0.5805, + "step": 3089 + }, + { + "epoch": 0.487612434906107, + "grad_norm": 0.6113176941871643, + "learning_rate": 4.697882807955325e-06, + "loss": 0.5978, + "step": 3090 + }, + { + "epoch": 0.48777023828309923, + "grad_norm": 0.6093212962150574, + "learning_rate": 4.697684864689515e-06, + "loss": 0.6005, + "step": 3091 + }, + { + "epoch": 0.4879280416600915, + "grad_norm": 0.613431453704834, + "learning_rate": 4.697486860772956e-06, + "loss": 0.5914, + "step": 3092 + }, + { + "epoch": 0.4880858450370838, + "grad_norm": 0.6007431149482727, + "learning_rate": 4.6972887962111114e-06, + "loss": 0.556, + "step": 3093 + }, + { + "epoch": 0.4882436484140761, + "grad_norm": 0.5986007452011108, + "learning_rate": 4.697090671009448e-06, + "loss": 0.6151, + "step": 3094 + }, + { + "epoch": 0.4884014517910683, + "grad_norm": 0.5769327878952026, + "learning_rate": 4.696892485173433e-06, + "loss": 0.5883, + "step": 3095 + }, + { + "epoch": 0.4885592551680606, + "grad_norm": 0.597375214099884, + "learning_rate": 4.696694238708537e-06, + "loss": 0.6078, + "step": 3096 + }, + { + "epoch": 0.4887170585450529, + "grad_norm": 0.6027871370315552, + "learning_rate": 4.696495931620229e-06, + "loss": 0.5907, + "step": 3097 + }, + { + "epoch": 0.48887486192204516, + "grad_norm": 0.5727100968360901, + "learning_rate": 4.696297563913984e-06, + "loss": 0.5944, + "step": 3098 + }, + { + "epoch": 0.4890326652990374, + "grad_norm": 0.5940069556236267, + "learning_rate": 4.696099135595276e-06, + "loss": 0.5802, + "step": 3099 + }, + { + "epoch": 0.48919046867602967, + "grad_norm": 0.5794159770011902, + "learning_rate": 4.69590064666958e-06, + "loss": 0.6098, + "step": 3100 + }, + { + "epoch": 0.48934827205302195, + "grad_norm": 0.5688369274139404, + "learning_rate": 4.695702097142375e-06, + "loss": 0.5837, + "step": 3101 + }, + { + "epoch": 0.4895060754300142, + "grad_norm": 0.5906004905700684, + "learning_rate": 4.695503487019139e-06, + "loss": 0.5716, + "step": 3102 + }, + { + "epoch": 0.48966387880700646, + "grad_norm": 0.6226703524589539, + "learning_rate": 4.695304816305356e-06, + "loss": 0.5971, + "step": 3103 + }, + { + "epoch": 0.48982168218399874, + "grad_norm": 0.6135688424110413, + "learning_rate": 4.695106085006505e-06, + "loss": 0.6227, + "step": 3104 + }, + { + "epoch": 0.489979485560991, + "grad_norm": 0.578870415687561, + "learning_rate": 4.694907293128074e-06, + "loss": 0.5763, + "step": 3105 + }, + { + "epoch": 0.49013728893798325, + "grad_norm": 0.615856945514679, + "learning_rate": 4.694708440675547e-06, + "loss": 0.6135, + "step": 3106 + }, + { + "epoch": 0.49029509231497553, + "grad_norm": 0.6473188400268555, + "learning_rate": 4.694509527654413e-06, + "loss": 0.566, + "step": 3107 + }, + { + "epoch": 0.4904528956919678, + "grad_norm": 0.5914477705955505, + "learning_rate": 4.694310554070161e-06, + "loss": 0.5831, + "step": 3108 + }, + { + "epoch": 0.4906106990689601, + "grad_norm": 0.5841027498245239, + "learning_rate": 4.694111519928282e-06, + "loss": 0.6031, + "step": 3109 + }, + { + "epoch": 0.4907685024459523, + "grad_norm": 0.5695486664772034, + "learning_rate": 4.693912425234268e-06, + "loss": 0.5761, + "step": 3110 + }, + { + "epoch": 0.4909263058229446, + "grad_norm": 0.6154880523681641, + "learning_rate": 4.6937132699936165e-06, + "loss": 0.6422, + "step": 3111 + }, + { + "epoch": 0.4910841091999369, + "grad_norm": 0.5972318649291992, + "learning_rate": 4.6935140542118205e-06, + "loss": 0.602, + "step": 3112 + }, + { + "epoch": 0.4912419125769292, + "grad_norm": 0.6227360963821411, + "learning_rate": 4.693314777894379e-06, + "loss": 0.5989, + "step": 3113 + }, + { + "epoch": 0.4913997159539214, + "grad_norm": 0.6059128046035767, + "learning_rate": 4.693115441046793e-06, + "loss": 0.5959, + "step": 3114 + }, + { + "epoch": 0.4915575193309137, + "grad_norm": 0.613560140132904, + "learning_rate": 4.692916043674561e-06, + "loss": 0.5812, + "step": 3115 + }, + { + "epoch": 0.49171532270790597, + "grad_norm": 0.610429048538208, + "learning_rate": 4.692716585783188e-06, + "loss": 0.5835, + "step": 3116 + }, + { + "epoch": 0.4918731260848982, + "grad_norm": 0.5731315016746521, + "learning_rate": 4.692517067378178e-06, + "loss": 0.5685, + "step": 3117 + }, + { + "epoch": 0.4920309294618905, + "grad_norm": 0.6124387383460999, + "learning_rate": 4.692317488465037e-06, + "loss": 0.6347, + "step": 3118 + }, + { + "epoch": 0.49218873283888276, + "grad_norm": 0.6189008951187134, + "learning_rate": 4.692117849049272e-06, + "loss": 0.5076, + "step": 3119 + }, + { + "epoch": 0.49234653621587504, + "grad_norm": 0.5897554159164429, + "learning_rate": 4.691918149136396e-06, + "loss": 0.6034, + "step": 3120 + }, + { + "epoch": 0.49250433959286727, + "grad_norm": 0.6314239501953125, + "learning_rate": 4.691718388731915e-06, + "loss": 0.5781, + "step": 3121 + }, + { + "epoch": 0.49266214296985955, + "grad_norm": 0.5917624235153198, + "learning_rate": 4.691518567841346e-06, + "loss": 0.596, + "step": 3122 + }, + { + "epoch": 0.49281994634685183, + "grad_norm": 0.6129161715507507, + "learning_rate": 4.691318686470202e-06, + "loss": 0.5859, + "step": 3123 + }, + { + "epoch": 0.4929777497238441, + "grad_norm": 0.6120344400405884, + "learning_rate": 4.691118744624e-06, + "loss": 0.527, + "step": 3124 + }, + { + "epoch": 0.49313555310083634, + "grad_norm": 0.5831770896911621, + "learning_rate": 4.690918742308257e-06, + "loss": 0.6105, + "step": 3125 + }, + { + "epoch": 0.4932933564778286, + "grad_norm": 0.6271287798881531, + "learning_rate": 4.690718679528492e-06, + "loss": 0.6298, + "step": 3126 + }, + { + "epoch": 0.4934511598548209, + "grad_norm": 0.6003296375274658, + "learning_rate": 4.690518556290229e-06, + "loss": 0.577, + "step": 3127 + }, + { + "epoch": 0.49360896323181314, + "grad_norm": 0.5769280791282654, + "learning_rate": 4.690318372598988e-06, + "loss": 0.5415, + "step": 3128 + }, + { + "epoch": 0.4937667666088054, + "grad_norm": 0.6033050417900085, + "learning_rate": 4.6901181284602945e-06, + "loss": 0.5857, + "step": 3129 + }, + { + "epoch": 0.4939245699857977, + "grad_norm": 0.6032170653343201, + "learning_rate": 4.6899178238796765e-06, + "loss": 0.582, + "step": 3130 + }, + { + "epoch": 0.49408237336279, + "grad_norm": 0.6089653968811035, + "learning_rate": 4.689717458862659e-06, + "loss": 0.6043, + "step": 3131 + }, + { + "epoch": 0.4942401767397822, + "grad_norm": 0.6052002310752869, + "learning_rate": 4.689517033414773e-06, + "loss": 0.5951, + "step": 3132 + }, + { + "epoch": 0.4943979801167745, + "grad_norm": 0.6291135549545288, + "learning_rate": 4.68931654754155e-06, + "loss": 0.5671, + "step": 3133 + }, + { + "epoch": 0.4945557834937668, + "grad_norm": 0.5796050429344177, + "learning_rate": 4.6891160012485225e-06, + "loss": 0.5999, + "step": 3134 + }, + { + "epoch": 0.49471358687075906, + "grad_norm": 0.6120935678482056, + "learning_rate": 4.688915394541225e-06, + "loss": 0.5872, + "step": 3135 + }, + { + "epoch": 0.4948713902477513, + "grad_norm": 0.549192488193512, + "learning_rate": 4.688714727425194e-06, + "loss": 0.5808, + "step": 3136 + }, + { + "epoch": 0.49502919362474357, + "grad_norm": 0.5568338632583618, + "learning_rate": 4.688513999905968e-06, + "loss": 0.5515, + "step": 3137 + }, + { + "epoch": 0.49518699700173585, + "grad_norm": 0.5796235203742981, + "learning_rate": 4.688313211989086e-06, + "loss": 0.5904, + "step": 3138 + }, + { + "epoch": 0.4953448003787281, + "grad_norm": 0.610554575920105, + "learning_rate": 4.688112363680089e-06, + "loss": 0.5489, + "step": 3139 + }, + { + "epoch": 0.49550260375572036, + "grad_norm": 0.5701038241386414, + "learning_rate": 4.68791145498452e-06, + "loss": 0.5946, + "step": 3140 + }, + { + "epoch": 0.49566040713271264, + "grad_norm": 0.5921506285667419, + "learning_rate": 4.6877104859079234e-06, + "loss": 0.5758, + "step": 3141 + }, + { + "epoch": 0.4958182105097049, + "grad_norm": 0.574824869632721, + "learning_rate": 4.687509456455847e-06, + "loss": 0.5406, + "step": 3142 + }, + { + "epoch": 0.49597601388669715, + "grad_norm": 0.6252639293670654, + "learning_rate": 4.687308366633836e-06, + "loss": 0.5593, + "step": 3143 + }, + { + "epoch": 0.49613381726368944, + "grad_norm": 0.5728892087936401, + "learning_rate": 4.687107216447443e-06, + "loss": 0.5644, + "step": 3144 + }, + { + "epoch": 0.4962916206406817, + "grad_norm": 0.5814487934112549, + "learning_rate": 4.686906005902217e-06, + "loss": 0.6054, + "step": 3145 + }, + { + "epoch": 0.496449424017674, + "grad_norm": 0.6028937697410583, + "learning_rate": 4.686704735003711e-06, + "loss": 0.6052, + "step": 3146 + }, + { + "epoch": 0.49660722739466623, + "grad_norm": 0.5746954083442688, + "learning_rate": 4.686503403757482e-06, + "loss": 0.5934, + "step": 3147 + }, + { + "epoch": 0.4967650307716585, + "grad_norm": 0.5874417424201965, + "learning_rate": 4.6863020121690835e-06, + "loss": 0.5452, + "step": 3148 + }, + { + "epoch": 0.4969228341486508, + "grad_norm": 0.5627092719078064, + "learning_rate": 4.6861005602440745e-06, + "loss": 0.5566, + "step": 3149 + }, + { + "epoch": 0.4970806375256431, + "grad_norm": 0.582135796546936, + "learning_rate": 4.685899047988015e-06, + "loss": 0.5481, + "step": 3150 + }, + { + "epoch": 0.4972384409026353, + "grad_norm": 0.6293278932571411, + "learning_rate": 4.685697475406466e-06, + "loss": 0.6094, + "step": 3151 + }, + { + "epoch": 0.4973962442796276, + "grad_norm": 0.5999050736427307, + "learning_rate": 4.685495842504989e-06, + "loss": 0.5934, + "step": 3152 + }, + { + "epoch": 0.49755404765661987, + "grad_norm": 0.661517322063446, + "learning_rate": 4.6852941492891505e-06, + "loss": 0.5836, + "step": 3153 + }, + { + "epoch": 0.4977118510336121, + "grad_norm": 0.6171008944511414, + "learning_rate": 4.685092395764516e-06, + "loss": 0.5535, + "step": 3154 + }, + { + "epoch": 0.4978696544106044, + "grad_norm": 0.6160749793052673, + "learning_rate": 4.6848905819366535e-06, + "loss": 0.5877, + "step": 3155 + }, + { + "epoch": 0.49802745778759666, + "grad_norm": 0.5731388330459595, + "learning_rate": 4.684688707811132e-06, + "loss": 0.6198, + "step": 3156 + }, + { + "epoch": 0.49818526116458894, + "grad_norm": 0.5929586887359619, + "learning_rate": 4.684486773393524e-06, + "loss": 0.582, + "step": 3157 + }, + { + "epoch": 0.49834306454158117, + "grad_norm": 0.5854153633117676, + "learning_rate": 4.684284778689402e-06, + "loss": 0.6108, + "step": 3158 + }, + { + "epoch": 0.49850086791857345, + "grad_norm": 0.5761922597885132, + "learning_rate": 4.684082723704339e-06, + "loss": 0.5748, + "step": 3159 + }, + { + "epoch": 0.49865867129556574, + "grad_norm": 0.5856003761291504, + "learning_rate": 4.683880608443913e-06, + "loss": 0.5767, + "step": 3160 + }, + { + "epoch": 0.498816474672558, + "grad_norm": 0.5879014730453491, + "learning_rate": 4.6836784329137015e-06, + "loss": 0.5803, + "step": 3161 + }, + { + "epoch": 0.49897427804955025, + "grad_norm": 0.6256188154220581, + "learning_rate": 4.6834761971192835e-06, + "loss": 0.5804, + "step": 3162 + }, + { + "epoch": 0.49913208142654253, + "grad_norm": 0.6244272589683533, + "learning_rate": 4.683273901066241e-06, + "loss": 0.6002, + "step": 3163 + }, + { + "epoch": 0.4992898848035348, + "grad_norm": 0.5504565834999084, + "learning_rate": 4.683071544760156e-06, + "loss": 0.5844, + "step": 3164 + }, + { + "epoch": 0.49944768818052704, + "grad_norm": 0.6168366074562073, + "learning_rate": 4.682869128206614e-06, + "loss": 0.6118, + "step": 3165 + }, + { + "epoch": 0.4996054915575193, + "grad_norm": 0.5568456053733826, + "learning_rate": 4.6826666514112e-06, + "loss": 0.6, + "step": 3166 + }, + { + "epoch": 0.4997632949345116, + "grad_norm": 0.6004049777984619, + "learning_rate": 4.682464114379503e-06, + "loss": 0.5956, + "step": 3167 + }, + { + "epoch": 0.4999210983115039, + "grad_norm": 0.5828208923339844, + "learning_rate": 4.682261517117112e-06, + "loss": 0.5947, + "step": 3168 + }, + { + "epoch": 0.5000789016884961, + "grad_norm": 0.6076875329017639, + "learning_rate": 4.682058859629619e-06, + "loss": 0.5719, + "step": 3169 + }, + { + "epoch": 0.5002367050654885, + "grad_norm": 0.6039599776268005, + "learning_rate": 4.681856141922615e-06, + "loss": 0.6074, + "step": 3170 + }, + { + "epoch": 0.5003945084424807, + "grad_norm": 0.5961118936538696, + "learning_rate": 4.681653364001696e-06, + "loss": 0.5735, + "step": 3171 + }, + { + "epoch": 0.5005523118194729, + "grad_norm": 0.5743470788002014, + "learning_rate": 4.681450525872458e-06, + "loss": 0.6029, + "step": 3172 + }, + { + "epoch": 0.5007101151964652, + "grad_norm": 0.5831307768821716, + "learning_rate": 4.681247627540499e-06, + "loss": 0.6168, + "step": 3173 + }, + { + "epoch": 0.5008679185734575, + "grad_norm": 0.6102088093757629, + "learning_rate": 4.681044669011418e-06, + "loss": 0.5719, + "step": 3174 + }, + { + "epoch": 0.5010257219504497, + "grad_norm": 0.5736030340194702, + "learning_rate": 4.680841650290816e-06, + "loss": 0.5601, + "step": 3175 + }, + { + "epoch": 0.501183525327442, + "grad_norm": 0.6134971976280212, + "learning_rate": 4.680638571384296e-06, + "loss": 0.6358, + "step": 3176 + }, + { + "epoch": 0.5013413287044343, + "grad_norm": 0.5823049545288086, + "learning_rate": 4.6804354322974635e-06, + "loss": 0.5833, + "step": 3177 + }, + { + "epoch": 0.5014991320814265, + "grad_norm": 0.5976095199584961, + "learning_rate": 4.680232233035923e-06, + "loss": 0.5896, + "step": 3178 + }, + { + "epoch": 0.5016569354584188, + "grad_norm": 0.5997558832168579, + "learning_rate": 4.680028973605283e-06, + "loss": 0.5589, + "step": 3179 + }, + { + "epoch": 0.5018147388354111, + "grad_norm": 0.6460225582122803, + "learning_rate": 4.6798256540111535e-06, + "loss": 0.6358, + "step": 3180 + }, + { + "epoch": 0.5019725422124034, + "grad_norm": 0.5704441666603088, + "learning_rate": 4.679622274259145e-06, + "loss": 0.5605, + "step": 3181 + }, + { + "epoch": 0.5021303455893956, + "grad_norm": 0.6156453490257263, + "learning_rate": 4.67941883435487e-06, + "loss": 0.6153, + "step": 3182 + }, + { + "epoch": 0.5022881489663878, + "grad_norm": 0.6020931005477905, + "learning_rate": 4.679215334303945e-06, + "loss": 0.5564, + "step": 3183 + }, + { + "epoch": 0.5024459523433802, + "grad_norm": 0.6035091876983643, + "learning_rate": 4.679011774111983e-06, + "loss": 0.5489, + "step": 3184 + }, + { + "epoch": 0.5026037557203724, + "grad_norm": 0.5715100169181824, + "learning_rate": 4.6788081537846035e-06, + "loss": 0.5992, + "step": 3185 + }, + { + "epoch": 0.5027615590973646, + "grad_norm": 0.577251136302948, + "learning_rate": 4.678604473327426e-06, + "loss": 0.5975, + "step": 3186 + }, + { + "epoch": 0.502919362474357, + "grad_norm": 0.583337664604187, + "learning_rate": 4.678400732746071e-06, + "loss": 0.6207, + "step": 3187 + }, + { + "epoch": 0.5030771658513492, + "grad_norm": 0.6131563186645508, + "learning_rate": 4.6781969320461614e-06, + "loss": 0.5617, + "step": 3188 + }, + { + "epoch": 0.5032349692283414, + "grad_norm": 0.5803416967391968, + "learning_rate": 4.677993071233324e-06, + "loss": 0.5559, + "step": 3189 + }, + { + "epoch": 0.5033927726053338, + "grad_norm": 0.6191145777702332, + "learning_rate": 4.67778915031318e-06, + "loss": 0.5574, + "step": 3190 + }, + { + "epoch": 0.503550575982326, + "grad_norm": 0.5536466836929321, + "learning_rate": 4.67758516929136e-06, + "loss": 0.5687, + "step": 3191 + }, + { + "epoch": 0.5037083793593183, + "grad_norm": 0.5383136868476868, + "learning_rate": 4.677381128173494e-06, + "loss": 0.5937, + "step": 3192 + }, + { + "epoch": 0.5038661827363106, + "grad_norm": 0.6307808756828308, + "learning_rate": 4.677177026965211e-06, + "loss": 0.6139, + "step": 3193 + }, + { + "epoch": 0.5040239861133028, + "grad_norm": 0.5805954337120056, + "learning_rate": 4.676972865672146e-06, + "loss": 0.5655, + "step": 3194 + }, + { + "epoch": 0.5041817894902951, + "grad_norm": 0.5933268666267395, + "learning_rate": 4.676768644299932e-06, + "loss": 0.6127, + "step": 3195 + }, + { + "epoch": 0.5043395928672874, + "grad_norm": 0.5746484994888306, + "learning_rate": 4.676564362854204e-06, + "loss": 0.5734, + "step": 3196 + }, + { + "epoch": 0.5044973962442796, + "grad_norm": 0.5766634345054626, + "learning_rate": 4.676360021340602e-06, + "loss": 0.5967, + "step": 3197 + }, + { + "epoch": 0.5046551996212719, + "grad_norm": 0.594965934753418, + "learning_rate": 4.676155619764764e-06, + "loss": 0.5754, + "step": 3198 + }, + { + "epoch": 0.5048130029982641, + "grad_norm": 0.6059662699699402, + "learning_rate": 4.675951158132331e-06, + "loss": 0.568, + "step": 3199 + }, + { + "epoch": 0.5049708063752564, + "grad_norm": 0.5885428190231323, + "learning_rate": 4.675746636448945e-06, + "loss": 0.5996, + "step": 3200 + }, + { + "epoch": 0.5051286097522487, + "grad_norm": 0.6341910362243652, + "learning_rate": 4.675542054720253e-06, + "loss": 0.5529, + "step": 3201 + }, + { + "epoch": 0.5052864131292409, + "grad_norm": 0.5581211447715759, + "learning_rate": 4.675337412951898e-06, + "loss": 0.5869, + "step": 3202 + }, + { + "epoch": 0.5054442165062333, + "grad_norm": 0.6146372556686401, + "learning_rate": 4.675132711149529e-06, + "loss": 0.5937, + "step": 3203 + }, + { + "epoch": 0.5056020198832255, + "grad_norm": 0.6812949180603027, + "learning_rate": 4.674927949318795e-06, + "loss": 0.5661, + "step": 3204 + }, + { + "epoch": 0.5057598232602177, + "grad_norm": 0.5814793705940247, + "learning_rate": 4.674723127465347e-06, + "loss": 0.6018, + "step": 3205 + }, + { + "epoch": 0.5059176266372101, + "grad_norm": 0.5856676697731018, + "learning_rate": 4.674518245594836e-06, + "loss": 0.5861, + "step": 3206 + }, + { + "epoch": 0.5060754300142023, + "grad_norm": 0.602411150932312, + "learning_rate": 4.67431330371292e-06, + "loss": 0.5789, + "step": 3207 + }, + { + "epoch": 0.5062332333911945, + "grad_norm": 0.5801651477813721, + "learning_rate": 4.674108301825252e-06, + "loss": 0.6186, + "step": 3208 + }, + { + "epoch": 0.5063910367681869, + "grad_norm": 0.5799238085746765, + "learning_rate": 4.673903239937489e-06, + "loss": 0.6271, + "step": 3209 + }, + { + "epoch": 0.5065488401451791, + "grad_norm": 0.5863869786262512, + "learning_rate": 4.673698118055292e-06, + "loss": 0.565, + "step": 3210 + }, + { + "epoch": 0.5067066435221714, + "grad_norm": 0.5555136799812317, + "learning_rate": 4.673492936184322e-06, + "loss": 0.5738, + "step": 3211 + }, + { + "epoch": 0.5068644468991637, + "grad_norm": 0.5644059181213379, + "learning_rate": 4.67328769433024e-06, + "loss": 0.5882, + "step": 3212 + }, + { + "epoch": 0.5070222502761559, + "grad_norm": 0.5934536457061768, + "learning_rate": 4.673082392498711e-06, + "loss": 0.5596, + "step": 3213 + }, + { + "epoch": 0.5071800536531482, + "grad_norm": 0.5908259749412537, + "learning_rate": 4.672877030695401e-06, + "loss": 0.5837, + "step": 3214 + }, + { + "epoch": 0.5073378570301404, + "grad_norm": 0.5842491984367371, + "learning_rate": 4.672671608925977e-06, + "loss": 0.596, + "step": 3215 + }, + { + "epoch": 0.5074956604071327, + "grad_norm": 0.5834252238273621, + "learning_rate": 4.672466127196109e-06, + "loss": 0.5591, + "step": 3216 + }, + { + "epoch": 0.507653463784125, + "grad_norm": 0.5976048111915588, + "learning_rate": 4.672260585511467e-06, + "loss": 0.5946, + "step": 3217 + }, + { + "epoch": 0.5078112671611172, + "grad_norm": 0.6053875684738159, + "learning_rate": 4.672054983877723e-06, + "loss": 0.6272, + "step": 3218 + }, + { + "epoch": 0.5079690705381095, + "grad_norm": 0.5896010398864746, + "learning_rate": 4.671849322300552e-06, + "loss": 0.5958, + "step": 3219 + }, + { + "epoch": 0.5081268739151018, + "grad_norm": 0.5848332047462463, + "learning_rate": 4.671643600785629e-06, + "loss": 0.6323, + "step": 3220 + }, + { + "epoch": 0.508284677292094, + "grad_norm": 0.6020887494087219, + "learning_rate": 4.671437819338633e-06, + "loss": 0.5943, + "step": 3221 + }, + { + "epoch": 0.5084424806690864, + "grad_norm": 0.601967453956604, + "learning_rate": 4.671231977965241e-06, + "loss": 0.62, + "step": 3222 + }, + { + "epoch": 0.5086002840460786, + "grad_norm": 0.573516309261322, + "learning_rate": 4.6710260766711355e-06, + "loss": 0.6025, + "step": 3223 + }, + { + "epoch": 0.5087580874230708, + "grad_norm": 0.5639981627464294, + "learning_rate": 4.6708201154619985e-06, + "loss": 0.5844, + "step": 3224 + }, + { + "epoch": 0.5089158908000632, + "grad_norm": 0.611923336982727, + "learning_rate": 4.670614094343512e-06, + "loss": 0.6046, + "step": 3225 + }, + { + "epoch": 0.5090736941770554, + "grad_norm": 0.5900511145591736, + "learning_rate": 4.670408013321365e-06, + "loss": 0.6038, + "step": 3226 + }, + { + "epoch": 0.5092314975540476, + "grad_norm": 0.5907394289970398, + "learning_rate": 4.670201872401242e-06, + "loss": 0.5785, + "step": 3227 + }, + { + "epoch": 0.50938930093104, + "grad_norm": 0.5985817313194275, + "learning_rate": 4.669995671588833e-06, + "loss": 0.5873, + "step": 3228 + }, + { + "epoch": 0.5095471043080322, + "grad_norm": 0.6105371117591858, + "learning_rate": 4.669789410889829e-06, + "loss": 0.6138, + "step": 3229 + }, + { + "epoch": 0.5097049076850244, + "grad_norm": 0.5798530578613281, + "learning_rate": 4.669583090309923e-06, + "loss": 0.5664, + "step": 3230 + }, + { + "epoch": 0.5098627110620167, + "grad_norm": 0.6100640892982483, + "learning_rate": 4.669376709854807e-06, + "loss": 0.5646, + "step": 3231 + }, + { + "epoch": 0.510020514439009, + "grad_norm": 0.5886738300323486, + "learning_rate": 4.669170269530178e-06, + "loss": 0.5699, + "step": 3232 + }, + { + "epoch": 0.5101783178160013, + "grad_norm": 0.5959857702255249, + "learning_rate": 4.668963769341732e-06, + "loss": 0.6087, + "step": 3233 + }, + { + "epoch": 0.5103361211929935, + "grad_norm": 0.5795708298683167, + "learning_rate": 4.668757209295169e-06, + "loss": 0.5696, + "step": 3234 + }, + { + "epoch": 0.5104939245699858, + "grad_norm": 0.5899804830551147, + "learning_rate": 4.66855058939619e-06, + "loss": 0.5822, + "step": 3235 + }, + { + "epoch": 0.5106517279469781, + "grad_norm": 0.6663082242012024, + "learning_rate": 4.6683439096504965e-06, + "loss": 0.5805, + "step": 3236 + }, + { + "epoch": 0.5108095313239703, + "grad_norm": 0.576417088508606, + "learning_rate": 4.668137170063792e-06, + "loss": 0.5983, + "step": 3237 + }, + { + "epoch": 0.5109673347009626, + "grad_norm": 0.5552636384963989, + "learning_rate": 4.667930370641782e-06, + "loss": 0.5868, + "step": 3238 + }, + { + "epoch": 0.5111251380779549, + "grad_norm": 0.5853247046470642, + "learning_rate": 4.667723511390174e-06, + "loss": 0.5723, + "step": 3239 + }, + { + "epoch": 0.5112829414549471, + "grad_norm": 0.5633020401000977, + "learning_rate": 4.6675165923146785e-06, + "loss": 0.5508, + "step": 3240 + }, + { + "epoch": 0.5114407448319394, + "grad_norm": 0.590776264667511, + "learning_rate": 4.667309613421003e-06, + "loss": 0.5987, + "step": 3241 + }, + { + "epoch": 0.5115985482089317, + "grad_norm": 0.5572790503501892, + "learning_rate": 4.667102574714861e-06, + "loss": 0.5763, + "step": 3242 + }, + { + "epoch": 0.5117563515859239, + "grad_norm": 0.6209731698036194, + "learning_rate": 4.666895476201966e-06, + "loss": 0.6228, + "step": 3243 + }, + { + "epoch": 0.5119141549629163, + "grad_norm": 0.5922080278396606, + "learning_rate": 4.666688317888034e-06, + "loss": 0.6106, + "step": 3244 + }, + { + "epoch": 0.5120719583399085, + "grad_norm": 0.5866551995277405, + "learning_rate": 4.666481099778781e-06, + "loss": 0.5973, + "step": 3245 + }, + { + "epoch": 0.5122297617169007, + "grad_norm": 0.5602622032165527, + "learning_rate": 4.666273821879927e-06, + "loss": 0.5547, + "step": 3246 + }, + { + "epoch": 0.512387565093893, + "grad_norm": 0.6071511507034302, + "learning_rate": 4.666066484197191e-06, + "loss": 0.6066, + "step": 3247 + }, + { + "epoch": 0.5125453684708853, + "grad_norm": 0.5670208930969238, + "learning_rate": 4.665859086736297e-06, + "loss": 0.6188, + "step": 3248 + }, + { + "epoch": 0.5127031718478775, + "grad_norm": 0.5558754205703735, + "learning_rate": 4.665651629502966e-06, + "loss": 0.5902, + "step": 3249 + }, + { + "epoch": 0.5128609752248698, + "grad_norm": 0.6156905889511108, + "learning_rate": 4.665444112502926e-06, + "loss": 0.5438, + "step": 3250 + }, + { + "epoch": 0.5130187786018621, + "grad_norm": 0.5827562808990479, + "learning_rate": 4.665236535741902e-06, + "loss": 0.5857, + "step": 3251 + }, + { + "epoch": 0.5131765819788543, + "grad_norm": 0.5935811996459961, + "learning_rate": 4.6650288992256235e-06, + "loss": 0.5994, + "step": 3252 + }, + { + "epoch": 0.5133343853558466, + "grad_norm": 0.5882035493850708, + "learning_rate": 4.664821202959821e-06, + "loss": 0.5725, + "step": 3253 + }, + { + "epoch": 0.5134921887328389, + "grad_norm": 0.5821424722671509, + "learning_rate": 4.6646134469502245e-06, + "loss": 0.5989, + "step": 3254 + }, + { + "epoch": 0.5136499921098312, + "grad_norm": 0.6268448829650879, + "learning_rate": 4.66440563120257e-06, + "loss": 0.5757, + "step": 3255 + }, + { + "epoch": 0.5138077954868234, + "grad_norm": 0.5977441072463989, + "learning_rate": 4.664197755722592e-06, + "loss": 0.5712, + "step": 3256 + }, + { + "epoch": 0.5139655988638157, + "grad_norm": 0.6205559968948364, + "learning_rate": 4.663989820516027e-06, + "loss": 0.5697, + "step": 3257 + }, + { + "epoch": 0.514123402240808, + "grad_norm": 0.6017505526542664, + "learning_rate": 4.663781825588614e-06, + "loss": 0.591, + "step": 3258 + }, + { + "epoch": 0.5142812056178002, + "grad_norm": 0.5727201700210571, + "learning_rate": 4.663573770946092e-06, + "loss": 0.599, + "step": 3259 + }, + { + "epoch": 0.5144390089947924, + "grad_norm": 0.5879718661308289, + "learning_rate": 4.663365656594203e-06, + "loss": 0.6053, + "step": 3260 + }, + { + "epoch": 0.5145968123717848, + "grad_norm": 0.5983181595802307, + "learning_rate": 4.663157482538693e-06, + "loss": 0.5308, + "step": 3261 + }, + { + "epoch": 0.514754615748777, + "grad_norm": 0.5774259567260742, + "learning_rate": 4.6629492487853035e-06, + "loss": 0.5652, + "step": 3262 + }, + { + "epoch": 0.5149124191257692, + "grad_norm": 0.5817708373069763, + "learning_rate": 4.662740955339783e-06, + "loss": 0.6027, + "step": 3263 + }, + { + "epoch": 0.5150702225027616, + "grad_norm": 0.5643896460533142, + "learning_rate": 4.662532602207881e-06, + "loss": 0.577, + "step": 3264 + }, + { + "epoch": 0.5152280258797538, + "grad_norm": 0.6077775955200195, + "learning_rate": 4.662324189395345e-06, + "loss": 0.5783, + "step": 3265 + }, + { + "epoch": 0.5153858292567461, + "grad_norm": 0.6099270582199097, + "learning_rate": 4.6621157169079275e-06, + "loss": 0.5822, + "step": 3266 + }, + { + "epoch": 0.5155436326337384, + "grad_norm": 0.5897260308265686, + "learning_rate": 4.6619071847513835e-06, + "loss": 0.6001, + "step": 3267 + }, + { + "epoch": 0.5157014360107306, + "grad_norm": 0.5943058729171753, + "learning_rate": 4.661698592931466e-06, + "loss": 0.5984, + "step": 3268 + }, + { + "epoch": 0.5158592393877229, + "grad_norm": 0.5977657437324524, + "learning_rate": 4.6614899414539334e-06, + "loss": 0.5927, + "step": 3269 + }, + { + "epoch": 0.5160170427647152, + "grad_norm": 0.6262012124061584, + "learning_rate": 4.661281230324543e-06, + "loss": 0.6266, + "step": 3270 + }, + { + "epoch": 0.5161748461417074, + "grad_norm": 0.6257683038711548, + "learning_rate": 4.661072459549054e-06, + "loss": 0.6015, + "step": 3271 + }, + { + "epoch": 0.5163326495186997, + "grad_norm": 0.5772569179534912, + "learning_rate": 4.660863629133228e-06, + "loss": 0.5675, + "step": 3272 + }, + { + "epoch": 0.516490452895692, + "grad_norm": 0.5841168165206909, + "learning_rate": 4.66065473908283e-06, + "loss": 0.5818, + "step": 3273 + }, + { + "epoch": 0.5166482562726842, + "grad_norm": 0.6085084676742554, + "learning_rate": 4.660445789403623e-06, + "loss": 0.5958, + "step": 3274 + }, + { + "epoch": 0.5168060596496765, + "grad_norm": 0.610654890537262, + "learning_rate": 4.6602367801013745e-06, + "loss": 0.5864, + "step": 3275 + }, + { + "epoch": 0.5169638630266687, + "grad_norm": 0.5723106265068054, + "learning_rate": 4.660027711181853e-06, + "loss": 0.6159, + "step": 3276 + }, + { + "epoch": 0.5171216664036611, + "grad_norm": 0.607310950756073, + "learning_rate": 4.659818582650828e-06, + "loss": 0.5882, + "step": 3277 + }, + { + "epoch": 0.5172794697806533, + "grad_norm": 0.6269048452377319, + "learning_rate": 4.659609394514069e-06, + "loss": 0.6055, + "step": 3278 + }, + { + "epoch": 0.5174372731576455, + "grad_norm": 0.5563472509384155, + "learning_rate": 4.659400146777352e-06, + "loss": 0.5881, + "step": 3279 + }, + { + "epoch": 0.5175950765346379, + "grad_norm": 0.6014525890350342, + "learning_rate": 4.65919083944645e-06, + "loss": 0.5766, + "step": 3280 + }, + { + "epoch": 0.5177528799116301, + "grad_norm": 0.5874263048171997, + "learning_rate": 4.6589814725271394e-06, + "loss": 0.5627, + "step": 3281 + }, + { + "epoch": 0.5179106832886223, + "grad_norm": 0.5977831482887268, + "learning_rate": 4.6587720460252e-06, + "loss": 0.5709, + "step": 3282 + }, + { + "epoch": 0.5180684866656147, + "grad_norm": 0.576897382736206, + "learning_rate": 4.658562559946408e-06, + "loss": 0.5851, + "step": 3283 + }, + { + "epoch": 0.5182262900426069, + "grad_norm": 0.5840798020362854, + "learning_rate": 4.658353014296548e-06, + "loss": 0.5827, + "step": 3284 + }, + { + "epoch": 0.5183840934195992, + "grad_norm": 0.6184645295143127, + "learning_rate": 4.658143409081401e-06, + "loss": 0.6267, + "step": 3285 + }, + { + "epoch": 0.5185418967965915, + "grad_norm": 0.5848968625068665, + "learning_rate": 4.657933744306753e-06, + "loss": 0.5895, + "step": 3286 + }, + { + "epoch": 0.5186997001735837, + "grad_norm": 0.6293984055519104, + "learning_rate": 4.65772401997839e-06, + "loss": 0.5933, + "step": 3287 + }, + { + "epoch": 0.518857503550576, + "grad_norm": 0.6182519197463989, + "learning_rate": 4.657514236102099e-06, + "loss": 0.5909, + "step": 3288 + }, + { + "epoch": 0.5190153069275683, + "grad_norm": 0.588223397731781, + "learning_rate": 4.65730439268367e-06, + "loss": 0.5697, + "step": 3289 + }, + { + "epoch": 0.5191731103045605, + "grad_norm": 0.5836308002471924, + "learning_rate": 4.6570944897288935e-06, + "loss": 0.5922, + "step": 3290 + }, + { + "epoch": 0.5193309136815528, + "grad_norm": 0.5964441895484924, + "learning_rate": 4.656884527243564e-06, + "loss": 0.567, + "step": 3291 + }, + { + "epoch": 0.519488717058545, + "grad_norm": 0.5945777297019958, + "learning_rate": 4.6566745052334744e-06, + "loss": 0.6078, + "step": 3292 + }, + { + "epoch": 0.5196465204355373, + "grad_norm": 0.5944077968597412, + "learning_rate": 4.656464423704421e-06, + "loss": 0.5595, + "step": 3293 + }, + { + "epoch": 0.5198043238125296, + "grad_norm": 0.5977767705917358, + "learning_rate": 4.6562542826622025e-06, + "loss": 0.593, + "step": 3294 + }, + { + "epoch": 0.5199621271895218, + "grad_norm": 0.591889500617981, + "learning_rate": 4.656044082112618e-06, + "loss": 0.5554, + "step": 3295 + }, + { + "epoch": 0.5201199305665142, + "grad_norm": 0.5762770175933838, + "learning_rate": 4.655833822061466e-06, + "loss": 0.5595, + "step": 3296 + }, + { + "epoch": 0.5202777339435064, + "grad_norm": 0.5582716464996338, + "learning_rate": 4.655623502514554e-06, + "loss": 0.5792, + "step": 3297 + }, + { + "epoch": 0.5204355373204986, + "grad_norm": 0.5992917418479919, + "learning_rate": 4.6554131234776815e-06, + "loss": 0.565, + "step": 3298 + }, + { + "epoch": 0.520593340697491, + "grad_norm": 0.6034846305847168, + "learning_rate": 4.6552026849566576e-06, + "loss": 0.5629, + "step": 3299 + }, + { + "epoch": 0.5207511440744832, + "grad_norm": 0.5969157218933105, + "learning_rate": 4.654992186957288e-06, + "loss": 0.5858, + "step": 3300 + }, + { + "epoch": 0.5209089474514754, + "grad_norm": 0.5944775342941284, + "learning_rate": 4.654781629485385e-06, + "loss": 0.5945, + "step": 3301 + }, + { + "epoch": 0.5210667508284678, + "grad_norm": 0.5931000709533691, + "learning_rate": 4.6545710125467544e-06, + "loss": 0.6269, + "step": 3302 + }, + { + "epoch": 0.52122455420546, + "grad_norm": 0.581727147102356, + "learning_rate": 4.6543603361472134e-06, + "loss": 0.581, + "step": 3303 + }, + { + "epoch": 0.5213823575824522, + "grad_norm": 0.6009762287139893, + "learning_rate": 4.654149600292573e-06, + "loss": 0.6033, + "step": 3304 + }, + { + "epoch": 0.5215401609594446, + "grad_norm": 0.5780070424079895, + "learning_rate": 4.653938804988651e-06, + "loss": 0.5739, + "step": 3305 + }, + { + "epoch": 0.5216979643364368, + "grad_norm": 0.5989776849746704, + "learning_rate": 4.653727950241265e-06, + "loss": 0.6553, + "step": 3306 + }, + { + "epoch": 0.5218557677134291, + "grad_norm": 0.5840609073638916, + "learning_rate": 4.653517036056232e-06, + "loss": 0.5863, + "step": 3307 + }, + { + "epoch": 0.5220135710904213, + "grad_norm": 0.5956335067749023, + "learning_rate": 4.653306062439375e-06, + "loss": 0.5831, + "step": 3308 + }, + { + "epoch": 0.5221713744674136, + "grad_norm": 0.5751097798347473, + "learning_rate": 4.653095029396514e-06, + "loss": 0.6009, + "step": 3309 + }, + { + "epoch": 0.5223291778444059, + "grad_norm": 0.5760238766670227, + "learning_rate": 4.652883936933475e-06, + "loss": 0.5878, + "step": 3310 + }, + { + "epoch": 0.5224869812213981, + "grad_norm": 0.6286089420318604, + "learning_rate": 4.652672785056084e-06, + "loss": 0.5892, + "step": 3311 + }, + { + "epoch": 0.5226447845983904, + "grad_norm": 0.611566960811615, + "learning_rate": 4.652461573770166e-06, + "loss": 0.5854, + "step": 3312 + }, + { + "epoch": 0.5228025879753827, + "grad_norm": 0.5940570831298828, + "learning_rate": 4.652250303081552e-06, + "loss": 0.5692, + "step": 3313 + }, + { + "epoch": 0.5229603913523749, + "grad_norm": 0.5911929607391357, + "learning_rate": 4.65203897299607e-06, + "loss": 0.6312, + "step": 3314 + }, + { + "epoch": 0.5231181947293672, + "grad_norm": 0.5845211744308472, + "learning_rate": 4.6518275835195555e-06, + "loss": 0.6053, + "step": 3315 + }, + { + "epoch": 0.5232759981063595, + "grad_norm": 0.58905428647995, + "learning_rate": 4.65161613465784e-06, + "loss": 0.5808, + "step": 3316 + }, + { + "epoch": 0.5234338014833517, + "grad_norm": 0.5670469403266907, + "learning_rate": 4.65140462641676e-06, + "loss": 0.5916, + "step": 3317 + }, + { + "epoch": 0.5235916048603441, + "grad_norm": 0.5720929503440857, + "learning_rate": 4.651193058802152e-06, + "loss": 0.6028, + "step": 3318 + }, + { + "epoch": 0.5237494082373363, + "grad_norm": 0.5620148181915283, + "learning_rate": 4.650981431819855e-06, + "loss": 0.6066, + "step": 3319 + }, + { + "epoch": 0.5239072116143285, + "grad_norm": 0.6356255412101746, + "learning_rate": 4.65076974547571e-06, + "loss": 0.5806, + "step": 3320 + }, + { + "epoch": 0.5240650149913209, + "grad_norm": 0.6063912510871887, + "learning_rate": 4.650557999775558e-06, + "loss": 0.5833, + "step": 3321 + }, + { + "epoch": 0.5242228183683131, + "grad_norm": 0.5770809054374695, + "learning_rate": 4.650346194725244e-06, + "loss": 0.5859, + "step": 3322 + }, + { + "epoch": 0.5243806217453053, + "grad_norm": 0.57406085729599, + "learning_rate": 4.650134330330611e-06, + "loss": 0.569, + "step": 3323 + }, + { + "epoch": 0.5245384251222976, + "grad_norm": 0.5737090706825256, + "learning_rate": 4.649922406597509e-06, + "loss": 0.5722, + "step": 3324 + }, + { + "epoch": 0.5246962284992899, + "grad_norm": 0.6190186142921448, + "learning_rate": 4.649710423531784e-06, + "loss": 0.5918, + "step": 3325 + }, + { + "epoch": 0.5248540318762821, + "grad_norm": 0.5646519064903259, + "learning_rate": 4.649498381139287e-06, + "loss": 0.6108, + "step": 3326 + }, + { + "epoch": 0.5250118352532744, + "grad_norm": 0.5916792154312134, + "learning_rate": 4.649286279425871e-06, + "loss": 0.5596, + "step": 3327 + }, + { + "epoch": 0.5251696386302667, + "grad_norm": 0.607404887676239, + "learning_rate": 4.6490741183973885e-06, + "loss": 0.5872, + "step": 3328 + }, + { + "epoch": 0.525327442007259, + "grad_norm": 0.5764176249504089, + "learning_rate": 4.648861898059693e-06, + "loss": 0.5892, + "step": 3329 + }, + { + "epoch": 0.5254852453842512, + "grad_norm": 0.5738563537597656, + "learning_rate": 4.648649618418644e-06, + "loss": 0.5842, + "step": 3330 + }, + { + "epoch": 0.5256430487612435, + "grad_norm": 0.5839462280273438, + "learning_rate": 4.6484372794801e-06, + "loss": 0.6064, + "step": 3331 + }, + { + "epoch": 0.5258008521382358, + "grad_norm": 0.5823054313659668, + "learning_rate": 4.6482248812499185e-06, + "loss": 0.5681, + "step": 3332 + }, + { + "epoch": 0.525958655515228, + "grad_norm": 0.6182889938354492, + "learning_rate": 4.648012423733963e-06, + "loss": 0.5863, + "step": 3333 + }, + { + "epoch": 0.5261164588922203, + "grad_norm": 0.5769601464271545, + "learning_rate": 4.647799906938096e-06, + "loss": 0.5872, + "step": 3334 + }, + { + "epoch": 0.5262742622692126, + "grad_norm": 0.6135481595993042, + "learning_rate": 4.647587330868183e-06, + "loss": 0.5768, + "step": 3335 + }, + { + "epoch": 0.5264320656462048, + "grad_norm": 0.5730228424072266, + "learning_rate": 4.64737469553009e-06, + "loss": 0.604, + "step": 3336 + }, + { + "epoch": 0.526589869023197, + "grad_norm": 0.586538553237915, + "learning_rate": 4.647162000929686e-06, + "loss": 0.5696, + "step": 3337 + }, + { + "epoch": 0.5267476724001894, + "grad_norm": 0.5601921677589417, + "learning_rate": 4.6469492470728406e-06, + "loss": 0.5958, + "step": 3338 + }, + { + "epoch": 0.5269054757771816, + "grad_norm": 0.5967245697975159, + "learning_rate": 4.646736433965425e-06, + "loss": 0.5934, + "step": 3339 + }, + { + "epoch": 0.527063279154174, + "grad_norm": 0.6054359674453735, + "learning_rate": 4.646523561613312e-06, + "loss": 0.5918, + "step": 3340 + }, + { + "epoch": 0.5272210825311662, + "grad_norm": 0.6454851627349854, + "learning_rate": 4.6463106300223775e-06, + "loss": 0.5936, + "step": 3341 + }, + { + "epoch": 0.5273788859081584, + "grad_norm": 0.5630428791046143, + "learning_rate": 4.646097639198497e-06, + "loss": 0.5825, + "step": 3342 + }, + { + "epoch": 0.5275366892851507, + "grad_norm": 0.554992139339447, + "learning_rate": 4.645884589147549e-06, + "loss": 0.5592, + "step": 3343 + }, + { + "epoch": 0.527694492662143, + "grad_norm": 0.5666011571884155, + "learning_rate": 4.645671479875412e-06, + "loss": 0.5652, + "step": 3344 + }, + { + "epoch": 0.5278522960391352, + "grad_norm": 0.6095622181892395, + "learning_rate": 4.645458311387969e-06, + "loss": 0.5694, + "step": 3345 + }, + { + "epoch": 0.5280100994161275, + "grad_norm": 0.5986672639846802, + "learning_rate": 4.645245083691102e-06, + "loss": 0.5904, + "step": 3346 + }, + { + "epoch": 0.5281679027931198, + "grad_norm": 0.5555661916732788, + "learning_rate": 4.645031796790695e-06, + "loss": 0.6069, + "step": 3347 + }, + { + "epoch": 0.528325706170112, + "grad_norm": 0.6113168597221375, + "learning_rate": 4.6448184506926356e-06, + "loss": 0.5456, + "step": 3348 + }, + { + "epoch": 0.5284835095471043, + "grad_norm": 0.6834935545921326, + "learning_rate": 4.6446050454028116e-06, + "loss": 0.5813, + "step": 3349 + }, + { + "epoch": 0.5286413129240966, + "grad_norm": 0.559215247631073, + "learning_rate": 4.64439158092711e-06, + "loss": 0.5851, + "step": 3350 + }, + { + "epoch": 0.5287991163010889, + "grad_norm": 0.604269802570343, + "learning_rate": 4.644178057271425e-06, + "loss": 0.5826, + "step": 3351 + }, + { + "epoch": 0.5289569196780811, + "grad_norm": 0.589942991733551, + "learning_rate": 4.643964474441648e-06, + "loss": 0.5915, + "step": 3352 + }, + { + "epoch": 0.5291147230550733, + "grad_norm": 0.598373532295227, + "learning_rate": 4.643750832443673e-06, + "loss": 0.5595, + "step": 3353 + }, + { + "epoch": 0.5292725264320657, + "grad_norm": 0.569337010383606, + "learning_rate": 4.643537131283396e-06, + "loss": 0.6118, + "step": 3354 + }, + { + "epoch": 0.5294303298090579, + "grad_norm": 0.5731223821640015, + "learning_rate": 4.643323370966716e-06, + "loss": 0.5593, + "step": 3355 + }, + { + "epoch": 0.5295881331860501, + "grad_norm": 0.6361293792724609, + "learning_rate": 4.643109551499531e-06, + "loss": 0.597, + "step": 3356 + }, + { + "epoch": 0.5297459365630425, + "grad_norm": 0.6174453496932983, + "learning_rate": 4.642895672887742e-06, + "loss": 0.5776, + "step": 3357 + }, + { + "epoch": 0.5299037399400347, + "grad_norm": 0.5857539772987366, + "learning_rate": 4.6426817351372515e-06, + "loss": 0.6099, + "step": 3358 + }, + { + "epoch": 0.530061543317027, + "grad_norm": 0.6152344942092896, + "learning_rate": 4.642467738253964e-06, + "loss": 0.5985, + "step": 3359 + }, + { + "epoch": 0.5302193466940193, + "grad_norm": 0.593052327632904, + "learning_rate": 4.6422536822437845e-06, + "loss": 0.5983, + "step": 3360 + }, + { + "epoch": 0.5303771500710115, + "grad_norm": 0.5988661050796509, + "learning_rate": 4.642039567112621e-06, + "loss": 0.5609, + "step": 3361 + }, + { + "epoch": 0.5305349534480038, + "grad_norm": 0.5625761151313782, + "learning_rate": 4.641825392866385e-06, + "loss": 0.5948, + "step": 3362 + }, + { + "epoch": 0.5306927568249961, + "grad_norm": 0.5972638130187988, + "learning_rate": 4.641611159510983e-06, + "loss": 0.6257, + "step": 3363 + }, + { + "epoch": 0.5308505602019883, + "grad_norm": 0.6159436702728271, + "learning_rate": 4.641396867052329e-06, + "loss": 0.5953, + "step": 3364 + }, + { + "epoch": 0.5310083635789806, + "grad_norm": 0.5901108980178833, + "learning_rate": 4.6411825154963375e-06, + "loss": 0.5796, + "step": 3365 + }, + { + "epoch": 0.5311661669559729, + "grad_norm": 0.5964571833610535, + "learning_rate": 4.640968104848924e-06, + "loss": 0.59, + "step": 3366 + }, + { + "epoch": 0.5313239703329651, + "grad_norm": 0.6089024543762207, + "learning_rate": 4.640753635116005e-06, + "loss": 0.5993, + "step": 3367 + }, + { + "epoch": 0.5314817737099574, + "grad_norm": 0.5896623134613037, + "learning_rate": 4.6405391063035e-06, + "loss": 0.5711, + "step": 3368 + }, + { + "epoch": 0.5316395770869496, + "grad_norm": 0.6055247783660889, + "learning_rate": 4.640324518417329e-06, + "loss": 0.5619, + "step": 3369 + }, + { + "epoch": 0.531797380463942, + "grad_norm": 0.6139637231826782, + "learning_rate": 4.640109871463414e-06, + "loss": 0.5648, + "step": 3370 + }, + { + "epoch": 0.5319551838409342, + "grad_norm": 0.6267266273498535, + "learning_rate": 4.63989516544768e-06, + "loss": 0.5807, + "step": 3371 + }, + { + "epoch": 0.5321129872179264, + "grad_norm": 0.6017366051673889, + "learning_rate": 4.63968040037605e-06, + "loss": 0.6154, + "step": 3372 + }, + { + "epoch": 0.5322707905949188, + "grad_norm": 0.6036745309829712, + "learning_rate": 4.639465576254454e-06, + "loss": 0.566, + "step": 3373 + }, + { + "epoch": 0.532428593971911, + "grad_norm": 0.5921775698661804, + "learning_rate": 4.6392506930888186e-06, + "loss": 0.5818, + "step": 3374 + }, + { + "epoch": 0.5325863973489032, + "grad_norm": 0.6029923558235168, + "learning_rate": 4.6390357508850735e-06, + "loss": 0.6017, + "step": 3375 + }, + { + "epoch": 0.5327442007258956, + "grad_norm": 0.6055936217308044, + "learning_rate": 4.638820749649152e-06, + "loss": 0.6018, + "step": 3376 + }, + { + "epoch": 0.5329020041028878, + "grad_norm": 0.6160601377487183, + "learning_rate": 4.638605689386987e-06, + "loss": 0.6136, + "step": 3377 + }, + { + "epoch": 0.53305980747988, + "grad_norm": 0.5783179402351379, + "learning_rate": 4.638390570104514e-06, + "loss": 0.5822, + "step": 3378 + }, + { + "epoch": 0.5332176108568724, + "grad_norm": 0.5747641921043396, + "learning_rate": 4.63817539180767e-06, + "loss": 0.5708, + "step": 3379 + }, + { + "epoch": 0.5333754142338646, + "grad_norm": 0.5664625763893127, + "learning_rate": 4.637960154502393e-06, + "loss": 0.5756, + "step": 3380 + }, + { + "epoch": 0.5335332176108569, + "grad_norm": 0.6292325258255005, + "learning_rate": 4.637744858194622e-06, + "loss": 0.5557, + "step": 3381 + }, + { + "epoch": 0.5336910209878492, + "grad_norm": 0.5704752206802368, + "learning_rate": 4.637529502890301e-06, + "loss": 0.5424, + "step": 3382 + }, + { + "epoch": 0.5338488243648414, + "grad_norm": 0.5751920342445374, + "learning_rate": 4.63731408859537e-06, + "loss": 0.5878, + "step": 3383 + }, + { + "epoch": 0.5340066277418337, + "grad_norm": 0.5729493498802185, + "learning_rate": 4.637098615315778e-06, + "loss": 0.5671, + "step": 3384 + }, + { + "epoch": 0.5341644311188259, + "grad_norm": 0.5657866597175598, + "learning_rate": 4.636883083057469e-06, + "loss": 0.5941, + "step": 3385 + }, + { + "epoch": 0.5343222344958182, + "grad_norm": 0.5623937249183655, + "learning_rate": 4.636667491826391e-06, + "loss": 0.5943, + "step": 3386 + }, + { + "epoch": 0.5344800378728105, + "grad_norm": 0.5707919001579285, + "learning_rate": 4.636451841628494e-06, + "loss": 0.603, + "step": 3387 + }, + { + "epoch": 0.5346378412498027, + "grad_norm": 0.6065758466720581, + "learning_rate": 4.6362361324697306e-06, + "loss": 0.5677, + "step": 3388 + }, + { + "epoch": 0.534795644626795, + "grad_norm": 0.5988085269927979, + "learning_rate": 4.636020364356052e-06, + "loss": 0.5676, + "step": 3389 + }, + { + "epoch": 0.5349534480037873, + "grad_norm": 0.5859000086784363, + "learning_rate": 4.635804537293414e-06, + "loss": 0.5862, + "step": 3390 + }, + { + "epoch": 0.5351112513807795, + "grad_norm": 0.6151880025863647, + "learning_rate": 4.635588651287773e-06, + "loss": 0.5676, + "step": 3391 + }, + { + "epoch": 0.5352690547577719, + "grad_norm": 0.7467567324638367, + "learning_rate": 4.635372706345087e-06, + "loss": 0.5793, + "step": 3392 + }, + { + "epoch": 0.5354268581347641, + "grad_norm": 0.5942605137825012, + "learning_rate": 4.635156702471315e-06, + "loss": 0.6043, + "step": 3393 + }, + { + "epoch": 0.5355846615117563, + "grad_norm": 0.5737330913543701, + "learning_rate": 4.63494063967242e-06, + "loss": 0.5959, + "step": 3394 + }, + { + "epoch": 0.5357424648887487, + "grad_norm": 0.5791864991188049, + "learning_rate": 4.634724517954361e-06, + "loss": 0.589, + "step": 3395 + }, + { + "epoch": 0.5359002682657409, + "grad_norm": 0.5657325983047485, + "learning_rate": 4.634508337323105e-06, + "loss": 0.5681, + "step": 3396 + }, + { + "epoch": 0.5360580716427331, + "grad_norm": 0.5679145455360413, + "learning_rate": 4.634292097784619e-06, + "loss": 0.5991, + "step": 3397 + }, + { + "epoch": 0.5362158750197255, + "grad_norm": 0.611406683921814, + "learning_rate": 4.634075799344868e-06, + "loss": 0.5799, + "step": 3398 + }, + { + "epoch": 0.5363736783967177, + "grad_norm": 0.5770435333251953, + "learning_rate": 4.633859442009824e-06, + "loss": 0.5727, + "step": 3399 + }, + { + "epoch": 0.5365314817737099, + "grad_norm": 0.6048683524131775, + "learning_rate": 4.633643025785456e-06, + "loss": 0.5574, + "step": 3400 + }, + { + "epoch": 0.5366892851507022, + "grad_norm": 0.5995453000068665, + "learning_rate": 4.633426550677738e-06, + "loss": 0.6022, + "step": 3401 + }, + { + "epoch": 0.5368470885276945, + "grad_norm": 0.5937089920043945, + "learning_rate": 4.633210016692642e-06, + "loss": 0.6, + "step": 3402 + }, + { + "epoch": 0.5370048919046868, + "grad_norm": 0.6442747712135315, + "learning_rate": 4.632993423836146e-06, + "loss": 0.5935, + "step": 3403 + }, + { + "epoch": 0.537162695281679, + "grad_norm": 0.5938532948493958, + "learning_rate": 4.6327767721142266e-06, + "loss": 0.5924, + "step": 3404 + }, + { + "epoch": 0.5373204986586713, + "grad_norm": 0.5646975040435791, + "learning_rate": 4.632560061532863e-06, + "loss": 0.5911, + "step": 3405 + }, + { + "epoch": 0.5374783020356636, + "grad_norm": 0.614769697189331, + "learning_rate": 4.632343292098036e-06, + "loss": 0.5739, + "step": 3406 + }, + { + "epoch": 0.5376361054126558, + "grad_norm": 0.5992773175239563, + "learning_rate": 4.632126463815727e-06, + "loss": 0.549, + "step": 3407 + }, + { + "epoch": 0.5377939087896481, + "grad_norm": 0.6540670394897461, + "learning_rate": 4.631909576691922e-06, + "loss": 0.6127, + "step": 3408 + }, + { + "epoch": 0.5379517121666404, + "grad_norm": 0.6008824110031128, + "learning_rate": 4.631692630732604e-06, + "loss": 0.5963, + "step": 3409 + }, + { + "epoch": 0.5381095155436326, + "grad_norm": 0.5978127121925354, + "learning_rate": 4.631475625943761e-06, + "loss": 0.584, + "step": 3410 + }, + { + "epoch": 0.5382673189206248, + "grad_norm": 0.5917340517044067, + "learning_rate": 4.631258562331384e-06, + "loss": 0.6053, + "step": 3411 + }, + { + "epoch": 0.5384251222976172, + "grad_norm": 0.5790048837661743, + "learning_rate": 4.63104143990146e-06, + "loss": 0.5873, + "step": 3412 + }, + { + "epoch": 0.5385829256746094, + "grad_norm": 0.5661720633506775, + "learning_rate": 4.630824258659983e-06, + "loss": 0.5942, + "step": 3413 + }, + { + "epoch": 0.5387407290516018, + "grad_norm": 0.5909680724143982, + "learning_rate": 4.630607018612947e-06, + "loss": 0.5985, + "step": 3414 + }, + { + "epoch": 0.538898532428594, + "grad_norm": 0.5734390616416931, + "learning_rate": 4.630389719766346e-06, + "loss": 0.5928, + "step": 3415 + }, + { + "epoch": 0.5390563358055862, + "grad_norm": 0.5995622873306274, + "learning_rate": 4.630172362126178e-06, + "loss": 0.586, + "step": 3416 + }, + { + "epoch": 0.5392141391825785, + "grad_norm": 0.6074673533439636, + "learning_rate": 4.6299549456984406e-06, + "loss": 0.5947, + "step": 3417 + }, + { + "epoch": 0.5393719425595708, + "grad_norm": 0.5788962841033936, + "learning_rate": 4.629737470489135e-06, + "loss": 0.555, + "step": 3418 + }, + { + "epoch": 0.539529745936563, + "grad_norm": 0.5930356979370117, + "learning_rate": 4.629519936504262e-06, + "loss": 0.5817, + "step": 3419 + }, + { + "epoch": 0.5396875493135553, + "grad_norm": 0.5908414125442505, + "learning_rate": 4.629302343749826e-06, + "loss": 0.6055, + "step": 3420 + }, + { + "epoch": 0.5398453526905476, + "grad_norm": 0.5964193344116211, + "learning_rate": 4.6290846922318315e-06, + "loss": 0.5266, + "step": 3421 + }, + { + "epoch": 0.5400031560675398, + "grad_norm": 0.5961725115776062, + "learning_rate": 4.6288669819562835e-06, + "loss": 0.599, + "step": 3422 + }, + { + "epoch": 0.5401609594445321, + "grad_norm": 0.5952470898628235, + "learning_rate": 4.628649212929194e-06, + "loss": 0.5871, + "step": 3423 + }, + { + "epoch": 0.5403187628215244, + "grad_norm": 0.5898485779762268, + "learning_rate": 4.62843138515657e-06, + "loss": 0.5668, + "step": 3424 + }, + { + "epoch": 0.5404765661985167, + "grad_norm": 0.5841102600097656, + "learning_rate": 4.628213498644424e-06, + "loss": 0.5893, + "step": 3425 + }, + { + "epoch": 0.5406343695755089, + "grad_norm": 0.6151747107505798, + "learning_rate": 4.6279955533987686e-06, + "loss": 0.5459, + "step": 3426 + }, + { + "epoch": 0.5407921729525011, + "grad_norm": 0.6183765530586243, + "learning_rate": 4.6277775494256186e-06, + "loss": 0.615, + "step": 3427 + }, + { + "epoch": 0.5409499763294935, + "grad_norm": 0.5713645815849304, + "learning_rate": 4.627559486730991e-06, + "loss": 0.5924, + "step": 3428 + }, + { + "epoch": 0.5411077797064857, + "grad_norm": 0.5933154225349426, + "learning_rate": 4.627341365320904e-06, + "loss": 0.5895, + "step": 3429 + }, + { + "epoch": 0.5412655830834779, + "grad_norm": 0.5769613981246948, + "learning_rate": 4.627123185201377e-06, + "loss": 0.573, + "step": 3430 + }, + { + "epoch": 0.5414233864604703, + "grad_norm": 0.561396062374115, + "learning_rate": 4.62690494637843e-06, + "loss": 0.5861, + "step": 3431 + }, + { + "epoch": 0.5415811898374625, + "grad_norm": 0.5905653834342957, + "learning_rate": 4.626686648858087e-06, + "loss": 0.5765, + "step": 3432 + }, + { + "epoch": 0.5417389932144548, + "grad_norm": 0.6446574926376343, + "learning_rate": 4.626468292646372e-06, + "loss": 0.5494, + "step": 3433 + }, + { + "epoch": 0.5418967965914471, + "grad_norm": 0.5790210366249084, + "learning_rate": 4.626249877749312e-06, + "loss": 0.5678, + "step": 3434 + }, + { + "epoch": 0.5420545999684393, + "grad_norm": 0.5800632238388062, + "learning_rate": 4.626031404172934e-06, + "loss": 0.6095, + "step": 3435 + }, + { + "epoch": 0.5422124033454316, + "grad_norm": 0.584334671497345, + "learning_rate": 4.625812871923267e-06, + "loss": 0.6161, + "step": 3436 + }, + { + "epoch": 0.5423702067224239, + "grad_norm": 0.6537361145019531, + "learning_rate": 4.6255942810063436e-06, + "loss": 0.6052, + "step": 3437 + }, + { + "epoch": 0.5425280100994161, + "grad_norm": 0.5675768852233887, + "learning_rate": 4.625375631428194e-06, + "loss": 0.5956, + "step": 3438 + }, + { + "epoch": 0.5426858134764084, + "grad_norm": 0.6065869927406311, + "learning_rate": 4.625156923194854e-06, + "loss": 0.5757, + "step": 3439 + }, + { + "epoch": 0.5428436168534007, + "grad_norm": 0.5588509440422058, + "learning_rate": 4.624938156312359e-06, + "loss": 0.5866, + "step": 3440 + }, + { + "epoch": 0.5430014202303929, + "grad_norm": 0.5740209817886353, + "learning_rate": 4.624719330786747e-06, + "loss": 0.5909, + "step": 3441 + }, + { + "epoch": 0.5431592236073852, + "grad_norm": 0.5890543460845947, + "learning_rate": 4.624500446624055e-06, + "loss": 0.5813, + "step": 3442 + }, + { + "epoch": 0.5433170269843774, + "grad_norm": 0.5839389562606812, + "learning_rate": 4.624281503830326e-06, + "loss": 0.5866, + "step": 3443 + }, + { + "epoch": 0.5434748303613698, + "grad_norm": 0.6001906991004944, + "learning_rate": 4.624062502411602e-06, + "loss": 0.5576, + "step": 3444 + }, + { + "epoch": 0.543632633738362, + "grad_norm": 0.6053450107574463, + "learning_rate": 4.623843442373926e-06, + "loss": 0.5542, + "step": 3445 + }, + { + "epoch": 0.5437904371153542, + "grad_norm": 0.6184387803077698, + "learning_rate": 4.623624323723344e-06, + "loss": 0.6036, + "step": 3446 + }, + { + "epoch": 0.5439482404923466, + "grad_norm": 0.5830904841423035, + "learning_rate": 4.6234051464659015e-06, + "loss": 0.5977, + "step": 3447 + }, + { + "epoch": 0.5441060438693388, + "grad_norm": 0.6565960645675659, + "learning_rate": 4.62318591060765e-06, + "loss": 0.595, + "step": 3448 + }, + { + "epoch": 0.544263847246331, + "grad_norm": 0.6498481035232544, + "learning_rate": 4.622966616154639e-06, + "loss": 0.5735, + "step": 3449 + }, + { + "epoch": 0.5444216506233234, + "grad_norm": 0.5932039618492126, + "learning_rate": 4.62274726311292e-06, + "loss": 0.5665, + "step": 3450 + }, + { + "epoch": 0.5445794540003156, + "grad_norm": 0.5814363360404968, + "learning_rate": 4.622527851488546e-06, + "loss": 0.5934, + "step": 3451 + }, + { + "epoch": 0.5447372573773078, + "grad_norm": 0.6000339388847351, + "learning_rate": 4.622308381287573e-06, + "loss": 0.6024, + "step": 3452 + }, + { + "epoch": 0.5448950607543002, + "grad_norm": 0.5769118666648865, + "learning_rate": 4.622088852516059e-06, + "loss": 0.5553, + "step": 3453 + }, + { + "epoch": 0.5450528641312924, + "grad_norm": 0.5721055269241333, + "learning_rate": 4.621869265180061e-06, + "loss": 0.5715, + "step": 3454 + }, + { + "epoch": 0.5452106675082847, + "grad_norm": 0.5786567330360413, + "learning_rate": 4.621649619285639e-06, + "loss": 0.6007, + "step": 3455 + }, + { + "epoch": 0.545368470885277, + "grad_norm": 0.584460437297821, + "learning_rate": 4.621429914838854e-06, + "loss": 0.6022, + "step": 3456 + }, + { + "epoch": 0.5455262742622692, + "grad_norm": 0.5764777660369873, + "learning_rate": 4.621210151845772e-06, + "loss": 0.5876, + "step": 3457 + }, + { + "epoch": 0.5456840776392615, + "grad_norm": 0.5876366496086121, + "learning_rate": 4.620990330312455e-06, + "loss": 0.5833, + "step": 3458 + }, + { + "epoch": 0.5458418810162537, + "grad_norm": 0.6062032580375671, + "learning_rate": 4.620770450244972e-06, + "loss": 0.6058, + "step": 3459 + }, + { + "epoch": 0.545999684393246, + "grad_norm": 0.5830970406532288, + "learning_rate": 4.620550511649389e-06, + "loss": 0.5616, + "step": 3460 + }, + { + "epoch": 0.5461574877702383, + "grad_norm": 0.5959593653678894, + "learning_rate": 4.6203305145317774e-06, + "loss": 0.6044, + "step": 3461 + }, + { + "epoch": 0.5463152911472305, + "grad_norm": 0.5685886144638062, + "learning_rate": 4.620110458898207e-06, + "loss": 0.5933, + "step": 3462 + }, + { + "epoch": 0.5464730945242228, + "grad_norm": 0.6216263771057129, + "learning_rate": 4.619890344754753e-06, + "loss": 0.5735, + "step": 3463 + }, + { + "epoch": 0.5466308979012151, + "grad_norm": 0.6191626191139221, + "learning_rate": 4.6196701721074874e-06, + "loss": 0.5541, + "step": 3464 + }, + { + "epoch": 0.5467887012782073, + "grad_norm": 0.5866479277610779, + "learning_rate": 4.619449940962489e-06, + "loss": 0.5708, + "step": 3465 + }, + { + "epoch": 0.5469465046551997, + "grad_norm": 0.5865986943244934, + "learning_rate": 4.619229651325834e-06, + "loss": 0.5766, + "step": 3466 + }, + { + "epoch": 0.5471043080321919, + "grad_norm": 0.6375002264976501, + "learning_rate": 4.619009303203602e-06, + "loss": 0.5664, + "step": 3467 + }, + { + "epoch": 0.5472621114091841, + "grad_norm": 0.5627309679985046, + "learning_rate": 4.618788896601874e-06, + "loss": 0.5674, + "step": 3468 + }, + { + "epoch": 0.5474199147861765, + "grad_norm": 0.6007717847824097, + "learning_rate": 4.618568431526734e-06, + "loss": 0.5734, + "step": 3469 + }, + { + "epoch": 0.5475777181631687, + "grad_norm": 0.5551970601081848, + "learning_rate": 4.618347907984264e-06, + "loss": 0.5472, + "step": 3470 + }, + { + "epoch": 0.5477355215401609, + "grad_norm": 0.5836083889007568, + "learning_rate": 4.618127325980552e-06, + "loss": 0.5838, + "step": 3471 + }, + { + "epoch": 0.5478933249171533, + "grad_norm": 0.610476016998291, + "learning_rate": 4.617906685521685e-06, + "loss": 0.5724, + "step": 3472 + }, + { + "epoch": 0.5480511282941455, + "grad_norm": 0.6031302809715271, + "learning_rate": 4.617685986613752e-06, + "loss": 0.5917, + "step": 3473 + }, + { + "epoch": 0.5482089316711377, + "grad_norm": 0.6259922385215759, + "learning_rate": 4.617465229262843e-06, + "loss": 0.5667, + "step": 3474 + }, + { + "epoch": 0.54836673504813, + "grad_norm": 0.5809618234634399, + "learning_rate": 4.617244413475051e-06, + "loss": 0.5742, + "step": 3475 + }, + { + "epoch": 0.5485245384251223, + "grad_norm": 0.5682280659675598, + "learning_rate": 4.6170235392564705e-06, + "loss": 0.5484, + "step": 3476 + }, + { + "epoch": 0.5486823418021146, + "grad_norm": 0.6026318669319153, + "learning_rate": 4.616802606613197e-06, + "loss": 0.5937, + "step": 3477 + }, + { + "epoch": 0.5488401451791068, + "grad_norm": 0.6072329878807068, + "learning_rate": 4.616581615551327e-06, + "loss": 0.549, + "step": 3478 + }, + { + "epoch": 0.5489979485560991, + "grad_norm": 0.5855955481529236, + "learning_rate": 4.616360566076959e-06, + "loss": 0.595, + "step": 3479 + }, + { + "epoch": 0.5491557519330914, + "grad_norm": 0.5943660736083984, + "learning_rate": 4.6161394581961935e-06, + "loss": 0.5603, + "step": 3480 + }, + { + "epoch": 0.5493135553100836, + "grad_norm": 0.5518202185630798, + "learning_rate": 4.615918291915134e-06, + "loss": 0.5633, + "step": 3481 + }, + { + "epoch": 0.5494713586870759, + "grad_norm": 0.6085445880889893, + "learning_rate": 4.615697067239884e-06, + "loss": 0.5782, + "step": 3482 + }, + { + "epoch": 0.5496291620640682, + "grad_norm": 0.6198110580444336, + "learning_rate": 4.615475784176547e-06, + "loss": 0.5946, + "step": 3483 + }, + { + "epoch": 0.5497869654410604, + "grad_norm": 0.5933718085289001, + "learning_rate": 4.615254442731231e-06, + "loss": 0.5727, + "step": 3484 + }, + { + "epoch": 0.5499447688180527, + "grad_norm": 0.5775982141494751, + "learning_rate": 4.615033042910043e-06, + "loss": 0.5603, + "step": 3485 + }, + { + "epoch": 0.550102572195045, + "grad_norm": 0.5976567268371582, + "learning_rate": 4.614811584719097e-06, + "loss": 0.5606, + "step": 3486 + }, + { + "epoch": 0.5502603755720372, + "grad_norm": 0.5666478276252747, + "learning_rate": 4.6145900681645e-06, + "loss": 0.5595, + "step": 3487 + }, + { + "epoch": 0.5504181789490296, + "grad_norm": 0.574881374835968, + "learning_rate": 4.6143684932523686e-06, + "loss": 0.5867, + "step": 3488 + }, + { + "epoch": 0.5505759823260218, + "grad_norm": 0.5622259378433228, + "learning_rate": 4.614146859988816e-06, + "loss": 0.5893, + "step": 3489 + }, + { + "epoch": 0.550733785703014, + "grad_norm": 0.5630156993865967, + "learning_rate": 4.61392516837996e-06, + "loss": 0.5753, + "step": 3490 + }, + { + "epoch": 0.5508915890800063, + "grad_norm": 0.571217954158783, + "learning_rate": 4.613703418431917e-06, + "loss": 0.5609, + "step": 3491 + }, + { + "epoch": 0.5510493924569986, + "grad_norm": 0.5974429845809937, + "learning_rate": 4.613481610150809e-06, + "loss": 0.5841, + "step": 3492 + }, + { + "epoch": 0.5512071958339908, + "grad_norm": 0.5822421312332153, + "learning_rate": 4.6132597435427556e-06, + "loss": 0.5331, + "step": 3493 + }, + { + "epoch": 0.5513649992109831, + "grad_norm": 0.5635478496551514, + "learning_rate": 4.613037818613881e-06, + "loss": 0.5998, + "step": 3494 + }, + { + "epoch": 0.5515228025879754, + "grad_norm": 0.5892784595489502, + "learning_rate": 4.6128158353703085e-06, + "loss": 0.589, + "step": 3495 + }, + { + "epoch": 0.5516806059649676, + "grad_norm": 0.6020663380622864, + "learning_rate": 4.6125937938181656e-06, + "loss": 0.5688, + "step": 3496 + }, + { + "epoch": 0.5518384093419599, + "grad_norm": 0.5755585432052612, + "learning_rate": 4.6123716939635785e-06, + "loss": 0.5748, + "step": 3497 + }, + { + "epoch": 0.5519962127189522, + "grad_norm": 0.583733320236206, + "learning_rate": 4.612149535812677e-06, + "loss": 0.5982, + "step": 3498 + }, + { + "epoch": 0.5521540160959445, + "grad_norm": 0.6044594049453735, + "learning_rate": 4.6119273193715946e-06, + "loss": 0.6167, + "step": 3499 + }, + { + "epoch": 0.5523118194729367, + "grad_norm": 0.6160483956336975, + "learning_rate": 4.611705044646462e-06, + "loss": 0.5789, + "step": 3500 + }, + { + "epoch": 0.552469622849929, + "grad_norm": 0.5959696173667908, + "learning_rate": 4.6114827116434125e-06, + "loss": 0.5762, + "step": 3501 + }, + { + "epoch": 0.5526274262269213, + "grad_norm": 0.5711044669151306, + "learning_rate": 4.6112603203685824e-06, + "loss": 0.5777, + "step": 3502 + }, + { + "epoch": 0.5527852296039135, + "grad_norm": 0.5724138021469116, + "learning_rate": 4.611037870828111e-06, + "loss": 0.5666, + "step": 3503 + }, + { + "epoch": 0.5529430329809057, + "grad_norm": 0.5896289944648743, + "learning_rate": 4.610815363028135e-06, + "loss": 0.5418, + "step": 3504 + }, + { + "epoch": 0.5531008363578981, + "grad_norm": 0.5787512063980103, + "learning_rate": 4.610592796974797e-06, + "loss": 0.5955, + "step": 3505 + }, + { + "epoch": 0.5532586397348903, + "grad_norm": 0.5625726580619812, + "learning_rate": 4.610370172674238e-06, + "loss": 0.5766, + "step": 3506 + }, + { + "epoch": 0.5534164431118825, + "grad_norm": 0.6129895448684692, + "learning_rate": 4.610147490132602e-06, + "loss": 0.5516, + "step": 3507 + }, + { + "epoch": 0.5535742464888749, + "grad_norm": 0.5965337753295898, + "learning_rate": 4.6099247493560355e-06, + "loss": 0.5992, + "step": 3508 + }, + { + "epoch": 0.5537320498658671, + "grad_norm": 0.5890654921531677, + "learning_rate": 4.609701950350685e-06, + "loss": 0.5892, + "step": 3509 + }, + { + "epoch": 0.5538898532428594, + "grad_norm": 0.6097542643547058, + "learning_rate": 4.609479093122698e-06, + "loss": 0.5951, + "step": 3510 + }, + { + "epoch": 0.5540476566198517, + "grad_norm": 0.5507969856262207, + "learning_rate": 4.609256177678227e-06, + "loss": 0.5888, + "step": 3511 + }, + { + "epoch": 0.5542054599968439, + "grad_norm": 0.615475594997406, + "learning_rate": 4.609033204023422e-06, + "loss": 0.6104, + "step": 3512 + }, + { + "epoch": 0.5543632633738362, + "grad_norm": 0.6446845531463623, + "learning_rate": 4.608810172164438e-06, + "loss": 0.5386, + "step": 3513 + }, + { + "epoch": 0.5545210667508285, + "grad_norm": 0.5849553942680359, + "learning_rate": 4.608587082107429e-06, + "loss": 0.5973, + "step": 3514 + }, + { + "epoch": 0.5546788701278207, + "grad_norm": 0.5636834502220154, + "learning_rate": 4.608363933858552e-06, + "loss": 0.5482, + "step": 3515 + }, + { + "epoch": 0.554836673504813, + "grad_norm": 0.5873566269874573, + "learning_rate": 4.608140727423966e-06, + "loss": 0.5444, + "step": 3516 + }, + { + "epoch": 0.5549944768818053, + "grad_norm": 0.5607360601425171, + "learning_rate": 4.607917462809831e-06, + "loss": 0.6043, + "step": 3517 + }, + { + "epoch": 0.5551522802587976, + "grad_norm": 0.6019108295440674, + "learning_rate": 4.607694140022308e-06, + "loss": 0.6176, + "step": 3518 + }, + { + "epoch": 0.5553100836357898, + "grad_norm": 0.5860780477523804, + "learning_rate": 4.60747075906756e-06, + "loss": 0.5975, + "step": 3519 + }, + { + "epoch": 0.555467887012782, + "grad_norm": 0.5663607120513916, + "learning_rate": 4.607247319951752e-06, + "loss": 0.5769, + "step": 3520 + }, + { + "epoch": 0.5556256903897744, + "grad_norm": 0.575910747051239, + "learning_rate": 4.60702382268105e-06, + "loss": 0.5652, + "step": 3521 + }, + { + "epoch": 0.5557834937667666, + "grad_norm": 0.6011241674423218, + "learning_rate": 4.606800267261623e-06, + "loss": 0.5548, + "step": 3522 + }, + { + "epoch": 0.5559412971437588, + "grad_norm": 0.8058870434761047, + "learning_rate": 4.606576653699641e-06, + "loss": 0.5847, + "step": 3523 + }, + { + "epoch": 0.5560991005207512, + "grad_norm": 0.6073631644248962, + "learning_rate": 4.606352982001272e-06, + "loss": 0.5746, + "step": 3524 + }, + { + "epoch": 0.5562569038977434, + "grad_norm": 0.5794433951377869, + "learning_rate": 4.606129252172693e-06, + "loss": 0.5578, + "step": 3525 + }, + { + "epoch": 0.5564147072747356, + "grad_norm": 0.674240231513977, + "learning_rate": 4.605905464220075e-06, + "loss": 0.5471, + "step": 3526 + }, + { + "epoch": 0.556572510651728, + "grad_norm": 0.582699179649353, + "learning_rate": 4.605681618149595e-06, + "loss": 0.6179, + "step": 3527 + }, + { + "epoch": 0.5567303140287202, + "grad_norm": 0.6237499117851257, + "learning_rate": 4.605457713967433e-06, + "loss": 0.5322, + "step": 3528 + }, + { + "epoch": 0.5568881174057125, + "grad_norm": 0.5776008367538452, + "learning_rate": 4.605233751679765e-06, + "loss": 0.5995, + "step": 3529 + }, + { + "epoch": 0.5570459207827048, + "grad_norm": 0.5890386700630188, + "learning_rate": 4.6050097312927725e-06, + "loss": 0.59, + "step": 3530 + }, + { + "epoch": 0.557203724159697, + "grad_norm": 0.597798228263855, + "learning_rate": 4.604785652812639e-06, + "loss": 0.6039, + "step": 3531 + }, + { + "epoch": 0.5573615275366893, + "grad_norm": 0.5903854370117188, + "learning_rate": 4.604561516245548e-06, + "loss": 0.603, + "step": 3532 + }, + { + "epoch": 0.5575193309136816, + "grad_norm": 0.5857282280921936, + "learning_rate": 4.604337321597685e-06, + "loss": 0.5746, + "step": 3533 + }, + { + "epoch": 0.5576771342906738, + "grad_norm": 0.6053504943847656, + "learning_rate": 4.604113068875237e-06, + "loss": 0.5716, + "step": 3534 + }, + { + "epoch": 0.5578349376676661, + "grad_norm": 0.5921884775161743, + "learning_rate": 4.6038887580843935e-06, + "loss": 0.5665, + "step": 3535 + }, + { + "epoch": 0.5579927410446583, + "grad_norm": 0.6073192954063416, + "learning_rate": 4.603664389231344e-06, + "loss": 0.5615, + "step": 3536 + }, + { + "epoch": 0.5581505444216506, + "grad_norm": 0.5654953122138977, + "learning_rate": 4.603439962322281e-06, + "loss": 0.5674, + "step": 3537 + }, + { + "epoch": 0.5583083477986429, + "grad_norm": 0.5782532691955566, + "learning_rate": 4.6032154773633995e-06, + "loss": 0.5697, + "step": 3538 + }, + { + "epoch": 0.5584661511756351, + "grad_norm": 0.5987964272499084, + "learning_rate": 4.602990934360892e-06, + "loss": 0.5822, + "step": 3539 + }, + { + "epoch": 0.5586239545526275, + "grad_norm": 0.5911991596221924, + "learning_rate": 4.6027663333209585e-06, + "loss": 0.5588, + "step": 3540 + }, + { + "epoch": 0.5587817579296197, + "grad_norm": 0.5568180084228516, + "learning_rate": 4.602541674249794e-06, + "loss": 0.5768, + "step": 3541 + }, + { + "epoch": 0.5589395613066119, + "grad_norm": 0.5744420886039734, + "learning_rate": 4.602316957153601e-06, + "loss": 0.5902, + "step": 3542 + }, + { + "epoch": 0.5590973646836043, + "grad_norm": 0.6107819080352783, + "learning_rate": 4.602092182038581e-06, + "loss": 0.5979, + "step": 3543 + }, + { + "epoch": 0.5592551680605965, + "grad_norm": 0.581973671913147, + "learning_rate": 4.601867348910935e-06, + "loss": 0.554, + "step": 3544 + }, + { + "epoch": 0.5594129714375887, + "grad_norm": 0.5645467042922974, + "learning_rate": 4.601642457776872e-06, + "loss": 0.5928, + "step": 3545 + }, + { + "epoch": 0.5595707748145811, + "grad_norm": 0.6114698052406311, + "learning_rate": 4.601417508642594e-06, + "loss": 0.5873, + "step": 3546 + }, + { + "epoch": 0.5597285781915733, + "grad_norm": 0.5876627564430237, + "learning_rate": 4.6011925015143115e-06, + "loss": 0.5712, + "step": 3547 + }, + { + "epoch": 0.5598863815685655, + "grad_norm": 0.5812416076660156, + "learning_rate": 4.600967436398234e-06, + "loss": 0.5724, + "step": 3548 + }, + { + "epoch": 0.5600441849455579, + "grad_norm": 0.5708503127098083, + "learning_rate": 4.6007423133005725e-06, + "loss": 0.5741, + "step": 3549 + }, + { + "epoch": 0.5602019883225501, + "grad_norm": 0.5861811637878418, + "learning_rate": 4.600517132227539e-06, + "loss": 0.5606, + "step": 3550 + }, + { + "epoch": 0.5603597916995424, + "grad_norm": 0.6069638133049011, + "learning_rate": 4.600291893185349e-06, + "loss": 0.5953, + "step": 3551 + }, + { + "epoch": 0.5605175950765346, + "grad_norm": 0.5876982808113098, + "learning_rate": 4.600066596180219e-06, + "loss": 0.6185, + "step": 3552 + }, + { + "epoch": 0.5606753984535269, + "grad_norm": 0.5675768256187439, + "learning_rate": 4.5998412412183655e-06, + "loss": 0.5354, + "step": 3553 + }, + { + "epoch": 0.5608332018305192, + "grad_norm": 0.5776893496513367, + "learning_rate": 4.599615828306008e-06, + "loss": 0.5868, + "step": 3554 + }, + { + "epoch": 0.5609910052075114, + "grad_norm": 0.5875870585441589, + "learning_rate": 4.599390357449367e-06, + "loss": 0.583, + "step": 3555 + }, + { + "epoch": 0.5611488085845037, + "grad_norm": 0.5522879958152771, + "learning_rate": 4.5991648286546665e-06, + "loss": 0.5633, + "step": 3556 + }, + { + "epoch": 0.561306611961496, + "grad_norm": 0.5942817330360413, + "learning_rate": 4.598939241928128e-06, + "loss": 0.5949, + "step": 3557 + }, + { + "epoch": 0.5614644153384882, + "grad_norm": 0.6048849821090698, + "learning_rate": 4.59871359727598e-06, + "loss": 0.5784, + "step": 3558 + }, + { + "epoch": 0.5616222187154805, + "grad_norm": 0.5890201330184937, + "learning_rate": 4.598487894704449e-06, + "loss": 0.5949, + "step": 3559 + }, + { + "epoch": 0.5617800220924728, + "grad_norm": 0.6099526286125183, + "learning_rate": 4.598262134219762e-06, + "loss": 0.5552, + "step": 3560 + }, + { + "epoch": 0.561937825469465, + "grad_norm": 0.6305082440376282, + "learning_rate": 4.5980363158281504e-06, + "loss": 0.613, + "step": 3561 + }, + { + "epoch": 0.5620956288464574, + "grad_norm": 0.5561069250106812, + "learning_rate": 4.597810439535847e-06, + "loss": 0.5899, + "step": 3562 + }, + { + "epoch": 0.5622534322234496, + "grad_norm": 0.5757322311401367, + "learning_rate": 4.597584505349085e-06, + "loss": 0.6272, + "step": 3563 + }, + { + "epoch": 0.5624112356004418, + "grad_norm": 0.5916699767112732, + "learning_rate": 4.5973585132741e-06, + "loss": 0.5604, + "step": 3564 + }, + { + "epoch": 0.5625690389774342, + "grad_norm": 0.6050942540168762, + "learning_rate": 4.5971324633171276e-06, + "loss": 0.5633, + "step": 3565 + }, + { + "epoch": 0.5627268423544264, + "grad_norm": 0.580337643623352, + "learning_rate": 4.596906355484407e-06, + "loss": 0.5776, + "step": 3566 + }, + { + "epoch": 0.5628846457314186, + "grad_norm": 0.5655763149261475, + "learning_rate": 4.596680189782179e-06, + "loss": 0.6022, + "step": 3567 + }, + { + "epoch": 0.563042449108411, + "grad_norm": 0.6005344986915588, + "learning_rate": 4.596453966216684e-06, + "loss": 0.5786, + "step": 3568 + }, + { + "epoch": 0.5632002524854032, + "grad_norm": 0.6094874143600464, + "learning_rate": 4.596227684794166e-06, + "loss": 0.5747, + "step": 3569 + }, + { + "epoch": 0.5633580558623954, + "grad_norm": 0.567677915096283, + "learning_rate": 4.596001345520868e-06, + "loss": 0.6016, + "step": 3570 + }, + { + "epoch": 0.5635158592393877, + "grad_norm": 0.6194664239883423, + "learning_rate": 4.59577494840304e-06, + "loss": 0.6011, + "step": 3571 + }, + { + "epoch": 0.56367366261638, + "grad_norm": 0.5701452493667603, + "learning_rate": 4.5955484934469275e-06, + "loss": 0.577, + "step": 3572 + }, + { + "epoch": 0.5638314659933723, + "grad_norm": 0.5823641419410706, + "learning_rate": 4.59532198065878e-06, + "loss": 0.5864, + "step": 3573 + }, + { + "epoch": 0.5639892693703645, + "grad_norm": 0.5995796322822571, + "learning_rate": 4.59509541004485e-06, + "loss": 0.5949, + "step": 3574 + }, + { + "epoch": 0.5641470727473568, + "grad_norm": 0.6032016277313232, + "learning_rate": 4.594868781611388e-06, + "loss": 0.5993, + "step": 3575 + }, + { + "epoch": 0.5643048761243491, + "grad_norm": 0.6019800901412964, + "learning_rate": 4.594642095364652e-06, + "loss": 0.5801, + "step": 3576 + }, + { + "epoch": 0.5644626795013413, + "grad_norm": 0.5735248327255249, + "learning_rate": 4.594415351310895e-06, + "loss": 0.5695, + "step": 3577 + }, + { + "epoch": 0.5646204828783336, + "grad_norm": 0.573939323425293, + "learning_rate": 4.5941885494563744e-06, + "loss": 0.5932, + "step": 3578 + }, + { + "epoch": 0.5647782862553259, + "grad_norm": 0.5638232827186584, + "learning_rate": 4.593961689807351e-06, + "loss": 0.5782, + "step": 3579 + }, + { + "epoch": 0.5649360896323181, + "grad_norm": 0.5785738229751587, + "learning_rate": 4.5937347723700866e-06, + "loss": 0.5705, + "step": 3580 + }, + { + "epoch": 0.5650938930093103, + "grad_norm": 0.5962275266647339, + "learning_rate": 4.59350779715084e-06, + "loss": 0.5742, + "step": 3581 + }, + { + "epoch": 0.5652516963863027, + "grad_norm": 0.614898681640625, + "learning_rate": 4.5932807641558784e-06, + "loss": 0.5578, + "step": 3582 + }, + { + "epoch": 0.5654094997632949, + "grad_norm": 0.5971670746803284, + "learning_rate": 4.593053673391466e-06, + "loss": 0.581, + "step": 3583 + }, + { + "epoch": 0.5655673031402872, + "grad_norm": 0.60477614402771, + "learning_rate": 4.592826524863871e-06, + "loss": 0.6112, + "step": 3584 + }, + { + "epoch": 0.5657251065172795, + "grad_norm": 0.5579222440719604, + "learning_rate": 4.59259931857936e-06, + "loss": 0.5543, + "step": 3585 + }, + { + "epoch": 0.5658829098942717, + "grad_norm": 0.5710344910621643, + "learning_rate": 4.592372054544204e-06, + "loss": 0.6154, + "step": 3586 + }, + { + "epoch": 0.566040713271264, + "grad_norm": 0.5699630379676819, + "learning_rate": 4.5921447327646765e-06, + "loss": 0.5817, + "step": 3587 + }, + { + "epoch": 0.5661985166482563, + "grad_norm": 0.595397412776947, + "learning_rate": 4.59191735324705e-06, + "loss": 0.5839, + "step": 3588 + }, + { + "epoch": 0.5663563200252485, + "grad_norm": 0.6226270198822021, + "learning_rate": 4.591689915997599e-06, + "loss": 0.5906, + "step": 3589 + }, + { + "epoch": 0.5665141234022408, + "grad_norm": 0.5999671220779419, + "learning_rate": 4.591462421022601e-06, + "loss": 0.5768, + "step": 3590 + }, + { + "epoch": 0.5666719267792331, + "grad_norm": 0.5829411745071411, + "learning_rate": 4.591234868328335e-06, + "loss": 0.5929, + "step": 3591 + }, + { + "epoch": 0.5668297301562254, + "grad_norm": 0.5803195238113403, + "learning_rate": 4.59100725792108e-06, + "loss": 0.5675, + "step": 3592 + }, + { + "epoch": 0.5669875335332176, + "grad_norm": 0.6032700538635254, + "learning_rate": 4.590779589807117e-06, + "loss": 0.5562, + "step": 3593 + }, + { + "epoch": 0.5671453369102099, + "grad_norm": 0.6003465056419373, + "learning_rate": 4.59055186399273e-06, + "loss": 0.5902, + "step": 3594 + }, + { + "epoch": 0.5673031402872022, + "grad_norm": 0.5816687941551208, + "learning_rate": 4.590324080484204e-06, + "loss": 0.6008, + "step": 3595 + }, + { + "epoch": 0.5674609436641944, + "grad_norm": 0.5768327713012695, + "learning_rate": 4.590096239287823e-06, + "loss": 0.5945, + "step": 3596 + }, + { + "epoch": 0.5676187470411866, + "grad_norm": 0.6052454710006714, + "learning_rate": 4.589868340409877e-06, + "loss": 0.5712, + "step": 3597 + }, + { + "epoch": 0.567776550418179, + "grad_norm": 0.5699519515037537, + "learning_rate": 4.589640383856656e-06, + "loss": 0.5824, + "step": 3598 + }, + { + "epoch": 0.5679343537951712, + "grad_norm": 0.5635495781898499, + "learning_rate": 4.58941236963445e-06, + "loss": 0.5823, + "step": 3599 + }, + { + "epoch": 0.5680921571721634, + "grad_norm": 0.5801964998245239, + "learning_rate": 4.589184297749551e-06, + "loss": 0.5894, + "step": 3600 + }, + { + "epoch": 0.5682499605491558, + "grad_norm": 0.5521343350410461, + "learning_rate": 4.588956168208253e-06, + "loss": 0.5739, + "step": 3601 + }, + { + "epoch": 0.568407763926148, + "grad_norm": 0.6782090067863464, + "learning_rate": 4.588727981016854e-06, + "loss": 0.5886, + "step": 3602 + }, + { + "epoch": 0.5685655673031403, + "grad_norm": 0.6027691960334778, + "learning_rate": 4.588499736181649e-06, + "loss": 0.5768, + "step": 3603 + }, + { + "epoch": 0.5687233706801326, + "grad_norm": 0.5791019201278687, + "learning_rate": 4.588271433708938e-06, + "loss": 0.5995, + "step": 3604 + }, + { + "epoch": 0.5688811740571248, + "grad_norm": 0.5896698832511902, + "learning_rate": 4.588043073605023e-06, + "loss": 0.6053, + "step": 3605 + }, + { + "epoch": 0.5690389774341171, + "grad_norm": 0.5979366898536682, + "learning_rate": 4.587814655876204e-06, + "loss": 0.6082, + "step": 3606 + }, + { + "epoch": 0.5691967808111094, + "grad_norm": 0.5887660980224609, + "learning_rate": 4.587586180528786e-06, + "loss": 0.5876, + "step": 3607 + }, + { + "epoch": 0.5693545841881016, + "grad_norm": 0.5612056255340576, + "learning_rate": 4.587357647569074e-06, + "loss": 0.5514, + "step": 3608 + }, + { + "epoch": 0.5695123875650939, + "grad_norm": 0.5932966470718384, + "learning_rate": 4.587129057003374e-06, + "loss": 0.5755, + "step": 3609 + }, + { + "epoch": 0.5696701909420862, + "grad_norm": 0.5960654616355896, + "learning_rate": 4.586900408837997e-06, + "loss": 0.6114, + "step": 3610 + }, + { + "epoch": 0.5698279943190784, + "grad_norm": 0.5877953767776489, + "learning_rate": 4.586671703079249e-06, + "loss": 0.5482, + "step": 3611 + }, + { + "epoch": 0.5699857976960707, + "grad_norm": 0.5679347515106201, + "learning_rate": 4.586442939733447e-06, + "loss": 0.5973, + "step": 3612 + }, + { + "epoch": 0.5701436010730629, + "grad_norm": 0.5889701843261719, + "learning_rate": 4.5862141188069005e-06, + "loss": 0.5585, + "step": 3613 + }, + { + "epoch": 0.5703014044500553, + "grad_norm": 0.605929970741272, + "learning_rate": 4.585985240305926e-06, + "loss": 0.5736, + "step": 3614 + }, + { + "epoch": 0.5704592078270475, + "grad_norm": 0.6439168453216553, + "learning_rate": 4.585756304236839e-06, + "loss": 0.5528, + "step": 3615 + }, + { + "epoch": 0.5706170112040397, + "grad_norm": 0.6088292002677917, + "learning_rate": 4.585527310605959e-06, + "loss": 0.5934, + "step": 3616 + }, + { + "epoch": 0.5707748145810321, + "grad_norm": 0.5990124940872192, + "learning_rate": 4.585298259419604e-06, + "loss": 0.5761, + "step": 3617 + }, + { + "epoch": 0.5709326179580243, + "grad_norm": 0.5953870415687561, + "learning_rate": 4.5850691506840965e-06, + "loss": 0.5691, + "step": 3618 + }, + { + "epoch": 0.5710904213350165, + "grad_norm": 0.5827488899230957, + "learning_rate": 4.5848399844057575e-06, + "loss": 0.6074, + "step": 3619 + }, + { + "epoch": 0.5712482247120089, + "grad_norm": 0.5685185790061951, + "learning_rate": 4.584610760590915e-06, + "loss": 0.6111, + "step": 3620 + }, + { + "epoch": 0.5714060280890011, + "grad_norm": 0.591812252998352, + "learning_rate": 4.584381479245891e-06, + "loss": 0.5787, + "step": 3621 + }, + { + "epoch": 0.5715638314659933, + "grad_norm": 0.6064899563789368, + "learning_rate": 4.584152140377016e-06, + "loss": 0.5993, + "step": 3622 + }, + { + "epoch": 0.5717216348429857, + "grad_norm": 0.5736068487167358, + "learning_rate": 4.583922743990617e-06, + "loss": 0.5915, + "step": 3623 + }, + { + "epoch": 0.5718794382199779, + "grad_norm": 0.58894944190979, + "learning_rate": 4.583693290093028e-06, + "loss": 0.5677, + "step": 3624 + }, + { + "epoch": 0.5720372415969702, + "grad_norm": 0.5928305983543396, + "learning_rate": 4.583463778690577e-06, + "loss": 0.5993, + "step": 3625 + }, + { + "epoch": 0.5721950449739625, + "grad_norm": 0.5938430428504944, + "learning_rate": 4.583234209789602e-06, + "loss": 0.5545, + "step": 3626 + }, + { + "epoch": 0.5723528483509547, + "grad_norm": 0.5968641042709351, + "learning_rate": 4.583004583396436e-06, + "loss": 0.579, + "step": 3627 + }, + { + "epoch": 0.572510651727947, + "grad_norm": 0.5961958765983582, + "learning_rate": 4.582774899517417e-06, + "loss": 0.6232, + "step": 3628 + }, + { + "epoch": 0.5726684551049392, + "grad_norm": 0.616324245929718, + "learning_rate": 4.582545158158883e-06, + "loss": 0.5631, + "step": 3629 + }, + { + "epoch": 0.5728262584819315, + "grad_norm": 0.5693817138671875, + "learning_rate": 4.5823153593271754e-06, + "loss": 0.5856, + "step": 3630 + }, + { + "epoch": 0.5729840618589238, + "grad_norm": 0.5796594023704529, + "learning_rate": 4.582085503028636e-06, + "loss": 0.592, + "step": 3631 + }, + { + "epoch": 0.573141865235916, + "grad_norm": 0.5734019875526428, + "learning_rate": 4.581855589269608e-06, + "loss": 0.5876, + "step": 3632 + }, + { + "epoch": 0.5732996686129083, + "grad_norm": 0.5982965230941772, + "learning_rate": 4.581625618056436e-06, + "loss": 0.5979, + "step": 3633 + }, + { + "epoch": 0.5734574719899006, + "grad_norm": 0.6293798089027405, + "learning_rate": 4.581395589395467e-06, + "loss": 0.5752, + "step": 3634 + }, + { + "epoch": 0.5736152753668928, + "grad_norm": 0.5803194642066956, + "learning_rate": 4.581165503293049e-06, + "loss": 0.5701, + "step": 3635 + }, + { + "epoch": 0.5737730787438852, + "grad_norm": 0.6119219064712524, + "learning_rate": 4.580935359755532e-06, + "loss": 0.5717, + "step": 3636 + }, + { + "epoch": 0.5739308821208774, + "grad_norm": 0.5986905097961426, + "learning_rate": 4.580705158789267e-06, + "loss": 0.5867, + "step": 3637 + }, + { + "epoch": 0.5740886854978696, + "grad_norm": 0.588316798210144, + "learning_rate": 4.580474900400609e-06, + "loss": 0.5936, + "step": 3638 + }, + { + "epoch": 0.574246488874862, + "grad_norm": 0.5813804268836975, + "learning_rate": 4.5802445845959094e-06, + "loss": 0.5961, + "step": 3639 + }, + { + "epoch": 0.5744042922518542, + "grad_norm": 0.6168195009231567, + "learning_rate": 4.5800142113815275e-06, + "loss": 0.5957, + "step": 3640 + }, + { + "epoch": 0.5745620956288464, + "grad_norm": 0.5764904022216797, + "learning_rate": 4.579783780763818e-06, + "loss": 0.5934, + "step": 3641 + }, + { + "epoch": 0.5747198990058388, + "grad_norm": 0.5831480026245117, + "learning_rate": 4.579553292749143e-06, + "loss": 0.5639, + "step": 3642 + }, + { + "epoch": 0.574877702382831, + "grad_norm": 0.5893843770027161, + "learning_rate": 4.579322747343861e-06, + "loss": 0.5743, + "step": 3643 + }, + { + "epoch": 0.5750355057598232, + "grad_norm": 0.5667629837989807, + "learning_rate": 4.579092144554336e-06, + "loss": 0.599, + "step": 3644 + }, + { + "epoch": 0.5751933091368155, + "grad_norm": 0.5928376913070679, + "learning_rate": 4.578861484386931e-06, + "loss": 0.5911, + "step": 3645 + }, + { + "epoch": 0.5753511125138078, + "grad_norm": 0.614314079284668, + "learning_rate": 4.5786307668480135e-06, + "loss": 0.5497, + "step": 3646 + }, + { + "epoch": 0.5755089158908001, + "grad_norm": 0.5978164076805115, + "learning_rate": 4.578399991943949e-06, + "loss": 0.5595, + "step": 3647 + }, + { + "epoch": 0.5756667192677923, + "grad_norm": 0.574599027633667, + "learning_rate": 4.578169159681107e-06, + "loss": 0.5674, + "step": 3648 + }, + { + "epoch": 0.5758245226447846, + "grad_norm": 0.5603082180023193, + "learning_rate": 4.5779382700658585e-06, + "loss": 0.5518, + "step": 3649 + }, + { + "epoch": 0.5759823260217769, + "grad_norm": 0.5573793649673462, + "learning_rate": 4.5777073231045745e-06, + "loss": 0.5607, + "step": 3650 + }, + { + "epoch": 0.5761401293987691, + "grad_norm": 0.5826743245124817, + "learning_rate": 4.577476318803628e-06, + "loss": 0.5761, + "step": 3651 + }, + { + "epoch": 0.5762979327757614, + "grad_norm": 0.5720070600509644, + "learning_rate": 4.577245257169396e-06, + "loss": 0.5972, + "step": 3652 + }, + { + "epoch": 0.5764557361527537, + "grad_norm": 0.5812878012657166, + "learning_rate": 4.577014138208254e-06, + "loss": 0.5398, + "step": 3653 + }, + { + "epoch": 0.5766135395297459, + "grad_norm": 0.596768856048584, + "learning_rate": 4.57678296192658e-06, + "loss": 0.6126, + "step": 3654 + }, + { + "epoch": 0.5767713429067381, + "grad_norm": 0.5826660394668579, + "learning_rate": 4.576551728330755e-06, + "loss": 0.6333, + "step": 3655 + }, + { + "epoch": 0.5769291462837305, + "grad_norm": 0.5906676650047302, + "learning_rate": 4.5763204374271605e-06, + "loss": 0.6049, + "step": 3656 + }, + { + "epoch": 0.5770869496607227, + "grad_norm": 0.6233271360397339, + "learning_rate": 4.576089089222179e-06, + "loss": 0.5847, + "step": 3657 + }, + { + "epoch": 0.577244753037715, + "grad_norm": 0.5762131810188293, + "learning_rate": 4.575857683722194e-06, + "loss": 0.5768, + "step": 3658 + }, + { + "epoch": 0.5774025564147073, + "grad_norm": 0.5941563248634338, + "learning_rate": 4.575626220933594e-06, + "loss": 0.5557, + "step": 3659 + }, + { + "epoch": 0.5775603597916995, + "grad_norm": 0.5929994583129883, + "learning_rate": 4.575394700862766e-06, + "loss": 0.591, + "step": 3660 + }, + { + "epoch": 0.5777181631686918, + "grad_norm": 0.5700281262397766, + "learning_rate": 4.575163123516099e-06, + "loss": 0.5817, + "step": 3661 + }, + { + "epoch": 0.5778759665456841, + "grad_norm": 0.5929579138755798, + "learning_rate": 4.5749314888999826e-06, + "loss": 0.5426, + "step": 3662 + }, + { + "epoch": 0.5780337699226763, + "grad_norm": 0.5984724164009094, + "learning_rate": 4.574699797020813e-06, + "loss": 0.5473, + "step": 3663 + }, + { + "epoch": 0.5781915732996686, + "grad_norm": 0.5878458023071289, + "learning_rate": 4.574468047884981e-06, + "loss": 0.6126, + "step": 3664 + }, + { + "epoch": 0.5783493766766609, + "grad_norm": 0.5652299523353577, + "learning_rate": 4.574236241498884e-06, + "loss": 0.5932, + "step": 3665 + }, + { + "epoch": 0.5785071800536532, + "grad_norm": 0.5749137997627258, + "learning_rate": 4.5740043778689184e-06, + "loss": 0.5725, + "step": 3666 + }, + { + "epoch": 0.5786649834306454, + "grad_norm": 0.5619671940803528, + "learning_rate": 4.573772457001483e-06, + "loss": 0.5851, + "step": 3667 + }, + { + "epoch": 0.5788227868076377, + "grad_norm": 0.6126790642738342, + "learning_rate": 4.5735404789029795e-06, + "loss": 0.579, + "step": 3668 + }, + { + "epoch": 0.57898059018463, + "grad_norm": 0.60673588514328, + "learning_rate": 4.573308443579808e-06, + "loss": 0.5798, + "step": 3669 + }, + { + "epoch": 0.5791383935616222, + "grad_norm": 0.6042709350585938, + "learning_rate": 4.573076351038375e-06, + "loss": 0.559, + "step": 3670 + }, + { + "epoch": 0.5792961969386144, + "grad_norm": 0.6232263445854187, + "learning_rate": 4.572844201285083e-06, + "loss": 0.5347, + "step": 3671 + }, + { + "epoch": 0.5794540003156068, + "grad_norm": 0.6100486516952515, + "learning_rate": 4.572611994326339e-06, + "loss": 0.5683, + "step": 3672 + }, + { + "epoch": 0.579611803692599, + "grad_norm": 0.5802386999130249, + "learning_rate": 4.572379730168553e-06, + "loss": 0.5623, + "step": 3673 + }, + { + "epoch": 0.5797696070695912, + "grad_norm": 0.5928777456283569, + "learning_rate": 4.572147408818134e-06, + "loss": 0.5907, + "step": 3674 + }, + { + "epoch": 0.5799274104465836, + "grad_norm": 0.5862278938293457, + "learning_rate": 4.571915030281493e-06, + "loss": 0.5979, + "step": 3675 + }, + { + "epoch": 0.5800852138235758, + "grad_norm": 0.6007152199745178, + "learning_rate": 4.571682594565045e-06, + "loss": 0.562, + "step": 3676 + }, + { + "epoch": 0.5802430172005681, + "grad_norm": 0.5687905550003052, + "learning_rate": 4.571450101675201e-06, + "loss": 0.5545, + "step": 3677 + }, + { + "epoch": 0.5804008205775604, + "grad_norm": 0.5752583146095276, + "learning_rate": 4.571217551618381e-06, + "loss": 0.5761, + "step": 3678 + }, + { + "epoch": 0.5805586239545526, + "grad_norm": 0.6092543601989746, + "learning_rate": 4.570984944401e-06, + "loss": 0.5653, + "step": 3679 + }, + { + "epoch": 0.5807164273315449, + "grad_norm": 0.5830124020576477, + "learning_rate": 4.57075228002948e-06, + "loss": 0.5604, + "step": 3680 + }, + { + "epoch": 0.5808742307085372, + "grad_norm": 0.5970211625099182, + "learning_rate": 4.5705195585102405e-06, + "loss": 0.5801, + "step": 3681 + }, + { + "epoch": 0.5810320340855294, + "grad_norm": 0.5669752955436707, + "learning_rate": 4.570286779849703e-06, + "loss": 0.5291, + "step": 3682 + }, + { + "epoch": 0.5811898374625217, + "grad_norm": 0.59797602891922, + "learning_rate": 4.570053944054294e-06, + "loss": 0.5992, + "step": 3683 + }, + { + "epoch": 0.581347640839514, + "grad_norm": 0.5967462658882141, + "learning_rate": 4.569821051130437e-06, + "loss": 0.6048, + "step": 3684 + }, + { + "epoch": 0.5815054442165062, + "grad_norm": 0.6230027675628662, + "learning_rate": 4.569588101084562e-06, + "loss": 0.5898, + "step": 3685 + }, + { + "epoch": 0.5816632475934985, + "grad_norm": 0.5763297080993652, + "learning_rate": 4.569355093923094e-06, + "loss": 0.5809, + "step": 3686 + }, + { + "epoch": 0.5818210509704907, + "grad_norm": 0.5810040235519409, + "learning_rate": 4.569122029652467e-06, + "loss": 0.5795, + "step": 3687 + }, + { + "epoch": 0.5819788543474831, + "grad_norm": 0.6026908159255981, + "learning_rate": 4.568888908279111e-06, + "loss": 0.5582, + "step": 3688 + }, + { + "epoch": 0.5821366577244753, + "grad_norm": 0.5699719786643982, + "learning_rate": 4.568655729809461e-06, + "loss": 0.603, + "step": 3689 + }, + { + "epoch": 0.5822944611014675, + "grad_norm": 0.5733830332756042, + "learning_rate": 4.568422494249951e-06, + "loss": 0.5854, + "step": 3690 + }, + { + "epoch": 0.5824522644784599, + "grad_norm": 0.6089893579483032, + "learning_rate": 4.5681892016070175e-06, + "loss": 0.5617, + "step": 3691 + }, + { + "epoch": 0.5826100678554521, + "grad_norm": 0.6064369082450867, + "learning_rate": 4.5679558518870995e-06, + "loss": 0.5668, + "step": 3692 + }, + { + "epoch": 0.5827678712324443, + "grad_norm": 0.5844938158988953, + "learning_rate": 4.567722445096637e-06, + "loss": 0.5646, + "step": 3693 + }, + { + "epoch": 0.5829256746094367, + "grad_norm": 0.5992715954780579, + "learning_rate": 4.567488981242071e-06, + "loss": 0.5582, + "step": 3694 + }, + { + "epoch": 0.5830834779864289, + "grad_norm": 0.5837608575820923, + "learning_rate": 4.567255460329845e-06, + "loss": 0.594, + "step": 3695 + }, + { + "epoch": 0.5832412813634211, + "grad_norm": 0.5823114514350891, + "learning_rate": 4.567021882366403e-06, + "loss": 0.5887, + "step": 3696 + }, + { + "epoch": 0.5833990847404135, + "grad_norm": 0.594412088394165, + "learning_rate": 4.566788247358192e-06, + "loss": 0.5889, + "step": 3697 + }, + { + "epoch": 0.5835568881174057, + "grad_norm": 0.6130637526512146, + "learning_rate": 4.566554555311658e-06, + "loss": 0.5667, + "step": 3698 + }, + { + "epoch": 0.583714691494398, + "grad_norm": 0.5759096145629883, + "learning_rate": 4.5663208062332535e-06, + "loss": 0.5574, + "step": 3699 + }, + { + "epoch": 0.5838724948713903, + "grad_norm": 0.5816487669944763, + "learning_rate": 4.5660870001294254e-06, + "loss": 0.6009, + "step": 3700 + }, + { + "epoch": 0.5840302982483825, + "grad_norm": 0.6259716153144836, + "learning_rate": 4.56585313700663e-06, + "loss": 0.5982, + "step": 3701 + }, + { + "epoch": 0.5841881016253748, + "grad_norm": 0.6218380331993103, + "learning_rate": 4.5656192168713184e-06, + "loss": 0.581, + "step": 3702 + }, + { + "epoch": 0.584345905002367, + "grad_norm": 0.5941224694252014, + "learning_rate": 4.565385239729949e-06, + "loss": 0.5877, + "step": 3703 + }, + { + "epoch": 0.5845037083793593, + "grad_norm": 0.5733585953712463, + "learning_rate": 4.565151205588977e-06, + "loss": 0.5904, + "step": 3704 + }, + { + "epoch": 0.5846615117563516, + "grad_norm": 0.5898370742797852, + "learning_rate": 4.564917114454861e-06, + "loss": 0.5575, + "step": 3705 + }, + { + "epoch": 0.5848193151333438, + "grad_norm": 0.610420286655426, + "learning_rate": 4.564682966334062e-06, + "loss": 0.57, + "step": 3706 + }, + { + "epoch": 0.5849771185103361, + "grad_norm": 0.5888358354568481, + "learning_rate": 4.5644487612330425e-06, + "loss": 0.5895, + "step": 3707 + }, + { + "epoch": 0.5851349218873284, + "grad_norm": 0.5959503650665283, + "learning_rate": 4.564214499158265e-06, + "loss": 0.6011, + "step": 3708 + }, + { + "epoch": 0.5852927252643206, + "grad_norm": 0.5953145623207092, + "learning_rate": 4.5639801801161955e-06, + "loss": 0.5796, + "step": 3709 + }, + { + "epoch": 0.585450528641313, + "grad_norm": 0.5997985005378723, + "learning_rate": 4.5637458041133e-06, + "loss": 0.6146, + "step": 3710 + }, + { + "epoch": 0.5856083320183052, + "grad_norm": 0.6012387871742249, + "learning_rate": 4.563511371156047e-06, + "loss": 0.622, + "step": 3711 + }, + { + "epoch": 0.5857661353952974, + "grad_norm": 0.651524007320404, + "learning_rate": 4.563276881250905e-06, + "loss": 0.6272, + "step": 3712 + }, + { + "epoch": 0.5859239387722898, + "grad_norm": 0.5606479644775391, + "learning_rate": 4.563042334404347e-06, + "loss": 0.5841, + "step": 3713 + }, + { + "epoch": 0.586081742149282, + "grad_norm": 0.6158187985420227, + "learning_rate": 4.562807730622845e-06, + "loss": 0.5771, + "step": 3714 + }, + { + "epoch": 0.5862395455262742, + "grad_norm": 0.5742101669311523, + "learning_rate": 4.562573069912874e-06, + "loss": 0.5727, + "step": 3715 + }, + { + "epoch": 0.5863973489032666, + "grad_norm": 0.5960695147514343, + "learning_rate": 4.5623383522809106e-06, + "loss": 0.601, + "step": 3716 + }, + { + "epoch": 0.5865551522802588, + "grad_norm": 0.5894970297813416, + "learning_rate": 4.562103577733431e-06, + "loss": 0.5927, + "step": 3717 + }, + { + "epoch": 0.586712955657251, + "grad_norm": 0.5972762703895569, + "learning_rate": 4.5618687462769165e-06, + "loss": 0.5811, + "step": 3718 + }, + { + "epoch": 0.5868707590342433, + "grad_norm": 0.6081821322441101, + "learning_rate": 4.561633857917845e-06, + "loss": 0.6141, + "step": 3719 + }, + { + "epoch": 0.5870285624112356, + "grad_norm": 0.5570045709609985, + "learning_rate": 4.561398912662701e-06, + "loss": 0.5677, + "step": 3720 + }, + { + "epoch": 0.5871863657882279, + "grad_norm": 0.5940255522727966, + "learning_rate": 4.561163910517968e-06, + "loss": 0.5824, + "step": 3721 + }, + { + "epoch": 0.5873441691652201, + "grad_norm": 0.5730314254760742, + "learning_rate": 4.560928851490131e-06, + "loss": 0.5938, + "step": 3722 + }, + { + "epoch": 0.5875019725422124, + "grad_norm": 0.6021329760551453, + "learning_rate": 4.560693735585678e-06, + "loss": 0.5739, + "step": 3723 + }, + { + "epoch": 0.5876597759192047, + "grad_norm": 0.6110649108886719, + "learning_rate": 4.560458562811095e-06, + "loss": 0.6462, + "step": 3724 + }, + { + "epoch": 0.5878175792961969, + "grad_norm": 0.5576123595237732, + "learning_rate": 4.560223333172876e-06, + "loss": 0.544, + "step": 3725 + }, + { + "epoch": 0.5879753826731892, + "grad_norm": 0.5756410360336304, + "learning_rate": 4.559988046677511e-06, + "loss": 0.5671, + "step": 3726 + }, + { + "epoch": 0.5881331860501815, + "grad_norm": 0.5532965064048767, + "learning_rate": 4.5597527033314914e-06, + "loss": 0.6027, + "step": 3727 + }, + { + "epoch": 0.5882909894271737, + "grad_norm": 0.6108563542366028, + "learning_rate": 4.5595173031413155e-06, + "loss": 0.5868, + "step": 3728 + }, + { + "epoch": 0.588448792804166, + "grad_norm": 0.6474381685256958, + "learning_rate": 4.559281846113478e-06, + "loss": 0.5845, + "step": 3729 + }, + { + "epoch": 0.5886065961811583, + "grad_norm": 0.5918341875076294, + "learning_rate": 4.559046332254477e-06, + "loss": 0.5961, + "step": 3730 + }, + { + "epoch": 0.5887643995581505, + "grad_norm": 0.6095003485679626, + "learning_rate": 4.558810761570812e-06, + "loss": 0.6015, + "step": 3731 + }, + { + "epoch": 0.5889222029351429, + "grad_norm": 0.5721487998962402, + "learning_rate": 4.558575134068985e-06, + "loss": 0.5973, + "step": 3732 + }, + { + "epoch": 0.5890800063121351, + "grad_norm": 0.6275439262390137, + "learning_rate": 4.558339449755499e-06, + "loss": 0.5903, + "step": 3733 + }, + { + "epoch": 0.5892378096891273, + "grad_norm": 0.5890410542488098, + "learning_rate": 4.558103708636856e-06, + "loss": 0.5995, + "step": 3734 + }, + { + "epoch": 0.5893956130661197, + "grad_norm": 0.6170664429664612, + "learning_rate": 4.5578679107195645e-06, + "loss": 0.5998, + "step": 3735 + }, + { + "epoch": 0.5895534164431119, + "grad_norm": 0.6380425691604614, + "learning_rate": 4.557632056010131e-06, + "loss": 0.6225, + "step": 3736 + }, + { + "epoch": 0.5897112198201041, + "grad_norm": 0.5897258520126343, + "learning_rate": 4.557396144515063e-06, + "loss": 0.5938, + "step": 3737 + }, + { + "epoch": 0.5898690231970964, + "grad_norm": 0.5714899301528931, + "learning_rate": 4.557160176240874e-06, + "loss": 0.5998, + "step": 3738 + }, + { + "epoch": 0.5900268265740887, + "grad_norm": 0.5654438138008118, + "learning_rate": 4.556924151194073e-06, + "loss": 0.5734, + "step": 3739 + }, + { + "epoch": 0.590184629951081, + "grad_norm": 0.5701290965080261, + "learning_rate": 4.5566880693811764e-06, + "loss": 0.5755, + "step": 3740 + }, + { + "epoch": 0.5903424333280732, + "grad_norm": 0.5882145166397095, + "learning_rate": 4.556451930808697e-06, + "loss": 0.5695, + "step": 3741 + }, + { + "epoch": 0.5905002367050655, + "grad_norm": 0.5634150505065918, + "learning_rate": 4.556215735483155e-06, + "loss": 0.5671, + "step": 3742 + }, + { + "epoch": 0.5906580400820578, + "grad_norm": 0.5656006932258606, + "learning_rate": 4.555979483411066e-06, + "loss": 0.5582, + "step": 3743 + }, + { + "epoch": 0.59081584345905, + "grad_norm": 0.5715301632881165, + "learning_rate": 4.555743174598951e-06, + "loss": 0.601, + "step": 3744 + }, + { + "epoch": 0.5909736468360423, + "grad_norm": 0.6052255034446716, + "learning_rate": 4.55550680905333e-06, + "loss": 0.5959, + "step": 3745 + }, + { + "epoch": 0.5911314502130346, + "grad_norm": 0.5900365114212036, + "learning_rate": 4.55527038678073e-06, + "loss": 0.581, + "step": 3746 + }, + { + "epoch": 0.5912892535900268, + "grad_norm": 0.5819572806358337, + "learning_rate": 4.555033907787672e-06, + "loss": 0.5595, + "step": 3747 + }, + { + "epoch": 0.591447056967019, + "grad_norm": 0.5998021960258484, + "learning_rate": 4.554797372080683e-06, + "loss": 0.5454, + "step": 3748 + }, + { + "epoch": 0.5916048603440114, + "grad_norm": 0.5786018967628479, + "learning_rate": 4.554560779666292e-06, + "loss": 0.6024, + "step": 3749 + }, + { + "epoch": 0.5917626637210036, + "grad_norm": 0.5915243625640869, + "learning_rate": 4.554324130551027e-06, + "loss": 0.5753, + "step": 3750 + }, + { + "epoch": 0.591920467097996, + "grad_norm": 0.57907634973526, + "learning_rate": 4.554087424741419e-06, + "loss": 0.6013, + "step": 3751 + }, + { + "epoch": 0.5920782704749882, + "grad_norm": 0.5939270853996277, + "learning_rate": 4.553850662244002e-06, + "loss": 0.5711, + "step": 3752 + }, + { + "epoch": 0.5922360738519804, + "grad_norm": 0.5598222613334656, + "learning_rate": 4.553613843065309e-06, + "loss": 0.6082, + "step": 3753 + }, + { + "epoch": 0.5923938772289727, + "grad_norm": 0.5905656218528748, + "learning_rate": 4.553376967211876e-06, + "loss": 0.5857, + "step": 3754 + }, + { + "epoch": 0.592551680605965, + "grad_norm": 0.5718918442726135, + "learning_rate": 4.55314003469024e-06, + "loss": 0.5407, + "step": 3755 + }, + { + "epoch": 0.5927094839829572, + "grad_norm": 0.5700700879096985, + "learning_rate": 4.552903045506939e-06, + "loss": 0.5878, + "step": 3756 + }, + { + "epoch": 0.5928672873599495, + "grad_norm": 0.5946148633956909, + "learning_rate": 4.5526659996685145e-06, + "loss": 0.5853, + "step": 3757 + }, + { + "epoch": 0.5930250907369418, + "grad_norm": 0.6016959547996521, + "learning_rate": 4.552428897181508e-06, + "loss": 0.5883, + "step": 3758 + }, + { + "epoch": 0.593182894113934, + "grad_norm": 0.5670149922370911, + "learning_rate": 4.552191738052462e-06, + "loss": 0.5665, + "step": 3759 + }, + { + "epoch": 0.5933406974909263, + "grad_norm": 0.6051489114761353, + "learning_rate": 4.551954522287923e-06, + "loss": 0.5815, + "step": 3760 + }, + { + "epoch": 0.5934985008679186, + "grad_norm": 0.5746898055076599, + "learning_rate": 4.551717249894437e-06, + "loss": 0.5654, + "step": 3761 + }, + { + "epoch": 0.5936563042449109, + "grad_norm": 0.5906969308853149, + "learning_rate": 4.551479920878553e-06, + "loss": 0.5942, + "step": 3762 + }, + { + "epoch": 0.5938141076219031, + "grad_norm": 0.5809630751609802, + "learning_rate": 4.551242535246819e-06, + "loss": 0.5737, + "step": 3763 + }, + { + "epoch": 0.5939719109988953, + "grad_norm": 0.5660688877105713, + "learning_rate": 4.551005093005787e-06, + "loss": 0.5915, + "step": 3764 + }, + { + "epoch": 0.5941297143758877, + "grad_norm": 0.6081724762916565, + "learning_rate": 4.55076759416201e-06, + "loss": 0.5616, + "step": 3765 + }, + { + "epoch": 0.5942875177528799, + "grad_norm": 0.5844887495040894, + "learning_rate": 4.550530038722042e-06, + "loss": 0.6028, + "step": 3766 + }, + { + "epoch": 0.5944453211298721, + "grad_norm": 0.5975540280342102, + "learning_rate": 4.55029242669244e-06, + "loss": 0.5862, + "step": 3767 + }, + { + "epoch": 0.5946031245068645, + "grad_norm": 0.5809643864631653, + "learning_rate": 4.5500547580797594e-06, + "loss": 0.6031, + "step": 3768 + }, + { + "epoch": 0.5947609278838567, + "grad_norm": 0.5830026268959045, + "learning_rate": 4.549817032890562e-06, + "loss": 0.5831, + "step": 3769 + }, + { + "epoch": 0.5949187312608489, + "grad_norm": 0.5749611258506775, + "learning_rate": 4.549579251131406e-06, + "loss": 0.5808, + "step": 3770 + }, + { + "epoch": 0.5950765346378413, + "grad_norm": 0.5873735547065735, + "learning_rate": 4.549341412808856e-06, + "loss": 0.5589, + "step": 3771 + }, + { + "epoch": 0.5952343380148335, + "grad_norm": 0.6135457158088684, + "learning_rate": 4.549103517929473e-06, + "loss": 0.5463, + "step": 3772 + }, + { + "epoch": 0.5953921413918258, + "grad_norm": 0.5792906284332275, + "learning_rate": 4.548865566499825e-06, + "loss": 0.6288, + "step": 3773 + }, + { + "epoch": 0.5955499447688181, + "grad_norm": 0.6209877729415894, + "learning_rate": 4.548627558526477e-06, + "loss": 0.5828, + "step": 3774 + }, + { + "epoch": 0.5957077481458103, + "grad_norm": 0.6186349987983704, + "learning_rate": 4.548389494015998e-06, + "loss": 0.5705, + "step": 3775 + }, + { + "epoch": 0.5958655515228026, + "grad_norm": 0.6223840713500977, + "learning_rate": 4.548151372974958e-06, + "loss": 0.5705, + "step": 3776 + }, + { + "epoch": 0.5960233548997949, + "grad_norm": 0.586026668548584, + "learning_rate": 4.547913195409929e-06, + "loss": 0.5498, + "step": 3777 + }, + { + "epoch": 0.5961811582767871, + "grad_norm": 0.611493706703186, + "learning_rate": 4.547674961327483e-06, + "loss": 0.6034, + "step": 3778 + }, + { + "epoch": 0.5963389616537794, + "grad_norm": 0.5982754826545715, + "learning_rate": 4.547436670734196e-06, + "loss": 0.5915, + "step": 3779 + }, + { + "epoch": 0.5964967650307716, + "grad_norm": 0.5907241106033325, + "learning_rate": 4.547198323636644e-06, + "loss": 0.5668, + "step": 3780 + }, + { + "epoch": 0.5966545684077639, + "grad_norm": 0.5751412510871887, + "learning_rate": 4.546959920041404e-06, + "loss": 0.5846, + "step": 3781 + }, + { + "epoch": 0.5968123717847562, + "grad_norm": 0.6206004023551941, + "learning_rate": 4.546721459955056e-06, + "loss": 0.5727, + "step": 3782 + }, + { + "epoch": 0.5969701751617484, + "grad_norm": 0.5651943683624268, + "learning_rate": 4.54648294338418e-06, + "loss": 0.5794, + "step": 3783 + }, + { + "epoch": 0.5971279785387408, + "grad_norm": 0.5790079236030579, + "learning_rate": 4.546244370335361e-06, + "loss": 0.5769, + "step": 3784 + }, + { + "epoch": 0.597285781915733, + "grad_norm": 0.6060269474983215, + "learning_rate": 4.546005740815179e-06, + "loss": 0.5936, + "step": 3785 + }, + { + "epoch": 0.5974435852927252, + "grad_norm": 0.5759556889533997, + "learning_rate": 4.5457670548302235e-06, + "loss": 0.6035, + "step": 3786 + }, + { + "epoch": 0.5976013886697176, + "grad_norm": 0.5721073746681213, + "learning_rate": 4.54552831238708e-06, + "loss": 0.5916, + "step": 3787 + }, + { + "epoch": 0.5977591920467098, + "grad_norm": 0.5864093899726868, + "learning_rate": 4.545289513492336e-06, + "loss": 0.613, + "step": 3788 + }, + { + "epoch": 0.597916995423702, + "grad_norm": 0.5996425747871399, + "learning_rate": 4.545050658152584e-06, + "loss": 0.581, + "step": 3789 + }, + { + "epoch": 0.5980747988006944, + "grad_norm": 0.5839831829071045, + "learning_rate": 4.544811746374415e-06, + "loss": 0.5699, + "step": 3790 + }, + { + "epoch": 0.5982326021776866, + "grad_norm": 0.5715017914772034, + "learning_rate": 4.544572778164421e-06, + "loss": 0.5559, + "step": 3791 + }, + { + "epoch": 0.5983904055546788, + "grad_norm": 0.6608051061630249, + "learning_rate": 4.544333753529199e-06, + "loss": 0.5416, + "step": 3792 + }, + { + "epoch": 0.5985482089316712, + "grad_norm": 0.5617427825927734, + "learning_rate": 4.544094672475344e-06, + "loss": 0.5613, + "step": 3793 + }, + { + "epoch": 0.5987060123086634, + "grad_norm": 0.5874730944633484, + "learning_rate": 4.543855535009455e-06, + "loss": 0.5824, + "step": 3794 + }, + { + "epoch": 0.5988638156856557, + "grad_norm": 0.5724066495895386, + "learning_rate": 4.5436163411381315e-06, + "loss": 0.5889, + "step": 3795 + }, + { + "epoch": 0.599021619062648, + "grad_norm": 0.6732708811759949, + "learning_rate": 4.543377090867975e-06, + "loss": 0.5795, + "step": 3796 + }, + { + "epoch": 0.5991794224396402, + "grad_norm": 0.5693382024765015, + "learning_rate": 4.543137784205587e-06, + "loss": 0.5504, + "step": 3797 + }, + { + "epoch": 0.5993372258166325, + "grad_norm": 0.5859765410423279, + "learning_rate": 4.542898421157572e-06, + "loss": 0.5959, + "step": 3798 + }, + { + "epoch": 0.5994950291936247, + "grad_norm": 0.5812384486198425, + "learning_rate": 4.5426590017305375e-06, + "loss": 0.5826, + "step": 3799 + }, + { + "epoch": 0.599652832570617, + "grad_norm": 0.6072768568992615, + "learning_rate": 4.542419525931089e-06, + "loss": 0.5266, + "step": 3800 + }, + { + "epoch": 0.5998106359476093, + "grad_norm": 0.5891388058662415, + "learning_rate": 4.542179993765836e-06, + "loss": 0.5934, + "step": 3801 + }, + { + "epoch": 0.5999684393246015, + "grad_norm": 0.5587908625602722, + "learning_rate": 4.541940405241389e-06, + "loss": 0.5875, + "step": 3802 + }, + { + "epoch": 0.6001262427015938, + "grad_norm": 0.5747085213661194, + "learning_rate": 4.54170076036436e-06, + "loss": 0.5981, + "step": 3803 + }, + { + "epoch": 0.6002840460785861, + "grad_norm": 0.603237509727478, + "learning_rate": 4.541461059141364e-06, + "loss": 0.5497, + "step": 3804 + }, + { + "epoch": 0.6004418494555783, + "grad_norm": 0.5749722123146057, + "learning_rate": 4.541221301579014e-06, + "loss": 0.5427, + "step": 3805 + }, + { + "epoch": 0.6005996528325707, + "grad_norm": 0.5826087594032288, + "learning_rate": 4.540981487683928e-06, + "loss": 0.5581, + "step": 3806 + }, + { + "epoch": 0.6007574562095629, + "grad_norm": 0.6044433116912842, + "learning_rate": 4.540741617462724e-06, + "loss": 0.549, + "step": 3807 + }, + { + "epoch": 0.6009152595865551, + "grad_norm": 0.5901827812194824, + "learning_rate": 4.5405016909220224e-06, + "loss": 0.5822, + "step": 3808 + }, + { + "epoch": 0.6010730629635475, + "grad_norm": 0.5990499258041382, + "learning_rate": 4.540261708068443e-06, + "loss": 0.5726, + "step": 3809 + }, + { + "epoch": 0.6012308663405397, + "grad_norm": 0.5983535051345825, + "learning_rate": 4.5400216689086105e-06, + "loss": 0.5575, + "step": 3810 + }, + { + "epoch": 0.6013886697175319, + "grad_norm": 0.5719018578529358, + "learning_rate": 4.539781573449148e-06, + "loss": 0.5729, + "step": 3811 + }, + { + "epoch": 0.6015464730945242, + "grad_norm": 0.5705781579017639, + "learning_rate": 4.539541421696683e-06, + "loss": 0.5824, + "step": 3812 + }, + { + "epoch": 0.6017042764715165, + "grad_norm": 0.5729394555091858, + "learning_rate": 4.539301213657842e-06, + "loss": 0.5957, + "step": 3813 + }, + { + "epoch": 0.6018620798485088, + "grad_norm": 0.5897150635719299, + "learning_rate": 4.539060949339254e-06, + "loss": 0.5527, + "step": 3814 + }, + { + "epoch": 0.602019883225501, + "grad_norm": 0.5963519215583801, + "learning_rate": 4.538820628747551e-06, + "loss": 0.6056, + "step": 3815 + }, + { + "epoch": 0.6021776866024933, + "grad_norm": 0.5908628106117249, + "learning_rate": 4.538580251889364e-06, + "loss": 0.5881, + "step": 3816 + }, + { + "epoch": 0.6023354899794856, + "grad_norm": 0.5782555937767029, + "learning_rate": 4.5383398187713265e-06, + "loss": 0.5994, + "step": 3817 + }, + { + "epoch": 0.6024932933564778, + "grad_norm": 0.5607390999794006, + "learning_rate": 4.538099329400076e-06, + "loss": 0.5926, + "step": 3818 + }, + { + "epoch": 0.6026510967334701, + "grad_norm": 0.5702846050262451, + "learning_rate": 4.537858783782247e-06, + "loss": 0.6094, + "step": 3819 + }, + { + "epoch": 0.6028089001104624, + "grad_norm": 0.5882539749145508, + "learning_rate": 4.53761818192448e-06, + "loss": 0.5674, + "step": 3820 + }, + { + "epoch": 0.6029667034874546, + "grad_norm": 0.6101568341255188, + "learning_rate": 4.5373775238334125e-06, + "loss": 0.6012, + "step": 3821 + }, + { + "epoch": 0.6031245068644469, + "grad_norm": 0.8253366351127625, + "learning_rate": 4.537136809515689e-06, + "loss": 0.5851, + "step": 3822 + }, + { + "epoch": 0.6032823102414392, + "grad_norm": 0.5767901539802551, + "learning_rate": 4.536896038977951e-06, + "loss": 0.6018, + "step": 3823 + }, + { + "epoch": 0.6034401136184314, + "grad_norm": 0.5419023633003235, + "learning_rate": 4.536655212226844e-06, + "loss": 0.5331, + "step": 3824 + }, + { + "epoch": 0.6035979169954238, + "grad_norm": 0.6413732767105103, + "learning_rate": 4.536414329269012e-06, + "loss": 0.5802, + "step": 3825 + }, + { + "epoch": 0.603755720372416, + "grad_norm": 0.5540556907653809, + "learning_rate": 4.536173390111106e-06, + "loss": 0.5628, + "step": 3826 + }, + { + "epoch": 0.6039135237494082, + "grad_norm": 0.5671558380126953, + "learning_rate": 4.535932394759773e-06, + "loss": 0.5786, + "step": 3827 + }, + { + "epoch": 0.6040713271264005, + "grad_norm": 0.5915114283561707, + "learning_rate": 4.535691343221665e-06, + "loss": 0.5669, + "step": 3828 + }, + { + "epoch": 0.6042291305033928, + "grad_norm": 0.5893499851226807, + "learning_rate": 4.535450235503433e-06, + "loss": 0.6042, + "step": 3829 + }, + { + "epoch": 0.604386933880385, + "grad_norm": 0.5639637112617493, + "learning_rate": 4.535209071611734e-06, + "loss": 0.5948, + "step": 3830 + }, + { + "epoch": 0.6045447372573773, + "grad_norm": 0.587972104549408, + "learning_rate": 4.53496785155322e-06, + "loss": 0.5196, + "step": 3831 + }, + { + "epoch": 0.6047025406343696, + "grad_norm": 0.5692077279090881, + "learning_rate": 4.534726575334551e-06, + "loss": 0.5692, + "step": 3832 + }, + { + "epoch": 0.6048603440113618, + "grad_norm": 0.5673061609268188, + "learning_rate": 4.534485242962384e-06, + "loss": 0.5858, + "step": 3833 + }, + { + "epoch": 0.6050181473883541, + "grad_norm": 0.6041895747184753, + "learning_rate": 4.534243854443378e-06, + "loss": 0.5967, + "step": 3834 + }, + { + "epoch": 0.6051759507653464, + "grad_norm": 0.5995579361915588, + "learning_rate": 4.534002409784198e-06, + "loss": 0.5691, + "step": 3835 + }, + { + "epoch": 0.6053337541423387, + "grad_norm": 0.6340352892875671, + "learning_rate": 4.533760908991505e-06, + "loss": 0.6123, + "step": 3836 + }, + { + "epoch": 0.6054915575193309, + "grad_norm": 0.6117687821388245, + "learning_rate": 4.5335193520719645e-06, + "loss": 0.6204, + "step": 3837 + }, + { + "epoch": 0.6056493608963232, + "grad_norm": 0.6034123301506042, + "learning_rate": 4.533277739032243e-06, + "loss": 0.5893, + "step": 3838 + }, + { + "epoch": 0.6058071642733155, + "grad_norm": 0.5583383440971375, + "learning_rate": 4.5330360698790075e-06, + "loss": 0.5727, + "step": 3839 + }, + { + "epoch": 0.6059649676503077, + "grad_norm": 0.6224699020385742, + "learning_rate": 4.532794344618929e-06, + "loss": 0.6075, + "step": 3840 + }, + { + "epoch": 0.6061227710273, + "grad_norm": 0.6242337822914124, + "learning_rate": 4.5325525632586775e-06, + "loss": 0.6195, + "step": 3841 + }, + { + "epoch": 0.6062805744042923, + "grad_norm": 0.5629652142524719, + "learning_rate": 4.532310725804926e-06, + "loss": 0.5991, + "step": 3842 + }, + { + "epoch": 0.6064383777812845, + "grad_norm": 0.5778573751449585, + "learning_rate": 4.532068832264347e-06, + "loss": 0.6088, + "step": 3843 + }, + { + "epoch": 0.6065961811582767, + "grad_norm": 0.5585645437240601, + "learning_rate": 4.531826882643618e-06, + "loss": 0.5581, + "step": 3844 + }, + { + "epoch": 0.6067539845352691, + "grad_norm": 0.5850322246551514, + "learning_rate": 4.531584876949418e-06, + "loss": 0.615, + "step": 3845 + }, + { + "epoch": 0.6069117879122613, + "grad_norm": 0.5834108591079712, + "learning_rate": 4.531342815188421e-06, + "loss": 0.5871, + "step": 3846 + }, + { + "epoch": 0.6070695912892536, + "grad_norm": 0.6323981285095215, + "learning_rate": 4.531100697367311e-06, + "loss": 0.601, + "step": 3847 + }, + { + "epoch": 0.6072273946662459, + "grad_norm": 0.5882057547569275, + "learning_rate": 4.530858523492768e-06, + "loss": 0.5905, + "step": 3848 + }, + { + "epoch": 0.6073851980432381, + "grad_norm": 0.5795276761054993, + "learning_rate": 4.530616293571477e-06, + "loss": 0.586, + "step": 3849 + }, + { + "epoch": 0.6075430014202304, + "grad_norm": 0.5863098502159119, + "learning_rate": 4.530374007610122e-06, + "loss": 0.5622, + "step": 3850 + }, + { + "epoch": 0.6077008047972227, + "grad_norm": 0.5860450267791748, + "learning_rate": 4.530131665615389e-06, + "loss": 0.584, + "step": 3851 + }, + { + "epoch": 0.6078586081742149, + "grad_norm": 0.6053204536437988, + "learning_rate": 4.529889267593968e-06, + "loss": 0.5784, + "step": 3852 + }, + { + "epoch": 0.6080164115512072, + "grad_norm": 0.6261022090911865, + "learning_rate": 4.529646813552546e-06, + "loss": 0.596, + "step": 3853 + }, + { + "epoch": 0.6081742149281995, + "grad_norm": 0.6056270003318787, + "learning_rate": 4.529404303497815e-06, + "loss": 0.5909, + "step": 3854 + }, + { + "epoch": 0.6083320183051917, + "grad_norm": 0.5927842259407043, + "learning_rate": 4.5291617374364695e-06, + "loss": 0.5781, + "step": 3855 + }, + { + "epoch": 0.608489821682184, + "grad_norm": 0.603508472442627, + "learning_rate": 4.528919115375201e-06, + "loss": 0.5969, + "step": 3856 + }, + { + "epoch": 0.6086476250591762, + "grad_norm": 0.6065272688865662, + "learning_rate": 4.528676437320707e-06, + "loss": 0.586, + "step": 3857 + }, + { + "epoch": 0.6088054284361686, + "grad_norm": 0.554961621761322, + "learning_rate": 4.528433703279686e-06, + "loss": 0.5731, + "step": 3858 + }, + { + "epoch": 0.6089632318131608, + "grad_norm": 0.610816478729248, + "learning_rate": 4.5281909132588334e-06, + "loss": 0.5964, + "step": 3859 + }, + { + "epoch": 0.609121035190153, + "grad_norm": 0.5762917399406433, + "learning_rate": 4.527948067264851e-06, + "loss": 0.5859, + "step": 3860 + }, + { + "epoch": 0.6092788385671454, + "grad_norm": 0.5958735942840576, + "learning_rate": 4.527705165304443e-06, + "loss": 0.6088, + "step": 3861 + }, + { + "epoch": 0.6094366419441376, + "grad_norm": 0.6065572500228882, + "learning_rate": 4.52746220738431e-06, + "loss": 0.5401, + "step": 3862 + }, + { + "epoch": 0.6095944453211298, + "grad_norm": 0.5683240294456482, + "learning_rate": 4.527219193511159e-06, + "loss": 0.5845, + "step": 3863 + }, + { + "epoch": 0.6097522486981222, + "grad_norm": 0.5721459984779358, + "learning_rate": 4.5269761236916955e-06, + "loss": 0.5777, + "step": 3864 + }, + { + "epoch": 0.6099100520751144, + "grad_norm": 0.6192176938056946, + "learning_rate": 4.526732997932628e-06, + "loss": 0.5645, + "step": 3865 + }, + { + "epoch": 0.6100678554521066, + "grad_norm": 0.5652175545692444, + "learning_rate": 4.526489816240666e-06, + "loss": 0.5781, + "step": 3866 + }, + { + "epoch": 0.610225658829099, + "grad_norm": 0.5848795771598816, + "learning_rate": 4.526246578622521e-06, + "loss": 0.5645, + "step": 3867 + }, + { + "epoch": 0.6103834622060912, + "grad_norm": 0.5951361060142517, + "learning_rate": 4.526003285084905e-06, + "loss": 0.5575, + "step": 3868 + }, + { + "epoch": 0.6105412655830835, + "grad_norm": 0.5904032588005066, + "learning_rate": 4.5257599356345335e-06, + "loss": 0.5712, + "step": 3869 + }, + { + "epoch": 0.6106990689600758, + "grad_norm": 0.5817548036575317, + "learning_rate": 4.525516530278121e-06, + "loss": 0.5387, + "step": 3870 + }, + { + "epoch": 0.610856872337068, + "grad_norm": 0.6206053495407104, + "learning_rate": 4.525273069022386e-06, + "loss": 0.6189, + "step": 3871 + }, + { + "epoch": 0.6110146757140603, + "grad_norm": 0.6343191862106323, + "learning_rate": 4.525029551874047e-06, + "loss": 0.5784, + "step": 3872 + }, + { + "epoch": 0.6111724790910525, + "grad_norm": 0.5979846715927124, + "learning_rate": 4.5247859788398256e-06, + "loss": 0.55, + "step": 3873 + }, + { + "epoch": 0.6113302824680448, + "grad_norm": 0.5618782043457031, + "learning_rate": 4.524542349926442e-06, + "loss": 0.5482, + "step": 3874 + }, + { + "epoch": 0.6114880858450371, + "grad_norm": 0.5748959183692932, + "learning_rate": 4.524298665140621e-06, + "loss": 0.5739, + "step": 3875 + }, + { + "epoch": 0.6116458892220293, + "grad_norm": 0.5962971448898315, + "learning_rate": 4.524054924489087e-06, + "loss": 0.5302, + "step": 3876 + }, + { + "epoch": 0.6118036925990216, + "grad_norm": 0.6194692254066467, + "learning_rate": 4.523811127978567e-06, + "loss": 0.5459, + "step": 3877 + }, + { + "epoch": 0.6119614959760139, + "grad_norm": 0.5666699409484863, + "learning_rate": 4.523567275615789e-06, + "loss": 0.5995, + "step": 3878 + }, + { + "epoch": 0.6121192993530061, + "grad_norm": 0.58970707654953, + "learning_rate": 4.523323367407483e-06, + "loss": 0.569, + "step": 3879 + }, + { + "epoch": 0.6122771027299985, + "grad_norm": 0.5782902836799622, + "learning_rate": 4.523079403360379e-06, + "loss": 0.5479, + "step": 3880 + }, + { + "epoch": 0.6124349061069907, + "grad_norm": 0.6000682711601257, + "learning_rate": 4.522835383481212e-06, + "loss": 0.5758, + "step": 3881 + }, + { + "epoch": 0.6125927094839829, + "grad_norm": 0.5874297618865967, + "learning_rate": 4.522591307776715e-06, + "loss": 0.6031, + "step": 3882 + }, + { + "epoch": 0.6127505128609753, + "grad_norm": 0.5866447687149048, + "learning_rate": 4.5223471762536244e-06, + "loss": 0.5973, + "step": 3883 + }, + { + "epoch": 0.6129083162379675, + "grad_norm": 0.5779869556427002, + "learning_rate": 4.522102988918677e-06, + "loss": 0.5953, + "step": 3884 + }, + { + "epoch": 0.6130661196149597, + "grad_norm": 0.5469757318496704, + "learning_rate": 4.521858745778613e-06, + "loss": 0.6016, + "step": 3885 + }, + { + "epoch": 0.613223922991952, + "grad_norm": 0.6158739924430847, + "learning_rate": 4.521614446840171e-06, + "loss": 0.5916, + "step": 3886 + }, + { + "epoch": 0.6133817263689443, + "grad_norm": 0.5525325536727905, + "learning_rate": 4.521370092110095e-06, + "loss": 0.5836, + "step": 3887 + }, + { + "epoch": 0.6135395297459366, + "grad_norm": 0.5807229280471802, + "learning_rate": 4.521125681595128e-06, + "loss": 0.5938, + "step": 3888 + }, + { + "epoch": 0.6136973331229288, + "grad_norm": 0.5806782841682434, + "learning_rate": 4.520881215302013e-06, + "loss": 0.5796, + "step": 3889 + }, + { + "epoch": 0.6138551364999211, + "grad_norm": 0.5649817585945129, + "learning_rate": 4.520636693237501e-06, + "loss": 0.5505, + "step": 3890 + }, + { + "epoch": 0.6140129398769134, + "grad_norm": 0.6377394199371338, + "learning_rate": 4.520392115408336e-06, + "loss": 0.5939, + "step": 3891 + }, + { + "epoch": 0.6141707432539056, + "grad_norm": 0.6430115103721619, + "learning_rate": 4.52014748182127e-06, + "loss": 0.5984, + "step": 3892 + }, + { + "epoch": 0.6143285466308979, + "grad_norm": 0.6018170714378357, + "learning_rate": 4.519902792483054e-06, + "loss": 0.5676, + "step": 3893 + }, + { + "epoch": 0.6144863500078902, + "grad_norm": 0.5636605620384216, + "learning_rate": 4.519658047400442e-06, + "loss": 0.5812, + "step": 3894 + }, + { + "epoch": 0.6146441533848824, + "grad_norm": 0.562825620174408, + "learning_rate": 4.519413246580184e-06, + "loss": 0.5775, + "step": 3895 + }, + { + "epoch": 0.6148019567618747, + "grad_norm": 0.5877458453178406, + "learning_rate": 4.519168390029041e-06, + "loss": 0.5872, + "step": 3896 + }, + { + "epoch": 0.614959760138867, + "grad_norm": 0.5718860030174255, + "learning_rate": 4.518923477753768e-06, + "loss": 0.6097, + "step": 3897 + }, + { + "epoch": 0.6151175635158592, + "grad_norm": 0.5817747116088867, + "learning_rate": 4.518678509761125e-06, + "loss": 0.5895, + "step": 3898 + }, + { + "epoch": 0.6152753668928516, + "grad_norm": 0.5810716152191162, + "learning_rate": 4.5184334860578705e-06, + "loss": 0.5698, + "step": 3899 + }, + { + "epoch": 0.6154331702698438, + "grad_norm": 0.574908435344696, + "learning_rate": 4.5181884066507685e-06, + "loss": 0.5566, + "step": 3900 + }, + { + "epoch": 0.615590973646836, + "grad_norm": 0.5756170153617859, + "learning_rate": 4.517943271546582e-06, + "loss": 0.5814, + "step": 3901 + }, + { + "epoch": 0.6157487770238284, + "grad_norm": 0.5710357427597046, + "learning_rate": 4.517698080752076e-06, + "loss": 0.5463, + "step": 3902 + }, + { + "epoch": 0.6159065804008206, + "grad_norm": 0.5851268172264099, + "learning_rate": 4.517452834274018e-06, + "loss": 0.5915, + "step": 3903 + }, + { + "epoch": 0.6160643837778128, + "grad_norm": 0.5825790166854858, + "learning_rate": 4.517207532119175e-06, + "loss": 0.5723, + "step": 3904 + }, + { + "epoch": 0.6162221871548051, + "grad_norm": 0.6184925436973572, + "learning_rate": 4.516962174294317e-06, + "loss": 0.5857, + "step": 3905 + }, + { + "epoch": 0.6163799905317974, + "grad_norm": 0.5490251779556274, + "learning_rate": 4.516716760806217e-06, + "loss": 0.529, + "step": 3906 + }, + { + "epoch": 0.6165377939087896, + "grad_norm": 0.5837032794952393, + "learning_rate": 4.5164712916616446e-06, + "loss": 0.5823, + "step": 3907 + }, + { + "epoch": 0.6166955972857819, + "grad_norm": 0.603337287902832, + "learning_rate": 4.516225766867377e-06, + "loss": 0.5707, + "step": 3908 + }, + { + "epoch": 0.6168534006627742, + "grad_norm": 0.5881329774856567, + "learning_rate": 4.515980186430188e-06, + "loss": 0.593, + "step": 3909 + }, + { + "epoch": 0.6170112040397665, + "grad_norm": 0.5764191150665283, + "learning_rate": 4.5157345503568575e-06, + "loss": 0.5523, + "step": 3910 + }, + { + "epoch": 0.6171690074167587, + "grad_norm": 0.5967995524406433, + "learning_rate": 4.515488858654161e-06, + "loss": 0.5351, + "step": 3911 + }, + { + "epoch": 0.617326810793751, + "grad_norm": 0.5998240113258362, + "learning_rate": 4.515243111328882e-06, + "loss": 0.5931, + "step": 3912 + }, + { + "epoch": 0.6174846141707433, + "grad_norm": 0.6281684041023254, + "learning_rate": 4.514997308387802e-06, + "loss": 0.5833, + "step": 3913 + }, + { + "epoch": 0.6176424175477355, + "grad_norm": 0.5735514760017395, + "learning_rate": 4.514751449837703e-06, + "loss": 0.5701, + "step": 3914 + }, + { + "epoch": 0.6178002209247277, + "grad_norm": 0.5900400876998901, + "learning_rate": 4.5145055356853725e-06, + "loss": 0.5894, + "step": 3915 + }, + { + "epoch": 0.6179580243017201, + "grad_norm": 0.5977720022201538, + "learning_rate": 4.514259565937595e-06, + "loss": 0.5906, + "step": 3916 + }, + { + "epoch": 0.6181158276787123, + "grad_norm": 0.5765628814697266, + "learning_rate": 4.5140135406011595e-06, + "loss": 0.5917, + "step": 3917 + }, + { + "epoch": 0.6182736310557045, + "grad_norm": 0.5786934494972229, + "learning_rate": 4.513767459682855e-06, + "loss": 0.5757, + "step": 3918 + }, + { + "epoch": 0.6184314344326969, + "grad_norm": 0.594260036945343, + "learning_rate": 4.513521323189475e-06, + "loss": 0.5562, + "step": 3919 + }, + { + "epoch": 0.6185892378096891, + "grad_norm": 0.676770806312561, + "learning_rate": 4.513275131127809e-06, + "loss": 0.6081, + "step": 3920 + }, + { + "epoch": 0.6187470411866814, + "grad_norm": 0.6044073700904846, + "learning_rate": 4.5130288835046534e-06, + "loss": 0.5484, + "step": 3921 + }, + { + "epoch": 0.6189048445636737, + "grad_norm": 0.6074613332748413, + "learning_rate": 4.512782580326804e-06, + "loss": 0.535, + "step": 3922 + }, + { + "epoch": 0.6190626479406659, + "grad_norm": 0.570656955242157, + "learning_rate": 4.5125362216010575e-06, + "loss": 0.5542, + "step": 3923 + }, + { + "epoch": 0.6192204513176582, + "grad_norm": 0.5897341370582581, + "learning_rate": 4.512289807334214e-06, + "loss": 0.5983, + "step": 3924 + }, + { + "epoch": 0.6193782546946505, + "grad_norm": 0.6149777770042419, + "learning_rate": 4.512043337533071e-06, + "loss": 0.5633, + "step": 3925 + }, + { + "epoch": 0.6195360580716427, + "grad_norm": 0.5933530330657959, + "learning_rate": 4.511796812204434e-06, + "loss": 0.5882, + "step": 3926 + }, + { + "epoch": 0.619693861448635, + "grad_norm": 0.5908656120300293, + "learning_rate": 4.511550231355105e-06, + "loss": 0.5571, + "step": 3927 + }, + { + "epoch": 0.6198516648256273, + "grad_norm": 0.5792783498764038, + "learning_rate": 4.511303594991888e-06, + "loss": 0.5683, + "step": 3928 + }, + { + "epoch": 0.6200094682026195, + "grad_norm": 0.5455393195152283, + "learning_rate": 4.5110569031215915e-06, + "loss": 0.5297, + "step": 3929 + }, + { + "epoch": 0.6201672715796118, + "grad_norm": 0.6043230295181274, + "learning_rate": 4.510810155751022e-06, + "loss": 0.5899, + "step": 3930 + }, + { + "epoch": 0.620325074956604, + "grad_norm": 0.5804182291030884, + "learning_rate": 4.51056335288699e-06, + "loss": 0.5653, + "step": 3931 + }, + { + "epoch": 0.6204828783335964, + "grad_norm": 0.6038508415222168, + "learning_rate": 4.510316494536306e-06, + "loss": 0.5521, + "step": 3932 + }, + { + "epoch": 0.6206406817105886, + "grad_norm": 0.5582035779953003, + "learning_rate": 4.510069580705784e-06, + "loss": 0.5753, + "step": 3933 + }, + { + "epoch": 0.6207984850875808, + "grad_norm": 0.586758553981781, + "learning_rate": 4.509822611402237e-06, + "loss": 0.5732, + "step": 3934 + }, + { + "epoch": 0.6209562884645732, + "grad_norm": 0.5991913676261902, + "learning_rate": 4.5095755866324805e-06, + "loss": 0.5307, + "step": 3935 + }, + { + "epoch": 0.6211140918415654, + "grad_norm": 0.586877703666687, + "learning_rate": 4.509328506403333e-06, + "loss": 0.582, + "step": 3936 + }, + { + "epoch": 0.6212718952185576, + "grad_norm": 0.5894119739532471, + "learning_rate": 4.509081370721613e-06, + "loss": 0.5602, + "step": 3937 + }, + { + "epoch": 0.62142969859555, + "grad_norm": 0.578465461730957, + "learning_rate": 4.50883417959414e-06, + "loss": 0.5772, + "step": 3938 + }, + { + "epoch": 0.6215875019725422, + "grad_norm": 0.5656845569610596, + "learning_rate": 4.508586933027736e-06, + "loss": 0.5976, + "step": 3939 + }, + { + "epoch": 0.6217453053495344, + "grad_norm": 0.5880796313285828, + "learning_rate": 4.508339631029225e-06, + "loss": 0.5768, + "step": 3940 + }, + { + "epoch": 0.6219031087265268, + "grad_norm": 0.6470693349838257, + "learning_rate": 4.508092273605432e-06, + "loss": 0.6015, + "step": 3941 + }, + { + "epoch": 0.622060912103519, + "grad_norm": 0.5935566425323486, + "learning_rate": 4.507844860763184e-06, + "loss": 0.5942, + "step": 3942 + }, + { + "epoch": 0.6222187154805113, + "grad_norm": 0.5749370455741882, + "learning_rate": 4.507597392509308e-06, + "loss": 0.5784, + "step": 3943 + }, + { + "epoch": 0.6223765188575036, + "grad_norm": 0.5671842098236084, + "learning_rate": 4.507349868850632e-06, + "loss": 0.5873, + "step": 3944 + }, + { + "epoch": 0.6225343222344958, + "grad_norm": 0.6052433848381042, + "learning_rate": 4.507102289793991e-06, + "loss": 0.6036, + "step": 3945 + }, + { + "epoch": 0.6226921256114881, + "grad_norm": 0.586776614189148, + "learning_rate": 4.506854655346213e-06, + "loss": 0.5819, + "step": 3946 + }, + { + "epoch": 0.6228499289884804, + "grad_norm": 0.5895147323608398, + "learning_rate": 4.506606965514136e-06, + "loss": 0.5694, + "step": 3947 + }, + { + "epoch": 0.6230077323654726, + "grad_norm": 0.6094140410423279, + "learning_rate": 4.5063592203045935e-06, + "loss": 0.5845, + "step": 3948 + }, + { + "epoch": 0.6231655357424649, + "grad_norm": 0.5996906757354736, + "learning_rate": 4.506111419724424e-06, + "loss": 0.585, + "step": 3949 + }, + { + "epoch": 0.6233233391194571, + "grad_norm": 0.5750243663787842, + "learning_rate": 4.505863563780465e-06, + "loss": 0.5845, + "step": 3950 + }, + { + "epoch": 0.6234811424964494, + "grad_norm": 0.5964981317520142, + "learning_rate": 4.505615652479557e-06, + "loss": 0.5652, + "step": 3951 + }, + { + "epoch": 0.6236389458734417, + "grad_norm": 0.578950047492981, + "learning_rate": 4.505367685828542e-06, + "loss": 0.6, + "step": 3952 + }, + { + "epoch": 0.6237967492504339, + "grad_norm": 0.5799104571342468, + "learning_rate": 4.505119663834263e-06, + "loss": 0.6006, + "step": 3953 + }, + { + "epoch": 0.6239545526274263, + "grad_norm": 0.5987715721130371, + "learning_rate": 4.504871586503565e-06, + "loss": 0.5456, + "step": 3954 + }, + { + "epoch": 0.6241123560044185, + "grad_norm": 0.5925467610359192, + "learning_rate": 4.504623453843295e-06, + "loss": 0.5539, + "step": 3955 + }, + { + "epoch": 0.6242701593814107, + "grad_norm": 0.5816943049430847, + "learning_rate": 4.5043752658603e-06, + "loss": 0.5593, + "step": 3956 + }, + { + "epoch": 0.6244279627584031, + "grad_norm": 0.678719162940979, + "learning_rate": 4.5041270225614284e-06, + "loss": 0.5947, + "step": 3957 + }, + { + "epoch": 0.6245857661353953, + "grad_norm": 0.5910063982009888, + "learning_rate": 4.503878723953534e-06, + "loss": 0.5759, + "step": 3958 + }, + { + "epoch": 0.6247435695123875, + "grad_norm": 0.6327548027038574, + "learning_rate": 4.5036303700434666e-06, + "loss": 0.5984, + "step": 3959 + }, + { + "epoch": 0.6249013728893799, + "grad_norm": 0.6180802583694458, + "learning_rate": 4.50338196083808e-06, + "loss": 0.6, + "step": 3960 + }, + { + "epoch": 0.6250591762663721, + "grad_norm": 0.587555468082428, + "learning_rate": 4.503133496344233e-06, + "loss": 0.5649, + "step": 3961 + }, + { + "epoch": 0.6252169796433644, + "grad_norm": 0.5819083452224731, + "learning_rate": 4.502884976568779e-06, + "loss": 0.5705, + "step": 3962 + }, + { + "epoch": 0.6253747830203567, + "grad_norm": 0.5756600499153137, + "learning_rate": 4.502636401518578e-06, + "loss": 0.5759, + "step": 3963 + }, + { + "epoch": 0.6255325863973489, + "grad_norm": 0.590205192565918, + "learning_rate": 4.50238777120049e-06, + "loss": 0.602, + "step": 3964 + }, + { + "epoch": 0.6256903897743412, + "grad_norm": 0.5977252125740051, + "learning_rate": 4.502139085621377e-06, + "loss": 0.582, + "step": 3965 + }, + { + "epoch": 0.6258481931513334, + "grad_norm": 0.5997149348258972, + "learning_rate": 4.501890344788101e-06, + "loss": 0.5957, + "step": 3966 + }, + { + "epoch": 0.6260059965283257, + "grad_norm": 0.5758454203605652, + "learning_rate": 4.501641548707528e-06, + "loss": 0.5534, + "step": 3967 + }, + { + "epoch": 0.626163799905318, + "grad_norm": 0.581106424331665, + "learning_rate": 4.501392697386524e-06, + "loss": 0.6143, + "step": 3968 + }, + { + "epoch": 0.6263216032823102, + "grad_norm": 0.5984770655632019, + "learning_rate": 4.501143790831955e-06, + "loss": 0.6007, + "step": 3969 + }, + { + "epoch": 0.6264794066593025, + "grad_norm": 0.5334557294845581, + "learning_rate": 4.500894829050692e-06, + "loss": 0.5752, + "step": 3970 + }, + { + "epoch": 0.6266372100362948, + "grad_norm": 0.6062655448913574, + "learning_rate": 4.500645812049604e-06, + "loss": 0.6066, + "step": 3971 + }, + { + "epoch": 0.626795013413287, + "grad_norm": 0.5791186094284058, + "learning_rate": 4.500396739835566e-06, + "loss": 0.5813, + "step": 3972 + }, + { + "epoch": 0.6269528167902794, + "grad_norm": 0.5750792026519775, + "learning_rate": 4.50014761241545e-06, + "loss": 0.6061, + "step": 3973 + }, + { + "epoch": 0.6271106201672716, + "grad_norm": 0.6060407161712646, + "learning_rate": 4.4998984297961305e-06, + "loss": 0.5489, + "step": 3974 + }, + { + "epoch": 0.6272684235442638, + "grad_norm": 0.6258114576339722, + "learning_rate": 4.4996491919844855e-06, + "loss": 0.5872, + "step": 3975 + }, + { + "epoch": 0.6274262269212562, + "grad_norm": 0.5915290117263794, + "learning_rate": 4.499399898987395e-06, + "loss": 0.5806, + "step": 3976 + }, + { + "epoch": 0.6275840302982484, + "grad_norm": 0.5992693305015564, + "learning_rate": 4.499150550811735e-06, + "loss": 0.5867, + "step": 3977 + }, + { + "epoch": 0.6277418336752406, + "grad_norm": 0.597409725189209, + "learning_rate": 4.4989011474643895e-06, + "loss": 0.5947, + "step": 3978 + }, + { + "epoch": 0.627899637052233, + "grad_norm": 0.6004375219345093, + "learning_rate": 4.498651688952241e-06, + "loss": 0.5958, + "step": 3979 + }, + { + "epoch": 0.6280574404292252, + "grad_norm": 0.5855794548988342, + "learning_rate": 4.498402175282174e-06, + "loss": 0.6049, + "step": 3980 + }, + { + "epoch": 0.6282152438062174, + "grad_norm": 0.5393959879875183, + "learning_rate": 4.498152606461074e-06, + "loss": 0.5651, + "step": 3981 + }, + { + "epoch": 0.6283730471832097, + "grad_norm": 0.5663466453552246, + "learning_rate": 4.497902982495828e-06, + "loss": 0.5997, + "step": 3982 + }, + { + "epoch": 0.628530850560202, + "grad_norm": 0.608877420425415, + "learning_rate": 4.497653303393328e-06, + "loss": 0.599, + "step": 3983 + }, + { + "epoch": 0.6286886539371943, + "grad_norm": 0.5884467959403992, + "learning_rate": 4.49740356916046e-06, + "loss": 0.5775, + "step": 3984 + }, + { + "epoch": 0.6288464573141865, + "grad_norm": 0.6324931979179382, + "learning_rate": 4.497153779804121e-06, + "loss": 0.5985, + "step": 3985 + }, + { + "epoch": 0.6290042606911788, + "grad_norm": 0.5879462361335754, + "learning_rate": 4.4969039353312e-06, + "loss": 0.5826, + "step": 3986 + }, + { + "epoch": 0.6291620640681711, + "grad_norm": 0.5871471166610718, + "learning_rate": 4.496654035748594e-06, + "loss": 0.5446, + "step": 3987 + }, + { + "epoch": 0.6293198674451633, + "grad_norm": 0.6065788865089417, + "learning_rate": 4.4964040810632e-06, + "loss": 0.5641, + "step": 3988 + }, + { + "epoch": 0.6294776708221556, + "grad_norm": 0.5874865055084229, + "learning_rate": 4.4961540712819165e-06, + "loss": 0.582, + "step": 3989 + }, + { + "epoch": 0.6296354741991479, + "grad_norm": 0.6311086416244507, + "learning_rate": 4.495904006411643e-06, + "loss": 0.5581, + "step": 3990 + }, + { + "epoch": 0.6297932775761401, + "grad_norm": 0.5907282829284668, + "learning_rate": 4.495653886459279e-06, + "loss": 0.5879, + "step": 3991 + }, + { + "epoch": 0.6299510809531323, + "grad_norm": 0.5983965396881104, + "learning_rate": 4.495403711431728e-06, + "loss": 0.5972, + "step": 3992 + }, + { + "epoch": 0.6301088843301247, + "grad_norm": 0.5863428115844727, + "learning_rate": 4.495153481335896e-06, + "loss": 0.567, + "step": 3993 + }, + { + "epoch": 0.6302666877071169, + "grad_norm": 0.5876850485801697, + "learning_rate": 4.494903196178687e-06, + "loss": 0.5819, + "step": 3994 + }, + { + "epoch": 0.6304244910841093, + "grad_norm": 0.5923919677734375, + "learning_rate": 4.4946528559670085e-06, + "loss": 0.6025, + "step": 3995 + }, + { + "epoch": 0.6305822944611015, + "grad_norm": 0.5484175682067871, + "learning_rate": 4.494402460707769e-06, + "loss": 0.5722, + "step": 3996 + }, + { + "epoch": 0.6307400978380937, + "grad_norm": 0.5572158694267273, + "learning_rate": 4.49415201040788e-06, + "loss": 0.5894, + "step": 3997 + }, + { + "epoch": 0.630897901215086, + "grad_norm": 0.575578510761261, + "learning_rate": 4.493901505074251e-06, + "loss": 0.5453, + "step": 3998 + }, + { + "epoch": 0.6310557045920783, + "grad_norm": 0.584559440612793, + "learning_rate": 4.493650944713799e-06, + "loss": 0.5704, + "step": 3999 + }, + { + "epoch": 0.6312135079690705, + "grad_norm": 0.5904666781425476, + "learning_rate": 4.493400329333435e-06, + "loss": 0.5708, + "step": 4000 + }, + { + "epoch": 0.6313713113460628, + "grad_norm": 0.5758212804794312, + "learning_rate": 4.493149658940077e-06, + "loss": 0.6023, + "step": 4001 + }, + { + "epoch": 0.6315291147230551, + "grad_norm": 0.5924627184867859, + "learning_rate": 4.492898933540643e-06, + "loss": 0.6082, + "step": 4002 + }, + { + "epoch": 0.6316869181000473, + "grad_norm": 0.5897166728973389, + "learning_rate": 4.492648153142054e-06, + "loss": 0.5501, + "step": 4003 + }, + { + "epoch": 0.6318447214770396, + "grad_norm": 0.5767874717712402, + "learning_rate": 4.492397317751228e-06, + "loss": 0.6017, + "step": 4004 + }, + { + "epoch": 0.6320025248540319, + "grad_norm": 0.6398530602455139, + "learning_rate": 4.492146427375089e-06, + "loss": 0.5263, + "step": 4005 + }, + { + "epoch": 0.6321603282310242, + "grad_norm": 0.5814235806465149, + "learning_rate": 4.4918954820205605e-06, + "loss": 0.5285, + "step": 4006 + }, + { + "epoch": 0.6323181316080164, + "grad_norm": 0.6111078262329102, + "learning_rate": 4.491644481694568e-06, + "loss": 0.5866, + "step": 4007 + }, + { + "epoch": 0.6324759349850086, + "grad_norm": 0.5867370367050171, + "learning_rate": 4.491393426404039e-06, + "loss": 0.5834, + "step": 4008 + }, + { + "epoch": 0.632633738362001, + "grad_norm": 0.5911517143249512, + "learning_rate": 4.491142316155902e-06, + "loss": 0.5746, + "step": 4009 + }, + { + "epoch": 0.6327915417389932, + "grad_norm": 0.5796091556549072, + "learning_rate": 4.490891150957087e-06, + "loss": 0.5723, + "step": 4010 + }, + { + "epoch": 0.6329493451159854, + "grad_norm": 0.582331120967865, + "learning_rate": 4.490639930814524e-06, + "loss": 0.5802, + "step": 4011 + }, + { + "epoch": 0.6331071484929778, + "grad_norm": 0.5877366065979004, + "learning_rate": 4.490388655735149e-06, + "loss": 0.5484, + "step": 4012 + }, + { + "epoch": 0.63326495186997, + "grad_norm": 0.5628044009208679, + "learning_rate": 4.490137325725894e-06, + "loss": 0.6045, + "step": 4013 + }, + { + "epoch": 0.6334227552469622, + "grad_norm": 0.6144285202026367, + "learning_rate": 4.489885940793696e-06, + "loss": 0.6166, + "step": 4014 + }, + { + "epoch": 0.6335805586239546, + "grad_norm": 0.5710435509681702, + "learning_rate": 4.489634500945493e-06, + "loss": 0.5643, + "step": 4015 + }, + { + "epoch": 0.6337383620009468, + "grad_norm": 0.6521360278129578, + "learning_rate": 4.4893830061882236e-06, + "loss": 0.5764, + "step": 4016 + }, + { + "epoch": 0.6338961653779391, + "grad_norm": 0.5787423849105835, + "learning_rate": 4.4891314565288295e-06, + "loss": 0.6055, + "step": 4017 + }, + { + "epoch": 0.6340539687549314, + "grad_norm": 0.6124407052993774, + "learning_rate": 4.488879851974251e-06, + "loss": 0.5601, + "step": 4018 + }, + { + "epoch": 0.6342117721319236, + "grad_norm": 0.5833797454833984, + "learning_rate": 4.488628192531432e-06, + "loss": 0.5708, + "step": 4019 + }, + { + "epoch": 0.6343695755089159, + "grad_norm": 0.5790371298789978, + "learning_rate": 4.4883764782073195e-06, + "loss": 0.5564, + "step": 4020 + }, + { + "epoch": 0.6345273788859082, + "grad_norm": 0.5717059969902039, + "learning_rate": 4.488124709008858e-06, + "loss": 0.5852, + "step": 4021 + }, + { + "epoch": 0.6346851822629004, + "grad_norm": 0.589855432510376, + "learning_rate": 4.487872884942998e-06, + "loss": 0.6189, + "step": 4022 + }, + { + "epoch": 0.6348429856398927, + "grad_norm": 0.585867166519165, + "learning_rate": 4.4876210060166885e-06, + "loss": 0.5578, + "step": 4023 + }, + { + "epoch": 0.635000789016885, + "grad_norm": 0.5840084552764893, + "learning_rate": 4.487369072236879e-06, + "loss": 0.5458, + "step": 4024 + }, + { + "epoch": 0.6351585923938772, + "grad_norm": 0.6216853857040405, + "learning_rate": 4.4871170836105244e-06, + "loss": 0.5754, + "step": 4025 + }, + { + "epoch": 0.6353163957708695, + "grad_norm": 0.6215411424636841, + "learning_rate": 4.486865040144578e-06, + "loss": 0.6041, + "step": 4026 + }, + { + "epoch": 0.6354741991478617, + "grad_norm": 0.5632824301719666, + "learning_rate": 4.486612941845996e-06, + "loss": 0.5751, + "step": 4027 + }, + { + "epoch": 0.6356320025248541, + "grad_norm": 0.583678662776947, + "learning_rate": 4.486360788721734e-06, + "loss": 0.5488, + "step": 4028 + }, + { + "epoch": 0.6357898059018463, + "grad_norm": 0.5808286070823669, + "learning_rate": 4.486108580778754e-06, + "loss": 0.6036, + "step": 4029 + }, + { + "epoch": 0.6359476092788385, + "grad_norm": 0.5724223256111145, + "learning_rate": 4.485856318024014e-06, + "loss": 0.5725, + "step": 4030 + }, + { + "epoch": 0.6361054126558309, + "grad_norm": 0.5990440249443054, + "learning_rate": 4.4856040004644764e-06, + "loss": 0.6206, + "step": 4031 + }, + { + "epoch": 0.6362632160328231, + "grad_norm": 0.5731401443481445, + "learning_rate": 4.4853516281071045e-06, + "loss": 0.614, + "step": 4032 + }, + { + "epoch": 0.6364210194098153, + "grad_norm": 0.5939109921455383, + "learning_rate": 4.4850992009588634e-06, + "loss": 0.5938, + "step": 4033 + }, + { + "epoch": 0.6365788227868077, + "grad_norm": 0.5732733607292175, + "learning_rate": 4.484846719026719e-06, + "loss": 0.6, + "step": 4034 + }, + { + "epoch": 0.6367366261637999, + "grad_norm": 0.590112566947937, + "learning_rate": 4.4845941823176396e-06, + "loss": 0.5571, + "step": 4035 + }, + { + "epoch": 0.6368944295407921, + "grad_norm": 0.6062913537025452, + "learning_rate": 4.484341590838595e-06, + "loss": 0.5626, + "step": 4036 + }, + { + "epoch": 0.6370522329177845, + "grad_norm": 0.6499122977256775, + "learning_rate": 4.484088944596555e-06, + "loss": 0.5538, + "step": 4037 + }, + { + "epoch": 0.6372100362947767, + "grad_norm": 0.5795737504959106, + "learning_rate": 4.483836243598493e-06, + "loss": 0.566, + "step": 4038 + }, + { + "epoch": 0.637367839671769, + "grad_norm": 0.5564965605735779, + "learning_rate": 4.483583487851382e-06, + "loss": 0.5947, + "step": 4039 + }, + { + "epoch": 0.6375256430487612, + "grad_norm": 0.5979741215705872, + "learning_rate": 4.483330677362198e-06, + "loss": 0.5553, + "step": 4040 + }, + { + "epoch": 0.6376834464257535, + "grad_norm": 0.5981985926628113, + "learning_rate": 4.483077812137919e-06, + "loss": 0.5773, + "step": 4041 + }, + { + "epoch": 0.6378412498027458, + "grad_norm": 0.5595219135284424, + "learning_rate": 4.482824892185521e-06, + "loss": 0.5849, + "step": 4042 + }, + { + "epoch": 0.637999053179738, + "grad_norm": 0.5959151387214661, + "learning_rate": 4.482571917511986e-06, + "loss": 0.5698, + "step": 4043 + }, + { + "epoch": 0.6381568565567303, + "grad_norm": 0.615001916885376, + "learning_rate": 4.4823188881242955e-06, + "loss": 0.6161, + "step": 4044 + }, + { + "epoch": 0.6383146599337226, + "grad_norm": 0.5952706336975098, + "learning_rate": 4.482065804029431e-06, + "loss": 0.608, + "step": 4045 + }, + { + "epoch": 0.6384724633107148, + "grad_norm": 0.5925858020782471, + "learning_rate": 4.481812665234379e-06, + "loss": 0.5839, + "step": 4046 + }, + { + "epoch": 0.6386302666877072, + "grad_norm": 0.642510175704956, + "learning_rate": 4.481559471746123e-06, + "loss": 0.5944, + "step": 4047 + }, + { + "epoch": 0.6387880700646994, + "grad_norm": 0.5885897278785706, + "learning_rate": 4.481306223571652e-06, + "loss": 0.5738, + "step": 4048 + }, + { + "epoch": 0.6389458734416916, + "grad_norm": 0.6241337060928345, + "learning_rate": 4.481052920717956e-06, + "loss": 0.5564, + "step": 4049 + }, + { + "epoch": 0.639103676818684, + "grad_norm": 0.5705674886703491, + "learning_rate": 4.480799563192024e-06, + "loss": 0.5497, + "step": 4050 + }, + { + "epoch": 0.6392614801956762, + "grad_norm": 0.6242807507514954, + "learning_rate": 4.480546151000848e-06, + "loss": 0.605, + "step": 4051 + }, + { + "epoch": 0.6394192835726684, + "grad_norm": 0.5836288332939148, + "learning_rate": 4.480292684151423e-06, + "loss": 0.5757, + "step": 4052 + }, + { + "epoch": 0.6395770869496608, + "grad_norm": 0.5963650941848755, + "learning_rate": 4.480039162650742e-06, + "loss": 0.6004, + "step": 4053 + }, + { + "epoch": 0.639734890326653, + "grad_norm": 0.5810999274253845, + "learning_rate": 4.479785586505804e-06, + "loss": 0.602, + "step": 4054 + }, + { + "epoch": 0.6398926937036452, + "grad_norm": 0.5787190794944763, + "learning_rate": 4.479531955723605e-06, + "loss": 0.5952, + "step": 4055 + }, + { + "epoch": 0.6400504970806375, + "grad_norm": 0.5597828030586243, + "learning_rate": 4.4792782703111455e-06, + "loss": 0.5297, + "step": 4056 + }, + { + "epoch": 0.6402083004576298, + "grad_norm": 0.5949410796165466, + "learning_rate": 4.479024530275427e-06, + "loss": 0.5747, + "step": 4057 + }, + { + "epoch": 0.6403661038346221, + "grad_norm": 0.5602224469184875, + "learning_rate": 4.4787707356234514e-06, + "loss": 0.5731, + "step": 4058 + }, + { + "epoch": 0.6405239072116143, + "grad_norm": 0.5896263718605042, + "learning_rate": 4.478516886362223e-06, + "loss": 0.5986, + "step": 4059 + }, + { + "epoch": 0.6406817105886066, + "grad_norm": 0.5729141235351562, + "learning_rate": 4.478262982498746e-06, + "loss": 0.5762, + "step": 4060 + }, + { + "epoch": 0.6408395139655989, + "grad_norm": 0.5840842723846436, + "learning_rate": 4.478009024040031e-06, + "loss": 0.5853, + "step": 4061 + }, + { + "epoch": 0.6409973173425911, + "grad_norm": 0.6013943552970886, + "learning_rate": 4.477755010993084e-06, + "loss": 0.5954, + "step": 4062 + }, + { + "epoch": 0.6411551207195834, + "grad_norm": 0.5696476101875305, + "learning_rate": 4.477500943364916e-06, + "loss": 0.5928, + "step": 4063 + }, + { + "epoch": 0.6413129240965757, + "grad_norm": 0.5889409184455872, + "learning_rate": 4.477246821162538e-06, + "loss": 0.578, + "step": 4064 + }, + { + "epoch": 0.6414707274735679, + "grad_norm": 0.5869168639183044, + "learning_rate": 4.476992644392964e-06, + "loss": 0.5754, + "step": 4065 + }, + { + "epoch": 0.6416285308505602, + "grad_norm": 0.5865516066551208, + "learning_rate": 4.4767384130632075e-06, + "loss": 0.5857, + "step": 4066 + }, + { + "epoch": 0.6417863342275525, + "grad_norm": 0.5737395286560059, + "learning_rate": 4.476484127180286e-06, + "loss": 0.5932, + "step": 4067 + }, + { + "epoch": 0.6419441376045447, + "grad_norm": 0.5816107988357544, + "learning_rate": 4.4762297867512164e-06, + "loss": 0.5519, + "step": 4068 + }, + { + "epoch": 0.6421019409815371, + "grad_norm": 0.5815700888633728, + "learning_rate": 4.475975391783017e-06, + "loss": 0.5854, + "step": 4069 + }, + { + "epoch": 0.6422597443585293, + "grad_norm": 0.5854873657226562, + "learning_rate": 4.475720942282711e-06, + "loss": 0.5641, + "step": 4070 + }, + { + "epoch": 0.6424175477355215, + "grad_norm": 0.6016779541969299, + "learning_rate": 4.475466438257319e-06, + "loss": 0.5666, + "step": 4071 + }, + { + "epoch": 0.6425753511125138, + "grad_norm": 0.5986239314079285, + "learning_rate": 4.4752118797138645e-06, + "loss": 0.5735, + "step": 4072 + }, + { + "epoch": 0.6427331544895061, + "grad_norm": 0.5691763758659363, + "learning_rate": 4.474957266659372e-06, + "loss": 0.5248, + "step": 4073 + }, + { + "epoch": 0.6428909578664983, + "grad_norm": 0.5906026363372803, + "learning_rate": 4.474702599100871e-06, + "loss": 0.557, + "step": 4074 + }, + { + "epoch": 0.6430487612434906, + "grad_norm": 0.5996724367141724, + "learning_rate": 4.474447877045386e-06, + "loss": 0.5735, + "step": 4075 + }, + { + "epoch": 0.6432065646204829, + "grad_norm": 0.5900353193283081, + "learning_rate": 4.47419310049995e-06, + "loss": 0.5433, + "step": 4076 + }, + { + "epoch": 0.6433643679974751, + "grad_norm": 0.6003993153572083, + "learning_rate": 4.473938269471592e-06, + "loss": 0.5437, + "step": 4077 + }, + { + "epoch": 0.6435221713744674, + "grad_norm": 0.6350484490394592, + "learning_rate": 4.473683383967346e-06, + "loss": 0.5655, + "step": 4078 + }, + { + "epoch": 0.6436799747514597, + "grad_norm": 0.5923448204994202, + "learning_rate": 4.473428443994246e-06, + "loss": 0.5469, + "step": 4079 + }, + { + "epoch": 0.643837778128452, + "grad_norm": 0.5922819375991821, + "learning_rate": 4.473173449559327e-06, + "loss": 0.5509, + "step": 4080 + }, + { + "epoch": 0.6439955815054442, + "grad_norm": 0.5891865491867065, + "learning_rate": 4.472918400669626e-06, + "loss": 0.5789, + "step": 4081 + }, + { + "epoch": 0.6441533848824365, + "grad_norm": 0.5453842878341675, + "learning_rate": 4.472663297332184e-06, + "loss": 0.56, + "step": 4082 + }, + { + "epoch": 0.6443111882594288, + "grad_norm": 0.5925694704055786, + "learning_rate": 4.472408139554039e-06, + "loss": 0.5793, + "step": 4083 + }, + { + "epoch": 0.644468991636421, + "grad_norm": 0.5662216544151306, + "learning_rate": 4.472152927342233e-06, + "loss": 0.569, + "step": 4084 + }, + { + "epoch": 0.6446267950134132, + "grad_norm": 0.5505645871162415, + "learning_rate": 4.47189766070381e-06, + "loss": 0.5439, + "step": 4085 + }, + { + "epoch": 0.6447845983904056, + "grad_norm": 0.578457236289978, + "learning_rate": 4.471642339645814e-06, + "loss": 0.6045, + "step": 4086 + }, + { + "epoch": 0.6449424017673978, + "grad_norm": 0.5849851369857788, + "learning_rate": 4.471386964175292e-06, + "loss": 0.582, + "step": 4087 + }, + { + "epoch": 0.64510020514439, + "grad_norm": 0.5966413617134094, + "learning_rate": 4.471131534299291e-06, + "loss": 0.5855, + "step": 4088 + }, + { + "epoch": 0.6452580085213824, + "grad_norm": 0.5615219473838806, + "learning_rate": 4.47087605002486e-06, + "loss": 0.5928, + "step": 4089 + }, + { + "epoch": 0.6454158118983746, + "grad_norm": 0.588409960269928, + "learning_rate": 4.470620511359052e-06, + "loss": 0.5782, + "step": 4090 + }, + { + "epoch": 0.6455736152753669, + "grad_norm": 0.6040182113647461, + "learning_rate": 4.470364918308917e-06, + "loss": 0.5973, + "step": 4091 + }, + { + "epoch": 0.6457314186523592, + "grad_norm": 0.5947341918945312, + "learning_rate": 4.470109270881509e-06, + "loss": 0.5605, + "step": 4092 + }, + { + "epoch": 0.6458892220293514, + "grad_norm": 0.5882638096809387, + "learning_rate": 4.469853569083883e-06, + "loss": 0.623, + "step": 4093 + }, + { + "epoch": 0.6460470254063437, + "grad_norm": 0.5606505274772644, + "learning_rate": 4.469597812923097e-06, + "loss": 0.5377, + "step": 4094 + }, + { + "epoch": 0.646204828783336, + "grad_norm": 0.6003230810165405, + "learning_rate": 4.4693420024062084e-06, + "loss": 0.5696, + "step": 4095 + }, + { + "epoch": 0.6463626321603282, + "grad_norm": 0.5985450148582458, + "learning_rate": 4.4690861375402776e-06, + "loss": 0.5737, + "step": 4096 + }, + { + "epoch": 0.6465204355373205, + "grad_norm": 0.5587963461875916, + "learning_rate": 4.468830218332364e-06, + "loss": 0.5614, + "step": 4097 + }, + { + "epoch": 0.6466782389143128, + "grad_norm": 0.5965261459350586, + "learning_rate": 4.468574244789533e-06, + "loss": 0.5856, + "step": 4098 + }, + { + "epoch": 0.646836042291305, + "grad_norm": 0.5937463045120239, + "learning_rate": 4.468318216918847e-06, + "loss": 0.5415, + "step": 4099 + }, + { + "epoch": 0.6469938456682973, + "grad_norm": 0.6073063611984253, + "learning_rate": 4.468062134727372e-06, + "loss": 0.5511, + "step": 4100 + }, + { + "epoch": 0.6471516490452895, + "grad_norm": 0.5911657214164734, + "learning_rate": 4.4678059982221765e-06, + "loss": 0.5731, + "step": 4101 + }, + { + "epoch": 0.6473094524222819, + "grad_norm": 0.6292532086372375, + "learning_rate": 4.467549807410327e-06, + "loss": 0.5765, + "step": 4102 + }, + { + "epoch": 0.6474672557992741, + "grad_norm": 0.5853274464607239, + "learning_rate": 4.4672935622988965e-06, + "loss": 0.6114, + "step": 4103 + }, + { + "epoch": 0.6476250591762663, + "grad_norm": 0.5851520895957947, + "learning_rate": 4.467037262894954e-06, + "loss": 0.5737, + "step": 4104 + }, + { + "epoch": 0.6477828625532587, + "grad_norm": 0.6056844592094421, + "learning_rate": 4.466780909205574e-06, + "loss": 0.556, + "step": 4105 + }, + { + "epoch": 0.6479406659302509, + "grad_norm": 0.6278177499771118, + "learning_rate": 4.466524501237832e-06, + "loss": 0.562, + "step": 4106 + }, + { + "epoch": 0.6480984693072431, + "grad_norm": 0.5781410336494446, + "learning_rate": 4.466268038998804e-06, + "loss": 0.5599, + "step": 4107 + }, + { + "epoch": 0.6482562726842355, + "grad_norm": 0.6023623943328857, + "learning_rate": 4.466011522495566e-06, + "loss": 0.5405, + "step": 4108 + }, + { + "epoch": 0.6484140760612277, + "grad_norm": 0.569427490234375, + "learning_rate": 4.465754951735199e-06, + "loss": 0.583, + "step": 4109 + }, + { + "epoch": 0.6485718794382199, + "grad_norm": 0.602401852607727, + "learning_rate": 4.465498326724783e-06, + "loss": 0.5644, + "step": 4110 + }, + { + "epoch": 0.6487296828152123, + "grad_norm": 0.5793846845626831, + "learning_rate": 4.4652416474714e-06, + "loss": 0.5607, + "step": 4111 + }, + { + "epoch": 0.6488874861922045, + "grad_norm": 0.5903560519218445, + "learning_rate": 4.464984913982135e-06, + "loss": 0.574, + "step": 4112 + }, + { + "epoch": 0.6490452895691968, + "grad_norm": 0.5925446152687073, + "learning_rate": 4.464728126264072e-06, + "loss": 0.5857, + "step": 4113 + }, + { + "epoch": 0.649203092946189, + "grad_norm": 0.6088014841079712, + "learning_rate": 4.464471284324297e-06, + "loss": 0.5641, + "step": 4114 + }, + { + "epoch": 0.6493608963231813, + "grad_norm": 0.6111425757408142, + "learning_rate": 4.464214388169901e-06, + "loss": 0.5885, + "step": 4115 + }, + { + "epoch": 0.6495186997001736, + "grad_norm": 0.6093299388885498, + "learning_rate": 4.463957437807971e-06, + "loss": 0.579, + "step": 4116 + }, + { + "epoch": 0.6496765030771658, + "grad_norm": 0.6127672791481018, + "learning_rate": 4.463700433245599e-06, + "loss": 0.5569, + "step": 4117 + }, + { + "epoch": 0.6498343064541581, + "grad_norm": 0.579891562461853, + "learning_rate": 4.4634433744898784e-06, + "loss": 0.5934, + "step": 4118 + }, + { + "epoch": 0.6499921098311504, + "grad_norm": 0.587102472782135, + "learning_rate": 4.463186261547902e-06, + "loss": 0.585, + "step": 4119 + }, + { + "epoch": 0.6501499132081426, + "grad_norm": 0.5901090502738953, + "learning_rate": 4.462929094426768e-06, + "loss": 0.5684, + "step": 4120 + }, + { + "epoch": 0.650307716585135, + "grad_norm": 0.5766416788101196, + "learning_rate": 4.462671873133571e-06, + "loss": 0.5647, + "step": 4121 + }, + { + "epoch": 0.6504655199621272, + "grad_norm": 0.5794193148612976, + "learning_rate": 4.46241459767541e-06, + "loss": 0.5697, + "step": 4122 + }, + { + "epoch": 0.6506233233391194, + "grad_norm": 0.6083148121833801, + "learning_rate": 4.462157268059386e-06, + "loss": 0.587, + "step": 4123 + }, + { + "epoch": 0.6507811267161118, + "grad_norm": 0.5873644948005676, + "learning_rate": 4.461899884292601e-06, + "loss": 0.5654, + "step": 4124 + }, + { + "epoch": 0.650938930093104, + "grad_norm": 0.5748244524002075, + "learning_rate": 4.461642446382158e-06, + "loss": 0.5807, + "step": 4125 + }, + { + "epoch": 0.6510967334700962, + "grad_norm": 0.6248295307159424, + "learning_rate": 4.46138495433516e-06, + "loss": 0.5793, + "step": 4126 + }, + { + "epoch": 0.6512545368470886, + "grad_norm": 0.6251404285430908, + "learning_rate": 4.461127408158716e-06, + "loss": 0.5643, + "step": 4127 + }, + { + "epoch": 0.6514123402240808, + "grad_norm": 0.5921316146850586, + "learning_rate": 4.460869807859931e-06, + "loss": 0.5595, + "step": 4128 + }, + { + "epoch": 0.651570143601073, + "grad_norm": 0.6070018410682678, + "learning_rate": 4.460612153445916e-06, + "loss": 0.593, + "step": 4129 + }, + { + "epoch": 0.6517279469780654, + "grad_norm": 0.609850287437439, + "learning_rate": 4.46035444492378e-06, + "loss": 0.5569, + "step": 4130 + }, + { + "epoch": 0.6518857503550576, + "grad_norm": 0.5769793391227722, + "learning_rate": 4.460096682300636e-06, + "loss": 0.5895, + "step": 4131 + }, + { + "epoch": 0.6520435537320499, + "grad_norm": 0.6065996885299683, + "learning_rate": 4.459838865583599e-06, + "loss": 0.5988, + "step": 4132 + }, + { + "epoch": 0.6522013571090421, + "grad_norm": 0.6230129599571228, + "learning_rate": 4.459580994779782e-06, + "loss": 0.5799, + "step": 4133 + }, + { + "epoch": 0.6523591604860344, + "grad_norm": 0.5970026850700378, + "learning_rate": 4.459323069896302e-06, + "loss": 0.5827, + "step": 4134 + }, + { + "epoch": 0.6525169638630267, + "grad_norm": 0.6045116186141968, + "learning_rate": 4.459065090940277e-06, + "loss": 0.5161, + "step": 4135 + }, + { + "epoch": 0.6526747672400189, + "grad_norm": 0.5869005918502808, + "learning_rate": 4.458807057918828e-06, + "loss": 0.5663, + "step": 4136 + }, + { + "epoch": 0.6528325706170112, + "grad_norm": 0.6143922805786133, + "learning_rate": 4.458548970839074e-06, + "loss": 0.6161, + "step": 4137 + }, + { + "epoch": 0.6529903739940035, + "grad_norm": 0.5562669634819031, + "learning_rate": 4.45829082970814e-06, + "loss": 0.5474, + "step": 4138 + }, + { + "epoch": 0.6531481773709957, + "grad_norm": 0.6015672087669373, + "learning_rate": 4.458032634533148e-06, + "loss": 0.5778, + "step": 4139 + }, + { + "epoch": 0.653305980747988, + "grad_norm": 0.5788236260414124, + "learning_rate": 4.457774385321224e-06, + "loss": 0.5943, + "step": 4140 + }, + { + "epoch": 0.6534637841249803, + "grad_norm": 0.584974467754364, + "learning_rate": 4.457516082079496e-06, + "loss": 0.5732, + "step": 4141 + }, + { + "epoch": 0.6536215875019725, + "grad_norm": 0.6087425947189331, + "learning_rate": 4.457257724815092e-06, + "loss": 0.5372, + "step": 4142 + }, + { + "epoch": 0.6537793908789649, + "grad_norm": 0.581270158290863, + "learning_rate": 4.456999313535142e-06, + "loss": 0.5488, + "step": 4143 + }, + { + "epoch": 0.6539371942559571, + "grad_norm": 0.591917872428894, + "learning_rate": 4.456740848246776e-06, + "loss": 0.5585, + "step": 4144 + }, + { + "epoch": 0.6540949976329493, + "grad_norm": 0.5828263163566589, + "learning_rate": 4.456482328957131e-06, + "loss": 0.5685, + "step": 4145 + }, + { + "epoch": 0.6542528010099417, + "grad_norm": 0.7523752450942993, + "learning_rate": 4.4562237556733375e-06, + "loss": 0.6021, + "step": 4146 + }, + { + "epoch": 0.6544106043869339, + "grad_norm": 0.5800158381462097, + "learning_rate": 4.455965128402533e-06, + "loss": 0.5837, + "step": 4147 + }, + { + "epoch": 0.6545684077639261, + "grad_norm": 0.6082311272621155, + "learning_rate": 4.455706447151855e-06, + "loss": 0.5971, + "step": 4148 + }, + { + "epoch": 0.6547262111409184, + "grad_norm": 0.5882702469825745, + "learning_rate": 4.455447711928443e-06, + "loss": 0.5621, + "step": 4149 + }, + { + "epoch": 0.6548840145179107, + "grad_norm": 0.5768996477127075, + "learning_rate": 4.455188922739435e-06, + "loss": 0.5864, + "step": 4150 + }, + { + "epoch": 0.6550418178949029, + "grad_norm": 0.5574616193771362, + "learning_rate": 4.454930079591977e-06, + "loss": 0.5959, + "step": 4151 + }, + { + "epoch": 0.6551996212718952, + "grad_norm": 0.6270413994789124, + "learning_rate": 4.454671182493208e-06, + "loss": 0.5851, + "step": 4152 + }, + { + "epoch": 0.6553574246488875, + "grad_norm": 0.6024866104125977, + "learning_rate": 4.454412231450278e-06, + "loss": 0.5984, + "step": 4153 + }, + { + "epoch": 0.6555152280258798, + "grad_norm": 0.6911225318908691, + "learning_rate": 4.454153226470329e-06, + "loss": 0.5362, + "step": 4154 + }, + { + "epoch": 0.655673031402872, + "grad_norm": 0.5668469071388245, + "learning_rate": 4.453894167560511e-06, + "loss": 0.5918, + "step": 4155 + }, + { + "epoch": 0.6558308347798643, + "grad_norm": 0.5538781881332397, + "learning_rate": 4.453635054727972e-06, + "loss": 0.5761, + "step": 4156 + }, + { + "epoch": 0.6559886381568566, + "grad_norm": 0.56329745054245, + "learning_rate": 4.4533758879798645e-06, + "loss": 0.5582, + "step": 4157 + }, + { + "epoch": 0.6561464415338488, + "grad_norm": 0.575645923614502, + "learning_rate": 4.45311666732334e-06, + "loss": 0.5923, + "step": 4158 + }, + { + "epoch": 0.656304244910841, + "grad_norm": 0.5865269899368286, + "learning_rate": 4.452857392765552e-06, + "loss": 0.5753, + "step": 4159 + }, + { + "epoch": 0.6564620482878334, + "grad_norm": 0.6035284399986267, + "learning_rate": 4.452598064313658e-06, + "loss": 0.6029, + "step": 4160 + }, + { + "epoch": 0.6566198516648256, + "grad_norm": 0.6213036179542542, + "learning_rate": 4.452338681974812e-06, + "loss": 0.5853, + "step": 4161 + }, + { + "epoch": 0.6567776550418178, + "grad_norm": 0.6504574418067932, + "learning_rate": 4.4520792457561735e-06, + "loss": 0.5678, + "step": 4162 + }, + { + "epoch": 0.6569354584188102, + "grad_norm": 0.5956661105155945, + "learning_rate": 4.451819755664903e-06, + "loss": 0.5715, + "step": 4163 + }, + { + "epoch": 0.6570932617958024, + "grad_norm": 0.5787553191184998, + "learning_rate": 4.451560211708161e-06, + "loss": 0.5707, + "step": 4164 + }, + { + "epoch": 0.6572510651727947, + "grad_norm": 0.5884168744087219, + "learning_rate": 4.45130061389311e-06, + "loss": 0.5768, + "step": 4165 + }, + { + "epoch": 0.657408868549787, + "grad_norm": 0.6098105311393738, + "learning_rate": 4.451040962226915e-06, + "loss": 0.5714, + "step": 4166 + }, + { + "epoch": 0.6575666719267792, + "grad_norm": 0.5688106417655945, + "learning_rate": 4.450781256716741e-06, + "loss": 0.6052, + "step": 4167 + }, + { + "epoch": 0.6577244753037715, + "grad_norm": 0.6133478879928589, + "learning_rate": 4.4505214973697565e-06, + "loss": 0.585, + "step": 4168 + }, + { + "epoch": 0.6578822786807638, + "grad_norm": 0.6285780072212219, + "learning_rate": 4.4502616841931285e-06, + "loss": 0.5742, + "step": 4169 + }, + { + "epoch": 0.658040082057756, + "grad_norm": 0.5809615850448608, + "learning_rate": 4.450001817194029e-06, + "loss": 0.5995, + "step": 4170 + }, + { + "epoch": 0.6581978854347483, + "grad_norm": 0.6053906679153442, + "learning_rate": 4.449741896379628e-06, + "loss": 0.5693, + "step": 4171 + }, + { + "epoch": 0.6583556888117406, + "grad_norm": 0.5819376707077026, + "learning_rate": 4.4494819217570995e-06, + "loss": 0.5814, + "step": 4172 + }, + { + "epoch": 0.6585134921887328, + "grad_norm": 0.6118245720863342, + "learning_rate": 4.4492218933336185e-06, + "loss": 0.578, + "step": 4173 + }, + { + "epoch": 0.6586712955657251, + "grad_norm": 0.5901440978050232, + "learning_rate": 4.448961811116361e-06, + "loss": 0.5526, + "step": 4174 + }, + { + "epoch": 0.6588290989427174, + "grad_norm": 0.5932331085205078, + "learning_rate": 4.448701675112504e-06, + "loss": 0.5764, + "step": 4175 + }, + { + "epoch": 0.6589869023197097, + "grad_norm": 0.5819812417030334, + "learning_rate": 4.448441485329227e-06, + "loss": 0.605, + "step": 4176 + }, + { + "epoch": 0.6591447056967019, + "grad_norm": 0.6146141290664673, + "learning_rate": 4.448181241773711e-06, + "loss": 0.586, + "step": 4177 + }, + { + "epoch": 0.6593025090736941, + "grad_norm": 0.5795128345489502, + "learning_rate": 4.447920944453138e-06, + "loss": 0.5556, + "step": 4178 + }, + { + "epoch": 0.6594603124506865, + "grad_norm": 0.5819427967071533, + "learning_rate": 4.44766059337469e-06, + "loss": 0.5926, + "step": 4179 + }, + { + "epoch": 0.6596181158276787, + "grad_norm": 0.5878691077232361, + "learning_rate": 4.4474001885455555e-06, + "loss": 0.5625, + "step": 4180 + }, + { + "epoch": 0.6597759192046709, + "grad_norm": 0.6003605723381042, + "learning_rate": 4.447139729972918e-06, + "loss": 0.6125, + "step": 4181 + }, + { + "epoch": 0.6599337225816633, + "grad_norm": 0.5938643217086792, + "learning_rate": 4.446879217663966e-06, + "loss": 0.5487, + "step": 4182 + }, + { + "epoch": 0.6600915259586555, + "grad_norm": 0.5974464416503906, + "learning_rate": 4.44661865162589e-06, + "loss": 0.6012, + "step": 4183 + }, + { + "epoch": 0.6602493293356477, + "grad_norm": 0.5832759141921997, + "learning_rate": 4.4463580318658805e-06, + "loss": 0.5814, + "step": 4184 + }, + { + "epoch": 0.6604071327126401, + "grad_norm": 0.6086528897285461, + "learning_rate": 4.446097358391129e-06, + "loss": 0.5613, + "step": 4185 + }, + { + "epoch": 0.6605649360896323, + "grad_norm": 0.5960040092468262, + "learning_rate": 4.445836631208831e-06, + "loss": 0.6028, + "step": 4186 + }, + { + "epoch": 0.6607227394666246, + "grad_norm": 0.5981992483139038, + "learning_rate": 4.445575850326181e-06, + "loss": 0.6006, + "step": 4187 + }, + { + "epoch": 0.6608805428436169, + "grad_norm": 0.5935457348823547, + "learning_rate": 4.4453150157503776e-06, + "loss": 0.5462, + "step": 4188 + }, + { + "epoch": 0.6610383462206091, + "grad_norm": 0.572258710861206, + "learning_rate": 4.445054127488616e-06, + "loss": 0.5649, + "step": 4189 + }, + { + "epoch": 0.6611961495976014, + "grad_norm": 0.6169089674949646, + "learning_rate": 4.4447931855480984e-06, + "loss": 0.5743, + "step": 4190 + }, + { + "epoch": 0.6613539529745937, + "grad_norm": 0.5846678614616394, + "learning_rate": 4.444532189936026e-06, + "loss": 0.628, + "step": 4191 + }, + { + "epoch": 0.6615117563515859, + "grad_norm": 0.5746752619743347, + "learning_rate": 4.444271140659601e-06, + "loss": 0.5825, + "step": 4192 + }, + { + "epoch": 0.6616695597285782, + "grad_norm": 0.5852310061454773, + "learning_rate": 4.444010037726028e-06, + "loss": 0.5884, + "step": 4193 + }, + { + "epoch": 0.6618273631055704, + "grad_norm": 0.5756063461303711, + "learning_rate": 4.443748881142513e-06, + "loss": 0.5527, + "step": 4194 + }, + { + "epoch": 0.6619851664825628, + "grad_norm": 0.5805996060371399, + "learning_rate": 4.443487670916263e-06, + "loss": 0.609, + "step": 4195 + }, + { + "epoch": 0.662142969859555, + "grad_norm": 0.6129705309867859, + "learning_rate": 4.443226407054488e-06, + "loss": 0.5926, + "step": 4196 + }, + { + "epoch": 0.6623007732365472, + "grad_norm": 0.5795760750770569, + "learning_rate": 4.442965089564396e-06, + "loss": 0.5582, + "step": 4197 + }, + { + "epoch": 0.6624585766135396, + "grad_norm": 0.6188676953315735, + "learning_rate": 4.4427037184532e-06, + "loss": 0.5997, + "step": 4198 + }, + { + "epoch": 0.6626163799905318, + "grad_norm": 0.5785398483276367, + "learning_rate": 4.442442293728113e-06, + "loss": 0.5436, + "step": 4199 + }, + { + "epoch": 0.662774183367524, + "grad_norm": 0.5835587382316589, + "learning_rate": 4.4421808153963496e-06, + "loss": 0.5875, + "step": 4200 + }, + { + "epoch": 0.6629319867445164, + "grad_norm": 0.5893164873123169, + "learning_rate": 4.4419192834651265e-06, + "loss": 0.6048, + "step": 4201 + }, + { + "epoch": 0.6630897901215086, + "grad_norm": 0.6093225479125977, + "learning_rate": 4.4416576979416604e-06, + "loss": 0.5713, + "step": 4202 + }, + { + "epoch": 0.6632475934985008, + "grad_norm": 0.585813045501709, + "learning_rate": 4.4413960588331715e-06, + "loss": 0.5623, + "step": 4203 + }, + { + "epoch": 0.6634053968754932, + "grad_norm": 0.6243320107460022, + "learning_rate": 4.44113436614688e-06, + "loss": 0.521, + "step": 4204 + }, + { + "epoch": 0.6635632002524854, + "grad_norm": 0.6185029149055481, + "learning_rate": 4.440872619890008e-06, + "loss": 0.568, + "step": 4205 + }, + { + "epoch": 0.6637210036294777, + "grad_norm": 0.5818284749984741, + "learning_rate": 4.440610820069779e-06, + "loss": 0.5466, + "step": 4206 + }, + { + "epoch": 0.66387880700647, + "grad_norm": 0.608137309551239, + "learning_rate": 4.440348966693416e-06, + "loss": 0.6046, + "step": 4207 + }, + { + "epoch": 0.6640366103834622, + "grad_norm": 0.5528738498687744, + "learning_rate": 4.4400870597681494e-06, + "loss": 0.5423, + "step": 4208 + }, + { + "epoch": 0.6641944137604545, + "grad_norm": 0.6027886271476746, + "learning_rate": 4.439825099301205e-06, + "loss": 0.5587, + "step": 4209 + }, + { + "epoch": 0.6643522171374467, + "grad_norm": 0.5856368541717529, + "learning_rate": 4.439563085299812e-06, + "loss": 0.5822, + "step": 4210 + }, + { + "epoch": 0.664510020514439, + "grad_norm": 0.5636337995529175, + "learning_rate": 4.439301017771203e-06, + "loss": 0.5414, + "step": 4211 + }, + { + "epoch": 0.6646678238914313, + "grad_norm": 0.5495761036872864, + "learning_rate": 4.439038896722609e-06, + "loss": 0.5532, + "step": 4212 + }, + { + "epoch": 0.6648256272684235, + "grad_norm": 0.6084924340248108, + "learning_rate": 4.438776722161263e-06, + "loss": 0.5892, + "step": 4213 + }, + { + "epoch": 0.6649834306454158, + "grad_norm": 0.5770279765129089, + "learning_rate": 4.438514494094403e-06, + "loss": 0.5533, + "step": 4214 + }, + { + "epoch": 0.6651412340224081, + "grad_norm": 0.6091262102127075, + "learning_rate": 4.438252212529263e-06, + "loss": 0.5571, + "step": 4215 + }, + { + "epoch": 0.6652990373994003, + "grad_norm": 0.566726803779602, + "learning_rate": 4.437989877473083e-06, + "loss": 0.5508, + "step": 4216 + }, + { + "epoch": 0.6654568407763927, + "grad_norm": 0.5826348662376404, + "learning_rate": 4.437727488933104e-06, + "loss": 0.5435, + "step": 4217 + }, + { + "epoch": 0.6656146441533849, + "grad_norm": 0.5808410048484802, + "learning_rate": 4.437465046916565e-06, + "loss": 0.5486, + "step": 4218 + }, + { + "epoch": 0.6657724475303771, + "grad_norm": 0.6073369383811951, + "learning_rate": 4.4372025514307096e-06, + "loss": 0.5636, + "step": 4219 + }, + { + "epoch": 0.6659302509073695, + "grad_norm": 0.5566904544830322, + "learning_rate": 4.436940002482782e-06, + "loss": 0.5883, + "step": 4220 + }, + { + "epoch": 0.6660880542843617, + "grad_norm": 0.5517571568489075, + "learning_rate": 4.436677400080028e-06, + "loss": 0.599, + "step": 4221 + }, + { + "epoch": 0.6662458576613539, + "grad_norm": 0.5833410620689392, + "learning_rate": 4.436414744229695e-06, + "loss": 0.5998, + "step": 4222 + }, + { + "epoch": 0.6664036610383463, + "grad_norm": 0.5762567520141602, + "learning_rate": 4.436152034939031e-06, + "loss": 0.5995, + "step": 4223 + }, + { + "epoch": 0.6665614644153385, + "grad_norm": 0.6283738017082214, + "learning_rate": 4.435889272215287e-06, + "loss": 0.6103, + "step": 4224 + }, + { + "epoch": 0.6667192677923307, + "grad_norm": 0.5847880840301514, + "learning_rate": 4.435626456065715e-06, + "loss": 0.564, + "step": 4225 + }, + { + "epoch": 0.666877071169323, + "grad_norm": 0.5859417915344238, + "learning_rate": 4.4353635864975655e-06, + "loss": 0.5591, + "step": 4226 + }, + { + "epoch": 0.6670348745463153, + "grad_norm": 0.5774564743041992, + "learning_rate": 4.435100663518096e-06, + "loss": 0.5786, + "step": 4227 + }, + { + "epoch": 0.6671926779233076, + "grad_norm": 0.5654213428497314, + "learning_rate": 4.434837687134561e-06, + "loss": 0.5754, + "step": 4228 + }, + { + "epoch": 0.6673504813002998, + "grad_norm": 0.5807563066482544, + "learning_rate": 4.434574657354218e-06, + "loss": 0.5678, + "step": 4229 + }, + { + "epoch": 0.6675082846772921, + "grad_norm": 0.5766006708145142, + "learning_rate": 4.434311574184327e-06, + "loss": 0.6078, + "step": 4230 + }, + { + "epoch": 0.6676660880542844, + "grad_norm": 0.6221767067909241, + "learning_rate": 4.434048437632148e-06, + "loss": 0.5779, + "step": 4231 + }, + { + "epoch": 0.6678238914312766, + "grad_norm": 0.5615194439888, + "learning_rate": 4.433785247704942e-06, + "loss": 0.5455, + "step": 4232 + }, + { + "epoch": 0.6679816948082689, + "grad_norm": 0.5913371443748474, + "learning_rate": 4.433522004409974e-06, + "loss": 0.6069, + "step": 4233 + }, + { + "epoch": 0.6681394981852612, + "grad_norm": 0.5908042192459106, + "learning_rate": 4.433258707754507e-06, + "loss": 0.5497, + "step": 4234 + }, + { + "epoch": 0.6682973015622534, + "grad_norm": 0.5594935417175293, + "learning_rate": 4.432995357745809e-06, + "loss": 0.5814, + "step": 4235 + }, + { + "epoch": 0.6684551049392456, + "grad_norm": 0.5621492266654968, + "learning_rate": 4.432731954391147e-06, + "loss": 0.5857, + "step": 4236 + }, + { + "epoch": 0.668612908316238, + "grad_norm": 0.6132891774177551, + "learning_rate": 4.432468497697791e-06, + "loss": 0.5931, + "step": 4237 + }, + { + "epoch": 0.6687707116932302, + "grad_norm": 0.5982575416564941, + "learning_rate": 4.43220498767301e-06, + "loss": 0.5699, + "step": 4238 + }, + { + "epoch": 0.6689285150702226, + "grad_norm": 0.54840487241745, + "learning_rate": 4.4319414243240776e-06, + "loss": 0.5906, + "step": 4239 + }, + { + "epoch": 0.6690863184472148, + "grad_norm": 0.6006852984428406, + "learning_rate": 4.431677807658268e-06, + "loss": 0.5648, + "step": 4240 + }, + { + "epoch": 0.669244121824207, + "grad_norm": 0.5846478939056396, + "learning_rate": 4.4314141376828555e-06, + "loss": 0.5972, + "step": 4241 + }, + { + "epoch": 0.6694019252011993, + "grad_norm": 0.6041056513786316, + "learning_rate": 4.431150414405118e-06, + "loss": 0.5775, + "step": 4242 + }, + { + "epoch": 0.6695597285781916, + "grad_norm": 0.5962579846382141, + "learning_rate": 4.430886637832331e-06, + "loss": 0.5543, + "step": 4243 + }, + { + "epoch": 0.6697175319551838, + "grad_norm": 0.6035997271537781, + "learning_rate": 4.430622807971776e-06, + "loss": 0.5987, + "step": 4244 + }, + { + "epoch": 0.6698753353321761, + "grad_norm": 0.5711162686347961, + "learning_rate": 4.430358924830734e-06, + "loss": 0.5791, + "step": 4245 + }, + { + "epoch": 0.6700331387091684, + "grad_norm": 0.588038980960846, + "learning_rate": 4.430094988416488e-06, + "loss": 0.5712, + "step": 4246 + }, + { + "epoch": 0.6701909420861606, + "grad_norm": 0.6201766729354858, + "learning_rate": 4.429830998736321e-06, + "loss": 0.5973, + "step": 4247 + }, + { + "epoch": 0.6703487454631529, + "grad_norm": 0.6280675530433655, + "learning_rate": 4.429566955797518e-06, + "loss": 0.5541, + "step": 4248 + }, + { + "epoch": 0.6705065488401452, + "grad_norm": 0.6264338493347168, + "learning_rate": 4.429302859607368e-06, + "loss": 0.6096, + "step": 4249 + }, + { + "epoch": 0.6706643522171375, + "grad_norm": 0.5605957508087158, + "learning_rate": 4.429038710173157e-06, + "loss": 0.5493, + "step": 4250 + }, + { + "epoch": 0.6708221555941297, + "grad_norm": 0.5856061577796936, + "learning_rate": 4.428774507502177e-06, + "loss": 0.5899, + "step": 4251 + }, + { + "epoch": 0.670979958971122, + "grad_norm": 0.5699422359466553, + "learning_rate": 4.428510251601718e-06, + "loss": 0.5719, + "step": 4252 + }, + { + "epoch": 0.6711377623481143, + "grad_norm": 0.5865249633789062, + "learning_rate": 4.428245942479073e-06, + "loss": 0.5737, + "step": 4253 + }, + { + "epoch": 0.6712955657251065, + "grad_norm": 0.5539838671684265, + "learning_rate": 4.427981580141535e-06, + "loss": 0.588, + "step": 4254 + }, + { + "epoch": 0.6714533691020987, + "grad_norm": 0.5956207513809204, + "learning_rate": 4.4277171645964035e-06, + "loss": 0.5686, + "step": 4255 + }, + { + "epoch": 0.6716111724790911, + "grad_norm": 0.5873770713806152, + "learning_rate": 4.427452695850972e-06, + "loss": 0.5843, + "step": 4256 + }, + { + "epoch": 0.6717689758560833, + "grad_norm": 0.5606580972671509, + "learning_rate": 4.427188173912541e-06, + "loss": 0.5741, + "step": 4257 + }, + { + "epoch": 0.6719267792330755, + "grad_norm": 0.6167193651199341, + "learning_rate": 4.426923598788411e-06, + "loss": 0.5823, + "step": 4258 + }, + { + "epoch": 0.6720845826100679, + "grad_norm": 0.5801055431365967, + "learning_rate": 4.426658970485882e-06, + "loss": 0.5743, + "step": 4259 + }, + { + "epoch": 0.6722423859870601, + "grad_norm": 0.6191515922546387, + "learning_rate": 4.426394289012259e-06, + "loss": 0.5323, + "step": 4260 + }, + { + "epoch": 0.6724001893640524, + "grad_norm": 0.6125592589378357, + "learning_rate": 4.426129554374845e-06, + "loss": 0.5672, + "step": 4261 + }, + { + "epoch": 0.6725579927410447, + "grad_norm": 0.5994083285331726, + "learning_rate": 4.425864766580947e-06, + "loss": 0.5784, + "step": 4262 + }, + { + "epoch": 0.6727157961180369, + "grad_norm": 0.5951423645019531, + "learning_rate": 4.425599925637871e-06, + "loss": 0.5945, + "step": 4263 + }, + { + "epoch": 0.6728735994950292, + "grad_norm": 0.5982719659805298, + "learning_rate": 4.425335031552928e-06, + "loss": 0.5533, + "step": 4264 + }, + { + "epoch": 0.6730314028720215, + "grad_norm": 0.5740861296653748, + "learning_rate": 4.4250700843334274e-06, + "loss": 0.5745, + "step": 4265 + }, + { + "epoch": 0.6731892062490137, + "grad_norm": 0.6102584004402161, + "learning_rate": 4.424805083986682e-06, + "loss": 0.5629, + "step": 4266 + }, + { + "epoch": 0.673347009626006, + "grad_norm": 0.632335364818573, + "learning_rate": 4.424540030520002e-06, + "loss": 0.542, + "step": 4267 + }, + { + "epoch": 0.6735048130029982, + "grad_norm": 0.5755340456962585, + "learning_rate": 4.424274923940707e-06, + "loss": 0.5859, + "step": 4268 + }, + { + "epoch": 0.6736626163799906, + "grad_norm": 0.5911643505096436, + "learning_rate": 4.42400976425611e-06, + "loss": 0.554, + "step": 4269 + }, + { + "epoch": 0.6738204197569828, + "grad_norm": 0.5866098999977112, + "learning_rate": 4.42374455147353e-06, + "loss": 0.5569, + "step": 4270 + }, + { + "epoch": 0.673978223133975, + "grad_norm": 0.5783259868621826, + "learning_rate": 4.423479285600286e-06, + "loss": 0.5395, + "step": 4271 + }, + { + "epoch": 0.6741360265109674, + "grad_norm": 0.6019337177276611, + "learning_rate": 4.423213966643698e-06, + "loss": 0.5821, + "step": 4272 + }, + { + "epoch": 0.6742938298879596, + "grad_norm": 0.5997615456581116, + "learning_rate": 4.422948594611088e-06, + "loss": 0.5636, + "step": 4273 + }, + { + "epoch": 0.6744516332649518, + "grad_norm": 0.632562518119812, + "learning_rate": 4.422683169509782e-06, + "loss": 0.5972, + "step": 4274 + }, + { + "epoch": 0.6746094366419442, + "grad_norm": 0.5996300578117371, + "learning_rate": 4.422417691347103e-06, + "loss": 0.5958, + "step": 4275 + }, + { + "epoch": 0.6747672400189364, + "grad_norm": 0.5706713795661926, + "learning_rate": 4.422152160130378e-06, + "loss": 0.5537, + "step": 4276 + }, + { + "epoch": 0.6749250433959286, + "grad_norm": 0.6103735566139221, + "learning_rate": 4.421886575866934e-06, + "loss": 0.5647, + "step": 4277 + }, + { + "epoch": 0.675082846772921, + "grad_norm": 0.5945660471916199, + "learning_rate": 4.421620938564103e-06, + "loss": 0.5636, + "step": 4278 + }, + { + "epoch": 0.6752406501499132, + "grad_norm": 0.6086753606796265, + "learning_rate": 4.421355248229213e-06, + "loss": 0.5609, + "step": 4279 + }, + { + "epoch": 0.6753984535269055, + "grad_norm": 0.5821974873542786, + "learning_rate": 4.421089504869599e-06, + "loss": 0.5823, + "step": 4280 + }, + { + "epoch": 0.6755562569038978, + "grad_norm": 0.6035138964653015, + "learning_rate": 4.420823708492593e-06, + "loss": 0.5818, + "step": 4281 + }, + { + "epoch": 0.67571406028089, + "grad_norm": 0.6008432507514954, + "learning_rate": 4.420557859105531e-06, + "loss": 0.6113, + "step": 4282 + }, + { + "epoch": 0.6758718636578823, + "grad_norm": 0.5781940221786499, + "learning_rate": 4.420291956715751e-06, + "loss": 0.5344, + "step": 4283 + }, + { + "epoch": 0.6760296670348745, + "grad_norm": 0.5710122585296631, + "learning_rate": 4.420026001330589e-06, + "loss": 0.5726, + "step": 4284 + }, + { + "epoch": 0.6761874704118668, + "grad_norm": 0.5833037495613098, + "learning_rate": 4.419759992957386e-06, + "loss": 0.5769, + "step": 4285 + }, + { + "epoch": 0.6763452737888591, + "grad_norm": 0.5762399435043335, + "learning_rate": 4.419493931603484e-06, + "loss": 0.5675, + "step": 4286 + }, + { + "epoch": 0.6765030771658513, + "grad_norm": 0.5732768774032593, + "learning_rate": 4.419227817276223e-06, + "loss": 0.579, + "step": 4287 + }, + { + "epoch": 0.6766608805428436, + "grad_norm": 0.5637930631637573, + "learning_rate": 4.418961649982949e-06, + "loss": 0.567, + "step": 4288 + }, + { + "epoch": 0.6768186839198359, + "grad_norm": 0.6113193035125732, + "learning_rate": 4.418695429731008e-06, + "loss": 0.5885, + "step": 4289 + }, + { + "epoch": 0.6769764872968281, + "grad_norm": 0.6465269923210144, + "learning_rate": 4.418429156527746e-06, + "loss": 0.593, + "step": 4290 + }, + { + "epoch": 0.6771342906738205, + "grad_norm": 0.5973665118217468, + "learning_rate": 4.418162830380512e-06, + "loss": 0.5789, + "step": 4291 + }, + { + "epoch": 0.6772920940508127, + "grad_norm": 0.5633241534233093, + "learning_rate": 4.417896451296656e-06, + "loss": 0.5751, + "step": 4292 + }, + { + "epoch": 0.6774498974278049, + "grad_norm": 0.5921565294265747, + "learning_rate": 4.4176300192835295e-06, + "loss": 0.55, + "step": 4293 + }, + { + "epoch": 0.6776077008047973, + "grad_norm": 0.5725169777870178, + "learning_rate": 4.417363534348484e-06, + "loss": 0.5813, + "step": 4294 + }, + { + "epoch": 0.6777655041817895, + "grad_norm": 0.6377445459365845, + "learning_rate": 4.4170969964988754e-06, + "loss": 0.5802, + "step": 4295 + }, + { + "epoch": 0.6779233075587817, + "grad_norm": 0.6183264851570129, + "learning_rate": 4.41683040574206e-06, + "loss": 0.5622, + "step": 4296 + }, + { + "epoch": 0.6780811109357741, + "grad_norm": 0.5739991664886475, + "learning_rate": 4.416563762085393e-06, + "loss": 0.5688, + "step": 4297 + }, + { + "epoch": 0.6782389143127663, + "grad_norm": 0.5728615522384644, + "learning_rate": 4.416297065536234e-06, + "loss": 0.5839, + "step": 4298 + }, + { + "epoch": 0.6783967176897585, + "grad_norm": 0.6093677878379822, + "learning_rate": 4.416030316101944e-06, + "loss": 0.5493, + "step": 4299 + }, + { + "epoch": 0.6785545210667508, + "grad_norm": 0.5569461584091187, + "learning_rate": 4.415763513789884e-06, + "loss": 0.5671, + "step": 4300 + }, + { + "epoch": 0.6787123244437431, + "grad_norm": 0.5700674057006836, + "learning_rate": 4.4154966586074165e-06, + "loss": 0.5719, + "step": 4301 + }, + { + "epoch": 0.6788701278207354, + "grad_norm": 0.5624406933784485, + "learning_rate": 4.415229750561907e-06, + "loss": 0.5054, + "step": 4302 + }, + { + "epoch": 0.6790279311977276, + "grad_norm": 0.5535711050033569, + "learning_rate": 4.414962789660722e-06, + "loss": 0.5964, + "step": 4303 + }, + { + "epoch": 0.6791857345747199, + "grad_norm": 0.5853193402290344, + "learning_rate": 4.414695775911227e-06, + "loss": 0.5727, + "step": 4304 + }, + { + "epoch": 0.6793435379517122, + "grad_norm": 0.5704105496406555, + "learning_rate": 4.414428709320792e-06, + "loss": 0.5811, + "step": 4305 + }, + { + "epoch": 0.6795013413287044, + "grad_norm": 0.5830715298652649, + "learning_rate": 4.414161589896788e-06, + "loss": 0.5809, + "step": 4306 + }, + { + "epoch": 0.6796591447056967, + "grad_norm": 0.5924286246299744, + "learning_rate": 4.413894417646586e-06, + "loss": 0.5805, + "step": 4307 + }, + { + "epoch": 0.679816948082689, + "grad_norm": 0.5976144671440125, + "learning_rate": 4.4136271925775595e-06, + "loss": 0.5368, + "step": 4308 + }, + { + "epoch": 0.6799747514596812, + "grad_norm": 0.5654700398445129, + "learning_rate": 4.413359914697084e-06, + "loss": 0.6212, + "step": 4309 + }, + { + "epoch": 0.6801325548366735, + "grad_norm": 0.6022637486457825, + "learning_rate": 4.413092584012534e-06, + "loss": 0.5587, + "step": 4310 + }, + { + "epoch": 0.6802903582136658, + "grad_norm": 0.6001619100570679, + "learning_rate": 4.412825200531289e-06, + "loss": 0.5457, + "step": 4311 + }, + { + "epoch": 0.680448161590658, + "grad_norm": 0.5953906178474426, + "learning_rate": 4.412557764260727e-06, + "loss": 0.5459, + "step": 4312 + }, + { + "epoch": 0.6806059649676504, + "grad_norm": 0.5767130255699158, + "learning_rate": 4.41229027520823e-06, + "loss": 0.5717, + "step": 4313 + }, + { + "epoch": 0.6807637683446426, + "grad_norm": 0.5865201950073242, + "learning_rate": 4.412022733381178e-06, + "loss": 0.6053, + "step": 4314 + }, + { + "epoch": 0.6809215717216348, + "grad_norm": 0.5997714996337891, + "learning_rate": 4.411755138786956e-06, + "loss": 0.5705, + "step": 4315 + }, + { + "epoch": 0.6810793750986271, + "grad_norm": 0.5944494605064392, + "learning_rate": 4.411487491432948e-06, + "loss": 0.6102, + "step": 4316 + }, + { + "epoch": 0.6812371784756194, + "grad_norm": 0.5570665597915649, + "learning_rate": 4.411219791326541e-06, + "loss": 0.573, + "step": 4317 + }, + { + "epoch": 0.6813949818526116, + "grad_norm": 0.5959364771842957, + "learning_rate": 4.4109520384751225e-06, + "loss": 0.6133, + "step": 4318 + }, + { + "epoch": 0.6815527852296039, + "grad_norm": 0.5829921960830688, + "learning_rate": 4.410684232886082e-06, + "loss": 0.5729, + "step": 4319 + }, + { + "epoch": 0.6817105886065962, + "grad_norm": 0.5899233818054199, + "learning_rate": 4.410416374566811e-06, + "loss": 0.5777, + "step": 4320 + }, + { + "epoch": 0.6818683919835884, + "grad_norm": 0.6074293851852417, + "learning_rate": 4.410148463524702e-06, + "loss": 0.6029, + "step": 4321 + }, + { + "epoch": 0.6820261953605807, + "grad_norm": 0.5729297399520874, + "learning_rate": 4.409880499767146e-06, + "loss": 0.5962, + "step": 4322 + }, + { + "epoch": 0.682183998737573, + "grad_norm": 0.6113345623016357, + "learning_rate": 4.409612483301541e-06, + "loss": 0.596, + "step": 4323 + }, + { + "epoch": 0.6823418021145653, + "grad_norm": 0.5911570191383362, + "learning_rate": 4.409344414135283e-06, + "loss": 0.6032, + "step": 4324 + }, + { + "epoch": 0.6824996054915575, + "grad_norm": 0.5950618982315063, + "learning_rate": 4.409076292275768e-06, + "loss": 0.5763, + "step": 4325 + }, + { + "epoch": 0.6826574088685498, + "grad_norm": 0.5650209784507751, + "learning_rate": 4.408808117730399e-06, + "loss": 0.5944, + "step": 4326 + }, + { + "epoch": 0.6828152122455421, + "grad_norm": 0.5584108233451843, + "learning_rate": 4.408539890506574e-06, + "loss": 0.597, + "step": 4327 + }, + { + "epoch": 0.6829730156225343, + "grad_norm": 0.5898020267486572, + "learning_rate": 4.408271610611697e-06, + "loss": 0.6082, + "step": 4328 + }, + { + "epoch": 0.6831308189995265, + "grad_norm": 0.6129211783409119, + "learning_rate": 4.40800327805317e-06, + "loss": 0.5665, + "step": 4329 + }, + { + "epoch": 0.6832886223765189, + "grad_norm": 0.5767385363578796, + "learning_rate": 4.407734892838401e-06, + "loss": 0.5799, + "step": 4330 + }, + { + "epoch": 0.6834464257535111, + "grad_norm": 0.5559157133102417, + "learning_rate": 4.4074664549747945e-06, + "loss": 0.5159, + "step": 4331 + }, + { + "epoch": 0.6836042291305033, + "grad_norm": 0.5858905911445618, + "learning_rate": 4.407197964469761e-06, + "loss": 0.5601, + "step": 4332 + }, + { + "epoch": 0.6837620325074957, + "grad_norm": 0.5782088041305542, + "learning_rate": 4.406929421330709e-06, + "loss": 0.553, + "step": 4333 + }, + { + "epoch": 0.6839198358844879, + "grad_norm": 0.598780632019043, + "learning_rate": 4.406660825565048e-06, + "loss": 0.5668, + "step": 4334 + }, + { + "epoch": 0.6840776392614802, + "grad_norm": 0.5998284816741943, + "learning_rate": 4.406392177180193e-06, + "loss": 0.5918, + "step": 4335 + }, + { + "epoch": 0.6842354426384725, + "grad_norm": 0.5816047191619873, + "learning_rate": 4.406123476183557e-06, + "loss": 0.5979, + "step": 4336 + }, + { + "epoch": 0.6843932460154647, + "grad_norm": 0.6174497604370117, + "learning_rate": 4.405854722582556e-06, + "loss": 0.5658, + "step": 4337 + }, + { + "epoch": 0.684551049392457, + "grad_norm": 0.5658040046691895, + "learning_rate": 4.405585916384606e-06, + "loss": 0.5713, + "step": 4338 + }, + { + "epoch": 0.6847088527694493, + "grad_norm": 0.6332594156265259, + "learning_rate": 4.405317057597126e-06, + "loss": 0.5717, + "step": 4339 + }, + { + "epoch": 0.6848666561464415, + "grad_norm": 0.5876244902610779, + "learning_rate": 4.405048146227536e-06, + "loss": 0.5677, + "step": 4340 + }, + { + "epoch": 0.6850244595234338, + "grad_norm": 0.5869860053062439, + "learning_rate": 4.404779182283258e-06, + "loss": 0.5957, + "step": 4341 + }, + { + "epoch": 0.685182262900426, + "grad_norm": 0.6197652220726013, + "learning_rate": 4.4045101657717125e-06, + "loss": 0.5802, + "step": 4342 + }, + { + "epoch": 0.6853400662774184, + "grad_norm": 0.601742684841156, + "learning_rate": 4.404241096700326e-06, + "loss": 0.5632, + "step": 4343 + }, + { + "epoch": 0.6854978696544106, + "grad_norm": 0.6514992117881775, + "learning_rate": 4.403971975076523e-06, + "loss": 0.5509, + "step": 4344 + }, + { + "epoch": 0.6856556730314028, + "grad_norm": 0.6143274903297424, + "learning_rate": 4.403702800907731e-06, + "loss": 0.5462, + "step": 4345 + }, + { + "epoch": 0.6858134764083952, + "grad_norm": 0.6270232200622559, + "learning_rate": 4.403433574201379e-06, + "loss": 0.5867, + "step": 4346 + }, + { + "epoch": 0.6859712797853874, + "grad_norm": 0.6095283031463623, + "learning_rate": 4.403164294964894e-06, + "loss": 0.5546, + "step": 4347 + }, + { + "epoch": 0.6861290831623796, + "grad_norm": 0.614713191986084, + "learning_rate": 4.402894963205712e-06, + "loss": 0.5563, + "step": 4348 + }, + { + "epoch": 0.686286886539372, + "grad_norm": 0.6185585856437683, + "learning_rate": 4.402625578931263e-06, + "loss": 0.5504, + "step": 4349 + }, + { + "epoch": 0.6864446899163642, + "grad_norm": 0.5809570550918579, + "learning_rate": 4.402356142148983e-06, + "loss": 0.5685, + "step": 4350 + }, + { + "epoch": 0.6866024932933564, + "grad_norm": 0.606918454170227, + "learning_rate": 4.402086652866306e-06, + "loss": 0.5863, + "step": 4351 + }, + { + "epoch": 0.6867602966703488, + "grad_norm": 0.5761882066726685, + "learning_rate": 4.4018171110906705e-06, + "loss": 0.5644, + "step": 4352 + }, + { + "epoch": 0.686918100047341, + "grad_norm": 0.5909034609794617, + "learning_rate": 4.401547516829515e-06, + "loss": 0.584, + "step": 4353 + }, + { + "epoch": 0.6870759034243333, + "grad_norm": 0.6079111695289612, + "learning_rate": 4.401277870090279e-06, + "loss": 0.5574, + "step": 4354 + }, + { + "epoch": 0.6872337068013256, + "grad_norm": 0.602526843547821, + "learning_rate": 4.401008170880405e-06, + "loss": 0.6142, + "step": 4355 + }, + { + "epoch": 0.6873915101783178, + "grad_norm": 0.5992262363433838, + "learning_rate": 4.400738419207335e-06, + "loss": 0.5675, + "step": 4356 + }, + { + "epoch": 0.6875493135553101, + "grad_norm": 0.6046633720397949, + "learning_rate": 4.400468615078515e-06, + "loss": 0.5553, + "step": 4357 + }, + { + "epoch": 0.6877071169323024, + "grad_norm": 0.6089279055595398, + "learning_rate": 4.400198758501389e-06, + "loss": 0.565, + "step": 4358 + }, + { + "epoch": 0.6878649203092946, + "grad_norm": 0.5798147320747375, + "learning_rate": 4.399928849483406e-06, + "loss": 0.5878, + "step": 4359 + }, + { + "epoch": 0.6880227236862869, + "grad_norm": 0.5918225646018982, + "learning_rate": 4.399658888032015e-06, + "loss": 0.578, + "step": 4360 + }, + { + "epoch": 0.6881805270632791, + "grad_norm": 0.5608169436454773, + "learning_rate": 4.399388874154664e-06, + "loss": 0.5915, + "step": 4361 + }, + { + "epoch": 0.6883383304402714, + "grad_norm": 0.5871500372886658, + "learning_rate": 4.399118807858807e-06, + "loss": 0.5718, + "step": 4362 + }, + { + "epoch": 0.6884961338172637, + "grad_norm": 0.6035255789756775, + "learning_rate": 4.398848689151897e-06, + "loss": 0.5871, + "step": 4363 + }, + { + "epoch": 0.6886539371942559, + "grad_norm": 0.6134766340255737, + "learning_rate": 4.398578518041387e-06, + "loss": 0.5673, + "step": 4364 + }, + { + "epoch": 0.6888117405712483, + "grad_norm": 0.5913233757019043, + "learning_rate": 4.398308294534734e-06, + "loss": 0.5772, + "step": 4365 + }, + { + "epoch": 0.6889695439482405, + "grad_norm": 0.6037238836288452, + "learning_rate": 4.398038018639397e-06, + "loss": 0.5636, + "step": 4366 + }, + { + "epoch": 0.6891273473252327, + "grad_norm": 0.5991219878196716, + "learning_rate": 4.397767690362833e-06, + "loss": 0.5668, + "step": 4367 + }, + { + "epoch": 0.6892851507022251, + "grad_norm": 0.566135823726654, + "learning_rate": 4.397497309712503e-06, + "loss": 0.5609, + "step": 4368 + }, + { + "epoch": 0.6894429540792173, + "grad_norm": 0.6087411642074585, + "learning_rate": 4.397226876695868e-06, + "loss": 0.6129, + "step": 4369 + }, + { + "epoch": 0.6896007574562095, + "grad_norm": 0.6405757069587708, + "learning_rate": 4.396956391320393e-06, + "loss": 0.5829, + "step": 4370 + }, + { + "epoch": 0.6897585608332019, + "grad_norm": 0.5787115693092346, + "learning_rate": 4.396685853593542e-06, + "loss": 0.5698, + "step": 4371 + }, + { + "epoch": 0.6899163642101941, + "grad_norm": 0.5921686887741089, + "learning_rate": 4.396415263522781e-06, + "loss": 0.5625, + "step": 4372 + }, + { + "epoch": 0.6900741675871863, + "grad_norm": 0.5783559083938599, + "learning_rate": 4.396144621115577e-06, + "loss": 0.5709, + "step": 4373 + }, + { + "epoch": 0.6902319709641787, + "grad_norm": 0.6004166007041931, + "learning_rate": 4.395873926379401e-06, + "loss": 0.5764, + "step": 4374 + }, + { + "epoch": 0.6903897743411709, + "grad_norm": 0.6061501502990723, + "learning_rate": 4.3956031793217206e-06, + "loss": 0.6171, + "step": 4375 + }, + { + "epoch": 0.6905475777181632, + "grad_norm": 0.6111525893211365, + "learning_rate": 4.39533237995001e-06, + "loss": 0.5938, + "step": 4376 + }, + { + "epoch": 0.6907053810951554, + "grad_norm": 0.5935155749320984, + "learning_rate": 4.395061528271742e-06, + "loss": 0.5738, + "step": 4377 + }, + { + "epoch": 0.6908631844721477, + "grad_norm": 0.6295458674430847, + "learning_rate": 4.394790624294392e-06, + "loss": 0.5618, + "step": 4378 + }, + { + "epoch": 0.69102098784914, + "grad_norm": 0.6192464828491211, + "learning_rate": 4.394519668025436e-06, + "loss": 0.543, + "step": 4379 + }, + { + "epoch": 0.6911787912261322, + "grad_norm": 0.5786528587341309, + "learning_rate": 4.394248659472351e-06, + "loss": 0.567, + "step": 4380 + }, + { + "epoch": 0.6913365946031245, + "grad_norm": 0.6241592764854431, + "learning_rate": 4.393977598642617e-06, + "loss": 0.5764, + "step": 4381 + }, + { + "epoch": 0.6914943979801168, + "grad_norm": 0.6125854253768921, + "learning_rate": 4.3937064855437135e-06, + "loss": 0.5811, + "step": 4382 + }, + { + "epoch": 0.691652201357109, + "grad_norm": 0.609441339969635, + "learning_rate": 4.393435320183124e-06, + "loss": 0.5595, + "step": 4383 + }, + { + "epoch": 0.6918100047341013, + "grad_norm": 0.5732532143592834, + "learning_rate": 4.3931641025683305e-06, + "loss": 0.5687, + "step": 4384 + }, + { + "epoch": 0.6919678081110936, + "grad_norm": 0.6036242246627808, + "learning_rate": 4.3928928327068195e-06, + "loss": 0.5872, + "step": 4385 + }, + { + "epoch": 0.6921256114880858, + "grad_norm": 0.5897682309150696, + "learning_rate": 4.392621510606076e-06, + "loss": 0.5348, + "step": 4386 + }, + { + "epoch": 0.6922834148650782, + "grad_norm": 0.5677073001861572, + "learning_rate": 4.39235013627359e-06, + "loss": 0.6035, + "step": 4387 + }, + { + "epoch": 0.6924412182420704, + "grad_norm": 0.6022121906280518, + "learning_rate": 4.392078709716849e-06, + "loss": 0.5864, + "step": 4388 + }, + { + "epoch": 0.6925990216190626, + "grad_norm": 0.5715184807777405, + "learning_rate": 4.391807230943343e-06, + "loss": 0.5717, + "step": 4389 + }, + { + "epoch": 0.692756824996055, + "grad_norm": 0.5581968426704407, + "learning_rate": 4.391535699960565e-06, + "loss": 0.5748, + "step": 4390 + }, + { + "epoch": 0.6929146283730472, + "grad_norm": 0.5707449913024902, + "learning_rate": 4.39126411677601e-06, + "loss": 0.5746, + "step": 4391 + }, + { + "epoch": 0.6930724317500394, + "grad_norm": 0.6275012493133545, + "learning_rate": 4.390992481397171e-06, + "loss": 0.5639, + "step": 4392 + }, + { + "epoch": 0.6932302351270317, + "grad_norm": 0.5920242667198181, + "learning_rate": 4.3907207938315455e-06, + "loss": 0.5588, + "step": 4393 + }, + { + "epoch": 0.693388038504024, + "grad_norm": 0.5975415110588074, + "learning_rate": 4.390449054086631e-06, + "loss": 0.5664, + "step": 4394 + }, + { + "epoch": 0.6935458418810162, + "grad_norm": 0.5544762015342712, + "learning_rate": 4.3901772621699265e-06, + "loss": 0.5891, + "step": 4395 + }, + { + "epoch": 0.6937036452580085, + "grad_norm": 0.5992293357849121, + "learning_rate": 4.389905418088934e-06, + "loss": 0.5661, + "step": 4396 + }, + { + "epoch": 0.6938614486350008, + "grad_norm": 0.5749224424362183, + "learning_rate": 4.389633521851156e-06, + "loss": 0.5563, + "step": 4397 + }, + { + "epoch": 0.6940192520119931, + "grad_norm": 0.5971843004226685, + "learning_rate": 4.389361573464093e-06, + "loss": 0.5842, + "step": 4398 + }, + { + "epoch": 0.6941770553889853, + "grad_norm": 0.5732383131980896, + "learning_rate": 4.389089572935254e-06, + "loss": 0.5652, + "step": 4399 + }, + { + "epoch": 0.6943348587659776, + "grad_norm": 0.5987657308578491, + "learning_rate": 4.3888175202721435e-06, + "loss": 0.5663, + "step": 4400 + }, + { + "epoch": 0.6944926621429699, + "grad_norm": 0.5871782302856445, + "learning_rate": 4.388545415482269e-06, + "loss": 0.5805, + "step": 4401 + }, + { + "epoch": 0.6946504655199621, + "grad_norm": 0.600609540939331, + "learning_rate": 4.3882732585731425e-06, + "loss": 0.551, + "step": 4402 + }, + { + "epoch": 0.6948082688969544, + "grad_norm": 0.619032621383667, + "learning_rate": 4.3880010495522715e-06, + "loss": 0.5761, + "step": 4403 + }, + { + "epoch": 0.6949660722739467, + "grad_norm": 0.5758190155029297, + "learning_rate": 4.387728788427172e-06, + "loss": 0.596, + "step": 4404 + }, + { + "epoch": 0.6951238756509389, + "grad_norm": 0.5944839715957642, + "learning_rate": 4.3874564752053535e-06, + "loss": 0.5749, + "step": 4405 + }, + { + "epoch": 0.6952816790279311, + "grad_norm": 0.5735629200935364, + "learning_rate": 4.387184109894335e-06, + "loss": 0.5803, + "step": 4406 + }, + { + "epoch": 0.6954394824049235, + "grad_norm": 0.5784513354301453, + "learning_rate": 4.386911692501631e-06, + "loss": 0.5716, + "step": 4407 + }, + { + "epoch": 0.6955972857819157, + "grad_norm": 0.5879859924316406, + "learning_rate": 4.38663922303476e-06, + "loss": 0.5614, + "step": 4408 + }, + { + "epoch": 0.695755089158908, + "grad_norm": 0.5861514210700989, + "learning_rate": 4.386366701501241e-06, + "loss": 0.5841, + "step": 4409 + }, + { + "epoch": 0.6959128925359003, + "grad_norm": 0.5946604013442993, + "learning_rate": 4.386094127908597e-06, + "loss": 0.5863, + "step": 4410 + }, + { + "epoch": 0.6960706959128925, + "grad_norm": 0.5602289438247681, + "learning_rate": 4.385821502264348e-06, + "loss": 0.5728, + "step": 4411 + }, + { + "epoch": 0.6962284992898848, + "grad_norm": 0.612958550453186, + "learning_rate": 4.385548824576018e-06, + "loss": 0.5387, + "step": 4412 + }, + { + "epoch": 0.6963863026668771, + "grad_norm": 0.6030941009521484, + "learning_rate": 4.385276094851134e-06, + "loss": 0.607, + "step": 4413 + }, + { + "epoch": 0.6965441060438693, + "grad_norm": 0.5975567102432251, + "learning_rate": 4.385003313097222e-06, + "loss": 0.5931, + "step": 4414 + }, + { + "epoch": 0.6967019094208616, + "grad_norm": 0.6078968644142151, + "learning_rate": 4.384730479321808e-06, + "loss": 0.5759, + "step": 4415 + }, + { + "epoch": 0.6968597127978539, + "grad_norm": 0.5639880895614624, + "learning_rate": 4.384457593532424e-06, + "loss": 0.5486, + "step": 4416 + }, + { + "epoch": 0.6970175161748462, + "grad_norm": 0.6298397779464722, + "learning_rate": 4.3841846557366e-06, + "loss": 0.5636, + "step": 4417 + }, + { + "epoch": 0.6971753195518384, + "grad_norm": 0.6010026335716248, + "learning_rate": 4.383911665941869e-06, + "loss": 0.5593, + "step": 4418 + }, + { + "epoch": 0.6973331229288307, + "grad_norm": 0.5859806537628174, + "learning_rate": 4.383638624155765e-06, + "loss": 0.5708, + "step": 4419 + }, + { + "epoch": 0.697490926305823, + "grad_norm": 0.5871888399124146, + "learning_rate": 4.383365530385822e-06, + "loss": 0.5916, + "step": 4420 + }, + { + "epoch": 0.6976487296828152, + "grad_norm": 0.5828037858009338, + "learning_rate": 4.383092384639579e-06, + "loss": 0.5638, + "step": 4421 + }, + { + "epoch": 0.6978065330598074, + "grad_norm": 0.5881223678588867, + "learning_rate": 4.382819186924571e-06, + "loss": 0.6072, + "step": 4422 + }, + { + "epoch": 0.6979643364367998, + "grad_norm": 0.5961249470710754, + "learning_rate": 4.38254593724834e-06, + "loss": 0.5553, + "step": 4423 + }, + { + "epoch": 0.698122139813792, + "grad_norm": 0.6089386940002441, + "learning_rate": 4.382272635618427e-06, + "loss": 0.5547, + "step": 4424 + }, + { + "epoch": 0.6982799431907842, + "grad_norm": 0.5637214183807373, + "learning_rate": 4.381999282042372e-06, + "loss": 0.5921, + "step": 4425 + }, + { + "epoch": 0.6984377465677766, + "grad_norm": 0.589231550693512, + "learning_rate": 4.381725876527721e-06, + "loss": 0.5535, + "step": 4426 + }, + { + "epoch": 0.6985955499447688, + "grad_norm": 0.549055278301239, + "learning_rate": 4.3814524190820205e-06, + "loss": 0.5836, + "step": 4427 + }, + { + "epoch": 0.6987533533217611, + "grad_norm": 0.5993034243583679, + "learning_rate": 4.381178909712814e-06, + "loss": 0.5644, + "step": 4428 + }, + { + "epoch": 0.6989111566987534, + "grad_norm": 0.5659014582633972, + "learning_rate": 4.380905348427653e-06, + "loss": 0.5301, + "step": 4429 + }, + { + "epoch": 0.6990689600757456, + "grad_norm": 0.6020833849906921, + "learning_rate": 4.380631735234085e-06, + "loss": 0.5869, + "step": 4430 + }, + { + "epoch": 0.6992267634527379, + "grad_norm": 0.594631552696228, + "learning_rate": 4.3803580701396615e-06, + "loss": 0.5628, + "step": 4431 + }, + { + "epoch": 0.6993845668297302, + "grad_norm": 0.5729334354400635, + "learning_rate": 4.380084353151934e-06, + "loss": 0.5692, + "step": 4432 + }, + { + "epoch": 0.6995423702067224, + "grad_norm": 0.6207988262176514, + "learning_rate": 4.37981058427846e-06, + "loss": 0.57, + "step": 4433 + }, + { + "epoch": 0.6997001735837147, + "grad_norm": 0.5510656833648682, + "learning_rate": 4.379536763526791e-06, + "loss": 0.5337, + "step": 4434 + }, + { + "epoch": 0.699857976960707, + "grad_norm": 0.6021786332130432, + "learning_rate": 4.379262890904484e-06, + "loss": 0.5743, + "step": 4435 + }, + { + "epoch": 0.7000157803376992, + "grad_norm": 0.6206737160682678, + "learning_rate": 4.3789889664191e-06, + "loss": 0.5739, + "step": 4436 + }, + { + "epoch": 0.7001735837146915, + "grad_norm": 0.5796599984169006, + "learning_rate": 4.378714990078197e-06, + "loss": 0.5468, + "step": 4437 + }, + { + "epoch": 0.7003313870916837, + "grad_norm": 0.6014835834503174, + "learning_rate": 4.378440961889336e-06, + "loss": 0.5845, + "step": 4438 + }, + { + "epoch": 0.7004891904686761, + "grad_norm": 0.5827728509902954, + "learning_rate": 4.378166881860078e-06, + "loss": 0.556, + "step": 4439 + }, + { + "epoch": 0.7006469938456683, + "grad_norm": 0.6073852181434631, + "learning_rate": 4.377892749997989e-06, + "loss": 0.5718, + "step": 4440 + }, + { + "epoch": 0.7008047972226605, + "grad_norm": 0.5391860008239746, + "learning_rate": 4.377618566310636e-06, + "loss": 0.5867, + "step": 4441 + }, + { + "epoch": 0.7009626005996529, + "grad_norm": 0.5927045941352844, + "learning_rate": 4.377344330805583e-06, + "loss": 0.5959, + "step": 4442 + }, + { + "epoch": 0.7011204039766451, + "grad_norm": 0.6027908325195312, + "learning_rate": 4.377070043490398e-06, + "loss": 0.545, + "step": 4443 + }, + { + "epoch": 0.7012782073536373, + "grad_norm": 0.568679928779602, + "learning_rate": 4.376795704372652e-06, + "loss": 0.5791, + "step": 4444 + }, + { + "epoch": 0.7014360107306297, + "grad_norm": 0.5902780890464783, + "learning_rate": 4.376521313459916e-06, + "loss": 0.5423, + "step": 4445 + }, + { + "epoch": 0.7015938141076219, + "grad_norm": 0.6063007116317749, + "learning_rate": 4.376246870759762e-06, + "loss": 0.5489, + "step": 4446 + }, + { + "epoch": 0.7017516174846141, + "grad_norm": 0.571492612361908, + "learning_rate": 4.375972376279764e-06, + "loss": 0.6115, + "step": 4447 + }, + { + "epoch": 0.7019094208616065, + "grad_norm": 0.5868462920188904, + "learning_rate": 4.3756978300274975e-06, + "loss": 0.5513, + "step": 4448 + }, + { + "epoch": 0.7020672242385987, + "grad_norm": 0.6026703715324402, + "learning_rate": 4.37542323201054e-06, + "loss": 0.5652, + "step": 4449 + }, + { + "epoch": 0.702225027615591, + "grad_norm": 0.5982826352119446, + "learning_rate": 4.375148582236469e-06, + "loss": 0.5688, + "step": 4450 + }, + { + "epoch": 0.7023828309925833, + "grad_norm": 0.6109884977340698, + "learning_rate": 4.374873880712864e-06, + "loss": 0.5981, + "step": 4451 + }, + { + "epoch": 0.7025406343695755, + "grad_norm": 0.585956871509552, + "learning_rate": 4.374599127447307e-06, + "loss": 0.5803, + "step": 4452 + }, + { + "epoch": 0.7026984377465678, + "grad_norm": 0.576079785823822, + "learning_rate": 4.374324322447379e-06, + "loss": 0.5494, + "step": 4453 + }, + { + "epoch": 0.70285624112356, + "grad_norm": 0.564740777015686, + "learning_rate": 4.374049465720665e-06, + "loss": 0.5753, + "step": 4454 + }, + { + "epoch": 0.7030140445005523, + "grad_norm": 0.5882961750030518, + "learning_rate": 4.373774557274751e-06, + "loss": 0.5925, + "step": 4455 + }, + { + "epoch": 0.7031718478775446, + "grad_norm": 0.5817175507545471, + "learning_rate": 4.373499597117222e-06, + "loss": 0.5735, + "step": 4456 + }, + { + "epoch": 0.7033296512545368, + "grad_norm": 0.5965505838394165, + "learning_rate": 4.373224585255668e-06, + "loss": 0.5718, + "step": 4457 + }, + { + "epoch": 0.7034874546315291, + "grad_norm": 0.6102747321128845, + "learning_rate": 4.372949521697677e-06, + "loss": 0.5679, + "step": 4458 + }, + { + "epoch": 0.7036452580085214, + "grad_norm": 0.6282181739807129, + "learning_rate": 4.372674406450842e-06, + "loss": 0.5709, + "step": 4459 + }, + { + "epoch": 0.7038030613855136, + "grad_norm": 0.6089268326759338, + "learning_rate": 4.372399239522754e-06, + "loss": 0.558, + "step": 4460 + }, + { + "epoch": 0.703960864762506, + "grad_norm": 0.5930137038230896, + "learning_rate": 4.372124020921007e-06, + "loss": 0.5586, + "step": 4461 + }, + { + "epoch": 0.7041186681394982, + "grad_norm": 0.611998438835144, + "learning_rate": 4.371848750653198e-06, + "loss": 0.5401, + "step": 4462 + }, + { + "epoch": 0.7042764715164904, + "grad_norm": 0.5559819936752319, + "learning_rate": 4.3715734287269215e-06, + "loss": 0.5696, + "step": 4463 + }, + { + "epoch": 0.7044342748934828, + "grad_norm": 0.5912227034568787, + "learning_rate": 4.371298055149778e-06, + "loss": 0.6003, + "step": 4464 + }, + { + "epoch": 0.704592078270475, + "grad_norm": 0.5939894914627075, + "learning_rate": 4.3710226299293645e-06, + "loss": 0.57, + "step": 4465 + }, + { + "epoch": 0.7047498816474672, + "grad_norm": 0.6458761692047119, + "learning_rate": 4.370747153073285e-06, + "loss": 0.57, + "step": 4466 + }, + { + "epoch": 0.7049076850244596, + "grad_norm": 0.6786116361618042, + "learning_rate": 4.37047162458914e-06, + "loss": 0.5891, + "step": 4467 + }, + { + "epoch": 0.7050654884014518, + "grad_norm": 0.5822232365608215, + "learning_rate": 4.370196044484532e-06, + "loss": 0.605, + "step": 4468 + }, + { + "epoch": 0.705223291778444, + "grad_norm": 0.5892406105995178, + "learning_rate": 4.3699204127670716e-06, + "loss": 0.5752, + "step": 4469 + }, + { + "epoch": 0.7053810951554363, + "grad_norm": 0.6072476506233215, + "learning_rate": 4.36964472944436e-06, + "loss": 0.5705, + "step": 4470 + }, + { + "epoch": 0.7055388985324286, + "grad_norm": 0.5533705949783325, + "learning_rate": 4.3693689945240085e-06, + "loss": 0.5628, + "step": 4471 + }, + { + "epoch": 0.7056967019094209, + "grad_norm": 0.5753107070922852, + "learning_rate": 4.369093208013625e-06, + "loss": 0.5968, + "step": 4472 + }, + { + "epoch": 0.7058545052864131, + "grad_norm": 0.5836228728294373, + "learning_rate": 4.368817369920822e-06, + "loss": 0.569, + "step": 4473 + }, + { + "epoch": 0.7060123086634054, + "grad_norm": 0.6165241599082947, + "learning_rate": 4.3685414802532115e-06, + "loss": 0.5698, + "step": 4474 + }, + { + "epoch": 0.7061701120403977, + "grad_norm": 0.6008642911911011, + "learning_rate": 4.368265539018407e-06, + "loss": 0.5619, + "step": 4475 + }, + { + "epoch": 0.7063279154173899, + "grad_norm": 0.6580753922462463, + "learning_rate": 4.367989546224024e-06, + "loss": 0.5708, + "step": 4476 + }, + { + "epoch": 0.7064857187943822, + "grad_norm": 0.6333863139152527, + "learning_rate": 4.367713501877678e-06, + "loss": 0.5575, + "step": 4477 + }, + { + "epoch": 0.7066435221713745, + "grad_norm": 0.5858168601989746, + "learning_rate": 4.36743740598699e-06, + "loss": 0.5613, + "step": 4478 + }, + { + "epoch": 0.7068013255483667, + "grad_norm": 0.6147475242614746, + "learning_rate": 4.367161258559578e-06, + "loss": 0.5474, + "step": 4479 + }, + { + "epoch": 0.706959128925359, + "grad_norm": 0.5847420692443848, + "learning_rate": 4.366885059603062e-06, + "loss": 0.5927, + "step": 4480 + }, + { + "epoch": 0.7071169323023513, + "grad_norm": 0.6013373732566833, + "learning_rate": 4.366608809125066e-06, + "loss": 0.5604, + "step": 4481 + }, + { + "epoch": 0.7072747356793435, + "grad_norm": 0.5924468040466309, + "learning_rate": 4.366332507133213e-06, + "loss": 0.5458, + "step": 4482 + }, + { + "epoch": 0.7074325390563359, + "grad_norm": 0.6082171201705933, + "learning_rate": 4.366056153635129e-06, + "loss": 0.5452, + "step": 4483 + }, + { + "epoch": 0.7075903424333281, + "grad_norm": 0.6315733194351196, + "learning_rate": 4.365779748638439e-06, + "loss": 0.5526, + "step": 4484 + }, + { + "epoch": 0.7077481458103203, + "grad_norm": 0.6318082213401794, + "learning_rate": 4.365503292150773e-06, + "loss": 0.5648, + "step": 4485 + }, + { + "epoch": 0.7079059491873126, + "grad_norm": 0.5949364900588989, + "learning_rate": 4.365226784179761e-06, + "loss": 0.5854, + "step": 4486 + }, + { + "epoch": 0.7080637525643049, + "grad_norm": 0.5816611647605896, + "learning_rate": 4.36495022473303e-06, + "loss": 0.5815, + "step": 4487 + }, + { + "epoch": 0.7082215559412971, + "grad_norm": 0.638355553150177, + "learning_rate": 4.364673613818217e-06, + "loss": 0.5883, + "step": 4488 + }, + { + "epoch": 0.7083793593182894, + "grad_norm": 0.5768083333969116, + "learning_rate": 4.364396951442954e-06, + "loss": 0.584, + "step": 4489 + }, + { + "epoch": 0.7085371626952817, + "grad_norm": 0.5782355666160583, + "learning_rate": 4.364120237614875e-06, + "loss": 0.5619, + "step": 4490 + }, + { + "epoch": 0.708694966072274, + "grad_norm": 0.6069448590278625, + "learning_rate": 4.363843472341618e-06, + "loss": 0.5751, + "step": 4491 + }, + { + "epoch": 0.7088527694492662, + "grad_norm": 0.57327800989151, + "learning_rate": 4.363566655630822e-06, + "loss": 0.5751, + "step": 4492 + }, + { + "epoch": 0.7090105728262585, + "grad_norm": 0.5767524242401123, + "learning_rate": 4.3632897874901235e-06, + "loss": 0.5725, + "step": 4493 + }, + { + "epoch": 0.7091683762032508, + "grad_norm": 0.5810478925704956, + "learning_rate": 4.363012867927166e-06, + "loss": 0.5621, + "step": 4494 + }, + { + "epoch": 0.709326179580243, + "grad_norm": 0.5645002126693726, + "learning_rate": 4.362735896949591e-06, + "loss": 0.575, + "step": 4495 + }, + { + "epoch": 0.7094839829572352, + "grad_norm": 0.5632327795028687, + "learning_rate": 4.362458874565043e-06, + "loss": 0.5756, + "step": 4496 + }, + { + "epoch": 0.7096417863342276, + "grad_norm": 0.5907319784164429, + "learning_rate": 4.362181800781165e-06, + "loss": 0.5802, + "step": 4497 + }, + { + "epoch": 0.7097995897112198, + "grad_norm": 0.6091585755348206, + "learning_rate": 4.361904675605605e-06, + "loss": 0.5963, + "step": 4498 + }, + { + "epoch": 0.709957393088212, + "grad_norm": 0.5909236073493958, + "learning_rate": 4.3616274990460115e-06, + "loss": 0.5211, + "step": 4499 + }, + { + "epoch": 0.7101151964652044, + "grad_norm": 0.564990758895874, + "learning_rate": 4.361350271110033e-06, + "loss": 0.5523, + "step": 4500 + }, + { + "epoch": 0.7102729998421966, + "grad_norm": 0.604404628276825, + "learning_rate": 4.36107299180532e-06, + "loss": 0.568, + "step": 4501 + }, + { + "epoch": 0.7104308032191889, + "grad_norm": 0.5765331983566284, + "learning_rate": 4.360795661139526e-06, + "loss": 0.5794, + "step": 4502 + }, + { + "epoch": 0.7105886065961812, + "grad_norm": 0.5734560489654541, + "learning_rate": 4.360518279120304e-06, + "loss": 0.5895, + "step": 4503 + }, + { + "epoch": 0.7107464099731734, + "grad_norm": 0.5856908559799194, + "learning_rate": 4.360240845755308e-06, + "loss": 0.5877, + "step": 4504 + }, + { + "epoch": 0.7109042133501657, + "grad_norm": 0.5742722749710083, + "learning_rate": 4.359963361052196e-06, + "loss": 0.5486, + "step": 4505 + }, + { + "epoch": 0.711062016727158, + "grad_norm": 0.6072536110877991, + "learning_rate": 4.359685825018626e-06, + "loss": 0.5909, + "step": 4506 + }, + { + "epoch": 0.7112198201041502, + "grad_norm": 0.6077057123184204, + "learning_rate": 4.359408237662256e-06, + "loss": 0.5705, + "step": 4507 + }, + { + "epoch": 0.7113776234811425, + "grad_norm": 0.5771909952163696, + "learning_rate": 4.359130598990749e-06, + "loss": 0.5666, + "step": 4508 + }, + { + "epoch": 0.7115354268581348, + "grad_norm": 0.5662687420845032, + "learning_rate": 4.358852909011764e-06, + "loss": 0.5808, + "step": 4509 + }, + { + "epoch": 0.711693230235127, + "grad_norm": 0.5979421138763428, + "learning_rate": 4.358575167732966e-06, + "loss": 0.5633, + "step": 4510 + }, + { + "epoch": 0.7118510336121193, + "grad_norm": 0.5767070055007935, + "learning_rate": 4.35829737516202e-06, + "loss": 0.5698, + "step": 4511 + }, + { + "epoch": 0.7120088369891115, + "grad_norm": 0.5999796390533447, + "learning_rate": 4.358019531306594e-06, + "loss": 0.5265, + "step": 4512 + }, + { + "epoch": 0.7121666403661039, + "grad_norm": 0.6006462574005127, + "learning_rate": 4.357741636174354e-06, + "loss": 0.582, + "step": 4513 + }, + { + "epoch": 0.7123244437430961, + "grad_norm": 0.5427632331848145, + "learning_rate": 4.3574636897729685e-06, + "loss": 0.5134, + "step": 4514 + }, + { + "epoch": 0.7124822471200883, + "grad_norm": 0.5825721025466919, + "learning_rate": 4.357185692110111e-06, + "loss": 0.5556, + "step": 4515 + }, + { + "epoch": 0.7126400504970807, + "grad_norm": 0.6041699647903442, + "learning_rate": 4.35690764319345e-06, + "loss": 0.5796, + "step": 4516 + }, + { + "epoch": 0.7127978538740729, + "grad_norm": 0.5983824729919434, + "learning_rate": 4.356629543030662e-06, + "loss": 0.57, + "step": 4517 + }, + { + "epoch": 0.7129556572510651, + "grad_norm": 0.611838698387146, + "learning_rate": 4.35635139162942e-06, + "loss": 0.5752, + "step": 4518 + }, + { + "epoch": 0.7131134606280575, + "grad_norm": 0.5835747718811035, + "learning_rate": 4.356073188997401e-06, + "loss": 0.5648, + "step": 4519 + }, + { + "epoch": 0.7132712640050497, + "grad_norm": 0.5758323669433594, + "learning_rate": 4.355794935142283e-06, + "loss": 0.5611, + "step": 4520 + }, + { + "epoch": 0.7134290673820419, + "grad_norm": 0.5671647787094116, + "learning_rate": 4.355516630071744e-06, + "loss": 0.5687, + "step": 4521 + }, + { + "epoch": 0.7135868707590343, + "grad_norm": 0.5632389783859253, + "learning_rate": 4.355238273793466e-06, + "loss": 0.573, + "step": 4522 + }, + { + "epoch": 0.7137446741360265, + "grad_norm": 0.5897337794303894, + "learning_rate": 4.354959866315131e-06, + "loss": 0.5934, + "step": 4523 + }, + { + "epoch": 0.7139024775130188, + "grad_norm": 0.6147218346595764, + "learning_rate": 4.35468140764442e-06, + "loss": 0.5892, + "step": 4524 + }, + { + "epoch": 0.7140602808900111, + "grad_norm": 0.5696017146110535, + "learning_rate": 4.3544028977890205e-06, + "loss": 0.5645, + "step": 4525 + }, + { + "epoch": 0.7142180842670033, + "grad_norm": 0.6310059428215027, + "learning_rate": 4.354124336756618e-06, + "loss": 0.5798, + "step": 4526 + }, + { + "epoch": 0.7143758876439956, + "grad_norm": 0.5989736914634705, + "learning_rate": 4.353845724554899e-06, + "loss": 0.5919, + "step": 4527 + }, + { + "epoch": 0.7145336910209878, + "grad_norm": 0.5889274477958679, + "learning_rate": 4.353567061191554e-06, + "loss": 0.5563, + "step": 4528 + }, + { + "epoch": 0.7146914943979801, + "grad_norm": 0.5933420062065125, + "learning_rate": 4.353288346674272e-06, + "loss": 0.5708, + "step": 4529 + }, + { + "epoch": 0.7148492977749724, + "grad_norm": 0.5892472267150879, + "learning_rate": 4.353009581010746e-06, + "loss": 0.5967, + "step": 4530 + }, + { + "epoch": 0.7150071011519646, + "grad_norm": 0.5711491703987122, + "learning_rate": 4.352730764208668e-06, + "loss": 0.5642, + "step": 4531 + }, + { + "epoch": 0.7151649045289569, + "grad_norm": 0.607391357421875, + "learning_rate": 4.352451896275735e-06, + "loss": 0.5424, + "step": 4532 + }, + { + "epoch": 0.7153227079059492, + "grad_norm": 0.5901855826377869, + "learning_rate": 4.35217297721964e-06, + "loss": 0.5957, + "step": 4533 + }, + { + "epoch": 0.7154805112829414, + "grad_norm": 0.5942557454109192, + "learning_rate": 4.3518940070480824e-06, + "loss": 0.5918, + "step": 4534 + }, + { + "epoch": 0.7156383146599338, + "grad_norm": 0.6047675013542175, + "learning_rate": 4.351614985768761e-06, + "loss": 0.5769, + "step": 4535 + }, + { + "epoch": 0.715796118036926, + "grad_norm": 0.578047513961792, + "learning_rate": 4.3513359133893754e-06, + "loss": 0.6164, + "step": 4536 + }, + { + "epoch": 0.7159539214139182, + "grad_norm": 0.7828572988510132, + "learning_rate": 4.3510567899176285e-06, + "loss": 0.5722, + "step": 4537 + }, + { + "epoch": 0.7161117247909106, + "grad_norm": 0.6303528547286987, + "learning_rate": 4.350777615361223e-06, + "loss": 0.509, + "step": 4538 + }, + { + "epoch": 0.7162695281679028, + "grad_norm": 0.5988674759864807, + "learning_rate": 4.350498389727862e-06, + "loss": 0.555, + "step": 4539 + }, + { + "epoch": 0.716427331544895, + "grad_norm": 0.5977000594139099, + "learning_rate": 4.350219113025252e-06, + "loss": 0.612, + "step": 4540 + }, + { + "epoch": 0.7165851349218874, + "grad_norm": 0.5786583423614502, + "learning_rate": 4.349939785261102e-06, + "loss": 0.5949, + "step": 4541 + }, + { + "epoch": 0.7167429382988796, + "grad_norm": 0.579673171043396, + "learning_rate": 4.34966040644312e-06, + "loss": 0.5347, + "step": 4542 + }, + { + "epoch": 0.7169007416758718, + "grad_norm": 0.5794836282730103, + "learning_rate": 4.3493809765790155e-06, + "loss": 0.5901, + "step": 4543 + }, + { + "epoch": 0.7170585450528641, + "grad_norm": 0.6026933193206787, + "learning_rate": 4.3491014956765e-06, + "loss": 0.5421, + "step": 4544 + }, + { + "epoch": 0.7172163484298564, + "grad_norm": 0.5761644244194031, + "learning_rate": 4.3488219637432874e-06, + "loss": 0.5943, + "step": 4545 + }, + { + "epoch": 0.7173741518068487, + "grad_norm": 0.5832656025886536, + "learning_rate": 4.348542380787092e-06, + "loss": 0.5772, + "step": 4546 + }, + { + "epoch": 0.7175319551838409, + "grad_norm": 0.6071460247039795, + "learning_rate": 4.348262746815628e-06, + "loss": 0.5784, + "step": 4547 + }, + { + "epoch": 0.7176897585608332, + "grad_norm": 0.5987973213195801, + "learning_rate": 4.3479830618366144e-06, + "loss": 0.5198, + "step": 4548 + }, + { + "epoch": 0.7178475619378255, + "grad_norm": 0.5878121256828308, + "learning_rate": 4.347703325857769e-06, + "loss": 0.556, + "step": 4549 + }, + { + "epoch": 0.7180053653148177, + "grad_norm": 0.5625726580619812, + "learning_rate": 4.347423538886813e-06, + "loss": 0.5578, + "step": 4550 + }, + { + "epoch": 0.71816316869181, + "grad_norm": 0.5814471244812012, + "learning_rate": 4.347143700931467e-06, + "loss": 0.5493, + "step": 4551 + }, + { + "epoch": 0.7183209720688023, + "grad_norm": 0.5833550691604614, + "learning_rate": 4.346863811999453e-06, + "loss": 0.5868, + "step": 4552 + }, + { + "epoch": 0.7184787754457945, + "grad_norm": 0.5904821157455444, + "learning_rate": 4.346583872098496e-06, + "loss": 0.5633, + "step": 4553 + }, + { + "epoch": 0.7186365788227868, + "grad_norm": 0.5840136408805847, + "learning_rate": 4.346303881236322e-06, + "loss": 0.5736, + "step": 4554 + }, + { + "epoch": 0.7187943821997791, + "grad_norm": 0.5876258015632629, + "learning_rate": 4.346023839420658e-06, + "loss": 0.5823, + "step": 4555 + }, + { + "epoch": 0.7189521855767713, + "grad_norm": 0.5831348299980164, + "learning_rate": 4.345743746659232e-06, + "loss": 0.5417, + "step": 4556 + }, + { + "epoch": 0.7191099889537637, + "grad_norm": 0.6344568133354187, + "learning_rate": 4.345463602959774e-06, + "loss": 0.5646, + "step": 4557 + }, + { + "epoch": 0.7192677923307559, + "grad_norm": 0.6036021113395691, + "learning_rate": 4.345183408330016e-06, + "loss": 0.5355, + "step": 4558 + }, + { + "epoch": 0.7194255957077481, + "grad_norm": 0.5883510112762451, + "learning_rate": 4.34490316277769e-06, + "loss": 0.5918, + "step": 4559 + }, + { + "epoch": 0.7195833990847404, + "grad_norm": 0.5952073931694031, + "learning_rate": 4.3446228663105295e-06, + "loss": 0.581, + "step": 4560 + }, + { + "epoch": 0.7197412024617327, + "grad_norm": 0.6015408635139465, + "learning_rate": 4.344342518936271e-06, + "loss": 0.597, + "step": 4561 + }, + { + "epoch": 0.7198990058387249, + "grad_norm": 0.5647245049476624, + "learning_rate": 4.344062120662651e-06, + "loss": 0.5716, + "step": 4562 + }, + { + "epoch": 0.7200568092157172, + "grad_norm": 0.6046690344810486, + "learning_rate": 4.343781671497408e-06, + "loss": 0.5482, + "step": 4563 + }, + { + "epoch": 0.7202146125927095, + "grad_norm": 0.5657467246055603, + "learning_rate": 4.343501171448282e-06, + "loss": 0.577, + "step": 4564 + }, + { + "epoch": 0.7203724159697018, + "grad_norm": 0.5693766474723816, + "learning_rate": 4.343220620523013e-06, + "loss": 0.6007, + "step": 4565 + }, + { + "epoch": 0.720530219346694, + "grad_norm": 0.5807675719261169, + "learning_rate": 4.342940018729345e-06, + "loss": 0.5839, + "step": 4566 + }, + { + "epoch": 0.7206880227236863, + "grad_norm": 0.6178652048110962, + "learning_rate": 4.342659366075021e-06, + "loss": 0.6047, + "step": 4567 + }, + { + "epoch": 0.7208458261006786, + "grad_norm": 0.5864707827568054, + "learning_rate": 4.3423786625677865e-06, + "loss": 0.5927, + "step": 4568 + }, + { + "epoch": 0.7210036294776708, + "grad_norm": 0.6115816235542297, + "learning_rate": 4.342097908215388e-06, + "loss": 0.5753, + "step": 4569 + }, + { + "epoch": 0.721161432854663, + "grad_norm": 0.6126270890235901, + "learning_rate": 4.341817103025574e-06, + "loss": 0.6207, + "step": 4570 + }, + { + "epoch": 0.7213192362316554, + "grad_norm": 0.5791292190551758, + "learning_rate": 4.341536247006094e-06, + "loss": 0.5704, + "step": 4571 + }, + { + "epoch": 0.7214770396086476, + "grad_norm": 0.6110989451408386, + "learning_rate": 4.341255340164699e-06, + "loss": 0.5772, + "step": 4572 + }, + { + "epoch": 0.7216348429856398, + "grad_norm": 0.625149667263031, + "learning_rate": 4.340974382509141e-06, + "loss": 0.5533, + "step": 4573 + }, + { + "epoch": 0.7217926463626322, + "grad_norm": 0.5999442934989929, + "learning_rate": 4.340693374047174e-06, + "loss": 0.595, + "step": 4574 + }, + { + "epoch": 0.7219504497396244, + "grad_norm": 0.6020916700363159, + "learning_rate": 4.340412314786553e-06, + "loss": 0.5825, + "step": 4575 + }, + { + "epoch": 0.7221082531166167, + "grad_norm": 0.5951747298240662, + "learning_rate": 4.340131204735035e-06, + "loss": 0.5334, + "step": 4576 + }, + { + "epoch": 0.722266056493609, + "grad_norm": 0.5901049375534058, + "learning_rate": 4.339850043900377e-06, + "loss": 0.582, + "step": 4577 + }, + { + "epoch": 0.7224238598706012, + "grad_norm": 0.5814238786697388, + "learning_rate": 4.3395688322903395e-06, + "loss": 0.5824, + "step": 4578 + }, + { + "epoch": 0.7225816632475935, + "grad_norm": 0.5829307436943054, + "learning_rate": 4.339287569912683e-06, + "loss": 0.5508, + "step": 4579 + }, + { + "epoch": 0.7227394666245858, + "grad_norm": 0.5935042500495911, + "learning_rate": 4.339006256775169e-06, + "loss": 0.5343, + "step": 4580 + }, + { + "epoch": 0.722897270001578, + "grad_norm": 0.6170029640197754, + "learning_rate": 4.338724892885561e-06, + "loss": 0.5766, + "step": 4581 + }, + { + "epoch": 0.7230550733785703, + "grad_norm": 0.5771246552467346, + "learning_rate": 4.338443478251625e-06, + "loss": 0.5711, + "step": 4582 + }, + { + "epoch": 0.7232128767555626, + "grad_norm": 0.5858423709869385, + "learning_rate": 4.338162012881126e-06, + "loss": 0.5709, + "step": 4583 + }, + { + "epoch": 0.7233706801325548, + "grad_norm": 0.585137665271759, + "learning_rate": 4.337880496781833e-06, + "loss": 0.5882, + "step": 4584 + }, + { + "epoch": 0.7235284835095471, + "grad_norm": 0.595021665096283, + "learning_rate": 4.337598929961515e-06, + "loss": 0.5889, + "step": 4585 + }, + { + "epoch": 0.7236862868865394, + "grad_norm": 0.5877375602722168, + "learning_rate": 4.337317312427942e-06, + "loss": 0.5768, + "step": 4586 + }, + { + "epoch": 0.7238440902635317, + "grad_norm": 0.6068095564842224, + "learning_rate": 4.337035644188886e-06, + "loss": 0.5901, + "step": 4587 + }, + { + "epoch": 0.7240018936405239, + "grad_norm": 0.5646708011627197, + "learning_rate": 4.33675392525212e-06, + "loss": 0.5936, + "step": 4588 + }, + { + "epoch": 0.7241596970175161, + "grad_norm": 0.5709677338600159, + "learning_rate": 4.336472155625421e-06, + "loss": 0.5974, + "step": 4589 + }, + { + "epoch": 0.7243175003945085, + "grad_norm": 0.6009724736213684, + "learning_rate": 4.336190335316563e-06, + "loss": 0.5793, + "step": 4590 + }, + { + "epoch": 0.7244753037715007, + "grad_norm": 0.594992995262146, + "learning_rate": 4.335908464333322e-06, + "loss": 0.5799, + "step": 4591 + }, + { + "epoch": 0.7246331071484929, + "grad_norm": 0.6005798578262329, + "learning_rate": 4.335626542683481e-06, + "loss": 0.5747, + "step": 4592 + }, + { + "epoch": 0.7247909105254853, + "grad_norm": 0.564662516117096, + "learning_rate": 4.3353445703748175e-06, + "loss": 0.5732, + "step": 4593 + }, + { + "epoch": 0.7249487139024775, + "grad_norm": 0.582941472530365, + "learning_rate": 4.335062547415114e-06, + "loss": 0.5888, + "step": 4594 + }, + { + "epoch": 0.7251065172794697, + "grad_norm": 0.6065099239349365, + "learning_rate": 4.334780473812155e-06, + "loss": 0.5538, + "step": 4595 + }, + { + "epoch": 0.7252643206564621, + "grad_norm": 0.5862442255020142, + "learning_rate": 4.334498349573722e-06, + "loss": 0.549, + "step": 4596 + }, + { + "epoch": 0.7254221240334543, + "grad_norm": 0.586529016494751, + "learning_rate": 4.334216174707603e-06, + "loss": 0.5446, + "step": 4597 + }, + { + "epoch": 0.7255799274104466, + "grad_norm": 0.5999895334243774, + "learning_rate": 4.333933949221586e-06, + "loss": 0.592, + "step": 4598 + }, + { + "epoch": 0.7257377307874389, + "grad_norm": 0.5645446181297302, + "learning_rate": 4.333651673123458e-06, + "loss": 0.5715, + "step": 4599 + }, + { + "epoch": 0.7258955341644311, + "grad_norm": 0.5971331596374512, + "learning_rate": 4.33336934642101e-06, + "loss": 0.5522, + "step": 4600 + }, + { + "epoch": 0.7260533375414234, + "grad_norm": 0.5830726027488708, + "learning_rate": 4.3330869691220334e-06, + "loss": 0.593, + "step": 4601 + }, + { + "epoch": 0.7262111409184157, + "grad_norm": 0.5649273991584778, + "learning_rate": 4.332804541234322e-06, + "loss": 0.5731, + "step": 4602 + }, + { + "epoch": 0.7263689442954079, + "grad_norm": 0.5621567368507385, + "learning_rate": 4.332522062765668e-06, + "loss": 0.5707, + "step": 4603 + }, + { + "epoch": 0.7265267476724002, + "grad_norm": 0.5644450187683105, + "learning_rate": 4.33223953372387e-06, + "loss": 0.5878, + "step": 4604 + }, + { + "epoch": 0.7266845510493924, + "grad_norm": 0.5942800641059875, + "learning_rate": 4.331956954116722e-06, + "loss": 0.6028, + "step": 4605 + }, + { + "epoch": 0.7268423544263847, + "grad_norm": 0.5910548567771912, + "learning_rate": 4.331674323952024e-06, + "loss": 0.5938, + "step": 4606 + }, + { + "epoch": 0.727000157803377, + "grad_norm": 0.5984309315681458, + "learning_rate": 4.331391643237577e-06, + "loss": 0.5858, + "step": 4607 + }, + { + "epoch": 0.7271579611803692, + "grad_norm": 0.5775097012519836, + "learning_rate": 4.331108911981181e-06, + "loss": 0.5976, + "step": 4608 + }, + { + "epoch": 0.7273157645573616, + "grad_norm": 0.5781646370887756, + "learning_rate": 4.330826130190638e-06, + "loss": 0.5385, + "step": 4609 + }, + { + "epoch": 0.7274735679343538, + "grad_norm": 0.6186045408248901, + "learning_rate": 4.3305432978737536e-06, + "loss": 0.584, + "step": 4610 + }, + { + "epoch": 0.727631371311346, + "grad_norm": 0.5745105147361755, + "learning_rate": 4.330260415038332e-06, + "loss": 0.5313, + "step": 4611 + }, + { + "epoch": 0.7277891746883384, + "grad_norm": 0.593842625617981, + "learning_rate": 4.329977481692183e-06, + "loss": 0.5611, + "step": 4612 + }, + { + "epoch": 0.7279469780653306, + "grad_norm": 0.5839192867279053, + "learning_rate": 4.32969449784311e-06, + "loss": 0.5216, + "step": 4613 + }, + { + "epoch": 0.7281047814423228, + "grad_norm": 0.5773666501045227, + "learning_rate": 4.329411463498926e-06, + "loss": 0.5846, + "step": 4614 + }, + { + "epoch": 0.7282625848193152, + "grad_norm": 0.6266316771507263, + "learning_rate": 4.3291283786674425e-06, + "loss": 0.5885, + "step": 4615 + }, + { + "epoch": 0.7284203881963074, + "grad_norm": 0.5931236743927002, + "learning_rate": 4.3288452433564694e-06, + "loss": 0.5544, + "step": 4616 + }, + { + "epoch": 0.7285781915732996, + "grad_norm": 0.5826952457427979, + "learning_rate": 4.328562057573823e-06, + "loss": 0.5723, + "step": 4617 + }, + { + "epoch": 0.728735994950292, + "grad_norm": 0.614520788192749, + "learning_rate": 4.328278821327317e-06, + "loss": 0.5852, + "step": 4618 + }, + { + "epoch": 0.7288937983272842, + "grad_norm": 0.5831856727600098, + "learning_rate": 4.327995534624769e-06, + "loss": 0.546, + "step": 4619 + }, + { + "epoch": 0.7290516017042765, + "grad_norm": 0.5868145227432251, + "learning_rate": 4.327712197473996e-06, + "loss": 0.5382, + "step": 4620 + }, + { + "epoch": 0.7292094050812687, + "grad_norm": 0.6080484390258789, + "learning_rate": 4.3274288098828185e-06, + "loss": 0.5522, + "step": 4621 + }, + { + "epoch": 0.729367208458261, + "grad_norm": 0.557891845703125, + "learning_rate": 4.3271453718590566e-06, + "loss": 0.5607, + "step": 4622 + }, + { + "epoch": 0.7295250118352533, + "grad_norm": 0.6239949464797974, + "learning_rate": 4.326861883410532e-06, + "loss": 0.5524, + "step": 4623 + }, + { + "epoch": 0.7296828152122455, + "grad_norm": 0.5886620283126831, + "learning_rate": 4.3265783445450695e-06, + "loss": 0.5997, + "step": 4624 + }, + { + "epoch": 0.7298406185892378, + "grad_norm": 0.5707836151123047, + "learning_rate": 4.326294755270493e-06, + "loss": 0.5737, + "step": 4625 + }, + { + "epoch": 0.7299984219662301, + "grad_norm": 0.5713995099067688, + "learning_rate": 4.32601111559463e-06, + "loss": 0.6066, + "step": 4626 + }, + { + "epoch": 0.7301562253432223, + "grad_norm": 0.5661311745643616, + "learning_rate": 4.325727425525308e-06, + "loss": 0.5746, + "step": 4627 + }, + { + "epoch": 0.7303140287202146, + "grad_norm": 0.5743265748023987, + "learning_rate": 4.3254436850703555e-06, + "loss": 0.6079, + "step": 4628 + }, + { + "epoch": 0.7304718320972069, + "grad_norm": 0.5657708644866943, + "learning_rate": 4.325159894237603e-06, + "loss": 0.585, + "step": 4629 + }, + { + "epoch": 0.7306296354741991, + "grad_norm": 0.5608369708061218, + "learning_rate": 4.3248760530348835e-06, + "loss": 0.557, + "step": 4630 + }, + { + "epoch": 0.7307874388511915, + "grad_norm": 0.6024423241615295, + "learning_rate": 4.3245921614700295e-06, + "loss": 0.5857, + "step": 4631 + }, + { + "epoch": 0.7309452422281837, + "grad_norm": 0.6360228657722473, + "learning_rate": 4.324308219550875e-06, + "loss": 0.5669, + "step": 4632 + }, + { + "epoch": 0.7311030456051759, + "grad_norm": 0.5722049474716187, + "learning_rate": 4.324024227285258e-06, + "loss": 0.5566, + "step": 4633 + }, + { + "epoch": 0.7312608489821683, + "grad_norm": 0.5637965798377991, + "learning_rate": 4.323740184681014e-06, + "loss": 0.5426, + "step": 4634 + }, + { + "epoch": 0.7314186523591605, + "grad_norm": 0.5931618213653564, + "learning_rate": 4.323456091745983e-06, + "loss": 0.5763, + "step": 4635 + }, + { + "epoch": 0.7315764557361527, + "grad_norm": 0.582893431186676, + "learning_rate": 4.323171948488006e-06, + "loss": 0.555, + "step": 4636 + }, + { + "epoch": 0.731734259113145, + "grad_norm": 0.5666576623916626, + "learning_rate": 4.322887754914923e-06, + "loss": 0.5703, + "step": 4637 + }, + { + "epoch": 0.7318920624901373, + "grad_norm": 0.6012505888938904, + "learning_rate": 4.322603511034579e-06, + "loss": 0.5512, + "step": 4638 + }, + { + "epoch": 0.7320498658671295, + "grad_norm": 0.6096908450126648, + "learning_rate": 4.322319216854816e-06, + "loss": 0.542, + "step": 4639 + }, + { + "epoch": 0.7322076692441218, + "grad_norm": 0.588094174861908, + "learning_rate": 4.322034872383481e-06, + "loss": 0.602, + "step": 4640 + }, + { + "epoch": 0.7323654726211141, + "grad_norm": 0.5838096141815186, + "learning_rate": 4.321750477628422e-06, + "loss": 0.5698, + "step": 4641 + }, + { + "epoch": 0.7325232759981064, + "grad_norm": 0.590000331401825, + "learning_rate": 4.3214660325974865e-06, + "loss": 0.5725, + "step": 4642 + }, + { + "epoch": 0.7326810793750986, + "grad_norm": 0.5895346403121948, + "learning_rate": 4.321181537298525e-06, + "loss": 0.5636, + "step": 4643 + }, + { + "epoch": 0.7328388827520909, + "grad_norm": 0.5997029542922974, + "learning_rate": 4.32089699173939e-06, + "loss": 0.6084, + "step": 4644 + }, + { + "epoch": 0.7329966861290832, + "grad_norm": 0.5855224132537842, + "learning_rate": 4.320612395927931e-06, + "loss": 0.6039, + "step": 4645 + }, + { + "epoch": 0.7331544895060754, + "grad_norm": 0.5582797527313232, + "learning_rate": 4.320327749872006e-06, + "loss": 0.562, + "step": 4646 + }, + { + "epoch": 0.7333122928830677, + "grad_norm": 0.6059653759002686, + "learning_rate": 4.320043053579467e-06, + "loss": 0.5532, + "step": 4647 + }, + { + "epoch": 0.73347009626006, + "grad_norm": 0.598552942276001, + "learning_rate": 4.3197583070581736e-06, + "loss": 0.5582, + "step": 4648 + }, + { + "epoch": 0.7336278996370522, + "grad_norm": 0.5717710852622986, + "learning_rate": 4.319473510315983e-06, + "loss": 0.5392, + "step": 4649 + }, + { + "epoch": 0.7337857030140446, + "grad_norm": 0.561909556388855, + "learning_rate": 4.319188663360755e-06, + "loss": 0.5981, + "step": 4650 + }, + { + "epoch": 0.7339435063910368, + "grad_norm": 0.580460250377655, + "learning_rate": 4.318903766200351e-06, + "loss": 0.5915, + "step": 4651 + }, + { + "epoch": 0.734101309768029, + "grad_norm": 0.5921778082847595, + "learning_rate": 4.3186188188426325e-06, + "loss": 0.5641, + "step": 4652 + }, + { + "epoch": 0.7342591131450213, + "grad_norm": 0.6100038290023804, + "learning_rate": 4.318333821295465e-06, + "loss": 0.5478, + "step": 4653 + }, + { + "epoch": 0.7344169165220136, + "grad_norm": 0.6063573360443115, + "learning_rate": 4.318048773566713e-06, + "loss": 0.5693, + "step": 4654 + }, + { + "epoch": 0.7345747198990058, + "grad_norm": 0.5701659321784973, + "learning_rate": 4.317763675664242e-06, + "loss": 0.5915, + "step": 4655 + }, + { + "epoch": 0.7347325232759981, + "grad_norm": 0.6035231351852417, + "learning_rate": 4.317478527595921e-06, + "loss": 0.5802, + "step": 4656 + }, + { + "epoch": 0.7348903266529904, + "grad_norm": 0.5838550925254822, + "learning_rate": 4.31719332936962e-06, + "loss": 0.5558, + "step": 4657 + }, + { + "epoch": 0.7350481300299826, + "grad_norm": 0.5916812419891357, + "learning_rate": 4.316908080993209e-06, + "loss": 0.5416, + "step": 4658 + }, + { + "epoch": 0.7352059334069749, + "grad_norm": 0.5653001666069031, + "learning_rate": 4.31662278247456e-06, + "loss": 0.5944, + "step": 4659 + }, + { + "epoch": 0.7353637367839672, + "grad_norm": 0.5975980758666992, + "learning_rate": 4.316337433821546e-06, + "loss": 0.5715, + "step": 4660 + }, + { + "epoch": 0.7355215401609595, + "grad_norm": 0.6131269931793213, + "learning_rate": 4.316052035042044e-06, + "loss": 0.5875, + "step": 4661 + }, + { + "epoch": 0.7356793435379517, + "grad_norm": 0.5955291986465454, + "learning_rate": 4.3157665861439295e-06, + "loss": 0.5698, + "step": 4662 + }, + { + "epoch": 0.735837146914944, + "grad_norm": 0.5673967599868774, + "learning_rate": 4.315481087135078e-06, + "loss": 0.5746, + "step": 4663 + }, + { + "epoch": 0.7359949502919363, + "grad_norm": 0.5437091588973999, + "learning_rate": 4.315195538023371e-06, + "loss": 0.5878, + "step": 4664 + }, + { + "epoch": 0.7361527536689285, + "grad_norm": 0.5904999375343323, + "learning_rate": 4.314909938816688e-06, + "loss": 0.5843, + "step": 4665 + }, + { + "epoch": 0.7363105570459207, + "grad_norm": 0.5967497229576111, + "learning_rate": 4.314624289522912e-06, + "loss": 0.606, + "step": 4666 + }, + { + "epoch": 0.7364683604229131, + "grad_norm": 0.626684308052063, + "learning_rate": 4.314338590149925e-06, + "loss": 0.5822, + "step": 4667 + }, + { + "epoch": 0.7366261637999053, + "grad_norm": 0.5845433473587036, + "learning_rate": 4.314052840705612e-06, + "loss": 0.5789, + "step": 4668 + }, + { + "epoch": 0.7367839671768975, + "grad_norm": 0.5773909091949463, + "learning_rate": 4.313767041197858e-06, + "loss": 0.5359, + "step": 4669 + }, + { + "epoch": 0.7369417705538899, + "grad_norm": 0.5855492353439331, + "learning_rate": 4.313481191634553e-06, + "loss": 0.5883, + "step": 4670 + }, + { + "epoch": 0.7370995739308821, + "grad_norm": 0.5524555444717407, + "learning_rate": 4.313195292023583e-06, + "loss": 0.5873, + "step": 4671 + }, + { + "epoch": 0.7372573773078744, + "grad_norm": 0.5961403846740723, + "learning_rate": 4.312909342372839e-06, + "loss": 0.5791, + "step": 4672 + }, + { + "epoch": 0.7374151806848667, + "grad_norm": 0.6212993264198303, + "learning_rate": 4.3126233426902135e-06, + "loss": 0.5646, + "step": 4673 + }, + { + "epoch": 0.7375729840618589, + "grad_norm": 0.5699817538261414, + "learning_rate": 4.312337292983598e-06, + "loss": 0.5515, + "step": 4674 + }, + { + "epoch": 0.7377307874388512, + "grad_norm": 0.6082561016082764, + "learning_rate": 4.312051193260888e-06, + "loss": 0.5638, + "step": 4675 + }, + { + "epoch": 0.7378885908158435, + "grad_norm": 0.6165137887001038, + "learning_rate": 4.311765043529978e-06, + "loss": 0.5682, + "step": 4676 + }, + { + "epoch": 0.7380463941928357, + "grad_norm": 0.5770621299743652, + "learning_rate": 4.311478843798765e-06, + "loss": 0.581, + "step": 4677 + }, + { + "epoch": 0.738204197569828, + "grad_norm": 0.5917844772338867, + "learning_rate": 4.311192594075149e-06, + "loss": 0.6005, + "step": 4678 + }, + { + "epoch": 0.7383620009468203, + "grad_norm": 0.6067792773246765, + "learning_rate": 4.3109062943670285e-06, + "loss": 0.5967, + "step": 4679 + }, + { + "epoch": 0.7385198043238125, + "grad_norm": 0.5798567533493042, + "learning_rate": 4.310619944682305e-06, + "loss": 0.5908, + "step": 4680 + }, + { + "epoch": 0.7386776077008048, + "grad_norm": 0.5888270735740662, + "learning_rate": 4.3103335450288805e-06, + "loss": 0.6157, + "step": 4681 + }, + { + "epoch": 0.738835411077797, + "grad_norm": 0.6011677980422974, + "learning_rate": 4.31004709541466e-06, + "loss": 0.5657, + "step": 4682 + }, + { + "epoch": 0.7389932144547894, + "grad_norm": 0.5925710201263428, + "learning_rate": 4.309760595847549e-06, + "loss": 0.5831, + "step": 4683 + }, + { + "epoch": 0.7391510178317816, + "grad_norm": 0.5856873393058777, + "learning_rate": 4.309474046335452e-06, + "loss": 0.5736, + "step": 4684 + }, + { + "epoch": 0.7393088212087738, + "grad_norm": 0.6001694798469543, + "learning_rate": 4.309187446886279e-06, + "loss": 0.5813, + "step": 4685 + }, + { + "epoch": 0.7394666245857662, + "grad_norm": 0.5750067234039307, + "learning_rate": 4.308900797507939e-06, + "loss": 0.5815, + "step": 4686 + }, + { + "epoch": 0.7396244279627584, + "grad_norm": 0.5714820623397827, + "learning_rate": 4.308614098208343e-06, + "loss": 0.5799, + "step": 4687 + }, + { + "epoch": 0.7397822313397506, + "grad_norm": 0.6213783025741577, + "learning_rate": 4.308327348995402e-06, + "loss": 0.5504, + "step": 4688 + }, + { + "epoch": 0.739940034716743, + "grad_norm": 0.5626628398895264, + "learning_rate": 4.308040549877032e-06, + "loss": 0.5662, + "step": 4689 + }, + { + "epoch": 0.7400978380937352, + "grad_norm": 0.5579544901847839, + "learning_rate": 4.307753700861146e-06, + "loss": 0.5858, + "step": 4690 + }, + { + "epoch": 0.7402556414707274, + "grad_norm": 0.5987794995307922, + "learning_rate": 4.307466801955661e-06, + "loss": 0.5774, + "step": 4691 + }, + { + "epoch": 0.7404134448477198, + "grad_norm": 0.5997050404548645, + "learning_rate": 4.307179853168494e-06, + "loss": 0.5846, + "step": 4692 + }, + { + "epoch": 0.740571248224712, + "grad_norm": 0.599543035030365, + "learning_rate": 4.306892854507565e-06, + "loss": 0.5551, + "step": 4693 + }, + { + "epoch": 0.7407290516017043, + "grad_norm": 0.5768078565597534, + "learning_rate": 4.306605805980794e-06, + "loss": 0.5784, + "step": 4694 + }, + { + "epoch": 0.7408868549786966, + "grad_norm": 0.6081961989402771, + "learning_rate": 4.306318707596104e-06, + "loss": 0.5396, + "step": 4695 + }, + { + "epoch": 0.7410446583556888, + "grad_norm": 0.60221266746521, + "learning_rate": 4.306031559361415e-06, + "loss": 0.5256, + "step": 4696 + }, + { + "epoch": 0.7412024617326811, + "grad_norm": 0.5917801260948181, + "learning_rate": 4.305744361284655e-06, + "loss": 0.5175, + "step": 4697 + }, + { + "epoch": 0.7413602651096733, + "grad_norm": 0.6179635524749756, + "learning_rate": 4.3054571133737485e-06, + "loss": 0.5903, + "step": 4698 + }, + { + "epoch": 0.7415180684866656, + "grad_norm": 0.5986982583999634, + "learning_rate": 4.305169815636624e-06, + "loss": 0.5869, + "step": 4699 + }, + { + "epoch": 0.7416758718636579, + "grad_norm": 0.560623824596405, + "learning_rate": 4.304882468081209e-06, + "loss": 0.5803, + "step": 4700 + }, + { + "epoch": 0.7418336752406501, + "grad_norm": 0.5997405052185059, + "learning_rate": 4.304595070715433e-06, + "loss": 0.5611, + "step": 4701 + }, + { + "epoch": 0.7419914786176424, + "grad_norm": 0.549085259437561, + "learning_rate": 4.304307623547229e-06, + "loss": 0.5642, + "step": 4702 + }, + { + "epoch": 0.7421492819946347, + "grad_norm": 0.5968870520591736, + "learning_rate": 4.304020126584529e-06, + "loss": 0.5755, + "step": 4703 + }, + { + "epoch": 0.7423070853716269, + "grad_norm": 0.549527645111084, + "learning_rate": 4.303732579835268e-06, + "loss": 0.5851, + "step": 4704 + }, + { + "epoch": 0.7424648887486193, + "grad_norm": 0.5608077049255371, + "learning_rate": 4.30344498330738e-06, + "loss": 0.5484, + "step": 4705 + }, + { + "epoch": 0.7426226921256115, + "grad_norm": 0.570746898651123, + "learning_rate": 4.303157337008804e-06, + "loss": 0.5429, + "step": 4706 + }, + { + "epoch": 0.7427804955026037, + "grad_norm": 0.5939521789550781, + "learning_rate": 4.3028696409474766e-06, + "loss": 0.5836, + "step": 4707 + }, + { + "epoch": 0.7429382988795961, + "grad_norm": 0.5804494619369507, + "learning_rate": 4.302581895131338e-06, + "loss": 0.571, + "step": 4708 + }, + { + "epoch": 0.7430961022565883, + "grad_norm": 0.5872808694839478, + "learning_rate": 4.3022940995683305e-06, + "loss": 0.5579, + "step": 4709 + }, + { + "epoch": 0.7432539056335805, + "grad_norm": 0.5834758877754211, + "learning_rate": 4.302006254266395e-06, + "loss": 0.5824, + "step": 4710 + }, + { + "epoch": 0.7434117090105729, + "grad_norm": 0.5765332579612732, + "learning_rate": 4.301718359233475e-06, + "loss": 0.5766, + "step": 4711 + }, + { + "epoch": 0.7435695123875651, + "grad_norm": 0.5960057973861694, + "learning_rate": 4.301430414477518e-06, + "loss": 0.5705, + "step": 4712 + }, + { + "epoch": 0.7437273157645573, + "grad_norm": 0.5836743116378784, + "learning_rate": 4.301142420006469e-06, + "loss": 0.571, + "step": 4713 + }, + { + "epoch": 0.7438851191415496, + "grad_norm": 0.567283570766449, + "learning_rate": 4.300854375828275e-06, + "loss": 0.6024, + "step": 4714 + }, + { + "epoch": 0.7440429225185419, + "grad_norm": 0.5765351057052612, + "learning_rate": 4.300566281950888e-06, + "loss": 0.5451, + "step": 4715 + }, + { + "epoch": 0.7442007258955342, + "grad_norm": 0.558664858341217, + "learning_rate": 4.300278138382255e-06, + "loss": 0.5915, + "step": 4716 + }, + { + "epoch": 0.7443585292725264, + "grad_norm": 0.5714871287345886, + "learning_rate": 4.299989945130332e-06, + "loss": 0.5574, + "step": 4717 + }, + { + "epoch": 0.7445163326495187, + "grad_norm": 0.5896415710449219, + "learning_rate": 4.29970170220307e-06, + "loss": 0.5305, + "step": 4718 + }, + { + "epoch": 0.744674136026511, + "grad_norm": 0.5768646597862244, + "learning_rate": 4.299413409608424e-06, + "loss": 0.5934, + "step": 4719 + }, + { + "epoch": 0.7448319394035032, + "grad_norm": 0.6301082968711853, + "learning_rate": 4.299125067354352e-06, + "loss": 0.5524, + "step": 4720 + }, + { + "epoch": 0.7449897427804955, + "grad_norm": 0.5882108211517334, + "learning_rate": 4.2988366754488084e-06, + "loss": 0.5429, + "step": 4721 + }, + { + "epoch": 0.7451475461574878, + "grad_norm": 0.5844355821609497, + "learning_rate": 4.298548233899755e-06, + "loss": 0.568, + "step": 4722 + }, + { + "epoch": 0.74530534953448, + "grad_norm": 0.5874985456466675, + "learning_rate": 4.298259742715151e-06, + "loss": 0.5894, + "step": 4723 + }, + { + "epoch": 0.7454631529114724, + "grad_norm": 0.5738307237625122, + "learning_rate": 4.297971201902957e-06, + "loss": 0.559, + "step": 4724 + }, + { + "epoch": 0.7456209562884646, + "grad_norm": 0.5693222880363464, + "learning_rate": 4.297682611471138e-06, + "loss": 0.5884, + "step": 4725 + }, + { + "epoch": 0.7457787596654568, + "grad_norm": 0.5539371371269226, + "learning_rate": 4.297393971427657e-06, + "loss": 0.5534, + "step": 4726 + }, + { + "epoch": 0.7459365630424492, + "grad_norm": 0.5890965461730957, + "learning_rate": 4.297105281780479e-06, + "loss": 0.5812, + "step": 4727 + }, + { + "epoch": 0.7460943664194414, + "grad_norm": 0.5726961493492126, + "learning_rate": 4.296816542537574e-06, + "loss": 0.5626, + "step": 4728 + }, + { + "epoch": 0.7462521697964336, + "grad_norm": 0.5554796457290649, + "learning_rate": 4.296527753706908e-06, + "loss": 0.5609, + "step": 4729 + }, + { + "epoch": 0.746409973173426, + "grad_norm": 0.5782054662704468, + "learning_rate": 4.296238915296452e-06, + "loss": 0.5781, + "step": 4730 + }, + { + "epoch": 0.7465677765504182, + "grad_norm": 0.5933186411857605, + "learning_rate": 4.2959500273141765e-06, + "loss": 0.5906, + "step": 4731 + }, + { + "epoch": 0.7467255799274104, + "grad_norm": 0.6166303157806396, + "learning_rate": 4.295661089768054e-06, + "loss": 0.5513, + "step": 4732 + }, + { + "epoch": 0.7468833833044027, + "grad_norm": 0.6354818940162659, + "learning_rate": 4.29537210266606e-06, + "loss": 0.574, + "step": 4733 + }, + { + "epoch": 0.747041186681395, + "grad_norm": 0.5675528645515442, + "learning_rate": 4.295083066016168e-06, + "loss": 0.5813, + "step": 4734 + }, + { + "epoch": 0.7471989900583873, + "grad_norm": 0.5772945284843445, + "learning_rate": 4.294793979826356e-06, + "loss": 0.5461, + "step": 4735 + }, + { + "epoch": 0.7473567934353795, + "grad_norm": 0.587824821472168, + "learning_rate": 4.294504844104602e-06, + "loss": 0.585, + "step": 4736 + }, + { + "epoch": 0.7475145968123718, + "grad_norm": 0.5827080607414246, + "learning_rate": 4.294215658858885e-06, + "loss": 0.586, + "step": 4737 + }, + { + "epoch": 0.7476724001893641, + "grad_norm": 0.5781714916229248, + "learning_rate": 4.293926424097186e-06, + "loss": 0.5765, + "step": 4738 + }, + { + "epoch": 0.7478302035663563, + "grad_norm": 0.5686776638031006, + "learning_rate": 4.293637139827486e-06, + "loss": 0.5584, + "step": 4739 + }, + { + "epoch": 0.7479880069433485, + "grad_norm": 0.6387318968772888, + "learning_rate": 4.29334780605777e-06, + "loss": 0.5835, + "step": 4740 + }, + { + "epoch": 0.7481458103203409, + "grad_norm": 0.6026250720024109, + "learning_rate": 4.293058422796023e-06, + "loss": 0.6043, + "step": 4741 + }, + { + "epoch": 0.7483036136973331, + "grad_norm": 0.574633777141571, + "learning_rate": 4.292768990050229e-06, + "loss": 0.5644, + "step": 4742 + }, + { + "epoch": 0.7484614170743253, + "grad_norm": 0.6169712543487549, + "learning_rate": 4.292479507828379e-06, + "loss": 0.5788, + "step": 4743 + }, + { + "epoch": 0.7486192204513177, + "grad_norm": 0.5948607921600342, + "learning_rate": 4.29218997613846e-06, + "loss": 0.5851, + "step": 4744 + }, + { + "epoch": 0.7487770238283099, + "grad_norm": 0.5519044399261475, + "learning_rate": 4.291900394988463e-06, + "loss": 0.5828, + "step": 4745 + }, + { + "epoch": 0.7489348272053022, + "grad_norm": 0.6006577014923096, + "learning_rate": 4.291610764386379e-06, + "loss": 0.5851, + "step": 4746 + }, + { + "epoch": 0.7490926305822945, + "grad_norm": 0.581404447555542, + "learning_rate": 4.291321084340202e-06, + "loss": 0.5666, + "step": 4747 + }, + { + "epoch": 0.7492504339592867, + "grad_norm": 0.621860682964325, + "learning_rate": 4.291031354857927e-06, + "loss": 0.6003, + "step": 4748 + }, + { + "epoch": 0.749408237336279, + "grad_norm": 0.5859420299530029, + "learning_rate": 4.290741575947547e-06, + "loss": 0.5495, + "step": 4749 + }, + { + "epoch": 0.7495660407132713, + "grad_norm": 0.5745954513549805, + "learning_rate": 4.290451747617061e-06, + "loss": 0.5712, + "step": 4750 + }, + { + "epoch": 0.7497238440902635, + "grad_norm": 0.5657274723052979, + "learning_rate": 4.29016186987447e-06, + "loss": 0.5575, + "step": 4751 + }, + { + "epoch": 0.7498816474672558, + "grad_norm": 0.5778735876083374, + "learning_rate": 4.289871942727769e-06, + "loss": 0.5546, + "step": 4752 + }, + { + "epoch": 0.7500394508442481, + "grad_norm": 0.5799294710159302, + "learning_rate": 4.289581966184963e-06, + "loss": 0.5963, + "step": 4753 + }, + { + "epoch": 0.7501972542212403, + "grad_norm": 0.6147696375846863, + "learning_rate": 4.289291940254054e-06, + "loss": 0.548, + "step": 4754 + }, + { + "epoch": 0.7503550575982326, + "grad_norm": 0.6075230836868286, + "learning_rate": 4.289001864943044e-06, + "loss": 0.5568, + "step": 4755 + }, + { + "epoch": 0.7505128609752248, + "grad_norm": 0.5721176862716675, + "learning_rate": 4.288711740259941e-06, + "loss": 0.5495, + "step": 4756 + }, + { + "epoch": 0.7506706643522172, + "grad_norm": 0.5643274188041687, + "learning_rate": 4.28842156621275e-06, + "loss": 0.5589, + "step": 4757 + }, + { + "epoch": 0.7508284677292094, + "grad_norm": 0.588547945022583, + "learning_rate": 4.288131342809481e-06, + "loss": 0.5934, + "step": 4758 + }, + { + "epoch": 0.7509862711062016, + "grad_norm": 0.5833842754364014, + "learning_rate": 4.287841070058141e-06, + "loss": 0.5715, + "step": 4759 + }, + { + "epoch": 0.751144074483194, + "grad_norm": 0.5940499901771545, + "learning_rate": 4.287550747966741e-06, + "loss": 0.5303, + "step": 4760 + }, + { + "epoch": 0.7513018778601862, + "grad_norm": 0.5511233806610107, + "learning_rate": 4.287260376543296e-06, + "loss": 0.5599, + "step": 4761 + }, + { + "epoch": 0.7514596812371784, + "grad_norm": 0.6104879379272461, + "learning_rate": 4.286969955795816e-06, + "loss": 0.5522, + "step": 4762 + }, + { + "epoch": 0.7516174846141708, + "grad_norm": 0.5806803107261658, + "learning_rate": 4.286679485732319e-06, + "loss": 0.5838, + "step": 4763 + }, + { + "epoch": 0.751775287991163, + "grad_norm": 0.5872542262077332, + "learning_rate": 4.286388966360819e-06, + "loss": 0.553, + "step": 4764 + }, + { + "epoch": 0.7519330913681552, + "grad_norm": 0.6149010062217712, + "learning_rate": 4.286098397689335e-06, + "loss": 0.5965, + "step": 4765 + }, + { + "epoch": 0.7520908947451476, + "grad_norm": 0.5844247937202454, + "learning_rate": 4.285807779725885e-06, + "loss": 0.5662, + "step": 4766 + }, + { + "epoch": 0.7522486981221398, + "grad_norm": 0.5702762603759766, + "learning_rate": 4.285517112478491e-06, + "loss": 0.5812, + "step": 4767 + }, + { + "epoch": 0.7524065014991321, + "grad_norm": 0.5812419056892395, + "learning_rate": 4.285226395955173e-06, + "loss": 0.5794, + "step": 4768 + }, + { + "epoch": 0.7525643048761244, + "grad_norm": 0.5516024231910706, + "learning_rate": 4.284935630163954e-06, + "loss": 0.5649, + "step": 4769 + }, + { + "epoch": 0.7527221082531166, + "grad_norm": 0.6130374073982239, + "learning_rate": 4.28464481511286e-06, + "loss": 0.5875, + "step": 4770 + }, + { + "epoch": 0.7528799116301089, + "grad_norm": 0.6260847449302673, + "learning_rate": 4.284353950809914e-06, + "loss": 0.5875, + "step": 4771 + }, + { + "epoch": 0.7530377150071011, + "grad_norm": 0.5897089242935181, + "learning_rate": 4.284063037263147e-06, + "loss": 0.5796, + "step": 4772 + }, + { + "epoch": 0.7531955183840934, + "grad_norm": 0.6068273782730103, + "learning_rate": 4.283772074480584e-06, + "loss": 0.56, + "step": 4773 + }, + { + "epoch": 0.7533533217610857, + "grad_norm": 0.5919108986854553, + "learning_rate": 4.283481062470257e-06, + "loss": 0.571, + "step": 4774 + }, + { + "epoch": 0.7535111251380779, + "grad_norm": 0.6154858469963074, + "learning_rate": 4.283190001240197e-06, + "loss": 0.5713, + "step": 4775 + }, + { + "epoch": 0.7536689285150702, + "grad_norm": 0.5599690079689026, + "learning_rate": 4.282898890798436e-06, + "loss": 0.5504, + "step": 4776 + }, + { + "epoch": 0.7538267318920625, + "grad_norm": 1.0128604173660278, + "learning_rate": 4.282607731153008e-06, + "loss": 0.5857, + "step": 4777 + }, + { + "epoch": 0.7539845352690547, + "grad_norm": 0.5890240669250488, + "learning_rate": 4.282316522311948e-06, + "loss": 0.5905, + "step": 4778 + }, + { + "epoch": 0.7541423386460471, + "grad_norm": 0.598624050617218, + "learning_rate": 4.282025264283293e-06, + "loss": 0.579, + "step": 4779 + }, + { + "epoch": 0.7543001420230393, + "grad_norm": 0.5885270237922668, + "learning_rate": 4.281733957075081e-06, + "loss": 0.574, + "step": 4780 + }, + { + "epoch": 0.7544579454000315, + "grad_norm": 0.5543823838233948, + "learning_rate": 4.281442600695352e-06, + "loss": 0.6066, + "step": 4781 + }, + { + "epoch": 0.7546157487770239, + "grad_norm": 0.5635919570922852, + "learning_rate": 4.281151195152146e-06, + "loss": 0.577, + "step": 4782 + }, + { + "epoch": 0.7547735521540161, + "grad_norm": 0.5401657819747925, + "learning_rate": 4.280859740453505e-06, + "loss": 0.5634, + "step": 4783 + }, + { + "epoch": 0.7549313555310083, + "grad_norm": 0.5743206739425659, + "learning_rate": 4.280568236607472e-06, + "loss": 0.5625, + "step": 4784 + }, + { + "epoch": 0.7550891589080007, + "grad_norm": 0.5843325853347778, + "learning_rate": 4.280276683622093e-06, + "loss": 0.5811, + "step": 4785 + }, + { + "epoch": 0.7552469622849929, + "grad_norm": 0.6075741648674011, + "learning_rate": 4.2799850815054145e-06, + "loss": 0.5729, + "step": 4786 + }, + { + "epoch": 0.7554047656619851, + "grad_norm": 0.5794164538383484, + "learning_rate": 4.279693430265482e-06, + "loss": 0.5666, + "step": 4787 + }, + { + "epoch": 0.7555625690389774, + "grad_norm": 0.5817893147468567, + "learning_rate": 4.279401729910346e-06, + "loss": 0.5511, + "step": 4788 + }, + { + "epoch": 0.7557203724159697, + "grad_norm": 0.610696017742157, + "learning_rate": 4.2791099804480555e-06, + "loss": 0.5923, + "step": 4789 + }, + { + "epoch": 0.755878175792962, + "grad_norm": 0.5906046032905579, + "learning_rate": 4.278818181886663e-06, + "loss": 0.5622, + "step": 4790 + }, + { + "epoch": 0.7560359791699542, + "grad_norm": 0.5712579488754272, + "learning_rate": 4.278526334234222e-06, + "loss": 0.56, + "step": 4791 + }, + { + "epoch": 0.7561937825469465, + "grad_norm": 0.5607795715332031, + "learning_rate": 4.278234437498786e-06, + "loss": 0.5652, + "step": 4792 + }, + { + "epoch": 0.7563515859239388, + "grad_norm": 0.6245068311691284, + "learning_rate": 4.27794249168841e-06, + "loss": 0.546, + "step": 4793 + }, + { + "epoch": 0.756509389300931, + "grad_norm": 0.5745338201522827, + "learning_rate": 4.2776504968111515e-06, + "loss": 0.6161, + "step": 4794 + }, + { + "epoch": 0.7566671926779233, + "grad_norm": 0.5558388233184814, + "learning_rate": 4.277358452875069e-06, + "loss": 0.5907, + "step": 4795 + }, + { + "epoch": 0.7568249960549156, + "grad_norm": 0.6648277640342712, + "learning_rate": 4.277066359888223e-06, + "loss": 0.5842, + "step": 4796 + }, + { + "epoch": 0.7569827994319078, + "grad_norm": 0.6738482117652893, + "learning_rate": 4.276774217858673e-06, + "loss": 0.5529, + "step": 4797 + }, + { + "epoch": 0.7571406028089002, + "grad_norm": 0.5779871940612793, + "learning_rate": 4.276482026794482e-06, + "loss": 0.573, + "step": 4798 + }, + { + "epoch": 0.7572984061858924, + "grad_norm": 0.5949130058288574, + "learning_rate": 4.276189786703715e-06, + "loss": 0.5446, + "step": 4799 + }, + { + "epoch": 0.7574562095628846, + "grad_norm": 0.5954297184944153, + "learning_rate": 4.275897497594435e-06, + "loss": 0.5421, + "step": 4800 + }, + { + "epoch": 0.757614012939877, + "grad_norm": 0.6127998232841492, + "learning_rate": 4.275605159474709e-06, + "loss": 0.5499, + "step": 4801 + }, + { + "epoch": 0.7577718163168692, + "grad_norm": 0.60308438539505, + "learning_rate": 4.275312772352606e-06, + "loss": 0.6052, + "step": 4802 + }, + { + "epoch": 0.7579296196938614, + "grad_norm": 0.5941794514656067, + "learning_rate": 4.275020336236194e-06, + "loss": 0.5823, + "step": 4803 + }, + { + "epoch": 0.7580874230708537, + "grad_norm": 0.5925840139389038, + "learning_rate": 4.274727851133545e-06, + "loss": 0.5621, + "step": 4804 + }, + { + "epoch": 0.758245226447846, + "grad_norm": 0.603640079498291, + "learning_rate": 4.274435317052729e-06, + "loss": 0.5458, + "step": 4805 + }, + { + "epoch": 0.7584030298248382, + "grad_norm": 0.5888864994049072, + "learning_rate": 4.274142734001821e-06, + "loss": 0.5507, + "step": 4806 + }, + { + "epoch": 0.7585608332018305, + "grad_norm": 0.5951517820358276, + "learning_rate": 4.273850101988893e-06, + "loss": 0.5637, + "step": 4807 + }, + { + "epoch": 0.7587186365788228, + "grad_norm": 0.6009926199913025, + "learning_rate": 4.273557421022024e-06, + "loss": 0.5668, + "step": 4808 + }, + { + "epoch": 0.7588764399558151, + "grad_norm": 0.5836387872695923, + "learning_rate": 4.27326469110929e-06, + "loss": 0.5607, + "step": 4809 + }, + { + "epoch": 0.7590342433328073, + "grad_norm": 0.5723253488540649, + "learning_rate": 4.27297191225877e-06, + "loss": 0.5768, + "step": 4810 + }, + { + "epoch": 0.7591920467097996, + "grad_norm": 0.5694248676300049, + "learning_rate": 4.272679084478541e-06, + "loss": 0.5537, + "step": 4811 + }, + { + "epoch": 0.7593498500867919, + "grad_norm": 0.5998250246047974, + "learning_rate": 4.272386207776689e-06, + "loss": 0.5541, + "step": 4812 + }, + { + "epoch": 0.7595076534637841, + "grad_norm": 0.5645626187324524, + "learning_rate": 4.272093282161295e-06, + "loss": 0.5771, + "step": 4813 + }, + { + "epoch": 0.7596654568407764, + "grad_norm": 0.581935465335846, + "learning_rate": 4.271800307640441e-06, + "loss": 0.5544, + "step": 4814 + }, + { + "epoch": 0.7598232602177687, + "grad_norm": 0.5956999063491821, + "learning_rate": 4.271507284222214e-06, + "loss": 0.5509, + "step": 4815 + }, + { + "epoch": 0.7599810635947609, + "grad_norm": 0.590842604637146, + "learning_rate": 4.271214211914701e-06, + "loss": 0.5392, + "step": 4816 + }, + { + "epoch": 0.7601388669717531, + "grad_norm": 0.5604117512702942, + "learning_rate": 4.27092109072599e-06, + "loss": 0.5742, + "step": 4817 + }, + { + "epoch": 0.7602966703487455, + "grad_norm": 0.5920671224594116, + "learning_rate": 4.270627920664171e-06, + "loss": 0.5891, + "step": 4818 + }, + { + "epoch": 0.7604544737257377, + "grad_norm": 0.5893786549568176, + "learning_rate": 4.270334701737333e-06, + "loss": 0.568, + "step": 4819 + }, + { + "epoch": 0.76061227710273, + "grad_norm": 0.5779990553855896, + "learning_rate": 4.270041433953569e-06, + "loss": 0.5603, + "step": 4820 + }, + { + "epoch": 0.7607700804797223, + "grad_norm": 0.6398794054985046, + "learning_rate": 4.269748117320973e-06, + "loss": 0.5302, + "step": 4821 + }, + { + "epoch": 0.7609278838567145, + "grad_norm": 0.5672206282615662, + "learning_rate": 4.269454751847639e-06, + "loss": 0.5511, + "step": 4822 + }, + { + "epoch": 0.7610856872337068, + "grad_norm": 0.5569623708724976, + "learning_rate": 4.2691613375416636e-06, + "loss": 0.5646, + "step": 4823 + }, + { + "epoch": 0.7612434906106991, + "grad_norm": 0.566322922706604, + "learning_rate": 4.268867874411145e-06, + "loss": 0.5768, + "step": 4824 + }, + { + "epoch": 0.7614012939876913, + "grad_norm": 0.5846782922744751, + "learning_rate": 4.26857436246418e-06, + "loss": 0.4927, + "step": 4825 + }, + { + "epoch": 0.7615590973646836, + "grad_norm": 0.579892098903656, + "learning_rate": 4.268280801708871e-06, + "loss": 0.5842, + "step": 4826 + }, + { + "epoch": 0.7617169007416759, + "grad_norm": 0.5937774181365967, + "learning_rate": 4.267987192153319e-06, + "loss": 0.5827, + "step": 4827 + }, + { + "epoch": 0.7618747041186681, + "grad_norm": 0.6169293522834778, + "learning_rate": 4.267693533805626e-06, + "loss": 0.5502, + "step": 4828 + }, + { + "epoch": 0.7620325074956604, + "grad_norm": 0.5877321362495422, + "learning_rate": 4.267399826673897e-06, + "loss": 0.5604, + "step": 4829 + }, + { + "epoch": 0.7621903108726527, + "grad_norm": 0.5878764986991882, + "learning_rate": 4.267106070766238e-06, + "loss": 0.5852, + "step": 4830 + }, + { + "epoch": 0.762348114249645, + "grad_norm": 0.5917280912399292, + "learning_rate": 4.266812266090755e-06, + "loss": 0.5374, + "step": 4831 + }, + { + "epoch": 0.7625059176266372, + "grad_norm": 0.5589550137519836, + "learning_rate": 4.266518412655557e-06, + "loss": 0.5527, + "step": 4832 + }, + { + "epoch": 0.7626637210036294, + "grad_norm": 0.568813681602478, + "learning_rate": 4.266224510468753e-06, + "loss": 0.5831, + "step": 4833 + }, + { + "epoch": 0.7628215243806218, + "grad_norm": 0.5756451487541199, + "learning_rate": 4.265930559538455e-06, + "loss": 0.5838, + "step": 4834 + }, + { + "epoch": 0.762979327757614, + "grad_norm": 0.5885137915611267, + "learning_rate": 4.265636559872775e-06, + "loss": 0.5753, + "step": 4835 + }, + { + "epoch": 0.7631371311346062, + "grad_norm": 0.605230450630188, + "learning_rate": 4.265342511479825e-06, + "loss": 0.5458, + "step": 4836 + }, + { + "epoch": 0.7632949345115986, + "grad_norm": 0.6016939878463745, + "learning_rate": 4.265048414367723e-06, + "loss": 0.5875, + "step": 4837 + }, + { + "epoch": 0.7634527378885908, + "grad_norm": 0.5898129940032959, + "learning_rate": 4.264754268544582e-06, + "loss": 0.5735, + "step": 4838 + }, + { + "epoch": 0.763610541265583, + "grad_norm": 0.5514022707939148, + "learning_rate": 4.264460074018522e-06, + "loss": 0.5875, + "step": 4839 + }, + { + "epoch": 0.7637683446425754, + "grad_norm": 0.5934242010116577, + "learning_rate": 4.264165830797662e-06, + "loss": 0.6039, + "step": 4840 + }, + { + "epoch": 0.7639261480195676, + "grad_norm": 0.5665915012359619, + "learning_rate": 4.263871538890122e-06, + "loss": 0.5559, + "step": 4841 + }, + { + "epoch": 0.7640839513965599, + "grad_norm": 0.599092960357666, + "learning_rate": 4.263577198304023e-06, + "loss": 0.5506, + "step": 4842 + }, + { + "epoch": 0.7642417547735522, + "grad_norm": 0.6090904474258423, + "learning_rate": 4.26328280904749e-06, + "loss": 0.5424, + "step": 4843 + }, + { + "epoch": 0.7643995581505444, + "grad_norm": 0.5988256335258484, + "learning_rate": 4.262988371128645e-06, + "loss": 0.5709, + "step": 4844 + }, + { + "epoch": 0.7645573615275367, + "grad_norm": 0.6014102697372437, + "learning_rate": 4.262693884555616e-06, + "loss": 0.5818, + "step": 4845 + }, + { + "epoch": 0.764715164904529, + "grad_norm": 0.583564043045044, + "learning_rate": 4.262399349336528e-06, + "loss": 0.5413, + "step": 4846 + }, + { + "epoch": 0.7648729682815212, + "grad_norm": 0.5765478610992432, + "learning_rate": 4.26210476547951e-06, + "loss": 0.5679, + "step": 4847 + }, + { + "epoch": 0.7650307716585135, + "grad_norm": 0.5870807766914368, + "learning_rate": 4.261810132992693e-06, + "loss": 0.5749, + "step": 4848 + }, + { + "epoch": 0.7651885750355057, + "grad_norm": 0.5790922045707703, + "learning_rate": 4.2615154518842076e-06, + "loss": 0.5394, + "step": 4849 + }, + { + "epoch": 0.765346378412498, + "grad_norm": 0.5779646635055542, + "learning_rate": 4.261220722162186e-06, + "loss": 0.6222, + "step": 4850 + }, + { + "epoch": 0.7655041817894903, + "grad_norm": 0.5814025402069092, + "learning_rate": 4.260925943834762e-06, + "loss": 0.5641, + "step": 4851 + }, + { + "epoch": 0.7656619851664825, + "grad_norm": 0.5920023918151855, + "learning_rate": 4.260631116910071e-06, + "loss": 0.5703, + "step": 4852 + }, + { + "epoch": 0.7658197885434749, + "grad_norm": 0.5945155620574951, + "learning_rate": 4.260336241396249e-06, + "loss": 0.57, + "step": 4853 + }, + { + "epoch": 0.7659775919204671, + "grad_norm": 0.5798459649085999, + "learning_rate": 4.260041317301435e-06, + "loss": 0.5481, + "step": 4854 + }, + { + "epoch": 0.7661353952974593, + "grad_norm": 0.6040289402008057, + "learning_rate": 4.2597463446337675e-06, + "loss": 0.5765, + "step": 4855 + }, + { + "epoch": 0.7662931986744517, + "grad_norm": 0.5833120346069336, + "learning_rate": 4.259451323401387e-06, + "loss": 0.5489, + "step": 4856 + }, + { + "epoch": 0.7664510020514439, + "grad_norm": 0.5544666051864624, + "learning_rate": 4.259156253612434e-06, + "loss": 0.5214, + "step": 4857 + }, + { + "epoch": 0.7666088054284361, + "grad_norm": 0.5888547897338867, + "learning_rate": 4.258861135275054e-06, + "loss": 0.5664, + "step": 4858 + }, + { + "epoch": 0.7667666088054285, + "grad_norm": 0.5829806923866272, + "learning_rate": 4.258565968397391e-06, + "loss": 0.5821, + "step": 4859 + }, + { + "epoch": 0.7669244121824207, + "grad_norm": 0.6014844179153442, + "learning_rate": 4.25827075298759e-06, + "loss": 0.5801, + "step": 4860 + }, + { + "epoch": 0.7670822155594129, + "grad_norm": 0.5897009968757629, + "learning_rate": 4.2579754890537985e-06, + "loss": 0.5741, + "step": 4861 + }, + { + "epoch": 0.7672400189364053, + "grad_norm": 0.5930191278457642, + "learning_rate": 4.257680176604166e-06, + "loss": 0.5496, + "step": 4862 + }, + { + "epoch": 0.7673978223133975, + "grad_norm": 0.5703510642051697, + "learning_rate": 4.257384815646841e-06, + "loss": 0.5276, + "step": 4863 + }, + { + "epoch": 0.7675556256903898, + "grad_norm": 0.57047438621521, + "learning_rate": 4.257089406189975e-06, + "loss": 0.5435, + "step": 4864 + }, + { + "epoch": 0.767713429067382, + "grad_norm": 0.595981776714325, + "learning_rate": 4.256793948241722e-06, + "loss": 0.5444, + "step": 4865 + }, + { + "epoch": 0.7678712324443743, + "grad_norm": 0.5787847638130188, + "learning_rate": 4.256498441810234e-06, + "loss": 0.5845, + "step": 4866 + }, + { + "epoch": 0.7680290358213666, + "grad_norm": 0.5979446172714233, + "learning_rate": 4.256202886903668e-06, + "loss": 0.5763, + "step": 4867 + }, + { + "epoch": 0.7681868391983588, + "grad_norm": 0.5958724021911621, + "learning_rate": 4.255907283530179e-06, + "loss": 0.588, + "step": 4868 + }, + { + "epoch": 0.7683446425753511, + "grad_norm": 0.5670300722122192, + "learning_rate": 4.255611631697925e-06, + "loss": 0.5916, + "step": 4869 + }, + { + "epoch": 0.7685024459523434, + "grad_norm": 0.579244077205658, + "learning_rate": 4.255315931415068e-06, + "loss": 0.6012, + "step": 4870 + }, + { + "epoch": 0.7686602493293356, + "grad_norm": 0.5880943536758423, + "learning_rate": 4.255020182689766e-06, + "loss": 0.5554, + "step": 4871 + }, + { + "epoch": 0.768818052706328, + "grad_norm": 0.6109189987182617, + "learning_rate": 4.254724385530182e-06, + "loss": 0.5684, + "step": 4872 + }, + { + "epoch": 0.7689758560833202, + "grad_norm": 0.6011088490486145, + "learning_rate": 4.254428539944477e-06, + "loss": 0.5907, + "step": 4873 + }, + { + "epoch": 0.7691336594603124, + "grad_norm": 0.5870009660720825, + "learning_rate": 4.254132645940818e-06, + "loss": 0.5632, + "step": 4874 + }, + { + "epoch": 0.7692914628373048, + "grad_norm": 0.6152269840240479, + "learning_rate": 4.253836703527371e-06, + "loss": 0.5726, + "step": 4875 + }, + { + "epoch": 0.769449266214297, + "grad_norm": 0.6104797124862671, + "learning_rate": 4.2535407127123025e-06, + "loss": 0.5576, + "step": 4876 + }, + { + "epoch": 0.7696070695912892, + "grad_norm": 0.5409712195396423, + "learning_rate": 4.253244673503781e-06, + "loss": 0.5824, + "step": 4877 + }, + { + "epoch": 0.7697648729682816, + "grad_norm": 0.5902339816093445, + "learning_rate": 4.252948585909978e-06, + "loss": 0.5754, + "step": 4878 + }, + { + "epoch": 0.7699226763452738, + "grad_norm": 0.592129647731781, + "learning_rate": 4.252652449939062e-06, + "loss": 0.589, + "step": 4879 + }, + { + "epoch": 0.770080479722266, + "grad_norm": 0.647301197052002, + "learning_rate": 4.252356265599208e-06, + "loss": 0.5772, + "step": 4880 + }, + { + "epoch": 0.7702382830992583, + "grad_norm": 0.5741134285926819, + "learning_rate": 4.252060032898589e-06, + "loss": 0.5943, + "step": 4881 + }, + { + "epoch": 0.7703960864762506, + "grad_norm": 0.5691123604774475, + "learning_rate": 4.2517637518453795e-06, + "loss": 0.5934, + "step": 4882 + }, + { + "epoch": 0.7705538898532429, + "grad_norm": 0.591885507106781, + "learning_rate": 4.251467422447758e-06, + "loss": 0.5846, + "step": 4883 + }, + { + "epoch": 0.7707116932302351, + "grad_norm": 0.5705270767211914, + "learning_rate": 4.251171044713901e-06, + "loss": 0.5829, + "step": 4884 + }, + { + "epoch": 0.7708694966072274, + "grad_norm": 0.5553304553031921, + "learning_rate": 4.250874618651989e-06, + "loss": 0.5836, + "step": 4885 + }, + { + "epoch": 0.7710272999842197, + "grad_norm": 0.5596164464950562, + "learning_rate": 4.250578144270202e-06, + "loss": 0.5595, + "step": 4886 + }, + { + "epoch": 0.7711851033612119, + "grad_norm": 0.5691215991973877, + "learning_rate": 4.250281621576722e-06, + "loss": 0.556, + "step": 4887 + }, + { + "epoch": 0.7713429067382042, + "grad_norm": 0.5560022592544556, + "learning_rate": 4.249985050579731e-06, + "loss": 0.5659, + "step": 4888 + }, + { + "epoch": 0.7715007101151965, + "grad_norm": 0.5703991055488586, + "learning_rate": 4.249688431287416e-06, + "loss": 0.5501, + "step": 4889 + }, + { + "epoch": 0.7716585134921887, + "grad_norm": 0.5688493251800537, + "learning_rate": 4.249391763707962e-06, + "loss": 0.558, + "step": 4890 + }, + { + "epoch": 0.771816316869181, + "grad_norm": 0.5813100934028625, + "learning_rate": 4.249095047849556e-06, + "loss": 0.5913, + "step": 4891 + }, + { + "epoch": 0.7719741202461733, + "grad_norm": 0.6057785153388977, + "learning_rate": 4.248798283720387e-06, + "loss": 0.5958, + "step": 4892 + }, + { + "epoch": 0.7721319236231655, + "grad_norm": 0.6255874633789062, + "learning_rate": 4.2485014713286446e-06, + "loss": 0.5311, + "step": 4893 + }, + { + "epoch": 0.7722897270001579, + "grad_norm": 0.574737548828125, + "learning_rate": 4.2482046106825195e-06, + "loss": 0.579, + "step": 4894 + }, + { + "epoch": 0.7724475303771501, + "grad_norm": 0.5842307806015015, + "learning_rate": 4.247907701790206e-06, + "loss": 0.5747, + "step": 4895 + }, + { + "epoch": 0.7726053337541423, + "grad_norm": 0.6319389343261719, + "learning_rate": 4.247610744659898e-06, + "loss": 0.5564, + "step": 4896 + }, + { + "epoch": 0.7727631371311346, + "grad_norm": 0.5821824073791504, + "learning_rate": 4.247313739299788e-06, + "loss": 0.5814, + "step": 4897 + }, + { + "epoch": 0.7729209405081269, + "grad_norm": 0.5900145769119263, + "learning_rate": 4.247016685718076e-06, + "loss": 0.5838, + "step": 4898 + }, + { + "epoch": 0.7730787438851191, + "grad_norm": 0.6334973573684692, + "learning_rate": 4.246719583922958e-06, + "loss": 0.5663, + "step": 4899 + }, + { + "epoch": 0.7732365472621114, + "grad_norm": 0.582389771938324, + "learning_rate": 4.246422433922633e-06, + "loss": 0.5601, + "step": 4900 + }, + { + "epoch": 0.7733943506391037, + "grad_norm": 0.5896672606468201, + "learning_rate": 4.246125235725304e-06, + "loss": 0.5564, + "step": 4901 + }, + { + "epoch": 0.7735521540160959, + "grad_norm": 0.599013090133667, + "learning_rate": 4.2458279893391705e-06, + "loss": 0.5623, + "step": 4902 + }, + { + "epoch": 0.7737099573930882, + "grad_norm": 0.5855673551559448, + "learning_rate": 4.245530694772437e-06, + "loss": 0.5823, + "step": 4903 + }, + { + "epoch": 0.7738677607700805, + "grad_norm": 0.5919306874275208, + "learning_rate": 4.245233352033308e-06, + "loss": 0.5849, + "step": 4904 + }, + { + "epoch": 0.7740255641470728, + "grad_norm": 0.5789796710014343, + "learning_rate": 4.244935961129989e-06, + "loss": 0.6009, + "step": 4905 + }, + { + "epoch": 0.774183367524065, + "grad_norm": 0.5775147080421448, + "learning_rate": 4.244638522070687e-06, + "loss": 0.595, + "step": 4906 + }, + { + "epoch": 0.7743411709010573, + "grad_norm": 0.571544885635376, + "learning_rate": 4.244341034863612e-06, + "loss": 0.6163, + "step": 4907 + }, + { + "epoch": 0.7744989742780496, + "grad_norm": 0.5844818353652954, + "learning_rate": 4.244043499516972e-06, + "loss": 0.5495, + "step": 4908 + }, + { + "epoch": 0.7746567776550418, + "grad_norm": 0.5796818733215332, + "learning_rate": 4.24374591603898e-06, + "loss": 0.6126, + "step": 4909 + }, + { + "epoch": 0.774814581032034, + "grad_norm": 0.6223645210266113, + "learning_rate": 4.2434482844378476e-06, + "loss": 0.5792, + "step": 4910 + }, + { + "epoch": 0.7749723844090264, + "grad_norm": 0.6092032790184021, + "learning_rate": 4.243150604721788e-06, + "loss": 0.5678, + "step": 4911 + }, + { + "epoch": 0.7751301877860186, + "grad_norm": 0.6095507144927979, + "learning_rate": 4.242852876899018e-06, + "loss": 0.5444, + "step": 4912 + }, + { + "epoch": 0.7752879911630108, + "grad_norm": 0.6185475587844849, + "learning_rate": 4.242555100977755e-06, + "loss": 0.5965, + "step": 4913 + }, + { + "epoch": 0.7754457945400032, + "grad_norm": 0.6126696467399597, + "learning_rate": 4.242257276966214e-06, + "loss": 0.5895, + "step": 4914 + }, + { + "epoch": 0.7756035979169954, + "grad_norm": 0.5585147738456726, + "learning_rate": 4.241959404872616e-06, + "loss": 0.5855, + "step": 4915 + }, + { + "epoch": 0.7757614012939877, + "grad_norm": 0.5668065547943115, + "learning_rate": 4.24166148470518e-06, + "loss": 0.5498, + "step": 4916 + }, + { + "epoch": 0.77591920467098, + "grad_norm": 0.5878072381019592, + "learning_rate": 4.241363516472131e-06, + "loss": 0.5731, + "step": 4917 + }, + { + "epoch": 0.7760770080479722, + "grad_norm": 0.5761723518371582, + "learning_rate": 4.241065500181688e-06, + "loss": 0.571, + "step": 4918 + }, + { + "epoch": 0.7762348114249645, + "grad_norm": 0.5989324450492859, + "learning_rate": 4.24076743584208e-06, + "loss": 0.5686, + "step": 4919 + }, + { + "epoch": 0.7763926148019568, + "grad_norm": 0.6006140112876892, + "learning_rate": 4.240469323461529e-06, + "loss": 0.5704, + "step": 4920 + }, + { + "epoch": 0.776550418178949, + "grad_norm": 0.629928708076477, + "learning_rate": 4.240171163048264e-06, + "loss": 0.5828, + "step": 4921 + }, + { + "epoch": 0.7767082215559413, + "grad_norm": 0.601655900478363, + "learning_rate": 4.239872954610514e-06, + "loss": 0.5258, + "step": 4922 + }, + { + "epoch": 0.7768660249329336, + "grad_norm": 0.5942676663398743, + "learning_rate": 4.239574698156508e-06, + "loss": 0.5606, + "step": 4923 + }, + { + "epoch": 0.7770238283099258, + "grad_norm": 0.5552466511726379, + "learning_rate": 4.239276393694478e-06, + "loss": 0.5174, + "step": 4924 + }, + { + "epoch": 0.7771816316869181, + "grad_norm": 0.614001452922821, + "learning_rate": 4.238978041232654e-06, + "loss": 0.5765, + "step": 4925 + }, + { + "epoch": 0.7773394350639103, + "grad_norm": 0.5968329310417175, + "learning_rate": 4.238679640779273e-06, + "loss": 0.56, + "step": 4926 + }, + { + "epoch": 0.7774972384409027, + "grad_norm": 0.6195690631866455, + "learning_rate": 4.238381192342568e-06, + "loss": 0.5709, + "step": 4927 + }, + { + "epoch": 0.7776550418178949, + "grad_norm": 0.5669845938682556, + "learning_rate": 4.238082695930776e-06, + "loss": 0.5694, + "step": 4928 + }, + { + "epoch": 0.7778128451948871, + "grad_norm": 0.5930643081665039, + "learning_rate": 4.237784151552136e-06, + "loss": 0.5729, + "step": 4929 + }, + { + "epoch": 0.7779706485718795, + "grad_norm": 0.5876446962356567, + "learning_rate": 4.237485559214884e-06, + "loss": 0.5866, + "step": 4930 + }, + { + "epoch": 0.7781284519488717, + "grad_norm": 0.5786778926849365, + "learning_rate": 4.237186918927264e-06, + "loss": 0.567, + "step": 4931 + }, + { + "epoch": 0.7782862553258639, + "grad_norm": 0.6108455657958984, + "learning_rate": 4.236888230697516e-06, + "loss": 0.5726, + "step": 4932 + }, + { + "epoch": 0.7784440587028563, + "grad_norm": 0.5533221960067749, + "learning_rate": 4.236589494533883e-06, + "loss": 0.5393, + "step": 4933 + }, + { + "epoch": 0.7786018620798485, + "grad_norm": 0.5726948976516724, + "learning_rate": 4.236290710444609e-06, + "loss": 0.5692, + "step": 4934 + }, + { + "epoch": 0.7787596654568407, + "grad_norm": 0.55413419008255, + "learning_rate": 4.235991878437942e-06, + "loss": 0.5472, + "step": 4935 + }, + { + "epoch": 0.7789174688338331, + "grad_norm": 0.580422580242157, + "learning_rate": 4.2356929985221265e-06, + "loss": 0.5894, + "step": 4936 + }, + { + "epoch": 0.7790752722108253, + "grad_norm": 0.5921812057495117, + "learning_rate": 4.235394070705411e-06, + "loss": 0.6034, + "step": 4937 + }, + { + "epoch": 0.7792330755878176, + "grad_norm": 0.5824918150901794, + "learning_rate": 4.235095094996047e-06, + "loss": 0.5197, + "step": 4938 + }, + { + "epoch": 0.7793908789648099, + "grad_norm": 0.5974700450897217, + "learning_rate": 4.2347960714022826e-06, + "loss": 0.5828, + "step": 4939 + }, + { + "epoch": 0.7795486823418021, + "grad_norm": 0.6036062836647034, + "learning_rate": 4.234496999932374e-06, + "loss": 0.6187, + "step": 4940 + }, + { + "epoch": 0.7797064857187944, + "grad_norm": 0.5754619836807251, + "learning_rate": 4.234197880594571e-06, + "loss": 0.5465, + "step": 4941 + }, + { + "epoch": 0.7798642890957866, + "grad_norm": 0.5878472924232483, + "learning_rate": 4.233898713397132e-06, + "loss": 0.5241, + "step": 4942 + }, + { + "epoch": 0.7800220924727789, + "grad_norm": 0.6233806014060974, + "learning_rate": 4.233599498348311e-06, + "loss": 0.5763, + "step": 4943 + }, + { + "epoch": 0.7801798958497712, + "grad_norm": 0.5804175734519958, + "learning_rate": 4.233300235456366e-06, + "loss": 0.5683, + "step": 4944 + }, + { + "epoch": 0.7803376992267634, + "grad_norm": 0.613243818283081, + "learning_rate": 4.233000924729557e-06, + "loss": 0.5832, + "step": 4945 + }, + { + "epoch": 0.7804955026037558, + "grad_norm": 0.6031191349029541, + "learning_rate": 4.232701566176143e-06, + "loss": 0.583, + "step": 4946 + }, + { + "epoch": 0.780653305980748, + "grad_norm": 0.5861493349075317, + "learning_rate": 4.232402159804385e-06, + "loss": 0.5839, + "step": 4947 + }, + { + "epoch": 0.7808111093577402, + "grad_norm": 0.6196667551994324, + "learning_rate": 4.2321027056225485e-06, + "loss": 0.584, + "step": 4948 + }, + { + "epoch": 0.7809689127347326, + "grad_norm": 0.5623810887336731, + "learning_rate": 4.231803203638895e-06, + "loss": 0.5812, + "step": 4949 + }, + { + "epoch": 0.7811267161117248, + "grad_norm": 0.5912924408912659, + "learning_rate": 4.2315036538616905e-06, + "loss": 0.5501, + "step": 4950 + }, + { + "epoch": 0.781284519488717, + "grad_norm": 0.5935512781143188, + "learning_rate": 4.231204056299204e-06, + "loss": 0.5638, + "step": 4951 + }, + { + "epoch": 0.7814423228657094, + "grad_norm": 0.597786009311676, + "learning_rate": 4.230904410959701e-06, + "loss": 0.5976, + "step": 4952 + }, + { + "epoch": 0.7816001262427016, + "grad_norm": 0.5537287592887878, + "learning_rate": 4.230604717851453e-06, + "loss": 0.5653, + "step": 4953 + }, + { + "epoch": 0.7817579296196938, + "grad_norm": 0.6143960952758789, + "learning_rate": 4.230304976982729e-06, + "loss": 0.5662, + "step": 4954 + }, + { + "epoch": 0.7819157329966862, + "grad_norm": 0.6030535101890564, + "learning_rate": 4.230005188361802e-06, + "loss": 0.535, + "step": 4955 + }, + { + "epoch": 0.7820735363736784, + "grad_norm": 0.5799479484558105, + "learning_rate": 4.229705351996946e-06, + "loss": 0.5771, + "step": 4956 + }, + { + "epoch": 0.7822313397506707, + "grad_norm": 0.5522257089614868, + "learning_rate": 4.229405467896435e-06, + "loss": 0.5491, + "step": 4957 + }, + { + "epoch": 0.782389143127663, + "grad_norm": 0.6067962646484375, + "learning_rate": 4.229105536068545e-06, + "loss": 0.5626, + "step": 4958 + }, + { + "epoch": 0.7825469465046552, + "grad_norm": 0.6627532243728638, + "learning_rate": 4.228805556521553e-06, + "loss": 0.5565, + "step": 4959 + }, + { + "epoch": 0.7827047498816475, + "grad_norm": 0.5919294357299805, + "learning_rate": 4.22850552926374e-06, + "loss": 0.5791, + "step": 4960 + }, + { + "epoch": 0.7828625532586397, + "grad_norm": 0.5931767225265503, + "learning_rate": 4.228205454303384e-06, + "loss": 0.5666, + "step": 4961 + }, + { + "epoch": 0.783020356635632, + "grad_norm": 0.5809425115585327, + "learning_rate": 4.227905331648766e-06, + "loss": 0.5452, + "step": 4962 + }, + { + "epoch": 0.7831781600126243, + "grad_norm": 0.5777758359909058, + "learning_rate": 4.22760516130817e-06, + "loss": 0.5767, + "step": 4963 + }, + { + "epoch": 0.7833359633896165, + "grad_norm": 0.5812849402427673, + "learning_rate": 4.227304943289878e-06, + "loss": 0.5552, + "step": 4964 + }, + { + "epoch": 0.7834937667666088, + "grad_norm": 0.5918716192245483, + "learning_rate": 4.227004677602178e-06, + "loss": 0.5953, + "step": 4965 + }, + { + "epoch": 0.7836515701436011, + "grad_norm": 0.586764931678772, + "learning_rate": 4.226704364253355e-06, + "loss": 0.588, + "step": 4966 + }, + { + "epoch": 0.7838093735205933, + "grad_norm": 0.5948788523674011, + "learning_rate": 4.226404003251698e-06, + "loss": 0.541, + "step": 4967 + }, + { + "epoch": 0.7839671768975857, + "grad_norm": 0.5692586898803711, + "learning_rate": 4.226103594605493e-06, + "loss": 0.5636, + "step": 4968 + }, + { + "epoch": 0.7841249802745779, + "grad_norm": 0.5730661749839783, + "learning_rate": 4.225803138323035e-06, + "loss": 0.5769, + "step": 4969 + }, + { + "epoch": 0.7842827836515701, + "grad_norm": 0.5992167592048645, + "learning_rate": 4.225502634412612e-06, + "loss": 0.6067, + "step": 4970 + }, + { + "epoch": 0.7844405870285625, + "grad_norm": 0.5915597081184387, + "learning_rate": 4.225202082882521e-06, + "loss": 0.5562, + "step": 4971 + }, + { + "epoch": 0.7845983904055547, + "grad_norm": 0.6250491738319397, + "learning_rate": 4.2249014837410526e-06, + "loss": 0.5755, + "step": 4972 + }, + { + "epoch": 0.7847561937825469, + "grad_norm": 0.5652005672454834, + "learning_rate": 4.224600836996506e-06, + "loss": 0.5664, + "step": 4973 + }, + { + "epoch": 0.7849139971595392, + "grad_norm": 0.5839101672172546, + "learning_rate": 4.224300142657176e-06, + "loss": 0.584, + "step": 4974 + }, + { + "epoch": 0.7850718005365315, + "grad_norm": 0.6259384751319885, + "learning_rate": 4.223999400731362e-06, + "loss": 0.5587, + "step": 4975 + }, + { + "epoch": 0.7852296039135237, + "grad_norm": 0.5831225514411926, + "learning_rate": 4.223698611227363e-06, + "loss": 0.5582, + "step": 4976 + }, + { + "epoch": 0.785387407290516, + "grad_norm": 0.5866628289222717, + "learning_rate": 4.223397774153481e-06, + "loss": 0.5836, + "step": 4977 + }, + { + "epoch": 0.7855452106675083, + "grad_norm": 0.550635576248169, + "learning_rate": 4.223096889518018e-06, + "loss": 0.5343, + "step": 4978 + }, + { + "epoch": 0.7857030140445006, + "grad_norm": 0.5480340719223022, + "learning_rate": 4.222795957329278e-06, + "loss": 0.5597, + "step": 4979 + }, + { + "epoch": 0.7858608174214928, + "grad_norm": 0.581298828125, + "learning_rate": 4.222494977595566e-06, + "loss": 0.6025, + "step": 4980 + }, + { + "epoch": 0.7860186207984851, + "grad_norm": 0.5858574509620667, + "learning_rate": 4.222193950325187e-06, + "loss": 0.5878, + "step": 4981 + }, + { + "epoch": 0.7861764241754774, + "grad_norm": 0.5857285261154175, + "learning_rate": 4.2218928755264495e-06, + "loss": 0.5633, + "step": 4982 + }, + { + "epoch": 0.7863342275524696, + "grad_norm": 0.5641104578971863, + "learning_rate": 4.221591753207663e-06, + "loss": 0.5377, + "step": 4983 + }, + { + "epoch": 0.7864920309294618, + "grad_norm": 0.6092286109924316, + "learning_rate": 4.221290583377137e-06, + "loss": 0.5598, + "step": 4984 + }, + { + "epoch": 0.7866498343064542, + "grad_norm": 0.5903373956680298, + "learning_rate": 4.220989366043184e-06, + "loss": 0.5989, + "step": 4985 + }, + { + "epoch": 0.7868076376834464, + "grad_norm": 0.5767266154289246, + "learning_rate": 4.2206881012141156e-06, + "loss": 0.5984, + "step": 4986 + }, + { + "epoch": 0.7869654410604386, + "grad_norm": 0.5907927751541138, + "learning_rate": 4.2203867888982465e-06, + "loss": 0.5777, + "step": 4987 + }, + { + "epoch": 0.787123244437431, + "grad_norm": 0.611310601234436, + "learning_rate": 4.220085429103893e-06, + "loss": 0.6101, + "step": 4988 + }, + { + "epoch": 0.7872810478144232, + "grad_norm": 0.5645692348480225, + "learning_rate": 4.21978402183937e-06, + "loss": 0.5866, + "step": 4989 + }, + { + "epoch": 0.7874388511914155, + "grad_norm": 0.560700535774231, + "learning_rate": 4.2194825671129976e-06, + "loss": 0.6018, + "step": 4990 + }, + { + "epoch": 0.7875966545684078, + "grad_norm": 0.5829430222511292, + "learning_rate": 4.219181064933093e-06, + "loss": 0.5682, + "step": 4991 + }, + { + "epoch": 0.7877544579454, + "grad_norm": 0.5773991942405701, + "learning_rate": 4.21887951530798e-06, + "loss": 0.544, + "step": 4992 + }, + { + "epoch": 0.7879122613223923, + "grad_norm": 0.5699395537376404, + "learning_rate": 4.218577918245979e-06, + "loss": 0.5832, + "step": 4993 + }, + { + "epoch": 0.7880700646993846, + "grad_norm": 0.6173931360244751, + "learning_rate": 4.218276273755412e-06, + "loss": 0.6043, + "step": 4994 + }, + { + "epoch": 0.7882278680763768, + "grad_norm": 0.5900959372520447, + "learning_rate": 4.217974581844605e-06, + "loss": 0.5909, + "step": 4995 + }, + { + "epoch": 0.7883856714533691, + "grad_norm": 0.565159797668457, + "learning_rate": 4.217672842521884e-06, + "loss": 0.5629, + "step": 4996 + }, + { + "epoch": 0.7885434748303614, + "grad_norm": 0.5694440007209778, + "learning_rate": 4.217371055795576e-06, + "loss": 0.5805, + "step": 4997 + }, + { + "epoch": 0.7887012782073536, + "grad_norm": 0.5794970393180847, + "learning_rate": 4.21706922167401e-06, + "loss": 0.5891, + "step": 4998 + }, + { + "epoch": 0.7888590815843459, + "grad_norm": 0.5908728837966919, + "learning_rate": 4.216767340165516e-06, + "loss": 0.5673, + "step": 4999 + }, + { + "epoch": 0.7890168849613381, + "grad_norm": 0.6038711071014404, + "learning_rate": 4.216465411278425e-06, + "loss": 0.5728, + "step": 5000 + }, + { + "epoch": 0.7891746883383305, + "grad_norm": 0.5933931469917297, + "learning_rate": 4.216163435021069e-06, + "loss": 0.5677, + "step": 5001 + }, + { + "epoch": 0.7893324917153227, + "grad_norm": 0.5467187762260437, + "learning_rate": 4.215861411401781e-06, + "loss": 0.5809, + "step": 5002 + }, + { + "epoch": 0.7894902950923149, + "grad_norm": 0.5760513544082642, + "learning_rate": 4.215559340428899e-06, + "loss": 0.5924, + "step": 5003 + }, + { + "epoch": 0.7896480984693073, + "grad_norm": 0.5983109474182129, + "learning_rate": 4.2152572221107565e-06, + "loss": 0.5502, + "step": 5004 + }, + { + "epoch": 0.7898059018462995, + "grad_norm": 0.5928536057472229, + "learning_rate": 4.214955056455693e-06, + "loss": 0.5847, + "step": 5005 + }, + { + "epoch": 0.7899637052232917, + "grad_norm": 0.5710873007774353, + "learning_rate": 4.214652843472047e-06, + "loss": 0.5569, + "step": 5006 + }, + { + "epoch": 0.7901215086002841, + "grad_norm": 0.5956944227218628, + "learning_rate": 4.214350583168158e-06, + "loss": 0.5648, + "step": 5007 + }, + { + "epoch": 0.7902793119772763, + "grad_norm": 0.5721518993377686, + "learning_rate": 4.2140482755523685e-06, + "loss": 0.5816, + "step": 5008 + }, + { + "epoch": 0.7904371153542685, + "grad_norm": 0.5741674900054932, + "learning_rate": 4.213745920633022e-06, + "loss": 0.5962, + "step": 5009 + }, + { + "epoch": 0.7905949187312609, + "grad_norm": 0.6046661734580994, + "learning_rate": 4.213443518418462e-06, + "loss": 0.6001, + "step": 5010 + }, + { + "epoch": 0.7907527221082531, + "grad_norm": 0.577896237373352, + "learning_rate": 4.213141068917034e-06, + "loss": 0.5707, + "step": 5011 + }, + { + "epoch": 0.7909105254852454, + "grad_norm": 0.5992209911346436, + "learning_rate": 4.212838572137085e-06, + "loss": 0.5513, + "step": 5012 + }, + { + "epoch": 0.7910683288622377, + "grad_norm": 0.5775250792503357, + "learning_rate": 4.212536028086963e-06, + "loss": 0.5682, + "step": 5013 + }, + { + "epoch": 0.7912261322392299, + "grad_norm": 0.6063390374183655, + "learning_rate": 4.212233436775018e-06, + "loss": 0.5392, + "step": 5014 + }, + { + "epoch": 0.7913839356162222, + "grad_norm": 0.5701256394386292, + "learning_rate": 4.2119307982096e-06, + "loss": 0.5465, + "step": 5015 + }, + { + "epoch": 0.7915417389932144, + "grad_norm": 0.6105396151542664, + "learning_rate": 4.2116281123990634e-06, + "loss": 0.5347, + "step": 5016 + }, + { + "epoch": 0.7916995423702067, + "grad_norm": 0.6233378052711487, + "learning_rate": 4.211325379351758e-06, + "loss": 0.5759, + "step": 5017 + }, + { + "epoch": 0.791857345747199, + "grad_norm": 0.6449397206306458, + "learning_rate": 4.21102259907604e-06, + "loss": 0.5474, + "step": 5018 + }, + { + "epoch": 0.7920151491241912, + "grad_norm": 0.5955415368080139, + "learning_rate": 4.2107197715802676e-06, + "loss": 0.552, + "step": 5019 + }, + { + "epoch": 0.7921729525011836, + "grad_norm": 0.5665522813796997, + "learning_rate": 4.2104168968727944e-06, + "loss": 0.5848, + "step": 5020 + }, + { + "epoch": 0.7923307558781758, + "grad_norm": 0.5877891778945923, + "learning_rate": 4.210113974961981e-06, + "loss": 0.5599, + "step": 5021 + }, + { + "epoch": 0.792488559255168, + "grad_norm": 0.5933160185813904, + "learning_rate": 4.2098110058561884e-06, + "loss": 0.5509, + "step": 5022 + }, + { + "epoch": 0.7926463626321604, + "grad_norm": 0.6250284314155579, + "learning_rate": 4.209507989563774e-06, + "loss": 0.5536, + "step": 5023 + }, + { + "epoch": 0.7928041660091526, + "grad_norm": 0.5874499082565308, + "learning_rate": 4.209204926093106e-06, + "loss": 0.5715, + "step": 5024 + }, + { + "epoch": 0.7929619693861448, + "grad_norm": 0.583080530166626, + "learning_rate": 4.2089018154525426e-06, + "loss": 0.5989, + "step": 5025 + }, + { + "epoch": 0.7931197727631372, + "grad_norm": 0.5925818085670471, + "learning_rate": 4.208598657650452e-06, + "loss": 0.5978, + "step": 5026 + }, + { + "epoch": 0.7932775761401294, + "grad_norm": 0.582291305065155, + "learning_rate": 4.208295452695201e-06, + "loss": 0.539, + "step": 5027 + }, + { + "epoch": 0.7934353795171216, + "grad_norm": 0.6108399629592896, + "learning_rate": 4.207992200595155e-06, + "loss": 0.5845, + "step": 5028 + }, + { + "epoch": 0.793593182894114, + "grad_norm": 0.5698696374893188, + "learning_rate": 4.207688901358684e-06, + "loss": 0.5605, + "step": 5029 + }, + { + "epoch": 0.7937509862711062, + "grad_norm": 0.5867390036582947, + "learning_rate": 4.207385554994159e-06, + "loss": 0.5482, + "step": 5030 + }, + { + "epoch": 0.7939087896480985, + "grad_norm": 0.5759236812591553, + "learning_rate": 4.207082161509951e-06, + "loss": 0.5302, + "step": 5031 + }, + { + "epoch": 0.7940665930250908, + "grad_norm": 0.580293595790863, + "learning_rate": 4.206778720914434e-06, + "loss": 0.6037, + "step": 5032 + }, + { + "epoch": 0.794224396402083, + "grad_norm": 0.5911422371864319, + "learning_rate": 4.206475233215981e-06, + "loss": 0.5637, + "step": 5033 + }, + { + "epoch": 0.7943821997790753, + "grad_norm": 0.5769272446632385, + "learning_rate": 4.206171698422968e-06, + "loss": 0.5436, + "step": 5034 + }, + { + "epoch": 0.7945400031560675, + "grad_norm": 0.5769444704055786, + "learning_rate": 4.205868116543771e-06, + "loss": 0.5569, + "step": 5035 + }, + { + "epoch": 0.7946978065330598, + "grad_norm": 0.553941011428833, + "learning_rate": 4.20556448758677e-06, + "loss": 0.5564, + "step": 5036 + }, + { + "epoch": 0.7948556099100521, + "grad_norm": 0.5440439581871033, + "learning_rate": 4.205260811560342e-06, + "loss": 0.5754, + "step": 5037 + }, + { + "epoch": 0.7950134132870443, + "grad_norm": 0.6087167263031006, + "learning_rate": 4.2049570884728694e-06, + "loss": 0.5424, + "step": 5038 + }, + { + "epoch": 0.7951712166640366, + "grad_norm": 0.5996675491333008, + "learning_rate": 4.204653318332734e-06, + "loss": 0.5959, + "step": 5039 + }, + { + "epoch": 0.7953290200410289, + "grad_norm": 0.6096466183662415, + "learning_rate": 4.204349501148319e-06, + "loss": 0.5683, + "step": 5040 + }, + { + "epoch": 0.7954868234180211, + "grad_norm": 0.6344251036643982, + "learning_rate": 4.204045636928009e-06, + "loss": 0.5657, + "step": 5041 + }, + { + "epoch": 0.7956446267950135, + "grad_norm": 0.5815714597702026, + "learning_rate": 4.203741725680189e-06, + "loss": 0.5779, + "step": 5042 + }, + { + "epoch": 0.7958024301720057, + "grad_norm": 0.6112968921661377, + "learning_rate": 4.2034377674132485e-06, + "loss": 0.5721, + "step": 5043 + }, + { + "epoch": 0.7959602335489979, + "grad_norm": 0.5725970268249512, + "learning_rate": 4.2031337621355735e-06, + "loss": 0.5439, + "step": 5044 + }, + { + "epoch": 0.7961180369259903, + "grad_norm": 0.6191085577011108, + "learning_rate": 4.202829709855555e-06, + "loss": 0.5759, + "step": 5045 + }, + { + "epoch": 0.7962758403029825, + "grad_norm": 0.604290783405304, + "learning_rate": 4.202525610581584e-06, + "loss": 0.5554, + "step": 5046 + }, + { + "epoch": 0.7964336436799747, + "grad_norm": 0.6251159310340881, + "learning_rate": 4.202221464322053e-06, + "loss": 0.541, + "step": 5047 + }, + { + "epoch": 0.796591447056967, + "grad_norm": 0.6107056736946106, + "learning_rate": 4.201917271085355e-06, + "loss": 0.578, + "step": 5048 + }, + { + "epoch": 0.7967492504339593, + "grad_norm": 0.6085303425788879, + "learning_rate": 4.201613030879887e-06, + "loss": 0.5788, + "step": 5049 + }, + { + "epoch": 0.7969070538109515, + "grad_norm": 0.601262092590332, + "learning_rate": 4.201308743714042e-06, + "loss": 0.572, + "step": 5050 + }, + { + "epoch": 0.7970648571879438, + "grad_norm": 0.5988460183143616, + "learning_rate": 4.201004409596221e-06, + "loss": 0.5249, + "step": 5051 + }, + { + "epoch": 0.7972226605649361, + "grad_norm": 0.5918310880661011, + "learning_rate": 4.20070002853482e-06, + "loss": 0.5556, + "step": 5052 + }, + { + "epoch": 0.7973804639419284, + "grad_norm": 0.5546604990959167, + "learning_rate": 4.200395600538241e-06, + "loss": 0.5349, + "step": 5053 + }, + { + "epoch": 0.7975382673189206, + "grad_norm": 0.5955120921134949, + "learning_rate": 4.200091125614885e-06, + "loss": 0.5461, + "step": 5054 + }, + { + "epoch": 0.7976960706959129, + "grad_norm": 0.5772300958633423, + "learning_rate": 4.1997866037731545e-06, + "loss": 0.589, + "step": 5055 + }, + { + "epoch": 0.7978538740729052, + "grad_norm": 0.5698021054267883, + "learning_rate": 4.199482035021454e-06, + "loss": 0.5423, + "step": 5056 + }, + { + "epoch": 0.7980116774498974, + "grad_norm": 0.6039370894432068, + "learning_rate": 4.199177419368188e-06, + "loss": 0.5829, + "step": 5057 + }, + { + "epoch": 0.7981694808268897, + "grad_norm": 0.5726944804191589, + "learning_rate": 4.198872756821765e-06, + "loss": 0.5595, + "step": 5058 + }, + { + "epoch": 0.798327284203882, + "grad_norm": 0.6009248495101929, + "learning_rate": 4.198568047390591e-06, + "loss": 0.5815, + "step": 5059 + }, + { + "epoch": 0.7984850875808742, + "grad_norm": 0.5747242569923401, + "learning_rate": 4.198263291083075e-06, + "loss": 0.5271, + "step": 5060 + }, + { + "epoch": 0.7986428909578664, + "grad_norm": 0.5567346811294556, + "learning_rate": 4.19795848790763e-06, + "loss": 0.5877, + "step": 5061 + }, + { + "epoch": 0.7988006943348588, + "grad_norm": 0.5757299065589905, + "learning_rate": 4.197653637872667e-06, + "loss": 0.5698, + "step": 5062 + }, + { + "epoch": 0.798958497711851, + "grad_norm": 0.6236106753349304, + "learning_rate": 4.197348740986597e-06, + "loss": 0.5631, + "step": 5063 + }, + { + "epoch": 0.7991163010888434, + "grad_norm": 0.618693470954895, + "learning_rate": 4.197043797257835e-06, + "loss": 0.5451, + "step": 5064 + }, + { + "epoch": 0.7992741044658356, + "grad_norm": 0.5904911160469055, + "learning_rate": 4.196738806694799e-06, + "loss": 0.5402, + "step": 5065 + }, + { + "epoch": 0.7994319078428278, + "grad_norm": 0.6026930809020996, + "learning_rate": 4.1964337693059044e-06, + "loss": 0.538, + "step": 5066 + }, + { + "epoch": 0.7995897112198201, + "grad_norm": 0.5748620629310608, + "learning_rate": 4.196128685099569e-06, + "loss": 0.5657, + "step": 5067 + }, + { + "epoch": 0.7997475145968124, + "grad_norm": 0.5963892936706543, + "learning_rate": 4.195823554084213e-06, + "loss": 0.5572, + "step": 5068 + }, + { + "epoch": 0.7999053179738046, + "grad_norm": 0.7505912184715271, + "learning_rate": 4.195518376268258e-06, + "loss": 0.5773, + "step": 5069 + }, + { + "epoch": 0.8000631213507969, + "grad_norm": 0.6201586723327637, + "learning_rate": 4.195213151660126e-06, + "loss": 0.5753, + "step": 5070 + }, + { + "epoch": 0.8002209247277892, + "grad_norm": 0.5922003984451294, + "learning_rate": 4.1949078802682385e-06, + "loss": 0.527, + "step": 5071 + }, + { + "epoch": 0.8003787281047814, + "grad_norm": 0.5967820882797241, + "learning_rate": 4.1946025621010225e-06, + "loss": 0.5998, + "step": 5072 + }, + { + "epoch": 0.8005365314817737, + "grad_norm": 0.5649139881134033, + "learning_rate": 4.1942971971669025e-06, + "loss": 0.5454, + "step": 5073 + }, + { + "epoch": 0.800694334858766, + "grad_norm": 0.5513618588447571, + "learning_rate": 4.193991785474307e-06, + "loss": 0.5305, + "step": 5074 + }, + { + "epoch": 0.8008521382357583, + "grad_norm": 0.5991997718811035, + "learning_rate": 4.193686327031664e-06, + "loss": 0.5719, + "step": 5075 + }, + { + "epoch": 0.8010099416127505, + "grad_norm": 0.5987465977668762, + "learning_rate": 4.193380821847404e-06, + "loss": 0.5797, + "step": 5076 + }, + { + "epoch": 0.8011677449897427, + "grad_norm": 0.566568911075592, + "learning_rate": 4.193075269929958e-06, + "loss": 0.5974, + "step": 5077 + }, + { + "epoch": 0.8013255483667351, + "grad_norm": 0.5947042107582092, + "learning_rate": 4.192769671287757e-06, + "loss": 0.5779, + "step": 5078 + }, + { + "epoch": 0.8014833517437273, + "grad_norm": 0.5941854119300842, + "learning_rate": 4.192464025929237e-06, + "loss": 0.5916, + "step": 5079 + }, + { + "epoch": 0.8016411551207195, + "grad_norm": 0.5819000601768494, + "learning_rate": 4.192158333862832e-06, + "loss": 0.5659, + "step": 5080 + }, + { + "epoch": 0.8017989584977119, + "grad_norm": 0.5815088152885437, + "learning_rate": 4.191852595096978e-06, + "loss": 0.531, + "step": 5081 + }, + { + "epoch": 0.8019567618747041, + "grad_norm": 0.579521119594574, + "learning_rate": 4.191546809640113e-06, + "loss": 0.5952, + "step": 5082 + }, + { + "epoch": 0.8021145652516963, + "grad_norm": 0.6125339865684509, + "learning_rate": 4.191240977500677e-06, + "loss": 0.5931, + "step": 5083 + }, + { + "epoch": 0.8022723686286887, + "grad_norm": 0.5795685648918152, + "learning_rate": 4.190935098687108e-06, + "loss": 0.575, + "step": 5084 + }, + { + "epoch": 0.8024301720056809, + "grad_norm": 0.5888353586196899, + "learning_rate": 4.19062917320785e-06, + "loss": 0.578, + "step": 5085 + }, + { + "epoch": 0.8025879753826732, + "grad_norm": 0.5778366327285767, + "learning_rate": 4.190323201071343e-06, + "loss": 0.55, + "step": 5086 + }, + { + "epoch": 0.8027457787596655, + "grad_norm": 0.6165342330932617, + "learning_rate": 4.190017182286033e-06, + "loss": 0.5512, + "step": 5087 + }, + { + "epoch": 0.8029035821366577, + "grad_norm": 0.5895323753356934, + "learning_rate": 4.189711116860367e-06, + "loss": 0.5595, + "step": 5088 + }, + { + "epoch": 0.80306138551365, + "grad_norm": 0.6076581478118896, + "learning_rate": 4.1894050048027865e-06, + "loss": 0.5842, + "step": 5089 + }, + { + "epoch": 0.8032191888906423, + "grad_norm": 0.5775537490844727, + "learning_rate": 4.189098846121745e-06, + "loss": 0.5872, + "step": 5090 + }, + { + "epoch": 0.8033769922676345, + "grad_norm": 0.5869188904762268, + "learning_rate": 4.1887926408256886e-06, + "loss": 0.5777, + "step": 5091 + }, + { + "epoch": 0.8035347956446268, + "grad_norm": 0.5549836754798889, + "learning_rate": 4.1884863889230685e-06, + "loss": 0.559, + "step": 5092 + }, + { + "epoch": 0.803692599021619, + "grad_norm": 0.5979322195053101, + "learning_rate": 4.188180090422337e-06, + "loss": 0.5874, + "step": 5093 + }, + { + "epoch": 0.8038504023986114, + "grad_norm": 0.5774111151695251, + "learning_rate": 4.187873745331946e-06, + "loss": 0.5867, + "step": 5094 + }, + { + "epoch": 0.8040082057756036, + "grad_norm": 0.5907607078552246, + "learning_rate": 4.187567353660352e-06, + "loss": 0.599, + "step": 5095 + }, + { + "epoch": 0.8041660091525958, + "grad_norm": 0.5928268432617188, + "learning_rate": 4.187260915416008e-06, + "loss": 0.5422, + "step": 5096 + }, + { + "epoch": 0.8043238125295882, + "grad_norm": 0.5656542778015137, + "learning_rate": 4.1869544306073726e-06, + "loss": 0.5676, + "step": 5097 + }, + { + "epoch": 0.8044816159065804, + "grad_norm": 0.5951868295669556, + "learning_rate": 4.186647899242904e-06, + "loss": 0.5816, + "step": 5098 + }, + { + "epoch": 0.8046394192835726, + "grad_norm": 0.5742824673652649, + "learning_rate": 4.1863413213310615e-06, + "loss": 0.5577, + "step": 5099 + }, + { + "epoch": 0.804797222660565, + "grad_norm": 0.6118476986885071, + "learning_rate": 4.186034696880305e-06, + "loss": 0.6083, + "step": 5100 + }, + { + "epoch": 0.8049550260375572, + "grad_norm": 0.6030288934707642, + "learning_rate": 4.185728025899098e-06, + "loss": 0.5822, + "step": 5101 + }, + { + "epoch": 0.8051128294145494, + "grad_norm": 0.5788543224334717, + "learning_rate": 4.185421308395903e-06, + "loss": 0.5731, + "step": 5102 + }, + { + "epoch": 0.8052706327915418, + "grad_norm": 0.6168650388717651, + "learning_rate": 4.1851145443791855e-06, + "loss": 0.5611, + "step": 5103 + }, + { + "epoch": 0.805428436168534, + "grad_norm": 0.59797203540802, + "learning_rate": 4.184807733857411e-06, + "loss": 0.5708, + "step": 5104 + }, + { + "epoch": 0.8055862395455263, + "grad_norm": 0.6146509647369385, + "learning_rate": 4.184500876839046e-06, + "loss": 0.5342, + "step": 5105 + }, + { + "epoch": 0.8057440429225186, + "grad_norm": 0.5964401960372925, + "learning_rate": 4.184193973332559e-06, + "loss": 0.5975, + "step": 5106 + }, + { + "epoch": 0.8059018462995108, + "grad_norm": 0.568360447883606, + "learning_rate": 4.183887023346421e-06, + "loss": 0.6056, + "step": 5107 + }, + { + "epoch": 0.8060596496765031, + "grad_norm": 0.6098117828369141, + "learning_rate": 4.183580026889103e-06, + "loss": 0.5723, + "step": 5108 + }, + { + "epoch": 0.8062174530534953, + "grad_norm": 0.6029707789421082, + "learning_rate": 4.183272983969077e-06, + "loss": 0.5522, + "step": 5109 + }, + { + "epoch": 0.8063752564304876, + "grad_norm": 0.5569913983345032, + "learning_rate": 4.182965894594815e-06, + "loss": 0.5629, + "step": 5110 + }, + { + "epoch": 0.8065330598074799, + "grad_norm": 0.6313096284866333, + "learning_rate": 4.182658758774795e-06, + "loss": 0.6198, + "step": 5111 + }, + { + "epoch": 0.8066908631844721, + "grad_norm": 0.6167113184928894, + "learning_rate": 4.18235157651749e-06, + "loss": 0.5769, + "step": 5112 + }, + { + "epoch": 0.8068486665614644, + "grad_norm": 0.5768113732337952, + "learning_rate": 4.18204434783138e-06, + "loss": 0.5827, + "step": 5113 + }, + { + "epoch": 0.8070064699384567, + "grad_norm": 0.5707524418830872, + "learning_rate": 4.181737072724943e-06, + "loss": 0.5735, + "step": 5114 + }, + { + "epoch": 0.8071642733154489, + "grad_norm": 0.5693761706352234, + "learning_rate": 4.181429751206658e-06, + "loss": 0.5726, + "step": 5115 + }, + { + "epoch": 0.8073220766924413, + "grad_norm": 0.5878226161003113, + "learning_rate": 4.181122383285008e-06, + "loss": 0.5781, + "step": 5116 + }, + { + "epoch": 0.8074798800694335, + "grad_norm": 0.6207099556922913, + "learning_rate": 4.180814968968475e-06, + "loss": 0.5759, + "step": 5117 + }, + { + "epoch": 0.8076376834464257, + "grad_norm": 0.6057984232902527, + "learning_rate": 4.180507508265542e-06, + "loss": 0.549, + "step": 5118 + }, + { + "epoch": 0.8077954868234181, + "grad_norm": 0.6211122274398804, + "learning_rate": 4.180200001184695e-06, + "loss": 0.5839, + "step": 5119 + }, + { + "epoch": 0.8079532902004103, + "grad_norm": 0.5843925476074219, + "learning_rate": 4.1798924477344206e-06, + "loss": 0.5721, + "step": 5120 + }, + { + "epoch": 0.8081110935774025, + "grad_norm": 0.5705032348632812, + "learning_rate": 4.179584847923206e-06, + "loss": 0.5574, + "step": 5121 + }, + { + "epoch": 0.8082688969543949, + "grad_norm": 0.579209566116333, + "learning_rate": 4.179277201759541e-06, + "loss": 0.5556, + "step": 5122 + }, + { + "epoch": 0.8084267003313871, + "grad_norm": 0.6185855865478516, + "learning_rate": 4.178969509251915e-06, + "loss": 0.5937, + "step": 5123 + }, + { + "epoch": 0.8085845037083793, + "grad_norm": 0.6045500636100769, + "learning_rate": 4.178661770408819e-06, + "loss": 0.6429, + "step": 5124 + }, + { + "epoch": 0.8087423070853716, + "grad_norm": 0.615962564945221, + "learning_rate": 4.178353985238747e-06, + "loss": 0.5509, + "step": 5125 + }, + { + "epoch": 0.8089001104623639, + "grad_norm": 0.5749959945678711, + "learning_rate": 4.178046153750194e-06, + "loss": 0.5549, + "step": 5126 + }, + { + "epoch": 0.8090579138393562, + "grad_norm": 0.6165571212768555, + "learning_rate": 4.177738275951652e-06, + "loss": 0.542, + "step": 5127 + }, + { + "epoch": 0.8092157172163484, + "grad_norm": 0.5645392537117004, + "learning_rate": 4.177430351851623e-06, + "loss": 0.5386, + "step": 5128 + }, + { + "epoch": 0.8093735205933407, + "grad_norm": 0.6100903153419495, + "learning_rate": 4.177122381458599e-06, + "loss": 0.5766, + "step": 5129 + }, + { + "epoch": 0.809531323970333, + "grad_norm": 0.5915371775627136, + "learning_rate": 4.176814364781084e-06, + "loss": 0.5768, + "step": 5130 + }, + { + "epoch": 0.8096891273473252, + "grad_norm": 0.5757115483283997, + "learning_rate": 4.176506301827576e-06, + "loss": 0.5629, + "step": 5131 + }, + { + "epoch": 0.8098469307243175, + "grad_norm": 0.5952584147453308, + "learning_rate": 4.176198192606577e-06, + "loss": 0.5949, + "step": 5132 + }, + { + "epoch": 0.8100047341013098, + "grad_norm": 0.5969513654708862, + "learning_rate": 4.175890037126591e-06, + "loss": 0.5608, + "step": 5133 + }, + { + "epoch": 0.810162537478302, + "grad_norm": 0.5948978066444397, + "learning_rate": 4.175581835396122e-06, + "loss": 0.5907, + "step": 5134 + }, + { + "epoch": 0.8103203408552943, + "grad_norm": 0.5988296270370483, + "learning_rate": 4.1752735874236745e-06, + "loss": 0.565, + "step": 5135 + }, + { + "epoch": 0.8104781442322866, + "grad_norm": 0.6162803173065186, + "learning_rate": 4.174965293217757e-06, + "loss": 0.5819, + "step": 5136 + }, + { + "epoch": 0.8106359476092788, + "grad_norm": 0.5726349949836731, + "learning_rate": 4.174656952786877e-06, + "loss": 0.5468, + "step": 5137 + }, + { + "epoch": 0.8107937509862712, + "grad_norm": 0.5835108757019043, + "learning_rate": 4.174348566139544e-06, + "loss": 0.5864, + "step": 5138 + }, + { + "epoch": 0.8109515543632634, + "grad_norm": 0.5961035490036011, + "learning_rate": 4.1740401332842685e-06, + "loss": 0.5769, + "step": 5139 + }, + { + "epoch": 0.8111093577402556, + "grad_norm": 0.5864094495773315, + "learning_rate": 4.173731654229562e-06, + "loss": 0.5466, + "step": 5140 + }, + { + "epoch": 0.811267161117248, + "grad_norm": 0.5887922644615173, + "learning_rate": 4.173423128983939e-06, + "loss": 0.5355, + "step": 5141 + }, + { + "epoch": 0.8114249644942402, + "grad_norm": 0.5719665288925171, + "learning_rate": 4.173114557555914e-06, + "loss": 0.5635, + "step": 5142 + }, + { + "epoch": 0.8115827678712324, + "grad_norm": 0.5980278849601746, + "learning_rate": 4.172805939954002e-06, + "loss": 0.5905, + "step": 5143 + }, + { + "epoch": 0.8117405712482247, + "grad_norm": 0.5861755013465881, + "learning_rate": 4.17249727618672e-06, + "loss": 0.5621, + "step": 5144 + }, + { + "epoch": 0.811898374625217, + "grad_norm": 0.5940035581588745, + "learning_rate": 4.172188566262587e-06, + "loss": 0.5783, + "step": 5145 + }, + { + "epoch": 0.8120561780022092, + "grad_norm": 0.6159669160842896, + "learning_rate": 4.171879810190122e-06, + "loss": 0.5701, + "step": 5146 + }, + { + "epoch": 0.8122139813792015, + "grad_norm": 0.5991894006729126, + "learning_rate": 4.171571007977847e-06, + "loss": 0.5621, + "step": 5147 + }, + { + "epoch": 0.8123717847561938, + "grad_norm": 0.5843861699104309, + "learning_rate": 4.171262159634283e-06, + "loss": 0.5706, + "step": 5148 + }, + { + "epoch": 0.8125295881331861, + "grad_norm": 0.6173120737075806, + "learning_rate": 4.170953265167954e-06, + "loss": 0.5691, + "step": 5149 + }, + { + "epoch": 0.8126873915101783, + "grad_norm": 0.5769199132919312, + "learning_rate": 4.170644324587384e-06, + "loss": 0.5418, + "step": 5150 + }, + { + "epoch": 0.8128451948871706, + "grad_norm": 0.5659658908843994, + "learning_rate": 4.170335337901101e-06, + "loss": 0.5729, + "step": 5151 + }, + { + "epoch": 0.8130029982641629, + "grad_norm": 0.6152730584144592, + "learning_rate": 4.170026305117631e-06, + "loss": 0.516, + "step": 5152 + }, + { + "epoch": 0.8131608016411551, + "grad_norm": 0.5868449807167053, + "learning_rate": 4.169717226245501e-06, + "loss": 0.6099, + "step": 5153 + }, + { + "epoch": 0.8133186050181473, + "grad_norm": 0.6017915606498718, + "learning_rate": 4.1694081012932436e-06, + "loss": 0.5897, + "step": 5154 + }, + { + "epoch": 0.8134764083951397, + "grad_norm": 0.5541865229606628, + "learning_rate": 4.169098930269388e-06, + "loss": 0.5538, + "step": 5155 + }, + { + "epoch": 0.8136342117721319, + "grad_norm": 0.5925980806350708, + "learning_rate": 4.168789713182468e-06, + "loss": 0.5578, + "step": 5156 + }, + { + "epoch": 0.8137920151491241, + "grad_norm": 0.5703408718109131, + "learning_rate": 4.168480450041016e-06, + "loss": 0.5341, + "step": 5157 + }, + { + "epoch": 0.8139498185261165, + "grad_norm": 0.578650176525116, + "learning_rate": 4.168171140853567e-06, + "loss": 0.5361, + "step": 5158 + }, + { + "epoch": 0.8141076219031087, + "grad_norm": 0.5746230483055115, + "learning_rate": 4.167861785628658e-06, + "loss": 0.5609, + "step": 5159 + }, + { + "epoch": 0.814265425280101, + "grad_norm": 0.5742037296295166, + "learning_rate": 4.167552384374826e-06, + "loss": 0.5397, + "step": 5160 + }, + { + "epoch": 0.8144232286570933, + "grad_norm": 0.5586718320846558, + "learning_rate": 4.16724293710061e-06, + "loss": 0.5823, + "step": 5161 + }, + { + "epoch": 0.8145810320340855, + "grad_norm": 0.5615296363830566, + "learning_rate": 4.166933443814549e-06, + "loss": 0.5747, + "step": 5162 + }, + { + "epoch": 0.8147388354110778, + "grad_norm": 0.5989576578140259, + "learning_rate": 4.166623904525185e-06, + "loss": 0.5633, + "step": 5163 + }, + { + "epoch": 0.8148966387880701, + "grad_norm": 0.5993561148643494, + "learning_rate": 4.166314319241061e-06, + "loss": 0.5747, + "step": 5164 + }, + { + "epoch": 0.8150544421650623, + "grad_norm": 0.5663978457450867, + "learning_rate": 4.16600468797072e-06, + "loss": 0.5822, + "step": 5165 + }, + { + "epoch": 0.8152122455420546, + "grad_norm": 0.5634768009185791, + "learning_rate": 4.165695010722708e-06, + "loss": 0.545, + "step": 5166 + }, + { + "epoch": 0.8153700489190469, + "grad_norm": 0.5771138072013855, + "learning_rate": 4.16538528750557e-06, + "loss": 0.5447, + "step": 5167 + }, + { + "epoch": 0.8155278522960392, + "grad_norm": 0.5983452200889587, + "learning_rate": 4.165075518327855e-06, + "loss": 0.5579, + "step": 5168 + }, + { + "epoch": 0.8156856556730314, + "grad_norm": 0.5711935758590698, + "learning_rate": 4.16476570319811e-06, + "loss": 0.5815, + "step": 5169 + }, + { + "epoch": 0.8158434590500236, + "grad_norm": 0.5631974339485168, + "learning_rate": 4.164455842124887e-06, + "loss": 0.5426, + "step": 5170 + }, + { + "epoch": 0.816001262427016, + "grad_norm": 0.5957891345024109, + "learning_rate": 4.164145935116737e-06, + "loss": 0.587, + "step": 5171 + }, + { + "epoch": 0.8161590658040082, + "grad_norm": 0.588132917881012, + "learning_rate": 4.163835982182211e-06, + "loss": 0.5815, + "step": 5172 + }, + { + "epoch": 0.8163168691810004, + "grad_norm": 0.5753324031829834, + "learning_rate": 4.163525983329867e-06, + "loss": 0.551, + "step": 5173 + }, + { + "epoch": 0.8164746725579928, + "grad_norm": 0.5895517468452454, + "learning_rate": 4.163215938568256e-06, + "loss": 0.5368, + "step": 5174 + }, + { + "epoch": 0.816632475934985, + "grad_norm": 0.5795072317123413, + "learning_rate": 4.162905847905936e-06, + "loss": 0.5592, + "step": 5175 + }, + { + "epoch": 0.8167902793119772, + "grad_norm": 0.5526486039161682, + "learning_rate": 4.1625957113514644e-06, + "loss": 0.5637, + "step": 5176 + }, + { + "epoch": 0.8169480826889696, + "grad_norm": 0.6028631329536438, + "learning_rate": 4.162285528913401e-06, + "loss": 0.5908, + "step": 5177 + }, + { + "epoch": 0.8171058860659618, + "grad_norm": 0.5674046277999878, + "learning_rate": 4.161975300600306e-06, + "loss": 0.5571, + "step": 5178 + }, + { + "epoch": 0.8172636894429541, + "grad_norm": 0.5871782898902893, + "learning_rate": 4.161665026420741e-06, + "loss": 0.5679, + "step": 5179 + }, + { + "epoch": 0.8174214928199464, + "grad_norm": 0.6006730198860168, + "learning_rate": 4.161354706383268e-06, + "loss": 0.5958, + "step": 5180 + }, + { + "epoch": 0.8175792961969386, + "grad_norm": 0.5682645440101624, + "learning_rate": 4.161044340496452e-06, + "loss": 0.5859, + "step": 5181 + }, + { + "epoch": 0.8177370995739309, + "grad_norm": 0.6471999287605286, + "learning_rate": 4.160733928768857e-06, + "loss": 0.5338, + "step": 5182 + }, + { + "epoch": 0.8178949029509232, + "grad_norm": 0.5902939438819885, + "learning_rate": 4.16042347120905e-06, + "loss": 0.5822, + "step": 5183 + }, + { + "epoch": 0.8180527063279154, + "grad_norm": 0.636600136756897, + "learning_rate": 4.160112967825602e-06, + "loss": 0.6225, + "step": 5184 + }, + { + "epoch": 0.8182105097049077, + "grad_norm": 0.5808992385864258, + "learning_rate": 4.159802418627077e-06, + "loss": 0.5731, + "step": 5185 + }, + { + "epoch": 0.8183683130819, + "grad_norm": 0.5817760229110718, + "learning_rate": 4.1594918236220485e-06, + "loss": 0.5856, + "step": 5186 + }, + { + "epoch": 0.8185261164588922, + "grad_norm": 0.5686932802200317, + "learning_rate": 4.159181182819087e-06, + "loss": 0.5851, + "step": 5187 + }, + { + "epoch": 0.8186839198358845, + "grad_norm": 0.5998406410217285, + "learning_rate": 4.1588704962267675e-06, + "loss": 0.571, + "step": 5188 + }, + { + "epoch": 0.8188417232128767, + "grad_norm": 0.5951674580574036, + "learning_rate": 4.158559763853662e-06, + "loss": 0.5791, + "step": 5189 + }, + { + "epoch": 0.8189995265898691, + "grad_norm": 0.5799797177314758, + "learning_rate": 4.158248985708346e-06, + "loss": 0.5659, + "step": 5190 + }, + { + "epoch": 0.8191573299668613, + "grad_norm": 0.5874243378639221, + "learning_rate": 4.157938161799398e-06, + "loss": 0.5756, + "step": 5191 + }, + { + "epoch": 0.8193151333438535, + "grad_norm": 0.594194769859314, + "learning_rate": 4.157627292135394e-06, + "loss": 0.547, + "step": 5192 + }, + { + "epoch": 0.8194729367208459, + "grad_norm": 0.583528995513916, + "learning_rate": 4.157316376724915e-06, + "loss": 0.5888, + "step": 5193 + }, + { + "epoch": 0.8196307400978381, + "grad_norm": 0.5927237272262573, + "learning_rate": 4.15700541557654e-06, + "loss": 0.5698, + "step": 5194 + }, + { + "epoch": 0.8197885434748303, + "grad_norm": 0.6180377006530762, + "learning_rate": 4.15669440869885e-06, + "loss": 0.5359, + "step": 5195 + }, + { + "epoch": 0.8199463468518227, + "grad_norm": 0.5686683058738708, + "learning_rate": 4.156383356100431e-06, + "loss": 0.566, + "step": 5196 + }, + { + "epoch": 0.8201041502288149, + "grad_norm": 0.5919315218925476, + "learning_rate": 4.156072257789866e-06, + "loss": 0.5988, + "step": 5197 + }, + { + "epoch": 0.8202619536058071, + "grad_norm": 0.624677300453186, + "learning_rate": 4.155761113775739e-06, + "loss": 0.576, + "step": 5198 + }, + { + "epoch": 0.8204197569827995, + "grad_norm": 0.549700915813446, + "learning_rate": 4.155449924066639e-06, + "loss": 0.5614, + "step": 5199 + }, + { + "epoch": 0.8205775603597917, + "grad_norm": 0.6061583757400513, + "learning_rate": 4.155138688671153e-06, + "loss": 0.569, + "step": 5200 + }, + { + "epoch": 0.820735363736784, + "grad_norm": 0.5630302429199219, + "learning_rate": 4.15482740759787e-06, + "loss": 0.5664, + "step": 5201 + }, + { + "epoch": 0.8208931671137762, + "grad_norm": 0.6100384593009949, + "learning_rate": 4.154516080855382e-06, + "loss": 0.5666, + "step": 5202 + }, + { + "epoch": 0.8210509704907685, + "grad_norm": 0.6106812357902527, + "learning_rate": 4.154204708452279e-06, + "loss": 0.5386, + "step": 5203 + }, + { + "epoch": 0.8212087738677608, + "grad_norm": 0.5883539319038391, + "learning_rate": 4.153893290397155e-06, + "loss": 0.5346, + "step": 5204 + }, + { + "epoch": 0.821366577244753, + "grad_norm": 0.5555075407028198, + "learning_rate": 4.153581826698605e-06, + "loss": 0.5864, + "step": 5205 + }, + { + "epoch": 0.8215243806217453, + "grad_norm": 0.6131466627120972, + "learning_rate": 4.153270317365224e-06, + "loss": 0.5623, + "step": 5206 + }, + { + "epoch": 0.8216821839987376, + "grad_norm": 0.5789414048194885, + "learning_rate": 4.1529587624056085e-06, + "loss": 0.5679, + "step": 5207 + }, + { + "epoch": 0.8218399873757298, + "grad_norm": 0.5808063745498657, + "learning_rate": 4.1526471618283586e-06, + "loss": 0.5642, + "step": 5208 + }, + { + "epoch": 0.8219977907527221, + "grad_norm": 0.6149489879608154, + "learning_rate": 4.152335515642071e-06, + "loss": 0.5818, + "step": 5209 + }, + { + "epoch": 0.8221555941297144, + "grad_norm": 0.5525789260864258, + "learning_rate": 4.152023823855348e-06, + "loss": 0.56, + "step": 5210 + }, + { + "epoch": 0.8223133975067066, + "grad_norm": 0.5972962379455566, + "learning_rate": 4.15171208647679e-06, + "loss": 0.5624, + "step": 5211 + }, + { + "epoch": 0.822471200883699, + "grad_norm": 0.5654304027557373, + "learning_rate": 4.151400303515003e-06, + "loss": 0.5954, + "step": 5212 + }, + { + "epoch": 0.8226290042606912, + "grad_norm": 0.5958938002586365, + "learning_rate": 4.151088474978589e-06, + "loss": 0.5748, + "step": 5213 + }, + { + "epoch": 0.8227868076376834, + "grad_norm": 0.6042104363441467, + "learning_rate": 4.150776600876155e-06, + "loss": 0.5688, + "step": 5214 + }, + { + "epoch": 0.8229446110146758, + "grad_norm": 0.586826741695404, + "learning_rate": 4.150464681216307e-06, + "loss": 0.5652, + "step": 5215 + }, + { + "epoch": 0.823102414391668, + "grad_norm": 0.5875826478004456, + "learning_rate": 4.1501527160076535e-06, + "loss": 0.5378, + "step": 5216 + }, + { + "epoch": 0.8232602177686602, + "grad_norm": 0.5649951696395874, + "learning_rate": 4.149840705258804e-06, + "loss": 0.5993, + "step": 5217 + }, + { + "epoch": 0.8234180211456525, + "grad_norm": 0.5563952922821045, + "learning_rate": 4.149528648978371e-06, + "loss": 0.5775, + "step": 5218 + }, + { + "epoch": 0.8235758245226448, + "grad_norm": 0.5822829604148865, + "learning_rate": 4.149216547174963e-06, + "loss": 0.5454, + "step": 5219 + }, + { + "epoch": 0.823733627899637, + "grad_norm": 0.5899060368537903, + "learning_rate": 4.148904399857196e-06, + "loss": 0.5853, + "step": 5220 + }, + { + "epoch": 0.8238914312766293, + "grad_norm": 0.5801687836647034, + "learning_rate": 4.1485922070336846e-06, + "loss": 0.5692, + "step": 5221 + }, + { + "epoch": 0.8240492346536216, + "grad_norm": 0.5604512691497803, + "learning_rate": 4.148279968713042e-06, + "loss": 0.5865, + "step": 5222 + }, + { + "epoch": 0.8242070380306139, + "grad_norm": 0.5684986710548401, + "learning_rate": 4.147967684903888e-06, + "loss": 0.5399, + "step": 5223 + }, + { + "epoch": 0.8243648414076061, + "grad_norm": 0.5964588522911072, + "learning_rate": 4.147655355614839e-06, + "loss": 0.58, + "step": 5224 + }, + { + "epoch": 0.8245226447845984, + "grad_norm": 0.5843433737754822, + "learning_rate": 4.147342980854516e-06, + "loss": 0.5735, + "step": 5225 + }, + { + "epoch": 0.8246804481615907, + "grad_norm": 0.6098415851593018, + "learning_rate": 4.147030560631539e-06, + "loss": 0.5784, + "step": 5226 + }, + { + "epoch": 0.8248382515385829, + "grad_norm": 0.6341264843940735, + "learning_rate": 4.146718094954529e-06, + "loss": 0.5439, + "step": 5227 + }, + { + "epoch": 0.8249960549155751, + "grad_norm": 0.6028715968132019, + "learning_rate": 4.146405583832112e-06, + "loss": 0.5834, + "step": 5228 + }, + { + "epoch": 0.8251538582925675, + "grad_norm": 0.603470504283905, + "learning_rate": 4.146093027272909e-06, + "loss": 0.5663, + "step": 5229 + }, + { + "epoch": 0.8253116616695597, + "grad_norm": 0.6264022588729858, + "learning_rate": 4.145780425285549e-06, + "loss": 0.5766, + "step": 5230 + }, + { + "epoch": 0.8254694650465519, + "grad_norm": 0.570817232131958, + "learning_rate": 4.145467777878658e-06, + "loss": 0.5616, + "step": 5231 + }, + { + "epoch": 0.8256272684235443, + "grad_norm": 0.586862325668335, + "learning_rate": 4.145155085060863e-06, + "loss": 0.5886, + "step": 5232 + }, + { + "epoch": 0.8257850718005365, + "grad_norm": 0.6168842315673828, + "learning_rate": 4.144842346840795e-06, + "loss": 0.5739, + "step": 5233 + }, + { + "epoch": 0.8259428751775288, + "grad_norm": 0.5530939698219299, + "learning_rate": 4.144529563227084e-06, + "loss": 0.5946, + "step": 5234 + }, + { + "epoch": 0.8261006785545211, + "grad_norm": 0.6278512477874756, + "learning_rate": 4.1442167342283635e-06, + "loss": 0.5932, + "step": 5235 + }, + { + "epoch": 0.8262584819315133, + "grad_norm": 0.5609435439109802, + "learning_rate": 4.143903859853265e-06, + "loss": 0.5613, + "step": 5236 + }, + { + "epoch": 0.8264162853085056, + "grad_norm": 0.625562846660614, + "learning_rate": 4.1435909401104245e-06, + "loss": 0.5555, + "step": 5237 + }, + { + "epoch": 0.8265740886854979, + "grad_norm": 0.5909993648529053, + "learning_rate": 4.143277975008477e-06, + "loss": 0.546, + "step": 5238 + }, + { + "epoch": 0.8267318920624901, + "grad_norm": 0.551255464553833, + "learning_rate": 4.14296496455606e-06, + "loss": 0.5993, + "step": 5239 + }, + { + "epoch": 0.8268896954394824, + "grad_norm": 0.5510579347610474, + "learning_rate": 4.142651908761811e-06, + "loss": 0.5365, + "step": 5240 + }, + { + "epoch": 0.8270474988164747, + "grad_norm": 0.5898503661155701, + "learning_rate": 4.142338807634371e-06, + "loss": 0.5671, + "step": 5241 + }, + { + "epoch": 0.8272053021934669, + "grad_norm": 0.6199826598167419, + "learning_rate": 4.14202566118238e-06, + "loss": 0.5503, + "step": 5242 + }, + { + "epoch": 0.8273631055704592, + "grad_norm": 0.5912417769432068, + "learning_rate": 4.141712469414479e-06, + "loss": 0.5899, + "step": 5243 + }, + { + "epoch": 0.8275209089474515, + "grad_norm": 0.566356897354126, + "learning_rate": 4.141399232339313e-06, + "loss": 0.5448, + "step": 5244 + }, + { + "epoch": 0.8276787123244438, + "grad_norm": 0.5870254039764404, + "learning_rate": 4.141085949965527e-06, + "loss": 0.5802, + "step": 5245 + }, + { + "epoch": 0.827836515701436, + "grad_norm": 0.5999539494514465, + "learning_rate": 4.140772622301765e-06, + "loss": 0.5378, + "step": 5246 + }, + { + "epoch": 0.8279943190784282, + "grad_norm": 0.59329754114151, + "learning_rate": 4.140459249356675e-06, + "loss": 0.5429, + "step": 5247 + }, + { + "epoch": 0.8281521224554206, + "grad_norm": 0.5891883969306946, + "learning_rate": 4.140145831138904e-06, + "loss": 0.5859, + "step": 5248 + }, + { + "epoch": 0.8283099258324128, + "grad_norm": 0.5812464952468872, + "learning_rate": 4.139832367657105e-06, + "loss": 0.6067, + "step": 5249 + }, + { + "epoch": 0.828467729209405, + "grad_norm": 0.5879971981048584, + "learning_rate": 4.1395188589199256e-06, + "loss": 0.5691, + "step": 5250 + }, + { + "epoch": 0.8286255325863974, + "grad_norm": 0.6200550198554993, + "learning_rate": 4.1392053049360196e-06, + "loss": 0.5642, + "step": 5251 + }, + { + "epoch": 0.8287833359633896, + "grad_norm": 0.6029238700866699, + "learning_rate": 4.138891705714039e-06, + "loss": 0.5369, + "step": 5252 + }, + { + "epoch": 0.8289411393403819, + "grad_norm": 0.5869299173355103, + "learning_rate": 4.138578061262638e-06, + "loss": 0.5742, + "step": 5253 + }, + { + "epoch": 0.8290989427173742, + "grad_norm": 0.5549250245094299, + "learning_rate": 4.138264371590475e-06, + "loss": 0.5485, + "step": 5254 + }, + { + "epoch": 0.8292567460943664, + "grad_norm": 0.5618124604225159, + "learning_rate": 4.137950636706205e-06, + "loss": 0.5725, + "step": 5255 + }, + { + "epoch": 0.8294145494713587, + "grad_norm": 0.5836042165756226, + "learning_rate": 4.137636856618487e-06, + "loss": 0.5415, + "step": 5256 + }, + { + "epoch": 0.829572352848351, + "grad_norm": 0.576647937297821, + "learning_rate": 4.137323031335981e-06, + "loss": 0.5839, + "step": 5257 + }, + { + "epoch": 0.8297301562253432, + "grad_norm": 0.5617443323135376, + "learning_rate": 4.137009160867346e-06, + "loss": 0.5665, + "step": 5258 + }, + { + "epoch": 0.8298879596023355, + "grad_norm": 0.5986824631690979, + "learning_rate": 4.136695245221246e-06, + "loss": 0.5744, + "step": 5259 + }, + { + "epoch": 0.8300457629793278, + "grad_norm": 0.6154429316520691, + "learning_rate": 4.136381284406344e-06, + "loss": 0.5255, + "step": 5260 + }, + { + "epoch": 0.83020356635632, + "grad_norm": 0.5776639580726624, + "learning_rate": 4.1360672784313034e-06, + "loss": 0.5769, + "step": 5261 + }, + { + "epoch": 0.8303613697333123, + "grad_norm": 0.6028028726577759, + "learning_rate": 4.135753227304791e-06, + "loss": 0.5516, + "step": 5262 + }, + { + "epoch": 0.8305191731103045, + "grad_norm": 0.6005135774612427, + "learning_rate": 4.135439131035474e-06, + "loss": 0.589, + "step": 5263 + }, + { + "epoch": 0.8306769764872969, + "grad_norm": 0.5938376188278198, + "learning_rate": 4.13512498963202e-06, + "loss": 0.5436, + "step": 5264 + }, + { + "epoch": 0.8308347798642891, + "grad_norm": 0.6119574308395386, + "learning_rate": 4.134810803103098e-06, + "loss": 0.5626, + "step": 5265 + }, + { + "epoch": 0.8309925832412813, + "grad_norm": 0.590932309627533, + "learning_rate": 4.134496571457381e-06, + "loss": 0.5701, + "step": 5266 + }, + { + "epoch": 0.8311503866182737, + "grad_norm": 0.5935007929801941, + "learning_rate": 4.1341822947035385e-06, + "loss": 0.5577, + "step": 5267 + }, + { + "epoch": 0.8313081899952659, + "grad_norm": 0.616824746131897, + "learning_rate": 4.133867972850246e-06, + "loss": 0.5422, + "step": 5268 + }, + { + "epoch": 0.8314659933722581, + "grad_norm": 0.5905821323394775, + "learning_rate": 4.133553605906177e-06, + "loss": 0.569, + "step": 5269 + }, + { + "epoch": 0.8316237967492505, + "grad_norm": 0.5837295055389404, + "learning_rate": 4.133239193880006e-06, + "loss": 0.5381, + "step": 5270 + }, + { + "epoch": 0.8317816001262427, + "grad_norm": 0.5521880388259888, + "learning_rate": 4.132924736780412e-06, + "loss": 0.594, + "step": 5271 + }, + { + "epoch": 0.8319394035032349, + "grad_norm": 0.5867182016372681, + "learning_rate": 4.132610234616072e-06, + "loss": 0.5485, + "step": 5272 + }, + { + "epoch": 0.8320972068802273, + "grad_norm": 0.6310956478118896, + "learning_rate": 4.132295687395666e-06, + "loss": 0.5541, + "step": 5273 + }, + { + "epoch": 0.8322550102572195, + "grad_norm": 0.5785060524940491, + "learning_rate": 4.131981095127875e-06, + "loss": 0.5638, + "step": 5274 + }, + { + "epoch": 0.8324128136342118, + "grad_norm": 0.6132269501686096, + "learning_rate": 4.131666457821381e-06, + "loss": 0.5918, + "step": 5275 + }, + { + "epoch": 0.832570617011204, + "grad_norm": 0.5730753540992737, + "learning_rate": 4.131351775484866e-06, + "loss": 0.5556, + "step": 5276 + }, + { + "epoch": 0.8327284203881963, + "grad_norm": 0.5920183658599854, + "learning_rate": 4.131037048127016e-06, + "loss": 0.5876, + "step": 5277 + }, + { + "epoch": 0.8328862237651886, + "grad_norm": 0.6174941658973694, + "learning_rate": 4.130722275756515e-06, + "loss": 0.6113, + "step": 5278 + }, + { + "epoch": 0.8330440271421808, + "grad_norm": 0.5884217023849487, + "learning_rate": 4.130407458382052e-06, + "loss": 0.5696, + "step": 5279 + }, + { + "epoch": 0.8332018305191731, + "grad_norm": 0.6166089773178101, + "learning_rate": 4.130092596012314e-06, + "loss": 0.5893, + "step": 5280 + }, + { + "epoch": 0.8333596338961654, + "grad_norm": 0.6015021800994873, + "learning_rate": 4.129777688655991e-06, + "loss": 0.582, + "step": 5281 + }, + { + "epoch": 0.8335174372731576, + "grad_norm": 0.5758314728736877, + "learning_rate": 4.129462736321773e-06, + "loss": 0.5655, + "step": 5282 + }, + { + "epoch": 0.8336752406501499, + "grad_norm": 0.5877017378807068, + "learning_rate": 4.129147739018352e-06, + "loss": 0.5699, + "step": 5283 + }, + { + "epoch": 0.8338330440271422, + "grad_norm": 0.5919161438941956, + "learning_rate": 4.128832696754422e-06, + "loss": 0.5434, + "step": 5284 + }, + { + "epoch": 0.8339908474041344, + "grad_norm": 0.6019015908241272, + "learning_rate": 4.128517609538676e-06, + "loss": 0.5784, + "step": 5285 + }, + { + "epoch": 0.8341486507811268, + "grad_norm": 0.6089200377464294, + "learning_rate": 4.128202477379811e-06, + "loss": 0.5581, + "step": 5286 + }, + { + "epoch": 0.834306454158119, + "grad_norm": 0.5751751065254211, + "learning_rate": 4.1278873002865224e-06, + "loss": 0.593, + "step": 5287 + }, + { + "epoch": 0.8344642575351112, + "grad_norm": 0.5976378917694092, + "learning_rate": 4.12757207826751e-06, + "loss": 0.5324, + "step": 5288 + }, + { + "epoch": 0.8346220609121036, + "grad_norm": 0.5791654586791992, + "learning_rate": 4.127256811331472e-06, + "loss": 0.5482, + "step": 5289 + }, + { + "epoch": 0.8347798642890958, + "grad_norm": 0.5727169513702393, + "learning_rate": 4.12694149948711e-06, + "loss": 0.5878, + "step": 5290 + }, + { + "epoch": 0.834937667666088, + "grad_norm": 0.5760839581489563, + "learning_rate": 4.1266261427431255e-06, + "loss": 0.5904, + "step": 5291 + }, + { + "epoch": 0.8350954710430804, + "grad_norm": 0.6201783418655396, + "learning_rate": 4.126310741108221e-06, + "loss": 0.5537, + "step": 5292 + }, + { + "epoch": 0.8352532744200726, + "grad_norm": 0.5835009813308716, + "learning_rate": 4.125995294591101e-06, + "loss": 0.6109, + "step": 5293 + }, + { + "epoch": 0.8354110777970648, + "grad_norm": 0.5822149515151978, + "learning_rate": 4.1256798032004705e-06, + "loss": 0.5969, + "step": 5294 + }, + { + "epoch": 0.8355688811740571, + "grad_norm": 0.6074647307395935, + "learning_rate": 4.125364266945038e-06, + "loss": 0.5968, + "step": 5295 + }, + { + "epoch": 0.8357266845510494, + "grad_norm": 0.6084513664245605, + "learning_rate": 4.125048685833509e-06, + "loss": 0.5727, + "step": 5296 + }, + { + "epoch": 0.8358844879280417, + "grad_norm": 0.5892218947410583, + "learning_rate": 4.124733059874596e-06, + "loss": 0.5488, + "step": 5297 + }, + { + "epoch": 0.8360422913050339, + "grad_norm": 0.6003716588020325, + "learning_rate": 4.1244173890770064e-06, + "loss": 0.5484, + "step": 5298 + }, + { + "epoch": 0.8362000946820262, + "grad_norm": 0.5811029672622681, + "learning_rate": 4.124101673449455e-06, + "loss": 0.5503, + "step": 5299 + }, + { + "epoch": 0.8363578980590185, + "grad_norm": 0.5818963646888733, + "learning_rate": 4.123785913000652e-06, + "loss": 0.5804, + "step": 5300 + }, + { + "epoch": 0.8365157014360107, + "grad_norm": 0.571980357170105, + "learning_rate": 4.123470107739312e-06, + "loss": 0.5732, + "step": 5301 + }, + { + "epoch": 0.836673504813003, + "grad_norm": 0.5705887675285339, + "learning_rate": 4.123154257674153e-06, + "loss": 0.5829, + "step": 5302 + }, + { + "epoch": 0.8368313081899953, + "grad_norm": 0.7122018337249756, + "learning_rate": 4.122838362813888e-06, + "loss": 0.5616, + "step": 5303 + }, + { + "epoch": 0.8369891115669875, + "grad_norm": 0.5764129161834717, + "learning_rate": 4.122522423167239e-06, + "loss": 0.5663, + "step": 5304 + }, + { + "epoch": 0.8371469149439797, + "grad_norm": 0.5806353092193604, + "learning_rate": 4.1222064387429215e-06, + "loss": 0.5583, + "step": 5305 + }, + { + "epoch": 0.8373047183209721, + "grad_norm": 0.5781369209289551, + "learning_rate": 4.121890409549657e-06, + "loss": 0.5578, + "step": 5306 + }, + { + "epoch": 0.8374625216979643, + "grad_norm": 0.573264479637146, + "learning_rate": 4.121574335596168e-06, + "loss": 0.5501, + "step": 5307 + }, + { + "epoch": 0.8376203250749567, + "grad_norm": 0.5645738244056702, + "learning_rate": 4.121258216891177e-06, + "loss": 0.568, + "step": 5308 + }, + { + "epoch": 0.8377781284519489, + "grad_norm": 0.5874782800674438, + "learning_rate": 4.120942053443408e-06, + "loss": 0.5891, + "step": 5309 + }, + { + "epoch": 0.8379359318289411, + "grad_norm": 0.6271598935127258, + "learning_rate": 4.120625845261586e-06, + "loss": 0.5452, + "step": 5310 + }, + { + "epoch": 0.8380937352059334, + "grad_norm": 0.5849231481552124, + "learning_rate": 4.120309592354439e-06, + "loss": 0.5939, + "step": 5311 + }, + { + "epoch": 0.8382515385829257, + "grad_norm": 0.5968565940856934, + "learning_rate": 4.119993294730693e-06, + "loss": 0.5712, + "step": 5312 + }, + { + "epoch": 0.8384093419599179, + "grad_norm": 0.5760632157325745, + "learning_rate": 4.119676952399077e-06, + "loss": 0.5498, + "step": 5313 + }, + { + "epoch": 0.8385671453369102, + "grad_norm": 0.5823690891265869, + "learning_rate": 4.119360565368322e-06, + "loss": 0.5961, + "step": 5314 + }, + { + "epoch": 0.8387249487139025, + "grad_norm": 0.5963276028633118, + "learning_rate": 4.119044133647161e-06, + "loss": 0.5441, + "step": 5315 + }, + { + "epoch": 0.8388827520908947, + "grad_norm": 0.5566579103469849, + "learning_rate": 4.118727657244324e-06, + "loss": 0.5609, + "step": 5316 + }, + { + "epoch": 0.839040555467887, + "grad_norm": 0.635799765586853, + "learning_rate": 4.118411136168548e-06, + "loss": 0.5911, + "step": 5317 + }, + { + "epoch": 0.8391983588448793, + "grad_norm": 0.5788347125053406, + "learning_rate": 4.118094570428565e-06, + "loss": 0.6, + "step": 5318 + }, + { + "epoch": 0.8393561622218716, + "grad_norm": 0.5943009257316589, + "learning_rate": 4.1177779600331125e-06, + "loss": 0.5878, + "step": 5319 + }, + { + "epoch": 0.8395139655988638, + "grad_norm": 0.5776690244674683, + "learning_rate": 4.11746130499093e-06, + "loss": 0.5612, + "step": 5320 + }, + { + "epoch": 0.839671768975856, + "grad_norm": 0.579946756362915, + "learning_rate": 4.117144605310753e-06, + "loss": 0.5718, + "step": 5321 + }, + { + "epoch": 0.8398295723528484, + "grad_norm": 0.6250376105308533, + "learning_rate": 4.116827861001325e-06, + "loss": 0.5773, + "step": 5322 + }, + { + "epoch": 0.8399873757298406, + "grad_norm": 0.6167253851890564, + "learning_rate": 4.116511072071386e-06, + "loss": 0.5522, + "step": 5323 + }, + { + "epoch": 0.8401451791068328, + "grad_norm": 0.5902647376060486, + "learning_rate": 4.116194238529679e-06, + "loss": 0.5706, + "step": 5324 + }, + { + "epoch": 0.8403029824838252, + "grad_norm": 0.5826700329780579, + "learning_rate": 4.115877360384947e-06, + "loss": 0.5271, + "step": 5325 + }, + { + "epoch": 0.8404607858608174, + "grad_norm": 0.5593875646591187, + "learning_rate": 4.115560437645936e-06, + "loss": 0.5673, + "step": 5326 + }, + { + "epoch": 0.8406185892378097, + "grad_norm": 0.5780690908432007, + "learning_rate": 4.115243470321391e-06, + "loss": 0.5532, + "step": 5327 + }, + { + "epoch": 0.840776392614802, + "grad_norm": 0.5858946442604065, + "learning_rate": 4.114926458420061e-06, + "loss": 0.613, + "step": 5328 + }, + { + "epoch": 0.8409341959917942, + "grad_norm": 0.5556921362876892, + "learning_rate": 4.114609401950693e-06, + "loss": 0.5679, + "step": 5329 + }, + { + "epoch": 0.8410919993687865, + "grad_norm": 0.5665792226791382, + "learning_rate": 4.114292300922039e-06, + "loss": 0.5774, + "step": 5330 + }, + { + "epoch": 0.8412498027457788, + "grad_norm": 0.6038845777511597, + "learning_rate": 4.113975155342849e-06, + "loss": 0.5545, + "step": 5331 + }, + { + "epoch": 0.841407606122771, + "grad_norm": 0.5761243104934692, + "learning_rate": 4.113657965221875e-06, + "loss": 0.5804, + "step": 5332 + }, + { + "epoch": 0.8415654094997633, + "grad_norm": 0.5702511072158813, + "learning_rate": 4.113340730567873e-06, + "loss": 0.5897, + "step": 5333 + }, + { + "epoch": 0.8417232128767556, + "grad_norm": 0.5787937641143799, + "learning_rate": 4.113023451389596e-06, + "loss": 0.5776, + "step": 5334 + }, + { + "epoch": 0.8418810162537478, + "grad_norm": 0.6070559620857239, + "learning_rate": 4.1127061276958e-06, + "loss": 0.5626, + "step": 5335 + }, + { + "epoch": 0.8420388196307401, + "grad_norm": 0.5971863269805908, + "learning_rate": 4.112388759495243e-06, + "loss": 0.5711, + "step": 5336 + }, + { + "epoch": 0.8421966230077323, + "grad_norm": 0.5852024555206299, + "learning_rate": 4.112071346796684e-06, + "loss": 0.5802, + "step": 5337 + }, + { + "epoch": 0.8423544263847247, + "grad_norm": 0.6057943105697632, + "learning_rate": 4.11175388960888e-06, + "loss": 0.5468, + "step": 5338 + }, + { + "epoch": 0.8425122297617169, + "grad_norm": 0.6173790097236633, + "learning_rate": 4.111436387940596e-06, + "loss": 0.5852, + "step": 5339 + }, + { + "epoch": 0.8426700331387091, + "grad_norm": 0.5496485233306885, + "learning_rate": 4.111118841800592e-06, + "loss": 0.553, + "step": 5340 + }, + { + "epoch": 0.8428278365157015, + "grad_norm": 0.5896860361099243, + "learning_rate": 4.110801251197634e-06, + "loss": 0.5829, + "step": 5341 + }, + { + "epoch": 0.8429856398926937, + "grad_norm": 0.5827255249023438, + "learning_rate": 4.110483616140483e-06, + "loss": 0.5712, + "step": 5342 + }, + { + "epoch": 0.8431434432696859, + "grad_norm": 0.6383054852485657, + "learning_rate": 4.110165936637906e-06, + "loss": 0.5854, + "step": 5343 + }, + { + "epoch": 0.8433012466466783, + "grad_norm": 0.5638916492462158, + "learning_rate": 4.109848212698673e-06, + "loss": 0.5891, + "step": 5344 + }, + { + "epoch": 0.8434590500236705, + "grad_norm": 0.5581481456756592, + "learning_rate": 4.1095304443315474e-06, + "loss": 0.5421, + "step": 5345 + }, + { + "epoch": 0.8436168534006627, + "grad_norm": 0.5707232356071472, + "learning_rate": 4.109212631545304e-06, + "loss": 0.5678, + "step": 5346 + }, + { + "epoch": 0.8437746567776551, + "grad_norm": 0.5783288478851318, + "learning_rate": 4.10889477434871e-06, + "loss": 0.5716, + "step": 5347 + }, + { + "epoch": 0.8439324601546473, + "grad_norm": 0.58155357837677, + "learning_rate": 4.1085768727505405e-06, + "loss": 0.5639, + "step": 5348 + }, + { + "epoch": 0.8440902635316396, + "grad_norm": 0.6067901253700256, + "learning_rate": 4.108258926759566e-06, + "loss": 0.5682, + "step": 5349 + }, + { + "epoch": 0.8442480669086319, + "grad_norm": 0.5948206186294556, + "learning_rate": 4.107940936384563e-06, + "loss": 0.5945, + "step": 5350 + }, + { + "epoch": 0.8444058702856241, + "grad_norm": 0.5962873697280884, + "learning_rate": 4.107622901634306e-06, + "loss": 0.5718, + "step": 5351 + }, + { + "epoch": 0.8445636736626164, + "grad_norm": 0.63800448179245, + "learning_rate": 4.107304822517573e-06, + "loss": 0.5341, + "step": 5352 + }, + { + "epoch": 0.8447214770396086, + "grad_norm": 0.5957997441291809, + "learning_rate": 4.106986699043141e-06, + "loss": 0.5816, + "step": 5353 + }, + { + "epoch": 0.8448792804166009, + "grad_norm": 0.5869271755218506, + "learning_rate": 4.106668531219791e-06, + "loss": 0.5721, + "step": 5354 + }, + { + "epoch": 0.8450370837935932, + "grad_norm": 0.565713107585907, + "learning_rate": 4.1063503190563015e-06, + "loss": 0.5587, + "step": 5355 + }, + { + "epoch": 0.8451948871705854, + "grad_norm": 0.5936175584793091, + "learning_rate": 4.106032062561456e-06, + "loss": 0.6083, + "step": 5356 + }, + { + "epoch": 0.8453526905475777, + "grad_norm": 0.6031959652900696, + "learning_rate": 4.105713761744038e-06, + "loss": 0.608, + "step": 5357 + }, + { + "epoch": 0.84551049392457, + "grad_norm": 0.5708346962928772, + "learning_rate": 4.105395416612831e-06, + "loss": 0.5643, + "step": 5358 + }, + { + "epoch": 0.8456682973015622, + "grad_norm": 0.5814884305000305, + "learning_rate": 4.1050770271766204e-06, + "loss": 0.562, + "step": 5359 + }, + { + "epoch": 0.8458261006785546, + "grad_norm": 0.592950165271759, + "learning_rate": 4.104758593444193e-06, + "loss": 0.5646, + "step": 5360 + }, + { + "epoch": 0.8459839040555468, + "grad_norm": 0.5503792762756348, + "learning_rate": 4.104440115424337e-06, + "loss": 0.5593, + "step": 5361 + }, + { + "epoch": 0.846141707432539, + "grad_norm": 0.6137787699699402, + "learning_rate": 4.104121593125841e-06, + "loss": 0.5935, + "step": 5362 + }, + { + "epoch": 0.8462995108095314, + "grad_norm": 0.5745882391929626, + "learning_rate": 4.103803026557498e-06, + "loss": 0.5208, + "step": 5363 + }, + { + "epoch": 0.8464573141865236, + "grad_norm": 0.5677182078361511, + "learning_rate": 4.1034844157280965e-06, + "loss": 0.5641, + "step": 5364 + }, + { + "epoch": 0.8466151175635158, + "grad_norm": 0.5839641690254211, + "learning_rate": 4.10316576064643e-06, + "loss": 0.5732, + "step": 5365 + }, + { + "epoch": 0.8467729209405082, + "grad_norm": 0.5836748480796814, + "learning_rate": 4.102847061321294e-06, + "loss": 0.5919, + "step": 5366 + }, + { + "epoch": 0.8469307243175004, + "grad_norm": 0.5981671214103699, + "learning_rate": 4.102528317761483e-06, + "loss": 0.5604, + "step": 5367 + }, + { + "epoch": 0.8470885276944926, + "grad_norm": 0.5825080275535583, + "learning_rate": 4.102209529975793e-06, + "loss": 0.576, + "step": 5368 + }, + { + "epoch": 0.847246331071485, + "grad_norm": 0.5746813416481018, + "learning_rate": 4.101890697973023e-06, + "loss": 0.5616, + "step": 5369 + }, + { + "epoch": 0.8474041344484772, + "grad_norm": 0.6295794248580933, + "learning_rate": 4.101571821761971e-06, + "loss": 0.5857, + "step": 5370 + }, + { + "epoch": 0.8475619378254695, + "grad_norm": 0.5834059715270996, + "learning_rate": 4.101252901351438e-06, + "loss": 0.5874, + "step": 5371 + }, + { + "epoch": 0.8477197412024617, + "grad_norm": 0.6398015022277832, + "learning_rate": 4.100933936750225e-06, + "loss": 0.5842, + "step": 5372 + }, + { + "epoch": 0.847877544579454, + "grad_norm": 0.6102773547172546, + "learning_rate": 4.100614927967134e-06, + "loss": 0.5547, + "step": 5373 + }, + { + "epoch": 0.8480353479564463, + "grad_norm": 0.6149964332580566, + "learning_rate": 4.10029587501097e-06, + "loss": 0.5685, + "step": 5374 + }, + { + "epoch": 0.8481931513334385, + "grad_norm": 0.5820277333259583, + "learning_rate": 4.099976777890538e-06, + "loss": 0.543, + "step": 5375 + }, + { + "epoch": 0.8483509547104308, + "grad_norm": 0.5649425387382507, + "learning_rate": 4.099657636614643e-06, + "loss": 0.5656, + "step": 5376 + }, + { + "epoch": 0.8485087580874231, + "grad_norm": 0.5580365061759949, + "learning_rate": 4.099338451192093e-06, + "loss": 0.5399, + "step": 5377 + }, + { + "epoch": 0.8486665614644153, + "grad_norm": 0.6058158278465271, + "learning_rate": 4.099019221631698e-06, + "loss": 0.5861, + "step": 5378 + }, + { + "epoch": 0.8488243648414076, + "grad_norm": 0.5799540281295776, + "learning_rate": 4.098699947942266e-06, + "loss": 0.586, + "step": 5379 + }, + { + "epoch": 0.8489821682183999, + "grad_norm": 0.5959810018539429, + "learning_rate": 4.0983806301326105e-06, + "loss": 0.5956, + "step": 5380 + }, + { + "epoch": 0.8491399715953921, + "grad_norm": 0.5925883650779724, + "learning_rate": 4.0980612682115426e-06, + "loss": 0.5608, + "step": 5381 + }, + { + "epoch": 0.8492977749723845, + "grad_norm": 0.6070907711982727, + "learning_rate": 4.097741862187875e-06, + "loss": 0.5469, + "step": 5382 + }, + { + "epoch": 0.8494555783493767, + "grad_norm": 0.6026992201805115, + "learning_rate": 4.097422412070424e-06, + "loss": 0.579, + "step": 5383 + }, + { + "epoch": 0.8496133817263689, + "grad_norm": 0.6230846047401428, + "learning_rate": 4.097102917868006e-06, + "loss": 0.548, + "step": 5384 + }, + { + "epoch": 0.8497711851033612, + "grad_norm": 0.6416939496994019, + "learning_rate": 4.096783379589436e-06, + "loss": 0.5565, + "step": 5385 + }, + { + "epoch": 0.8499289884803535, + "grad_norm": 0.6317764520645142, + "learning_rate": 4.096463797243535e-06, + "loss": 0.5413, + "step": 5386 + }, + { + "epoch": 0.8500867918573457, + "grad_norm": 0.5501769781112671, + "learning_rate": 4.09614417083912e-06, + "loss": 0.544, + "step": 5387 + }, + { + "epoch": 0.850244595234338, + "grad_norm": 0.5958439707756042, + "learning_rate": 4.095824500385015e-06, + "loss": 0.591, + "step": 5388 + }, + { + "epoch": 0.8504023986113303, + "grad_norm": 0.6702543497085571, + "learning_rate": 4.09550478589004e-06, + "loss": 0.5492, + "step": 5389 + }, + { + "epoch": 0.8505602019883225, + "grad_norm": 0.5964361429214478, + "learning_rate": 4.09518502736302e-06, + "loss": 0.582, + "step": 5390 + }, + { + "epoch": 0.8507180053653148, + "grad_norm": 0.6177905201911926, + "learning_rate": 4.094865224812778e-06, + "loss": 0.5719, + "step": 5391 + }, + { + "epoch": 0.8508758087423071, + "grad_norm": 0.6007071137428284, + "learning_rate": 4.0945453782481396e-06, + "loss": 0.5425, + "step": 5392 + }, + { + "epoch": 0.8510336121192994, + "grad_norm": 0.5945065021514893, + "learning_rate": 4.094225487677933e-06, + "loss": 0.5926, + "step": 5393 + }, + { + "epoch": 0.8511914154962916, + "grad_norm": 0.5728323459625244, + "learning_rate": 4.093905553110985e-06, + "loss": 0.5877, + "step": 5394 + }, + { + "epoch": 0.8513492188732839, + "grad_norm": 0.5894752144813538, + "learning_rate": 4.093585574556127e-06, + "loss": 0.5722, + "step": 5395 + }, + { + "epoch": 0.8515070222502762, + "grad_norm": 0.6121612191200256, + "learning_rate": 4.093265552022188e-06, + "loss": 0.6032, + "step": 5396 + }, + { + "epoch": 0.8516648256272684, + "grad_norm": 0.6066976189613342, + "learning_rate": 4.092945485518e-06, + "loss": 0.5589, + "step": 5397 + }, + { + "epoch": 0.8518226290042606, + "grad_norm": 0.5868589282035828, + "learning_rate": 4.092625375052397e-06, + "loss": 0.5678, + "step": 5398 + }, + { + "epoch": 0.851980432381253, + "grad_norm": 0.5675803422927856, + "learning_rate": 4.092305220634213e-06, + "loss": 0.5585, + "step": 5399 + }, + { + "epoch": 0.8521382357582452, + "grad_norm": 0.5889614820480347, + "learning_rate": 4.091985022272282e-06, + "loss": 0.5179, + "step": 5400 + }, + { + "epoch": 0.8522960391352375, + "grad_norm": 0.617104709148407, + "learning_rate": 4.091664779975443e-06, + "loss": 0.5637, + "step": 5401 + }, + { + "epoch": 0.8524538425122298, + "grad_norm": 0.5921313166618347, + "learning_rate": 4.091344493752532e-06, + "loss": 0.5706, + "step": 5402 + }, + { + "epoch": 0.852611645889222, + "grad_norm": 0.548073410987854, + "learning_rate": 4.0910241636123895e-06, + "loss": 0.5511, + "step": 5403 + }, + { + "epoch": 0.8527694492662143, + "grad_norm": 0.5863253474235535, + "learning_rate": 4.090703789563854e-06, + "loss": 0.5605, + "step": 5404 + }, + { + "epoch": 0.8529272526432066, + "grad_norm": 0.5877960920333862, + "learning_rate": 4.09038337161577e-06, + "loss": 0.5538, + "step": 5405 + }, + { + "epoch": 0.8530850560201988, + "grad_norm": 0.5760671496391296, + "learning_rate": 4.0900629097769774e-06, + "loss": 0.5977, + "step": 5406 + }, + { + "epoch": 0.8532428593971911, + "grad_norm": 0.5955833792686462, + "learning_rate": 4.089742404056322e-06, + "loss": 0.5913, + "step": 5407 + }, + { + "epoch": 0.8534006627741834, + "grad_norm": 0.6018348932266235, + "learning_rate": 4.089421854462648e-06, + "loss": 0.5971, + "step": 5408 + }, + { + "epoch": 0.8535584661511756, + "grad_norm": 0.5808199644088745, + "learning_rate": 4.089101261004801e-06, + "loss": 0.575, + "step": 5409 + }, + { + "epoch": 0.8537162695281679, + "grad_norm": 0.5760694146156311, + "learning_rate": 4.088780623691631e-06, + "loss": 0.531, + "step": 5410 + }, + { + "epoch": 0.8538740729051602, + "grad_norm": 0.5836241245269775, + "learning_rate": 4.088459942531985e-06, + "loss": 0.5666, + "step": 5411 + }, + { + "epoch": 0.8540318762821525, + "grad_norm": 0.5527470111846924, + "learning_rate": 4.088139217534713e-06, + "loss": 0.5578, + "step": 5412 + }, + { + "epoch": 0.8541896796591447, + "grad_norm": 0.586094081401825, + "learning_rate": 4.087818448708667e-06, + "loss": 0.5721, + "step": 5413 + }, + { + "epoch": 0.854347483036137, + "grad_norm": 0.624292254447937, + "learning_rate": 4.0874976360627e-06, + "loss": 0.5699, + "step": 5414 + }, + { + "epoch": 0.8545052864131293, + "grad_norm": 0.6210857629776001, + "learning_rate": 4.0871767796056635e-06, + "loss": 0.5494, + "step": 5415 + }, + { + "epoch": 0.8546630897901215, + "grad_norm": 0.5788665413856506, + "learning_rate": 4.086855879346414e-06, + "loss": 0.5924, + "step": 5416 + }, + { + "epoch": 0.8548208931671137, + "grad_norm": 0.5784815549850464, + "learning_rate": 4.086534935293808e-06, + "loss": 0.6015, + "step": 5417 + }, + { + "epoch": 0.8549786965441061, + "grad_norm": 0.6114882826805115, + "learning_rate": 4.0862139474567005e-06, + "loss": 0.5775, + "step": 5418 + }, + { + "epoch": 0.8551364999210983, + "grad_norm": 0.5869871377944946, + "learning_rate": 4.085892915843953e-06, + "loss": 0.5687, + "step": 5419 + }, + { + "epoch": 0.8552943032980905, + "grad_norm": 0.5883960127830505, + "learning_rate": 4.085571840464423e-06, + "loss": 0.5786, + "step": 5420 + }, + { + "epoch": 0.8554521066750829, + "grad_norm": 0.6134746670722961, + "learning_rate": 4.0852507213269725e-06, + "loss": 0.5251, + "step": 5421 + }, + { + "epoch": 0.8556099100520751, + "grad_norm": 0.5796648263931274, + "learning_rate": 4.0849295584404626e-06, + "loss": 0.5873, + "step": 5422 + }, + { + "epoch": 0.8557677134290674, + "grad_norm": 0.5916813611984253, + "learning_rate": 4.0846083518137566e-06, + "loss": 0.5911, + "step": 5423 + }, + { + "epoch": 0.8559255168060597, + "grad_norm": 0.559459924697876, + "learning_rate": 4.08428710145572e-06, + "loss": 0.5528, + "step": 5424 + }, + { + "epoch": 0.8560833201830519, + "grad_norm": 0.5860116481781006, + "learning_rate": 4.083965807375219e-06, + "loss": 0.5351, + "step": 5425 + }, + { + "epoch": 0.8562411235600442, + "grad_norm": 0.5807007551193237, + "learning_rate": 4.083644469581119e-06, + "loss": 0.6017, + "step": 5426 + }, + { + "epoch": 0.8563989269370365, + "grad_norm": 0.5855349898338318, + "learning_rate": 4.0833230880822895e-06, + "loss": 0.5736, + "step": 5427 + }, + { + "epoch": 0.8565567303140287, + "grad_norm": 0.586982786655426, + "learning_rate": 4.083001662887598e-06, + "loss": 0.5743, + "step": 5428 + }, + { + "epoch": 0.856714533691021, + "grad_norm": 0.5562223792076111, + "learning_rate": 4.082680194005917e-06, + "loss": 0.5431, + "step": 5429 + }, + { + "epoch": 0.8568723370680132, + "grad_norm": 0.5742196440696716, + "learning_rate": 4.082358681446117e-06, + "loss": 0.5753, + "step": 5430 + }, + { + "epoch": 0.8570301404450055, + "grad_norm": 0.5657389760017395, + "learning_rate": 4.0820371252170725e-06, + "loss": 0.586, + "step": 5431 + }, + { + "epoch": 0.8571879438219978, + "grad_norm": 0.5904938578605652, + "learning_rate": 4.081715525327655e-06, + "loss": 0.5523, + "step": 5432 + }, + { + "epoch": 0.85734574719899, + "grad_norm": 0.5661501884460449, + "learning_rate": 4.081393881786742e-06, + "loss": 0.5498, + "step": 5433 + }, + { + "epoch": 0.8575035505759824, + "grad_norm": 0.6440916061401367, + "learning_rate": 4.08107219460321e-06, + "loss": 0.5739, + "step": 5434 + }, + { + "epoch": 0.8576613539529746, + "grad_norm": 0.6023861169815063, + "learning_rate": 4.080750463785936e-06, + "loss": 0.5509, + "step": 5435 + }, + { + "epoch": 0.8578191573299668, + "grad_norm": 0.6105749607086182, + "learning_rate": 4.080428689343799e-06, + "loss": 0.5225, + "step": 5436 + }, + { + "epoch": 0.8579769607069592, + "grad_norm": 0.5836808085441589, + "learning_rate": 4.08010687128568e-06, + "loss": 0.5709, + "step": 5437 + }, + { + "epoch": 0.8581347640839514, + "grad_norm": 0.6299588084220886, + "learning_rate": 4.07978500962046e-06, + "loss": 0.5748, + "step": 5438 + }, + { + "epoch": 0.8582925674609436, + "grad_norm": 0.5622112154960632, + "learning_rate": 4.079463104357021e-06, + "loss": 0.5839, + "step": 5439 + }, + { + "epoch": 0.858450370837936, + "grad_norm": 0.6183809041976929, + "learning_rate": 4.079141155504247e-06, + "loss": 0.5473, + "step": 5440 + }, + { + "epoch": 0.8586081742149282, + "grad_norm": 0.5871029496192932, + "learning_rate": 4.078819163071024e-06, + "loss": 0.5842, + "step": 5441 + }, + { + "epoch": 0.8587659775919204, + "grad_norm": 0.6157363057136536, + "learning_rate": 4.078497127066237e-06, + "loss": 0.5507, + "step": 5442 + }, + { + "epoch": 0.8589237809689128, + "grad_norm": 0.5825474262237549, + "learning_rate": 4.0781750474987745e-06, + "loss": 0.5692, + "step": 5443 + }, + { + "epoch": 0.859081584345905, + "grad_norm": 0.6121276617050171, + "learning_rate": 4.0778529243775246e-06, + "loss": 0.571, + "step": 5444 + }, + { + "epoch": 0.8592393877228973, + "grad_norm": 0.6058412194252014, + "learning_rate": 4.0775307577113755e-06, + "loss": 0.5882, + "step": 5445 + }, + { + "epoch": 0.8593971910998895, + "grad_norm": 0.6041327714920044, + "learning_rate": 4.077208547509221e-06, + "loss": 0.5833, + "step": 5446 + }, + { + "epoch": 0.8595549944768818, + "grad_norm": 0.5953667759895325, + "learning_rate": 4.076886293779952e-06, + "loss": 0.5761, + "step": 5447 + }, + { + "epoch": 0.8597127978538741, + "grad_norm": 0.5652876496315002, + "learning_rate": 4.076563996532463e-06, + "loss": 0.5414, + "step": 5448 + }, + { + "epoch": 0.8598706012308663, + "grad_norm": 0.6747309565544128, + "learning_rate": 4.076241655775646e-06, + "loss": 0.5583, + "step": 5449 + }, + { + "epoch": 0.8600284046078586, + "grad_norm": 0.6095392107963562, + "learning_rate": 4.075919271518399e-06, + "loss": 0.5651, + "step": 5450 + }, + { + "epoch": 0.8601862079848509, + "grad_norm": 0.6148313283920288, + "learning_rate": 4.075596843769619e-06, + "loss": 0.5657, + "step": 5451 + }, + { + "epoch": 0.8603440113618431, + "grad_norm": 0.5873019099235535, + "learning_rate": 4.075274372538203e-06, + "loss": 0.5572, + "step": 5452 + }, + { + "epoch": 0.8605018147388354, + "grad_norm": 0.5519018173217773, + "learning_rate": 4.074951857833051e-06, + "loss": 0.5486, + "step": 5453 + }, + { + "epoch": 0.8606596181158277, + "grad_norm": 0.6015499830245972, + "learning_rate": 4.074629299663065e-06, + "loss": 0.5852, + "step": 5454 + }, + { + "epoch": 0.8608174214928199, + "grad_norm": 0.6098511815071106, + "learning_rate": 4.074306698037144e-06, + "loss": 0.5905, + "step": 5455 + }, + { + "epoch": 0.8609752248698123, + "grad_norm": 0.6126755475997925, + "learning_rate": 4.0739840529641925e-06, + "loss": 0.5311, + "step": 5456 + }, + { + "epoch": 0.8611330282468045, + "grad_norm": 0.6098795533180237, + "learning_rate": 4.073661364453116e-06, + "loss": 0.5737, + "step": 5457 + }, + { + "epoch": 0.8612908316237967, + "grad_norm": 0.6255962252616882, + "learning_rate": 4.0733386325128174e-06, + "loss": 0.5702, + "step": 5458 + }, + { + "epoch": 0.861448635000789, + "grad_norm": 0.6047935485839844, + "learning_rate": 4.073015857152205e-06, + "loss": 0.5812, + "step": 5459 + }, + { + "epoch": 0.8616064383777813, + "grad_norm": 0.549104630947113, + "learning_rate": 4.072693038380187e-06, + "loss": 0.5969, + "step": 5460 + }, + { + "epoch": 0.8617642417547735, + "grad_norm": 0.5811651945114136, + "learning_rate": 4.07237017620567e-06, + "loss": 0.5712, + "step": 5461 + }, + { + "epoch": 0.8619220451317658, + "grad_norm": 0.5739196538925171, + "learning_rate": 4.0720472706375675e-06, + "loss": 0.5357, + "step": 5462 + }, + { + "epoch": 0.8620798485087581, + "grad_norm": 0.5866954922676086, + "learning_rate": 4.071724321684789e-06, + "loss": 0.5687, + "step": 5463 + }, + { + "epoch": 0.8622376518857503, + "grad_norm": 0.5829424262046814, + "learning_rate": 4.0714013293562465e-06, + "loss": 0.5785, + "step": 5464 + }, + { + "epoch": 0.8623954552627426, + "grad_norm": 0.5896797180175781, + "learning_rate": 4.071078293660855e-06, + "loss": 0.5304, + "step": 5465 + }, + { + "epoch": 0.8625532586397349, + "grad_norm": 0.5873622298240662, + "learning_rate": 4.070755214607529e-06, + "loss": 0.5676, + "step": 5466 + }, + { + "epoch": 0.8627110620167272, + "grad_norm": 0.5950850248336792, + "learning_rate": 4.070432092205185e-06, + "loss": 0.5629, + "step": 5467 + }, + { + "epoch": 0.8628688653937194, + "grad_norm": 0.5960932970046997, + "learning_rate": 4.070108926462741e-06, + "loss": 0.5519, + "step": 5468 + }, + { + "epoch": 0.8630266687707117, + "grad_norm": 0.5770342350006104, + "learning_rate": 4.069785717389114e-06, + "loss": 0.5933, + "step": 5469 + }, + { + "epoch": 0.863184472147704, + "grad_norm": 0.6080717444419861, + "learning_rate": 4.069462464993225e-06, + "loss": 0.6179, + "step": 5470 + }, + { + "epoch": 0.8633422755246962, + "grad_norm": 0.566741943359375, + "learning_rate": 4.069139169283993e-06, + "loss": 0.5827, + "step": 5471 + }, + { + "epoch": 0.8635000789016885, + "grad_norm": 0.6134375333786011, + "learning_rate": 4.068815830270344e-06, + "loss": 0.548, + "step": 5472 + }, + { + "epoch": 0.8636578822786808, + "grad_norm": 0.6018542051315308, + "learning_rate": 4.068492447961197e-06, + "loss": 0.5742, + "step": 5473 + }, + { + "epoch": 0.863815685655673, + "grad_norm": 0.5610185861587524, + "learning_rate": 4.06816902236548e-06, + "loss": 0.551, + "step": 5474 + }, + { + "epoch": 0.8639734890326654, + "grad_norm": 0.6139633059501648, + "learning_rate": 4.067845553492118e-06, + "loss": 0.5907, + "step": 5475 + }, + { + "epoch": 0.8641312924096576, + "grad_norm": 0.5859309434890747, + "learning_rate": 4.0675220413500375e-06, + "loss": 0.5494, + "step": 5476 + }, + { + "epoch": 0.8642890957866498, + "grad_norm": 0.6034127473831177, + "learning_rate": 4.067198485948165e-06, + "loss": 0.5465, + "step": 5477 + }, + { + "epoch": 0.8644468991636421, + "grad_norm": 0.5837084054946899, + "learning_rate": 4.0668748872954324e-06, + "loss": 0.584, + "step": 5478 + }, + { + "epoch": 0.8646047025406344, + "grad_norm": 0.6205910444259644, + "learning_rate": 4.066551245400769e-06, + "loss": 0.5475, + "step": 5479 + }, + { + "epoch": 0.8647625059176266, + "grad_norm": 0.5876616835594177, + "learning_rate": 4.0662275602731064e-06, + "loss": 0.6016, + "step": 5480 + }, + { + "epoch": 0.8649203092946189, + "grad_norm": 0.5818520784378052, + "learning_rate": 4.065903831921378e-06, + "loss": 0.5481, + "step": 5481 + }, + { + "epoch": 0.8650781126716112, + "grad_norm": 0.5774075388908386, + "learning_rate": 4.065580060354518e-06, + "loss": 0.547, + "step": 5482 + }, + { + "epoch": 0.8652359160486034, + "grad_norm": 0.5822579860687256, + "learning_rate": 4.065256245581461e-06, + "loss": 0.5731, + "step": 5483 + }, + { + "epoch": 0.8653937194255957, + "grad_norm": 0.5760470032691956, + "learning_rate": 4.064932387611145e-06, + "loss": 0.5516, + "step": 5484 + }, + { + "epoch": 0.865551522802588, + "grad_norm": 0.6258273124694824, + "learning_rate": 4.064608486452507e-06, + "loss": 0.5728, + "step": 5485 + }, + { + "epoch": 0.8657093261795803, + "grad_norm": 0.5795298218727112, + "learning_rate": 4.0642845421144835e-06, + "loss": 0.5744, + "step": 5486 + }, + { + "epoch": 0.8658671295565725, + "grad_norm": 0.6368348598480225, + "learning_rate": 4.063960554606018e-06, + "loss": 0.5581, + "step": 5487 + }, + { + "epoch": 0.8660249329335648, + "grad_norm": 0.5725372433662415, + "learning_rate": 4.063636523936051e-06, + "loss": 0.5465, + "step": 5488 + }, + { + "epoch": 0.8661827363105571, + "grad_norm": 0.5944398045539856, + "learning_rate": 4.063312450113523e-06, + "loss": 0.516, + "step": 5489 + }, + { + "epoch": 0.8663405396875493, + "grad_norm": 0.5972734093666077, + "learning_rate": 4.062988333147381e-06, + "loss": 0.5476, + "step": 5490 + }, + { + "epoch": 0.8664983430645415, + "grad_norm": 0.6147674322128296, + "learning_rate": 4.062664173046565e-06, + "loss": 0.5376, + "step": 5491 + }, + { + "epoch": 0.8666561464415339, + "grad_norm": 0.5808929800987244, + "learning_rate": 4.062339969820026e-06, + "loss": 0.5779, + "step": 5492 + }, + { + "epoch": 0.8668139498185261, + "grad_norm": 0.5810816884040833, + "learning_rate": 4.062015723476709e-06, + "loss": 0.5563, + "step": 5493 + }, + { + "epoch": 0.8669717531955183, + "grad_norm": 0.598601758480072, + "learning_rate": 4.061691434025561e-06, + "loss": 0.5837, + "step": 5494 + }, + { + "epoch": 0.8671295565725107, + "grad_norm": 0.6049678921699524, + "learning_rate": 4.061367101475534e-06, + "loss": 0.571, + "step": 5495 + }, + { + "epoch": 0.8672873599495029, + "grad_norm": 0.5533498525619507, + "learning_rate": 4.061042725835579e-06, + "loss": 0.5352, + "step": 5496 + }, + { + "epoch": 0.8674451633264952, + "grad_norm": 0.5788007378578186, + "learning_rate": 4.0607183071146454e-06, + "loss": 0.5449, + "step": 5497 + }, + { + "epoch": 0.8676029667034875, + "grad_norm": 0.5934793949127197, + "learning_rate": 4.0603938453216884e-06, + "loss": 0.5785, + "step": 5498 + }, + { + "epoch": 0.8677607700804797, + "grad_norm": 0.5913354158401489, + "learning_rate": 4.060069340465662e-06, + "loss": 0.5842, + "step": 5499 + }, + { + "epoch": 0.867918573457472, + "grad_norm": 0.5838361382484436, + "learning_rate": 4.059744792555521e-06, + "loss": 0.5368, + "step": 5500 + }, + { + "epoch": 0.8680763768344643, + "grad_norm": 0.5970306992530823, + "learning_rate": 4.059420201600223e-06, + "loss": 0.5569, + "step": 5501 + }, + { + "epoch": 0.8682341802114565, + "grad_norm": 0.5742454528808594, + "learning_rate": 4.059095567608725e-06, + "loss": 0.5539, + "step": 5502 + }, + { + "epoch": 0.8683919835884488, + "grad_norm": 0.5875817537307739, + "learning_rate": 4.058770890589988e-06, + "loss": 0.5735, + "step": 5503 + }, + { + "epoch": 0.868549786965441, + "grad_norm": 0.6025663614273071, + "learning_rate": 4.05844617055297e-06, + "loss": 0.5818, + "step": 5504 + }, + { + "epoch": 0.8687075903424333, + "grad_norm": 0.5752469301223755, + "learning_rate": 4.058121407506634e-06, + "loss": 0.5638, + "step": 5505 + }, + { + "epoch": 0.8688653937194256, + "grad_norm": 0.5852840542793274, + "learning_rate": 4.057796601459942e-06, + "loss": 0.6063, + "step": 5506 + }, + { + "epoch": 0.8690231970964178, + "grad_norm": 0.5718138217926025, + "learning_rate": 4.057471752421858e-06, + "loss": 0.5643, + "step": 5507 + }, + { + "epoch": 0.8691810004734102, + "grad_norm": 0.6066796779632568, + "learning_rate": 4.057146860401346e-06, + "loss": 0.5889, + "step": 5508 + }, + { + "epoch": 0.8693388038504024, + "grad_norm": 0.6658530235290527, + "learning_rate": 4.056821925407375e-06, + "loss": 0.5316, + "step": 5509 + }, + { + "epoch": 0.8694966072273946, + "grad_norm": 0.5676940083503723, + "learning_rate": 4.056496947448909e-06, + "loss": 0.5486, + "step": 5510 + }, + { + "epoch": 0.869654410604387, + "grad_norm": 0.5968173742294312, + "learning_rate": 4.056171926534919e-06, + "loss": 0.5926, + "step": 5511 + }, + { + "epoch": 0.8698122139813792, + "grad_norm": 0.6264181137084961, + "learning_rate": 4.055846862674375e-06, + "loss": 0.579, + "step": 5512 + }, + { + "epoch": 0.8699700173583714, + "grad_norm": 0.617668867111206, + "learning_rate": 4.055521755876246e-06, + "loss": 0.5662, + "step": 5513 + }, + { + "epoch": 0.8701278207353638, + "grad_norm": 0.5680651664733887, + "learning_rate": 4.055196606149506e-06, + "loss": 0.5517, + "step": 5514 + }, + { + "epoch": 0.870285624112356, + "grad_norm": 0.6027637124061584, + "learning_rate": 4.054871413503126e-06, + "loss": 0.5602, + "step": 5515 + }, + { + "epoch": 0.8704434274893482, + "grad_norm": 0.5669808387756348, + "learning_rate": 4.054546177946083e-06, + "loss": 0.5606, + "step": 5516 + }, + { + "epoch": 0.8706012308663406, + "grad_norm": 0.6180499196052551, + "learning_rate": 4.054220899487352e-06, + "loss": 0.551, + "step": 5517 + }, + { + "epoch": 0.8707590342433328, + "grad_norm": 0.5888007879257202, + "learning_rate": 4.05389557813591e-06, + "loss": 0.5549, + "step": 5518 + }, + { + "epoch": 0.8709168376203251, + "grad_norm": 0.6000747084617615, + "learning_rate": 4.053570213900735e-06, + "loss": 0.5714, + "step": 5519 + }, + { + "epoch": 0.8710746409973174, + "grad_norm": 0.6110967993736267, + "learning_rate": 4.053244806790804e-06, + "loss": 0.6042, + "step": 5520 + }, + { + "epoch": 0.8712324443743096, + "grad_norm": 0.5834705829620361, + "learning_rate": 4.052919356815101e-06, + "loss": 0.5495, + "step": 5521 + }, + { + "epoch": 0.8713902477513019, + "grad_norm": 0.609600305557251, + "learning_rate": 4.052593863982606e-06, + "loss": 0.5449, + "step": 5522 + }, + { + "epoch": 0.8715480511282941, + "grad_norm": 0.5946098566055298, + "learning_rate": 4.0522683283023014e-06, + "loss": 0.5452, + "step": 5523 + }, + { + "epoch": 0.8717058545052864, + "grad_norm": 0.6081040501594543, + "learning_rate": 4.051942749783172e-06, + "loss": 0.5919, + "step": 5524 + }, + { + "epoch": 0.8718636578822787, + "grad_norm": 0.5970323085784912, + "learning_rate": 4.051617128434202e-06, + "loss": 0.5455, + "step": 5525 + }, + { + "epoch": 0.8720214612592709, + "grad_norm": 0.5859227776527405, + "learning_rate": 4.051291464264379e-06, + "loss": 0.5612, + "step": 5526 + }, + { + "epoch": 0.8721792646362632, + "grad_norm": 0.5842001438140869, + "learning_rate": 4.050965757282691e-06, + "loss": 0.5642, + "step": 5527 + }, + { + "epoch": 0.8723370680132555, + "grad_norm": 0.6087620854377747, + "learning_rate": 4.050640007498125e-06, + "loss": 0.5815, + "step": 5528 + }, + { + "epoch": 0.8724948713902477, + "grad_norm": 0.5525529980659485, + "learning_rate": 4.05031421491967e-06, + "loss": 0.5467, + "step": 5529 + }, + { + "epoch": 0.8726526747672401, + "grad_norm": 0.5989533066749573, + "learning_rate": 4.04998837955632e-06, + "loss": 0.5671, + "step": 5530 + }, + { + "epoch": 0.8728104781442323, + "grad_norm": 0.5760453939437866, + "learning_rate": 4.049662501417066e-06, + "loss": 0.5803, + "step": 5531 + }, + { + "epoch": 0.8729682815212245, + "grad_norm": 0.5904245376586914, + "learning_rate": 4.049336580510901e-06, + "loss": 0.5395, + "step": 5532 + }, + { + "epoch": 0.8731260848982169, + "grad_norm": 0.581352174282074, + "learning_rate": 4.0490106168468204e-06, + "loss": 0.565, + "step": 5533 + }, + { + "epoch": 0.8732838882752091, + "grad_norm": 0.5817164778709412, + "learning_rate": 4.048684610433819e-06, + "loss": 0.5505, + "step": 5534 + }, + { + "epoch": 0.8734416916522013, + "grad_norm": 0.5667105317115784, + "learning_rate": 4.048358561280895e-06, + "loss": 0.5505, + "step": 5535 + }, + { + "epoch": 0.8735994950291937, + "grad_norm": 0.590747058391571, + "learning_rate": 4.048032469397045e-06, + "loss": 0.5303, + "step": 5536 + }, + { + "epoch": 0.8737572984061859, + "grad_norm": 0.5826817154884338, + "learning_rate": 4.047706334791269e-06, + "loss": 0.5877, + "step": 5537 + }, + { + "epoch": 0.8739151017831781, + "grad_norm": 0.5637806057929993, + "learning_rate": 4.047380157472569e-06, + "loss": 0.5494, + "step": 5538 + }, + { + "epoch": 0.8740729051601704, + "grad_norm": 0.5981470942497253, + "learning_rate": 4.0470539374499454e-06, + "loss": 0.5093, + "step": 5539 + }, + { + "epoch": 0.8742307085371627, + "grad_norm": 0.6397484540939331, + "learning_rate": 4.0467276747324005e-06, + "loss": 0.5626, + "step": 5540 + }, + { + "epoch": 0.874388511914155, + "grad_norm": 0.5875550508499146, + "learning_rate": 4.0464013693289394e-06, + "loss": 0.5756, + "step": 5541 + }, + { + "epoch": 0.8745463152911472, + "grad_norm": 0.5924099087715149, + "learning_rate": 4.046075021248567e-06, + "loss": 0.5582, + "step": 5542 + }, + { + "epoch": 0.8747041186681395, + "grad_norm": 0.5821860432624817, + "learning_rate": 4.0457486305002895e-06, + "loss": 0.5415, + "step": 5543 + }, + { + "epoch": 0.8748619220451318, + "grad_norm": 0.6041911244392395, + "learning_rate": 4.045422197093115e-06, + "loss": 0.5591, + "step": 5544 + }, + { + "epoch": 0.875019725422124, + "grad_norm": 0.5988723039627075, + "learning_rate": 4.045095721036052e-06, + "loss": 0.5531, + "step": 5545 + }, + { + "epoch": 0.8751775287991163, + "grad_norm": 0.5665779113769531, + "learning_rate": 4.044769202338109e-06, + "loss": 0.564, + "step": 5546 + }, + { + "epoch": 0.8753353321761086, + "grad_norm": 0.589051365852356, + "learning_rate": 4.0444426410083e-06, + "loss": 0.575, + "step": 5547 + }, + { + "epoch": 0.8754931355531008, + "grad_norm": 0.5862237215042114, + "learning_rate": 4.044116037055635e-06, + "loss": 0.606, + "step": 5548 + }, + { + "epoch": 0.8756509389300932, + "grad_norm": 0.5625123381614685, + "learning_rate": 4.043789390489129e-06, + "loss": 0.5753, + "step": 5549 + }, + { + "epoch": 0.8758087423070854, + "grad_norm": 0.594330370426178, + "learning_rate": 4.043462701317795e-06, + "loss": 0.534, + "step": 5550 + }, + { + "epoch": 0.8759665456840776, + "grad_norm": 0.5803782343864441, + "learning_rate": 4.043135969550651e-06, + "loss": 0.5664, + "step": 5551 + }, + { + "epoch": 0.87612434906107, + "grad_norm": 0.6012929081916809, + "learning_rate": 4.042809195196711e-06, + "loss": 0.5897, + "step": 5552 + }, + { + "epoch": 0.8762821524380622, + "grad_norm": 0.5958556532859802, + "learning_rate": 4.042482378264996e-06, + "loss": 0.5557, + "step": 5553 + }, + { + "epoch": 0.8764399558150544, + "grad_norm": 0.6155980825424194, + "learning_rate": 4.042155518764524e-06, + "loss": 0.6137, + "step": 5554 + }, + { + "epoch": 0.8765977591920467, + "grad_norm": 0.5679322481155396, + "learning_rate": 4.041828616704316e-06, + "loss": 0.5607, + "step": 5555 + }, + { + "epoch": 0.876755562569039, + "grad_norm": 0.6408922672271729, + "learning_rate": 4.041501672093393e-06, + "loss": 0.5164, + "step": 5556 + }, + { + "epoch": 0.8769133659460312, + "grad_norm": 0.5858416557312012, + "learning_rate": 4.041174684940779e-06, + "loss": 0.5465, + "step": 5557 + }, + { + "epoch": 0.8770711693230235, + "grad_norm": 0.5687867999076843, + "learning_rate": 4.040847655255497e-06, + "loss": 0.5107, + "step": 5558 + }, + { + "epoch": 0.8772289727000158, + "grad_norm": 0.5676657557487488, + "learning_rate": 4.040520583046574e-06, + "loss": 0.5947, + "step": 5559 + }, + { + "epoch": 0.8773867760770081, + "grad_norm": 0.5605484247207642, + "learning_rate": 4.0401934683230324e-06, + "loss": 0.5383, + "step": 5560 + }, + { + "epoch": 0.8775445794540003, + "grad_norm": 0.5755753517150879, + "learning_rate": 4.039866311093904e-06, + "loss": 0.5742, + "step": 5561 + }, + { + "epoch": 0.8777023828309926, + "grad_norm": 0.5871253609657288, + "learning_rate": 4.0395391113682155e-06, + "loss": 0.5691, + "step": 5562 + }, + { + "epoch": 0.8778601862079849, + "grad_norm": 0.586805522441864, + "learning_rate": 4.039211869154998e-06, + "loss": 0.6027, + "step": 5563 + }, + { + "epoch": 0.8780179895849771, + "grad_norm": 0.5923757553100586, + "learning_rate": 4.038884584463282e-06, + "loss": 0.5659, + "step": 5564 + }, + { + "epoch": 0.8781757929619693, + "grad_norm": 0.6110261082649231, + "learning_rate": 4.038557257302098e-06, + "loss": 0.5304, + "step": 5565 + }, + { + "epoch": 0.8783335963389617, + "grad_norm": 0.5970476269721985, + "learning_rate": 4.038229887680483e-06, + "loss": 0.55, + "step": 5566 + }, + { + "epoch": 0.8784913997159539, + "grad_norm": 0.5613078474998474, + "learning_rate": 4.037902475607468e-06, + "loss": 0.5608, + "step": 5567 + }, + { + "epoch": 0.8786492030929461, + "grad_norm": 0.5822815895080566, + "learning_rate": 4.03757502109209e-06, + "loss": 0.6027, + "step": 5568 + }, + { + "epoch": 0.8788070064699385, + "grad_norm": 0.5932059288024902, + "learning_rate": 4.037247524143388e-06, + "loss": 0.5547, + "step": 5569 + }, + { + "epoch": 0.8789648098469307, + "grad_norm": 0.5752841830253601, + "learning_rate": 4.036919984770398e-06, + "loss": 0.5409, + "step": 5570 + }, + { + "epoch": 0.879122613223923, + "grad_norm": 0.5873956680297852, + "learning_rate": 4.03659240298216e-06, + "loss": 0.5726, + "step": 5571 + }, + { + "epoch": 0.8792804166009153, + "grad_norm": 0.5648183822631836, + "learning_rate": 4.036264778787714e-06, + "loss": 0.586, + "step": 5572 + }, + { + "epoch": 0.8794382199779075, + "grad_norm": 0.5909918546676636, + "learning_rate": 4.035937112196101e-06, + "loss": 0.572, + "step": 5573 + }, + { + "epoch": 0.8795960233548998, + "grad_norm": 0.5959304571151733, + "learning_rate": 4.035609403216366e-06, + "loss": 0.5798, + "step": 5574 + }, + { + "epoch": 0.8797538267318921, + "grad_norm": 0.5891522765159607, + "learning_rate": 4.035281651857551e-06, + "loss": 0.5572, + "step": 5575 + }, + { + "epoch": 0.8799116301088843, + "grad_norm": 0.5807265043258667, + "learning_rate": 4.034953858128702e-06, + "loss": 0.5788, + "step": 5576 + }, + { + "epoch": 0.8800694334858766, + "grad_norm": 0.6300869584083557, + "learning_rate": 4.034626022038865e-06, + "loss": 0.5669, + "step": 5577 + }, + { + "epoch": 0.8802272368628689, + "grad_norm": 0.58791184425354, + "learning_rate": 4.034298143597087e-06, + "loss": 0.558, + "step": 5578 + }, + { + "epoch": 0.8803850402398611, + "grad_norm": 0.5902625918388367, + "learning_rate": 4.0339702228124185e-06, + "loss": 0.5749, + "step": 5579 + }, + { + "epoch": 0.8805428436168534, + "grad_norm": 0.5725929141044617, + "learning_rate": 4.033642259693907e-06, + "loss": 0.5261, + "step": 5580 + }, + { + "epoch": 0.8807006469938456, + "grad_norm": 0.5768524408340454, + "learning_rate": 4.033314254250605e-06, + "loss": 0.5676, + "step": 5581 + }, + { + "epoch": 0.880858450370838, + "grad_norm": 0.6006642580032349, + "learning_rate": 4.032986206491564e-06, + "loss": 0.5434, + "step": 5582 + }, + { + "epoch": 0.8810162537478302, + "grad_norm": 0.5944661498069763, + "learning_rate": 4.032658116425837e-06, + "loss": 0.5578, + "step": 5583 + }, + { + "epoch": 0.8811740571248224, + "grad_norm": 0.5957821011543274, + "learning_rate": 4.0323299840624795e-06, + "loss": 0.5788, + "step": 5584 + }, + { + "epoch": 0.8813318605018148, + "grad_norm": 0.6033128499984741, + "learning_rate": 4.0320018094105465e-06, + "loss": 0.6274, + "step": 5585 + }, + { + "epoch": 0.881489663878807, + "grad_norm": 0.5932202339172363, + "learning_rate": 4.031673592479095e-06, + "loss": 0.5767, + "step": 5586 + }, + { + "epoch": 0.8816474672557992, + "grad_norm": 0.5713233351707458, + "learning_rate": 4.031345333277182e-06, + "loss": 0.5243, + "step": 5587 + }, + { + "epoch": 0.8818052706327916, + "grad_norm": 0.5747987627983093, + "learning_rate": 4.031017031813869e-06, + "loss": 0.5558, + "step": 5588 + }, + { + "epoch": 0.8819630740097838, + "grad_norm": 0.5653942227363586, + "learning_rate": 4.030688688098214e-06, + "loss": 0.555, + "step": 5589 + }, + { + "epoch": 0.882120877386776, + "grad_norm": 0.5952503681182861, + "learning_rate": 4.03036030213928e-06, + "loss": 0.5464, + "step": 5590 + }, + { + "epoch": 0.8822786807637684, + "grad_norm": 0.5684595108032227, + "learning_rate": 4.0300318739461285e-06, + "loss": 0.5753, + "step": 5591 + }, + { + "epoch": 0.8824364841407606, + "grad_norm": 0.5991286635398865, + "learning_rate": 4.029703403527824e-06, + "loss": 0.5605, + "step": 5592 + }, + { + "epoch": 0.8825942875177529, + "grad_norm": 0.5863239169120789, + "learning_rate": 4.029374890893432e-06, + "loss": 0.5952, + "step": 5593 + }, + { + "epoch": 0.8827520908947452, + "grad_norm": 0.6047069430351257, + "learning_rate": 4.029046336052017e-06, + "loss": 0.5949, + "step": 5594 + }, + { + "epoch": 0.8829098942717374, + "grad_norm": 0.5626243948936462, + "learning_rate": 4.028717739012647e-06, + "loss": 0.6065, + "step": 5595 + }, + { + "epoch": 0.8830676976487297, + "grad_norm": 0.5704299807548523, + "learning_rate": 4.028389099784392e-06, + "loss": 0.599, + "step": 5596 + }, + { + "epoch": 0.883225501025722, + "grad_norm": 0.5737431049346924, + "learning_rate": 4.02806041837632e-06, + "loss": 0.5745, + "step": 5597 + }, + { + "epoch": 0.8833833044027142, + "grad_norm": 0.5735402703285217, + "learning_rate": 4.027731694797502e-06, + "loss": 0.5158, + "step": 5598 + }, + { + "epoch": 0.8835411077797065, + "grad_norm": 0.5685769319534302, + "learning_rate": 4.02740292905701e-06, + "loss": 0.5216, + "step": 5599 + }, + { + "epoch": 0.8836989111566987, + "grad_norm": 0.5995299220085144, + "learning_rate": 4.027074121163918e-06, + "loss": 0.6022, + "step": 5600 + }, + { + "epoch": 0.883856714533691, + "grad_norm": 0.5806185603141785, + "learning_rate": 4.0267452711273e-06, + "loss": 0.5402, + "step": 5601 + }, + { + "epoch": 0.8840145179106833, + "grad_norm": 0.5772287249565125, + "learning_rate": 4.026416378956229e-06, + "loss": 0.5557, + "step": 5602 + }, + { + "epoch": 0.8841723212876755, + "grad_norm": 0.5798015594482422, + "learning_rate": 4.026087444659786e-06, + "loss": 0.5522, + "step": 5603 + }, + { + "epoch": 0.8843301246646679, + "grad_norm": 0.5765863656997681, + "learning_rate": 4.025758468247045e-06, + "loss": 0.5684, + "step": 5604 + }, + { + "epoch": 0.8844879280416601, + "grad_norm": 0.5694183111190796, + "learning_rate": 4.025429449727087e-06, + "loss": 0.5659, + "step": 5605 + }, + { + "epoch": 0.8846457314186523, + "grad_norm": 0.6030685305595398, + "learning_rate": 4.025100389108992e-06, + "loss": 0.5387, + "step": 5606 + }, + { + "epoch": 0.8848035347956447, + "grad_norm": 0.5819602012634277, + "learning_rate": 4.02477128640184e-06, + "loss": 0.5666, + "step": 5607 + }, + { + "epoch": 0.8849613381726369, + "grad_norm": 0.5837131142616272, + "learning_rate": 4.024442141614715e-06, + "loss": 0.5592, + "step": 5608 + }, + { + "epoch": 0.8851191415496291, + "grad_norm": 0.5907478332519531, + "learning_rate": 4.0241129547567e-06, + "loss": 0.5573, + "step": 5609 + }, + { + "epoch": 0.8852769449266215, + "grad_norm": 0.5781798958778381, + "learning_rate": 4.023783725836879e-06, + "loss": 0.5519, + "step": 5610 + }, + { + "epoch": 0.8854347483036137, + "grad_norm": 0.5699521899223328, + "learning_rate": 4.0234544548643385e-06, + "loss": 0.5593, + "step": 5611 + }, + { + "epoch": 0.8855925516806059, + "grad_norm": 0.5809233784675598, + "learning_rate": 4.0231251418481655e-06, + "loss": 0.5798, + "step": 5612 + }, + { + "epoch": 0.8857503550575982, + "grad_norm": 0.5909817814826965, + "learning_rate": 4.022795786797449e-06, + "loss": 0.5396, + "step": 5613 + }, + { + "epoch": 0.8859081584345905, + "grad_norm": 0.5520825386047363, + "learning_rate": 4.022466389721277e-06, + "loss": 0.5895, + "step": 5614 + }, + { + "epoch": 0.8860659618115828, + "grad_norm": 0.576181173324585, + "learning_rate": 4.022136950628741e-06, + "loss": 0.5603, + "step": 5615 + }, + { + "epoch": 0.886223765188575, + "grad_norm": 0.5893725752830505, + "learning_rate": 4.021807469528933e-06, + "loss": 0.5439, + "step": 5616 + }, + { + "epoch": 0.8863815685655673, + "grad_norm": 0.7159174084663391, + "learning_rate": 4.021477946430945e-06, + "loss": 0.5624, + "step": 5617 + }, + { + "epoch": 0.8865393719425596, + "grad_norm": 0.5615558624267578, + "learning_rate": 4.021148381343871e-06, + "loss": 0.5474, + "step": 5618 + }, + { + "epoch": 0.8866971753195518, + "grad_norm": 0.609403133392334, + "learning_rate": 4.020818774276808e-06, + "loss": 0.5596, + "step": 5619 + }, + { + "epoch": 0.8868549786965441, + "grad_norm": 0.6063580513000488, + "learning_rate": 4.02048912523885e-06, + "loss": 0.5382, + "step": 5620 + }, + { + "epoch": 0.8870127820735364, + "grad_norm": 0.5830378532409668, + "learning_rate": 4.0201594342390955e-06, + "loss": 0.5781, + "step": 5621 + }, + { + "epoch": 0.8871705854505286, + "grad_norm": 0.5779776573181152, + "learning_rate": 4.019829701286643e-06, + "loss": 0.5288, + "step": 5622 + }, + { + "epoch": 0.887328388827521, + "grad_norm": 0.6026723980903625, + "learning_rate": 4.0194999263905925e-06, + "loss": 0.5607, + "step": 5623 + }, + { + "epoch": 0.8874861922045132, + "grad_norm": 0.5841547250747681, + "learning_rate": 4.019170109560046e-06, + "loss": 0.5802, + "step": 5624 + }, + { + "epoch": 0.8876439955815054, + "grad_norm": 0.558131992816925, + "learning_rate": 4.018840250804103e-06, + "loss": 0.5465, + "step": 5625 + }, + { + "epoch": 0.8878017989584978, + "grad_norm": 0.6212999820709229, + "learning_rate": 4.018510350131869e-06, + "loss": 0.5446, + "step": 5626 + }, + { + "epoch": 0.88795960233549, + "grad_norm": 0.5781027674674988, + "learning_rate": 4.018180407552448e-06, + "loss": 0.5716, + "step": 5627 + }, + { + "epoch": 0.8881174057124822, + "grad_norm": 0.6017976999282837, + "learning_rate": 4.017850423074946e-06, + "loss": 0.5789, + "step": 5628 + }, + { + "epoch": 0.8882752090894745, + "grad_norm": 0.5797131061553955, + "learning_rate": 4.0175203967084685e-06, + "loss": 0.5832, + "step": 5629 + }, + { + "epoch": 0.8884330124664668, + "grad_norm": 0.575202465057373, + "learning_rate": 4.017190328462124e-06, + "loss": 0.5597, + "step": 5630 + }, + { + "epoch": 0.888590815843459, + "grad_norm": 0.5552891492843628, + "learning_rate": 4.016860218345022e-06, + "loss": 0.5817, + "step": 5631 + }, + { + "epoch": 0.8887486192204513, + "grad_norm": 0.5747065544128418, + "learning_rate": 4.016530066366272e-06, + "loss": 0.5455, + "step": 5632 + }, + { + "epoch": 0.8889064225974436, + "grad_norm": 0.5989831686019897, + "learning_rate": 4.016199872534987e-06, + "loss": 0.5598, + "step": 5633 + }, + { + "epoch": 0.8890642259744359, + "grad_norm": 0.5778947472572327, + "learning_rate": 4.0158696368602775e-06, + "loss": 0.6022, + "step": 5634 + }, + { + "epoch": 0.8892220293514281, + "grad_norm": 0.5863929986953735, + "learning_rate": 4.015539359351259e-06, + "loss": 0.5853, + "step": 5635 + }, + { + "epoch": 0.8893798327284204, + "grad_norm": 0.6042550206184387, + "learning_rate": 4.015209040017044e-06, + "loss": 0.5626, + "step": 5636 + }, + { + "epoch": 0.8895376361054127, + "grad_norm": 0.6644716858863831, + "learning_rate": 4.014878678866751e-06, + "loss": 0.5249, + "step": 5637 + }, + { + "epoch": 0.8896954394824049, + "grad_norm": 0.6195807456970215, + "learning_rate": 4.014548275909496e-06, + "loss": 0.5214, + "step": 5638 + }, + { + "epoch": 0.8898532428593972, + "grad_norm": 0.6331720352172852, + "learning_rate": 4.014217831154398e-06, + "loss": 0.5661, + "step": 5639 + }, + { + "epoch": 0.8900110462363895, + "grad_norm": 0.5999831557273865, + "learning_rate": 4.013887344610575e-06, + "loss": 0.5696, + "step": 5640 + }, + { + "epoch": 0.8901688496133817, + "grad_norm": 0.5597672462463379, + "learning_rate": 4.0135568162871484e-06, + "loss": 0.5809, + "step": 5641 + }, + { + "epoch": 0.890326652990374, + "grad_norm": 0.5898681282997131, + "learning_rate": 4.013226246193242e-06, + "loss": 0.5486, + "step": 5642 + }, + { + "epoch": 0.8904844563673663, + "grad_norm": 0.588788628578186, + "learning_rate": 4.012895634337974e-06, + "loss": 0.5766, + "step": 5643 + }, + { + "epoch": 0.8906422597443585, + "grad_norm": 0.6107369661331177, + "learning_rate": 4.0125649807304735e-06, + "loss": 0.5692, + "step": 5644 + }, + { + "epoch": 0.8908000631213508, + "grad_norm": 0.5704821944236755, + "learning_rate": 4.012234285379862e-06, + "loss": 0.5948, + "step": 5645 + }, + { + "epoch": 0.8909578664983431, + "grad_norm": 0.6037918925285339, + "learning_rate": 4.011903548295268e-06, + "loss": 0.5611, + "step": 5646 + }, + { + "epoch": 0.8911156698753353, + "grad_norm": 0.6080042719841003, + "learning_rate": 4.01157276948582e-06, + "loss": 0.5428, + "step": 5647 + }, + { + "epoch": 0.8912734732523276, + "grad_norm": 0.6002389192581177, + "learning_rate": 4.011241948960644e-06, + "loss": 0.5717, + "step": 5648 + }, + { + "epoch": 0.8914312766293199, + "grad_norm": 0.5571288466453552, + "learning_rate": 4.01091108672887e-06, + "loss": 0.5551, + "step": 5649 + }, + { + "epoch": 0.8915890800063121, + "grad_norm": 0.5939801931381226, + "learning_rate": 4.010580182799632e-06, + "loss": 0.5581, + "step": 5650 + }, + { + "epoch": 0.8917468833833044, + "grad_norm": 0.5857510566711426, + "learning_rate": 4.010249237182059e-06, + "loss": 0.572, + "step": 5651 + }, + { + "epoch": 0.8919046867602967, + "grad_norm": 0.5710021257400513, + "learning_rate": 4.009918249885285e-06, + "loss": 0.5663, + "step": 5652 + }, + { + "epoch": 0.8920624901372889, + "grad_norm": 0.5478670001029968, + "learning_rate": 4.009587220918446e-06, + "loss": 0.5577, + "step": 5653 + }, + { + "epoch": 0.8922202935142812, + "grad_norm": 0.5785094499588013, + "learning_rate": 4.009256150290676e-06, + "loss": 0.541, + "step": 5654 + }, + { + "epoch": 0.8923780968912735, + "grad_norm": 0.5591427087783813, + "learning_rate": 4.0089250380111125e-06, + "loss": 0.5279, + "step": 5655 + }, + { + "epoch": 0.8925359002682658, + "grad_norm": 0.5948924422264099, + "learning_rate": 4.008593884088893e-06, + "loss": 0.5828, + "step": 5656 + }, + { + "epoch": 0.892693703645258, + "grad_norm": 0.5635426044464111, + "learning_rate": 4.008262688533155e-06, + "loss": 0.5456, + "step": 5657 + }, + { + "epoch": 0.8928515070222502, + "grad_norm": 0.5819272994995117, + "learning_rate": 4.0079314513530425e-06, + "loss": 0.5983, + "step": 5658 + }, + { + "epoch": 0.8930093103992426, + "grad_norm": 0.5854888558387756, + "learning_rate": 4.007600172557694e-06, + "loss": 0.5597, + "step": 5659 + }, + { + "epoch": 0.8931671137762348, + "grad_norm": 0.6060248613357544, + "learning_rate": 4.007268852156253e-06, + "loss": 0.5319, + "step": 5660 + }, + { + "epoch": 0.893324917153227, + "grad_norm": 0.6185106039047241, + "learning_rate": 4.006937490157863e-06, + "loss": 0.6044, + "step": 5661 + }, + { + "epoch": 0.8934827205302194, + "grad_norm": 0.5811423063278198, + "learning_rate": 4.006606086571667e-06, + "loss": 0.587, + "step": 5662 + }, + { + "epoch": 0.8936405239072116, + "grad_norm": 0.5949921011924744, + "learning_rate": 4.0062746414068146e-06, + "loss": 0.5197, + "step": 5663 + }, + { + "epoch": 0.8937983272842038, + "grad_norm": 0.5681066513061523, + "learning_rate": 4.005943154672449e-06, + "loss": 0.5922, + "step": 5664 + }, + { + "epoch": 0.8939561306611962, + "grad_norm": 0.5606364011764526, + "learning_rate": 4.0056116263777215e-06, + "loss": 0.5662, + "step": 5665 + }, + { + "epoch": 0.8941139340381884, + "grad_norm": 0.5794999599456787, + "learning_rate": 4.00528005653178e-06, + "loss": 0.5338, + "step": 5666 + }, + { + "epoch": 0.8942717374151807, + "grad_norm": 0.5812636017799377, + "learning_rate": 4.004948445143774e-06, + "loss": 0.561, + "step": 5667 + }, + { + "epoch": 0.894429540792173, + "grad_norm": 0.595929741859436, + "learning_rate": 4.004616792222858e-06, + "loss": 0.5614, + "step": 5668 + }, + { + "epoch": 0.8945873441691652, + "grad_norm": 0.6212317943572998, + "learning_rate": 4.004285097778183e-06, + "loss": 0.5267, + "step": 5669 + }, + { + "epoch": 0.8947451475461575, + "grad_norm": 0.6025255918502808, + "learning_rate": 4.003953361818903e-06, + "loss": 0.5568, + "step": 5670 + }, + { + "epoch": 0.8949029509231498, + "grad_norm": 0.5998236536979675, + "learning_rate": 4.003621584354173e-06, + "loss": 0.5847, + "step": 5671 + }, + { + "epoch": 0.895060754300142, + "grad_norm": 0.5700008273124695, + "learning_rate": 4.00328976539315e-06, + "loss": 0.5788, + "step": 5672 + }, + { + "epoch": 0.8952185576771343, + "grad_norm": 0.590638279914856, + "learning_rate": 4.00295790494499e-06, + "loss": 0.5712, + "step": 5673 + }, + { + "epoch": 0.8953763610541265, + "grad_norm": 0.6008650064468384, + "learning_rate": 4.002626003018853e-06, + "loss": 0.5803, + "step": 5674 + }, + { + "epoch": 0.8955341644311188, + "grad_norm": 0.5650309920310974, + "learning_rate": 4.002294059623898e-06, + "loss": 0.5511, + "step": 5675 + }, + { + "epoch": 0.8956919678081111, + "grad_norm": 0.6218993067741394, + "learning_rate": 4.001962074769287e-06, + "loss": 0.4881, + "step": 5676 + }, + { + "epoch": 0.8958497711851033, + "grad_norm": 0.5905233025550842, + "learning_rate": 4.00163004846418e-06, + "loss": 0.5762, + "step": 5677 + }, + { + "epoch": 0.8960075745620957, + "grad_norm": 0.6009418964385986, + "learning_rate": 4.001297980717741e-06, + "loss": 0.5898, + "step": 5678 + }, + { + "epoch": 0.8961653779390879, + "grad_norm": 0.5739084482192993, + "learning_rate": 4.000965871539134e-06, + "loss": 0.57, + "step": 5679 + }, + { + "epoch": 0.8963231813160801, + "grad_norm": 0.5703338384628296, + "learning_rate": 4.0006337209375255e-06, + "loss": 0.5413, + "step": 5680 + }, + { + "epoch": 0.8964809846930725, + "grad_norm": 0.5877112746238708, + "learning_rate": 4.00030152892208e-06, + "loss": 0.5679, + "step": 5681 + }, + { + "epoch": 0.8966387880700647, + "grad_norm": 0.5712066292762756, + "learning_rate": 3.999969295501969e-06, + "loss": 0.5942, + "step": 5682 + }, + { + "epoch": 0.8967965914470569, + "grad_norm": 0.5816266536712646, + "learning_rate": 3.9996370206863566e-06, + "loss": 0.5891, + "step": 5683 + }, + { + "epoch": 0.8969543948240493, + "grad_norm": 0.5866891145706177, + "learning_rate": 3.999304704484416e-06, + "loss": 0.5699, + "step": 5684 + }, + { + "epoch": 0.8971121982010415, + "grad_norm": 0.5562590956687927, + "learning_rate": 3.998972346905317e-06, + "loss": 0.5713, + "step": 5685 + }, + { + "epoch": 0.8972700015780337, + "grad_norm": 0.5830860733985901, + "learning_rate": 3.998639947958232e-06, + "loss": 0.5621, + "step": 5686 + }, + { + "epoch": 0.897427804955026, + "grad_norm": 0.6026306748390198, + "learning_rate": 3.9983075076523345e-06, + "loss": 0.5595, + "step": 5687 + }, + { + "epoch": 0.8975856083320183, + "grad_norm": 0.5615721344947815, + "learning_rate": 3.9979750259967986e-06, + "loss": 0.5737, + "step": 5688 + }, + { + "epoch": 0.8977434117090106, + "grad_norm": 0.5925778150558472, + "learning_rate": 3.9976425030008015e-06, + "loss": 0.5657, + "step": 5689 + }, + { + "epoch": 0.8979012150860028, + "grad_norm": 0.5739099979400635, + "learning_rate": 3.997309938673518e-06, + "loss": 0.5681, + "step": 5690 + }, + { + "epoch": 0.8980590184629951, + "grad_norm": 0.587955117225647, + "learning_rate": 3.996977333024128e-06, + "loss": 0.5625, + "step": 5691 + }, + { + "epoch": 0.8982168218399874, + "grad_norm": 0.6159525513648987, + "learning_rate": 3.996644686061809e-06, + "loss": 0.5947, + "step": 5692 + }, + { + "epoch": 0.8983746252169796, + "grad_norm": 0.5753665566444397, + "learning_rate": 3.996311997795743e-06, + "loss": 0.549, + "step": 5693 + }, + { + "epoch": 0.8985324285939719, + "grad_norm": 0.6028928756713867, + "learning_rate": 3.995979268235109e-06, + "loss": 0.5624, + "step": 5694 + }, + { + "epoch": 0.8986902319709642, + "grad_norm": 0.6065410375595093, + "learning_rate": 3.995646497389092e-06, + "loss": 0.5585, + "step": 5695 + }, + { + "epoch": 0.8988480353479564, + "grad_norm": 0.6417369842529297, + "learning_rate": 3.995313685266874e-06, + "loss": 0.5232, + "step": 5696 + }, + { + "epoch": 0.8990058387249488, + "grad_norm": 0.6338962316513062, + "learning_rate": 3.99498083187764e-06, + "loss": 0.5641, + "step": 5697 + }, + { + "epoch": 0.899163642101941, + "grad_norm": 0.5895280838012695, + "learning_rate": 3.994647937230577e-06, + "loss": 0.5665, + "step": 5698 + }, + { + "epoch": 0.8993214454789332, + "grad_norm": 0.595419704914093, + "learning_rate": 3.99431500133487e-06, + "loss": 0.5808, + "step": 5699 + }, + { + "epoch": 0.8994792488559256, + "grad_norm": 0.6135740280151367, + "learning_rate": 3.993982024199709e-06, + "loss": 0.5726, + "step": 5700 + }, + { + "epoch": 0.8996370522329178, + "grad_norm": 0.6238461136817932, + "learning_rate": 3.993649005834283e-06, + "loss": 0.5631, + "step": 5701 + }, + { + "epoch": 0.89979485560991, + "grad_norm": 0.5779857635498047, + "learning_rate": 3.993315946247783e-06, + "loss": 0.5801, + "step": 5702 + }, + { + "epoch": 0.8999526589869024, + "grad_norm": 0.5969676375389099, + "learning_rate": 3.9929828454494e-06, + "loss": 0.561, + "step": 5703 + }, + { + "epoch": 0.9001104623638946, + "grad_norm": 0.5867944359779358, + "learning_rate": 3.992649703448327e-06, + "loss": 0.5582, + "step": 5704 + }, + { + "epoch": 0.9002682657408868, + "grad_norm": 0.5639466047286987, + "learning_rate": 3.992316520253757e-06, + "loss": 0.6095, + "step": 5705 + }, + { + "epoch": 0.9004260691178791, + "grad_norm": 0.5843953490257263, + "learning_rate": 3.991983295874886e-06, + "loss": 0.5932, + "step": 5706 + }, + { + "epoch": 0.9005838724948714, + "grad_norm": 0.5596357583999634, + "learning_rate": 3.991650030320909e-06, + "loss": 0.573, + "step": 5707 + }, + { + "epoch": 0.9007416758718637, + "grad_norm": 0.6051388382911682, + "learning_rate": 3.991316723601027e-06, + "loss": 0.5632, + "step": 5708 + }, + { + "epoch": 0.9008994792488559, + "grad_norm": 0.5983260869979858, + "learning_rate": 3.990983375724433e-06, + "loss": 0.5238, + "step": 5709 + }, + { + "epoch": 0.9010572826258482, + "grad_norm": 0.5659254789352417, + "learning_rate": 3.9906499867003305e-06, + "loss": 0.5628, + "step": 5710 + }, + { + "epoch": 0.9012150860028405, + "grad_norm": 0.5988951325416565, + "learning_rate": 3.9903165565379185e-06, + "loss": 0.5209, + "step": 5711 + }, + { + "epoch": 0.9013728893798327, + "grad_norm": 0.6200394034385681, + "learning_rate": 3.9899830852464e-06, + "loss": 0.5459, + "step": 5712 + }, + { + "epoch": 0.901530692756825, + "grad_norm": 0.6202958226203918, + "learning_rate": 3.989649572834977e-06, + "loss": 0.6, + "step": 5713 + }, + { + "epoch": 0.9016884961338173, + "grad_norm": 0.6156406402587891, + "learning_rate": 3.989316019312853e-06, + "loss": 0.5579, + "step": 5714 + }, + { + "epoch": 0.9018462995108095, + "grad_norm": 0.5885875225067139, + "learning_rate": 3.988982424689236e-06, + "loss": 0.5743, + "step": 5715 + }, + { + "epoch": 0.9020041028878018, + "grad_norm": 0.60193932056427, + "learning_rate": 3.9886487889733296e-06, + "loss": 0.572, + "step": 5716 + }, + { + "epoch": 0.9021619062647941, + "grad_norm": 0.5557190775871277, + "learning_rate": 3.9883151121743424e-06, + "loss": 0.5731, + "step": 5717 + }, + { + "epoch": 0.9023197096417863, + "grad_norm": 0.577587366104126, + "learning_rate": 3.987981394301484e-06, + "loss": 0.6011, + "step": 5718 + }, + { + "epoch": 0.9024775130187787, + "grad_norm": 0.5888763070106506, + "learning_rate": 3.987647635363962e-06, + "loss": 0.5533, + "step": 5719 + }, + { + "epoch": 0.9026353163957709, + "grad_norm": 0.6021528244018555, + "learning_rate": 3.98731383537099e-06, + "loss": 0.5495, + "step": 5720 + }, + { + "epoch": 0.9027931197727631, + "grad_norm": 0.5757094025611877, + "learning_rate": 3.986979994331778e-06, + "loss": 0.5711, + "step": 5721 + }, + { + "epoch": 0.9029509231497554, + "grad_norm": 0.5928071737289429, + "learning_rate": 3.98664611225554e-06, + "loss": 0.5404, + "step": 5722 + }, + { + "epoch": 0.9031087265267477, + "grad_norm": 0.5899053812026978, + "learning_rate": 3.98631218915149e-06, + "loss": 0.5548, + "step": 5723 + }, + { + "epoch": 0.9032665299037399, + "grad_norm": 0.5725361704826355, + "learning_rate": 3.985978225028844e-06, + "loss": 0.5082, + "step": 5724 + }, + { + "epoch": 0.9034243332807322, + "grad_norm": 0.5912917256355286, + "learning_rate": 3.985644219896817e-06, + "loss": 0.5508, + "step": 5725 + }, + { + "epoch": 0.9035821366577245, + "grad_norm": 0.6147830486297607, + "learning_rate": 3.9853101737646295e-06, + "loss": 0.5699, + "step": 5726 + }, + { + "epoch": 0.9037399400347167, + "grad_norm": 0.5878756046295166, + "learning_rate": 3.9849760866414985e-06, + "loss": 0.5638, + "step": 5727 + }, + { + "epoch": 0.903897743411709, + "grad_norm": 0.6259045004844666, + "learning_rate": 3.9846419585366435e-06, + "loss": 0.5635, + "step": 5728 + }, + { + "epoch": 0.9040555467887013, + "grad_norm": 0.5870434045791626, + "learning_rate": 3.984307789459288e-06, + "loss": 0.5509, + "step": 5729 + }, + { + "epoch": 0.9042133501656936, + "grad_norm": 0.5841573476791382, + "learning_rate": 3.983973579418651e-06, + "loss": 0.5756, + "step": 5730 + }, + { + "epoch": 0.9043711535426858, + "grad_norm": 0.5838530659675598, + "learning_rate": 3.983639328423959e-06, + "loss": 0.5775, + "step": 5731 + }, + { + "epoch": 0.904528956919678, + "grad_norm": 0.5853005051612854, + "learning_rate": 3.983305036484435e-06, + "loss": 0.557, + "step": 5732 + }, + { + "epoch": 0.9046867602966704, + "grad_norm": 0.5814350843429565, + "learning_rate": 3.982970703609304e-06, + "loss": 0.5809, + "step": 5733 + }, + { + "epoch": 0.9048445636736626, + "grad_norm": 0.5612041354179382, + "learning_rate": 3.982636329807794e-06, + "loss": 0.54, + "step": 5734 + }, + { + "epoch": 0.9050023670506548, + "grad_norm": 0.6068333983421326, + "learning_rate": 3.982301915089133e-06, + "loss": 0.5635, + "step": 5735 + }, + { + "epoch": 0.9051601704276472, + "grad_norm": 0.6136669516563416, + "learning_rate": 3.981967459462548e-06, + "loss": 0.5418, + "step": 5736 + }, + { + "epoch": 0.9053179738046394, + "grad_norm": 0.5634037256240845, + "learning_rate": 3.981632962937272e-06, + "loss": 0.556, + "step": 5737 + }, + { + "epoch": 0.9054757771816316, + "grad_norm": 0.5844433903694153, + "learning_rate": 3.981298425522534e-06, + "loss": 0.5609, + "step": 5738 + }, + { + "epoch": 0.905633580558624, + "grad_norm": 0.5859827399253845, + "learning_rate": 3.980963847227568e-06, + "loss": 0.5627, + "step": 5739 + }, + { + "epoch": 0.9057913839356162, + "grad_norm": 0.5512230396270752, + "learning_rate": 3.980629228061607e-06, + "loss": 0.5247, + "step": 5740 + }, + { + "epoch": 0.9059491873126085, + "grad_norm": 0.5475929975509644, + "learning_rate": 3.980294568033884e-06, + "loss": 0.5762, + "step": 5741 + }, + { + "epoch": 0.9061069906896008, + "grad_norm": 0.5847753286361694, + "learning_rate": 3.9799598671536384e-06, + "loss": 0.5894, + "step": 5742 + }, + { + "epoch": 0.906264794066593, + "grad_norm": 0.6030057072639465, + "learning_rate": 3.979625125430104e-06, + "loss": 0.5192, + "step": 5743 + }, + { + "epoch": 0.9064225974435853, + "grad_norm": 0.5581523776054382, + "learning_rate": 3.97929034287252e-06, + "loss": 0.5709, + "step": 5744 + }, + { + "epoch": 0.9065804008205776, + "grad_norm": 0.5635527968406677, + "learning_rate": 3.978955519490126e-06, + "loss": 0.5622, + "step": 5745 + }, + { + "epoch": 0.9067382041975698, + "grad_norm": 0.5919292569160461, + "learning_rate": 3.9786206552921615e-06, + "loss": 0.5611, + "step": 5746 + }, + { + "epoch": 0.9068960075745621, + "grad_norm": 0.5729972124099731, + "learning_rate": 3.978285750287868e-06, + "loss": 0.5853, + "step": 5747 + }, + { + "epoch": 0.9070538109515544, + "grad_norm": 0.5804927349090576, + "learning_rate": 3.977950804486489e-06, + "loss": 0.5506, + "step": 5748 + }, + { + "epoch": 0.9072116143285466, + "grad_norm": 0.5547699928283691, + "learning_rate": 3.977615817897267e-06, + "loss": 0.57, + "step": 5749 + }, + { + "epoch": 0.9073694177055389, + "grad_norm": 0.5410853028297424, + "learning_rate": 3.977280790529447e-06, + "loss": 0.531, + "step": 5750 + }, + { + "epoch": 0.9075272210825311, + "grad_norm": 0.603255569934845, + "learning_rate": 3.976945722392276e-06, + "loss": 0.5766, + "step": 5751 + }, + { + "epoch": 0.9076850244595235, + "grad_norm": 0.5439370274543762, + "learning_rate": 3.976610613495001e-06, + "loss": 0.5163, + "step": 5752 + }, + { + "epoch": 0.9078428278365157, + "grad_norm": 0.5958177447319031, + "learning_rate": 3.976275463846868e-06, + "loss": 0.5549, + "step": 5753 + }, + { + "epoch": 0.9080006312135079, + "grad_norm": 0.5929274559020996, + "learning_rate": 3.975940273457129e-06, + "loss": 0.5497, + "step": 5754 + }, + { + "epoch": 0.9081584345905003, + "grad_norm": 0.6018033027648926, + "learning_rate": 3.975605042335032e-06, + "loss": 0.5697, + "step": 5755 + }, + { + "epoch": 0.9083162379674925, + "grad_norm": 0.5595354437828064, + "learning_rate": 3.975269770489831e-06, + "loss": 0.5514, + "step": 5756 + }, + { + "epoch": 0.9084740413444847, + "grad_norm": 0.6109285950660706, + "learning_rate": 3.974934457930777e-06, + "loss": 0.5775, + "step": 5757 + }, + { + "epoch": 0.9086318447214771, + "grad_norm": 0.618346095085144, + "learning_rate": 3.974599104667124e-06, + "loss": 0.5307, + "step": 5758 + }, + { + "epoch": 0.9087896480984693, + "grad_norm": 0.6056841015815735, + "learning_rate": 3.974263710708128e-06, + "loss": 0.5389, + "step": 5759 + }, + { + "epoch": 0.9089474514754615, + "grad_norm": 0.5807531476020813, + "learning_rate": 3.9739282760630445e-06, + "loss": 0.5879, + "step": 5760 + }, + { + "epoch": 0.9091052548524539, + "grad_norm": 0.5923309326171875, + "learning_rate": 3.97359280074113e-06, + "loss": 0.5651, + "step": 5761 + }, + { + "epoch": 0.9092630582294461, + "grad_norm": 0.5568239092826843, + "learning_rate": 3.973257284751643e-06, + "loss": 0.5524, + "step": 5762 + }, + { + "epoch": 0.9094208616064384, + "grad_norm": 0.5584681034088135, + "learning_rate": 3.972921728103844e-06, + "loss": 0.5397, + "step": 5763 + }, + { + "epoch": 0.9095786649834307, + "grad_norm": 0.5921055674552917, + "learning_rate": 3.972586130806992e-06, + "loss": 0.582, + "step": 5764 + }, + { + "epoch": 0.9097364683604229, + "grad_norm": 0.6020343899726868, + "learning_rate": 3.9722504928703505e-06, + "loss": 0.5455, + "step": 5765 + }, + { + "epoch": 0.9098942717374152, + "grad_norm": 0.631663978099823, + "learning_rate": 3.971914814303181e-06, + "loss": 0.5647, + "step": 5766 + }, + { + "epoch": 0.9100520751144074, + "grad_norm": 0.5893954038619995, + "learning_rate": 3.971579095114747e-06, + "loss": 0.5785, + "step": 5767 + }, + { + "epoch": 0.9102098784913997, + "grad_norm": 0.6311924457550049, + "learning_rate": 3.971243335314314e-06, + "loss": 0.5705, + "step": 5768 + }, + { + "epoch": 0.910367681868392, + "grad_norm": 0.6038501858711243, + "learning_rate": 3.970907534911149e-06, + "loss": 0.5688, + "step": 5769 + }, + { + "epoch": 0.9105254852453842, + "grad_norm": 0.5981193780899048, + "learning_rate": 3.970571693914518e-06, + "loss": 0.5495, + "step": 5770 + }, + { + "epoch": 0.9106832886223765, + "grad_norm": 0.6011204123497009, + "learning_rate": 3.970235812333691e-06, + "loss": 0.5283, + "step": 5771 + }, + { + "epoch": 0.9108410919993688, + "grad_norm": 0.6347639560699463, + "learning_rate": 3.9698998901779355e-06, + "loss": 0.5851, + "step": 5772 + }, + { + "epoch": 0.910998895376361, + "grad_norm": 0.5971699357032776, + "learning_rate": 3.969563927456524e-06, + "loss": 0.6053, + "step": 5773 + }, + { + "epoch": 0.9111566987533534, + "grad_norm": 0.6111212968826294, + "learning_rate": 3.969227924178727e-06, + "loss": 0.5687, + "step": 5774 + }, + { + "epoch": 0.9113145021303456, + "grad_norm": 0.59257972240448, + "learning_rate": 3.968891880353817e-06, + "loss": 0.6002, + "step": 5775 + }, + { + "epoch": 0.9114723055073378, + "grad_norm": 0.5842499136924744, + "learning_rate": 3.96855579599107e-06, + "loss": 0.5726, + "step": 5776 + }, + { + "epoch": 0.9116301088843302, + "grad_norm": 0.577385663986206, + "learning_rate": 3.96821967109976e-06, + "loss": 0.5309, + "step": 5777 + }, + { + "epoch": 0.9117879122613224, + "grad_norm": 0.615770161151886, + "learning_rate": 3.967883505689162e-06, + "loss": 0.5822, + "step": 5778 + }, + { + "epoch": 0.9119457156383146, + "grad_norm": 0.5667274594306946, + "learning_rate": 3.967547299768555e-06, + "loss": 0.5581, + "step": 5779 + }, + { + "epoch": 0.912103519015307, + "grad_norm": 0.5740767121315002, + "learning_rate": 3.967211053347217e-06, + "loss": 0.5255, + "step": 5780 + }, + { + "epoch": 0.9122613223922992, + "grad_norm": 0.580782413482666, + "learning_rate": 3.966874766434427e-06, + "loss": 0.5618, + "step": 5781 + }, + { + "epoch": 0.9124191257692915, + "grad_norm": 0.5992459058761597, + "learning_rate": 3.966538439039467e-06, + "loss": 0.5849, + "step": 5782 + }, + { + "epoch": 0.9125769291462837, + "grad_norm": 0.6006315350532532, + "learning_rate": 3.966202071171617e-06, + "loss": 0.5674, + "step": 5783 + }, + { + "epoch": 0.912734732523276, + "grad_norm": 0.5770111083984375, + "learning_rate": 3.9658656628401616e-06, + "loss": 0.59, + "step": 5784 + }, + { + "epoch": 0.9128925359002683, + "grad_norm": 0.6002102494239807, + "learning_rate": 3.9655292140543845e-06, + "loss": 0.5769, + "step": 5785 + }, + { + "epoch": 0.9130503392772605, + "grad_norm": 0.5749130845069885, + "learning_rate": 3.96519272482357e-06, + "loss": 0.5705, + "step": 5786 + }, + { + "epoch": 0.9132081426542528, + "grad_norm": 0.5619394779205322, + "learning_rate": 3.9648561951570056e-06, + "loss": 0.5519, + "step": 5787 + }, + { + "epoch": 0.9133659460312451, + "grad_norm": 0.6139616370201111, + "learning_rate": 3.9645196250639775e-06, + "loss": 0.5558, + "step": 5788 + }, + { + "epoch": 0.9135237494082373, + "grad_norm": 0.6073816418647766, + "learning_rate": 3.964183014553775e-06, + "loss": 0.5678, + "step": 5789 + }, + { + "epoch": 0.9136815527852296, + "grad_norm": 0.5666418671607971, + "learning_rate": 3.963846363635687e-06, + "loss": 0.5689, + "step": 5790 + }, + { + "epoch": 0.9138393561622219, + "grad_norm": 0.6093915104866028, + "learning_rate": 3.963509672319005e-06, + "loss": 0.5673, + "step": 5791 + }, + { + "epoch": 0.9139971595392141, + "grad_norm": 0.6267327666282654, + "learning_rate": 3.963172940613021e-06, + "loss": 0.5748, + "step": 5792 + }, + { + "epoch": 0.9141549629162065, + "grad_norm": 0.6033337116241455, + "learning_rate": 3.962836168527027e-06, + "loss": 0.5517, + "step": 5793 + }, + { + "epoch": 0.9143127662931987, + "grad_norm": 0.6096192598342896, + "learning_rate": 3.9624993560703185e-06, + "loss": 0.579, + "step": 5794 + }, + { + "epoch": 0.9144705696701909, + "grad_norm": 0.6102752089500427, + "learning_rate": 3.962162503252189e-06, + "loss": 0.5924, + "step": 5795 + }, + { + "epoch": 0.9146283730471833, + "grad_norm": 0.5991487503051758, + "learning_rate": 3.9618256100819355e-06, + "loss": 0.5566, + "step": 5796 + }, + { + "epoch": 0.9147861764241755, + "grad_norm": 0.5979308485984802, + "learning_rate": 3.961488676568856e-06, + "loss": 0.5479, + "step": 5797 + }, + { + "epoch": 0.9149439798011677, + "grad_norm": 0.6254630088806152, + "learning_rate": 3.961151702722249e-06, + "loss": 0.5569, + "step": 5798 + }, + { + "epoch": 0.91510178317816, + "grad_norm": 0.6199724078178406, + "learning_rate": 3.960814688551414e-06, + "loss": 0.5954, + "step": 5799 + }, + { + "epoch": 0.9152595865551523, + "grad_norm": 0.5964557528495789, + "learning_rate": 3.9604776340656506e-06, + "loss": 0.5717, + "step": 5800 + }, + { + "epoch": 0.9154173899321445, + "grad_norm": 0.5954697728157043, + "learning_rate": 3.960140539274262e-06, + "loss": 0.5544, + "step": 5801 + }, + { + "epoch": 0.9155751933091368, + "grad_norm": 0.5851207375526428, + "learning_rate": 3.959803404186552e-06, + "loss": 0.5313, + "step": 5802 + }, + { + "epoch": 0.9157329966861291, + "grad_norm": 0.5867379903793335, + "learning_rate": 3.9594662288118225e-06, + "loss": 0.5561, + "step": 5803 + }, + { + "epoch": 0.9158908000631214, + "grad_norm": 0.5893707871437073, + "learning_rate": 3.9591290131593794e-06, + "loss": 0.5735, + "step": 5804 + }, + { + "epoch": 0.9160486034401136, + "grad_norm": 0.5582067370414734, + "learning_rate": 3.95879175723853e-06, + "loss": 0.5586, + "step": 5805 + }, + { + "epoch": 0.9162064068171059, + "grad_norm": 0.5787672400474548, + "learning_rate": 3.958454461058581e-06, + "loss": 0.5808, + "step": 5806 + }, + { + "epoch": 0.9163642101940982, + "grad_norm": 0.5827297568321228, + "learning_rate": 3.958117124628842e-06, + "loss": 0.5836, + "step": 5807 + }, + { + "epoch": 0.9165220135710904, + "grad_norm": 0.6069881319999695, + "learning_rate": 3.95777974795862e-06, + "loss": 0.5917, + "step": 5808 + }, + { + "epoch": 0.9166798169480826, + "grad_norm": 0.6226795315742493, + "learning_rate": 3.95744233105723e-06, + "loss": 0.6005, + "step": 5809 + }, + { + "epoch": 0.916837620325075, + "grad_norm": 0.5652250051498413, + "learning_rate": 3.957104873933979e-06, + "loss": 0.5853, + "step": 5810 + }, + { + "epoch": 0.9169954237020672, + "grad_norm": 0.6012566685676575, + "learning_rate": 3.956767376598184e-06, + "loss": 0.5536, + "step": 5811 + }, + { + "epoch": 0.9171532270790594, + "grad_norm": 0.606814980506897, + "learning_rate": 3.956429839059156e-06, + "loss": 0.574, + "step": 5812 + }, + { + "epoch": 0.9173110304560518, + "grad_norm": 0.6221573352813721, + "learning_rate": 3.9560922613262135e-06, + "loss": 0.5842, + "step": 5813 + }, + { + "epoch": 0.917468833833044, + "grad_norm": 0.5867617726325989, + "learning_rate": 3.95575464340867e-06, + "loss": 0.5749, + "step": 5814 + }, + { + "epoch": 0.9176266372100363, + "grad_norm": 0.5912563800811768, + "learning_rate": 3.955416985315845e-06, + "loss": 0.5497, + "step": 5815 + }, + { + "epoch": 0.9177844405870286, + "grad_norm": 0.6053569316864014, + "learning_rate": 3.955079287057056e-06, + "loss": 0.5243, + "step": 5816 + }, + { + "epoch": 0.9179422439640208, + "grad_norm": 0.5906084179878235, + "learning_rate": 3.9547415486416215e-06, + "loss": 0.5885, + "step": 5817 + }, + { + "epoch": 0.9181000473410131, + "grad_norm": 0.6050287485122681, + "learning_rate": 3.954403770078864e-06, + "loss": 0.5114, + "step": 5818 + }, + { + "epoch": 0.9182578507180054, + "grad_norm": 0.5606579780578613, + "learning_rate": 3.954065951378104e-06, + "loss": 0.5685, + "step": 5819 + }, + { + "epoch": 0.9184156540949976, + "grad_norm": 0.586421549320221, + "learning_rate": 3.953728092548667e-06, + "loss": 0.5789, + "step": 5820 + }, + { + "epoch": 0.9185734574719899, + "grad_norm": 0.5756598114967346, + "learning_rate": 3.953390193599874e-06, + "loss": 0.5627, + "step": 5821 + }, + { + "epoch": 0.9187312608489822, + "grad_norm": 0.5589200854301453, + "learning_rate": 3.953052254541052e-06, + "loss": 0.5778, + "step": 5822 + }, + { + "epoch": 0.9188890642259744, + "grad_norm": 0.5769863724708557, + "learning_rate": 3.952714275381527e-06, + "loss": 0.5497, + "step": 5823 + }, + { + "epoch": 0.9190468676029667, + "grad_norm": 0.5976510643959045, + "learning_rate": 3.952376256130627e-06, + "loss": 0.5655, + "step": 5824 + }, + { + "epoch": 0.919204670979959, + "grad_norm": 0.6074737310409546, + "learning_rate": 3.9520381967976785e-06, + "loss": 0.5712, + "step": 5825 + }, + { + "epoch": 0.9193624743569513, + "grad_norm": 0.6081387996673584, + "learning_rate": 3.951700097392012e-06, + "loss": 0.5647, + "step": 5826 + }, + { + "epoch": 0.9195202777339435, + "grad_norm": 0.5721692442893982, + "learning_rate": 3.95136195792296e-06, + "loss": 0.5462, + "step": 5827 + }, + { + "epoch": 0.9196780811109357, + "grad_norm": 0.5948020815849304, + "learning_rate": 3.9510237783998525e-06, + "loss": 0.5839, + "step": 5828 + }, + { + "epoch": 0.9198358844879281, + "grad_norm": 0.6479912996292114, + "learning_rate": 3.950685558832023e-06, + "loss": 0.5496, + "step": 5829 + }, + { + "epoch": 0.9199936878649203, + "grad_norm": 0.5943413376808167, + "learning_rate": 3.950347299228805e-06, + "loss": 0.5355, + "step": 5830 + }, + { + "epoch": 0.9201514912419125, + "grad_norm": 0.5782319903373718, + "learning_rate": 3.950008999599535e-06, + "loss": 0.5422, + "step": 5831 + }, + { + "epoch": 0.9203092946189049, + "grad_norm": 0.572468638420105, + "learning_rate": 3.949670659953547e-06, + "loss": 0.5756, + "step": 5832 + }, + { + "epoch": 0.9204670979958971, + "grad_norm": 0.5996676683425903, + "learning_rate": 3.94933228030018e-06, + "loss": 0.5746, + "step": 5833 + }, + { + "epoch": 0.9206249013728893, + "grad_norm": 0.6653697490692139, + "learning_rate": 3.948993860648772e-06, + "loss": 0.5431, + "step": 5834 + }, + { + "epoch": 0.9207827047498817, + "grad_norm": 0.6241094470024109, + "learning_rate": 3.948655401008663e-06, + "loss": 0.5737, + "step": 5835 + }, + { + "epoch": 0.9209405081268739, + "grad_norm": 0.5957937240600586, + "learning_rate": 3.948316901389194e-06, + "loss": 0.5535, + "step": 5836 + }, + { + "epoch": 0.9210983115038662, + "grad_norm": 0.5559210181236267, + "learning_rate": 3.947978361799705e-06, + "loss": 0.5268, + "step": 5837 + }, + { + "epoch": 0.9212561148808585, + "grad_norm": 0.5869495272636414, + "learning_rate": 3.94763978224954e-06, + "loss": 0.5692, + "step": 5838 + }, + { + "epoch": 0.9214139182578507, + "grad_norm": 0.6450270414352417, + "learning_rate": 3.947301162748043e-06, + "loss": 0.5411, + "step": 5839 + }, + { + "epoch": 0.921571721634843, + "grad_norm": 0.609942615032196, + "learning_rate": 3.946962503304559e-06, + "loss": 0.5469, + "step": 5840 + }, + { + "epoch": 0.9217295250118352, + "grad_norm": 0.6087762117385864, + "learning_rate": 3.946623803928434e-06, + "loss": 0.5902, + "step": 5841 + }, + { + "epoch": 0.9218873283888275, + "grad_norm": 0.6195084452629089, + "learning_rate": 3.946285064629017e-06, + "loss": 0.5756, + "step": 5842 + }, + { + "epoch": 0.9220451317658198, + "grad_norm": 0.5972710251808167, + "learning_rate": 3.9459462854156525e-06, + "loss": 0.5365, + "step": 5843 + }, + { + "epoch": 0.922202935142812, + "grad_norm": 0.5894469022750854, + "learning_rate": 3.945607466297694e-06, + "loss": 0.5792, + "step": 5844 + }, + { + "epoch": 0.9223607385198043, + "grad_norm": 0.5857532620429993, + "learning_rate": 3.945268607284489e-06, + "loss": 0.5687, + "step": 5845 + }, + { + "epoch": 0.9225185418967966, + "grad_norm": 0.5825405716896057, + "learning_rate": 3.944929708385391e-06, + "loss": 0.5631, + "step": 5846 + }, + { + "epoch": 0.9226763452737888, + "grad_norm": 0.5630156993865967, + "learning_rate": 3.944590769609753e-06, + "loss": 0.5511, + "step": 5847 + }, + { + "epoch": 0.9228341486507812, + "grad_norm": 0.5932248830795288, + "learning_rate": 3.944251790966927e-06, + "loss": 0.556, + "step": 5848 + }, + { + "epoch": 0.9229919520277734, + "grad_norm": 0.6274057030677795, + "learning_rate": 3.943912772466271e-06, + "loss": 0.583, + "step": 5849 + }, + { + "epoch": 0.9231497554047656, + "grad_norm": 0.6252473592758179, + "learning_rate": 3.943573714117138e-06, + "loss": 0.524, + "step": 5850 + }, + { + "epoch": 0.923307558781758, + "grad_norm": 0.5872845649719238, + "learning_rate": 3.943234615928887e-06, + "loss": 0.5429, + "step": 5851 + }, + { + "epoch": 0.9234653621587502, + "grad_norm": 0.5467646718025208, + "learning_rate": 3.942895477910875e-06, + "loss": 0.5782, + "step": 5852 + }, + { + "epoch": 0.9236231655357424, + "grad_norm": 0.5648953914642334, + "learning_rate": 3.942556300072463e-06, + "loss": 0.5666, + "step": 5853 + }, + { + "epoch": 0.9237809689127348, + "grad_norm": 0.5695544481277466, + "learning_rate": 3.942217082423011e-06, + "loss": 0.5359, + "step": 5854 + }, + { + "epoch": 0.923938772289727, + "grad_norm": 0.5628653764724731, + "learning_rate": 3.941877824971879e-06, + "loss": 0.5619, + "step": 5855 + }, + { + "epoch": 0.9240965756667193, + "grad_norm": 0.5953062176704407, + "learning_rate": 3.941538527728432e-06, + "loss": 0.5721, + "step": 5856 + }, + { + "epoch": 0.9242543790437115, + "grad_norm": 0.6302608251571655, + "learning_rate": 3.941199190702032e-06, + "loss": 0.5839, + "step": 5857 + }, + { + "epoch": 0.9244121824207038, + "grad_norm": 0.6086653470993042, + "learning_rate": 3.940859813902045e-06, + "loss": 0.6016, + "step": 5858 + }, + { + "epoch": 0.9245699857976961, + "grad_norm": 0.6023575067520142, + "learning_rate": 3.940520397337836e-06, + "loss": 0.5832, + "step": 5859 + }, + { + "epoch": 0.9247277891746883, + "grad_norm": 0.5820319056510925, + "learning_rate": 3.940180941018773e-06, + "loss": 0.5696, + "step": 5860 + }, + { + "epoch": 0.9248855925516806, + "grad_norm": 0.5995185375213623, + "learning_rate": 3.939841444954223e-06, + "loss": 0.5797, + "step": 5861 + }, + { + "epoch": 0.9250433959286729, + "grad_norm": 0.5732178092002869, + "learning_rate": 3.939501909153557e-06, + "loss": 0.5546, + "step": 5862 + }, + { + "epoch": 0.9252011993056651, + "grad_norm": 0.5962733030319214, + "learning_rate": 3.9391623336261445e-06, + "loss": 0.5809, + "step": 5863 + }, + { + "epoch": 0.9253590026826574, + "grad_norm": 0.6002850532531738, + "learning_rate": 3.938822718381356e-06, + "loss": 0.5544, + "step": 5864 + }, + { + "epoch": 0.9255168060596497, + "grad_norm": 0.5865505337715149, + "learning_rate": 3.938483063428565e-06, + "loss": 0.5619, + "step": 5865 + }, + { + "epoch": 0.9256746094366419, + "grad_norm": 0.613116979598999, + "learning_rate": 3.938143368777145e-06, + "loss": 0.5716, + "step": 5866 + }, + { + "epoch": 0.9258324128136343, + "grad_norm": 0.5986266136169434, + "learning_rate": 3.937803634436472e-06, + "loss": 0.5605, + "step": 5867 + }, + { + "epoch": 0.9259902161906265, + "grad_norm": 0.5777679085731506, + "learning_rate": 3.93746386041592e-06, + "loss": 0.5601, + "step": 5868 + }, + { + "epoch": 0.9261480195676187, + "grad_norm": 0.6007958054542542, + "learning_rate": 3.937124046724866e-06, + "loss": 0.5304, + "step": 5869 + }, + { + "epoch": 0.9263058229446111, + "grad_norm": 0.5743346214294434, + "learning_rate": 3.936784193372689e-06, + "loss": 0.5717, + "step": 5870 + }, + { + "epoch": 0.9264636263216033, + "grad_norm": 0.6178016662597656, + "learning_rate": 3.936444300368767e-06, + "loss": 0.5742, + "step": 5871 + }, + { + "epoch": 0.9266214296985955, + "grad_norm": 0.6061806082725525, + "learning_rate": 3.936104367722482e-06, + "loss": 0.5494, + "step": 5872 + }, + { + "epoch": 0.9267792330755878, + "grad_norm": 0.6029344201087952, + "learning_rate": 3.9357643954432144e-06, + "loss": 0.5522, + "step": 5873 + }, + { + "epoch": 0.9269370364525801, + "grad_norm": 0.6106342673301697, + "learning_rate": 3.935424383540346e-06, + "loss": 0.5564, + "step": 5874 + }, + { + "epoch": 0.9270948398295723, + "grad_norm": 0.6098002195358276, + "learning_rate": 3.9350843320232605e-06, + "loss": 0.5692, + "step": 5875 + }, + { + "epoch": 0.9272526432065646, + "grad_norm": 0.567571222782135, + "learning_rate": 3.934744240901342e-06, + "loss": 0.5546, + "step": 5876 + }, + { + "epoch": 0.9274104465835569, + "grad_norm": 0.6011646389961243, + "learning_rate": 3.934404110183979e-06, + "loss": 0.5663, + "step": 5877 + }, + { + "epoch": 0.9275682499605492, + "grad_norm": 0.5702484250068665, + "learning_rate": 3.934063939880555e-06, + "loss": 0.5437, + "step": 5878 + }, + { + "epoch": 0.9277260533375414, + "grad_norm": 0.5746182203292847, + "learning_rate": 3.9337237300004595e-06, + "loss": 0.5147, + "step": 5879 + }, + { + "epoch": 0.9278838567145337, + "grad_norm": 0.5631524324417114, + "learning_rate": 3.93338348055308e-06, + "loss": 0.5503, + "step": 5880 + }, + { + "epoch": 0.928041660091526, + "grad_norm": 0.5738162994384766, + "learning_rate": 3.933043191547809e-06, + "loss": 0.5474, + "step": 5881 + }, + { + "epoch": 0.9281994634685182, + "grad_norm": 0.5600164532661438, + "learning_rate": 3.932702862994036e-06, + "loss": 0.5306, + "step": 5882 + }, + { + "epoch": 0.9283572668455105, + "grad_norm": 0.5950276851654053, + "learning_rate": 3.932362494901153e-06, + "loss": 0.576, + "step": 5883 + }, + { + "epoch": 0.9285150702225028, + "grad_norm": 0.5834509134292603, + "learning_rate": 3.932022087278553e-06, + "loss": 0.5764, + "step": 5884 + }, + { + "epoch": 0.928672873599495, + "grad_norm": 0.5856939554214478, + "learning_rate": 3.931681640135633e-06, + "loss": 0.5798, + "step": 5885 + }, + { + "epoch": 0.9288306769764872, + "grad_norm": 0.5816890001296997, + "learning_rate": 3.931341153481786e-06, + "loss": 0.5468, + "step": 5886 + }, + { + "epoch": 0.9289884803534796, + "grad_norm": 0.5955291986465454, + "learning_rate": 3.931000627326409e-06, + "loss": 0.5733, + "step": 5887 + }, + { + "epoch": 0.9291462837304718, + "grad_norm": 0.5848254561424255, + "learning_rate": 3.9306600616789005e-06, + "loss": 0.5775, + "step": 5888 + }, + { + "epoch": 0.9293040871074641, + "grad_norm": 0.5788034200668335, + "learning_rate": 3.930319456548659e-06, + "loss": 0.5558, + "step": 5889 + }, + { + "epoch": 0.9294618904844564, + "grad_norm": 0.6219549775123596, + "learning_rate": 3.929978811945084e-06, + "loss": 0.5673, + "step": 5890 + }, + { + "epoch": 0.9296196938614486, + "grad_norm": 0.6318100094795227, + "learning_rate": 3.9296381278775765e-06, + "loss": 0.5507, + "step": 5891 + }, + { + "epoch": 0.9297774972384409, + "grad_norm": 0.5790640115737915, + "learning_rate": 3.929297404355539e-06, + "loss": 0.5781, + "step": 5892 + }, + { + "epoch": 0.9299353006154332, + "grad_norm": 0.5834240317344666, + "learning_rate": 3.928956641388374e-06, + "loss": 0.5628, + "step": 5893 + }, + { + "epoch": 0.9300931039924254, + "grad_norm": 0.5949267745018005, + "learning_rate": 3.9286158389854864e-06, + "loss": 0.5713, + "step": 5894 + }, + { + "epoch": 0.9302509073694177, + "grad_norm": 0.5736263990402222, + "learning_rate": 3.928274997156282e-06, + "loss": 0.5704, + "step": 5895 + }, + { + "epoch": 0.93040871074641, + "grad_norm": 0.5680755972862244, + "learning_rate": 3.927934115910165e-06, + "loss": 0.5985, + "step": 5896 + }, + { + "epoch": 0.9305665141234022, + "grad_norm": 0.5766461491584778, + "learning_rate": 3.9275931952565446e-06, + "loss": 0.5787, + "step": 5897 + }, + { + "epoch": 0.9307243175003945, + "grad_norm": 0.611995279788971, + "learning_rate": 3.92725223520483e-06, + "loss": 0.5519, + "step": 5898 + }, + { + "epoch": 0.9308821208773868, + "grad_norm": 0.589647650718689, + "learning_rate": 3.926911235764429e-06, + "loss": 0.5507, + "step": 5899 + }, + { + "epoch": 0.9310399242543791, + "grad_norm": 0.5745087265968323, + "learning_rate": 3.926570196944753e-06, + "loss": 0.5519, + "step": 5900 + }, + { + "epoch": 0.9311977276313713, + "grad_norm": 0.5666797757148743, + "learning_rate": 3.926229118755215e-06, + "loss": 0.5346, + "step": 5901 + }, + { + "epoch": 0.9313555310083635, + "grad_norm": 0.6123591065406799, + "learning_rate": 3.9258880012052276e-06, + "loss": 0.5592, + "step": 5902 + }, + { + "epoch": 0.9315133343853559, + "grad_norm": 0.5820143818855286, + "learning_rate": 3.9255468443042034e-06, + "loss": 0.5454, + "step": 5903 + }, + { + "epoch": 0.9316711377623481, + "grad_norm": 0.596149206161499, + "learning_rate": 3.925205648061558e-06, + "loss": 0.5824, + "step": 5904 + }, + { + "epoch": 0.9318289411393403, + "grad_norm": 0.612570583820343, + "learning_rate": 3.924864412486709e-06, + "loss": 0.5762, + "step": 5905 + }, + { + "epoch": 0.9319867445163327, + "grad_norm": 0.5613797307014465, + "learning_rate": 3.924523137589072e-06, + "loss": 0.5297, + "step": 5906 + }, + { + "epoch": 0.9321445478933249, + "grad_norm": 0.562220573425293, + "learning_rate": 3.924181823378067e-06, + "loss": 0.5655, + "step": 5907 + }, + { + "epoch": 0.9323023512703171, + "grad_norm": 0.550152063369751, + "learning_rate": 3.92384046986311e-06, + "loss": 0.5447, + "step": 5908 + }, + { + "epoch": 0.9324601546473095, + "grad_norm": 0.5625676512718201, + "learning_rate": 3.923499077053626e-06, + "loss": 0.5791, + "step": 5909 + }, + { + "epoch": 0.9326179580243017, + "grad_norm": 0.6025519967079163, + "learning_rate": 3.9231576449590345e-06, + "loss": 0.5496, + "step": 5910 + }, + { + "epoch": 0.932775761401294, + "grad_norm": 0.5806004405021667, + "learning_rate": 3.922816173588758e-06, + "loss": 0.5457, + "step": 5911 + }, + { + "epoch": 0.9329335647782863, + "grad_norm": 0.6194379925727844, + "learning_rate": 3.92247466295222e-06, + "loss": 0.5895, + "step": 5912 + }, + { + "epoch": 0.9330913681552785, + "grad_norm": 0.6093848943710327, + "learning_rate": 3.922133113058847e-06, + "loss": 0.5847, + "step": 5913 + }, + { + "epoch": 0.9332491715322708, + "grad_norm": 0.5812230706214905, + "learning_rate": 3.921791523918064e-06, + "loss": 0.5906, + "step": 5914 + }, + { + "epoch": 0.933406974909263, + "grad_norm": 0.5613006949424744, + "learning_rate": 3.921449895539296e-06, + "loss": 0.5467, + "step": 5915 + }, + { + "epoch": 0.9335647782862553, + "grad_norm": 0.5844448208808899, + "learning_rate": 3.921108227931974e-06, + "loss": 0.5467, + "step": 5916 + }, + { + "epoch": 0.9337225816632476, + "grad_norm": 0.5994458794593811, + "learning_rate": 3.9207665211055265e-06, + "loss": 0.5793, + "step": 5917 + }, + { + "epoch": 0.9338803850402398, + "grad_norm": 0.5732895135879517, + "learning_rate": 3.920424775069383e-06, + "loss": 0.5593, + "step": 5918 + }, + { + "epoch": 0.9340381884172321, + "grad_norm": 0.579052746295929, + "learning_rate": 3.920082989832976e-06, + "loss": 0.5913, + "step": 5919 + }, + { + "epoch": 0.9341959917942244, + "grad_norm": 0.5723357200622559, + "learning_rate": 3.919741165405737e-06, + "loss": 0.5823, + "step": 5920 + }, + { + "epoch": 0.9343537951712166, + "grad_norm": 0.636254608631134, + "learning_rate": 3.9193993017970995e-06, + "loss": 0.5525, + "step": 5921 + }, + { + "epoch": 0.934511598548209, + "grad_norm": 0.5547880530357361, + "learning_rate": 3.919057399016499e-06, + "loss": 0.5415, + "step": 5922 + }, + { + "epoch": 0.9346694019252012, + "grad_norm": 0.5660202503204346, + "learning_rate": 3.91871545707337e-06, + "loss": 0.5314, + "step": 5923 + }, + { + "epoch": 0.9348272053021934, + "grad_norm": 0.5589534640312195, + "learning_rate": 3.91837347597715e-06, + "loss": 0.5834, + "step": 5924 + }, + { + "epoch": 0.9349850086791858, + "grad_norm": 0.5774582624435425, + "learning_rate": 3.918031455737277e-06, + "loss": 0.5608, + "step": 5925 + }, + { + "epoch": 0.935142812056178, + "grad_norm": 0.5575392246246338, + "learning_rate": 3.917689396363189e-06, + "loss": 0.5495, + "step": 5926 + }, + { + "epoch": 0.9353006154331702, + "grad_norm": 0.5962892174720764, + "learning_rate": 3.9173472978643265e-06, + "loss": 0.5678, + "step": 5927 + }, + { + "epoch": 0.9354584188101626, + "grad_norm": 0.5810796618461609, + "learning_rate": 3.917005160250131e-06, + "loss": 0.562, + "step": 5928 + }, + { + "epoch": 0.9356162221871548, + "grad_norm": 0.5808817744255066, + "learning_rate": 3.916662983530045e-06, + "loss": 0.5564, + "step": 5929 + }, + { + "epoch": 0.9357740255641471, + "grad_norm": 0.590499997138977, + "learning_rate": 3.916320767713511e-06, + "loss": 0.5873, + "step": 5930 + }, + { + "epoch": 0.9359318289411394, + "grad_norm": 0.6047029495239258, + "learning_rate": 3.915978512809972e-06, + "loss": 0.5955, + "step": 5931 + }, + { + "epoch": 0.9360896323181316, + "grad_norm": 0.5486422181129456, + "learning_rate": 3.915636218828876e-06, + "loss": 0.5216, + "step": 5932 + }, + { + "epoch": 0.9362474356951239, + "grad_norm": 0.5708988904953003, + "learning_rate": 3.915293885779669e-06, + "loss": 0.5546, + "step": 5933 + }, + { + "epoch": 0.9364052390721161, + "grad_norm": 0.5957843661308289, + "learning_rate": 3.914951513671797e-06, + "loss": 0.5225, + "step": 5934 + }, + { + "epoch": 0.9365630424491084, + "grad_norm": 0.5980768203735352, + "learning_rate": 3.914609102514708e-06, + "loss": 0.5909, + "step": 5935 + }, + { + "epoch": 0.9367208458261007, + "grad_norm": 0.6168948411941528, + "learning_rate": 3.9142666523178556e-06, + "loss": 0.5745, + "step": 5936 + }, + { + "epoch": 0.9368786492030929, + "grad_norm": 0.6030712723731995, + "learning_rate": 3.913924163090686e-06, + "loss": 0.5694, + "step": 5937 + }, + { + "epoch": 0.9370364525800852, + "grad_norm": 0.6362779140472412, + "learning_rate": 3.913581634842656e-06, + "loss": 0.5574, + "step": 5938 + }, + { + "epoch": 0.9371942559570775, + "grad_norm": 0.5919708609580994, + "learning_rate": 3.913239067583214e-06, + "loss": 0.5571, + "step": 5939 + }, + { + "epoch": 0.9373520593340697, + "grad_norm": 0.5710604786872864, + "learning_rate": 3.912896461321817e-06, + "loss": 0.5742, + "step": 5940 + }, + { + "epoch": 0.9375098627110621, + "grad_norm": 0.5908259749412537, + "learning_rate": 3.9125538160679174e-06, + "loss": 0.5854, + "step": 5941 + }, + { + "epoch": 0.9376676660880543, + "grad_norm": 0.6195177435874939, + "learning_rate": 3.912211131830974e-06, + "loss": 0.5601, + "step": 5942 + }, + { + "epoch": 0.9378254694650465, + "grad_norm": 0.5580980181694031, + "learning_rate": 3.911868408620443e-06, + "loss": 0.5672, + "step": 5943 + }, + { + "epoch": 0.9379832728420389, + "grad_norm": 0.601684033870697, + "learning_rate": 3.911525646445782e-06, + "loss": 0.5422, + "step": 5944 + }, + { + "epoch": 0.9381410762190311, + "grad_norm": 0.6012312769889832, + "learning_rate": 3.911182845316451e-06, + "loss": 0.5438, + "step": 5945 + }, + { + "epoch": 0.9382988795960233, + "grad_norm": 0.5709059834480286, + "learning_rate": 3.910840005241912e-06, + "loss": 0.5556, + "step": 5946 + }, + { + "epoch": 0.9384566829730157, + "grad_norm": 0.6013923287391663, + "learning_rate": 3.910497126231625e-06, + "loss": 0.5535, + "step": 5947 + }, + { + "epoch": 0.9386144863500079, + "grad_norm": 0.5873863101005554, + "learning_rate": 3.910154208295051e-06, + "loss": 0.5869, + "step": 5948 + }, + { + "epoch": 0.9387722897270001, + "grad_norm": 0.5949934720993042, + "learning_rate": 3.909811251441656e-06, + "loss": 0.5805, + "step": 5949 + }, + { + "epoch": 0.9389300931039924, + "grad_norm": 0.579654335975647, + "learning_rate": 3.9094682556809045e-06, + "loss": 0.5883, + "step": 5950 + }, + { + "epoch": 0.9390878964809847, + "grad_norm": 0.5819809436798096, + "learning_rate": 3.909125221022263e-06, + "loss": 0.5637, + "step": 5951 + }, + { + "epoch": 0.939245699857977, + "grad_norm": 0.6028886437416077, + "learning_rate": 3.908782147475197e-06, + "loss": 0.5713, + "step": 5952 + }, + { + "epoch": 0.9394035032349692, + "grad_norm": 0.6105336546897888, + "learning_rate": 3.9084390350491745e-06, + "loss": 0.5356, + "step": 5953 + }, + { + "epoch": 0.9395613066119615, + "grad_norm": 0.6177793145179749, + "learning_rate": 3.908095883753666e-06, + "loss": 0.5423, + "step": 5954 + }, + { + "epoch": 0.9397191099889538, + "grad_norm": 0.5797035098075867, + "learning_rate": 3.907752693598139e-06, + "loss": 0.5492, + "step": 5955 + }, + { + "epoch": 0.939876913365946, + "grad_norm": 0.6167610883712769, + "learning_rate": 3.907409464592067e-06, + "loss": 0.5626, + "step": 5956 + }, + { + "epoch": 0.9400347167429383, + "grad_norm": 0.5602192282676697, + "learning_rate": 3.907066196744923e-06, + "loss": 0.5789, + "step": 5957 + }, + { + "epoch": 0.9401925201199306, + "grad_norm": 0.5659434795379639, + "learning_rate": 3.906722890066178e-06, + "loss": 0.5454, + "step": 5958 + }, + { + "epoch": 0.9403503234969228, + "grad_norm": 0.5671905875205994, + "learning_rate": 3.906379544565307e-06, + "loss": 0.5935, + "step": 5959 + }, + { + "epoch": 0.940508126873915, + "grad_norm": 0.604344367980957, + "learning_rate": 3.906036160251787e-06, + "loss": 0.5449, + "step": 5960 + }, + { + "epoch": 0.9406659302509074, + "grad_norm": 0.5951666235923767, + "learning_rate": 3.905692737135092e-06, + "loss": 0.5299, + "step": 5961 + }, + { + "epoch": 0.9408237336278996, + "grad_norm": 0.582451581954956, + "learning_rate": 3.905349275224703e-06, + "loss": 0.5575, + "step": 5962 + }, + { + "epoch": 0.940981537004892, + "grad_norm": 0.6189226508140564, + "learning_rate": 3.905005774530096e-06, + "loss": 0.552, + "step": 5963 + }, + { + "epoch": 0.9411393403818842, + "grad_norm": 0.5928817987442017, + "learning_rate": 3.904662235060752e-06, + "loss": 0.5801, + "step": 5964 + }, + { + "epoch": 0.9412971437588764, + "grad_norm": 0.6020141839981079, + "learning_rate": 3.904318656826152e-06, + "loss": 0.5731, + "step": 5965 + }, + { + "epoch": 0.9414549471358687, + "grad_norm": 0.5680637955665588, + "learning_rate": 3.903975039835777e-06, + "loss": 0.5673, + "step": 5966 + }, + { + "epoch": 0.941612750512861, + "grad_norm": 0.569769561290741, + "learning_rate": 3.90363138409911e-06, + "loss": 0.56, + "step": 5967 + }, + { + "epoch": 0.9417705538898532, + "grad_norm": 0.6057375073432922, + "learning_rate": 3.903287689625636e-06, + "loss": 0.5374, + "step": 5968 + }, + { + "epoch": 0.9419283572668455, + "grad_norm": 0.5732100009918213, + "learning_rate": 3.90294395642484e-06, + "loss": 0.5686, + "step": 5969 + }, + { + "epoch": 0.9420861606438378, + "grad_norm": 0.576665997505188, + "learning_rate": 3.902600184506208e-06, + "loss": 0.5714, + "step": 5970 + }, + { + "epoch": 0.94224396402083, + "grad_norm": 0.5665032863616943, + "learning_rate": 3.902256373879225e-06, + "loss": 0.609, + "step": 5971 + }, + { + "epoch": 0.9424017673978223, + "grad_norm": 0.5892398357391357, + "learning_rate": 3.901912524553384e-06, + "loss": 0.5395, + "step": 5972 + }, + { + "epoch": 0.9425595707748146, + "grad_norm": 0.569190502166748, + "learning_rate": 3.90156863653817e-06, + "loss": 0.5365, + "step": 5973 + }, + { + "epoch": 0.9427173741518069, + "grad_norm": 0.564771831035614, + "learning_rate": 3.901224709843076e-06, + "loss": 0.5788, + "step": 5974 + }, + { + "epoch": 0.9428751775287991, + "grad_norm": 0.5754950642585754, + "learning_rate": 3.900880744477593e-06, + "loss": 0.5882, + "step": 5975 + }, + { + "epoch": 0.9430329809057914, + "grad_norm": 0.6030820608139038, + "learning_rate": 3.900536740451214e-06, + "loss": 0.5808, + "step": 5976 + }, + { + "epoch": 0.9431907842827837, + "grad_norm": 0.5896283984184265, + "learning_rate": 3.900192697773431e-06, + "loss": 0.5394, + "step": 5977 + }, + { + "epoch": 0.9433485876597759, + "grad_norm": 0.5877249836921692, + "learning_rate": 3.899848616453741e-06, + "loss": 0.5506, + "step": 5978 + }, + { + "epoch": 0.9435063910367681, + "grad_norm": 0.6146100759506226, + "learning_rate": 3.899504496501637e-06, + "loss": 0.5681, + "step": 5979 + }, + { + "epoch": 0.9436641944137605, + "grad_norm": 0.6317652463912964, + "learning_rate": 3.89916033792662e-06, + "loss": 0.5942, + "step": 5980 + }, + { + "epoch": 0.9438219977907527, + "grad_norm": 0.6184008121490479, + "learning_rate": 3.8988161407381845e-06, + "loss": 0.5456, + "step": 5981 + }, + { + "epoch": 0.9439798011677449, + "grad_norm": 0.6264567375183105, + "learning_rate": 3.898471904945829e-06, + "loss": 0.5484, + "step": 5982 + }, + { + "epoch": 0.9441376045447373, + "grad_norm": 0.5995611548423767, + "learning_rate": 3.898127630559057e-06, + "loss": 0.5745, + "step": 5983 + }, + { + "epoch": 0.9442954079217295, + "grad_norm": 0.6015669107437134, + "learning_rate": 3.897783317587366e-06, + "loss": 0.5351, + "step": 5984 + }, + { + "epoch": 0.9444532112987218, + "grad_norm": 0.5968014001846313, + "learning_rate": 3.8974389660402614e-06, + "loss": 0.5745, + "step": 5985 + }, + { + "epoch": 0.9446110146757141, + "grad_norm": 0.5832816958427429, + "learning_rate": 3.897094575927245e-06, + "loss": 0.5609, + "step": 5986 + }, + { + "epoch": 0.9447688180527063, + "grad_norm": 0.5777313113212585, + "learning_rate": 3.896750147257819e-06, + "loss": 0.5378, + "step": 5987 + }, + { + "epoch": 0.9449266214296986, + "grad_norm": 0.5967499017715454, + "learning_rate": 3.8964056800414935e-06, + "loss": 0.5219, + "step": 5988 + }, + { + "epoch": 0.9450844248066909, + "grad_norm": 0.5799875259399414, + "learning_rate": 3.896061174287772e-06, + "loss": 0.5469, + "step": 5989 + }, + { + "epoch": 0.9452422281836831, + "grad_norm": 0.5560483932495117, + "learning_rate": 3.895716630006161e-06, + "loss": 0.5462, + "step": 5990 + }, + { + "epoch": 0.9454000315606754, + "grad_norm": 0.5914400219917297, + "learning_rate": 3.89537204720617e-06, + "loss": 0.5196, + "step": 5991 + }, + { + "epoch": 0.9455578349376677, + "grad_norm": 0.5974520444869995, + "learning_rate": 3.895027425897311e-06, + "loss": 0.5302, + "step": 5992 + }, + { + "epoch": 0.9457156383146599, + "grad_norm": 0.5720310807228088, + "learning_rate": 3.8946827660890915e-06, + "loss": 0.5662, + "step": 5993 + }, + { + "epoch": 0.9458734416916522, + "grad_norm": 0.6085065603256226, + "learning_rate": 3.894338067791026e-06, + "loss": 0.5893, + "step": 5994 + }, + { + "epoch": 0.9460312450686444, + "grad_norm": 0.5680716037750244, + "learning_rate": 3.893993331012624e-06, + "loss": 0.5475, + "step": 5995 + }, + { + "epoch": 0.9461890484456368, + "grad_norm": 0.6192404627799988, + "learning_rate": 3.893648555763403e-06, + "loss": 0.5332, + "step": 5996 + }, + { + "epoch": 0.946346851822629, + "grad_norm": 0.6154406070709229, + "learning_rate": 3.893303742052875e-06, + "loss": 0.527, + "step": 5997 + }, + { + "epoch": 0.9465046551996212, + "grad_norm": 0.586444616317749, + "learning_rate": 3.892958889890558e-06, + "loss": 0.5492, + "step": 5998 + }, + { + "epoch": 0.9466624585766136, + "grad_norm": 0.5874044895172119, + "learning_rate": 3.892613999285969e-06, + "loss": 0.614, + "step": 5999 + }, + { + "epoch": 0.9468202619536058, + "grad_norm": 0.5975766181945801, + "learning_rate": 3.8922690702486235e-06, + "loss": 0.5313, + "step": 6000 + }, + { + "epoch": 0.946978065330598, + "grad_norm": 0.5906485915184021, + "learning_rate": 3.891924102788044e-06, + "loss": 0.5293, + "step": 6001 + }, + { + "epoch": 0.9471358687075904, + "grad_norm": 0.5656383037567139, + "learning_rate": 3.89157909691375e-06, + "loss": 0.5546, + "step": 6002 + }, + { + "epoch": 0.9472936720845826, + "grad_norm": 0.6006038188934326, + "learning_rate": 3.891234052635261e-06, + "loss": 0.5618, + "step": 6003 + }, + { + "epoch": 0.9474514754615749, + "grad_norm": 0.5660796165466309, + "learning_rate": 3.890888969962102e-06, + "loss": 0.5555, + "step": 6004 + }, + { + "epoch": 0.9476092788385672, + "grad_norm": 0.5826799273490906, + "learning_rate": 3.8905438489037936e-06, + "loss": 0.5584, + "step": 6005 + }, + { + "epoch": 0.9477670822155594, + "grad_norm": 0.5717424750328064, + "learning_rate": 3.890198689469863e-06, + "loss": 0.5556, + "step": 6006 + }, + { + "epoch": 0.9479248855925517, + "grad_norm": 0.5649457573890686, + "learning_rate": 3.889853491669833e-06, + "loss": 0.5506, + "step": 6007 + }, + { + "epoch": 0.948082688969544, + "grad_norm": 0.590857744216919, + "learning_rate": 3.889508255513233e-06, + "loss": 0.5681, + "step": 6008 + }, + { + "epoch": 0.9482404923465362, + "grad_norm": 0.6298149824142456, + "learning_rate": 3.88916298100959e-06, + "loss": 0.5544, + "step": 6009 + }, + { + "epoch": 0.9483982957235285, + "grad_norm": 0.5844553709030151, + "learning_rate": 3.888817668168432e-06, + "loss": 0.5552, + "step": 6010 + }, + { + "epoch": 0.9485560991005207, + "grad_norm": 0.6119729280471802, + "learning_rate": 3.888472316999289e-06, + "loss": 0.567, + "step": 6011 + }, + { + "epoch": 0.948713902477513, + "grad_norm": 0.5534839630126953, + "learning_rate": 3.888126927511691e-06, + "loss": 0.5469, + "step": 6012 + }, + { + "epoch": 0.9488717058545053, + "grad_norm": 0.596163809299469, + "learning_rate": 3.8877814997151704e-06, + "loss": 0.5696, + "step": 6013 + }, + { + "epoch": 0.9490295092314975, + "grad_norm": 0.593036413192749, + "learning_rate": 3.887436033619262e-06, + "loss": 0.5333, + "step": 6014 + }, + { + "epoch": 0.9491873126084899, + "grad_norm": 0.5944859385490417, + "learning_rate": 3.8870905292334985e-06, + "loss": 0.5559, + "step": 6015 + }, + { + "epoch": 0.9493451159854821, + "grad_norm": 0.6165981888771057, + "learning_rate": 3.886744986567414e-06, + "loss": 0.5521, + "step": 6016 + }, + { + "epoch": 0.9495029193624743, + "grad_norm": 0.5622664093971252, + "learning_rate": 3.886399405630545e-06, + "loss": 0.5731, + "step": 6017 + }, + { + "epoch": 0.9496607227394667, + "grad_norm": 0.5630747079849243, + "learning_rate": 3.88605378643243e-06, + "loss": 0.5415, + "step": 6018 + }, + { + "epoch": 0.9498185261164589, + "grad_norm": 0.5621052980422974, + "learning_rate": 3.8857081289826046e-06, + "loss": 0.5797, + "step": 6019 + }, + { + "epoch": 0.9499763294934511, + "grad_norm": 0.5876898765563965, + "learning_rate": 3.8853624332906124e-06, + "loss": 0.5447, + "step": 6020 + }, + { + "epoch": 0.9501341328704435, + "grad_norm": 0.5835167765617371, + "learning_rate": 3.88501669936599e-06, + "loss": 0.5623, + "step": 6021 + }, + { + "epoch": 0.9502919362474357, + "grad_norm": 0.5945889949798584, + "learning_rate": 3.88467092721828e-06, + "loss": 0.5595, + "step": 6022 + }, + { + "epoch": 0.9504497396244279, + "grad_norm": 0.5898677706718445, + "learning_rate": 3.884325116857025e-06, + "loss": 0.5363, + "step": 6023 + }, + { + "epoch": 0.9506075430014203, + "grad_norm": 0.6127086877822876, + "learning_rate": 3.883979268291769e-06, + "loss": 0.565, + "step": 6024 + }, + { + "epoch": 0.9507653463784125, + "grad_norm": 0.6297963857650757, + "learning_rate": 3.8836333815320555e-06, + "loss": 0.563, + "step": 6025 + }, + { + "epoch": 0.9509231497554048, + "grad_norm": 0.5962697863578796, + "learning_rate": 3.883287456587431e-06, + "loss": 0.5361, + "step": 6026 + }, + { + "epoch": 0.951080953132397, + "grad_norm": 0.596268892288208, + "learning_rate": 3.882941493467441e-06, + "loss": 0.5499, + "step": 6027 + }, + { + "epoch": 0.9512387565093893, + "grad_norm": 0.5801824331283569, + "learning_rate": 3.882595492181635e-06, + "loss": 0.5645, + "step": 6028 + }, + { + "epoch": 0.9513965598863816, + "grad_norm": 0.599352240562439, + "learning_rate": 3.882249452739561e-06, + "loss": 0.5745, + "step": 6029 + }, + { + "epoch": 0.9515543632633738, + "grad_norm": 0.5893464684486389, + "learning_rate": 3.881903375150769e-06, + "loss": 0.5835, + "step": 6030 + }, + { + "epoch": 0.9517121666403661, + "grad_norm": 0.6187283992767334, + "learning_rate": 3.8815572594248085e-06, + "loss": 0.5697, + "step": 6031 + }, + { + "epoch": 0.9518699700173584, + "grad_norm": 0.5831384658813477, + "learning_rate": 3.881211105571233e-06, + "loss": 0.5832, + "step": 6032 + }, + { + "epoch": 0.9520277733943506, + "grad_norm": 0.5571225881576538, + "learning_rate": 3.880864913599596e-06, + "loss": 0.5572, + "step": 6033 + }, + { + "epoch": 0.9521855767713429, + "grad_norm": 0.5907643437385559, + "learning_rate": 3.8805186835194485e-06, + "loss": 0.5514, + "step": 6034 + }, + { + "epoch": 0.9523433801483352, + "grad_norm": 0.5805118680000305, + "learning_rate": 3.88017241534035e-06, + "loss": 0.5376, + "step": 6035 + }, + { + "epoch": 0.9525011835253274, + "grad_norm": 0.5756129026412964, + "learning_rate": 3.879826109071853e-06, + "loss": 0.5619, + "step": 6036 + }, + { + "epoch": 0.9526589869023198, + "grad_norm": 0.5762574672698975, + "learning_rate": 3.8794797647235175e-06, + "loss": 0.5852, + "step": 6037 + }, + { + "epoch": 0.952816790279312, + "grad_norm": 0.5666444301605225, + "learning_rate": 3.8791333823049e-06, + "loss": 0.5552, + "step": 6038 + }, + { + "epoch": 0.9529745936563042, + "grad_norm": 0.6011244058609009, + "learning_rate": 3.8787869618255595e-06, + "loss": 0.5539, + "step": 6039 + }, + { + "epoch": 0.9531323970332966, + "grad_norm": 0.6051278114318848, + "learning_rate": 3.878440503295056e-06, + "loss": 0.5699, + "step": 6040 + }, + { + "epoch": 0.9532902004102888, + "grad_norm": 0.5943481922149658, + "learning_rate": 3.878094006722954e-06, + "loss": 0.6171, + "step": 6041 + }, + { + "epoch": 0.953448003787281, + "grad_norm": 0.571600615978241, + "learning_rate": 3.877747472118813e-06, + "loss": 0.5898, + "step": 6042 + }, + { + "epoch": 0.9536058071642733, + "grad_norm": 0.5923064351081848, + "learning_rate": 3.877400899492198e-06, + "loss": 0.554, + "step": 6043 + }, + { + "epoch": 0.9537636105412656, + "grad_norm": 0.5867122411727905, + "learning_rate": 3.877054288852673e-06, + "loss": 0.577, + "step": 6044 + }, + { + "epoch": 0.9539214139182578, + "grad_norm": 0.6834588646888733, + "learning_rate": 3.876707640209803e-06, + "loss": 0.5641, + "step": 6045 + }, + { + "epoch": 0.9540792172952501, + "grad_norm": 0.593445897102356, + "learning_rate": 3.876360953573156e-06, + "loss": 0.5886, + "step": 6046 + }, + { + "epoch": 0.9542370206722424, + "grad_norm": 0.5847892165184021, + "learning_rate": 3.876014228952298e-06, + "loss": 0.5755, + "step": 6047 + }, + { + "epoch": 0.9543948240492347, + "grad_norm": 0.610998809337616, + "learning_rate": 3.8756674663567994e-06, + "loss": 0.5691, + "step": 6048 + }, + { + "epoch": 0.9545526274262269, + "grad_norm": 0.6103476881980896, + "learning_rate": 3.8753206657962295e-06, + "loss": 0.5527, + "step": 6049 + }, + { + "epoch": 0.9547104308032192, + "grad_norm": 0.5860583186149597, + "learning_rate": 3.874973827280158e-06, + "loss": 0.5488, + "step": 6050 + }, + { + "epoch": 0.9548682341802115, + "grad_norm": 0.5762362480163574, + "learning_rate": 3.874626950818159e-06, + "loss": 0.5619, + "step": 6051 + }, + { + "epoch": 0.9550260375572037, + "grad_norm": 0.581291139125824, + "learning_rate": 3.874280036419803e-06, + "loss": 0.5577, + "step": 6052 + }, + { + "epoch": 0.955183840934196, + "grad_norm": 0.5894258618354797, + "learning_rate": 3.873933084094666e-06, + "loss": 0.5686, + "step": 6053 + }, + { + "epoch": 0.9553416443111883, + "grad_norm": 0.6159752011299133, + "learning_rate": 3.873586093852323e-06, + "loss": 0.5591, + "step": 6054 + }, + { + "epoch": 0.9554994476881805, + "grad_norm": 0.5922669172286987, + "learning_rate": 3.873239065702348e-06, + "loss": 0.5595, + "step": 6055 + }, + { + "epoch": 0.9556572510651727, + "grad_norm": 0.6033235788345337, + "learning_rate": 3.87289199965432e-06, + "loss": 0.59, + "step": 6056 + }, + { + "epoch": 0.9558150544421651, + "grad_norm": 0.6809817552566528, + "learning_rate": 3.872544895717817e-06, + "loss": 0.5115, + "step": 6057 + }, + { + "epoch": 0.9559728578191573, + "grad_norm": 0.5803025960922241, + "learning_rate": 3.872197753902417e-06, + "loss": 0.5666, + "step": 6058 + }, + { + "epoch": 0.9561306611961496, + "grad_norm": 0.6301056742668152, + "learning_rate": 3.871850574217702e-06, + "loss": 0.5612, + "step": 6059 + }, + { + "epoch": 0.9562884645731419, + "grad_norm": 0.5847240686416626, + "learning_rate": 3.8715033566732514e-06, + "loss": 0.54, + "step": 6060 + }, + { + "epoch": 0.9564462679501341, + "grad_norm": 0.5815793871879578, + "learning_rate": 3.87115610127865e-06, + "loss": 0.5703, + "step": 6061 + }, + { + "epoch": 0.9566040713271264, + "grad_norm": 0.6038494110107422, + "learning_rate": 3.870808808043479e-06, + "loss": 0.585, + "step": 6062 + }, + { + "epoch": 0.9567618747041187, + "grad_norm": 0.5873663425445557, + "learning_rate": 3.870461476977324e-06, + "loss": 0.5521, + "step": 6063 + }, + { + "epoch": 0.9569196780811109, + "grad_norm": 0.5865188837051392, + "learning_rate": 3.87011410808977e-06, + "loss": 0.5254, + "step": 6064 + }, + { + "epoch": 0.9570774814581032, + "grad_norm": 0.5708987712860107, + "learning_rate": 3.869766701390403e-06, + "loss": 0.5535, + "step": 6065 + }, + { + "epoch": 0.9572352848350955, + "grad_norm": 0.5580615997314453, + "learning_rate": 3.869419256888812e-06, + "loss": 0.5668, + "step": 6066 + }, + { + "epoch": 0.9573930882120877, + "grad_norm": 0.5717204213142395, + "learning_rate": 3.869071774594585e-06, + "loss": 0.5584, + "step": 6067 + }, + { + "epoch": 0.95755089158908, + "grad_norm": 0.6083300113677979, + "learning_rate": 3.868724254517312e-06, + "loss": 0.5672, + "step": 6068 + }, + { + "epoch": 0.9577086949660722, + "grad_norm": 0.6164615154266357, + "learning_rate": 3.868376696666582e-06, + "loss": 0.544, + "step": 6069 + }, + { + "epoch": 0.9578664983430646, + "grad_norm": 0.5802131295204163, + "learning_rate": 3.868029101051989e-06, + "loss": 0.5383, + "step": 6070 + }, + { + "epoch": 0.9580243017200568, + "grad_norm": 0.5869053602218628, + "learning_rate": 3.8676814676831245e-06, + "loss": 0.5434, + "step": 6071 + }, + { + "epoch": 0.958182105097049, + "grad_norm": 0.5647076964378357, + "learning_rate": 3.867333796569583e-06, + "loss": 0.5609, + "step": 6072 + }, + { + "epoch": 0.9583399084740414, + "grad_norm": 0.5884628891944885, + "learning_rate": 3.8669860877209585e-06, + "loss": 0.5893, + "step": 6073 + }, + { + "epoch": 0.9584977118510336, + "grad_norm": 0.6045898795127869, + "learning_rate": 3.866638341146847e-06, + "loss": 0.5442, + "step": 6074 + }, + { + "epoch": 0.9586555152280258, + "grad_norm": 0.5760453939437866, + "learning_rate": 3.8662905568568464e-06, + "loss": 0.5626, + "step": 6075 + }, + { + "epoch": 0.9588133186050182, + "grad_norm": 0.6224693655967712, + "learning_rate": 3.865942734860555e-06, + "loss": 0.5772, + "step": 6076 + }, + { + "epoch": 0.9589711219820104, + "grad_norm": 0.580598771572113, + "learning_rate": 3.86559487516757e-06, + "loss": 0.5476, + "step": 6077 + }, + { + "epoch": 0.9591289253590027, + "grad_norm": 0.5802890658378601, + "learning_rate": 3.865246977787493e-06, + "loss": 0.5602, + "step": 6078 + }, + { + "epoch": 0.959286728735995, + "grad_norm": 0.5437613129615784, + "learning_rate": 3.8648990427299245e-06, + "loss": 0.5444, + "step": 6079 + }, + { + "epoch": 0.9594445321129872, + "grad_norm": 0.5726863741874695, + "learning_rate": 3.864551070004466e-06, + "loss": 0.5412, + "step": 6080 + }, + { + "epoch": 0.9596023354899795, + "grad_norm": 0.5886802077293396, + "learning_rate": 3.864203059620723e-06, + "loss": 0.5471, + "step": 6081 + }, + { + "epoch": 0.9597601388669718, + "grad_norm": 0.6195653080940247, + "learning_rate": 3.863855011588298e-06, + "loss": 0.5463, + "step": 6082 + }, + { + "epoch": 0.959917942243964, + "grad_norm": 0.5926525592803955, + "learning_rate": 3.863506925916795e-06, + "loss": 0.5556, + "step": 6083 + }, + { + "epoch": 0.9600757456209563, + "grad_norm": 0.5454012155532837, + "learning_rate": 3.863158802615823e-06, + "loss": 0.5411, + "step": 6084 + }, + { + "epoch": 0.9602335489979485, + "grad_norm": 0.5907604098320007, + "learning_rate": 3.8628106416949885e-06, + "loss": 0.5682, + "step": 6085 + }, + { + "epoch": 0.9603913523749408, + "grad_norm": 0.5716055631637573, + "learning_rate": 3.862462443163899e-06, + "loss": 0.5492, + "step": 6086 + }, + { + "epoch": 0.9605491557519331, + "grad_norm": 0.5915720462799072, + "learning_rate": 3.862114207032165e-06, + "loss": 0.5531, + "step": 6087 + }, + { + "epoch": 0.9607069591289253, + "grad_norm": 0.5674353837966919, + "learning_rate": 3.861765933309396e-06, + "loss": 0.5534, + "step": 6088 + }, + { + "epoch": 0.9608647625059177, + "grad_norm": 0.5908213257789612, + "learning_rate": 3.861417622005204e-06, + "loss": 0.5822, + "step": 6089 + }, + { + "epoch": 0.9610225658829099, + "grad_norm": 0.6432904601097107, + "learning_rate": 3.861069273129202e-06, + "loss": 0.5364, + "step": 6090 + }, + { + "epoch": 0.9611803692599021, + "grad_norm": 0.6166094541549683, + "learning_rate": 3.860720886691003e-06, + "loss": 0.5666, + "step": 6091 + }, + { + "epoch": 0.9613381726368945, + "grad_norm": 0.6106444597244263, + "learning_rate": 3.860372462700221e-06, + "loss": 0.5614, + "step": 6092 + }, + { + "epoch": 0.9614959760138867, + "grad_norm": 0.6063681840896606, + "learning_rate": 3.860024001166473e-06, + "loss": 0.541, + "step": 6093 + }, + { + "epoch": 0.9616537793908789, + "grad_norm": 0.6361616253852844, + "learning_rate": 3.859675502099375e-06, + "loss": 0.5545, + "step": 6094 + }, + { + "epoch": 0.9618115827678713, + "grad_norm": 0.6299073696136475, + "learning_rate": 3.859326965508545e-06, + "loss": 0.5837, + "step": 6095 + }, + { + "epoch": 0.9619693861448635, + "grad_norm": 0.6028454303741455, + "learning_rate": 3.858978391403601e-06, + "loss": 0.5699, + "step": 6096 + }, + { + "epoch": 0.9621271895218557, + "grad_norm": 0.5781006813049316, + "learning_rate": 3.858629779794163e-06, + "loss": 0.547, + "step": 6097 + }, + { + "epoch": 0.9622849928988481, + "grad_norm": 0.5806746482849121, + "learning_rate": 3.858281130689854e-06, + "loss": 0.584, + "step": 6098 + }, + { + "epoch": 0.9624427962758403, + "grad_norm": 0.5856460928916931, + "learning_rate": 3.8579324441002916e-06, + "loss": 0.5748, + "step": 6099 + }, + { + "epoch": 0.9626005996528326, + "grad_norm": 0.6004132628440857, + "learning_rate": 3.857583720035103e-06, + "loss": 0.5691, + "step": 6100 + }, + { + "epoch": 0.9627584030298248, + "grad_norm": 0.6111620664596558, + "learning_rate": 3.85723495850391e-06, + "loss": 0.5602, + "step": 6101 + }, + { + "epoch": 0.9629162064068171, + "grad_norm": 0.5857158899307251, + "learning_rate": 3.8568861595163375e-06, + "loss": 0.579, + "step": 6102 + }, + { + "epoch": 0.9630740097838094, + "grad_norm": 0.5779225826263428, + "learning_rate": 3.856537323082012e-06, + "loss": 0.5639, + "step": 6103 + }, + { + "epoch": 0.9632318131608016, + "grad_norm": 0.591309666633606, + "learning_rate": 3.85618844921056e-06, + "loss": 0.5429, + "step": 6104 + }, + { + "epoch": 0.9633896165377939, + "grad_norm": 0.5712352991104126, + "learning_rate": 3.85583953791161e-06, + "loss": 0.5606, + "step": 6105 + }, + { + "epoch": 0.9635474199147862, + "grad_norm": 0.5759222507476807, + "learning_rate": 3.8554905891947916e-06, + "loss": 0.5453, + "step": 6106 + }, + { + "epoch": 0.9637052232917784, + "grad_norm": 0.5773808360099792, + "learning_rate": 3.8551416030697345e-06, + "loss": 0.5597, + "step": 6107 + }, + { + "epoch": 0.9638630266687707, + "grad_norm": 0.6166070699691772, + "learning_rate": 3.854792579546069e-06, + "loss": 0.5701, + "step": 6108 + }, + { + "epoch": 0.964020830045763, + "grad_norm": 0.583755373954773, + "learning_rate": 3.854443518633429e-06, + "loss": 0.5345, + "step": 6109 + }, + { + "epoch": 0.9641786334227552, + "grad_norm": 0.6040862202644348, + "learning_rate": 3.854094420341446e-06, + "loss": 0.5529, + "step": 6110 + }, + { + "epoch": 0.9643364367997476, + "grad_norm": 0.602363646030426, + "learning_rate": 3.853745284679755e-06, + "loss": 0.5197, + "step": 6111 + }, + { + "epoch": 0.9644942401767398, + "grad_norm": 0.5896916389465332, + "learning_rate": 3.853396111657992e-06, + "loss": 0.5543, + "step": 6112 + }, + { + "epoch": 0.964652043553732, + "grad_norm": 0.5685459971427917, + "learning_rate": 3.8530469012857914e-06, + "loss": 0.5958, + "step": 6113 + }, + { + "epoch": 0.9648098469307244, + "grad_norm": 0.5771709084510803, + "learning_rate": 3.8526976535727936e-06, + "loss": 0.5732, + "step": 6114 + }, + { + "epoch": 0.9649676503077166, + "grad_norm": 0.5600669384002686, + "learning_rate": 3.852348368528635e-06, + "loss": 0.5682, + "step": 6115 + }, + { + "epoch": 0.9651254536847088, + "grad_norm": 0.590048611164093, + "learning_rate": 3.851999046162955e-06, + "loss": 0.5708, + "step": 6116 + }, + { + "epoch": 0.9652832570617011, + "grad_norm": 0.5904655456542969, + "learning_rate": 3.851649686485394e-06, + "loss": 0.5443, + "step": 6117 + }, + { + "epoch": 0.9654410604386934, + "grad_norm": 0.5495908260345459, + "learning_rate": 3.851300289505593e-06, + "loss": 0.5516, + "step": 6118 + }, + { + "epoch": 0.9655988638156856, + "grad_norm": 0.5741522908210754, + "learning_rate": 3.850950855233197e-06, + "loss": 0.53, + "step": 6119 + }, + { + "epoch": 0.9657566671926779, + "grad_norm": 0.5599603652954102, + "learning_rate": 3.850601383677846e-06, + "loss": 0.5848, + "step": 6120 + }, + { + "epoch": 0.9659144705696702, + "grad_norm": 0.5730763077735901, + "learning_rate": 3.850251874849187e-06, + "loss": 0.5449, + "step": 6121 + }, + { + "epoch": 0.9660722739466625, + "grad_norm": 0.5989363193511963, + "learning_rate": 3.849902328756865e-06, + "loss": 0.5664, + "step": 6122 + }, + { + "epoch": 0.9662300773236547, + "grad_norm": 0.6020540595054626, + "learning_rate": 3.849552745410527e-06, + "loss": 0.5565, + "step": 6123 + }, + { + "epoch": 0.966387880700647, + "grad_norm": 0.5828388333320618, + "learning_rate": 3.84920312481982e-06, + "loss": 0.5803, + "step": 6124 + }, + { + "epoch": 0.9665456840776393, + "grad_norm": 0.5999464392662048, + "learning_rate": 3.848853466994392e-06, + "loss": 0.5804, + "step": 6125 + }, + { + "epoch": 0.9667034874546315, + "grad_norm": 0.5890252590179443, + "learning_rate": 3.8485037719438944e-06, + "loss": 0.5618, + "step": 6126 + }, + { + "epoch": 0.9668612908316238, + "grad_norm": 0.5738828778266907, + "learning_rate": 3.8481540396779776e-06, + "loss": 0.554, + "step": 6127 + }, + { + "epoch": 0.9670190942086161, + "grad_norm": 0.6026695966720581, + "learning_rate": 3.847804270206293e-06, + "loss": 0.5861, + "step": 6128 + }, + { + "epoch": 0.9671768975856083, + "grad_norm": 0.5926797986030579, + "learning_rate": 3.8474544635384916e-06, + "loss": 0.55, + "step": 6129 + }, + { + "epoch": 0.9673347009626005, + "grad_norm": 0.5906752943992615, + "learning_rate": 3.8471046196842306e-06, + "loss": 0.5242, + "step": 6130 + }, + { + "epoch": 0.9674925043395929, + "grad_norm": 0.5848662853240967, + "learning_rate": 3.8467547386531624e-06, + "loss": 0.6161, + "step": 6131 + }, + { + "epoch": 0.9676503077165851, + "grad_norm": 0.5815879106521606, + "learning_rate": 3.846404820454944e-06, + "loss": 0.5688, + "step": 6132 + }, + { + "epoch": 0.9678081110935775, + "grad_norm": 0.5739962458610535, + "learning_rate": 3.846054865099231e-06, + "loss": 0.5524, + "step": 6133 + }, + { + "epoch": 0.9679659144705697, + "grad_norm": 0.5951026082038879, + "learning_rate": 3.845704872595683e-06, + "loss": 0.5765, + "step": 6134 + }, + { + "epoch": 0.9681237178475619, + "grad_norm": 0.5798717737197876, + "learning_rate": 3.845354842953958e-06, + "loss": 0.5676, + "step": 6135 + }, + { + "epoch": 0.9682815212245542, + "grad_norm": 0.5692702531814575, + "learning_rate": 3.845004776183716e-06, + "loss": 0.5609, + "step": 6136 + }, + { + "epoch": 0.9684393246015465, + "grad_norm": 0.5881890058517456, + "learning_rate": 3.844654672294619e-06, + "loss": 0.5198, + "step": 6137 + }, + { + "epoch": 0.9685971279785387, + "grad_norm": 0.5906083583831787, + "learning_rate": 3.844304531296327e-06, + "loss": 0.5753, + "step": 6138 + }, + { + "epoch": 0.968754931355531, + "grad_norm": 0.5870009660720825, + "learning_rate": 3.843954353198504e-06, + "loss": 0.6, + "step": 6139 + }, + { + "epoch": 0.9689127347325233, + "grad_norm": 0.5921463966369629, + "learning_rate": 3.843604138010815e-06, + "loss": 0.5562, + "step": 6140 + }, + { + "epoch": 0.9690705381095155, + "grad_norm": 0.6044179797172546, + "learning_rate": 3.843253885742924e-06, + "loss": 0.5651, + "step": 6141 + }, + { + "epoch": 0.9692283414865078, + "grad_norm": 0.5834926962852478, + "learning_rate": 3.842903596404497e-06, + "loss": 0.5582, + "step": 6142 + }, + { + "epoch": 0.9693861448635, + "grad_norm": 0.5605986714363098, + "learning_rate": 3.842553270005203e-06, + "loss": 0.5467, + "step": 6143 + }, + { + "epoch": 0.9695439482404924, + "grad_norm": 0.5699493885040283, + "learning_rate": 3.842202906554706e-06, + "loss": 0.5584, + "step": 6144 + }, + { + "epoch": 0.9697017516174846, + "grad_norm": 0.6078549027442932, + "learning_rate": 3.84185250606268e-06, + "loss": 0.5632, + "step": 6145 + }, + { + "epoch": 0.9698595549944768, + "grad_norm": 0.5870882868766785, + "learning_rate": 3.841502068538793e-06, + "loss": 0.5197, + "step": 6146 + }, + { + "epoch": 0.9700173583714692, + "grad_norm": 0.6239046454429626, + "learning_rate": 3.841151593992716e-06, + "loss": 0.5744, + "step": 6147 + }, + { + "epoch": 0.9701751617484614, + "grad_norm": 0.5899212956428528, + "learning_rate": 3.840801082434121e-06, + "loss": 0.5913, + "step": 6148 + }, + { + "epoch": 0.9703329651254536, + "grad_norm": 0.5781940817832947, + "learning_rate": 3.840450533872683e-06, + "loss": 0.5863, + "step": 6149 + }, + { + "epoch": 0.970490768502446, + "grad_norm": 0.6505781412124634, + "learning_rate": 3.840099948318074e-06, + "loss": 0.5725, + "step": 6150 + }, + { + "epoch": 0.9706485718794382, + "grad_norm": 0.6046651601791382, + "learning_rate": 3.839749325779971e-06, + "loss": 0.5384, + "step": 6151 + }, + { + "epoch": 0.9708063752564305, + "grad_norm": 0.6258700489997864, + "learning_rate": 3.839398666268049e-06, + "loss": 0.5874, + "step": 6152 + }, + { + "epoch": 0.9709641786334228, + "grad_norm": 0.619891881942749, + "learning_rate": 3.839047969791987e-06, + "loss": 0.5535, + "step": 6153 + }, + { + "epoch": 0.971121982010415, + "grad_norm": 0.5551032423973083, + "learning_rate": 3.838697236361462e-06, + "loss": 0.5766, + "step": 6154 + }, + { + "epoch": 0.9712797853874073, + "grad_norm": 0.5931451916694641, + "learning_rate": 3.838346465986153e-06, + "loss": 0.5539, + "step": 6155 + }, + { + "epoch": 0.9714375887643996, + "grad_norm": 0.5901145935058594, + "learning_rate": 3.837995658675743e-06, + "loss": 0.5887, + "step": 6156 + }, + { + "epoch": 0.9715953921413918, + "grad_norm": 0.6098012924194336, + "learning_rate": 3.8376448144399096e-06, + "loss": 0.5606, + "step": 6157 + }, + { + "epoch": 0.9717531955183841, + "grad_norm": 0.5891311764717102, + "learning_rate": 3.837293933288339e-06, + "loss": 0.5363, + "step": 6158 + }, + { + "epoch": 0.9719109988953764, + "grad_norm": 0.5716434717178345, + "learning_rate": 3.836943015230713e-06, + "loss": 0.5222, + "step": 6159 + }, + { + "epoch": 0.9720688022723686, + "grad_norm": 0.6075356602668762, + "learning_rate": 3.836592060276715e-06, + "loss": 0.5677, + "step": 6160 + }, + { + "epoch": 0.9722266056493609, + "grad_norm": 0.5731530785560608, + "learning_rate": 3.8362410684360315e-06, + "loss": 0.5518, + "step": 6161 + }, + { + "epoch": 0.9723844090263531, + "grad_norm": 0.5774710774421692, + "learning_rate": 3.83589003971835e-06, + "loss": 0.5608, + "step": 6162 + }, + { + "epoch": 0.9725422124033455, + "grad_norm": 0.5895016193389893, + "learning_rate": 3.835538974133356e-06, + "loss": 0.5489, + "step": 6163 + }, + { + "epoch": 0.9727000157803377, + "grad_norm": 0.5945772528648376, + "learning_rate": 3.835187871690741e-06, + "loss": 0.549, + "step": 6164 + }, + { + "epoch": 0.9728578191573299, + "grad_norm": 0.5837388634681702, + "learning_rate": 3.834836732400191e-06, + "loss": 0.5816, + "step": 6165 + }, + { + "epoch": 0.9730156225343223, + "grad_norm": 0.6129700541496277, + "learning_rate": 3.8344855562714e-06, + "loss": 0.5801, + "step": 6166 + }, + { + "epoch": 0.9731734259113145, + "grad_norm": 0.5862283110618591, + "learning_rate": 3.834134343314056e-06, + "loss": 0.5783, + "step": 6167 + }, + { + "epoch": 0.9733312292883067, + "grad_norm": 0.5909649133682251, + "learning_rate": 3.833783093537855e-06, + "loss": 0.58, + "step": 6168 + }, + { + "epoch": 0.9734890326652991, + "grad_norm": 0.5980948209762573, + "learning_rate": 3.833431806952489e-06, + "loss": 0.5701, + "step": 6169 + }, + { + "epoch": 0.9736468360422913, + "grad_norm": 0.6161919236183167, + "learning_rate": 3.833080483567653e-06, + "loss": 0.5668, + "step": 6170 + }, + { + "epoch": 0.9738046394192835, + "grad_norm": 0.5842013359069824, + "learning_rate": 3.8327291233930415e-06, + "loss": 0.5554, + "step": 6171 + }, + { + "epoch": 0.9739624427962759, + "grad_norm": 0.5750848054885864, + "learning_rate": 3.832377726438353e-06, + "loss": 0.5696, + "step": 6172 + }, + { + "epoch": 0.9741202461732681, + "grad_norm": 0.5863490700721741, + "learning_rate": 3.832026292713285e-06, + "loss": 0.5717, + "step": 6173 + }, + { + "epoch": 0.9742780495502604, + "grad_norm": 0.6002809405326843, + "learning_rate": 3.831674822227535e-06, + "loss": 0.5816, + "step": 6174 + }, + { + "epoch": 0.9744358529272527, + "grad_norm": 0.565726101398468, + "learning_rate": 3.831323314990803e-06, + "loss": 0.5626, + "step": 6175 + }, + { + "epoch": 0.9745936563042449, + "grad_norm": 0.5678207874298096, + "learning_rate": 3.83097177101279e-06, + "loss": 0.5718, + "step": 6176 + }, + { + "epoch": 0.9747514596812372, + "grad_norm": 0.5790578722953796, + "learning_rate": 3.830620190303199e-06, + "loss": 0.5735, + "step": 6177 + }, + { + "epoch": 0.9749092630582294, + "grad_norm": 0.5633588433265686, + "learning_rate": 3.83026857287173e-06, + "loss": 0.5556, + "step": 6178 + }, + { + "epoch": 0.9750670664352217, + "grad_norm": 0.65196293592453, + "learning_rate": 3.82991691872809e-06, + "loss": 0.5273, + "step": 6179 + }, + { + "epoch": 0.975224869812214, + "grad_norm": 0.6000386476516724, + "learning_rate": 3.8295652278819814e-06, + "loss": 0.5806, + "step": 6180 + }, + { + "epoch": 0.9753826731892062, + "grad_norm": 0.5956845879554749, + "learning_rate": 3.829213500343111e-06, + "loss": 0.595, + "step": 6181 + }, + { + "epoch": 0.9755404765661985, + "grad_norm": 0.5937519073486328, + "learning_rate": 3.828861736121186e-06, + "loss": 0.5483, + "step": 6182 + }, + { + "epoch": 0.9756982799431908, + "grad_norm": 0.586082398891449, + "learning_rate": 3.828509935225912e-06, + "loss": 0.5416, + "step": 6183 + }, + { + "epoch": 0.975856083320183, + "grad_norm": 0.5677700638771057, + "learning_rate": 3.828158097667001e-06, + "loss": 0.564, + "step": 6184 + }, + { + "epoch": 0.9760138866971754, + "grad_norm": 0.5906929969787598, + "learning_rate": 3.8278062234541615e-06, + "loss": 0.5592, + "step": 6185 + }, + { + "epoch": 0.9761716900741676, + "grad_norm": 0.5439387559890747, + "learning_rate": 3.827454312597103e-06, + "loss": 0.5528, + "step": 6186 + }, + { + "epoch": 0.9763294934511598, + "grad_norm": 0.5708004832267761, + "learning_rate": 3.82710236510554e-06, + "loss": 0.5561, + "step": 6187 + }, + { + "epoch": 0.9764872968281522, + "grad_norm": 0.579961895942688, + "learning_rate": 3.826750380989184e-06, + "loss": 0.5465, + "step": 6188 + }, + { + "epoch": 0.9766451002051444, + "grad_norm": 0.6025300025939941, + "learning_rate": 3.826398360257747e-06, + "loss": 0.5721, + "step": 6189 + }, + { + "epoch": 0.9768029035821366, + "grad_norm": 0.5713192224502563, + "learning_rate": 3.826046302920948e-06, + "loss": 0.5551, + "step": 6190 + }, + { + "epoch": 0.976960706959129, + "grad_norm": 0.588024914264679, + "learning_rate": 3.8256942089885e-06, + "loss": 0.5799, + "step": 6191 + }, + { + "epoch": 0.9771185103361212, + "grad_norm": 0.5871561765670776, + "learning_rate": 3.825342078470121e-06, + "loss": 0.5535, + "step": 6192 + }, + { + "epoch": 0.9772763137131134, + "grad_norm": 0.5925978422164917, + "learning_rate": 3.824989911375529e-06, + "loss": 0.5745, + "step": 6193 + }, + { + "epoch": 0.9774341170901057, + "grad_norm": 0.5697079300880432, + "learning_rate": 3.824637707714441e-06, + "loss": 0.5535, + "step": 6194 + }, + { + "epoch": 0.977591920467098, + "grad_norm": 0.5730113387107849, + "learning_rate": 3.8242854674965806e-06, + "loss": 0.5637, + "step": 6195 + }, + { + "epoch": 0.9777497238440903, + "grad_norm": 0.553878903388977, + "learning_rate": 3.823933190731666e-06, + "loss": 0.563, + "step": 6196 + }, + { + "epoch": 0.9779075272210825, + "grad_norm": 0.6174440979957581, + "learning_rate": 3.823580877429419e-06, + "loss": 0.5823, + "step": 6197 + }, + { + "epoch": 0.9780653305980748, + "grad_norm": 0.5768445730209351, + "learning_rate": 3.8232285275995646e-06, + "loss": 0.5578, + "step": 6198 + }, + { + "epoch": 0.9782231339750671, + "grad_norm": 0.6500750184059143, + "learning_rate": 3.8228761412518255e-06, + "loss": 0.5751, + "step": 6199 + }, + { + "epoch": 0.9783809373520593, + "grad_norm": 0.5853930115699768, + "learning_rate": 3.822523718395926e-06, + "loss": 0.5227, + "step": 6200 + }, + { + "epoch": 0.9785387407290516, + "grad_norm": 0.5774327516555786, + "learning_rate": 3.822171259041595e-06, + "loss": 0.5661, + "step": 6201 + }, + { + "epoch": 0.9786965441060439, + "grad_norm": 0.5721741318702698, + "learning_rate": 3.8218187631985555e-06, + "loss": 0.5444, + "step": 6202 + }, + { + "epoch": 0.9788543474830361, + "grad_norm": 0.5643377900123596, + "learning_rate": 3.82146623087654e-06, + "loss": 0.5386, + "step": 6203 + }, + { + "epoch": 0.9790121508600284, + "grad_norm": 0.5830936431884766, + "learning_rate": 3.821113662085273e-06, + "loss": 0.5727, + "step": 6204 + }, + { + "epoch": 0.9791699542370207, + "grad_norm": 0.5873678922653198, + "learning_rate": 3.820761056834487e-06, + "loss": 0.5431, + "step": 6205 + }, + { + "epoch": 0.9793277576140129, + "grad_norm": 0.5893053412437439, + "learning_rate": 3.820408415133913e-06, + "loss": 0.5473, + "step": 6206 + }, + { + "epoch": 0.9794855609910053, + "grad_norm": 0.5744673013687134, + "learning_rate": 3.820055736993282e-06, + "loss": 0.549, + "step": 6207 + }, + { + "epoch": 0.9796433643679975, + "grad_norm": 0.601440966129303, + "learning_rate": 3.819703022422329e-06, + "loss": 0.5878, + "step": 6208 + }, + { + "epoch": 0.9798011677449897, + "grad_norm": 0.5841817259788513, + "learning_rate": 3.8193502714307865e-06, + "loss": 0.5718, + "step": 6209 + }, + { + "epoch": 0.979958971121982, + "grad_norm": 0.5566737651824951, + "learning_rate": 3.818997484028389e-06, + "loss": 0.5531, + "step": 6210 + }, + { + "epoch": 0.9801167744989743, + "grad_norm": 0.6075260043144226, + "learning_rate": 3.818644660224875e-06, + "loss": 0.5901, + "step": 6211 + }, + { + "epoch": 0.9802745778759665, + "grad_norm": 0.5814286470413208, + "learning_rate": 3.81829180002998e-06, + "loss": 0.5464, + "step": 6212 + }, + { + "epoch": 0.9804323812529588, + "grad_norm": 0.6025444269180298, + "learning_rate": 3.817938903453442e-06, + "loss": 0.5607, + "step": 6213 + }, + { + "epoch": 0.9805901846299511, + "grad_norm": 0.6389800906181335, + "learning_rate": 3.8175859705050004e-06, + "loss": 0.5837, + "step": 6214 + }, + { + "epoch": 0.9807479880069433, + "grad_norm": 0.5741757154464722, + "learning_rate": 3.8172330011943934e-06, + "loss": 0.5507, + "step": 6215 + }, + { + "epoch": 0.9809057913839356, + "grad_norm": 0.5794395804405212, + "learning_rate": 3.816879995531366e-06, + "loss": 0.5596, + "step": 6216 + }, + { + "epoch": 0.9810635947609279, + "grad_norm": 0.5884330868721008, + "learning_rate": 3.816526953525656e-06, + "loss": 0.5674, + "step": 6217 + }, + { + "epoch": 0.9812213981379202, + "grad_norm": 0.589704155921936, + "learning_rate": 3.816173875187011e-06, + "loss": 0.531, + "step": 6218 + }, + { + "epoch": 0.9813792015149124, + "grad_norm": 0.5805954337120056, + "learning_rate": 3.815820760525172e-06, + "loss": 0.569, + "step": 6219 + }, + { + "epoch": 0.9815370048919047, + "grad_norm": 0.5916928052902222, + "learning_rate": 3.815467609549884e-06, + "loss": 0.5972, + "step": 6220 + }, + { + "epoch": 0.981694808268897, + "grad_norm": 0.6479397416114807, + "learning_rate": 3.815114422270895e-06, + "loss": 0.5619, + "step": 6221 + }, + { + "epoch": 0.9818526116458892, + "grad_norm": 0.5857071876525879, + "learning_rate": 3.8147611986979504e-06, + "loss": 0.5572, + "step": 6222 + }, + { + "epoch": 0.9820104150228814, + "grad_norm": 0.5999612808227539, + "learning_rate": 3.8144079388407984e-06, + "loss": 0.5792, + "step": 6223 + }, + { + "epoch": 0.9821682183998738, + "grad_norm": 0.5840067267417908, + "learning_rate": 3.8140546427091894e-06, + "loss": 0.5405, + "step": 6224 + }, + { + "epoch": 0.982326021776866, + "grad_norm": 0.6306602954864502, + "learning_rate": 3.8137013103128735e-06, + "loss": 0.5415, + "step": 6225 + }, + { + "epoch": 0.9824838251538583, + "grad_norm": 0.5911625027656555, + "learning_rate": 3.8133479416616e-06, + "loss": 0.5579, + "step": 6226 + }, + { + "epoch": 0.9826416285308506, + "grad_norm": 0.5657343864440918, + "learning_rate": 3.8129945367651226e-06, + "loss": 0.5319, + "step": 6227 + }, + { + "epoch": 0.9827994319078428, + "grad_norm": 0.5849944353103638, + "learning_rate": 3.812641095633194e-06, + "loss": 0.5418, + "step": 6228 + }, + { + "epoch": 0.9829572352848351, + "grad_norm": 0.5739319324493408, + "learning_rate": 3.8122876182755687e-06, + "loss": 0.5516, + "step": 6229 + }, + { + "epoch": 0.9831150386618274, + "grad_norm": 0.5908547639846802, + "learning_rate": 3.811934104702001e-06, + "loss": 0.5637, + "step": 6230 + }, + { + "epoch": 0.9832728420388196, + "grad_norm": 0.5828875303268433, + "learning_rate": 3.811580554922246e-06, + "loss": 0.5823, + "step": 6231 + }, + { + "epoch": 0.9834306454158119, + "grad_norm": 0.6082329154014587, + "learning_rate": 3.811226968946064e-06, + "loss": 0.5779, + "step": 6232 + }, + { + "epoch": 0.9835884487928042, + "grad_norm": 0.6154103875160217, + "learning_rate": 3.810873346783211e-06, + "loss": 0.591, + "step": 6233 + }, + { + "epoch": 0.9837462521697964, + "grad_norm": 0.5782674551010132, + "learning_rate": 3.810519688443446e-06, + "loss": 0.5866, + "step": 6234 + }, + { + "epoch": 0.9839040555467887, + "grad_norm": 0.5817796587944031, + "learning_rate": 3.8101659939365298e-06, + "loss": 0.5871, + "step": 6235 + }, + { + "epoch": 0.984061858923781, + "grad_norm": 0.583256721496582, + "learning_rate": 3.8098122632722225e-06, + "loss": 0.5497, + "step": 6236 + }, + { + "epoch": 0.9842196623007733, + "grad_norm": 0.5606623291969299, + "learning_rate": 3.8094584964602878e-06, + "loss": 0.581, + "step": 6237 + }, + { + "epoch": 0.9843774656777655, + "grad_norm": 0.5694546103477478, + "learning_rate": 3.809104693510488e-06, + "loss": 0.571, + "step": 6238 + }, + { + "epoch": 0.9845352690547577, + "grad_norm": 0.6002447009086609, + "learning_rate": 3.808750854432586e-06, + "loss": 0.5364, + "step": 6239 + }, + { + "epoch": 0.9846930724317501, + "grad_norm": 0.6080799102783203, + "learning_rate": 3.8083969792363497e-06, + "loss": 0.5901, + "step": 6240 + }, + { + "epoch": 0.9848508758087423, + "grad_norm": 0.577477216720581, + "learning_rate": 3.808043067931543e-06, + "loss": 0.5812, + "step": 6241 + }, + { + "epoch": 0.9850086791857345, + "grad_norm": 0.6124005913734436, + "learning_rate": 3.807689120527933e-06, + "loss": 0.5487, + "step": 6242 + }, + { + "epoch": 0.9851664825627269, + "grad_norm": 0.5744560956954956, + "learning_rate": 3.8073351370352885e-06, + "loss": 0.5422, + "step": 6243 + }, + { + "epoch": 0.9853242859397191, + "grad_norm": 0.5868034362792969, + "learning_rate": 3.8069811174633776e-06, + "loss": 0.5703, + "step": 6244 + }, + { + "epoch": 0.9854820893167113, + "grad_norm": 0.5929223895072937, + "learning_rate": 3.8066270618219725e-06, + "loss": 0.5427, + "step": 6245 + }, + { + "epoch": 0.9856398926937037, + "grad_norm": 0.5863749384880066, + "learning_rate": 3.8062729701208422e-06, + "loss": 0.5444, + "step": 6246 + }, + { + "epoch": 0.9857976960706959, + "grad_norm": 0.571281373500824, + "learning_rate": 3.80591884236976e-06, + "loss": 0.5489, + "step": 6247 + }, + { + "epoch": 0.9859554994476882, + "grad_norm": 0.5601460933685303, + "learning_rate": 3.805564678578498e-06, + "loss": 0.5706, + "step": 6248 + }, + { + "epoch": 0.9861133028246805, + "grad_norm": 0.6095417141914368, + "learning_rate": 3.8052104787568312e-06, + "loss": 0.555, + "step": 6249 + }, + { + "epoch": 0.9862711062016727, + "grad_norm": 0.5957404971122742, + "learning_rate": 3.8048562429145343e-06, + "loss": 0.5539, + "step": 6250 + }, + { + "epoch": 0.986428909578665, + "grad_norm": 0.5793875455856323, + "learning_rate": 3.804501971061383e-06, + "loss": 0.585, + "step": 6251 + }, + { + "epoch": 0.9865867129556573, + "grad_norm": 0.5992040038108826, + "learning_rate": 3.8041476632071537e-06, + "loss": 0.5369, + "step": 6252 + }, + { + "epoch": 0.9867445163326495, + "grad_norm": 0.590179979801178, + "learning_rate": 3.8037933193616262e-06, + "loss": 0.5861, + "step": 6253 + }, + { + "epoch": 0.9869023197096418, + "grad_norm": 0.5643383264541626, + "learning_rate": 3.8034389395345784e-06, + "loss": 0.5672, + "step": 6254 + }, + { + "epoch": 0.987060123086634, + "grad_norm": 0.5804921984672546, + "learning_rate": 3.8030845237357903e-06, + "loss": 0.5627, + "step": 6255 + }, + { + "epoch": 0.9872179264636263, + "grad_norm": 0.5788939595222473, + "learning_rate": 3.8027300719750437e-06, + "loss": 0.5341, + "step": 6256 + }, + { + "epoch": 0.9873757298406186, + "grad_norm": 0.587116539478302, + "learning_rate": 3.80237558426212e-06, + "loss": 0.5851, + "step": 6257 + }, + { + "epoch": 0.9875335332176108, + "grad_norm": 0.5570642352104187, + "learning_rate": 3.802021060606802e-06, + "loss": 0.5794, + "step": 6258 + }, + { + "epoch": 0.9876913365946032, + "grad_norm": 0.590650737285614, + "learning_rate": 3.8016665010188747e-06, + "loss": 0.572, + "step": 6259 + }, + { + "epoch": 0.9878491399715954, + "grad_norm": 0.5666354298591614, + "learning_rate": 3.801311905508121e-06, + "loss": 0.6003, + "step": 6260 + }, + { + "epoch": 0.9880069433485876, + "grad_norm": 0.6111732125282288, + "learning_rate": 3.8009572740843293e-06, + "loss": 0.5391, + "step": 6261 + }, + { + "epoch": 0.98816474672558, + "grad_norm": 0.5711501240730286, + "learning_rate": 3.8006026067572856e-06, + "loss": 0.565, + "step": 6262 + }, + { + "epoch": 0.9883225501025722, + "grad_norm": 0.5845191478729248, + "learning_rate": 3.800247903536777e-06, + "loss": 0.5221, + "step": 6263 + }, + { + "epoch": 0.9884803534795644, + "grad_norm": 0.578768253326416, + "learning_rate": 3.7998931644325933e-06, + "loss": 0.5329, + "step": 6264 + }, + { + "epoch": 0.9886381568565568, + "grad_norm": 0.6101123094558716, + "learning_rate": 3.799538389454524e-06, + "loss": 0.5316, + "step": 6265 + }, + { + "epoch": 0.988795960233549, + "grad_norm": 0.5677991509437561, + "learning_rate": 3.799183578612361e-06, + "loss": 0.556, + "step": 6266 + }, + { + "epoch": 0.9889537636105412, + "grad_norm": 0.6080459952354431, + "learning_rate": 3.7988287319158955e-06, + "loss": 0.5824, + "step": 6267 + }, + { + "epoch": 0.9891115669875336, + "grad_norm": 0.5780219435691833, + "learning_rate": 3.79847384937492e-06, + "loss": 0.5815, + "step": 6268 + }, + { + "epoch": 0.9892693703645258, + "grad_norm": 0.5811241269111633, + "learning_rate": 3.7981189309992295e-06, + "loss": 0.5858, + "step": 6269 + }, + { + "epoch": 0.9894271737415181, + "grad_norm": 0.5727960467338562, + "learning_rate": 3.7977639767986175e-06, + "loss": 0.5289, + "step": 6270 + }, + { + "epoch": 0.9895849771185103, + "grad_norm": 0.6014429330825806, + "learning_rate": 3.7974089867828813e-06, + "loss": 0.5823, + "step": 6271 + }, + { + "epoch": 0.9897427804955026, + "grad_norm": 0.5615416765213013, + "learning_rate": 3.7970539609618164e-06, + "loss": 0.5437, + "step": 6272 + }, + { + "epoch": 0.9899005838724949, + "grad_norm": 0.5648170113563538, + "learning_rate": 3.7966988993452213e-06, + "loss": 0.5932, + "step": 6273 + }, + { + "epoch": 0.9900583872494871, + "grad_norm": 0.5969148874282837, + "learning_rate": 3.7963438019428954e-06, + "loss": 0.5607, + "step": 6274 + }, + { + "epoch": 0.9902161906264794, + "grad_norm": 0.5615748763084412, + "learning_rate": 3.795988668764638e-06, + "loss": 0.5454, + "step": 6275 + }, + { + "epoch": 0.9903739940034717, + "grad_norm": 0.5959685444831848, + "learning_rate": 3.795633499820249e-06, + "loss": 0.54, + "step": 6276 + }, + { + "epoch": 0.9905317973804639, + "grad_norm": 0.561375081539154, + "learning_rate": 3.795278295119532e-06, + "loss": 0.5389, + "step": 6277 + }, + { + "epoch": 0.9906896007574562, + "grad_norm": 0.5620150566101074, + "learning_rate": 3.7949230546722886e-06, + "loss": 0.5365, + "step": 6278 + }, + { + "epoch": 0.9908474041344485, + "grad_norm": 0.5867237448692322, + "learning_rate": 3.794567778488323e-06, + "loss": 0.5406, + "step": 6279 + }, + { + "epoch": 0.9910052075114407, + "grad_norm": 0.5855279564857483, + "learning_rate": 3.7942124665774394e-06, + "loss": 0.5563, + "step": 6280 + }, + { + "epoch": 0.9911630108884331, + "grad_norm": 0.6031869649887085, + "learning_rate": 3.7938571189494446e-06, + "loss": 0.5716, + "step": 6281 + }, + { + "epoch": 0.9913208142654253, + "grad_norm": 0.579580545425415, + "learning_rate": 3.7935017356141436e-06, + "loss": 0.5475, + "step": 6282 + }, + { + "epoch": 0.9914786176424175, + "grad_norm": 0.5748009085655212, + "learning_rate": 3.793146316581346e-06, + "loss": 0.5716, + "step": 6283 + }, + { + "epoch": 0.9916364210194099, + "grad_norm": 0.5814832448959351, + "learning_rate": 3.7927908618608588e-06, + "loss": 0.5399, + "step": 6284 + }, + { + "epoch": 0.9917942243964021, + "grad_norm": 0.5607571005821228, + "learning_rate": 3.792435371462494e-06, + "loss": 0.5464, + "step": 6285 + }, + { + "epoch": 0.9919520277733943, + "grad_norm": 0.5912891626358032, + "learning_rate": 3.7920798453960596e-06, + "loss": 0.5686, + "step": 6286 + }, + { + "epoch": 0.9921098311503866, + "grad_norm": 0.5949938893318176, + "learning_rate": 3.7917242836713686e-06, + "loss": 0.5874, + "step": 6287 + }, + { + "epoch": 0.9922676345273789, + "grad_norm": 0.5872530937194824, + "learning_rate": 3.791368686298234e-06, + "loss": 0.5422, + "step": 6288 + }, + { + "epoch": 0.9924254379043711, + "grad_norm": 0.786261260509491, + "learning_rate": 3.7910130532864687e-06, + "loss": 0.5936, + "step": 6289 + }, + { + "epoch": 0.9925832412813634, + "grad_norm": 0.615193784236908, + "learning_rate": 3.7906573846458876e-06, + "loss": 0.548, + "step": 6290 + }, + { + "epoch": 0.9927410446583557, + "grad_norm": 0.5664339065551758, + "learning_rate": 3.7903016803863064e-06, + "loss": 0.5261, + "step": 6291 + }, + { + "epoch": 0.992898848035348, + "grad_norm": 0.5788701772689819, + "learning_rate": 3.789945940517541e-06, + "loss": 0.5819, + "step": 6292 + }, + { + "epoch": 0.9930566514123402, + "grad_norm": 0.5788673758506775, + "learning_rate": 3.7895901650494103e-06, + "loss": 0.5283, + "step": 6293 + }, + { + "epoch": 0.9932144547893325, + "grad_norm": 0.5590288639068604, + "learning_rate": 3.789234353991731e-06, + "loss": 0.59, + "step": 6294 + }, + { + "epoch": 0.9933722581663248, + "grad_norm": 0.5754006505012512, + "learning_rate": 3.7888785073543244e-06, + "loss": 0.5893, + "step": 6295 + }, + { + "epoch": 0.993530061543317, + "grad_norm": 0.5751763582229614, + "learning_rate": 3.78852262514701e-06, + "loss": 0.5462, + "step": 6296 + }, + { + "epoch": 0.9936878649203092, + "grad_norm": 0.5537508726119995, + "learning_rate": 3.7881667073796093e-06, + "loss": 0.5579, + "step": 6297 + }, + { + "epoch": 0.9938456682973016, + "grad_norm": 0.5604907274246216, + "learning_rate": 3.787810754061945e-06, + "loss": 0.6016, + "step": 6298 + }, + { + "epoch": 0.9940034716742938, + "grad_norm": 0.6025168895721436, + "learning_rate": 3.787454765203841e-06, + "loss": 0.5482, + "step": 6299 + }, + { + "epoch": 0.9941612750512862, + "grad_norm": 0.5634483695030212, + "learning_rate": 3.7870987408151204e-06, + "loss": 0.5493, + "step": 6300 + }, + { + "epoch": 0.9943190784282784, + "grad_norm": 0.6093750596046448, + "learning_rate": 3.7867426809056103e-06, + "loss": 0.5577, + "step": 6301 + }, + { + "epoch": 0.9944768818052706, + "grad_norm": 0.6087208986282349, + "learning_rate": 3.786386585485136e-06, + "loss": 0.5614, + "step": 6302 + }, + { + "epoch": 0.994634685182263, + "grad_norm": 0.5513201951980591, + "learning_rate": 3.7860304545635246e-06, + "loss": 0.5841, + "step": 6303 + }, + { + "epoch": 0.9947924885592552, + "grad_norm": 0.5768284797668457, + "learning_rate": 3.785674288150605e-06, + "loss": 0.5268, + "step": 6304 + }, + { + "epoch": 0.9949502919362474, + "grad_norm": 0.566495954990387, + "learning_rate": 3.7853180862562065e-06, + "loss": 0.5357, + "step": 6305 + }, + { + "epoch": 0.9951080953132397, + "grad_norm": 0.5882160663604736, + "learning_rate": 3.7849618488901595e-06, + "loss": 0.596, + "step": 6306 + }, + { + "epoch": 0.995265898690232, + "grad_norm": 0.5823222994804382, + "learning_rate": 3.784605576062296e-06, + "loss": 0.5543, + "step": 6307 + }, + { + "epoch": 0.9954237020672242, + "grad_norm": 0.6286187767982483, + "learning_rate": 3.784249267782446e-06, + "loss": 0.5954, + "step": 6308 + }, + { + "epoch": 0.9955815054442165, + "grad_norm": 0.6140681505203247, + "learning_rate": 3.7838929240604448e-06, + "loss": 0.5915, + "step": 6309 + }, + { + "epoch": 0.9957393088212088, + "grad_norm": 0.5861386060714722, + "learning_rate": 3.783536544906126e-06, + "loss": 0.5473, + "step": 6310 + }, + { + "epoch": 0.9958971121982011, + "grad_norm": 0.5731537938117981, + "learning_rate": 3.783180130329325e-06, + "loss": 0.5395, + "step": 6311 + }, + { + "epoch": 0.9960549155751933, + "grad_norm": 0.6016662120819092, + "learning_rate": 3.782823680339878e-06, + "loss": 0.5607, + "step": 6312 + }, + { + "epoch": 0.9962127189521855, + "grad_norm": 0.5676655173301697, + "learning_rate": 3.782467194947621e-06, + "loss": 0.5436, + "step": 6313 + }, + { + "epoch": 0.9963705223291779, + "grad_norm": 0.5615426301956177, + "learning_rate": 3.782110674162393e-06, + "loss": 0.5437, + "step": 6314 + }, + { + "epoch": 0.9965283257061701, + "grad_norm": 0.6012169718742371, + "learning_rate": 3.7817541179940332e-06, + "loss": 0.5676, + "step": 6315 + }, + { + "epoch": 0.9966861290831623, + "grad_norm": 0.5735711455345154, + "learning_rate": 3.781397526452382e-06, + "loss": 0.5539, + "step": 6316 + }, + { + "epoch": 0.9968439324601547, + "grad_norm": 0.5517914891242981, + "learning_rate": 3.7810408995472804e-06, + "loss": 0.5949, + "step": 6317 + }, + { + "epoch": 0.9970017358371469, + "grad_norm": 0.5662906765937805, + "learning_rate": 3.78068423728857e-06, + "loss": 0.5593, + "step": 6318 + }, + { + "epoch": 0.9971595392141391, + "grad_norm": 0.5893527269363403, + "learning_rate": 3.780327539686093e-06, + "loss": 0.5597, + "step": 6319 + }, + { + "epoch": 0.9973173425911315, + "grad_norm": 0.5612245202064514, + "learning_rate": 3.7799708067496953e-06, + "loss": 0.5816, + "step": 6320 + }, + { + "epoch": 0.9974751459681237, + "grad_norm": 0.5737277865409851, + "learning_rate": 3.77961403848922e-06, + "loss": 0.5575, + "step": 6321 + }, + { + "epoch": 0.997632949345116, + "grad_norm": 0.5657596588134766, + "learning_rate": 3.7792572349145144e-06, + "loss": 0.559, + "step": 6322 + }, + { + "epoch": 0.9977907527221083, + "grad_norm": 0.6005147695541382, + "learning_rate": 3.7789003960354256e-06, + "loss": 0.5763, + "step": 6323 + }, + { + "epoch": 0.9979485560991005, + "grad_norm": 0.5619044899940491, + "learning_rate": 3.7785435218617994e-06, + "loss": 0.5167, + "step": 6324 + }, + { + "epoch": 0.9981063594760928, + "grad_norm": 0.58468097448349, + "learning_rate": 3.7781866124034865e-06, + "loss": 0.5737, + "step": 6325 + }, + { + "epoch": 0.9982641628530851, + "grad_norm": 0.592735230922699, + "learning_rate": 3.777829667670336e-06, + "loss": 0.5309, + "step": 6326 + }, + { + "epoch": 0.9984219662300773, + "grad_norm": 0.5836113095283508, + "learning_rate": 3.7774726876722002e-06, + "loss": 0.5761, + "step": 6327 + }, + { + "epoch": 0.9985797696070696, + "grad_norm": 0.5579439997673035, + "learning_rate": 3.7771156724189284e-06, + "loss": 0.5615, + "step": 6328 + }, + { + "epoch": 0.9987375729840618, + "grad_norm": 0.5972313284873962, + "learning_rate": 3.7767586219203755e-06, + "loss": 0.5742, + "step": 6329 + }, + { + "epoch": 0.9988953763610541, + "grad_norm": 0.6044876575469971, + "learning_rate": 3.776401536186394e-06, + "loss": 0.5749, + "step": 6330 + }, + { + "epoch": 0.9990531797380464, + "grad_norm": 0.5998092293739319, + "learning_rate": 3.776044415226839e-06, + "loss": 0.5511, + "step": 6331 + }, + { + "epoch": 0.9992109831150386, + "grad_norm": 0.5713402032852173, + "learning_rate": 3.775687259051566e-06, + "loss": 0.5702, + "step": 6332 + }, + { + "epoch": 0.999368786492031, + "grad_norm": 0.5544038414955139, + "learning_rate": 3.7753300676704317e-06, + "loss": 0.5564, + "step": 6333 + }, + { + "epoch": 0.9995265898690232, + "grad_norm": 0.5650018453598022, + "learning_rate": 3.7749728410932934e-06, + "loss": 0.5664, + "step": 6334 + }, + { + "epoch": 0.9996843932460154, + "grad_norm": 0.57365882396698, + "learning_rate": 3.7746155793300105e-06, + "loss": 0.5441, + "step": 6335 + }, + { + "epoch": 0.9998421966230078, + "grad_norm": 0.5974447727203369, + "learning_rate": 3.7742582823904416e-06, + "loss": 0.5502, + "step": 6336 + }, + { + "epoch": 1.0, + "grad_norm": 0.5870632529258728, + "learning_rate": 3.7739009502844487e-06, + "loss": 0.5821, + "step": 6337 + } + ], + "logging_steps": 1, + "max_steps": 19011, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 6337, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2014989094740296e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}