{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004920049200492005, "grad_norm": 483.4133605957031, "learning_rate": 0.0, "loss": 0.95, "step": 1 }, { "epoch": 0.00984009840098401, "grad_norm": 414.1131286621094, "learning_rate": 8.000000000000001e-06, "loss": 0.7919, "step": 2 }, { "epoch": 0.014760147601476014, "grad_norm": 20.664552688598633, "learning_rate": 1.6000000000000003e-05, "loss": 0.2639, "step": 3 }, { "epoch": 0.01968019680196802, "grad_norm": 19.735389709472656, "learning_rate": 2.4e-05, "loss": 0.3098, "step": 4 }, { "epoch": 0.024600246002460024, "grad_norm": 630.93115234375, "learning_rate": 3.2000000000000005e-05, "loss": 1.3057, "step": 5 }, { "epoch": 0.02952029520295203, "grad_norm": 108.70830535888672, "learning_rate": 4e-05, "loss": 0.784, "step": 6 }, { "epoch": 0.03444034440344403, "grad_norm": 25.684120178222656, "learning_rate": 4.8e-05, "loss": 0.6977, "step": 7 }, { "epoch": 0.03936039360393604, "grad_norm": 28.066545486450195, "learning_rate": 5.6000000000000006e-05, "loss": 0.6476, "step": 8 }, { "epoch": 0.04428044280442804, "grad_norm": 76.08965301513672, "learning_rate": 6.400000000000001e-05, "loss": 0.5767, "step": 9 }, { "epoch": 0.04920049200492005, "grad_norm": 14.550333023071289, "learning_rate": 7.2e-05, "loss": 0.4199, "step": 10 }, { "epoch": 0.05412054120541206, "grad_norm": 66.70437622070312, "learning_rate": 8e-05, "loss": 0.6942, "step": 11 }, { "epoch": 0.05904059040590406, "grad_norm": 73.81010437011719, "learning_rate": 8.800000000000001e-05, "loss": 1.517, "step": 12 }, { "epoch": 0.06396063960639606, "grad_norm": 6.138183116912842, "learning_rate": 9.6e-05, "loss": 0.3904, "step": 13 }, { "epoch": 0.06888068880688807, "grad_norm": 11.106021881103516, "learning_rate": 0.00010400000000000001, "loss": 0.4124, "step": 14 }, { "epoch": 0.07380073800738007, "grad_norm": 12.445630073547363, "learning_rate": 0.00011200000000000001, "loss": 0.4418, "step": 15 }, { "epoch": 0.07872078720787208, "grad_norm": 7.307021141052246, "learning_rate": 0.00012, "loss": 0.4751, "step": 16 }, { "epoch": 0.08364083640836409, "grad_norm": 19.08457374572754, "learning_rate": 0.00012800000000000002, "loss": 0.5671, "step": 17 }, { "epoch": 0.08856088560885608, "grad_norm": 11.03348159790039, "learning_rate": 0.00013600000000000003, "loss": 0.4441, "step": 18 }, { "epoch": 0.09348093480934809, "grad_norm": 155.23716735839844, "learning_rate": 0.000144, "loss": 0.5707, "step": 19 }, { "epoch": 0.0984009840098401, "grad_norm": 7.583343029022217, "learning_rate": 0.000152, "loss": 0.4662, "step": 20 }, { "epoch": 0.1033210332103321, "grad_norm": 6.289183139801025, "learning_rate": 0.00016, "loss": 0.4997, "step": 21 }, { "epoch": 0.10824108241082411, "grad_norm": 24.76793098449707, "learning_rate": 0.000168, "loss": 0.491, "step": 22 }, { "epoch": 0.11316113161131611, "grad_norm": 17.512184143066406, "learning_rate": 0.00017600000000000002, "loss": 0.4472, "step": 23 }, { "epoch": 0.11808118081180811, "grad_norm": 16.47793960571289, "learning_rate": 0.00018400000000000003, "loss": 0.5235, "step": 24 }, { "epoch": 0.12300123001230012, "grad_norm": 8.312434196472168, "learning_rate": 0.000192, "loss": 0.4818, "step": 25 }, { "epoch": 0.12792127921279212, "grad_norm": 83.81122589111328, "learning_rate": 0.0002, "loss": 0.5259, "step": 26 }, { "epoch": 0.13284132841328414, "grad_norm": 8.704654693603516, "learning_rate": 0.000199999211292062, "loss": 0.4744, "step": 27 }, { "epoch": 0.13776137761377613, "grad_norm": 2.8565006256103516, "learning_rate": 0.00019999684518068916, "loss": 0.4066, "step": 28 }, { "epoch": 0.14268142681426815, "grad_norm": 5.916236877441406, "learning_rate": 0.00019999290170320485, "loss": 0.4154, "step": 29 }, { "epoch": 0.14760147601476015, "grad_norm": 5.697567462921143, "learning_rate": 0.00019998738092181421, "loss": 0.4639, "step": 30 }, { "epoch": 0.15252152521525214, "grad_norm": 1.615671157836914, "learning_rate": 0.00019998028292360286, "loss": 0.4108, "step": 31 }, { "epoch": 0.15744157441574416, "grad_norm": 11.121039390563965, "learning_rate": 0.00019997160782053578, "loss": 0.449, "step": 32 }, { "epoch": 0.16236162361623616, "grad_norm": 7.386440277099609, "learning_rate": 0.00019996135574945544, "loss": 0.4216, "step": 33 }, { "epoch": 0.16728167281672818, "grad_norm": 84.96491241455078, "learning_rate": 0.00019994952687207954, "loss": 0.5282, "step": 34 }, { "epoch": 0.17220172201722017, "grad_norm": 6.692220211029053, "learning_rate": 0.00019993612137499876, "loss": 0.5036, "step": 35 }, { "epoch": 0.17712177121771217, "grad_norm": 5.1402363777160645, "learning_rate": 0.00019992113946967353, "loss": 0.4041, "step": 36 }, { "epoch": 0.1820418204182042, "grad_norm": 3.2179603576660156, "learning_rate": 0.00019990458139243077, "loss": 0.398, "step": 37 }, { "epoch": 0.18696186961869618, "grad_norm": 5.34651517868042, "learning_rate": 0.00019988644740446022, "loss": 0.4233, "step": 38 }, { "epoch": 0.1918819188191882, "grad_norm": 4.085568428039551, "learning_rate": 0.00019986673779181033, "loss": 0.3935, "step": 39 }, { "epoch": 0.1968019680196802, "grad_norm": 1.318534255027771, "learning_rate": 0.0001998454528653836, "loss": 0.3458, "step": 40 }, { "epoch": 0.2017220172201722, "grad_norm": 3.834606409072876, "learning_rate": 0.0001998225929609319, "loss": 0.3819, "step": 41 }, { "epoch": 0.2066420664206642, "grad_norm": 4.840269088745117, "learning_rate": 0.00019979815843905097, "loss": 0.473, "step": 42 }, { "epoch": 0.2115621156211562, "grad_norm": 498.64990234375, "learning_rate": 0.0001997721496851748, "loss": 3.6745, "step": 43 }, { "epoch": 0.21648216482164823, "grad_norm": 4.956181526184082, "learning_rate": 0.00019974456710956964, "loss": 0.3385, "step": 44 }, { "epoch": 0.22140221402214022, "grad_norm": 6.580547332763672, "learning_rate": 0.00019971541114732741, "loss": 0.4277, "step": 45 }, { "epoch": 0.22632226322263221, "grad_norm": 37.05827713012695, "learning_rate": 0.0001996846822583589, "loss": 0.8045, "step": 46 }, { "epoch": 0.23124231242312424, "grad_norm": 5.152987480163574, "learning_rate": 0.00019965238092738643, "loss": 0.4173, "step": 47 }, { "epoch": 0.23616236162361623, "grad_norm": 277.78857421875, "learning_rate": 0.0001996185076639364, "loss": 0.8626, "step": 48 }, { "epoch": 0.24108241082410825, "grad_norm": 2.9399607181549072, "learning_rate": 0.00019958306300233098, "loss": 0.3167, "step": 49 }, { "epoch": 0.24600246002460024, "grad_norm": 3.668168306350708, "learning_rate": 0.00019954604750167993, "loss": 0.4422, "step": 50 }, { "epoch": 0.25092250922509224, "grad_norm": 4.103700637817383, "learning_rate": 0.00019950746174587163, "loss": 0.3683, "step": 51 }, { "epoch": 0.25584255842558423, "grad_norm": 0.7908763885498047, "learning_rate": 0.0001994673063435639, "loss": 0.3834, "step": 52 }, { "epoch": 0.2607626076260763, "grad_norm": 1.0205233097076416, "learning_rate": 0.0001994255819281744, "loss": 0.375, "step": 53 }, { "epoch": 0.2656826568265683, "grad_norm": 3.6355137825012207, "learning_rate": 0.0001993822891578708, "loss": 0.4557, "step": 54 }, { "epoch": 0.27060270602706027, "grad_norm": 6.3725409507751465, "learning_rate": 0.00019933742871556, "loss": 0.4183, "step": 55 }, { "epoch": 0.27552275522755226, "grad_norm": 6.519746780395508, "learning_rate": 0.00019929100130887782, "loss": 0.4416, "step": 56 }, { "epoch": 0.28044280442804426, "grad_norm": 3.950495719909668, "learning_rate": 0.0001992430076701775, "loss": 0.4255, "step": 57 }, { "epoch": 0.2853628536285363, "grad_norm": 2.0773677825927734, "learning_rate": 0.00019919344855651833, "loss": 0.3393, "step": 58 }, { "epoch": 0.2902829028290283, "grad_norm": 8.755096435546875, "learning_rate": 0.00019914232474965365, "loss": 0.5312, "step": 59 }, { "epoch": 0.2952029520295203, "grad_norm": 4.103138446807861, "learning_rate": 0.00019908963705601846, "loss": 0.4104, "step": 60 }, { "epoch": 0.3001230012300123, "grad_norm": 3.1862292289733887, "learning_rate": 0.0001990353863067169, "loss": 0.4211, "step": 61 }, { "epoch": 0.3050430504305043, "grad_norm": 3.5777299404144287, "learning_rate": 0.00019897957335750878, "loss": 0.38, "step": 62 }, { "epoch": 0.30996309963099633, "grad_norm": 3.0276339054107666, "learning_rate": 0.00019892219908879653, "loss": 0.4561, "step": 63 }, { "epoch": 0.3148831488314883, "grad_norm": 1.853022575378418, "learning_rate": 0.00019886326440561093, "loss": 0.3874, "step": 64 }, { "epoch": 0.3198031980319803, "grad_norm": 5.521330833435059, "learning_rate": 0.00019880277023759702, "loss": 0.459, "step": 65 }, { "epoch": 0.3247232472324723, "grad_norm": 8.374741554260254, "learning_rate": 0.0001987407175389994, "loss": 0.4025, "step": 66 }, { "epoch": 0.3296432964329643, "grad_norm": 3.8265085220336914, "learning_rate": 0.0001986771072886472, "loss": 0.4654, "step": 67 }, { "epoch": 0.33456334563345635, "grad_norm": 2.002042770385742, "learning_rate": 0.00019861194048993863, "loss": 0.312, "step": 68 }, { "epoch": 0.33948339483394835, "grad_norm": 6.2479634284973145, "learning_rate": 0.0001985452181708251, "loss": 0.4739, "step": 69 }, { "epoch": 0.34440344403444034, "grad_norm": 3.072579860687256, "learning_rate": 0.00019847694138379506, "loss": 0.4282, "step": 70 }, { "epoch": 0.34932349323493234, "grad_norm": 1.4464001655578613, "learning_rate": 0.0001984071112058574, "loss": 0.4165, "step": 71 }, { "epoch": 0.35424354243542433, "grad_norm": 1.2664532661437988, "learning_rate": 0.00019833572873852444, "loss": 0.4575, "step": 72 }, { "epoch": 0.3591635916359164, "grad_norm": 9.184704780578613, "learning_rate": 0.00019826279510779454, "loss": 0.4957, "step": 73 }, { "epoch": 0.3640836408364084, "grad_norm": 8.479774475097656, "learning_rate": 0.00019818831146413434, "loss": 0.5062, "step": 74 }, { "epoch": 0.36900369003690037, "grad_norm": 6.585694789886475, "learning_rate": 0.0001981122789824607, "loss": 0.4461, "step": 75 }, { "epoch": 0.37392373923739236, "grad_norm": 2.46947979927063, "learning_rate": 0.0001980346988621221, "loss": 0.4175, "step": 76 }, { "epoch": 0.37884378843788435, "grad_norm": 5.467379570007324, "learning_rate": 0.00019795557232687956, "loss": 0.4634, "step": 77 }, { "epoch": 0.3837638376383764, "grad_norm": 3.7511723041534424, "learning_rate": 0.0001978749006248877, "loss": 0.466, "step": 78 }, { "epoch": 0.3886838868388684, "grad_norm": 4.01120138168335, "learning_rate": 0.00019779268502867473, "loss": 0.5087, "step": 79 }, { "epoch": 0.3936039360393604, "grad_norm": 3.0289227962493896, "learning_rate": 0.0001977089268351225, "loss": 0.4315, "step": 80 }, { "epoch": 0.3985239852398524, "grad_norm": 2.6868069171905518, "learning_rate": 0.00019762362736544607, "loss": 0.3795, "step": 81 }, { "epoch": 0.4034440344403444, "grad_norm": 3.252519130706787, "learning_rate": 0.00019753678796517282, "loss": 0.3436, "step": 82 }, { "epoch": 0.40836408364083643, "grad_norm": 3.5098648071289062, "learning_rate": 0.00019744841000412123, "loss": 0.3921, "step": 83 }, { "epoch": 0.4132841328413284, "grad_norm": 4.654256820678711, "learning_rate": 0.00019735849487637929, "loss": 0.4444, "step": 84 }, { "epoch": 0.4182041820418204, "grad_norm": 1.7671858072280884, "learning_rate": 0.0001972670440002825, "loss": 0.3749, "step": 85 }, { "epoch": 0.4231242312423124, "grad_norm": 2.725391387939453, "learning_rate": 0.00019717405881839145, "loss": 0.4183, "step": 86 }, { "epoch": 0.4280442804428044, "grad_norm": 1.985857605934143, "learning_rate": 0.00019707954079746927, "loss": 0.3992, "step": 87 }, { "epoch": 0.43296432964329645, "grad_norm": 4.41717004776001, "learning_rate": 0.00019698349142845814, "loss": 0.4746, "step": 88 }, { "epoch": 0.43788437884378845, "grad_norm": 4.98541784286499, "learning_rate": 0.00019688591222645607, "loss": 0.3828, "step": 89 }, { "epoch": 0.44280442804428044, "grad_norm": 5.29671573638916, "learning_rate": 0.00019678680473069293, "loss": 0.3513, "step": 90 }, { "epoch": 0.44772447724477243, "grad_norm": 2.3669795989990234, "learning_rate": 0.00019668617050450603, "loss": 0.3433, "step": 91 }, { "epoch": 0.45264452644526443, "grad_norm": 3.3942222595214844, "learning_rate": 0.00019658401113531565, "loss": 0.4033, "step": 92 }, { "epoch": 0.4575645756457565, "grad_norm": 2.1776537895202637, "learning_rate": 0.00019648032823459994, "loss": 0.2924, "step": 93 }, { "epoch": 0.46248462484624847, "grad_norm": 3.5817902088165283, "learning_rate": 0.00019637512343786937, "loss": 0.3886, "step": 94 }, { "epoch": 0.46740467404674046, "grad_norm": 5.881927490234375, "learning_rate": 0.00019626839840464119, "loss": 0.5516, "step": 95 }, { "epoch": 0.47232472324723246, "grad_norm": 3.89084529876709, "learning_rate": 0.0001961601548184129, "loss": 0.5291, "step": 96 }, { "epoch": 0.47724477244772445, "grad_norm": 1.7908971309661865, "learning_rate": 0.00019605039438663614, "loss": 0.4671, "step": 97 }, { "epoch": 0.4821648216482165, "grad_norm": 3.8980045318603516, "learning_rate": 0.0001959391188406893, "loss": 0.4105, "step": 98 }, { "epoch": 0.4870848708487085, "grad_norm": 5.375885486602783, "learning_rate": 0.00019582632993585052, "loss": 0.4652, "step": 99 }, { "epoch": 0.4920049200492005, "grad_norm": 4.329046249389648, "learning_rate": 0.00019571202945126994, "loss": 0.4507, "step": 100 }, { "epoch": 0.4969249692496925, "grad_norm": 3.3222126960754395, "learning_rate": 0.0001955962191899415, "loss": 0.43, "step": 101 }, { "epoch": 0.5018450184501845, "grad_norm": 5.076950550079346, "learning_rate": 0.00019547890097867468, "loss": 0.3756, "step": 102 }, { "epoch": 0.5067650676506765, "grad_norm": 4.1895294189453125, "learning_rate": 0.00019536007666806556, "loss": 0.4545, "step": 103 }, { "epoch": 0.5116851168511685, "grad_norm": 2.738429069519043, "learning_rate": 0.00019523974813246767, "loss": 0.3971, "step": 104 }, { "epoch": 0.5166051660516605, "grad_norm": 1.7775121927261353, "learning_rate": 0.00019511791726996243, "loss": 0.3815, "step": 105 }, { "epoch": 0.5215252152521526, "grad_norm": 0.5655261874198914, "learning_rate": 0.0001949945860023292, "loss": 0.308, "step": 106 }, { "epoch": 0.5264452644526445, "grad_norm": 2.861567258834839, "learning_rate": 0.00019486975627501502, "loss": 0.349, "step": 107 }, { "epoch": 0.5313653136531366, "grad_norm": 0.9508899450302124, "learning_rate": 0.0001947434300571038, "loss": 0.3095, "step": 108 }, { "epoch": 0.5362853628536285, "grad_norm": 4.669578552246094, "learning_rate": 0.00019461560934128533, "loss": 0.5094, "step": 109 }, { "epoch": 0.5412054120541205, "grad_norm": 0.9468059539794922, "learning_rate": 0.0001944862961438239, "loss": 0.2996, "step": 110 }, { "epoch": 0.5461254612546126, "grad_norm": 3.7661190032958984, "learning_rate": 0.00019435549250452645, "loss": 0.3556, "step": 111 }, { "epoch": 0.5510455104551045, "grad_norm": 3.7507622241973877, "learning_rate": 0.0001942232004867103, "loss": 0.4196, "step": 112 }, { "epoch": 0.5559655596555966, "grad_norm": 1.455446481704712, "learning_rate": 0.0001940894221771708, "loss": 0.3751, "step": 113 }, { "epoch": 0.5608856088560885, "grad_norm": 5.634677886962891, "learning_rate": 0.00019395415968614813, "loss": 0.4659, "step": 114 }, { "epoch": 0.5658056580565806, "grad_norm": 1.9559741020202637, "learning_rate": 0.00019381741514729443, "loss": 0.4113, "step": 115 }, { "epoch": 0.5707257072570726, "grad_norm": 2.967988967895508, "learning_rate": 0.0001936791907176397, "loss": 0.4565, "step": 116 }, { "epoch": 0.5756457564575646, "grad_norm": 6.420986175537109, "learning_rate": 0.00019353948857755803, "loss": 0.4465, "step": 117 }, { "epoch": 0.5805658056580566, "grad_norm": 5.503588676452637, "learning_rate": 0.00019339831093073318, "loss": 0.4705, "step": 118 }, { "epoch": 0.5854858548585485, "grad_norm": 5.966702461242676, "learning_rate": 0.00019325566000412376, "loss": 0.4205, "step": 119 }, { "epoch": 0.5904059040590406, "grad_norm": 2.865349054336548, "learning_rate": 0.0001931115380479281, "loss": 0.3988, "step": 120 }, { "epoch": 0.5953259532595326, "grad_norm": 1.7353636026382446, "learning_rate": 0.00019296594733554892, "loss": 0.4364, "step": 121 }, { "epoch": 0.6002460024600246, "grad_norm": 3.236239194869995, "learning_rate": 0.0001928188901635571, "loss": 0.4553, "step": 122 }, { "epoch": 0.6051660516605166, "grad_norm": 2.1501266956329346, "learning_rate": 0.00019267036885165588, "loss": 0.4121, "step": 123 }, { "epoch": 0.6100861008610086, "grad_norm": 2.416868209838867, "learning_rate": 0.00019252038574264405, "loss": 0.3616, "step": 124 }, { "epoch": 0.6150061500615006, "grad_norm": 2.7004334926605225, "learning_rate": 0.00019236894320237894, "loss": 0.3443, "step": 125 }, { "epoch": 0.6199261992619927, "grad_norm": 1.992761492729187, "learning_rate": 0.00019221604361973919, "loss": 0.4068, "step": 126 }, { "epoch": 0.6248462484624846, "grad_norm": 0.9761249423027039, "learning_rate": 0.00019206168940658712, "loss": 0.3952, "step": 127 }, { "epoch": 0.6297662976629766, "grad_norm": 1.5893077850341797, "learning_rate": 0.00019190588299773062, "loss": 0.383, "step": 128 }, { "epoch": 0.6346863468634686, "grad_norm": 1.1404695510864258, "learning_rate": 0.00019174862685088472, "loss": 0.4274, "step": 129 }, { "epoch": 0.6396063960639606, "grad_norm": 1.63871431350708, "learning_rate": 0.0001915899234466328, "loss": 0.3883, "step": 130 }, { "epoch": 0.6445264452644527, "grad_norm": 1.9504516124725342, "learning_rate": 0.00019142977528838762, "loss": 0.3932, "step": 131 }, { "epoch": 0.6494464944649446, "grad_norm": 1.6097129583358765, "learning_rate": 0.0001912681849023516, "loss": 0.4028, "step": 132 }, { "epoch": 0.6543665436654367, "grad_norm": 2.071103572845459, "learning_rate": 0.00019110515483747716, "loss": 0.4016, "step": 133 }, { "epoch": 0.6592865928659286, "grad_norm": 2.177647352218628, "learning_rate": 0.0001909406876654264, "loss": 0.3975, "step": 134 }, { "epoch": 0.6642066420664207, "grad_norm": 1.2018887996673584, "learning_rate": 0.00019077478598053063, "loss": 0.3846, "step": 135 }, { "epoch": 0.6691266912669127, "grad_norm": 3.688076972961426, "learning_rate": 0.00019060745239974936, "loss": 0.4432, "step": 136 }, { "epoch": 0.6740467404674046, "grad_norm": 2.5613861083984375, "learning_rate": 0.0001904386895626291, "loss": 0.3704, "step": 137 }, { "epoch": 0.6789667896678967, "grad_norm": 2.255295753479004, "learning_rate": 0.00019026850013126157, "loss": 0.3267, "step": 138 }, { "epoch": 0.6838868388683886, "grad_norm": 3.4777557849884033, "learning_rate": 0.0001900968867902419, "loss": 0.5534, "step": 139 }, { "epoch": 0.6888068880688807, "grad_norm": 2.0409767627716064, "learning_rate": 0.00018992385224662623, "loss": 0.4607, "step": 140 }, { "epoch": 0.6937269372693727, "grad_norm": 2.408515691757202, "learning_rate": 0.00018974939922988883, "loss": 0.3969, "step": 141 }, { "epoch": 0.6986469864698647, "grad_norm": 1.8554408550262451, "learning_rate": 0.00018957353049187936, "loss": 0.3385, "step": 142 }, { "epoch": 0.7035670356703567, "grad_norm": 3.487424612045288, "learning_rate": 0.00018939624880677918, "loss": 0.4132, "step": 143 }, { "epoch": 0.7084870848708487, "grad_norm": 3.606100559234619, "learning_rate": 0.0001892175569710577, "loss": 0.4644, "step": 144 }, { "epoch": 0.7134071340713407, "grad_norm": 3.1930618286132812, "learning_rate": 0.00018903745780342839, "loss": 0.4235, "step": 145 }, { "epoch": 0.7183271832718328, "grad_norm": 3.3350257873535156, "learning_rate": 0.00018885595414480405, "loss": 0.4837, "step": 146 }, { "epoch": 0.7232472324723247, "grad_norm": 2.0124611854553223, "learning_rate": 0.0001886730488582522, "loss": 0.4149, "step": 147 }, { "epoch": 0.7281672816728167, "grad_norm": 3.699632167816162, "learning_rate": 0.00018848874482894993, "loss": 0.376, "step": 148 }, { "epoch": 0.7330873308733087, "grad_norm": 2.4049108028411865, "learning_rate": 0.00018830304496413822, "loss": 0.4215, "step": 149 }, { "epoch": 0.7380073800738007, "grad_norm": 2.440385341644287, "learning_rate": 0.00018811595219307622, "loss": 0.4041, "step": 150 }, { "epoch": 0.7429274292742928, "grad_norm": 2.6796436309814453, "learning_rate": 0.000187927469466995, "loss": 0.3949, "step": 151 }, { "epoch": 0.7478474784747847, "grad_norm": 1.940114974975586, "learning_rate": 0.00018773759975905098, "loss": 0.4411, "step": 152 }, { "epoch": 0.7527675276752768, "grad_norm": 3.338021755218506, "learning_rate": 0.00018754634606427914, "loss": 0.4607, "step": 153 }, { "epoch": 0.7576875768757687, "grad_norm": 2.3407375812530518, "learning_rate": 0.00018735371139954558, "loss": 0.416, "step": 154 }, { "epoch": 0.7626076260762608, "grad_norm": 1.1078053712844849, "learning_rate": 0.0001871596988035001, "loss": 0.34, "step": 155 }, { "epoch": 0.7675276752767528, "grad_norm": 1.750227928161621, "learning_rate": 0.00018696431133652817, "loss": 0.3084, "step": 156 }, { "epoch": 0.7724477244772447, "grad_norm": 2.9180145263671875, "learning_rate": 0.00018676755208070275, "loss": 0.4109, "step": 157 }, { "epoch": 0.7773677736777368, "grad_norm": 2.675165891647339, "learning_rate": 0.00018656942413973555, "loss": 0.4438, "step": 158 }, { "epoch": 0.7822878228782287, "grad_norm": 3.3854095935821533, "learning_rate": 0.0001863699306389282, "loss": 0.4418, "step": 159 }, { "epoch": 0.7872078720787208, "grad_norm": 0.5620162487030029, "learning_rate": 0.0001861690747251228, "loss": 0.3806, "step": 160 }, { "epoch": 0.7921279212792128, "grad_norm": 1.223493218421936, "learning_rate": 0.00018596685956665245, "loss": 0.3758, "step": 161 }, { "epoch": 0.7970479704797048, "grad_norm": 1.9586799144744873, "learning_rate": 0.00018576328835329117, "loss": 0.3354, "step": 162 }, { "epoch": 0.8019680196801968, "grad_norm": 3.2813546657562256, "learning_rate": 0.00018555836429620358, "loss": 0.4063, "step": 163 }, { "epoch": 0.8068880688806888, "grad_norm": 2.182837963104248, "learning_rate": 0.00018535209062789433, "loss": 0.3697, "step": 164 }, { "epoch": 0.8118081180811808, "grad_norm": 1.3659495115280151, "learning_rate": 0.00018514447060215698, "loss": 0.3351, "step": 165 }, { "epoch": 0.8167281672816729, "grad_norm": 1.170257329940796, "learning_rate": 0.00018493550749402278, "loss": 0.3225, "step": 166 }, { "epoch": 0.8216482164821648, "grad_norm": 4.230517387390137, "learning_rate": 0.00018472520459970898, "loss": 0.4448, "step": 167 }, { "epoch": 0.8265682656826568, "grad_norm": 2.8111300468444824, "learning_rate": 0.0001845135652365668, "loss": 0.3761, "step": 168 }, { "epoch": 0.8314883148831488, "grad_norm": 5.860655307769775, "learning_rate": 0.00018430059274302917, "loss": 0.4974, "step": 169 }, { "epoch": 0.8364083640836408, "grad_norm": 3.6116364002227783, "learning_rate": 0.00018408629047855804, "loss": 0.4327, "step": 170 }, { "epoch": 0.8413284132841329, "grad_norm": 2.6450071334838867, "learning_rate": 0.00018387066182359133, "loss": 0.3813, "step": 171 }, { "epoch": 0.8462484624846248, "grad_norm": 2.9791674613952637, "learning_rate": 0.00018365371017948964, "loss": 0.4184, "step": 172 }, { "epoch": 0.8511685116851169, "grad_norm": 1.7529772520065308, "learning_rate": 0.00018343543896848273, "loss": 0.3489, "step": 173 }, { "epoch": 0.8560885608856088, "grad_norm": 3.5216493606567383, "learning_rate": 0.00018321585163361527, "loss": 0.3988, "step": 174 }, { "epoch": 0.8610086100861009, "grad_norm": 2.470106840133667, "learning_rate": 0.00018299495163869275, "loss": 0.3919, "step": 175 }, { "epoch": 0.8659286592865929, "grad_norm": 3.1759798526763916, "learning_rate": 0.0001827727424682268, "loss": 0.3853, "step": 176 }, { "epoch": 0.8708487084870848, "grad_norm": 3.581413745880127, "learning_rate": 0.00018254922762738008, "loss": 0.4041, "step": 177 }, { "epoch": 0.8757687576875769, "grad_norm": 1.35221266746521, "learning_rate": 0.00018232441064191125, "loss": 0.3564, "step": 178 }, { "epoch": 0.8806888068880688, "grad_norm": 2.2829418182373047, "learning_rate": 0.0001820982950581191, "loss": 0.443, "step": 179 }, { "epoch": 0.8856088560885609, "grad_norm": 2.931074619293213, "learning_rate": 0.00018187088444278674, "loss": 0.4088, "step": 180 }, { "epoch": 0.8905289052890529, "grad_norm": 3.7436723709106445, "learning_rate": 0.00018164218238312535, "loss": 0.4888, "step": 181 }, { "epoch": 0.8954489544895449, "grad_norm": 2.7169697284698486, "learning_rate": 0.00018141219248671745, "loss": 0.4432, "step": 182 }, { "epoch": 0.9003690036900369, "grad_norm": 1.1118288040161133, "learning_rate": 0.00018118091838146029, "loss": 0.3677, "step": 183 }, { "epoch": 0.9052890528905289, "grad_norm": 3.0052273273468018, "learning_rate": 0.00018094836371550824, "loss": 0.3169, "step": 184 }, { "epoch": 0.9102091020910209, "grad_norm": 2.911255121231079, "learning_rate": 0.00018071453215721554, "loss": 0.4721, "step": 185 }, { "epoch": 0.915129151291513, "grad_norm": 2.483900547027588, "learning_rate": 0.00018047942739507836, "loss": 0.3812, "step": 186 }, { "epoch": 0.9200492004920049, "grad_norm": 1.8842488527297974, "learning_rate": 0.00018024305313767646, "loss": 0.436, "step": 187 }, { "epoch": 0.9249692496924969, "grad_norm": 1.5961415767669678, "learning_rate": 0.000180005413113615, "loss": 0.3808, "step": 188 }, { "epoch": 0.9298892988929889, "grad_norm": 2.0334715843200684, "learning_rate": 0.00017976651107146533, "loss": 0.4548, "step": 189 }, { "epoch": 0.9348093480934809, "grad_norm": 1.6422673463821411, "learning_rate": 0.0001795263507797063, "loss": 0.406, "step": 190 }, { "epoch": 0.939729397293973, "grad_norm": 2.5241055488586426, "learning_rate": 0.00017928493602666445, "loss": 0.3661, "step": 191 }, { "epoch": 0.9446494464944649, "grad_norm": 2.3822920322418213, "learning_rate": 0.00017904227062045437, "loss": 0.4581, "step": 192 }, { "epoch": 0.949569495694957, "grad_norm": 3.649919271469116, "learning_rate": 0.00017879835838891875, "loss": 0.4743, "step": 193 }, { "epoch": 0.9544895448954489, "grad_norm": 1.9197454452514648, "learning_rate": 0.00017855320317956784, "loss": 0.3857, "step": 194 }, { "epoch": 0.959409594095941, "grad_norm": 1.4304083585739136, "learning_rate": 0.00017830680885951887, "loss": 0.3935, "step": 195 }, { "epoch": 0.964329643296433, "grad_norm": 0.4576971232891083, "learning_rate": 0.00017805917931543492, "loss": 0.4147, "step": 196 }, { "epoch": 0.9692496924969249, "grad_norm": 1.4386779069900513, "learning_rate": 0.00017781031845346375, "loss": 0.3927, "step": 197 }, { "epoch": 0.974169741697417, "grad_norm": 1.496974229812622, "learning_rate": 0.00017756023019917607, "loss": 0.3666, "step": 198 }, { "epoch": 0.9790897908979089, "grad_norm": 1.221921682357788, "learning_rate": 0.00017730891849750377, "loss": 0.3938, "step": 199 }, { "epoch": 0.984009840098401, "grad_norm": 1.7949525117874146, "learning_rate": 0.0001770563873126775, "loss": 0.4118, "step": 200 }, { "epoch": 0.988929889298893, "grad_norm": 1.1061089038848877, "learning_rate": 0.0001768026406281642, "loss": 0.4086, "step": 201 }, { "epoch": 0.993849938499385, "grad_norm": 3.217977523803711, "learning_rate": 0.00017654768244660448, "loss": 0.4018, "step": 202 }, { "epoch": 0.998769987699877, "grad_norm": 0.9173564314842224, "learning_rate": 0.00017629151678974907, "loss": 0.3952, "step": 203 }, { "epoch": 1.0, "grad_norm": 5.783997058868408, "learning_rate": 0.00017603414769839577, "loss": 0.375, "step": 204 }, { "epoch": 1.004920049200492, "grad_norm": 3.309582471847534, "learning_rate": 0.00017577557923232546, "loss": 0.4257, "step": 205 }, { "epoch": 1.009840098400984, "grad_norm": 1.2689415216445923, "learning_rate": 0.00017551581547023819, "loss": 0.4078, "step": 206 }, { "epoch": 1.014760147601476, "grad_norm": 1.5618160963058472, "learning_rate": 0.00017525486050968875, "loss": 0.3948, "step": 207 }, { "epoch": 1.019680196801968, "grad_norm": 2.377791166305542, "learning_rate": 0.00017499271846702213, "loss": 0.3407, "step": 208 }, { "epoch": 1.0246002460024601, "grad_norm": 1.7102715969085693, "learning_rate": 0.00017472939347730856, "loss": 0.3997, "step": 209 }, { "epoch": 1.029520295202952, "grad_norm": 1.2720469236373901, "learning_rate": 0.0001744648896942782, "loss": 0.339, "step": 210 }, { "epoch": 1.034440344403444, "grad_norm": 2.7468247413635254, "learning_rate": 0.00017419921129025576, "loss": 0.3818, "step": 211 }, { "epoch": 1.039360393603936, "grad_norm": 3.501011371612549, "learning_rate": 0.0001739323624560945, "loss": 0.451, "step": 212 }, { "epoch": 1.044280442804428, "grad_norm": 1.6358418464660645, "learning_rate": 0.00017366434740111037, "loss": 0.3493, "step": 213 }, { "epoch": 1.04920049200492, "grad_norm": 3.540642023086548, "learning_rate": 0.00017339517035301532, "loss": 0.455, "step": 214 }, { "epoch": 1.054120541205412, "grad_norm": 2.093965530395508, "learning_rate": 0.00017312483555785086, "loss": 0.3673, "step": 215 }, { "epoch": 1.0590405904059041, "grad_norm": 4.747845649719238, "learning_rate": 0.000172853347279921, "loss": 0.5216, "step": 216 }, { "epoch": 1.063960639606396, "grad_norm": 2.5414655208587646, "learning_rate": 0.00017258070980172494, "loss": 0.4571, "step": 217 }, { "epoch": 1.068880688806888, "grad_norm": 1.3232766389846802, "learning_rate": 0.0001723069274238895, "loss": 0.4014, "step": 218 }, { "epoch": 1.07380073800738, "grad_norm": 2.045196771621704, "learning_rate": 0.0001720320044651014, "loss": 0.4119, "step": 219 }, { "epoch": 1.0787207872078721, "grad_norm": 5.2392096519470215, "learning_rate": 0.00017175594526203905, "loss": 0.3691, "step": 220 }, { "epoch": 1.083640836408364, "grad_norm": 2.589878797531128, "learning_rate": 0.00017147875416930416, "loss": 0.4317, "step": 221 }, { "epoch": 1.088560885608856, "grad_norm": 1.5000386238098145, "learning_rate": 0.00017120043555935298, "loss": 0.4135, "step": 222 }, { "epoch": 1.0934809348093482, "grad_norm": 0.8919417262077332, "learning_rate": 0.00017092099382242748, "loss": 0.4183, "step": 223 }, { "epoch": 1.09840098400984, "grad_norm": 1.059650182723999, "learning_rate": 0.00017064043336648599, "loss": 0.3791, "step": 224 }, { "epoch": 1.103321033210332, "grad_norm": 1.8085955381393433, "learning_rate": 0.0001703587586171337, "loss": 0.3893, "step": 225 }, { "epoch": 1.1082410824108242, "grad_norm": 2.2094881534576416, "learning_rate": 0.00017007597401755276, "loss": 0.3871, "step": 226 }, { "epoch": 1.1131611316113161, "grad_norm": 3.2818965911865234, "learning_rate": 0.00016979208402843237, "loss": 0.4412, "step": 227 }, { "epoch": 1.118081180811808, "grad_norm": 1.4197732210159302, "learning_rate": 0.00016950709312789833, "loss": 0.3248, "step": 228 }, { "epoch": 1.1230012300123002, "grad_norm": 3.690911054611206, "learning_rate": 0.00016922100581144228, "loss": 0.4552, "step": 229 }, { "epoch": 1.1279212792127922, "grad_norm": 4.1127424240112305, "learning_rate": 0.00016893382659185105, "loss": 0.4887, "step": 230 }, { "epoch": 1.132841328413284, "grad_norm": 2.658750295639038, "learning_rate": 0.00016864555999913518, "loss": 0.4037, "step": 231 }, { "epoch": 1.137761377613776, "grad_norm": 1.4189069271087646, "learning_rate": 0.0001683562105804577, "loss": 0.3705, "step": 232 }, { "epoch": 1.1426814268142682, "grad_norm": 2.599860191345215, "learning_rate": 0.00016806578290006225, "loss": 0.3975, "step": 233 }, { "epoch": 1.1476014760147601, "grad_norm": 3.787053108215332, "learning_rate": 0.0001677742815392012, "loss": 0.4294, "step": 234 }, { "epoch": 1.152521525215252, "grad_norm": 3.4738941192626953, "learning_rate": 0.00016748171109606328, "loss": 0.3847, "step": 235 }, { "epoch": 1.1574415744157442, "grad_norm": 2.0890064239501953, "learning_rate": 0.00016718807618570106, "loss": 0.4156, "step": 236 }, { "epoch": 1.1623616236162362, "grad_norm": 2.275296211242676, "learning_rate": 0.00016689338143995833, "loss": 0.4598, "step": 237 }, { "epoch": 1.1672816728167281, "grad_norm": 0.8225153088569641, "learning_rate": 0.00016659763150739677, "loss": 0.3495, "step": 238 }, { "epoch": 1.17220172201722, "grad_norm": 0.9762566685676575, "learning_rate": 0.00016630083105322266, "loss": 0.3705, "step": 239 }, { "epoch": 1.1771217712177122, "grad_norm": 3.3081791400909424, "learning_rate": 0.00016600298475921365, "loss": 0.4167, "step": 240 }, { "epoch": 1.1820418204182042, "grad_norm": 4.026612281799316, "learning_rate": 0.00016570409732364437, "loss": 0.4859, "step": 241 }, { "epoch": 1.186961869618696, "grad_norm": 2.193952798843384, "learning_rate": 0.0001654041734612127, "loss": 0.4207, "step": 242 }, { "epoch": 1.1918819188191883, "grad_norm": 2.2682714462280273, "learning_rate": 0.00016510321790296525, "loss": 0.4344, "step": 243 }, { "epoch": 1.1968019680196802, "grad_norm": 3.3938522338867188, "learning_rate": 0.00016480123539622281, "loss": 0.4628, "step": 244 }, { "epoch": 1.2017220172201721, "grad_norm": 4.911561489105225, "learning_rate": 0.00016449823070450531, "loss": 0.3449, "step": 245 }, { "epoch": 1.2066420664206643, "grad_norm": 2.2653610706329346, "learning_rate": 0.00016419420860745699, "loss": 0.3965, "step": 246 }, { "epoch": 1.2115621156211562, "grad_norm": 1.626495361328125, "learning_rate": 0.00016388917390077054, "loss": 0.3818, "step": 247 }, { "epoch": 1.2164821648216482, "grad_norm": 2.9067697525024414, "learning_rate": 0.00016358313139611195, "loss": 0.4184, "step": 248 }, { "epoch": 1.2214022140221403, "grad_norm": 1.6488162279129028, "learning_rate": 0.0001632760859210442, "loss": 0.3561, "step": 249 }, { "epoch": 1.2263222632226323, "grad_norm": 1.5693081617355347, "learning_rate": 0.00016296804231895142, "loss": 0.404, "step": 250 }, { "epoch": 1.2312423124231242, "grad_norm": 2.674132823944092, "learning_rate": 0.00016265900544896225, "loss": 0.4402, "step": 251 }, { "epoch": 1.2361623616236161, "grad_norm": 1.432892918586731, "learning_rate": 0.00016234898018587337, "loss": 0.3073, "step": 252 }, { "epoch": 1.2410824108241083, "grad_norm": 1.0736567974090576, "learning_rate": 0.0001620379714200725, "loss": 0.3551, "step": 253 }, { "epoch": 1.2460024600246002, "grad_norm": 2.692246675491333, "learning_rate": 0.00016172598405746124, "loss": 0.4585, "step": 254 }, { "epoch": 1.2509225092250922, "grad_norm": 1.1363232135772705, "learning_rate": 0.00016141302301937786, "loss": 0.3566, "step": 255 }, { "epoch": 1.2558425584255843, "grad_norm": 2.9427497386932373, "learning_rate": 0.0001610990932425194, "loss": 0.4541, "step": 256 }, { "epoch": 1.2607626076260763, "grad_norm": 1.8412046432495117, "learning_rate": 0.00016078419967886402, "loss": 0.4018, "step": 257 }, { "epoch": 1.2656826568265682, "grad_norm": 1.6179234981536865, "learning_rate": 0.0001604683472955928, "loss": 0.4115, "step": 258 }, { "epoch": 1.2706027060270602, "grad_norm": 1.2234046459197998, "learning_rate": 0.00016015154107501133, "loss": 0.4339, "step": 259 }, { "epoch": 1.2755227552275523, "grad_norm": 1.4952470064163208, "learning_rate": 0.00015983378601447127, "loss": 0.4079, "step": 260 }, { "epoch": 1.2804428044280443, "grad_norm": 0.6399968266487122, "learning_rate": 0.0001595150871262914, "loss": 0.4262, "step": 261 }, { "epoch": 1.2853628536285364, "grad_norm": 2.328315258026123, "learning_rate": 0.00015919544943767856, "loss": 0.4236, "step": 262 }, { "epoch": 1.2902829028290284, "grad_norm": 2.3608176708221436, "learning_rate": 0.00015887487799064838, "loss": 0.3888, "step": 263 }, { "epoch": 1.2952029520295203, "grad_norm": 1.258406639099121, "learning_rate": 0.00015855337784194577, "loss": 0.405, "step": 264 }, { "epoch": 1.3001230012300122, "grad_norm": 1.4067128896713257, "learning_rate": 0.00015823095406296514, "loss": 0.426, "step": 265 }, { "epoch": 1.3050430504305042, "grad_norm": 2.433593988418579, "learning_rate": 0.00015790761173967036, "loss": 0.404, "step": 266 }, { "epoch": 1.3099630996309963, "grad_norm": 1.216810703277588, "learning_rate": 0.00015758335597251458, "loss": 0.3607, "step": 267 }, { "epoch": 1.3148831488314883, "grad_norm": 3.8141870498657227, "learning_rate": 0.00015725819187635968, "loss": 0.487, "step": 268 }, { "epoch": 1.3198031980319804, "grad_norm": 0.3127140700817108, "learning_rate": 0.00015693212458039584, "loss": 0.3555, "step": 269 }, { "epoch": 1.3247232472324724, "grad_norm": 3.0095064640045166, "learning_rate": 0.00015660515922806027, "loss": 0.4525, "step": 270 }, { "epoch": 1.3296432964329643, "grad_norm": 1.490798830986023, "learning_rate": 0.00015627730097695638, "loss": 0.3951, "step": 271 }, { "epoch": 1.3345633456334562, "grad_norm": 2.4473958015441895, "learning_rate": 0.0001559485549987723, "loss": 0.3184, "step": 272 }, { "epoch": 1.3394833948339484, "grad_norm": 1.3399827480316162, "learning_rate": 0.0001556189264791992, "loss": 0.401, "step": 273 }, { "epoch": 1.3444034440344403, "grad_norm": 1.9885616302490234, "learning_rate": 0.0001552884206178498, "loss": 0.4482, "step": 274 }, { "epoch": 1.3493234932349323, "grad_norm": 1.2298444509506226, "learning_rate": 0.00015495704262817597, "loss": 0.4295, "step": 275 }, { "epoch": 1.3542435424354244, "grad_norm": 1.3658753633499146, "learning_rate": 0.0001546247977373867, "loss": 0.3741, "step": 276 }, { "epoch": 1.3591635916359164, "grad_norm": 3.8412437438964844, "learning_rate": 0.00015429169118636566, "loss": 0.356, "step": 277 }, { "epoch": 1.3640836408364083, "grad_norm": 2.24770188331604, "learning_rate": 0.00015395772822958845, "loss": 0.3911, "step": 278 }, { "epoch": 1.3690036900369003, "grad_norm": 1.0389429330825806, "learning_rate": 0.00015362291413503984, "loss": 0.4239, "step": 279 }, { "epoch": 1.3739237392373924, "grad_norm": 2.6337220668792725, "learning_rate": 0.00015328725418413045, "loss": 0.3546, "step": 280 }, { "epoch": 1.3788437884378844, "grad_norm": 1.609165906906128, "learning_rate": 0.00015295075367161367, "loss": 0.4083, "step": 281 }, { "epoch": 1.3837638376383765, "grad_norm": 2.580286741256714, "learning_rate": 0.00015261341790550196, "loss": 0.3493, "step": 282 }, { "epoch": 1.3886838868388685, "grad_norm": 2.396101474761963, "learning_rate": 0.0001522752522069833, "loss": 0.4164, "step": 283 }, { "epoch": 1.3936039360393604, "grad_norm": 1.4685685634613037, "learning_rate": 0.00015193626191033712, "loss": 0.3765, "step": 284 }, { "epoch": 1.3985239852398523, "grad_norm": 2.472041368484497, "learning_rate": 0.0001515964523628501, "loss": 0.4015, "step": 285 }, { "epoch": 1.4034440344403443, "grad_norm": 2.8179895877838135, "learning_rate": 0.00015125582892473204, "loss": 0.4108, "step": 286 }, { "epoch": 1.4083640836408364, "grad_norm": 2.4579968452453613, "learning_rate": 0.00015091439696903115, "loss": 0.4333, "step": 287 }, { "epoch": 1.4132841328413284, "grad_norm": 2.46209716796875, "learning_rate": 0.00015057216188154928, "loss": 0.468, "step": 288 }, { "epoch": 1.4182041820418205, "grad_norm": 1.7040590047836304, "learning_rate": 0.00015022912906075702, "loss": 0.421, "step": 289 }, { "epoch": 1.4231242312423125, "grad_norm": 4.77639102935791, "learning_rate": 0.00014988530391770856, "loss": 0.4105, "step": 290 }, { "epoch": 1.4280442804428044, "grad_norm": 1.7009060382843018, "learning_rate": 0.00014954069187595633, "loss": 0.4034, "step": 291 }, { "epoch": 1.4329643296432963, "grad_norm": 1.7801786661148071, "learning_rate": 0.00014919529837146528, "loss": 0.3962, "step": 292 }, { "epoch": 1.4378843788437885, "grad_norm": 0.9307105541229248, "learning_rate": 0.0001488491288525275, "loss": 0.3895, "step": 293 }, { "epoch": 1.4428044280442804, "grad_norm": 1.3841267824172974, "learning_rate": 0.0001485021887796759, "loss": 0.3871, "step": 294 }, { "epoch": 1.4477244772447724, "grad_norm": 1.3614524602890015, "learning_rate": 0.00014815448362559826, "loss": 0.4055, "step": 295 }, { "epoch": 1.4526445264452645, "grad_norm": 3.51263165473938, "learning_rate": 0.00014780601887505088, "loss": 0.2708, "step": 296 }, { "epoch": 1.4575645756457565, "grad_norm": 2.4436159133911133, "learning_rate": 0.00014745680002477203, "loss": 0.388, "step": 297 }, { "epoch": 1.4624846248462484, "grad_norm": 0.775227427482605, "learning_rate": 0.00014710683258339536, "loss": 0.3506, "step": 298 }, { "epoch": 1.4674046740467404, "grad_norm": 1.1680070161819458, "learning_rate": 0.0001467561220713628, "loss": 0.3227, "step": 299 }, { "epoch": 1.4723247232472325, "grad_norm": 2.8542237281799316, "learning_rate": 0.0001464046740208377, "loss": 0.3588, "step": 300 }, { "epoch": 1.4772447724477245, "grad_norm": 2.2465827465057373, "learning_rate": 0.00014605249397561736, "loss": 0.4161, "step": 301 }, { "epoch": 1.4821648216482166, "grad_norm": 3.5913736820220947, "learning_rate": 0.00014569958749104575, "loss": 0.4758, "step": 302 }, { "epoch": 1.4870848708487086, "grad_norm": 0.5437675714492798, "learning_rate": 0.00014534596013392575, "loss": 0.3388, "step": 303 }, { "epoch": 1.4920049200492005, "grad_norm": 2.386204242706299, "learning_rate": 0.00014499161748243147, "loss": 0.4425, "step": 304 }, { "epoch": 1.4969249692496924, "grad_norm": 1.160514235496521, "learning_rate": 0.0001446365651260201, "loss": 0.3747, "step": 305 }, { "epoch": 1.5018450184501844, "grad_norm": 0.49445146322250366, "learning_rate": 0.00014428080866534396, "loss": 0.3707, "step": 306 }, { "epoch": 1.5067650676506765, "grad_norm": 1.3350694179534912, "learning_rate": 0.00014392435371216185, "loss": 0.3455, "step": 307 }, { "epoch": 1.5116851168511685, "grad_norm": 1.1016676425933838, "learning_rate": 0.0001435672058892509, "loss": 0.4095, "step": 308 }, { "epoch": 1.5166051660516606, "grad_norm": 2.0227558612823486, "learning_rate": 0.00014320937083031748, "loss": 0.3706, "step": 309 }, { "epoch": 1.5215252152521526, "grad_norm": 3.2734158039093018, "learning_rate": 0.0001428508541799086, "loss": 0.3384, "step": 310 }, { "epoch": 1.5264452644526445, "grad_norm": 1.967950701713562, "learning_rate": 0.0001424916615933229, "loss": 0.4321, "step": 311 }, { "epoch": 1.5313653136531364, "grad_norm": 1.8679777383804321, "learning_rate": 0.00014213179873652127, "loss": 0.3597, "step": 312 }, { "epoch": 1.5362853628536284, "grad_norm": 1.111864447593689, "learning_rate": 0.00014177127128603745, "loss": 0.3639, "step": 313 }, { "epoch": 1.5412054120541205, "grad_norm": 1.1539496183395386, "learning_rate": 0.0001414100849288888, "loss": 0.3735, "step": 314 }, { "epoch": 1.5461254612546127, "grad_norm": 2.5453989505767822, "learning_rate": 0.00014104824536248614, "loss": 0.4241, "step": 315 }, { "epoch": 1.5510455104551046, "grad_norm": 1.5490731000900269, "learning_rate": 0.00014068575829454436, "loss": 0.38, "step": 316 }, { "epoch": 1.5559655596555966, "grad_norm": 2.0369129180908203, "learning_rate": 0.00014032262944299194, "loss": 0.432, "step": 317 }, { "epoch": 1.5608856088560885, "grad_norm": 1.938671588897705, "learning_rate": 0.00013995886453588104, "loss": 0.4407, "step": 318 }, { "epoch": 1.5658056580565805, "grad_norm": 1.5802247524261475, "learning_rate": 0.00013959446931129704, "loss": 0.4174, "step": 319 }, { "epoch": 1.5707257072570726, "grad_norm": 1.7823857069015503, "learning_rate": 0.0001392294495172681, "loss": 0.3608, "step": 320 }, { "epoch": 1.5756457564575646, "grad_norm": 1.5793462991714478, "learning_rate": 0.0001388638109116744, "loss": 0.4049, "step": 321 }, { "epoch": 1.5805658056580567, "grad_norm": 2.478447437286377, "learning_rate": 0.00013849755926215735, "loss": 0.3822, "step": 322 }, { "epoch": 1.5854858548585486, "grad_norm": 3.0512235164642334, "learning_rate": 0.00013813070034602863, "loss": 0.3729, "step": 323 }, { "epoch": 1.5904059040590406, "grad_norm": 2.298110008239746, "learning_rate": 0.00013776323995017898, "loss": 0.3757, "step": 324 }, { "epoch": 1.5953259532595325, "grad_norm": 1.1335664987564087, "learning_rate": 0.00013739518387098705, "loss": 0.3436, "step": 325 }, { "epoch": 1.6002460024600245, "grad_norm": 1.9639568328857422, "learning_rate": 0.0001370265379142279, "loss": 0.4321, "step": 326 }, { "epoch": 1.6051660516605166, "grad_norm": 2.0375776290893555, "learning_rate": 0.0001366573078949813, "loss": 0.3924, "step": 327 }, { "epoch": 1.6100861008610086, "grad_norm": 2.925692558288574, "learning_rate": 0.00013628749963754026, "loss": 0.447, "step": 328 }, { "epoch": 1.6150061500615007, "grad_norm": 2.7469842433929443, "learning_rate": 0.0001359171189753189, "loss": 0.4045, "step": 329 }, { "epoch": 1.6199261992619927, "grad_norm": 1.8784551620483398, "learning_rate": 0.00013554617175076062, "loss": 0.398, "step": 330 }, { "epoch": 1.6248462484624846, "grad_norm": 3.961890459060669, "learning_rate": 0.0001351746638152458, "loss": 0.4837, "step": 331 }, { "epoch": 1.6297662976629765, "grad_norm": 1.2118688821792603, "learning_rate": 0.00013480260102899966, "loss": 0.3792, "step": 332 }, { "epoch": 1.6346863468634685, "grad_norm": 0.8447842597961426, "learning_rate": 0.0001344299892609996, "loss": 0.3939, "step": 333 }, { "epoch": 1.6396063960639606, "grad_norm": 0.8459701538085938, "learning_rate": 0.00013405683438888282, "loss": 0.4246, "step": 334 }, { "epoch": 1.6445264452644528, "grad_norm": 2.4549758434295654, "learning_rate": 0.00013368314229885347, "loss": 0.3642, "step": 335 }, { "epoch": 1.6494464944649447, "grad_norm": 3.813248872756958, "learning_rate": 0.00013330891888559002, "loss": 0.4009, "step": 336 }, { "epoch": 1.6543665436654367, "grad_norm": 3.946821689605713, "learning_rate": 0.00013293417005215188, "loss": 0.3961, "step": 337 }, { "epoch": 1.6592865928659286, "grad_norm": 1.7004120349884033, "learning_rate": 0.0001325589017098867, "loss": 0.4145, "step": 338 }, { "epoch": 1.6642066420664205, "grad_norm": 2.824493169784546, "learning_rate": 0.00013218311977833687, "loss": 0.403, "step": 339 }, { "epoch": 1.6691266912669127, "grad_norm": 2.5144972801208496, "learning_rate": 0.0001318068301851463, "loss": 0.4236, "step": 340 }, { "epoch": 1.6740467404674046, "grad_norm": 2.8602144718170166, "learning_rate": 0.00013143003886596669, "loss": 0.4267, "step": 341 }, { "epoch": 1.6789667896678968, "grad_norm": 1.5796253681182861, "learning_rate": 0.0001310527517643642, "loss": 0.4181, "step": 342 }, { "epoch": 1.6838868388683887, "grad_norm": 1.748310923576355, "learning_rate": 0.00013067497483172538, "loss": 0.3817, "step": 343 }, { "epoch": 1.6888068880688807, "grad_norm": 1.1796998977661133, "learning_rate": 0.00013029671402716366, "loss": 0.3891, "step": 344 }, { "epoch": 1.6937269372693726, "grad_norm": 0.8031755089759827, "learning_rate": 0.00012991797531742492, "loss": 0.3746, "step": 345 }, { "epoch": 1.6986469864698646, "grad_norm": 2.449571132659912, "learning_rate": 0.00012953876467679373, "loss": 0.3759, "step": 346 }, { "epoch": 1.7035670356703567, "grad_norm": 2.167459726333618, "learning_rate": 0.00012915908808699893, "loss": 0.4026, "step": 347 }, { "epoch": 1.7084870848708487, "grad_norm": 0.8750459551811218, "learning_rate": 0.00012877895153711935, "loss": 0.3612, "step": 348 }, { "epoch": 1.7134071340713408, "grad_norm": 0.9922705292701721, "learning_rate": 0.00012839836102348926, "loss": 0.3894, "step": 349 }, { "epoch": 1.7183271832718328, "grad_norm": 2.778425693511963, "learning_rate": 0.00012801732254960388, "loss": 0.5034, "step": 350 }, { "epoch": 1.7232472324723247, "grad_norm": 2.2978157997131348, "learning_rate": 0.00012763584212602453, "loss": 0.3919, "step": 351 }, { "epoch": 1.7281672816728166, "grad_norm": 0.7636315822601318, "learning_rate": 0.00012725392577028402, "loss": 0.3465, "step": 352 }, { "epoch": 1.7330873308733086, "grad_norm": 0.9501156210899353, "learning_rate": 0.0001268715795067916, "loss": 0.353, "step": 353 }, { "epoch": 1.7380073800738007, "grad_norm": 1.9191248416900635, "learning_rate": 0.00012648880936673787, "loss": 0.4535, "step": 354 }, { "epoch": 1.742927429274293, "grad_norm": 0.5128054618835449, "learning_rate": 0.00012610562138799978, "loss": 0.3761, "step": 355 }, { "epoch": 1.7478474784747848, "grad_norm": 1.427462100982666, "learning_rate": 0.00012572202161504543, "loss": 0.3871, "step": 356 }, { "epoch": 1.7527675276752768, "grad_norm": 0.44678959250450134, "learning_rate": 0.00012533801609883842, "loss": 0.3858, "step": 357 }, { "epoch": 1.7576875768757687, "grad_norm": 1.5311493873596191, "learning_rate": 0.00012495361089674285, "loss": 0.3638, "step": 358 }, { "epoch": 1.7626076260762606, "grad_norm": 1.7714836597442627, "learning_rate": 0.00012456881207242732, "loss": 0.324, "step": 359 }, { "epoch": 1.7675276752767528, "grad_norm": 0.6259622573852539, "learning_rate": 0.00012418362569576965, "loss": 0.3832, "step": 360 }, { "epoch": 1.7724477244772447, "grad_norm": 1.4297990798950195, "learning_rate": 0.00012379805784276082, "loss": 0.3464, "step": 361 }, { "epoch": 1.777367773677737, "grad_norm": 0.9419127702713013, "learning_rate": 0.0001234121145954094, "loss": 0.3605, "step": 362 }, { "epoch": 1.7822878228782288, "grad_norm": 1.7905269861221313, "learning_rate": 0.00012302580204164541, "loss": 0.4042, "step": 363 }, { "epoch": 1.7872078720787208, "grad_norm": 2.3646910190582275, "learning_rate": 0.0001226391262752245, "loss": 0.4208, "step": 364 }, { "epoch": 1.7921279212792127, "grad_norm": 1.26406991481781, "learning_rate": 0.00012225209339563145, "loss": 0.3653, "step": 365 }, { "epoch": 1.7970479704797047, "grad_norm": 2.371533155441284, "learning_rate": 0.00012186470950798445, "loss": 0.4039, "step": 366 }, { "epoch": 1.8019680196801968, "grad_norm": 3.1603784561157227, "learning_rate": 0.00012147698072293842, "loss": 0.4911, "step": 367 }, { "epoch": 1.8068880688806888, "grad_norm": 2.687168836593628, "learning_rate": 0.00012108891315658879, "loss": 0.4356, "step": 368 }, { "epoch": 1.811808118081181, "grad_norm": 3.9243521690368652, "learning_rate": 0.00012070051293037492, "loss": 0.434, "step": 369 }, { "epoch": 1.8167281672816729, "grad_norm": 2.8489391803741455, "learning_rate": 0.00012031178617098371, "loss": 0.3572, "step": 370 }, { "epoch": 1.8216482164821648, "grad_norm": 2.8946075439453125, "learning_rate": 0.00011992273901025269, "loss": 0.3993, "step": 371 }, { "epoch": 1.8265682656826567, "grad_norm": 1.3082534074783325, "learning_rate": 0.0001195333775850736, "loss": 0.4137, "step": 372 }, { "epoch": 1.8314883148831487, "grad_norm": 1.9355298280715942, "learning_rate": 0.00011914370803729533, "loss": 0.3746, "step": 373 }, { "epoch": 1.8364083640836408, "grad_norm": 2.1702141761779785, "learning_rate": 0.00011875373651362727, "loss": 0.3622, "step": 374 }, { "epoch": 1.841328413284133, "grad_norm": 1.4988595247268677, "learning_rate": 0.00011836346916554205, "loss": 0.3619, "step": 375 }, { "epoch": 1.846248462484625, "grad_norm": 1.761991262435913, "learning_rate": 0.00011797291214917881, "loss": 0.4358, "step": 376 }, { "epoch": 1.8511685116851169, "grad_norm": 0.745695173740387, "learning_rate": 0.00011758207162524598, "loss": 0.3995, "step": 377 }, { "epoch": 1.8560885608856088, "grad_norm": 1.9512763023376465, "learning_rate": 0.00011719095375892396, "loss": 0.4432, "step": 378 }, { "epoch": 1.8610086100861007, "grad_norm": 2.6219289302825928, "learning_rate": 0.00011679956471976814, "loss": 0.4677, "step": 379 }, { "epoch": 1.865928659286593, "grad_norm": 3.995495080947876, "learning_rate": 0.0001164079106816113, "loss": 0.2968, "step": 380 }, { "epoch": 1.8708487084870848, "grad_norm": 0.8375853300094604, "learning_rate": 0.00011601599782246646, "loss": 0.4035, "step": 381 }, { "epoch": 1.875768757687577, "grad_norm": 1.5365298986434937, "learning_rate": 0.00011562383232442926, "loss": 0.417, "step": 382 }, { "epoch": 1.880688806888069, "grad_norm": 0.9506827592849731, "learning_rate": 0.0001152314203735805, "loss": 0.3772, "step": 383 }, { "epoch": 1.8856088560885609, "grad_norm": 0.900571346282959, "learning_rate": 0.00011483876815988867, "loss": 0.3805, "step": 384 }, { "epoch": 1.8905289052890528, "grad_norm": 1.0722607374191284, "learning_rate": 0.00011444588187711205, "loss": 0.4088, "step": 385 }, { "epoch": 1.8954489544895448, "grad_norm": 1.7054160833358765, "learning_rate": 0.00011405276772270126, "loss": 0.3956, "step": 386 }, { "epoch": 1.900369003690037, "grad_norm": 0.986569881439209, "learning_rate": 0.0001136594318977014, "loss": 0.4079, "step": 387 }, { "epoch": 1.9052890528905289, "grad_norm": 0.8886733651161194, "learning_rate": 0.0001132658806066542, "loss": 0.3842, "step": 388 }, { "epoch": 1.910209102091021, "grad_norm": 0.8961542248725891, "learning_rate": 0.00011287212005750024, "loss": 0.3881, "step": 389 }, { "epoch": 1.915129151291513, "grad_norm": 2.585698127746582, "learning_rate": 0.00011247815646148087, "loss": 0.4671, "step": 390 }, { "epoch": 1.920049200492005, "grad_norm": 1.6980706453323364, "learning_rate": 0.00011208399603304047, "loss": 0.3499, "step": 391 }, { "epoch": 1.9249692496924968, "grad_norm": 0.8087127804756165, "learning_rate": 0.00011168964498972818, "loss": 0.3786, "step": 392 }, { "epoch": 1.9298892988929888, "grad_norm": 1.2001378536224365, "learning_rate": 0.00011129510955209996, "loss": 0.3502, "step": 393 }, { "epoch": 1.934809348093481, "grad_norm": 1.2342605590820312, "learning_rate": 0.00011090039594362045, "loss": 0.3924, "step": 394 }, { "epoch": 1.939729397293973, "grad_norm": 1.3440324068069458, "learning_rate": 0.00011050551039056479, "loss": 0.4143, "step": 395 }, { "epoch": 1.944649446494465, "grad_norm": 0.7715713977813721, "learning_rate": 0.00011011045912192035, "loss": 0.3371, "step": 396 }, { "epoch": 1.949569495694957, "grad_norm": 0.9439634084701538, "learning_rate": 0.0001097152483692886, "loss": 0.3916, "step": 397 }, { "epoch": 1.954489544895449, "grad_norm": 1.3055254220962524, "learning_rate": 0.00010931988436678666, "loss": 0.4016, "step": 398 }, { "epoch": 1.9594095940959408, "grad_norm": 0.8916832208633423, "learning_rate": 0.00010892437335094912, "loss": 0.3525, "step": 399 }, { "epoch": 1.964329643296433, "grad_norm": 1.3914546966552734, "learning_rate": 0.00010852872156062946, "loss": 0.3771, "step": 400 }, { "epoch": 1.969249692496925, "grad_norm": 0.9857692122459412, "learning_rate": 0.00010813293523690191, "loss": 0.3393, "step": 401 }, { "epoch": 1.974169741697417, "grad_norm": 2.0917956829071045, "learning_rate": 0.00010773702062296273, "loss": 0.4354, "step": 402 }, { "epoch": 1.979089790897909, "grad_norm": 1.275038719177246, "learning_rate": 0.00010734098396403192, "loss": 0.3398, "step": 403 }, { "epoch": 1.984009840098401, "grad_norm": 2.0591840744018555, "learning_rate": 0.00010694483150725458, "loss": 0.4068, "step": 404 }, { "epoch": 1.988929889298893, "grad_norm": 4.258203506469727, "learning_rate": 0.00010654856950160253, "loss": 0.5179, "step": 405 }, { "epoch": 1.9938499384993849, "grad_norm": 1.638200283050537, "learning_rate": 0.00010615220419777548, "loss": 0.3814, "step": 406 }, { "epoch": 1.998769987699877, "grad_norm": 1.5387883186340332, "learning_rate": 0.00010575574184810269, "loss": 0.3818, "step": 407 }, { "epoch": 2.0, "grad_norm": 4.10122013092041, "learning_rate": 0.0001053591887064442, "loss": 0.3577, "step": 408 }, { "epoch": 2.004920049200492, "grad_norm": 1.2864503860473633, "learning_rate": 0.00010496255102809223, "loss": 0.3394, "step": 409 }, { "epoch": 2.009840098400984, "grad_norm": 0.9785577654838562, "learning_rate": 0.00010456583506967248, "loss": 0.3734, "step": 410 }, { "epoch": 2.014760147601476, "grad_norm": 0.43325719237327576, "learning_rate": 0.00010416904708904548, "loss": 0.3805, "step": 411 }, { "epoch": 2.019680196801968, "grad_norm": 3.109909772872925, "learning_rate": 0.00010377219334520783, "loss": 0.4594, "step": 412 }, { "epoch": 2.02460024600246, "grad_norm": 1.8757784366607666, "learning_rate": 0.00010337528009819344, "loss": 0.4087, "step": 413 }, { "epoch": 2.029520295202952, "grad_norm": 3.7887425422668457, "learning_rate": 0.00010297831360897492, "loss": 0.436, "step": 414 }, { "epoch": 2.034440344403444, "grad_norm": 0.3496626019477844, "learning_rate": 0.00010258130013936474, "loss": 0.3854, "step": 415 }, { "epoch": 2.039360393603936, "grad_norm": 1.3036730289459229, "learning_rate": 0.00010218424595191631, "loss": 0.3803, "step": 416 }, { "epoch": 2.044280442804428, "grad_norm": 4.245285987854004, "learning_rate": 0.00010178715730982549, "loss": 0.4952, "step": 417 }, { "epoch": 2.0492004920049203, "grad_norm": 3.09157133102417, "learning_rate": 0.00010139004047683151, "loss": 0.3944, "step": 418 }, { "epoch": 2.054120541205412, "grad_norm": 1.5304118394851685, "learning_rate": 0.00010099290171711841, "loss": 0.4174, "step": 419 }, { "epoch": 2.059040590405904, "grad_norm": 2.2359232902526855, "learning_rate": 0.00010059574729521595, "loss": 0.3358, "step": 420 }, { "epoch": 2.063960639606396, "grad_norm": 0.6024315357208252, "learning_rate": 0.0001001985834759011, "loss": 0.3981, "step": 421 }, { "epoch": 2.068880688806888, "grad_norm": 1.2679041624069214, "learning_rate": 9.980141652409895e-05, "loss": 0.4225, "step": 422 }, { "epoch": 2.07380073800738, "grad_norm": 0.895416796207428, "learning_rate": 9.940425270478407e-05, "loss": 0.388, "step": 423 }, { "epoch": 2.078720787207872, "grad_norm": 1.567826747894287, "learning_rate": 9.900709828288164e-05, "loss": 0.3704, "step": 424 }, { "epoch": 2.0836408364083643, "grad_norm": 1.9329123497009277, "learning_rate": 9.860995952316851e-05, "loss": 0.4234, "step": 425 }, { "epoch": 2.088560885608856, "grad_norm": 0.44675880670547485, "learning_rate": 9.821284269017455e-05, "loss": 0.3876, "step": 426 }, { "epoch": 2.093480934809348, "grad_norm": 1.7258495092391968, "learning_rate": 9.781575404808371e-05, "loss": 0.4297, "step": 427 }, { "epoch": 2.09840098400984, "grad_norm": 1.092556118965149, "learning_rate": 9.741869986063526e-05, "loss": 0.4026, "step": 428 }, { "epoch": 2.103321033210332, "grad_norm": 1.4725236892700195, "learning_rate": 9.702168639102509e-05, "loss": 0.4385, "step": 429 }, { "epoch": 2.108241082410824, "grad_norm": 2.7385778427124023, "learning_rate": 9.662471990180657e-05, "loss": 0.4424, "step": 430 }, { "epoch": 2.113161131611316, "grad_norm": 2.480210542678833, "learning_rate": 9.622780665479222e-05, "loss": 0.4206, "step": 431 }, { "epoch": 2.1180811808118083, "grad_norm": 0.8362523913383484, "learning_rate": 9.583095291095453e-05, "loss": 0.3986, "step": 432 }, { "epoch": 2.1230012300123002, "grad_norm": 2.0622987747192383, "learning_rate": 9.543416493032757e-05, "loss": 0.3485, "step": 433 }, { "epoch": 2.127921279212792, "grad_norm": 2.7538793087005615, "learning_rate": 9.503744897190778e-05, "loss": 0.3756, "step": 434 }, { "epoch": 2.132841328413284, "grad_norm": 0.8746367692947388, "learning_rate": 9.464081129355586e-05, "loss": 0.3751, "step": 435 }, { "epoch": 2.137761377613776, "grad_norm": 1.8478419780731201, "learning_rate": 9.424425815189733e-05, "loss": 0.4292, "step": 436 }, { "epoch": 2.142681426814268, "grad_norm": 1.6647083759307861, "learning_rate": 9.384779580222453e-05, "loss": 0.339, "step": 437 }, { "epoch": 2.14760147601476, "grad_norm": 4.109962463378906, "learning_rate": 9.345143049839749e-05, "loss": 0.4544, "step": 438 }, { "epoch": 2.1525215252152523, "grad_norm": 1.978119969367981, "learning_rate": 9.305516849274541e-05, "loss": 0.3702, "step": 439 }, { "epoch": 2.1574415744157442, "grad_norm": 1.917183518409729, "learning_rate": 9.265901603596811e-05, "loss": 0.4077, "step": 440 }, { "epoch": 2.162361623616236, "grad_norm": 0.47567835450172424, "learning_rate": 9.226297937703728e-05, "loss": 0.3356, "step": 441 }, { "epoch": 2.167281672816728, "grad_norm": 2.546321392059326, "learning_rate": 9.186706476309812e-05, "loss": 0.4337, "step": 442 }, { "epoch": 2.17220172201722, "grad_norm": 2.111480236053467, "learning_rate": 9.147127843937055e-05, "loss": 0.4024, "step": 443 }, { "epoch": 2.177121771217712, "grad_norm": 1.1858526468276978, "learning_rate": 9.107562664905093e-05, "loss": 0.3637, "step": 444 }, { "epoch": 2.1820418204182044, "grad_norm": 1.404078722000122, "learning_rate": 9.068011563321336e-05, "loss": 0.4173, "step": 445 }, { "epoch": 2.1869618696186963, "grad_norm": 1.1295206546783447, "learning_rate": 9.028475163071141e-05, "loss": 0.3856, "step": 446 }, { "epoch": 2.1918819188191883, "grad_norm": 1.2605645656585693, "learning_rate": 8.988954087807968e-05, "loss": 0.4193, "step": 447 }, { "epoch": 2.19680196801968, "grad_norm": 1.1261564493179321, "learning_rate": 8.949448960943524e-05, "loss": 0.407, "step": 448 }, { "epoch": 2.201722017220172, "grad_norm": 2.366487979888916, "learning_rate": 8.909960405637958e-05, "loss": 0.3946, "step": 449 }, { "epoch": 2.206642066420664, "grad_norm": 2.1479427814483643, "learning_rate": 8.870489044790006e-05, "loss": 0.3728, "step": 450 }, { "epoch": 2.211562115621156, "grad_norm": 2.990525245666504, "learning_rate": 8.831035501027186e-05, "loss": 0.3367, "step": 451 }, { "epoch": 2.2164821648216484, "grad_norm": 1.812566876411438, "learning_rate": 8.791600396695954e-05, "loss": 0.3689, "step": 452 }, { "epoch": 2.2214022140221403, "grad_norm": 0.5948531031608582, "learning_rate": 8.752184353851916e-05, "loss": 0.4018, "step": 453 }, { "epoch": 2.2263222632226323, "grad_norm": 1.8020761013031006, "learning_rate": 8.712787994249979e-05, "loss": 0.3965, "step": 454 }, { "epoch": 2.231242312423124, "grad_norm": 1.5464495420455933, "learning_rate": 8.673411939334581e-05, "loss": 0.3353, "step": 455 }, { "epoch": 2.236162361623616, "grad_norm": 1.8382320404052734, "learning_rate": 8.634056810229862e-05, "loss": 0.3916, "step": 456 }, { "epoch": 2.241082410824108, "grad_norm": 1.5499740839004517, "learning_rate": 8.594723227729875e-05, "loss": 0.3895, "step": 457 }, { "epoch": 2.2460024600246005, "grad_norm": 4.030876636505127, "learning_rate": 8.555411812288798e-05, "loss": 0.4616, "step": 458 }, { "epoch": 2.2509225092250924, "grad_norm": 1.1098424196243286, "learning_rate": 8.516123184011135e-05, "loss": 0.2977, "step": 459 }, { "epoch": 2.2558425584255843, "grad_norm": 1.2961804866790771, "learning_rate": 8.47685796264195e-05, "loss": 0.36, "step": 460 }, { "epoch": 2.2607626076260763, "grad_norm": 1.140372633934021, "learning_rate": 8.437616767557077e-05, "loss": 0.351, "step": 461 }, { "epoch": 2.265682656826568, "grad_norm": 3.4962611198425293, "learning_rate": 8.398400217753357e-05, "loss": 0.4724, "step": 462 }, { "epoch": 2.27060270602706, "grad_norm": 2.6000497341156006, "learning_rate": 8.359208931838871e-05, "loss": 0.4618, "step": 463 }, { "epoch": 2.275522755227552, "grad_norm": 2.955470323562622, "learning_rate": 8.320043528023188e-05, "loss": 0.461, "step": 464 }, { "epoch": 2.280442804428044, "grad_norm": 0.9662995934486389, "learning_rate": 8.280904624107606e-05, "loss": 0.3457, "step": 465 }, { "epoch": 2.2853628536285364, "grad_norm": 0.8392460346221924, "learning_rate": 8.241792837475405e-05, "loss": 0.354, "step": 466 }, { "epoch": 2.2902829028290284, "grad_norm": 2.8896520137786865, "learning_rate": 8.202708785082121e-05, "loss": 0.416, "step": 467 }, { "epoch": 2.2952029520295203, "grad_norm": 1.6311709880828857, "learning_rate": 8.163653083445799e-05, "loss": 0.399, "step": 468 }, { "epoch": 2.3001230012300122, "grad_norm": 0.4800054132938385, "learning_rate": 8.124626348637279e-05, "loss": 0.3758, "step": 469 }, { "epoch": 2.305043050430504, "grad_norm": 1.0817440748214722, "learning_rate": 8.085629196270469e-05, "loss": 0.3764, "step": 470 }, { "epoch": 2.3099630996309966, "grad_norm": 1.6088804006576538, "learning_rate": 8.046662241492645e-05, "loss": 0.4473, "step": 471 }, { "epoch": 2.3148831488314885, "grad_norm": 1.0749715566635132, "learning_rate": 8.007726098974734e-05, "loss": 0.3703, "step": 472 }, { "epoch": 2.3198031980319804, "grad_norm": 1.5354204177856445, "learning_rate": 7.96882138290163e-05, "loss": 0.4164, "step": 473 }, { "epoch": 2.3247232472324724, "grad_norm": 1.117240309715271, "learning_rate": 7.929948706962508e-05, "loss": 0.4144, "step": 474 }, { "epoch": 2.3296432964329643, "grad_norm": 0.9542057514190674, "learning_rate": 7.891108684341121e-05, "loss": 0.4162, "step": 475 }, { "epoch": 2.3345633456334562, "grad_norm": 1.411424994468689, "learning_rate": 7.852301927706159e-05, "loss": 0.4402, "step": 476 }, { "epoch": 2.339483394833948, "grad_norm": 1.8303946256637573, "learning_rate": 7.813529049201556e-05, "loss": 0.423, "step": 477 }, { "epoch": 2.34440344403444, "grad_norm": 1.6640418767929077, "learning_rate": 7.774790660436858e-05, "loss": 0.3943, "step": 478 }, { "epoch": 2.3493234932349325, "grad_norm": 0.5208873152732849, "learning_rate": 7.736087372477554e-05, "loss": 0.4215, "step": 479 }, { "epoch": 2.3542435424354244, "grad_norm": 2.1671223640441895, "learning_rate": 7.69741979583546e-05, "loss": 0.3839, "step": 480 }, { "epoch": 2.3591635916359164, "grad_norm": 2.075159788131714, "learning_rate": 7.658788540459062e-05, "loss": 0.3851, "step": 481 }, { "epoch": 2.3640836408364083, "grad_norm": 1.8642665147781372, "learning_rate": 7.620194215723919e-05, "loss": 0.3669, "step": 482 }, { "epoch": 2.3690036900369003, "grad_norm": 2.8715755939483643, "learning_rate": 7.581637430423037e-05, "loss": 0.4352, "step": 483 }, { "epoch": 2.373923739237392, "grad_norm": 0.8020451664924622, "learning_rate": 7.543118792757266e-05, "loss": 0.3657, "step": 484 }, { "epoch": 2.3788437884378846, "grad_norm": 2.100980758666992, "learning_rate": 7.504638910325717e-05, "loss": 0.3141, "step": 485 }, { "epoch": 2.3837638376383765, "grad_norm": 3.8309755325317383, "learning_rate": 7.466198390116158e-05, "loss": 0.494, "step": 486 }, { "epoch": 2.3886838868388685, "grad_norm": 1.7863093614578247, "learning_rate": 7.427797838495463e-05, "loss": 0.3792, "step": 487 }, { "epoch": 2.3936039360393604, "grad_norm": 1.1884002685546875, "learning_rate": 7.389437861200024e-05, "loss": 0.3928, "step": 488 }, { "epoch": 2.3985239852398523, "grad_norm": 1.9756462574005127, "learning_rate": 7.35111906332622e-05, "loss": 0.4218, "step": 489 }, { "epoch": 2.4034440344403443, "grad_norm": 3.6889054775238037, "learning_rate": 7.312842049320844e-05, "loss": 0.4441, "step": 490 }, { "epoch": 2.408364083640836, "grad_norm": 2.106717109680176, "learning_rate": 7.2746074229716e-05, "loss": 0.3783, "step": 491 }, { "epoch": 2.4132841328413286, "grad_norm": 1.312242865562439, "learning_rate": 7.236415787397548e-05, "loss": 0.3342, "step": 492 }, { "epoch": 2.4182041820418205, "grad_norm": 0.7120693325996399, "learning_rate": 7.198267745039612e-05, "loss": 0.3846, "step": 493 }, { "epoch": 2.4231242312423125, "grad_norm": 1.6067770719528198, "learning_rate": 7.160163897651075e-05, "loss": 0.4396, "step": 494 }, { "epoch": 2.4280442804428044, "grad_norm": 4.197781562805176, "learning_rate": 7.122104846288064e-05, "loss": 0.2713, "step": 495 }, { "epoch": 2.4329643296432963, "grad_norm": 1.1666693687438965, "learning_rate": 7.08409119130011e-05, "loss": 0.3647, "step": 496 }, { "epoch": 2.4378843788437883, "grad_norm": 1.3826804161071777, "learning_rate": 7.04612353232063e-05, "loss": 0.3739, "step": 497 }, { "epoch": 2.4428044280442807, "grad_norm": 1.140659213066101, "learning_rate": 7.008202468257514e-05, "loss": 0.4207, "step": 498 }, { "epoch": 2.4477244772447726, "grad_norm": 2.2047266960144043, "learning_rate": 6.970328597283637e-05, "loss": 0.3767, "step": 499 }, { "epoch": 2.4526445264452645, "grad_norm": 2.385573148727417, "learning_rate": 6.932502516827461e-05, "loss": 0.3369, "step": 500 }, { "epoch": 2.4575645756457565, "grad_norm": 1.869011402130127, "learning_rate": 6.894724823563583e-05, "loss": 0.3521, "step": 501 }, { "epoch": 2.4624846248462484, "grad_norm": 1.2904314994812012, "learning_rate": 6.85699611340333e-05, "loss": 0.3519, "step": 502 }, { "epoch": 2.4674046740467404, "grad_norm": 1.8398619890213013, "learning_rate": 6.819316981485372e-05, "loss": 0.3123, "step": 503 }, { "epoch": 2.4723247232472323, "grad_norm": 2.567601442337036, "learning_rate": 6.781688022166311e-05, "loss": 0.4435, "step": 504 }, { "epoch": 2.4772447724477242, "grad_norm": 1.0559594631195068, "learning_rate": 6.744109829011332e-05, "loss": 0.3921, "step": 505 }, { "epoch": 2.4821648216482166, "grad_norm": 1.4271594285964966, "learning_rate": 6.706582994784814e-05, "loss": 0.382, "step": 506 }, { "epoch": 2.4870848708487086, "grad_norm": 2.100080728530884, "learning_rate": 6.669108111441003e-05, "loss": 0.4241, "step": 507 }, { "epoch": 2.4920049200492005, "grad_norm": 2.3189799785614014, "learning_rate": 6.631685770114654e-05, "loss": 0.4492, "step": 508 }, { "epoch": 2.4969249692496924, "grad_norm": 1.2089158296585083, "learning_rate": 6.594316561111724e-05, "loss": 0.3763, "step": 509 }, { "epoch": 2.5018450184501844, "grad_norm": 2.086798906326294, "learning_rate": 6.557001073900044e-05, "loss": 0.4291, "step": 510 }, { "epoch": 2.5067650676506767, "grad_norm": 1.3246550559997559, "learning_rate": 6.519739897100034e-05, "loss": 0.4328, "step": 511 }, { "epoch": 2.5116851168511687, "grad_norm": 3.522636890411377, "learning_rate": 6.482533618475422e-05, "loss": 0.3572, "step": 512 }, { "epoch": 2.5166051660516606, "grad_norm": 2.3924167156219482, "learning_rate": 6.445382824923938e-05, "loss": 0.4262, "step": 513 }, { "epoch": 2.5215252152521526, "grad_norm": 3.661113739013672, "learning_rate": 6.408288102468113e-05, "loss": 0.3796, "step": 514 }, { "epoch": 2.5264452644526445, "grad_norm": 1.2376595735549927, "learning_rate": 6.371250036245976e-05, "loss": 0.3796, "step": 515 }, { "epoch": 2.5313653136531364, "grad_norm": 1.8098406791687012, "learning_rate": 6.334269210501875e-05, "loss": 0.3707, "step": 516 }, { "epoch": 2.5362853628536284, "grad_norm": 1.7512861490249634, "learning_rate": 6.297346208577213e-05, "loss": 0.3753, "step": 517 }, { "epoch": 2.5412054120541203, "grad_norm": 2.3691437244415283, "learning_rate": 6.260481612901299e-05, "loss": 0.3678, "step": 518 }, { "epoch": 2.5461254612546127, "grad_norm": 2.599379539489746, "learning_rate": 6.223676004982105e-05, "loss": 0.4462, "step": 519 }, { "epoch": 2.5510455104551046, "grad_norm": 1.492092251777649, "learning_rate": 6.18692996539714e-05, "loss": 0.4379, "step": 520 }, { "epoch": 2.5559655596555966, "grad_norm": 1.2708606719970703, "learning_rate": 6.150244073784266e-05, "loss": 0.3573, "step": 521 }, { "epoch": 2.5608856088560885, "grad_norm": 2.399810552597046, "learning_rate": 6.113618908832561e-05, "loss": 0.4584, "step": 522 }, { "epoch": 2.5658056580565805, "grad_norm": 3.6795196533203125, "learning_rate": 6.0770550482731924e-05, "loss": 0.247, "step": 523 }, { "epoch": 2.570725707257073, "grad_norm": 1.5731709003448486, "learning_rate": 6.0405530688702986e-05, "loss": 0.4207, "step": 524 }, { "epoch": 2.5756457564575648, "grad_norm": 2.2327213287353516, "learning_rate": 6.0041135464119024e-05, "loss": 0.389, "step": 525 }, { "epoch": 2.5805658056580567, "grad_norm": 2.482600688934326, "learning_rate": 5.9677370557008104e-05, "loss": 0.4297, "step": 526 }, { "epoch": 2.5854858548585486, "grad_norm": 0.5008729696273804, "learning_rate": 5.9314241705455674e-05, "loss": 0.37, "step": 527 }, { "epoch": 2.5904059040590406, "grad_norm": 1.346571683883667, "learning_rate": 5.895175463751385e-05, "loss": 0.398, "step": 528 }, { "epoch": 2.5953259532595325, "grad_norm": 1.3295096158981323, "learning_rate": 5.858991507111122e-05, "loss": 0.4046, "step": 529 }, { "epoch": 2.6002460024600245, "grad_norm": 2.531033515930176, "learning_rate": 5.8228728713962543e-05, "loss": 0.441, "step": 530 }, { "epoch": 2.6051660516605164, "grad_norm": 1.4059702157974243, "learning_rate": 5.786820126347876e-05, "loss": 0.3887, "step": 531 }, { "epoch": 2.6100861008610083, "grad_norm": 0.8365688323974609, "learning_rate": 5.750833840667711e-05, "loss": 0.3926, "step": 532 }, { "epoch": 2.6150061500615007, "grad_norm": 1.1072005033493042, "learning_rate": 5.7149145820091385e-05, "loss": 0.4331, "step": 533 }, { "epoch": 2.6199261992619927, "grad_norm": 4.232044219970703, "learning_rate": 5.6790629169682564e-05, "loss": 0.313, "step": 534 }, { "epoch": 2.6248462484624846, "grad_norm": 1.9496935606002808, "learning_rate": 5.6432794110749134e-05, "loss": 0.388, "step": 535 }, { "epoch": 2.6297662976629765, "grad_norm": 2.281867265701294, "learning_rate": 5.607564628783817e-05, "loss": 0.3739, "step": 536 }, { "epoch": 2.6346863468634685, "grad_norm": 1.5007566213607788, "learning_rate": 5.571919133465605e-05, "loss": 0.4018, "step": 537 }, { "epoch": 2.639606396063961, "grad_norm": 1.5338659286499023, "learning_rate": 5.5363434873979903e-05, "loss": 0.3782, "step": 538 }, { "epoch": 2.644526445264453, "grad_norm": 1.8886133432388306, "learning_rate": 5.500838251756857e-05, "loss": 0.4441, "step": 539 }, { "epoch": 2.6494464944649447, "grad_norm": 3.30102801322937, "learning_rate": 5.465403986607426e-05, "loss": 0.3137, "step": 540 }, { "epoch": 2.6543665436654367, "grad_norm": 1.8262077569961548, "learning_rate": 5.430041250895428e-05, "loss": 0.4104, "step": 541 }, { "epoch": 2.6592865928659286, "grad_norm": 1.551676869392395, "learning_rate": 5.3947506024382665e-05, "loss": 0.3337, "step": 542 }, { "epoch": 2.6642066420664205, "grad_norm": 2.0609912872314453, "learning_rate": 5.359532597916233e-05, "loss": 0.3059, "step": 543 }, { "epoch": 2.6691266912669125, "grad_norm": 2.948434829711914, "learning_rate": 5.324387792863719e-05, "loss": 0.4629, "step": 544 }, { "epoch": 2.6740467404674044, "grad_norm": 1.7256718873977661, "learning_rate": 5.289316741660466e-05, "loss": 0.3752, "step": 545 }, { "epoch": 2.678966789667897, "grad_norm": 3.3157119750976562, "learning_rate": 5.254319997522796e-05, "loss": 0.4715, "step": 546 }, { "epoch": 2.6838868388683887, "grad_norm": 2.951591730117798, "learning_rate": 5.21939811249492e-05, "loss": 0.4372, "step": 547 }, { "epoch": 2.6888068880688807, "grad_norm": 1.9655730724334717, "learning_rate": 5.1845516374401784e-05, "loss": 0.3728, "step": 548 }, { "epoch": 2.6937269372693726, "grad_norm": 2.9351847171783447, "learning_rate": 5.14978112203241e-05, "loss": 0.4404, "step": 549 }, { "epoch": 2.6986469864698646, "grad_norm": 1.8943357467651367, "learning_rate": 5.11508711474725e-05, "loss": 0.2844, "step": 550 }, { "epoch": 2.703567035670357, "grad_norm": 0.9512324333190918, "learning_rate": 5.080470162853472e-05, "loss": 0.3566, "step": 551 }, { "epoch": 2.708487084870849, "grad_norm": 1.0358315706253052, "learning_rate": 5.0459308124043715e-05, "loss": 0.3267, "step": 552 }, { "epoch": 2.713407134071341, "grad_norm": 0.828611433506012, "learning_rate": 5.0114696082291425e-05, "loss": 0.3766, "step": 553 }, { "epoch": 2.7183271832718328, "grad_norm": 1.0593851804733276, "learning_rate": 4.9770870939242986e-05, "loss": 0.3895, "step": 554 }, { "epoch": 2.7232472324723247, "grad_norm": 1.6970057487487793, "learning_rate": 4.942783811845074e-05, "loss": 0.3349, "step": 555 }, { "epoch": 2.7281672816728166, "grad_norm": 0.8009957671165466, "learning_rate": 4.908560303096887e-05, "loss": 0.3741, "step": 556 }, { "epoch": 2.7330873308733086, "grad_norm": 1.5965189933776855, "learning_rate": 4.874417107526795e-05, "loss": 0.326, "step": 557 }, { "epoch": 2.7380073800738005, "grad_norm": 1.6400642395019531, "learning_rate": 4.840354763714991e-05, "loss": 0.3416, "step": 558 }, { "epoch": 2.742927429274293, "grad_norm": 2.1281847953796387, "learning_rate": 4.8063738089662926e-05, "loss": 0.3142, "step": 559 }, { "epoch": 2.747847478474785, "grad_norm": 1.0202291011810303, "learning_rate": 4.772474779301669e-05, "loss": 0.389, "step": 560 }, { "epoch": 2.7527675276752768, "grad_norm": 1.6404527425765991, "learning_rate": 4.738658209449805e-05, "loss": 0.3891, "step": 561 }, { "epoch": 2.7576875768757687, "grad_norm": 0.9586972594261169, "learning_rate": 4.704924632838636e-05, "loss": 0.3888, "step": 562 }, { "epoch": 2.7626076260762606, "grad_norm": 3.16829776763916, "learning_rate": 4.671274581586958e-05, "loss": 0.4402, "step": 563 }, { "epoch": 2.767527675276753, "grad_norm": 1.2215882539749146, "learning_rate": 4.637708586496018e-05, "loss": 0.3508, "step": 564 }, { "epoch": 2.772447724477245, "grad_norm": 1.4616819620132446, "learning_rate": 4.604227177041156e-05, "loss": 0.4277, "step": 565 }, { "epoch": 2.777367773677737, "grad_norm": 1.0973330736160278, "learning_rate": 4.570830881363439e-05, "loss": 0.4127, "step": 566 }, { "epoch": 2.782287822878229, "grad_norm": 1.676638126373291, "learning_rate": 4.537520226261333e-05, "loss": 0.4243, "step": 567 }, { "epoch": 2.787207872078721, "grad_norm": 2.636601448059082, "learning_rate": 4.5042957371824057e-05, "loss": 0.3116, "step": 568 }, { "epoch": 2.7921279212792127, "grad_norm": 1.0604605674743652, "learning_rate": 4.471157938215017e-05, "loss": 0.4186, "step": 569 }, { "epoch": 2.7970479704797047, "grad_norm": 1.1565591096878052, "learning_rate": 4.438107352080076e-05, "loss": 0.4068, "step": 570 }, { "epoch": 2.8019680196801966, "grad_norm": 0.5449007153511047, "learning_rate": 4.405144500122772e-05, "loss": 0.3739, "step": 571 }, { "epoch": 2.8068880688806885, "grad_norm": 1.7176798582077026, "learning_rate": 4.372269902304363e-05, "loss": 0.4036, "step": 572 }, { "epoch": 2.811808118081181, "grad_norm": 1.9100306034088135, "learning_rate": 4.339484077193974e-05, "loss": 0.3436, "step": 573 }, { "epoch": 2.816728167281673, "grad_norm": 0.950062096118927, "learning_rate": 4.3067875419604184e-05, "loss": 0.4206, "step": 574 }, { "epoch": 2.821648216482165, "grad_norm": 1.950170636177063, "learning_rate": 4.2741808123640335e-05, "loss": 0.4187, "step": 575 }, { "epoch": 2.8265682656826567, "grad_norm": 1.7998218536376953, "learning_rate": 4.241664402748544e-05, "loss": 0.3643, "step": 576 }, { "epoch": 2.8314883148831487, "grad_norm": 0.5359982252120972, "learning_rate": 4.209238826032965e-05, "loss": 0.4071, "step": 577 }, { "epoch": 2.836408364083641, "grad_norm": 2.177288055419922, "learning_rate": 4.1769045937034876e-05, "loss": 0.4656, "step": 578 }, { "epoch": 2.841328413284133, "grad_norm": 1.8691096305847168, "learning_rate": 4.144662215805426e-05, "loss": 0.4428, "step": 579 }, { "epoch": 2.846248462484625, "grad_norm": 1.3971619606018066, "learning_rate": 4.1125122009351634e-05, "loss": 0.3774, "step": 580 }, { "epoch": 2.851168511685117, "grad_norm": 1.863781213760376, "learning_rate": 4.080455056232147e-05, "loss": 0.3686, "step": 581 }, { "epoch": 2.856088560885609, "grad_norm": 2.2776503562927246, "learning_rate": 4.048491287370863e-05, "loss": 0.4009, "step": 582 }, { "epoch": 2.8610086100861007, "grad_norm": 0.3682532012462616, "learning_rate": 4.016621398552877e-05, "loss": 0.4106, "step": 583 }, { "epoch": 2.8659286592865927, "grad_norm": 1.6590131521224976, "learning_rate": 3.9848458924988684e-05, "loss": 0.4113, "step": 584 }, { "epoch": 2.8708487084870846, "grad_norm": 0.8245828747749329, "learning_rate": 3.953165270440721e-05, "loss": 0.3874, "step": 585 }, { "epoch": 2.875768757687577, "grad_norm": 1.1494457721710205, "learning_rate": 3.921580032113602e-05, "loss": 0.4041, "step": 586 }, { "epoch": 2.880688806888069, "grad_norm": 0.3291958272457123, "learning_rate": 3.8900906757480614e-05, "loss": 0.4009, "step": 587 }, { "epoch": 2.885608856088561, "grad_norm": 3.5113492012023926, "learning_rate": 3.858697698062217e-05, "loss": 0.4783, "step": 588 }, { "epoch": 2.890528905289053, "grad_norm": 0.7835597991943359, "learning_rate": 3.8274015942538745e-05, "loss": 0.3928, "step": 589 }, { "epoch": 2.8954489544895448, "grad_norm": 1.4036983251571655, "learning_rate": 3.7962028579927555e-05, "loss": 0.3694, "step": 590 }, { "epoch": 2.900369003690037, "grad_norm": 1.1807712316513062, "learning_rate": 3.7651019814126654e-05, "loss": 0.385, "step": 591 }, { "epoch": 2.905289052890529, "grad_norm": 2.0742995738983154, "learning_rate": 3.734099455103779e-05, "loss": 0.4164, "step": 592 }, { "epoch": 2.910209102091021, "grad_norm": 2.6549105644226074, "learning_rate": 3.7031957681048604e-05, "loss": 0.347, "step": 593 }, { "epoch": 2.915129151291513, "grad_norm": 1.3094247579574585, "learning_rate": 3.6723914078955825e-05, "loss": 0.4112, "step": 594 }, { "epoch": 2.920049200492005, "grad_norm": 0.5627428293228149, "learning_rate": 3.64168686038881e-05, "loss": 0.3947, "step": 595 }, { "epoch": 2.924969249692497, "grad_norm": 1.4705300331115723, "learning_rate": 3.6110826099229453e-05, "loss": 0.3828, "step": 596 }, { "epoch": 2.9298892988929888, "grad_norm": 0.9498153924942017, "learning_rate": 3.580579139254303e-05, "loss": 0.3829, "step": 597 }, { "epoch": 2.9348093480934807, "grad_norm": 2.052823781967163, "learning_rate": 3.550176929549468e-05, "loss": 0.3334, "step": 598 }, { "epoch": 2.939729397293973, "grad_norm": 0.9632225036621094, "learning_rate": 3.5198764603777235e-05, "loss": 0.3681, "step": 599 }, { "epoch": 2.944649446494465, "grad_norm": 1.2577297687530518, "learning_rate": 3.489678209703475e-05, "loss": 0.3469, "step": 600 }, { "epoch": 2.949569495694957, "grad_norm": 1.42790949344635, "learning_rate": 3.459582653878731e-05, "loss": 0.4072, "step": 601 }, { "epoch": 2.954489544895449, "grad_norm": 2.504870653152466, "learning_rate": 3.429590267635565e-05, "loss": 0.4232, "step": 602 }, { "epoch": 2.959409594095941, "grad_norm": 2.3047032356262207, "learning_rate": 3.399701524078635e-05, "loss": 0.3763, "step": 603 }, { "epoch": 2.9643296432964332, "grad_norm": 1.7464078664779663, "learning_rate": 3.369916894677733e-05, "loss": 0.3354, "step": 604 }, { "epoch": 2.969249692496925, "grad_norm": 1.6479971408843994, "learning_rate": 3.340236849260324e-05, "loss": 0.3798, "step": 605 }, { "epoch": 2.974169741697417, "grad_norm": 1.558695673942566, "learning_rate": 3.31066185600417e-05, "loss": 0.3488, "step": 606 }, { "epoch": 2.979089790897909, "grad_norm": 3.189610719680786, "learning_rate": 3.281192381429894e-05, "loss": 0.441, "step": 607 }, { "epoch": 2.984009840098401, "grad_norm": 0.9114331603050232, "learning_rate": 3.251828890393677e-05, "loss": 0.3922, "step": 608 }, { "epoch": 2.988929889298893, "grad_norm": 0.84954833984375, "learning_rate": 3.222571846079881e-05, "loss": 0.3682, "step": 609 }, { "epoch": 2.993849938499385, "grad_norm": 2.6202147006988525, "learning_rate": 3.193421709993779e-05, "loss": 0.453, "step": 610 }, { "epoch": 2.998769987699877, "grad_norm": 1.6845208406448364, "learning_rate": 3.1643789419542324e-05, "loss": 0.3606, "step": 611 }, { "epoch": 3.0, "grad_norm": 4.893674850463867, "learning_rate": 3.135444000086485e-05, "loss": 0.5199, "step": 612 }, { "epoch": 3.004920049200492, "grad_norm": 1.350771427154541, "learning_rate": 3.1066173408148955e-05, "loss": 0.3319, "step": 613 }, { "epoch": 3.009840098400984, "grad_norm": 2.239192247390747, "learning_rate": 3.077899418855772e-05, "loss": 0.4358, "step": 614 }, { "epoch": 3.014760147601476, "grad_norm": 2.0310704708099365, "learning_rate": 3.04929068721017e-05, "loss": 0.4024, "step": 615 }, { "epoch": 3.019680196801968, "grad_norm": 0.5520709156990051, "learning_rate": 3.0207915971567624e-05, "loss": 0.3869, "step": 616 }, { "epoch": 3.02460024600246, "grad_norm": 1.409179925918579, "learning_rate": 2.992402598244727e-05, "loss": 0.343, "step": 617 }, { "epoch": 3.029520295202952, "grad_norm": 3.2636709213256836, "learning_rate": 2.9641241382866348e-05, "loss": 0.3208, "step": 618 }, { "epoch": 3.034440344403444, "grad_norm": 1.3331984281539917, "learning_rate": 2.9359566633514037e-05, "loss": 0.4065, "step": 619 }, { "epoch": 3.039360393603936, "grad_norm": 1.49379563331604, "learning_rate": 2.907900617757252e-05, "loss": 0.3844, "step": 620 }, { "epoch": 3.044280442804428, "grad_norm": 1.0063300132751465, "learning_rate": 2.879956444064703e-05, "loss": 0.4103, "step": 621 }, { "epoch": 3.0492004920049203, "grad_norm": 1.5763076543807983, "learning_rate": 2.8521245830695864e-05, "loss": 0.4199, "step": 622 }, { "epoch": 3.054120541205412, "grad_norm": 1.9557186365127563, "learning_rate": 2.8244054737960935e-05, "loss": 0.3928, "step": 623 }, { "epoch": 3.059040590405904, "grad_norm": 1.7936758995056152, "learning_rate": 2.7967995534898596e-05, "loss": 0.3503, "step": 624 }, { "epoch": 3.063960639606396, "grad_norm": 2.0918500423431396, "learning_rate": 2.7693072576110514e-05, "loss": 0.3772, "step": 625 }, { "epoch": 3.068880688806888, "grad_norm": 1.531785249710083, "learning_rate": 2.7419290198275095e-05, "loss": 0.413, "step": 626 }, { "epoch": 3.07380073800738, "grad_norm": 0.7834340929985046, "learning_rate": 2.7146652720079003e-05, "loss": 0.3919, "step": 627 }, { "epoch": 3.078720787207872, "grad_norm": 1.8467501401901245, "learning_rate": 2.6875164442149147e-05, "loss": 0.368, "step": 628 }, { "epoch": 3.0836408364083643, "grad_norm": 1.6197096109390259, "learning_rate": 2.6604829646984686e-05, "loss": 0.3476, "step": 629 }, { "epoch": 3.088560885608856, "grad_norm": 2.2266929149627686, "learning_rate": 2.6335652598889683e-05, "loss": 0.3692, "step": 630 }, { "epoch": 3.093480934809348, "grad_norm": 1.0801973342895508, "learning_rate": 2.60676375439055e-05, "loss": 0.4145, "step": 631 }, { "epoch": 3.09840098400984, "grad_norm": 0.6759971976280212, "learning_rate": 2.5800788709744227e-05, "loss": 0.3621, "step": 632 }, { "epoch": 3.103321033210332, "grad_norm": 1.5428274869918823, "learning_rate": 2.5535110305721776e-05, "loss": 0.3946, "step": 633 }, { "epoch": 3.108241082410824, "grad_norm": 0.4800112843513489, "learning_rate": 2.5270606522691443e-05, "loss": 0.3695, "step": 634 }, { "epoch": 3.113161131611316, "grad_norm": 1.418677568435669, "learning_rate": 2.500728153297788e-05, "loss": 0.3413, "step": 635 }, { "epoch": 3.1180811808118083, "grad_norm": 1.384252667427063, "learning_rate": 2.4745139490311254e-05, "loss": 0.3376, "step": 636 }, { "epoch": 3.1230012300123002, "grad_norm": 0.7807061672210693, "learning_rate": 2.4484184529761834e-05, "loss": 0.4, "step": 637 }, { "epoch": 3.127921279212792, "grad_norm": 1.9366016387939453, "learning_rate": 2.4224420767674562e-05, "loss": 0.3731, "step": 638 }, { "epoch": 3.132841328413284, "grad_norm": 2.5923564434051514, "learning_rate": 2.3965852301604254e-05, "loss": 0.4395, "step": 639 }, { "epoch": 3.137761377613776, "grad_norm": 0.9284645318984985, "learning_rate": 2.370848321025093e-05, "loss": 0.3901, "step": 640 }, { "epoch": 3.142681426814268, "grad_norm": 1.9988764524459839, "learning_rate": 2.345231755339554e-05, "loss": 0.4379, "step": 641 }, { "epoch": 3.14760147601476, "grad_norm": 1.626031517982483, "learning_rate": 2.3197359371835802e-05, "loss": 0.4256, "step": 642 }, { "epoch": 3.1525215252152523, "grad_norm": 2.1211905479431152, "learning_rate": 2.2943612687322525e-05, "loss": 0.3934, "step": 643 }, { "epoch": 3.1574415744157442, "grad_norm": 1.0140880346298218, "learning_rate": 2.2691081502496246e-05, "loss": 0.3604, "step": 644 }, { "epoch": 3.162361623616236, "grad_norm": 2.3775453567504883, "learning_rate": 2.243976980082394e-05, "loss": 0.4068, "step": 645 }, { "epoch": 3.167281672816728, "grad_norm": 2.1912922859191895, "learning_rate": 2.218968154653629e-05, "loss": 0.3614, "step": 646 }, { "epoch": 3.17220172201722, "grad_norm": 1.8802082538604736, "learning_rate": 2.194082068456509e-05, "loss": 0.3843, "step": 647 }, { "epoch": 3.177121771217712, "grad_norm": 1.67764151096344, "learning_rate": 2.169319114048114e-05, "loss": 0.3707, "step": 648 }, { "epoch": 3.1820418204182044, "grad_norm": 1.9697654247283936, "learning_rate": 2.1446796820432167e-05, "loss": 0.3357, "step": 649 }, { "epoch": 3.1869618696186963, "grad_norm": 1.7767447233200073, "learning_rate": 2.1201641611081246e-05, "loss": 0.3937, "step": 650 }, { "epoch": 3.1918819188191883, "grad_norm": 1.3625164031982422, "learning_rate": 2.0957729379545655e-05, "loss": 0.3593, "step": 651 }, { "epoch": 3.19680196801968, "grad_norm": 1.0841906070709229, "learning_rate": 2.0715063973335568e-05, "loss": 0.393, "step": 652 }, { "epoch": 3.201722017220172, "grad_norm": 3.0648295879364014, "learning_rate": 2.04736492202937e-05, "loss": 0.3615, "step": 653 }, { "epoch": 3.206642066420664, "grad_norm": 1.1780354976654053, "learning_rate": 2.0233488928534673e-05, "loss": 0.3733, "step": 654 }, { "epoch": 3.211562115621156, "grad_norm": 2.0348012447357178, "learning_rate": 1.9994586886385046e-05, "loss": 0.3895, "step": 655 }, { "epoch": 3.2164821648216484, "grad_norm": 0.7234269380569458, "learning_rate": 1.9756946862323535e-05, "loss": 0.3621, "step": 656 }, { "epoch": 3.2214022140221403, "grad_norm": 2.2290384769439697, "learning_rate": 1.9520572604921672e-05, "loss": 0.4369, "step": 657 }, { "epoch": 3.2263222632226323, "grad_norm": 0.7513899803161621, "learning_rate": 1.9285467842784467e-05, "loss": 0.3614, "step": 658 }, { "epoch": 3.231242312423124, "grad_norm": 2.5259876251220703, "learning_rate": 1.9051636284491757e-05, "loss": 0.3877, "step": 659 }, { "epoch": 3.236162361623616, "grad_norm": 2.885737180709839, "learning_rate": 1.8819081618539723e-05, "loss": 0.4691, "step": 660 }, { "epoch": 3.241082410824108, "grad_norm": 1.888336181640625, "learning_rate": 1.858780751328255e-05, "loss": 0.433, "step": 661 }, { "epoch": 3.2460024600246005, "grad_norm": 0.801278293132782, "learning_rate": 1.8357817616874694e-05, "loss": 0.3704, "step": 662 }, { "epoch": 3.2509225092250924, "grad_norm": 1.2432537078857422, "learning_rate": 1.8129115557213262e-05, "loss": 0.3552, "step": 663 }, { "epoch": 3.2558425584255843, "grad_norm": 1.9892895221710205, "learning_rate": 1.7901704941880914e-05, "loss": 0.3551, "step": 664 }, { "epoch": 3.2607626076260763, "grad_norm": 1.448431372642517, "learning_rate": 1.7675589358088763e-05, "loss": 0.4053, "step": 665 }, { "epoch": 3.265682656826568, "grad_norm": 2.4297046661376953, "learning_rate": 1.745077237261994e-05, "loss": 0.4334, "step": 666 }, { "epoch": 3.27060270602706, "grad_norm": 1.624751329421997, "learning_rate": 1.7227257531773223e-05, "loss": 0.4296, "step": 667 }, { "epoch": 3.275522755227552, "grad_norm": 1.1023207902908325, "learning_rate": 1.7005048361307262e-05, "loss": 0.375, "step": 668 }, { "epoch": 3.280442804428044, "grad_norm": 1.6138256788253784, "learning_rate": 1.6784148366384754e-05, "loss": 0.3394, "step": 669 }, { "epoch": 3.2853628536285364, "grad_norm": 0.9887522459030151, "learning_rate": 1.656456103151728e-05, "loss": 0.3597, "step": 670 }, { "epoch": 3.2902829028290284, "grad_norm": 1.7043898105621338, "learning_rate": 1.6346289820510363e-05, "loss": 0.3417, "step": 671 }, { "epoch": 3.2952029520295203, "grad_norm": 1.6882188320159912, "learning_rate": 1.612933817640868e-05, "loss": 0.436, "step": 672 }, { "epoch": 3.3001230012300122, "grad_norm": 0.7217171788215637, "learning_rate": 1.5913709521441988e-05, "loss": 0.3997, "step": 673 }, { "epoch": 3.305043050430504, "grad_norm": 2.6820271015167236, "learning_rate": 1.5699407256970833e-05, "loss": 0.3115, "step": 674 }, { "epoch": 3.3099630996309966, "grad_norm": 1.4860421419143677, "learning_rate": 1.5486434763433222e-05, "loss": 0.3516, "step": 675 }, { "epoch": 3.3148831488314885, "grad_norm": 1.136051893234253, "learning_rate": 1.527479540029104e-05, "loss": 0.4023, "step": 676 }, { "epoch": 3.3198031980319804, "grad_norm": 2.500821828842163, "learning_rate": 1.5064492505977234e-05, "loss": 0.4225, "step": 677 }, { "epoch": 3.3247232472324724, "grad_norm": 0.5306374430656433, "learning_rate": 1.4855529397843038e-05, "loss": 0.3675, "step": 678 }, { "epoch": 3.3296432964329643, "grad_norm": 1.5522453784942627, "learning_rate": 1.4647909372105672e-05, "loss": 0.3182, "step": 679 }, { "epoch": 3.3345633456334562, "grad_norm": 1.6273597478866577, "learning_rate": 1.4441635703796408e-05, "loss": 0.3548, "step": 680 }, { "epoch": 3.339483394833948, "grad_norm": 1.7513864040374756, "learning_rate": 1.4236711646708844e-05, "loss": 0.3177, "step": 681 }, { "epoch": 3.34440344403444, "grad_norm": 1.033565878868103, "learning_rate": 1.4033140433347569e-05, "loss": 0.3639, "step": 682 }, { "epoch": 3.3493234932349325, "grad_norm": 1.3103158473968506, "learning_rate": 1.3830925274877216e-05, "loss": 0.4256, "step": 683 }, { "epoch": 3.3542435424354244, "grad_norm": 2.1008458137512207, "learning_rate": 1.363006936107183e-05, "loss": 0.4194, "step": 684 }, { "epoch": 3.3591635916359164, "grad_norm": 1.350831151008606, "learning_rate": 1.343057586026446e-05, "loss": 0.3792, "step": 685 }, { "epoch": 3.3640836408364083, "grad_norm": 3.0984957218170166, "learning_rate": 1.3232447919297274e-05, "loss": 0.4341, "step": 686 }, { "epoch": 3.3690036900369003, "grad_norm": 0.47078070044517517, "learning_rate": 1.3035688663471834e-05, "loss": 0.3664, "step": 687 }, { "epoch": 3.373923739237392, "grad_norm": 1.277298927307129, "learning_rate": 1.2840301196499893e-05, "loss": 0.3714, "step": 688 }, { "epoch": 3.3788437884378846, "grad_norm": 2.4945287704467773, "learning_rate": 1.2646288600454448e-05, "loss": 0.3517, "step": 689 }, { "epoch": 3.3837638376383765, "grad_norm": 0.9373493194580078, "learning_rate": 1.2453653935720867e-05, "loss": 0.3881, "step": 690 }, { "epoch": 3.3886838868388685, "grad_norm": 4.251840591430664, "learning_rate": 1.2262400240949023e-05, "loss": 0.305, "step": 691 }, { "epoch": 3.3936039360393604, "grad_norm": 2.382617950439453, "learning_rate": 1.2072530533005012e-05, "loss": 0.4376, "step": 692 }, { "epoch": 3.3985239852398523, "grad_norm": 1.3531382083892822, "learning_rate": 1.1884047806923815e-05, "loss": 0.4127, "step": 693 }, { "epoch": 3.4034440344403443, "grad_norm": 0.8284920454025269, "learning_rate": 1.169695503586179e-05, "loss": 0.406, "step": 694 }, { "epoch": 3.408364083640836, "grad_norm": 0.6216104030609131, "learning_rate": 1.1511255171050084e-05, "loss": 0.3963, "step": 695 }, { "epoch": 3.4132841328413286, "grad_norm": 2.1421051025390625, "learning_rate": 1.1326951141747788e-05, "loss": 0.449, "step": 696 }, { "epoch": 3.4182041820418205, "grad_norm": 1.2773298025131226, "learning_rate": 1.1144045855195973e-05, "loss": 0.3583, "step": 697 }, { "epoch": 3.4231242312423125, "grad_norm": 1.9336838722229004, "learning_rate": 1.0962542196571634e-05, "loss": 0.363, "step": 698 }, { "epoch": 3.4280442804428044, "grad_norm": 2.467573881149292, "learning_rate": 1.078244302894229e-05, "loss": 0.4245, "step": 699 }, { "epoch": 3.4329643296432963, "grad_norm": 2.337416648864746, "learning_rate": 1.0603751193220846e-05, "loss": 0.4083, "step": 700 }, { "epoch": 3.4378843788437883, "grad_norm": 2.5366225242614746, "learning_rate": 1.0426469508120662e-05, "loss": 0.353, "step": 701 }, { "epoch": 3.4428044280442807, "grad_norm": 1.9000239372253418, "learning_rate": 1.0250600770111185e-05, "loss": 0.4028, "step": 702 }, { "epoch": 3.4477244772447726, "grad_norm": 1.7372283935546875, "learning_rate": 1.0076147753373789e-05, "loss": 0.4029, "step": 703 }, { "epoch": 3.4526445264452645, "grad_norm": 1.1029900312423706, "learning_rate": 9.903113209758096e-06, "loss": 0.3817, "step": 704 }, { "epoch": 3.4575645756457565, "grad_norm": 1.5212130546569824, "learning_rate": 9.731499868738447e-06, "loss": 0.3745, "step": 705 }, { "epoch": 3.4624846248462484, "grad_norm": 1.2530347108840942, "learning_rate": 9.561310437370907e-06, "loss": 0.4198, "step": 706 }, { "epoch": 3.4674046740467404, "grad_norm": 1.090973138809204, "learning_rate": 9.392547600250634e-06, "loss": 0.3743, "step": 707 }, { "epoch": 3.4723247232472323, "grad_norm": 0.8587853312492371, "learning_rate": 9.225214019469385e-06, "loss": 0.3928, "step": 708 }, { "epoch": 3.4772447724477242, "grad_norm": 1.6450562477111816, "learning_rate": 9.059312334573633e-06, "loss": 0.3529, "step": 709 }, { "epoch": 3.4821648216482166, "grad_norm": 1.3053218126296997, "learning_rate": 8.89484516252287e-06, "loss": 0.3634, "step": 710 }, { "epoch": 3.4870848708487086, "grad_norm": 2.639911413192749, "learning_rate": 8.731815097648433e-06, "loss": 0.4159, "step": 711 }, { "epoch": 3.4920049200492005, "grad_norm": 0.9935341477394104, "learning_rate": 8.570224711612385e-06, "loss": 0.3803, "step": 712 }, { "epoch": 3.4969249692496924, "grad_norm": 1.752165675163269, "learning_rate": 8.410076553367208e-06, "loss": 0.4104, "step": 713 }, { "epoch": 3.5018450184501844, "grad_norm": 1.270850419998169, "learning_rate": 8.251373149115293e-06, "loss": 0.4122, "step": 714 }, { "epoch": 3.5067650676506767, "grad_norm": 2.370002508163452, "learning_rate": 8.094117002269363e-06, "loss": 0.4529, "step": 715 }, { "epoch": 3.5116851168511687, "grad_norm": 2.229987382888794, "learning_rate": 7.938310593412879e-06, "loss": 0.4117, "step": 716 }, { "epoch": 3.5166051660516606, "grad_norm": 1.700907588005066, "learning_rate": 7.783956380260837e-06, "loss": 0.3801, "step": 717 }, { "epoch": 3.5215252152521526, "grad_norm": 1.5140172243118286, "learning_rate": 7.631056797621106e-06, "loss": 0.3708, "step": 718 }, { "epoch": 3.5264452644526445, "grad_norm": 1.4080220460891724, "learning_rate": 7.479614257355971e-06, "loss": 0.3763, "step": 719 }, { "epoch": 3.5313653136531364, "grad_norm": 1.585070252418518, "learning_rate": 7.329631148344118e-06, "loss": 0.358, "step": 720 }, { "epoch": 3.5362853628536284, "grad_norm": 2.044015645980835, "learning_rate": 7.181109836442912e-06, "loss": 0.3774, "step": 721 }, { "epoch": 3.5412054120541203, "grad_norm": 0.8359534740447998, "learning_rate": 7.034052664451118e-06, "loss": 0.3663, "step": 722 }, { "epoch": 3.5461254612546127, "grad_norm": 2.3022444248199463, "learning_rate": 6.88846195207189e-06, "loss": 0.3065, "step": 723 }, { "epoch": 3.5510455104551046, "grad_norm": 1.8175033330917358, "learning_rate": 6.7443399958762584e-06, "loss": 0.4242, "step": 724 }, { "epoch": 3.5559655596555966, "grad_norm": 1.7454516887664795, "learning_rate": 6.6016890692668364e-06, "loss": 0.3996, "step": 725 }, { "epoch": 3.5608856088560885, "grad_norm": 2.403921604156494, "learning_rate": 6.460511422441984e-06, "loss": 0.4444, "step": 726 }, { "epoch": 3.5658056580565805, "grad_norm": 1.0997297763824463, "learning_rate": 6.320809282360319e-06, "loss": 0.4124, "step": 727 }, { "epoch": 3.570725707257073, "grad_norm": 3.04303240776062, "learning_rate": 6.1825848527055865e-06, "loss": 0.4291, "step": 728 }, { "epoch": 3.5756457564575648, "grad_norm": 0.9251189827919006, "learning_rate": 6.04584031385188e-06, "loss": 0.3733, "step": 729 }, { "epoch": 3.5805658056580567, "grad_norm": 1.9034310579299927, "learning_rate": 5.910577822829233e-06, "loss": 0.3884, "step": 730 }, { "epoch": 3.5854858548585486, "grad_norm": 1.187487244606018, "learning_rate": 5.77679951328971e-06, "loss": 0.4108, "step": 731 }, { "epoch": 3.5904059040590406, "grad_norm": 1.513329267501831, "learning_rate": 5.644507495473572e-06, "loss": 0.4008, "step": 732 }, { "epoch": 3.5953259532595325, "grad_norm": 2.4123191833496094, "learning_rate": 5.5137038561761115e-06, "loss": 0.4162, "step": 733 }, { "epoch": 3.6002460024600245, "grad_norm": 1.3358474969863892, "learning_rate": 5.3843906587146886e-06, "loss": 0.4287, "step": 734 }, { "epoch": 3.6051660516605164, "grad_norm": 1.746752142906189, "learning_rate": 5.256569942896217e-06, "loss": 0.341, "step": 735 }, { "epoch": 3.6100861008610083, "grad_norm": 1.716902732849121, "learning_rate": 5.130243724984995e-06, "loss": 0.4344, "step": 736 }, { "epoch": 3.6150061500615007, "grad_norm": 0.44636377692222595, "learning_rate": 5.005413997670816e-06, "loss": 0.3995, "step": 737 }, { "epoch": 3.6199261992619927, "grad_norm": 0.6673928499221802, "learning_rate": 4.8820827300376075e-06, "loss": 0.3771, "step": 738 }, { "epoch": 3.6248462484624846, "grad_norm": 1.8165249824523926, "learning_rate": 4.760251867532362e-06, "loss": 0.4214, "step": 739 }, { "epoch": 3.6297662976629765, "grad_norm": 1.8206608295440674, "learning_rate": 4.639923331934471e-06, "loss": 0.3361, "step": 740 }, { "epoch": 3.6346863468634685, "grad_norm": 1.2049740552902222, "learning_rate": 4.521099021325336e-06, "loss": 0.4241, "step": 741 }, { "epoch": 3.639606396063961, "grad_norm": 2.151357650756836, "learning_rate": 4.403780810058511e-06, "loss": 0.3934, "step": 742 }, { "epoch": 3.644526445264453, "grad_norm": 2.024153470993042, "learning_rate": 4.287970548730069e-06, "loss": 0.4109, "step": 743 }, { "epoch": 3.6494464944649447, "grad_norm": 0.612326979637146, "learning_rate": 4.173670064149482e-06, "loss": 0.4119, "step": 744 }, { "epoch": 3.6543665436654367, "grad_norm": 1.2650341987609863, "learning_rate": 4.060881159310725e-06, "loss": 0.4048, "step": 745 }, { "epoch": 3.6592865928659286, "grad_norm": 1.5588371753692627, "learning_rate": 3.949605613363882e-06, "loss": 0.3616, "step": 746 }, { "epoch": 3.6642066420664205, "grad_norm": 0.8163132667541504, "learning_rate": 3.839845181587098e-06, "loss": 0.4051, "step": 747 }, { "epoch": 3.6691266912669125, "grad_norm": 2.6811370849609375, "learning_rate": 3.7316015953588467e-06, "loss": 0.4446, "step": 748 }, { "epoch": 3.6740467404674044, "grad_norm": 3.0077154636383057, "learning_rate": 3.6248765621306414e-06, "loss": 0.3562, "step": 749 }, { "epoch": 3.678966789667897, "grad_norm": 1.8142826557159424, "learning_rate": 3.519671765400079e-06, "loss": 0.3967, "step": 750 }, { "epoch": 3.6838868388683887, "grad_norm": 4.520020008087158, "learning_rate": 3.4159888646843495e-06, "loss": 0.4737, "step": 751 }, { "epoch": 3.6888068880688807, "grad_norm": 2.5950474739074707, "learning_rate": 3.313829495493992e-06, "loss": 0.3269, "step": 752 }, { "epoch": 3.6937269372693726, "grad_norm": 0.9162222146987915, "learning_rate": 3.2131952693070898e-06, "loss": 0.4284, "step": 753 }, { "epoch": 3.6986469864698646, "grad_norm": 2.3598175048828125, "learning_rate": 3.1140877735439387e-06, "loss": 0.4268, "step": 754 }, { "epoch": 3.703567035670357, "grad_norm": 2.1901378631591797, "learning_rate": 3.0165085715418763e-06, "loss": 0.3514, "step": 755 }, { "epoch": 3.708487084870849, "grad_norm": 1.2730752229690552, "learning_rate": 2.9204592025307566e-06, "loss": 0.3697, "step": 756 }, { "epoch": 3.713407134071341, "grad_norm": 1.7523503303527832, "learning_rate": 2.8259411816085492e-06, "loss": 0.3626, "step": 757 }, { "epoch": 3.7183271832718328, "grad_norm": 0.7201489806175232, "learning_rate": 2.732955999717546e-06, "loss": 0.4082, "step": 758 }, { "epoch": 3.7232472324723247, "grad_norm": 2.6464169025421143, "learning_rate": 2.6415051236207355e-06, "loss": 0.3311, "step": 759 }, { "epoch": 3.7281672816728166, "grad_norm": 1.9799178838729858, "learning_rate": 2.551589995878789e-06, "loss": 0.392, "step": 760 }, { "epoch": 3.7330873308733086, "grad_norm": 1.5155545473098755, "learning_rate": 2.4632120348272003e-06, "loss": 0.3762, "step": 761 }, { "epoch": 3.7380073800738005, "grad_norm": 1.5089105367660522, "learning_rate": 2.376372634553936e-06, "loss": 0.3995, "step": 762 }, { "epoch": 3.742927429274293, "grad_norm": 1.772503137588501, "learning_rate": 2.291073164877511e-06, "loss": 0.3853, "step": 763 }, { "epoch": 3.747847478474785, "grad_norm": 2.189436435699463, "learning_rate": 2.207314971325292e-06, "loss": 0.3494, "step": 764 }, { "epoch": 3.7527675276752768, "grad_norm": 1.9785796403884888, "learning_rate": 2.125099375112316e-06, "loss": 0.3675, "step": 765 }, { "epoch": 3.7576875768757687, "grad_norm": 2.732494831085205, "learning_rate": 2.0444276731204415e-06, "loss": 0.4188, "step": 766 }, { "epoch": 3.7626076260762606, "grad_norm": 1.5634301900863647, "learning_rate": 1.9653011378779283e-06, "loss": 0.4186, "step": 767 }, { "epoch": 3.767527675276753, "grad_norm": 0.6259942650794983, "learning_rate": 1.88772101753929e-06, "loss": 0.3834, "step": 768 }, { "epoch": 3.772447724477245, "grad_norm": 1.3457146883010864, "learning_rate": 1.8116885358656744e-06, "loss": 0.3696, "step": 769 }, { "epoch": 3.777367773677737, "grad_norm": 1.3714008331298828, "learning_rate": 1.7372048922054906e-06, "loss": 0.3921, "step": 770 }, { "epoch": 3.782287822878229, "grad_norm": 1.4138679504394531, "learning_rate": 1.6642712614755695e-06, "loss": 0.4379, "step": 771 }, { "epoch": 3.787207872078721, "grad_norm": 0.921842634677887, "learning_rate": 1.5928887941426107e-06, "loss": 0.3714, "step": 772 }, { "epoch": 3.7921279212792127, "grad_norm": 2.7711589336395264, "learning_rate": 1.523058616204942e-06, "loss": 0.3689, "step": 773 }, { "epoch": 3.7970479704797047, "grad_norm": 2.5462987422943115, "learning_rate": 1.4547818291749115e-06, "loss": 0.4578, "step": 774 }, { "epoch": 3.8019680196801966, "grad_norm": 2.8806490898132324, "learning_rate": 1.3880595100613792e-06, "loss": 0.3297, "step": 775 }, { "epoch": 3.8068880688806885, "grad_norm": 1.5188145637512207, "learning_rate": 1.3228927113528189e-06, "loss": 0.3871, "step": 776 }, { "epoch": 3.811808118081181, "grad_norm": 0.9707936644554138, "learning_rate": 1.2592824610006215e-06, "loss": 0.3656, "step": 777 }, { "epoch": 3.816728167281673, "grad_norm": 1.8770543336868286, "learning_rate": 1.1972297624030072e-06, "loss": 0.3981, "step": 778 }, { "epoch": 3.821648216482165, "grad_norm": 2.3081560134887695, "learning_rate": 1.1367355943890823e-06, "loss": 0.341, "step": 779 }, { "epoch": 3.8265682656826567, "grad_norm": 1.113144040107727, "learning_rate": 1.0778009112034748e-06, "loss": 0.3586, "step": 780 }, { "epoch": 3.8314883148831487, "grad_norm": 0.5980240702629089, "learning_rate": 1.0204266424912123e-06, "loss": 0.376, "step": 781 }, { "epoch": 3.836408364083641, "grad_norm": 0.6723970174789429, "learning_rate": 9.64613693283123e-07, "loss": 0.4038, "step": 782 }, { "epoch": 3.841328413284133, "grad_norm": 2.4948697090148926, "learning_rate": 9.103629439815354e-07, "loss": 0.3738, "step": 783 }, { "epoch": 3.846248462484625, "grad_norm": 1.11293625831604, "learning_rate": 8.57675250346368e-07, "loss": 0.3866, "step": 784 }, { "epoch": 3.851168511685117, "grad_norm": 2.0996763706207275, "learning_rate": 8.065514434816845e-07, "loss": 0.4064, "step": 785 }, { "epoch": 3.856088560885609, "grad_norm": 1.6557263135910034, "learning_rate": 7.569923298225146e-07, "loss": 0.3567, "step": 786 }, { "epoch": 3.8610086100861007, "grad_norm": 1.717772364616394, "learning_rate": 7.08998691122198e-07, "loss": 0.3856, "step": 787 }, { "epoch": 3.8659286592865927, "grad_norm": 1.4299819469451904, "learning_rate": 6.625712844400056e-07, "loss": 0.3652, "step": 788 }, { "epoch": 3.8708487084870846, "grad_norm": 2.8910887241363525, "learning_rate": 6.177108421292266e-07, "loss": 0.4677, "step": 789 }, { "epoch": 3.875768757687577, "grad_norm": 1.175137996673584, "learning_rate": 5.744180718255776e-07, "loss": 0.4193, "step": 790 }, { "epoch": 3.880688806888069, "grad_norm": 1.1175763607025146, "learning_rate": 5.326936564361118e-07, "loss": 0.3875, "step": 791 }, { "epoch": 3.885608856088561, "grad_norm": 0.9984952211380005, "learning_rate": 4.92538254128383e-07, "loss": 0.3799, "step": 792 }, { "epoch": 3.890528905289053, "grad_norm": 1.142543077468872, "learning_rate": 4.5395249832007604e-07, "loss": 0.4194, "step": 793 }, { "epoch": 3.8954489544895448, "grad_norm": 1.1013692617416382, "learning_rate": 4.1693699766902626e-07, "loss": 0.3853, "step": 794 }, { "epoch": 3.900369003690037, "grad_norm": 1.5713825225830078, "learning_rate": 3.814923360636158e-07, "loss": 0.4418, "step": 795 }, { "epoch": 3.905289052890529, "grad_norm": 3.3740017414093018, "learning_rate": 3.4761907261356976e-07, "loss": 0.3226, "step": 796 }, { "epoch": 3.910209102091021, "grad_norm": 2.347411870956421, "learning_rate": 3.1531774164111903e-07, "loss": 0.4269, "step": 797 }, { "epoch": 3.915129151291513, "grad_norm": 0.46610283851623535, "learning_rate": 2.8458885267260705e-07, "loss": 0.3861, "step": 798 }, { "epoch": 3.920049200492005, "grad_norm": 2.183335304260254, "learning_rate": 2.554328904303738e-07, "loss": 0.4076, "step": 799 }, { "epoch": 3.924969249692497, "grad_norm": 0.9739826321601868, "learning_rate": 2.2785031482521758e-07, "loss": 0.366, "step": 800 }, { "epoch": 3.9298892988929888, "grad_norm": 1.9975255727767944, "learning_rate": 2.0184156094905648e-07, "loss": 0.4491, "step": 801 }, { "epoch": 3.9348093480934807, "grad_norm": 2.732900619506836, "learning_rate": 1.7740703906810042e-07, "loss": 0.3248, "step": 802 }, { "epoch": 3.939729397293973, "grad_norm": 0.8809100389480591, "learning_rate": 1.545471346164007e-07, "loss": 0.3633, "step": 803 }, { "epoch": 3.944649446494465, "grad_norm": 0.5867434740066528, "learning_rate": 1.3326220818968838e-07, "loss": 0.3881, "step": 804 }, { "epoch": 3.949569495694957, "grad_norm": 0.8650780320167542, "learning_rate": 1.1355259553978981e-07, "loss": 0.3669, "step": 805 }, { "epoch": 3.954489544895449, "grad_norm": 1.4509629011154175, "learning_rate": 9.541860756925314e-08, "loss": 0.3649, "step": 806 }, { "epoch": 3.959409594095941, "grad_norm": 2.9854180812835693, "learning_rate": 7.886053032649665e-08, "loss": 0.3379, "step": 807 }, { "epoch": 3.9643296432964332, "grad_norm": 3.3452847003936768, "learning_rate": 6.387862500125685e-08, "loss": 0.3104, "step": 808 }, { "epoch": 3.969249692496925, "grad_norm": 1.342034935951233, "learning_rate": 5.047312792046954e-08, "loss": 0.3895, "step": 809 }, { "epoch": 3.974169741697417, "grad_norm": 1.3684653043746948, "learning_rate": 3.8644250544594975e-08, "loss": 0.3729, "step": 810 }, { "epoch": 3.979089790897909, "grad_norm": 2.351048231124878, "learning_rate": 2.839217946422057e-08, "loss": 0.4621, "step": 811 }, { "epoch": 3.984009840098401, "grad_norm": 0.49089106917381287, "learning_rate": 1.971707639712994e-08, "loss": 0.3819, "step": 812 }, { "epoch": 3.988929889298893, "grad_norm": 1.8144298791885376, "learning_rate": 1.2619078185793776e-08, "loss": 0.4157, "step": 813 }, { "epoch": 3.993849938499385, "grad_norm": 1.8721059560775757, "learning_rate": 7.098296795138293e-09, "loss": 0.3468, "step": 814 }, { "epoch": 3.998769987699877, "grad_norm": 1.0250661373138428, "learning_rate": 3.154819310868806e-09, "loss": 0.401, "step": 815 }, { "epoch": 4.0, "grad_norm": 1.8312103748321533, "learning_rate": 7.887079380153317e-10, "loss": 0.3332, "step": 816 }, { "epoch": 4.0, "step": 816, "total_flos": 1.3456927249947034e+17, "train_loss": 0.40810306406780783, "train_runtime": 2344.5136, "train_samples_per_second": 11.093, "train_steps_per_second": 0.348 } ], "logging_steps": 1, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2400000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3456927249947034e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }