{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003177124702144559, "grad_norm": 0.9960207343101501, "learning_rate": 0.0, "loss": 2.9881, "step": 1 }, { "epoch": 0.006354249404289118, "grad_norm": 0.9263616800308228, "learning_rate": 4e-05, "loss": 2.9644, "step": 2 }, { "epoch": 0.009531374106433678, "grad_norm": 0.9059100151062012, "learning_rate": 8e-05, "loss": 2.7584, "step": 3 }, { "epoch": 0.012708498808578236, "grad_norm": 0.8163802027702332, "learning_rate": 0.00012, "loss": 2.7029, "step": 4 }, { "epoch": 0.015885623510722795, "grad_norm": 0.7039979696273804, "learning_rate": 0.00016, "loss": 2.5991, "step": 5 }, { "epoch": 0.019062748212867357, "grad_norm": 0.5652405023574829, "learning_rate": 0.0002, "loss": 2.4187, "step": 6 }, { "epoch": 0.022239872915011914, "grad_norm": 0.7419441342353821, "learning_rate": 0.00019968, "loss": 2.2885, "step": 7 }, { "epoch": 0.025416997617156472, "grad_norm": 0.6665301322937012, "learning_rate": 0.00019936000000000002, "loss": 2.3176, "step": 8 }, { "epoch": 0.028594122319301033, "grad_norm": 0.5366690158843994, "learning_rate": 0.00019904, "loss": 2.2728, "step": 9 }, { "epoch": 0.03177124702144559, "grad_norm": 0.5066737532615662, "learning_rate": 0.00019872000000000002, "loss": 2.068, "step": 10 }, { "epoch": 0.03494837172359015, "grad_norm": 0.5810503959655762, "learning_rate": 0.0001984, "loss": 2.2197, "step": 11 }, { "epoch": 0.03812549642573471, "grad_norm": 0.47433704137802124, "learning_rate": 0.00019808, "loss": 1.9871, "step": 12 }, { "epoch": 0.04130262112787927, "grad_norm": 0.49652695655822754, "learning_rate": 0.00019776, "loss": 1.9139, "step": 13 }, { "epoch": 0.04447974583002383, "grad_norm": 0.4280414879322052, "learning_rate": 0.00019744, "loss": 1.8852, "step": 14 }, { "epoch": 0.04765687053216839, "grad_norm": 0.596341609954834, "learning_rate": 0.00019712, "loss": 1.9809, "step": 15 }, { "epoch": 0.050833995234312944, "grad_norm": 0.5067018866539001, "learning_rate": 0.0001968, "loss": 1.9549, "step": 16 }, { "epoch": 0.054011119936457505, "grad_norm": 0.4348883032798767, "learning_rate": 0.00019648000000000002, "loss": 1.9078, "step": 17 }, { "epoch": 0.057188244638602066, "grad_norm": 0.4295920133590698, "learning_rate": 0.00019616000000000002, "loss": 1.8234, "step": 18 }, { "epoch": 0.06036536934074663, "grad_norm": 0.43549808859825134, "learning_rate": 0.00019584, "loss": 1.9143, "step": 19 }, { "epoch": 0.06354249404289118, "grad_norm": 0.4168950319290161, "learning_rate": 0.00019552000000000003, "loss": 1.7874, "step": 20 }, { "epoch": 0.06671961874503574, "grad_norm": 0.4715218245983124, "learning_rate": 0.0001952, "loss": 1.8926, "step": 21 }, { "epoch": 0.0698967434471803, "grad_norm": 0.38382261991500854, "learning_rate": 0.00019488000000000003, "loss": 1.7832, "step": 22 }, { "epoch": 0.07307386814932486, "grad_norm": 0.43702301383018494, "learning_rate": 0.00019456, "loss": 1.7447, "step": 23 }, { "epoch": 0.07625099285146943, "grad_norm": 0.4813467562198639, "learning_rate": 0.00019424, "loss": 1.8851, "step": 24 }, { "epoch": 0.07942811755361398, "grad_norm": 0.4026224911212921, "learning_rate": 0.00019392000000000001, "loss": 1.8036, "step": 25 }, { "epoch": 0.08260524225575853, "grad_norm": 0.39529579877853394, "learning_rate": 0.00019360000000000002, "loss": 1.7207, "step": 26 }, { "epoch": 0.0857823669579031, "grad_norm": 0.4045431911945343, "learning_rate": 0.00019328000000000002, "loss": 1.7962, "step": 27 }, { "epoch": 0.08895949166004766, "grad_norm": 0.3818039894104004, "learning_rate": 0.00019296, "loss": 1.68, "step": 28 }, { "epoch": 0.09213661636219221, "grad_norm": 0.3767971098423004, "learning_rate": 0.00019264, "loss": 1.7949, "step": 29 }, { "epoch": 0.09531374106433678, "grad_norm": 0.38719820976257324, "learning_rate": 0.00019232, "loss": 1.7069, "step": 30 }, { "epoch": 0.09849086576648133, "grad_norm": 0.416172057390213, "learning_rate": 0.000192, "loss": 1.6897, "step": 31 }, { "epoch": 0.10166799046862589, "grad_norm": 0.3721797466278076, "learning_rate": 0.00019168, "loss": 1.6996, "step": 32 }, { "epoch": 0.10484511517077046, "grad_norm": 0.4110720753669739, "learning_rate": 0.00019136, "loss": 1.5606, "step": 33 }, { "epoch": 0.10802223987291501, "grad_norm": 0.3850787580013275, "learning_rate": 0.00019104000000000001, "loss": 1.7615, "step": 34 }, { "epoch": 0.11119936457505956, "grad_norm": 0.33883488178253174, "learning_rate": 0.00019072000000000002, "loss": 1.5231, "step": 35 }, { "epoch": 0.11437648927720413, "grad_norm": 0.37157201766967773, "learning_rate": 0.0001904, "loss": 1.6246, "step": 36 }, { "epoch": 0.11755361397934869, "grad_norm": 0.3907526433467865, "learning_rate": 0.00019008000000000002, "loss": 1.6537, "step": 37 }, { "epoch": 0.12073073868149325, "grad_norm": 0.39647847414016724, "learning_rate": 0.00018976, "loss": 1.6193, "step": 38 }, { "epoch": 0.12390786338363781, "grad_norm": 0.34513983130455017, "learning_rate": 0.00018944000000000003, "loss": 1.4992, "step": 39 }, { "epoch": 0.12708498808578236, "grad_norm": 0.4174259305000305, "learning_rate": 0.00018912, "loss": 1.6547, "step": 40 }, { "epoch": 0.13026211278792693, "grad_norm": 0.42140164971351624, "learning_rate": 0.0001888, "loss": 1.6165, "step": 41 }, { "epoch": 0.13343923749007147, "grad_norm": 0.40260136127471924, "learning_rate": 0.00018848, "loss": 1.5583, "step": 42 }, { "epoch": 0.13661636219221604, "grad_norm": 0.42584484815597534, "learning_rate": 0.00018816000000000001, "loss": 1.5742, "step": 43 }, { "epoch": 0.1397934868943606, "grad_norm": 0.3613159954547882, "learning_rate": 0.00018784000000000002, "loss": 1.6282, "step": 44 }, { "epoch": 0.14297061159650518, "grad_norm": 0.45315852761268616, "learning_rate": 0.00018752, "loss": 1.6274, "step": 45 }, { "epoch": 0.14614773629864972, "grad_norm": 0.3528841733932495, "learning_rate": 0.00018720000000000002, "loss": 1.5076, "step": 46 }, { "epoch": 0.14932486100079428, "grad_norm": 0.4335366487503052, "learning_rate": 0.00018688, "loss": 1.5561, "step": 47 }, { "epoch": 0.15250198570293885, "grad_norm": 0.4157211184501648, "learning_rate": 0.00018656, "loss": 1.4963, "step": 48 }, { "epoch": 0.1556791104050834, "grad_norm": 0.3932294249534607, "learning_rate": 0.00018624, "loss": 1.5381, "step": 49 }, { "epoch": 0.15885623510722796, "grad_norm": 0.47142326831817627, "learning_rate": 0.00018592, "loss": 1.7029, "step": 50 }, { "epoch": 0.16203335980937253, "grad_norm": 0.46922552585601807, "learning_rate": 0.0001856, "loss": 1.5722, "step": 51 }, { "epoch": 0.16521048451151707, "grad_norm": 0.40435677766799927, "learning_rate": 0.00018528000000000001, "loss": 1.5885, "step": 52 }, { "epoch": 0.16838760921366164, "grad_norm": 0.4449491500854492, "learning_rate": 0.00018496, "loss": 1.5691, "step": 53 }, { "epoch": 0.1715647339158062, "grad_norm": 0.46489715576171875, "learning_rate": 0.00018464000000000002, "loss": 1.5736, "step": 54 }, { "epoch": 0.17474185861795075, "grad_norm": 0.4461865723133087, "learning_rate": 0.00018432, "loss": 1.5359, "step": 55 }, { "epoch": 0.17791898332009531, "grad_norm": 0.4674195349216461, "learning_rate": 0.00018400000000000003, "loss": 1.4617, "step": 56 }, { "epoch": 0.18109610802223988, "grad_norm": 0.3901899755001068, "learning_rate": 0.00018368, "loss": 1.5141, "step": 57 }, { "epoch": 0.18427323272438442, "grad_norm": 0.46142131090164185, "learning_rate": 0.00018336, "loss": 1.5436, "step": 58 }, { "epoch": 0.187450357426529, "grad_norm": 0.4317268431186676, "learning_rate": 0.00018304, "loss": 1.56, "step": 59 }, { "epoch": 0.19062748212867356, "grad_norm": 0.44914504885673523, "learning_rate": 0.00018272, "loss": 1.5477, "step": 60 }, { "epoch": 0.1938046068308181, "grad_norm": 0.43380966782569885, "learning_rate": 0.00018240000000000002, "loss": 1.4989, "step": 61 }, { "epoch": 0.19698173153296267, "grad_norm": 0.41798654198646545, "learning_rate": 0.00018208000000000002, "loss": 1.3978, "step": 62 }, { "epoch": 0.20015885623510724, "grad_norm": 0.4322330355644226, "learning_rate": 0.00018176000000000002, "loss": 1.5094, "step": 63 }, { "epoch": 0.20333598093725178, "grad_norm": 0.4732660949230194, "learning_rate": 0.00018144, "loss": 1.4756, "step": 64 }, { "epoch": 0.20651310563939634, "grad_norm": 0.41877272725105286, "learning_rate": 0.00018112, "loss": 1.4598, "step": 65 }, { "epoch": 0.2096902303415409, "grad_norm": 0.46112221479415894, "learning_rate": 0.0001808, "loss": 1.5562, "step": 66 }, { "epoch": 0.21286735504368545, "grad_norm": 0.3946124017238617, "learning_rate": 0.00018048, "loss": 1.4517, "step": 67 }, { "epoch": 0.21604447974583002, "grad_norm": 0.452828586101532, "learning_rate": 0.00018016, "loss": 1.5298, "step": 68 }, { "epoch": 0.2192216044479746, "grad_norm": 0.45543792843818665, "learning_rate": 0.00017984, "loss": 1.5945, "step": 69 }, { "epoch": 0.22239872915011913, "grad_norm": 0.4937468469142914, "learning_rate": 0.00017952, "loss": 1.5111, "step": 70 }, { "epoch": 0.2255758538522637, "grad_norm": 0.43769434094429016, "learning_rate": 0.00017920000000000002, "loss": 1.5139, "step": 71 }, { "epoch": 0.22875297855440826, "grad_norm": 0.42178353667259216, "learning_rate": 0.00017888, "loss": 1.3964, "step": 72 }, { "epoch": 0.23193010325655283, "grad_norm": 0.4274325668811798, "learning_rate": 0.00017856000000000003, "loss": 1.4176, "step": 73 }, { "epoch": 0.23510722795869737, "grad_norm": 0.4603947401046753, "learning_rate": 0.00017824, "loss": 1.549, "step": 74 }, { "epoch": 0.23828435266084194, "grad_norm": 0.4948660731315613, "learning_rate": 0.00017792, "loss": 1.4564, "step": 75 }, { "epoch": 0.2414614773629865, "grad_norm": 0.4219314455986023, "learning_rate": 0.0001776, "loss": 1.4946, "step": 76 }, { "epoch": 0.24463860206513105, "grad_norm": 0.49445462226867676, "learning_rate": 0.00017728, "loss": 1.5655, "step": 77 }, { "epoch": 0.24781572676727562, "grad_norm": 0.4661003053188324, "learning_rate": 0.00017696, "loss": 1.5347, "step": 78 }, { "epoch": 0.2509928514694202, "grad_norm": 0.49738094210624695, "learning_rate": 0.00017664000000000002, "loss": 1.5218, "step": 79 }, { "epoch": 0.2541699761715647, "grad_norm": 0.44844523072242737, "learning_rate": 0.00017632000000000002, "loss": 1.4657, "step": 80 }, { "epoch": 0.25734710087370927, "grad_norm": 0.5222679972648621, "learning_rate": 0.00017600000000000002, "loss": 1.4799, "step": 81 }, { "epoch": 0.26052422557585386, "grad_norm": 0.5003090500831604, "learning_rate": 0.00017568, "loss": 1.4998, "step": 82 }, { "epoch": 0.2637013502779984, "grad_norm": 0.4072366952896118, "learning_rate": 0.00017536, "loss": 1.5213, "step": 83 }, { "epoch": 0.26687847498014294, "grad_norm": 0.42663538455963135, "learning_rate": 0.00017504, "loss": 1.5446, "step": 84 }, { "epoch": 0.27005559968228754, "grad_norm": 0.45552435517311096, "learning_rate": 0.00017472, "loss": 1.5624, "step": 85 }, { "epoch": 0.2732327243844321, "grad_norm": 0.463173508644104, "learning_rate": 0.0001744, "loss": 1.4161, "step": 86 }, { "epoch": 0.2764098490865767, "grad_norm": 0.4052661955356598, "learning_rate": 0.00017408, "loss": 1.5228, "step": 87 }, { "epoch": 0.2795869737887212, "grad_norm": 0.3988233506679535, "learning_rate": 0.00017376000000000002, "loss": 1.3896, "step": 88 }, { "epoch": 0.28276409849086576, "grad_norm": 0.3923889398574829, "learning_rate": 0.00017344, "loss": 1.4572, "step": 89 }, { "epoch": 0.28594122319301035, "grad_norm": 0.46868669986724854, "learning_rate": 0.00017312000000000002, "loss": 1.5586, "step": 90 }, { "epoch": 0.2891183478951549, "grad_norm": 0.43891963362693787, "learning_rate": 0.0001728, "loss": 1.5185, "step": 91 }, { "epoch": 0.29229547259729943, "grad_norm": 0.4684846103191376, "learning_rate": 0.00017248000000000003, "loss": 1.4992, "step": 92 }, { "epoch": 0.29547259729944403, "grad_norm": 0.4795592725276947, "learning_rate": 0.00017216, "loss": 1.4987, "step": 93 }, { "epoch": 0.29864972200158857, "grad_norm": 0.4201822578907013, "learning_rate": 0.00017184, "loss": 1.4367, "step": 94 }, { "epoch": 0.3018268467037331, "grad_norm": 0.443697065114975, "learning_rate": 0.00017152, "loss": 1.4436, "step": 95 }, { "epoch": 0.3050039714058777, "grad_norm": 0.4432813823223114, "learning_rate": 0.00017120000000000001, "loss": 1.523, "step": 96 }, { "epoch": 0.30818109610802225, "grad_norm": 0.43522974848747253, "learning_rate": 0.00017088000000000002, "loss": 1.3767, "step": 97 }, { "epoch": 0.3113582208101668, "grad_norm": 0.396990031003952, "learning_rate": 0.00017056000000000002, "loss": 1.3021, "step": 98 }, { "epoch": 0.3145353455123114, "grad_norm": 0.462819904088974, "learning_rate": 0.00017024, "loss": 1.455, "step": 99 }, { "epoch": 0.3177124702144559, "grad_norm": 0.41220882534980774, "learning_rate": 0.00016992, "loss": 1.4471, "step": 100 }, { "epoch": 0.32088959491660046, "grad_norm": 0.47001487016677856, "learning_rate": 0.0001696, "loss": 1.4668, "step": 101 }, { "epoch": 0.32406671961874506, "grad_norm": 0.4349619150161743, "learning_rate": 0.00016928, "loss": 1.4413, "step": 102 }, { "epoch": 0.3272438443208896, "grad_norm": 0.47175517678260803, "learning_rate": 0.00016896, "loss": 1.4546, "step": 103 }, { "epoch": 0.33042096902303414, "grad_norm": 0.4192788600921631, "learning_rate": 0.00016863999999999998, "loss": 1.4569, "step": 104 }, { "epoch": 0.33359809372517873, "grad_norm": 0.4177974462509155, "learning_rate": 0.00016832000000000001, "loss": 1.3423, "step": 105 }, { "epoch": 0.3367752184273233, "grad_norm": 0.4190915524959564, "learning_rate": 0.000168, "loss": 1.3857, "step": 106 }, { "epoch": 0.3399523431294678, "grad_norm": 0.42924079298973083, "learning_rate": 0.00016768000000000002, "loss": 1.4038, "step": 107 }, { "epoch": 0.3431294678316124, "grad_norm": 0.425611287355423, "learning_rate": 0.00016736, "loss": 1.395, "step": 108 }, { "epoch": 0.34630659253375695, "grad_norm": 0.4815029799938202, "learning_rate": 0.00016704000000000003, "loss": 1.4251, "step": 109 }, { "epoch": 0.3494837172359015, "grad_norm": 0.45862439274787903, "learning_rate": 0.00016672, "loss": 1.3488, "step": 110 }, { "epoch": 0.3526608419380461, "grad_norm": 0.46242061257362366, "learning_rate": 0.0001664, "loss": 1.393, "step": 111 }, { "epoch": 0.35583796664019063, "grad_norm": 0.46360430121421814, "learning_rate": 0.00016608, "loss": 1.4168, "step": 112 }, { "epoch": 0.35901509134233517, "grad_norm": 0.45501938462257385, "learning_rate": 0.00016576, "loss": 1.4718, "step": 113 }, { "epoch": 0.36219221604447976, "grad_norm": 0.44886520504951477, "learning_rate": 0.00016544000000000002, "loss": 1.4206, "step": 114 }, { "epoch": 0.3653693407466243, "grad_norm": 0.5432794690132141, "learning_rate": 0.00016512000000000002, "loss": 1.369, "step": 115 }, { "epoch": 0.36854646544876885, "grad_norm": 0.40831825137138367, "learning_rate": 0.0001648, "loss": 1.3482, "step": 116 }, { "epoch": 0.37172359015091344, "grad_norm": 0.4690685272216797, "learning_rate": 0.00016448000000000002, "loss": 1.4799, "step": 117 }, { "epoch": 0.374900714853058, "grad_norm": 0.4517834782600403, "learning_rate": 0.00016416, "loss": 1.3597, "step": 118 }, { "epoch": 0.3780778395552025, "grad_norm": 0.45939838886260986, "learning_rate": 0.00016384, "loss": 1.4188, "step": 119 }, { "epoch": 0.3812549642573471, "grad_norm": 0.4444164037704468, "learning_rate": 0.00016352, "loss": 1.4501, "step": 120 }, { "epoch": 0.38443208895949166, "grad_norm": 0.46045759320259094, "learning_rate": 0.0001632, "loss": 1.4315, "step": 121 }, { "epoch": 0.3876092136616362, "grad_norm": 0.46573230624198914, "learning_rate": 0.00016288, "loss": 1.3214, "step": 122 }, { "epoch": 0.3907863383637808, "grad_norm": 0.46668779850006104, "learning_rate": 0.00016256, "loss": 1.3008, "step": 123 }, { "epoch": 0.39396346306592533, "grad_norm": 0.45954373478889465, "learning_rate": 0.00016224000000000002, "loss": 1.4209, "step": 124 }, { "epoch": 0.3971405877680699, "grad_norm": 0.433923602104187, "learning_rate": 0.00016192, "loss": 1.4436, "step": 125 }, { "epoch": 0.40031771247021447, "grad_norm": 0.49414584040641785, "learning_rate": 0.00016160000000000002, "loss": 1.3914, "step": 126 }, { "epoch": 0.403494837172359, "grad_norm": 0.4280381202697754, "learning_rate": 0.00016128, "loss": 1.4302, "step": 127 }, { "epoch": 0.40667196187450355, "grad_norm": 0.5049663782119751, "learning_rate": 0.00016096, "loss": 1.5463, "step": 128 }, { "epoch": 0.40984908657664815, "grad_norm": 0.3671952486038208, "learning_rate": 0.00016064, "loss": 1.3469, "step": 129 }, { "epoch": 0.4130262112787927, "grad_norm": 0.4638643264770508, "learning_rate": 0.00016032, "loss": 1.3772, "step": 130 }, { "epoch": 0.41620333598093723, "grad_norm": 0.4278906583786011, "learning_rate": 0.00016, "loss": 1.4239, "step": 131 }, { "epoch": 0.4193804606830818, "grad_norm": 0.45057350397109985, "learning_rate": 0.00015968000000000002, "loss": 1.3649, "step": 132 }, { "epoch": 0.42255758538522636, "grad_norm": 0.4940052330493927, "learning_rate": 0.00015936, "loss": 1.4542, "step": 133 }, { "epoch": 0.4257347100873709, "grad_norm": 0.48272138833999634, "learning_rate": 0.00015904000000000002, "loss": 1.4525, "step": 134 }, { "epoch": 0.4289118347895155, "grad_norm": 0.4591176211833954, "learning_rate": 0.00015872, "loss": 1.372, "step": 135 }, { "epoch": 0.43208895949166004, "grad_norm": 0.39564651250839233, "learning_rate": 0.00015840000000000003, "loss": 1.2387, "step": 136 }, { "epoch": 0.4352660841938046, "grad_norm": 0.4640074670314789, "learning_rate": 0.00015808, "loss": 1.4381, "step": 137 }, { "epoch": 0.4384432088959492, "grad_norm": 0.4523836374282837, "learning_rate": 0.00015776, "loss": 1.3681, "step": 138 }, { "epoch": 0.4416203335980937, "grad_norm": 0.4463924765586853, "learning_rate": 0.00015744, "loss": 1.4002, "step": 139 }, { "epoch": 0.44479745830023826, "grad_norm": 0.4263816177845001, "learning_rate": 0.00015712000000000001, "loss": 1.3452, "step": 140 }, { "epoch": 0.44797458300238285, "grad_norm": 0.4039861857891083, "learning_rate": 0.00015680000000000002, "loss": 1.3888, "step": 141 }, { "epoch": 0.4511517077045274, "grad_norm": 0.44540414214134216, "learning_rate": 0.00015648, "loss": 1.3827, "step": 142 }, { "epoch": 0.45432883240667193, "grad_norm": 0.4521636664867401, "learning_rate": 0.00015616000000000002, "loss": 1.4448, "step": 143 }, { "epoch": 0.45750595710881653, "grad_norm": 0.46087294816970825, "learning_rate": 0.00015584, "loss": 1.374, "step": 144 }, { "epoch": 0.46068308181096107, "grad_norm": 0.43480321764945984, "learning_rate": 0.00015552, "loss": 1.4021, "step": 145 }, { "epoch": 0.46386020651310567, "grad_norm": 0.48551246523857117, "learning_rate": 0.0001552, "loss": 1.3729, "step": 146 }, { "epoch": 0.4670373312152502, "grad_norm": 0.44551774859428406, "learning_rate": 0.00015488, "loss": 1.3628, "step": 147 }, { "epoch": 0.47021445591739475, "grad_norm": 0.43176624178886414, "learning_rate": 0.00015456, "loss": 1.4927, "step": 148 }, { "epoch": 0.47339158061953934, "grad_norm": 0.47492435574531555, "learning_rate": 0.00015424000000000001, "loss": 1.4361, "step": 149 }, { "epoch": 0.4765687053216839, "grad_norm": 0.46715089678764343, "learning_rate": 0.00015392, "loss": 1.382, "step": 150 }, { "epoch": 0.4797458300238284, "grad_norm": 0.44686493277549744, "learning_rate": 0.00015360000000000002, "loss": 1.4603, "step": 151 }, { "epoch": 0.482922954725973, "grad_norm": 0.429262638092041, "learning_rate": 0.00015328, "loss": 1.3358, "step": 152 }, { "epoch": 0.48610007942811756, "grad_norm": 0.4371509850025177, "learning_rate": 0.00015296000000000003, "loss": 1.4007, "step": 153 }, { "epoch": 0.4892772041302621, "grad_norm": 0.44418609142303467, "learning_rate": 0.00015264, "loss": 1.2738, "step": 154 }, { "epoch": 0.4924543288324067, "grad_norm": 0.44011855125427246, "learning_rate": 0.00015232, "loss": 1.3724, "step": 155 }, { "epoch": 0.49563145353455124, "grad_norm": 0.4057789742946625, "learning_rate": 0.000152, "loss": 1.3708, "step": 156 }, { "epoch": 0.4988085782366958, "grad_norm": 0.462645024061203, "learning_rate": 0.00015168, "loss": 1.3377, "step": 157 }, { "epoch": 0.5019857029388404, "grad_norm": 0.48834553360939026, "learning_rate": 0.00015136000000000001, "loss": 1.393, "step": 158 }, { "epoch": 0.5051628276409849, "grad_norm": 0.4350643754005432, "learning_rate": 0.00015104, "loss": 1.3838, "step": 159 }, { "epoch": 0.5083399523431295, "grad_norm": 0.529528796672821, "learning_rate": 0.00015072000000000002, "loss": 1.4754, "step": 160 }, { "epoch": 0.511517077045274, "grad_norm": 0.43042004108428955, "learning_rate": 0.0001504, "loss": 1.2977, "step": 161 }, { "epoch": 0.5146942017474185, "grad_norm": 0.4385746717453003, "learning_rate": 0.00015008, "loss": 1.3612, "step": 162 }, { "epoch": 0.5178713264495631, "grad_norm": 0.4804169535636902, "learning_rate": 0.00014976, "loss": 1.3587, "step": 163 }, { "epoch": 0.5210484511517077, "grad_norm": 0.43496274948120117, "learning_rate": 0.00014944, "loss": 1.3683, "step": 164 }, { "epoch": 0.5242255758538522, "grad_norm": 0.4485699534416199, "learning_rate": 0.00014912, "loss": 1.3987, "step": 165 }, { "epoch": 0.5274027005559968, "grad_norm": 0.4077622592449188, "learning_rate": 0.0001488, "loss": 1.3115, "step": 166 }, { "epoch": 0.5305798252581414, "grad_norm": 0.480747789144516, "learning_rate": 0.00014848, "loss": 1.4257, "step": 167 }, { "epoch": 0.5337569499602859, "grad_norm": 0.4339999854564667, "learning_rate": 0.00014816000000000002, "loss": 1.3689, "step": 168 }, { "epoch": 0.5369340746624305, "grad_norm": 0.4264179766178131, "learning_rate": 0.00014784, "loss": 1.3877, "step": 169 }, { "epoch": 0.5401111993645751, "grad_norm": 0.4252622127532959, "learning_rate": 0.00014752000000000002, "loss": 1.3887, "step": 170 }, { "epoch": 0.5432883240667196, "grad_norm": 0.42078137397766113, "learning_rate": 0.0001472, "loss": 1.388, "step": 171 }, { "epoch": 0.5464654487688642, "grad_norm": 0.4306480586528778, "learning_rate": 0.00014688000000000003, "loss": 1.3366, "step": 172 }, { "epoch": 0.5496425734710088, "grad_norm": 0.4413485825061798, "learning_rate": 0.00014656, "loss": 1.4593, "step": 173 }, { "epoch": 0.5528196981731534, "grad_norm": 0.5051458477973938, "learning_rate": 0.00014624, "loss": 1.3824, "step": 174 }, { "epoch": 0.5559968228752978, "grad_norm": 0.42062053084373474, "learning_rate": 0.00014592, "loss": 1.2272, "step": 175 }, { "epoch": 0.5591739475774424, "grad_norm": 0.46674421429634094, "learning_rate": 0.00014560000000000002, "loss": 1.4548, "step": 176 }, { "epoch": 0.562351072279587, "grad_norm": 0.44679704308509827, "learning_rate": 0.00014528000000000002, "loss": 1.4101, "step": 177 }, { "epoch": 0.5655281969817315, "grad_norm": 0.4165225327014923, "learning_rate": 0.00014496, "loss": 1.3625, "step": 178 }, { "epoch": 0.5687053216838761, "grad_norm": 0.4735226631164551, "learning_rate": 0.00014464, "loss": 1.4569, "step": 179 }, { "epoch": 0.5718824463860207, "grad_norm": 0.4978485405445099, "learning_rate": 0.00014432, "loss": 1.3913, "step": 180 }, { "epoch": 0.5750595710881652, "grad_norm": 0.5753241181373596, "learning_rate": 0.000144, "loss": 1.3885, "step": 181 }, { "epoch": 0.5782366957903098, "grad_norm": 0.4427070617675781, "learning_rate": 0.00014368, "loss": 1.4068, "step": 182 }, { "epoch": 0.5814138204924544, "grad_norm": 0.4505138099193573, "learning_rate": 0.00014336, "loss": 1.3579, "step": 183 }, { "epoch": 0.5845909451945989, "grad_norm": 0.5041322708129883, "learning_rate": 0.00014303999999999999, "loss": 1.4652, "step": 184 }, { "epoch": 0.5877680698967435, "grad_norm": 0.4617048501968384, "learning_rate": 0.00014272000000000002, "loss": 1.3565, "step": 185 }, { "epoch": 0.5909451945988881, "grad_norm": 0.4603610038757324, "learning_rate": 0.0001424, "loss": 1.3436, "step": 186 }, { "epoch": 0.5941223193010325, "grad_norm": 0.5165044069290161, "learning_rate": 0.00014208000000000002, "loss": 1.4443, "step": 187 }, { "epoch": 0.5972994440031771, "grad_norm": 0.3984765410423279, "learning_rate": 0.00014176, "loss": 1.2166, "step": 188 }, { "epoch": 0.6004765687053217, "grad_norm": 0.5299299955368042, "learning_rate": 0.00014144000000000003, "loss": 1.446, "step": 189 }, { "epoch": 0.6036536934074662, "grad_norm": 0.49046239256858826, "learning_rate": 0.00014112, "loss": 1.3844, "step": 190 }, { "epoch": 0.6068308181096108, "grad_norm": 0.3969656825065613, "learning_rate": 0.0001408, "loss": 1.2135, "step": 191 }, { "epoch": 0.6100079428117554, "grad_norm": 0.4312625527381897, "learning_rate": 0.00014048, "loss": 1.3431, "step": 192 }, { "epoch": 0.6131850675138999, "grad_norm": 0.4357926547527313, "learning_rate": 0.00014016, "loss": 1.31, "step": 193 }, { "epoch": 0.6163621922160445, "grad_norm": 0.4309421181678772, "learning_rate": 0.00013984000000000002, "loss": 1.3617, "step": 194 }, { "epoch": 0.6195393169181891, "grad_norm": 0.4217104911804199, "learning_rate": 0.00013952000000000002, "loss": 1.3756, "step": 195 }, { "epoch": 0.6227164416203336, "grad_norm": 0.5252110958099365, "learning_rate": 0.0001392, "loss": 1.3824, "step": 196 }, { "epoch": 0.6258935663224782, "grad_norm": 0.4495287537574768, "learning_rate": 0.00013888, "loss": 1.3279, "step": 197 }, { "epoch": 0.6290706910246228, "grad_norm": 0.4457398056983948, "learning_rate": 0.00013856, "loss": 1.385, "step": 198 }, { "epoch": 0.6322478157267672, "grad_norm": 0.4607657790184021, "learning_rate": 0.00013824, "loss": 1.458, "step": 199 }, { "epoch": 0.6354249404289118, "grad_norm": 0.43265438079833984, "learning_rate": 0.00013792, "loss": 1.3233, "step": 200 }, { "epoch": 0.6386020651310564, "grad_norm": 0.4238455295562744, "learning_rate": 0.00013759999999999998, "loss": 1.3658, "step": 201 }, { "epoch": 0.6417791898332009, "grad_norm": 0.4150598645210266, "learning_rate": 0.00013728000000000001, "loss": 1.406, "step": 202 }, { "epoch": 0.6449563145353455, "grad_norm": 0.44659295678138733, "learning_rate": 0.00013696, "loss": 1.4752, "step": 203 }, { "epoch": 0.6481334392374901, "grad_norm": 0.4836420714855194, "learning_rate": 0.00013664000000000002, "loss": 1.3478, "step": 204 }, { "epoch": 0.6513105639396346, "grad_norm": 0.40945902466773987, "learning_rate": 0.00013632, "loss": 1.2042, "step": 205 }, { "epoch": 0.6544876886417792, "grad_norm": 0.4980110228061676, "learning_rate": 0.00013600000000000003, "loss": 1.309, "step": 206 }, { "epoch": 0.6576648133439238, "grad_norm": 0.4770593047142029, "learning_rate": 0.00013568, "loss": 1.4164, "step": 207 }, { "epoch": 0.6608419380460683, "grad_norm": 0.4662317633628845, "learning_rate": 0.00013536, "loss": 1.4044, "step": 208 }, { "epoch": 0.6640190627482129, "grad_norm": 0.4472275674343109, "learning_rate": 0.00013504, "loss": 1.3541, "step": 209 }, { "epoch": 0.6671961874503575, "grad_norm": 0.45574310421943665, "learning_rate": 0.00013472, "loss": 1.406, "step": 210 }, { "epoch": 0.670373312152502, "grad_norm": 0.4748678207397461, "learning_rate": 0.00013440000000000001, "loss": 1.2955, "step": 211 }, { "epoch": 0.6735504368546466, "grad_norm": 0.4513389766216278, "learning_rate": 0.00013408000000000002, "loss": 1.3326, "step": 212 }, { "epoch": 0.6767275615567911, "grad_norm": 0.4360558092594147, "learning_rate": 0.00013376, "loss": 1.2359, "step": 213 }, { "epoch": 0.6799046862589356, "grad_norm": 0.41032615303993225, "learning_rate": 0.00013344, "loss": 1.3916, "step": 214 }, { "epoch": 0.6830818109610802, "grad_norm": 0.46569857001304626, "learning_rate": 0.00013312, "loss": 1.3152, "step": 215 }, { "epoch": 0.6862589356632248, "grad_norm": 0.4858649969100952, "learning_rate": 0.0001328, "loss": 1.3445, "step": 216 }, { "epoch": 0.6894360603653693, "grad_norm": 0.4437476098537445, "learning_rate": 0.00013248, "loss": 1.3565, "step": 217 }, { "epoch": 0.6926131850675139, "grad_norm": 0.47393283247947693, "learning_rate": 0.00013216, "loss": 1.3308, "step": 218 }, { "epoch": 0.6957903097696585, "grad_norm": 0.446773499250412, "learning_rate": 0.00013184, "loss": 1.3424, "step": 219 }, { "epoch": 0.698967434471803, "grad_norm": 0.4282335042953491, "learning_rate": 0.00013152, "loss": 1.3799, "step": 220 }, { "epoch": 0.7021445591739476, "grad_norm": 0.36902791261672974, "learning_rate": 0.00013120000000000002, "loss": 1.2638, "step": 221 }, { "epoch": 0.7053216838760922, "grad_norm": 0.4036352336406708, "learning_rate": 0.00013088, "loss": 1.2865, "step": 222 }, { "epoch": 0.7084988085782367, "grad_norm": 0.4829830825328827, "learning_rate": 0.00013056000000000002, "loss": 1.3685, "step": 223 }, { "epoch": 0.7116759332803813, "grad_norm": 0.425111323595047, "learning_rate": 0.00013024, "loss": 1.3151, "step": 224 }, { "epoch": 0.7148530579825259, "grad_norm": 0.4299517869949341, "learning_rate": 0.00012992, "loss": 1.3412, "step": 225 }, { "epoch": 0.7180301826846703, "grad_norm": 0.4297490417957306, "learning_rate": 0.0001296, "loss": 1.3244, "step": 226 }, { "epoch": 0.7212073073868149, "grad_norm": 0.48203548789024353, "learning_rate": 0.00012928, "loss": 1.3428, "step": 227 }, { "epoch": 0.7243844320889595, "grad_norm": 0.43935510516166687, "learning_rate": 0.00012896, "loss": 1.3323, "step": 228 }, { "epoch": 0.727561556791104, "grad_norm": 0.4296364188194275, "learning_rate": 0.00012864000000000002, "loss": 1.3068, "step": 229 }, { "epoch": 0.7307386814932486, "grad_norm": 0.44215404987335205, "learning_rate": 0.00012832, "loss": 1.3646, "step": 230 }, { "epoch": 0.7339158061953932, "grad_norm": 0.4621836245059967, "learning_rate": 0.00012800000000000002, "loss": 1.3799, "step": 231 }, { "epoch": 0.7370929308975377, "grad_norm": 0.4484768211841583, "learning_rate": 0.00012768, "loss": 1.4356, "step": 232 }, { "epoch": 0.7402700555996823, "grad_norm": 0.4553694427013397, "learning_rate": 0.00012736, "loss": 1.2484, "step": 233 }, { "epoch": 0.7434471803018269, "grad_norm": 0.40847110748291016, "learning_rate": 0.00012704, "loss": 1.2732, "step": 234 }, { "epoch": 0.7466243050039714, "grad_norm": 0.4255897104740143, "learning_rate": 0.00012672, "loss": 1.2625, "step": 235 }, { "epoch": 0.749801429706116, "grad_norm": 0.468524307012558, "learning_rate": 0.0001264, "loss": 1.4405, "step": 236 }, { "epoch": 0.7529785544082606, "grad_norm": 0.48269885778427124, "learning_rate": 0.00012607999999999999, "loss": 1.3345, "step": 237 }, { "epoch": 0.756155679110405, "grad_norm": 0.4441956877708435, "learning_rate": 0.00012576000000000002, "loss": 1.2967, "step": 238 }, { "epoch": 0.7593328038125496, "grad_norm": 0.44516798853874207, "learning_rate": 0.00012544, "loss": 1.2502, "step": 239 }, { "epoch": 0.7625099285146942, "grad_norm": 0.47978633642196655, "learning_rate": 0.00012512000000000002, "loss": 1.3275, "step": 240 }, { "epoch": 0.7656870532168387, "grad_norm": 0.4489109218120575, "learning_rate": 0.0001248, "loss": 1.3594, "step": 241 }, { "epoch": 0.7688641779189833, "grad_norm": 0.46123188734054565, "learning_rate": 0.00012448, "loss": 1.3014, "step": 242 }, { "epoch": 0.7720413026211279, "grad_norm": 0.4514189064502716, "learning_rate": 0.00012416, "loss": 1.2148, "step": 243 }, { "epoch": 0.7752184273232724, "grad_norm": 0.47698548436164856, "learning_rate": 0.00012384, "loss": 1.3562, "step": 244 }, { "epoch": 0.778395552025417, "grad_norm": 0.4442936182022095, "learning_rate": 0.00012352, "loss": 1.3092, "step": 245 }, { "epoch": 0.7815726767275616, "grad_norm": 0.48598411679267883, "learning_rate": 0.0001232, "loss": 1.418, "step": 246 }, { "epoch": 0.7847498014297061, "grad_norm": 0.46551617980003357, "learning_rate": 0.00012288, "loss": 1.261, "step": 247 }, { "epoch": 0.7879269261318507, "grad_norm": 0.4166944921016693, "learning_rate": 0.00012256000000000002, "loss": 1.213, "step": 248 }, { "epoch": 0.7911040508339953, "grad_norm": 0.48919105529785156, "learning_rate": 0.00012224, "loss": 1.3328, "step": 249 }, { "epoch": 0.7942811755361397, "grad_norm": 0.44160059094429016, "learning_rate": 0.00012192000000000001, "loss": 1.2758, "step": 250 }, { "epoch": 0.7974583002382843, "grad_norm": 0.45591601729393005, "learning_rate": 0.0001216, "loss": 1.3771, "step": 251 }, { "epoch": 0.8006354249404289, "grad_norm": 0.4620215892791748, "learning_rate": 0.00012128000000000002, "loss": 1.3787, "step": 252 }, { "epoch": 0.8038125496425734, "grad_norm": 0.4968376159667969, "learning_rate": 0.00012096000000000001, "loss": 1.3798, "step": 253 }, { "epoch": 0.806989674344718, "grad_norm": 0.4507087767124176, "learning_rate": 0.00012064, "loss": 1.3349, "step": 254 }, { "epoch": 0.8101667990468626, "grad_norm": 0.46431511640548706, "learning_rate": 0.00012032000000000001, "loss": 1.359, "step": 255 }, { "epoch": 0.8133439237490071, "grad_norm": 0.46496230363845825, "learning_rate": 0.00012, "loss": 1.2429, "step": 256 }, { "epoch": 0.8165210484511517, "grad_norm": 0.46748438477516174, "learning_rate": 0.00011968000000000002, "loss": 1.3088, "step": 257 }, { "epoch": 0.8196981731532963, "grad_norm": 0.45148542523384094, "learning_rate": 0.00011936000000000001, "loss": 1.2191, "step": 258 }, { "epoch": 0.8228752978554408, "grad_norm": 0.4253683388233185, "learning_rate": 0.00011904, "loss": 1.2915, "step": 259 }, { "epoch": 0.8260524225575854, "grad_norm": 0.506744384765625, "learning_rate": 0.00011872000000000002, "loss": 1.3271, "step": 260 }, { "epoch": 0.82922954725973, "grad_norm": 0.4920015335083008, "learning_rate": 0.0001184, "loss": 1.3933, "step": 261 }, { "epoch": 0.8324066719618745, "grad_norm": 0.4514538645744324, "learning_rate": 0.00011808000000000001, "loss": 1.356, "step": 262 }, { "epoch": 0.835583796664019, "grad_norm": 0.5036830306053162, "learning_rate": 0.00011776, "loss": 1.2622, "step": 263 }, { "epoch": 0.8387609213661636, "grad_norm": 0.5152455568313599, "learning_rate": 0.00011744000000000001, "loss": 1.3674, "step": 264 }, { "epoch": 0.8419380460683081, "grad_norm": 0.4376108944416046, "learning_rate": 0.00011712, "loss": 1.3758, "step": 265 }, { "epoch": 0.8451151707704527, "grad_norm": 0.4190007746219635, "learning_rate": 0.00011679999999999999, "loss": 1.2389, "step": 266 }, { "epoch": 0.8482922954725973, "grad_norm": 0.5019193291664124, "learning_rate": 0.00011648000000000001, "loss": 1.3014, "step": 267 }, { "epoch": 0.8514694201747418, "grad_norm": 0.47944578528404236, "learning_rate": 0.00011616, "loss": 1.3507, "step": 268 }, { "epoch": 0.8546465448768864, "grad_norm": 0.4307346045970917, "learning_rate": 0.00011584000000000002, "loss": 1.2211, "step": 269 }, { "epoch": 0.857823669579031, "grad_norm": 0.5099300742149353, "learning_rate": 0.00011552, "loss": 1.3172, "step": 270 }, { "epoch": 0.8610007942811755, "grad_norm": 0.41971608996391296, "learning_rate": 0.0001152, "loss": 1.2691, "step": 271 }, { "epoch": 0.8641779189833201, "grad_norm": 0.4612553119659424, "learning_rate": 0.00011488000000000001, "loss": 1.3128, "step": 272 }, { "epoch": 0.8673550436854647, "grad_norm": 0.4589272141456604, "learning_rate": 0.00011456, "loss": 1.3275, "step": 273 }, { "epoch": 0.8705321683876092, "grad_norm": 0.47001925110816956, "learning_rate": 0.00011424000000000002, "loss": 1.2607, "step": 274 }, { "epoch": 0.8737092930897538, "grad_norm": 0.4315769672393799, "learning_rate": 0.00011392000000000001, "loss": 1.3215, "step": 275 }, { "epoch": 0.8768864177918984, "grad_norm": 0.45138058066368103, "learning_rate": 0.0001136, "loss": 1.332, "step": 276 }, { "epoch": 0.8800635424940428, "grad_norm": 0.4450497329235077, "learning_rate": 0.00011328000000000001, "loss": 1.3474, "step": 277 }, { "epoch": 0.8832406671961874, "grad_norm": 0.4595153033733368, "learning_rate": 0.00011296, "loss": 1.27, "step": 278 }, { "epoch": 0.886417791898332, "grad_norm": 0.42433419823646545, "learning_rate": 0.00011264, "loss": 1.3352, "step": 279 }, { "epoch": 0.8895949166004765, "grad_norm": 0.44947418570518494, "learning_rate": 0.00011232000000000001, "loss": 1.2532, "step": 280 }, { "epoch": 0.8927720413026211, "grad_norm": 0.4503403604030609, "learning_rate": 0.00011200000000000001, "loss": 1.3267, "step": 281 }, { "epoch": 0.8959491660047657, "grad_norm": 0.418992280960083, "learning_rate": 0.00011168, "loss": 1.2699, "step": 282 }, { "epoch": 0.8991262907069102, "grad_norm": 0.4266560971736908, "learning_rate": 0.00011135999999999999, "loss": 1.169, "step": 283 }, { "epoch": 0.9023034154090548, "grad_norm": 0.5053189396858215, "learning_rate": 0.00011104000000000001, "loss": 1.4047, "step": 284 }, { "epoch": 0.9054805401111994, "grad_norm": 0.5122870206832886, "learning_rate": 0.00011072, "loss": 1.3834, "step": 285 }, { "epoch": 0.9086576648133439, "grad_norm": 0.43556493520736694, "learning_rate": 0.00011040000000000001, "loss": 1.2163, "step": 286 }, { "epoch": 0.9118347895154885, "grad_norm": 0.4655609130859375, "learning_rate": 0.00011008, "loss": 1.2967, "step": 287 }, { "epoch": 0.9150119142176331, "grad_norm": 0.4987747371196747, "learning_rate": 0.00010975999999999999, "loss": 1.2905, "step": 288 }, { "epoch": 0.9181890389197777, "grad_norm": 0.4585645794868469, "learning_rate": 0.00010944000000000001, "loss": 1.2875, "step": 289 }, { "epoch": 0.9213661636219221, "grad_norm": 0.5033825039863586, "learning_rate": 0.00010912, "loss": 1.2772, "step": 290 }, { "epoch": 0.9245432883240667, "grad_norm": 0.4755001962184906, "learning_rate": 0.00010880000000000002, "loss": 1.4525, "step": 291 }, { "epoch": 0.9277204130262113, "grad_norm": 0.43799713253974915, "learning_rate": 0.00010848, "loss": 1.3104, "step": 292 }, { "epoch": 0.9308975377283558, "grad_norm": 0.43732205033302307, "learning_rate": 0.00010816, "loss": 1.2373, "step": 293 }, { "epoch": 0.9340746624305004, "grad_norm": 0.45804721117019653, "learning_rate": 0.00010784000000000001, "loss": 1.3313, "step": 294 }, { "epoch": 0.937251787132645, "grad_norm": 0.49885255098342896, "learning_rate": 0.00010752, "loss": 1.3134, "step": 295 }, { "epoch": 0.9404289118347895, "grad_norm": 0.4742017090320587, "learning_rate": 0.00010720000000000002, "loss": 1.323, "step": 296 }, { "epoch": 0.9436060365369341, "grad_norm": 0.4221518039703369, "learning_rate": 0.00010688, "loss": 1.3479, "step": 297 }, { "epoch": 0.9467831612390787, "grad_norm": 0.4776606261730194, "learning_rate": 0.00010656000000000001, "loss": 1.265, "step": 298 }, { "epoch": 0.9499602859412232, "grad_norm": 0.49409452080726624, "learning_rate": 0.00010624000000000001, "loss": 1.2697, "step": 299 }, { "epoch": 0.9531374106433678, "grad_norm": 0.4598381817340851, "learning_rate": 0.00010592, "loss": 1.2518, "step": 300 }, { "epoch": 0.9563145353455124, "grad_norm": 0.43075883388519287, "learning_rate": 0.0001056, "loss": 1.2483, "step": 301 }, { "epoch": 0.9594916600476568, "grad_norm": 0.5096505880355835, "learning_rate": 0.00010528, "loss": 1.4291, "step": 302 }, { "epoch": 0.9626687847498014, "grad_norm": 0.4315980076789856, "learning_rate": 0.00010496000000000001, "loss": 1.2332, "step": 303 }, { "epoch": 0.965845909451946, "grad_norm": 0.47984281182289124, "learning_rate": 0.00010464, "loss": 1.3108, "step": 304 }, { "epoch": 0.9690230341540905, "grad_norm": 0.4698749780654907, "learning_rate": 0.00010431999999999999, "loss": 1.3103, "step": 305 }, { "epoch": 0.9722001588562351, "grad_norm": 0.465999960899353, "learning_rate": 0.00010400000000000001, "loss": 1.3343, "step": 306 }, { "epoch": 0.9753772835583797, "grad_norm": 0.43465176224708557, "learning_rate": 0.00010368, "loss": 1.1649, "step": 307 }, { "epoch": 0.9785544082605242, "grad_norm": 0.4245821237564087, "learning_rate": 0.00010336000000000001, "loss": 1.273, "step": 308 }, { "epoch": 0.9817315329626688, "grad_norm": 0.43245622515678406, "learning_rate": 0.00010304, "loss": 1.2443, "step": 309 }, { "epoch": 0.9849086576648134, "grad_norm": 0.4845837950706482, "learning_rate": 0.00010271999999999999, "loss": 1.2649, "step": 310 }, { "epoch": 0.9880857823669579, "grad_norm": 0.424667090177536, "learning_rate": 0.00010240000000000001, "loss": 1.2708, "step": 311 }, { "epoch": 0.9912629070691025, "grad_norm": 0.43120723962783813, "learning_rate": 0.00010208, "loss": 1.2844, "step": 312 }, { "epoch": 0.9944400317712471, "grad_norm": 0.4800574481487274, "learning_rate": 0.00010176000000000002, "loss": 1.379, "step": 313 }, { "epoch": 0.9976171564733916, "grad_norm": 0.5008915066719055, "learning_rate": 0.00010144, "loss": 1.3679, "step": 314 }, { "epoch": 1.0, "grad_norm": 0.6201555132865906, "learning_rate": 0.00010112000000000002, "loss": 1.2757, "step": 315 } ], "logging_steps": 1, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.203279169783808e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }