| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 315, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003177124702144559, | |
| "grad_norm": 0.9960207343101501, | |
| "learning_rate": 0.0, | |
| "loss": 2.9881, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006354249404289118, | |
| "grad_norm": 0.9263616800308228, | |
| "learning_rate": 4e-05, | |
| "loss": 2.9644, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.009531374106433678, | |
| "grad_norm": 0.9059100151062012, | |
| "learning_rate": 8e-05, | |
| "loss": 2.7584, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012708498808578236, | |
| "grad_norm": 0.8163802027702332, | |
| "learning_rate": 0.00012, | |
| "loss": 2.7029, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.015885623510722795, | |
| "grad_norm": 0.7039979696273804, | |
| "learning_rate": 0.00016, | |
| "loss": 2.5991, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.019062748212867357, | |
| "grad_norm": 0.5652405023574829, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4187, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.022239872915011914, | |
| "grad_norm": 0.7419441342353821, | |
| "learning_rate": 0.00019968, | |
| "loss": 2.2885, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.025416997617156472, | |
| "grad_norm": 0.6665301322937012, | |
| "learning_rate": 0.00019936000000000002, | |
| "loss": 2.3176, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.028594122319301033, | |
| "grad_norm": 0.5366690158843994, | |
| "learning_rate": 0.00019904, | |
| "loss": 2.2728, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03177124702144559, | |
| "grad_norm": 0.5066737532615662, | |
| "learning_rate": 0.00019872000000000002, | |
| "loss": 2.068, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03494837172359015, | |
| "grad_norm": 0.5810503959655762, | |
| "learning_rate": 0.0001984, | |
| "loss": 2.2197, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03812549642573471, | |
| "grad_norm": 0.47433704137802124, | |
| "learning_rate": 0.00019808, | |
| "loss": 1.9871, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04130262112787927, | |
| "grad_norm": 0.49652695655822754, | |
| "learning_rate": 0.00019776, | |
| "loss": 1.9139, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04447974583002383, | |
| "grad_norm": 0.4280414879322052, | |
| "learning_rate": 0.00019744, | |
| "loss": 1.8852, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04765687053216839, | |
| "grad_norm": 0.596341609954834, | |
| "learning_rate": 0.00019712, | |
| "loss": 1.9809, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.050833995234312944, | |
| "grad_norm": 0.5067018866539001, | |
| "learning_rate": 0.0001968, | |
| "loss": 1.9549, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.054011119936457505, | |
| "grad_norm": 0.4348883032798767, | |
| "learning_rate": 0.00019648000000000002, | |
| "loss": 1.9078, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.057188244638602066, | |
| "grad_norm": 0.4295920133590698, | |
| "learning_rate": 0.00019616000000000002, | |
| "loss": 1.8234, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06036536934074663, | |
| "grad_norm": 0.43549808859825134, | |
| "learning_rate": 0.00019584, | |
| "loss": 1.9143, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06354249404289118, | |
| "grad_norm": 0.4168950319290161, | |
| "learning_rate": 0.00019552000000000003, | |
| "loss": 1.7874, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06671961874503574, | |
| "grad_norm": 0.4715218245983124, | |
| "learning_rate": 0.0001952, | |
| "loss": 1.8926, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0698967434471803, | |
| "grad_norm": 0.38382261991500854, | |
| "learning_rate": 0.00019488000000000003, | |
| "loss": 1.7832, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07307386814932486, | |
| "grad_norm": 0.43702301383018494, | |
| "learning_rate": 0.00019456, | |
| "loss": 1.7447, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07625099285146943, | |
| "grad_norm": 0.4813467562198639, | |
| "learning_rate": 0.00019424, | |
| "loss": 1.8851, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.07942811755361398, | |
| "grad_norm": 0.4026224911212921, | |
| "learning_rate": 0.00019392000000000001, | |
| "loss": 1.8036, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08260524225575853, | |
| "grad_norm": 0.39529579877853394, | |
| "learning_rate": 0.00019360000000000002, | |
| "loss": 1.7207, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0857823669579031, | |
| "grad_norm": 0.4045431911945343, | |
| "learning_rate": 0.00019328000000000002, | |
| "loss": 1.7962, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08895949166004766, | |
| "grad_norm": 0.3818039894104004, | |
| "learning_rate": 0.00019296, | |
| "loss": 1.68, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09213661636219221, | |
| "grad_norm": 0.3767971098423004, | |
| "learning_rate": 0.00019264, | |
| "loss": 1.7949, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09531374106433678, | |
| "grad_norm": 0.38719820976257324, | |
| "learning_rate": 0.00019232, | |
| "loss": 1.7069, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09849086576648133, | |
| "grad_norm": 0.416172057390213, | |
| "learning_rate": 0.000192, | |
| "loss": 1.6897, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.10166799046862589, | |
| "grad_norm": 0.3721797466278076, | |
| "learning_rate": 0.00019168, | |
| "loss": 1.6996, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10484511517077046, | |
| "grad_norm": 0.4110720753669739, | |
| "learning_rate": 0.00019136, | |
| "loss": 1.5606, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.10802223987291501, | |
| "grad_norm": 0.3850787580013275, | |
| "learning_rate": 0.00019104000000000001, | |
| "loss": 1.7615, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.11119936457505956, | |
| "grad_norm": 0.33883488178253174, | |
| "learning_rate": 0.00019072000000000002, | |
| "loss": 1.5231, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11437648927720413, | |
| "grad_norm": 0.37157201766967773, | |
| "learning_rate": 0.0001904, | |
| "loss": 1.6246, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11755361397934869, | |
| "grad_norm": 0.3907526433467865, | |
| "learning_rate": 0.00019008000000000002, | |
| "loss": 1.6537, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.12073073868149325, | |
| "grad_norm": 0.39647847414016724, | |
| "learning_rate": 0.00018976, | |
| "loss": 1.6193, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.12390786338363781, | |
| "grad_norm": 0.34513983130455017, | |
| "learning_rate": 0.00018944000000000003, | |
| "loss": 1.4992, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12708498808578236, | |
| "grad_norm": 0.4174259305000305, | |
| "learning_rate": 0.00018912, | |
| "loss": 1.6547, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13026211278792693, | |
| "grad_norm": 0.42140164971351624, | |
| "learning_rate": 0.0001888, | |
| "loss": 1.6165, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.13343923749007147, | |
| "grad_norm": 0.40260136127471924, | |
| "learning_rate": 0.00018848, | |
| "loss": 1.5583, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.13661636219221604, | |
| "grad_norm": 0.42584484815597534, | |
| "learning_rate": 0.00018816000000000001, | |
| "loss": 1.5742, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1397934868943606, | |
| "grad_norm": 0.3613159954547882, | |
| "learning_rate": 0.00018784000000000002, | |
| "loss": 1.6282, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.14297061159650518, | |
| "grad_norm": 0.45315852761268616, | |
| "learning_rate": 0.00018752, | |
| "loss": 1.6274, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14614773629864972, | |
| "grad_norm": 0.3528841733932495, | |
| "learning_rate": 0.00018720000000000002, | |
| "loss": 1.5076, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.14932486100079428, | |
| "grad_norm": 0.4335366487503052, | |
| "learning_rate": 0.00018688, | |
| "loss": 1.5561, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.15250198570293885, | |
| "grad_norm": 0.4157211184501648, | |
| "learning_rate": 0.00018656, | |
| "loss": 1.4963, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1556791104050834, | |
| "grad_norm": 0.3932294249534607, | |
| "learning_rate": 0.00018624, | |
| "loss": 1.5381, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.15885623510722796, | |
| "grad_norm": 0.47142326831817627, | |
| "learning_rate": 0.00018592, | |
| "loss": 1.7029, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16203335980937253, | |
| "grad_norm": 0.46922552585601807, | |
| "learning_rate": 0.0001856, | |
| "loss": 1.5722, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.16521048451151707, | |
| "grad_norm": 0.40435677766799927, | |
| "learning_rate": 0.00018528000000000001, | |
| "loss": 1.5885, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.16838760921366164, | |
| "grad_norm": 0.4449491500854492, | |
| "learning_rate": 0.00018496, | |
| "loss": 1.5691, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1715647339158062, | |
| "grad_norm": 0.46489715576171875, | |
| "learning_rate": 0.00018464000000000002, | |
| "loss": 1.5736, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.17474185861795075, | |
| "grad_norm": 0.4461865723133087, | |
| "learning_rate": 0.00018432, | |
| "loss": 1.5359, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17791898332009531, | |
| "grad_norm": 0.4674195349216461, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 1.4617, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.18109610802223988, | |
| "grad_norm": 0.3901899755001068, | |
| "learning_rate": 0.00018368, | |
| "loss": 1.5141, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.18427323272438442, | |
| "grad_norm": 0.46142131090164185, | |
| "learning_rate": 0.00018336, | |
| "loss": 1.5436, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.187450357426529, | |
| "grad_norm": 0.4317268431186676, | |
| "learning_rate": 0.00018304, | |
| "loss": 1.56, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.19062748212867356, | |
| "grad_norm": 0.44914504885673523, | |
| "learning_rate": 0.00018272, | |
| "loss": 1.5477, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1938046068308181, | |
| "grad_norm": 0.43380966782569885, | |
| "learning_rate": 0.00018240000000000002, | |
| "loss": 1.4989, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.19698173153296267, | |
| "grad_norm": 0.41798654198646545, | |
| "learning_rate": 0.00018208000000000002, | |
| "loss": 1.3978, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.20015885623510724, | |
| "grad_norm": 0.4322330355644226, | |
| "learning_rate": 0.00018176000000000002, | |
| "loss": 1.5094, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.20333598093725178, | |
| "grad_norm": 0.4732660949230194, | |
| "learning_rate": 0.00018144, | |
| "loss": 1.4756, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.20651310563939634, | |
| "grad_norm": 0.41877272725105286, | |
| "learning_rate": 0.00018112, | |
| "loss": 1.4598, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2096902303415409, | |
| "grad_norm": 0.46112221479415894, | |
| "learning_rate": 0.0001808, | |
| "loss": 1.5562, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.21286735504368545, | |
| "grad_norm": 0.3946124017238617, | |
| "learning_rate": 0.00018048, | |
| "loss": 1.4517, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.21604447974583002, | |
| "grad_norm": 0.452828586101532, | |
| "learning_rate": 0.00018016, | |
| "loss": 1.5298, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2192216044479746, | |
| "grad_norm": 0.45543792843818665, | |
| "learning_rate": 0.00017984, | |
| "loss": 1.5945, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.22239872915011913, | |
| "grad_norm": 0.4937468469142914, | |
| "learning_rate": 0.00017952, | |
| "loss": 1.5111, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2255758538522637, | |
| "grad_norm": 0.43769434094429016, | |
| "learning_rate": 0.00017920000000000002, | |
| "loss": 1.5139, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.22875297855440826, | |
| "grad_norm": 0.42178353667259216, | |
| "learning_rate": 0.00017888, | |
| "loss": 1.3964, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.23193010325655283, | |
| "grad_norm": 0.4274325668811798, | |
| "learning_rate": 0.00017856000000000003, | |
| "loss": 1.4176, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.23510722795869737, | |
| "grad_norm": 0.4603947401046753, | |
| "learning_rate": 0.00017824, | |
| "loss": 1.549, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.23828435266084194, | |
| "grad_norm": 0.4948660731315613, | |
| "learning_rate": 0.00017792, | |
| "loss": 1.4564, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2414614773629865, | |
| "grad_norm": 0.4219314455986023, | |
| "learning_rate": 0.0001776, | |
| "loss": 1.4946, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.24463860206513105, | |
| "grad_norm": 0.49445462226867676, | |
| "learning_rate": 0.00017728, | |
| "loss": 1.5655, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.24781572676727562, | |
| "grad_norm": 0.4661003053188324, | |
| "learning_rate": 0.00017696, | |
| "loss": 1.5347, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2509928514694202, | |
| "grad_norm": 0.49738094210624695, | |
| "learning_rate": 0.00017664000000000002, | |
| "loss": 1.5218, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2541699761715647, | |
| "grad_norm": 0.44844523072242737, | |
| "learning_rate": 0.00017632000000000002, | |
| "loss": 1.4657, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.25734710087370927, | |
| "grad_norm": 0.5222679972648621, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 1.4799, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.26052422557585386, | |
| "grad_norm": 0.5003090500831604, | |
| "learning_rate": 0.00017568, | |
| "loss": 1.4998, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2637013502779984, | |
| "grad_norm": 0.4072366952896118, | |
| "learning_rate": 0.00017536, | |
| "loss": 1.5213, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.26687847498014294, | |
| "grad_norm": 0.42663538455963135, | |
| "learning_rate": 0.00017504, | |
| "loss": 1.5446, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.27005559968228754, | |
| "grad_norm": 0.45552435517311096, | |
| "learning_rate": 0.00017472, | |
| "loss": 1.5624, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2732327243844321, | |
| "grad_norm": 0.463173508644104, | |
| "learning_rate": 0.0001744, | |
| "loss": 1.4161, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2764098490865767, | |
| "grad_norm": 0.4052661955356598, | |
| "learning_rate": 0.00017408, | |
| "loss": 1.5228, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2795869737887212, | |
| "grad_norm": 0.3988233506679535, | |
| "learning_rate": 0.00017376000000000002, | |
| "loss": 1.3896, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.28276409849086576, | |
| "grad_norm": 0.3923889398574829, | |
| "learning_rate": 0.00017344, | |
| "loss": 1.4572, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.28594122319301035, | |
| "grad_norm": 0.46868669986724854, | |
| "learning_rate": 0.00017312000000000002, | |
| "loss": 1.5586, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2891183478951549, | |
| "grad_norm": 0.43891963362693787, | |
| "learning_rate": 0.0001728, | |
| "loss": 1.5185, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.29229547259729943, | |
| "grad_norm": 0.4684846103191376, | |
| "learning_rate": 0.00017248000000000003, | |
| "loss": 1.4992, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.29547259729944403, | |
| "grad_norm": 0.4795592725276947, | |
| "learning_rate": 0.00017216, | |
| "loss": 1.4987, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.29864972200158857, | |
| "grad_norm": 0.4201822578907013, | |
| "learning_rate": 0.00017184, | |
| "loss": 1.4367, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3018268467037331, | |
| "grad_norm": 0.443697065114975, | |
| "learning_rate": 0.00017152, | |
| "loss": 1.4436, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3050039714058777, | |
| "grad_norm": 0.4432813823223114, | |
| "learning_rate": 0.00017120000000000001, | |
| "loss": 1.523, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.30818109610802225, | |
| "grad_norm": 0.43522974848747253, | |
| "learning_rate": 0.00017088000000000002, | |
| "loss": 1.3767, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3113582208101668, | |
| "grad_norm": 0.396990031003952, | |
| "learning_rate": 0.00017056000000000002, | |
| "loss": 1.3021, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3145353455123114, | |
| "grad_norm": 0.462819904088974, | |
| "learning_rate": 0.00017024, | |
| "loss": 1.455, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3177124702144559, | |
| "grad_norm": 0.41220882534980774, | |
| "learning_rate": 0.00016992, | |
| "loss": 1.4471, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.32088959491660046, | |
| "grad_norm": 0.47001487016677856, | |
| "learning_rate": 0.0001696, | |
| "loss": 1.4668, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.32406671961874506, | |
| "grad_norm": 0.4349619150161743, | |
| "learning_rate": 0.00016928, | |
| "loss": 1.4413, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3272438443208896, | |
| "grad_norm": 0.47175517678260803, | |
| "learning_rate": 0.00016896, | |
| "loss": 1.4546, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.33042096902303414, | |
| "grad_norm": 0.4192788600921631, | |
| "learning_rate": 0.00016863999999999998, | |
| "loss": 1.4569, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.33359809372517873, | |
| "grad_norm": 0.4177974462509155, | |
| "learning_rate": 0.00016832000000000001, | |
| "loss": 1.3423, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3367752184273233, | |
| "grad_norm": 0.4190915524959564, | |
| "learning_rate": 0.000168, | |
| "loss": 1.3857, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3399523431294678, | |
| "grad_norm": 0.42924079298973083, | |
| "learning_rate": 0.00016768000000000002, | |
| "loss": 1.4038, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3431294678316124, | |
| "grad_norm": 0.425611287355423, | |
| "learning_rate": 0.00016736, | |
| "loss": 1.395, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.34630659253375695, | |
| "grad_norm": 0.4815029799938202, | |
| "learning_rate": 0.00016704000000000003, | |
| "loss": 1.4251, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3494837172359015, | |
| "grad_norm": 0.45862439274787903, | |
| "learning_rate": 0.00016672, | |
| "loss": 1.3488, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3526608419380461, | |
| "grad_norm": 0.46242061257362366, | |
| "learning_rate": 0.0001664, | |
| "loss": 1.393, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.35583796664019063, | |
| "grad_norm": 0.46360430121421814, | |
| "learning_rate": 0.00016608, | |
| "loss": 1.4168, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.35901509134233517, | |
| "grad_norm": 0.45501938462257385, | |
| "learning_rate": 0.00016576, | |
| "loss": 1.4718, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.36219221604447976, | |
| "grad_norm": 0.44886520504951477, | |
| "learning_rate": 0.00016544000000000002, | |
| "loss": 1.4206, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3653693407466243, | |
| "grad_norm": 0.5432794690132141, | |
| "learning_rate": 0.00016512000000000002, | |
| "loss": 1.369, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.36854646544876885, | |
| "grad_norm": 0.40831825137138367, | |
| "learning_rate": 0.0001648, | |
| "loss": 1.3482, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.37172359015091344, | |
| "grad_norm": 0.4690685272216797, | |
| "learning_rate": 0.00016448000000000002, | |
| "loss": 1.4799, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.374900714853058, | |
| "grad_norm": 0.4517834782600403, | |
| "learning_rate": 0.00016416, | |
| "loss": 1.3597, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3780778395552025, | |
| "grad_norm": 0.45939838886260986, | |
| "learning_rate": 0.00016384, | |
| "loss": 1.4188, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.3812549642573471, | |
| "grad_norm": 0.4444164037704468, | |
| "learning_rate": 0.00016352, | |
| "loss": 1.4501, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.38443208895949166, | |
| "grad_norm": 0.46045759320259094, | |
| "learning_rate": 0.0001632, | |
| "loss": 1.4315, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3876092136616362, | |
| "grad_norm": 0.46573230624198914, | |
| "learning_rate": 0.00016288, | |
| "loss": 1.3214, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3907863383637808, | |
| "grad_norm": 0.46668779850006104, | |
| "learning_rate": 0.00016256, | |
| "loss": 1.3008, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.39396346306592533, | |
| "grad_norm": 0.45954373478889465, | |
| "learning_rate": 0.00016224000000000002, | |
| "loss": 1.4209, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3971405877680699, | |
| "grad_norm": 0.433923602104187, | |
| "learning_rate": 0.00016192, | |
| "loss": 1.4436, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.40031771247021447, | |
| "grad_norm": 0.49414584040641785, | |
| "learning_rate": 0.00016160000000000002, | |
| "loss": 1.3914, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.403494837172359, | |
| "grad_norm": 0.4280381202697754, | |
| "learning_rate": 0.00016128, | |
| "loss": 1.4302, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.40667196187450355, | |
| "grad_norm": 0.5049663782119751, | |
| "learning_rate": 0.00016096, | |
| "loss": 1.5463, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.40984908657664815, | |
| "grad_norm": 0.3671952486038208, | |
| "learning_rate": 0.00016064, | |
| "loss": 1.3469, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.4130262112787927, | |
| "grad_norm": 0.4638643264770508, | |
| "learning_rate": 0.00016032, | |
| "loss": 1.3772, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.41620333598093723, | |
| "grad_norm": 0.4278906583786011, | |
| "learning_rate": 0.00016, | |
| "loss": 1.4239, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4193804606830818, | |
| "grad_norm": 0.45057350397109985, | |
| "learning_rate": 0.00015968000000000002, | |
| "loss": 1.3649, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.42255758538522636, | |
| "grad_norm": 0.4940052330493927, | |
| "learning_rate": 0.00015936, | |
| "loss": 1.4542, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4257347100873709, | |
| "grad_norm": 0.48272138833999634, | |
| "learning_rate": 0.00015904000000000002, | |
| "loss": 1.4525, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4289118347895155, | |
| "grad_norm": 0.4591176211833954, | |
| "learning_rate": 0.00015872, | |
| "loss": 1.372, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.43208895949166004, | |
| "grad_norm": 0.39564651250839233, | |
| "learning_rate": 0.00015840000000000003, | |
| "loss": 1.2387, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4352660841938046, | |
| "grad_norm": 0.4640074670314789, | |
| "learning_rate": 0.00015808, | |
| "loss": 1.4381, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4384432088959492, | |
| "grad_norm": 0.4523836374282837, | |
| "learning_rate": 0.00015776, | |
| "loss": 1.3681, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4416203335980937, | |
| "grad_norm": 0.4463924765586853, | |
| "learning_rate": 0.00015744, | |
| "loss": 1.4002, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.44479745830023826, | |
| "grad_norm": 0.4263816177845001, | |
| "learning_rate": 0.00015712000000000001, | |
| "loss": 1.3452, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.44797458300238285, | |
| "grad_norm": 0.4039861857891083, | |
| "learning_rate": 0.00015680000000000002, | |
| "loss": 1.3888, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4511517077045274, | |
| "grad_norm": 0.44540414214134216, | |
| "learning_rate": 0.00015648, | |
| "loss": 1.3827, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.45432883240667193, | |
| "grad_norm": 0.4521636664867401, | |
| "learning_rate": 0.00015616000000000002, | |
| "loss": 1.4448, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.45750595710881653, | |
| "grad_norm": 0.46087294816970825, | |
| "learning_rate": 0.00015584, | |
| "loss": 1.374, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.46068308181096107, | |
| "grad_norm": 0.43480321764945984, | |
| "learning_rate": 0.00015552, | |
| "loss": 1.4021, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.46386020651310567, | |
| "grad_norm": 0.48551246523857117, | |
| "learning_rate": 0.0001552, | |
| "loss": 1.3729, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4670373312152502, | |
| "grad_norm": 0.44551774859428406, | |
| "learning_rate": 0.00015488, | |
| "loss": 1.3628, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.47021445591739475, | |
| "grad_norm": 0.43176624178886414, | |
| "learning_rate": 0.00015456, | |
| "loss": 1.4927, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.47339158061953934, | |
| "grad_norm": 0.47492435574531555, | |
| "learning_rate": 0.00015424000000000001, | |
| "loss": 1.4361, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.4765687053216839, | |
| "grad_norm": 0.46715089678764343, | |
| "learning_rate": 0.00015392, | |
| "loss": 1.382, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4797458300238284, | |
| "grad_norm": 0.44686493277549744, | |
| "learning_rate": 0.00015360000000000002, | |
| "loss": 1.4603, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.482922954725973, | |
| "grad_norm": 0.429262638092041, | |
| "learning_rate": 0.00015328, | |
| "loss": 1.3358, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.48610007942811756, | |
| "grad_norm": 0.4371509850025177, | |
| "learning_rate": 0.00015296000000000003, | |
| "loss": 1.4007, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4892772041302621, | |
| "grad_norm": 0.44418609142303467, | |
| "learning_rate": 0.00015264, | |
| "loss": 1.2738, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.4924543288324067, | |
| "grad_norm": 0.44011855125427246, | |
| "learning_rate": 0.00015232, | |
| "loss": 1.3724, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.49563145353455124, | |
| "grad_norm": 0.4057789742946625, | |
| "learning_rate": 0.000152, | |
| "loss": 1.3708, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.4988085782366958, | |
| "grad_norm": 0.462645024061203, | |
| "learning_rate": 0.00015168, | |
| "loss": 1.3377, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5019857029388404, | |
| "grad_norm": 0.48834553360939026, | |
| "learning_rate": 0.00015136000000000001, | |
| "loss": 1.393, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5051628276409849, | |
| "grad_norm": 0.4350643754005432, | |
| "learning_rate": 0.00015104, | |
| "loss": 1.3838, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5083399523431295, | |
| "grad_norm": 0.529528796672821, | |
| "learning_rate": 0.00015072000000000002, | |
| "loss": 1.4754, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.511517077045274, | |
| "grad_norm": 0.43042004108428955, | |
| "learning_rate": 0.0001504, | |
| "loss": 1.2977, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5146942017474185, | |
| "grad_norm": 0.4385746717453003, | |
| "learning_rate": 0.00015008, | |
| "loss": 1.3612, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5178713264495631, | |
| "grad_norm": 0.4804169535636902, | |
| "learning_rate": 0.00014976, | |
| "loss": 1.3587, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5210484511517077, | |
| "grad_norm": 0.43496274948120117, | |
| "learning_rate": 0.00014944, | |
| "loss": 1.3683, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5242255758538522, | |
| "grad_norm": 0.4485699534416199, | |
| "learning_rate": 0.00014912, | |
| "loss": 1.3987, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5274027005559968, | |
| "grad_norm": 0.4077622592449188, | |
| "learning_rate": 0.0001488, | |
| "loss": 1.3115, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5305798252581414, | |
| "grad_norm": 0.480747789144516, | |
| "learning_rate": 0.00014848, | |
| "loss": 1.4257, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5337569499602859, | |
| "grad_norm": 0.4339999854564667, | |
| "learning_rate": 0.00014816000000000002, | |
| "loss": 1.3689, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5369340746624305, | |
| "grad_norm": 0.4264179766178131, | |
| "learning_rate": 0.00014784, | |
| "loss": 1.3877, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5401111993645751, | |
| "grad_norm": 0.4252622127532959, | |
| "learning_rate": 0.00014752000000000002, | |
| "loss": 1.3887, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5432883240667196, | |
| "grad_norm": 0.42078137397766113, | |
| "learning_rate": 0.0001472, | |
| "loss": 1.388, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5464654487688642, | |
| "grad_norm": 0.4306480586528778, | |
| "learning_rate": 0.00014688000000000003, | |
| "loss": 1.3366, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5496425734710088, | |
| "grad_norm": 0.4413485825061798, | |
| "learning_rate": 0.00014656, | |
| "loss": 1.4593, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5528196981731534, | |
| "grad_norm": 0.5051458477973938, | |
| "learning_rate": 0.00014624, | |
| "loss": 1.3824, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5559968228752978, | |
| "grad_norm": 0.42062053084373474, | |
| "learning_rate": 0.00014592, | |
| "loss": 1.2272, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5591739475774424, | |
| "grad_norm": 0.46674421429634094, | |
| "learning_rate": 0.00014560000000000002, | |
| "loss": 1.4548, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.562351072279587, | |
| "grad_norm": 0.44679704308509827, | |
| "learning_rate": 0.00014528000000000002, | |
| "loss": 1.4101, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5655281969817315, | |
| "grad_norm": 0.4165225327014923, | |
| "learning_rate": 0.00014496, | |
| "loss": 1.3625, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5687053216838761, | |
| "grad_norm": 0.4735226631164551, | |
| "learning_rate": 0.00014464, | |
| "loss": 1.4569, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5718824463860207, | |
| "grad_norm": 0.4978485405445099, | |
| "learning_rate": 0.00014432, | |
| "loss": 1.3913, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5750595710881652, | |
| "grad_norm": 0.5753241181373596, | |
| "learning_rate": 0.000144, | |
| "loss": 1.3885, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5782366957903098, | |
| "grad_norm": 0.4427070617675781, | |
| "learning_rate": 0.00014368, | |
| "loss": 1.4068, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5814138204924544, | |
| "grad_norm": 0.4505138099193573, | |
| "learning_rate": 0.00014336, | |
| "loss": 1.3579, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5845909451945989, | |
| "grad_norm": 0.5041322708129883, | |
| "learning_rate": 0.00014303999999999999, | |
| "loss": 1.4652, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5877680698967435, | |
| "grad_norm": 0.4617048501968384, | |
| "learning_rate": 0.00014272000000000002, | |
| "loss": 1.3565, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5909451945988881, | |
| "grad_norm": 0.4603610038757324, | |
| "learning_rate": 0.0001424, | |
| "loss": 1.3436, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5941223193010325, | |
| "grad_norm": 0.5165044069290161, | |
| "learning_rate": 0.00014208000000000002, | |
| "loss": 1.4443, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.5972994440031771, | |
| "grad_norm": 0.3984765410423279, | |
| "learning_rate": 0.00014176, | |
| "loss": 1.2166, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6004765687053217, | |
| "grad_norm": 0.5299299955368042, | |
| "learning_rate": 0.00014144000000000003, | |
| "loss": 1.446, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6036536934074662, | |
| "grad_norm": 0.49046239256858826, | |
| "learning_rate": 0.00014112, | |
| "loss": 1.3844, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6068308181096108, | |
| "grad_norm": 0.3969656825065613, | |
| "learning_rate": 0.0001408, | |
| "loss": 1.2135, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6100079428117554, | |
| "grad_norm": 0.4312625527381897, | |
| "learning_rate": 0.00014048, | |
| "loss": 1.3431, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6131850675138999, | |
| "grad_norm": 0.4357926547527313, | |
| "learning_rate": 0.00014016, | |
| "loss": 1.31, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6163621922160445, | |
| "grad_norm": 0.4309421181678772, | |
| "learning_rate": 0.00013984000000000002, | |
| "loss": 1.3617, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6195393169181891, | |
| "grad_norm": 0.4217104911804199, | |
| "learning_rate": 0.00013952000000000002, | |
| "loss": 1.3756, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6227164416203336, | |
| "grad_norm": 0.5252110958099365, | |
| "learning_rate": 0.0001392, | |
| "loss": 1.3824, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6258935663224782, | |
| "grad_norm": 0.4495287537574768, | |
| "learning_rate": 0.00013888, | |
| "loss": 1.3279, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6290706910246228, | |
| "grad_norm": 0.4457398056983948, | |
| "learning_rate": 0.00013856, | |
| "loss": 1.385, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6322478157267672, | |
| "grad_norm": 0.4607657790184021, | |
| "learning_rate": 0.00013824, | |
| "loss": 1.458, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6354249404289118, | |
| "grad_norm": 0.43265438079833984, | |
| "learning_rate": 0.00013792, | |
| "loss": 1.3233, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6386020651310564, | |
| "grad_norm": 0.4238455295562744, | |
| "learning_rate": 0.00013759999999999998, | |
| "loss": 1.3658, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6417791898332009, | |
| "grad_norm": 0.4150598645210266, | |
| "learning_rate": 0.00013728000000000001, | |
| "loss": 1.406, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6449563145353455, | |
| "grad_norm": 0.44659295678138733, | |
| "learning_rate": 0.00013696, | |
| "loss": 1.4752, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6481334392374901, | |
| "grad_norm": 0.4836420714855194, | |
| "learning_rate": 0.00013664000000000002, | |
| "loss": 1.3478, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6513105639396346, | |
| "grad_norm": 0.40945902466773987, | |
| "learning_rate": 0.00013632, | |
| "loss": 1.2042, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6544876886417792, | |
| "grad_norm": 0.4980110228061676, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.309, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6576648133439238, | |
| "grad_norm": 0.4770593047142029, | |
| "learning_rate": 0.00013568, | |
| "loss": 1.4164, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6608419380460683, | |
| "grad_norm": 0.4662317633628845, | |
| "learning_rate": 0.00013536, | |
| "loss": 1.4044, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6640190627482129, | |
| "grad_norm": 0.4472275674343109, | |
| "learning_rate": 0.00013504, | |
| "loss": 1.3541, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6671961874503575, | |
| "grad_norm": 0.45574310421943665, | |
| "learning_rate": 0.00013472, | |
| "loss": 1.406, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.670373312152502, | |
| "grad_norm": 0.4748678207397461, | |
| "learning_rate": 0.00013440000000000001, | |
| "loss": 1.2955, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6735504368546466, | |
| "grad_norm": 0.4513389766216278, | |
| "learning_rate": 0.00013408000000000002, | |
| "loss": 1.3326, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6767275615567911, | |
| "grad_norm": 0.4360558092594147, | |
| "learning_rate": 0.00013376, | |
| "loss": 1.2359, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6799046862589356, | |
| "grad_norm": 0.41032615303993225, | |
| "learning_rate": 0.00013344, | |
| "loss": 1.3916, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6830818109610802, | |
| "grad_norm": 0.46569857001304626, | |
| "learning_rate": 0.00013312, | |
| "loss": 1.3152, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6862589356632248, | |
| "grad_norm": 0.4858649969100952, | |
| "learning_rate": 0.0001328, | |
| "loss": 1.3445, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6894360603653693, | |
| "grad_norm": 0.4437476098537445, | |
| "learning_rate": 0.00013248, | |
| "loss": 1.3565, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6926131850675139, | |
| "grad_norm": 0.47393283247947693, | |
| "learning_rate": 0.00013216, | |
| "loss": 1.3308, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.6957903097696585, | |
| "grad_norm": 0.446773499250412, | |
| "learning_rate": 0.00013184, | |
| "loss": 1.3424, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.698967434471803, | |
| "grad_norm": 0.4282335042953491, | |
| "learning_rate": 0.00013152, | |
| "loss": 1.3799, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7021445591739476, | |
| "grad_norm": 0.36902791261672974, | |
| "learning_rate": 0.00013120000000000002, | |
| "loss": 1.2638, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7053216838760922, | |
| "grad_norm": 0.4036352336406708, | |
| "learning_rate": 0.00013088, | |
| "loss": 1.2865, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7084988085782367, | |
| "grad_norm": 0.4829830825328827, | |
| "learning_rate": 0.00013056000000000002, | |
| "loss": 1.3685, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7116759332803813, | |
| "grad_norm": 0.425111323595047, | |
| "learning_rate": 0.00013024, | |
| "loss": 1.3151, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7148530579825259, | |
| "grad_norm": 0.4299517869949341, | |
| "learning_rate": 0.00012992, | |
| "loss": 1.3412, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7180301826846703, | |
| "grad_norm": 0.4297490417957306, | |
| "learning_rate": 0.0001296, | |
| "loss": 1.3244, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7212073073868149, | |
| "grad_norm": 0.48203548789024353, | |
| "learning_rate": 0.00012928, | |
| "loss": 1.3428, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7243844320889595, | |
| "grad_norm": 0.43935510516166687, | |
| "learning_rate": 0.00012896, | |
| "loss": 1.3323, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.727561556791104, | |
| "grad_norm": 0.4296364188194275, | |
| "learning_rate": 0.00012864000000000002, | |
| "loss": 1.3068, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7307386814932486, | |
| "grad_norm": 0.44215404987335205, | |
| "learning_rate": 0.00012832, | |
| "loss": 1.3646, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7339158061953932, | |
| "grad_norm": 0.4621836245059967, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 1.3799, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7370929308975377, | |
| "grad_norm": 0.4484768211841583, | |
| "learning_rate": 0.00012768, | |
| "loss": 1.4356, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7402700555996823, | |
| "grad_norm": 0.4553694427013397, | |
| "learning_rate": 0.00012736, | |
| "loss": 1.2484, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7434471803018269, | |
| "grad_norm": 0.40847110748291016, | |
| "learning_rate": 0.00012704, | |
| "loss": 1.2732, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7466243050039714, | |
| "grad_norm": 0.4255897104740143, | |
| "learning_rate": 0.00012672, | |
| "loss": 1.2625, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.749801429706116, | |
| "grad_norm": 0.468524307012558, | |
| "learning_rate": 0.0001264, | |
| "loss": 1.4405, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7529785544082606, | |
| "grad_norm": 0.48269885778427124, | |
| "learning_rate": 0.00012607999999999999, | |
| "loss": 1.3345, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.756155679110405, | |
| "grad_norm": 0.4441956877708435, | |
| "learning_rate": 0.00012576000000000002, | |
| "loss": 1.2967, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7593328038125496, | |
| "grad_norm": 0.44516798853874207, | |
| "learning_rate": 0.00012544, | |
| "loss": 1.2502, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7625099285146942, | |
| "grad_norm": 0.47978633642196655, | |
| "learning_rate": 0.00012512000000000002, | |
| "loss": 1.3275, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7656870532168387, | |
| "grad_norm": 0.4489109218120575, | |
| "learning_rate": 0.0001248, | |
| "loss": 1.3594, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7688641779189833, | |
| "grad_norm": 0.46123188734054565, | |
| "learning_rate": 0.00012448, | |
| "loss": 1.3014, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7720413026211279, | |
| "grad_norm": 0.4514189064502716, | |
| "learning_rate": 0.00012416, | |
| "loss": 1.2148, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7752184273232724, | |
| "grad_norm": 0.47698548436164856, | |
| "learning_rate": 0.00012384, | |
| "loss": 1.3562, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.778395552025417, | |
| "grad_norm": 0.4442936182022095, | |
| "learning_rate": 0.00012352, | |
| "loss": 1.3092, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7815726767275616, | |
| "grad_norm": 0.48598411679267883, | |
| "learning_rate": 0.0001232, | |
| "loss": 1.418, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7847498014297061, | |
| "grad_norm": 0.46551617980003357, | |
| "learning_rate": 0.00012288, | |
| "loss": 1.261, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7879269261318507, | |
| "grad_norm": 0.4166944921016693, | |
| "learning_rate": 0.00012256000000000002, | |
| "loss": 1.213, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7911040508339953, | |
| "grad_norm": 0.48919105529785156, | |
| "learning_rate": 0.00012224, | |
| "loss": 1.3328, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7942811755361397, | |
| "grad_norm": 0.44160059094429016, | |
| "learning_rate": 0.00012192000000000001, | |
| "loss": 1.2758, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7974583002382843, | |
| "grad_norm": 0.45591601729393005, | |
| "learning_rate": 0.0001216, | |
| "loss": 1.3771, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8006354249404289, | |
| "grad_norm": 0.4620215892791748, | |
| "learning_rate": 0.00012128000000000002, | |
| "loss": 1.3787, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8038125496425734, | |
| "grad_norm": 0.4968376159667969, | |
| "learning_rate": 0.00012096000000000001, | |
| "loss": 1.3798, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.806989674344718, | |
| "grad_norm": 0.4507087767124176, | |
| "learning_rate": 0.00012064, | |
| "loss": 1.3349, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.8101667990468626, | |
| "grad_norm": 0.46431511640548706, | |
| "learning_rate": 0.00012032000000000001, | |
| "loss": 1.359, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8133439237490071, | |
| "grad_norm": 0.46496230363845825, | |
| "learning_rate": 0.00012, | |
| "loss": 1.2429, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8165210484511517, | |
| "grad_norm": 0.46748438477516174, | |
| "learning_rate": 0.00011968000000000002, | |
| "loss": 1.3088, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8196981731532963, | |
| "grad_norm": 0.45148542523384094, | |
| "learning_rate": 0.00011936000000000001, | |
| "loss": 1.2191, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8228752978554408, | |
| "grad_norm": 0.4253683388233185, | |
| "learning_rate": 0.00011904, | |
| "loss": 1.2915, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8260524225575854, | |
| "grad_norm": 0.506744384765625, | |
| "learning_rate": 0.00011872000000000002, | |
| "loss": 1.3271, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.82922954725973, | |
| "grad_norm": 0.4920015335083008, | |
| "learning_rate": 0.0001184, | |
| "loss": 1.3933, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8324066719618745, | |
| "grad_norm": 0.4514538645744324, | |
| "learning_rate": 0.00011808000000000001, | |
| "loss": 1.356, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.835583796664019, | |
| "grad_norm": 0.5036830306053162, | |
| "learning_rate": 0.00011776, | |
| "loss": 1.2622, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8387609213661636, | |
| "grad_norm": 0.5152455568313599, | |
| "learning_rate": 0.00011744000000000001, | |
| "loss": 1.3674, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8419380460683081, | |
| "grad_norm": 0.4376108944416046, | |
| "learning_rate": 0.00011712, | |
| "loss": 1.3758, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8451151707704527, | |
| "grad_norm": 0.4190007746219635, | |
| "learning_rate": 0.00011679999999999999, | |
| "loss": 1.2389, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8482922954725973, | |
| "grad_norm": 0.5019193291664124, | |
| "learning_rate": 0.00011648000000000001, | |
| "loss": 1.3014, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8514694201747418, | |
| "grad_norm": 0.47944578528404236, | |
| "learning_rate": 0.00011616, | |
| "loss": 1.3507, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8546465448768864, | |
| "grad_norm": 0.4307346045970917, | |
| "learning_rate": 0.00011584000000000002, | |
| "loss": 1.2211, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.857823669579031, | |
| "grad_norm": 0.5099300742149353, | |
| "learning_rate": 0.00011552, | |
| "loss": 1.3172, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8610007942811755, | |
| "grad_norm": 0.41971608996391296, | |
| "learning_rate": 0.0001152, | |
| "loss": 1.2691, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8641779189833201, | |
| "grad_norm": 0.4612553119659424, | |
| "learning_rate": 0.00011488000000000001, | |
| "loss": 1.3128, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8673550436854647, | |
| "grad_norm": 0.4589272141456604, | |
| "learning_rate": 0.00011456, | |
| "loss": 1.3275, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8705321683876092, | |
| "grad_norm": 0.47001925110816956, | |
| "learning_rate": 0.00011424000000000002, | |
| "loss": 1.2607, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8737092930897538, | |
| "grad_norm": 0.4315769672393799, | |
| "learning_rate": 0.00011392000000000001, | |
| "loss": 1.3215, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8768864177918984, | |
| "grad_norm": 0.45138058066368103, | |
| "learning_rate": 0.0001136, | |
| "loss": 1.332, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8800635424940428, | |
| "grad_norm": 0.4450497329235077, | |
| "learning_rate": 0.00011328000000000001, | |
| "loss": 1.3474, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8832406671961874, | |
| "grad_norm": 0.4595153033733368, | |
| "learning_rate": 0.00011296, | |
| "loss": 1.27, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.886417791898332, | |
| "grad_norm": 0.42433419823646545, | |
| "learning_rate": 0.00011264, | |
| "loss": 1.3352, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8895949166004765, | |
| "grad_norm": 0.44947418570518494, | |
| "learning_rate": 0.00011232000000000001, | |
| "loss": 1.2532, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8927720413026211, | |
| "grad_norm": 0.4503403604030609, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 1.3267, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.8959491660047657, | |
| "grad_norm": 0.418992280960083, | |
| "learning_rate": 0.00011168, | |
| "loss": 1.2699, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.8991262907069102, | |
| "grad_norm": 0.4266560971736908, | |
| "learning_rate": 0.00011135999999999999, | |
| "loss": 1.169, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9023034154090548, | |
| "grad_norm": 0.5053189396858215, | |
| "learning_rate": 0.00011104000000000001, | |
| "loss": 1.4047, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9054805401111994, | |
| "grad_norm": 0.5122870206832886, | |
| "learning_rate": 0.00011072, | |
| "loss": 1.3834, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9086576648133439, | |
| "grad_norm": 0.43556493520736694, | |
| "learning_rate": 0.00011040000000000001, | |
| "loss": 1.2163, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9118347895154885, | |
| "grad_norm": 0.4655609130859375, | |
| "learning_rate": 0.00011008, | |
| "loss": 1.2967, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9150119142176331, | |
| "grad_norm": 0.4987747371196747, | |
| "learning_rate": 0.00010975999999999999, | |
| "loss": 1.2905, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9181890389197777, | |
| "grad_norm": 0.4585645794868469, | |
| "learning_rate": 0.00010944000000000001, | |
| "loss": 1.2875, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.9213661636219221, | |
| "grad_norm": 0.5033825039863586, | |
| "learning_rate": 0.00010912, | |
| "loss": 1.2772, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9245432883240667, | |
| "grad_norm": 0.4755001962184906, | |
| "learning_rate": 0.00010880000000000002, | |
| "loss": 1.4525, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9277204130262113, | |
| "grad_norm": 0.43799713253974915, | |
| "learning_rate": 0.00010848, | |
| "loss": 1.3104, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9308975377283558, | |
| "grad_norm": 0.43732205033302307, | |
| "learning_rate": 0.00010816, | |
| "loss": 1.2373, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9340746624305004, | |
| "grad_norm": 0.45804721117019653, | |
| "learning_rate": 0.00010784000000000001, | |
| "loss": 1.3313, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.937251787132645, | |
| "grad_norm": 0.49885255098342896, | |
| "learning_rate": 0.00010752, | |
| "loss": 1.3134, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9404289118347895, | |
| "grad_norm": 0.4742017090320587, | |
| "learning_rate": 0.00010720000000000002, | |
| "loss": 1.323, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9436060365369341, | |
| "grad_norm": 0.4221518039703369, | |
| "learning_rate": 0.00010688, | |
| "loss": 1.3479, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9467831612390787, | |
| "grad_norm": 0.4776606261730194, | |
| "learning_rate": 0.00010656000000000001, | |
| "loss": 1.265, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9499602859412232, | |
| "grad_norm": 0.49409452080726624, | |
| "learning_rate": 0.00010624000000000001, | |
| "loss": 1.2697, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9531374106433678, | |
| "grad_norm": 0.4598381817340851, | |
| "learning_rate": 0.00010592, | |
| "loss": 1.2518, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9563145353455124, | |
| "grad_norm": 0.43075883388519287, | |
| "learning_rate": 0.0001056, | |
| "loss": 1.2483, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9594916600476568, | |
| "grad_norm": 0.5096505880355835, | |
| "learning_rate": 0.00010528, | |
| "loss": 1.4291, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9626687847498014, | |
| "grad_norm": 0.4315980076789856, | |
| "learning_rate": 0.00010496000000000001, | |
| "loss": 1.2332, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.965845909451946, | |
| "grad_norm": 0.47984281182289124, | |
| "learning_rate": 0.00010464, | |
| "loss": 1.3108, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9690230341540905, | |
| "grad_norm": 0.4698749780654907, | |
| "learning_rate": 0.00010431999999999999, | |
| "loss": 1.3103, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9722001588562351, | |
| "grad_norm": 0.465999960899353, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 1.3343, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9753772835583797, | |
| "grad_norm": 0.43465176224708557, | |
| "learning_rate": 0.00010368, | |
| "loss": 1.1649, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9785544082605242, | |
| "grad_norm": 0.4245821237564087, | |
| "learning_rate": 0.00010336000000000001, | |
| "loss": 1.273, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9817315329626688, | |
| "grad_norm": 0.43245622515678406, | |
| "learning_rate": 0.00010304, | |
| "loss": 1.2443, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.9849086576648134, | |
| "grad_norm": 0.4845837950706482, | |
| "learning_rate": 0.00010271999999999999, | |
| "loss": 1.2649, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9880857823669579, | |
| "grad_norm": 0.424667090177536, | |
| "learning_rate": 0.00010240000000000001, | |
| "loss": 1.2708, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9912629070691025, | |
| "grad_norm": 0.43120723962783813, | |
| "learning_rate": 0.00010208, | |
| "loss": 1.2844, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.9944400317712471, | |
| "grad_norm": 0.4800574481487274, | |
| "learning_rate": 0.00010176000000000002, | |
| "loss": 1.379, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.9976171564733916, | |
| "grad_norm": 0.5008915066719055, | |
| "learning_rate": 0.00010144, | |
| "loss": 1.3679, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6201555132865906, | |
| "learning_rate": 0.00010112000000000002, | |
| "loss": 1.2757, | |
| "step": 315 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 630, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.203279169783808e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |