{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05003027134129561, "eval_steps": 500, "global_step": 909, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.503880235566074e-05, "grad_norm": 459.8753356933594, "learning_rate": 1.0000000000000001e-07, "loss": 3.303, "step": 1 }, { "epoch": 0.00011007760471132149, "grad_norm": 314.2561950683594, "learning_rate": 2.0000000000000002e-07, "loss": 2.8226, "step": 2 }, { "epoch": 0.0001651164070669822, "grad_norm": 314.1292419433594, "learning_rate": 3.0000000000000004e-07, "loss": 2.8517, "step": 3 }, { "epoch": 0.00022015520942264297, "grad_norm": 312.4049072265625, "learning_rate": 4.0000000000000003e-07, "loss": 2.6248, "step": 4 }, { "epoch": 0.0002751940117783037, "grad_norm": 353.7213134765625, "learning_rate": 5.000000000000001e-07, "loss": 2.7883, "step": 5 }, { "epoch": 0.0003302328141339644, "grad_norm": 278.41668701171875, "learning_rate": 6.000000000000001e-07, "loss": 2.5468, "step": 6 }, { "epoch": 0.0003852716164896252, "grad_norm": 336.14532470703125, "learning_rate": 7.000000000000001e-07, "loss": 2.7721, "step": 7 }, { "epoch": 0.00044031041884528595, "grad_norm": 201.19374084472656, "learning_rate": 8.000000000000001e-07, "loss": 2.4873, "step": 8 }, { "epoch": 0.0004953492212009466, "grad_norm": 184.7027587890625, "learning_rate": 9.000000000000001e-07, "loss": 2.6647, "step": 9 }, { "epoch": 0.0005503880235566074, "grad_norm": 154.597412109375, "learning_rate": 1.0000000000000002e-06, "loss": 2.602, "step": 10 }, { "epoch": 0.0006054268259122681, "grad_norm": 40.47785568237305, "learning_rate": 1.1e-06, "loss": 2.6716, "step": 11 }, { "epoch": 0.0006604656282679288, "grad_norm": 25.338607788085938, "learning_rate": 1.2000000000000002e-06, "loss": 2.2631, "step": 12 }, { "epoch": 0.0007155044306235897, "grad_norm": 24.976919174194336, "learning_rate": 1.3e-06, "loss": 2.3564, "step": 13 }, { "epoch": 0.0007705432329792504, "grad_norm": 15.239912033081055, "learning_rate": 1.4000000000000001e-06, "loss": 2.3295, "step": 14 }, { "epoch": 0.0008255820353349112, "grad_norm": 14.125042915344238, "learning_rate": 1.5e-06, "loss": 2.307, "step": 15 }, { "epoch": 0.0008806208376905719, "grad_norm": 13.163726806640625, "learning_rate": 1.6000000000000001e-06, "loss": 2.1493, "step": 16 }, { "epoch": 0.0009356596400462326, "grad_norm": 8.726515769958496, "learning_rate": 1.7000000000000002e-06, "loss": 2.0333, "step": 17 }, { "epoch": 0.0009906984424018933, "grad_norm": 9.072502136230469, "learning_rate": 1.8000000000000001e-06, "loss": 2.2046, "step": 18 }, { "epoch": 0.001045737244757554, "grad_norm": 9.412588119506836, "learning_rate": 1.9000000000000002e-06, "loss": 2.2001, "step": 19 }, { "epoch": 0.0011007760471132147, "grad_norm": 8.67534065246582, "learning_rate": 2.0000000000000003e-06, "loss": 1.7679, "step": 20 }, { "epoch": 0.0011558148494688755, "grad_norm": 14.015918731689453, "learning_rate": 2.1000000000000002e-06, "loss": 1.9566, "step": 21 }, { "epoch": 0.0012108536518245362, "grad_norm": 7.9474687576293945, "learning_rate": 2.2e-06, "loss": 1.9085, "step": 22 }, { "epoch": 0.001265892454180197, "grad_norm": 6.806368350982666, "learning_rate": 2.3000000000000004e-06, "loss": 1.7918, "step": 23 }, { "epoch": 0.0013209312565358577, "grad_norm": 5.3452582359313965, "learning_rate": 2.4000000000000003e-06, "loss": 1.8321, "step": 24 }, { "epoch": 0.0013759700588915184, "grad_norm": 8.744244575500488, "learning_rate": 2.5e-06, "loss": 1.6317, "step": 25 }, { "epoch": 0.0014310088612471794, "grad_norm": 5.304683685302734, "learning_rate": 2.6e-06, "loss": 1.6846, "step": 26 }, { "epoch": 0.00148604766360284, "grad_norm": 5.650127410888672, "learning_rate": 2.7000000000000004e-06, "loss": 1.7449, "step": 27 }, { "epoch": 0.0015410864659585008, "grad_norm": 5.479269504547119, "learning_rate": 2.8000000000000003e-06, "loss": 1.8158, "step": 28 }, { "epoch": 0.0015961252683141616, "grad_norm": 4.873537063598633, "learning_rate": 2.9e-06, "loss": 1.8015, "step": 29 }, { "epoch": 0.0016511640706698223, "grad_norm": 4.971101760864258, "learning_rate": 3e-06, "loss": 1.9034, "step": 30 }, { "epoch": 0.001706202873025483, "grad_norm": 4.407571315765381, "learning_rate": 3.1000000000000004e-06, "loss": 1.9037, "step": 31 }, { "epoch": 0.0017612416753811438, "grad_norm": 4.429073810577393, "learning_rate": 3.2000000000000003e-06, "loss": 1.6812, "step": 32 }, { "epoch": 0.0018162804777368045, "grad_norm": 5.16085147857666, "learning_rate": 3.3000000000000006e-06, "loss": 1.7627, "step": 33 }, { "epoch": 0.0018713192800924653, "grad_norm": 4.0805768966674805, "learning_rate": 3.4000000000000005e-06, "loss": 1.6799, "step": 34 }, { "epoch": 0.001926358082448126, "grad_norm": 4.548702239990234, "learning_rate": 3.5e-06, "loss": 1.7799, "step": 35 }, { "epoch": 0.0019813968848037865, "grad_norm": 5.181888580322266, "learning_rate": 3.6000000000000003e-06, "loss": 1.8235, "step": 36 }, { "epoch": 0.0020364356871594475, "grad_norm": 3.9876129627227783, "learning_rate": 3.7e-06, "loss": 1.5999, "step": 37 }, { "epoch": 0.002091474489515108, "grad_norm": 6.325051307678223, "learning_rate": 3.8000000000000005e-06, "loss": 1.7499, "step": 38 }, { "epoch": 0.002146513291870769, "grad_norm": 6.199049949645996, "learning_rate": 3.900000000000001e-06, "loss": 1.784, "step": 39 }, { "epoch": 0.0022015520942264295, "grad_norm": 4.83912992477417, "learning_rate": 4.000000000000001e-06, "loss": 1.8895, "step": 40 }, { "epoch": 0.0022565908965820904, "grad_norm": 4.515626907348633, "learning_rate": 4.1e-06, "loss": 1.4887, "step": 41 }, { "epoch": 0.002311629698937751, "grad_norm": 5.032265663146973, "learning_rate": 4.2000000000000004e-06, "loss": 1.7324, "step": 42 }, { "epoch": 0.002366668501293412, "grad_norm": 4.1879048347473145, "learning_rate": 4.3e-06, "loss": 1.4912, "step": 43 }, { "epoch": 0.0024217073036490724, "grad_norm": 4.128026485443115, "learning_rate": 4.4e-06, "loss": 1.554, "step": 44 }, { "epoch": 0.0024767461060047334, "grad_norm": 4.527958393096924, "learning_rate": 4.5e-06, "loss": 1.652, "step": 45 }, { "epoch": 0.002531784908360394, "grad_norm": 4.8388190269470215, "learning_rate": 4.600000000000001e-06, "loss": 1.6696, "step": 46 }, { "epoch": 0.002586823710716055, "grad_norm": 4.2088541984558105, "learning_rate": 4.7e-06, "loss": 1.568, "step": 47 }, { "epoch": 0.0026418625130717154, "grad_norm": 4.789997577667236, "learning_rate": 4.800000000000001e-06, "loss": 1.642, "step": 48 }, { "epoch": 0.0026969013154273763, "grad_norm": 4.408346652984619, "learning_rate": 4.9000000000000005e-06, "loss": 1.5181, "step": 49 }, { "epoch": 0.002751940117783037, "grad_norm": 4.572340488433838, "learning_rate": 5e-06, "loss": 1.6698, "step": 50 }, { "epoch": 0.0028069789201386978, "grad_norm": 4.728564739227295, "learning_rate": 5.1e-06, "loss": 1.5785, "step": 51 }, { "epoch": 0.0028620177224943587, "grad_norm": 4.449855327606201, "learning_rate": 5.2e-06, "loss": 1.4624, "step": 52 }, { "epoch": 0.0029170565248500193, "grad_norm": 4.127189636230469, "learning_rate": 5.300000000000001e-06, "loss": 1.6061, "step": 53 }, { "epoch": 0.00297209532720568, "grad_norm": 4.244532108306885, "learning_rate": 5.400000000000001e-06, "loss": 1.491, "step": 54 }, { "epoch": 0.0030271341295613407, "grad_norm": 3.437682628631592, "learning_rate": 5.500000000000001e-06, "loss": 1.1967, "step": 55 }, { "epoch": 0.0030821729319170017, "grad_norm": 3.83516788482666, "learning_rate": 5.600000000000001e-06, "loss": 1.4731, "step": 56 }, { "epoch": 0.003137211734272662, "grad_norm": 3.9108972549438477, "learning_rate": 5.7e-06, "loss": 1.4393, "step": 57 }, { "epoch": 0.003192250536628323, "grad_norm": 3.5258419513702393, "learning_rate": 5.8e-06, "loss": 1.4206, "step": 58 }, { "epoch": 0.0032472893389839837, "grad_norm": 4.124903678894043, "learning_rate": 5.9e-06, "loss": 1.4747, "step": 59 }, { "epoch": 0.0033023281413396446, "grad_norm": 4.055769920349121, "learning_rate": 6e-06, "loss": 1.4655, "step": 60 }, { "epoch": 0.003357366943695305, "grad_norm": 3.904837131500244, "learning_rate": 6.1e-06, "loss": 1.5125, "step": 61 }, { "epoch": 0.003412405746050966, "grad_norm": 3.2904794216156006, "learning_rate": 6.200000000000001e-06, "loss": 1.4596, "step": 62 }, { "epoch": 0.0034674445484066266, "grad_norm": 3.24053692817688, "learning_rate": 6.300000000000001e-06, "loss": 1.3851, "step": 63 }, { "epoch": 0.0035224833507622876, "grad_norm": 3.457639217376709, "learning_rate": 6.4000000000000006e-06, "loss": 1.4019, "step": 64 }, { "epoch": 0.003577522153117948, "grad_norm": 3.073054790496826, "learning_rate": 6.5000000000000004e-06, "loss": 1.2872, "step": 65 }, { "epoch": 0.003632560955473609, "grad_norm": 2.6726694107055664, "learning_rate": 6.600000000000001e-06, "loss": 1.2361, "step": 66 }, { "epoch": 0.0036875997578292696, "grad_norm": 2.9378459453582764, "learning_rate": 6.700000000000001e-06, "loss": 1.4452, "step": 67 }, { "epoch": 0.0037426385601849305, "grad_norm": 2.81107234954834, "learning_rate": 6.800000000000001e-06, "loss": 1.4804, "step": 68 }, { "epoch": 0.003797677362540591, "grad_norm": 2.60062313079834, "learning_rate": 6.9e-06, "loss": 1.3263, "step": 69 }, { "epoch": 0.003852716164896252, "grad_norm": 2.5642921924591064, "learning_rate": 7e-06, "loss": 1.2751, "step": 70 }, { "epoch": 0.0039077549672519125, "grad_norm": 2.3608031272888184, "learning_rate": 7.100000000000001e-06, "loss": 1.2614, "step": 71 }, { "epoch": 0.003962793769607573, "grad_norm": 2.7201738357543945, "learning_rate": 7.2000000000000005e-06, "loss": 1.5018, "step": 72 }, { "epoch": 0.004017832571963234, "grad_norm": 2.584726095199585, "learning_rate": 7.3e-06, "loss": 1.3519, "step": 73 }, { "epoch": 0.004072871374318895, "grad_norm": 1.9693044424057007, "learning_rate": 7.4e-06, "loss": 1.0934, "step": 74 }, { "epoch": 0.0041279101766745555, "grad_norm": 2.220736503601074, "learning_rate": 7.500000000000001e-06, "loss": 1.4687, "step": 75 }, { "epoch": 0.004182948979030216, "grad_norm": 2.2629456520080566, "learning_rate": 7.600000000000001e-06, "loss": 1.3328, "step": 76 }, { "epoch": 0.004237987781385877, "grad_norm": 2.051820993423462, "learning_rate": 7.7e-06, "loss": 1.3058, "step": 77 }, { "epoch": 0.004293026583741538, "grad_norm": 2.2451820373535156, "learning_rate": 7.800000000000002e-06, "loss": 1.3556, "step": 78 }, { "epoch": 0.004348065386097198, "grad_norm": 3.13584303855896, "learning_rate": 7.9e-06, "loss": 1.3262, "step": 79 }, { "epoch": 0.004403104188452859, "grad_norm": 5.024479866027832, "learning_rate": 8.000000000000001e-06, "loss": 1.2103, "step": 80 }, { "epoch": 0.00445814299080852, "grad_norm": 2.070889711380005, "learning_rate": 8.1e-06, "loss": 1.1994, "step": 81 }, { "epoch": 0.004513181793164181, "grad_norm": 2.797286033630371, "learning_rate": 8.2e-06, "loss": 1.3075, "step": 82 }, { "epoch": 0.004568220595519841, "grad_norm": 2.11370849609375, "learning_rate": 8.3e-06, "loss": 1.36, "step": 83 }, { "epoch": 0.004623259397875502, "grad_norm": 2.5416152477264404, "learning_rate": 8.400000000000001e-06, "loss": 1.3484, "step": 84 }, { "epoch": 0.004678298200231163, "grad_norm": 2.4702343940734863, "learning_rate": 8.5e-06, "loss": 1.3677, "step": 85 }, { "epoch": 0.004733337002586824, "grad_norm": 3.670365333557129, "learning_rate": 8.6e-06, "loss": 1.2192, "step": 86 }, { "epoch": 0.004788375804942484, "grad_norm": 2.282954692840576, "learning_rate": 8.700000000000001e-06, "loss": 1.2982, "step": 87 }, { "epoch": 0.004843414607298145, "grad_norm": 2.3659238815307617, "learning_rate": 8.8e-06, "loss": 1.3206, "step": 88 }, { "epoch": 0.004898453409653806, "grad_norm": 4.939981460571289, "learning_rate": 8.900000000000001e-06, "loss": 1.4328, "step": 89 }, { "epoch": 0.004953492212009467, "grad_norm": 2.335858106613159, "learning_rate": 9e-06, "loss": 1.2603, "step": 90 }, { "epoch": 0.005008531014365127, "grad_norm": 2.2165043354034424, "learning_rate": 9.100000000000001e-06, "loss": 1.3141, "step": 91 }, { "epoch": 0.005063569816720788, "grad_norm": 2.7872185707092285, "learning_rate": 9.200000000000002e-06, "loss": 1.3314, "step": 92 }, { "epoch": 0.005118608619076449, "grad_norm": 2.6353912353515625, "learning_rate": 9.3e-06, "loss": 1.2027, "step": 93 }, { "epoch": 0.00517364742143211, "grad_norm": 3.2509102821350098, "learning_rate": 9.4e-06, "loss": 1.2316, "step": 94 }, { "epoch": 0.00522868622378777, "grad_norm": 2.4560611248016357, "learning_rate": 9.5e-06, "loss": 1.1848, "step": 95 }, { "epoch": 0.005283725026143431, "grad_norm": 2.338151216506958, "learning_rate": 9.600000000000001e-06, "loss": 1.2392, "step": 96 }, { "epoch": 0.005338763828499092, "grad_norm": 2.231065034866333, "learning_rate": 9.7e-06, "loss": 1.2089, "step": 97 }, { "epoch": 0.005393802630854753, "grad_norm": 2.278428077697754, "learning_rate": 9.800000000000001e-06, "loss": 1.2267, "step": 98 }, { "epoch": 0.005448841433210413, "grad_norm": 2.4422810077667236, "learning_rate": 9.9e-06, "loss": 1.2041, "step": 99 }, { "epoch": 0.005503880235566074, "grad_norm": 2.216248035430908, "learning_rate": 1e-05, "loss": 1.0798, "step": 100 }, { "epoch": 0.005558919037921735, "grad_norm": 2.3301615715026855, "learning_rate": 9.99999998121067e-06, "loss": 1.3069, "step": 101 }, { "epoch": 0.0056139578402773956, "grad_norm": 2.315436363220215, "learning_rate": 9.999999924842678e-06, "loss": 1.1589, "step": 102 }, { "epoch": 0.005668996642633056, "grad_norm": 2.3522140979766846, "learning_rate": 9.999999830896024e-06, "loss": 1.0978, "step": 103 }, { "epoch": 0.0057240354449887175, "grad_norm": 2.5798308849334717, "learning_rate": 9.99999969937071e-06, "loss": 1.0599, "step": 104 }, { "epoch": 0.005779074247344378, "grad_norm": 2.456644058227539, "learning_rate": 9.999999530266738e-06, "loss": 1.1682, "step": 105 }, { "epoch": 0.0058341130497000385, "grad_norm": 2.1559031009674072, "learning_rate": 9.999999323584106e-06, "loss": 1.0631, "step": 106 }, { "epoch": 0.005889151852055699, "grad_norm": 2.2985048294067383, "learning_rate": 9.99999907932282e-06, "loss": 1.1455, "step": 107 }, { "epoch": 0.00594419065441136, "grad_norm": 2.596167802810669, "learning_rate": 9.999998797482877e-06, "loss": 1.1686, "step": 108 }, { "epoch": 0.005999229456767021, "grad_norm": 2.378618001937866, "learning_rate": 9.999998478064283e-06, "loss": 1.2226, "step": 109 }, { "epoch": 0.0060542682591226814, "grad_norm": 2.228116750717163, "learning_rate": 9.999998121067038e-06, "loss": 1.1396, "step": 110 }, { "epoch": 0.006109307061478342, "grad_norm": 2.4419472217559814, "learning_rate": 9.999997726491146e-06, "loss": 1.1401, "step": 111 }, { "epoch": 0.006164345863834003, "grad_norm": 2.0695526599884033, "learning_rate": 9.999997294336608e-06, "loss": 1.1868, "step": 112 }, { "epoch": 0.006219384666189664, "grad_norm": 2.3170363903045654, "learning_rate": 9.99999682460343e-06, "loss": 1.1172, "step": 113 }, { "epoch": 0.006274423468545324, "grad_norm": 2.670466184616089, "learning_rate": 9.999996317291615e-06, "loss": 1.2481, "step": 114 }, { "epoch": 0.006329462270900985, "grad_norm": 2.1214540004730225, "learning_rate": 9.999995772401166e-06, "loss": 0.9994, "step": 115 }, { "epoch": 0.006384501073256646, "grad_norm": 1.9283969402313232, "learning_rate": 9.999995189932085e-06, "loss": 1.0692, "step": 116 }, { "epoch": 0.006439539875612307, "grad_norm": 2.2620882987976074, "learning_rate": 9.99999456988438e-06, "loss": 1.0725, "step": 117 }, { "epoch": 0.006494578677967967, "grad_norm": 2.2121341228485107, "learning_rate": 9.999993912258055e-06, "loss": 1.1328, "step": 118 }, { "epoch": 0.006549617480323628, "grad_norm": 2.298126220703125, "learning_rate": 9.999993217053113e-06, "loss": 1.1272, "step": 119 }, { "epoch": 0.006604656282679289, "grad_norm": 1.81593656539917, "learning_rate": 9.99999248426956e-06, "loss": 1.017, "step": 120 }, { "epoch": 0.00665969508503495, "grad_norm": 2.1174378395080566, "learning_rate": 9.999991713907403e-06, "loss": 1.0557, "step": 121 }, { "epoch": 0.00671473388739061, "grad_norm": 1.9061017036437988, "learning_rate": 9.999990905966647e-06, "loss": 1.0379, "step": 122 }, { "epoch": 0.006769772689746271, "grad_norm": 1.912500023841858, "learning_rate": 9.999990060447297e-06, "loss": 1.104, "step": 123 }, { "epoch": 0.006824811492101932, "grad_norm": 1.9249529838562012, "learning_rate": 9.99998917734936e-06, "loss": 1.0136, "step": 124 }, { "epoch": 0.006879850294457593, "grad_norm": 1.8504948616027832, "learning_rate": 9.999988256672843e-06, "loss": 0.99, "step": 125 }, { "epoch": 0.006934889096813253, "grad_norm": 1.720042109489441, "learning_rate": 9.999987298417753e-06, "loss": 1.0666, "step": 126 }, { "epoch": 0.006989927899168914, "grad_norm": 1.778251051902771, "learning_rate": 9.999986302584097e-06, "loss": 1.0424, "step": 127 }, { "epoch": 0.007044966701524575, "grad_norm": 1.9485961198806763, "learning_rate": 9.999985269171881e-06, "loss": 1.105, "step": 128 }, { "epoch": 0.007100005503880236, "grad_norm": 3.0802104473114014, "learning_rate": 9.999984198181114e-06, "loss": 1.1081, "step": 129 }, { "epoch": 0.007155044306235896, "grad_norm": 1.7476954460144043, "learning_rate": 9.999983089611806e-06, "loss": 0.9677, "step": 130 }, { "epoch": 0.007210083108591557, "grad_norm": 1.6127299070358276, "learning_rate": 9.999981943463963e-06, "loss": 0.9937, "step": 131 }, { "epoch": 0.007265121910947218, "grad_norm": 2.1477208137512207, "learning_rate": 9.999980759737594e-06, "loss": 1.0319, "step": 132 }, { "epoch": 0.007320160713302879, "grad_norm": 1.531163215637207, "learning_rate": 9.999979538432707e-06, "loss": 0.8696, "step": 133 }, { "epoch": 0.007375199515658539, "grad_norm": 1.8226820230484009, "learning_rate": 9.999978279549313e-06, "loss": 1.2061, "step": 134 }, { "epoch": 0.0074302383180142, "grad_norm": 1.481895923614502, "learning_rate": 9.99997698308742e-06, "loss": 0.949, "step": 135 }, { "epoch": 0.007485277120369861, "grad_norm": 1.6715927124023438, "learning_rate": 9.99997564904704e-06, "loss": 1.1579, "step": 136 }, { "epoch": 0.0075403159227255215, "grad_norm": 1.4235272407531738, "learning_rate": 9.999974277428179e-06, "loss": 1.064, "step": 137 }, { "epoch": 0.007595354725081182, "grad_norm": 1.3524872064590454, "learning_rate": 9.999972868230852e-06, "loss": 0.9141, "step": 138 }, { "epoch": 0.007650393527436843, "grad_norm": 1.3741765022277832, "learning_rate": 9.999971421455066e-06, "loss": 1.0256, "step": 139 }, { "epoch": 0.007705432329792504, "grad_norm": 1.9869598150253296, "learning_rate": 9.999969937100835e-06, "loss": 0.9489, "step": 140 }, { "epoch": 0.0077604711321481645, "grad_norm": 1.4785465002059937, "learning_rate": 9.999968415168166e-06, "loss": 0.9243, "step": 141 }, { "epoch": 0.007815509934503825, "grad_norm": 1.5476176738739014, "learning_rate": 9.999966855657074e-06, "loss": 1.178, "step": 142 }, { "epoch": 0.007870548736859486, "grad_norm": 1.500401258468628, "learning_rate": 9.99996525856757e-06, "loss": 0.9837, "step": 143 }, { "epoch": 0.007925587539215146, "grad_norm": 1.3777157068252563, "learning_rate": 9.999963623899664e-06, "loss": 1.0732, "step": 144 }, { "epoch": 0.007980626341570807, "grad_norm": 1.4466841220855713, "learning_rate": 9.99996195165337e-06, "loss": 0.9779, "step": 145 }, { "epoch": 0.008035665143926469, "grad_norm": 1.5304051637649536, "learning_rate": 9.9999602418287e-06, "loss": 1.196, "step": 146 }, { "epoch": 0.008090703946282128, "grad_norm": 1.9012362957000732, "learning_rate": 9.99995849442567e-06, "loss": 0.9797, "step": 147 }, { "epoch": 0.00814574274863779, "grad_norm": 1.430679202079773, "learning_rate": 9.999956709444289e-06, "loss": 0.9869, "step": 148 }, { "epoch": 0.00820078155099345, "grad_norm": 1.3489817380905151, "learning_rate": 9.99995488688457e-06, "loss": 1.0137, "step": 149 }, { "epoch": 0.008255820353349111, "grad_norm": 1.1878125667572021, "learning_rate": 9.999953026746531e-06, "loss": 0.9355, "step": 150 }, { "epoch": 0.008310859155704772, "grad_norm": 1.3481942415237427, "learning_rate": 9.999951129030182e-06, "loss": 1.1235, "step": 151 }, { "epoch": 0.008365897958060432, "grad_norm": 1.7335314750671387, "learning_rate": 9.999949193735539e-06, "loss": 0.9382, "step": 152 }, { "epoch": 0.008420936760416093, "grad_norm": 1.2029480934143066, "learning_rate": 9.999947220862615e-06, "loss": 0.9419, "step": 153 }, { "epoch": 0.008475975562771755, "grad_norm": 1.2104203701019287, "learning_rate": 9.999945210411428e-06, "loss": 0.9196, "step": 154 }, { "epoch": 0.008531014365127414, "grad_norm": 1.1857126951217651, "learning_rate": 9.999943162381991e-06, "loss": 0.9421, "step": 155 }, { "epoch": 0.008586053167483076, "grad_norm": 1.115027904510498, "learning_rate": 9.999941076774319e-06, "loss": 0.9634, "step": 156 }, { "epoch": 0.008641091969838737, "grad_norm": 1.4227553606033325, "learning_rate": 9.999938953588428e-06, "loss": 1.0036, "step": 157 }, { "epoch": 0.008696130772194397, "grad_norm": 1.2913776636123657, "learning_rate": 9.999936792824334e-06, "loss": 0.9232, "step": 158 }, { "epoch": 0.008751169574550058, "grad_norm": 1.2817318439483643, "learning_rate": 9.999934594482055e-06, "loss": 0.9691, "step": 159 }, { "epoch": 0.008806208376905718, "grad_norm": 1.5647841691970825, "learning_rate": 9.999932358561604e-06, "loss": 1.1842, "step": 160 }, { "epoch": 0.00886124717926138, "grad_norm": 1.368135929107666, "learning_rate": 9.999930085063002e-06, "loss": 1.0873, "step": 161 }, { "epoch": 0.00891628598161704, "grad_norm": 1.2297240495681763, "learning_rate": 9.999927773986262e-06, "loss": 1.0778, "step": 162 }, { "epoch": 0.0089713247839727, "grad_norm": 1.0658279657363892, "learning_rate": 9.999925425331405e-06, "loss": 0.9008, "step": 163 }, { "epoch": 0.009026363586328362, "grad_norm": 1.3484326601028442, "learning_rate": 9.999923039098445e-06, "loss": 1.0664, "step": 164 }, { "epoch": 0.009081402388684023, "grad_norm": 1.1839075088500977, "learning_rate": 9.999920615287401e-06, "loss": 0.9257, "step": 165 }, { "epoch": 0.009136441191039683, "grad_norm": 1.2757254838943481, "learning_rate": 9.999918153898295e-06, "loss": 0.9473, "step": 166 }, { "epoch": 0.009191479993395344, "grad_norm": 1.2414579391479492, "learning_rate": 9.99991565493114e-06, "loss": 1.1091, "step": 167 }, { "epoch": 0.009246518795751004, "grad_norm": 1.2802611589431763, "learning_rate": 9.999913118385959e-06, "loss": 1.063, "step": 168 }, { "epoch": 0.009301557598106665, "grad_norm": 1.2055327892303467, "learning_rate": 9.99991054426277e-06, "loss": 0.8, "step": 169 }, { "epoch": 0.009356596400462327, "grad_norm": 1.0391098260879517, "learning_rate": 9.99990793256159e-06, "loss": 0.8672, "step": 170 }, { "epoch": 0.009411635202817986, "grad_norm": 1.131536602973938, "learning_rate": 9.99990528328244e-06, "loss": 0.9569, "step": 171 }, { "epoch": 0.009466674005173648, "grad_norm": 1.164307951927185, "learning_rate": 9.999902596425342e-06, "loss": 0.9999, "step": 172 }, { "epoch": 0.009521712807529309, "grad_norm": 1.2099504470825195, "learning_rate": 9.999899871990313e-06, "loss": 0.9994, "step": 173 }, { "epoch": 0.009576751609884969, "grad_norm": 1.7294539213180542, "learning_rate": 9.999897109977376e-06, "loss": 1.0265, "step": 174 }, { "epoch": 0.00963179041224063, "grad_norm": 1.3009883165359497, "learning_rate": 9.99989431038655e-06, "loss": 0.9022, "step": 175 }, { "epoch": 0.00968682921459629, "grad_norm": 1.1014611721038818, "learning_rate": 9.999891473217857e-06, "loss": 0.8476, "step": 176 }, { "epoch": 0.009741868016951951, "grad_norm": 1.2410900592803955, "learning_rate": 9.99988859847132e-06, "loss": 1.0272, "step": 177 }, { "epoch": 0.009796906819307612, "grad_norm": 1.336348295211792, "learning_rate": 9.999885686146957e-06, "loss": 0.9456, "step": 178 }, { "epoch": 0.009851945621663272, "grad_norm": 1.2931095361709595, "learning_rate": 9.99988273624479e-06, "loss": 0.9554, "step": 179 }, { "epoch": 0.009906984424018933, "grad_norm": 1.2647838592529297, "learning_rate": 9.999879748764845e-06, "loss": 1.0394, "step": 180 }, { "epoch": 0.009962023226374595, "grad_norm": 1.3485127687454224, "learning_rate": 9.99987672370714e-06, "loss": 1.1016, "step": 181 }, { "epoch": 0.010017062028730254, "grad_norm": 1.110187292098999, "learning_rate": 9.999873661071702e-06, "loss": 0.946, "step": 182 }, { "epoch": 0.010072100831085916, "grad_norm": 1.0991623401641846, "learning_rate": 9.999870560858551e-06, "loss": 1.0084, "step": 183 }, { "epoch": 0.010127139633441576, "grad_norm": 1.049804449081421, "learning_rate": 9.999867423067713e-06, "loss": 0.8264, "step": 184 }, { "epoch": 0.010182178435797237, "grad_norm": 1.0947058200836182, "learning_rate": 9.999864247699207e-06, "loss": 0.8884, "step": 185 }, { "epoch": 0.010237217238152898, "grad_norm": 1.1147902011871338, "learning_rate": 9.999861034753061e-06, "loss": 0.9657, "step": 186 }, { "epoch": 0.010292256040508558, "grad_norm": 1.260027527809143, "learning_rate": 9.999857784229298e-06, "loss": 1.0102, "step": 187 }, { "epoch": 0.01034729484286422, "grad_norm": 1.1275582313537598, "learning_rate": 9.999854496127942e-06, "loss": 1.028, "step": 188 }, { "epoch": 0.01040233364521988, "grad_norm": 1.1377174854278564, "learning_rate": 9.999851170449018e-06, "loss": 1.032, "step": 189 }, { "epoch": 0.01045737244757554, "grad_norm": 1.1734225749969482, "learning_rate": 9.999847807192552e-06, "loss": 1.0009, "step": 190 }, { "epoch": 0.010512411249931202, "grad_norm": 1.1934596300125122, "learning_rate": 9.999844406358565e-06, "loss": 1.0432, "step": 191 }, { "epoch": 0.010567450052286861, "grad_norm": 1.0638024806976318, "learning_rate": 9.99984096794709e-06, "loss": 0.8651, "step": 192 }, { "epoch": 0.010622488854642523, "grad_norm": 1.2381829023361206, "learning_rate": 9.999837491958147e-06, "loss": 1.0088, "step": 193 }, { "epoch": 0.010677527656998184, "grad_norm": 1.030246615409851, "learning_rate": 9.999833978391763e-06, "loss": 0.9488, "step": 194 }, { "epoch": 0.010732566459353844, "grad_norm": 1.1640657186508179, "learning_rate": 9.999830427247965e-06, "loss": 1.0588, "step": 195 }, { "epoch": 0.010787605261709505, "grad_norm": 1.0431616306304932, "learning_rate": 9.99982683852678e-06, "loss": 0.8728, "step": 196 }, { "epoch": 0.010842644064065167, "grad_norm": 1.032263159751892, "learning_rate": 9.999823212228235e-06, "loss": 0.9498, "step": 197 }, { "epoch": 0.010897682866420826, "grad_norm": 1.1383745670318604, "learning_rate": 9.999819548352358e-06, "loss": 0.9498, "step": 198 }, { "epoch": 0.010952721668776488, "grad_norm": 1.1324639320373535, "learning_rate": 9.999815846899175e-06, "loss": 1.0432, "step": 199 }, { "epoch": 0.011007760471132147, "grad_norm": 1.188672661781311, "learning_rate": 9.999812107868714e-06, "loss": 0.982, "step": 200 }, { "epoch": 0.011062799273487809, "grad_norm": 1.1011098623275757, "learning_rate": 9.999808331261005e-06, "loss": 0.9587, "step": 201 }, { "epoch": 0.01111783807584347, "grad_norm": 1.1782938241958618, "learning_rate": 9.999804517076073e-06, "loss": 1.0659, "step": 202 }, { "epoch": 0.01117287687819913, "grad_norm": 1.0520117282867432, "learning_rate": 9.99980066531395e-06, "loss": 1.0056, "step": 203 }, { "epoch": 0.011227915680554791, "grad_norm": 1.1584919691085815, "learning_rate": 9.999796775974663e-06, "loss": 0.9435, "step": 204 }, { "epoch": 0.011282954482910452, "grad_norm": 1.2201849222183228, "learning_rate": 9.999792849058242e-06, "loss": 1.0562, "step": 205 }, { "epoch": 0.011337993285266112, "grad_norm": 1.2985976934432983, "learning_rate": 9.999788884564715e-06, "loss": 1.0126, "step": 206 }, { "epoch": 0.011393032087621774, "grad_norm": 0.9926307201385498, "learning_rate": 9.999784882494115e-06, "loss": 0.7875, "step": 207 }, { "epoch": 0.011448070889977435, "grad_norm": 1.103365182876587, "learning_rate": 9.99978084284647e-06, "loss": 0.9833, "step": 208 }, { "epoch": 0.011503109692333095, "grad_norm": 1.1798462867736816, "learning_rate": 9.99977676562181e-06, "loss": 0.8479, "step": 209 }, { "epoch": 0.011558148494688756, "grad_norm": 1.2887194156646729, "learning_rate": 9.999772650820168e-06, "loss": 0.9606, "step": 210 }, { "epoch": 0.011613187297044416, "grad_norm": 1.1120634078979492, "learning_rate": 9.99976849844157e-06, "loss": 0.9604, "step": 211 }, { "epoch": 0.011668226099400077, "grad_norm": 1.1248979568481445, "learning_rate": 9.999764308486052e-06, "loss": 0.9428, "step": 212 }, { "epoch": 0.011723264901755738, "grad_norm": 1.274610161781311, "learning_rate": 9.999760080953643e-06, "loss": 0.9044, "step": 213 }, { "epoch": 0.011778303704111398, "grad_norm": 1.1746865510940552, "learning_rate": 9.999755815844377e-06, "loss": 0.9114, "step": 214 }, { "epoch": 0.01183334250646706, "grad_norm": 1.2531086206436157, "learning_rate": 9.999751513158282e-06, "loss": 1.0785, "step": 215 }, { "epoch": 0.01188838130882272, "grad_norm": 1.0789539813995361, "learning_rate": 9.999747172895395e-06, "loss": 0.9794, "step": 216 }, { "epoch": 0.01194342011117838, "grad_norm": 1.1805329322814941, "learning_rate": 9.999742795055746e-06, "loss": 0.9602, "step": 217 }, { "epoch": 0.011998458913534042, "grad_norm": 2.309329032897949, "learning_rate": 9.99973837963937e-06, "loss": 0.9482, "step": 218 }, { "epoch": 0.012053497715889702, "grad_norm": 1.2379088401794434, "learning_rate": 9.999733926646296e-06, "loss": 1.0237, "step": 219 }, { "epoch": 0.012108536518245363, "grad_norm": 1.1581377983093262, "learning_rate": 9.999729436076562e-06, "loss": 1.0583, "step": 220 }, { "epoch": 0.012163575320601024, "grad_norm": 1.3006727695465088, "learning_rate": 9.999724907930199e-06, "loss": 0.9581, "step": 221 }, { "epoch": 0.012218614122956684, "grad_norm": 1.3215982913970947, "learning_rate": 9.999720342207243e-06, "loss": 0.9438, "step": 222 }, { "epoch": 0.012273652925312345, "grad_norm": 1.1107337474822998, "learning_rate": 9.999715738907727e-06, "loss": 0.9987, "step": 223 }, { "epoch": 0.012328691727668007, "grad_norm": 1.0745457410812378, "learning_rate": 9.999711098031685e-06, "loss": 0.9637, "step": 224 }, { "epoch": 0.012383730530023666, "grad_norm": 1.110861897468567, "learning_rate": 9.999706419579154e-06, "loss": 1.0225, "step": 225 }, { "epoch": 0.012438769332379328, "grad_norm": 1.0755527019500732, "learning_rate": 9.999701703550167e-06, "loss": 1.0204, "step": 226 }, { "epoch": 0.012493808134734987, "grad_norm": 1.1694976091384888, "learning_rate": 9.99969694994476e-06, "loss": 1.0566, "step": 227 }, { "epoch": 0.012548846937090649, "grad_norm": 1.455856442451477, "learning_rate": 9.99969215876297e-06, "loss": 0.9397, "step": 228 }, { "epoch": 0.01260388573944631, "grad_norm": 1.0707073211669922, "learning_rate": 9.99968733000483e-06, "loss": 0.8286, "step": 229 }, { "epoch": 0.01265892454180197, "grad_norm": 1.189548134803772, "learning_rate": 9.99968246367038e-06, "loss": 0.8762, "step": 230 }, { "epoch": 0.012713963344157631, "grad_norm": 1.1439214944839478, "learning_rate": 9.999677559759655e-06, "loss": 0.9187, "step": 231 }, { "epoch": 0.012769002146513293, "grad_norm": 1.2329761981964111, "learning_rate": 9.999672618272691e-06, "loss": 1.0374, "step": 232 }, { "epoch": 0.012824040948868952, "grad_norm": 1.1545134782791138, "learning_rate": 9.999667639209527e-06, "loss": 0.9343, "step": 233 }, { "epoch": 0.012879079751224614, "grad_norm": 1.0946775674819946, "learning_rate": 9.999662622570198e-06, "loss": 0.9568, "step": 234 }, { "epoch": 0.012934118553580273, "grad_norm": 1.2099589109420776, "learning_rate": 9.999657568354743e-06, "loss": 1.0364, "step": 235 }, { "epoch": 0.012989157355935935, "grad_norm": 1.09062922000885, "learning_rate": 9.999652476563202e-06, "loss": 1.0289, "step": 236 }, { "epoch": 0.013044196158291596, "grad_norm": 1.154557228088379, "learning_rate": 9.999647347195612e-06, "loss": 0.9925, "step": 237 }, { "epoch": 0.013099234960647256, "grad_norm": 1.025374174118042, "learning_rate": 9.999642180252008e-06, "loss": 0.9346, "step": 238 }, { "epoch": 0.013154273763002917, "grad_norm": 1.1473641395568848, "learning_rate": 9.999636975732433e-06, "loss": 1.0244, "step": 239 }, { "epoch": 0.013209312565358578, "grad_norm": 1.0421240329742432, "learning_rate": 9.999631733636923e-06, "loss": 0.9368, "step": 240 }, { "epoch": 0.013264351367714238, "grad_norm": 1.1076610088348389, "learning_rate": 9.99962645396552e-06, "loss": 1.0276, "step": 241 }, { "epoch": 0.0133193901700699, "grad_norm": 1.143559455871582, "learning_rate": 9.999621136718266e-06, "loss": 0.9626, "step": 242 }, { "epoch": 0.01337442897242556, "grad_norm": 1.0958378314971924, "learning_rate": 9.999615781895195e-06, "loss": 1.0254, "step": 243 }, { "epoch": 0.01342946777478122, "grad_norm": 1.117688536643982, "learning_rate": 9.99961038949635e-06, "loss": 0.9685, "step": 244 }, { "epoch": 0.013484506577136882, "grad_norm": 1.1645647287368774, "learning_rate": 9.999604959521771e-06, "loss": 1.0666, "step": 245 }, { "epoch": 0.013539545379492542, "grad_norm": 1.1238516569137573, "learning_rate": 9.999599491971502e-06, "loss": 1.0252, "step": 246 }, { "epoch": 0.013594584181848203, "grad_norm": 1.0196914672851562, "learning_rate": 9.999593986845579e-06, "loss": 0.9389, "step": 247 }, { "epoch": 0.013649622984203864, "grad_norm": 1.0231372117996216, "learning_rate": 9.999588444144049e-06, "loss": 0.8786, "step": 248 }, { "epoch": 0.013704661786559524, "grad_norm": 1.2504147291183472, "learning_rate": 9.999582863866947e-06, "loss": 1.0969, "step": 249 }, { "epoch": 0.013759700588915185, "grad_norm": 1.1123549938201904, "learning_rate": 9.99957724601432e-06, "loss": 0.8833, "step": 250 }, { "epoch": 0.013814739391270847, "grad_norm": 1.1068202257156372, "learning_rate": 9.999571590586208e-06, "loss": 0.9709, "step": 251 }, { "epoch": 0.013869778193626506, "grad_norm": 0.9891651272773743, "learning_rate": 9.999565897582655e-06, "loss": 0.8598, "step": 252 }, { "epoch": 0.013924816995982168, "grad_norm": 0.9866491556167603, "learning_rate": 9.999560167003703e-06, "loss": 0.8101, "step": 253 }, { "epoch": 0.013979855798337828, "grad_norm": 1.0862594842910767, "learning_rate": 9.999554398849396e-06, "loss": 0.9411, "step": 254 }, { "epoch": 0.014034894600693489, "grad_norm": 1.1898949146270752, "learning_rate": 9.999548593119774e-06, "loss": 0.9548, "step": 255 }, { "epoch": 0.01408993340304915, "grad_norm": 1.2167880535125732, "learning_rate": 9.999542749814886e-06, "loss": 1.0302, "step": 256 }, { "epoch": 0.01414497220540481, "grad_norm": 1.0784146785736084, "learning_rate": 9.999536868934771e-06, "loss": 0.8875, "step": 257 }, { "epoch": 0.014200011007760471, "grad_norm": 1.1128027439117432, "learning_rate": 9.999530950479475e-06, "loss": 0.9498, "step": 258 }, { "epoch": 0.014255049810116133, "grad_norm": 1.1311595439910889, "learning_rate": 9.999524994449044e-06, "loss": 0.9035, "step": 259 }, { "epoch": 0.014310088612471792, "grad_norm": 1.225615382194519, "learning_rate": 9.999519000843521e-06, "loss": 1.0104, "step": 260 }, { "epoch": 0.014365127414827454, "grad_norm": 1.2347793579101562, "learning_rate": 9.99951296966295e-06, "loss": 1.0288, "step": 261 }, { "epoch": 0.014420166217183113, "grad_norm": 1.1837103366851807, "learning_rate": 9.99950690090738e-06, "loss": 0.9553, "step": 262 }, { "epoch": 0.014475205019538775, "grad_norm": 1.1985397338867188, "learning_rate": 9.999500794576852e-06, "loss": 0.9561, "step": 263 }, { "epoch": 0.014530243821894436, "grad_norm": 1.036928415298462, "learning_rate": 9.999494650671418e-06, "loss": 0.8906, "step": 264 }, { "epoch": 0.014585282624250096, "grad_norm": 1.0797842741012573, "learning_rate": 9.999488469191116e-06, "loss": 0.8975, "step": 265 }, { "epoch": 0.014640321426605757, "grad_norm": 1.0571156740188599, "learning_rate": 9.999482250136e-06, "loss": 0.9334, "step": 266 }, { "epoch": 0.014695360228961419, "grad_norm": 1.2065023183822632, "learning_rate": 9.999475993506114e-06, "loss": 0.8986, "step": 267 }, { "epoch": 0.014750399031317078, "grad_norm": 1.201586127281189, "learning_rate": 9.999469699301502e-06, "loss": 0.9192, "step": 268 }, { "epoch": 0.01480543783367274, "grad_norm": 1.0470168590545654, "learning_rate": 9.999463367522216e-06, "loss": 0.8604, "step": 269 }, { "epoch": 0.0148604766360284, "grad_norm": 1.1142147779464722, "learning_rate": 9.9994569981683e-06, "loss": 0.9847, "step": 270 }, { "epoch": 0.01491551543838406, "grad_norm": 1.0352061986923218, "learning_rate": 9.999450591239805e-06, "loss": 0.8927, "step": 271 }, { "epoch": 0.014970554240739722, "grad_norm": 1.0353184938430786, "learning_rate": 9.999444146736779e-06, "loss": 0.8435, "step": 272 }, { "epoch": 0.015025593043095382, "grad_norm": 1.2091951370239258, "learning_rate": 9.999437664659267e-06, "loss": 0.8959, "step": 273 }, { "epoch": 0.015080631845451043, "grad_norm": 1.006361722946167, "learning_rate": 9.999431145007319e-06, "loss": 0.8579, "step": 274 }, { "epoch": 0.015135670647806704, "grad_norm": 1.1265509128570557, "learning_rate": 9.999424587780985e-06, "loss": 0.8808, "step": 275 }, { "epoch": 0.015190709450162364, "grad_norm": 1.060882568359375, "learning_rate": 9.999417992980317e-06, "loss": 1.044, "step": 276 }, { "epoch": 0.015245748252518026, "grad_norm": 1.0216747522354126, "learning_rate": 9.999411360605358e-06, "loss": 0.7773, "step": 277 }, { "epoch": 0.015300787054873685, "grad_norm": 1.1382462978363037, "learning_rate": 9.999404690656163e-06, "loss": 0.8954, "step": 278 }, { "epoch": 0.015355825857229347, "grad_norm": 1.113815188407898, "learning_rate": 9.99939798313278e-06, "loss": 0.8143, "step": 279 }, { "epoch": 0.015410864659585008, "grad_norm": 1.123530387878418, "learning_rate": 9.99939123803526e-06, "loss": 0.8872, "step": 280 }, { "epoch": 0.015465903461940668, "grad_norm": 1.0873669385910034, "learning_rate": 9.999384455363656e-06, "loss": 1.008, "step": 281 }, { "epoch": 0.015520942264296329, "grad_norm": 1.5956637859344482, "learning_rate": 9.999377635118014e-06, "loss": 0.9456, "step": 282 }, { "epoch": 0.01557598106665199, "grad_norm": 1.1471425294876099, "learning_rate": 9.999370777298389e-06, "loss": 0.9897, "step": 283 }, { "epoch": 0.01563101986900765, "grad_norm": 0.9960193634033203, "learning_rate": 9.999363881904831e-06, "loss": 0.8196, "step": 284 }, { "epoch": 0.01568605867136331, "grad_norm": 1.1033951044082642, "learning_rate": 9.999356948937393e-06, "loss": 0.879, "step": 285 }, { "epoch": 0.015741097473718973, "grad_norm": 1.157765507698059, "learning_rate": 9.999349978396126e-06, "loss": 1.0116, "step": 286 }, { "epoch": 0.015796136276074634, "grad_norm": 1.0472352504730225, "learning_rate": 9.999342970281084e-06, "loss": 0.8657, "step": 287 }, { "epoch": 0.015851175078430292, "grad_norm": 1.1346659660339355, "learning_rate": 9.999335924592315e-06, "loss": 0.8482, "step": 288 }, { "epoch": 0.015906213880785953, "grad_norm": 1.1164487600326538, "learning_rate": 9.999328841329879e-06, "loss": 1.0542, "step": 289 }, { "epoch": 0.015961252683141615, "grad_norm": 1.1890591382980347, "learning_rate": 9.999321720493825e-06, "loss": 0.9598, "step": 290 }, { "epoch": 0.016016291485497276, "grad_norm": 1.0419867038726807, "learning_rate": 9.999314562084205e-06, "loss": 0.9548, "step": 291 }, { "epoch": 0.016071330287852938, "grad_norm": 1.0652042627334595, "learning_rate": 9.999307366101077e-06, "loss": 0.9359, "step": 292 }, { "epoch": 0.016126369090208596, "grad_norm": 1.0166404247283936, "learning_rate": 9.999300132544492e-06, "loss": 0.9276, "step": 293 }, { "epoch": 0.016181407892564257, "grad_norm": 1.1638866662979126, "learning_rate": 9.999292861414507e-06, "loss": 0.957, "step": 294 }, { "epoch": 0.01623644669491992, "grad_norm": 1.5505993366241455, "learning_rate": 9.999285552711173e-06, "loss": 0.9878, "step": 295 }, { "epoch": 0.01629148549727558, "grad_norm": 1.177262783050537, "learning_rate": 9.999278206434549e-06, "loss": 0.8631, "step": 296 }, { "epoch": 0.01634652429963124, "grad_norm": 1.8578168153762817, "learning_rate": 9.999270822584687e-06, "loss": 0.9684, "step": 297 }, { "epoch": 0.0164015631019869, "grad_norm": 1.2617360353469849, "learning_rate": 9.999263401161643e-06, "loss": 1.014, "step": 298 }, { "epoch": 0.01645660190434256, "grad_norm": 0.9740132689476013, "learning_rate": 9.999255942165475e-06, "loss": 0.8606, "step": 299 }, { "epoch": 0.016511640706698222, "grad_norm": 0.9821745753288269, "learning_rate": 9.999248445596238e-06, "loss": 0.8241, "step": 300 }, { "epoch": 0.016566679509053883, "grad_norm": 1.0200445652008057, "learning_rate": 9.999240911453986e-06, "loss": 0.8256, "step": 301 }, { "epoch": 0.016621718311409545, "grad_norm": 1.4100390672683716, "learning_rate": 9.999233339738779e-06, "loss": 0.9057, "step": 302 }, { "epoch": 0.016676757113765206, "grad_norm": 1.056544303894043, "learning_rate": 9.99922573045067e-06, "loss": 1.0808, "step": 303 }, { "epoch": 0.016731795916120864, "grad_norm": 0.9271026253700256, "learning_rate": 9.99921808358972e-06, "loss": 0.878, "step": 304 }, { "epoch": 0.016786834718476525, "grad_norm": 0.9864157438278198, "learning_rate": 9.999210399155987e-06, "loss": 0.9198, "step": 305 }, { "epoch": 0.016841873520832187, "grad_norm": 1.093995451927185, "learning_rate": 9.999202677149525e-06, "loss": 0.9794, "step": 306 }, { "epoch": 0.016896912323187848, "grad_norm": 0.9717912077903748, "learning_rate": 9.999194917570395e-06, "loss": 0.8764, "step": 307 }, { "epoch": 0.01695195112554351, "grad_norm": 1.0026428699493408, "learning_rate": 9.999187120418653e-06, "loss": 0.8526, "step": 308 }, { "epoch": 0.017006989927899167, "grad_norm": 1.122870922088623, "learning_rate": 9.999179285694359e-06, "loss": 0.9773, "step": 309 }, { "epoch": 0.01706202873025483, "grad_norm": 1.0522836446762085, "learning_rate": 9.999171413397572e-06, "loss": 1.0183, "step": 310 }, { "epoch": 0.01711706753261049, "grad_norm": 0.9303658604621887, "learning_rate": 9.99916350352835e-06, "loss": 0.8402, "step": 311 }, { "epoch": 0.01717210633496615, "grad_norm": 0.9606096148490906, "learning_rate": 9.999155556086755e-06, "loss": 0.9692, "step": 312 }, { "epoch": 0.017227145137321813, "grad_norm": 1.176992416381836, "learning_rate": 9.999147571072844e-06, "loss": 0.8172, "step": 313 }, { "epoch": 0.017282183939677474, "grad_norm": 1.1948801279067993, "learning_rate": 9.999139548486678e-06, "loss": 1.0205, "step": 314 }, { "epoch": 0.017337222742033132, "grad_norm": 1.0064897537231445, "learning_rate": 9.999131488328318e-06, "loss": 0.9479, "step": 315 }, { "epoch": 0.017392261544388794, "grad_norm": 1.048242449760437, "learning_rate": 9.999123390597822e-06, "loss": 0.9862, "step": 316 }, { "epoch": 0.017447300346744455, "grad_norm": 1.12875497341156, "learning_rate": 9.999115255295256e-06, "loss": 0.9743, "step": 317 }, { "epoch": 0.017502339149100116, "grad_norm": 1.0607460737228394, "learning_rate": 9.999107082420674e-06, "loss": 0.8878, "step": 318 }, { "epoch": 0.017557377951455778, "grad_norm": 1.1480191946029663, "learning_rate": 9.999098871974144e-06, "loss": 0.8769, "step": 319 }, { "epoch": 0.017612416753811436, "grad_norm": 1.1150004863739014, "learning_rate": 9.999090623955724e-06, "loss": 0.8615, "step": 320 }, { "epoch": 0.017667455556167097, "grad_norm": 1.137839913368225, "learning_rate": 9.999082338365478e-06, "loss": 0.9703, "step": 321 }, { "epoch": 0.01772249435852276, "grad_norm": 1.0883489847183228, "learning_rate": 9.999074015203467e-06, "loss": 0.9273, "step": 322 }, { "epoch": 0.01777753316087842, "grad_norm": 1.0999557971954346, "learning_rate": 9.999065654469752e-06, "loss": 0.9605, "step": 323 }, { "epoch": 0.01783257196323408, "grad_norm": 0.9911689758300781, "learning_rate": 9.999057256164401e-06, "loss": 0.9117, "step": 324 }, { "epoch": 0.01788761076558974, "grad_norm": 1.040933609008789, "learning_rate": 9.999048820287472e-06, "loss": 0.9229, "step": 325 }, { "epoch": 0.0179426495679454, "grad_norm": 1.4341392517089844, "learning_rate": 9.999040346839031e-06, "loss": 1.0718, "step": 326 }, { "epoch": 0.017997688370301062, "grad_norm": 1.0246332883834839, "learning_rate": 9.99903183581914e-06, "loss": 0.9617, "step": 327 }, { "epoch": 0.018052727172656723, "grad_norm": 10.162322998046875, "learning_rate": 9.999023287227863e-06, "loss": 1.0391, "step": 328 }, { "epoch": 0.018107765975012385, "grad_norm": 1.3370027542114258, "learning_rate": 9.999014701065266e-06, "loss": 1.0211, "step": 329 }, { "epoch": 0.018162804777368046, "grad_norm": 1.0146219730377197, "learning_rate": 9.999006077331413e-06, "loss": 0.8611, "step": 330 }, { "epoch": 0.018217843579723704, "grad_norm": 1.0899269580841064, "learning_rate": 9.998997416026368e-06, "loss": 0.9209, "step": 331 }, { "epoch": 0.018272882382079365, "grad_norm": 1.1343204975128174, "learning_rate": 9.998988717150198e-06, "loss": 0.9405, "step": 332 }, { "epoch": 0.018327921184435027, "grad_norm": 1.2308380603790283, "learning_rate": 9.998979980702965e-06, "loss": 0.9579, "step": 333 }, { "epoch": 0.018382959986790688, "grad_norm": 1.1433519124984741, "learning_rate": 9.998971206684737e-06, "loss": 1.0045, "step": 334 }, { "epoch": 0.01843799878914635, "grad_norm": 1.0585781335830688, "learning_rate": 9.99896239509558e-06, "loss": 0.9171, "step": 335 }, { "epoch": 0.018493037591502007, "grad_norm": 1.2735164165496826, "learning_rate": 9.99895354593556e-06, "loss": 1.1001, "step": 336 }, { "epoch": 0.01854807639385767, "grad_norm": 1.2905755043029785, "learning_rate": 9.998944659204744e-06, "loss": 1.0294, "step": 337 }, { "epoch": 0.01860311519621333, "grad_norm": 1.1442075967788696, "learning_rate": 9.998935734903198e-06, "loss": 0.9385, "step": 338 }, { "epoch": 0.01865815399856899, "grad_norm": 1.1005232334136963, "learning_rate": 9.998926773030987e-06, "loss": 1.026, "step": 339 }, { "epoch": 0.018713192800924653, "grad_norm": 1.2770785093307495, "learning_rate": 9.998917773588182e-06, "loss": 1.0015, "step": 340 }, { "epoch": 0.01876823160328031, "grad_norm": 1.0963070392608643, "learning_rate": 9.998908736574849e-06, "loss": 0.9347, "step": 341 }, { "epoch": 0.018823270405635972, "grad_norm": 1.10364830493927, "learning_rate": 9.998899661991055e-06, "loss": 0.869, "step": 342 }, { "epoch": 0.018878309207991634, "grad_norm": 1.0364975929260254, "learning_rate": 9.99889054983687e-06, "loss": 0.9855, "step": 343 }, { "epoch": 0.018933348010347295, "grad_norm": 1.104702115058899, "learning_rate": 9.998881400112362e-06, "loss": 0.9555, "step": 344 }, { "epoch": 0.018988386812702956, "grad_norm": 0.9957441687583923, "learning_rate": 9.998872212817599e-06, "loss": 0.9634, "step": 345 }, { "epoch": 0.019043425615058618, "grad_norm": 1.262271523475647, "learning_rate": 9.998862987952651e-06, "loss": 1.0133, "step": 346 }, { "epoch": 0.019098464417414276, "grad_norm": 1.2075226306915283, "learning_rate": 9.998853725517587e-06, "loss": 1.0588, "step": 347 }, { "epoch": 0.019153503219769937, "grad_norm": 1.0609898567199707, "learning_rate": 9.998844425512477e-06, "loss": 0.9952, "step": 348 }, { "epoch": 0.0192085420221256, "grad_norm": 1.1930195093154907, "learning_rate": 9.998835087937389e-06, "loss": 0.9617, "step": 349 }, { "epoch": 0.01926358082448126, "grad_norm": 1.2359932661056519, "learning_rate": 9.998825712792396e-06, "loss": 0.8768, "step": 350 }, { "epoch": 0.01931861962683692, "grad_norm": 0.9984115362167358, "learning_rate": 9.998816300077566e-06, "loss": 0.8205, "step": 351 }, { "epoch": 0.01937365842919258, "grad_norm": 1.6853677034378052, "learning_rate": 9.998806849792972e-06, "loss": 0.9066, "step": 352 }, { "epoch": 0.01942869723154824, "grad_norm": 1.2869856357574463, "learning_rate": 9.998797361938683e-06, "loss": 1.0054, "step": 353 }, { "epoch": 0.019483736033903902, "grad_norm": 1.2791584730148315, "learning_rate": 9.99878783651477e-06, "loss": 0.7627, "step": 354 }, { "epoch": 0.019538774836259563, "grad_norm": 1.0795867443084717, "learning_rate": 9.998778273521307e-06, "loss": 0.9343, "step": 355 }, { "epoch": 0.019593813638615225, "grad_norm": 1.0926088094711304, "learning_rate": 9.998768672958365e-06, "loss": 0.943, "step": 356 }, { "epoch": 0.019648852440970886, "grad_norm": 1.0530847311019897, "learning_rate": 9.998759034826015e-06, "loss": 0.9656, "step": 357 }, { "epoch": 0.019703891243326544, "grad_norm": 1.1793400049209595, "learning_rate": 9.99874935912433e-06, "loss": 0.9799, "step": 358 }, { "epoch": 0.019758930045682205, "grad_norm": 1.0726191997528076, "learning_rate": 9.998739645853383e-06, "loss": 0.8739, "step": 359 }, { "epoch": 0.019813968848037867, "grad_norm": 1.0488981008529663, "learning_rate": 9.998729895013246e-06, "loss": 0.8986, "step": 360 }, { "epoch": 0.019869007650393528, "grad_norm": 1.8267477750778198, "learning_rate": 9.998720106603993e-06, "loss": 0.9175, "step": 361 }, { "epoch": 0.01992404645274919, "grad_norm": 0.9868306517601013, "learning_rate": 9.9987102806257e-06, "loss": 0.9609, "step": 362 }, { "epoch": 0.019979085255104848, "grad_norm": 1.0171183347702026, "learning_rate": 9.998700417078438e-06, "loss": 0.8904, "step": 363 }, { "epoch": 0.02003412405746051, "grad_norm": 0.9800812602043152, "learning_rate": 9.998690515962282e-06, "loss": 0.8344, "step": 364 }, { "epoch": 0.02008916285981617, "grad_norm": 1.024707317352295, "learning_rate": 9.998680577277304e-06, "loss": 0.9026, "step": 365 }, { "epoch": 0.02014420166217183, "grad_norm": 1.1056619882583618, "learning_rate": 9.998670601023584e-06, "loss": 1.017, "step": 366 }, { "epoch": 0.020199240464527493, "grad_norm": 1.0555908679962158, "learning_rate": 9.998660587201191e-06, "loss": 0.9627, "step": 367 }, { "epoch": 0.02025427926688315, "grad_norm": 0.9502031803131104, "learning_rate": 9.998650535810204e-06, "loss": 0.935, "step": 368 }, { "epoch": 0.020309318069238812, "grad_norm": 1.0355613231658936, "learning_rate": 9.998640446850699e-06, "loss": 0.9946, "step": 369 }, { "epoch": 0.020364356871594474, "grad_norm": 0.9906355142593384, "learning_rate": 9.99863032032275e-06, "loss": 0.9389, "step": 370 }, { "epoch": 0.020419395673950135, "grad_norm": 0.9483911395072937, "learning_rate": 9.99862015622643e-06, "loss": 0.979, "step": 371 }, { "epoch": 0.020474434476305797, "grad_norm": 0.9769986271858215, "learning_rate": 9.998609954561822e-06, "loss": 0.8972, "step": 372 }, { "epoch": 0.020529473278661458, "grad_norm": 1.1682699918746948, "learning_rate": 9.998599715329e-06, "loss": 0.943, "step": 373 }, { "epoch": 0.020584512081017116, "grad_norm": 1.007912516593933, "learning_rate": 9.99858943852804e-06, "loss": 0.8825, "step": 374 }, { "epoch": 0.020639550883372777, "grad_norm": 0.9788785576820374, "learning_rate": 9.99857912415902e-06, "loss": 0.9667, "step": 375 }, { "epoch": 0.02069458968572844, "grad_norm": 1.0804275274276733, "learning_rate": 9.998568772222017e-06, "loss": 1.0026, "step": 376 }, { "epoch": 0.0207496284880841, "grad_norm": 1.0859237909317017, "learning_rate": 9.998558382717109e-06, "loss": 0.9592, "step": 377 }, { "epoch": 0.02080466729043976, "grad_norm": 1.2925337553024292, "learning_rate": 9.998547955644373e-06, "loss": 0.9067, "step": 378 }, { "epoch": 0.02085970609279542, "grad_norm": 0.9853373765945435, "learning_rate": 9.99853749100389e-06, "loss": 0.9538, "step": 379 }, { "epoch": 0.02091474489515108, "grad_norm": 1.0461076498031616, "learning_rate": 9.998526988795738e-06, "loss": 0.9261, "step": 380 }, { "epoch": 0.020969783697506742, "grad_norm": 1.024559497833252, "learning_rate": 9.998516449019995e-06, "loss": 0.9117, "step": 381 }, { "epoch": 0.021024822499862404, "grad_norm": 1.1474825143814087, "learning_rate": 9.998505871676739e-06, "loss": 1.0177, "step": 382 }, { "epoch": 0.021079861302218065, "grad_norm": 0.9587596654891968, "learning_rate": 9.998495256766051e-06, "loss": 0.8809, "step": 383 }, { "epoch": 0.021134900104573723, "grad_norm": 0.9505122303962708, "learning_rate": 9.998484604288013e-06, "loss": 0.9266, "step": 384 }, { "epoch": 0.021189938906929384, "grad_norm": 0.9625647664070129, "learning_rate": 9.9984739142427e-06, "loss": 0.9073, "step": 385 }, { "epoch": 0.021244977709285046, "grad_norm": 0.9650934338569641, "learning_rate": 9.998463186630196e-06, "loss": 0.9042, "step": 386 }, { "epoch": 0.021300016511640707, "grad_norm": 1.0289491415023804, "learning_rate": 9.99845242145058e-06, "loss": 0.929, "step": 387 }, { "epoch": 0.02135505531399637, "grad_norm": 0.9543869495391846, "learning_rate": 9.998441618703935e-06, "loss": 0.9406, "step": 388 }, { "epoch": 0.02141009411635203, "grad_norm": 0.9276942610740662, "learning_rate": 9.99843077839034e-06, "loss": 0.8982, "step": 389 }, { "epoch": 0.021465132918707688, "grad_norm": 0.9264664053916931, "learning_rate": 9.998419900509877e-06, "loss": 0.7255, "step": 390 }, { "epoch": 0.02152017172106335, "grad_norm": 0.9961187243461609, "learning_rate": 9.998408985062628e-06, "loss": 0.9826, "step": 391 }, { "epoch": 0.02157521052341901, "grad_norm": 0.966596245765686, "learning_rate": 9.998398032048676e-06, "loss": 0.8159, "step": 392 }, { "epoch": 0.021630249325774672, "grad_norm": 1.1336095333099365, "learning_rate": 9.998387041468102e-06, "loss": 0.9289, "step": 393 }, { "epoch": 0.021685288128130333, "grad_norm": 1.0453619956970215, "learning_rate": 9.998376013320989e-06, "loss": 0.8816, "step": 394 }, { "epoch": 0.02174032693048599, "grad_norm": 0.8961821794509888, "learning_rate": 9.998364947607419e-06, "loss": 0.871, "step": 395 }, { "epoch": 0.021795365732841653, "grad_norm": 1.3420332670211792, "learning_rate": 9.998353844327477e-06, "loss": 0.9338, "step": 396 }, { "epoch": 0.021850404535197314, "grad_norm": 0.9635335206985474, "learning_rate": 9.998342703481246e-06, "loss": 0.9592, "step": 397 }, { "epoch": 0.021905443337552975, "grad_norm": 1.3322341442108154, "learning_rate": 9.998331525068807e-06, "loss": 1.0974, "step": 398 }, { "epoch": 0.021960482139908637, "grad_norm": 1.017220377922058, "learning_rate": 9.998320309090247e-06, "loss": 0.9827, "step": 399 }, { "epoch": 0.022015520942264295, "grad_norm": 1.0080329179763794, "learning_rate": 9.99830905554565e-06, "loss": 0.877, "step": 400 }, { "epoch": 0.022070559744619956, "grad_norm": 0.9883211255073547, "learning_rate": 9.998297764435101e-06, "loss": 0.9625, "step": 401 }, { "epoch": 0.022125598546975617, "grad_norm": 1.0948412418365479, "learning_rate": 9.998286435758684e-06, "loss": 0.9058, "step": 402 }, { "epoch": 0.02218063734933128, "grad_norm": 0.9402000308036804, "learning_rate": 9.998275069516482e-06, "loss": 0.8882, "step": 403 }, { "epoch": 0.02223567615168694, "grad_norm": 0.9858806133270264, "learning_rate": 9.998263665708583e-06, "loss": 0.9086, "step": 404 }, { "epoch": 0.0222907149540426, "grad_norm": 1.0556131601333618, "learning_rate": 9.998252224335073e-06, "loss": 0.9583, "step": 405 }, { "epoch": 0.02234575375639826, "grad_norm": 1.092766284942627, "learning_rate": 9.998240745396037e-06, "loss": 0.9124, "step": 406 }, { "epoch": 0.02240079255875392, "grad_norm": 1.1902250051498413, "learning_rate": 9.998229228891563e-06, "loss": 1.0566, "step": 407 }, { "epoch": 0.022455831361109582, "grad_norm": 1.067906141281128, "learning_rate": 9.998217674821734e-06, "loss": 0.9823, "step": 408 }, { "epoch": 0.022510870163465244, "grad_norm": 1.0051710605621338, "learning_rate": 9.998206083186638e-06, "loss": 0.9141, "step": 409 }, { "epoch": 0.022565908965820905, "grad_norm": 1.046412467956543, "learning_rate": 9.998194453986367e-06, "loss": 0.9439, "step": 410 }, { "epoch": 0.022620947768176563, "grad_norm": 1.1103553771972656, "learning_rate": 9.998182787221e-06, "loss": 0.9494, "step": 411 }, { "epoch": 0.022675986570532224, "grad_norm": 1.0508466958999634, "learning_rate": 9.998171082890632e-06, "loss": 0.9202, "step": 412 }, { "epoch": 0.022731025372887886, "grad_norm": 1.1364226341247559, "learning_rate": 9.998159340995347e-06, "loss": 0.9859, "step": 413 }, { "epoch": 0.022786064175243547, "grad_norm": 1.2073607444763184, "learning_rate": 9.998147561535234e-06, "loss": 0.8883, "step": 414 }, { "epoch": 0.02284110297759921, "grad_norm": 1.0657012462615967, "learning_rate": 9.998135744510384e-06, "loss": 0.8321, "step": 415 }, { "epoch": 0.02289614177995487, "grad_norm": 1.0101548433303833, "learning_rate": 9.998123889920881e-06, "loss": 0.9374, "step": 416 }, { "epoch": 0.022951180582310528, "grad_norm": 1.057455062866211, "learning_rate": 9.998111997766817e-06, "loss": 0.8831, "step": 417 }, { "epoch": 0.02300621938466619, "grad_norm": 1.206092357635498, "learning_rate": 9.998100068048282e-06, "loss": 0.8812, "step": 418 }, { "epoch": 0.02306125818702185, "grad_norm": 1.0709773302078247, "learning_rate": 9.998088100765366e-06, "loss": 0.9486, "step": 419 }, { "epoch": 0.023116296989377512, "grad_norm": 1.066469669342041, "learning_rate": 9.998076095918156e-06, "loss": 1.0229, "step": 420 }, { "epoch": 0.023171335791733173, "grad_norm": 1.0443583726882935, "learning_rate": 9.998064053506744e-06, "loss": 0.8615, "step": 421 }, { "epoch": 0.02322637459408883, "grad_norm": 1.103096842765808, "learning_rate": 9.99805197353122e-06, "loss": 0.9909, "step": 422 }, { "epoch": 0.023281413396444493, "grad_norm": 0.9804643392562866, "learning_rate": 9.998039855991677e-06, "loss": 0.9214, "step": 423 }, { "epoch": 0.023336452198800154, "grad_norm": 0.9880676865577698, "learning_rate": 9.998027700888202e-06, "loss": 0.9345, "step": 424 }, { "epoch": 0.023391491001155815, "grad_norm": 0.9633826017379761, "learning_rate": 9.99801550822089e-06, "loss": 0.9897, "step": 425 }, { "epoch": 0.023446529803511477, "grad_norm": 1.0159331560134888, "learning_rate": 9.998003277989831e-06, "loss": 0.9385, "step": 426 }, { "epoch": 0.023501568605867135, "grad_norm": 1.009667158126831, "learning_rate": 9.99799101019512e-06, "loss": 0.9013, "step": 427 }, { "epoch": 0.023556607408222796, "grad_norm": 0.9478578567504883, "learning_rate": 9.997978704836842e-06, "loss": 0.8775, "step": 428 }, { "epoch": 0.023611646210578457, "grad_norm": 1.013181447982788, "learning_rate": 9.997966361915096e-06, "loss": 0.8797, "step": 429 }, { "epoch": 0.02366668501293412, "grad_norm": 1.0337481498718262, "learning_rate": 9.997953981429974e-06, "loss": 1.0047, "step": 430 }, { "epoch": 0.02372172381528978, "grad_norm": 0.9423721432685852, "learning_rate": 9.997941563381566e-06, "loss": 0.8639, "step": 431 }, { "epoch": 0.02377676261764544, "grad_norm": 1.100492000579834, "learning_rate": 9.997929107769968e-06, "loss": 1.0022, "step": 432 }, { "epoch": 0.0238318014200011, "grad_norm": 1.1232364177703857, "learning_rate": 9.997916614595272e-06, "loss": 0.9145, "step": 433 }, { "epoch": 0.02388684022235676, "grad_norm": 0.9466833472251892, "learning_rate": 9.997904083857572e-06, "loss": 0.9397, "step": 434 }, { "epoch": 0.023941879024712422, "grad_norm": 0.9514566659927368, "learning_rate": 9.997891515556963e-06, "loss": 0.8025, "step": 435 }, { "epoch": 0.023996917827068084, "grad_norm": 0.9292222261428833, "learning_rate": 9.997878909693539e-06, "loss": 0.7739, "step": 436 }, { "epoch": 0.024051956629423745, "grad_norm": 1.1049963235855103, "learning_rate": 9.997866266267397e-06, "loss": 0.9439, "step": 437 }, { "epoch": 0.024106995431779403, "grad_norm": 1.0938019752502441, "learning_rate": 9.997853585278627e-06, "loss": 0.9479, "step": 438 }, { "epoch": 0.024162034234135064, "grad_norm": 1.0423611402511597, "learning_rate": 9.997840866727331e-06, "loss": 0.9309, "step": 439 }, { "epoch": 0.024217073036490726, "grad_norm": 1.0584756135940552, "learning_rate": 9.997828110613598e-06, "loss": 1.0218, "step": 440 }, { "epoch": 0.024272111838846387, "grad_norm": 0.9986408948898315, "learning_rate": 9.997815316937527e-06, "loss": 0.9734, "step": 441 }, { "epoch": 0.02432715064120205, "grad_norm": 0.9680983424186707, "learning_rate": 9.997802485699215e-06, "loss": 0.9286, "step": 442 }, { "epoch": 0.024382189443557706, "grad_norm": 1.2231700420379639, "learning_rate": 9.997789616898757e-06, "loss": 0.8083, "step": 443 }, { "epoch": 0.024437228245913368, "grad_norm": 1.0064021348953247, "learning_rate": 9.99777671053625e-06, "loss": 0.9161, "step": 444 }, { "epoch": 0.02449226704826903, "grad_norm": 0.9658541679382324, "learning_rate": 9.99776376661179e-06, "loss": 0.8027, "step": 445 }, { "epoch": 0.02454730585062469, "grad_norm": 0.9440343379974365, "learning_rate": 9.997750785125477e-06, "loss": 0.9124, "step": 446 }, { "epoch": 0.024602344652980352, "grad_norm": 0.998792827129364, "learning_rate": 9.997737766077404e-06, "loss": 0.8699, "step": 447 }, { "epoch": 0.024657383455336013, "grad_norm": 1.430880069732666, "learning_rate": 9.997724709467676e-06, "loss": 0.9158, "step": 448 }, { "epoch": 0.02471242225769167, "grad_norm": 0.9737820029258728, "learning_rate": 9.997711615296384e-06, "loss": 0.9496, "step": 449 }, { "epoch": 0.024767461060047333, "grad_norm": 0.9710075855255127, "learning_rate": 9.997698483563629e-06, "loss": 0.8714, "step": 450 }, { "epoch": 0.024822499862402994, "grad_norm": 1.5286253690719604, "learning_rate": 9.997685314269511e-06, "loss": 0.8421, "step": 451 }, { "epoch": 0.024877538664758655, "grad_norm": 1.0269445180892944, "learning_rate": 9.99767210741413e-06, "loss": 1.0131, "step": 452 }, { "epoch": 0.024932577467114317, "grad_norm": 0.9780508279800415, "learning_rate": 9.99765886299758e-06, "loss": 0.9897, "step": 453 }, { "epoch": 0.024987616269469975, "grad_norm": 0.998332679271698, "learning_rate": 9.997645581019965e-06, "loss": 0.9647, "step": 454 }, { "epoch": 0.025042655071825636, "grad_norm": 1.7062602043151855, "learning_rate": 9.997632261481383e-06, "loss": 1.0729, "step": 455 }, { "epoch": 0.025097693874181298, "grad_norm": 0.9793694615364075, "learning_rate": 9.997618904381936e-06, "loss": 0.9556, "step": 456 }, { "epoch": 0.02515273267653696, "grad_norm": 1.0183895826339722, "learning_rate": 9.997605509721721e-06, "loss": 0.9194, "step": 457 }, { "epoch": 0.02520777147889262, "grad_norm": 1.0288400650024414, "learning_rate": 9.997592077500844e-06, "loss": 0.955, "step": 458 }, { "epoch": 0.025262810281248282, "grad_norm": 0.9551253914833069, "learning_rate": 9.997578607719401e-06, "loss": 0.8498, "step": 459 }, { "epoch": 0.02531784908360394, "grad_norm": 0.9648008942604065, "learning_rate": 9.997565100377494e-06, "loss": 0.9306, "step": 460 }, { "epoch": 0.0253728878859596, "grad_norm": 0.9206677675247192, "learning_rate": 9.997551555475225e-06, "loss": 0.7874, "step": 461 }, { "epoch": 0.025427926688315262, "grad_norm": 1.0479545593261719, "learning_rate": 9.997537973012698e-06, "loss": 0.9201, "step": 462 }, { "epoch": 0.025482965490670924, "grad_norm": 1.0329946279525757, "learning_rate": 9.997524352990013e-06, "loss": 0.9577, "step": 463 }, { "epoch": 0.025538004293026585, "grad_norm": 1.1177828311920166, "learning_rate": 9.997510695407273e-06, "loss": 1.0041, "step": 464 }, { "epoch": 0.025593043095382243, "grad_norm": 1.0351577997207642, "learning_rate": 9.99749700026458e-06, "loss": 0.9952, "step": 465 }, { "epoch": 0.025648081897737905, "grad_norm": 0.905274510383606, "learning_rate": 9.997483267562035e-06, "loss": 0.8185, "step": 466 }, { "epoch": 0.025703120700093566, "grad_norm": 1.0749776363372803, "learning_rate": 9.997469497299747e-06, "loss": 1.0611, "step": 467 }, { "epoch": 0.025758159502449227, "grad_norm": 0.8972223401069641, "learning_rate": 9.997455689477815e-06, "loss": 0.8994, "step": 468 }, { "epoch": 0.02581319830480489, "grad_norm": 1.0669914484024048, "learning_rate": 9.997441844096342e-06, "loss": 1.06, "step": 469 }, { "epoch": 0.025868237107160547, "grad_norm": 1.0431914329528809, "learning_rate": 9.997427961155435e-06, "loss": 0.8657, "step": 470 }, { "epoch": 0.025923275909516208, "grad_norm": 0.9609962701797485, "learning_rate": 9.997414040655198e-06, "loss": 0.8864, "step": 471 }, { "epoch": 0.02597831471187187, "grad_norm": 1.0829721689224243, "learning_rate": 9.997400082595735e-06, "loss": 0.9221, "step": 472 }, { "epoch": 0.02603335351422753, "grad_norm": 0.992082953453064, "learning_rate": 9.99738608697715e-06, "loss": 0.8455, "step": 473 }, { "epoch": 0.026088392316583192, "grad_norm": 1.0486301183700562, "learning_rate": 9.997372053799547e-06, "loss": 0.8729, "step": 474 }, { "epoch": 0.026143431118938854, "grad_norm": 1.0328491926193237, "learning_rate": 9.997357983063036e-06, "loss": 0.8788, "step": 475 }, { "epoch": 0.02619846992129451, "grad_norm": 0.963333249092102, "learning_rate": 9.997343874767719e-06, "loss": 0.892, "step": 476 }, { "epoch": 0.026253508723650173, "grad_norm": 1.1606497764587402, "learning_rate": 9.997329728913704e-06, "loss": 0.9984, "step": 477 }, { "epoch": 0.026308547526005834, "grad_norm": 1.241650104522705, "learning_rate": 9.997315545501096e-06, "loss": 0.946, "step": 478 }, { "epoch": 0.026363586328361496, "grad_norm": 1.008004069328308, "learning_rate": 9.99730132453e-06, "loss": 0.849, "step": 479 }, { "epoch": 0.026418625130717157, "grad_norm": 0.9883478879928589, "learning_rate": 9.997287066000527e-06, "loss": 0.9478, "step": 480 }, { "epoch": 0.026473663933072815, "grad_norm": 1.0224446058273315, "learning_rate": 9.997272769912783e-06, "loss": 1.0318, "step": 481 }, { "epoch": 0.026528702735428476, "grad_norm": 0.9412569403648376, "learning_rate": 9.997258436266874e-06, "loss": 0.9119, "step": 482 }, { "epoch": 0.026583741537784138, "grad_norm": 0.9214537739753723, "learning_rate": 9.997244065062906e-06, "loss": 0.8785, "step": 483 }, { "epoch": 0.0266387803401398, "grad_norm": 1.0015628337860107, "learning_rate": 9.997229656300991e-06, "loss": 0.8869, "step": 484 }, { "epoch": 0.02669381914249546, "grad_norm": 0.8965190052986145, "learning_rate": 9.997215209981237e-06, "loss": 0.7009, "step": 485 }, { "epoch": 0.02674885794485112, "grad_norm": 1.1976135969161987, "learning_rate": 9.997200726103749e-06, "loss": 0.9795, "step": 486 }, { "epoch": 0.02680389674720678, "grad_norm": 0.864780843257904, "learning_rate": 9.997186204668639e-06, "loss": 0.7687, "step": 487 }, { "epoch": 0.02685893554956244, "grad_norm": 0.9946566820144653, "learning_rate": 9.997171645676013e-06, "loss": 0.9672, "step": 488 }, { "epoch": 0.026913974351918103, "grad_norm": 1.043835997581482, "learning_rate": 9.997157049125985e-06, "loss": 0.862, "step": 489 }, { "epoch": 0.026969013154273764, "grad_norm": 0.9697456955909729, "learning_rate": 9.99714241501866e-06, "loss": 0.8368, "step": 490 }, { "epoch": 0.027024051956629425, "grad_norm": 0.9975618124008179, "learning_rate": 9.997127743354153e-06, "loss": 0.8739, "step": 491 }, { "epoch": 0.027079090758985083, "grad_norm": 1.0055313110351562, "learning_rate": 9.99711303413257e-06, "loss": 0.9227, "step": 492 }, { "epoch": 0.027134129561340745, "grad_norm": 1.0418384075164795, "learning_rate": 9.997098287354024e-06, "loss": 0.9978, "step": 493 }, { "epoch": 0.027189168363696406, "grad_norm": 0.8648970723152161, "learning_rate": 9.997083503018625e-06, "loss": 0.8363, "step": 494 }, { "epoch": 0.027244207166052067, "grad_norm": 1.13506019115448, "learning_rate": 9.997068681126483e-06, "loss": 0.8851, "step": 495 }, { "epoch": 0.02729924596840773, "grad_norm": 0.974400520324707, "learning_rate": 9.997053821677712e-06, "loss": 0.8533, "step": 496 }, { "epoch": 0.027354284770763387, "grad_norm": 1.226507544517517, "learning_rate": 9.997038924672419e-06, "loss": 0.8586, "step": 497 }, { "epoch": 0.027409323573119048, "grad_norm": 1.004753589630127, "learning_rate": 9.997023990110721e-06, "loss": 0.8974, "step": 498 }, { "epoch": 0.02746436237547471, "grad_norm": 1.0492571592330933, "learning_rate": 9.997009017992729e-06, "loss": 0.8457, "step": 499 }, { "epoch": 0.02751940117783037, "grad_norm": 1.0068167448043823, "learning_rate": 9.996994008318554e-06, "loss": 0.9608, "step": 500 }, { "epoch": 0.027574439980186032, "grad_norm": 0.9686044454574585, "learning_rate": 9.996978961088311e-06, "loss": 0.9041, "step": 501 }, { "epoch": 0.027629478782541694, "grad_norm": 1.281728744506836, "learning_rate": 9.99696387630211e-06, "loss": 0.9739, "step": 502 }, { "epoch": 0.02768451758489735, "grad_norm": 0.9069758653640747, "learning_rate": 9.996948753960065e-06, "loss": 0.8467, "step": 503 }, { "epoch": 0.027739556387253013, "grad_norm": 1.0337222814559937, "learning_rate": 9.996933594062293e-06, "loss": 0.9638, "step": 504 }, { "epoch": 0.027794595189608674, "grad_norm": 0.9695359468460083, "learning_rate": 9.996918396608905e-06, "loss": 0.8986, "step": 505 }, { "epoch": 0.027849633991964336, "grad_norm": 0.9120615124702454, "learning_rate": 9.996903161600016e-06, "loss": 0.9103, "step": 506 }, { "epoch": 0.027904672794319997, "grad_norm": 0.9736546874046326, "learning_rate": 9.996887889035741e-06, "loss": 0.9308, "step": 507 }, { "epoch": 0.027959711596675655, "grad_norm": 1.0184897184371948, "learning_rate": 9.996872578916192e-06, "loss": 0.8978, "step": 508 }, { "epoch": 0.028014750399031316, "grad_norm": 0.9791838526725769, "learning_rate": 9.996857231241489e-06, "loss": 0.8639, "step": 509 }, { "epoch": 0.028069789201386978, "grad_norm": 1.2985681295394897, "learning_rate": 9.996841846011742e-06, "loss": 0.9581, "step": 510 }, { "epoch": 0.02812482800374264, "grad_norm": 1.0647368431091309, "learning_rate": 9.996826423227071e-06, "loss": 1.0565, "step": 511 }, { "epoch": 0.0281798668060983, "grad_norm": 1.0336421728134155, "learning_rate": 9.996810962887591e-06, "loss": 1.008, "step": 512 }, { "epoch": 0.02823490560845396, "grad_norm": 1.1838933229446411, "learning_rate": 9.996795464993416e-06, "loss": 0.8359, "step": 513 }, { "epoch": 0.02828994441080962, "grad_norm": 0.9898360371589661, "learning_rate": 9.996779929544663e-06, "loss": 0.8501, "step": 514 }, { "epoch": 0.02834498321316528, "grad_norm": 0.9836066365242004, "learning_rate": 9.99676435654145e-06, "loss": 0.8795, "step": 515 }, { "epoch": 0.028400022015520943, "grad_norm": 1.0621601343154907, "learning_rate": 9.996748745983895e-06, "loss": 0.8746, "step": 516 }, { "epoch": 0.028455060817876604, "grad_norm": 1.0082437992095947, "learning_rate": 9.996733097872113e-06, "loss": 0.9278, "step": 517 }, { "epoch": 0.028510099620232265, "grad_norm": 0.9903931617736816, "learning_rate": 9.996717412206222e-06, "loss": 0.8264, "step": 518 }, { "epoch": 0.028565138422587923, "grad_norm": 1.0797243118286133, "learning_rate": 9.996701688986342e-06, "loss": 1.0077, "step": 519 }, { "epoch": 0.028620177224943585, "grad_norm": 1.147133231163025, "learning_rate": 9.99668592821259e-06, "loss": 0.9374, "step": 520 }, { "epoch": 0.028675216027299246, "grad_norm": 0.9993947744369507, "learning_rate": 9.996670129885082e-06, "loss": 0.9562, "step": 521 }, { "epoch": 0.028730254829654907, "grad_norm": 0.8580895066261292, "learning_rate": 9.99665429400394e-06, "loss": 0.7985, "step": 522 }, { "epoch": 0.02878529363201057, "grad_norm": 0.9251388907432556, "learning_rate": 9.996638420569281e-06, "loss": 0.7323, "step": 523 }, { "epoch": 0.028840332434366227, "grad_norm": 1.0010193586349487, "learning_rate": 9.996622509581227e-06, "loss": 0.9316, "step": 524 }, { "epoch": 0.028895371236721888, "grad_norm": 0.9822579026222229, "learning_rate": 9.996606561039894e-06, "loss": 0.8978, "step": 525 }, { "epoch": 0.02895041003907755, "grad_norm": 1.0760595798492432, "learning_rate": 9.996590574945403e-06, "loss": 0.9125, "step": 526 }, { "epoch": 0.02900544884143321, "grad_norm": 1.138869285583496, "learning_rate": 9.996574551297876e-06, "loss": 0.8185, "step": 527 }, { "epoch": 0.029060487643788872, "grad_norm": 1.002994179725647, "learning_rate": 9.996558490097433e-06, "loss": 0.9404, "step": 528 }, { "epoch": 0.02911552644614453, "grad_norm": 0.9550611972808838, "learning_rate": 9.996542391344194e-06, "loss": 0.859, "step": 529 }, { "epoch": 0.02917056524850019, "grad_norm": 0.9236055612564087, "learning_rate": 9.996526255038277e-06, "loss": 0.7758, "step": 530 }, { "epoch": 0.029225604050855853, "grad_norm": 1.103966474533081, "learning_rate": 9.996510081179808e-06, "loss": 1.0147, "step": 531 }, { "epoch": 0.029280642853211514, "grad_norm": 0.9884665012359619, "learning_rate": 9.996493869768906e-06, "loss": 0.8784, "step": 532 }, { "epoch": 0.029335681655567176, "grad_norm": 0.9173223376274109, "learning_rate": 9.996477620805694e-06, "loss": 0.8741, "step": 533 }, { "epoch": 0.029390720457922837, "grad_norm": 0.965548574924469, "learning_rate": 9.996461334290294e-06, "loss": 0.8989, "step": 534 }, { "epoch": 0.029445759260278495, "grad_norm": 0.9939296245574951, "learning_rate": 9.996445010222828e-06, "loss": 0.8552, "step": 535 }, { "epoch": 0.029500798062634156, "grad_norm": 1.0081578493118286, "learning_rate": 9.996428648603417e-06, "loss": 0.9138, "step": 536 }, { "epoch": 0.029555836864989818, "grad_norm": 1.0139487981796265, "learning_rate": 9.996412249432188e-06, "loss": 0.9452, "step": 537 }, { "epoch": 0.02961087566734548, "grad_norm": 0.9463647603988647, "learning_rate": 9.996395812709262e-06, "loss": 0.8721, "step": 538 }, { "epoch": 0.02966591446970114, "grad_norm": 0.9981473684310913, "learning_rate": 9.99637933843476e-06, "loss": 0.7791, "step": 539 }, { "epoch": 0.0297209532720568, "grad_norm": 1.1637190580368042, "learning_rate": 9.996362826608812e-06, "loss": 0.8798, "step": 540 }, { "epoch": 0.02977599207441246, "grad_norm": 2.2887051105499268, "learning_rate": 9.996346277231536e-06, "loss": 0.9303, "step": 541 }, { "epoch": 0.02983103087676812, "grad_norm": 0.9173391461372375, "learning_rate": 9.99632969030306e-06, "loss": 0.8627, "step": 542 }, { "epoch": 0.029886069679123783, "grad_norm": 1.033355474472046, "learning_rate": 9.996313065823506e-06, "loss": 0.9906, "step": 543 }, { "epoch": 0.029941108481479444, "grad_norm": 0.9286639094352722, "learning_rate": 9.996296403793002e-06, "loss": 0.7043, "step": 544 }, { "epoch": 0.029996147283835102, "grad_norm": 0.963238000869751, "learning_rate": 9.996279704211671e-06, "loss": 1.0236, "step": 545 }, { "epoch": 0.030051186086190763, "grad_norm": 1.0275089740753174, "learning_rate": 9.99626296707964e-06, "loss": 0.976, "step": 546 }, { "epoch": 0.030106224888546425, "grad_norm": 1.0944674015045166, "learning_rate": 9.996246192397032e-06, "loss": 0.9209, "step": 547 }, { "epoch": 0.030161263690902086, "grad_norm": 0.9620945453643799, "learning_rate": 9.996229380163976e-06, "loss": 0.8973, "step": 548 }, { "epoch": 0.030216302493257748, "grad_norm": 1.032549500465393, "learning_rate": 9.996212530380597e-06, "loss": 0.892, "step": 549 }, { "epoch": 0.03027134129561341, "grad_norm": 1.0433719158172607, "learning_rate": 9.996195643047023e-06, "loss": 0.8428, "step": 550 }, { "epoch": 0.030326380097969067, "grad_norm": 1.1541085243225098, "learning_rate": 9.996178718163378e-06, "loss": 0.9084, "step": 551 }, { "epoch": 0.03038141890032473, "grad_norm": 0.9386873245239258, "learning_rate": 9.996161755729793e-06, "loss": 0.9246, "step": 552 }, { "epoch": 0.03043645770268039, "grad_norm": 1.092236042022705, "learning_rate": 9.996144755746393e-06, "loss": 0.8419, "step": 553 }, { "epoch": 0.03049149650503605, "grad_norm": 0.9517606496810913, "learning_rate": 9.996127718213306e-06, "loss": 0.9002, "step": 554 }, { "epoch": 0.030546535307391712, "grad_norm": 0.965972900390625, "learning_rate": 9.996110643130661e-06, "loss": 0.9197, "step": 555 }, { "epoch": 0.03060157410974737, "grad_norm": 0.9396095275878906, "learning_rate": 9.996093530498586e-06, "loss": 0.8686, "step": 556 }, { "epoch": 0.030656612912103032, "grad_norm": 1.0154120922088623, "learning_rate": 9.99607638031721e-06, "loss": 0.9773, "step": 557 }, { "epoch": 0.030711651714458693, "grad_norm": 1.3572301864624023, "learning_rate": 9.99605919258666e-06, "loss": 0.911, "step": 558 }, { "epoch": 0.030766690516814355, "grad_norm": 0.968278169631958, "learning_rate": 9.996041967307066e-06, "loss": 0.7704, "step": 559 }, { "epoch": 0.030821729319170016, "grad_norm": 0.9867869019508362, "learning_rate": 9.99602470447856e-06, "loss": 0.873, "step": 560 }, { "epoch": 0.030876768121525677, "grad_norm": 1.056450605392456, "learning_rate": 9.996007404101269e-06, "loss": 0.941, "step": 561 }, { "epoch": 0.030931806923881335, "grad_norm": 1.0419799089431763, "learning_rate": 9.995990066175321e-06, "loss": 0.957, "step": 562 }, { "epoch": 0.030986845726236997, "grad_norm": 0.9789314866065979, "learning_rate": 9.995972690700852e-06, "loss": 0.9229, "step": 563 }, { "epoch": 0.031041884528592658, "grad_norm": 0.917783796787262, "learning_rate": 9.995955277677989e-06, "loss": 0.8186, "step": 564 }, { "epoch": 0.03109692333094832, "grad_norm": 1.0231432914733887, "learning_rate": 9.995937827106863e-06, "loss": 0.8624, "step": 565 }, { "epoch": 0.03115196213330398, "grad_norm": 0.9552083015441895, "learning_rate": 9.995920338987605e-06, "loss": 0.7967, "step": 566 }, { "epoch": 0.03120700093565964, "grad_norm": 0.9441083669662476, "learning_rate": 9.995902813320349e-06, "loss": 0.8471, "step": 567 }, { "epoch": 0.0312620397380153, "grad_norm": 1.0025299787521362, "learning_rate": 9.995885250105223e-06, "loss": 0.8646, "step": 568 }, { "epoch": 0.03131707854037096, "grad_norm": 0.8997280597686768, "learning_rate": 9.99586764934236e-06, "loss": 0.8736, "step": 569 }, { "epoch": 0.03137211734272662, "grad_norm": 0.9090663194656372, "learning_rate": 9.995850011031896e-06, "loss": 0.8548, "step": 570 }, { "epoch": 0.031427156145082284, "grad_norm": 0.9641294479370117, "learning_rate": 9.995832335173959e-06, "loss": 0.8667, "step": 571 }, { "epoch": 0.031482194947437946, "grad_norm": 0.9165804982185364, "learning_rate": 9.995814621768682e-06, "loss": 0.803, "step": 572 }, { "epoch": 0.03153723374979361, "grad_norm": 0.9672492742538452, "learning_rate": 9.995796870816202e-06, "loss": 0.8335, "step": 573 }, { "epoch": 0.03159227255214927, "grad_norm": 0.9359404444694519, "learning_rate": 9.995779082316648e-06, "loss": 0.8294, "step": 574 }, { "epoch": 0.03164731135450492, "grad_norm": 0.926925003528595, "learning_rate": 9.995761256270157e-06, "loss": 0.7714, "step": 575 }, { "epoch": 0.031702350156860584, "grad_norm": 1.1848629713058472, "learning_rate": 9.995743392676862e-06, "loss": 0.8925, "step": 576 }, { "epoch": 0.031757388959216246, "grad_norm": 0.9624786972999573, "learning_rate": 9.995725491536897e-06, "loss": 0.9292, "step": 577 }, { "epoch": 0.03181242776157191, "grad_norm": 0.9479736089706421, "learning_rate": 9.995707552850396e-06, "loss": 0.8797, "step": 578 }, { "epoch": 0.03186746656392757, "grad_norm": 0.9551546573638916, "learning_rate": 9.995689576617494e-06, "loss": 0.8793, "step": 579 }, { "epoch": 0.03192250536628323, "grad_norm": 0.9210056662559509, "learning_rate": 9.995671562838325e-06, "loss": 0.9714, "step": 580 }, { "epoch": 0.03197754416863889, "grad_norm": 1.063117504119873, "learning_rate": 9.995653511513029e-06, "loss": 0.9608, "step": 581 }, { "epoch": 0.03203258297099455, "grad_norm": 0.9426459670066833, "learning_rate": 9.995635422641736e-06, "loss": 0.9102, "step": 582 }, { "epoch": 0.032087621773350214, "grad_norm": 1.0176693201065063, "learning_rate": 9.995617296224584e-06, "loss": 0.9109, "step": 583 }, { "epoch": 0.032142660575705875, "grad_norm": 0.9457042217254639, "learning_rate": 9.995599132261711e-06, "loss": 0.9017, "step": 584 }, { "epoch": 0.03219769937806154, "grad_norm": 1.5851638317108154, "learning_rate": 9.995580930753252e-06, "loss": 0.967, "step": 585 }, { "epoch": 0.03225273818041719, "grad_norm": 0.9961487054824829, "learning_rate": 9.995562691699345e-06, "loss": 0.9396, "step": 586 }, { "epoch": 0.03230777698277285, "grad_norm": 0.9892112016677856, "learning_rate": 9.995544415100125e-06, "loss": 0.9058, "step": 587 }, { "epoch": 0.032362815785128514, "grad_norm": 0.9052272439002991, "learning_rate": 9.99552610095573e-06, "loss": 0.9194, "step": 588 }, { "epoch": 0.032417854587484175, "grad_norm": 0.8381399512290955, "learning_rate": 9.995507749266297e-06, "loss": 0.7465, "step": 589 }, { "epoch": 0.03247289338983984, "grad_norm": 1.018964171409607, "learning_rate": 9.995489360031969e-06, "loss": 0.841, "step": 590 }, { "epoch": 0.0325279321921955, "grad_norm": 0.908311128616333, "learning_rate": 9.995470933252876e-06, "loss": 0.8592, "step": 591 }, { "epoch": 0.03258297099455116, "grad_norm": 1.2986040115356445, "learning_rate": 9.995452468929162e-06, "loss": 0.8341, "step": 592 }, { "epoch": 0.03263800979690682, "grad_norm": 1.6565190553665161, "learning_rate": 9.995433967060966e-06, "loss": 0.8681, "step": 593 }, { "epoch": 0.03269304859926248, "grad_norm": 0.9725674390792847, "learning_rate": 9.995415427648423e-06, "loss": 0.8449, "step": 594 }, { "epoch": 0.032748087401618144, "grad_norm": 0.8683852553367615, "learning_rate": 9.995396850691677e-06, "loss": 0.8478, "step": 595 }, { "epoch": 0.0328031262039738, "grad_norm": 0.9912856817245483, "learning_rate": 9.995378236190862e-06, "loss": 0.8912, "step": 596 }, { "epoch": 0.03285816500632946, "grad_norm": 0.9396800398826599, "learning_rate": 9.995359584146125e-06, "loss": 0.856, "step": 597 }, { "epoch": 0.03291320380868512, "grad_norm": 1.385006308555603, "learning_rate": 9.995340894557601e-06, "loss": 0.9633, "step": 598 }, { "epoch": 0.03296824261104078, "grad_norm": 0.8982875943183899, "learning_rate": 9.995322167425433e-06, "loss": 0.9244, "step": 599 }, { "epoch": 0.033023281413396444, "grad_norm": 0.8981022834777832, "learning_rate": 9.995303402749759e-06, "loss": 0.8854, "step": 600 }, { "epoch": 0.033078320215752105, "grad_norm": 0.9917197227478027, "learning_rate": 9.995284600530724e-06, "loss": 1.0086, "step": 601 }, { "epoch": 0.033133359018107766, "grad_norm": 1.0540626049041748, "learning_rate": 9.995265760768464e-06, "loss": 1.0022, "step": 602 }, { "epoch": 0.03318839782046343, "grad_norm": 0.9523479342460632, "learning_rate": 9.995246883463126e-06, "loss": 0.9893, "step": 603 }, { "epoch": 0.03324343662281909, "grad_norm": 0.9824770092964172, "learning_rate": 9.99522796861485e-06, "loss": 0.8385, "step": 604 }, { "epoch": 0.03329847542517475, "grad_norm": 1.0968893766403198, "learning_rate": 9.995209016223776e-06, "loss": 1.0109, "step": 605 }, { "epoch": 0.03335351422753041, "grad_norm": 0.9115625023841858, "learning_rate": 9.995190026290049e-06, "loss": 0.8656, "step": 606 }, { "epoch": 0.033408553029886066, "grad_norm": 0.9795814156532288, "learning_rate": 9.99517099881381e-06, "loss": 0.8941, "step": 607 }, { "epoch": 0.03346359183224173, "grad_norm": 0.9317291378974915, "learning_rate": 9.995151933795204e-06, "loss": 0.7819, "step": 608 }, { "epoch": 0.03351863063459739, "grad_norm": 0.9936283230781555, "learning_rate": 9.995132831234373e-06, "loss": 0.8674, "step": 609 }, { "epoch": 0.03357366943695305, "grad_norm": 0.9872812032699585, "learning_rate": 9.995113691131462e-06, "loss": 0.9038, "step": 610 }, { "epoch": 0.03362870823930871, "grad_norm": 0.9516895413398743, "learning_rate": 9.995094513486611e-06, "loss": 0.9038, "step": 611 }, { "epoch": 0.03368374704166437, "grad_norm": 1.090579867362976, "learning_rate": 9.995075298299968e-06, "loss": 0.9587, "step": 612 }, { "epoch": 0.033738785844020035, "grad_norm": 1.021398663520813, "learning_rate": 9.995056045571677e-06, "loss": 0.9569, "step": 613 }, { "epoch": 0.033793824646375696, "grad_norm": 1.009657382965088, "learning_rate": 9.99503675530188e-06, "loss": 0.8346, "step": 614 }, { "epoch": 0.03384886344873136, "grad_norm": 1.0478712320327759, "learning_rate": 9.995017427490725e-06, "loss": 1.0566, "step": 615 }, { "epoch": 0.03390390225108702, "grad_norm": 1.1391830444335938, "learning_rate": 9.994998062138355e-06, "loss": 1.0727, "step": 616 }, { "epoch": 0.03395894105344268, "grad_norm": 1.0172302722930908, "learning_rate": 9.994978659244918e-06, "loss": 0.7869, "step": 617 }, { "epoch": 0.034013979855798335, "grad_norm": 1.0532630681991577, "learning_rate": 9.994959218810558e-06, "loss": 0.8626, "step": 618 }, { "epoch": 0.034069018658153996, "grad_norm": 0.8300478458404541, "learning_rate": 9.99493974083542e-06, "loss": 0.8166, "step": 619 }, { "epoch": 0.03412405746050966, "grad_norm": 1.0613664388656616, "learning_rate": 9.994920225319656e-06, "loss": 0.8899, "step": 620 }, { "epoch": 0.03417909626286532, "grad_norm": 0.9827042818069458, "learning_rate": 9.994900672263406e-06, "loss": 0.8243, "step": 621 }, { "epoch": 0.03423413506522098, "grad_norm": 0.8790082931518555, "learning_rate": 9.994881081666818e-06, "loss": 0.8153, "step": 622 }, { "epoch": 0.03428917386757664, "grad_norm": 1.033378005027771, "learning_rate": 9.994861453530044e-06, "loss": 0.8916, "step": 623 }, { "epoch": 0.0343442126699323, "grad_norm": 0.9547238349914551, "learning_rate": 9.994841787853227e-06, "loss": 0.9141, "step": 624 }, { "epoch": 0.034399251472287964, "grad_norm": 0.9606438279151917, "learning_rate": 9.994822084636514e-06, "loss": 0.9435, "step": 625 }, { "epoch": 0.034454290274643626, "grad_norm": 0.8461503982543945, "learning_rate": 9.994802343880059e-06, "loss": 0.7914, "step": 626 }, { "epoch": 0.03450932907699929, "grad_norm": 1.144538402557373, "learning_rate": 9.994782565584004e-06, "loss": 0.8025, "step": 627 }, { "epoch": 0.03456436787935495, "grad_norm": 1.0099962949752808, "learning_rate": 9.994762749748502e-06, "loss": 0.9607, "step": 628 }, { "epoch": 0.0346194066817106, "grad_norm": 0.9822041988372803, "learning_rate": 9.9947428963737e-06, "loss": 0.9216, "step": 629 }, { "epoch": 0.034674445484066264, "grad_norm": 0.9056866765022278, "learning_rate": 9.994723005459746e-06, "loss": 0.7913, "step": 630 }, { "epoch": 0.034729484286421926, "grad_norm": 1.0099287033081055, "learning_rate": 9.994703077006792e-06, "loss": 0.9937, "step": 631 }, { "epoch": 0.03478452308877759, "grad_norm": 0.9559167623519897, "learning_rate": 9.994683111014984e-06, "loss": 0.9774, "step": 632 }, { "epoch": 0.03483956189113325, "grad_norm": 1.0359059572219849, "learning_rate": 9.994663107484478e-06, "loss": 0.9062, "step": 633 }, { "epoch": 0.03489460069348891, "grad_norm": 0.8803057074546814, "learning_rate": 9.99464306641542e-06, "loss": 0.9638, "step": 634 }, { "epoch": 0.03494963949584457, "grad_norm": 1.0926579236984253, "learning_rate": 9.994622987807962e-06, "loss": 1.0467, "step": 635 }, { "epoch": 0.03500467829820023, "grad_norm": 1.0051401853561401, "learning_rate": 9.994602871662253e-06, "loss": 0.8717, "step": 636 }, { "epoch": 0.035059717100555894, "grad_norm": 1.2007508277893066, "learning_rate": 9.994582717978448e-06, "loss": 0.8004, "step": 637 }, { "epoch": 0.035114755902911556, "grad_norm": 0.8826266527175903, "learning_rate": 9.994562526756695e-06, "loss": 0.8888, "step": 638 }, { "epoch": 0.03516979470526721, "grad_norm": 0.9953717589378357, "learning_rate": 9.994542297997147e-06, "loss": 0.8999, "step": 639 }, { "epoch": 0.03522483350762287, "grad_norm": 1.0203614234924316, "learning_rate": 9.994522031699958e-06, "loss": 0.8241, "step": 640 }, { "epoch": 0.03527987230997853, "grad_norm": 0.8760203719139099, "learning_rate": 9.994501727865276e-06, "loss": 0.7893, "step": 641 }, { "epoch": 0.035334911112334194, "grad_norm": 1.024888277053833, "learning_rate": 9.994481386493257e-06, "loss": 0.9865, "step": 642 }, { "epoch": 0.035389949914689856, "grad_norm": 0.907454788684845, "learning_rate": 9.994461007584052e-06, "loss": 0.891, "step": 643 }, { "epoch": 0.03544498871704552, "grad_norm": 1.0400965213775635, "learning_rate": 9.994440591137816e-06, "loss": 0.9345, "step": 644 }, { "epoch": 0.03550002751940118, "grad_norm": 0.9816616177558899, "learning_rate": 9.9944201371547e-06, "loss": 0.91, "step": 645 }, { "epoch": 0.03555506632175684, "grad_norm": 1.0528117418289185, "learning_rate": 9.99439964563486e-06, "loss": 0.952, "step": 646 }, { "epoch": 0.0356101051241125, "grad_norm": 0.9802080988883972, "learning_rate": 9.99437911657845e-06, "loss": 0.9392, "step": 647 }, { "epoch": 0.03566514392646816, "grad_norm": 0.9580393433570862, "learning_rate": 9.994358549985623e-06, "loss": 0.874, "step": 648 }, { "epoch": 0.035720182728823824, "grad_norm": 0.8935576677322388, "learning_rate": 9.994337945856533e-06, "loss": 0.8435, "step": 649 }, { "epoch": 0.03577522153117948, "grad_norm": 1.009699821472168, "learning_rate": 9.994317304191337e-06, "loss": 0.9436, "step": 650 }, { "epoch": 0.03583026033353514, "grad_norm": 0.9126121401786804, "learning_rate": 9.994296624990188e-06, "loss": 0.8424, "step": 651 }, { "epoch": 0.0358852991358908, "grad_norm": 0.9555553197860718, "learning_rate": 9.994275908253243e-06, "loss": 0.93, "step": 652 }, { "epoch": 0.03594033793824646, "grad_norm": 0.8359857797622681, "learning_rate": 9.994255153980658e-06, "loss": 0.6326, "step": 653 }, { "epoch": 0.035995376740602124, "grad_norm": 0.8918783664703369, "learning_rate": 9.994234362172587e-06, "loss": 0.8287, "step": 654 }, { "epoch": 0.036050415542957785, "grad_norm": 0.9878549575805664, "learning_rate": 9.994213532829188e-06, "loss": 0.8841, "step": 655 }, { "epoch": 0.03610545434531345, "grad_norm": 0.9504040479660034, "learning_rate": 9.994192665950617e-06, "loss": 1.0182, "step": 656 }, { "epoch": 0.03616049314766911, "grad_norm": 0.9531422257423401, "learning_rate": 9.99417176153703e-06, "loss": 0.8504, "step": 657 }, { "epoch": 0.03621553195002477, "grad_norm": 0.9580292105674744, "learning_rate": 9.994150819588587e-06, "loss": 0.8048, "step": 658 }, { "epoch": 0.03627057075238043, "grad_norm": 0.9786819815635681, "learning_rate": 9.99412984010544e-06, "loss": 0.9124, "step": 659 }, { "epoch": 0.03632560955473609, "grad_norm": 0.9733422994613647, "learning_rate": 9.994108823087751e-06, "loss": 0.8868, "step": 660 }, { "epoch": 0.03638064835709175, "grad_norm": 1.093173623085022, "learning_rate": 9.994087768535679e-06, "loss": 0.9428, "step": 661 }, { "epoch": 0.03643568715944741, "grad_norm": 0.9067148566246033, "learning_rate": 9.994066676449378e-06, "loss": 0.8838, "step": 662 }, { "epoch": 0.03649072596180307, "grad_norm": 0.9509521722793579, "learning_rate": 9.99404554682901e-06, "loss": 0.9034, "step": 663 }, { "epoch": 0.03654576476415873, "grad_norm": 0.9523824453353882, "learning_rate": 9.994024379674731e-06, "loss": 0.9623, "step": 664 }, { "epoch": 0.03660080356651439, "grad_norm": 0.987276554107666, "learning_rate": 9.994003174986703e-06, "loss": 0.8817, "step": 665 }, { "epoch": 0.036655842368870054, "grad_norm": 0.9500744342803955, "learning_rate": 9.993981932765083e-06, "loss": 0.9742, "step": 666 }, { "epoch": 0.036710881171225715, "grad_norm": 0.9420705437660217, "learning_rate": 9.993960653010034e-06, "loss": 0.9657, "step": 667 }, { "epoch": 0.036765919973581376, "grad_norm": 0.9443248510360718, "learning_rate": 9.99393933572171e-06, "loss": 0.8468, "step": 668 }, { "epoch": 0.03682095877593704, "grad_norm": 0.9666558504104614, "learning_rate": 9.993917980900276e-06, "loss": 0.9871, "step": 669 }, { "epoch": 0.0368759975782927, "grad_norm": 1.0236201286315918, "learning_rate": 9.993896588545892e-06, "loss": 0.9814, "step": 670 }, { "epoch": 0.03693103638064836, "grad_norm": 1.016190528869629, "learning_rate": 9.993875158658716e-06, "loss": 1.0156, "step": 671 }, { "epoch": 0.036986075183004015, "grad_norm": 0.9296661019325256, "learning_rate": 9.993853691238913e-06, "loss": 0.7956, "step": 672 }, { "epoch": 0.037041113985359676, "grad_norm": 0.9276684522628784, "learning_rate": 9.993832186286643e-06, "loss": 0.9253, "step": 673 }, { "epoch": 0.03709615278771534, "grad_norm": 0.8588787913322449, "learning_rate": 9.993810643802065e-06, "loss": 0.7878, "step": 674 }, { "epoch": 0.037151191590071, "grad_norm": 0.9955212473869324, "learning_rate": 9.993789063785344e-06, "loss": 0.8711, "step": 675 }, { "epoch": 0.03720623039242666, "grad_norm": 0.925578236579895, "learning_rate": 9.993767446236642e-06, "loss": 0.9431, "step": 676 }, { "epoch": 0.03726126919478232, "grad_norm": 0.9610552787780762, "learning_rate": 9.99374579115612e-06, "loss": 0.887, "step": 677 }, { "epoch": 0.03731630799713798, "grad_norm": 1.0052428245544434, "learning_rate": 9.99372409854394e-06, "loss": 0.8751, "step": 678 }, { "epoch": 0.037371346799493645, "grad_norm": 0.9503066539764404, "learning_rate": 9.99370236840027e-06, "loss": 0.8556, "step": 679 }, { "epoch": 0.037426385601849306, "grad_norm": 2.426232099533081, "learning_rate": 9.993680600725266e-06, "loss": 0.9077, "step": 680 }, { "epoch": 0.03748142440420497, "grad_norm": 0.9119723439216614, "learning_rate": 9.993658795519096e-06, "loss": 0.8575, "step": 681 }, { "epoch": 0.03753646320656062, "grad_norm": 0.9688286781311035, "learning_rate": 9.993636952781923e-06, "loss": 0.8921, "step": 682 }, { "epoch": 0.03759150200891628, "grad_norm": 1.030013084411621, "learning_rate": 9.993615072513913e-06, "loss": 0.8622, "step": 683 }, { "epoch": 0.037646540811271945, "grad_norm": 1.055187463760376, "learning_rate": 9.993593154715228e-06, "loss": 0.9251, "step": 684 }, { "epoch": 0.037701579613627606, "grad_norm": 1.0518591403961182, "learning_rate": 9.993571199386032e-06, "loss": 0.9575, "step": 685 }, { "epoch": 0.03775661841598327, "grad_norm": 0.9232666492462158, "learning_rate": 9.993549206526495e-06, "loss": 0.8522, "step": 686 }, { "epoch": 0.03781165721833893, "grad_norm": 1.0212332010269165, "learning_rate": 9.993527176136775e-06, "loss": 0.9358, "step": 687 }, { "epoch": 0.03786669602069459, "grad_norm": 0.9137141108512878, "learning_rate": 9.993505108217045e-06, "loss": 0.8561, "step": 688 }, { "epoch": 0.03792173482305025, "grad_norm": 1.0069375038146973, "learning_rate": 9.993483002767465e-06, "loss": 0.8274, "step": 689 }, { "epoch": 0.03797677362540591, "grad_norm": 0.9820672869682312, "learning_rate": 9.993460859788204e-06, "loss": 0.907, "step": 690 }, { "epoch": 0.038031812427761574, "grad_norm": 1.0042002201080322, "learning_rate": 9.993438679279428e-06, "loss": 0.9263, "step": 691 }, { "epoch": 0.038086851230117236, "grad_norm": 0.9733695983886719, "learning_rate": 9.993416461241304e-06, "loss": 0.8455, "step": 692 }, { "epoch": 0.03814189003247289, "grad_norm": 0.9106015563011169, "learning_rate": 9.993394205673996e-06, "loss": 0.8469, "step": 693 }, { "epoch": 0.03819692883482855, "grad_norm": 0.9802660346031189, "learning_rate": 9.993371912577677e-06, "loss": 0.8662, "step": 694 }, { "epoch": 0.03825196763718421, "grad_norm": 0.9183964729309082, "learning_rate": 9.99334958195251e-06, "loss": 0.8968, "step": 695 }, { "epoch": 0.038307006439539874, "grad_norm": 0.9572185277938843, "learning_rate": 9.993327213798663e-06, "loss": 0.953, "step": 696 }, { "epoch": 0.038362045241895536, "grad_norm": 1.4480071067810059, "learning_rate": 9.993304808116307e-06, "loss": 1.1131, "step": 697 }, { "epoch": 0.0384170840442512, "grad_norm": 0.9297361969947815, "learning_rate": 9.993282364905607e-06, "loss": 0.884, "step": 698 }, { "epoch": 0.03847212284660686, "grad_norm": 0.9400073885917664, "learning_rate": 9.993259884166735e-06, "loss": 0.932, "step": 699 }, { "epoch": 0.03852716164896252, "grad_norm": 0.9231798052787781, "learning_rate": 9.993237365899858e-06, "loss": 0.8981, "step": 700 }, { "epoch": 0.03858220045131818, "grad_norm": 0.8233712911605835, "learning_rate": 9.993214810105144e-06, "loss": 0.8218, "step": 701 }, { "epoch": 0.03863723925367384, "grad_norm": 1.0997854471206665, "learning_rate": 9.993192216782768e-06, "loss": 0.9298, "step": 702 }, { "epoch": 0.038692278056029504, "grad_norm": 0.9570802450180054, "learning_rate": 9.993169585932893e-06, "loss": 0.7815, "step": 703 }, { "epoch": 0.03874731685838516, "grad_norm": 0.9913730025291443, "learning_rate": 9.993146917555692e-06, "loss": 0.9621, "step": 704 }, { "epoch": 0.03880235566074082, "grad_norm": 1.088767409324646, "learning_rate": 9.993124211651334e-06, "loss": 0.9295, "step": 705 }, { "epoch": 0.03885739446309648, "grad_norm": 0.8199124336242676, "learning_rate": 9.993101468219995e-06, "loss": 0.7613, "step": 706 }, { "epoch": 0.03891243326545214, "grad_norm": 1.112566351890564, "learning_rate": 9.99307868726184e-06, "loss": 0.791, "step": 707 }, { "epoch": 0.038967472067807804, "grad_norm": 0.9372578859329224, "learning_rate": 9.99305586877704e-06, "loss": 0.8567, "step": 708 }, { "epoch": 0.039022510870163465, "grad_norm": 1.0167721509933472, "learning_rate": 9.99303301276577e-06, "loss": 0.9787, "step": 709 }, { "epoch": 0.03907754967251913, "grad_norm": 1.3526856899261475, "learning_rate": 9.993010119228202e-06, "loss": 1.2215, "step": 710 }, { "epoch": 0.03913258847487479, "grad_norm": 0.8819016814231873, "learning_rate": 9.992987188164505e-06, "loss": 0.7736, "step": 711 }, { "epoch": 0.03918762727723045, "grad_norm": 1.0033677816390991, "learning_rate": 9.992964219574852e-06, "loss": 0.9919, "step": 712 }, { "epoch": 0.03924266607958611, "grad_norm": 0.894926130771637, "learning_rate": 9.992941213459417e-06, "loss": 0.9058, "step": 713 }, { "epoch": 0.03929770488194177, "grad_norm": 0.9481377005577087, "learning_rate": 9.992918169818373e-06, "loss": 0.8436, "step": 714 }, { "epoch": 0.03935274368429743, "grad_norm": 0.9312933087348938, "learning_rate": 9.992895088651893e-06, "loss": 0.8869, "step": 715 }, { "epoch": 0.03940778248665309, "grad_norm": 0.9765705466270447, "learning_rate": 9.99287196996015e-06, "loss": 0.9512, "step": 716 }, { "epoch": 0.03946282128900875, "grad_norm": 0.9610235691070557, "learning_rate": 9.992848813743317e-06, "loss": 0.8005, "step": 717 }, { "epoch": 0.03951786009136441, "grad_norm": 1.102995753288269, "learning_rate": 9.99282562000157e-06, "loss": 0.8017, "step": 718 }, { "epoch": 0.03957289889372007, "grad_norm": 1.023317575454712, "learning_rate": 9.99280238873508e-06, "loss": 0.911, "step": 719 }, { "epoch": 0.039627937696075734, "grad_norm": 1.0531049966812134, "learning_rate": 9.992779119944025e-06, "loss": 0.8562, "step": 720 }, { "epoch": 0.039682976498431395, "grad_norm": 0.918250322341919, "learning_rate": 9.992755813628579e-06, "loss": 0.92, "step": 721 }, { "epoch": 0.039738015300787057, "grad_norm": 0.8508251309394836, "learning_rate": 9.992732469788915e-06, "loss": 0.7347, "step": 722 }, { "epoch": 0.03979305410314272, "grad_norm": 0.9184926152229309, "learning_rate": 9.992709088425211e-06, "loss": 0.8732, "step": 723 }, { "epoch": 0.03984809290549838, "grad_norm": 1.1613929271697998, "learning_rate": 9.992685669537643e-06, "loss": 0.9522, "step": 724 }, { "epoch": 0.039903131707854034, "grad_norm": 1.091513752937317, "learning_rate": 9.992662213126386e-06, "loss": 0.9646, "step": 725 }, { "epoch": 0.039958170510209695, "grad_norm": 1.057803750038147, "learning_rate": 9.992638719191615e-06, "loss": 0.7032, "step": 726 }, { "epoch": 0.040013209312565357, "grad_norm": 0.8771823644638062, "learning_rate": 9.992615187733508e-06, "loss": 0.8577, "step": 727 }, { "epoch": 0.04006824811492102, "grad_norm": 0.9471028447151184, "learning_rate": 9.992591618752244e-06, "loss": 0.9057, "step": 728 }, { "epoch": 0.04012328691727668, "grad_norm": 0.9547705054283142, "learning_rate": 9.992568012247995e-06, "loss": 0.9549, "step": 729 }, { "epoch": 0.04017832571963234, "grad_norm": 0.8862974047660828, "learning_rate": 9.992544368220941e-06, "loss": 0.8593, "step": 730 }, { "epoch": 0.040233364521988, "grad_norm": 0.906334400177002, "learning_rate": 9.992520686671261e-06, "loss": 0.8832, "step": 731 }, { "epoch": 0.04028840332434366, "grad_norm": 1.07270085811615, "learning_rate": 9.992496967599133e-06, "loss": 0.9409, "step": 732 }, { "epoch": 0.040343442126699325, "grad_norm": 0.9026005268096924, "learning_rate": 9.992473211004734e-06, "loss": 0.8326, "step": 733 }, { "epoch": 0.040398480929054986, "grad_norm": 0.9762942790985107, "learning_rate": 9.992449416888241e-06, "loss": 0.9048, "step": 734 }, { "epoch": 0.04045351973141065, "grad_norm": 0.9658033847808838, "learning_rate": 9.992425585249837e-06, "loss": 0.9219, "step": 735 }, { "epoch": 0.0405085585337663, "grad_norm": 0.8909044861793518, "learning_rate": 9.992401716089698e-06, "loss": 0.8564, "step": 736 }, { "epoch": 0.04056359733612196, "grad_norm": 1.0387929677963257, "learning_rate": 9.992377809408001e-06, "loss": 0.9533, "step": 737 }, { "epoch": 0.040618636138477625, "grad_norm": 0.9044275879859924, "learning_rate": 9.99235386520493e-06, "loss": 0.8508, "step": 738 }, { "epoch": 0.040673674940833286, "grad_norm": 1.019377589225769, "learning_rate": 9.992329883480667e-06, "loss": 0.8684, "step": 739 }, { "epoch": 0.04072871374318895, "grad_norm": 0.9394627213478088, "learning_rate": 9.992305864235385e-06, "loss": 0.7665, "step": 740 }, { "epoch": 0.04078375254554461, "grad_norm": 0.8652323484420776, "learning_rate": 9.99228180746927e-06, "loss": 0.8576, "step": 741 }, { "epoch": 0.04083879134790027, "grad_norm": 0.9347619414329529, "learning_rate": 9.992257713182502e-06, "loss": 0.9586, "step": 742 }, { "epoch": 0.04089383015025593, "grad_norm": 0.9510203003883362, "learning_rate": 9.99223358137526e-06, "loss": 0.9092, "step": 743 }, { "epoch": 0.04094886895261159, "grad_norm": 0.8242866396903992, "learning_rate": 9.992209412047729e-06, "loss": 0.6997, "step": 744 }, { "epoch": 0.041003907754967255, "grad_norm": 0.8842730522155762, "learning_rate": 9.992185205200087e-06, "loss": 0.8873, "step": 745 }, { "epoch": 0.041058946557322916, "grad_norm": 1.0813730955123901, "learning_rate": 9.992160960832518e-06, "loss": 1.0162, "step": 746 }, { "epoch": 0.04111398535967857, "grad_norm": 1.1276283264160156, "learning_rate": 9.9921366789452e-06, "loss": 1.0004, "step": 747 }, { "epoch": 0.04116902416203423, "grad_norm": 0.8810326457023621, "learning_rate": 9.992112359538323e-06, "loss": 0.7823, "step": 748 }, { "epoch": 0.04122406296438989, "grad_norm": 0.9939407110214233, "learning_rate": 9.992088002612066e-06, "loss": 1.0016, "step": 749 }, { "epoch": 0.041279101766745555, "grad_norm": 1.0963523387908936, "learning_rate": 9.99206360816661e-06, "loss": 0.9252, "step": 750 }, { "epoch": 0.041334140569101216, "grad_norm": 1.1346478462219238, "learning_rate": 9.99203917620214e-06, "loss": 0.9608, "step": 751 }, { "epoch": 0.04138917937145688, "grad_norm": 1.0108580589294434, "learning_rate": 9.992014706718841e-06, "loss": 0.9179, "step": 752 }, { "epoch": 0.04144421817381254, "grad_norm": 0.897293210029602, "learning_rate": 9.991990199716894e-06, "loss": 0.9295, "step": 753 }, { "epoch": 0.0414992569761682, "grad_norm": 1.0152363777160645, "learning_rate": 9.991965655196488e-06, "loss": 0.8467, "step": 754 }, { "epoch": 0.04155429577852386, "grad_norm": 0.8655388355255127, "learning_rate": 9.9919410731578e-06, "loss": 0.796, "step": 755 }, { "epoch": 0.04160933458087952, "grad_norm": 1.0140331983566284, "learning_rate": 9.991916453601023e-06, "loss": 0.8444, "step": 756 }, { "epoch": 0.041664373383235184, "grad_norm": 0.9387341141700745, "learning_rate": 9.991891796526338e-06, "loss": 0.8669, "step": 757 }, { "epoch": 0.04171941218559084, "grad_norm": 0.9395696520805359, "learning_rate": 9.991867101933928e-06, "loss": 0.8376, "step": 758 }, { "epoch": 0.0417744509879465, "grad_norm": 1.0856634378433228, "learning_rate": 9.991842369823983e-06, "loss": 0.9271, "step": 759 }, { "epoch": 0.04182948979030216, "grad_norm": 0.8777190446853638, "learning_rate": 9.991817600196687e-06, "loss": 0.9197, "step": 760 }, { "epoch": 0.04188452859265782, "grad_norm": 0.9639917016029358, "learning_rate": 9.991792793052225e-06, "loss": 0.8835, "step": 761 }, { "epoch": 0.041939567395013484, "grad_norm": 0.9384773969650269, "learning_rate": 9.991767948390785e-06, "loss": 0.8403, "step": 762 }, { "epoch": 0.041994606197369146, "grad_norm": 0.8987650275230408, "learning_rate": 9.991743066212554e-06, "loss": 0.7948, "step": 763 }, { "epoch": 0.04204964499972481, "grad_norm": 1.0545049905776978, "learning_rate": 9.991718146517717e-06, "loss": 0.9359, "step": 764 }, { "epoch": 0.04210468380208047, "grad_norm": 0.9840022325515747, "learning_rate": 9.991693189306463e-06, "loss": 0.9188, "step": 765 }, { "epoch": 0.04215972260443613, "grad_norm": 0.8769927620887756, "learning_rate": 9.991668194578981e-06, "loss": 0.8647, "step": 766 }, { "epoch": 0.04221476140679179, "grad_norm": 0.9268791675567627, "learning_rate": 9.991643162335455e-06, "loss": 0.897, "step": 767 }, { "epoch": 0.042269800209147446, "grad_norm": 0.9316747784614563, "learning_rate": 9.991618092576075e-06, "loss": 0.9341, "step": 768 }, { "epoch": 0.04232483901150311, "grad_norm": 0.8348364233970642, "learning_rate": 9.991592985301031e-06, "loss": 0.7528, "step": 769 }, { "epoch": 0.04237987781385877, "grad_norm": 0.9139068126678467, "learning_rate": 9.99156784051051e-06, "loss": 0.8596, "step": 770 }, { "epoch": 0.04243491661621443, "grad_norm": 0.9403928518295288, "learning_rate": 9.991542658204701e-06, "loss": 0.974, "step": 771 }, { "epoch": 0.04248995541857009, "grad_norm": 0.993549108505249, "learning_rate": 9.991517438383793e-06, "loss": 0.9479, "step": 772 }, { "epoch": 0.04254499422092575, "grad_norm": 0.8494916558265686, "learning_rate": 9.991492181047975e-06, "loss": 0.9149, "step": 773 }, { "epoch": 0.042600033023281414, "grad_norm": 1.0351910591125488, "learning_rate": 9.991466886197441e-06, "loss": 0.9552, "step": 774 }, { "epoch": 0.042655071825637075, "grad_norm": 0.916829526424408, "learning_rate": 9.991441553832375e-06, "loss": 0.8781, "step": 775 }, { "epoch": 0.04271011062799274, "grad_norm": 1.113476276397705, "learning_rate": 9.991416183952972e-06, "loss": 0.8137, "step": 776 }, { "epoch": 0.0427651494303484, "grad_norm": 1.1608171463012695, "learning_rate": 9.991390776559421e-06, "loss": 1.0045, "step": 777 }, { "epoch": 0.04282018823270406, "grad_norm": 1.0045493841171265, "learning_rate": 9.991365331651913e-06, "loss": 0.8813, "step": 778 }, { "epoch": 0.042875227035059714, "grad_norm": 0.918820858001709, "learning_rate": 9.991339849230639e-06, "loss": 0.9198, "step": 779 }, { "epoch": 0.042930265837415375, "grad_norm": 0.9875735640525818, "learning_rate": 9.991314329295792e-06, "loss": 0.8665, "step": 780 }, { "epoch": 0.04298530463977104, "grad_norm": 0.873768150806427, "learning_rate": 9.991288771847561e-06, "loss": 0.8606, "step": 781 }, { "epoch": 0.0430403434421267, "grad_norm": 0.8892746567726135, "learning_rate": 9.991263176886139e-06, "loss": 0.9011, "step": 782 }, { "epoch": 0.04309538224448236, "grad_norm": 1.097734808921814, "learning_rate": 9.99123754441172e-06, "loss": 1.009, "step": 783 }, { "epoch": 0.04315042104683802, "grad_norm": 1.0065964460372925, "learning_rate": 9.991211874424497e-06, "loss": 0.9492, "step": 784 }, { "epoch": 0.04320545984919368, "grad_norm": 1.0791678428649902, "learning_rate": 9.99118616692466e-06, "loss": 1.0142, "step": 785 }, { "epoch": 0.043260498651549344, "grad_norm": 0.9454777836799622, "learning_rate": 9.991160421912404e-06, "loss": 0.8058, "step": 786 }, { "epoch": 0.043315537453905005, "grad_norm": 0.9448156952857971, "learning_rate": 9.991134639387922e-06, "loss": 0.8184, "step": 787 }, { "epoch": 0.043370576256260666, "grad_norm": 0.9636550545692444, "learning_rate": 9.99110881935141e-06, "loss": 0.8606, "step": 788 }, { "epoch": 0.04342561505861633, "grad_norm": 0.9933613538742065, "learning_rate": 9.991082961803058e-06, "loss": 0.9449, "step": 789 }, { "epoch": 0.04348065386097198, "grad_norm": 0.8906797170639038, "learning_rate": 9.991057066743065e-06, "loss": 0.8053, "step": 790 }, { "epoch": 0.043535692663327644, "grad_norm": 1.0393906831741333, "learning_rate": 9.991031134171621e-06, "loss": 0.8487, "step": 791 }, { "epoch": 0.043590731465683305, "grad_norm": 1.0618231296539307, "learning_rate": 9.991005164088923e-06, "loss": 0.9847, "step": 792 }, { "epoch": 0.043645770268038966, "grad_norm": 0.9525149464607239, "learning_rate": 9.990979156495167e-06, "loss": 0.9318, "step": 793 }, { "epoch": 0.04370080907039463, "grad_norm": 0.9430851936340332, "learning_rate": 9.990953111390546e-06, "loss": 0.8483, "step": 794 }, { "epoch": 0.04375584787275029, "grad_norm": 0.9259672164916992, "learning_rate": 9.99092702877526e-06, "loss": 0.9365, "step": 795 }, { "epoch": 0.04381088667510595, "grad_norm": 0.942609965801239, "learning_rate": 9.9909009086495e-06, "loss": 0.8408, "step": 796 }, { "epoch": 0.04386592547746161, "grad_norm": 0.939255952835083, "learning_rate": 9.990874751013467e-06, "loss": 0.8749, "step": 797 }, { "epoch": 0.04392096427981727, "grad_norm": 1.1701711416244507, "learning_rate": 9.990848555867353e-06, "loss": 0.9312, "step": 798 }, { "epoch": 0.043976003082172935, "grad_norm": 1.0441124439239502, "learning_rate": 9.990822323211358e-06, "loss": 0.8618, "step": 799 }, { "epoch": 0.04403104188452859, "grad_norm": 0.9601489305496216, "learning_rate": 9.990796053045679e-06, "loss": 0.9569, "step": 800 }, { "epoch": 0.04408608068688425, "grad_norm": 0.9394032955169678, "learning_rate": 9.990769745370513e-06, "loss": 0.846, "step": 801 }, { "epoch": 0.04414111948923991, "grad_norm": 0.9631348252296448, "learning_rate": 9.990743400186056e-06, "loss": 0.8754, "step": 802 }, { "epoch": 0.04419615829159557, "grad_norm": 0.9234963059425354, "learning_rate": 9.990717017492508e-06, "loss": 0.8613, "step": 803 }, { "epoch": 0.044251197093951235, "grad_norm": 0.9169090390205383, "learning_rate": 9.990690597290069e-06, "loss": 0.8867, "step": 804 }, { "epoch": 0.044306235896306896, "grad_norm": 1.0194867849349976, "learning_rate": 9.990664139578933e-06, "loss": 0.8675, "step": 805 }, { "epoch": 0.04436127469866256, "grad_norm": 1.3226114511489868, "learning_rate": 9.990637644359302e-06, "loss": 0.997, "step": 806 }, { "epoch": 0.04441631350101822, "grad_norm": 0.8904317617416382, "learning_rate": 9.990611111631374e-06, "loss": 0.7274, "step": 807 }, { "epoch": 0.04447135230337388, "grad_norm": 0.8909007906913757, "learning_rate": 9.99058454139535e-06, "loss": 0.8141, "step": 808 }, { "epoch": 0.04452639110572954, "grad_norm": 1.004015564918518, "learning_rate": 9.990557933651429e-06, "loss": 0.9883, "step": 809 }, { "epoch": 0.0445814299080852, "grad_norm": 1.1215732097625732, "learning_rate": 9.990531288399807e-06, "loss": 0.9355, "step": 810 }, { "epoch": 0.04463646871044086, "grad_norm": 1.0545012950897217, "learning_rate": 9.99050460564069e-06, "loss": 0.9532, "step": 811 }, { "epoch": 0.04469150751279652, "grad_norm": 0.9608867168426514, "learning_rate": 9.990477885374277e-06, "loss": 0.9363, "step": 812 }, { "epoch": 0.04474654631515218, "grad_norm": 0.8750461935997009, "learning_rate": 9.990451127600766e-06, "loss": 0.7343, "step": 813 }, { "epoch": 0.04480158511750784, "grad_norm": 0.891740620136261, "learning_rate": 9.99042433232036e-06, "loss": 0.8541, "step": 814 }, { "epoch": 0.0448566239198635, "grad_norm": 1.1520029306411743, "learning_rate": 9.990397499533264e-06, "loss": 0.7696, "step": 815 }, { "epoch": 0.044911662722219164, "grad_norm": 0.9526278972625732, "learning_rate": 9.990370629239673e-06, "loss": 0.8953, "step": 816 }, { "epoch": 0.044966701524574826, "grad_norm": 0.9218434691429138, "learning_rate": 9.990343721439795e-06, "loss": 0.8198, "step": 817 }, { "epoch": 0.04502174032693049, "grad_norm": 0.8502745628356934, "learning_rate": 9.990316776133827e-06, "loss": 0.8035, "step": 818 }, { "epoch": 0.04507677912928615, "grad_norm": 0.8861565589904785, "learning_rate": 9.990289793321975e-06, "loss": 0.8626, "step": 819 }, { "epoch": 0.04513181793164181, "grad_norm": 1.1113256216049194, "learning_rate": 9.99026277300444e-06, "loss": 0.9363, "step": 820 }, { "epoch": 0.04518685673399747, "grad_norm": 0.9984708428382874, "learning_rate": 9.990235715181426e-06, "loss": 1.0376, "step": 821 }, { "epoch": 0.045241895536353126, "grad_norm": 0.9026711583137512, "learning_rate": 9.990208619853137e-06, "loss": 0.9079, "step": 822 }, { "epoch": 0.04529693433870879, "grad_norm": 0.8724965453147888, "learning_rate": 9.990181487019775e-06, "loss": 0.8665, "step": 823 }, { "epoch": 0.04535197314106445, "grad_norm": 0.8923047780990601, "learning_rate": 9.990154316681543e-06, "loss": 0.7779, "step": 824 }, { "epoch": 0.04540701194342011, "grad_norm": 0.9024640321731567, "learning_rate": 9.99012710883865e-06, "loss": 0.8859, "step": 825 }, { "epoch": 0.04546205074577577, "grad_norm": 0.9245888590812683, "learning_rate": 9.990099863491296e-06, "loss": 0.8501, "step": 826 }, { "epoch": 0.04551708954813143, "grad_norm": 0.9257050156593323, "learning_rate": 9.990072580639687e-06, "loss": 0.9561, "step": 827 }, { "epoch": 0.045572128350487094, "grad_norm": 0.995610773563385, "learning_rate": 9.99004526028403e-06, "loss": 0.917, "step": 828 }, { "epoch": 0.045627167152842756, "grad_norm": 0.9524009823799133, "learning_rate": 9.990017902424525e-06, "loss": 0.9184, "step": 829 }, { "epoch": 0.04568220595519842, "grad_norm": 0.9264503121376038, "learning_rate": 9.989990507061385e-06, "loss": 0.8615, "step": 830 }, { "epoch": 0.04573724475755408, "grad_norm": 1.0068570375442505, "learning_rate": 9.989963074194809e-06, "loss": 0.8331, "step": 831 }, { "epoch": 0.04579228355990974, "grad_norm": 0.9295952320098877, "learning_rate": 9.989935603825009e-06, "loss": 0.8387, "step": 832 }, { "epoch": 0.045847322362265394, "grad_norm": 1.0408827066421509, "learning_rate": 9.989908095952186e-06, "loss": 0.9686, "step": 833 }, { "epoch": 0.045902361164621056, "grad_norm": 0.8874136209487915, "learning_rate": 9.989880550576551e-06, "loss": 0.815, "step": 834 }, { "epoch": 0.04595739996697672, "grad_norm": 0.9898836016654968, "learning_rate": 9.989852967698311e-06, "loss": 0.9458, "step": 835 }, { "epoch": 0.04601243876933238, "grad_norm": 0.9828970432281494, "learning_rate": 9.989825347317668e-06, "loss": 0.7922, "step": 836 }, { "epoch": 0.04606747757168804, "grad_norm": 1.025447964668274, "learning_rate": 9.989797689434836e-06, "loss": 0.9349, "step": 837 }, { "epoch": 0.0461225163740437, "grad_norm": 0.8623831272125244, "learning_rate": 9.98976999405002e-06, "loss": 0.8786, "step": 838 }, { "epoch": 0.04617755517639936, "grad_norm": 0.9614997506141663, "learning_rate": 9.98974226116343e-06, "loss": 0.7885, "step": 839 }, { "epoch": 0.046232593978755024, "grad_norm": 1.0207616090774536, "learning_rate": 9.989714490775269e-06, "loss": 0.9786, "step": 840 }, { "epoch": 0.046287632781110685, "grad_norm": 0.8509595990180969, "learning_rate": 9.98968668288575e-06, "loss": 0.7312, "step": 841 }, { "epoch": 0.04634267158346635, "grad_norm": 0.9822607040405273, "learning_rate": 9.989658837495084e-06, "loss": 0.952, "step": 842 }, { "epoch": 0.046397710385822, "grad_norm": 1.0058252811431885, "learning_rate": 9.989630954603477e-06, "loss": 0.8811, "step": 843 }, { "epoch": 0.04645274918817766, "grad_norm": 1.0146985054016113, "learning_rate": 9.989603034211139e-06, "loss": 0.9051, "step": 844 }, { "epoch": 0.046507787990533324, "grad_norm": 0.8976503610610962, "learning_rate": 9.98957507631828e-06, "loss": 0.879, "step": 845 }, { "epoch": 0.046562826792888985, "grad_norm": 0.8791939616203308, "learning_rate": 9.989547080925111e-06, "loss": 0.8944, "step": 846 }, { "epoch": 0.04661786559524465, "grad_norm": 0.8530884981155396, "learning_rate": 9.989519048031842e-06, "loss": 0.9029, "step": 847 }, { "epoch": 0.04667290439760031, "grad_norm": 0.9621617197990417, "learning_rate": 9.989490977638683e-06, "loss": 0.8374, "step": 848 }, { "epoch": 0.04672794319995597, "grad_norm": 0.9629075527191162, "learning_rate": 9.989462869745845e-06, "loss": 0.9032, "step": 849 }, { "epoch": 0.04678298200231163, "grad_norm": 1.3256126642227173, "learning_rate": 9.989434724353541e-06, "loss": 0.9748, "step": 850 }, { "epoch": 0.04683802080466729, "grad_norm": 1.0230494737625122, "learning_rate": 9.989406541461979e-06, "loss": 0.9752, "step": 851 }, { "epoch": 0.046893059607022954, "grad_norm": 0.8454533219337463, "learning_rate": 9.989378321071375e-06, "loss": 0.8426, "step": 852 }, { "epoch": 0.046948098409378615, "grad_norm": 0.9995863437652588, "learning_rate": 9.989350063181939e-06, "loss": 0.9955, "step": 853 }, { "epoch": 0.04700313721173427, "grad_norm": 0.8956604599952698, "learning_rate": 9.989321767793883e-06, "loss": 0.9024, "step": 854 }, { "epoch": 0.04705817601408993, "grad_norm": 1.0123292207717896, "learning_rate": 9.989293434907419e-06, "loss": 0.7856, "step": 855 }, { "epoch": 0.04711321481644559, "grad_norm": 0.814577043056488, "learning_rate": 9.989265064522762e-06, "loss": 0.8377, "step": 856 }, { "epoch": 0.047168253618801254, "grad_norm": 1.1571552753448486, "learning_rate": 9.989236656640125e-06, "loss": 0.8562, "step": 857 }, { "epoch": 0.047223292421156915, "grad_norm": 0.9681577682495117, "learning_rate": 9.98920821125972e-06, "loss": 0.8473, "step": 858 }, { "epoch": 0.047278331223512576, "grad_norm": 0.9680121541023254, "learning_rate": 9.989179728381761e-06, "loss": 0.9811, "step": 859 }, { "epoch": 0.04733337002586824, "grad_norm": 0.985477089881897, "learning_rate": 9.989151208006464e-06, "loss": 0.6994, "step": 860 }, { "epoch": 0.0473884088282239, "grad_norm": 0.8612962365150452, "learning_rate": 9.98912265013404e-06, "loss": 0.7667, "step": 861 }, { "epoch": 0.04744344763057956, "grad_norm": 0.8884604573249817, "learning_rate": 9.989094054764708e-06, "loss": 0.8382, "step": 862 }, { "epoch": 0.04749848643293522, "grad_norm": 1.036881923675537, "learning_rate": 9.989065421898681e-06, "loss": 0.8748, "step": 863 }, { "epoch": 0.04755352523529088, "grad_norm": 0.9954493045806885, "learning_rate": 9.989036751536171e-06, "loss": 0.9174, "step": 864 }, { "epoch": 0.04760856403764654, "grad_norm": 0.9984694123268127, "learning_rate": 9.989008043677399e-06, "loss": 0.7636, "step": 865 }, { "epoch": 0.0476636028400022, "grad_norm": 1.0412588119506836, "learning_rate": 9.988979298322576e-06, "loss": 0.773, "step": 866 }, { "epoch": 0.04771864164235786, "grad_norm": 0.8034874796867371, "learning_rate": 9.98895051547192e-06, "loss": 0.7914, "step": 867 }, { "epoch": 0.04777368044471352, "grad_norm": 0.8983979225158691, "learning_rate": 9.988921695125648e-06, "loss": 0.7292, "step": 868 }, { "epoch": 0.04782871924706918, "grad_norm": 0.9445077776908875, "learning_rate": 9.988892837283976e-06, "loss": 0.8263, "step": 869 }, { "epoch": 0.047883758049424845, "grad_norm": 1.0753306150436401, "learning_rate": 9.988863941947121e-06, "loss": 1.1122, "step": 870 }, { "epoch": 0.047938796851780506, "grad_norm": 1.0091484785079956, "learning_rate": 9.9888350091153e-06, "loss": 0.9276, "step": 871 }, { "epoch": 0.04799383565413617, "grad_norm": 1.0977306365966797, "learning_rate": 9.988806038788732e-06, "loss": 0.854, "step": 872 }, { "epoch": 0.04804887445649183, "grad_norm": 1.0285007953643799, "learning_rate": 9.988777030967632e-06, "loss": 0.9441, "step": 873 }, { "epoch": 0.04810391325884749, "grad_norm": 0.8973976373672485, "learning_rate": 9.988747985652218e-06, "loss": 0.786, "step": 874 }, { "epoch": 0.04815895206120315, "grad_norm": 0.9809553623199463, "learning_rate": 9.98871890284271e-06, "loss": 0.9042, "step": 875 }, { "epoch": 0.048213990863558806, "grad_norm": 0.8514279723167419, "learning_rate": 9.988689782539326e-06, "loss": 0.7874, "step": 876 }, { "epoch": 0.04826902966591447, "grad_norm": 0.8299674391746521, "learning_rate": 9.988660624742286e-06, "loss": 0.8704, "step": 877 }, { "epoch": 0.04832406846827013, "grad_norm": 0.9862462282180786, "learning_rate": 9.988631429451809e-06, "loss": 0.9963, "step": 878 }, { "epoch": 0.04837910727062579, "grad_norm": 0.9041131734848022, "learning_rate": 9.988602196668111e-06, "loss": 0.9207, "step": 879 }, { "epoch": 0.04843414607298145, "grad_norm": 0.8597276210784912, "learning_rate": 9.988572926391416e-06, "loss": 0.8226, "step": 880 }, { "epoch": 0.04848918487533711, "grad_norm": 0.9494329690933228, "learning_rate": 9.988543618621941e-06, "loss": 0.8834, "step": 881 }, { "epoch": 0.048544223677692774, "grad_norm": 0.9129118323326111, "learning_rate": 9.98851427335991e-06, "loss": 0.7819, "step": 882 }, { "epoch": 0.048599262480048436, "grad_norm": 0.9145999550819397, "learning_rate": 9.988484890605539e-06, "loss": 0.885, "step": 883 }, { "epoch": 0.0486543012824041, "grad_norm": 1.0115307569503784, "learning_rate": 9.98845547035905e-06, "loss": 0.8347, "step": 884 }, { "epoch": 0.04870934008475976, "grad_norm": 1.1372706890106201, "learning_rate": 9.988426012620667e-06, "loss": 0.944, "step": 885 }, { "epoch": 0.04876437888711541, "grad_norm": 0.9502811431884766, "learning_rate": 9.98839651739061e-06, "loss": 0.9054, "step": 886 }, { "epoch": 0.048819417689471074, "grad_norm": 0.9612823128700256, "learning_rate": 9.988366984669097e-06, "loss": 0.8796, "step": 887 }, { "epoch": 0.048874456491826736, "grad_norm": 0.9551461935043335, "learning_rate": 9.988337414456355e-06, "loss": 0.8769, "step": 888 }, { "epoch": 0.0489294952941824, "grad_norm": 0.8554086089134216, "learning_rate": 9.988307806752603e-06, "loss": 0.892, "step": 889 }, { "epoch": 0.04898453409653806, "grad_norm": 0.8418886661529541, "learning_rate": 9.988278161558067e-06, "loss": 0.7568, "step": 890 }, { "epoch": 0.04903957289889372, "grad_norm": 1.4780360460281372, "learning_rate": 9.988248478872967e-06, "loss": 0.9126, "step": 891 }, { "epoch": 0.04909461170124938, "grad_norm": 0.8236714005470276, "learning_rate": 9.988218758697526e-06, "loss": 0.7317, "step": 892 }, { "epoch": 0.04914965050360504, "grad_norm": 0.8777141571044922, "learning_rate": 9.988189001031968e-06, "loss": 0.7989, "step": 893 }, { "epoch": 0.049204689305960704, "grad_norm": 1.0235031843185425, "learning_rate": 9.988159205876516e-06, "loss": 0.8335, "step": 894 }, { "epoch": 0.049259728108316365, "grad_norm": 0.9340357184410095, "learning_rate": 9.988129373231395e-06, "loss": 0.8129, "step": 895 }, { "epoch": 0.04931476691067203, "grad_norm": 1.7686667442321777, "learning_rate": 9.98809950309683e-06, "loss": 0.9792, "step": 896 }, { "epoch": 0.04936980571302768, "grad_norm": 0.9252369403839111, "learning_rate": 9.988069595473044e-06, "loss": 0.8671, "step": 897 }, { "epoch": 0.04942484451538334, "grad_norm": 0.9989960789680481, "learning_rate": 9.988039650360262e-06, "loss": 0.9245, "step": 898 }, { "epoch": 0.049479883317739004, "grad_norm": 1.062912106513977, "learning_rate": 9.98800966775871e-06, "loss": 0.9146, "step": 899 }, { "epoch": 0.049534922120094665, "grad_norm": 0.8698169589042664, "learning_rate": 9.98797964766861e-06, "loss": 0.8606, "step": 900 }, { "epoch": 0.04958996092245033, "grad_norm": 1.6754224300384521, "learning_rate": 9.98794959009019e-06, "loss": 0.9236, "step": 901 }, { "epoch": 0.04964499972480599, "grad_norm": 1.084174394607544, "learning_rate": 9.98791949502368e-06, "loss": 0.9252, "step": 902 }, { "epoch": 0.04970003852716165, "grad_norm": 0.9866724610328674, "learning_rate": 9.987889362469301e-06, "loss": 0.9096, "step": 903 }, { "epoch": 0.04975507732951731, "grad_norm": 0.8814040422439575, "learning_rate": 9.987859192427279e-06, "loss": 0.8475, "step": 904 }, { "epoch": 0.04981011613187297, "grad_norm": 0.8796457052230835, "learning_rate": 9.987828984897843e-06, "loss": 0.8478, "step": 905 }, { "epoch": 0.049865154934228634, "grad_norm": 1.0541884899139404, "learning_rate": 9.98779873988122e-06, "loss": 0.9799, "step": 906 }, { "epoch": 0.049920193736584295, "grad_norm": 0.91409832239151, "learning_rate": 9.987768457377636e-06, "loss": 0.8701, "step": 907 }, { "epoch": 0.04997523253893995, "grad_norm": 1.0120370388031006, "learning_rate": 9.98773813738732e-06, "loss": 0.8417, "step": 908 }, { "epoch": 0.05003027134129561, "grad_norm": 1.7744206190109253, "learning_rate": 9.987707779910499e-06, "loss": 0.9263, "step": 909 } ], "logging_steps": 1, "max_steps": 36338, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 909, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.682514714121994e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }