{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.992771084337349, "eval_steps": 500, "global_step": 775, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00642570281124498, "grad_norm": 0.8317294716835022, "learning_rate": 8.333333333333334e-06, "loss": 0.1978, "step": 1 }, { "epoch": 0.01285140562248996, "grad_norm": 0.94012850522995, "learning_rate": 1.6666666666666667e-05, "loss": 0.2672, "step": 2 }, { "epoch": 0.01927710843373494, "grad_norm": 1.2400332689285278, "learning_rate": 2.5e-05, "loss": 0.2964, "step": 3 }, { "epoch": 0.02570281124497992, "grad_norm": 2.0498180389404297, "learning_rate": 3.3333333333333335e-05, "loss": 0.3573, "step": 4 }, { "epoch": 0.0321285140562249, "grad_norm": 1.034347653388977, "learning_rate": 4.166666666666667e-05, "loss": 0.2913, "step": 5 }, { "epoch": 0.03855421686746988, "grad_norm": 0.6081845164299011, "learning_rate": 5e-05, "loss": 0.2519, "step": 6 }, { "epoch": 0.04497991967871486, "grad_norm": 0.4193064272403717, "learning_rate": 5.833333333333334e-05, "loss": 0.2426, "step": 7 }, { "epoch": 0.05140562248995984, "grad_norm": 0.50704026222229, "learning_rate": 6.666666666666667e-05, "loss": 0.2293, "step": 8 }, { "epoch": 0.05783132530120482, "grad_norm": 0.3707605302333832, "learning_rate": 7.500000000000001e-05, "loss": 0.2189, "step": 9 }, { "epoch": 0.0642570281124498, "grad_norm": 0.34638485312461853, "learning_rate": 8.333333333333334e-05, "loss": 0.1798, "step": 10 }, { "epoch": 0.07068273092369477, "grad_norm": 0.4543774425983429, "learning_rate": 9.166666666666667e-05, "loss": 0.1842, "step": 11 }, { "epoch": 0.07710843373493977, "grad_norm": 0.3193999230861664, "learning_rate": 0.0001, "loss": 0.2133, "step": 12 }, { "epoch": 0.08353413654618475, "grad_norm": 0.3274695575237274, "learning_rate": 0.00010833333333333333, "loss": 0.2086, "step": 13 }, { "epoch": 0.08995983935742972, "grad_norm": 0.32100680470466614, "learning_rate": 0.00011666666666666668, "loss": 0.1988, "step": 14 }, { "epoch": 0.0963855421686747, "grad_norm": 0.3277706205844879, "learning_rate": 0.000125, "loss": 0.1865, "step": 15 }, { "epoch": 0.10281124497991968, "grad_norm": 0.2264498621225357, "learning_rate": 0.00013333333333333334, "loss": 0.1605, "step": 16 }, { "epoch": 0.10923694779116466, "grad_norm": 0.3071700930595398, "learning_rate": 0.00014166666666666668, "loss": 0.1535, "step": 17 }, { "epoch": 0.11566265060240964, "grad_norm": 0.3147311508655548, "learning_rate": 0.00015000000000000001, "loss": 0.1637, "step": 18 }, { "epoch": 0.12208835341365462, "grad_norm": 0.32233041524887085, "learning_rate": 0.00015833333333333332, "loss": 0.17, "step": 19 }, { "epoch": 0.1285140562248996, "grad_norm": 0.2847141921520233, "learning_rate": 0.0001666666666666667, "loss": 0.1432, "step": 20 }, { "epoch": 0.13493975903614458, "grad_norm": 0.45303934812545776, "learning_rate": 0.000175, "loss": 0.1595, "step": 21 }, { "epoch": 0.14136546184738955, "grad_norm": 0.32803109288215637, "learning_rate": 0.00018333333333333334, "loss": 0.1263, "step": 22 }, { "epoch": 0.14779116465863454, "grad_norm": 0.7632677555084229, "learning_rate": 0.00019166666666666667, "loss": 0.162, "step": 23 }, { "epoch": 0.15421686746987953, "grad_norm": 7.622311115264893, "learning_rate": 0.0002, "loss": 0.1743, "step": 24 }, { "epoch": 0.1606425702811245, "grad_norm": 0.41840752959251404, "learning_rate": 0.00019999912503789813, "loss": 0.1967, "step": 25 }, { "epoch": 0.1670682730923695, "grad_norm": 0.29047325253486633, "learning_rate": 0.00019999650016690364, "loss": 0.1196, "step": 26 }, { "epoch": 0.17349397590361446, "grad_norm": 0.2337496131658554, "learning_rate": 0.0001999921254329498, "loss": 0.1488, "step": 27 }, { "epoch": 0.17991967871485945, "grad_norm": 0.2089911699295044, "learning_rate": 0.00019998600091259113, "loss": 0.138, "step": 28 }, { "epoch": 0.18634538152610441, "grad_norm": 0.268136590719223, "learning_rate": 0.00019997812671300214, "loss": 0.1845, "step": 29 }, { "epoch": 0.1927710843373494, "grad_norm": 0.2347370982170105, "learning_rate": 0.0001999685029719753, "loss": 0.1257, "step": 30 }, { "epoch": 0.19919678714859437, "grad_norm": 0.21996308863162994, "learning_rate": 0.0001999571298579188, "loss": 0.171, "step": 31 }, { "epoch": 0.20562248995983937, "grad_norm": 0.1974944919347763, "learning_rate": 0.0001999440075698535, "loss": 0.1095, "step": 32 }, { "epoch": 0.21204819277108433, "grad_norm": 0.15095502138137817, "learning_rate": 0.00019992913633740957, "loss": 0.1663, "step": 33 }, { "epoch": 0.21847389558232932, "grad_norm": 0.2080863118171692, "learning_rate": 0.0001999125164208222, "loss": 0.141, "step": 34 }, { "epoch": 0.2248995983935743, "grad_norm": 0.2128421813249588, "learning_rate": 0.0001998941481109274, "loss": 0.2076, "step": 35 }, { "epoch": 0.23132530120481928, "grad_norm": 0.2011524736881256, "learning_rate": 0.00019987403172915666, "loss": 0.1419, "step": 36 }, { "epoch": 0.23775100401606425, "grad_norm": 0.16084055602550507, "learning_rate": 0.00019985216762753139, "loss": 0.1357, "step": 37 }, { "epoch": 0.24417670682730924, "grad_norm": 0.1661912202835083, "learning_rate": 0.0001998285561886568, "loss": 0.1471, "step": 38 }, { "epoch": 0.25060240963855424, "grad_norm": 0.2506616413593292, "learning_rate": 0.00019980319782571523, "loss": 0.1555, "step": 39 }, { "epoch": 0.2570281124497992, "grad_norm": 0.17315006256103516, "learning_rate": 0.00019977609298245873, "loss": 0.1468, "step": 40 }, { "epoch": 0.26345381526104417, "grad_norm": 0.1772138923406601, "learning_rate": 0.00019974724213320157, "loss": 0.1447, "step": 41 }, { "epoch": 0.26987951807228916, "grad_norm": 0.2274760603904724, "learning_rate": 0.00019971664578281173, "loss": 0.1707, "step": 42 }, { "epoch": 0.27630522088353415, "grad_norm": 0.16768603026866913, "learning_rate": 0.00019968430446670212, "loss": 0.147, "step": 43 }, { "epoch": 0.2827309236947791, "grad_norm": 0.1941104382276535, "learning_rate": 0.0001996502187508213, "loss": 0.1415, "step": 44 }, { "epoch": 0.2891566265060241, "grad_norm": 0.17718106508255005, "learning_rate": 0.00019961438923164345, "loss": 0.1297, "step": 45 }, { "epoch": 0.2955823293172691, "grad_norm": 0.17948807775974274, "learning_rate": 0.00019957681653615797, "loss": 0.1349, "step": 46 }, { "epoch": 0.30200803212851407, "grad_norm": 0.19423460960388184, "learning_rate": 0.0001995375013218586, "loss": 0.1277, "step": 47 }, { "epoch": 0.30843373493975906, "grad_norm": 0.2893676161766052, "learning_rate": 0.00019949644427673177, "loss": 0.1485, "step": 48 }, { "epoch": 0.314859437751004, "grad_norm": 0.4430103600025177, "learning_rate": 0.00019945364611924463, "loss": 0.1306, "step": 49 }, { "epoch": 0.321285140562249, "grad_norm": 0.28305917978286743, "learning_rate": 0.0001994091075983325, "loss": 0.1646, "step": 50 }, { "epoch": 0.327710843373494, "grad_norm": 0.10388786345720291, "learning_rate": 0.00019936282949338578, "loss": 0.097, "step": 51 }, { "epoch": 0.334136546184739, "grad_norm": 0.13306961953639984, "learning_rate": 0.00019931481261423618, "loss": 0.1222, "step": 52 }, { "epoch": 0.3405622489959839, "grad_norm": 0.1919727623462677, "learning_rate": 0.00019926505780114276, "loss": 0.1566, "step": 53 }, { "epoch": 0.3469879518072289, "grad_norm": 0.21380549669265747, "learning_rate": 0.0001992135659247769, "loss": 0.1404, "step": 54 }, { "epoch": 0.3534136546184739, "grad_norm": 0.1510586440563202, "learning_rate": 0.00019916033788620755, "loss": 0.1453, "step": 55 }, { "epoch": 0.3598393574297189, "grad_norm": 0.11387544125318527, "learning_rate": 0.000199105374616885, "loss": 0.1261, "step": 56 }, { "epoch": 0.36626506024096384, "grad_norm": 0.15470993518829346, "learning_rate": 0.00019904867707862476, "loss": 0.163, "step": 57 }, { "epoch": 0.37269076305220883, "grad_norm": 0.18749335408210754, "learning_rate": 0.0001989902462635908, "loss": 0.1452, "step": 58 }, { "epoch": 0.3791164658634538, "grad_norm": 0.13796862959861755, "learning_rate": 0.00019893008319427812, "loss": 0.1257, "step": 59 }, { "epoch": 0.3855421686746988, "grad_norm": 0.18501056730747223, "learning_rate": 0.00019886818892349482, "loss": 0.1143, "step": 60 }, { "epoch": 0.39196787148594375, "grad_norm": 0.18443071842193604, "learning_rate": 0.00019880456453434369, "loss": 0.1395, "step": 61 }, { "epoch": 0.39839357429718875, "grad_norm": 0.1594623327255249, "learning_rate": 0.00019873921114020333, "loss": 0.1505, "step": 62 }, { "epoch": 0.40481927710843374, "grad_norm": 0.1715545505285263, "learning_rate": 0.00019867212988470864, "loss": 0.115, "step": 63 }, { "epoch": 0.41124497991967873, "grad_norm": 0.24589896202087402, "learning_rate": 0.0001986033219417307, "loss": 0.1549, "step": 64 }, { "epoch": 0.41767068273092367, "grad_norm": 0.1842864602804184, "learning_rate": 0.00019853278851535638, "loss": 0.1511, "step": 65 }, { "epoch": 0.42409638554216866, "grad_norm": 0.20570969581604004, "learning_rate": 0.00019846053083986717, "loss": 0.168, "step": 66 }, { "epoch": 0.43052208835341366, "grad_norm": 0.1519116312265396, "learning_rate": 0.00019838655017971767, "loss": 0.1142, "step": 67 }, { "epoch": 0.43694779116465865, "grad_norm": 0.2530803978443146, "learning_rate": 0.00019831084782951326, "loss": 0.1359, "step": 68 }, { "epoch": 0.4433734939759036, "grad_norm": 0.2451787292957306, "learning_rate": 0.00019823342511398776, "loss": 0.1257, "step": 69 }, { "epoch": 0.4497991967871486, "grad_norm": 0.43833062052726746, "learning_rate": 0.00019815428338798002, "loss": 0.1275, "step": 70 }, { "epoch": 0.4562248995983936, "grad_norm": 0.1903715431690216, "learning_rate": 0.0001980734240364102, "loss": 0.1357, "step": 71 }, { "epoch": 0.46265060240963857, "grad_norm": 0.28559988737106323, "learning_rate": 0.00019799084847425572, "loss": 0.1312, "step": 72 }, { "epoch": 0.46907630522088356, "grad_norm": 0.2496558576822281, "learning_rate": 0.0001979065581465263, "loss": 0.1633, "step": 73 }, { "epoch": 0.4755020080321285, "grad_norm": 0.2327835112810135, "learning_rate": 0.00019782055452823878, "loss": 0.1442, "step": 74 }, { "epoch": 0.4819277108433735, "grad_norm": 0.3205198645591736, "learning_rate": 0.00019773283912439133, "loss": 0.1511, "step": 75 }, { "epoch": 0.4883534136546185, "grad_norm": 0.1274174302816391, "learning_rate": 0.00019764341346993698, "loss": 0.0996, "step": 76 }, { "epoch": 0.4947791164658635, "grad_norm": 0.2272380292415619, "learning_rate": 0.00019755227912975697, "loss": 0.1176, "step": 77 }, { "epoch": 0.5012048192771085, "grad_norm": 0.13094107806682587, "learning_rate": 0.0001974594376986331, "loss": 0.1184, "step": 78 }, { "epoch": 0.5076305220883535, "grad_norm": 0.13428834080696106, "learning_rate": 0.00019736489080122006, "loss": 0.1309, "step": 79 }, { "epoch": 0.5140562248995983, "grad_norm": 0.1373153030872345, "learning_rate": 0.00019726864009201694, "loss": 0.1376, "step": 80 }, { "epoch": 0.5204819277108433, "grad_norm": 0.1309944987297058, "learning_rate": 0.00019717068725533818, "loss": 0.1403, "step": 81 }, { "epoch": 0.5269076305220883, "grad_norm": 0.14153365790843964, "learning_rate": 0.00019707103400528415, "loss": 0.1399, "step": 82 }, { "epoch": 0.5333333333333333, "grad_norm": 0.1331794112920761, "learning_rate": 0.0001969696820857112, "loss": 0.1455, "step": 83 }, { "epoch": 0.5397590361445783, "grad_norm": 0.12772125005722046, "learning_rate": 0.0001968666332702011, "loss": 0.1409, "step": 84 }, { "epoch": 0.5461847389558233, "grad_norm": 0.13976705074310303, "learning_rate": 0.00019676188936203006, "loss": 0.1144, "step": 85 }, { "epoch": 0.5526104417670683, "grad_norm": 0.1431870311498642, "learning_rate": 0.00019665545219413701, "loss": 0.148, "step": 86 }, { "epoch": 0.5590361445783133, "grad_norm": 0.1257038563489914, "learning_rate": 0.00019654732362909177, "loss": 0.1197, "step": 87 }, { "epoch": 0.5654618473895582, "grad_norm": 0.16741393506526947, "learning_rate": 0.00019643750555906224, "loss": 0.1563, "step": 88 }, { "epoch": 0.5718875502008032, "grad_norm": 0.16155458986759186, "learning_rate": 0.00019632599990578143, "loss": 0.1333, "step": 89 }, { "epoch": 0.5783132530120482, "grad_norm": 0.17337974905967712, "learning_rate": 0.00019621280862051373, "loss": 0.1669, "step": 90 }, { "epoch": 0.5847389558232932, "grad_norm": 0.17315532267093658, "learning_rate": 0.00019609793368402086, "loss": 0.1488, "step": 91 }, { "epoch": 0.5911646586345382, "grad_norm": 0.15965646505355835, "learning_rate": 0.0001959813771065271, "loss": 0.1207, "step": 92 }, { "epoch": 0.5975903614457831, "grad_norm": 0.15546628832817078, "learning_rate": 0.00019586314092768424, "loss": 0.1147, "step": 93 }, { "epoch": 0.6040160642570281, "grad_norm": 0.16691721975803375, "learning_rate": 0.00019574322721653583, "loss": 0.1172, "step": 94 }, { "epoch": 0.6104417670682731, "grad_norm": 0.1729833036661148, "learning_rate": 0.00019562163807148084, "loss": 0.12, "step": 95 }, { "epoch": 0.6168674698795181, "grad_norm": 0.17864178121089935, "learning_rate": 0.0001954983756202372, "loss": 0.1266, "step": 96 }, { "epoch": 0.623293172690763, "grad_norm": 0.20421630144119263, "learning_rate": 0.0001953734420198044, "loss": 0.1531, "step": 97 }, { "epoch": 0.629718875502008, "grad_norm": 0.19765256345272064, "learning_rate": 0.0001952468394564257, "loss": 0.1134, "step": 98 }, { "epoch": 0.636144578313253, "grad_norm": 0.197422593832016, "learning_rate": 0.00019511857014555, "loss": 0.1292, "step": 99 }, { "epoch": 0.642570281124498, "grad_norm": 0.2465924769639969, "learning_rate": 0.00019498863633179308, "loss": 0.1426, "step": 100 }, { "epoch": 0.648995983935743, "grad_norm": 0.136220321059227, "learning_rate": 0.00019485704028889813, "loss": 0.0881, "step": 101 }, { "epoch": 0.655421686746988, "grad_norm": 0.14770826697349548, "learning_rate": 0.0001947237843196962, "loss": 0.125, "step": 102 }, { "epoch": 0.661847389558233, "grad_norm": 0.13019175827503204, "learning_rate": 0.0001945888707560657, "loss": 0.1271, "step": 103 }, { "epoch": 0.668273092369478, "grad_norm": 0.10536068677902222, "learning_rate": 0.0001944523019588918, "loss": 0.107, "step": 104 }, { "epoch": 0.6746987951807228, "grad_norm": 0.10180668532848358, "learning_rate": 0.00019431408031802486, "loss": 0.1145, "step": 105 }, { "epoch": 0.6811244979919678, "grad_norm": 0.14559617638587952, "learning_rate": 0.00019417420825223891, "loss": 0.1395, "step": 106 }, { "epoch": 0.6875502008032128, "grad_norm": 0.13509546220302582, "learning_rate": 0.000194032688209189, "loss": 0.1478, "step": 107 }, { "epoch": 0.6939759036144578, "grad_norm": 0.13227735459804535, "learning_rate": 0.00019388952266536868, "loss": 0.1445, "step": 108 }, { "epoch": 0.7004016064257028, "grad_norm": 0.16162370145320892, "learning_rate": 0.00019374471412606642, "loss": 0.1246, "step": 109 }, { "epoch": 0.7068273092369478, "grad_norm": 0.1407587081193924, "learning_rate": 0.00019359826512532194, "loss": 0.1421, "step": 110 }, { "epoch": 0.7132530120481928, "grad_norm": 0.15528196096420288, "learning_rate": 0.00019345017822588168, "loss": 0.1629, "step": 111 }, { "epoch": 0.7196787148594378, "grad_norm": 0.1608172059059143, "learning_rate": 0.0001933004560191542, "loss": 0.1538, "step": 112 }, { "epoch": 0.7261044176706827, "grad_norm": 0.15321175754070282, "learning_rate": 0.00019314910112516463, "loss": 0.1251, "step": 113 }, { "epoch": 0.7325301204819277, "grad_norm": 0.17383867502212524, "learning_rate": 0.00019299611619250881, "loss": 0.1531, "step": 114 }, { "epoch": 0.7389558232931727, "grad_norm": 0.18434979021549225, "learning_rate": 0.00019284150389830721, "loss": 0.1847, "step": 115 }, { "epoch": 0.7453815261044177, "grad_norm": 0.16240356862545013, "learning_rate": 0.00019268526694815773, "loss": 0.1712, "step": 116 }, { "epoch": 0.7518072289156627, "grad_norm": 0.17521648108959198, "learning_rate": 0.0001925274080760886, "loss": 0.1222, "step": 117 }, { "epoch": 0.7582329317269076, "grad_norm": 0.16100138425827026, "learning_rate": 0.00019236793004451044, "loss": 0.1238, "step": 118 }, { "epoch": 0.7646586345381526, "grad_norm": 0.16682398319244385, "learning_rate": 0.00019220683564416787, "loss": 0.0914, "step": 119 }, { "epoch": 0.7710843373493976, "grad_norm": 0.15211397409439087, "learning_rate": 0.00019204412769409086, "loss": 0.1051, "step": 120 }, { "epoch": 0.7775100401606426, "grad_norm": 0.19107018411159515, "learning_rate": 0.00019187980904154515, "loss": 0.1532, "step": 121 }, { "epoch": 0.7839357429718875, "grad_norm": 0.18667763471603394, "learning_rate": 0.00019171388256198268, "loss": 0.1435, "step": 122 }, { "epoch": 0.7903614457831325, "grad_norm": 0.1942739635705948, "learning_rate": 0.000191546351158991, "loss": 0.1137, "step": 123 }, { "epoch": 0.7967871485943775, "grad_norm": 0.23028349876403809, "learning_rate": 0.00019137721776424274, "loss": 0.1293, "step": 124 }, { "epoch": 0.8032128514056225, "grad_norm": 0.25495702028274536, "learning_rate": 0.0001912064853374441, "loss": 0.1441, "step": 125 }, { "epoch": 0.8096385542168675, "grad_norm": 0.10432910919189453, "learning_rate": 0.0001910341568662831, "loss": 0.0831, "step": 126 }, { "epoch": 0.8160642570281125, "grad_norm": 0.11154992133378983, "learning_rate": 0.00019086023536637737, "loss": 0.1183, "step": 127 }, { "epoch": 0.8224899598393575, "grad_norm": 0.13584573566913605, "learning_rate": 0.0001906847238812214, "loss": 0.1441, "step": 128 }, { "epoch": 0.8289156626506025, "grad_norm": 0.35521605610847473, "learning_rate": 0.0001905076254821331, "loss": 0.1368, "step": 129 }, { "epoch": 0.8353413654618473, "grad_norm": 0.14569194614887238, "learning_rate": 0.00019032894326820023, "loss": 0.1285, "step": 130 }, { "epoch": 0.8417670682730923, "grad_norm": 0.16236892342567444, "learning_rate": 0.0001901486803662261, "loss": 0.1578, "step": 131 }, { "epoch": 0.8481927710843373, "grad_norm": 0.11677072197198868, "learning_rate": 0.00018996683993067483, "loss": 0.1183, "step": 132 }, { "epoch": 0.8546184738955823, "grad_norm": 0.1277882605791092, "learning_rate": 0.00018978342514361626, "loss": 0.1196, "step": 133 }, { "epoch": 0.8610441767068273, "grad_norm": 0.12904071807861328, "learning_rate": 0.00018959843921467014, "loss": 0.1281, "step": 134 }, { "epoch": 0.8674698795180723, "grad_norm": 0.13108272850513458, "learning_rate": 0.00018941188538094999, "loss": 0.1187, "step": 135 }, { "epoch": 0.8738955823293173, "grad_norm": 0.15289278328418732, "learning_rate": 0.0001892237669070065, "loss": 0.1524, "step": 136 }, { "epoch": 0.8803212851405623, "grad_norm": 0.13197748363018036, "learning_rate": 0.0001890340870847704, "loss": 0.1104, "step": 137 }, { "epoch": 0.8867469879518072, "grad_norm": 0.15141014754772186, "learning_rate": 0.00018884284923349477, "loss": 0.154, "step": 138 }, { "epoch": 0.8931726907630522, "grad_norm": 0.1232500970363617, "learning_rate": 0.00018865005669969708, "loss": 0.102, "step": 139 }, { "epoch": 0.8995983935742972, "grad_norm": 0.16395455598831177, "learning_rate": 0.00018845571285710058, "loss": 0.145, "step": 140 }, { "epoch": 0.9060240963855422, "grad_norm": 0.15895424783229828, "learning_rate": 0.00018825982110657515, "loss": 0.1268, "step": 141 }, { "epoch": 0.9124497991967871, "grad_norm": 0.15639305114746094, "learning_rate": 0.00018806238487607794, "loss": 0.126, "step": 142 }, { "epoch": 0.9188755020080321, "grad_norm": 0.14362278580665588, "learning_rate": 0.0001878634076205934, "loss": 0.0981, "step": 143 }, { "epoch": 0.9253012048192771, "grad_norm": 0.1624501496553421, "learning_rate": 0.00018766289282207263, "loss": 0.1208, "step": 144 }, { "epoch": 0.9317269076305221, "grad_norm": 0.1643369346857071, "learning_rate": 0.00018746084398937266, "loss": 0.1088, "step": 145 }, { "epoch": 0.9381526104417671, "grad_norm": 0.19890688359737396, "learning_rate": 0.00018725726465819488, "loss": 0.1476, "step": 146 }, { "epoch": 0.944578313253012, "grad_norm": 0.16708028316497803, "learning_rate": 0.00018705215839102328, "loss": 0.1175, "step": 147 }, { "epoch": 0.951004016064257, "grad_norm": 0.20685526728630066, "learning_rate": 0.0001868455287770621, "loss": 0.1573, "step": 148 }, { "epoch": 0.957429718875502, "grad_norm": 0.19720108807086945, "learning_rate": 0.00018663737943217296, "loss": 0.137, "step": 149 }, { "epoch": 0.963855421686747, "grad_norm": 0.2381121814250946, "learning_rate": 0.00018642771399881162, "loss": 0.156, "step": 150 }, { "epoch": 0.970281124497992, "grad_norm": 0.13865579664707184, "learning_rate": 0.00018621653614596425, "loss": 0.1229, "step": 151 }, { "epoch": 0.976706827309237, "grad_norm": 0.10851379483938217, "learning_rate": 0.00018600384956908323, "loss": 0.1088, "step": 152 }, { "epoch": 0.983132530120482, "grad_norm": 0.1621655523777008, "learning_rate": 0.00018578965799002236, "loss": 0.1479, "step": 153 }, { "epoch": 0.989558232931727, "grad_norm": 0.18607285618782043, "learning_rate": 0.00018557396515697202, "loss": 0.1489, "step": 154 }, { "epoch": 0.9959839357429718, "grad_norm": 0.19177676737308502, "learning_rate": 0.0001853567748443933, "loss": 0.1163, "step": 155 }, { "epoch": 1.0056224899598394, "grad_norm": 0.6790018677711487, "learning_rate": 0.000185138090852952, "loss": 0.2256, "step": 156 }, { "epoch": 1.0120481927710843, "grad_norm": 0.10716850310564041, "learning_rate": 0.0001849179170094522, "loss": 0.094, "step": 157 }, { "epoch": 1.0184738955823294, "grad_norm": 0.11798243969678879, "learning_rate": 0.00018469625716676933, "loss": 0.1108, "step": 158 }, { "epoch": 1.0248995983935743, "grad_norm": 0.13069161772727966, "learning_rate": 0.00018447311520378262, "loss": 0.1041, "step": 159 }, { "epoch": 1.0313253012048194, "grad_norm": 0.19986286759376526, "learning_rate": 0.0001842484950253073, "loss": 0.125, "step": 160 }, { "epoch": 1.0377510040160642, "grad_norm": 0.16085122525691986, "learning_rate": 0.00018402240056202614, "loss": 0.1025, "step": 161 }, { "epoch": 1.0441767068273093, "grad_norm": 0.20288337767124176, "learning_rate": 0.00018379483577042103, "loss": 0.1328, "step": 162 }, { "epoch": 1.0506024096385542, "grad_norm": 0.15977489948272705, "learning_rate": 0.00018356580463270322, "loss": 0.0985, "step": 163 }, { "epoch": 1.057028112449799, "grad_norm": 0.2264052927494049, "learning_rate": 0.00018333531115674408, "loss": 0.0931, "step": 164 }, { "epoch": 1.0634538152610442, "grad_norm": 0.18668119609355927, "learning_rate": 0.0001831033593760047, "loss": 0.0777, "step": 165 }, { "epoch": 1.069879518072289, "grad_norm": 0.2187710851430893, "learning_rate": 0.00018286995334946545, "loss": 0.1076, "step": 166 }, { "epoch": 1.0763052208835342, "grad_norm": 0.19872407615184784, "learning_rate": 0.0001826350971615549, "loss": 0.1008, "step": 167 }, { "epoch": 1.082730923694779, "grad_norm": 0.23164619505405426, "learning_rate": 0.00018239879492207831, "loss": 0.1104, "step": 168 }, { "epoch": 1.0891566265060242, "grad_norm": 0.20669420063495636, "learning_rate": 0.00018216105076614576, "loss": 0.1042, "step": 169 }, { "epoch": 1.095582329317269, "grad_norm": 0.23208123445510864, "learning_rate": 0.00018192186885409973, "loss": 0.1156, "step": 170 }, { "epoch": 1.1020080321285142, "grad_norm": 0.25448471307754517, "learning_rate": 0.0001816812533714425, "loss": 0.1322, "step": 171 }, { "epoch": 1.108433734939759, "grad_norm": 0.18578830361366272, "learning_rate": 0.00018143920852876257, "loss": 0.078, "step": 172 }, { "epoch": 1.114859437751004, "grad_norm": 0.21140921115875244, "learning_rate": 0.0001811957385616612, "loss": 0.1078, "step": 173 }, { "epoch": 1.121285140562249, "grad_norm": 0.24159879982471466, "learning_rate": 0.0001809508477306783, "loss": 0.098, "step": 174 }, { "epoch": 1.127710843373494, "grad_norm": 0.2108435034751892, "learning_rate": 0.00018070454032121787, "loss": 0.085, "step": 175 }, { "epoch": 1.134136546184739, "grad_norm": 0.25270572304725647, "learning_rate": 0.00018045682064347275, "loss": 0.0984, "step": 176 }, { "epoch": 1.140562248995984, "grad_norm": 0.2605237066745758, "learning_rate": 0.00018020769303234962, "loss": 0.1125, "step": 177 }, { "epoch": 1.146987951807229, "grad_norm": 0.2768741846084595, "learning_rate": 0.00017995716184739284, "loss": 0.0868, "step": 178 }, { "epoch": 1.153413654618474, "grad_norm": 0.3283689320087433, "learning_rate": 0.00017970523147270822, "loss": 0.0932, "step": 179 }, { "epoch": 1.159839357429719, "grad_norm": 0.2760496735572815, "learning_rate": 0.0001794519063168864, "loss": 0.0702, "step": 180 }, { "epoch": 1.1662650602409639, "grad_norm": 0.1508658230304718, "learning_rate": 0.0001791971908129256, "loss": 0.1086, "step": 181 }, { "epoch": 1.1726907630522088, "grad_norm": 0.1741812825202942, "learning_rate": 0.000178941089418154, "loss": 0.1102, "step": 182 }, { "epoch": 1.1791164658634539, "grad_norm": 0.18406537175178528, "learning_rate": 0.000178683606614152, "loss": 0.1185, "step": 183 }, { "epoch": 1.1855421686746987, "grad_norm": 0.18714162707328796, "learning_rate": 0.00017842474690667344, "loss": 0.1078, "step": 184 }, { "epoch": 1.1919678714859439, "grad_norm": 0.15225981175899506, "learning_rate": 0.00017816451482556702, "loss": 0.0808, "step": 185 }, { "epoch": 1.1983935742971887, "grad_norm": 0.1622186154127121, "learning_rate": 0.0001779029149246969, "loss": 0.097, "step": 186 }, { "epoch": 1.2048192771084336, "grad_norm": 0.17352862656116486, "learning_rate": 0.00017763995178186307, "loss": 0.1094, "step": 187 }, { "epoch": 1.2112449799196787, "grad_norm": 0.14557699859142303, "learning_rate": 0.00017737562999872118, "loss": 0.1031, "step": 188 }, { "epoch": 1.2176706827309236, "grad_norm": 0.1729564219713211, "learning_rate": 0.00017710995420070215, "loss": 0.1109, "step": 189 }, { "epoch": 1.2240963855421687, "grad_norm": 0.17331069707870483, "learning_rate": 0.00017684292903693102, "loss": 0.1163, "step": 190 }, { "epoch": 1.2305220883534136, "grad_norm": 0.1967068910598755, "learning_rate": 0.0001765745591801458, "loss": 0.1137, "step": 191 }, { "epoch": 1.2369477911646587, "grad_norm": 0.20813412964344025, "learning_rate": 0.00017630484932661559, "loss": 0.0865, "step": 192 }, { "epoch": 1.2433734939759036, "grad_norm": 0.17115503549575806, "learning_rate": 0.0001760338041960583, "loss": 0.0954, "step": 193 }, { "epoch": 1.2497991967871487, "grad_norm": 0.2135663777589798, "learning_rate": 0.00017576142853155838, "loss": 0.099, "step": 194 }, { "epoch": 1.2562248995983936, "grad_norm": 0.2796885669231415, "learning_rate": 0.00017548772709948343, "loss": 0.1166, "step": 195 }, { "epoch": 1.2626506024096384, "grad_norm": 0.2290525585412979, "learning_rate": 0.0001752127046894011, "loss": 0.1018, "step": 196 }, { "epoch": 1.2690763052208835, "grad_norm": 0.23698222637176514, "learning_rate": 0.0001749363661139951, "loss": 0.0871, "step": 197 }, { "epoch": 1.2755020080321284, "grad_norm": 0.2161116749048233, "learning_rate": 0.00017465871620898102, "loss": 0.0819, "step": 198 }, { "epoch": 1.2819277108433735, "grad_norm": 0.2709653377532959, "learning_rate": 0.00017437975983302178, "loss": 0.082, "step": 199 }, { "epoch": 1.2883534136546184, "grad_norm": 0.2437043935060501, "learning_rate": 0.0001740995018676425, "loss": 0.07, "step": 200 }, { "epoch": 1.2947791164658635, "grad_norm": 0.24623267352581024, "learning_rate": 0.0001738179472171452, "loss": 0.0868, "step": 201 }, { "epoch": 1.3012048192771084, "grad_norm": 0.27310508489608765, "learning_rate": 0.00017353510080852282, "loss": 0.0857, "step": 202 }, { "epoch": 1.3076305220883535, "grad_norm": 0.2532103657722473, "learning_rate": 0.0001732509675913731, "loss": 0.0885, "step": 203 }, { "epoch": 1.3140562248995984, "grad_norm": 0.2618705928325653, "learning_rate": 0.000172965552537812, "loss": 0.0903, "step": 204 }, { "epoch": 1.3204819277108433, "grad_norm": 0.3039279282093048, "learning_rate": 0.00017267886064238662, "loss": 0.0963, "step": 205 }, { "epoch": 1.3269076305220884, "grad_norm": 0.16821685433387756, "learning_rate": 0.00017239089692198785, "loss": 0.0837, "step": 206 }, { "epoch": 1.3333333333333333, "grad_norm": 0.275329053401947, "learning_rate": 0.0001721016664157625, "loss": 0.1015, "step": 207 }, { "epoch": 1.3397590361445784, "grad_norm": 0.1983174830675125, "learning_rate": 0.00017181117418502525, "loss": 0.1156, "step": 208 }, { "epoch": 1.3461847389558232, "grad_norm": 0.19256579875946045, "learning_rate": 0.00017151942531316988, "loss": 0.1055, "step": 209 }, { "epoch": 1.3526104417670683, "grad_norm": 0.19830577075481415, "learning_rate": 0.00017122642490558055, "loss": 0.1142, "step": 210 }, { "epoch": 1.3590361445783132, "grad_norm": 0.17073017358779907, "learning_rate": 0.00017093217808954232, "loss": 0.1305, "step": 211 }, { "epoch": 1.3654618473895583, "grad_norm": 0.18915396928787231, "learning_rate": 0.00017063669001415145, "loss": 0.1147, "step": 212 }, { "epoch": 1.3718875502008032, "grad_norm": 0.13788312673568726, "learning_rate": 0.00017033996585022528, "loss": 0.1056, "step": 213 }, { "epoch": 1.378313253012048, "grad_norm": 0.203065887093544, "learning_rate": 0.00017004201079021176, "loss": 0.1355, "step": 214 }, { "epoch": 1.3847389558232932, "grad_norm": 0.2112981230020523, "learning_rate": 0.00016974283004809858, "loss": 0.1215, "step": 215 }, { "epoch": 1.391164658634538, "grad_norm": 0.19515666365623474, "learning_rate": 0.00016944242885932206, "loss": 0.135, "step": 216 }, { "epoch": 1.3975903614457832, "grad_norm": 0.19761696457862854, "learning_rate": 0.0001691408124806752, "loss": 0.125, "step": 217 }, { "epoch": 1.404016064257028, "grad_norm": 0.18920212984085083, "learning_rate": 0.00016883798619021608, "loss": 0.0967, "step": 218 }, { "epoch": 1.410441767068273, "grad_norm": 0.1732681393623352, "learning_rate": 0.0001685339552871752, "loss": 0.0984, "step": 219 }, { "epoch": 1.416867469879518, "grad_norm": 0.20118467509746552, "learning_rate": 0.00016822872509186297, "loss": 0.0871, "step": 220 }, { "epoch": 1.4232931726907632, "grad_norm": 0.24401867389678955, "learning_rate": 0.0001679223009455764, "loss": 0.0971, "step": 221 }, { "epoch": 1.429718875502008, "grad_norm": 0.22608117759227753, "learning_rate": 0.00016761468821050585, "loss": 0.0996, "step": 222 }, { "epoch": 1.436144578313253, "grad_norm": 0.19186720252037048, "learning_rate": 0.00016730589226964098, "loss": 0.0757, "step": 223 }, { "epoch": 1.442570281124498, "grad_norm": 0.24773664772510529, "learning_rate": 0.00016699591852667673, "loss": 0.0819, "step": 224 }, { "epoch": 1.448995983935743, "grad_norm": 0.2296506017446518, "learning_rate": 0.00016668477240591864, "loss": 0.0967, "step": 225 }, { "epoch": 1.455421686746988, "grad_norm": 0.23210635781288147, "learning_rate": 0.00016637245935218799, "loss": 0.089, "step": 226 }, { "epoch": 1.461847389558233, "grad_norm": 0.22376962006092072, "learning_rate": 0.00016605898483072648, "loss": 0.0839, "step": 227 }, { "epoch": 1.4682730923694778, "grad_norm": 0.23127049207687378, "learning_rate": 0.00016574435432710068, "loss": 0.0827, "step": 228 }, { "epoch": 1.4746987951807229, "grad_norm": 0.31701013445854187, "learning_rate": 0.0001654285733471059, "loss": 0.1, "step": 229 }, { "epoch": 1.481124497991968, "grad_norm": 0.3070242702960968, "learning_rate": 0.0001651116474166699, "loss": 0.1036, "step": 230 }, { "epoch": 1.4875502008032129, "grad_norm": 0.18072882294654846, "learning_rate": 0.00016479358208175627, "loss": 0.1061, "step": 231 }, { "epoch": 1.4939759036144578, "grad_norm": 0.14309802651405334, "learning_rate": 0.00016447438290826733, "loss": 0.092, "step": 232 }, { "epoch": 1.5004016064257029, "grad_norm": 0.18977715075016022, "learning_rate": 0.00016415405548194663, "loss": 0.1152, "step": 233 }, { "epoch": 1.5068273092369477, "grad_norm": 0.22865934669971466, "learning_rate": 0.00016383260540828135, "loss": 0.116, "step": 234 }, { "epoch": 1.5132530120481928, "grad_norm": 0.2020760327577591, "learning_rate": 0.00016351003831240415, "loss": 0.112, "step": 235 }, { "epoch": 1.5196787148594377, "grad_norm": 0.25912925601005554, "learning_rate": 0.00016318635983899465, "loss": 0.1282, "step": 236 }, { "epoch": 1.5261044176706826, "grad_norm": 0.1735217273235321, "learning_rate": 0.0001628615756521809, "loss": 0.1034, "step": 237 }, { "epoch": 1.5325301204819277, "grad_norm": 0.19721132516860962, "learning_rate": 0.0001625356914354399, "loss": 0.1338, "step": 238 }, { "epoch": 1.5389558232931728, "grad_norm": 0.19484449923038483, "learning_rate": 0.0001622087128914985, "loss": 0.1214, "step": 239 }, { "epoch": 1.5453815261044177, "grad_norm": 0.16421280801296234, "learning_rate": 0.00016188064574223335, "loss": 0.0866, "step": 240 }, { "epoch": 1.5518072289156626, "grad_norm": 0.1922108381986618, "learning_rate": 0.0001615514957285709, "loss": 0.1298, "step": 241 }, { "epoch": 1.5582329317269075, "grad_norm": 0.16495804488658905, "learning_rate": 0.00016122126861038688, "loss": 0.1056, "step": 242 }, { "epoch": 1.5646586345381526, "grad_norm": 0.2061115801334381, "learning_rate": 0.00016088997016640562, "loss": 0.1008, "step": 243 }, { "epoch": 1.5710843373493977, "grad_norm": 0.21605950593948364, "learning_rate": 0.00016055760619409877, "loss": 0.099, "step": 244 }, { "epoch": 1.5775100401606426, "grad_norm": 0.21308393776416779, "learning_rate": 0.00016022418250958385, "loss": 0.1041, "step": 245 }, { "epoch": 1.5839357429718874, "grad_norm": 0.30087393522262573, "learning_rate": 0.00015988970494752272, "loss": 0.1192, "step": 246 }, { "epoch": 1.5903614457831325, "grad_norm": 0.22396108508110046, "learning_rate": 0.00015955417936101913, "loss": 0.0985, "step": 247 }, { "epoch": 1.5967871485943776, "grad_norm": 0.27335667610168457, "learning_rate": 0.00015921761162151653, "loss": 0.0809, "step": 248 }, { "epoch": 1.6032128514056225, "grad_norm": 0.19635237753391266, "learning_rate": 0.00015888000761869528, "loss": 0.074, "step": 249 }, { "epoch": 1.6096385542168674, "grad_norm": 0.24129214882850647, "learning_rate": 0.0001585413732603695, "loss": 0.0948, "step": 250 }, { "epoch": 1.6160642570281123, "grad_norm": 0.26676061749458313, "learning_rate": 0.00015820171447238383, "loss": 0.1169, "step": 251 }, { "epoch": 1.6224899598393574, "grad_norm": 0.23809854686260223, "learning_rate": 0.0001578610371985096, "loss": 0.0916, "step": 252 }, { "epoch": 1.6289156626506025, "grad_norm": 0.21806567907333374, "learning_rate": 0.00015751934740034092, "loss": 0.0897, "step": 253 }, { "epoch": 1.6353413654618474, "grad_norm": 0.2490801066160202, "learning_rate": 0.00015717665105719015, "loss": 0.1021, "step": 254 }, { "epoch": 1.6417670682730923, "grad_norm": 0.32763025164604187, "learning_rate": 0.00015683295416598367, "loss": 0.0981, "step": 255 }, { "epoch": 1.6481927710843374, "grad_norm": 0.15386274456977844, "learning_rate": 0.00015648826274115653, "loss": 0.0735, "step": 256 }, { "epoch": 1.6546184738955825, "grad_norm": 0.16144217550754547, "learning_rate": 0.00015614258281454734, "loss": 0.1047, "step": 257 }, { "epoch": 1.6610441767068274, "grad_norm": 0.16441850364208221, "learning_rate": 0.00015579592043529292, "loss": 0.1014, "step": 258 }, { "epoch": 1.6674698795180722, "grad_norm": 0.2219092845916748, "learning_rate": 0.00015544828166972203, "loss": 0.1492, "step": 259 }, { "epoch": 1.6738955823293171, "grad_norm": 0.2408316433429718, "learning_rate": 0.00015509967260124964, "loss": 0.1373, "step": 260 }, { "epoch": 1.6803212851405622, "grad_norm": 0.16629627346992493, "learning_rate": 0.0001547500993302702, "loss": 0.1024, "step": 261 }, { "epoch": 1.6867469879518073, "grad_norm": 0.17428399622440338, "learning_rate": 0.000154399567974051, "loss": 0.1071, "step": 262 }, { "epoch": 1.6931726907630522, "grad_norm": 0.20689523220062256, "learning_rate": 0.00015404808466662508, "loss": 0.1164, "step": 263 }, { "epoch": 1.699598393574297, "grad_norm": 0.19431588053703308, "learning_rate": 0.0001536956555586839, "loss": 0.1095, "step": 264 }, { "epoch": 1.7060240963855422, "grad_norm": 0.28836753964424133, "learning_rate": 0.0001533422868174697, "loss": 0.0958, "step": 265 }, { "epoch": 1.7124497991967873, "grad_norm": 0.16699343919754028, "learning_rate": 0.00015298798462666765, "loss": 0.1017, "step": 266 }, { "epoch": 1.7188755020080322, "grad_norm": 0.20274591445922852, "learning_rate": 0.00015263275518629754, "loss": 0.1082, "step": 267 }, { "epoch": 1.725301204819277, "grad_norm": 0.20032569766044617, "learning_rate": 0.00015227660471260528, "loss": 0.1201, "step": 268 }, { "epoch": 1.731726907630522, "grad_norm": 0.19980573654174805, "learning_rate": 0.00015191953943795427, "loss": 0.1072, "step": 269 }, { "epoch": 1.738152610441767, "grad_norm": 0.2036619335412979, "learning_rate": 0.00015156156561071612, "loss": 0.1083, "step": 270 }, { "epoch": 1.7445783132530122, "grad_norm": 0.17177674174308777, "learning_rate": 0.0001512026894951615, "loss": 0.0981, "step": 271 }, { "epoch": 1.751004016064257, "grad_norm": 0.2405836135149002, "learning_rate": 0.00015084291737135048, "loss": 0.1005, "step": 272 }, { "epoch": 1.757429718875502, "grad_norm": 0.20927219092845917, "learning_rate": 0.00015048225553502244, "loss": 0.0895, "step": 273 }, { "epoch": 1.763855421686747, "grad_norm": 0.22314170002937317, "learning_rate": 0.00015012071029748614, "loss": 0.0874, "step": 274 }, { "epoch": 1.7702811244979921, "grad_norm": 0.21546539664268494, "learning_rate": 0.00014975828798550933, "loss": 0.0765, "step": 275 }, { "epoch": 1.776706827309237, "grad_norm": 0.27791547775268555, "learning_rate": 0.00014939499494120761, "loss": 0.0851, "step": 276 }, { "epoch": 1.783132530120482, "grad_norm": 0.23379187285900116, "learning_rate": 0.00014903083752193397, "loss": 0.1173, "step": 277 }, { "epoch": 1.7895582329317268, "grad_norm": 0.30948150157928467, "learning_rate": 0.0001486658221001672, "loss": 0.0994, "step": 278 }, { "epoch": 1.7959839357429719, "grad_norm": 0.31466349959373474, "learning_rate": 0.0001482999550634006, "loss": 0.1006, "step": 279 }, { "epoch": 1.802409638554217, "grad_norm": 0.29642632603645325, "learning_rate": 0.0001479332428140299, "loss": 0.0992, "step": 280 }, { "epoch": 1.8088353413654619, "grad_norm": 0.155403733253479, "learning_rate": 0.00014756569176924153, "loss": 0.081, "step": 281 }, { "epoch": 1.8152610441767068, "grad_norm": 0.15768149495124817, "learning_rate": 0.0001471973083609002, "loss": 0.0994, "step": 282 }, { "epoch": 1.8216867469879519, "grad_norm": 0.21333357691764832, "learning_rate": 0.00014682809903543632, "loss": 0.0975, "step": 283 }, { "epoch": 1.8281124497991967, "grad_norm": 0.1593853384256363, "learning_rate": 0.00014645807025373328, "loss": 0.1053, "step": 284 }, { "epoch": 1.8345381526104418, "grad_norm": 0.2197875678539276, "learning_rate": 0.0001460872284910143, "loss": 0.1231, "step": 285 }, { "epoch": 1.8409638554216867, "grad_norm": 0.1849106401205063, "learning_rate": 0.000145715580236729, "loss": 0.1369, "step": 286 }, { "epoch": 1.8473895582329316, "grad_norm": 0.19566693902015686, "learning_rate": 0.00014534313199444031, "loss": 0.1229, "step": 287 }, { "epoch": 1.8538152610441767, "grad_norm": 0.1796487420797348, "learning_rate": 0.00014496989028171012, "loss": 0.1046, "step": 288 }, { "epoch": 1.8602409638554218, "grad_norm": 0.18316736817359924, "learning_rate": 0.00014459586162998545, "loss": 0.1128, "step": 289 }, { "epoch": 1.8666666666666667, "grad_norm": 0.11896166950464249, "learning_rate": 0.00014422105258448425, "loss": 0.0722, "step": 290 }, { "epoch": 1.8730923694779116, "grad_norm": 0.17587630450725555, "learning_rate": 0.00014384546970408067, "loss": 0.1201, "step": 291 }, { "epoch": 1.8795180722891565, "grad_norm": 0.22650708258152008, "learning_rate": 0.0001434691195611905, "loss": 0.1196, "step": 292 }, { "epoch": 1.8859437751004016, "grad_norm": 0.17509253323078156, "learning_rate": 0.000143092008741656, "loss": 0.1129, "step": 293 }, { "epoch": 1.8923694779116467, "grad_norm": 0.29918667674064636, "learning_rate": 0.00014271414384463063, "loss": 0.1159, "step": 294 }, { "epoch": 1.8987951807228916, "grad_norm": 0.20377112925052643, "learning_rate": 0.00014233553148246364, "loss": 0.1046, "step": 295 }, { "epoch": 1.9052208835341364, "grad_norm": 0.20923396944999695, "learning_rate": 0.00014195617828058446, "loss": 0.1094, "step": 296 }, { "epoch": 1.9116465863453815, "grad_norm": 0.20258182287216187, "learning_rate": 0.00014157609087738656, "loss": 0.095, "step": 297 }, { "epoch": 1.9180722891566266, "grad_norm": 0.17875652015209198, "learning_rate": 0.00014119527592411146, "loss": 0.0937, "step": 298 }, { "epoch": 1.9244979919678715, "grad_norm": 0.21565286815166473, "learning_rate": 0.00014081374008473213, "loss": 0.0915, "step": 299 }, { "epoch": 1.9309236947791164, "grad_norm": 0.2412794530391693, "learning_rate": 0.0001404314900358366, "loss": 0.087, "step": 300 }, { "epoch": 1.9373493975903613, "grad_norm": 0.20525000989437103, "learning_rate": 0.00014004853246651092, "loss": 0.1062, "step": 301 }, { "epoch": 1.9437751004016064, "grad_norm": 0.20360495150089264, "learning_rate": 0.0001396648740782224, "loss": 0.0909, "step": 302 }, { "epoch": 1.9502008032128515, "grad_norm": 0.2990632653236389, "learning_rate": 0.000139280521584702, "loss": 0.0891, "step": 303 }, { "epoch": 1.9566265060240964, "grad_norm": 0.2693440020084381, "learning_rate": 0.00013889548171182702, "loss": 0.0964, "step": 304 }, { "epoch": 1.9630522088353413, "grad_norm": 0.35366424918174744, "learning_rate": 0.0001385097611975034, "loss": 0.1023, "step": 305 }, { "epoch": 1.9694779116465864, "grad_norm": 0.189253568649292, "learning_rate": 0.00013812336679154777, "loss": 0.0928, "step": 306 }, { "epoch": 1.9759036144578315, "grad_norm": 0.20141035318374634, "learning_rate": 0.0001377363052555693, "loss": 0.1159, "step": 307 }, { "epoch": 1.9823293172690764, "grad_norm": 0.29336804151535034, "learning_rate": 0.00013734858336285162, "loss": 0.1123, "step": 308 }, { "epoch": 1.9887550200803212, "grad_norm": 0.2803572714328766, "learning_rate": 0.00013696020789823388, "loss": 0.1217, "step": 309 }, { "epoch": 1.9951807228915661, "grad_norm": 0.2023596316576004, "learning_rate": 0.00013657118565799236, "loss": 0.081, "step": 310 }, { "epoch": 2.004819277108434, "grad_norm": 0.6364520192146301, "learning_rate": 0.00013618152344972142, "loss": 0.2296, "step": 311 }, { "epoch": 2.0112449799196788, "grad_norm": 0.15013103187084198, "learning_rate": 0.00013579122809221432, "loss": 0.0985, "step": 312 }, { "epoch": 2.0176706827309236, "grad_norm": 0.16968320310115814, "learning_rate": 0.00013540030641534404, "loss": 0.1061, "step": 313 }, { "epoch": 2.0240963855421685, "grad_norm": 0.11979459226131439, "learning_rate": 0.00013500876525994354, "loss": 0.0778, "step": 314 }, { "epoch": 2.030522088353414, "grad_norm": 0.1250924915075302, "learning_rate": 0.00013461661147768633, "loss": 0.076, "step": 315 }, { "epoch": 2.0369477911646587, "grad_norm": 0.14744716882705688, "learning_rate": 0.00013422385193096636, "loss": 0.1088, "step": 316 }, { "epoch": 2.0433734939759036, "grad_norm": 0.14878836274147034, "learning_rate": 0.000133830493492778, "loss": 0.0886, "step": 317 }, { "epoch": 2.0497991967871485, "grad_norm": 0.14586399495601654, "learning_rate": 0.00013343654304659574, "loss": 0.0737, "step": 318 }, { "epoch": 2.0562248995983934, "grad_norm": 0.26601502299308777, "learning_rate": 0.00013304200748625377, "loss": 0.1376, "step": 319 }, { "epoch": 2.0626506024096387, "grad_norm": 0.13429085910320282, "learning_rate": 0.0001326468937158254, "loss": 0.0653, "step": 320 }, { "epoch": 2.0690763052208836, "grad_norm": 0.15061058104038239, "learning_rate": 0.00013225120864950217, "loss": 0.0832, "step": 321 }, { "epoch": 2.0755020080321285, "grad_norm": 0.19543641805648804, "learning_rate": 0.00013185495921147272, "loss": 0.0904, "step": 322 }, { "epoch": 2.0819277108433734, "grad_norm": 0.19916664063930511, "learning_rate": 0.00013145815233580192, "loss": 0.1002, "step": 323 }, { "epoch": 2.0883534136546187, "grad_norm": 0.18353912234306335, "learning_rate": 0.00013106079496630937, "loss": 0.065, "step": 324 }, { "epoch": 2.0947791164658636, "grad_norm": 0.22987468540668488, "learning_rate": 0.00013066289405644778, "loss": 0.0889, "step": 325 }, { "epoch": 2.1012048192771084, "grad_norm": 0.18574562668800354, "learning_rate": 0.00013026445656918155, "loss": 0.0771, "step": 326 }, { "epoch": 2.1076305220883533, "grad_norm": 0.2045065313577652, "learning_rate": 0.00012986548947686467, "loss": 0.0761, "step": 327 }, { "epoch": 2.114056224899598, "grad_norm": 0.20475462079048157, "learning_rate": 0.00012946599976111883, "loss": 0.0603, "step": 328 }, { "epoch": 2.1204819277108435, "grad_norm": 0.2551652491092682, "learning_rate": 0.0001290659944127113, "loss": 0.0619, "step": 329 }, { "epoch": 2.1269076305220884, "grad_norm": 0.24741911888122559, "learning_rate": 0.0001286654804314325, "loss": 0.0685, "step": 330 }, { "epoch": 2.1333333333333333, "grad_norm": 0.3083299696445465, "learning_rate": 0.0001282644648259735, "loss": 0.073, "step": 331 }, { "epoch": 2.139759036144578, "grad_norm": 0.26823994517326355, "learning_rate": 0.00012786295461380344, "loss": 0.0743, "step": 332 }, { "epoch": 2.1461847389558235, "grad_norm": 0.267733633518219, "learning_rate": 0.00012746095682104669, "loss": 0.0914, "step": 333 }, { "epoch": 2.1526104417670684, "grad_norm": 0.5392053723335266, "learning_rate": 0.00012705847848235995, "loss": 0.0627, "step": 334 }, { "epoch": 2.1590361445783133, "grad_norm": 0.2896229922771454, "learning_rate": 0.00012665552664080907, "loss": 0.0777, "step": 335 }, { "epoch": 2.165461847389558, "grad_norm": 0.145288348197937, "learning_rate": 0.00012625210834774585, "loss": 0.0673, "step": 336 }, { "epoch": 2.171887550200803, "grad_norm": 0.20815478265285492, "learning_rate": 0.00012584823066268466, "loss": 0.0887, "step": 337 }, { "epoch": 2.1783132530120484, "grad_norm": 0.15457971394062042, "learning_rate": 0.00012544390065317887, "loss": 0.0806, "step": 338 }, { "epoch": 2.1847389558232932, "grad_norm": 0.1739414930343628, "learning_rate": 0.00012503912539469714, "loss": 0.0904, "step": 339 }, { "epoch": 2.191164658634538, "grad_norm": 0.19280481338500977, "learning_rate": 0.00012463391197049977, "loss": 0.0936, "step": 340 }, { "epoch": 2.197590361445783, "grad_norm": 0.19487957656383514, "learning_rate": 0.00012422826747151444, "loss": 0.0711, "step": 341 }, { "epoch": 2.2040160642570283, "grad_norm": 0.1922217458486557, "learning_rate": 0.00012382219899621246, "loss": 0.0937, "step": 342 }, { "epoch": 2.2104417670682732, "grad_norm": 0.19978083670139313, "learning_rate": 0.00012341571365048442, "loss": 0.09, "step": 343 }, { "epoch": 2.216867469879518, "grad_norm": 0.17337800562381744, "learning_rate": 0.00012300881854751568, "loss": 0.0849, "step": 344 }, { "epoch": 2.223293172690763, "grad_norm": 0.16611528396606445, "learning_rate": 0.0001226015208076622, "loss": 0.0695, "step": 345 }, { "epoch": 2.229718875502008, "grad_norm": 0.19705650210380554, "learning_rate": 0.0001221938275583257, "loss": 0.0924, "step": 346 }, { "epoch": 2.236144578313253, "grad_norm": 0.2103363275527954, "learning_rate": 0.00012178574593382899, "loss": 0.0971, "step": 347 }, { "epoch": 2.242570281124498, "grad_norm": 0.17318083345890045, "learning_rate": 0.0001213772830752912, "loss": 0.0764, "step": 348 }, { "epoch": 2.248995983935743, "grad_norm": 0.18497149646282196, "learning_rate": 0.0001209684461305028, "loss": 0.0775, "step": 349 }, { "epoch": 2.255421686746988, "grad_norm": 0.16112066805362701, "learning_rate": 0.00012055924225380038, "loss": 0.066, "step": 350 }, { "epoch": 2.261847389558233, "grad_norm": 0.14638184010982513, "learning_rate": 0.00012014967860594164, "loss": 0.0667, "step": 351 }, { "epoch": 2.268273092369478, "grad_norm": 0.23443520069122314, "learning_rate": 0.00011973976235398, "loss": 0.0746, "step": 352 }, { "epoch": 2.274698795180723, "grad_norm": 0.15776869654655457, "learning_rate": 0.0001193295006711392, "loss": 0.0576, "step": 353 }, { "epoch": 2.281124497991968, "grad_norm": 0.1940746307373047, "learning_rate": 0.00011891890073668763, "loss": 0.0614, "step": 354 }, { "epoch": 2.2875502008032127, "grad_norm": 0.14611606299877167, "learning_rate": 0.00011850796973581302, "loss": 0.057, "step": 355 }, { "epoch": 2.293975903614458, "grad_norm": 0.2916417419910431, "learning_rate": 0.00011809671485949636, "loss": 0.0677, "step": 356 }, { "epoch": 2.300401606425703, "grad_norm": 0.2484523206949234, "learning_rate": 0.00011768514330438627, "loss": 0.0846, "step": 357 }, { "epoch": 2.306827309236948, "grad_norm": 0.18685579299926758, "learning_rate": 0.00011727326227267308, "loss": 0.0682, "step": 358 }, { "epoch": 2.3132530120481927, "grad_norm": 0.23231656849384308, "learning_rate": 0.00011686107897196255, "loss": 0.0782, "step": 359 }, { "epoch": 2.319678714859438, "grad_norm": 0.2503097653388977, "learning_rate": 0.00011644860061515008, "loss": 0.0745, "step": 360 }, { "epoch": 2.326104417670683, "grad_norm": 0.15438750386238098, "learning_rate": 0.00011603583442029426, "loss": 0.0599, "step": 361 }, { "epoch": 2.3325301204819278, "grad_norm": 0.11933010071516037, "learning_rate": 0.00011562278761049066, "loss": 0.0705, "step": 362 }, { "epoch": 2.3389558232931726, "grad_norm": 0.2344401627779007, "learning_rate": 0.00011520946741374534, "loss": 0.1086, "step": 363 }, { "epoch": 2.3453815261044175, "grad_norm": 0.1429268717765808, "learning_rate": 0.00011479588106284848, "loss": 0.0793, "step": 364 }, { "epoch": 2.3518072289156624, "grad_norm": 0.170423224568367, "learning_rate": 0.00011438203579524778, "loss": 0.0876, "step": 365 }, { "epoch": 2.3582329317269077, "grad_norm": 0.21951356530189514, "learning_rate": 0.00011396793885292165, "loss": 0.1001, "step": 366 }, { "epoch": 2.3646586345381526, "grad_norm": 0.18042345345020294, "learning_rate": 0.00011355359748225279, "loss": 0.096, "step": 367 }, { "epoch": 2.3710843373493975, "grad_norm": 0.19519764184951782, "learning_rate": 0.00011313901893390113, "loss": 0.0842, "step": 368 }, { "epoch": 2.3775100401606424, "grad_norm": 0.1564580202102661, "learning_rate": 0.00011272421046267696, "loss": 0.0849, "step": 369 }, { "epoch": 2.3839357429718877, "grad_norm": 0.21376530826091766, "learning_rate": 0.00011230917932741418, "loss": 0.0848, "step": 370 }, { "epoch": 2.3903614457831326, "grad_norm": 0.16924604773521423, "learning_rate": 0.00011189393279084308, "loss": 0.0986, "step": 371 }, { "epoch": 2.3967871485943775, "grad_norm": 0.142182394862175, "learning_rate": 0.00011147847811946328, "loss": 0.0753, "step": 372 }, { "epoch": 2.4032128514056224, "grad_norm": 0.2380589097738266, "learning_rate": 0.00011106282258341665, "loss": 0.0873, "step": 373 }, { "epoch": 2.4096385542168672, "grad_norm": 0.17887993156909943, "learning_rate": 0.00011064697345636002, "loss": 0.0721, "step": 374 }, { "epoch": 2.4160642570281126, "grad_norm": 0.16593624651432037, "learning_rate": 0.00011023093801533785, "loss": 0.0673, "step": 375 }, { "epoch": 2.4224899598393574, "grad_norm": 0.19967305660247803, "learning_rate": 0.00010981472354065514, "loss": 0.0839, "step": 376 }, { "epoch": 2.4289156626506023, "grad_norm": 0.1671656221151352, "learning_rate": 0.00010939833731574967, "loss": 0.0692, "step": 377 }, { "epoch": 2.435341365461847, "grad_norm": 0.2287617176771164, "learning_rate": 0.00010898178662706471, "loss": 0.0841, "step": 378 }, { "epoch": 2.4417670682730925, "grad_norm": 0.1811789572238922, "learning_rate": 0.00010856507876392166, "loss": 0.0549, "step": 379 }, { "epoch": 2.4481927710843374, "grad_norm": 0.1921742856502533, "learning_rate": 0.00010814822101839224, "loss": 0.0723, "step": 380 }, { "epoch": 2.4546184738955823, "grad_norm": 0.24973797798156738, "learning_rate": 0.00010773122068517103, "loss": 0.0746, "step": 381 }, { "epoch": 2.461044176706827, "grad_norm": 0.22716690599918365, "learning_rate": 0.00010731408506144782, "loss": 0.0837, "step": 382 }, { "epoch": 2.467469879518072, "grad_norm": 0.19027042388916016, "learning_rate": 0.00010689682144677983, "loss": 0.0575, "step": 383 }, { "epoch": 2.4738955823293174, "grad_norm": 0.26334255933761597, "learning_rate": 0.00010647943714296405, "loss": 0.0688, "step": 384 }, { "epoch": 2.4803212851405623, "grad_norm": 0.26130348443984985, "learning_rate": 0.00010606193945390943, "loss": 0.0704, "step": 385 }, { "epoch": 2.486746987951807, "grad_norm": 0.16175444424152374, "learning_rate": 0.00010564433568550909, "loss": 0.0739, "step": 386 }, { "epoch": 2.493172690763052, "grad_norm": 0.34543898701667786, "learning_rate": 0.00010522663314551247, "loss": 0.0883, "step": 387 }, { "epoch": 2.4995983935742974, "grad_norm": 0.16623060405254364, "learning_rate": 0.00010480883914339736, "loss": 0.0916, "step": 388 }, { "epoch": 2.5060240963855422, "grad_norm": 0.15224424004554749, "learning_rate": 0.0001043909609902422, "loss": 0.1017, "step": 389 }, { "epoch": 2.512449799196787, "grad_norm": 0.2146722823381424, "learning_rate": 0.00010397300599859785, "loss": 0.0699, "step": 390 }, { "epoch": 2.518875502008032, "grad_norm": 0.18753445148468018, "learning_rate": 0.00010355498148235996, "loss": 0.1012, "step": 391 }, { "epoch": 2.525301204819277, "grad_norm": 0.19780860841274261, "learning_rate": 0.00010313689475664063, "loss": 0.0876, "step": 392 }, { "epoch": 2.531726907630522, "grad_norm": 0.15880204737186432, "learning_rate": 0.0001027187531376407, "loss": 0.0978, "step": 393 }, { "epoch": 2.538152610441767, "grad_norm": 0.20312370359897614, "learning_rate": 0.00010230056394252161, "loss": 0.0978, "step": 394 }, { "epoch": 2.544578313253012, "grad_norm": 0.17712520062923431, "learning_rate": 0.00010188233448927724, "loss": 0.0632, "step": 395 }, { "epoch": 2.551004016064257, "grad_norm": 0.1579594612121582, "learning_rate": 0.00010146407209660607, "loss": 0.0868, "step": 396 }, { "epoch": 2.557429718875502, "grad_norm": 0.18369610607624054, "learning_rate": 0.00010104578408378289, "loss": 0.0721, "step": 397 }, { "epoch": 2.563855421686747, "grad_norm": 0.1889200061559677, "learning_rate": 0.00010062747777053094, "loss": 0.079, "step": 398 }, { "epoch": 2.570281124497992, "grad_norm": 0.1784103661775589, "learning_rate": 0.00010020916047689358, "loss": 0.0703, "step": 399 }, { "epoch": 2.576706827309237, "grad_norm": 0.1942092776298523, "learning_rate": 9.979083952310643e-05, "loss": 0.0851, "step": 400 }, { "epoch": 2.5831325301204817, "grad_norm": 0.17258763313293457, "learning_rate": 9.937252222946908e-05, "loss": 0.0632, "step": 401 }, { "epoch": 2.589558232931727, "grad_norm": 0.1579107642173767, "learning_rate": 9.895421591621712e-05, "loss": 0.0568, "step": 402 }, { "epoch": 2.595983935742972, "grad_norm": 0.16073255240917206, "learning_rate": 9.853592790339396e-05, "loss": 0.0571, "step": 403 }, { "epoch": 2.602409638554217, "grad_norm": 0.19859455525875092, "learning_rate": 9.811766551072278e-05, "loss": 0.0695, "step": 404 }, { "epoch": 2.6088353413654617, "grad_norm": 0.20779484510421753, "learning_rate": 9.769943605747844e-05, "loss": 0.0679, "step": 405 }, { "epoch": 2.615261044176707, "grad_norm": 0.1808435469865799, "learning_rate": 9.72812468623593e-05, "loss": 0.0663, "step": 406 }, { "epoch": 2.621686746987952, "grad_norm": 0.18272174894809723, "learning_rate": 9.686310524335938e-05, "loss": 0.0617, "step": 407 }, { "epoch": 2.628112449799197, "grad_norm": 0.21186350286006927, "learning_rate": 9.644501851764007e-05, "loss": 0.0597, "step": 408 }, { "epoch": 2.6345381526104417, "grad_norm": 0.1978769302368164, "learning_rate": 9.602699400140218e-05, "loss": 0.0573, "step": 409 }, { "epoch": 2.6409638554216865, "grad_norm": 0.2839438319206238, "learning_rate": 9.560903900975785e-05, "loss": 0.0705, "step": 410 }, { "epoch": 2.647389558232932, "grad_norm": 0.19903969764709473, "learning_rate": 9.519116085660267e-05, "loss": 0.0696, "step": 411 }, { "epoch": 2.6538152610441768, "grad_norm": 0.2470594197511673, "learning_rate": 9.477336685448754e-05, "loss": 0.0781, "step": 412 }, { "epoch": 2.6602409638554216, "grad_norm": 0.16970917582511902, "learning_rate": 9.435566431449092e-05, "loss": 0.0799, "step": 413 }, { "epoch": 2.6666666666666665, "grad_norm": 0.20240218937397003, "learning_rate": 9.39380605460906e-05, "loss": 0.0751, "step": 414 }, { "epoch": 2.673092369477912, "grad_norm": 0.15831498801708221, "learning_rate": 9.352056285703599e-05, "loss": 0.0764, "step": 415 }, { "epoch": 2.6795180722891567, "grad_norm": 0.2208964228630066, "learning_rate": 9.31031785532202e-05, "loss": 0.089, "step": 416 }, { "epoch": 2.6859437751004016, "grad_norm": 0.23068203032016754, "learning_rate": 9.268591493855222e-05, "loss": 0.0822, "step": 417 }, { "epoch": 2.6923694779116465, "grad_norm": 0.18787510693073273, "learning_rate": 9.226877931482898e-05, "loss": 0.0762, "step": 418 }, { "epoch": 2.6987951807228914, "grad_norm": 0.22357405722141266, "learning_rate": 9.18517789816078e-05, "loss": 0.0889, "step": 419 }, { "epoch": 2.7052208835341367, "grad_norm": 0.15312552452087402, "learning_rate": 9.143492123607838e-05, "loss": 0.0579, "step": 420 }, { "epoch": 2.7116465863453816, "grad_norm": 0.2166433036327362, "learning_rate": 9.101821337293532e-05, "loss": 0.1423, "step": 421 }, { "epoch": 2.7180722891566265, "grad_norm": 0.1675548553466797, "learning_rate": 9.060166268425038e-05, "loss": 0.09, "step": 422 }, { "epoch": 2.7244979919678713, "grad_norm": 0.16221830248832703, "learning_rate": 9.018527645934488e-05, "loss": 0.0606, "step": 423 }, { "epoch": 2.7309236947791167, "grad_norm": 0.20156528055667877, "learning_rate": 8.976906198466213e-05, "loss": 0.0855, "step": 424 }, { "epoch": 2.7373493975903616, "grad_norm": 0.2165391594171524, "learning_rate": 8.935302654364e-05, "loss": 0.0935, "step": 425 }, { "epoch": 2.7437751004016064, "grad_norm": 0.16395288705825806, "learning_rate": 8.893717741658336e-05, "loss": 0.092, "step": 426 }, { "epoch": 2.7502008032128513, "grad_norm": 0.24706722795963287, "learning_rate": 8.852152188053674e-05, "loss": 0.0717, "step": 427 }, { "epoch": 2.756626506024096, "grad_norm": 0.19659721851348877, "learning_rate": 8.810606720915697e-05, "loss": 0.0703, "step": 428 }, { "epoch": 2.7630522088353415, "grad_norm": 0.2377336174249649, "learning_rate": 8.769082067258585e-05, "loss": 0.0711, "step": 429 }, { "epoch": 2.7694779116465864, "grad_norm": 0.1496395319700241, "learning_rate": 8.727578953732303e-05, "loss": 0.0565, "step": 430 }, { "epoch": 2.7759036144578313, "grad_norm": 0.230519101023674, "learning_rate": 8.686098106609889e-05, "loss": 0.0676, "step": 431 }, { "epoch": 2.782329317269076, "grad_norm": 0.1836637556552887, "learning_rate": 8.644640251774722e-05, "loss": 0.0613, "step": 432 }, { "epoch": 2.7887550200803215, "grad_norm": 0.17303813993930817, "learning_rate": 8.603206114707837e-05, "loss": 0.066, "step": 433 }, { "epoch": 2.7951807228915664, "grad_norm": 0.18832409381866455, "learning_rate": 8.561796420475227e-05, "loss": 0.0539, "step": 434 }, { "epoch": 2.8016064257028113, "grad_norm": 0.23530371487140656, "learning_rate": 8.52041189371515e-05, "loss": 0.0669, "step": 435 }, { "epoch": 2.808032128514056, "grad_norm": 0.12783204019069672, "learning_rate": 8.479053258625467e-05, "loss": 0.0686, "step": 436 }, { "epoch": 2.814457831325301, "grad_norm": 0.16126012802124023, "learning_rate": 8.437721238950938e-05, "loss": 0.0783, "step": 437 }, { "epoch": 2.820883534136546, "grad_norm": 0.16663892567157745, "learning_rate": 8.396416557970576e-05, "loss": 0.0899, "step": 438 }, { "epoch": 2.8273092369477912, "grad_norm": 0.12750643491744995, "learning_rate": 8.355139938484995e-05, "loss": 0.087, "step": 439 }, { "epoch": 2.833734939759036, "grad_norm": 0.1801062375307083, "learning_rate": 8.313892102803749e-05, "loss": 0.079, "step": 440 }, { "epoch": 2.840160642570281, "grad_norm": 0.2397543489933014, "learning_rate": 8.272673772732695e-05, "loss": 0.0982, "step": 441 }, { "epoch": 2.8465863453815263, "grad_norm": 0.16935226321220398, "learning_rate": 8.231485669561371e-05, "loss": 0.0675, "step": 442 }, { "epoch": 2.853012048192771, "grad_norm": 0.14082035422325134, "learning_rate": 8.190328514050365e-05, "loss": 0.0748, "step": 443 }, { "epoch": 2.859437751004016, "grad_norm": 0.1471889168024063, "learning_rate": 8.1492030264187e-05, "loss": 0.0622, "step": 444 }, { "epoch": 2.865863453815261, "grad_norm": 0.19526907801628113, "learning_rate": 8.108109926331238e-05, "loss": 0.0737, "step": 445 }, { "epoch": 2.872289156626506, "grad_norm": 0.15070655941963196, "learning_rate": 8.067049932886084e-05, "loss": 0.0751, "step": 446 }, { "epoch": 2.8787148594377507, "grad_norm": 0.1857602894306183, "learning_rate": 8.026023764601999e-05, "loss": 0.0717, "step": 447 }, { "epoch": 2.885140562248996, "grad_norm": 0.20066916942596436, "learning_rate": 7.985032139405836e-05, "loss": 0.0792, "step": 448 }, { "epoch": 2.891566265060241, "grad_norm": 0.19384227693080902, "learning_rate": 7.944075774619963e-05, "loss": 0.0575, "step": 449 }, { "epoch": 2.897991967871486, "grad_norm": 0.21878117322921753, "learning_rate": 7.903155386949723e-05, "loss": 0.0799, "step": 450 }, { "epoch": 2.904417670682731, "grad_norm": 0.2109462022781372, "learning_rate": 7.862271692470884e-05, "loss": 0.081, "step": 451 }, { "epoch": 2.910843373493976, "grad_norm": 0.2199956178665161, "learning_rate": 7.821425406617106e-05, "loss": 0.0749, "step": 452 }, { "epoch": 2.917269076305221, "grad_norm": 0.2037278711795807, "learning_rate": 7.780617244167432e-05, "loss": 0.0529, "step": 453 }, { "epoch": 2.923694779116466, "grad_norm": 0.19817106425762177, "learning_rate": 7.739847919233781e-05, "loss": 0.0482, "step": 454 }, { "epoch": 2.9301204819277107, "grad_norm": 0.16905316710472107, "learning_rate": 7.699118145248434e-05, "loss": 0.0518, "step": 455 }, { "epoch": 2.9365461847389556, "grad_norm": 0.18011616170406342, "learning_rate": 7.658428634951562e-05, "loss": 0.0619, "step": 456 }, { "epoch": 2.942971887550201, "grad_norm": 0.18177881836891174, "learning_rate": 7.617780100378756e-05, "loss": 0.057, "step": 457 }, { "epoch": 2.9493975903614458, "grad_norm": 0.1976725161075592, "learning_rate": 7.57717325284856e-05, "loss": 0.0512, "step": 458 }, { "epoch": 2.9558232931726907, "grad_norm": 0.16665002703666687, "learning_rate": 7.536608802950027e-05, "loss": 0.0603, "step": 459 }, { "epoch": 2.962248995983936, "grad_norm": 0.2631849944591522, "learning_rate": 7.496087460530285e-05, "loss": 0.0644, "step": 460 }, { "epoch": 2.968674698795181, "grad_norm": 0.14662465453147888, "learning_rate": 7.455609934682116e-05, "loss": 0.1023, "step": 461 }, { "epoch": 2.9751004016064257, "grad_norm": 0.15676066279411316, "learning_rate": 7.415176933731536e-05, "loss": 0.078, "step": 462 }, { "epoch": 2.9815261044176706, "grad_norm": 0.15064530074596405, "learning_rate": 7.374789165225416e-05, "loss": 0.0697, "step": 463 }, { "epoch": 2.9879518072289155, "grad_norm": 0.21094205975532532, "learning_rate": 7.334447335919096e-05, "loss": 0.065, "step": 464 }, { "epoch": 2.9943775100401604, "grad_norm": 0.19423390924930573, "learning_rate": 7.294152151764006e-05, "loss": 0.0587, "step": 465 }, { "epoch": 3.004016064257028, "grad_norm": 0.5364864468574524, "learning_rate": 7.253904317895332e-05, "loss": 0.0888, "step": 466 }, { "epoch": 3.010441767068273, "grad_norm": 0.10208001732826233, "learning_rate": 7.21370453861966e-05, "loss": 0.0712, "step": 467 }, { "epoch": 3.016867469879518, "grad_norm": 0.13294284045696259, "learning_rate": 7.173553517402652e-05, "loss": 0.0869, "step": 468 }, { "epoch": 3.0232931726907633, "grad_norm": 0.11979340761899948, "learning_rate": 7.133451956856751e-05, "loss": 0.0719, "step": 469 }, { "epoch": 3.029718875502008, "grad_norm": 0.1777201145887375, "learning_rate": 7.093400558728871e-05, "loss": 0.068, "step": 470 }, { "epoch": 3.036144578313253, "grad_norm": 0.14670364558696747, "learning_rate": 7.053400023888115e-05, "loss": 0.0693, "step": 471 }, { "epoch": 3.042570281124498, "grad_norm": 0.15252293646335602, "learning_rate": 7.013451052313534e-05, "loss": 0.0649, "step": 472 }, { "epoch": 3.048995983935743, "grad_norm": 0.14201124012470245, "learning_rate": 6.973554343081846e-05, "loss": 0.0515, "step": 473 }, { "epoch": 3.055421686746988, "grad_norm": 0.15977801382541656, "learning_rate": 6.933710594355225e-05, "loss": 0.0593, "step": 474 }, { "epoch": 3.061847389558233, "grad_norm": 0.11069828271865845, "learning_rate": 6.893920503369068e-05, "loss": 0.0407, "step": 475 }, { "epoch": 3.068273092369478, "grad_norm": 0.1887078583240509, "learning_rate": 6.854184766419812e-05, "loss": 0.0619, "step": 476 }, { "epoch": 3.0746987951807228, "grad_norm": 0.17938368022441864, "learning_rate": 6.814504078852729e-05, "loss": 0.0634, "step": 477 }, { "epoch": 3.081124497991968, "grad_norm": 0.1669437140226364, "learning_rate": 6.774879135049787e-05, "loss": 0.0518, "step": 478 }, { "epoch": 3.087550200803213, "grad_norm": 0.14382565021514893, "learning_rate": 6.735310628417461e-05, "loss": 0.0472, "step": 479 }, { "epoch": 3.093975903614458, "grad_norm": 0.176390141248703, "learning_rate": 6.695799251374625e-05, "loss": 0.0519, "step": 480 }, { "epoch": 3.1004016064257027, "grad_norm": 0.19478528201580048, "learning_rate": 6.656345695340431e-05, "loss": 0.0631, "step": 481 }, { "epoch": 3.1068273092369476, "grad_norm": 0.24837417900562286, "learning_rate": 6.616950650722205e-05, "loss": 0.0646, "step": 482 }, { "epoch": 3.113253012048193, "grad_norm": 0.20292866230010986, "learning_rate": 6.577614806903365e-05, "loss": 0.048, "step": 483 }, { "epoch": 3.119678714859438, "grad_norm": 0.14314322173595428, "learning_rate": 6.538338852231367e-05, "loss": 0.0387, "step": 484 }, { "epoch": 3.1261044176706827, "grad_norm": 0.623166561126709, "learning_rate": 6.499123474005647e-05, "loss": 0.056, "step": 485 }, { "epoch": 3.1325301204819276, "grad_norm": 0.18953579664230347, "learning_rate": 6.4599693584656e-05, "loss": 0.052, "step": 486 }, { "epoch": 3.1389558232931725, "grad_norm": 0.30603814125061035, "learning_rate": 6.420877190778569e-05, "loss": 0.0628, "step": 487 }, { "epoch": 3.145381526104418, "grad_norm": 0.22006677091121674, "learning_rate": 6.381847655027864e-05, "loss": 0.0492, "step": 488 }, { "epoch": 3.1518072289156627, "grad_norm": 0.2588195204734802, "learning_rate": 6.342881434200765e-05, "loss": 0.0466, "step": 489 }, { "epoch": 3.1582329317269076, "grad_norm": 0.22890865802764893, "learning_rate": 6.303979210176614e-05, "loss": 0.0621, "step": 490 }, { "epoch": 3.1646586345381524, "grad_norm": 0.15218952298164368, "learning_rate": 6.26514166371484e-05, "loss": 0.0451, "step": 491 }, { "epoch": 3.1710843373493978, "grad_norm": 0.11059743911027908, "learning_rate": 6.226369474443072e-05, "loss": 0.0769, "step": 492 }, { "epoch": 3.1775100401606426, "grad_norm": 0.1349543184041977, "learning_rate": 6.18766332084523e-05, "loss": 0.0576, "step": 493 }, { "epoch": 3.1839357429718875, "grad_norm": 0.14686280488967896, "learning_rate": 6.149023880249665e-05, "loss": 0.0839, "step": 494 }, { "epoch": 3.1903614457831324, "grad_norm": 0.14011719822883606, "learning_rate": 6.110451828817298e-05, "loss": 0.0591, "step": 495 }, { "epoch": 3.1967871485943773, "grad_norm": 0.18035003542900085, "learning_rate": 6.071947841529801e-05, "loss": 0.069, "step": 496 }, { "epoch": 3.2032128514056226, "grad_norm": 0.1427018791437149, "learning_rate": 6.03351259217776e-05, "loss": 0.0647, "step": 497 }, { "epoch": 3.2096385542168675, "grad_norm": 0.21494245529174805, "learning_rate": 5.995146753348909e-05, "loss": 0.0764, "step": 498 }, { "epoch": 3.2160642570281124, "grad_norm": 0.18623854219913483, "learning_rate": 5.9568509964163464e-05, "loss": 0.0558, "step": 499 }, { "epoch": 3.2224899598393573, "grad_norm": 0.13678805530071259, "learning_rate": 5.9186259915267916e-05, "loss": 0.048, "step": 500 }, { "epoch": 3.2289156626506026, "grad_norm": 0.14847120642662048, "learning_rate": 5.880472407588857e-05, "loss": 0.0668, "step": 501 }, { "epoch": 3.2353413654618475, "grad_norm": 0.14581717550754547, "learning_rate": 5.842390912261344e-05, "loss": 0.0424, "step": 502 }, { "epoch": 3.2417670682730924, "grad_norm": 0.14835858345031738, "learning_rate": 5.8043821719415534e-05, "loss": 0.0565, "step": 503 }, { "epoch": 3.2481927710843372, "grad_norm": 0.1661667823791504, "learning_rate": 5.7664468517536395e-05, "loss": 0.0432, "step": 504 }, { "epoch": 3.254618473895582, "grad_norm": 0.13604165613651276, "learning_rate": 5.728585615536946e-05, "loss": 0.0417, "step": 505 }, { "epoch": 3.2610441767068274, "grad_norm": 0.18449333310127258, "learning_rate": 5.6907991258344e-05, "loss": 0.0662, "step": 506 }, { "epoch": 3.2674698795180723, "grad_norm": 0.14538291096687317, "learning_rate": 5.6530880438809494e-05, "loss": 0.0378, "step": 507 }, { "epoch": 3.273895582329317, "grad_norm": 0.19400519132614136, "learning_rate": 5.615453029591935e-05, "loss": 0.0494, "step": 508 }, { "epoch": 3.280321285140562, "grad_norm": 0.2027265727519989, "learning_rate": 5.5778947415515784e-05, "loss": 0.0472, "step": 509 }, { "epoch": 3.2867469879518074, "grad_norm": 0.18922823667526245, "learning_rate": 5.540413837001459e-05, "loss": 0.0393, "step": 510 }, { "epoch": 3.2931726907630523, "grad_norm": 0.20464153587818146, "learning_rate": 5.50301097182899e-05, "loss": 0.0369, "step": 511 }, { "epoch": 3.299598393574297, "grad_norm": 0.23268243670463562, "learning_rate": 5.465686800555967e-05, "loss": 0.0362, "step": 512 }, { "epoch": 3.306024096385542, "grad_norm": 0.16447444260120392, "learning_rate": 5.4284419763271e-05, "loss": 0.0305, "step": 513 }, { "epoch": 3.312449799196787, "grad_norm": 0.20870855450630188, "learning_rate": 5.391277150898575e-05, "loss": 0.0539, "step": 514 }, { "epoch": 3.3188755020080323, "grad_norm": 0.19453927874565125, "learning_rate": 5.354192974626674e-05, "loss": 0.0462, "step": 515 }, { "epoch": 3.325301204819277, "grad_norm": 0.12926504015922546, "learning_rate": 5.317190096456368e-05, "loss": 0.047, "step": 516 }, { "epoch": 3.331726907630522, "grad_norm": 0.13683585822582245, "learning_rate": 5.2802691639099834e-05, "loss": 0.0655, "step": 517 }, { "epoch": 3.338152610441767, "grad_norm": 0.13933661580085754, "learning_rate": 5.24343082307585e-05, "loss": 0.0713, "step": 518 }, { "epoch": 3.3445783132530122, "grad_norm": 0.15325239300727844, "learning_rate": 5.206675718597012e-05, "loss": 0.0471, "step": 519 }, { "epoch": 3.351004016064257, "grad_norm": 0.21659892797470093, "learning_rate": 5.1700044936599434e-05, "loss": 0.1054, "step": 520 }, { "epoch": 3.357429718875502, "grad_norm": 0.1403968334197998, "learning_rate": 5.133417789983277e-05, "loss": 0.0668, "step": 521 }, { "epoch": 3.363855421686747, "grad_norm": 0.16732285916805267, "learning_rate": 5.0969162478066055e-05, "loss": 0.0587, "step": 522 }, { "epoch": 3.3702811244979918, "grad_norm": 0.12107214331626892, "learning_rate": 5.060500505879244e-05, "loss": 0.0534, "step": 523 }, { "epoch": 3.376706827309237, "grad_norm": 0.1527867466211319, "learning_rate": 5.0241712014490684e-05, "loss": 0.0497, "step": 524 }, { "epoch": 3.383132530120482, "grad_norm": 0.15944266319274902, "learning_rate": 4.9879289702513845e-05, "loss": 0.0653, "step": 525 }, { "epoch": 3.389558232931727, "grad_norm": 0.15926086902618408, "learning_rate": 4.95177444649776e-05, "loss": 0.049, "step": 526 }, { "epoch": 3.3959839357429717, "grad_norm": 0.13858525454998016, "learning_rate": 4.9157082628649545e-05, "loss": 0.0544, "step": 527 }, { "epoch": 3.402409638554217, "grad_norm": 0.14041852951049805, "learning_rate": 4.87973105048385e-05, "loss": 0.0481, "step": 528 }, { "epoch": 3.408835341365462, "grad_norm": 0.12636280059814453, "learning_rate": 4.8438434389283895e-05, "loss": 0.0533, "step": 529 }, { "epoch": 3.415261044176707, "grad_norm": 0.21128305792808533, "learning_rate": 4.8080460562045736e-05, "loss": 0.0532, "step": 530 }, { "epoch": 3.4216867469879517, "grad_norm": 0.13390301167964935, "learning_rate": 4.7723395287394746e-05, "loss": 0.0334, "step": 531 }, { "epoch": 3.4281124497991966, "grad_norm": 0.17304478585720062, "learning_rate": 4.736724481370248e-05, "loss": 0.0537, "step": 532 }, { "epoch": 3.434538152610442, "grad_norm": 0.17624051868915558, "learning_rate": 4.701201537333237e-05, "loss": 0.045, "step": 533 }, { "epoch": 3.440963855421687, "grad_norm": 0.18076051771640778, "learning_rate": 4.6657713182530316e-05, "loss": 0.0458, "step": 534 }, { "epoch": 3.4473895582329317, "grad_norm": 0.15712404251098633, "learning_rate": 4.630434444131615e-05, "loss": 0.0403, "step": 535 }, { "epoch": 3.4538152610441766, "grad_norm": 0.1949484795331955, "learning_rate": 4.595191533337494e-05, "loss": 0.0455, "step": 536 }, { "epoch": 3.460240963855422, "grad_norm": 0.18234334886074066, "learning_rate": 4.560043202594899e-05, "loss": 0.0347, "step": 537 }, { "epoch": 3.466666666666667, "grad_norm": 0.23936811089515686, "learning_rate": 4.524990066972982e-05, "loss": 0.0567, "step": 538 }, { "epoch": 3.4730923694779117, "grad_norm": 0.21601049602031708, "learning_rate": 4.4900327398750363e-05, "loss": 0.0487, "step": 539 }, { "epoch": 3.4795180722891565, "grad_norm": 0.20177985727787018, "learning_rate": 4.4551718330278006e-05, "loss": 0.0401, "step": 540 }, { "epoch": 3.4859437751004014, "grad_norm": 0.14613081514835358, "learning_rate": 4.4204079564707144e-05, "loss": 0.0383, "step": 541 }, { "epoch": 3.4923694779116468, "grad_norm": 0.11608117818832397, "learning_rate": 4.3857417185452644e-05, "loss": 0.0638, "step": 542 }, { "epoch": 3.4987951807228916, "grad_norm": 0.13324810564517975, "learning_rate": 4.351173725884351e-05, "loss": 0.0705, "step": 543 }, { "epoch": 3.5052208835341365, "grad_norm": 0.16427621245384216, "learning_rate": 4.3167045834016326e-05, "loss": 0.0861, "step": 544 }, { "epoch": 3.5116465863453814, "grad_norm": 0.12879817187786102, "learning_rate": 4.282334894280986e-05, "loss": 0.0532, "step": 545 }, { "epoch": 3.5180722891566267, "grad_norm": 0.13565368950366974, "learning_rate": 4.2480652599659154e-05, "loss": 0.064, "step": 546 }, { "epoch": 3.5244979919678716, "grad_norm": 0.1572374403476715, "learning_rate": 4.213896280149041e-05, "loss": 0.0627, "step": 547 }, { "epoch": 3.5309236947791165, "grad_norm": 0.1345859169960022, "learning_rate": 4.179828552761617e-05, "loss": 0.0654, "step": 548 }, { "epoch": 3.5373493975903614, "grad_norm": 0.18093015253543854, "learning_rate": 4.1458626739630526e-05, "loss": 0.0593, "step": 549 }, { "epoch": 3.5437751004016063, "grad_norm": 0.16655421257019043, "learning_rate": 4.1119992381304754e-05, "loss": 0.0524, "step": 550 }, { "epoch": 3.550200803212851, "grad_norm": 0.1454760730266571, "learning_rate": 4.078238837848352e-05, "loss": 0.0478, "step": 551 }, { "epoch": 3.5566265060240965, "grad_norm": 0.16041091084480286, "learning_rate": 4.04458206389809e-05, "loss": 0.0586, "step": 552 }, { "epoch": 3.5630522088353414, "grad_norm": 0.1313060075044632, "learning_rate": 4.011029505247732e-05, "loss": 0.0441, "step": 553 }, { "epoch": 3.5694779116465862, "grad_norm": 0.22944222390651703, "learning_rate": 3.977581749041616e-05, "loss": 0.0543, "step": 554 }, { "epoch": 3.5759036144578316, "grad_norm": 0.20055855810642242, "learning_rate": 3.9442393805901245e-05, "loss": 0.0542, "step": 555 }, { "epoch": 3.5823293172690764, "grad_norm": 0.20784057676792145, "learning_rate": 3.91100298335944e-05, "loss": 0.0474, "step": 556 }, { "epoch": 3.5887550200803213, "grad_norm": 0.1995190680027008, "learning_rate": 3.877873138961311e-05, "loss": 0.0494, "step": 557 }, { "epoch": 3.595180722891566, "grad_norm": 0.195552259683609, "learning_rate": 3.844850427142914e-05, "loss": 0.0433, "step": 558 }, { "epoch": 3.601606425702811, "grad_norm": 0.18116550147533417, "learning_rate": 3.811935425776667e-05, "loss": 0.0454, "step": 559 }, { "epoch": 3.608032128514056, "grad_norm": 0.19290290772914886, "learning_rate": 3.779128710850151e-05, "loss": 0.0428, "step": 560 }, { "epoch": 3.6144578313253013, "grad_norm": 0.21416397392749786, "learning_rate": 3.7464308564560106e-05, "loss": 0.051, "step": 561 }, { "epoch": 3.620883534136546, "grad_norm": 0.16343973577022552, "learning_rate": 3.71384243478191e-05, "loss": 0.0348, "step": 562 }, { "epoch": 3.627309236947791, "grad_norm": 0.23021504282951355, "learning_rate": 3.681364016100535e-05, "loss": 0.0395, "step": 563 }, { "epoch": 3.6337349397590364, "grad_norm": 0.17765948176383972, "learning_rate": 3.64899616875959e-05, "loss": 0.03, "step": 564 }, { "epoch": 3.6401606425702813, "grad_norm": 0.19902700185775757, "learning_rate": 3.616739459171866e-05, "loss": 0.0375, "step": 565 }, { "epoch": 3.646586345381526, "grad_norm": 0.12952236831188202, "learning_rate": 3.5845944518053376e-05, "loss": 0.0586, "step": 566 }, { "epoch": 3.653012048192771, "grad_norm": 0.1288733333349228, "learning_rate": 3.552561709173266e-05, "loss": 0.0611, "step": 567 }, { "epoch": 3.659437751004016, "grad_norm": 0.13545019924640656, "learning_rate": 3.520641791824374e-05, "loss": 0.0742, "step": 568 }, { "epoch": 3.665863453815261, "grad_norm": 0.18385440111160278, "learning_rate": 3.488835258333014e-05, "loss": 0.0468, "step": 569 }, { "epoch": 3.672289156626506, "grad_norm": 0.15589255094528198, "learning_rate": 3.4571426652894144e-05, "loss": 0.0691, "step": 570 }, { "epoch": 3.678714859437751, "grad_norm": 0.16703784465789795, "learning_rate": 3.4255645672899325e-05, "loss": 0.0707, "step": 571 }, { "epoch": 3.685140562248996, "grad_norm": 0.1708272397518158, "learning_rate": 3.3941015169273524e-05, "loss": 0.0699, "step": 572 }, { "epoch": 3.691566265060241, "grad_norm": 0.1434539556503296, "learning_rate": 3.362754064781202e-05, "loss": 0.058, "step": 573 }, { "epoch": 3.697991967871486, "grad_norm": 0.12717658281326294, "learning_rate": 3.331522759408138e-05, "loss": 0.045, "step": 574 }, { "epoch": 3.704417670682731, "grad_norm": 0.1311410516500473, "learning_rate": 3.300408147332327e-05, "loss": 0.0369, "step": 575 }, { "epoch": 3.710843373493976, "grad_norm": 0.13452477753162384, "learning_rate": 3.269410773035903e-05, "loss": 0.0382, "step": 576 }, { "epoch": 3.7172690763052207, "grad_norm": 1.2888418436050415, "learning_rate": 3.238531178949417e-05, "loss": 0.0785, "step": 577 }, { "epoch": 3.7236947791164656, "grad_norm": 0.15146106481552124, "learning_rate": 3.207769905442359e-05, "loss": 0.0726, "step": 578 }, { "epoch": 3.730120481927711, "grad_norm": 0.14550191164016724, "learning_rate": 3.177127490813706e-05, "loss": 0.0504, "step": 579 }, { "epoch": 3.736546184738956, "grad_norm": 0.1444544643163681, "learning_rate": 3.1466044712824805e-05, "loss": 0.0384, "step": 580 }, { "epoch": 3.7429718875502007, "grad_norm": 0.160190150141716, "learning_rate": 3.1162013809783955e-05, "loss": 0.0491, "step": 581 }, { "epoch": 3.749397590361446, "grad_norm": 0.1942124217748642, "learning_rate": 3.0859187519324806e-05, "loss": 0.0536, "step": 582 }, { "epoch": 3.755823293172691, "grad_norm": 0.1264030784368515, "learning_rate": 3.055757114067794e-05, "loss": 0.0319, "step": 583 }, { "epoch": 3.762248995983936, "grad_norm": 0.19579629600048065, "learning_rate": 3.025716995190141e-05, "loss": 0.0477, "step": 584 }, { "epoch": 3.7686746987951807, "grad_norm": 0.16901759803295135, "learning_rate": 2.995798920978825e-05, "loss": 0.0306, "step": 585 }, { "epoch": 3.7751004016064256, "grad_norm": 0.16561923921108246, "learning_rate": 2.966003414977475e-05, "loss": 0.0492, "step": 586 }, { "epoch": 3.7815261044176705, "grad_norm": 0.2909844219684601, "learning_rate": 2.9363309985848585e-05, "loss": 0.0502, "step": 587 }, { "epoch": 3.787951807228916, "grad_norm": 0.15100587904453278, "learning_rate": 2.9067821910457704e-05, "loss": 0.0343, "step": 588 }, { "epoch": 3.7943775100401607, "grad_norm": 0.26900508999824524, "learning_rate": 2.877357509441947e-05, "loss": 0.0544, "step": 589 }, { "epoch": 3.8008032128514055, "grad_norm": 0.23953552544116974, "learning_rate": 2.8480574686830142e-05, "loss": 0.0438, "step": 590 }, { "epoch": 3.807228915662651, "grad_norm": 0.14778892695903778, "learning_rate": 2.8188825814974795e-05, "loss": 0.0455, "step": 591 }, { "epoch": 3.8136546184738958, "grad_norm": 0.11664500087499619, "learning_rate": 2.7898333584237534e-05, "loss": 0.0634, "step": 592 }, { "epoch": 3.8200803212851406, "grad_norm": 0.10635870695114136, "learning_rate": 2.7609103078012166e-05, "loss": 0.0603, "step": 593 }, { "epoch": 3.8265060240963855, "grad_norm": 0.11662769317626953, "learning_rate": 2.7321139357613412e-05, "loss": 0.0466, "step": 594 }, { "epoch": 3.8329317269076304, "grad_norm": 0.12710097432136536, "learning_rate": 2.703444746218802e-05, "loss": 0.063, "step": 595 }, { "epoch": 3.8393574297188753, "grad_norm": 0.16935308277606964, "learning_rate": 2.6749032408626907e-05, "loss": 0.0519, "step": 596 }, { "epoch": 3.8457831325301206, "grad_norm": 0.15195302665233612, "learning_rate": 2.646489919147721e-05, "loss": 0.076, "step": 597 }, { "epoch": 3.8522088353413655, "grad_norm": 0.21082603931427002, "learning_rate": 2.6182052782854806e-05, "loss": 0.0771, "step": 598 }, { "epoch": 3.8586345381526104, "grad_norm": 0.1390170454978943, "learning_rate": 2.59004981323575e-05, "loss": 0.0673, "step": 599 }, { "epoch": 3.8650602409638557, "grad_norm": 0.11533928662538528, "learning_rate": 2.5620240166978226e-05, "loss": 0.0392, "step": 600 }, { "epoch": 3.8714859437751006, "grad_norm": 0.1865217238664627, "learning_rate": 2.5341283791018988e-05, "loss": 0.0505, "step": 601 }, { "epoch": 3.8779116465863455, "grad_norm": 0.15040706098079681, "learning_rate": 2.5063633886004935e-05, "loss": 0.0589, "step": 602 }, { "epoch": 3.8843373493975903, "grad_norm": 0.15011747181415558, "learning_rate": 2.4787295310598913e-05, "loss": 0.0607, "step": 603 }, { "epoch": 3.8907630522088352, "grad_norm": 0.15416325628757477, "learning_rate": 2.45122729005166e-05, "loss": 0.0613, "step": 604 }, { "epoch": 3.89718875502008, "grad_norm": 0.20439183712005615, "learning_rate": 2.423857146844164e-05, "loss": 0.0688, "step": 605 }, { "epoch": 3.9036144578313254, "grad_norm": 0.16020047664642334, "learning_rate": 2.3966195803941715e-05, "loss": 0.0513, "step": 606 }, { "epoch": 3.9100401606425703, "grad_norm": 0.14457987248897552, "learning_rate": 2.3695150673384437e-05, "loss": 0.0374, "step": 607 }, { "epoch": 3.916465863453815, "grad_norm": 0.12410798668861389, "learning_rate": 2.3425440819854185e-05, "loss": 0.0335, "step": 608 }, { "epoch": 3.92289156626506, "grad_norm": 0.1418788731098175, "learning_rate": 2.3157070963068984e-05, "loss": 0.0286, "step": 609 }, { "epoch": 3.9293172690763054, "grad_norm": 0.1596471220254898, "learning_rate": 2.2890045799297876e-05, "loss": 0.0321, "step": 610 }, { "epoch": 3.9357429718875503, "grad_norm": 0.19366958737373352, "learning_rate": 2.2624370001278838e-05, "loss": 0.0398, "step": 611 }, { "epoch": 3.942168674698795, "grad_norm": 0.1489342749118805, "learning_rate": 2.2360048218136985e-05, "loss": 0.0313, "step": 612 }, { "epoch": 3.94859437751004, "grad_norm": 0.2047041803598404, "learning_rate": 2.2097085075303138e-05, "loss": 0.0517, "step": 613 }, { "epoch": 3.955020080321285, "grad_norm": 0.24907703697681427, "learning_rate": 2.1835485174433002e-05, "loss": 0.051, "step": 614 }, { "epoch": 3.9614457831325303, "grad_norm": 0.21545164287090302, "learning_rate": 2.1575253093326586e-05, "loss": 0.0327, "step": 615 }, { "epoch": 3.967871485943775, "grad_norm": 0.11793698370456696, "learning_rate": 2.131639338584801e-05, "loss": 0.0516, "step": 616 }, { "epoch": 3.97429718875502, "grad_norm": 0.14671315252780914, "learning_rate": 2.1058910581846013e-05, "loss": 0.0528, "step": 617 }, { "epoch": 3.980722891566265, "grad_norm": 0.12005927413702011, "learning_rate": 2.0802809187074434e-05, "loss": 0.0404, "step": 618 }, { "epoch": 3.9871485943775102, "grad_norm": 0.16592156887054443, "learning_rate": 2.05480936831136e-05, "loss": 0.0456, "step": 619 }, { "epoch": 3.993574297188755, "grad_norm": 0.14437763392925262, "learning_rate": 2.0294768527291796e-05, "loss": 0.0292, "step": 620 }, { "epoch": 4.003212851405623, "grad_norm": 0.4867587685585022, "learning_rate": 2.004283815260717e-05, "loss": 0.0602, "step": 621 }, { "epoch": 4.009638554216868, "grad_norm": 0.11754161864519119, "learning_rate": 1.9792306967650398e-05, "loss": 0.0577, "step": 622 }, { "epoch": 4.016064257028113, "grad_norm": 0.12216556817293167, "learning_rate": 1.9543179356527252e-05, "loss": 0.0782, "step": 623 }, { "epoch": 4.0224899598393575, "grad_norm": 0.10597134381532669, "learning_rate": 1.9295459678782168e-05, "loss": 0.0623, "step": 624 }, { "epoch": 4.028915662650602, "grad_norm": 0.13430526852607727, "learning_rate": 1.904915226932169e-05, "loss": 0.0573, "step": 625 }, { "epoch": 4.035341365461847, "grad_norm": 0.14790372550487518, "learning_rate": 1.88042614383388e-05, "loss": 0.0494, "step": 626 }, { "epoch": 4.041767068273092, "grad_norm": 0.1289214938879013, "learning_rate": 1.856079147123746e-05, "loss": 0.0353, "step": 627 }, { "epoch": 4.048192771084337, "grad_norm": 0.12450610101222992, "learning_rate": 1.8318746628557526e-05, "loss": 0.0379, "step": 628 }, { "epoch": 4.054618473895582, "grad_norm": 0.17245958745479584, "learning_rate": 1.8078131145900267e-05, "loss": 0.0583, "step": 629 }, { "epoch": 4.061044176706828, "grad_norm": 0.14711426198482513, "learning_rate": 1.7838949233854284e-05, "loss": 0.0612, "step": 630 }, { "epoch": 4.067469879518073, "grad_norm": 0.16549347341060638, "learning_rate": 1.760120507792169e-05, "loss": 0.049, "step": 631 }, { "epoch": 4.0738955823293175, "grad_norm": 0.10457637906074524, "learning_rate": 1.7364902838445106e-05, "loss": 0.0337, "step": 632 }, { "epoch": 4.080321285140562, "grad_norm": 0.19436487555503845, "learning_rate": 1.713004665053457e-05, "loss": 0.0455, "step": 633 }, { "epoch": 4.086746987951807, "grad_norm": 0.14365814626216888, "learning_rate": 1.6896640623995318e-05, "loss": 0.0363, "step": 634 }, { "epoch": 4.093172690763052, "grad_norm": 0.13444365561008453, "learning_rate": 1.666468884325596e-05, "loss": 0.0382, "step": 635 }, { "epoch": 4.099598393574297, "grad_norm": 0.11268598586320877, "learning_rate": 1.6434195367296802e-05, "loss": 0.0323, "step": 636 }, { "epoch": 4.106024096385542, "grad_norm": 0.12947283685207367, "learning_rate": 1.6205164229578994e-05, "loss": 0.0281, "step": 637 }, { "epoch": 4.112449799196787, "grad_norm": 0.12646523118019104, "learning_rate": 1.5977599437973867e-05, "loss": 0.0264, "step": 638 }, { "epoch": 4.1188755020080325, "grad_norm": 0.1570567935705185, "learning_rate": 1.5751504974692733e-05, "loss": 0.0356, "step": 639 }, { "epoch": 4.125301204819277, "grad_norm": 0.1824863851070404, "learning_rate": 1.55268847962174e-05, "loss": 0.0368, "step": 640 }, { "epoch": 4.131726907630522, "grad_norm": 0.17944642901420593, "learning_rate": 1.5303742833230673e-05, "loss": 0.0275, "step": 641 }, { "epoch": 4.138152610441767, "grad_norm": 0.16530513763427734, "learning_rate": 1.5082082990547796e-05, "loss": 0.0308, "step": 642 }, { "epoch": 4.144578313253012, "grad_norm": 0.17503738403320312, "learning_rate": 1.4861909147048025e-05, "loss": 0.0232, "step": 643 }, { "epoch": 4.151004016064257, "grad_norm": 0.18777841329574585, "learning_rate": 1.464322515560671e-05, "loss": 0.032, "step": 644 }, { "epoch": 4.157429718875502, "grad_norm": 0.2501942217350006, "learning_rate": 1.4426034843027969e-05, "loss": 0.0328, "step": 645 }, { "epoch": 4.163855421686747, "grad_norm": 0.13108326494693756, "learning_rate": 1.4210342009977628e-05, "loss": 0.0393, "step": 646 }, { "epoch": 4.170281124497992, "grad_norm": 0.1045239120721817, "learning_rate": 1.3996150430916799e-05, "loss": 0.0491, "step": 647 }, { "epoch": 4.176706827309237, "grad_norm": 0.14830029010772705, "learning_rate": 1.378346385403575e-05, "loss": 0.0488, "step": 648 }, { "epoch": 4.183132530120482, "grad_norm": 0.12067140638828278, "learning_rate": 1.357228600118836e-05, "loss": 0.0431, "step": 649 }, { "epoch": 4.189558232931727, "grad_norm": 0.12366708368062973, "learning_rate": 1.3362620567827033e-05, "loss": 0.0488, "step": 650 }, { "epoch": 4.195983935742972, "grad_norm": 0.14555774629116058, "learning_rate": 1.3154471222937903e-05, "loss": 0.05, "step": 651 }, { "epoch": 4.202409638554217, "grad_norm": 0.16288283467292786, "learning_rate": 1.2947841608976718e-05, "loss": 0.0583, "step": 652 }, { "epoch": 4.208835341365462, "grad_norm": 0.14024418592453003, "learning_rate": 1.2742735341805145e-05, "loss": 0.0404, "step": 653 }, { "epoch": 4.215261044176707, "grad_norm": 0.13420836627483368, "learning_rate": 1.253915601062734e-05, "loss": 0.0587, "step": 654 }, { "epoch": 4.2216867469879515, "grad_norm": 0.1517859846353531, "learning_rate": 1.2337107177927365e-05, "loss": 0.041, "step": 655 }, { "epoch": 4.228112449799196, "grad_norm": 0.14088109135627747, "learning_rate": 1.213659237940662e-05, "loss": 0.0419, "step": 656 }, { "epoch": 4.234538152610442, "grad_norm": 0.1399402618408203, "learning_rate": 1.1937615123922052e-05, "loss": 0.0443, "step": 657 }, { "epoch": 4.240963855421687, "grad_norm": 0.15205906331539154, "learning_rate": 1.174017889342489e-05, "loss": 0.0429, "step": 658 }, { "epoch": 4.247389558232932, "grad_norm": 0.3410640358924866, "learning_rate": 1.1544287142899446e-05, "loss": 0.0367, "step": 659 }, { "epoch": 4.253815261044177, "grad_norm": 0.1357734352350235, "learning_rate": 1.1349943300302913e-05, "loss": 0.0294, "step": 660 }, { "epoch": 4.260240963855422, "grad_norm": 0.14988286793231964, "learning_rate": 1.1157150766505253e-05, "loss": 0.0384, "step": 661 }, { "epoch": 4.266666666666667, "grad_norm": 0.1004982739686966, "learning_rate": 1.0965912915229625e-05, "loss": 0.0263, "step": 662 }, { "epoch": 4.2730923694779115, "grad_norm": 0.1396017074584961, "learning_rate": 1.0776233092993527e-05, "loss": 0.028, "step": 663 }, { "epoch": 4.279518072289156, "grad_norm": 0.11266002804040909, "learning_rate": 1.0588114619050028e-05, "loss": 0.0205, "step": 664 }, { "epoch": 4.285943775100401, "grad_norm": 0.15433090925216675, "learning_rate": 1.040156078532989e-05, "loss": 0.0316, "step": 665 }, { "epoch": 4.292369477911647, "grad_norm": 0.15806183218955994, "learning_rate": 1.0216574856383742e-05, "loss": 0.0319, "step": 666 }, { "epoch": 4.298795180722892, "grad_norm": 0.184244304895401, "learning_rate": 1.0033160069325166e-05, "loss": 0.0203, "step": 667 }, { "epoch": 4.305220883534137, "grad_norm": 0.14676746726036072, "learning_rate": 9.851319633773926e-06, "loss": 0.0222, "step": 668 }, { "epoch": 4.311646586345382, "grad_norm": 0.22139282524585724, "learning_rate": 9.671056731799777e-06, "loss": 0.0349, "step": 669 }, { "epoch": 4.3180722891566266, "grad_norm": 0.16691678762435913, "learning_rate": 9.49237451786692e-06, "loss": 0.0227, "step": 670 }, { "epoch": 4.324497991967871, "grad_norm": 0.16708509624004364, "learning_rate": 9.315276118778627e-06, "loss": 0.0374, "step": 671 }, { "epoch": 4.330923694779116, "grad_norm": 0.09550745040178299, "learning_rate": 9.139764633622617e-06, "loss": 0.0511, "step": 672 }, { "epoch": 4.337349397590361, "grad_norm": 0.1416180431842804, "learning_rate": 8.965843133716933e-06, "loss": 0.0739, "step": 673 }, { "epoch": 4.343775100401606, "grad_norm": 0.15338513255119324, "learning_rate": 8.793514662555946e-06, "loss": 0.0452, "step": 674 }, { "epoch": 4.350200803212852, "grad_norm": 0.19460880756378174, "learning_rate": 8.622782235757276e-06, "loss": 0.0617, "step": 675 }, { "epoch": 4.356626506024097, "grad_norm": 0.14072959125041962, "learning_rate": 8.453648841009021e-06, "loss": 0.0435, "step": 676 }, { "epoch": 4.363052208835342, "grad_norm": 0.25675803422927856, "learning_rate": 8.286117438017337e-06, "loss": 0.0503, "step": 677 }, { "epoch": 4.3694779116465865, "grad_norm": 0.13942734897136688, "learning_rate": 8.120190958454843e-06, "loss": 0.062, "step": 678 }, { "epoch": 4.375903614457831, "grad_norm": 0.13195668160915375, "learning_rate": 7.955872305909152e-06, "loss": 0.0479, "step": 679 }, { "epoch": 4.382329317269076, "grad_norm": 0.13269154727458954, "learning_rate": 7.793164355832127e-06, "loss": 0.0414, "step": 680 }, { "epoch": 4.388755020080321, "grad_norm": 0.1273636668920517, "learning_rate": 7.632069955489585e-06, "loss": 0.0359, "step": 681 }, { "epoch": 4.395180722891566, "grad_norm": 0.14960840344429016, "learning_rate": 7.472591923911398e-06, "loss": 0.0374, "step": 682 }, { "epoch": 4.401606425702811, "grad_norm": 0.14672979712486267, "learning_rate": 7.314733051842282e-06, "loss": 0.0323, "step": 683 }, { "epoch": 4.408032128514057, "grad_norm": 0.13686661422252655, "learning_rate": 7.158496101692802e-06, "loss": 0.0314, "step": 684 }, { "epoch": 4.414457831325302, "grad_norm": 0.15936917066574097, "learning_rate": 7.003883807491185e-06, "loss": 0.0493, "step": 685 }, { "epoch": 4.4208835341365464, "grad_norm": 0.16186754405498505, "learning_rate": 6.85089887483541e-06, "loss": 0.0337, "step": 686 }, { "epoch": 4.427309236947791, "grad_norm": 0.1702742725610733, "learning_rate": 6.699543980845801e-06, "loss": 0.0306, "step": 687 }, { "epoch": 4.433734939759036, "grad_norm": 0.16083656251430511, "learning_rate": 6.549821774118325e-06, "loss": 0.0341, "step": 688 }, { "epoch": 4.440160642570281, "grad_norm": 0.11489646136760712, "learning_rate": 6.401734874678089e-06, "loss": 0.0221, "step": 689 }, { "epoch": 4.446586345381526, "grad_norm": 0.1623634397983551, "learning_rate": 6.255285873933569e-06, "loss": 0.0234, "step": 690 }, { "epoch": 4.453012048192771, "grad_norm": 0.1647336781024933, "learning_rate": 6.110477334631326e-06, "loss": 0.0305, "step": 691 }, { "epoch": 4.459437751004016, "grad_norm": 0.13188236951828003, "learning_rate": 5.967311790811014e-06, "loss": 0.0194, "step": 692 }, { "epoch": 4.4658634538152615, "grad_norm": 0.17995339632034302, "learning_rate": 5.825791747761123e-06, "loss": 0.0368, "step": 693 }, { "epoch": 4.472289156626506, "grad_norm": 0.18333207070827484, "learning_rate": 5.685919681975149e-06, "loss": 0.0303, "step": 694 }, { "epoch": 4.478714859437751, "grad_norm": 0.13925676047801971, "learning_rate": 5.547698041108229e-06, "loss": 0.0314, "step": 695 }, { "epoch": 4.485140562248996, "grad_norm": 0.10495959222316742, "learning_rate": 5.4111292439342986e-06, "loss": 0.027, "step": 696 }, { "epoch": 4.491566265060241, "grad_norm": 0.1200391873717308, "learning_rate": 5.276215680303831e-06, "loss": 0.0475, "step": 697 }, { "epoch": 4.497991967871486, "grad_norm": 0.12512274086475372, "learning_rate": 5.14295971110188e-06, "loss": 0.0513, "step": 698 }, { "epoch": 4.504417670682731, "grad_norm": 0.11158251017332077, "learning_rate": 5.011363668206948e-06, "loss": 0.049, "step": 699 }, { "epoch": 4.510843373493976, "grad_norm": 0.12815481424331665, "learning_rate": 4.881429854450004e-06, "loss": 0.0465, "step": 700 }, { "epoch": 4.517269076305221, "grad_norm": 0.1294533908367157, "learning_rate": 4.753160543574331e-06, "loss": 0.0445, "step": 701 }, { "epoch": 4.523694779116466, "grad_norm": 0.1252312958240509, "learning_rate": 4.626557980195623e-06, "loss": 0.0383, "step": 702 }, { "epoch": 4.530120481927711, "grad_norm": 0.14612750709056854, "learning_rate": 4.501624379762803e-06, "loss": 0.0567, "step": 703 }, { "epoch": 4.536546184738956, "grad_norm": 0.11176297813653946, "learning_rate": 4.3783619285191705e-06, "loss": 0.0418, "step": 704 }, { "epoch": 4.542971887550201, "grad_norm": 0.13328702747821808, "learning_rate": 4.2567727834641915e-06, "loss": 0.0316, "step": 705 }, { "epoch": 4.549397590361446, "grad_norm": 0.11925917118787766, "learning_rate": 4.136859072315758e-06, "loss": 0.0403, "step": 706 }, { "epoch": 4.555823293172691, "grad_norm": 0.14322586357593536, "learning_rate": 4.018622893472912e-06, "loss": 0.0486, "step": 707 }, { "epoch": 4.562248995983936, "grad_norm": 0.135334774851799, "learning_rate": 3.902066315979158e-06, "loss": 0.0358, "step": 708 }, { "epoch": 4.5686746987951805, "grad_norm": 0.14582079648971558, "learning_rate": 3.787191379486288e-06, "loss": 0.0322, "step": 709 }, { "epoch": 4.575100401606425, "grad_norm": 0.16357247531414032, "learning_rate": 3.674000094218577e-06, "loss": 0.0331, "step": 710 }, { "epoch": 4.581526104417671, "grad_norm": 0.1673492193222046, "learning_rate": 3.562494440937769e-06, "loss": 0.0299, "step": 711 }, { "epoch": 4.587951807228916, "grad_norm": 0.14151403307914734, "learning_rate": 3.4526763709082476e-06, "loss": 0.0256, "step": 712 }, { "epoch": 4.594377510040161, "grad_norm": 0.20980997383594513, "learning_rate": 3.344547805862985e-06, "loss": 0.0342, "step": 713 }, { "epoch": 4.600803212851406, "grad_norm": 0.12801873683929443, "learning_rate": 3.2381106379699488e-06, "loss": 0.022, "step": 714 }, { "epoch": 4.607228915662651, "grad_norm": 0.15073615312576294, "learning_rate": 3.1333667297989035e-06, "loss": 0.0179, "step": 715 }, { "epoch": 4.613654618473896, "grad_norm": 0.14964726567268372, "learning_rate": 3.030317914288816e-06, "loss": 0.022, "step": 716 }, { "epoch": 4.6200803212851405, "grad_norm": 0.11907092481851578, "learning_rate": 2.928965994715882e-06, "loss": 0.0199, "step": 717 }, { "epoch": 4.626506024096385, "grad_norm": 0.1797790676355362, "learning_rate": 2.8293127446618383e-06, "loss": 0.0241, "step": 718 }, { "epoch": 4.63293172690763, "grad_norm": 0.2542262673377991, "learning_rate": 2.7313599079830666e-06, "loss": 0.0415, "step": 719 }, { "epoch": 4.639357429718876, "grad_norm": 0.16320320963859558, "learning_rate": 2.63510919877995e-06, "loss": 0.0283, "step": 720 }, { "epoch": 4.64578313253012, "grad_norm": 0.13019272685050964, "learning_rate": 2.540562301366922e-06, "loss": 0.0467, "step": 721 }, { "epoch": 4.652208835341366, "grad_norm": 0.09419631212949753, "learning_rate": 2.447720870243064e-06, "loss": 0.0517, "step": 722 }, { "epoch": 4.658634538152611, "grad_norm": 0.2463325709104538, "learning_rate": 2.3565865300630206e-06, "loss": 0.0399, "step": 723 }, { "epoch": 4.6650602409638555, "grad_norm": 0.14761923253536224, "learning_rate": 2.267160875608687e-06, "loss": 0.0643, "step": 724 }, { "epoch": 4.6714859437751, "grad_norm": 0.15416769683361053, "learning_rate": 2.179445471761221e-06, "loss": 0.0506, "step": 725 }, { "epoch": 4.677911646586345, "grad_norm": 0.1494145393371582, "learning_rate": 2.0934418534737098e-06, "loss": 0.0553, "step": 726 }, { "epoch": 4.68433734939759, "grad_norm": 0.17155231535434723, "learning_rate": 2.0091515257442904e-06, "loss": 0.0434, "step": 727 }, { "epoch": 4.690763052208835, "grad_norm": 0.12443775683641434, "learning_rate": 1.926575963589805e-06, "loss": 0.0337, "step": 728 }, { "epoch": 4.697188755020081, "grad_norm": 0.11996244639158249, "learning_rate": 1.8457166120199987e-06, "loss": 0.0304, "step": 729 }, { "epoch": 4.703614457831325, "grad_norm": 0.17615464329719543, "learning_rate": 1.7665748860122512e-06, "loss": 0.054, "step": 730 }, { "epoch": 4.710040160642571, "grad_norm": 0.16451974213123322, "learning_rate": 1.689152170486752e-06, "loss": 0.0435, "step": 731 }, { "epoch": 4.7164658634538155, "grad_norm": 0.223603755235672, "learning_rate": 1.6134498202823645e-06, "loss": 0.0439, "step": 732 }, { "epoch": 4.72289156626506, "grad_norm": 0.14352591335773468, "learning_rate": 1.5394691601328338e-06, "loss": 0.0347, "step": 733 }, { "epoch": 4.729317269076305, "grad_norm": 0.17710843682289124, "learning_rate": 1.467211484643627e-06, "loss": 0.0383, "step": 734 }, { "epoch": 4.73574297188755, "grad_norm": 0.20051458477973938, "learning_rate": 1.3966780582693185e-06, "loss": 0.056, "step": 735 }, { "epoch": 4.742168674698795, "grad_norm": 0.1505594104528427, "learning_rate": 1.3278701152913742e-06, "loss": 0.036, "step": 736 }, { "epoch": 4.74859437751004, "grad_norm": 0.20499320328235626, "learning_rate": 1.2607888597966688e-06, "loss": 0.0438, "step": 737 }, { "epoch": 4.755020080321285, "grad_norm": 0.1615976244211197, "learning_rate": 1.195435465656325e-06, "loss": 0.0258, "step": 738 }, { "epoch": 4.76144578313253, "grad_norm": 0.13965967297554016, "learning_rate": 1.131811076505196e-06, "loss": 0.0195, "step": 739 }, { "epoch": 4.767871485943775, "grad_norm": 0.18875765800476074, "learning_rate": 1.0699168057218823e-06, "loss": 0.0339, "step": 740 }, { "epoch": 4.77429718875502, "grad_norm": 0.12984806299209595, "learning_rate": 1.0097537364091914e-06, "loss": 0.0274, "step": 741 }, { "epoch": 4.780722891566265, "grad_norm": 0.11120978742837906, "learning_rate": 9.513229213752417e-07, "loss": 0.0233, "step": 742 }, { "epoch": 4.78714859437751, "grad_norm": 0.1367001086473465, "learning_rate": 8.946253831150109e-07, "loss": 0.0301, "step": 743 }, { "epoch": 4.793574297188755, "grad_norm": 0.11858794093132019, "learning_rate": 8.396621137924388e-07, "loss": 0.0244, "step": 744 }, { "epoch": 4.8, "grad_norm": 0.1457306295633316, "learning_rate": 7.864340752230859e-07, "loss": 0.0297, "step": 745 }, { "epoch": 4.806425702811245, "grad_norm": 0.133670836687088, "learning_rate": 7.349421988572691e-07, "loss": 0.0458, "step": 746 }, { "epoch": 4.81285140562249, "grad_norm": 0.12656152248382568, "learning_rate": 6.851873857638192e-07, "loss": 0.0772, "step": 747 }, { "epoch": 4.8192771084337345, "grad_norm": 0.09214270859956741, "learning_rate": 6.371705066142264e-07, "loss": 0.0415, "step": 748 }, { "epoch": 4.82570281124498, "grad_norm": 0.1306275725364685, "learning_rate": 5.908924016674977e-07, "loss": 0.0608, "step": 749 }, { "epoch": 4.832128514056225, "grad_norm": 0.1301136314868927, "learning_rate": 5.463538807553903e-07, "loss": 0.0429, "step": 750 }, { "epoch": 4.83855421686747, "grad_norm": 0.1389213353395462, "learning_rate": 5.035557232682564e-07, "loss": 0.0525, "step": 751 }, { "epoch": 4.844979919678715, "grad_norm": 0.15371650457382202, "learning_rate": 4.624986781414098e-07, "loss": 0.0382, "step": 752 }, { "epoch": 4.85140562248996, "grad_norm": 0.13239037990570068, "learning_rate": 4.231834638420362e-07, "loss": 0.0378, "step": 753 }, { "epoch": 4.857831325301205, "grad_norm": 0.13462603092193604, "learning_rate": 3.8561076835657017e-07, "loss": 0.0574, "step": 754 }, { "epoch": 4.8642570281124495, "grad_norm": 0.13441871106624603, "learning_rate": 3.4978124917871556e-07, "loss": 0.0383, "step": 755 }, { "epoch": 4.870682730923694, "grad_norm": 0.12943735718727112, "learning_rate": 3.1569553329788836e-07, "loss": 0.0372, "step": 756 }, { "epoch": 4.877108433734939, "grad_norm": 0.13817009329795837, "learning_rate": 2.8335421718829193e-07, "loss": 0.037, "step": 757 }, { "epoch": 4.883534136546185, "grad_norm": 0.15584351122379303, "learning_rate": 2.527578667984365e-07, "loss": 0.057, "step": 758 }, { "epoch": 4.88995983935743, "grad_norm": 0.13548874855041504, "learning_rate": 2.239070175412694e-07, "loss": 0.0312, "step": 759 }, { "epoch": 4.896385542168675, "grad_norm": 0.1511905938386917, "learning_rate": 1.9680217428479364e-07, "loss": 0.0422, "step": 760 }, { "epoch": 4.90281124497992, "grad_norm": 0.1757678985595703, "learning_rate": 1.714438113431971e-07, "loss": 0.0431, "step": 761 }, { "epoch": 4.909236947791165, "grad_norm": 0.17349182069301605, "learning_rate": 1.4783237246862592e-07, "loss": 0.0352, "step": 762 }, { "epoch": 4.9156626506024095, "grad_norm": 0.14009258151054382, "learning_rate": 1.259682708433574e-07, "loss": 0.0405, "step": 763 }, { "epoch": 4.922088353413654, "grad_norm": 0.1775335967540741, "learning_rate": 1.0585188907260569e-07, "loss": 0.04, "step": 764 }, { "epoch": 4.928514056224899, "grad_norm": 0.15646418929100037, "learning_rate": 8.748357917780503e-08, "loss": 0.034, "step": 765 }, { "epoch": 4.934939759036144, "grad_norm": 0.1268489807844162, "learning_rate": 7.086366259044796e-08, "loss": 0.0199, "step": 766 }, { "epoch": 4.94136546184739, "grad_norm": 0.17802543938159943, "learning_rate": 5.59924301464898e-08, "loss": 0.0365, "step": 767 }, { "epoch": 4.947791164658635, "grad_norm": 0.18624529242515564, "learning_rate": 4.287014208120832e-08, "loss": 0.0319, "step": 768 }, { "epoch": 4.95421686746988, "grad_norm": 0.1787281632423401, "learning_rate": 3.149702802470733e-08, "loss": 0.0216, "step": 769 }, { "epoch": 4.9606425702811245, "grad_norm": 0.19038884341716766, "learning_rate": 2.1873286997875498e-08, "loss": 0.0341, "step": 770 }, { "epoch": 4.967068273092369, "grad_norm": 0.15438446402549744, "learning_rate": 1.3999087408866906e-08, "loss": 0.0378, "step": 771 }, { "epoch": 4.973493975903614, "grad_norm": 0.14491575956344604, "learning_rate": 7.874567050214499e-09, "loss": 0.0398, "step": 772 }, { "epoch": 4.979919678714859, "grad_norm": 0.18205036222934723, "learning_rate": 3.4998330963764705e-09, "loss": 0.0426, "step": 773 }, { "epoch": 4.986345381526104, "grad_norm": 0.11920995265245438, "learning_rate": 8.749621018822041e-10, "loss": 0.027, "step": 774 }, { "epoch": 4.992771084337349, "grad_norm": 0.15572018921375275, "learning_rate": 0.0, "loss": 0.0339, "step": 775 } ], "logging_steps": 1, "max_steps": 775, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3110452392841708e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }