| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.7316017316017316, | |
| "eval_steps": 500, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01443001443001443, | |
| "grad_norm": 56.83119360154837, | |
| "learning_rate": 4.9997137491585e-05, | |
| "loss": 1.3624, | |
| "num_input_tokens_seen": 359024, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02886002886002886, | |
| "grad_norm": 3.369976030864084, | |
| "learning_rate": 4.9988550621856334e-05, | |
| "loss": 0.4676, | |
| "num_input_tokens_seen": 704936, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04329004329004329, | |
| "grad_norm": 4.096562689130303, | |
| "learning_rate": 4.997424135721297e-05, | |
| "loss": 0.2693, | |
| "num_input_tokens_seen": 1054072, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05772005772005772, | |
| "grad_norm": 3.267048245468216, | |
| "learning_rate": 4.9954212974486133e-05, | |
| "loss": 0.1972, | |
| "num_input_tokens_seen": 1407008, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07215007215007214, | |
| "grad_norm": 1.4062210868604832, | |
| "learning_rate": 4.9928470060188954e-05, | |
| "loss": 0.1583, | |
| "num_input_tokens_seen": 1758688, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08658008658008658, | |
| "grad_norm": 1.435503762940731, | |
| "learning_rate": 4.989701850946613e-05, | |
| "loss": 0.3325, | |
| "num_input_tokens_seen": 2115360, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10101010101010101, | |
| "grad_norm": 1.4056756129974017, | |
| "learning_rate": 4.985986552474396e-05, | |
| "loss": 0.2568, | |
| "num_input_tokens_seen": 2465168, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11544011544011544, | |
| "grad_norm": 2.4297584032149038, | |
| "learning_rate": 4.9817019614080956e-05, | |
| "loss": 0.2166, | |
| "num_input_tokens_seen": 2824680, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12987012987012986, | |
| "grad_norm": 2.079558907831912, | |
| "learning_rate": 4.97684905892195e-05, | |
| "loss": 0.1564, | |
| "num_input_tokens_seen": 3186688, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1443001443001443, | |
| "grad_norm": 4.289559037743566, | |
| "learning_rate": 4.9714289563338956e-05, | |
| "loss": 0.2399, | |
| "num_input_tokens_seen": 3539368, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 1.709391455489484, | |
| "learning_rate": 4.9654428948510733e-05, | |
| "loss": 0.1786, | |
| "num_input_tokens_seen": 3892272, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17316017316017315, | |
| "grad_norm": 4.105516922389285, | |
| "learning_rate": 4.9588922452855935e-05, | |
| "loss": 0.1634, | |
| "num_input_tokens_seen": 4247888, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18759018759018758, | |
| "grad_norm": 14.337965174289707, | |
| "learning_rate": 4.9517785077406154e-05, | |
| "loss": 0.2301, | |
| "num_input_tokens_seen": 4600504, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 3.4495197481454194, | |
| "learning_rate": 4.9441033112668264e-05, | |
| "loss": 0.1836, | |
| "num_input_tokens_seen": 4954360, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21645021645021645, | |
| "grad_norm": 2.9433687548388106, | |
| "learning_rate": 4.9358684134893875e-05, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 5307224, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.23088023088023088, | |
| "grad_norm": 1.9991837308587015, | |
| "learning_rate": 4.927075700205431e-05, | |
| "loss": 0.1776, | |
| "num_input_tokens_seen": 5665880, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2453102453102453, | |
| "grad_norm": 1.0758975822927606, | |
| "learning_rate": 4.917727184952219e-05, | |
| "loss": 0.153, | |
| "num_input_tokens_seen": 6013968, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2597402597402597, | |
| "grad_norm": 0.9976424589406766, | |
| "learning_rate": 4.9078250085460384e-05, | |
| "loss": 0.1538, | |
| "num_input_tokens_seen": 6362696, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2741702741702742, | |
| "grad_norm": 1.554715189619398, | |
| "learning_rate": 4.897371438591952e-05, | |
| "loss": 0.1166, | |
| "num_input_tokens_seen": 6707576, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2886002886002886, | |
| "grad_norm": 2.122029208713052, | |
| "learning_rate": 4.8863688689645164e-05, | |
| "loss": 0.1719, | |
| "num_input_tokens_seen": 7056720, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 1.214256916538219, | |
| "learning_rate": 4.874819819259584e-05, | |
| "loss": 0.1858, | |
| "num_input_tokens_seen": 7412576, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 1.3100116612480939, | |
| "learning_rate": 4.862726934217311e-05, | |
| "loss": 0.1949, | |
| "num_input_tokens_seen": 7772560, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3318903318903319, | |
| "grad_norm": 1.1161659945835543, | |
| "learning_rate": 4.850092983116514e-05, | |
| "loss": 0.1788, | |
| "num_input_tokens_seen": 8131176, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3463203463203463, | |
| "grad_norm": 1.0722413071734969, | |
| "learning_rate": 4.8369208591404997e-05, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 8485328, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.36075036075036077, | |
| "grad_norm": 1.2139674135231018, | |
| "learning_rate": 4.823213578714526e-05, | |
| "loss": 0.1156, | |
| "num_input_tokens_seen": 8833696, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.37518037518037517, | |
| "grad_norm": 1.6739499644681717, | |
| "learning_rate": 4.8089742808150384e-05, | |
| "loss": 0.172, | |
| "num_input_tokens_seen": 9184616, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.38961038961038963, | |
| "grad_norm": 1.483447317449199, | |
| "learning_rate": 4.7942062262508425e-05, | |
| "loss": 0.1966, | |
| "num_input_tokens_seen": 9539992, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 1.1509455037627738, | |
| "learning_rate": 4.778912796916374e-05, | |
| "loss": 0.1628, | |
| "num_input_tokens_seen": 9887200, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4184704184704185, | |
| "grad_norm": 1.420110660393153, | |
| "learning_rate": 4.763097495017247e-05, | |
| "loss": 0.1336, | |
| "num_input_tokens_seen": 10242808, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4329004329004329, | |
| "grad_norm": 1.4519100138720278, | |
| "learning_rate": 4.746763942268243e-05, | |
| "loss": 0.1703, | |
| "num_input_tokens_seen": 10594344, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.44733044733044736, | |
| "grad_norm": 1.303306860048612, | |
| "learning_rate": 4.7299158790639365e-05, | |
| "loss": 0.1553, | |
| "num_input_tokens_seen": 10948808, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.46176046176046176, | |
| "grad_norm": 0.834125896322133, | |
| "learning_rate": 4.712557163622145e-05, | |
| "loss": 0.1514, | |
| "num_input_tokens_seen": 11307176, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 1.090377119591504, | |
| "learning_rate": 4.694691771100389e-05, | |
| "loss": 0.1689, | |
| "num_input_tokens_seen": 11664048, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4906204906204906, | |
| "grad_norm": 1.1504944334378613, | |
| "learning_rate": 4.676323792685584e-05, | |
| "loss": 0.1943, | |
| "num_input_tokens_seen": 12024008, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5050505050505051, | |
| "grad_norm": 1.5052046184655268, | |
| "learning_rate": 4.657457434657152e-05, | |
| "loss": 0.1416, | |
| "num_input_tokens_seen": 12374176, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 1.250782472648046, | |
| "learning_rate": 4.638097017423783e-05, | |
| "loss": 0.1572, | |
| "num_input_tokens_seen": 12726528, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5339105339105339, | |
| "grad_norm": 1.4846786443672924, | |
| "learning_rate": 4.618246974534055e-05, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 13092552, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5483405483405484, | |
| "grad_norm": 1.209336870267204, | |
| "learning_rate": 4.597911851661155e-05, | |
| "loss": 0.2137, | |
| "num_input_tokens_seen": 13450656, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5627705627705628, | |
| "grad_norm": 0.900006892425402, | |
| "learning_rate": 4.5770963055619095e-05, | |
| "loss": 0.1534, | |
| "num_input_tokens_seen": 13801680, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5772005772005772, | |
| "grad_norm": 1.7634935350790797, | |
| "learning_rate": 4.5558051030103876e-05, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 14153496, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5916305916305916, | |
| "grad_norm": 1.3464012143723911, | |
| "learning_rate": 4.5340431197063084e-05, | |
| "loss": 0.1793, | |
| "num_input_tokens_seen": 14510352, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.8869022258852858, | |
| "learning_rate": 4.5118153391584974e-05, | |
| "loss": 0.1541, | |
| "num_input_tokens_seen": 14859280, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6204906204906205, | |
| "grad_norm": 1.0128792509826028, | |
| "learning_rate": 4.489126851543664e-05, | |
| "loss": 0.1612, | |
| "num_input_tokens_seen": 15220952, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 1.7855902267859547, | |
| "learning_rate": 4.465982852540747e-05, | |
| "loss": 0.2029, | |
| "num_input_tokens_seen": 15585584, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "grad_norm": 1.1762565216888077, | |
| "learning_rate": 4.442388642141097e-05, | |
| "loss": 0.1213, | |
| "num_input_tokens_seen": 15932344, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6637806637806638, | |
| "grad_norm": 1.5774565711682704, | |
| "learning_rate": 4.4183496234347796e-05, | |
| "loss": 0.1808, | |
| "num_input_tokens_seen": 16288200, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6782106782106783, | |
| "grad_norm": 1.4243380964648475, | |
| "learning_rate": 4.393871301373262e-05, | |
| "loss": 0.1502, | |
| "num_input_tokens_seen": 16637448, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6926406926406926, | |
| "grad_norm": 0.9512374605634504, | |
| "learning_rate": 4.3689592815087764e-05, | |
| "loss": 0.1557, | |
| "num_input_tokens_seen": 16992200, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7070707070707071, | |
| "grad_norm": 1.3279436403523264, | |
| "learning_rate": 4.3436192687106406e-05, | |
| "loss": 0.1607, | |
| "num_input_tokens_seen": 17347112, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7215007215007215, | |
| "grad_norm": 1.750549734106104, | |
| "learning_rate": 4.317857065858844e-05, | |
| "loss": 0.2099, | |
| "num_input_tokens_seen": 17699392, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7359307359307359, | |
| "grad_norm": 1.1251441881988402, | |
| "learning_rate": 4.291678572515184e-05, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 18056608, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7503607503607503, | |
| "grad_norm": 1.0416765811260265, | |
| "learning_rate": 4.26508978357226e-05, | |
| "loss": 0.1784, | |
| "num_input_tokens_seen": 18411256, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7647907647907648, | |
| "grad_norm": 1.201198812934987, | |
| "learning_rate": 4.238096787880638e-05, | |
| "loss": 0.1857, | |
| "num_input_tokens_seen": 18767664, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7792207792207793, | |
| "grad_norm": 1.4819563873601835, | |
| "learning_rate": 4.2107057668545044e-05, | |
| "loss": 0.136, | |
| "num_input_tokens_seen": 19132320, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7936507936507936, | |
| "grad_norm": 1.2547051865192014, | |
| "learning_rate": 4.182922993056113e-05, | |
| "loss": 0.1058, | |
| "num_input_tokens_seen": 19488160, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 1.5166739134010474, | |
| "learning_rate": 4.154754828759368e-05, | |
| "loss": 0.1823, | |
| "num_input_tokens_seen": 19844064, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8225108225108225, | |
| "grad_norm": 1.1491639114248267, | |
| "learning_rate": 4.126207724492855e-05, | |
| "loss": 0.1587, | |
| "num_input_tokens_seen": 20200488, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.836940836940837, | |
| "grad_norm": 1.797485180499581, | |
| "learning_rate": 4.097288217562669e-05, | |
| "loss": 0.203, | |
| "num_input_tokens_seen": 20557248, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8513708513708513, | |
| "grad_norm": 1.929792036515502, | |
| "learning_rate": 4.0680029305553674e-05, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 20921800, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8658008658008658, | |
| "grad_norm": 0.7667283264695735, | |
| "learning_rate": 4.0383585698213876e-05, | |
| "loss": 0.1355, | |
| "num_input_tokens_seen": 21269448, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8802308802308803, | |
| "grad_norm": 0.729775915381155, | |
| "learning_rate": 4.008361923939295e-05, | |
| "loss": 0.1873, | |
| "num_input_tokens_seen": 21625040, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8946608946608947, | |
| "grad_norm": 1.2721263119411592, | |
| "learning_rate": 3.978019862161191e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 21973600, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.40284206796357, | |
| "learning_rate": 3.9473393328396484e-05, | |
| "loss": 0.1754, | |
| "num_input_tokens_seen": 22327832, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9235209235209235, | |
| "grad_norm": 1.4456006541134594, | |
| "learning_rate": 3.916327361836536e-05, | |
| "loss": 0.1967, | |
| "num_input_tokens_seen": 22686432, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.937950937950938, | |
| "grad_norm": 0.5527227312593487, | |
| "learning_rate": 3.884991050914091e-05, | |
| "loss": 0.1457, | |
| "num_input_tokens_seen": 23043784, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.3930212264797546, | |
| "learning_rate": 3.85333757610861e-05, | |
| "loss": 0.2194, | |
| "num_input_tokens_seen": 23411560, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9668109668109668, | |
| "grad_norm": 1.4476303074289294, | |
| "learning_rate": 3.821374186087133e-05, | |
| "loss": 0.1148, | |
| "num_input_tokens_seen": 23765000, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9812409812409812, | |
| "grad_norm": 3.292955863226407, | |
| "learning_rate": 3.789108200487493e-05, | |
| "loss": 0.1348, | |
| "num_input_tokens_seen": 24119024, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9956709956709957, | |
| "grad_norm": 1.1327523117828926, | |
| "learning_rate": 3.756547008242112e-05, | |
| "loss": 0.1762, | |
| "num_input_tokens_seen": 24475120, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.0101010101010102, | |
| "grad_norm": 0.6731553914954855, | |
| "learning_rate": 3.723698065885936e-05, | |
| "loss": 0.0941, | |
| "num_input_tokens_seen": 24834408, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0245310245310246, | |
| "grad_norm": 0.9750510929970303, | |
| "learning_rate": 3.690568895848879e-05, | |
| "loss": 0.0694, | |
| "num_input_tokens_seen": 25195312, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0389610389610389, | |
| "grad_norm": 0.6125336557821428, | |
| "learning_rate": 3.65716708473318e-05, | |
| "loss": 0.0736, | |
| "num_input_tokens_seen": 25555472, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0533910533910533, | |
| "grad_norm": 1.1303634424790558, | |
| "learning_rate": 3.623500281576073e-05, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 25907632, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0678210678210678, | |
| "grad_norm": 0.8264622226623303, | |
| "learning_rate": 3.589576196098142e-05, | |
| "loss": 0.0555, | |
| "num_input_tokens_seen": 26255856, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0822510822510822, | |
| "grad_norm": 0.7804657972204446, | |
| "learning_rate": 3.5554025969378034e-05, | |
| "loss": 0.0781, | |
| "num_input_tokens_seen": 26614912, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0966810966810967, | |
| "grad_norm": 0.6498854003200126, | |
| "learning_rate": 3.520987309872269e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 26973272, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.3530620649043212, | |
| "learning_rate": 3.486338216025444e-05, | |
| "loss": 0.0626, | |
| "num_input_tokens_seen": 27333584, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.1255411255411256, | |
| "grad_norm": 0.8465897427898971, | |
| "learning_rate": 3.451463250063146e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 27686384, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.13997113997114, | |
| "grad_norm": 0.9339277337141088, | |
| "learning_rate": 3.416370398376057e-05, | |
| "loss": 0.0902, | |
| "num_input_tokens_seen": 28042656, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.1544011544011543, | |
| "grad_norm": 0.6813215436255746, | |
| "learning_rate": 3.38106769725084e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 28395936, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1688311688311688, | |
| "grad_norm": 0.6152635426287013, | |
| "learning_rate": 3.345563231029818e-05, | |
| "loss": 0.0792, | |
| "num_input_tokens_seen": 28752264, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.1832611832611832, | |
| "grad_norm": 0.5791814399404469, | |
| "learning_rate": 3.309865130259656e-05, | |
| "loss": 0.0538, | |
| "num_input_tokens_seen": 29104512, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1976911976911977, | |
| "grad_norm": 1.227354622086928, | |
| "learning_rate": 3.2739815698294635e-05, | |
| "loss": 0.0806, | |
| "num_input_tokens_seen": 29460048, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 1.014705815120655, | |
| "learning_rate": 3.237920767098735e-05, | |
| "loss": 0.0654, | |
| "num_input_tokens_seen": 29815240, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2265512265512266, | |
| "grad_norm": 0.6935986036942643, | |
| "learning_rate": 3.201690980015572e-05, | |
| "loss": 0.0631, | |
| "num_input_tokens_seen": 30168648, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.240981240981241, | |
| "grad_norm": 0.5742221282988151, | |
| "learning_rate": 3.165300505225608e-05, | |
| "loss": 0.0454, | |
| "num_input_tokens_seen": 30515984, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2554112554112553, | |
| "grad_norm": 0.8521717779753476, | |
| "learning_rate": 3.128757676172065e-05, | |
| "loss": 0.0435, | |
| "num_input_tokens_seen": 30856848, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.2698412698412698, | |
| "grad_norm": 0.6676462028746246, | |
| "learning_rate": 3.092070861187401e-05, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 31210856, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2842712842712842, | |
| "grad_norm": 0.4953272050872759, | |
| "learning_rate": 3.0552484615769404e-05, | |
| "loss": 0.0551, | |
| "num_input_tokens_seen": 31565760, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.2987012987012987, | |
| "grad_norm": 0.8296764277086711, | |
| "learning_rate": 3.018298909694986e-05, | |
| "loss": 0.0607, | |
| "num_input_tokens_seen": 31920664, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3131313131313131, | |
| "grad_norm": 0.7341929187486326, | |
| "learning_rate": 2.9812306670137928e-05, | |
| "loss": 0.0683, | |
| "num_input_tokens_seen": 32277696, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.3275613275613276, | |
| "grad_norm": 0.5799627106422043, | |
| "learning_rate": 2.9440522221858885e-05, | |
| "loss": 0.0672, | |
| "num_input_tokens_seen": 32629688, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.341991341991342, | |
| "grad_norm": 0.892667216375801, | |
| "learning_rate": 2.9067720891001676e-05, | |
| "loss": 0.0675, | |
| "num_input_tokens_seen": 32979664, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.3564213564213565, | |
| "grad_norm": 0.3708623827189489, | |
| "learning_rate": 2.869398804932204e-05, | |
| "loss": 0.0673, | |
| "num_input_tokens_seen": 33336624, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.370851370851371, | |
| "grad_norm": 0.7639296039850831, | |
| "learning_rate": 2.8319409281892307e-05, | |
| "loss": 0.0843, | |
| "num_input_tokens_seen": 33698032, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3852813852813852, | |
| "grad_norm": 0.659221228832128, | |
| "learning_rate": 2.7944070367502402e-05, | |
| "loss": 0.0438, | |
| "num_input_tokens_seen": 34043384, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3997113997113997, | |
| "grad_norm": 0.6103194296481118, | |
| "learning_rate": 2.7568057259016384e-05, | |
| "loss": 0.0568, | |
| "num_input_tokens_seen": 34400944, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.4141414141414141, | |
| "grad_norm": 0.5955688127258445, | |
| "learning_rate": 2.7191456063689236e-05, | |
| "loss": 0.0673, | |
| "num_input_tokens_seen": 34763888, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.7048448509220415, | |
| "learning_rate": 2.6814353023448213e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 35122880, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.443001443001443, | |
| "grad_norm": 0.8954659143802416, | |
| "learning_rate": 2.6436834495143396e-05, | |
| "loss": 0.0672, | |
| "num_input_tokens_seen": 35476128, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4574314574314573, | |
| "grad_norm": 0.5357540884810665, | |
| "learning_rate": 2.6058986930771923e-05, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 35826824, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.4718614718614718, | |
| "grad_norm": 0.6403871525105113, | |
| "learning_rate": 2.568089685768038e-05, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 36176528, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4862914862914862, | |
| "grad_norm": 0.6086257743807054, | |
| "learning_rate": 2.530265085875005e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 36531584, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.5007215007215007, | |
| "grad_norm": 0.7284156072158536, | |
| "learning_rate": 2.492433555256933e-05, | |
| "loss": 0.0887, | |
| "num_input_tokens_seen": 36887632, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.5833690078341504, | |
| "learning_rate": 2.4546037573598003e-05, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 37237360, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.5295815295815296, | |
| "grad_norm": 1.068934721386313, | |
| "learning_rate": 2.4167843552327932e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 37594456, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.544011544011544, | |
| "grad_norm": 0.6914421570316827, | |
| "learning_rate": 2.3789840095444584e-05, | |
| "loss": 0.0831, | |
| "num_input_tokens_seen": 37943432, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.5584415584415585, | |
| "grad_norm": 0.5411649106235956, | |
| "learning_rate": 2.341211376599406e-05, | |
| "loss": 0.0896, | |
| "num_input_tokens_seen": 38309480, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.572871572871573, | |
| "grad_norm": 0.7808054192274716, | |
| "learning_rate": 2.303475106356009e-05, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 38670552, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "grad_norm": 0.5377374336741765, | |
| "learning_rate": 2.265783840445557e-05, | |
| "loss": 0.0661, | |
| "num_input_tokens_seen": 39022944, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.601731601731602, | |
| "grad_norm": 0.37966039726527356, | |
| "learning_rate": 2.2281462101933174e-05, | |
| "loss": 0.0525, | |
| "num_input_tokens_seen": 39370928, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.6161616161616161, | |
| "grad_norm": 1.6803346686839633, | |
| "learning_rate": 2.1905708346419553e-05, | |
| "loss": 0.0755, | |
| "num_input_tokens_seen": 39717904, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6305916305916306, | |
| "grad_norm": 0.5133393164983202, | |
| "learning_rate": 2.1530663185777686e-05, | |
| "loss": 0.0522, | |
| "num_input_tokens_seen": 40067856, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.645021645021645, | |
| "grad_norm": 0.7107107176574299, | |
| "learning_rate": 2.115641250560183e-05, | |
| "loss": 0.063, | |
| "num_input_tokens_seen": 40420928, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6594516594516593, | |
| "grad_norm": 0.37375269780433457, | |
| "learning_rate": 2.0783042009549696e-05, | |
| "loss": 0.0572, | |
| "num_input_tokens_seen": 40775672, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.6738816738816737, | |
| "grad_norm": 0.4542968746133499, | |
| "learning_rate": 2.0410637199716236e-05, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 41132536, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6883116883116882, | |
| "grad_norm": 1.6546823865399398, | |
| "learning_rate": 2.00392833570536e-05, | |
| "loss": 0.0563, | |
| "num_input_tokens_seen": 41492840, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.7027417027417027, | |
| "grad_norm": 0.7762350084544962, | |
| "learning_rate": 1.9669065521841758e-05, | |
| "loss": 0.0754, | |
| "num_input_tokens_seen": 41849832, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7171717171717171, | |
| "grad_norm": 0.5851162333943368, | |
| "learning_rate": 1.9300068474214195e-05, | |
| "loss": 0.0677, | |
| "num_input_tokens_seen": 42201136, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.7316017316017316, | |
| "grad_norm": 0.9931889138260699, | |
| "learning_rate": 1.8932376714743236e-05, | |
| "loss": 0.0818, | |
| "num_input_tokens_seen": 42558776, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1038, | |
| "num_input_tokens_seen": 42558776, | |
| "num_train_epochs": 3, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 67969436221440.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |