| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.070711128967457, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0021427614838623278, |
| "grad_norm": 64.89540762075667, |
| "learning_rate": 5e-06, |
| "loss": 3.2563, |
| "num_input_tokens_seen": 1048576, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0042855229677246556, |
| "grad_norm": 64.53587667772443, |
| "learning_rate": 1e-05, |
| "loss": 3.2442, |
| "num_input_tokens_seen": 2097152, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0064282844515869825, |
| "grad_norm": 45.91705534890451, |
| "learning_rate": 1.5e-05, |
| "loss": 2.7435, |
| "num_input_tokens_seen": 3145728, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.008571045935449311, |
| "grad_norm": 9.616577532098649, |
| "learning_rate": 2e-05, |
| "loss": 2.0932, |
| "num_input_tokens_seen": 4194304, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.010713807419311638, |
| "grad_norm": 22.677650894260427, |
| "learning_rate": 2.5e-05, |
| "loss": 2.1313, |
| "num_input_tokens_seen": 5242880, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.012856568903173965, |
| "grad_norm": 16.200400277863025, |
| "learning_rate": 3e-05, |
| "loss": 2.1563, |
| "num_input_tokens_seen": 6291456, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.014999330387036294, |
| "grad_norm": 7.7723602177379725, |
| "learning_rate": 3.5e-05, |
| "loss": 1.9378, |
| "num_input_tokens_seen": 7340032, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.017142091870898622, |
| "grad_norm": 8.349008010722175, |
| "learning_rate": 4e-05, |
| "loss": 1.8095, |
| "num_input_tokens_seen": 8388608, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.019284853354760947, |
| "grad_norm": 4.24057283338546, |
| "learning_rate": 4.5e-05, |
| "loss": 1.6948, |
| "num_input_tokens_seen": 9437184, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.021427614838623276, |
| "grad_norm": 9.738414333035731, |
| "learning_rate": 5e-05, |
| "loss": 1.7145, |
| "num_input_tokens_seen": 10485760, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.023570376322485605, |
| "grad_norm": 9.427464720180852, |
| "learning_rate": 4.999999429436697e-05, |
| "loss": 1.7124, |
| "num_input_tokens_seen": 11534336, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.02571313780634793, |
| "grad_norm": 3.2252035671130743, |
| "learning_rate": 4.9999977177470465e-05, |
| "loss": 1.6181, |
| "num_input_tokens_seen": 12582912, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.02785589929021026, |
| "grad_norm": 5.389002593456943, |
| "learning_rate": 4.999994864931831e-05, |
| "loss": 1.5381, |
| "num_input_tokens_seen": 13631488, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.029998660774072587, |
| "grad_norm": 2.93969979997987, |
| "learning_rate": 4.999990870992352e-05, |
| "loss": 1.532, |
| "num_input_tokens_seen": 14680064, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03214142225793491, |
| "grad_norm": 3.0591292630760933, |
| "learning_rate": 4.999985735930432e-05, |
| "loss": 1.4952, |
| "num_input_tokens_seen": 15728640, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.034284183741797244, |
| "grad_norm": 3.324378449482722, |
| "learning_rate": 4.9999794597484165e-05, |
| "loss": 1.4567, |
| "num_input_tokens_seen": 16777216, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.03642694522565957, |
| "grad_norm": 4.561130115369689, |
| "learning_rate": 4.999972042449169e-05, |
| "loss": 1.4686, |
| "num_input_tokens_seen": 17825792, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.038569706709521895, |
| "grad_norm": 1.780527582253664, |
| "learning_rate": 4.9999634840360755e-05, |
| "loss": 1.4052, |
| "num_input_tokens_seen": 18874368, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04071246819338423, |
| "grad_norm": 3.117995934114996, |
| "learning_rate": 4.9999537845130426e-05, |
| "loss": 1.4083, |
| "num_input_tokens_seen": 19922944, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.04285522967724655, |
| "grad_norm": 2.848287146164459, |
| "learning_rate": 4.999942943884498e-05, |
| "loss": 1.3887, |
| "num_input_tokens_seen": 20971520, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04499799116110888, |
| "grad_norm": 1.69625375895056, |
| "learning_rate": 4.9999309621553894e-05, |
| "loss": 1.349, |
| "num_input_tokens_seen": 22020096, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.04714075264497121, |
| "grad_norm": 2.567244377686529, |
| "learning_rate": 4.9999178393311855e-05, |
| "loss": 1.3423, |
| "num_input_tokens_seen": 23068672, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.049283514128833535, |
| "grad_norm": 1.7526016889237623, |
| "learning_rate": 4.999903575417877e-05, |
| "loss": 1.3301, |
| "num_input_tokens_seen": 24117248, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.05142627561269586, |
| "grad_norm": 2.1556250824756282, |
| "learning_rate": 4.9998881704219745e-05, |
| "loss": 1.3152, |
| "num_input_tokens_seen": 25165824, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.05356903709655819, |
| "grad_norm": 1.8871936642830933, |
| "learning_rate": 4.9998716243505096e-05, |
| "loss": 1.304, |
| "num_input_tokens_seen": 26214400, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.05571179858042052, |
| "grad_norm": 1.674338621481819, |
| "learning_rate": 4.999853937211034e-05, |
| "loss": 1.2796, |
| "num_input_tokens_seen": 27262976, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.05785456006428284, |
| "grad_norm": 1.761831320598704, |
| "learning_rate": 4.9998351090116226e-05, |
| "loss": 1.2732, |
| "num_input_tokens_seen": 28311552, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.059997321548145174, |
| "grad_norm": 1.7061574034058262, |
| "learning_rate": 4.9998151397608674e-05, |
| "loss": 1.2686, |
| "num_input_tokens_seen": 29360128, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0621400830320075, |
| "grad_norm": 1.5863747354870246, |
| "learning_rate": 4.999794029467886e-05, |
| "loss": 1.2613, |
| "num_input_tokens_seen": 30408704, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.06428284451586982, |
| "grad_norm": 1.7274454226000222, |
| "learning_rate": 4.9997717781423114e-05, |
| "loss": 1.2526, |
| "num_input_tokens_seen": 31457280, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06642560599973216, |
| "grad_norm": 1.4317285831126387, |
| "learning_rate": 4.999748385794302e-05, |
| "loss": 1.2329, |
| "num_input_tokens_seen": 32505856, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.06856836748359449, |
| "grad_norm": 1.8999621450491984, |
| "learning_rate": 4.999723852434535e-05, |
| "loss": 1.2436, |
| "num_input_tokens_seen": 33554432, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.07071112896745681, |
| "grad_norm": 1.4448128803947724, |
| "learning_rate": 4.999698178074209e-05, |
| "loss": 1.2355, |
| "num_input_tokens_seen": 34603008, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.07285389045131914, |
| "grad_norm": 2.144552654239913, |
| "learning_rate": 4.9996713627250426e-05, |
| "loss": 1.2217, |
| "num_input_tokens_seen": 35651584, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.07499665193518147, |
| "grad_norm": 1.1224127832608906, |
| "learning_rate": 4.999643406399275e-05, |
| "loss": 1.2163, |
| "num_input_tokens_seen": 36700160, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07713941341904379, |
| "grad_norm": 2.0366823883396057, |
| "learning_rate": 4.9996143091096684e-05, |
| "loss": 1.2142, |
| "num_input_tokens_seen": 37748736, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.07928217490290612, |
| "grad_norm": 1.296430607752612, |
| "learning_rate": 4.999584070869502e-05, |
| "loss": 1.2073, |
| "num_input_tokens_seen": 38797312, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.08142493638676845, |
| "grad_norm": 1.4801029998241608, |
| "learning_rate": 4.999552691692581e-05, |
| "loss": 1.2124, |
| "num_input_tokens_seen": 39845888, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.08356769787063077, |
| "grad_norm": 1.4660757543282248, |
| "learning_rate": 4.999520171593226e-05, |
| "loss": 1.1989, |
| "num_input_tokens_seen": 40894464, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0857104593544931, |
| "grad_norm": 1.7036809143512879, |
| "learning_rate": 4.999486510586282e-05, |
| "loss": 1.1902, |
| "num_input_tokens_seen": 41943040, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08785322083835544, |
| "grad_norm": 1.5061981122893944, |
| "learning_rate": 4.999451708687114e-05, |
| "loss": 1.1964, |
| "num_input_tokens_seen": 42991616, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.08999598232221775, |
| "grad_norm": 1.050371458696268, |
| "learning_rate": 4.999415765911606e-05, |
| "loss": 1.1799, |
| "num_input_tokens_seen": 44040192, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.09213874380608009, |
| "grad_norm": 1.6332624514974972, |
| "learning_rate": 4.9993786822761656e-05, |
| "loss": 1.1769, |
| "num_input_tokens_seen": 45088768, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.09428150528994242, |
| "grad_norm": 1.351155620545513, |
| "learning_rate": 4.999340457797718e-05, |
| "loss": 1.1779, |
| "num_input_tokens_seen": 46137344, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.09642426677380474, |
| "grad_norm": 1.2370952346467414, |
| "learning_rate": 4.999301092493712e-05, |
| "loss": 1.183, |
| "num_input_tokens_seen": 47185920, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09856702825766707, |
| "grad_norm": 1.4038096900765242, |
| "learning_rate": 4.999260586382116e-05, |
| "loss": 1.1645, |
| "num_input_tokens_seen": 48234496, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.1007097897415294, |
| "grad_norm": 1.1452882430899725, |
| "learning_rate": 4.999218939481418e-05, |
| "loss": 1.1727, |
| "num_input_tokens_seen": 49283072, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.10285255122539172, |
| "grad_norm": 1.3160375257186312, |
| "learning_rate": 4.999176151810629e-05, |
| "loss": 1.1574, |
| "num_input_tokens_seen": 50331648, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.10499531270925405, |
| "grad_norm": 1.1507076301290393, |
| "learning_rate": 4.9991322233892784e-05, |
| "loss": 1.1581, |
| "num_input_tokens_seen": 51380224, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.10713807419311638, |
| "grad_norm": 1.6090478698286774, |
| "learning_rate": 4.999087154237418e-05, |
| "loss": 1.1568, |
| "num_input_tokens_seen": 52428800, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1092808356769787, |
| "grad_norm": 1.2451517727795873, |
| "learning_rate": 4.999040944375619e-05, |
| "loss": 1.1469, |
| "num_input_tokens_seen": 53477376, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.11142359716084103, |
| "grad_norm": 1.3185344813227535, |
| "learning_rate": 4.998993593824975e-05, |
| "loss": 1.1446, |
| "num_input_tokens_seen": 54525952, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.11356635864470337, |
| "grad_norm": 1.3295965754074688, |
| "learning_rate": 4.9989451026070975e-05, |
| "loss": 1.1575, |
| "num_input_tokens_seen": 55574528, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.11570912012856568, |
| "grad_norm": 1.3620844847038756, |
| "learning_rate": 4.9988954707441226e-05, |
| "loss": 1.137, |
| "num_input_tokens_seen": 56623104, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.11785188161242802, |
| "grad_norm": 1.14332778163853, |
| "learning_rate": 4.9988446982587035e-05, |
| "loss": 1.1377, |
| "num_input_tokens_seen": 57671680, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11999464309629035, |
| "grad_norm": 1.6078355451981008, |
| "learning_rate": 4.998792785174014e-05, |
| "loss": 1.1424, |
| "num_input_tokens_seen": 58720256, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.12213740458015267, |
| "grad_norm": 1.092036338689917, |
| "learning_rate": 4.998739731513753e-05, |
| "loss": 1.1428, |
| "num_input_tokens_seen": 59768832, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.124280166064015, |
| "grad_norm": 1.2636480648876944, |
| "learning_rate": 4.998685537302135e-05, |
| "loss": 1.1343, |
| "num_input_tokens_seen": 60817408, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.12642292754787732, |
| "grad_norm": 1.107538929643645, |
| "learning_rate": 4.998630202563896e-05, |
| "loss": 1.1321, |
| "num_input_tokens_seen": 61865984, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.12856568903173965, |
| "grad_norm": 1.4992869675310534, |
| "learning_rate": 4.998573727324295e-05, |
| "loss": 1.1337, |
| "num_input_tokens_seen": 62914560, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.13070845051560198, |
| "grad_norm": 1.2531055693275621, |
| "learning_rate": 4.998516111609111e-05, |
| "loss": 1.127, |
| "num_input_tokens_seen": 63963136, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.1328512119994643, |
| "grad_norm": 1.2048006235711455, |
| "learning_rate": 4.9984573554446404e-05, |
| "loss": 1.1165, |
| "num_input_tokens_seen": 65011712, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.13499397348332665, |
| "grad_norm": 1.4888034799511158, |
| "learning_rate": 4.998397458857704e-05, |
| "loss": 1.1274, |
| "num_input_tokens_seen": 66060288, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.13713673496718898, |
| "grad_norm": 0.8078512840634009, |
| "learning_rate": 4.998336421875641e-05, |
| "loss": 1.1263, |
| "num_input_tokens_seen": 67108864, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.13927949645105128, |
| "grad_norm": 1.0550151301129909, |
| "learning_rate": 4.998274244526313e-05, |
| "loss": 1.1194, |
| "num_input_tokens_seen": 68157440, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.14142225793491361, |
| "grad_norm": 1.7338596462962559, |
| "learning_rate": 4.9982109268380995e-05, |
| "loss": 1.13, |
| "num_input_tokens_seen": 69206016, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.14356501941877595, |
| "grad_norm": 0.9679831352642563, |
| "learning_rate": 4.998146468839903e-05, |
| "loss": 1.1263, |
| "num_input_tokens_seen": 70254592, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.14570778090263828, |
| "grad_norm": 1.4660673328866483, |
| "learning_rate": 4.9980808705611435e-05, |
| "loss": 1.1121, |
| "num_input_tokens_seen": 71303168, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.1478505423865006, |
| "grad_norm": 1.2072777943721469, |
| "learning_rate": 4.998014132031766e-05, |
| "loss": 1.1041, |
| "num_input_tokens_seen": 72351744, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.14999330387036294, |
| "grad_norm": 1.572431062612095, |
| "learning_rate": 4.997946253282231e-05, |
| "loss": 1.1131, |
| "num_input_tokens_seen": 73400320, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.15213606535422525, |
| "grad_norm": 1.0201292692682673, |
| "learning_rate": 4.9978772343435234e-05, |
| "loss": 1.1053, |
| "num_input_tokens_seen": 74448896, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.15427882683808758, |
| "grad_norm": 1.8688137247551255, |
| "learning_rate": 4.997807075247146e-05, |
| "loss": 1.1142, |
| "num_input_tokens_seen": 75497472, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.1564215883219499, |
| "grad_norm": 1.121453660135061, |
| "learning_rate": 4.997735776025124e-05, |
| "loss": 1.1163, |
| "num_input_tokens_seen": 76546048, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.15856434980581224, |
| "grad_norm": 1.3729465860631151, |
| "learning_rate": 4.99766333671e-05, |
| "loss": 1.1063, |
| "num_input_tokens_seen": 77594624, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.16070711128967458, |
| "grad_norm": 1.1291336752784253, |
| "learning_rate": 4.997589757334842e-05, |
| "loss": 1.1002, |
| "num_input_tokens_seen": 78643200, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1628498727735369, |
| "grad_norm": 1.0444442505925857, |
| "learning_rate": 4.997515037933232e-05, |
| "loss": 1.1045, |
| "num_input_tokens_seen": 79691776, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.1649926342573992, |
| "grad_norm": 1.1429743430825556, |
| "learning_rate": 4.997439178539278e-05, |
| "loss": 1.0939, |
| "num_input_tokens_seen": 80740352, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.16713539574126154, |
| "grad_norm": 1.3216220734620914, |
| "learning_rate": 4.9973621791876055e-05, |
| "loss": 1.1102, |
| "num_input_tokens_seen": 81788928, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.16927815722512388, |
| "grad_norm": 0.9852096617413347, |
| "learning_rate": 4.99728403991336e-05, |
| "loss": 1.0977, |
| "num_input_tokens_seen": 82837504, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.1714209187089862, |
| "grad_norm": 1.2499776417522643, |
| "learning_rate": 4.99720476075221e-05, |
| "loss": 1.0953, |
| "num_input_tokens_seen": 83886080, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.17356368019284854, |
| "grad_norm": 1.0988616217785656, |
| "learning_rate": 4.9971243417403414e-05, |
| "loss": 1.0947, |
| "num_input_tokens_seen": 84934656, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.17570644167671087, |
| "grad_norm": 1.0199454529298497, |
| "learning_rate": 4.997042782914462e-05, |
| "loss": 1.0728, |
| "num_input_tokens_seen": 85983232, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.17784920316057318, |
| "grad_norm": 0.8316506162348385, |
| "learning_rate": 4.996960084311798e-05, |
| "loss": 1.0929, |
| "num_input_tokens_seen": 87031808, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.1799919646444355, |
| "grad_norm": 1.284554753733248, |
| "learning_rate": 4.9968762459700994e-05, |
| "loss": 1.0885, |
| "num_input_tokens_seen": 88080384, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.18213472612829784, |
| "grad_norm": 1.2390646541659713, |
| "learning_rate": 4.9967912679276316e-05, |
| "loss": 1.0849, |
| "num_input_tokens_seen": 89128960, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.18427748761216017, |
| "grad_norm": 1.123613767329084, |
| "learning_rate": 4.996705150223186e-05, |
| "loss": 1.0875, |
| "num_input_tokens_seen": 90177536, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.1864202490960225, |
| "grad_norm": 1.0399206213607557, |
| "learning_rate": 4.996617892896069e-05, |
| "loss": 1.0828, |
| "num_input_tokens_seen": 91226112, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.18856301057988484, |
| "grad_norm": 1.3129154484684602, |
| "learning_rate": 4.9965294959861095e-05, |
| "loss": 1.0904, |
| "num_input_tokens_seen": 92274688, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.19070577206374714, |
| "grad_norm": 1.1647296416332669, |
| "learning_rate": 4.996439959533656e-05, |
| "loss": 1.0906, |
| "num_input_tokens_seen": 93323264, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.19284853354760947, |
| "grad_norm": 1.3475745928522973, |
| "learning_rate": 4.9963492835795797e-05, |
| "loss": 1.0856, |
| "num_input_tokens_seen": 94371840, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1949912950314718, |
| "grad_norm": 1.1041626171352936, |
| "learning_rate": 4.9962574681652675e-05, |
| "loss": 1.0827, |
| "num_input_tokens_seen": 95420416, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.19713405651533414, |
| "grad_norm": 0.924276147277848, |
| "learning_rate": 4.996164513332628e-05, |
| "loss": 1.0815, |
| "num_input_tokens_seen": 96468992, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.19927681799919647, |
| "grad_norm": 1.3177961658200323, |
| "learning_rate": 4.9960704191240926e-05, |
| "loss": 1.0792, |
| "num_input_tokens_seen": 97517568, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.2014195794830588, |
| "grad_norm": 0.9334752844114854, |
| "learning_rate": 4.99597518558261e-05, |
| "loss": 1.0729, |
| "num_input_tokens_seen": 98566144, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.2035623409669211, |
| "grad_norm": 1.1860646160155437, |
| "learning_rate": 4.995878812751649e-05, |
| "loss": 1.0659, |
| "num_input_tokens_seen": 99614720, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.20570510245078344, |
| "grad_norm": 1.1073970834673053, |
| "learning_rate": 4.995781300675199e-05, |
| "loss": 1.0738, |
| "num_input_tokens_seen": 100663296, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.20784786393464577, |
| "grad_norm": 1.0410882369872347, |
| "learning_rate": 4.99568264939777e-05, |
| "loss": 1.0654, |
| "num_input_tokens_seen": 101711872, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.2099906254185081, |
| "grad_norm": 0.9473960462530998, |
| "learning_rate": 4.995582858964392e-05, |
| "loss": 1.0739, |
| "num_input_tokens_seen": 102760448, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.21213338690237044, |
| "grad_norm": 1.0886939381352059, |
| "learning_rate": 4.9954819294206124e-05, |
| "loss": 1.0662, |
| "num_input_tokens_seen": 103809024, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.21427614838623277, |
| "grad_norm": 1.383048793026184, |
| "learning_rate": 4.9953798608125025e-05, |
| "loss": 1.078, |
| "num_input_tokens_seen": 104857600, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.21641890987009507, |
| "grad_norm": 0.8614390068736321, |
| "learning_rate": 4.995276653186651e-05, |
| "loss": 1.0661, |
| "num_input_tokens_seen": 105906176, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.2185616713539574, |
| "grad_norm": 1.1754449020307258, |
| "learning_rate": 4.9951723065901665e-05, |
| "loss": 1.0797, |
| "num_input_tokens_seen": 106954752, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.22070443283781974, |
| "grad_norm": 1.1156873692946436, |
| "learning_rate": 4.995066821070679e-05, |
| "loss": 1.0656, |
| "num_input_tokens_seen": 108003328, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.22284719432168207, |
| "grad_norm": 0.9815049988878293, |
| "learning_rate": 4.994960196676337e-05, |
| "loss": 1.0615, |
| "num_input_tokens_seen": 109051904, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.2249899558055444, |
| "grad_norm": 1.1594613245286667, |
| "learning_rate": 4.994852433455809e-05, |
| "loss": 1.0727, |
| "num_input_tokens_seen": 110100480, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.22713271728940673, |
| "grad_norm": 1.1219733488329482, |
| "learning_rate": 4.9947435314582844e-05, |
| "loss": 1.0661, |
| "num_input_tokens_seen": 111149056, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.22927547877326904, |
| "grad_norm": 1.2406645521219986, |
| "learning_rate": 4.99463349073347e-05, |
| "loss": 1.0665, |
| "num_input_tokens_seen": 112197632, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.23141824025713137, |
| "grad_norm": 0.9176808882852722, |
| "learning_rate": 4.9945223113315966e-05, |
| "loss": 1.0696, |
| "num_input_tokens_seen": 113246208, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.2335610017409937, |
| "grad_norm": 0.9318195936282073, |
| "learning_rate": 4.994409993303409e-05, |
| "loss": 1.0605, |
| "num_input_tokens_seen": 114294784, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.23570376322485603, |
| "grad_norm": 1.0593738678656253, |
| "learning_rate": 4.994296536700177e-05, |
| "loss": 1.0649, |
| "num_input_tokens_seen": 115343360, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.23784652470871837, |
| "grad_norm": 1.2171400848983855, |
| "learning_rate": 4.994181941573687e-05, |
| "loss": 1.0502, |
| "num_input_tokens_seen": 116391936, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.2399892861925807, |
| "grad_norm": 1.0177436664305382, |
| "learning_rate": 4.994066207976247e-05, |
| "loss": 1.052, |
| "num_input_tokens_seen": 117440512, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.24213204767644303, |
| "grad_norm": 1.0732635557180488, |
| "learning_rate": 4.993949335960683e-05, |
| "loss": 1.0566, |
| "num_input_tokens_seen": 118489088, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.24427480916030533, |
| "grad_norm": 1.229545019453543, |
| "learning_rate": 4.9938313255803406e-05, |
| "loss": 1.0538, |
| "num_input_tokens_seen": 119537664, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.24641757064416767, |
| "grad_norm": 0.8920833241411221, |
| "learning_rate": 4.993712176889086e-05, |
| "loss": 1.0422, |
| "num_input_tokens_seen": 120586240, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.24856033212803, |
| "grad_norm": 0.8904929780704225, |
| "learning_rate": 4.993591889941306e-05, |
| "loss": 1.0576, |
| "num_input_tokens_seen": 121634816, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.2507030936118923, |
| "grad_norm": 1.1237317297804248, |
| "learning_rate": 4.993470464791904e-05, |
| "loss": 1.0465, |
| "num_input_tokens_seen": 122683392, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.25284585509575463, |
| "grad_norm": 0.8695137661037583, |
| "learning_rate": 4.9933479014963055e-05, |
| "loss": 1.0615, |
| "num_input_tokens_seen": 123731968, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.25498861657961697, |
| "grad_norm": 0.8752675990810166, |
| "learning_rate": 4.9932242001104556e-05, |
| "loss": 1.0427, |
| "num_input_tokens_seen": 124780544, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.2571313780634793, |
| "grad_norm": 1.1263946339650905, |
| "learning_rate": 4.9930993606908154e-05, |
| "loss": 1.043, |
| "num_input_tokens_seen": 125829120, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.25927413954734163, |
| "grad_norm": 1.1587989669007857, |
| "learning_rate": 4.99297338329437e-05, |
| "loss": 1.0558, |
| "num_input_tokens_seen": 126877696, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.26141690103120396, |
| "grad_norm": 1.100292756182761, |
| "learning_rate": 4.992846267978621e-05, |
| "loss": 1.0595, |
| "num_input_tokens_seen": 127926272, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.2635596625150663, |
| "grad_norm": 0.9764047931891696, |
| "learning_rate": 4.99271801480159e-05, |
| "loss": 1.0567, |
| "num_input_tokens_seen": 128974848, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.2657024239989286, |
| "grad_norm": 1.0337777172886875, |
| "learning_rate": 4.992588623821819e-05, |
| "loss": 1.0402, |
| "num_input_tokens_seen": 130023424, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.26784518548279096, |
| "grad_norm": 1.0931510943833824, |
| "learning_rate": 4.992458095098368e-05, |
| "loss": 1.0518, |
| "num_input_tokens_seen": 131072000, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2699879469666533, |
| "grad_norm": 0.965385066043555, |
| "learning_rate": 4.9923264286908164e-05, |
| "loss": 1.0443, |
| "num_input_tokens_seen": 132120576, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.2721307084505156, |
| "grad_norm": 0.9207891788376037, |
| "learning_rate": 4.9921936246592656e-05, |
| "loss": 1.0335, |
| "num_input_tokens_seen": 133169152, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.27427346993437796, |
| "grad_norm": 0.9453747401932903, |
| "learning_rate": 4.992059683064332e-05, |
| "loss": 1.0345, |
| "num_input_tokens_seen": 134217728, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.27641623141824023, |
| "grad_norm": 0.8736473057042122, |
| "learning_rate": 4.991924603967154e-05, |
| "loss": 1.036, |
| "num_input_tokens_seen": 135266304, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.27855899290210256, |
| "grad_norm": 0.9112732853382438, |
| "learning_rate": 4.991788387429388e-05, |
| "loss": 1.0608, |
| "num_input_tokens_seen": 136314880, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "grad_norm": 0.9148465256720132, |
| "learning_rate": 4.991651033513212e-05, |
| "loss": 1.0245, |
| "num_input_tokens_seen": 137363456, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.28284451586982723, |
| "grad_norm": 0.9883419785308301, |
| "learning_rate": 4.9915125422813187e-05, |
| "loss": 1.0629, |
| "num_input_tokens_seen": 138412032, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.28498727735368956, |
| "grad_norm": 1.270307696820798, |
| "learning_rate": 4.991372913796924e-05, |
| "loss": 1.0389, |
| "num_input_tokens_seen": 139460608, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.2871300388375519, |
| "grad_norm": 0.9097618800977756, |
| "learning_rate": 4.991232148123761e-05, |
| "loss": 1.0436, |
| "num_input_tokens_seen": 140509184, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.2892728003214142, |
| "grad_norm": 1.0267713390462572, |
| "learning_rate": 4.9910902453260824e-05, |
| "loss": 1.0266, |
| "num_input_tokens_seen": 141557760, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.29141556180527656, |
| "grad_norm": 1.005063306500544, |
| "learning_rate": 4.99094720546866e-05, |
| "loss": 1.0378, |
| "num_input_tokens_seen": 142606336, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.2935583232891389, |
| "grad_norm": 1.1299519867382342, |
| "learning_rate": 4.990803028616785e-05, |
| "loss": 1.0403, |
| "num_input_tokens_seen": 143654912, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.2957010847730012, |
| "grad_norm": 0.9592070371054032, |
| "learning_rate": 4.990657714836266e-05, |
| "loss": 1.0371, |
| "num_input_tokens_seen": 144703488, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.29784384625686355, |
| "grad_norm": 1.141468525308474, |
| "learning_rate": 4.990511264193431e-05, |
| "loss": 1.0365, |
| "num_input_tokens_seen": 145752064, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.2999866077407259, |
| "grad_norm": 1.0214002809180978, |
| "learning_rate": 4.9903636767551285e-05, |
| "loss": 1.0309, |
| "num_input_tokens_seen": 146800640, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.30212936922458816, |
| "grad_norm": 0.9708959913413399, |
| "learning_rate": 4.9902149525887255e-05, |
| "loss": 1.0362, |
| "num_input_tokens_seen": 147849216, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.3042721307084505, |
| "grad_norm": 0.9885069162667705, |
| "learning_rate": 4.990065091762106e-05, |
| "loss": 1.0336, |
| "num_input_tokens_seen": 148897792, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.3064148921923128, |
| "grad_norm": 0.9783204712664121, |
| "learning_rate": 4.989914094343675e-05, |
| "loss": 1.0346, |
| "num_input_tokens_seen": 149946368, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.30855765367617516, |
| "grad_norm": 1.1202679861078162, |
| "learning_rate": 4.9897619604023545e-05, |
| "loss": 1.0246, |
| "num_input_tokens_seen": 150994944, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.3107004151600375, |
| "grad_norm": 0.9657646058287815, |
| "learning_rate": 4.9896086900075865e-05, |
| "loss": 1.0289, |
| "num_input_tokens_seen": 152043520, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3128431766438998, |
| "grad_norm": 0.7765273512748966, |
| "learning_rate": 4.989454283229331e-05, |
| "loss": 1.0316, |
| "num_input_tokens_seen": 153092096, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.31498593812776216, |
| "grad_norm": 0.9990709966822854, |
| "learning_rate": 4.9892987401380686e-05, |
| "loss": 1.0403, |
| "num_input_tokens_seen": 154140672, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.3171286996116245, |
| "grad_norm": 0.9112558770615148, |
| "learning_rate": 4.989142060804796e-05, |
| "loss": 1.0207, |
| "num_input_tokens_seen": 155189248, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.3192714610954868, |
| "grad_norm": 0.8239604360975109, |
| "learning_rate": 4.988984245301028e-05, |
| "loss": 1.0335, |
| "num_input_tokens_seen": 156237824, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.32141422257934915, |
| "grad_norm": 0.7965066882079385, |
| "learning_rate": 4.988825293698802e-05, |
| "loss": 1.0262, |
| "num_input_tokens_seen": 157286400, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3235569840632115, |
| "grad_norm": 0.806696091204924, |
| "learning_rate": 4.988665206070671e-05, |
| "loss": 1.0243, |
| "num_input_tokens_seen": 158334976, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.3256997455470738, |
| "grad_norm": 0.7247103495924784, |
| "learning_rate": 4.988503982489707e-05, |
| "loss": 1.0182, |
| "num_input_tokens_seen": 159383552, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.3278425070309361, |
| "grad_norm": 0.7394277153793506, |
| "learning_rate": 4.988341623029499e-05, |
| "loss": 1.0367, |
| "num_input_tokens_seen": 160432128, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.3299852685147984, |
| "grad_norm": 0.7909085688130246, |
| "learning_rate": 4.9881781277641586e-05, |
| "loss": 1.0315, |
| "num_input_tokens_seen": 161480704, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.33212802999866076, |
| "grad_norm": 0.9199353950108281, |
| "learning_rate": 4.9880134967683124e-05, |
| "loss": 1.0177, |
| "num_input_tokens_seen": 162529280, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3342707914825231, |
| "grad_norm": 1.0018645701488917, |
| "learning_rate": 4.987847730117106e-05, |
| "loss": 1.0339, |
| "num_input_tokens_seen": 163577856, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.3364135529663854, |
| "grad_norm": 1.0554032523781822, |
| "learning_rate": 4.987680827886203e-05, |
| "loss": 1.0157, |
| "num_input_tokens_seen": 164626432, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.33855631445024775, |
| "grad_norm": 0.9124413476479185, |
| "learning_rate": 4.987512790151787e-05, |
| "loss": 1.0247, |
| "num_input_tokens_seen": 165675008, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.3406990759341101, |
| "grad_norm": 0.9524677697798024, |
| "learning_rate": 4.987343616990559e-05, |
| "loss": 1.0222, |
| "num_input_tokens_seen": 166723584, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.3428418374179724, |
| "grad_norm": 1.0777046064102398, |
| "learning_rate": 4.987173308479738e-05, |
| "loss": 1.0243, |
| "num_input_tokens_seen": 167772160, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.34498459890183475, |
| "grad_norm": 1.0500416155465637, |
| "learning_rate": 4.987001864697062e-05, |
| "loss": 1.0264, |
| "num_input_tokens_seen": 168820736, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.3471273603856971, |
| "grad_norm": 1.0818693332363452, |
| "learning_rate": 4.986829285720785e-05, |
| "loss": 1.0247, |
| "num_input_tokens_seen": 169869312, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.3492701218695594, |
| "grad_norm": 0.9559100827296363, |
| "learning_rate": 4.986655571629682e-05, |
| "loss": 1.0242, |
| "num_input_tokens_seen": 170917888, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.35141288335342175, |
| "grad_norm": 0.9840195580062531, |
| "learning_rate": 4.9864807225030454e-05, |
| "loss": 1.0181, |
| "num_input_tokens_seen": 171966464, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.353555644837284, |
| "grad_norm": 0.9455632571654006, |
| "learning_rate": 4.9863047384206835e-05, |
| "loss": 1.0107, |
| "num_input_tokens_seen": 173015040, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.35569840632114635, |
| "grad_norm": 0.9705144544240498, |
| "learning_rate": 4.9861276194629256e-05, |
| "loss": 1.0256, |
| "num_input_tokens_seen": 174063616, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.3578411678050087, |
| "grad_norm": 0.8866463242874902, |
| "learning_rate": 4.9859493657106185e-05, |
| "loss": 1.0141, |
| "num_input_tokens_seen": 175112192, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.359983929288871, |
| "grad_norm": 0.6883759012573423, |
| "learning_rate": 4.985769977245124e-05, |
| "loss": 1.0207, |
| "num_input_tokens_seen": 176160768, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.36212669077273335, |
| "grad_norm": 0.7831417666898026, |
| "learning_rate": 4.985589454148326e-05, |
| "loss": 1.0171, |
| "num_input_tokens_seen": 177209344, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.3642694522565957, |
| "grad_norm": 0.8541836155121969, |
| "learning_rate": 4.9854077965026234e-05, |
| "loss": 1.0224, |
| "num_input_tokens_seen": 178257920, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.366412213740458, |
| "grad_norm": 0.7420919461973196, |
| "learning_rate": 4.985225004390934e-05, |
| "loss": 1.0244, |
| "num_input_tokens_seen": 179306496, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.36855497522432035, |
| "grad_norm": 0.7079902130346447, |
| "learning_rate": 4.985041077896695e-05, |
| "loss": 1.0208, |
| "num_input_tokens_seen": 180355072, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.3706977367081827, |
| "grad_norm": 1.0041856975865897, |
| "learning_rate": 4.984856017103857e-05, |
| "loss": 1.0274, |
| "num_input_tokens_seen": 181403648, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.372840498192045, |
| "grad_norm": 1.0259481061387932, |
| "learning_rate": 4.9846698220968934e-05, |
| "loss": 1.0056, |
| "num_input_tokens_seen": 182452224, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.37498325967590734, |
| "grad_norm": 1.0380142050192422, |
| "learning_rate": 4.984482492960791e-05, |
| "loss": 1.0111, |
| "num_input_tokens_seen": 183500800, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3771260211597697, |
| "grad_norm": 0.8459728811497111, |
| "learning_rate": 4.984294029781059e-05, |
| "loss": 1.0112, |
| "num_input_tokens_seen": 184549376, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.379268782643632, |
| "grad_norm": 0.7095378896130021, |
| "learning_rate": 4.9841044326437194e-05, |
| "loss": 1.0178, |
| "num_input_tokens_seen": 185597952, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.3814115441274943, |
| "grad_norm": 0.8696432175517164, |
| "learning_rate": 4.9839137016353147e-05, |
| "loss": 1.0017, |
| "num_input_tokens_seen": 186646528, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.3835543056113566, |
| "grad_norm": 1.0512720531438484, |
| "learning_rate": 4.983721836842903e-05, |
| "loss": 1.0235, |
| "num_input_tokens_seen": 187695104, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.38569706709521895, |
| "grad_norm": 0.9561870070751214, |
| "learning_rate": 4.9835288383540626e-05, |
| "loss": 1.0073, |
| "num_input_tokens_seen": 188743680, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3878398285790813, |
| "grad_norm": 0.9055977049493014, |
| "learning_rate": 4.983334706256888e-05, |
| "loss": 1.0075, |
| "num_input_tokens_seen": 189792256, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.3899825900629436, |
| "grad_norm": 0.9275644252118139, |
| "learning_rate": 4.98313944063999e-05, |
| "loss": 1.0127, |
| "num_input_tokens_seen": 190840832, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.39212535154680594, |
| "grad_norm": 0.8560179313292782, |
| "learning_rate": 4.9829430415924974e-05, |
| "loss": 1.0125, |
| "num_input_tokens_seen": 191889408, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.3942681130306683, |
| "grad_norm": 1.648732944069561, |
| "learning_rate": 4.982745509204058e-05, |
| "loss": 1.0099, |
| "num_input_tokens_seen": 192937984, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.3964108745145306, |
| "grad_norm": 0.9197658476204091, |
| "learning_rate": 4.982546843564834e-05, |
| "loss": 1.0086, |
| "num_input_tokens_seen": 193986560, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.39855363599839294, |
| "grad_norm": 1.2107999479278293, |
| "learning_rate": 4.982347044765508e-05, |
| "loss": 1.0284, |
| "num_input_tokens_seen": 195035136, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.4006963974822553, |
| "grad_norm": 1.181894489618923, |
| "learning_rate": 4.982146112897277e-05, |
| "loss": 1.019, |
| "num_input_tokens_seen": 196083712, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.4028391589661176, |
| "grad_norm": 1.1976480443902457, |
| "learning_rate": 4.9819440480518574e-05, |
| "loss": 1.0188, |
| "num_input_tokens_seen": 197132288, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.40498192044997994, |
| "grad_norm": 0.8228565634507706, |
| "learning_rate": 4.981740850321481e-05, |
| "loss": 1.0066, |
| "num_input_tokens_seen": 198180864, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4071246819338422, |
| "grad_norm": 1.3694193286801866, |
| "learning_rate": 4.9815365197988986e-05, |
| "loss": 1.0199, |
| "num_input_tokens_seen": 199229440, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.40926744341770455, |
| "grad_norm": 0.9081276834937746, |
| "learning_rate": 4.981331056577376e-05, |
| "loss": 1.0128, |
| "num_input_tokens_seen": 200278016, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.4114102049015669, |
| "grad_norm": 0.8864947540491567, |
| "learning_rate": 4.981124460750698e-05, |
| "loss": 1.0082, |
| "num_input_tokens_seen": 201326592, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.4135529663854292, |
| "grad_norm": 0.8828235461765532, |
| "learning_rate": 4.9809167324131645e-05, |
| "loss": 1.0016, |
| "num_input_tokens_seen": 202375168, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.41569572786929154, |
| "grad_norm": 0.9051114336975766, |
| "learning_rate": 4.980707871659593e-05, |
| "loss": 1.0042, |
| "num_input_tokens_seen": 203423744, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.4178384893531539, |
| "grad_norm": 0.9036581277126695, |
| "learning_rate": 4.9804978785853196e-05, |
| "loss": 0.9922, |
| "num_input_tokens_seen": 204472320, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4199812508370162, |
| "grad_norm": 0.8304193637983237, |
| "learning_rate": 4.980286753286195e-05, |
| "loss": 1.0079, |
| "num_input_tokens_seen": 205520896, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.42212401232087854, |
| "grad_norm": 0.9472951756101595, |
| "learning_rate": 4.9800744958585864e-05, |
| "loss": 1.001, |
| "num_input_tokens_seen": 206569472, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.42426677380474087, |
| "grad_norm": 1.1322195801321533, |
| "learning_rate": 4.9798611063993805e-05, |
| "loss": 1.0036, |
| "num_input_tokens_seen": 207618048, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.4264095352886032, |
| "grad_norm": 0.8872065766562148, |
| "learning_rate": 4.979646585005978e-05, |
| "loss": 0.9966, |
| "num_input_tokens_seen": 208666624, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.42855229677246554, |
| "grad_norm": 1.4965045521917073, |
| "learning_rate": 4.979430931776298e-05, |
| "loss": 1.0088, |
| "num_input_tokens_seen": 209715200, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.43069505825632787, |
| "grad_norm": 0.8377677047446743, |
| "learning_rate": 4.9792141468087746e-05, |
| "loss": 1.0005, |
| "num_input_tokens_seen": 210763776, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.43283781974019014, |
| "grad_norm": 1.2081647491986152, |
| "learning_rate": 4.97899623020236e-05, |
| "loss": 0.9931, |
| "num_input_tokens_seen": 211812352, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.4349805812240525, |
| "grad_norm": 1.096772412526795, |
| "learning_rate": 4.978777182056523e-05, |
| "loss": 1.0001, |
| "num_input_tokens_seen": 212860928, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.4371233427079148, |
| "grad_norm": 0.7671831408885996, |
| "learning_rate": 4.9785570024712475e-05, |
| "loss": 1.0049, |
| "num_input_tokens_seen": 213909504, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.43926610419177714, |
| "grad_norm": 1.2438496225084703, |
| "learning_rate": 4.9783356915470344e-05, |
| "loss": 1.0171, |
| "num_input_tokens_seen": 214958080, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.4414088656756395, |
| "grad_norm": 0.9608971236273456, |
| "learning_rate": 4.9781132493849025e-05, |
| "loss": 0.9959, |
| "num_input_tokens_seen": 216006656, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.4435516271595018, |
| "grad_norm": 0.9297760035172808, |
| "learning_rate": 4.977889676086383e-05, |
| "loss": 0.991, |
| "num_input_tokens_seen": 217055232, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.44569438864336414, |
| "grad_norm": 0.9414486663179142, |
| "learning_rate": 4.97766497175353e-05, |
| "loss": 0.9944, |
| "num_input_tokens_seen": 218103808, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.44783715012722647, |
| "grad_norm": 0.8000456781814055, |
| "learning_rate": 4.977439136488907e-05, |
| "loss": 1.0104, |
| "num_input_tokens_seen": 219152384, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.4499799116110888, |
| "grad_norm": 0.6756812079262554, |
| "learning_rate": 4.977212170395598e-05, |
| "loss": 0.9981, |
| "num_input_tokens_seen": 220200960, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.45212267309495113, |
| "grad_norm": 0.6807847019333788, |
| "learning_rate": 4.9769840735772e-05, |
| "loss": 1.0012, |
| "num_input_tokens_seen": 221249536, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.45426543457881347, |
| "grad_norm": 0.8214478039872608, |
| "learning_rate": 4.9767548461378296e-05, |
| "loss": 1.0019, |
| "num_input_tokens_seen": 222298112, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.4564081960626758, |
| "grad_norm": 0.9135429305296144, |
| "learning_rate": 4.976524488182118e-05, |
| "loss": 0.9853, |
| "num_input_tokens_seen": 223346688, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.4585509575465381, |
| "grad_norm": 0.6387595131943387, |
| "learning_rate": 4.976292999815211e-05, |
| "loss": 0.9887, |
| "num_input_tokens_seen": 224395264, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.4606937190304004, |
| "grad_norm": 0.7940521466204665, |
| "learning_rate": 4.976060381142773e-05, |
| "loss": 0.993, |
| "num_input_tokens_seen": 225443840, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.46283648051426274, |
| "grad_norm": 0.9880642635060383, |
| "learning_rate": 4.975826632270982e-05, |
| "loss": 0.9938, |
| "num_input_tokens_seen": 226492416, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.46497924199812507, |
| "grad_norm": 0.8686661622915615, |
| "learning_rate": 4.975591753306533e-05, |
| "loss": 0.9997, |
| "num_input_tokens_seen": 227540992, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.4671220034819874, |
| "grad_norm": 0.8041033834848365, |
| "learning_rate": 4.975355744356637e-05, |
| "loss": 0.9894, |
| "num_input_tokens_seen": 228589568, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.46926476496584973, |
| "grad_norm": 0.8132377305765438, |
| "learning_rate": 4.975118605529019e-05, |
| "loss": 0.9915, |
| "num_input_tokens_seen": 229638144, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.47140752644971207, |
| "grad_norm": 0.9285629824414583, |
| "learning_rate": 4.974880336931923e-05, |
| "loss": 0.9985, |
| "num_input_tokens_seen": 230686720, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4735502879335744, |
| "grad_norm": 1.0134617586666492, |
| "learning_rate": 4.974640938674107e-05, |
| "loss": 1.0019, |
| "num_input_tokens_seen": 231735296, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.47569304941743673, |
| "grad_norm": 0.6308767616473976, |
| "learning_rate": 4.974400410864842e-05, |
| "loss": 0.9842, |
| "num_input_tokens_seen": 232783872, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.47783581090129906, |
| "grad_norm": 0.793272468348317, |
| "learning_rate": 4.9741587536139204e-05, |
| "loss": 0.9973, |
| "num_input_tokens_seen": 233832448, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.4799785723851614, |
| "grad_norm": 0.8278454064136296, |
| "learning_rate": 4.973915967031644e-05, |
| "loss": 0.993, |
| "num_input_tokens_seen": 234881024, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.4821213338690237, |
| "grad_norm": 0.7549982264523979, |
| "learning_rate": 4.9736720512288334e-05, |
| "loss": 0.9956, |
| "num_input_tokens_seen": 235929600, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.48426409535288606, |
| "grad_norm": 0.9478859353787976, |
| "learning_rate": 4.973427006316826e-05, |
| "loss": 0.9834, |
| "num_input_tokens_seen": 236978176, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.48640685683674834, |
| "grad_norm": 1.2636972768723895, |
| "learning_rate": 4.9731808324074717e-05, |
| "loss": 1.0092, |
| "num_input_tokens_seen": 238026752, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.48854961832061067, |
| "grad_norm": 0.7293159014596692, |
| "learning_rate": 4.972933529613135e-05, |
| "loss": 0.9904, |
| "num_input_tokens_seen": 239075328, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.490692379804473, |
| "grad_norm": 7.0328945417058595, |
| "learning_rate": 4.9726850980467e-05, |
| "loss": 1.0093, |
| "num_input_tokens_seen": 240123904, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.49283514128833533, |
| "grad_norm": 1.6482931792205036, |
| "learning_rate": 4.972435537821562e-05, |
| "loss": 1.015, |
| "num_input_tokens_seen": 241172480, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.49497790277219766, |
| "grad_norm": 0.866041440885184, |
| "learning_rate": 4.972184849051633e-05, |
| "loss": 0.9884, |
| "num_input_tokens_seen": 242221056, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.49712066425606, |
| "grad_norm": 1.1433705805979748, |
| "learning_rate": 4.971933031851341e-05, |
| "loss": 0.9992, |
| "num_input_tokens_seen": 243269632, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.49926342573992233, |
| "grad_norm": 1.1749688400490832, |
| "learning_rate": 4.971680086335627e-05, |
| "loss": 1.0002, |
| "num_input_tokens_seen": 244318208, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.5014061872237846, |
| "grad_norm": 1.1383165304481435, |
| "learning_rate": 4.971426012619949e-05, |
| "loss": 1.0272, |
| "num_input_tokens_seen": 245366784, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.503548948707647, |
| "grad_norm": 0.9272257838508966, |
| "learning_rate": 4.971170810820279e-05, |
| "loss": 0.9856, |
| "num_input_tokens_seen": 246415360, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5056917101915093, |
| "grad_norm": 1.049285697170834, |
| "learning_rate": 4.9709144810531026e-05, |
| "loss": 1.0075, |
| "num_input_tokens_seen": 247463936, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.5078344716753717, |
| "grad_norm": 1.1031939359682563, |
| "learning_rate": 4.970657023435424e-05, |
| "loss": 0.9938, |
| "num_input_tokens_seen": 248512512, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.5099772331592339, |
| "grad_norm": 0.8559721116198098, |
| "learning_rate": 4.970398438084758e-05, |
| "loss": 1.0073, |
| "num_input_tokens_seen": 249561088, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.5121199946430963, |
| "grad_norm": 0.8693796569575785, |
| "learning_rate": 4.9701387251191364e-05, |
| "loss": 0.9939, |
| "num_input_tokens_seen": 250609664, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.5142627561269586, |
| "grad_norm": 0.7154096347287712, |
| "learning_rate": 4.969877884657107e-05, |
| "loss": 0.9923, |
| "num_input_tokens_seen": 251658240, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.516405517610821, |
| "grad_norm": 0.7434887839314398, |
| "learning_rate": 4.969615916817728e-05, |
| "loss": 0.9953, |
| "num_input_tokens_seen": 252706816, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.5185482790946833, |
| "grad_norm": 0.8259812014838418, |
| "learning_rate": 4.969352821720577e-05, |
| "loss": 0.9751, |
| "num_input_tokens_seen": 253755392, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.5206910405785456, |
| "grad_norm": 0.8349207689087657, |
| "learning_rate": 4.969088599485743e-05, |
| "loss": 0.9772, |
| "num_input_tokens_seen": 254803968, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.5228338020624079, |
| "grad_norm": 0.9703418596197566, |
| "learning_rate": 4.96882325023383e-05, |
| "loss": 0.9855, |
| "num_input_tokens_seen": 255852544, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.5249765635462702, |
| "grad_norm": 0.780048511806156, |
| "learning_rate": 4.968556774085957e-05, |
| "loss": 0.9938, |
| "num_input_tokens_seen": 256901120, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5271193250301326, |
| "grad_norm": 0.7459697569119867, |
| "learning_rate": 4.968289171163758e-05, |
| "loss": 0.9774, |
| "num_input_tokens_seen": 257949696, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.5292620865139949, |
| "grad_norm": 0.6952628328824583, |
| "learning_rate": 4.9680204415893804e-05, |
| "loss": 0.9858, |
| "num_input_tokens_seen": 258998272, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.5314048479978573, |
| "grad_norm": 0.6326440824353008, |
| "learning_rate": 4.967750585485484e-05, |
| "loss": 0.9878, |
| "num_input_tokens_seen": 260046848, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.5335476094817195, |
| "grad_norm": 0.6886573402560195, |
| "learning_rate": 4.967479602975248e-05, |
| "loss": 0.9858, |
| "num_input_tokens_seen": 261095424, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.5356903709655819, |
| "grad_norm": 0.7358477832059849, |
| "learning_rate": 4.967207494182361e-05, |
| "loss": 0.968, |
| "num_input_tokens_seen": 262144000, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5378331324494442, |
| "grad_norm": 0.719311517365177, |
| "learning_rate": 4.966934259231026e-05, |
| "loss": 0.9732, |
| "num_input_tokens_seen": 263192576, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.5399758939333066, |
| "grad_norm": 0.6469259741219804, |
| "learning_rate": 4.9666598982459635e-05, |
| "loss": 0.9804, |
| "num_input_tokens_seen": 264241152, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.5421186554171689, |
| "grad_norm": 0.6295830105068848, |
| "learning_rate": 4.9663844113524035e-05, |
| "loss": 0.9849, |
| "num_input_tokens_seen": 265289728, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.5442614169010312, |
| "grad_norm": 0.6470841192271415, |
| "learning_rate": 4.966107798676095e-05, |
| "loss": 0.998, |
| "num_input_tokens_seen": 266338304, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.5464041783848935, |
| "grad_norm": 0.745769037672019, |
| "learning_rate": 4.965830060343295e-05, |
| "loss": 0.9786, |
| "num_input_tokens_seen": 267386880, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5485469398687559, |
| "grad_norm": 0.8367677663996089, |
| "learning_rate": 4.9655511964807785e-05, |
| "loss": 0.966, |
| "num_input_tokens_seen": 268435456, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.5506897013526182, |
| "grad_norm": 0.7767484325908519, |
| "learning_rate": 4.965271207215835e-05, |
| "loss": 0.9812, |
| "num_input_tokens_seen": 269484032, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.5528324628364805, |
| "grad_norm": 0.8001991506895153, |
| "learning_rate": 4.964990092676263e-05, |
| "loss": 0.978, |
| "num_input_tokens_seen": 270532608, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.5549752243203429, |
| "grad_norm": 0.8240750872880366, |
| "learning_rate": 4.964707852990378e-05, |
| "loss": 0.9774, |
| "num_input_tokens_seen": 271581184, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.5571179858042051, |
| "grad_norm": 0.8910873941590336, |
| "learning_rate": 4.964424488287009e-05, |
| "loss": 0.9748, |
| "num_input_tokens_seen": 272629760, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5592607472880675, |
| "grad_norm": 0.8594739011298812, |
| "learning_rate": 4.9641399986955e-05, |
| "loss": 0.9774, |
| "num_input_tokens_seen": 273678336, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "grad_norm": 0.7932194322462572, |
| "learning_rate": 4.963854384345702e-05, |
| "loss": 0.977, |
| "num_input_tokens_seen": 274726912, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.5635462702557922, |
| "grad_norm": 0.7206472566566292, |
| "learning_rate": 4.963567645367988e-05, |
| "loss": 0.9787, |
| "num_input_tokens_seen": 275775488, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.5656890317396545, |
| "grad_norm": 0.67771883626031, |
| "learning_rate": 4.9632797818932374e-05, |
| "loss": 0.974, |
| "num_input_tokens_seen": 276824064, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.5678317932235168, |
| "grad_norm": 0.6254390758836766, |
| "learning_rate": 4.962990794052847e-05, |
| "loss": 0.982, |
| "num_input_tokens_seen": 277872640, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.5699745547073791, |
| "grad_norm": 0.6719281453617003, |
| "learning_rate": 4.962700681978725e-05, |
| "loss": 0.9784, |
| "num_input_tokens_seen": 278921216, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.5721173161912415, |
| "grad_norm": 0.7065800364114421, |
| "learning_rate": 4.9624094458032946e-05, |
| "loss": 0.9645, |
| "num_input_tokens_seen": 279969792, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.5742600776751038, |
| "grad_norm": 0.8137810359656845, |
| "learning_rate": 4.962117085659489e-05, |
| "loss": 0.976, |
| "num_input_tokens_seen": 281018368, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.5764028391589661, |
| "grad_norm": 0.8384457143591761, |
| "learning_rate": 4.9618236016807564e-05, |
| "loss": 0.9745, |
| "num_input_tokens_seen": 282066944, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.5785456006428285, |
| "grad_norm": 0.8204715252519866, |
| "learning_rate": 4.9615289940010584e-05, |
| "loss": 0.9593, |
| "num_input_tokens_seen": 283115520, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5806883621266907, |
| "grad_norm": 0.7458887228441663, |
| "learning_rate": 4.9612332627548686e-05, |
| "loss": 0.9629, |
| "num_input_tokens_seen": 284164096, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.5828311236105531, |
| "grad_norm": 0.7023067986639293, |
| "learning_rate": 4.9609364080771735e-05, |
| "loss": 0.9601, |
| "num_input_tokens_seen": 285212672, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.5849738850944154, |
| "grad_norm": 0.6929007805184092, |
| "learning_rate": 4.960638430103473e-05, |
| "loss": 0.9699, |
| "num_input_tokens_seen": 286261248, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.5871166465782778, |
| "grad_norm": 0.6061285658814772, |
| "learning_rate": 4.96033932896978e-05, |
| "loss": 0.9748, |
| "num_input_tokens_seen": 287309824, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.58925940806214, |
| "grad_norm": 0.6819628224136882, |
| "learning_rate": 4.960039104812618e-05, |
| "loss": 0.967, |
| "num_input_tokens_seen": 288358400, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5914021695460024, |
| "grad_norm": 0.7578327655133753, |
| "learning_rate": 4.959737757769025e-05, |
| "loss": 0.9697, |
| "num_input_tokens_seen": 289406976, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.5935449310298647, |
| "grad_norm": 0.7930136215053085, |
| "learning_rate": 4.959435287976551e-05, |
| "loss": 0.9798, |
| "num_input_tokens_seen": 290455552, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.5956876925137271, |
| "grad_norm": 0.6902790672018838, |
| "learning_rate": 4.9591316955732595e-05, |
| "loss": 0.9683, |
| "num_input_tokens_seen": 291504128, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.5978304539975894, |
| "grad_norm": 0.7366948695256316, |
| "learning_rate": 4.9588269806977236e-05, |
| "loss": 0.981, |
| "num_input_tokens_seen": 292552704, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.5999732154814518, |
| "grad_norm": 0.7412565614434039, |
| "learning_rate": 4.958521143489032e-05, |
| "loss": 0.9655, |
| "num_input_tokens_seen": 293601280, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.602115976965314, |
| "grad_norm": 0.7638328455546902, |
| "learning_rate": 4.9582141840867835e-05, |
| "loss": 0.9768, |
| "num_input_tokens_seen": 294649856, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.6042587384491763, |
| "grad_norm": 0.7859880911062314, |
| "learning_rate": 4.957906102631091e-05, |
| "loss": 0.9655, |
| "num_input_tokens_seen": 295698432, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.6064014999330387, |
| "grad_norm": 0.7021334869310707, |
| "learning_rate": 4.9575968992625775e-05, |
| "loss": 0.9714, |
| "num_input_tokens_seen": 296747008, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.608544261416901, |
| "grad_norm": 0.7151227917175634, |
| "learning_rate": 4.957286574122379e-05, |
| "loss": 0.9805, |
| "num_input_tokens_seen": 297795584, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.6106870229007634, |
| "grad_norm": 0.8089921958905767, |
| "learning_rate": 4.9569751273521454e-05, |
| "loss": 0.9749, |
| "num_input_tokens_seen": 298844160, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6128297843846257, |
| "grad_norm": 0.8163799273123393, |
| "learning_rate": 4.956662559094034e-05, |
| "loss": 0.9628, |
| "num_input_tokens_seen": 299892736, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.614972545868488, |
| "grad_norm": 0.7583938434718084, |
| "learning_rate": 4.9563488694907186e-05, |
| "loss": 0.9855, |
| "num_input_tokens_seen": 300941312, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.6171153073523503, |
| "grad_norm": 0.7961064465249695, |
| "learning_rate": 4.9560340586853825e-05, |
| "loss": 0.9812, |
| "num_input_tokens_seen": 301989888, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.6192580688362127, |
| "grad_norm": 0.8537459633314501, |
| "learning_rate": 4.9557181268217227e-05, |
| "loss": 0.9788, |
| "num_input_tokens_seen": 303038464, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.621400830320075, |
| "grad_norm": 0.8608551087009711, |
| "learning_rate": 4.9554010740439435e-05, |
| "loss": 0.9649, |
| "num_input_tokens_seen": 304087040, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6235435918039374, |
| "grad_norm": 0.8426716239141546, |
| "learning_rate": 4.955082900496766e-05, |
| "loss": 0.9652, |
| "num_input_tokens_seen": 305135616, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.6256863532877996, |
| "grad_norm": 0.8378716840701593, |
| "learning_rate": 4.9547636063254196e-05, |
| "loss": 0.9772, |
| "num_input_tokens_seen": 306184192, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.627829114771662, |
| "grad_norm": 0.7383256729006417, |
| "learning_rate": 4.954443191675648e-05, |
| "loss": 0.968, |
| "num_input_tokens_seen": 307232768, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.6299718762555243, |
| "grad_norm": 0.8321449404267852, |
| "learning_rate": 4.954121656693703e-05, |
| "loss": 0.9608, |
| "num_input_tokens_seen": 308281344, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.6321146377393866, |
| "grad_norm": 0.9058689828945011, |
| "learning_rate": 4.9537990015263505e-05, |
| "loss": 0.9624, |
| "num_input_tokens_seen": 309329920, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.634257399223249, |
| "grad_norm": 0.8558551476463713, |
| "learning_rate": 4.953475226320866e-05, |
| "loss": 0.9759, |
| "num_input_tokens_seen": 310378496, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.6364001607071113, |
| "grad_norm": 0.8994060904273229, |
| "learning_rate": 4.9531503312250375e-05, |
| "loss": 0.9698, |
| "num_input_tokens_seen": 311427072, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.6385429221909736, |
| "grad_norm": 0.9092829036891147, |
| "learning_rate": 4.952824316387163e-05, |
| "loss": 0.9588, |
| "num_input_tokens_seen": 312475648, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.6406856836748359, |
| "grad_norm": 0.7764855569244933, |
| "learning_rate": 4.952497181956053e-05, |
| "loss": 0.9622, |
| "num_input_tokens_seen": 313524224, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.6428284451586983, |
| "grad_norm": 0.7428489302275753, |
| "learning_rate": 4.952168928081027e-05, |
| "loss": 0.9663, |
| "num_input_tokens_seen": 314572800, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6449712066425606, |
| "grad_norm": 0.6931852532792968, |
| "learning_rate": 4.951839554911917e-05, |
| "loss": 0.9599, |
| "num_input_tokens_seen": 315621376, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.647113968126423, |
| "grad_norm": 0.6380122510191174, |
| "learning_rate": 4.951509062599066e-05, |
| "loss": 0.9696, |
| "num_input_tokens_seen": 316669952, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.6492567296102852, |
| "grad_norm": 0.627484942520017, |
| "learning_rate": 4.951177451293328e-05, |
| "loss": 0.9649, |
| "num_input_tokens_seen": 317718528, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.6513994910941476, |
| "grad_norm": 0.669276814072384, |
| "learning_rate": 4.950844721146066e-05, |
| "loss": 0.9617, |
| "num_input_tokens_seen": 318767104, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.6535422525780099, |
| "grad_norm": 0.6472064020866986, |
| "learning_rate": 4.950510872309155e-05, |
| "loss": 0.9593, |
| "num_input_tokens_seen": 319815680, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6556850140618722, |
| "grad_norm": 0.6282593277453888, |
| "learning_rate": 4.950175904934982e-05, |
| "loss": 0.9682, |
| "num_input_tokens_seen": 320864256, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.6578277755457346, |
| "grad_norm": 0.6233384774128375, |
| "learning_rate": 4.949839819176442e-05, |
| "loss": 0.9672, |
| "num_input_tokens_seen": 321912832, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.6599705370295968, |
| "grad_norm": 0.6406545710871328, |
| "learning_rate": 4.949502615186941e-05, |
| "loss": 0.9667, |
| "num_input_tokens_seen": 322961408, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.6621132985134592, |
| "grad_norm": 0.66860629438086, |
| "learning_rate": 4.949164293120397e-05, |
| "loss": 0.9672, |
| "num_input_tokens_seen": 324009984, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.6642560599973215, |
| "grad_norm": 0.7221412776198233, |
| "learning_rate": 4.948824853131236e-05, |
| "loss": 0.9768, |
| "num_input_tokens_seen": 325058560, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6663988214811839, |
| "grad_norm": 0.7520232960695742, |
| "learning_rate": 4.948484295374397e-05, |
| "loss": 0.9636, |
| "num_input_tokens_seen": 326107136, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.6685415829650462, |
| "grad_norm": 0.6404508643255505, |
| "learning_rate": 4.948142620005328e-05, |
| "loss": 0.956, |
| "num_input_tokens_seen": 327155712, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.6706843444489086, |
| "grad_norm": 0.7309479445003378, |
| "learning_rate": 4.947799827179986e-05, |
| "loss": 0.9562, |
| "num_input_tokens_seen": 328204288, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.6728271059327708, |
| "grad_norm": 0.6802683062309518, |
| "learning_rate": 4.9474559170548387e-05, |
| "loss": 0.9746, |
| "num_input_tokens_seen": 329252864, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.6749698674166332, |
| "grad_norm": 0.6778007963541558, |
| "learning_rate": 4.947110889786864e-05, |
| "loss": 0.9578, |
| "num_input_tokens_seen": 330301440, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.6771126289004955, |
| "grad_norm": 0.7269233162497253, |
| "learning_rate": 4.946764745533552e-05, |
| "loss": 0.955, |
| "num_input_tokens_seen": 331350016, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.6792553903843579, |
| "grad_norm": 0.8004146317091602, |
| "learning_rate": 4.9464174844528984e-05, |
| "loss": 0.9601, |
| "num_input_tokens_seen": 332398592, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.6813981518682202, |
| "grad_norm": 0.7753178744802244, |
| "learning_rate": 4.946069106703411e-05, |
| "loss": 0.9594, |
| "num_input_tokens_seen": 333447168, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.6835409133520824, |
| "grad_norm": 0.6815178840652059, |
| "learning_rate": 4.9457196124441073e-05, |
| "loss": 0.9578, |
| "num_input_tokens_seen": 334495744, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.6856836748359448, |
| "grad_norm": 0.686119514860132, |
| "learning_rate": 4.9453690018345144e-05, |
| "loss": 0.9605, |
| "num_input_tokens_seen": 335544320, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6878264363198071, |
| "grad_norm": 0.6182467278075696, |
| "learning_rate": 4.9450172750346684e-05, |
| "loss": 0.9531, |
| "num_input_tokens_seen": 336592896, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.6899691978036695, |
| "grad_norm": 0.7082354365899891, |
| "learning_rate": 4.944664432205115e-05, |
| "loss": 0.9652, |
| "num_input_tokens_seen": 337641472, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.6921119592875318, |
| "grad_norm": 0.6792138640035731, |
| "learning_rate": 4.944310473506911e-05, |
| "loss": 0.9535, |
| "num_input_tokens_seen": 338690048, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.6942547207713942, |
| "grad_norm": 0.6443309953592322, |
| "learning_rate": 4.9439553991016187e-05, |
| "loss": 0.9659, |
| "num_input_tokens_seen": 339738624, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.6963974822552564, |
| "grad_norm": 0.6884219897088883, |
| "learning_rate": 4.943599209151314e-05, |
| "loss": 0.9626, |
| "num_input_tokens_seen": 340787200, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6985402437391188, |
| "grad_norm": 0.7474123707956007, |
| "learning_rate": 4.9432419038185794e-05, |
| "loss": 0.9579, |
| "num_input_tokens_seen": 341835776, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.7006830052229811, |
| "grad_norm": 0.8001439442798239, |
| "learning_rate": 4.942883483266507e-05, |
| "loss": 0.9585, |
| "num_input_tokens_seen": 342884352, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.7028257667068435, |
| "grad_norm": 0.6776134193455201, |
| "learning_rate": 4.942523947658698e-05, |
| "loss": 0.9584, |
| "num_input_tokens_seen": 343932928, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.7049685281907058, |
| "grad_norm": 0.6444901731086969, |
| "learning_rate": 4.942163297159263e-05, |
| "loss": 0.9438, |
| "num_input_tokens_seen": 344981504, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.707111289674568, |
| "grad_norm": 0.6635883359485919, |
| "learning_rate": 4.9418015319328204e-05, |
| "loss": 0.9524, |
| "num_input_tokens_seen": 346030080, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7092540511584304, |
| "grad_norm": 0.6682935448210751, |
| "learning_rate": 4.9414386521445e-05, |
| "loss": 0.956, |
| "num_input_tokens_seen": 347078656, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.7113968126422927, |
| "grad_norm": 0.5451374703215448, |
| "learning_rate": 4.941074657959937e-05, |
| "loss": 0.9568, |
| "num_input_tokens_seen": 348127232, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.7135395741261551, |
| "grad_norm": 0.6617882607114642, |
| "learning_rate": 4.940709549545276e-05, |
| "loss": 0.9788, |
| "num_input_tokens_seen": 349175808, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.7156823356100174, |
| "grad_norm": 0.7663700039234058, |
| "learning_rate": 4.940343327067172e-05, |
| "loss": 0.9611, |
| "num_input_tokens_seen": 350224384, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.7178250970938798, |
| "grad_norm": 0.7106293208476724, |
| "learning_rate": 4.939975990692789e-05, |
| "loss": 0.9433, |
| "num_input_tokens_seen": 351272960, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.719967858577742, |
| "grad_norm": 0.8812609616643794, |
| "learning_rate": 4.939607540589795e-05, |
| "loss": 0.9522, |
| "num_input_tokens_seen": 352321536, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.7221106200616044, |
| "grad_norm": 1.180776889607567, |
| "learning_rate": 4.9392379769263716e-05, |
| "loss": 0.9644, |
| "num_input_tokens_seen": 353370112, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.7242533815454667, |
| "grad_norm": 0.9180392680374633, |
| "learning_rate": 4.9388672998712046e-05, |
| "loss": 0.9498, |
| "num_input_tokens_seen": 354418688, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.7263961430293291, |
| "grad_norm": 0.686049375541203, |
| "learning_rate": 4.938495509593492e-05, |
| "loss": 0.9603, |
| "num_input_tokens_seen": 355467264, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.7285389045131914, |
| "grad_norm": 0.6408315854104981, |
| "learning_rate": 4.938122606262936e-05, |
| "loss": 0.951, |
| "num_input_tokens_seen": 356515840, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7306816659970538, |
| "grad_norm": 0.7454673929992747, |
| "learning_rate": 4.9377485900497476e-05, |
| "loss": 0.946, |
| "num_input_tokens_seen": 357564416, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.732824427480916, |
| "grad_norm": 0.807025821794171, |
| "learning_rate": 4.937373461124649e-05, |
| "loss": 0.9694, |
| "num_input_tokens_seen": 358612992, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.7349671889647783, |
| "grad_norm": 1.0392775338900357, |
| "learning_rate": 4.9369972196588676e-05, |
| "loss": 0.9606, |
| "num_input_tokens_seen": 359661568, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.7371099504486407, |
| "grad_norm": 0.8056437211626823, |
| "learning_rate": 4.936619865824138e-05, |
| "loss": 0.9494, |
| "num_input_tokens_seen": 360710144, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.739252711932503, |
| "grad_norm": 0.6692576799616317, |
| "learning_rate": 4.936241399792705e-05, |
| "loss": 0.9471, |
| "num_input_tokens_seen": 361758720, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7413954734163654, |
| "grad_norm": 0.6439662667669066, |
| "learning_rate": 4.935861821737318e-05, |
| "loss": 0.9518, |
| "num_input_tokens_seen": 362807296, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.7435382349002276, |
| "grad_norm": 0.6971274641706622, |
| "learning_rate": 4.9354811318312367e-05, |
| "loss": 0.958, |
| "num_input_tokens_seen": 363855872, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.74568099638409, |
| "grad_norm": 0.8246209023489939, |
| "learning_rate": 4.935099330248227e-05, |
| "loss": 0.9575, |
| "num_input_tokens_seen": 364904448, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.7478237578679523, |
| "grad_norm": 0.6454804197784619, |
| "learning_rate": 4.934716417162563e-05, |
| "loss": 0.9527, |
| "num_input_tokens_seen": 365953024, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.7499665193518147, |
| "grad_norm": 0.7907007746191874, |
| "learning_rate": 4.934332392749025e-05, |
| "loss": 0.9534, |
| "num_input_tokens_seen": 367001600, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.752109280835677, |
| "grad_norm": 0.8537019071397235, |
| "learning_rate": 4.933947257182901e-05, |
| "loss": 0.9618, |
| "num_input_tokens_seen": 368050176, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.7542520423195394, |
| "grad_norm": 0.8452849854896484, |
| "learning_rate": 4.9335610106399864e-05, |
| "loss": 0.9563, |
| "num_input_tokens_seen": 369098752, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.7563948038034016, |
| "grad_norm": 0.8365818755601035, |
| "learning_rate": 4.933173653296585e-05, |
| "loss": 0.9433, |
| "num_input_tokens_seen": 370147328, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.758537565287264, |
| "grad_norm": 0.8843871389931269, |
| "learning_rate": 4.932785185329505e-05, |
| "loss": 0.9634, |
| "num_input_tokens_seen": 371195904, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.7606803267711263, |
| "grad_norm": 0.7011261167297714, |
| "learning_rate": 4.932395606916062e-05, |
| "loss": 0.9546, |
| "num_input_tokens_seen": 372244480, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.7628230882549886, |
| "grad_norm": 0.7236664874887737, |
| "learning_rate": 4.932004918234082e-05, |
| "loss": 0.9405, |
| "num_input_tokens_seen": 373293056, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.764965849738851, |
| "grad_norm": 0.6365713819464895, |
| "learning_rate": 4.931613119461893e-05, |
| "loss": 0.9456, |
| "num_input_tokens_seen": 374341632, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.7671086112227132, |
| "grad_norm": 0.7016893089903377, |
| "learning_rate": 4.931220210778332e-05, |
| "loss": 0.9578, |
| "num_input_tokens_seen": 375390208, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.7692513727065756, |
| "grad_norm": 0.604362477542579, |
| "learning_rate": 4.930826192362744e-05, |
| "loss": 0.9397, |
| "num_input_tokens_seen": 376438784, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.7713941341904379, |
| "grad_norm": 0.5848016990696451, |
| "learning_rate": 4.930431064394977e-05, |
| "loss": 0.9595, |
| "num_input_tokens_seen": 377487360, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7735368956743003, |
| "grad_norm": 0.5429284750696663, |
| "learning_rate": 4.930034827055388e-05, |
| "loss": 0.9411, |
| "num_input_tokens_seen": 378535936, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.7756796571581626, |
| "grad_norm": 0.7012823730956075, |
| "learning_rate": 4.92963748052484e-05, |
| "loss": 0.946, |
| "num_input_tokens_seen": 379584512, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.777822418642025, |
| "grad_norm": 0.8647844578965358, |
| "learning_rate": 4.929239024984702e-05, |
| "loss": 0.9537, |
| "num_input_tokens_seen": 380633088, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.7799651801258872, |
| "grad_norm": 0.9492005933550962, |
| "learning_rate": 4.9288394606168494e-05, |
| "loss": 0.9538, |
| "num_input_tokens_seen": 381681664, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.7821079416097496, |
| "grad_norm": 0.9833441664810633, |
| "learning_rate": 4.928438787603664e-05, |
| "loss": 0.9551, |
| "num_input_tokens_seen": 382730240, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.7842507030936119, |
| "grad_norm": 0.874577604103094, |
| "learning_rate": 4.928037006128032e-05, |
| "loss": 0.9536, |
| "num_input_tokens_seen": 383778816, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.7863934645774742, |
| "grad_norm": 0.6387335904967489, |
| "learning_rate": 4.927634116373349e-05, |
| "loss": 0.9408, |
| "num_input_tokens_seen": 384827392, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.7885362260613366, |
| "grad_norm": 0.6243565041547219, |
| "learning_rate": 4.9272301185235116e-05, |
| "loss": 0.9435, |
| "num_input_tokens_seen": 385875968, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.7906789875451988, |
| "grad_norm": 0.669927371235254, |
| "learning_rate": 4.9268250127629265e-05, |
| "loss": 0.95, |
| "num_input_tokens_seen": 386924544, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.7928217490290612, |
| "grad_norm": 0.6159925821864788, |
| "learning_rate": 4.926418799276504e-05, |
| "loss": 0.9403, |
| "num_input_tokens_seen": 387973120, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7949645105129235, |
| "grad_norm": 0.5805674830639145, |
| "learning_rate": 4.926011478249661e-05, |
| "loss": 0.9489, |
| "num_input_tokens_seen": 389021696, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.7971072719967859, |
| "grad_norm": 0.5990935108596377, |
| "learning_rate": 4.925603049868319e-05, |
| "loss": 0.9333, |
| "num_input_tokens_seen": 390070272, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.7992500334806482, |
| "grad_norm": 0.56780569716724, |
| "learning_rate": 4.925193514318906e-05, |
| "loss": 0.9524, |
| "num_input_tokens_seen": 391118848, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.8013927949645105, |
| "grad_norm": 0.5555146256374626, |
| "learning_rate": 4.924782871788354e-05, |
| "loss": 0.9455, |
| "num_input_tokens_seen": 392167424, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.8035355564483728, |
| "grad_norm": 0.6025921472581821, |
| "learning_rate": 4.924371122464101e-05, |
| "loss": 0.9502, |
| "num_input_tokens_seen": 393216000, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8056783179322352, |
| "grad_norm": 0.6412503310742468, |
| "learning_rate": 4.923958266534091e-05, |
| "loss": 0.9553, |
| "num_input_tokens_seen": 394264576, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.8078210794160975, |
| "grad_norm": 0.6035262643666147, |
| "learning_rate": 4.923544304186771e-05, |
| "loss": 0.9462, |
| "num_input_tokens_seen": 395313152, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.8099638408999599, |
| "grad_norm": 0.6773878542789623, |
| "learning_rate": 4.923129235611096e-05, |
| "loss": 0.9484, |
| "num_input_tokens_seen": 396361728, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.8121066023838222, |
| "grad_norm": 0.7181038641332558, |
| "learning_rate": 4.922713060996524e-05, |
| "loss": 0.9452, |
| "num_input_tokens_seen": 397410304, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.8142493638676844, |
| "grad_norm": 0.719797180992769, |
| "learning_rate": 4.922295780533017e-05, |
| "loss": 0.9433, |
| "num_input_tokens_seen": 398458880, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8163921253515468, |
| "grad_norm": 0.6697588054636289, |
| "learning_rate": 4.921877394411045e-05, |
| "loss": 0.9538, |
| "num_input_tokens_seen": 399507456, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.8185348868354091, |
| "grad_norm": 0.6623009304845271, |
| "learning_rate": 4.9214579028215776e-05, |
| "loss": 0.9482, |
| "num_input_tokens_seen": 400556032, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.8206776483192715, |
| "grad_norm": 0.6039497437674582, |
| "learning_rate": 4.921037305956095e-05, |
| "loss": 0.9536, |
| "num_input_tokens_seen": 401604608, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.8228204098031338, |
| "grad_norm": 0.7653003468169798, |
| "learning_rate": 4.920615604006578e-05, |
| "loss": 0.9423, |
| "num_input_tokens_seen": 402653184, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.8249631712869961, |
| "grad_norm": 0.7904738063345222, |
| "learning_rate": 4.920192797165511e-05, |
| "loss": 0.9347, |
| "num_input_tokens_seen": 403701760, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8271059327708584, |
| "grad_norm": 0.7924189341019402, |
| "learning_rate": 4.919768885625887e-05, |
| "loss": 0.9454, |
| "num_input_tokens_seen": 404750336, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.8292486942547208, |
| "grad_norm": 0.8312139658612682, |
| "learning_rate": 4.9193438695811985e-05, |
| "loss": 0.9386, |
| "num_input_tokens_seen": 405798912, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.8313914557385831, |
| "grad_norm": 0.8646683775446922, |
| "learning_rate": 4.9189177492254455e-05, |
| "loss": 0.9392, |
| "num_input_tokens_seen": 406847488, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.8335342172224455, |
| "grad_norm": 1.0294254039782134, |
| "learning_rate": 4.9184905247531316e-05, |
| "loss": 0.9483, |
| "num_input_tokens_seen": 407896064, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.8356769787063077, |
| "grad_norm": 0.8744877993208896, |
| "learning_rate": 4.918062196359263e-05, |
| "loss": 0.945, |
| "num_input_tokens_seen": 408944640, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8378197401901701, |
| "grad_norm": 0.7205016171684795, |
| "learning_rate": 4.917632764239349e-05, |
| "loss": 0.9406, |
| "num_input_tokens_seen": 409993216, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.8399625016740324, |
| "grad_norm": 0.8646230649226543, |
| "learning_rate": 4.9172022285894074e-05, |
| "loss": 0.9425, |
| "num_input_tokens_seen": 411041792, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 0.72954927450801, |
| "learning_rate": 4.9167705896059527e-05, |
| "loss": 0.9375, |
| "num_input_tokens_seen": 412090368, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.8442480246417571, |
| "grad_norm": 0.6048658838544064, |
| "learning_rate": 4.91633784748601e-05, |
| "loss": 0.9411, |
| "num_input_tokens_seen": 413138944, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.8463907861256194, |
| "grad_norm": 0.6584812604484274, |
| "learning_rate": 4.915904002427103e-05, |
| "loss": 0.9346, |
| "num_input_tokens_seen": 414187520, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8485335476094817, |
| "grad_norm": 0.771775334405825, |
| "learning_rate": 4.9154690546272606e-05, |
| "loss": 0.9435, |
| "num_input_tokens_seen": 415236096, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.850676309093344, |
| "grad_norm": 0.7257743982464717, |
| "learning_rate": 4.9150330042850155e-05, |
| "loss": 0.9411, |
| "num_input_tokens_seen": 416284672, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.8528190705772064, |
| "grad_norm": 0.6471296513959546, |
| "learning_rate": 4.9145958515994025e-05, |
| "loss": 0.9423, |
| "num_input_tokens_seen": 417333248, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.8549618320610687, |
| "grad_norm": 0.6905884924419747, |
| "learning_rate": 4.914157596769962e-05, |
| "loss": 0.9478, |
| "num_input_tokens_seen": 418381824, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.8571045935449311, |
| "grad_norm": 0.8067538886359611, |
| "learning_rate": 4.9137182399967343e-05, |
| "loss": 0.9418, |
| "num_input_tokens_seen": 419430400, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8592473550287933, |
| "grad_norm": 0.6249988394159846, |
| "learning_rate": 4.9132777814802634e-05, |
| "loss": 0.9351, |
| "num_input_tokens_seen": 420478976, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.8613901165126557, |
| "grad_norm": 0.6847816284919354, |
| "learning_rate": 4.9128362214215986e-05, |
| "loss": 0.9385, |
| "num_input_tokens_seen": 421527552, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.863532877996518, |
| "grad_norm": 0.832176639221433, |
| "learning_rate": 4.912393560022288e-05, |
| "loss": 0.9468, |
| "num_input_tokens_seen": 422576128, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.8656756394803803, |
| "grad_norm": 0.6997745179543038, |
| "learning_rate": 4.911949797484388e-05, |
| "loss": 0.9327, |
| "num_input_tokens_seen": 423624704, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.8678184009642427, |
| "grad_norm": 0.6321940710919781, |
| "learning_rate": 4.9115049340104505e-05, |
| "loss": 0.9351, |
| "num_input_tokens_seen": 424673280, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.869961162448105, |
| "grad_norm": 0.7060643613594743, |
| "learning_rate": 4.911058969803536e-05, |
| "loss": 0.945, |
| "num_input_tokens_seen": 425721856, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.8721039239319673, |
| "grad_norm": 0.6065745860189082, |
| "learning_rate": 4.910611905067205e-05, |
| "loss": 0.9407, |
| "num_input_tokens_seen": 426770432, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.8742466854158296, |
| "grad_norm": 0.5015704205138852, |
| "learning_rate": 4.91016374000552e-05, |
| "loss": 0.9305, |
| "num_input_tokens_seen": 427819008, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.876389446899692, |
| "grad_norm": 0.6567251162870986, |
| "learning_rate": 4.909714474823047e-05, |
| "loss": 0.951, |
| "num_input_tokens_seen": 428867584, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.8785322083835543, |
| "grad_norm": 0.5920933662760173, |
| "learning_rate": 4.909264109724853e-05, |
| "loss": 0.9315, |
| "num_input_tokens_seen": 429916160, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8806749698674167, |
| "grad_norm": 0.660025661812582, |
| "learning_rate": 4.9088126449165065e-05, |
| "loss": 0.9308, |
| "num_input_tokens_seen": 430964736, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.882817731351279, |
| "grad_norm": 0.7662454510330629, |
| "learning_rate": 4.90836008060408e-05, |
| "loss": 0.9428, |
| "num_input_tokens_seen": 432013312, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.8849604928351413, |
| "grad_norm": 0.6707054047645358, |
| "learning_rate": 4.907906416994146e-05, |
| "loss": 0.9405, |
| "num_input_tokens_seen": 433061888, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.8871032543190036, |
| "grad_norm": 0.7475341888198394, |
| "learning_rate": 4.9074516542937795e-05, |
| "loss": 0.9479, |
| "num_input_tokens_seen": 434110464, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.889246015802866, |
| "grad_norm": 0.7247188513031333, |
| "learning_rate": 4.9069957927105586e-05, |
| "loss": 0.9348, |
| "num_input_tokens_seen": 435159040, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.8913887772867283, |
| "grad_norm": 0.5567338151611357, |
| "learning_rate": 4.906538832452561e-05, |
| "loss": 0.9292, |
| "num_input_tokens_seen": 436207616, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.8935315387705905, |
| "grad_norm": 0.6675238484825794, |
| "learning_rate": 4.9060807737283656e-05, |
| "loss": 0.9354, |
| "num_input_tokens_seen": 437256192, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.8956743002544529, |
| "grad_norm": 0.6283836886644354, |
| "learning_rate": 4.905621616747054e-05, |
| "loss": 0.9351, |
| "num_input_tokens_seen": 438304768, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.8978170617383152, |
| "grad_norm": 0.6058437294929521, |
| "learning_rate": 4.905161361718209e-05, |
| "loss": 0.9309, |
| "num_input_tokens_seen": 439353344, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.8999598232221776, |
| "grad_norm": 0.6772602115603277, |
| "learning_rate": 4.9047000088519144e-05, |
| "loss": 0.9384, |
| "num_input_tokens_seen": 440401920, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9021025847060399, |
| "grad_norm": 0.7034846488508698, |
| "learning_rate": 4.9042375583587555e-05, |
| "loss": 0.9519, |
| "num_input_tokens_seen": 441450496, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.9042453461899023, |
| "grad_norm": 0.6874699025267311, |
| "learning_rate": 4.9037740104498166e-05, |
| "loss": 0.9281, |
| "num_input_tokens_seen": 442499072, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.9063881076737645, |
| "grad_norm": 0.6359007946203304, |
| "learning_rate": 4.903309365336686e-05, |
| "loss": 0.939, |
| "num_input_tokens_seen": 443547648, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.9085308691576269, |
| "grad_norm": 0.7006661263512417, |
| "learning_rate": 4.90284362323145e-05, |
| "loss": 0.9345, |
| "num_input_tokens_seen": 444596224, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.9106736306414892, |
| "grad_norm": 0.5187269941294566, |
| "learning_rate": 4.902376784346697e-05, |
| "loss": 0.9414, |
| "num_input_tokens_seen": 445644800, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.9128163921253516, |
| "grad_norm": 0.6305715854234033, |
| "learning_rate": 4.901908848895517e-05, |
| "loss": 0.938, |
| "num_input_tokens_seen": 446693376, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.9149591536092139, |
| "grad_norm": 0.657768269603032, |
| "learning_rate": 4.901439817091499e-05, |
| "loss": 0.9359, |
| "num_input_tokens_seen": 447741952, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.9171019150930761, |
| "grad_norm": 0.7053589096005869, |
| "learning_rate": 4.9009696891487325e-05, |
| "loss": 0.9418, |
| "num_input_tokens_seen": 448790528, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.9192446765769385, |
| "grad_norm": 0.7187736478194787, |
| "learning_rate": 4.9004984652818076e-05, |
| "loss": 0.9338, |
| "num_input_tokens_seen": 449839104, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.9213874380608008, |
| "grad_norm": 0.5897812937476063, |
| "learning_rate": 4.900026145705815e-05, |
| "loss": 0.9425, |
| "num_input_tokens_seen": 450887680, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9235301995446632, |
| "grad_norm": 0.5467959396908199, |
| "learning_rate": 4.899552730636345e-05, |
| "loss": 0.9276, |
| "num_input_tokens_seen": 451936256, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.9256729610285255, |
| "grad_norm": 0.6600294586377221, |
| "learning_rate": 4.899078220289489e-05, |
| "loss": 0.938, |
| "num_input_tokens_seen": 452984832, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.9278157225123879, |
| "grad_norm": 0.7533489644860528, |
| "learning_rate": 4.898602614881836e-05, |
| "loss": 0.9408, |
| "num_input_tokens_seen": 454033408, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.9299584839962501, |
| "grad_norm": 0.7688826889229265, |
| "learning_rate": 4.898125914630479e-05, |
| "loss": 0.9416, |
| "num_input_tokens_seen": 455081984, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.9321012454801125, |
| "grad_norm": 0.8178732789471899, |
| "learning_rate": 4.897648119753006e-05, |
| "loss": 0.9349, |
| "num_input_tokens_seen": 456130560, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9342440069639748, |
| "grad_norm": 0.8805822989196023, |
| "learning_rate": 4.897169230467506e-05, |
| "loss": 0.9398, |
| "num_input_tokens_seen": 457179136, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.9363867684478372, |
| "grad_norm": 0.7148664306953607, |
| "learning_rate": 4.896689246992572e-05, |
| "loss": 0.9288, |
| "num_input_tokens_seen": 458227712, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.9385295299316995, |
| "grad_norm": 0.620777716602536, |
| "learning_rate": 4.8962081695472886e-05, |
| "loss": 0.937, |
| "num_input_tokens_seen": 459276288, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.9406722914155619, |
| "grad_norm": 0.780179843380168, |
| "learning_rate": 4.895725998351246e-05, |
| "loss": 0.9282, |
| "num_input_tokens_seen": 460324864, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.9428150528994241, |
| "grad_norm": 0.7098688144002365, |
| "learning_rate": 4.8952427336245324e-05, |
| "loss": 0.9205, |
| "num_input_tokens_seen": 461373440, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9449578143832864, |
| "grad_norm": 0.8388395028850127, |
| "learning_rate": 4.894758375587733e-05, |
| "loss": 0.9298, |
| "num_input_tokens_seen": 462422016, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.9471005758671488, |
| "grad_norm": 1.0011263471423495, |
| "learning_rate": 4.894272924461932e-05, |
| "loss": 0.9339, |
| "num_input_tokens_seen": 463470592, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.9492433373510111, |
| "grad_norm": 0.9116591107624027, |
| "learning_rate": 4.8937863804687165e-05, |
| "loss": 0.9286, |
| "num_input_tokens_seen": 464519168, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.9513860988348735, |
| "grad_norm": 0.6734428440804721, |
| "learning_rate": 4.893298743830168e-05, |
| "loss": 0.945, |
| "num_input_tokens_seen": 465567744, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.9535288603187357, |
| "grad_norm": 0.7235503104955481, |
| "learning_rate": 4.89281001476887e-05, |
| "loss": 0.9435, |
| "num_input_tokens_seen": 466616320, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.9556716218025981, |
| "grad_norm": 0.6810287431117885, |
| "learning_rate": 4.892320193507902e-05, |
| "loss": 0.9329, |
| "num_input_tokens_seen": 467664896, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.9578143832864604, |
| "grad_norm": 0.8494818139907088, |
| "learning_rate": 4.8918292802708445e-05, |
| "loss": 0.9434, |
| "num_input_tokens_seen": 468713472, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.9599571447703228, |
| "grad_norm": 0.7794460661595467, |
| "learning_rate": 4.891337275281774e-05, |
| "loss": 0.9313, |
| "num_input_tokens_seen": 469762048, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.9620999062541851, |
| "grad_norm": 0.6458068716390041, |
| "learning_rate": 4.890844178765267e-05, |
| "loss": 0.9339, |
| "num_input_tokens_seen": 470810624, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.9642426677380475, |
| "grad_norm": 0.7530678772967354, |
| "learning_rate": 4.8903499909463966e-05, |
| "loss": 0.9381, |
| "num_input_tokens_seen": 471859200, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9663854292219097, |
| "grad_norm": 0.8522366384371276, |
| "learning_rate": 4.889854712050737e-05, |
| "loss": 0.9326, |
| "num_input_tokens_seen": 472907776, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.9685281907057721, |
| "grad_norm": 0.6897177044990395, |
| "learning_rate": 4.8893583423043574e-05, |
| "loss": 0.939, |
| "num_input_tokens_seen": 473956352, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.9706709521896344, |
| "grad_norm": 0.5452623417030903, |
| "learning_rate": 4.888860881933826e-05, |
| "loss": 0.932, |
| "num_input_tokens_seen": 475004928, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.9728137136734967, |
| "grad_norm": 0.7408247505392889, |
| "learning_rate": 4.888362331166211e-05, |
| "loss": 0.9306, |
| "num_input_tokens_seen": 476053504, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.9749564751573591, |
| "grad_norm": 0.8159380899572539, |
| "learning_rate": 4.887862690229073e-05, |
| "loss": 0.9338, |
| "num_input_tokens_seen": 477102080, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.9770992366412213, |
| "grad_norm": 0.6775152425008051, |
| "learning_rate": 4.887361959350475e-05, |
| "loss": 0.9313, |
| "num_input_tokens_seen": 478150656, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.9792419981250837, |
| "grad_norm": 0.7643871285334946, |
| "learning_rate": 4.8868601387589765e-05, |
| "loss": 0.9292, |
| "num_input_tokens_seen": 479199232, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.981384759608946, |
| "grad_norm": 0.7359268386748409, |
| "learning_rate": 4.8863572286836324e-05, |
| "loss": 0.9371, |
| "num_input_tokens_seen": 480247808, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.9835275210928084, |
| "grad_norm": 0.7367735699896256, |
| "learning_rate": 4.885853229353998e-05, |
| "loss": 0.9165, |
| "num_input_tokens_seen": 481296384, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.9856702825766707, |
| "grad_norm": 0.7300756128487073, |
| "learning_rate": 4.885348141000122e-05, |
| "loss": 0.935, |
| "num_input_tokens_seen": 482344960, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.987813044060533, |
| "grad_norm": 0.6381716292470745, |
| "learning_rate": 4.8848419638525545e-05, |
| "loss": 0.9207, |
| "num_input_tokens_seen": 483393536, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.9899558055443953, |
| "grad_norm": 0.6121363602754881, |
| "learning_rate": 4.884334698142339e-05, |
| "loss": 0.9297, |
| "num_input_tokens_seen": 484442112, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.9920985670282577, |
| "grad_norm": 0.5846344306782465, |
| "learning_rate": 4.8838263441010186e-05, |
| "loss": 0.9317, |
| "num_input_tokens_seen": 485490688, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.99424132851212, |
| "grad_norm": 0.5734102323548795, |
| "learning_rate": 4.88331690196063e-05, |
| "loss": 0.9265, |
| "num_input_tokens_seen": 486539264, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.9963840899959823, |
| "grad_norm": 0.6569092711755317, |
| "learning_rate": 4.88280637195371e-05, |
| "loss": 0.9315, |
| "num_input_tokens_seen": 487587840, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.9985268514798447, |
| "grad_norm": 0.5544483498666563, |
| "learning_rate": 4.882294754313289e-05, |
| "loss": 0.9337, |
| "num_input_tokens_seen": 488636416, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.6728502195291717, |
| "learning_rate": 4.881782049272896e-05, |
| "loss": 0.9227, |
| "num_input_tokens_seen": 489357312, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.0021427614838623, |
| "grad_norm": 1.1178061816772684, |
| "learning_rate": 4.8812682570665556e-05, |
| "loss": 0.7839, |
| "num_input_tokens_seen": 490405888, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.0042855229677246, |
| "grad_norm": 1.0921873795233088, |
| "learning_rate": 4.880753377928788e-05, |
| "loss": 0.7834, |
| "num_input_tokens_seen": 491454464, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.006428284451587, |
| "grad_norm": 1.1287773013748974, |
| "learning_rate": 4.880237412094611e-05, |
| "loss": 0.7761, |
| "num_input_tokens_seen": 492503040, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0085710459354493, |
| "grad_norm": 1.0631852319209654, |
| "learning_rate": 4.879720359799537e-05, |
| "loss": 0.7684, |
| "num_input_tokens_seen": 493551616, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.0107138074193116, |
| "grad_norm": 1.0776720184821373, |
| "learning_rate": 4.879202221279575e-05, |
| "loss": 0.7701, |
| "num_input_tokens_seen": 494600192, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.0128565689031739, |
| "grad_norm": 0.9227986460902488, |
| "learning_rate": 4.878682996771229e-05, |
| "loss": 0.7791, |
| "num_input_tokens_seen": 495648768, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.0149993303870364, |
| "grad_norm": 1.075443769168512, |
| "learning_rate": 4.8781626865115005e-05, |
| "loss": 0.7674, |
| "num_input_tokens_seen": 496697344, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.0171420918708987, |
| "grad_norm": 1.2527807821550248, |
| "learning_rate": 4.877641290737884e-05, |
| "loss": 0.773, |
| "num_input_tokens_seen": 497745920, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.019284853354761, |
| "grad_norm": 0.8580407158397938, |
| "learning_rate": 4.877118809688372e-05, |
| "loss": 0.771, |
| "num_input_tokens_seen": 498794496, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.0214276148386232, |
| "grad_norm": 0.6853196367256191, |
| "learning_rate": 4.8765952436014515e-05, |
| "loss": 0.7725, |
| "num_input_tokens_seen": 499843072, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.0235703763224857, |
| "grad_norm": 0.8301110353120499, |
| "learning_rate": 4.876070592716105e-05, |
| "loss": 0.7783, |
| "num_input_tokens_seen": 500891648, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.025713137806348, |
| "grad_norm": 0.8756175808406557, |
| "learning_rate": 4.875544857271808e-05, |
| "loss": 0.7487, |
| "num_input_tokens_seen": 501940224, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.0278558992902103, |
| "grad_norm": 0.9757446146451294, |
| "learning_rate": 4.8750180375085344e-05, |
| "loss": 0.766, |
| "num_input_tokens_seen": 502988800, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0299986607740725, |
| "grad_norm": 0.8610400304752691, |
| "learning_rate": 4.874490133666749e-05, |
| "loss": 0.7663, |
| "num_input_tokens_seen": 504037376, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.0321414222579348, |
| "grad_norm": 0.8464462121552816, |
| "learning_rate": 4.873961145987417e-05, |
| "loss": 0.7689, |
| "num_input_tokens_seen": 505085952, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.0342841837417973, |
| "grad_norm": 0.7205290839211237, |
| "learning_rate": 4.8734310747119935e-05, |
| "loss": 0.7598, |
| "num_input_tokens_seen": 506134528, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.0364269452256596, |
| "grad_norm": 0.7972996029812055, |
| "learning_rate": 4.87289992008243e-05, |
| "loss": 0.7677, |
| "num_input_tokens_seen": 507183104, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.0385697067095219, |
| "grad_norm": 0.8060439581250051, |
| "learning_rate": 4.872367682341173e-05, |
| "loss": 0.7657, |
| "num_input_tokens_seen": 508231680, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.0407124681933841, |
| "grad_norm": 0.625204599036251, |
| "learning_rate": 4.871834361731162e-05, |
| "loss": 0.7537, |
| "num_input_tokens_seen": 509280256, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.0428552296772466, |
| "grad_norm": 0.7605314454169393, |
| "learning_rate": 4.8712999584958314e-05, |
| "loss": 0.7719, |
| "num_input_tokens_seen": 510328832, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.044997991161109, |
| "grad_norm": 0.8346367886178117, |
| "learning_rate": 4.87076447287911e-05, |
| "loss": 0.7662, |
| "num_input_tokens_seen": 511377408, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.0471407526449712, |
| "grad_norm": 0.6285971829943235, |
| "learning_rate": 4.870227905125422e-05, |
| "loss": 0.765, |
| "num_input_tokens_seen": 512425984, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.0492835141288335, |
| "grad_norm": 0.6964181738913828, |
| "learning_rate": 4.869690255479682e-05, |
| "loss": 0.7506, |
| "num_input_tokens_seen": 513474560, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.051426275612696, |
| "grad_norm": 0.7957806090570286, |
| "learning_rate": 4.8691515241873023e-05, |
| "loss": 0.7714, |
| "num_input_tokens_seen": 514523136, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.0535690370965582, |
| "grad_norm": 0.8340425161918896, |
| "learning_rate": 4.868611711494186e-05, |
| "loss": 0.7606, |
| "num_input_tokens_seen": 515571712, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.0557117985804205, |
| "grad_norm": 0.7847167960403033, |
| "learning_rate": 4.8680708176467305e-05, |
| "loss": 0.7512, |
| "num_input_tokens_seen": 516620288, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.0578545600642828, |
| "grad_norm": 0.825125615426757, |
| "learning_rate": 4.867528842891828e-05, |
| "loss": 0.7711, |
| "num_input_tokens_seen": 517668864, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.059997321548145, |
| "grad_norm": 0.7783907475532756, |
| "learning_rate": 4.866985787476863e-05, |
| "loss": 0.7734, |
| "num_input_tokens_seen": 518717440, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.0621400830320076, |
| "grad_norm": 0.7189975833814624, |
| "learning_rate": 4.866441651649715e-05, |
| "loss": 0.7748, |
| "num_input_tokens_seen": 519766016, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.0642828445158699, |
| "grad_norm": 0.6408978236973415, |
| "learning_rate": 4.865896435658752e-05, |
| "loss": 0.7632, |
| "num_input_tokens_seen": 520814592, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.0664256059997321, |
| "grad_norm": 0.8641304391887605, |
| "learning_rate": 4.865350139752841e-05, |
| "loss": 0.7602, |
| "num_input_tokens_seen": 521863168, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.0685683674835944, |
| "grad_norm": 0.7514805150967576, |
| "learning_rate": 4.8648027641813384e-05, |
| "loss": 0.7536, |
| "num_input_tokens_seen": 522911744, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.070711128967457, |
| "grad_norm": 0.6149193256755058, |
| "learning_rate": 4.864254309194093e-05, |
| "loss": 0.764, |
| "num_input_tokens_seen": 523960320, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 4660, |
| "num_input_tokens_seen": 523960320, |
| "num_train_epochs": 10, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 836890483752960.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|