| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.953191489361702, | |
| "eval_steps": 500, | |
| "global_step": 440, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011347517730496455, | |
| "grad_norm": 42.11255056967552, | |
| "learning_rate": 4.999936276068748e-05, | |
| "loss": 2.7084, | |
| "num_input_tokens_seen": 262144, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02269503546099291, | |
| "grad_norm": 54.92874275677881, | |
| "learning_rate": 4.9997451075235834e-05, | |
| "loss": 3.9726, | |
| "num_input_tokens_seen": 524288, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03404255319148936, | |
| "grad_norm": 42.37953862248386, | |
| "learning_rate": 4.999426504110115e-05, | |
| "loss": 3.6425, | |
| "num_input_tokens_seen": 786432, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04539007092198582, | |
| "grad_norm": 91.1622917243015, | |
| "learning_rate": 4.9989804820704735e-05, | |
| "loss": 3.9759, | |
| "num_input_tokens_seen": 1048576, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05673758865248227, | |
| "grad_norm": 18.918639143868496, | |
| "learning_rate": 4.99840706414248e-05, | |
| "loss": 2.7491, | |
| "num_input_tokens_seen": 1310720, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06808510638297872, | |
| "grad_norm": 1015.4776658382041, | |
| "learning_rate": 4.9977062795584893e-05, | |
| "loss": 6.0462, | |
| "num_input_tokens_seen": 1572864, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07943262411347518, | |
| "grad_norm": 72.92199611683951, | |
| "learning_rate": 4.9968781640439026e-05, | |
| "loss": 4.8547, | |
| "num_input_tokens_seen": 1835008, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.09078014184397164, | |
| "grad_norm": 2053.7092461560514, | |
| "learning_rate": 4.995922759815339e-05, | |
| "loss": 3.4759, | |
| "num_input_tokens_seen": 2097152, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10212765957446808, | |
| "grad_norm": 301.76868717065724, | |
| "learning_rate": 4.9948401155784904e-05, | |
| "loss": 5.716, | |
| "num_input_tokens_seen": 2359296, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11347517730496454, | |
| "grad_norm": 22.772625297554043, | |
| "learning_rate": 4.993630286525634e-05, | |
| "loss": 3.0399, | |
| "num_input_tokens_seen": 2621440, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12482269503546099, | |
| "grad_norm": 30.392870542185943, | |
| "learning_rate": 4.99229333433282e-05, | |
| "loss": 2.7072, | |
| "num_input_tokens_seen": 2883584, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.13617021276595745, | |
| "grad_norm": 7.488897181026025, | |
| "learning_rate": 4.9908293271567286e-05, | |
| "loss": 2.3501, | |
| "num_input_tokens_seen": 3145728, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1475177304964539, | |
| "grad_norm": 7.085681299970245, | |
| "learning_rate": 4.9892383396311934e-05, | |
| "loss": 2.157, | |
| "num_input_tokens_seen": 3407872, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.15886524822695036, | |
| "grad_norm": 8.190856615122739, | |
| "learning_rate": 4.987520452863399e-05, | |
| "loss": 2.0567, | |
| "num_input_tokens_seen": 3670016, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1702127659574468, | |
| "grad_norm": 10.596165600490249, | |
| "learning_rate": 4.985675754429744e-05, | |
| "loss": 2.0914, | |
| "num_input_tokens_seen": 3932160, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.18156028368794327, | |
| "grad_norm": 4.228212579415264, | |
| "learning_rate": 4.9837043383713753e-05, | |
| "loss": 1.9227, | |
| "num_input_tokens_seen": 4194304, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.19290780141843972, | |
| "grad_norm": 7.545954107986025, | |
| "learning_rate": 4.981606305189401e-05, | |
| "loss": 1.9327, | |
| "num_input_tokens_seen": 4456448, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.20425531914893616, | |
| "grad_norm": 5.46892073376645, | |
| "learning_rate": 4.979381761839757e-05, | |
| "loss": 1.9021, | |
| "num_input_tokens_seen": 4718592, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.21560283687943263, | |
| "grad_norm": 4.169344521056397, | |
| "learning_rate": 4.9770308217277614e-05, | |
| "loss": 1.8432, | |
| "num_input_tokens_seen": 4980736, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.22695035460992907, | |
| "grad_norm": 4.236687724660673, | |
| "learning_rate": 4.9745536047023324e-05, | |
| "loss": 1.8072, | |
| "num_input_tokens_seen": 5242880, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23829787234042554, | |
| "grad_norm": 2.060568655336032, | |
| "learning_rate": 4.971950237049874e-05, | |
| "loss": 1.7635, | |
| "num_input_tokens_seen": 5505024, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.24964539007092199, | |
| "grad_norm": 4.122485050689507, | |
| "learning_rate": 4.9692208514878444e-05, | |
| "loss": 1.7203, | |
| "num_input_tokens_seen": 5767168, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.26099290780141843, | |
| "grad_norm": 2.219869083818665, | |
| "learning_rate": 4.966365587157986e-05, | |
| "loss": 1.697, | |
| "num_input_tokens_seen": 6029312, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2723404255319149, | |
| "grad_norm": 2.978772859366142, | |
| "learning_rate": 4.963384589619233e-05, | |
| "loss": 1.6882, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.28368794326241137, | |
| "grad_norm": 2.3166879594611562, | |
| "learning_rate": 4.96027801084029e-05, | |
| "loss": 1.6619, | |
| "num_input_tokens_seen": 6553600, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2950354609929078, | |
| "grad_norm": 2.660252399042677, | |
| "learning_rate": 4.957046009191889e-05, | |
| "loss": 1.6709, | |
| "num_input_tokens_seen": 6815744, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.30638297872340425, | |
| "grad_norm": 1.9436043867392558, | |
| "learning_rate": 4.95368874943871e-05, | |
| "loss": 1.6295, | |
| "num_input_tokens_seen": 7077888, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3177304964539007, | |
| "grad_norm": 3.2367808954229056, | |
| "learning_rate": 4.9502064027309836e-05, | |
| "loss": 1.6475, | |
| "num_input_tokens_seen": 7340032, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.32907801418439714, | |
| "grad_norm": 2.9573963451533163, | |
| "learning_rate": 4.946599146595769e-05, | |
| "loss": 1.6421, | |
| "num_input_tokens_seen": 7602176, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3404255319148936, | |
| "grad_norm": 2.0096985811179584, | |
| "learning_rate": 4.942867164927899e-05, | |
| "loss": 1.6335, | |
| "num_input_tokens_seen": 7864320, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3517730496453901, | |
| "grad_norm": 3.2248151762473904, | |
| "learning_rate": 4.9390106479806085e-05, | |
| "loss": 1.6085, | |
| "num_input_tokens_seen": 8126464, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.36312056737588655, | |
| "grad_norm": 2.03961317414281, | |
| "learning_rate": 4.935029792355834e-05, | |
| "loss": 1.5966, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.37446808510638296, | |
| "grad_norm": 3.106580267812995, | |
| "learning_rate": 4.9309248009941914e-05, | |
| "loss": 1.5939, | |
| "num_input_tokens_seen": 8650752, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.38581560283687943, | |
| "grad_norm": 2.356069611331115, | |
| "learning_rate": 4.9266958831646315e-05, | |
| "loss": 1.5713, | |
| "num_input_tokens_seen": 8912896, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3971631205673759, | |
| "grad_norm": 2.555281051112305, | |
| "learning_rate": 4.922343254453768e-05, | |
| "loss": 1.5476, | |
| "num_input_tokens_seen": 9175040, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4085106382978723, | |
| "grad_norm": 2.1382267614522377, | |
| "learning_rate": 4.917867136754893e-05, | |
| "loss": 1.5501, | |
| "num_input_tokens_seen": 9437184, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4198581560283688, | |
| "grad_norm": 2.182480841186189, | |
| "learning_rate": 4.913267758256658e-05, | |
| "loss": 1.5678, | |
| "num_input_tokens_seen": 9699328, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.43120567375886526, | |
| "grad_norm": 1.9946631054615584, | |
| "learning_rate": 4.9085453534314476e-05, | |
| "loss": 1.5604, | |
| "num_input_tokens_seen": 9961472, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.4425531914893617, | |
| "grad_norm": 2.1201916556543505, | |
| "learning_rate": 4.9037001630234215e-05, | |
| "loss": 1.5265, | |
| "num_input_tokens_seen": 10223616, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.45390070921985815, | |
| "grad_norm": 1.9422378506921851, | |
| "learning_rate": 4.898732434036244e-05, | |
| "loss": 1.5269, | |
| "num_input_tokens_seen": 10485760, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4652482269503546, | |
| "grad_norm": 1.7767185160628844, | |
| "learning_rate": 4.893642419720491e-05, | |
| "loss": 1.4965, | |
| "num_input_tokens_seen": 10747904, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4765957446808511, | |
| "grad_norm": 1.6802879544286047, | |
| "learning_rate": 4.888430379560742e-05, | |
| "loss": 1.5254, | |
| "num_input_tokens_seen": 11010048, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4879432624113475, | |
| "grad_norm": 1.6860871708585086, | |
| "learning_rate": 4.883096579262346e-05, | |
| "loss": 1.4975, | |
| "num_input_tokens_seen": 11272192, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.49929078014184397, | |
| "grad_norm": 1.7303274117706944, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 1.5209, | |
| "num_input_tokens_seen": 11534336, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5106382978723404, | |
| "grad_norm": 1.4626721944583252, | |
| "learning_rate": 4.872064792093299e-05, | |
| "loss": 1.51, | |
| "num_input_tokens_seen": 11796480, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5219858156028369, | |
| "grad_norm": 1.6681840945027697, | |
| "learning_rate": 4.866367367613725e-05, | |
| "loss": 1.5086, | |
| "num_input_tokens_seen": 12058624, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.5207849294270255, | |
| "learning_rate": 4.86054930774899e-05, | |
| "loss": 1.4588, | |
| "num_input_tokens_seen": 12320768, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5446808510638298, | |
| "grad_norm": 1.616710324729056, | |
| "learning_rate": 4.854610909098812e-05, | |
| "loss": 1.4805, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5560283687943263, | |
| "grad_norm": 1.60340831300244, | |
| "learning_rate": 4.848552474397676e-05, | |
| "loss": 1.4696, | |
| "num_input_tokens_seen": 12845056, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5673758865248227, | |
| "grad_norm": 1.7880113956949981, | |
| "learning_rate": 4.842374312499405e-05, | |
| "loss": 1.4821, | |
| "num_input_tokens_seen": 13107200, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5787234042553191, | |
| "grad_norm": 1.3568656846238325, | |
| "learning_rate": 4.836076738361408e-05, | |
| "loss": 1.4884, | |
| "num_input_tokens_seen": 13369344, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5900709219858156, | |
| "grad_norm": 1.5552837054733266, | |
| "learning_rate": 4.829660073028631e-05, | |
| "loss": 1.4686, | |
| "num_input_tokens_seen": 13631488, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.601418439716312, | |
| "grad_norm": 1.4041461982644978, | |
| "learning_rate": 4.823124643617187e-05, | |
| "loss": 1.4522, | |
| "num_input_tokens_seen": 13893632, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6127659574468085, | |
| "grad_norm": 1.475780206398884, | |
| "learning_rate": 4.8164707832976783e-05, | |
| "loss": 1.464, | |
| "num_input_tokens_seen": 14155776, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.624113475177305, | |
| "grad_norm": 1.4112950034248575, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 1.445, | |
| "num_input_tokens_seen": 14417920, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6354609929078014, | |
| "grad_norm": 1.6206014969548581, | |
| "learning_rate": 4.802809132787125e-05, | |
| "loss": 1.45, | |
| "num_input_tokens_seen": 14680064, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6468085106382979, | |
| "grad_norm": 1.2999206377427606, | |
| "learning_rate": 4.7958020390553426e-05, | |
| "loss": 1.4923, | |
| "num_input_tokens_seen": 14942208, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6581560283687943, | |
| "grad_norm": 1.2629239809770683, | |
| "learning_rate": 4.7886779072985156e-05, | |
| "loss": 1.4459, | |
| "num_input_tokens_seen": 15204352, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6695035460992907, | |
| "grad_norm": 1.6115720276042818, | |
| "learning_rate": 4.78143710069879e-05, | |
| "loss": 1.4211, | |
| "num_input_tokens_seen": 15466496, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6808510638297872, | |
| "grad_norm": 1.3564661495186316, | |
| "learning_rate": 4.774079988386296e-05, | |
| "loss": 1.4354, | |
| "num_input_tokens_seen": 15728640, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6921985815602837, | |
| "grad_norm": 1.4029089027053228, | |
| "learning_rate": 4.766606945420329e-05, | |
| "loss": 1.4552, | |
| "num_input_tokens_seen": 15990784, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7035460992907802, | |
| "grad_norm": 1.609508929492919, | |
| "learning_rate": 4.759018352770229e-05, | |
| "loss": 1.4401, | |
| "num_input_tokens_seen": 16252928, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7148936170212766, | |
| "grad_norm": 1.5314498833493546, | |
| "learning_rate": 4.751314597295963e-05, | |
| "loss": 1.4717, | |
| "num_input_tokens_seen": 16515072, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7262411347517731, | |
| "grad_norm": 1.4584944515244869, | |
| "learning_rate": 4.743496071728396e-05, | |
| "loss": 1.4372, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7375886524822695, | |
| "grad_norm": 1.3848187202296613, | |
| "learning_rate": 4.735563174649278e-05, | |
| "loss": 1.4222, | |
| "num_input_tokens_seen": 17039360, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7489361702127659, | |
| "grad_norm": 1.322399845197604, | |
| "learning_rate": 4.72751631047092e-05, | |
| "loss": 1.4328, | |
| "num_input_tokens_seen": 17301504, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7602836879432624, | |
| "grad_norm": 1.2041955676624019, | |
| "learning_rate": 4.719355889415576e-05, | |
| "loss": 1.4133, | |
| "num_input_tokens_seen": 17563648, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7716312056737589, | |
| "grad_norm": 1.4025507710172729, | |
| "learning_rate": 4.711082327494536e-05, | |
| "loss": 1.4239, | |
| "num_input_tokens_seen": 17825792, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7829787234042553, | |
| "grad_norm": 1.5777455822322792, | |
| "learning_rate": 4.7026960464869116e-05, | |
| "loss": 1.437, | |
| "num_input_tokens_seen": 18087936, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7943262411347518, | |
| "grad_norm": 1.3436386092926063, | |
| "learning_rate": 4.6941974739181395e-05, | |
| "loss": 1.4243, | |
| "num_input_tokens_seen": 18350080, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8056737588652483, | |
| "grad_norm": 1.7031338750612581, | |
| "learning_rate": 4.6855870430381816e-05, | |
| "loss": 1.4272, | |
| "num_input_tokens_seen": 18612224, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8170212765957446, | |
| "grad_norm": 1.3269888785525612, | |
| "learning_rate": 4.6768651927994434e-05, | |
| "loss": 1.4054, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8283687943262411, | |
| "grad_norm": 1.4476048219302555, | |
| "learning_rate": 4.668032367834392e-05, | |
| "loss": 1.413, | |
| "num_input_tokens_seen": 19136512, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8397163120567376, | |
| "grad_norm": 1.4535317388744813, | |
| "learning_rate": 4.6590890184328925e-05, | |
| "loss": 1.4054, | |
| "num_input_tokens_seen": 19398656, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 1.2269074929733113, | |
| "learning_rate": 4.6500356005192514e-05, | |
| "loss": 1.3898, | |
| "num_input_tokens_seen": 19660800, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8624113475177305, | |
| "grad_norm": 1.3008326960072658, | |
| "learning_rate": 4.640872575628973e-05, | |
| "loss": 1.4042, | |
| "num_input_tokens_seen": 19922944, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.873758865248227, | |
| "grad_norm": 1.2343345399913839, | |
| "learning_rate": 4.6316004108852305e-05, | |
| "loss": 1.4181, | |
| "num_input_tokens_seen": 20185088, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.8851063829787233, | |
| "grad_norm": 1.40051458320969, | |
| "learning_rate": 4.622219578975057e-05, | |
| "loss": 1.3986, | |
| "num_input_tokens_seen": 20447232, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.8964539007092198, | |
| "grad_norm": 1.2497430360048092, | |
| "learning_rate": 4.6127305581252414e-05, | |
| "loss": 1.3769, | |
| "num_input_tokens_seen": 20709376, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9078014184397163, | |
| "grad_norm": 1.2431538833258515, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 1.4002, | |
| "num_input_tokens_seen": 20971520, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9191489361702128, | |
| "grad_norm": 1.0814373534222335, | |
| "learning_rate": 4.593429890066082e-05, | |
| "loss": 1.4156, | |
| "num_input_tokens_seen": 21233664, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9304964539007092, | |
| "grad_norm": 1.115346242762015, | |
| "learning_rate": 4.583619226788294e-05, | |
| "loss": 1.3867, | |
| "num_input_tokens_seen": 21495808, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9418439716312057, | |
| "grad_norm": 1.2127601722053167, | |
| "learning_rate": 4.573702342383816e-05, | |
| "loss": 1.3751, | |
| "num_input_tokens_seen": 21757952, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9531914893617022, | |
| "grad_norm": 1.1748214200248, | |
| "learning_rate": 4.563679742406935e-05, | |
| "loss": 1.3834, | |
| "num_input_tokens_seen": 22020096, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.9645390070921985, | |
| "grad_norm": 1.1331339281580772, | |
| "learning_rate": 4.5535519378012295e-05, | |
| "loss": 1.3791, | |
| "num_input_tokens_seen": 22282240, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.975886524822695, | |
| "grad_norm": 1.4004411805319554, | |
| "learning_rate": 4.543319444873517e-05, | |
| "loss": 1.3785, | |
| "num_input_tokens_seen": 22544384, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.9872340425531915, | |
| "grad_norm": 1.2118340365527955, | |
| "learning_rate": 4.532982785267541e-05, | |
| "loss": 1.3635, | |
| "num_input_tokens_seen": 22806528, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.9985815602836879, | |
| "grad_norm": 1.5899548683498645, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 1.3879, | |
| "num_input_tokens_seen": 23068672, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.5899548683498645, | |
| "learning_rate": 4.511999079120534e-05, | |
| "loss": 1.2364, | |
| "num_input_tokens_seen": 23101440, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0113475177304965, | |
| "grad_norm": 3.1471576605311427, | |
| "learning_rate": 4.5013531023109014e-05, | |
| "loss": 1.0204, | |
| "num_input_tokens_seen": 23363584, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.022695035460993, | |
| "grad_norm": 1.9813456886904501, | |
| "learning_rate": 4.4906050982312664e-05, | |
| "loss": 1.0799, | |
| "num_input_tokens_seen": 23625728, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0340425531914894, | |
| "grad_norm": 1.7175351501466867, | |
| "learning_rate": 4.479755614805688e-05, | |
| "loss": 1.0052, | |
| "num_input_tokens_seen": 23887872, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.0453900709219859, | |
| "grad_norm": 1.970342186079611, | |
| "learning_rate": 4.4688052051315545e-05, | |
| "loss": 1.0531, | |
| "num_input_tokens_seen": 24150016, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.0567375886524824, | |
| "grad_norm": 1.7872662354821134, | |
| "learning_rate": 4.457754427451389e-05, | |
| "loss": 1.0159, | |
| "num_input_tokens_seen": 24412160, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.0680851063829788, | |
| "grad_norm": 1.8666514589785563, | |
| "learning_rate": 4.446603845124388e-05, | |
| "loss": 0.9947, | |
| "num_input_tokens_seen": 24674304, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.0794326241134753, | |
| "grad_norm": 1.5777077629921268, | |
| "learning_rate": 4.4353540265977064e-05, | |
| "loss": 0.9969, | |
| "num_input_tokens_seen": 24936448, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.0907801418439715, | |
| "grad_norm": 1.4442562002627628, | |
| "learning_rate": 4.4240055453774734e-05, | |
| "loss": 1.02, | |
| "num_input_tokens_seen": 25198592, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.102127659574468, | |
| "grad_norm": 1.6801959052168647, | |
| "learning_rate": 4.412558979999558e-05, | |
| "loss": 0.9789, | |
| "num_input_tokens_seen": 25460736, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.1134751773049645, | |
| "grad_norm": 1.3983076558719154, | |
| "learning_rate": 4.401014914000078e-05, | |
| "loss": 1.0035, | |
| "num_input_tokens_seen": 25722880, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.124822695035461, | |
| "grad_norm": 1.4068549353820048, | |
| "learning_rate": 4.389373935885646e-05, | |
| "loss": 0.9751, | |
| "num_input_tokens_seen": 25985024, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1361702127659574, | |
| "grad_norm": 1.4162325676250134, | |
| "learning_rate": 4.3776366391033746e-05, | |
| "loss": 0.9464, | |
| "num_input_tokens_seen": 26247168, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.147517730496454, | |
| "grad_norm": 1.3675197337363096, | |
| "learning_rate": 4.365803622010618e-05, | |
| "loss": 0.9592, | |
| "num_input_tokens_seen": 26509312, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.1588652482269504, | |
| "grad_norm": 1.5007646821184917, | |
| "learning_rate": 4.35387548784447e-05, | |
| "loss": 0.9709, | |
| "num_input_tokens_seen": 26771456, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.1702127659574468, | |
| "grad_norm": 1.4773281381482295, | |
| "learning_rate": 4.341852844691012e-05, | |
| "loss": 0.9834, | |
| "num_input_tokens_seen": 27033600, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.1815602836879433, | |
| "grad_norm": 1.933895535557616, | |
| "learning_rate": 4.329736305454314e-05, | |
| "loss": 0.9999, | |
| "num_input_tokens_seen": 27295744, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.1929078014184398, | |
| "grad_norm": 1.6902638904380827, | |
| "learning_rate": 4.3175264878251845e-05, | |
| "loss": 0.9294, | |
| "num_input_tokens_seen": 27557888, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2042553191489362, | |
| "grad_norm": 1.696391072445269, | |
| "learning_rate": 4.305224014249688e-05, | |
| "loss": 0.9906, | |
| "num_input_tokens_seen": 27820032, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2156028368794327, | |
| "grad_norm": 1.7162350808896498, | |
| "learning_rate": 4.292829511897409e-05, | |
| "loss": 0.9731, | |
| "num_input_tokens_seen": 28082176, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.226950354609929, | |
| "grad_norm": 1.736567992601448, | |
| "learning_rate": 4.280343612629479e-05, | |
| "loss": 0.978, | |
| "num_input_tokens_seen": 28344320, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.2382978723404254, | |
| "grad_norm": 1.3602272146940024, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.9325, | |
| "num_input_tokens_seen": 28606464, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.249645390070922, | |
| "grad_norm": 1.5745245166339177, | |
| "learning_rate": 4.255100174055434e-05, | |
| "loss": 0.9657, | |
| "num_input_tokens_seen": 28868608, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.2609929078014184, | |
| "grad_norm": 1.3239840597482868, | |
| "learning_rate": 4.242343921638234e-05, | |
| "loss": 0.9451, | |
| "num_input_tokens_seen": 29130752, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.2723404255319148, | |
| "grad_norm": 1.392428377159933, | |
| "learning_rate": 4.22949884601761e-05, | |
| "loss": 0.9632, | |
| "num_input_tokens_seen": 29392896, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.2836879432624113, | |
| "grad_norm": 1.382581854515865, | |
| "learning_rate": 4.2165656020245336e-05, | |
| "loss": 0.9483, | |
| "num_input_tokens_seen": 29655040, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.2950354609929078, | |
| "grad_norm": 1.5462756056089444, | |
| "learning_rate": 4.2035448489847284e-05, | |
| "loss": 0.9899, | |
| "num_input_tokens_seen": 29917184, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3063829787234043, | |
| "grad_norm": 1.4147640794435994, | |
| "learning_rate": 4.1904372506850484e-05, | |
| "loss": 0.9092, | |
| "num_input_tokens_seen": 30179328, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3177304964539007, | |
| "grad_norm": 1.2432801814071204, | |
| "learning_rate": 4.1772434753396504e-05, | |
| "loss": 0.9501, | |
| "num_input_tokens_seen": 30441472, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.3290780141843972, | |
| "grad_norm": 1.4932728466362941, | |
| "learning_rate": 4.1639641955559205e-05, | |
| "loss": 1.0093, | |
| "num_input_tokens_seen": 30703616, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.3404255319148937, | |
| "grad_norm": 1.146394031379382, | |
| "learning_rate": 4.1506000883001875e-05, | |
| "loss": 0.95, | |
| "num_input_tokens_seen": 30965760, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.3517730496453901, | |
| "grad_norm": 1.314570904504416, | |
| "learning_rate": 4.137151834863213e-05, | |
| "loss": 0.9733, | |
| "num_input_tokens_seen": 31227904, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3631205673758866, | |
| "grad_norm": 1.144532383386255, | |
| "learning_rate": 4.123620120825459e-05, | |
| "loss": 0.9503, | |
| "num_input_tokens_seen": 31490048, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.374468085106383, | |
| "grad_norm": 1.407081639684963, | |
| "learning_rate": 4.1100056360221384e-05, | |
| "loss": 0.9787, | |
| "num_input_tokens_seen": 31752192, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.3858156028368795, | |
| "grad_norm": 1.1478616889934938, | |
| "learning_rate": 4.096309074508046e-05, | |
| "loss": 0.9697, | |
| "num_input_tokens_seen": 32014336, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.397163120567376, | |
| "grad_norm": 1.1798657684361409, | |
| "learning_rate": 4.082531134522176e-05, | |
| "loss": 0.9397, | |
| "num_input_tokens_seen": 32276480, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4085106382978723, | |
| "grad_norm": 1.1170685676293548, | |
| "learning_rate": 4.06867251845213e-05, | |
| "loss": 0.9466, | |
| "num_input_tokens_seen": 32538624, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4198581560283687, | |
| "grad_norm": 1.158583040981931, | |
| "learning_rate": 4.054733932798306e-05, | |
| "loss": 0.9517, | |
| "num_input_tokens_seen": 32800768, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4312056737588652, | |
| "grad_norm": 1.1206965982137238, | |
| "learning_rate": 4.0407160881378824e-05, | |
| "loss": 0.9299, | |
| "num_input_tokens_seen": 33062912, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.4425531914893617, | |
| "grad_norm": 1.2166341082803098, | |
| "learning_rate": 4.0266196990885955e-05, | |
| "loss": 0.9674, | |
| "num_input_tokens_seen": 33325056, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.4539007092198581, | |
| "grad_norm": 1.253313105155196, | |
| "learning_rate": 4.012445484272307e-05, | |
| "loss": 0.935, | |
| "num_input_tokens_seen": 33587200, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.4652482269503546, | |
| "grad_norm": 1.1241425426902512, | |
| "learning_rate": 3.9981941662783674e-05, | |
| "loss": 0.9856, | |
| "num_input_tokens_seen": 33849344, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.476595744680851, | |
| "grad_norm": 1.2496739496083311, | |
| "learning_rate": 3.9838664716267855e-05, | |
| "loss": 0.95, | |
| "num_input_tokens_seen": 34111488, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.4879432624113476, | |
| "grad_norm": 1.2413295026999056, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 1.0095, | |
| "num_input_tokens_seen": 34373632, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.499290780141844, | |
| "grad_norm": 1.0994149236511626, | |
| "learning_rate": 3.954984877861565e-05, | |
| "loss": 0.9711, | |
| "num_input_tokens_seen": 34635776, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.5106382978723403, | |
| "grad_norm": 1.2204241140248873, | |
| "learning_rate": 3.9404324511068825e-05, | |
| "loss": 1.0004, | |
| "num_input_tokens_seen": 34897920, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5219858156028367, | |
| "grad_norm": 1.0639734593664278, | |
| "learning_rate": 3.92580659233741e-05, | |
| "loss": 0.9683, | |
| "num_input_tokens_seen": 35160064, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 1.286928460200864, | |
| "learning_rate": 3.911108047166924e-05, | |
| "loss": 0.9735, | |
| "num_input_tokens_seen": 35422208, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.5446808510638297, | |
| "grad_norm": 1.1776256536499852, | |
| "learning_rate": 3.8963375649146866e-05, | |
| "loss": 0.9917, | |
| "num_input_tokens_seen": 35684352, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.5560283687943262, | |
| "grad_norm": 1.1927233327599214, | |
| "learning_rate": 3.881495898567257e-05, | |
| "loss": 0.9585, | |
| "num_input_tokens_seen": 35946496, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.5673758865248226, | |
| "grad_norm": 1.1616205054974948, | |
| "learning_rate": 3.866583804740095e-05, | |
| "loss": 0.9305, | |
| "num_input_tokens_seen": 36208640, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.578723404255319, | |
| "grad_norm": 1.1608542297205653, | |
| "learning_rate": 3.851602043638994e-05, | |
| "loss": 0.9738, | |
| "num_input_tokens_seen": 36470784, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5900709219858156, | |
| "grad_norm": 1.1754828045051902, | |
| "learning_rate": 3.8365513790213265e-05, | |
| "loss": 0.9627, | |
| "num_input_tokens_seen": 36732928, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.601418439716312, | |
| "grad_norm": 1.2412851531549465, | |
| "learning_rate": 3.821432578157105e-05, | |
| "loss": 0.9673, | |
| "num_input_tokens_seen": 36995072, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6127659574468085, | |
| "grad_norm": 1.1681668630687394, | |
| "learning_rate": 3.8062464117898724e-05, | |
| "loss": 0.9908, | |
| "num_input_tokens_seen": 37257216, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.624113475177305, | |
| "grad_norm": 1.1771959095169333, | |
| "learning_rate": 3.790993654097405e-05, | |
| "loss": 0.9913, | |
| "num_input_tokens_seen": 37519360, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.6354609929078014, | |
| "grad_norm": 1.2334490161088576, | |
| "learning_rate": 3.77567508265225e-05, | |
| "loss": 0.9911, | |
| "num_input_tokens_seen": 37781504, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.646808510638298, | |
| "grad_norm": 1.132776925570845, | |
| "learning_rate": 3.76029147838208e-05, | |
| "loss": 0.9801, | |
| "num_input_tokens_seen": 38043648, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.6581560283687944, | |
| "grad_norm": 1.2852017625106646, | |
| "learning_rate": 3.74484362552989e-05, | |
| "loss": 0.9725, | |
| "num_input_tokens_seen": 38305792, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.6695035460992909, | |
| "grad_norm": 1.1423632361464884, | |
| "learning_rate": 3.72933231161401e-05, | |
| "loss": 0.961, | |
| "num_input_tokens_seen": 38567936, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.6808510638297873, | |
| "grad_norm": 1.1887510715530398, | |
| "learning_rate": 3.713758327387961e-05, | |
| "loss": 0.944, | |
| "num_input_tokens_seen": 38830080, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.6921985815602838, | |
| "grad_norm": 1.2005973709528561, | |
| "learning_rate": 3.6981224668001424e-05, | |
| "loss": 0.9855, | |
| "num_input_tokens_seen": 39092224, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7035460992907803, | |
| "grad_norm": 1.1796966769071517, | |
| "learning_rate": 3.682425526953359e-05, | |
| "loss": 0.9785, | |
| "num_input_tokens_seen": 39354368, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.7148936170212767, | |
| "grad_norm": 1.1401769784499336, | |
| "learning_rate": 3.6666683080641846e-05, | |
| "loss": 0.9509, | |
| "num_input_tokens_seen": 39616512, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.7262411347517732, | |
| "grad_norm": 1.2720626628857692, | |
| "learning_rate": 3.6508516134221635e-05, | |
| "loss": 0.965, | |
| "num_input_tokens_seen": 39878656, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.7375886524822695, | |
| "grad_norm": 1.1887836841186321, | |
| "learning_rate": 3.634976249348867e-05, | |
| "loss": 0.9737, | |
| "num_input_tokens_seen": 40140800, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.748936170212766, | |
| "grad_norm": 1.2318081552834408, | |
| "learning_rate": 3.619043025156782e-05, | |
| "loss": 0.947, | |
| "num_input_tokens_seen": 40402944, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7602836879432624, | |
| "grad_norm": 1.0371027068087095, | |
| "learning_rate": 3.603052753108053e-05, | |
| "loss": 0.9782, | |
| "num_input_tokens_seen": 40665088, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.7716312056737589, | |
| "grad_norm": 1.2582450073527114, | |
| "learning_rate": 3.58700624837308e-05, | |
| "loss": 0.9852, | |
| "num_input_tokens_seen": 40927232, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.7829787234042553, | |
| "grad_norm": 1.159400337663857, | |
| "learning_rate": 3.5709043289889536e-05, | |
| "loss": 0.9779, | |
| "num_input_tokens_seen": 41189376, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.7943262411347518, | |
| "grad_norm": 1.1094376112354303, | |
| "learning_rate": 3.554747815817756e-05, | |
| "loss": 0.9958, | |
| "num_input_tokens_seen": 41451520, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.8056737588652483, | |
| "grad_norm": 1.2198757703909164, | |
| "learning_rate": 3.5385375325047166e-05, | |
| "loss": 0.9784, | |
| "num_input_tokens_seen": 41713664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8170212765957445, | |
| "grad_norm": 1.0480308860957073, | |
| "learning_rate": 3.522274305436217e-05, | |
| "loss": 0.9644, | |
| "num_input_tokens_seen": 41975808, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.828368794326241, | |
| "grad_norm": 1.1883804521566375, | |
| "learning_rate": 3.50595896369767e-05, | |
| "loss": 0.9513, | |
| "num_input_tokens_seen": 42237952, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.8397163120567375, | |
| "grad_norm": 1.1041573287946038, | |
| "learning_rate": 3.4895923390312466e-05, | |
| "loss": 0.9935, | |
| "num_input_tokens_seen": 42500096, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.851063829787234, | |
| "grad_norm": 1.0322154463171043, | |
| "learning_rate": 3.4731752657934794e-05, | |
| "loss": 1.0127, | |
| "num_input_tokens_seen": 42762240, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.8624113475177304, | |
| "grad_norm": 1.210272229526448, | |
| "learning_rate": 3.456708580912725e-05, | |
| "loss": 0.94, | |
| "num_input_tokens_seen": 43024384, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8737588652482269, | |
| "grad_norm": 1.1397057549276046, | |
| "learning_rate": 3.4401931238464994e-05, | |
| "loss": 1.0017, | |
| "num_input_tokens_seen": 43286528, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.8851063829787233, | |
| "grad_norm": 1.052990158347976, | |
| "learning_rate": 3.423629736538685e-05, | |
| "loss": 0.9705, | |
| "num_input_tokens_seen": 43548672, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.8964539007092198, | |
| "grad_norm": 1.247482706002507, | |
| "learning_rate": 3.4070192633766025e-05, | |
| "loss": 0.9841, | |
| "num_input_tokens_seen": 43810816, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.9078014184397163, | |
| "grad_norm": 1.1048483913646374, | |
| "learning_rate": 3.390362551147974e-05, | |
| "loss": 0.9737, | |
| "num_input_tokens_seen": 44072960, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.9191489361702128, | |
| "grad_norm": 1.0770863251176332, | |
| "learning_rate": 3.3736604489977466e-05, | |
| "loss": 1.0008, | |
| "num_input_tokens_seen": 44335104, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9304964539007092, | |
| "grad_norm": 1.130827655449932, | |
| "learning_rate": 3.356913808384807e-05, | |
| "loss": 0.9726, | |
| "num_input_tokens_seen": 44597248, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.9418439716312057, | |
| "grad_norm": 1.0722010665917157, | |
| "learning_rate": 3.3401234830385756e-05, | |
| "loss": 0.9711, | |
| "num_input_tokens_seen": 44859392, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.9531914893617022, | |
| "grad_norm": 1.1584145820517182, | |
| "learning_rate": 3.323290328915483e-05, | |
| "loss": 0.9989, | |
| "num_input_tokens_seen": 45121536, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.9645390070921986, | |
| "grad_norm": 1.0846738851387514, | |
| "learning_rate": 3.306415204155335e-05, | |
| "loss": 0.9588, | |
| "num_input_tokens_seen": 45383680, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.9758865248226951, | |
| "grad_norm": 1.102142271317201, | |
| "learning_rate": 3.2894989690375626e-05, | |
| "loss": 0.9476, | |
| "num_input_tokens_seen": 45645824, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.9872340425531916, | |
| "grad_norm": 1.2026725696126155, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.9606, | |
| "num_input_tokens_seen": 45907968, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.998581560283688, | |
| "grad_norm": 1.2427515943863843, | |
| "learning_rate": 3.255546619281765e-05, | |
| "loss": 1.0062, | |
| "num_input_tokens_seen": 46170112, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.2427515943863843, | |
| "learning_rate": 3.2385122355055005e-05, | |
| "loss": 0.7789, | |
| "num_input_tokens_seen": 46202880, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.0113475177304965, | |
| "grad_norm": 3.9851861322995004, | |
| "learning_rate": 3.221440203006897e-05, | |
| "loss": 0.5235, | |
| "num_input_tokens_seen": 46465024, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.022695035460993, | |
| "grad_norm": 2.782182214592954, | |
| "learning_rate": 3.2043313921035743e-05, | |
| "loss": 0.5097, | |
| "num_input_tokens_seen": 46727168, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0340425531914894, | |
| "grad_norm": 1.8270642765269718, | |
| "learning_rate": 3.1871866749880846e-05, | |
| "loss": 0.4839, | |
| "num_input_tokens_seen": 46989312, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.045390070921986, | |
| "grad_norm": 3.7339811640972473, | |
| "learning_rate": 3.170006925683448e-05, | |
| "loss": 0.4825, | |
| "num_input_tokens_seen": 47251456, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.0567375886524824, | |
| "grad_norm": 2.9941865165214754, | |
| "learning_rate": 3.152793019998594e-05, | |
| "loss": 0.4497, | |
| "num_input_tokens_seen": 47513600, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.068085106382979, | |
| "grad_norm": 1.8964492803313993, | |
| "learning_rate": 3.135545835483718e-05, | |
| "loss": 0.4443, | |
| "num_input_tokens_seen": 47775744, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.0794326241134753, | |
| "grad_norm": 1.6098003889750838, | |
| "learning_rate": 3.118266251385539e-05, | |
| "loss": 0.4355, | |
| "num_input_tokens_seen": 48037888, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.0907801418439718, | |
| "grad_norm": 1.523263157923423, | |
| "learning_rate": 3.100955148602481e-05, | |
| "loss": 0.4265, | |
| "num_input_tokens_seen": 48300032, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.1021276595744682, | |
| "grad_norm": 1.3639153937211606, | |
| "learning_rate": 3.083613409639764e-05, | |
| "loss": 0.4058, | |
| "num_input_tokens_seen": 48562176, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.1134751773049647, | |
| "grad_norm": 1.370922535424333, | |
| "learning_rate": 3.0662419185644115e-05, | |
| "loss": 0.4004, | |
| "num_input_tokens_seen": 48824320, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.124822695035461, | |
| "grad_norm": 1.440676408033895, | |
| "learning_rate": 3.0488415609601862e-05, | |
| "loss": 0.389, | |
| "num_input_tokens_seen": 49086464, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.1361702127659576, | |
| "grad_norm": 1.310285355956054, | |
| "learning_rate": 3.0314132238824415e-05, | |
| "loss": 0.404, | |
| "num_input_tokens_seen": 49348608, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.147517730496454, | |
| "grad_norm": 1.4941784665076816, | |
| "learning_rate": 3.013957795812902e-05, | |
| "loss": 0.382, | |
| "num_input_tokens_seen": 49610752, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.1588652482269506, | |
| "grad_norm": 1.4030391388098873, | |
| "learning_rate": 2.996476166614364e-05, | |
| "loss": 0.3788, | |
| "num_input_tokens_seen": 49872896, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.1702127659574466, | |
| "grad_norm": 1.256129131429154, | |
| "learning_rate": 2.9789692274853388e-05, | |
| "loss": 0.3925, | |
| "num_input_tokens_seen": 50135040, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.181560283687943, | |
| "grad_norm": 1.4558080899884718, | |
| "learning_rate": 2.9614378709146133e-05, | |
| "loss": 0.3868, | |
| "num_input_tokens_seen": 50397184, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.1929078014184396, | |
| "grad_norm": 1.250507734486578, | |
| "learning_rate": 2.943882990635759e-05, | |
| "loss": 0.3775, | |
| "num_input_tokens_seen": 50659328, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.204255319148936, | |
| "grad_norm": 1.2741202231518605, | |
| "learning_rate": 2.92630548158156e-05, | |
| "loss": 0.3897, | |
| "num_input_tokens_seen": 50921472, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.2156028368794325, | |
| "grad_norm": 1.2133688441052128, | |
| "learning_rate": 2.9087062398384e-05, | |
| "loss": 0.3644, | |
| "num_input_tokens_seen": 51183616, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.226950354609929, | |
| "grad_norm": 1.1909730406616328, | |
| "learning_rate": 2.8910861626005776e-05, | |
| "loss": 0.3933, | |
| "num_input_tokens_seen": 51445760, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.2382978723404254, | |
| "grad_norm": 1.1826769961907178, | |
| "learning_rate": 2.873446148124563e-05, | |
| "loss": 0.4031, | |
| "num_input_tokens_seen": 51707904, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.249645390070922, | |
| "grad_norm": 1.2503503019173514, | |
| "learning_rate": 2.8557870956832132e-05, | |
| "loss": 0.3774, | |
| "num_input_tokens_seen": 51970048, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2609929078014184, | |
| "grad_norm": 1.1228352195116507, | |
| "learning_rate": 2.8381099055199222e-05, | |
| "loss": 0.396, | |
| "num_input_tokens_seen": 52232192, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.272340425531915, | |
| "grad_norm": 1.2833412130464927, | |
| "learning_rate": 2.8204154788027325e-05, | |
| "loss": 0.3589, | |
| "num_input_tokens_seen": 52494336, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.2836879432624113, | |
| "grad_norm": 1.1493157288933067, | |
| "learning_rate": 2.8027047175783873e-05, | |
| "loss": 0.36, | |
| "num_input_tokens_seen": 52756480, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.295035460992908, | |
| "grad_norm": 1.2057679652659834, | |
| "learning_rate": 2.7849785247263515e-05, | |
| "loss": 0.3938, | |
| "num_input_tokens_seen": 53018624, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.3063829787234043, | |
| "grad_norm": 1.2859581898852654, | |
| "learning_rate": 2.767237803912783e-05, | |
| "loss": 0.4006, | |
| "num_input_tokens_seen": 53280768, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.3177304964539007, | |
| "grad_norm": 1.2255059859165218, | |
| "learning_rate": 2.7494834595444568e-05, | |
| "loss": 0.3798, | |
| "num_input_tokens_seen": 53542912, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.329078014184397, | |
| "grad_norm": 1.1213649523858145, | |
| "learning_rate": 2.731716396722672e-05, | |
| "loss": 0.3806, | |
| "num_input_tokens_seen": 53805056, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.3404255319148937, | |
| "grad_norm": 1.215408045504973, | |
| "learning_rate": 2.7139375211970996e-05, | |
| "loss": 0.3794, | |
| "num_input_tokens_seen": 54067200, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.35177304964539, | |
| "grad_norm": 1.1823599565086842, | |
| "learning_rate": 2.6961477393196126e-05, | |
| "loss": 0.3836, | |
| "num_input_tokens_seen": 54329344, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.3631205673758866, | |
| "grad_norm": 1.1340883173812746, | |
| "learning_rate": 2.6783479579980807e-05, | |
| "loss": 0.3769, | |
| "num_input_tokens_seen": 54591488, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.374468085106383, | |
| "grad_norm": 1.2066922550248658, | |
| "learning_rate": 2.6605390846501377e-05, | |
| "loss": 0.3854, | |
| "num_input_tokens_seen": 54853632, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.3858156028368795, | |
| "grad_norm": 1.2171750570169015, | |
| "learning_rate": 2.6427220271569203e-05, | |
| "loss": 0.3872, | |
| "num_input_tokens_seen": 55115776, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.397163120567376, | |
| "grad_norm": 1.154268662146649, | |
| "learning_rate": 2.624897693816785e-05, | |
| "loss": 0.3877, | |
| "num_input_tokens_seen": 55377920, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.4085106382978725, | |
| "grad_norm": 1.1755024157603309, | |
| "learning_rate": 2.6070669932990067e-05, | |
| "loss": 0.3839, | |
| "num_input_tokens_seen": 55640064, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.419858156028369, | |
| "grad_norm": 1.2230171275187467, | |
| "learning_rate": 2.5892308345974515e-05, | |
| "loss": 0.3765, | |
| "num_input_tokens_seen": 55902208, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4312056737588654, | |
| "grad_norm": 1.1601360552600042, | |
| "learning_rate": 2.5713901269842404e-05, | |
| "loss": 0.3788, | |
| "num_input_tokens_seen": 56164352, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.4425531914893615, | |
| "grad_norm": 1.2731492044445878, | |
| "learning_rate": 2.5535457799633955e-05, | |
| "loss": 0.3773, | |
| "num_input_tokens_seen": 56426496, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.453900709219858, | |
| "grad_norm": 1.1913197047744846, | |
| "learning_rate": 2.5356987032244683e-05, | |
| "loss": 0.3741, | |
| "num_input_tokens_seen": 56688640, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.4652482269503544, | |
| "grad_norm": 1.3734178315666306, | |
| "learning_rate": 2.5178498065961736e-05, | |
| "loss": 0.3959, | |
| "num_input_tokens_seen": 56950784, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.476595744680851, | |
| "grad_norm": 1.2130341884336506, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.3795, | |
| "num_input_tokens_seen": 57212928, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.4879432624113473, | |
| "grad_norm": 1.2201255253281913, | |
| "learning_rate": 2.4821501934038266e-05, | |
| "loss": 0.3643, | |
| "num_input_tokens_seen": 57475072, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.499290780141844, | |
| "grad_norm": 1.215550219664517, | |
| "learning_rate": 2.4643012967755326e-05, | |
| "loss": 0.3926, | |
| "num_input_tokens_seen": 57737216, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.5106382978723403, | |
| "grad_norm": 1.390476571364045, | |
| "learning_rate": 2.446454220036605e-05, | |
| "loss": 0.3904, | |
| "num_input_tokens_seen": 57999360, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.5219858156028367, | |
| "grad_norm": 1.117871728218981, | |
| "learning_rate": 2.42860987301576e-05, | |
| "loss": 0.373, | |
| "num_input_tokens_seen": 58261504, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 1.3698199224662784, | |
| "learning_rate": 2.410769165402549e-05, | |
| "loss": 0.4068, | |
| "num_input_tokens_seen": 58523648, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5446808510638297, | |
| "grad_norm": 1.3551110495536183, | |
| "learning_rate": 2.3929330067009942e-05, | |
| "loss": 0.3859, | |
| "num_input_tokens_seen": 58785792, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.556028368794326, | |
| "grad_norm": 1.294001666579498, | |
| "learning_rate": 2.3751023061832158e-05, | |
| "loss": 0.3738, | |
| "num_input_tokens_seen": 59047936, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.5673758865248226, | |
| "grad_norm": 1.2033518812521018, | |
| "learning_rate": 2.35727797284308e-05, | |
| "loss": 0.4054, | |
| "num_input_tokens_seen": 59310080, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.578723404255319, | |
| "grad_norm": 1.3809415688054603, | |
| "learning_rate": 2.339460915349862e-05, | |
| "loss": 0.3948, | |
| "num_input_tokens_seen": 59572224, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.5900709219858156, | |
| "grad_norm": 1.144779349719294, | |
| "learning_rate": 2.3216520420019195e-05, | |
| "loss": 0.3976, | |
| "num_input_tokens_seen": 59834368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.601418439716312, | |
| "grad_norm": 1.2132146253753062, | |
| "learning_rate": 2.303852260680388e-05, | |
| "loss": 0.3899, | |
| "num_input_tokens_seen": 60096512, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.6127659574468085, | |
| "grad_norm": 1.1499252028219595, | |
| "learning_rate": 2.2860624788029013e-05, | |
| "loss": 0.4025, | |
| "num_input_tokens_seen": 60358656, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.624113475177305, | |
| "grad_norm": 1.1403014147417394, | |
| "learning_rate": 2.268283603277328e-05, | |
| "loss": 0.3858, | |
| "num_input_tokens_seen": 60620800, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.6354609929078014, | |
| "grad_norm": 1.13239820788593, | |
| "learning_rate": 2.250516540455543e-05, | |
| "loss": 0.3676, | |
| "num_input_tokens_seen": 60882944, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.646808510638298, | |
| "grad_norm": 1.1982201887250508, | |
| "learning_rate": 2.2327621960872187e-05, | |
| "loss": 0.3744, | |
| "num_input_tokens_seen": 61145088, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6581560283687944, | |
| "grad_norm": 1.091653134615552, | |
| "learning_rate": 2.2150214752736488e-05, | |
| "loss": 0.3804, | |
| "num_input_tokens_seen": 61407232, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.669503546099291, | |
| "grad_norm": 1.0757322285847275, | |
| "learning_rate": 2.197295282421613e-05, | |
| "loss": 0.3636, | |
| "num_input_tokens_seen": 61669376, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.6808510638297873, | |
| "grad_norm": 1.1907618432720997, | |
| "learning_rate": 2.179584521197268e-05, | |
| "loss": 0.3836, | |
| "num_input_tokens_seen": 61931520, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.692198581560284, | |
| "grad_norm": 1.154673483311636, | |
| "learning_rate": 2.1618900944800777e-05, | |
| "loss": 0.3797, | |
| "num_input_tokens_seen": 62193664, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.7035460992907803, | |
| "grad_norm": 1.155175360207987, | |
| "learning_rate": 2.1442129043167874e-05, | |
| "loss": 0.3783, | |
| "num_input_tokens_seen": 62455808, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7148936170212767, | |
| "grad_norm": 1.2332827172317569, | |
| "learning_rate": 2.1265538518754374e-05, | |
| "loss": 0.3778, | |
| "num_input_tokens_seen": 62717952, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.726241134751773, | |
| "grad_norm": 1.1881992350835984, | |
| "learning_rate": 2.1089138373994223e-05, | |
| "loss": 0.4034, | |
| "num_input_tokens_seen": 62980096, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.7375886524822697, | |
| "grad_norm": 1.2274329880141919, | |
| "learning_rate": 2.0912937601616005e-05, | |
| "loss": 0.3808, | |
| "num_input_tokens_seen": 63242240, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.748936170212766, | |
| "grad_norm": 1.1187546128405046, | |
| "learning_rate": 2.0736945184184405e-05, | |
| "loss": 0.3743, | |
| "num_input_tokens_seen": 63504384, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.7602836879432626, | |
| "grad_norm": 1.2604037547622573, | |
| "learning_rate": 2.0561170093642423e-05, | |
| "loss": 0.4019, | |
| "num_input_tokens_seen": 63766528, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.771631205673759, | |
| "grad_norm": 1.1558782623081498, | |
| "learning_rate": 2.038562129085387e-05, | |
| "loss": 0.3791, | |
| "num_input_tokens_seen": 64028672, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.7829787234042556, | |
| "grad_norm": 1.1080497501880997, | |
| "learning_rate": 2.0210307725146615e-05, | |
| "loss": 0.3696, | |
| "num_input_tokens_seen": 64290816, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.794326241134752, | |
| "grad_norm": 1.3333275378619078, | |
| "learning_rate": 2.003523833385637e-05, | |
| "loss": 0.3945, | |
| "num_input_tokens_seen": 64552960, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.8056737588652485, | |
| "grad_norm": 1.1083885717571962, | |
| "learning_rate": 1.9860422041870987e-05, | |
| "loss": 0.3922, | |
| "num_input_tokens_seen": 64815104, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.8170212765957445, | |
| "grad_norm": 1.2103467593229682, | |
| "learning_rate": 1.9685867761175584e-05, | |
| "loss": 0.3921, | |
| "num_input_tokens_seen": 65077248, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.828368794326241, | |
| "grad_norm": 1.0852218255349009, | |
| "learning_rate": 1.9511584390398147e-05, | |
| "loss": 0.3846, | |
| "num_input_tokens_seen": 65339392, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.8397163120567375, | |
| "grad_norm": 1.2112711184032858, | |
| "learning_rate": 1.9337580814355888e-05, | |
| "loss": 0.3741, | |
| "num_input_tokens_seen": 65601536, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.851063829787234, | |
| "grad_norm": 1.1548719932949865, | |
| "learning_rate": 1.9163865903602374e-05, | |
| "loss": 0.3688, | |
| "num_input_tokens_seen": 65863680, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.8624113475177304, | |
| "grad_norm": 1.0632108357505439, | |
| "learning_rate": 1.899044851397519e-05, | |
| "loss": 0.3809, | |
| "num_input_tokens_seen": 66125824, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.873758865248227, | |
| "grad_norm": 1.2738307250224112, | |
| "learning_rate": 1.881733748614461e-05, | |
| "loss": 0.3861, | |
| "num_input_tokens_seen": 66387968, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.8851063829787233, | |
| "grad_norm": 1.2665120384000739, | |
| "learning_rate": 1.8644541645162834e-05, | |
| "loss": 0.3905, | |
| "num_input_tokens_seen": 66650112, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.89645390070922, | |
| "grad_norm": 1.1562998336132535, | |
| "learning_rate": 1.8472069800014068e-05, | |
| "loss": 0.3614, | |
| "num_input_tokens_seen": 66912256, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.9078014184397163, | |
| "grad_norm": 1.1877334097829628, | |
| "learning_rate": 1.8299930743165535e-05, | |
| "loss": 0.383, | |
| "num_input_tokens_seen": 67174400, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.9191489361702128, | |
| "grad_norm": 1.077396638449169, | |
| "learning_rate": 1.8128133250119157e-05, | |
| "loss": 0.3735, | |
| "num_input_tokens_seen": 67436544, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.9304964539007092, | |
| "grad_norm": 1.1563881856425553, | |
| "learning_rate": 1.795668607896426e-05, | |
| "loss": 0.3978, | |
| "num_input_tokens_seen": 67698688, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9418439716312057, | |
| "grad_norm": 1.0938743557105097, | |
| "learning_rate": 1.778559796993104e-05, | |
| "loss": 0.3877, | |
| "num_input_tokens_seen": 67960832, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.953191489361702, | |
| "grad_norm": 1.1062018663854845, | |
| "learning_rate": 1.7614877644945e-05, | |
| "loss": 0.3862, | |
| "num_input_tokens_seen": 68222976, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.9645390070921986, | |
| "grad_norm": 1.0442669458478842, | |
| "learning_rate": 1.7444533807182357e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 68485120, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.975886524822695, | |
| "grad_norm": 1.0755649234512432, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.371, | |
| "num_input_tokens_seen": 68747264, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.9872340425531916, | |
| "grad_norm": 1.0856119742333918, | |
| "learning_rate": 1.710501030962438e-05, | |
| "loss": 0.3866, | |
| "num_input_tokens_seen": 69009408, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.998581560283688, | |
| "grad_norm": 1.0445462962582197, | |
| "learning_rate": 1.6935847958446657e-05, | |
| "loss": 0.3701, | |
| "num_input_tokens_seen": 69271552, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.0445462962582197, | |
| "learning_rate": 1.6767096710845174e-05, | |
| "loss": 0.2901, | |
| "num_input_tokens_seen": 69304320, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.0113475177304965, | |
| "grad_norm": 2.74466686658697, | |
| "learning_rate": 1.6598765169614243e-05, | |
| "loss": 0.1312, | |
| "num_input_tokens_seen": 69566464, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.022695035460993, | |
| "grad_norm": 1.5834322773598914, | |
| "learning_rate": 1.643086191615194e-05, | |
| "loss": 0.1237, | |
| "num_input_tokens_seen": 69828608, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 3.0340425531914894, | |
| "grad_norm": 1.303912889019024, | |
| "learning_rate": 1.6263395510022543e-05, | |
| "loss": 0.1163, | |
| "num_input_tokens_seen": 70090752, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.045390070921986, | |
| "grad_norm": 0.9262307477869844, | |
| "learning_rate": 1.6096374488520265e-05, | |
| "loss": 0.101, | |
| "num_input_tokens_seen": 70352896, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 3.0567375886524824, | |
| "grad_norm": 0.9926400294079983, | |
| "learning_rate": 1.5929807366233977e-05, | |
| "loss": 0.1097, | |
| "num_input_tokens_seen": 70615040, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.068085106382979, | |
| "grad_norm": 1.275115327394668, | |
| "learning_rate": 1.5763702634613152e-05, | |
| "loss": 0.1049, | |
| "num_input_tokens_seen": 70877184, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.0794326241134753, | |
| "grad_norm": 1.5930282703149587, | |
| "learning_rate": 1.559806876153501e-05, | |
| "loss": 0.1168, | |
| "num_input_tokens_seen": 71139328, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 3.0907801418439718, | |
| "grad_norm": 1.4943109059271082, | |
| "learning_rate": 1.5432914190872757e-05, | |
| "loss": 0.1063, | |
| "num_input_tokens_seen": 71401472, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.1021276595744682, | |
| "grad_norm": 1.3224758273342347, | |
| "learning_rate": 1.5268247342065215e-05, | |
| "loss": 0.0996, | |
| "num_input_tokens_seen": 71663616, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.1134751773049647, | |
| "grad_norm": 1.0401363088355635, | |
| "learning_rate": 1.5104076609687545e-05, | |
| "loss": 0.099, | |
| "num_input_tokens_seen": 71925760, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 3.124822695035461, | |
| "grad_norm": 0.9168634985756315, | |
| "learning_rate": 1.4940410363023306e-05, | |
| "loss": 0.0904, | |
| "num_input_tokens_seen": 72187904, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 3.1361702127659576, | |
| "grad_norm": 0.8236578810199936, | |
| "learning_rate": 1.4777256945637834e-05, | |
| "loss": 0.0986, | |
| "num_input_tokens_seen": 72450048, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.147517730496454, | |
| "grad_norm": 0.7819456617953726, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 0.0922, | |
| "num_input_tokens_seen": 72712192, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.1588652482269506, | |
| "grad_norm": 0.811675121286178, | |
| "learning_rate": 1.4452521841822436e-05, | |
| "loss": 0.0981, | |
| "num_input_tokens_seen": 72974336, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 3.1702127659574466, | |
| "grad_norm": 0.7976686044001465, | |
| "learning_rate": 1.4290956710110475e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 73236480, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.181560283687943, | |
| "grad_norm": 0.751816526314869, | |
| "learning_rate": 1.4129937516269203e-05, | |
| "loss": 0.0947, | |
| "num_input_tokens_seen": 73498624, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 3.1929078014184396, | |
| "grad_norm": 0.8177754717456022, | |
| "learning_rate": 1.3969472468919461e-05, | |
| "loss": 0.0921, | |
| "num_input_tokens_seen": 73760768, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 3.204255319148936, | |
| "grad_norm": 0.7571785449155486, | |
| "learning_rate": 1.3809569748432189e-05, | |
| "loss": 0.0906, | |
| "num_input_tokens_seen": 74022912, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.2156028368794325, | |
| "grad_norm": 0.747961359332696, | |
| "learning_rate": 1.3650237506511331e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 74285056, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 3.226950354609929, | |
| "grad_norm": 0.792288904094197, | |
| "learning_rate": 1.3491483865778365e-05, | |
| "loss": 0.0884, | |
| "num_input_tokens_seen": 74547200, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 3.2382978723404254, | |
| "grad_norm": 0.8907746700880744, | |
| "learning_rate": 1.3333316919358157e-05, | |
| "loss": 0.0967, | |
| "num_input_tokens_seen": 74809344, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.249645390070922, | |
| "grad_norm": 0.7775009871286768, | |
| "learning_rate": 1.3175744730466408e-05, | |
| "loss": 0.0818, | |
| "num_input_tokens_seen": 75071488, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 3.2609929078014184, | |
| "grad_norm": 0.7695449735932754, | |
| "learning_rate": 1.301877533199859e-05, | |
| "loss": 0.0888, | |
| "num_input_tokens_seen": 75333632, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.272340425531915, | |
| "grad_norm": 0.8110072874482183, | |
| "learning_rate": 1.2862416726120396e-05, | |
| "loss": 0.0906, | |
| "num_input_tokens_seen": 75595776, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 3.2836879432624113, | |
| "grad_norm": 0.7766644179424415, | |
| "learning_rate": 1.2706676883859903e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 75857920, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 3.295035460992908, | |
| "grad_norm": 0.7189275332544894, | |
| "learning_rate": 1.2551563744701109e-05, | |
| "loss": 0.087, | |
| "num_input_tokens_seen": 76120064, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 3.3063829787234043, | |
| "grad_norm": 0.711512593628328, | |
| "learning_rate": 1.2397085216179208e-05, | |
| "loss": 0.0852, | |
| "num_input_tokens_seen": 76382208, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 3.3177304964539007, | |
| "grad_norm": 0.7338724262748597, | |
| "learning_rate": 1.2243249173477513e-05, | |
| "loss": 0.0877, | |
| "num_input_tokens_seen": 76644352, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.329078014184397, | |
| "grad_norm": 0.7269268914811858, | |
| "learning_rate": 1.2090063459025955e-05, | |
| "loss": 0.086, | |
| "num_input_tokens_seen": 76906496, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 3.3404255319148937, | |
| "grad_norm": 0.7772637508360701, | |
| "learning_rate": 1.1937535882101281e-05, | |
| "loss": 0.0898, | |
| "num_input_tokens_seen": 77168640, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 3.35177304964539, | |
| "grad_norm": 0.7412964391797219, | |
| "learning_rate": 1.1785674218428952e-05, | |
| "loss": 0.0936, | |
| "num_input_tokens_seen": 77430784, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 3.3631205673758866, | |
| "grad_norm": 0.717507466030445, | |
| "learning_rate": 1.163448620978674e-05, | |
| "loss": 0.0837, | |
| "num_input_tokens_seen": 77692928, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 3.374468085106383, | |
| "grad_norm": 0.7491045281676182, | |
| "learning_rate": 1.148397956361007e-05, | |
| "loss": 0.0902, | |
| "num_input_tokens_seen": 77955072, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.3858156028368795, | |
| "grad_norm": 0.7816973300591532, | |
| "learning_rate": 1.1334161952599054e-05, | |
| "loss": 0.085, | |
| "num_input_tokens_seen": 78217216, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 3.397163120567376, | |
| "grad_norm": 0.7754872404477443, | |
| "learning_rate": 1.1185041014327433e-05, | |
| "loss": 0.0967, | |
| "num_input_tokens_seen": 78479360, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 3.4085106382978725, | |
| "grad_norm": 0.7562220941293399, | |
| "learning_rate": 1.1036624350853145e-05, | |
| "loss": 0.0861, | |
| "num_input_tokens_seen": 78741504, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 3.419858156028369, | |
| "grad_norm": 0.7446834922212843, | |
| "learning_rate": 1.0888919528330777e-05, | |
| "loss": 0.0866, | |
| "num_input_tokens_seen": 79003648, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 3.4312056737588654, | |
| "grad_norm": 0.7368714630650928, | |
| "learning_rate": 1.0741934076625895e-05, | |
| "loss": 0.0909, | |
| "num_input_tokens_seen": 79265792, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.4425531914893615, | |
| "grad_norm": 0.7586993943731002, | |
| "learning_rate": 1.059567548893118e-05, | |
| "loss": 0.0864, | |
| "num_input_tokens_seen": 79527936, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.453900709219858, | |
| "grad_norm": 0.7026395743924377, | |
| "learning_rate": 1.0450151221384358e-05, | |
| "loss": 0.0907, | |
| "num_input_tokens_seen": 79790080, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.4652482269503544, | |
| "grad_norm": 0.7698549183761029, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 0.0875, | |
| "num_input_tokens_seen": 80052224, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.476595744680851, | |
| "grad_norm": 0.7275021764735461, | |
| "learning_rate": 1.016133528373215e-05, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 80314368, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.4879432624113473, | |
| "grad_norm": 0.6801616141816872, | |
| "learning_rate": 1.0018058337216327e-05, | |
| "loss": 0.0757, | |
| "num_input_tokens_seen": 80576512, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.499290780141844, | |
| "grad_norm": 0.7242637394130332, | |
| "learning_rate": 9.875545157276939e-06, | |
| "loss": 0.0805, | |
| "num_input_tokens_seen": 80838656, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.5106382978723403, | |
| "grad_norm": 0.695464243410679, | |
| "learning_rate": 9.733803009114045e-06, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 81100800, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.5219858156028367, | |
| "grad_norm": 0.7091878866235037, | |
| "learning_rate": 9.592839118621187e-06, | |
| "loss": 0.0871, | |
| "num_input_tokens_seen": 81362944, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.533333333333333, | |
| "grad_norm": 0.7211061205609051, | |
| "learning_rate": 9.452660672016949e-06, | |
| "loss": 0.0787, | |
| "num_input_tokens_seen": 81625088, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.5446808510638297, | |
| "grad_norm": 0.7222949726436061, | |
| "learning_rate": 9.313274815478698e-06, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 81887232, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.556028368794326, | |
| "grad_norm": 0.7838259636406755, | |
| "learning_rate": 9.174688654778243e-06, | |
| "loss": 0.0936, | |
| "num_input_tokens_seen": 82149376, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.5673758865248226, | |
| "grad_norm": 0.7463416558404256, | |
| "learning_rate": 9.036909254919549e-06, | |
| "loss": 0.0802, | |
| "num_input_tokens_seen": 82411520, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.578723404255319, | |
| "grad_norm": 0.7081806282392477, | |
| "learning_rate": 8.899943639778619e-06, | |
| "loss": 0.0889, | |
| "num_input_tokens_seen": 82673664, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.5900709219858156, | |
| "grad_norm": 0.725305882992691, | |
| "learning_rate": 8.763798791745411e-06, | |
| "loss": 0.0787, | |
| "num_input_tokens_seen": 82935808, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.601418439716312, | |
| "grad_norm": 0.6909525153125249, | |
| "learning_rate": 8.628481651367876e-06, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 83197952, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.6127659574468085, | |
| "grad_norm": 0.8073887823074956, | |
| "learning_rate": 8.49399911699814e-06, | |
| "loss": 0.0897, | |
| "num_input_tokens_seen": 83460096, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.624113475177305, | |
| "grad_norm": 0.7270215654465839, | |
| "learning_rate": 8.360358044440797e-06, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 83722240, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.6354609929078014, | |
| "grad_norm": 0.7664259700341645, | |
| "learning_rate": 8.227565246603493e-06, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 83984384, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.646808510638298, | |
| "grad_norm": 0.6912127705169636, | |
| "learning_rate": 8.09562749314952e-06, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 84246528, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.6581560283687944, | |
| "grad_norm": 0.7391846111163392, | |
| "learning_rate": 7.96455151015272e-06, | |
| "loss": 0.086, | |
| "num_input_tokens_seen": 84508672, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.669503546099291, | |
| "grad_norm": 0.7040432780957079, | |
| "learning_rate": 7.83434397975466e-06, | |
| "loss": 0.0869, | |
| "num_input_tokens_seen": 84770816, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.6808510638297873, | |
| "grad_norm": 0.7049978471337497, | |
| "learning_rate": 7.705011539823911e-06, | |
| "loss": 0.0791, | |
| "num_input_tokens_seen": 85032960, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.692198581560284, | |
| "grad_norm": 0.6773004765686385, | |
| "learning_rate": 7.576560783617668e-06, | |
| "loss": 0.0807, | |
| "num_input_tokens_seen": 85295104, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.7035460992907803, | |
| "grad_norm": 0.7205733650917361, | |
| "learning_rate": 7.448998259445664e-06, | |
| "loss": 0.0846, | |
| "num_input_tokens_seen": 85557248, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.7148936170212767, | |
| "grad_norm": 0.7090150829436532, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.0755, | |
| "num_input_tokens_seen": 85819392, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.726241134751773, | |
| "grad_norm": 0.7317047711785631, | |
| "learning_rate": 7.196563873705209e-06, | |
| "loss": 0.0897, | |
| "num_input_tokens_seen": 86081536, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.7375886524822697, | |
| "grad_norm": 0.7593361249228872, | |
| "learning_rate": 7.071704881025915e-06, | |
| "loss": 0.0807, | |
| "num_input_tokens_seen": 86343680, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.748936170212766, | |
| "grad_norm": 0.7235635799774105, | |
| "learning_rate": 6.947759857503119e-06, | |
| "loss": 0.0839, | |
| "num_input_tokens_seen": 86605824, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.7602836879432626, | |
| "grad_norm": 0.7456576507189918, | |
| "learning_rate": 6.824735121748163e-06, | |
| "loss": 0.0811, | |
| "num_input_tokens_seen": 86867968, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.771631205673759, | |
| "grad_norm": 0.693061575666079, | |
| "learning_rate": 6.70263694545687e-06, | |
| "loss": 0.0759, | |
| "num_input_tokens_seen": 87130112, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.7829787234042556, | |
| "grad_norm": 0.6702503675155104, | |
| "learning_rate": 6.5814715530898745e-06, | |
| "loss": 0.0764, | |
| "num_input_tokens_seen": 87392256, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.794326241134752, | |
| "grad_norm": 0.7321643097852675, | |
| "learning_rate": 6.461245121555307e-06, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 87654400, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.8056737588652485, | |
| "grad_norm": 0.7231929761648604, | |
| "learning_rate": 6.341963779893828e-06, | |
| "loss": 0.0789, | |
| "num_input_tokens_seen": 87916544, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.8170212765957445, | |
| "grad_norm": 0.6959508065425039, | |
| "learning_rate": 6.223633608966254e-06, | |
| "loss": 0.0804, | |
| "num_input_tokens_seen": 88178688, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.828368794326241, | |
| "grad_norm": 0.6689396318017862, | |
| "learning_rate": 6.106260641143546e-06, | |
| "loss": 0.0846, | |
| "num_input_tokens_seen": 88440832, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.8397163120567375, | |
| "grad_norm": 0.7000335984216709, | |
| "learning_rate": 5.989850859999227e-06, | |
| "loss": 0.0768, | |
| "num_input_tokens_seen": 88702976, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.851063829787234, | |
| "grad_norm": 0.6819366248376687, | |
| "learning_rate": 5.874410200004421e-06, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 88965120, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.8624113475177304, | |
| "grad_norm": 0.6970883337622755, | |
| "learning_rate": 5.759944546225271e-06, | |
| "loss": 0.0826, | |
| "num_input_tokens_seen": 89227264, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.873758865248227, | |
| "grad_norm": 0.6874281419558664, | |
| "learning_rate": 5.646459734022938e-06, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 89489408, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.8851063829787233, | |
| "grad_norm": 0.7106888911769208, | |
| "learning_rate": 5.533961548756128e-06, | |
| "loss": 0.0812, | |
| "num_input_tokens_seen": 89751552, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.89645390070922, | |
| "grad_norm": 0.7331620079344322, | |
| "learning_rate": 5.422455725486114e-06, | |
| "loss": 0.0836, | |
| "num_input_tokens_seen": 90013696, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.9078014184397163, | |
| "grad_norm": 0.7035598390060888, | |
| "learning_rate": 5.311947948684457e-06, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 90275840, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.9191489361702128, | |
| "grad_norm": 0.7060132556751886, | |
| "learning_rate": 5.202443851943126e-06, | |
| "loss": 0.0806, | |
| "num_input_tokens_seen": 90537984, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.9304964539007092, | |
| "grad_norm": 0.6788669783816418, | |
| "learning_rate": 5.093949017687341e-06, | |
| "loss": 0.0794, | |
| "num_input_tokens_seen": 90800128, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.9418439716312057, | |
| "grad_norm": 0.6687951756948881, | |
| "learning_rate": 4.986468976890993e-06, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 91062272, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.953191489361702, | |
| "grad_norm": 0.691944078100738, | |
| "learning_rate": 4.880009208794667e-06, | |
| "loss": 0.0814, | |
| "num_input_tokens_seen": 91324416, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.9645390070921986, | |
| "grad_norm": 0.6500781762399683, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.0726, | |
| "num_input_tokens_seen": 91586560, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.975886524822695, | |
| "grad_norm": 0.671905484958583, | |
| "learning_rate": 4.670172147324592e-06, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 91848704, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.9872340425531916, | |
| "grad_norm": 0.6796917288094408, | |
| "learning_rate": 4.566805551264827e-06, | |
| "loss": 0.0796, | |
| "num_input_tokens_seen": 92110848, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.998581560283688, | |
| "grad_norm": 0.6833947734522067, | |
| "learning_rate": 4.4644806219877184e-06, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 92372992, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.6833947734522067, | |
| "learning_rate": 4.36320257593065e-06, | |
| "loss": 0.045, | |
| "num_input_tokens_seen": 92405760, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 4.0113475177304965, | |
| "grad_norm": 1.477142079845877, | |
| "learning_rate": 4.262976576161842e-06, | |
| "loss": 0.0244, | |
| "num_input_tokens_seen": 92667904, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.022695035460993, | |
| "grad_norm": 0.5476807803856774, | |
| "learning_rate": 4.1638077321170646e-06, | |
| "loss": 0.0227, | |
| "num_input_tokens_seen": 92930048, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 4.034042553191489, | |
| "grad_norm": 0.48697982739940027, | |
| "learning_rate": 4.0657010993391865e-06, | |
| "loss": 0.0213, | |
| "num_input_tokens_seen": 93192192, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 4.045390070921986, | |
| "grad_norm": 0.44199476911249697, | |
| "learning_rate": 3.968661679220468e-06, | |
| "loss": 0.0196, | |
| "num_input_tokens_seen": 93454336, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.056737588652482, | |
| "grad_norm": 0.40308653677545786, | |
| "learning_rate": 3.872694418747594e-06, | |
| "loss": 0.018, | |
| "num_input_tokens_seen": 93716480, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 4.068085106382979, | |
| "grad_norm": 0.37702562689636665, | |
| "learning_rate": 3.777804210249436e-06, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 93978624, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 4.079432624113475, | |
| "grad_norm": 0.36421639955022134, | |
| "learning_rate": 3.6839958911476957e-06, | |
| "loss": 0.0193, | |
| "num_input_tokens_seen": 94240768, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 4.090780141843972, | |
| "grad_norm": 0.3805365338343004, | |
| "learning_rate": 3.591274243710277e-06, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 94502912, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 4.102127659574468, | |
| "grad_norm": 0.3382957037488846, | |
| "learning_rate": 3.499643994807486e-06, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 94765056, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.113475177304965, | |
| "grad_norm": 0.32506441384380935, | |
| "learning_rate": 3.4091098156710744e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 95027200, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 4.124822695035461, | |
| "grad_norm": 0.3165720865193881, | |
| "learning_rate": 3.319676321656082e-06, | |
| "loss": 0.0164, | |
| "num_input_tokens_seen": 95289344, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 4.136170212765958, | |
| "grad_norm": 0.35920729686783553, | |
| "learning_rate": 3.2313480720055745e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 95551488, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 4.147517730496454, | |
| "grad_norm": 0.35575880232588847, | |
| "learning_rate": 3.1441295696181897e-06, | |
| "loss": 0.0161, | |
| "num_input_tokens_seen": 95813632, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 4.158865248226951, | |
| "grad_norm": 0.35269152277096033, | |
| "learning_rate": 3.058025260818609e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 96075776, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.170212765957447, | |
| "grad_norm": 0.38681949998462883, | |
| "learning_rate": 2.9730395351308866e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 96337920, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 4.1815602836879435, | |
| "grad_norm": 0.34874854085333057, | |
| "learning_rate": 2.889176725054643e-06, | |
| "loss": 0.0164, | |
| "num_input_tokens_seen": 96600064, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 4.19290780141844, | |
| "grad_norm": 0.42993916416930195, | |
| "learning_rate": 2.80644110584424e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 96862208, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 4.2042553191489365, | |
| "grad_norm": 0.3819874015881562, | |
| "learning_rate": 2.7248368952908053e-06, | |
| "loss": 0.0162, | |
| "num_input_tokens_seen": 97124352, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 4.215602836879433, | |
| "grad_norm": 0.39897276144640115, | |
| "learning_rate": 2.6443682535072177e-06, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 97386496, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.226950354609929, | |
| "grad_norm": 0.3570885820962534, | |
| "learning_rate": 2.565039282716045e-06, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 97648640, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 4.238297872340426, | |
| "grad_norm": 0.36959919713875067, | |
| "learning_rate": 2.486854027040375e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 97910784, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 4.249645390070922, | |
| "grad_norm": 0.3387567156073276, | |
| "learning_rate": 2.4098164722977073e-06, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 98172928, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 4.260992907801419, | |
| "grad_norm": 0.35273580349206224, | |
| "learning_rate": 2.333930545796717e-06, | |
| "loss": 0.0162, | |
| "num_input_tokens_seen": 98435072, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 4.272340425531915, | |
| "grad_norm": 0.41537674406100333, | |
| "learning_rate": 2.2592001161370392e-06, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 98697216, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.283687943262412, | |
| "grad_norm": 0.3722991986523798, | |
| "learning_rate": 2.185628993012101e-06, | |
| "loss": 0.0146, | |
| "num_input_tokens_seen": 98959360, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 4.295035460992908, | |
| "grad_norm": 0.34823793557474586, | |
| "learning_rate": 2.11322092701485e-06, | |
| "loss": 0.0155, | |
| "num_input_tokens_seen": 99221504, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 4.306382978723404, | |
| "grad_norm": 0.36284584139761145, | |
| "learning_rate": 2.0419796094465788e-06, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 99483648, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 4.317730496453901, | |
| "grad_norm": 0.33330741537946007, | |
| "learning_rate": 1.97190867212875e-06, | |
| "loss": 0.0158, | |
| "num_input_tokens_seen": 99745792, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 4.329078014184397, | |
| "grad_norm": 0.3678153199936105, | |
| "learning_rate": 1.9030116872178316e-06, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 100007936, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.340425531914893, | |
| "grad_norm": 0.3179402575720803, | |
| "learning_rate": 1.8352921670232143e-06, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 100270080, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 4.35177304964539, | |
| "grad_norm": 0.3139410780519724, | |
| "learning_rate": 1.768753563828135e-06, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 100532224, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 4.363120567375886, | |
| "grad_norm": 0.3543527103722898, | |
| "learning_rate": 1.703399269713693e-06, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 100794368, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 4.374468085106383, | |
| "grad_norm": 0.335580074515755, | |
| "learning_rate": 1.6392326163859273e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 101056512, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 4.385815602836879, | |
| "grad_norm": 0.34683584517312804, | |
| "learning_rate": 1.5762568750059604e-06, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 101318656, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.397163120567376, | |
| "grad_norm": 0.3626865573265347, | |
| "learning_rate": 1.5144752560232372e-06, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 101580800, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 4.408510638297872, | |
| "grad_norm": 0.3038959732006494, | |
| "learning_rate": 1.4538909090118846e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 101842944, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 4.4198581560283685, | |
| "grad_norm": 0.33499076540774964, | |
| "learning_rate": 1.3945069225101026e-06, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 102105088, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 4.431205673758865, | |
| "grad_norm": 0.3607028863228608, | |
| "learning_rate": 1.3363263238627493e-06, | |
| "loss": 0.0161, | |
| "num_input_tokens_seen": 102367232, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 4.4425531914893615, | |
| "grad_norm": 0.33049646901291296, | |
| "learning_rate": 1.2793520790670116e-06, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 102629376, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.453900709219858, | |
| "grad_norm": 0.3409835608306395, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 102891520, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 4.465248226950354, | |
| "grad_norm": 0.3426495926722642, | |
| "learning_rate": 1.1690342073765375e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 103153664, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 4.476595744680851, | |
| "grad_norm": 0.3385258820673031, | |
| "learning_rate": 1.1156962043925828e-06, | |
| "loss": 0.0158, | |
| "num_input_tokens_seen": 103415808, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 4.487943262411347, | |
| "grad_norm": 0.3366041472428315, | |
| "learning_rate": 1.0635758027950888e-06, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 103677952, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 4.499290780141844, | |
| "grad_norm": 0.3580533391262604, | |
| "learning_rate": 1.0126756596375686e-06, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 103940096, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.51063829787234, | |
| "grad_norm": 0.32986746041795145, | |
| "learning_rate": 9.629983697657886e-07, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 104202240, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 4.521985815602837, | |
| "grad_norm": 0.32413763896921793, | |
| "learning_rate": 9.145464656855257e-07, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 104464384, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 4.533333333333333, | |
| "grad_norm": 0.2817637969270016, | |
| "learning_rate": 8.673224174334221e-07, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 104726528, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 4.54468085106383, | |
| "grad_norm": 0.348191153365174, | |
| "learning_rate": 8.213286324510738e-07, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 104988672, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 4.556028368794326, | |
| "grad_norm": 0.3602776848534372, | |
| "learning_rate": 7.765674554623181e-07, | |
| "loss": 0.0168, | |
| "num_input_tokens_seen": 105250816, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.567375886524823, | |
| "grad_norm": 0.31058750754194875, | |
| "learning_rate": 7.330411683536876e-07, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 105512960, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 4.578723404255319, | |
| "grad_norm": 0.3075632948113939, | |
| "learning_rate": 6.907519900580861e-07, | |
| "loss": 0.0155, | |
| "num_input_tokens_seen": 105775104, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 4.590070921985816, | |
| "grad_norm": 0.3413483582876884, | |
| "learning_rate": 6.497020764416633e-07, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 106037248, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.601418439716312, | |
| "grad_norm": 0.3299810205766221, | |
| "learning_rate": 6.098935201939187e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 106299392, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 4.6127659574468085, | |
| "grad_norm": 0.3045488270246865, | |
| "learning_rate": 5.713283507210148e-07, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 106561536, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.624113475177305, | |
| "grad_norm": 0.30919394459980154, | |
| "learning_rate": 5.340085340423129e-07, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 106823680, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.6354609929078014, | |
| "grad_norm": 0.3317176085043783, | |
| "learning_rate": 4.979359726901639e-07, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 107085824, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.646808510638298, | |
| "grad_norm": 0.3157192290221517, | |
| "learning_rate": 4.63112505612906e-07, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 107347968, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.658156028368794, | |
| "grad_norm": 0.304631562860152, | |
| "learning_rate": 4.2953990808111135e-07, | |
| "loss": 0.015, | |
| "num_input_tokens_seen": 107610112, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.669503546099291, | |
| "grad_norm": 0.3526905412516224, | |
| "learning_rate": 3.972198915970976e-07, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 107872256, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.680851063829787, | |
| "grad_norm": 0.33679224274071523, | |
| "learning_rate": 3.6615410380767544e-07, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 108134400, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.692198581560284, | |
| "grad_norm": 0.32343102481646735, | |
| "learning_rate": 3.3634412842014353e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 108396544, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.70354609929078, | |
| "grad_norm": 0.2946484230167598, | |
| "learning_rate": 3.077914851215585e-07, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 108658688, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.714893617021277, | |
| "grad_norm": 0.30378846194550047, | |
| "learning_rate": 2.804976295012612e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 108920832, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.726241134751773, | |
| "grad_norm": 0.33144732440436586, | |
| "learning_rate": 2.544639529766829e-07, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 109182976, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.73758865248227, | |
| "grad_norm": 0.3360054980469322, | |
| "learning_rate": 2.2969178272238545e-07, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 109445120, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.748936170212766, | |
| "grad_norm": 0.3119829984354626, | |
| "learning_rate": 2.061823816024322e-07, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 109707264, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.760283687943263, | |
| "grad_norm": 0.3182968779470437, | |
| "learning_rate": 1.8393694810599493e-07, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 109969408, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.771631205673759, | |
| "grad_norm": 0.2823020931727126, | |
| "learning_rate": 1.6295661628624447e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 110231552, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.782978723404256, | |
| "grad_norm": 0.33810487516657384, | |
| "learning_rate": 1.4324245570256633e-07, | |
| "loss": 0.0167, | |
| "num_input_tokens_seen": 110493696, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.794326241134752, | |
| "grad_norm": 0.33893844406644097, | |
| "learning_rate": 1.2479547136600989e-07, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 110755840, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.8056737588652485, | |
| "grad_norm": 0.3356203471174735, | |
| "learning_rate": 1.0761660368806548e-07, | |
| "loss": 0.0151, | |
| "num_input_tokens_seen": 111017984, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.817021276595745, | |
| "grad_norm": 0.3334523590081176, | |
| "learning_rate": 9.170672843271666e-08, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 111280128, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.828368794326241, | |
| "grad_norm": 0.32060335064770346, | |
| "learning_rate": 7.706665667180091e-08, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 111542272, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.839716312056738, | |
| "grad_norm": 0.30209084334682806, | |
| "learning_rate": 6.369713474366212e-08, | |
| "loss": 0.0141, | |
| "num_input_tokens_seen": 111804416, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.851063829787234, | |
| "grad_norm": 0.35524870224084554, | |
| "learning_rate": 5.159884421509498e-08, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 112066560, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.862411347517731, | |
| "grad_norm": 0.3137882300233803, | |
| "learning_rate": 4.07724018466088e-08, | |
| "loss": 0.0162, | |
| "num_input_tokens_seen": 112328704, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.873758865248227, | |
| "grad_norm": 0.31600464447069954, | |
| "learning_rate": 3.1218359560974966e-08, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 112590848, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.885106382978723, | |
| "grad_norm": 0.36130384891727935, | |
| "learning_rate": 2.2937204415107717e-08, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 112852992, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.89645390070922, | |
| "grad_norm": 0.28210880961362844, | |
| "learning_rate": 1.5929358575206275e-08, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 113115136, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.907801418439716, | |
| "grad_norm": 0.3370034082905995, | |
| "learning_rate": 1.0195179295269252e-08, | |
| "loss": 0.0145, | |
| "num_input_tokens_seen": 113377280, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.919148936170213, | |
| "grad_norm": 0.34659397478326986, | |
| "learning_rate": 5.7349588988481194e-09, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 113639424, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.930496453900709, | |
| "grad_norm": 0.3388609291955605, | |
| "learning_rate": 2.5489247641674596e-09, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 113901568, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.941843971631206, | |
| "grad_norm": 0.32782899822451794, | |
| "learning_rate": 6.372393125203546e-10, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 114163712, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.953191489361702, | |
| "grad_norm": 0.340643949367228, | |
| "learning_rate": 0.0, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 114425856, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.953191489361702, | |
| "num_input_tokens_seen": 114425856, | |
| "step": 440, | |
| "total_flos": 182736094494720.0, | |
| "train_loss": 0.6679792516632006, | |
| "train_runtime": 10942.5374, | |
| "train_samples_per_second": 10.298, | |
| "train_steps_per_second": 0.04 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 440, | |
| "num_input_tokens_seen": 114425856, | |
| "num_train_epochs": 5, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 182736094494720.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |