| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1073, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004659832246039142, |
| "grad_norm": 2.0020346486972196, |
| "learning_rate": 4.6296296296296296e-06, |
| "loss": 0.8721, |
| "num_tokens": 2621440.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.009319664492078284, |
| "grad_norm": 1.253158998404554, |
| "learning_rate": 9.259259259259259e-06, |
| "loss": 0.8069, |
| "num_tokens": 5242880.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.013979496738117428, |
| "grad_norm": 1.0989413519013813, |
| "learning_rate": 1.388888888888889e-05, |
| "loss": 0.7748, |
| "num_tokens": 7864320.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.01863932898415657, |
| "grad_norm": 0.6255940599509165, |
| "learning_rate": 1.8518518518518518e-05, |
| "loss": 0.7143, |
| "num_tokens": 10465392.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.023299161230195712, |
| "grad_norm": 0.5953735542875533, |
| "learning_rate": 2.314814814814815e-05, |
| "loss": 0.7005, |
| "num_tokens": 13086832.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.027958993476234855, |
| "grad_norm": 0.45079290502204816, |
| "learning_rate": 2.777777777777778e-05, |
| "loss": 0.6884, |
| "num_tokens": 15708272.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.032618825722273995, |
| "grad_norm": 0.49062930054604054, |
| "learning_rate": 3.240740740740741e-05, |
| "loss": 0.652, |
| "num_tokens": 18329712.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.03727865796831314, |
| "grad_norm": 0.449672124324896, |
| "learning_rate": 3.7037037037037037e-05, |
| "loss": 0.6507, |
| "num_tokens": 20951152.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04193849021435228, |
| "grad_norm": 0.44608433780722856, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 0.6394, |
| "num_tokens": 23572592.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.046598322460391424, |
| "grad_norm": 0.42309130403831824, |
| "learning_rate": 4.62962962962963e-05, |
| "loss": 0.6405, |
| "num_tokens": 26194032.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05125815470643057, |
| "grad_norm": 0.49262634105345865, |
| "learning_rate": 4.999989306901785e-05, |
| "loss": 0.6221, |
| "num_tokens": 28815472.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.05591798695246971, |
| "grad_norm": 0.436012264727026, |
| "learning_rate": 4.999615059136074e-05, |
| "loss": 0.6334, |
| "num_tokens": 31436912.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06057781919850885, |
| "grad_norm": 0.45481218499286563, |
| "learning_rate": 4.9987062580928735e-05, |
| "loss": 0.6148, |
| "num_tokens": 34058352.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.06523765144454799, |
| "grad_norm": 0.4359195847354566, |
| "learning_rate": 4.997263119721384e-05, |
| "loss": 0.6099, |
| "num_tokens": 36666389.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.06989748369058714, |
| "grad_norm": 0.4730817542130775, |
| "learning_rate": 4.995285986939984e-05, |
| "loss": 0.6135, |
| "num_tokens": 39287829.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.07455731593662628, |
| "grad_norm": 0.49470493408718064, |
| "learning_rate": 4.992775329554741e-05, |
| "loss": 0.6077, |
| "num_tokens": 41909269.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.07921714818266543, |
| "grad_norm": 0.4463089290230757, |
| "learning_rate": 4.989731744147782e-05, |
| "loss": 0.6215, |
| "num_tokens": 44530709.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.08387698042870456, |
| "grad_norm": 0.4985100734116643, |
| "learning_rate": 4.98615595393553e-05, |
| "loss": 0.6078, |
| "num_tokens": 47152149.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.08853681267474371, |
| "grad_norm": 0.4655941806945866, |
| "learning_rate": 4.9820488085968514e-05, |
| "loss": 0.6021, |
| "num_tokens": 49773589.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.09319664492078285, |
| "grad_norm": 0.5071630878858543, |
| "learning_rate": 4.9774112840711616e-05, |
| "loss": 0.5872, |
| "num_tokens": 52353719.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.097856477166822, |
| "grad_norm": 0.4202565227732614, |
| "learning_rate": 4.972244482326516e-05, |
| "loss": 0.6016, |
| "num_tokens": 54952917.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.10251630941286113, |
| "grad_norm": 0.4082361621515674, |
| "learning_rate": 4.9665496310977647e-05, |
| "loss": 0.5976, |
| "num_tokens": 57574357.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.10717614165890028, |
| "grad_norm": 0.45210016965034117, |
| "learning_rate": 4.960328083594817e-05, |
| "loss": 0.5947, |
| "num_tokens": 60191231.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.11183597390493942, |
| "grad_norm": 0.40117661777550084, |
| "learning_rate": 4.953581318181091e-05, |
| "loss": 0.5964, |
| "num_tokens": 62792124.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.11649580615097857, |
| "grad_norm": 0.41729490339689734, |
| "learning_rate": 4.946310938022225e-05, |
| "loss": 0.5811, |
| "num_tokens": 65405812.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.1211556383970177, |
| "grad_norm": 0.41449186514184744, |
| "learning_rate": 4.9385186707051365e-05, |
| "loss": 0.6091, |
| "num_tokens": 68027252.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.12581547064305684, |
| "grad_norm": 0.3886748654306215, |
| "learning_rate": 4.93020636782751e-05, |
| "loss": 0.5882, |
| "num_tokens": 70648692.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.13047530288909598, |
| "grad_norm": 0.43026058357535235, |
| "learning_rate": 4.921376004557819e-05, |
| "loss": 0.5731, |
| "num_tokens": 73260442.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.13513513513513514, |
| "grad_norm": 0.36793157290965955, |
| "learning_rate": 4.9120296791659924e-05, |
| "loss": 0.5794, |
| "num_tokens": 75881882.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.13979496738117428, |
| "grad_norm": 0.4379752352372389, |
| "learning_rate": 4.902169612524819e-05, |
| "loss": 0.5851, |
| "num_tokens": 78503322.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.14445479962721341, |
| "grad_norm": 0.41772008334919314, |
| "learning_rate": 4.891798147582226e-05, |
| "loss": 0.5623, |
| "num_tokens": 81124762.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.14911463187325255, |
| "grad_norm": 0.4579730206401132, |
| "learning_rate": 4.880917748804547e-05, |
| "loss": 0.5784, |
| "num_tokens": 83727556.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.15377446411929171, |
| "grad_norm": 0.4417774748778227, |
| "learning_rate": 4.8695310015909164e-05, |
| "loss": 0.5785, |
| "num_tokens": 86348996.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.15843429636533085, |
| "grad_norm": 0.4031021659730179, |
| "learning_rate": 4.8576406116589275e-05, |
| "loss": 0.5831, |
| "num_tokens": 88959058.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.16309412861137, |
| "grad_norm": 0.3679441023930586, |
| "learning_rate": 4.845249404401699e-05, |
| "loss": 0.567, |
| "num_tokens": 91580498.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.16775396085740912, |
| "grad_norm": 0.36804291519324855, |
| "learning_rate": 4.8323603242165074e-05, |
| "loss": 0.5801, |
| "num_tokens": 94201938.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.1724137931034483, |
| "grad_norm": 0.3596667241510591, |
| "learning_rate": 4.818976433805137e-05, |
| "loss": 0.5683, |
| "num_tokens": 96817927.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.17707362534948742, |
| "grad_norm": 0.45073480216981227, |
| "learning_rate": 4.8051009134461225e-05, |
| "loss": 0.5766, |
| "num_tokens": 99432200.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.18173345759552656, |
| "grad_norm": 0.5148894373872221, |
| "learning_rate": 4.7907370602390526e-05, |
| "loss": 0.5679, |
| "num_tokens": 102049300.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.1863932898415657, |
| "grad_norm": 0.4449761153781537, |
| "learning_rate": 4.775888287321112e-05, |
| "loss": 0.5765, |
| "num_tokens": 104670740.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.19105312208760486, |
| "grad_norm": 0.4680422060256896, |
| "learning_rate": 4.760558123056053e-05, |
| "loss": 0.5686, |
| "num_tokens": 107287613.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.195712954333644, |
| "grad_norm": 0.38081865109846375, |
| "learning_rate": 4.744750210195784e-05, |
| "loss": 0.5829, |
| "num_tokens": 109909053.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.20037278657968313, |
| "grad_norm": 0.4421980564810565, |
| "learning_rate": 4.728468305014781e-05, |
| "loss": 0.5787, |
| "num_tokens": 112530493.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.20503261882572227, |
| "grad_norm": 0.3907838961510381, |
| "learning_rate": 4.7117162764175255e-05, |
| "loss": 0.5777, |
| "num_tokens": 115151933.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2096924510717614, |
| "grad_norm": 0.3766316500324192, |
| "learning_rate": 4.6944981050191676e-05, |
| "loss": 0.5743, |
| "num_tokens": 117765441.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.21435228331780057, |
| "grad_norm": 0.41073517011439986, |
| "learning_rate": 4.6768178821996615e-05, |
| "loss": 0.5741, |
| "num_tokens": 120376968.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.2190121155638397, |
| "grad_norm": 0.4871170386460364, |
| "learning_rate": 4.6586798091315654e-05, |
| "loss": 0.5618, |
| "num_tokens": 122996669.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.22367194780987884, |
| "grad_norm": 0.41194183107478055, |
| "learning_rate": 4.6400881957817626e-05, |
| "loss": 0.5646, |
| "num_tokens": 125607410.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.22833178005591798, |
| "grad_norm": 0.38673212830732145, |
| "learning_rate": 4.621047459887328e-05, |
| "loss": 0.568, |
| "num_tokens": 128215486.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.23299161230195714, |
| "grad_norm": 0.33622036212857603, |
| "learning_rate": 4.601562125905785e-05, |
| "loss": 0.563, |
| "num_tokens": 130831913.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.23765144454799628, |
| "grad_norm": 0.3575373025028897, |
| "learning_rate": 4.581636823940004e-05, |
| "loss": 0.5568, |
| "num_tokens": 133447076.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.2423112767940354, |
| "grad_norm": 0.34420106632880754, |
| "learning_rate": 4.561276288637997e-05, |
| "loss": 0.576, |
| "num_tokens": 136036466.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.24697110904007455, |
| "grad_norm": 0.34530949205825096, |
| "learning_rate": 4.5404853580678756e-05, |
| "loss": 0.5665, |
| "num_tokens": 138657906.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.2516309412861137, |
| "grad_norm": 0.3208643127615705, |
| "learning_rate": 4.5192689725682245e-05, |
| "loss": 0.5635, |
| "num_tokens": 141273912.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.25629077353215285, |
| "grad_norm": 0.3615064308276192, |
| "learning_rate": 4.497632173574181e-05, |
| "loss": 0.5579, |
| "num_tokens": 143880811.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.26095060577819196, |
| "grad_norm": 0.3575665967206342, |
| "learning_rate": 4.475580102419491e-05, |
| "loss": 0.5655, |
| "num_tokens": 146502251.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.2656104380242311, |
| "grad_norm": 0.4400933029907661, |
| "learning_rate": 4.453117999114822e-05, |
| "loss": 0.5579, |
| "num_tokens": 149113471.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 0.35086248287423627, |
| "learning_rate": 4.4302512011026374e-05, |
| "loss": 0.5614, |
| "num_tokens": 151734911.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.2749301025163094, |
| "grad_norm": 0.3528868192665579, |
| "learning_rate": 4.406985141988911e-05, |
| "loss": 0.5688, |
| "num_tokens": 154356351.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.27958993476234856, |
| "grad_norm": 0.3256469784094283, |
| "learning_rate": 4.383325350251993e-05, |
| "loss": 0.5549, |
| "num_tokens": 156977308.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2842497670083877, |
| "grad_norm": 0.33879498170022027, |
| "learning_rate": 4.359277447928938e-05, |
| "loss": 0.5535, |
| "num_tokens": 159598748.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.28890959925442683, |
| "grad_norm": 0.3419150060665249, |
| "learning_rate": 4.3348471492795916e-05, |
| "loss": 0.5477, |
| "num_tokens": 162220188.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.293569431500466, |
| "grad_norm": 0.30870547736615334, |
| "learning_rate": 4.310040259428774e-05, |
| "loss": 0.5602, |
| "num_tokens": 164828478.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.2982292637465051, |
| "grad_norm": 0.35321251487422134, |
| "learning_rate": 4.2848626729868625e-05, |
| "loss": 0.5579, |
| "num_tokens": 167423794.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.30288909599254427, |
| "grad_norm": 0.34298327601950934, |
| "learning_rate": 4.259320372649117e-05, |
| "loss": 0.5484, |
| "num_tokens": 170045234.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.30754892823858343, |
| "grad_norm": 0.3164239205249639, |
| "learning_rate": 4.233419427774074e-05, |
| "loss": 0.5438, |
| "num_tokens": 172657586.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.31220876048462254, |
| "grad_norm": 0.31090603672917905, |
| "learning_rate": 4.207165992941342e-05, |
| "loss": 0.5487, |
| "num_tokens": 175279026.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3168685927306617, |
| "grad_norm": 0.3175339185979445, |
| "learning_rate": 4.1805663064891564e-05, |
| "loss": 0.549, |
| "num_tokens": 177900466.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.32152842497670087, |
| "grad_norm": 0.383267722315599, |
| "learning_rate": 4.153626689032021e-05, |
| "loss": 0.5584, |
| "num_tokens": 180509493.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.32618825722274, |
| "grad_norm": 0.3379752484923232, |
| "learning_rate": 4.1263535419588095e-05, |
| "loss": 0.5486, |
| "num_tokens": 183130933.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.33084808946877914, |
| "grad_norm": 0.3207936223866377, |
| "learning_rate": 4.0987533459116615e-05, |
| "loss": 0.5504, |
| "num_tokens": 185752373.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.33550792171481825, |
| "grad_norm": 0.354222277147523, |
| "learning_rate": 4.0708326592460606e-05, |
| "loss": 0.5472, |
| "num_tokens": 188373813.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.3401677539608574, |
| "grad_norm": 0.3326492486974685, |
| "learning_rate": 4.0425981164724344e-05, |
| "loss": 0.5591, |
| "num_tokens": 190995062.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 0.3581181953154264, |
| "learning_rate": 4.014056426679663e-05, |
| "loss": 0.5515, |
| "num_tokens": 193616502.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.3494874184529357, |
| "grad_norm": 0.3601496352956064, |
| "learning_rate": 3.9852143719408665e-05, |
| "loss": 0.5525, |
| "num_tokens": 196237942.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.35414725069897485, |
| "grad_norm": 0.34698465506060644, |
| "learning_rate": 3.956078805701849e-05, |
| "loss": 0.5363, |
| "num_tokens": 198859382.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.35880708294501396, |
| "grad_norm": 0.32273960033714727, |
| "learning_rate": 3.926656651152588e-05, |
| "loss": 0.5413, |
| "num_tokens": 201480822.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.3634669151910531, |
| "grad_norm": 0.3274895692294926, |
| "learning_rate": 3.896954899582137e-05, |
| "loss": 0.5457, |
| "num_tokens": 204102262.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.3681267474370923, |
| "grad_norm": 0.3081307619689516, |
| "learning_rate": 3.8669806087173693e-05, |
| "loss": 0.5498, |
| "num_tokens": 206723702.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.3727865796831314, |
| "grad_norm": 0.3080730806469092, |
| "learning_rate": 3.8367409010459074e-05, |
| "loss": 0.5492, |
| "num_tokens": 209345142.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.37744641192917056, |
| "grad_norm": 0.3225594494940693, |
| "learning_rate": 3.806242962123692e-05, |
| "loss": 0.5505, |
| "num_tokens": 211966582.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.3821062441752097, |
| "grad_norm": 0.327562243989163, |
| "learning_rate": 3.77549403886754e-05, |
| "loss": 0.542, |
| "num_tokens": 214586037.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.38676607642124883, |
| "grad_norm": 0.33567644427961824, |
| "learning_rate": 3.74450143783314e-05, |
| "loss": 0.5339, |
| "num_tokens": 217207477.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.391425908667288, |
| "grad_norm": 0.3195877544134351, |
| "learning_rate": 3.713272523478867e-05, |
| "loss": 0.5559, |
| "num_tokens": 219820636.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.3960857409133271, |
| "grad_norm": 0.3894354150867297, |
| "learning_rate": 3.6818147164158374e-05, |
| "loss": 0.5359, |
| "num_tokens": 222442076.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.40074557315936626, |
| "grad_norm": 0.2931641258940042, |
| "learning_rate": 3.650135491644632e-05, |
| "loss": 0.5343, |
| "num_tokens": 225063516.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.40540540540540543, |
| "grad_norm": 0.3416218197533556, |
| "learning_rate": 3.618242376779078e-05, |
| "loss": 0.541, |
| "num_tokens": 227684956.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.41006523765144454, |
| "grad_norm": 0.3597428641005261, |
| "learning_rate": 3.5861429502575474e-05, |
| "loss": 0.5395, |
| "num_tokens": 230306396.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.4147250698974837, |
| "grad_norm": 0.3279123054545153, |
| "learning_rate": 3.55384483954216e-05, |
| "loss": 0.5421, |
| "num_tokens": 232927836.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.4193849021435228, |
| "grad_norm": 0.2913125395664931, |
| "learning_rate": 3.521355719306354e-05, |
| "loss": 0.5356, |
| "num_tokens": 235549276.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.424044734389562, |
| "grad_norm": 0.30642195214154083, |
| "learning_rate": 3.488683309611229e-05, |
| "loss": 0.5368, |
| "num_tokens": 238170716.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.42870456663560114, |
| "grad_norm": 0.2971057874789325, |
| "learning_rate": 3.455835374071104e-05, |
| "loss": 0.5481, |
| "num_tokens": 240792156.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.43336439888164024, |
| "grad_norm": 0.289893538706147, |
| "learning_rate": 3.422819718008729e-05, |
| "loss": 0.5409, |
| "num_tokens": 243413596.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.4380242311276794, |
| "grad_norm": 0.3208026547455586, |
| "learning_rate": 3.389644186600587e-05, |
| "loss": 0.5334, |
| "num_tokens": 246029886.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.4426840633737186, |
| "grad_norm": 0.31098073797577613, |
| "learning_rate": 3.356316663012723e-05, |
| "loss": 0.5459, |
| "num_tokens": 248637925.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.4473438956197577, |
| "grad_norm": 0.32556807411218974, |
| "learning_rate": 3.3228450665275444e-05, |
| "loss": 0.5354, |
| "num_tokens": 251259365.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.45200372786579684, |
| "grad_norm": 0.32862455947964564, |
| "learning_rate": 3.289237350662047e-05, |
| "loss": 0.5393, |
| "num_tokens": 253880805.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.45666356011183595, |
| "grad_norm": 0.30836926820730953, |
| "learning_rate": 3.255501501277896e-05, |
| "loss": 0.5546, |
| "num_tokens": 256502245.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.4613233923578751, |
| "grad_norm": 0.33290837078293123, |
| "learning_rate": 3.221645534683832e-05, |
| "loss": 0.5396, |
| "num_tokens": 259123685.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.4659832246039143, |
| "grad_norm": 0.32742182695539307, |
| "learning_rate": 3.187677495730829e-05, |
| "loss": 0.5354, |
| "num_tokens": 261736444.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4706430568499534, |
| "grad_norm": 0.3538484759611728, |
| "learning_rate": 3.153605455900487e-05, |
| "loss": 0.5437, |
| "num_tokens": 264341546.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.47530288909599255, |
| "grad_norm": 0.3118404288646792, |
| "learning_rate": 3.11943751138708e-05, |
| "loss": 0.5459, |
| "num_tokens": 266962986.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.47996272134203166, |
| "grad_norm": 0.3293869499725537, |
| "learning_rate": 3.0851817811737425e-05, |
| "loss": 0.5389, |
| "num_tokens": 269584426.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.4846225535880708, |
| "grad_norm": 0.34414868375288504, |
| "learning_rate": 3.0508464051032376e-05, |
| "loss": 0.5352, |
| "num_tokens": 272205866.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.48928238583411, |
| "grad_norm": 0.3079023720606716, |
| "learning_rate": 3.016439541943768e-05, |
| "loss": 0.5417, |
| "num_tokens": 274825267.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.4939422180801491, |
| "grad_norm": 0.3000794780271502, |
| "learning_rate": 2.9819693674502906e-05, |
| "loss": 0.5368, |
| "num_tokens": 277446221.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.49860205032618826, |
| "grad_norm": 0.2773419734072065, |
| "learning_rate": 2.9474440724217926e-05, |
| "loss": 0.5273, |
| "num_tokens": 280061887.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5032618825722274, |
| "grad_norm": 0.29295257972225236, |
| "learning_rate": 2.9128718607549994e-05, |
| "loss": 0.5378, |
| "num_tokens": 282683327.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5079217148182665, |
| "grad_norm": 0.31096154211972027, |
| "learning_rate": 2.878260947494961e-05, |
| "loss": 0.5464, |
| "num_tokens": 285291439.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5125815470643057, |
| "grad_norm": 0.3054879958391873, |
| "learning_rate": 2.8436195568829943e-05, |
| "loss": 0.5333, |
| "num_tokens": 287912879.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.5172413793103449, |
| "grad_norm": 0.310332716175625, |
| "learning_rate": 2.8089559204024457e-05, |
| "loss": 0.5462, |
| "num_tokens": 290525783.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5219012115563839, |
| "grad_norm": 0.29652511581484403, |
| "learning_rate": 2.7742782748227176e-05, |
| "loss": 0.5268, |
| "num_tokens": 293122806.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5265610438024231, |
| "grad_norm": 0.28939793439128153, |
| "learning_rate": 2.739594860242054e-05, |
| "loss": 0.5442, |
| "num_tokens": 295734627.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.5312208760484622, |
| "grad_norm": 0.29312372251486524, |
| "learning_rate": 2.7049139181295312e-05, |
| "loss": 0.5362, |
| "num_tokens": 298356067.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.5358807082945014, |
| "grad_norm": 0.29358566256036717, |
| "learning_rate": 2.6702436893667138e-05, |
| "loss": 0.5334, |
| "num_tokens": 300977507.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 0.3033549784648095, |
| "learning_rate": 2.6355924122894643e-05, |
| "loss": 0.5352, |
| "num_tokens": 303598947.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.5452003727865797, |
| "grad_norm": 0.2951407014316869, |
| "learning_rate": 2.6009683207303477e-05, |
| "loss": 0.5375, |
| "num_tokens": 306213321.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.5498602050326188, |
| "grad_norm": 0.2882327876076673, |
| "learning_rate": 2.5663796420621073e-05, |
| "loss": 0.5421, |
| "num_tokens": 308834761.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.554520037278658, |
| "grad_norm": 0.26633057282800016, |
| "learning_rate": 2.5318345952426808e-05, |
| "loss": 0.5236, |
| "num_tokens": 311456201.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.5591798695246971, |
| "grad_norm": 0.27919494565083547, |
| "learning_rate": 2.4973413888622105e-05, |
| "loss": 0.5321, |
| "num_tokens": 314077641.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5638397017707363, |
| "grad_norm": 0.2765667208716287, |
| "learning_rate": 2.4629082191925196e-05, |
| "loss": 0.5277, |
| "num_tokens": 316699081.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.5684995340167754, |
| "grad_norm": 0.3050264720388227, |
| "learning_rate": 2.4285432682395144e-05, |
| "loss": 0.5247, |
| "num_tokens": 319309553.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.5731593662628145, |
| "grad_norm": 0.3053981872479748, |
| "learning_rate": 2.3942547017989775e-05, |
| "loss": 0.521, |
| "num_tokens": 321923946.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.5778191985088537, |
| "grad_norm": 0.2863029139711317, |
| "learning_rate": 2.3600506675162127e-05, |
| "loss": 0.5256, |
| "num_tokens": 324545386.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.5824790307548928, |
| "grad_norm": 0.2893658578022934, |
| "learning_rate": 2.3259392929500012e-05, |
| "loss": 0.5274, |
| "num_tokens": 327166826.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.587138863000932, |
| "grad_norm": 0.28200232187730584, |
| "learning_rate": 2.291928683641334e-05, |
| "loss": 0.5455, |
| "num_tokens": 329788266.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.5917986952469712, |
| "grad_norm": 0.29555844325064967, |
| "learning_rate": 2.2580269211873718e-05, |
| "loss": 0.5473, |
| "num_tokens": 332409706.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.5964585274930102, |
| "grad_norm": 0.29324870477181914, |
| "learning_rate": 2.2242420613210983e-05, |
| "loss": 0.525, |
| "num_tokens": 335031146.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6011183597390494, |
| "grad_norm": 0.3107371690433492, |
| "learning_rate": 2.1905821319971172e-05, |
| "loss": 0.515, |
| "num_tokens": 337649451.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6057781919850885, |
| "grad_norm": 0.2879448837149661, |
| "learning_rate": 2.1570551314840516e-05, |
| "loss": 0.5195, |
| "num_tokens": 340270891.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.6104380242311277, |
| "grad_norm": 0.31763792445460876, |
| "learning_rate": 2.1236690264639942e-05, |
| "loss": 0.5237, |
| "num_tokens": 342889310.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.6150978564771669, |
| "grad_norm": 0.2647629638798241, |
| "learning_rate": 2.0904317501394677e-05, |
| "loss": 0.5084, |
| "num_tokens": 345504904.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.6197576887232059, |
| "grad_norm": 0.29481131039641423, |
| "learning_rate": 2.0573512003483364e-05, |
| "loss": 0.5253, |
| "num_tokens": 348126344.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.6244175209692451, |
| "grad_norm": 0.34164718488231277, |
| "learning_rate": 2.0244352376871227e-05, |
| "loss": 0.5184, |
| "num_tokens": 350734169.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.6290773532152842, |
| "grad_norm": 0.2821570510287332, |
| "learning_rate": 1.9916916836431753e-05, |
| "loss": 0.5383, |
| "num_tokens": 353342019.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.6337371854613234, |
| "grad_norm": 0.28996200795726057, |
| "learning_rate": 1.9591283187361252e-05, |
| "loss": 0.5207, |
| "num_tokens": 355956520.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.6383970177073626, |
| "grad_norm": 0.26569239968020403, |
| "learning_rate": 1.9267528806690816e-05, |
| "loss": 0.5238, |
| "num_tokens": 358570028.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.6430568499534017, |
| "grad_norm": 0.2829964600484454, |
| "learning_rate": 1.8945730624899976e-05, |
| "loss": 0.5348, |
| "num_tokens": 361191468.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.6477166821994408, |
| "grad_norm": 0.2556942982280325, |
| "learning_rate": 1.8625965107636544e-05, |
| "loss": 0.5162, |
| "num_tokens": 363812908.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.65237651444548, |
| "grad_norm": 0.2648594970355323, |
| "learning_rate": 1.8308308237546762e-05, |
| "loss": 0.5294, |
| "num_tokens": 366434348.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6570363466915191, |
| "grad_norm": 0.2683905079567185, |
| "learning_rate": 1.7992835496220416e-05, |
| "loss": 0.5285, |
| "num_tokens": 369055788.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.6616961789375583, |
| "grad_norm": 0.2596484218260175, |
| "learning_rate": 1.7679621846254836e-05, |
| "loss": 0.5339, |
| "num_tokens": 371677228.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.6663560111835974, |
| "grad_norm": 0.26447702062398853, |
| "learning_rate": 1.736874171344232e-05, |
| "loss": 0.5185, |
| "num_tokens": 374298668.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.6710158434296365, |
| "grad_norm": 0.25723321996437576, |
| "learning_rate": 1.706026896908507e-05, |
| "loss": 0.5334, |
| "num_tokens": 376908695.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.6756756756756757, |
| "grad_norm": 0.26344020729428064, |
| "learning_rate": 1.6754276912441947e-05, |
| "loss": 0.5388, |
| "num_tokens": 379526026.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.6803355079217148, |
| "grad_norm": 0.28222081145619576, |
| "learning_rate": 1.645083825331104e-05, |
| "loss": 0.5334, |
| "num_tokens": 382146337.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.684995340167754, |
| "grad_norm": 0.263215360381916, |
| "learning_rate": 1.6150025094752447e-05, |
| "loss": 0.5228, |
| "num_tokens": 384767777.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 0.2797337087632092, |
| "learning_rate": 1.585190891595507e-05, |
| "loss": 0.5283, |
| "num_tokens": 387384301.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.6943150046598322, |
| "grad_norm": 0.2484800890330122, |
| "learning_rate": 1.555656055525181e-05, |
| "loss": 0.5164, |
| "num_tokens": 390005741.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.6989748369058714, |
| "grad_norm": 0.26971982009234485, |
| "learning_rate": 1.5264050193286954e-05, |
| "loss": 0.5171, |
| "num_tokens": 392611427.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.7036346691519105, |
| "grad_norm": 0.2748099600779281, |
| "learning_rate": 1.497444733633982e-05, |
| "loss": 0.5331, |
| "num_tokens": 395213883.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.7082945013979497, |
| "grad_norm": 0.27565837023589923, |
| "learning_rate": 1.4687820799808719e-05, |
| "loss": 0.5195, |
| "num_tokens": 397835323.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.7129543336439889, |
| "grad_norm": 0.26645545801620185, |
| "learning_rate": 1.4404238691859062e-05, |
| "loss": 0.5155, |
| "num_tokens": 400443842.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.7176141658900279, |
| "grad_norm": 0.24409048673286582, |
| "learning_rate": 1.4123768397239456e-05, |
| "loss": 0.5221, |
| "num_tokens": 403065282.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.7222739981360671, |
| "grad_norm": 0.25300035754374856, |
| "learning_rate": 1.3846476561269767e-05, |
| "loss": 0.5187, |
| "num_tokens": 405686722.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.7269338303821062, |
| "grad_norm": 0.2412044572257938, |
| "learning_rate": 1.357242907400492e-05, |
| "loss": 0.5237, |
| "num_tokens": 408298322.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.7315936626281454, |
| "grad_norm": 0.24702257426941307, |
| "learning_rate": 1.330169105457802e-05, |
| "loss": 0.5356, |
| "num_tokens": 410917246.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.7362534948741846, |
| "grad_norm": 0.24167827665963923, |
| "learning_rate": 1.3034326835726802e-05, |
| "loss": 0.5238, |
| "num_tokens": 413538686.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.7409133271202236, |
| "grad_norm": 0.24897412715760983, |
| "learning_rate": 1.2770399948506906e-05, |
| "loss": 0.5134, |
| "num_tokens": 416160126.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.7455731593662628, |
| "grad_norm": 0.26229665036809996, |
| "learning_rate": 1.2509973107195624e-05, |
| "loss": 0.5301, |
| "num_tokens": 418781566.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.750232991612302, |
| "grad_norm": 0.2562216827588944, |
| "learning_rate": 1.2253108194389685e-05, |
| "loss": 0.5172, |
| "num_tokens": 421401116.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.7548928238583411, |
| "grad_norm": 0.2516582157966662, |
| "learning_rate": 1.1999866246300815e-05, |
| "loss": 0.5099, |
| "num_tokens": 424003342.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.7595526561043803, |
| "grad_norm": 0.24248755796816507, |
| "learning_rate": 1.175030743825226e-05, |
| "loss": 0.5258, |
| "num_tokens": 426624782.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.7642124883504194, |
| "grad_norm": 0.2621063803828783, |
| "learning_rate": 1.1504491070379925e-05, |
| "loss": 0.5315, |
| "num_tokens": 429246222.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.7688723205964585, |
| "grad_norm": 0.24174822969342202, |
| "learning_rate": 1.1262475553541516e-05, |
| "loss": 0.5221, |
| "num_tokens": 431867662.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.7735321528424977, |
| "grad_norm": 0.24763585139037442, |
| "learning_rate": 1.1024318395436947e-05, |
| "loss": 0.5111, |
| "num_tokens": 434474764.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.7781919850885368, |
| "grad_norm": 0.24044824353281324, |
| "learning_rate": 1.0790076186943354e-05, |
| "loss": 0.5127, |
| "num_tokens": 437096204.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.782851817334576, |
| "grad_norm": 0.2372983684841008, |
| "learning_rate": 1.0559804588667984e-05, |
| "loss": 0.5147, |
| "num_tokens": 439717644.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.7875116495806151, |
| "grad_norm": 0.24144775560753629, |
| "learning_rate": 1.0333558317722149e-05, |
| "loss": 0.5182, |
| "num_tokens": 442322749.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.7921714818266542, |
| "grad_norm": 0.23521231933346765, |
| "learning_rate": 1.0111391134719311e-05, |
| "loss": 0.5261, |
| "num_tokens": 444944189.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7968313140726934, |
| "grad_norm": 0.25839814116091386, |
| "learning_rate": 9.89335583100052e-06, |
| "loss": 0.5329, |
| "num_tokens": 447549445.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.8014911463187325, |
| "grad_norm": 0.24802007531704112, |
| "learning_rate": 9.679504216090155e-06, |
| "loss": 0.5195, |
| "num_tokens": 450170885.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.8061509785647717, |
| "grad_norm": 0.23878085144461889, |
| "learning_rate": 9.469887105384895e-06, |
| "loss": 0.5232, |
| "num_tokens": 452792325.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 0.2402322264197709, |
| "learning_rate": 9.264554308079026e-06, |
| "loss": 0.5176, |
| "num_tokens": 455413765.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.8154706430568499, |
| "grad_norm": 0.24820173245610386, |
| "learning_rate": 9.06355461532878e-06, |
| "loss": 0.5135, |
| "num_tokens": 458035205.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.8201304753028891, |
| "grad_norm": 0.2434059936450184, |
| "learning_rate": 8.866935788658539e-06, |
| "loss": 0.5255, |
| "num_tokens": 460656645.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.8247903075489282, |
| "grad_norm": 0.23024924252511744, |
| "learning_rate": 8.674744548611804e-06, |
| "loss": 0.5217, |
| "num_tokens": 463278085.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.8294501397949674, |
| "grad_norm": 0.24039525346980659, |
| "learning_rate": 8.487026563649433e-06, |
| "loss": 0.5152, |
| "num_tokens": 465899525.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.8341099720410066, |
| "grad_norm": 0.23872455856549998, |
| "learning_rate": 8.303826439297903e-06, |
| "loss": 0.5065, |
| "num_tokens": 468520965.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.8387698042870456, |
| "grad_norm": 0.23556903067718313, |
| "learning_rate": 8.125187707550183e-06, |
| "loss": 0.5049, |
| "num_tokens": 471142405.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.8434296365330848, |
| "grad_norm": 0.25336561442972605, |
| "learning_rate": 7.95115281652163e-06, |
| "loss": 0.5207, |
| "num_tokens": 473763845.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.848089468779124, |
| "grad_norm": 0.23499180503350842, |
| "learning_rate": 7.781763120363474e-06, |
| "loss": 0.5139, |
| "num_tokens": 476385285.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.8527493010251631, |
| "grad_norm": 0.2529364537392306, |
| "learning_rate": 7.6170588694362915e-06, |
| "loss": 0.517, |
| "num_tokens": 478994969.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.8574091332712023, |
| "grad_norm": 0.2384519421315903, |
| "learning_rate": 7.4570792007456745e-06, |
| "loss": 0.5125, |
| "num_tokens": 481616409.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.8620689655172413, |
| "grad_norm": 0.23538313273325193, |
| "learning_rate": 7.3018621286425035e-06, |
| "loss": 0.5144, |
| "num_tokens": 484237849.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.8667287977632805, |
| "grad_norm": 0.2550125804528335, |
| "learning_rate": 7.151444535790017e-06, |
| "loss": 0.5132, |
| "num_tokens": 486854794.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.8713886300093197, |
| "grad_norm": 0.24543512806948412, |
| "learning_rate": 7.005862164399716e-06, |
| "loss": 0.5141, |
| "num_tokens": 489476234.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.8760484622553588, |
| "grad_norm": 0.23402783407943364, |
| "learning_rate": 6.865149607738324e-06, |
| "loss": 0.5186, |
| "num_tokens": 492097674.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.880708294501398, |
| "grad_norm": 0.24038673988791295, |
| "learning_rate": 6.7293403019077394e-06, |
| "loss": 0.5199, |
| "num_tokens": 494719114.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.8853681267474371, |
| "grad_norm": 0.24876547075652353, |
| "learning_rate": 6.598466517899961e-06, |
| "loss": 0.5169, |
| "num_tokens": 497312162.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.8900279589934762, |
| "grad_norm": 0.23348989886508098, |
| "learning_rate": 6.472559353928814e-06, |
| "loss": 0.5016, |
| "num_tokens": 499933602.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.8946877912395154, |
| "grad_norm": 0.24054715291127265, |
| "learning_rate": 6.35164872804046e-06, |
| "loss": 0.5119, |
| "num_tokens": 502555042.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.8993476234855545, |
| "grad_norm": 0.2560254493666138, |
| "learning_rate": 6.235763371004234e-06, |
| "loss": 0.5054, |
| "num_tokens": 505176482.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.9040074557315937, |
| "grad_norm": 0.24407123816751405, |
| "learning_rate": 6.124930819485644e-06, |
| "loss": 0.5201, |
| "num_tokens": 507789906.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.9086672879776329, |
| "grad_norm": 0.2358195713693558, |
| "learning_rate": 6.0191774095031244e-06, |
| "loss": 0.5186, |
| "num_tokens": 510411346.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.9133271202236719, |
| "grad_norm": 0.23407573111022562, |
| "learning_rate": 5.9185282701700745e-06, |
| "loss": 0.4987, |
| "num_tokens": 513031521.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.9179869524697111, |
| "grad_norm": 0.24014654644427658, |
| "learning_rate": 5.823007317723664e-06, |
| "loss": 0.5211, |
| "num_tokens": 515652961.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.9226467847157502, |
| "grad_norm": 0.22727371457693266, |
| "learning_rate": 5.732637249841873e-06, |
| "loss": 0.5156, |
| "num_tokens": 518274401.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.9273066169617894, |
| "grad_norm": 0.2337261404165014, |
| "learning_rate": 5.647439540250082e-06, |
| "loss": 0.5163, |
| "num_tokens": 520895841.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.9319664492078286, |
| "grad_norm": 0.23362828808889882, |
| "learning_rate": 5.567434433618465e-06, |
| "loss": 0.5217, |
| "num_tokens": 523517281.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.9366262814538676, |
| "grad_norm": 0.23796854375911086, |
| "learning_rate": 5.492640940751462e-06, |
| "loss": 0.5091, |
| "num_tokens": 526138721.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.9412861136999068, |
| "grad_norm": 0.23622782337632642, |
| "learning_rate": 5.423076834070447e-06, |
| "loss": 0.5281, |
| "num_tokens": 528760161.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.9459459459459459, |
| "grad_norm": 0.24322297652322292, |
| "learning_rate": 5.358758643390628e-06, |
| "loss": 0.5113, |
| "num_tokens": 531376706.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.9506057781919851, |
| "grad_norm": 0.22483847680941124, |
| "learning_rate": 5.299701651993246e-06, |
| "loss": 0.512, |
| "num_tokens": 533998146.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.9552656104380243, |
| "grad_norm": 0.2368346722624514, |
| "learning_rate": 5.245919892993957e-06, |
| "loss": 0.5124, |
| "num_tokens": 536619586.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.9599254426840633, |
| "grad_norm": 0.22210260171548793, |
| "learning_rate": 5.197426146008291e-06, |
| "loss": 0.5111, |
| "num_tokens": 539227681.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.9645852749301025, |
| "grad_norm": 0.22629882586985997, |
| "learning_rate": 5.1542319341149565e-06, |
| "loss": 0.5149, |
| "num_tokens": 541849121.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.9692451071761417, |
| "grad_norm": 0.22896206453144197, |
| "learning_rate": 5.1163475211177235e-06, |
| "loss": 0.5062, |
| "num_tokens": 544470561.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.9739049394221808, |
| "grad_norm": 0.23605828228072606, |
| "learning_rate": 5.083781909106557e-06, |
| "loss": 0.5048, |
| "num_tokens": 547092001.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.97856477166822, |
| "grad_norm": 0.24271666524954047, |
| "learning_rate": 5.056542836318518e-06, |
| "loss": 0.5122, |
| "num_tokens": 549691697.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.983224603914259, |
| "grad_norm": 0.2485044947333615, |
| "learning_rate": 5.034636775299023e-06, |
| "loss": 0.5055, |
| "num_tokens": 552313137.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.9878844361602982, |
| "grad_norm": 0.2439169394590551, |
| "learning_rate": 5.018068931363828e-06, |
| "loss": 0.5069, |
| "num_tokens": 554934577.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.9925442684063374, |
| "grad_norm": 0.22894700173958088, |
| "learning_rate": 5.006843241362149e-06, |
| "loss": 0.5092, |
| "num_tokens": 557556017.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.9972041006523765, |
| "grad_norm": 0.22935016975423794, |
| "learning_rate": 5.000962372741178e-06, |
| "loss": 0.5066, |
| "num_tokens": 560177457.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.0, |
| "num_tokens": 561488177.0, |
| "step": 1073, |
| "total_flos": 489077052801024.0, |
| "train_loss": 0.5497076121425362, |
| "train_runtime": 14506.5548, |
| "train_samples_per_second": 2.365, |
| "train_steps_per_second": 0.074 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1073, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 489077052801024.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|