| { |
| "best_global_step": null, |
| "best_metric": 0.5373095273971558, |
| "best_model_checkpoint": null, |
| "epoch": 2.9276734210915545, |
| "eval_steps": 50, |
| "global_step": 4950, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0029581422866439876, |
| "grad_norm": 0.5572423338890076, |
| "learning_rate": 1.6589451880122303e-05, |
| "loss": 1.4063, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.005916284573287975, |
| "grad_norm": 0.4338622987270355, |
| "learning_rate": 3.7326266730275184e-05, |
| "loss": 1.3321, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.008874426859931962, |
| "grad_norm": 0.3512386381626129, |
| "learning_rate": 5.8063081580428065e-05, |
| "loss": 1.2302, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.01183256914657595, |
| "grad_norm": 0.2740453779697418, |
| "learning_rate": 7.879989643058095e-05, |
| "loss": 1.1511, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.014790711433219937, |
| "grad_norm": 0.20490019023418427, |
| "learning_rate": 9.953671128073382e-05, |
| "loss": 1.1108, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.017748853719863924, |
| "grad_norm": 0.18535283207893372, |
| "learning_rate": 0.00012027352613088669, |
| "loss": 1.0932, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.020706996006507914, |
| "grad_norm": 0.17043855786323547, |
| "learning_rate": 0.00014101034098103958, |
| "loss": 1.0726, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0236651382931519, |
| "grad_norm": 0.16548407077789307, |
| "learning_rate": 0.00016174715583119247, |
| "loss": 1.055, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.026623280579795888, |
| "grad_norm": 0.17369449138641357, |
| "learning_rate": 0.00018248397068134533, |
| "loss": 1.0256, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.029581422866439874, |
| "grad_norm": 0.17916908860206604, |
| "learning_rate": 0.00020322078553149822, |
| "loss": 1.0189, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029581422866439874, |
| "eval_loss": 1.0180954933166504, |
| "eval_runtime": 15.5481, |
| "eval_samples_per_second": 417.416, |
| "eval_steps_per_second": 13.056, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.032539565153083864, |
| "grad_norm": 0.16111324727535248, |
| "learning_rate": 0.0002239576003816511, |
| "loss": 1.0049, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.03549770743972785, |
| "grad_norm": 0.1549229621887207, |
| "learning_rate": 0.00024469441523180396, |
| "loss": 1.0133, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03845584972637184, |
| "grad_norm": 0.16132739186286926, |
| "learning_rate": 0.00026543123008195685, |
| "loss": 1.0043, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04141399201301583, |
| "grad_norm": 0.15359684824943542, |
| "learning_rate": 0.00028616804493210974, |
| "loss": 0.9838, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04437213429965981, |
| "grad_norm": 0.1586950719356537, |
| "learning_rate": 0.0003069048597822626, |
| "loss": 0.9858, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0473302765863038, |
| "grad_norm": 0.15954945981502533, |
| "learning_rate": 0.0003276416746324155, |
| "loss": 0.9659, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05028841887294779, |
| "grad_norm": 0.15601466596126556, |
| "learning_rate": 0.00034837848948256835, |
| "loss": 0.9703, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.053246561159591775, |
| "grad_norm": 0.16445401310920715, |
| "learning_rate": 0.00036911530433272123, |
| "loss": 0.9421, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.056204703446235765, |
| "grad_norm": 0.15353924036026, |
| "learning_rate": 0.0003898521191828741, |
| "loss": 0.9504, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.05916284573287975, |
| "grad_norm": 0.15212711691856384, |
| "learning_rate": 0.000410588934033027, |
| "loss": 0.9703, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.05916284573287975, |
| "eval_loss": 0.9503689408302307, |
| "eval_runtime": 15.2084, |
| "eval_samples_per_second": 426.739, |
| "eval_steps_per_second": 13.348, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06212098801952374, |
| "grad_norm": 0.15998579561710358, |
| "learning_rate": 0.00041473561507370503, |
| "loss": 0.9576, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.06507913030616773, |
| "grad_norm": 0.17387987673282623, |
| "learning_rate": 0.000414732844743397, |
| "loss": 0.9515, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06803727259281171, |
| "grad_norm": 0.15958620607852936, |
| "learning_rate": 0.00041472794341999657, |
| "loss": 0.9326, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0709954148794557, |
| "grad_norm": 0.15257656574249268, |
| "learning_rate": 0.00041472091115387234, |
| "loss": 0.9414, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07395355716609969, |
| "grad_norm": 0.16304738819599152, |
| "learning_rate": 0.0004147117480172918, |
| "loss": 0.9594, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.07691169945274368, |
| "grad_norm": 0.150679811835289, |
| "learning_rate": 0.00041470045410442024, |
| "loss": 0.9134, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07986984173938766, |
| "grad_norm": 0.1541481912136078, |
| "learning_rate": 0.00041468702953132027, |
| "loss": 0.9232, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.08282798402603166, |
| "grad_norm": 0.16420885920524597, |
| "learning_rate": 0.00041467147443595, |
| "loss": 0.9352, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08578612631267564, |
| "grad_norm": 0.15783484280109406, |
| "learning_rate": 0.00041465378897816206, |
| "loss": 0.9439, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.08874426859931962, |
| "grad_norm": 0.14839908480644226, |
| "learning_rate": 0.0004146339733397021, |
| "loss": 0.9149, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08874426859931962, |
| "eval_loss": 0.9155654311180115, |
| "eval_runtime": 15.0718, |
| "eval_samples_per_second": 430.606, |
| "eval_steps_per_second": 13.469, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09170241088596362, |
| "grad_norm": 0.16119568049907684, |
| "learning_rate": 0.00041461202772420625, |
| "loss": 0.9055, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0946605531726076, |
| "grad_norm": 0.14887715876102448, |
| "learning_rate": 0.0004145879523571998, |
| "loss": 0.8985, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09761869545925159, |
| "grad_norm": 0.15016992390155792, |
| "learning_rate": 0.0004145617474860943, |
| "loss": 0.8983, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.10057683774589558, |
| "grad_norm": 0.15168191492557526, |
| "learning_rate": 0.00041453341338018547, |
| "loss": 0.9227, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.10353498003253957, |
| "grad_norm": 0.1442921757698059, |
| "learning_rate": 0.00041450295033064997, |
| "loss": 0.9184, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.10649312231918355, |
| "grad_norm": 0.15555252134799957, |
| "learning_rate": 0.00041447035865054287, |
| "loss": 0.8999, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.10945126460582753, |
| "grad_norm": 0.15253983438014984, |
| "learning_rate": 0.000414435638674794, |
| "loss": 0.923, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.11240940689247153, |
| "grad_norm": 0.15464100241661072, |
| "learning_rate": 0.00041439879076020483, |
| "loss": 0.9159, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.11536754917911551, |
| "grad_norm": 0.15457701683044434, |
| "learning_rate": 0.0004143598152854448, |
| "loss": 0.9146, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.1183256914657595, |
| "grad_norm": 0.16058233380317688, |
| "learning_rate": 0.00041431871265104717, |
| "loss": 0.9076, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1183256914657595, |
| "eval_loss": 0.8913146257400513, |
| "eval_runtime": 15.1605, |
| "eval_samples_per_second": 428.087, |
| "eval_steps_per_second": 13.39, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1212838337524035, |
| "grad_norm": 0.15900303423404694, |
| "learning_rate": 0.0004142754832794051, |
| "loss": 0.8976, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.12424197603904748, |
| "grad_norm": 0.15453267097473145, |
| "learning_rate": 0.0004142301276147672, |
| "loss": 0.911, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.12720011832569147, |
| "grad_norm": 0.15236064791679382, |
| "learning_rate": 0.0004141826461232332, |
| "loss": 0.8884, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.13015826061233546, |
| "grad_norm": 0.15213336050510406, |
| "learning_rate": 0.0004141330392927488, |
| "loss": 0.9041, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.13311640289897944, |
| "grad_norm": 0.15676744282245636, |
| "learning_rate": 0.00041408130763310113, |
| "loss": 0.9003, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.13607454518562342, |
| "grad_norm": 0.14520438015460968, |
| "learning_rate": 0.0004140274516759128, |
| "loss": 0.8936, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1390326874722674, |
| "grad_norm": 0.147927924990654, |
| "learning_rate": 0.00041397147197463717, |
| "loss": 0.9004, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.1419908297589114, |
| "grad_norm": 0.14898143708705902, |
| "learning_rate": 0.0004139133691045523, |
| "loss": 0.899, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1449489720455554, |
| "grad_norm": 0.13882724940776825, |
| "learning_rate": 0.00041385314366275514, |
| "loss": 0.8864, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.14790711433219939, |
| "grad_norm": 0.15192769467830658, |
| "learning_rate": 0.0004137907962681552, |
| "loss": 0.8892, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.14790711433219939, |
| "eval_loss": 0.8746127486228943, |
| "eval_runtime": 15.0604, |
| "eval_samples_per_second": 430.931, |
| "eval_steps_per_second": 13.479, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.15086525661884337, |
| "grad_norm": 0.14568747580051422, |
| "learning_rate": 0.0004137263275614684, |
| "loss": 0.8886, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.15382339890548735, |
| "grad_norm": 0.15014733374118805, |
| "learning_rate": 0.00041365973820521053, |
| "loss": 0.8922, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.15678154119213134, |
| "grad_norm": 0.15133148431777954, |
| "learning_rate": 0.00041359102888369024, |
| "loss": 0.8966, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.15973968347877532, |
| "grad_norm": 0.15641948580741882, |
| "learning_rate": 0.00041352020030300206, |
| "loss": 0.8962, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.16269782576541933, |
| "grad_norm": 0.146999329328537, |
| "learning_rate": 0.0004134472531910193, |
| "loss": 0.8973, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.1656559680520633, |
| "grad_norm": 0.15431025624275208, |
| "learning_rate": 0.0004133721882973865, |
| "loss": 0.871, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1686141103387073, |
| "grad_norm": 0.15721946954727173, |
| "learning_rate": 0.00041329500639351136, |
| "loss": 0.8848, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.17157225262535128, |
| "grad_norm": 0.15370066463947296, |
| "learning_rate": 0.0004132157082725574, |
| "loss": 0.8834, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.17453039491199526, |
| "grad_norm": 0.15350687503814697, |
| "learning_rate": 0.00041313429474943564, |
| "loss": 0.8739, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.17748853719863925, |
| "grad_norm": 0.1491025686264038, |
| "learning_rate": 0.000413050766660796, |
| "loss": 0.872, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.17748853719863925, |
| "eval_loss": 0.8620766997337341, |
| "eval_runtime": 15.2152, |
| "eval_samples_per_second": 426.548, |
| "eval_steps_per_second": 13.342, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18044667948528323, |
| "grad_norm": 0.15806402266025543, |
| "learning_rate": 0.00041296512486501866, |
| "loss": 0.8851, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.18340482177192724, |
| "grad_norm": 0.15328341722488403, |
| "learning_rate": 0.0004128773702422057, |
| "loss": 0.8724, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.18636296405857122, |
| "grad_norm": 0.15714845061302185, |
| "learning_rate": 0.00041278750369417157, |
| "loss": 0.8892, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.1893211063452152, |
| "grad_norm": 0.15943311154842377, |
| "learning_rate": 0.0004126955261444342, |
| "loss": 0.8683, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1922792486318592, |
| "grad_norm": 0.1572256088256836, |
| "learning_rate": 0.00041260143853820517, |
| "loss": 0.8645, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.19523739091850317, |
| "grad_norm": 0.158464714884758, |
| "learning_rate": 0.0004125052418423802, |
| "loss": 0.8634, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.19819553320514716, |
| "grad_norm": 0.15009784698486328, |
| "learning_rate": 0.0004124069370455292, |
| "loss": 0.8579, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.20115367549179117, |
| "grad_norm": 0.15192705392837524, |
| "learning_rate": 0.00041230652515788596, |
| "loss": 0.8696, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.20411181777843515, |
| "grad_norm": 0.1550079882144928, |
| "learning_rate": 0.0004122040072113381, |
| "loss": 0.8438, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.20706996006507913, |
| "grad_norm": 0.14868152141571045, |
| "learning_rate": 0.00041209938425941614, |
| "loss": 0.8533, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.20706996006507913, |
| "eval_loss": 0.8512822389602661, |
| "eval_runtime": 15.4431, |
| "eval_samples_per_second": 420.252, |
| "eval_steps_per_second": 13.145, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.21002810235172312, |
| "grad_norm": 0.16069160401821136, |
| "learning_rate": 0.0004119926573772827, |
| "loss": 0.8731, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.2129862446383671, |
| "grad_norm": 0.15190179646015167, |
| "learning_rate": 0.00041188382766172164, |
| "loss": 0.8707, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.21594438692501108, |
| "grad_norm": 0.15935535728931427, |
| "learning_rate": 0.0004117728962311268, |
| "loss": 0.8414, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.21890252921165507, |
| "grad_norm": 0.14721055328845978, |
| "learning_rate": 0.00041165986422549004, |
| "loss": 0.853, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.22186067149829908, |
| "grad_norm": 0.15762685239315033, |
| "learning_rate": 0.0004115447328063903, |
| "loss": 0.8732, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.22481881378494306, |
| "grad_norm": 0.15402108430862427, |
| "learning_rate": 0.00041142750315698094, |
| "loss": 0.8595, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.22777695607158704, |
| "grad_norm": 0.15774597227573395, |
| "learning_rate": 0.000411308176481978, |
| "loss": 0.8621, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.23073509835823103, |
| "grad_norm": 0.1595357209444046, |
| "learning_rate": 0.00041118675400764773, |
| "loss": 0.8694, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.233693240644875, |
| "grad_norm": 0.14985965192317963, |
| "learning_rate": 0.000411063236981794, |
| "loss": 0.8526, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.236651382931519, |
| "grad_norm": 0.15028232336044312, |
| "learning_rate": 0.0004109376266737452, |
| "loss": 0.8737, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.236651382931519, |
| "eval_loss": 0.8394450545310974, |
| "eval_runtime": 15.1118, |
| "eval_samples_per_second": 429.467, |
| "eval_steps_per_second": 13.433, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.239609525218163, |
| "grad_norm": 0.1572600156068802, |
| "learning_rate": 0.00041080992437434155, |
| "loss": 0.8643, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.242567667504807, |
| "grad_norm": 0.15349853038787842, |
| "learning_rate": 0.00041068013139592194, |
| "loss": 0.8768, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.24552580979145097, |
| "grad_norm": 0.15303458273410797, |
| "learning_rate": 0.00041054824907231, |
| "loss": 0.8491, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.24848395207809496, |
| "grad_norm": 0.15635432302951813, |
| "learning_rate": 0.0004104142787588005, |
| "loss": 0.8489, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.25144209436473897, |
| "grad_norm": 0.1505730003118515, |
| "learning_rate": 0.00041027822183214595, |
| "loss": 0.8538, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.25440023665138295, |
| "grad_norm": 0.1578625589609146, |
| "learning_rate": 0.0004101400796905416, |
| "loss": 0.8533, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.25735837893802693, |
| "grad_norm": 0.15344196557998657, |
| "learning_rate": 0.0004099998537536117, |
| "loss": 0.8577, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.2603165212246709, |
| "grad_norm": 0.15440982580184937, |
| "learning_rate": 0.0004098575454623947, |
| "loss": 0.8494, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2632746635113149, |
| "grad_norm": 0.15573708713054657, |
| "learning_rate": 0.0004097131562793286, |
| "loss": 0.8471, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.2662328057979589, |
| "grad_norm": 0.1527046263217926, |
| "learning_rate": 0.0004095666876882355, |
| "loss": 0.8486, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2662328057979589, |
| "eval_loss": 0.8319525718688965, |
| "eval_runtime": 15.1071, |
| "eval_samples_per_second": 429.599, |
| "eval_steps_per_second": 13.437, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.26919094808460287, |
| "grad_norm": 0.1534292846918106, |
| "learning_rate": 0.00040941814119430694, |
| "loss": 0.8488, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.27214909037124685, |
| "grad_norm": 0.15478292107582092, |
| "learning_rate": 0.0004092675183240879, |
| "loss": 0.8442, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.27510723265789083, |
| "grad_norm": 0.16151192784309387, |
| "learning_rate": 0.00040911482062546144, |
| "loss": 0.8569, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.2780653749445348, |
| "grad_norm": 0.15042878687381744, |
| "learning_rate": 0.0004089600496676326, |
| "loss": 0.841, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.2810235172311788, |
| "grad_norm": 0.1557908058166504, |
| "learning_rate": 0.0004088032070411125, |
| "loss": 0.8455, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.2839816595178228, |
| "grad_norm": 0.15937361121177673, |
| "learning_rate": 0.00040864429435770184, |
| "loss": 0.8363, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2869398018044668, |
| "grad_norm": 0.15416064858436584, |
| "learning_rate": 0.0004084833132504743, |
| "loss": 0.851, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.2898979440911108, |
| "grad_norm": 0.14673134684562683, |
| "learning_rate": 0.00040832026537375974, |
| "loss": 0.8293, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.2928560863777548, |
| "grad_norm": 0.159574493765831, |
| "learning_rate": 0.0004081551524031274, |
| "loss": 0.8419, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.29581422866439877, |
| "grad_norm": 0.15907645225524902, |
| "learning_rate": 0.0004079879760353685, |
| "loss": 0.8525, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.29581422866439877, |
| "eval_loss": 0.8241714835166931, |
| "eval_runtime": 15.1548, |
| "eval_samples_per_second": 428.247, |
| "eval_steps_per_second": 13.395, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.29877237095104275, |
| "grad_norm": 0.15710967779159546, |
| "learning_rate": 0.000407818737988479, |
| "loss": 0.829, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.30173051323768674, |
| "grad_norm": 0.1684395968914032, |
| "learning_rate": 0.00040764744000164154, |
| "loss": 0.8652, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3046886555243307, |
| "grad_norm": 0.15995532274246216, |
| "learning_rate": 0.00040747408383520804, |
| "loss": 0.8387, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.3076467978109747, |
| "grad_norm": 0.161375030875206, |
| "learning_rate": 0.00040729867127068135, |
| "loss": 0.8483, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.3106049400976187, |
| "grad_norm": 0.15752190351486206, |
| "learning_rate": 0.0004071212041106969, |
| "loss": 0.8366, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.31356308238426267, |
| "grad_norm": 0.15867403149604797, |
| "learning_rate": 0.00040694168417900443, |
| "loss": 0.8411, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.31652122467090665, |
| "grad_norm": 0.1642945110797882, |
| "learning_rate": 0.0004067601133204489, |
| "loss": 0.8401, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.31947936695755064, |
| "grad_norm": 0.15721701085567474, |
| "learning_rate": 0.0004065764934009518, |
| "loss": 0.8392, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.3224375092441946, |
| "grad_norm": 0.1723637729883194, |
| "learning_rate": 0.000406390826307492, |
| "loss": 0.8339, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.32539565153083866, |
| "grad_norm": 0.16064327955245972, |
| "learning_rate": 0.00040620311394808616, |
| "loss": 0.833, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.32539565153083866, |
| "eval_loss": 0.8170909881591797, |
| "eval_runtime": 15.1491, |
| "eval_samples_per_second": 428.409, |
| "eval_steps_per_second": 13.4, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.32835379381748264, |
| "grad_norm": 0.15267042815685272, |
| "learning_rate": 0.0004060133582517691, |
| "loss": 0.8436, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.3313119361041266, |
| "grad_norm": 0.16683924198150635, |
| "learning_rate": 0.00040582156116857423, |
| "loss": 0.8494, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3342700783907706, |
| "grad_norm": 0.16277329623699188, |
| "learning_rate": 0.0004056277246695134, |
| "loss": 0.853, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.3372282206774146, |
| "grad_norm": 0.16534163057804108, |
| "learning_rate": 0.00040543185074655647, |
| "loss": 0.8282, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.3401863629640586, |
| "grad_norm": 0.1668074131011963, |
| "learning_rate": 0.00040523394141261113, |
| "loss": 0.8288, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.34314450525070256, |
| "grad_norm": 0.1653686761856079, |
| "learning_rate": 0.0004050339987015018, |
| "loss": 0.842, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.34610264753734654, |
| "grad_norm": 0.15694311261177063, |
| "learning_rate": 0.00040483202466794953, |
| "loss": 0.8321, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.3490607898239905, |
| "grad_norm": 0.16013135015964508, |
| "learning_rate": 0.00040462802138754975, |
| "loss": 0.83, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3520189321106345, |
| "grad_norm": 0.1565493494272232, |
| "learning_rate": 0.00040442199095675185, |
| "loss": 0.829, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.3549770743972785, |
| "grad_norm": 0.15936292707920074, |
| "learning_rate": 0.00040421393549283733, |
| "loss": 0.8292, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3549770743972785, |
| "eval_loss": 0.8113046288490295, |
| "eval_runtime": 15.0797, |
| "eval_samples_per_second": 430.381, |
| "eval_steps_per_second": 13.462, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3579352166839225, |
| "grad_norm": 0.1653825342655182, |
| "learning_rate": 0.00040400385713389793, |
| "loss": 0.8255, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.36089335897056646, |
| "grad_norm": 0.1677619218826294, |
| "learning_rate": 0.00040379175803881387, |
| "loss": 0.834, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.3638515012572105, |
| "grad_norm": 0.17027856409549713, |
| "learning_rate": 0.0004035776403872316, |
| "loss": 0.8374, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.3668096435438545, |
| "grad_norm": 0.17102032899856567, |
| "learning_rate": 0.0004033615063795411, |
| "loss": 0.8299, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.36976778583049846, |
| "grad_norm": 0.16256418824195862, |
| "learning_rate": 0.00040314335823685377, |
| "loss": 0.8367, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.37272592811714245, |
| "grad_norm": 0.15656936168670654, |
| "learning_rate": 0.00040292319820097936, |
| "loss": 0.8276, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.37568407040378643, |
| "grad_norm": 0.16188517212867737, |
| "learning_rate": 0.0004027010285344028, |
| "loss": 0.8303, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.3786422126904304, |
| "grad_norm": 0.15468132495880127, |
| "learning_rate": 0.00040247685152026123, |
| "loss": 0.8257, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.3816003549770744, |
| "grad_norm": 0.16564877331256866, |
| "learning_rate": 0.0004022506694623202, |
| "loss": 0.8388, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.3845584972637184, |
| "grad_norm": 0.15713313221931458, |
| "learning_rate": 0.0004020224846849505, |
| "loss": 0.8214, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.3845584972637184, |
| "eval_loss": 0.8043718934059143, |
| "eval_runtime": 15.1261, |
| "eval_samples_per_second": 429.061, |
| "eval_steps_per_second": 13.421, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.38751663955036236, |
| "grad_norm": 0.15846404433250427, |
| "learning_rate": 0.0004017922995331036, |
| "loss": 0.8492, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.39047478183700635, |
| "grad_norm": 0.16053859889507294, |
| "learning_rate": 0.0004015601163722882, |
| "loss": 0.8208, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.39343292412365033, |
| "grad_norm": 0.16480772197246552, |
| "learning_rate": 0.00040132593758854544, |
| "loss": 0.8314, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.3963910664102943, |
| "grad_norm": 0.16306859254837036, |
| "learning_rate": 0.00040108976558842467, |
| "loss": 0.8336, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.3993492086969383, |
| "grad_norm": 0.15954424440860748, |
| "learning_rate": 0.00040085160279895856, |
| "loss": 0.8325, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.40230735098358233, |
| "grad_norm": 0.16945534944534302, |
| "learning_rate": 0.0004006114516676383, |
| "loss": 0.8283, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4052654932702263, |
| "grad_norm": 0.16391772031784058, |
| "learning_rate": 0.00040036931466238835, |
| "loss": 0.8321, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.4082236355568703, |
| "grad_norm": 0.16165830194950104, |
| "learning_rate": 0.0004001251942715411, |
| "loss": 0.8477, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.4111817778435143, |
| "grad_norm": 0.15972331166267395, |
| "learning_rate": 0.00039987909300381115, |
| "loss": 0.8187, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.41413992013015827, |
| "grad_norm": 0.16436176002025604, |
| "learning_rate": 0.00039963101338826994, |
| "loss": 0.8277, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.41413992013015827, |
| "eval_loss": 0.7969197630882263, |
| "eval_runtime": 15.1288, |
| "eval_samples_per_second": 428.984, |
| "eval_steps_per_second": 13.418, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.41709806241680225, |
| "grad_norm": 0.16101345419883728, |
| "learning_rate": 0.0003993809579743193, |
| "loss": 0.8236, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.42005620470344623, |
| "grad_norm": 0.16163453459739685, |
| "learning_rate": 0.00039912892933166545, |
| "loss": 0.8234, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4230143469900902, |
| "grad_norm": 0.1685408502817154, |
| "learning_rate": 0.00039887493005029266, |
| "loss": 0.8164, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.4259724892767342, |
| "grad_norm": 0.16631852090358734, |
| "learning_rate": 0.00039861896274043663, |
| "loss": 0.8306, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4289306315633782, |
| "grad_norm": 0.15881124138832092, |
| "learning_rate": 0.0003983610300325574, |
| "loss": 0.8279, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.43188877385002217, |
| "grad_norm": 0.1685400754213333, |
| "learning_rate": 0.0003981011345773126, |
| "loss": 0.8261, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.43484691613666615, |
| "grad_norm": 0.16085389256477356, |
| "learning_rate": 0.0003978392790455303, |
| "loss": 0.8192, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.43780505842331013, |
| "grad_norm": 0.15905866026878357, |
| "learning_rate": 0.0003975754661281811, |
| "loss": 0.8139, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.4407632007099542, |
| "grad_norm": 0.16905803978443146, |
| "learning_rate": 0.00039730969853635093, |
| "loss": 0.8241, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.44372134299659816, |
| "grad_norm": 0.1739426702260971, |
| "learning_rate": 0.000397041979001213, |
| "loss": 0.8001, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.44372134299659816, |
| "eval_loss": 0.791688859462738, |
| "eval_runtime": 15.0867, |
| "eval_samples_per_second": 430.18, |
| "eval_steps_per_second": 13.456, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.44667948528324214, |
| "grad_norm": 0.16287444531917572, |
| "learning_rate": 0.0003967723102739998, |
| "loss": 0.8244, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.4496376275698861, |
| "grad_norm": 0.17151105403900146, |
| "learning_rate": 0.00039650069512597473, |
| "loss": 0.8309, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.4525957698565301, |
| "grad_norm": 0.16484029591083527, |
| "learning_rate": 0.0003962271363484036, |
| "loss": 0.8193, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.4555539121431741, |
| "grad_norm": 0.16619396209716797, |
| "learning_rate": 0.0003959516367525262, |
| "loss": 0.7922, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.4585120544298181, |
| "grad_norm": 0.1678115576505661, |
| "learning_rate": 0.00039567419916952706, |
| "loss": 0.8085, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.46147019671646206, |
| "grad_norm": 0.1662498414516449, |
| "learning_rate": 0.00039539482645050664, |
| "loss": 0.8095, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.46442833900310604, |
| "grad_norm": 0.17038890719413757, |
| "learning_rate": 0.0003951135214664519, |
| "loss": 0.8373, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.46738648128975, |
| "grad_norm": 0.16738362610340118, |
| "learning_rate": 0.0003948302871082067, |
| "loss": 0.8164, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.470344623576394, |
| "grad_norm": 0.16601060330867767, |
| "learning_rate": 0.0003945451262864425, |
| "loss": 0.8161, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.473302765863038, |
| "grad_norm": 0.16971147060394287, |
| "learning_rate": 0.00039425804193162774, |
| "loss": 0.7995, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.473302765863038, |
| "eval_loss": 0.7893310785293579, |
| "eval_runtime": 15.0955, |
| "eval_samples_per_second": 429.929, |
| "eval_steps_per_second": 13.448, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.476260908149682, |
| "grad_norm": 0.17346754670143127, |
| "learning_rate": 0.00039396903699399845, |
| "loss": 0.82, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.479219050436326, |
| "grad_norm": 0.16586238145828247, |
| "learning_rate": 0.00039367811444352747, |
| "loss": 0.8057, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.48217719272297, |
| "grad_norm": 0.16300995647907257, |
| "learning_rate": 0.0003933852772698941, |
| "loss": 0.8095, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.485135335009614, |
| "grad_norm": 0.17610788345336914, |
| "learning_rate": 0.00039309052848245346, |
| "loss": 0.8277, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.48809347729625796, |
| "grad_norm": 0.1696024090051651, |
| "learning_rate": 0.0003927938711102054, |
| "loss": 0.8172, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.49105161958290194, |
| "grad_norm": 0.16828782856464386, |
| "learning_rate": 0.00039249530820176335, |
| "loss": 0.8249, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.4940097618695459, |
| "grad_norm": 0.17049913108348846, |
| "learning_rate": 0.00039219484282532316, |
| "loss": 0.8145, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.4969679041561899, |
| "grad_norm": 0.16307583451271057, |
| "learning_rate": 0.00039189247806863136, |
| "loss": 0.8137, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.4999260464428339, |
| "grad_norm": 0.17322470247745514, |
| "learning_rate": 0.00039158821703895387, |
| "loss": 0.8133, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5028841887294779, |
| "grad_norm": 0.16524559259414673, |
| "learning_rate": 0.0003912820628630433, |
| "loss": 0.8153, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5028841887294779, |
| "eval_loss": 0.7836877703666687, |
| "eval_runtime": 15.0882, |
| "eval_samples_per_second": 430.136, |
| "eval_steps_per_second": 13.454, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5058423310161219, |
| "grad_norm": 0.16517767310142517, |
| "learning_rate": 0.0003909740186871077, |
| "loss": 0.7993, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.5088004733027659, |
| "grad_norm": 0.16940750181674957, |
| "learning_rate": 0.0003906640876767774, |
| "loss": 0.8145, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5117586155894098, |
| "grad_norm": 0.17082969844341278, |
| "learning_rate": 0.00039035227301707315, |
| "loss": 0.8084, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5147167578760539, |
| "grad_norm": 0.16588161885738373, |
| "learning_rate": 0.000390038577912373, |
| "loss": 0.7991, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5176749001626978, |
| "grad_norm": 0.1763608604669571, |
| "learning_rate": 0.0003897230055863795, |
| "loss": 0.8118, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5206330424493418, |
| "grad_norm": 0.16826897859573364, |
| "learning_rate": 0.00038940555928208674, |
| "loss": 0.8041, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5235911847359858, |
| "grad_norm": 0.17069588601589203, |
| "learning_rate": 0.00038908624226174633, |
| "loss": 0.8186, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.5265493270226298, |
| "grad_norm": 0.16954410076141357, |
| "learning_rate": 0.000388765057806835, |
| "loss": 0.8085, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5295074693092737, |
| "grad_norm": 0.17242303490638733, |
| "learning_rate": 0.00038844200921801976, |
| "loss": 0.8158, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.5324656115959178, |
| "grad_norm": 0.1635177582502365, |
| "learning_rate": 0.0003881170998151248, |
| "loss": 0.7943, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5324656115959178, |
| "eval_loss": 0.7797777652740479, |
| "eval_runtime": 15.1642, |
| "eval_samples_per_second": 427.983, |
| "eval_steps_per_second": 13.387, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5354237538825618, |
| "grad_norm": 0.16979654133319855, |
| "learning_rate": 0.00038779033293709694, |
| "loss": 0.82, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.5383818961692057, |
| "grad_norm": 0.1702311635017395, |
| "learning_rate": 0.0003874617119419714, |
| "loss": 0.8196, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5413400384558498, |
| "grad_norm": 0.16838914155960083, |
| "learning_rate": 0.00038713124020683736, |
| "loss": 0.8044, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.5442981807424937, |
| "grad_norm": 0.16786698997020721, |
| "learning_rate": 0.00038679892112780315, |
| "loss": 0.8109, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5472563230291377, |
| "grad_norm": 0.16810354590415955, |
| "learning_rate": 0.0003864647581199616, |
| "loss": 0.7934, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.5502144653157817, |
| "grad_norm": 0.18060894310474396, |
| "learning_rate": 0.00038612875461735457, |
| "loss": 0.8181, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5531726076024257, |
| "grad_norm": 0.17606692016124725, |
| "learning_rate": 0.00038579091407293784, |
| "loss": 0.82, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.5561307498890696, |
| "grad_norm": 0.16881082952022552, |
| "learning_rate": 0.0003854512399585459, |
| "loss": 0.8165, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5590888921757137, |
| "grad_norm": 0.1732224076986313, |
| "learning_rate": 0.0003851097357648557, |
| "loss": 0.8104, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.5620470344623576, |
| "grad_norm": 0.1755875200033188, |
| "learning_rate": 0.0003847664050013512, |
| "loss": 0.8037, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5620470344623576, |
| "eval_loss": 0.7749656438827515, |
| "eval_runtime": 15.1327, |
| "eval_samples_per_second": 428.871, |
| "eval_steps_per_second": 13.415, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5650051767490016, |
| "grad_norm": 0.1778503805398941, |
| "learning_rate": 0.00038442125119628727, |
| "loss": 0.8009, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.5679633190356456, |
| "grad_norm": 0.1720447540283203, |
| "learning_rate": 0.0003840742778966532, |
| "loss": 0.8054, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5709214613222896, |
| "grad_norm": 0.1675298660993576, |
| "learning_rate": 0.0003837254886681367, |
| "loss": 0.8059, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.5738796036089336, |
| "grad_norm": 0.16319267451763153, |
| "learning_rate": 0.0003833748870950865, |
| "loss": 0.8017, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5768377458955776, |
| "grad_norm": 0.1670883148908615, |
| "learning_rate": 0.0003830224767804762, |
| "loss": 0.7935, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.5797958881822216, |
| "grad_norm": 0.17353768646717072, |
| "learning_rate": 0.00038266826134586727, |
| "loss": 0.8116, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5827540304688655, |
| "grad_norm": 0.180403470993042, |
| "learning_rate": 0.0003823122444313713, |
| "loss": 0.8087, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.5857121727555096, |
| "grad_norm": 0.17455299198627472, |
| "learning_rate": 0.00038195442969561275, |
| "loss": 0.8041, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5886703150421535, |
| "grad_norm": 0.1729428619146347, |
| "learning_rate": 0.0003815948208156917, |
| "loss": 0.7989, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.5916284573287975, |
| "grad_norm": 0.16548192501068115, |
| "learning_rate": 0.00038123342148714594, |
| "loss": 0.8095, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5916284573287975, |
| "eval_loss": 0.7690043449401855, |
| "eval_runtime": 15.0799, |
| "eval_samples_per_second": 430.376, |
| "eval_steps_per_second": 13.462, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5945865996154415, |
| "grad_norm": 0.1718030571937561, |
| "learning_rate": 0.0003808702354239126, |
| "loss": 0.8028, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.5975447419020855, |
| "grad_norm": 0.1787232607603073, |
| "learning_rate": 0.00038050526635829035, |
| "loss": 0.806, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6005028841887294, |
| "grad_norm": 0.16556113958358765, |
| "learning_rate": 0.0003801385180409012, |
| "loss": 0.8054, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6034610264753735, |
| "grad_norm": 0.19446386396884918, |
| "learning_rate": 0.00037976999424065147, |
| "loss": 0.8107, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6064191687620174, |
| "grad_norm": 0.17038469016551971, |
| "learning_rate": 0.0003793996987446934, |
| "loss": 0.7748, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6093773110486614, |
| "grad_norm": 0.1731182187795639, |
| "learning_rate": 0.00037902763535838606, |
| "loss": 0.8054, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6123354533353055, |
| "grad_norm": 0.1793605238199234, |
| "learning_rate": 0.0003786538079052565, |
| "loss": 0.8161, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.6152935956219494, |
| "grad_norm": 0.174575075507164, |
| "learning_rate": 0.0003782782202269602, |
| "loss": 0.8056, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6182517379085934, |
| "grad_norm": 0.17301982641220093, |
| "learning_rate": 0.0003779008761832416, |
| "loss": 0.8057, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.6212098801952374, |
| "grad_norm": 0.1758825033903122, |
| "learning_rate": 0.0003775217796518946, |
| "loss": 0.7916, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6212098801952374, |
| "eval_loss": 0.7642711997032166, |
| "eval_runtime": 15.0843, |
| "eval_samples_per_second": 430.249, |
| "eval_steps_per_second": 13.458, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6241680224818814, |
| "grad_norm": 0.1691800355911255, |
| "learning_rate": 0.0003771409345287227, |
| "loss": 0.7948, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.6271261647685253, |
| "grad_norm": 0.167150616645813, |
| "learning_rate": 0.0003767583447274987, |
| "loss": 0.812, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6300843070551694, |
| "grad_norm": 0.18501031398773193, |
| "learning_rate": 0.00037637401417992477, |
| "loss": 0.8015, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.6330424493418133, |
| "grad_norm": 0.17512960731983185, |
| "learning_rate": 0.0003759879468355919, |
| "loss": 0.8057, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6360005916284573, |
| "grad_norm": 0.1902162879705429, |
| "learning_rate": 0.0003756001466619395, |
| "loss": 0.8044, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.6389587339151013, |
| "grad_norm": 0.17907238006591797, |
| "learning_rate": 0.0003752106176442142, |
| "loss": 0.8126, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6419168762017453, |
| "grad_norm": 0.1738317906856537, |
| "learning_rate": 0.00037481936378542944, |
| "loss": 0.805, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.6448750184883892, |
| "grad_norm": 0.18125277757644653, |
| "learning_rate": 0.00037442638910632406, |
| "loss": 0.8205, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6478331607750333, |
| "grad_norm": 0.16496701538562775, |
| "learning_rate": 0.00037403169764532073, |
| "loss": 0.8009, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.6507913030616773, |
| "grad_norm": 0.17556807398796082, |
| "learning_rate": 0.000373635293458485, |
| "loss": 0.814, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6507913030616773, |
| "eval_loss": 0.7593667507171631, |
| "eval_runtime": 15.0495, |
| "eval_samples_per_second": 431.243, |
| "eval_steps_per_second": 13.489, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6537494453483212, |
| "grad_norm": 0.17472174763679504, |
| "learning_rate": 0.00037323718061948313, |
| "loss": 0.8207, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.6567075876349653, |
| "grad_norm": 0.17784008383750916, |
| "learning_rate": 0.0003728373632195406, |
| "loss": 0.7911, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.6596657299216092, |
| "grad_norm": 0.17336086928844452, |
| "learning_rate": 0.00037243584536739973, |
| "loss": 0.8007, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.6626238722082533, |
| "grad_norm": 0.17311853170394897, |
| "learning_rate": 0.00037203263118927777, |
| "loss": 0.8016, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6655820144948972, |
| "grad_norm": 0.1790124475955963, |
| "learning_rate": 0.00037162772482882416, |
| "loss": 0.8022, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.6685401567815412, |
| "grad_norm": 0.175547793507576, |
| "learning_rate": 0.0003712211304470783, |
| "loss": 0.8097, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6714982990681851, |
| "grad_norm": 0.18130792677402496, |
| "learning_rate": 0.00037081285222242646, |
| "loss": 0.8033, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.6744564413548292, |
| "grad_norm": 0.1801801323890686, |
| "learning_rate": 0.0003704028943505593, |
| "loss": 0.8023, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.6774145836414731, |
| "grad_norm": 0.18148784339427948, |
| "learning_rate": 0.000369991261044428, |
| "loss": 0.7823, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.6803727259281172, |
| "grad_norm": 0.18274278938770294, |
| "learning_rate": 0.0003695779565342018, |
| "loss": 0.796, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6803727259281172, |
| "eval_loss": 0.7549857497215271, |
| "eval_runtime": 15.0502, |
| "eval_samples_per_second": 431.224, |
| "eval_steps_per_second": 13.488, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6833308682147611, |
| "grad_norm": 0.1774689257144928, |
| "learning_rate": 0.000369162985067224, |
| "loss": 0.7964, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.6862890105014051, |
| "grad_norm": 0.17815682291984558, |
| "learning_rate": 0.00036874635090796846, |
| "loss": 0.7863, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.6892471527880492, |
| "grad_norm": 0.1769675314426422, |
| "learning_rate": 0.00036832805833799585, |
| "loss": 0.8083, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.6922052950746931, |
| "grad_norm": 0.1735963374376297, |
| "learning_rate": 0.0003679081116559094, |
| "loss": 0.7853, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.6951634373613371, |
| "grad_norm": 0.17244546115398407, |
| "learning_rate": 0.0003674865151773111, |
| "loss": 0.784, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.698121579647981, |
| "grad_norm": 0.1718629151582718, |
| "learning_rate": 0.00036706327323475713, |
| "loss": 0.7971, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7010797219346251, |
| "grad_norm": 0.18205584585666656, |
| "learning_rate": 0.0003666383901777132, |
| "loss": 0.771, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.704037864221269, |
| "grad_norm": 0.18081381916999817, |
| "learning_rate": 0.00036621187037251003, |
| "loss": 0.7801, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.706996006507913, |
| "grad_norm": 0.18197500705718994, |
| "learning_rate": 0.00036578371820229874, |
| "loss": 0.805, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.709954148794557, |
| "grad_norm": 0.17939776182174683, |
| "learning_rate": 0.0003653539380670052, |
| "loss": 0.7792, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.709954148794557, |
| "eval_loss": 0.7517691254615784, |
| "eval_runtime": 15.0963, |
| "eval_samples_per_second": 429.906, |
| "eval_steps_per_second": 13.447, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.712912291081201, |
| "grad_norm": 0.19871175289154053, |
| "learning_rate": 0.0003649225343832853, |
| "loss": 0.8001, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.715870433367845, |
| "grad_norm": 0.1700190007686615, |
| "learning_rate": 0.0003644895115844793, |
| "loss": 0.7888, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.718828575654489, |
| "grad_norm": 0.1822744756937027, |
| "learning_rate": 0.0003640548741205665, |
| "loss": 0.7745, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.7217867179411329, |
| "grad_norm": 0.1826736479997635, |
| "learning_rate": 0.00036361862645811933, |
| "loss": 0.7822, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.724744860227777, |
| "grad_norm": 0.18002553284168243, |
| "learning_rate": 0.0003631807730802575, |
| "loss": 0.7809, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.727703002514421, |
| "grad_norm": 0.1772574633359909, |
| "learning_rate": 0.0003627413184866018, |
| "loss": 0.7874, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.7306611448010649, |
| "grad_norm": 0.2032460868358612, |
| "learning_rate": 0.00036230026719322834, |
| "loss": 0.7983, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.733619287087709, |
| "grad_norm": 0.17761647701263428, |
| "learning_rate": 0.0003618576237326213, |
| "loss": 0.7991, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.7365774293743529, |
| "grad_norm": 0.17456114292144775, |
| "learning_rate": 0.0003614133926536273, |
| "loss": 0.7882, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.7395355716609969, |
| "grad_norm": 0.17203833162784576, |
| "learning_rate": 0.00036096757852140804, |
| "loss": 0.7704, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7395355716609969, |
| "eval_loss": 0.7466259002685547, |
| "eval_runtime": 15.0614, |
| "eval_samples_per_second": 430.902, |
| "eval_steps_per_second": 13.478, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7424937139476409, |
| "grad_norm": 0.1762530505657196, |
| "learning_rate": 0.00036052018591739327, |
| "loss": 0.7914, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.7454518562342849, |
| "grad_norm": 0.18494775891304016, |
| "learning_rate": 0.00036007121943923436, |
| "loss": 0.7836, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7484099985209288, |
| "grad_norm": 0.1850501149892807, |
| "learning_rate": 0.0003596206837007565, |
| "loss": 0.7903, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.7513681408075729, |
| "grad_norm": 0.18491573631763458, |
| "learning_rate": 0.0003591685833319115, |
| "loss": 0.7793, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7543262830942168, |
| "grad_norm": 0.17476260662078857, |
| "learning_rate": 0.0003587149229787301, |
| "loss": 0.7846, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.7572844253808608, |
| "grad_norm": 0.1844286024570465, |
| "learning_rate": 0.00035825970730327437, |
| "loss": 0.7933, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7602425676675048, |
| "grad_norm": 0.17650367319583893, |
| "learning_rate": 0.00035780294098358966, |
| "loss": 0.7769, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.7632007099541488, |
| "grad_norm": 0.18384352326393127, |
| "learning_rate": 0.0003573446287136567, |
| "loss": 0.794, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.7661588522407928, |
| "grad_norm": 0.16338300704956055, |
| "learning_rate": 0.0003568847752033431, |
| "loss": 0.7857, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.7691169945274368, |
| "grad_norm": 0.17690972983837128, |
| "learning_rate": 0.0003564233851783553, |
| "loss": 0.7754, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7691169945274368, |
| "eval_loss": 0.7431380152702332, |
| "eval_runtime": 15.132, |
| "eval_samples_per_second": 428.891, |
| "eval_steps_per_second": 13.415, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7720751368140808, |
| "grad_norm": 0.1823299676179886, |
| "learning_rate": 0.0003559604633801894, |
| "loss": 0.7862, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.7750332791007247, |
| "grad_norm": 0.1779462695121765, |
| "learning_rate": 0.00035549601456608343, |
| "loss": 0.7896, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.7779914213873688, |
| "grad_norm": 0.18377184867858887, |
| "learning_rate": 0.00035503004350896736, |
| "loss": 0.7925, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.7809495636740127, |
| "grad_norm": 0.1728079617023468, |
| "learning_rate": 0.00035456255499741483, |
| "loss": 0.7695, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.7839077059606567, |
| "grad_norm": 0.18434765934944153, |
| "learning_rate": 0.0003540935538355937, |
| "loss": 0.7758, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.7868658482473007, |
| "grad_norm": 0.17789992690086365, |
| "learning_rate": 0.00035362304484321634, |
| "loss": 0.7922, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.7898239905339447, |
| "grad_norm": 0.1814350187778473, |
| "learning_rate": 0.0003531510328554907, |
| "loss": 0.7799, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.7927821328205886, |
| "grad_norm": 0.17939285933971405, |
| "learning_rate": 0.00035267752272307037, |
| "loss": 0.7789, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.7957402751072327, |
| "grad_norm": 0.17399096488952637, |
| "learning_rate": 0.0003522025193120045, |
| "loss": 0.7764, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.7986984173938766, |
| "grad_norm": 0.1716017723083496, |
| "learning_rate": 0.0003517260275036881, |
| "loss": 0.7668, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.7986984173938766, |
| "eval_loss": 0.7402477860450745, |
| "eval_runtime": 15.0911, |
| "eval_samples_per_second": 430.055, |
| "eval_steps_per_second": 13.452, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8016565596805206, |
| "grad_norm": 0.1866738647222519, |
| "learning_rate": 0.0003512480521948117, |
| "loss": 0.779, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.8046147019671647, |
| "grad_norm": 0.1841634213924408, |
| "learning_rate": 0.00035076859829731116, |
| "loss": 0.7792, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8075728442538086, |
| "grad_norm": 0.18017052114009857, |
| "learning_rate": 0.0003502876707383171, |
| "loss": 0.7752, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.8105309865404526, |
| "grad_norm": 0.1775410771369934, |
| "learning_rate": 0.00034980527446010435, |
| "loss": 0.7654, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.8134891288270966, |
| "grad_norm": 0.17534510791301727, |
| "learning_rate": 0.00034932141442004086, |
| "loss": 0.7778, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.8164472711137406, |
| "grad_norm": 0.17751650512218475, |
| "learning_rate": 0.0003488360955905374, |
| "loss": 0.7792, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.8194054134003845, |
| "grad_norm": 0.18025827407836914, |
| "learning_rate": 0.0003483493229589956, |
| "loss": 0.777, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.8223635556870286, |
| "grad_norm": 0.18627671897411346, |
| "learning_rate": 0.0003478611015277576, |
| "loss": 0.7616, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8253216979736725, |
| "grad_norm": 0.18397071957588196, |
| "learning_rate": 0.0003473714363140539, |
| "loss": 0.7775, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.8282798402603165, |
| "grad_norm": 0.17657072842121124, |
| "learning_rate": 0.0003468803323499522, |
| "loss": 0.7777, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8282798402603165, |
| "eval_loss": 0.7356473803520203, |
| "eval_runtime": 15.1704, |
| "eval_samples_per_second": 427.807, |
| "eval_steps_per_second": 13.381, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8312379825469605, |
| "grad_norm": 0.18247635662555695, |
| "learning_rate": 0.00034638779468230556, |
| "loss": 0.7741, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.8341961248336045, |
| "grad_norm": 0.18058709800243378, |
| "learning_rate": 0.0003458938283727006, |
| "loss": 0.7856, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.8371542671202484, |
| "grad_norm": 0.18204925954341888, |
| "learning_rate": 0.0003453984384974055, |
| "loss": 0.7684, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.8401124094068925, |
| "grad_norm": 0.17773495614528656, |
| "learning_rate": 0.0003449016301473176, |
| "loss": 0.7772, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8430705516935365, |
| "grad_norm": 0.18899886310100555, |
| "learning_rate": 0.00034440340842791135, |
| "loss": 0.7828, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.8460286939801804, |
| "grad_norm": 0.18704986572265625, |
| "learning_rate": 0.00034390377845918584, |
| "loss": 0.8089, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.8489868362668245, |
| "grad_norm": 0.17881925404071808, |
| "learning_rate": 0.000343402745375612, |
| "loss": 0.7797, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.8519449785534684, |
| "grad_norm": 0.1758805811405182, |
| "learning_rate": 0.00034290031432608007, |
| "loss": 0.7846, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.8549031208401124, |
| "grad_norm": 0.17261724174022675, |
| "learning_rate": 0.0003423964904738463, |
| "loss": 0.7786, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.8578612631267564, |
| "grad_norm": 0.18959110975265503, |
| "learning_rate": 0.0003418912789964804, |
| "loss": 0.7595, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8578612631267564, |
| "eval_loss": 0.7294892072677612, |
| "eval_runtime": 15.0942, |
| "eval_samples_per_second": 429.966, |
| "eval_steps_per_second": 13.449, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8608194054134004, |
| "grad_norm": 0.1834767460823059, |
| "learning_rate": 0.0003413846850858119, |
| "loss": 0.7906, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.8637775477000443, |
| "grad_norm": 0.18648895621299744, |
| "learning_rate": 0.00034087671394787716, |
| "loss": 0.7774, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8667356899866884, |
| "grad_norm": 0.17823517322540283, |
| "learning_rate": 0.0003403673708028654, |
| "loss": 0.7572, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.8696938322733323, |
| "grad_norm": 0.17990326881408691, |
| "learning_rate": 0.0003398566608850657, |
| "loss": 0.7847, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.8726519745599763, |
| "grad_norm": 0.178693026304245, |
| "learning_rate": 0.0003393445894428125, |
| "loss": 0.7963, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.8756101168466203, |
| "grad_norm": 0.19564184546470642, |
| "learning_rate": 0.00033883116173843216, |
| "loss": 0.7752, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.8785682591332643, |
| "grad_norm": 0.1759771704673767, |
| "learning_rate": 0.0003383163830481888, |
| "loss": 0.7896, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.8815264014199083, |
| "grad_norm": 0.1835668683052063, |
| "learning_rate": 0.0003378002586622298, |
| "loss": 0.7933, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.8844845437065523, |
| "grad_norm": 0.17599444091320038, |
| "learning_rate": 0.000337282793884532, |
| "loss": 0.7527, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.8874426859931963, |
| "grad_norm": 0.1790158897638321, |
| "learning_rate": 0.00033676399403284645, |
| "loss": 0.7843, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8874426859931963, |
| "eval_loss": 0.725857675075531, |
| "eval_runtime": 15.0739, |
| "eval_samples_per_second": 430.544, |
| "eval_steps_per_second": 13.467, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8904008282798402, |
| "grad_norm": 0.1715373992919922, |
| "learning_rate": 0.0003362438644386444, |
| "loss": 0.7578, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.8933589705664843, |
| "grad_norm": 0.17853514850139618, |
| "learning_rate": 0.0003357224104470622, |
| "loss": 0.775, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.8963171128531282, |
| "grad_norm": 0.18070384860038757, |
| "learning_rate": 0.00033519963741684625, |
| "loss": 0.7762, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.8992752551397722, |
| "grad_norm": 0.1907287836074829, |
| "learning_rate": 0.0003346755507202985, |
| "loss": 0.7899, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9022333974264162, |
| "grad_norm": 0.18904021382331848, |
| "learning_rate": 0.00033415015574322053, |
| "loss": 0.7556, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.9051915397130602, |
| "grad_norm": 0.1869528889656067, |
| "learning_rate": 0.0003336234578848587, |
| "loss": 0.7789, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.9081496819997041, |
| "grad_norm": 0.17394182085990906, |
| "learning_rate": 0.0003330954625578482, |
| "loss": 0.7585, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.9111078242863482, |
| "grad_norm": 0.1805906444787979, |
| "learning_rate": 0.0003325661751881582, |
| "loss": 0.7622, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9140659665729921, |
| "grad_norm": 0.16848941147327423, |
| "learning_rate": 0.00033203560121503533, |
| "loss": 0.7532, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.9170241088596361, |
| "grad_norm": 0.17714829742908478, |
| "learning_rate": 0.00033150374609094795, |
| "loss": 0.7629, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9170241088596361, |
| "eval_loss": 0.7223963141441345, |
| "eval_runtime": 15.1068, |
| "eval_samples_per_second": 429.607, |
| "eval_steps_per_second": 13.438, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9199822511462802, |
| "grad_norm": 0.18904811143875122, |
| "learning_rate": 0.00033097061528153035, |
| "loss": 0.7632, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.9229403934329241, |
| "grad_norm": 0.1821032464504242, |
| "learning_rate": 0.0003304362142655266, |
| "loss": 0.7677, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9258985357195681, |
| "grad_norm": 0.187586709856987, |
| "learning_rate": 0.0003299005485347338, |
| "loss": 0.7531, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.9288566780062121, |
| "grad_norm": 0.1836872398853302, |
| "learning_rate": 0.0003293636235939463, |
| "loss": 0.7557, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.9318148202928561, |
| "grad_norm": 0.19305482506752014, |
| "learning_rate": 0.0003288254449608985, |
| "loss": 0.7681, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.9347729625795, |
| "grad_norm": 0.19273991882801056, |
| "learning_rate": 0.00032828601816620856, |
| "loss": 0.7542, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9377311048661441, |
| "grad_norm": 0.1794893592596054, |
| "learning_rate": 0.0003277453487533214, |
| "loss": 0.7647, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.940689247152788, |
| "grad_norm": 0.18218044936656952, |
| "learning_rate": 0.00032720344227845185, |
| "loss": 0.7698, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.943647389439432, |
| "grad_norm": 0.18507668375968933, |
| "learning_rate": 0.00032666030431052724, |
| "loss": 0.7766, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.946605531726076, |
| "grad_norm": 0.17978817224502563, |
| "learning_rate": 0.0003261159404311306, |
| "loss": 0.7462, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.946605531726076, |
| "eval_loss": 0.7182918190956116, |
| "eval_runtime": 15.1761, |
| "eval_samples_per_second": 427.645, |
| "eval_steps_per_second": 13.376, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.94956367401272, |
| "grad_norm": 0.17872752249240875, |
| "learning_rate": 0.00032557035623444316, |
| "loss": 0.746, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.952521816299364, |
| "grad_norm": 0.1889464557170868, |
| "learning_rate": 0.0003250235573271866, |
| "loss": 0.7755, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.955479958586008, |
| "grad_norm": 0.22346559166908264, |
| "learning_rate": 0.0003244755493285656, |
| "loss": 0.7584, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.958438100872652, |
| "grad_norm": 0.18715400993824005, |
| "learning_rate": 0.0003239263378702103, |
| "loss": 0.7626, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.961396243159296, |
| "grad_norm": 0.1809689849615097, |
| "learning_rate": 0.0003233759285961183, |
| "loss": 0.7626, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.96435438544594, |
| "grad_norm": 0.1916404366493225, |
| "learning_rate": 0.00032282432716259637, |
| "loss": 0.7633, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.9673125277325839, |
| "grad_norm": 0.18187373876571655, |
| "learning_rate": 0.00032227153923820276, |
| "loss": 0.7777, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.970270670019228, |
| "grad_norm": 0.18343503773212433, |
| "learning_rate": 0.00032171757050368857, |
| "loss": 0.7632, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.9732288123058719, |
| "grad_norm": 0.1849653571844101, |
| "learning_rate": 0.0003211624266519398, |
| "loss": 0.7727, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.9761869545925159, |
| "grad_norm": 0.1886414885520935, |
| "learning_rate": 0.00032060611338791833, |
| "loss": 0.7832, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9761869545925159, |
| "eval_loss": 0.7173508405685425, |
| "eval_runtime": 15.0955, |
| "eval_samples_per_second": 429.929, |
| "eval_steps_per_second": 13.448, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9791450968791598, |
| "grad_norm": 0.18768879771232605, |
| "learning_rate": 0.0003200486364286038, |
| "loss": 0.7744, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.9821032391658039, |
| "grad_norm": 0.17705638706684113, |
| "learning_rate": 0.0003194900015029344, |
| "loss": 0.752, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.9850613814524478, |
| "grad_norm": 0.1893269568681717, |
| "learning_rate": 0.0003189302143517484, |
| "loss": 0.7673, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.9880195237390919, |
| "grad_norm": 0.18785062432289124, |
| "learning_rate": 0.0003183692807277248, |
| "loss": 0.7712, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.9909776660257359, |
| "grad_norm": 0.1836850345134735, |
| "learning_rate": 0.0003178072063953245, |
| "loss": 0.7714, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.9939358083123798, |
| "grad_norm": 0.19294902682304382, |
| "learning_rate": 0.00031724399713073116, |
| "loss": 0.7482, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.9968939505990239, |
| "grad_norm": 0.17442071437835693, |
| "learning_rate": 0.00031667965872179103, |
| "loss": 0.7504, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.9998520928856678, |
| "grad_norm": 0.18082062900066376, |
| "learning_rate": 0.0003161141969679545, |
| "loss": 0.7581, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.0023665138293152, |
| "grad_norm": 0.1936347484588623, |
| "learning_rate": 0.0003155476176802161, |
| "loss": 0.7118, |
| "step": 1695 |
| }, |
| { |
| "epoch": 1.0053246561159592, |
| "grad_norm": 0.18891729414463043, |
| "learning_rate": 0.00031497992668105465, |
| "loss": 0.7042, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0053246561159592, |
| "eval_loss": 0.7123794555664062, |
| "eval_runtime": 15.1307, |
| "eval_samples_per_second": 428.928, |
| "eval_steps_per_second": 13.416, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0082827984026033, |
| "grad_norm": 0.18976199626922607, |
| "learning_rate": 0.0003144111298043734, |
| "loss": 0.7102, |
| "step": 1705 |
| }, |
| { |
| "epoch": 1.011240940689247, |
| "grad_norm": 0.18827883899211884, |
| "learning_rate": 0.00031384123289544027, |
| "loss": 0.7105, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.0141990829758911, |
| "grad_norm": 0.18360069394111633, |
| "learning_rate": 0.0003132702418108279, |
| "loss": 0.7162, |
| "step": 1715 |
| }, |
| { |
| "epoch": 1.0171572252625352, |
| "grad_norm": 0.1873406022787094, |
| "learning_rate": 0.00031269816241835305, |
| "loss": 0.6896, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.0201153675491792, |
| "grad_norm": 0.1977914422750473, |
| "learning_rate": 0.00031212500059701664, |
| "loss": 0.7108, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.023073509835823, |
| "grad_norm": 0.1831110417842865, |
| "learning_rate": 0.0003115507622369431, |
| "loss": 0.7104, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.026031652122467, |
| "grad_norm": 0.1879042387008667, |
| "learning_rate": 0.0003109754532393202, |
| "loss": 0.7231, |
| "step": 1735 |
| }, |
| { |
| "epoch": 1.028989794409111, |
| "grad_norm": 0.1994623839855194, |
| "learning_rate": 0.00031039907951633795, |
| "loss": 0.7083, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.0319479366957551, |
| "grad_norm": 0.19423262774944305, |
| "learning_rate": 0.0003098216469911281, |
| "loss": 0.701, |
| "step": 1745 |
| }, |
| { |
| "epoch": 1.034906078982399, |
| "grad_norm": 0.18887244164943695, |
| "learning_rate": 0.0003092431615977033, |
| "loss": 0.717, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.034906078982399, |
| "eval_loss": 0.7047431468963623, |
| "eval_runtime": 15.2085, |
| "eval_samples_per_second": 426.735, |
| "eval_steps_per_second": 13.348, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.037864221269043, |
| "grad_norm": 0.1960499882698059, |
| "learning_rate": 0.000308663629280896, |
| "loss": 0.7184, |
| "step": 1755 |
| }, |
| { |
| "epoch": 1.040822363555687, |
| "grad_norm": 0.18444953858852386, |
| "learning_rate": 0.0003080830559962974, |
| "loss": 0.7302, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.043780505842331, |
| "grad_norm": 0.19335564970970154, |
| "learning_rate": 0.00030750144771019635, |
| "loss": 0.7142, |
| "step": 1765 |
| }, |
| { |
| "epoch": 1.046738648128975, |
| "grad_norm": 0.199940025806427, |
| "learning_rate": 0.0003069188103995177, |
| "loss": 0.7266, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.049696790415619, |
| "grad_norm": 0.20113906264305115, |
| "learning_rate": 0.0003063351500517615, |
| "loss": 0.7256, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.052654932702263, |
| "grad_norm": 0.20398704707622528, |
| "learning_rate": 0.0003057504726649407, |
| "loss": 0.7133, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.055613074988907, |
| "grad_norm": 0.18939799070358276, |
| "learning_rate": 0.00030516478424752014, |
| "loss": 0.6997, |
| "step": 1785 |
| }, |
| { |
| "epoch": 1.058571217275551, |
| "grad_norm": 0.19960810244083405, |
| "learning_rate": 0.0003045780908183545, |
| "loss": 0.7186, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.0615293595621949, |
| "grad_norm": 0.19348271191120148, |
| "learning_rate": 0.00030399039840662645, |
| "loss": 0.7243, |
| "step": 1795 |
| }, |
| { |
| "epoch": 1.064487501848839, |
| "grad_norm": 0.191221222281456, |
| "learning_rate": 0.0003034017130517849, |
| "loss": 0.7194, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.064487501848839, |
| "eval_loss": 0.7023242115974426, |
| "eval_runtime": 15.108, |
| "eval_samples_per_second": 429.573, |
| "eval_steps_per_second": 13.437, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.067445644135483, |
| "grad_norm": 0.2013147622346878, |
| "learning_rate": 0.0003028120408034827, |
| "loss": 0.7141, |
| "step": 1805 |
| }, |
| { |
| "epoch": 1.070403786422127, |
| "grad_norm": 0.19821201264858246, |
| "learning_rate": 0.00030222138772151443, |
| "loss": 0.7151, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.073361928708771, |
| "grad_norm": 0.1936139017343521, |
| "learning_rate": 0.00030162975987575453, |
| "loss": 0.716, |
| "step": 1815 |
| }, |
| { |
| "epoch": 1.0763200709954148, |
| "grad_norm": 0.20006032288074493, |
| "learning_rate": 0.0003010371633460944, |
| "loss": 0.7168, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.0792782132820589, |
| "grad_norm": 0.1984449028968811, |
| "learning_rate": 0.0003004436042223803, |
| "loss": 0.7084, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.082236355568703, |
| "grad_norm": 0.18980693817138672, |
| "learning_rate": 0.0002998490886043505, |
| "loss": 0.7093, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.085194497855347, |
| "grad_norm": 0.19591650366783142, |
| "learning_rate": 0.000299253622601573, |
| "loss": 0.7083, |
| "step": 1835 |
| }, |
| { |
| "epoch": 1.0881526401419908, |
| "grad_norm": 0.1950058490037918, |
| "learning_rate": 0.00029865721233338213, |
| "loss": 0.7129, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.0911107824286348, |
| "grad_norm": 0.1933528035879135, |
| "learning_rate": 0.00029805986392881617, |
| "loss": 0.7183, |
| "step": 1845 |
| }, |
| { |
| "epoch": 1.0940689247152788, |
| "grad_norm": 0.18293854594230652, |
| "learning_rate": 0.00029746158352655434, |
| "loss": 0.7124, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.0940689247152788, |
| "eval_loss": 0.6965740323066711, |
| "eval_runtime": 15.1266, |
| "eval_samples_per_second": 429.045, |
| "eval_steps_per_second": 13.42, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.0970270670019229, |
| "grad_norm": 0.19370752573013306, |
| "learning_rate": 0.00029686237727485334, |
| "loss": 0.7028, |
| "step": 1855 |
| }, |
| { |
| "epoch": 1.0999852092885667, |
| "grad_norm": 0.196714848279953, |
| "learning_rate": 0.0002962622513314845, |
| "loss": 0.727, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.1029433515752107, |
| "grad_norm": 0.1929130107164383, |
| "learning_rate": 0.0002956612118636705, |
| "loss": 0.7109, |
| "step": 1865 |
| }, |
| { |
| "epoch": 1.1059014938618548, |
| "grad_norm": 0.2017488032579422, |
| "learning_rate": 0.00029505926504802175, |
| "loss": 0.7134, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.1088596361484988, |
| "grad_norm": 0.18689104914665222, |
| "learning_rate": 0.00029445641707047317, |
| "loss": 0.7072, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.1118177784351428, |
| "grad_norm": 0.21098212897777557, |
| "learning_rate": 0.0002938526741262204, |
| "loss": 0.7387, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.1147759207217867, |
| "grad_norm": 0.1986856907606125, |
| "learning_rate": 0.00029324804241965635, |
| "loss": 0.7228, |
| "step": 1885 |
| }, |
| { |
| "epoch": 1.1177340630084307, |
| "grad_norm": 0.194554403424263, |
| "learning_rate": 0.00029264252816430734, |
| "loss": 0.7213, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.1206922052950747, |
| "grad_norm": 0.1904379278421402, |
| "learning_rate": 0.00029203613758276915, |
| "loss": 0.6987, |
| "step": 1895 |
| }, |
| { |
| "epoch": 1.1236503475817188, |
| "grad_norm": 0.19453752040863037, |
| "learning_rate": 0.0002914288769066432, |
| "loss": 0.7011, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.1236503475817188, |
| "eval_loss": 0.6925638318061829, |
| "eval_runtime": 15.1122, |
| "eval_samples_per_second": 429.455, |
| "eval_steps_per_second": 13.433, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.1266084898683626, |
| "grad_norm": 0.1970696598291397, |
| "learning_rate": 0.00029082075237647266, |
| "loss": 0.7354, |
| "step": 1905 |
| }, |
| { |
| "epoch": 1.1295666321550066, |
| "grad_norm": 0.19833780825138092, |
| "learning_rate": 0.00029021177024167775, |
| "loss": 0.7154, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.1325247744416507, |
| "grad_norm": 0.1916065663099289, |
| "learning_rate": 0.00028960193676049226, |
| "loss": 0.7172, |
| "step": 1915 |
| }, |
| { |
| "epoch": 1.1354829167282947, |
| "grad_norm": 0.1968769133090973, |
| "learning_rate": 0.00028899125819989874, |
| "loss": 0.723, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.1384410590149385, |
| "grad_norm": 0.1897817701101303, |
| "learning_rate": 0.0002883797408355643, |
| "loss": 0.7071, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.1413992013015826, |
| "grad_norm": 0.19858099520206451, |
| "learning_rate": 0.00028776739095177597, |
| "loss": 0.7097, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.1443573435882266, |
| "grad_norm": 0.19754937291145325, |
| "learning_rate": 0.0002871542148413762, |
| "loss": 0.7043, |
| "step": 1935 |
| }, |
| { |
| "epoch": 1.1473154858748706, |
| "grad_norm": 0.20758198201656342, |
| "learning_rate": 0.00028654021880569834, |
| "loss": 0.7158, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.1502736281615147, |
| "grad_norm": 0.19415895640850067, |
| "learning_rate": 0.00028592540915450154, |
| "loss": 0.7079, |
| "step": 1945 |
| }, |
| { |
| "epoch": 1.1532317704481585, |
| "grad_norm": 0.19232836365699768, |
| "learning_rate": 0.0002853097922059063, |
| "loss": 0.7033, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.1532317704481585, |
| "eval_loss": 0.6914330124855042, |
| "eval_runtime": 15.1002, |
| "eval_samples_per_second": 429.796, |
| "eval_steps_per_second": 13.444, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.1561899127348025, |
| "grad_norm": 0.19998547434806824, |
| "learning_rate": 0.0002846933742863292, |
| "loss": 0.7044, |
| "step": 1955 |
| }, |
| { |
| "epoch": 1.1591480550214466, |
| "grad_norm": 0.19473986327648163, |
| "learning_rate": 0.0002840761617304181, |
| "loss": 0.6919, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.1621061973080906, |
| "grad_norm": 0.19846412539482117, |
| "learning_rate": 0.00028345816088098697, |
| "loss": 0.7021, |
| "step": 1965 |
| }, |
| { |
| "epoch": 1.1650643395947344, |
| "grad_norm": 0.19736585021018982, |
| "learning_rate": 0.0002828393780889508, |
| "loss": 0.7163, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.1680224818813785, |
| "grad_norm": 0.19072949886322021, |
| "learning_rate": 0.00028221981971326005, |
| "loss": 0.7155, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.1709806241680225, |
| "grad_norm": 0.20604072511196136, |
| "learning_rate": 0.0002815994921208358, |
| "loss": 0.7092, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.1739387664546665, |
| "grad_norm": 0.19358354806900024, |
| "learning_rate": 0.0002809784016865036, |
| "loss": 0.7027, |
| "step": 1985 |
| }, |
| { |
| "epoch": 1.1768969087413104, |
| "grad_norm": 0.1930014044046402, |
| "learning_rate": 0.00028035655479292877, |
| "loss": 0.7109, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.1798550510279544, |
| "grad_norm": 0.1993882954120636, |
| "learning_rate": 0.0002797339578305503, |
| "loss": 0.7035, |
| "step": 1995 |
| }, |
| { |
| "epoch": 1.1828131933145984, |
| "grad_norm": 0.20162735879421234, |
| "learning_rate": 0.00027911061719751516, |
| "loss": 0.7044, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.1828131933145984, |
| "eval_loss": 0.6860821843147278, |
| "eval_runtime": 15.1933, |
| "eval_samples_per_second": 427.162, |
| "eval_steps_per_second": 13.361, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.1857713356012425, |
| "grad_norm": 0.19949831068515778, |
| "learning_rate": 0.00027848653929961293, |
| "loss": 0.7081, |
| "step": 2005 |
| }, |
| { |
| "epoch": 1.1887294778878865, |
| "grad_norm": 0.1930517703294754, |
| "learning_rate": 0.0002778617305502096, |
| "loss": 0.7038, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.1916876201745303, |
| "grad_norm": 0.2007024586200714, |
| "learning_rate": 0.0002772361973701816, |
| "loss": 0.7016, |
| "step": 2015 |
| }, |
| { |
| "epoch": 1.1946457624611744, |
| "grad_norm": 0.2045419067144394, |
| "learning_rate": 0.00027660994618785044, |
| "loss": 0.7079, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.1976039047478184, |
| "grad_norm": 0.2049221694469452, |
| "learning_rate": 0.0002759829834389157, |
| "loss": 0.7031, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.2005620470344625, |
| "grad_norm": 0.19344255328178406, |
| "learning_rate": 0.00027535531556638994, |
| "loss": 0.7207, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.2035201893211063, |
| "grad_norm": 0.19495131075382233, |
| "learning_rate": 0.0002747269490205315, |
| "loss": 0.7128, |
| "step": 2035 |
| }, |
| { |
| "epoch": 1.2064783316077503, |
| "grad_norm": 0.1935647428035736, |
| "learning_rate": 0.00027409789025877897, |
| "loss": 0.701, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.2094364738943943, |
| "grad_norm": 0.2149600386619568, |
| "learning_rate": 0.0002734681457456843, |
| "loss": 0.7072, |
| "step": 2045 |
| }, |
| { |
| "epoch": 1.2123946161810384, |
| "grad_norm": 0.1978955715894699, |
| "learning_rate": 0.0002728377219528468, |
| "loss": 0.7042, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.2123946161810384, |
| "eval_loss": 0.6831551790237427, |
| "eval_runtime": 15.2782, |
| "eval_samples_per_second": 424.787, |
| "eval_steps_per_second": 13.287, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.2153527584676822, |
| "grad_norm": 0.1999792903661728, |
| "learning_rate": 0.00027220662535884635, |
| "loss": 0.7239, |
| "step": 2055 |
| }, |
| { |
| "epoch": 1.2183109007543262, |
| "grad_norm": 0.19362211227416992, |
| "learning_rate": 0.00027157486244917687, |
| "loss": 0.7149, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.2212690430409703, |
| "grad_norm": 0.200283482670784, |
| "learning_rate": 0.0002709424397161798, |
| "loss": 0.7277, |
| "step": 2065 |
| }, |
| { |
| "epoch": 1.2242271853276143, |
| "grad_norm": 0.2076682150363922, |
| "learning_rate": 0.00027030936365897705, |
| "loss": 0.714, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.2271853276142584, |
| "grad_norm": 0.20363134145736694, |
| "learning_rate": 0.00026967564078340483, |
| "loss": 0.7328, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.2301434699009022, |
| "grad_norm": 0.19987626373767853, |
| "learning_rate": 0.000269041277601946, |
| "loss": 0.7092, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.2331016121875462, |
| "grad_norm": 0.20434942841529846, |
| "learning_rate": 0.0002684062806336639, |
| "loss": 0.7149, |
| "step": 2085 |
| }, |
| { |
| "epoch": 1.2360597544741903, |
| "grad_norm": 0.21225950121879578, |
| "learning_rate": 0.0002677706564041348, |
| "loss": 0.6893, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.2390178967608343, |
| "grad_norm": 0.20148411393165588, |
| "learning_rate": 0.00026713441144538106, |
| "loss": 0.703, |
| "step": 2095 |
| }, |
| { |
| "epoch": 1.241976039047478, |
| "grad_norm": 0.20638014376163483, |
| "learning_rate": 0.0002664975522958041, |
| "loss": 0.6961, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.241976039047478, |
| "eval_loss": 0.6792827248573303, |
| "eval_runtime": 15.2265, |
| "eval_samples_per_second": 426.23, |
| "eval_steps_per_second": 13.332, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.2449341813341221, |
| "grad_norm": 0.20307990908622742, |
| "learning_rate": 0.000265860085500117, |
| "loss": 0.7301, |
| "step": 2105 |
| }, |
| { |
| "epoch": 1.2478923236207662, |
| "grad_norm": 0.19271095097064972, |
| "learning_rate": 0.0002652220176092775, |
| "loss": 0.7224, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.2508504659074102, |
| "grad_norm": 0.19397003948688507, |
| "learning_rate": 0.0002645833551804202, |
| "loss": 0.7044, |
| "step": 2115 |
| }, |
| { |
| "epoch": 1.253808608194054, |
| "grad_norm": 0.1992734968662262, |
| "learning_rate": 0.0002639441047767899, |
| "loss": 0.7267, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.256766750480698, |
| "grad_norm": 0.19389532506465912, |
| "learning_rate": 0.0002633042729676735, |
| "loss": 0.7022, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.2597248927673421, |
| "grad_norm": 0.21384213864803314, |
| "learning_rate": 0.00026266386632833275, |
| "loss": 0.689, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.2626830350539862, |
| "grad_norm": 0.20540039241313934, |
| "learning_rate": 0.0002620228914399368, |
| "loss": 0.6929, |
| "step": 2135 |
| }, |
| { |
| "epoch": 1.2656411773406302, |
| "grad_norm": 0.19188985228538513, |
| "learning_rate": 0.0002613813548894943, |
| "loss": 0.7023, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.268599319627274, |
| "grad_norm": 0.21049626171588898, |
| "learning_rate": 0.00026073926326978587, |
| "loss": 0.6864, |
| "step": 2145 |
| }, |
| { |
| "epoch": 1.271557461913918, |
| "grad_norm": 0.20232687890529633, |
| "learning_rate": 0.0002600966231792964, |
| "loss": 0.7041, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.271557461913918, |
| "eval_loss": 0.6756435632705688, |
| "eval_runtime": 15.2077, |
| "eval_samples_per_second": 426.757, |
| "eval_steps_per_second": 13.348, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.274515604200562, |
| "grad_norm": 0.20849832892417908, |
| "learning_rate": 0.0002594534412221472, |
| "loss": 0.7097, |
| "step": 2155 |
| }, |
| { |
| "epoch": 1.277473746487206, |
| "grad_norm": 0.2087436467409134, |
| "learning_rate": 0.0002588097240080279, |
| "loss": 0.708, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.28043188877385, |
| "grad_norm": 0.2046281397342682, |
| "learning_rate": 0.00025816547815212887, |
| "loss": 0.6973, |
| "step": 2165 |
| }, |
| { |
| "epoch": 1.283390031060494, |
| "grad_norm": 0.19384223222732544, |
| "learning_rate": 0.00025752071027507315, |
| "loss": 0.7067, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.286348173347138, |
| "grad_norm": 0.20464631915092468, |
| "learning_rate": 0.00025687542700284817, |
| "loss": 0.7178, |
| "step": 2175 |
| }, |
| { |
| "epoch": 1.289306315633782, |
| "grad_norm": 0.20493179559707642, |
| "learning_rate": 0.000256229634966738, |
| "loss": 0.6897, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.2922644579204259, |
| "grad_norm": 0.20845326781272888, |
| "learning_rate": 0.000255583340803255, |
| "loss": 0.717, |
| "step": 2185 |
| }, |
| { |
| "epoch": 1.29522260020707, |
| "grad_norm": 0.20893819630146027, |
| "learning_rate": 0.00025493655115407164, |
| "loss": 0.7003, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.298180742493714, |
| "grad_norm": 0.2201833724975586, |
| "learning_rate": 0.0002542892726659523, |
| "loss": 0.7106, |
| "step": 2195 |
| }, |
| { |
| "epoch": 1.301138884780358, |
| "grad_norm": 0.19646216928958893, |
| "learning_rate": 0.000253641511990685, |
| "loss": 0.688, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.301138884780358, |
| "eval_loss": 0.6724188923835754, |
| "eval_runtime": 15.2418, |
| "eval_samples_per_second": 425.804, |
| "eval_steps_per_second": 13.319, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.304097027067002, |
| "grad_norm": 0.2002546191215515, |
| "learning_rate": 0.00025299327578501274, |
| "loss": 0.6972, |
| "step": 2205 |
| }, |
| { |
| "epoch": 1.3070551693536459, |
| "grad_norm": 0.1986691802740097, |
| "learning_rate": 0.0002523445707105656, |
| "loss": 0.726, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.31001331164029, |
| "grad_norm": 0.20118148624897003, |
| "learning_rate": 0.00025169540343379193, |
| "loss": 0.696, |
| "step": 2215 |
| }, |
| { |
| "epoch": 1.312971453926934, |
| "grad_norm": 0.20996816456317902, |
| "learning_rate": 0.0002510457806258898, |
| "loss": 0.7079, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.3159295962135777, |
| "grad_norm": 0.20342901349067688, |
| "learning_rate": 0.0002503957089627388, |
| "loss": 0.708, |
| "step": 2225 |
| }, |
| { |
| "epoch": 1.3188877385002218, |
| "grad_norm": 0.20094621181488037, |
| "learning_rate": 0.000249745195124831, |
| "loss": 0.7225, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.3218458807868658, |
| "grad_norm": 0.20358270406723022, |
| "learning_rate": 0.0002490942457972025, |
| "loss": 0.7048, |
| "step": 2235 |
| }, |
| { |
| "epoch": 1.3248040230735099, |
| "grad_norm": 0.19839729368686676, |
| "learning_rate": 0.00024844286766936504, |
| "loss": 0.6953, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.327762165360154, |
| "grad_norm": 0.19929233193397522, |
| "learning_rate": 0.00024779106743523646, |
| "loss": 0.7127, |
| "step": 2245 |
| }, |
| { |
| "epoch": 1.3307203076467977, |
| "grad_norm": 0.19684284925460815, |
| "learning_rate": 0.0002471388517930727, |
| "loss": 0.7118, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.3307203076467977, |
| "eval_loss": 0.6681681871414185, |
| "eval_runtime": 15.1564, |
| "eval_samples_per_second": 428.202, |
| "eval_steps_per_second": 13.394, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.3336784499334418, |
| "grad_norm": 0.20598828792572021, |
| "learning_rate": 0.00024648622744539864, |
| "loss": 0.7046, |
| "step": 2255 |
| }, |
| { |
| "epoch": 1.3366365922200858, |
| "grad_norm": 0.20147842168807983, |
| "learning_rate": 0.0002458332010989393, |
| "loss": 0.7151, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.3395947345067298, |
| "grad_norm": 0.20049843192100525, |
| "learning_rate": 0.00024517977946455057, |
| "loss": 0.7203, |
| "step": 2265 |
| }, |
| { |
| "epoch": 1.3425528767933739, |
| "grad_norm": 0.19043414294719696, |
| "learning_rate": 0.00024452596925715093, |
| "loss": 0.7122, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.3455110190800177, |
| "grad_norm": 0.20228472352027893, |
| "learning_rate": 0.00024387177719565164, |
| "loss": 0.7079, |
| "step": 2275 |
| }, |
| { |
| "epoch": 1.3484691613666617, |
| "grad_norm": 0.2019595056772232, |
| "learning_rate": 0.00024321721000288845, |
| "loss": 0.6854, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.3514273036533058, |
| "grad_norm": 0.19533614814281464, |
| "learning_rate": 0.000242562274405552, |
| "loss": 0.7039, |
| "step": 2285 |
| }, |
| { |
| "epoch": 1.3543854459399496, |
| "grad_norm": 0.20636282861232758, |
| "learning_rate": 0.00024190697713411885, |
| "loss": 0.6876, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.3573435882265936, |
| "grad_norm": 0.1991133838891983, |
| "learning_rate": 0.00024125132492278244, |
| "loss": 0.6944, |
| "step": 2295 |
| }, |
| { |
| "epoch": 1.3603017305132377, |
| "grad_norm": 0.2027919590473175, |
| "learning_rate": 0.00024059532450938358, |
| "loss": 0.7037, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.3603017305132377, |
| "eval_loss": 0.664495587348938, |
| "eval_runtime": 15.2359, |
| "eval_samples_per_second": 425.967, |
| "eval_steps_per_second": 13.324, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.3632598727998817, |
| "grad_norm": 0.2081235647201538, |
| "learning_rate": 0.0002399389826353415, |
| "loss": 0.7097, |
| "step": 2305 |
| }, |
| { |
| "epoch": 1.3662180150865257, |
| "grad_norm": 0.20397891104221344, |
| "learning_rate": 0.0002392823060455845, |
| "loss": 0.6983, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.3691761573731696, |
| "grad_norm": 0.19654420018196106, |
| "learning_rate": 0.00023862530148848052, |
| "loss": 0.7147, |
| "step": 2315 |
| }, |
| { |
| "epoch": 1.3721342996598136, |
| "grad_norm": 0.21511806547641754, |
| "learning_rate": 0.000237967975715768, |
| "loss": 0.7028, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.3750924419464576, |
| "grad_norm": 0.19977550208568573, |
| "learning_rate": 0.00023731033548248618, |
| "loss": 0.7037, |
| "step": 2325 |
| }, |
| { |
| "epoch": 1.3780505842331017, |
| "grad_norm": 0.20144343376159668, |
| "learning_rate": 0.00023665238754690604, |
| "loss": 0.6902, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.3810087265197457, |
| "grad_norm": 0.20750263333320618, |
| "learning_rate": 0.00023599413867046056, |
| "loss": 0.6967, |
| "step": 2335 |
| }, |
| { |
| "epoch": 1.3839668688063895, |
| "grad_norm": 0.20630352199077606, |
| "learning_rate": 0.0002353355956176755, |
| "loss": 0.7054, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.3869250110930336, |
| "grad_norm": 0.19819048047065735, |
| "learning_rate": 0.0002346767651560995, |
| "loss": 0.6915, |
| "step": 2345 |
| }, |
| { |
| "epoch": 1.3898831533796776, |
| "grad_norm": 0.19468337297439575, |
| "learning_rate": 0.00023401765405623495, |
| "loss": 0.687, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.3898831533796776, |
| "eval_loss": 0.6600573658943176, |
| "eval_runtime": 15.1998, |
| "eval_samples_per_second": 426.98, |
| "eval_steps_per_second": 13.355, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.3928412956663214, |
| "grad_norm": 0.2120439112186432, |
| "learning_rate": 0.00023335826909146824, |
| "loss": 0.6855, |
| "step": 2355 |
| }, |
| { |
| "epoch": 1.3957994379529655, |
| "grad_norm": 0.19401684403419495, |
| "learning_rate": 0.0002326986170380001, |
| "loss": 0.6898, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.3987575802396095, |
| "grad_norm": 0.20173484086990356, |
| "learning_rate": 0.0002320387046747759, |
| "loss": 0.7154, |
| "step": 2365 |
| }, |
| { |
| "epoch": 1.4017157225262535, |
| "grad_norm": 0.19973625242710114, |
| "learning_rate": 0.00023137853878341628, |
| "loss": 0.7032, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.4046738648128976, |
| "grad_norm": 0.19550538063049316, |
| "learning_rate": 0.00023071812614814722, |
| "loss": 0.7068, |
| "step": 2375 |
| }, |
| { |
| "epoch": 1.4076320070995414, |
| "grad_norm": 0.19463753700256348, |
| "learning_rate": 0.00023005747355573026, |
| "loss": 0.6961, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.4105901493861854, |
| "grad_norm": 0.19839531183242798, |
| "learning_rate": 0.00022939658779539304, |
| "loss": 0.6912, |
| "step": 2385 |
| }, |
| { |
| "epoch": 1.4135482916728295, |
| "grad_norm": 0.20887915790081024, |
| "learning_rate": 0.00022873547565875927, |
| "loss": 0.7069, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.4165064339594735, |
| "grad_norm": 0.20123013854026794, |
| "learning_rate": 0.00022807414393977905, |
| "loss": 0.6805, |
| "step": 2395 |
| }, |
| { |
| "epoch": 1.4194645762461175, |
| "grad_norm": 0.2022971361875534, |
| "learning_rate": 0.00022741259943465894, |
| "loss": 0.6999, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.4194645762461175, |
| "eval_loss": 0.6564731001853943, |
| "eval_runtime": 15.2607, |
| "eval_samples_per_second": 425.276, |
| "eval_steps_per_second": 13.302, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.4224227185327614, |
| "grad_norm": 0.2049475908279419, |
| "learning_rate": 0.00022675084894179244, |
| "loss": 0.6881, |
| "step": 2405 |
| }, |
| { |
| "epoch": 1.4253808608194054, |
| "grad_norm": 0.20038989186286926, |
| "learning_rate": 0.00022608889926168958, |
| "loss": 0.709, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.4283390031060494, |
| "grad_norm": 0.20032629370689392, |
| "learning_rate": 0.00022542675719690753, |
| "loss": 0.7014, |
| "step": 2415 |
| }, |
| { |
| "epoch": 1.4312971453926933, |
| "grad_norm": 0.19591467082500458, |
| "learning_rate": 0.00022476442955198057, |
| "loss": 0.6995, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.4342552876793373, |
| "grad_norm": 0.2085343301296234, |
| "learning_rate": 0.0002241019231333499, |
| "loss": 0.6962, |
| "step": 2425 |
| }, |
| { |
| "epoch": 1.4372134299659813, |
| "grad_norm": 0.20096167922019958, |
| "learning_rate": 0.00022343924474929415, |
| "loss": 0.6961, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.4401715722526254, |
| "grad_norm": 0.20498406887054443, |
| "learning_rate": 0.0002227764012098589, |
| "loss": 0.6975, |
| "step": 2435 |
| }, |
| { |
| "epoch": 1.4431297145392694, |
| "grad_norm": 0.20352709293365479, |
| "learning_rate": 0.00022211339932678715, |
| "loss": 0.7016, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.4460878568259132, |
| "grad_norm": 0.200238898396492, |
| "learning_rate": 0.00022145024591344904, |
| "loss": 0.6754, |
| "step": 2445 |
| }, |
| { |
| "epoch": 1.4490459991125573, |
| "grad_norm": 0.20030836760997772, |
| "learning_rate": 0.0002207869477847719, |
| "loss": 0.6945, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.4490459991125573, |
| "eval_loss": 0.6526739001274109, |
| "eval_runtime": 15.2028, |
| "eval_samples_per_second": 426.896, |
| "eval_steps_per_second": 13.353, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.4520041413992013, |
| "grad_norm": 0.20762254297733307, |
| "learning_rate": 0.00022012351175717035, |
| "loss": 0.6892, |
| "step": 2455 |
| }, |
| { |
| "epoch": 1.4549622836858453, |
| "grad_norm": 0.19851386547088623, |
| "learning_rate": 0.000219459944648476, |
| "loss": 0.6946, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.4579204259724894, |
| "grad_norm": 0.20084446668624878, |
| "learning_rate": 0.0002187962532778676, |
| "loss": 0.6855, |
| "step": 2465 |
| }, |
| { |
| "epoch": 1.4608785682591332, |
| "grad_norm": 0.19501332938671112, |
| "learning_rate": 0.0002181324444658008, |
| "loss": 0.7057, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.4638367105457772, |
| "grad_norm": 0.19630266726016998, |
| "learning_rate": 0.0002174685250339383, |
| "loss": 0.6885, |
| "step": 2475 |
| }, |
| { |
| "epoch": 1.4667948528324213, |
| "grad_norm": 0.19841867685317993, |
| "learning_rate": 0.0002168045018050794, |
| "loss": 0.6906, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.469752995119065, |
| "grad_norm": 0.2073049694299698, |
| "learning_rate": 0.0002161403816030902, |
| "loss": 0.686, |
| "step": 2485 |
| }, |
| { |
| "epoch": 1.4727111374057091, |
| "grad_norm": 0.20695935189723969, |
| "learning_rate": 0.00021547617125283332, |
| "loss": 0.6919, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.4756692796923532, |
| "grad_norm": 0.21272872388362885, |
| "learning_rate": 0.00021481187758009784, |
| "loss": 0.6954, |
| "step": 2495 |
| }, |
| { |
| "epoch": 1.4786274219789972, |
| "grad_norm": 0.20142289996147156, |
| "learning_rate": 0.00021414750741152895, |
| "loss": 0.6728, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.4786274219789972, |
| "eval_loss": 0.650132417678833, |
| "eval_runtime": 15.1925, |
| "eval_samples_per_second": 427.184, |
| "eval_steps_per_second": 13.362, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.4815855642656413, |
| "grad_norm": 0.19570617377758026, |
| "learning_rate": 0.0002134830675745581, |
| "loss": 0.6891, |
| "step": 2505 |
| }, |
| { |
| "epoch": 1.484543706552285, |
| "grad_norm": 0.22009368240833282, |
| "learning_rate": 0.00021281856489733261, |
| "loss": 0.6923, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.487501848838929, |
| "grad_norm": 0.20262496173381805, |
| "learning_rate": 0.00021215400620864575, |
| "loss": 0.6902, |
| "step": 2515 |
| }, |
| { |
| "epoch": 1.4904599911255731, |
| "grad_norm": 0.20191776752471924, |
| "learning_rate": 0.00021148939833786617, |
| "loss": 0.6916, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.4934181334122172, |
| "grad_norm": 0.20352528989315033, |
| "learning_rate": 0.00021082474811486804, |
| "loss": 0.6995, |
| "step": 2525 |
| }, |
| { |
| "epoch": 1.4963762756988612, |
| "grad_norm": 0.19871221482753754, |
| "learning_rate": 0.00021016006236996074, |
| "loss": 0.706, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.499334417985505, |
| "grad_norm": 0.2028975784778595, |
| "learning_rate": 0.00020949534793381877, |
| "loss": 0.6801, |
| "step": 2535 |
| }, |
| { |
| "epoch": 1.502292560272149, |
| "grad_norm": 0.20480629801750183, |
| "learning_rate": 0.00020883061163741142, |
| "loss": 0.7002, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.5052507025587931, |
| "grad_norm": 0.20334535837173462, |
| "learning_rate": 0.00020816586031193254, |
| "loss": 0.6992, |
| "step": 2545 |
| }, |
| { |
| "epoch": 1.508208844845437, |
| "grad_norm": 0.1922510415315628, |
| "learning_rate": 0.00020750110078873057, |
| "loss": 0.69, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.508208844845437, |
| "eval_loss": 0.6458378434181213, |
| "eval_runtime": 15.156, |
| "eval_samples_per_second": 428.213, |
| "eval_steps_per_second": 13.394, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.5111669871320812, |
| "grad_norm": 0.18962906301021576, |
| "learning_rate": 0.0002068363398992382, |
| "loss": 0.6879, |
| "step": 2555 |
| }, |
| { |
| "epoch": 1.514125129418725, |
| "grad_norm": 0.19932591915130615, |
| "learning_rate": 0.000206171584474902, |
| "loss": 0.6799, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.517083271705369, |
| "grad_norm": 0.20362286269664764, |
| "learning_rate": 0.00020550684134711252, |
| "loss": 0.689, |
| "step": 2565 |
| }, |
| { |
| "epoch": 1.520041413992013, |
| "grad_norm": 0.20676745474338531, |
| "learning_rate": 0.00020484211734713388, |
| "loss": 0.7185, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.522999556278657, |
| "grad_norm": 0.21269002556800842, |
| "learning_rate": 0.00020417741930603376, |
| "loss": 0.6852, |
| "step": 2575 |
| }, |
| { |
| "epoch": 1.525957698565301, |
| "grad_norm": 0.2073538452386856, |
| "learning_rate": 0.00020351275405461282, |
| "loss": 0.7067, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.528915840851945, |
| "grad_norm": 0.1996561586856842, |
| "learning_rate": 0.00020284812842333495, |
| "loss": 0.6647, |
| "step": 2585 |
| }, |
| { |
| "epoch": 1.5318739831385888, |
| "grad_norm": 0.20402218401432037, |
| "learning_rate": 0.00020218354924225683, |
| "loss": 0.6934, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.534832125425233, |
| "grad_norm": 0.21136872470378876, |
| "learning_rate": 0.00020151902334095785, |
| "loss": 0.6849, |
| "step": 2595 |
| }, |
| { |
| "epoch": 1.5377902677118769, |
| "grad_norm": 0.20921598374843597, |
| "learning_rate": 0.00020085455754846975, |
| "loss": 0.6916, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.5377902677118769, |
| "eval_loss": 0.6419690847396851, |
| "eval_runtime": 15.1809, |
| "eval_samples_per_second": 427.511, |
| "eval_steps_per_second": 13.372, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.540748409998521, |
| "grad_norm": 0.1944100558757782, |
| "learning_rate": 0.00020019015869320663, |
| "loss": 0.7003, |
| "step": 2605 |
| }, |
| { |
| "epoch": 1.543706552285165, |
| "grad_norm": 0.19190169870853424, |
| "learning_rate": 0.00019952583360289473, |
| "loss": 0.7068, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.5466646945718088, |
| "grad_norm": 0.20087464153766632, |
| "learning_rate": 0.00019886158910450218, |
| "loss": 0.667, |
| "step": 2615 |
| }, |
| { |
| "epoch": 1.549622836858453, |
| "grad_norm": 0.19939038157463074, |
| "learning_rate": 0.00019819743202416904, |
| "loss": 0.6686, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.5525809791450969, |
| "grad_norm": 0.201206237077713, |
| "learning_rate": 0.00019753336918713668, |
| "loss": 0.6882, |
| "step": 2625 |
| }, |
| { |
| "epoch": 1.555539121431741, |
| "grad_norm": 0.21987488865852356, |
| "learning_rate": 0.00019686940741767839, |
| "loss": 0.6855, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.558497263718385, |
| "grad_norm": 0.20101770758628845, |
| "learning_rate": 0.00019620555353902855, |
| "loss": 0.69, |
| "step": 2635 |
| }, |
| { |
| "epoch": 1.5614554060050287, |
| "grad_norm": 0.20172643661499023, |
| "learning_rate": 0.00019554181437331296, |
| "loss": 0.6666, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.5644135482916728, |
| "grad_norm": 0.19294790923595428, |
| "learning_rate": 0.00019487819674147844, |
| "loss": 0.6694, |
| "step": 2645 |
| }, |
| { |
| "epoch": 1.5673716905783168, |
| "grad_norm": 0.20478790998458862, |
| "learning_rate": 0.00019421470746322294, |
| "loss": 0.6969, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.5673716905783168, |
| "eval_loss": 0.6383815407752991, |
| "eval_runtime": 15.3432, |
| "eval_samples_per_second": 422.988, |
| "eval_steps_per_second": 13.231, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.5703298328649606, |
| "grad_norm": 0.20216462016105652, |
| "learning_rate": 0.00019355135335692538, |
| "loss": 0.6687, |
| "step": 2655 |
| }, |
| { |
| "epoch": 1.573287975151605, |
| "grad_norm": 0.2018350064754486, |
| "learning_rate": 0.00019288814123957554, |
| "loss": 0.6973, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.5762461174382487, |
| "grad_norm": 0.20364220440387726, |
| "learning_rate": 0.00019222507792670412, |
| "loss": 0.6894, |
| "step": 2665 |
| }, |
| { |
| "epoch": 1.5792042597248928, |
| "grad_norm": 0.19478543102741241, |
| "learning_rate": 0.00019156217023231245, |
| "loss": 0.6845, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.5821624020115368, |
| "grad_norm": 0.19766514003276825, |
| "learning_rate": 0.00019089942496880276, |
| "loss": 0.6791, |
| "step": 2675 |
| }, |
| { |
| "epoch": 1.5851205442981806, |
| "grad_norm": 0.20217527449131012, |
| "learning_rate": 0.00019023684894690812, |
| "loss": 0.6914, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.5880786865848249, |
| "grad_norm": 0.19573134183883667, |
| "learning_rate": 0.00018957444897562225, |
| "loss": 0.6899, |
| "step": 2685 |
| }, |
| { |
| "epoch": 1.5910368288714687, |
| "grad_norm": 0.19171777367591858, |
| "learning_rate": 0.00018891223186212974, |
| "loss": 0.6763, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.5939949711581127, |
| "grad_norm": 0.20240968465805054, |
| "learning_rate": 0.00018825020441173607, |
| "loss": 0.6881, |
| "step": 2695 |
| }, |
| { |
| "epoch": 1.5969531134447568, |
| "grad_norm": 0.1968025267124176, |
| "learning_rate": 0.0001875883734277976, |
| "loss": 0.6817, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.5969531134447568, |
| "eval_loss": 0.6345797181129456, |
| "eval_runtime": 15.1784, |
| "eval_samples_per_second": 427.581, |
| "eval_steps_per_second": 13.374, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.5999112557314006, |
| "grad_norm": 0.2064265012741089, |
| "learning_rate": 0.00018692674571165157, |
| "loss": 0.6783, |
| "step": 2705 |
| }, |
| { |
| "epoch": 1.6028693980180446, |
| "grad_norm": 0.20387953519821167, |
| "learning_rate": 0.00018626532806254666, |
| "loss": 0.6935, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.6058275403046887, |
| "grad_norm": 0.20735909044742584, |
| "learning_rate": 0.00018560412727757235, |
| "loss": 0.6804, |
| "step": 2715 |
| }, |
| { |
| "epoch": 1.6087856825913325, |
| "grad_norm": 0.21424676477909088, |
| "learning_rate": 0.0001849431501515898, |
| "loss": 0.6906, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.6117438248779767, |
| "grad_norm": 0.195107102394104, |
| "learning_rate": 0.00018428240347716172, |
| "loss": 0.6796, |
| "step": 2725 |
| }, |
| { |
| "epoch": 1.6147019671646206, |
| "grad_norm": 0.20345866680145264, |
| "learning_rate": 0.00018362189404448243, |
| "loss": 0.6991, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.6176601094512646, |
| "grad_norm": 0.19848254323005676, |
| "learning_rate": 0.00018296162864130837, |
| "loss": 0.6921, |
| "step": 2735 |
| }, |
| { |
| "epoch": 1.6206182517379086, |
| "grad_norm": 0.20381172001361847, |
| "learning_rate": 0.00018230161405288807, |
| "loss": 0.6599, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.6235763940245524, |
| "grad_norm": 0.19607172906398773, |
| "learning_rate": 0.00018164185706189267, |
| "loss": 0.6746, |
| "step": 2745 |
| }, |
| { |
| "epoch": 1.6265345363111967, |
| "grad_norm": 0.21484482288360596, |
| "learning_rate": 0.000180982364448346, |
| "loss": 0.67, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.6265345363111967, |
| "eval_loss": 0.6297933459281921, |
| "eval_runtime": 15.2129, |
| "eval_samples_per_second": 426.612, |
| "eval_steps_per_second": 13.344, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.6294926785978405, |
| "grad_norm": 0.19467367231845856, |
| "learning_rate": 0.00018032314298955507, |
| "loss": 0.6884, |
| "step": 2755 |
| }, |
| { |
| "epoch": 1.6324508208844846, |
| "grad_norm": 0.21421058475971222, |
| "learning_rate": 0.00017966419946004034, |
| "loss": 0.6708, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.6354089631711286, |
| "grad_norm": 0.2013017237186432, |
| "learning_rate": 0.00017900554063146607, |
| "loss": 0.6792, |
| "step": 2765 |
| }, |
| { |
| "epoch": 1.6383671054577724, |
| "grad_norm": 0.20477554202079773, |
| "learning_rate": 0.0001783471732725708, |
| "loss": 0.6787, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.6413252477444165, |
| "grad_norm": 0.20022639632225037, |
| "learning_rate": 0.00017768910414909782, |
| "loss": 0.6918, |
| "step": 2775 |
| }, |
| { |
| "epoch": 1.6442833900310605, |
| "grad_norm": 0.2140152007341385, |
| "learning_rate": 0.00017703134002372553, |
| "loss": 0.6768, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.6472415323177043, |
| "grad_norm": 0.20383507013320923, |
| "learning_rate": 0.00017637388765599804, |
| "loss": 0.6848, |
| "step": 2785 |
| }, |
| { |
| "epoch": 1.6501996746043486, |
| "grad_norm": 0.19069762527942657, |
| "learning_rate": 0.0001757167538022556, |
| "loss": 0.6739, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.6531578168909924, |
| "grad_norm": 0.2004157304763794, |
| "learning_rate": 0.00017505994521556538, |
| "loss": 0.7016, |
| "step": 2795 |
| }, |
| { |
| "epoch": 1.6561159591776364, |
| "grad_norm": 0.20051540434360504, |
| "learning_rate": 0.00017440346864565178, |
| "loss": 0.6731, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.6561159591776364, |
| "eval_loss": 0.626323401927948, |
| "eval_runtime": 15.1818, |
| "eval_samples_per_second": 427.485, |
| "eval_steps_per_second": 13.371, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.6590741014642805, |
| "grad_norm": 0.2020529806613922, |
| "learning_rate": 0.00017374733083882736, |
| "loss": 0.6824, |
| "step": 2805 |
| }, |
| { |
| "epoch": 1.6620322437509243, |
| "grad_norm": 0.19849729537963867, |
| "learning_rate": 0.00017309153853792305, |
| "loss": 0.6818, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.6649903860375685, |
| "grad_norm": 0.19597412645816803, |
| "learning_rate": 0.0001724360984822196, |
| "loss": 0.6711, |
| "step": 2815 |
| }, |
| { |
| "epoch": 1.6679485283242124, |
| "grad_norm": 0.21375198662281036, |
| "learning_rate": 0.00017178101740737757, |
| "loss": 0.6683, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.6709066706108564, |
| "grad_norm": 0.20787851512432098, |
| "learning_rate": 0.00017112630204536866, |
| "loss": 0.6776, |
| "step": 2825 |
| }, |
| { |
| "epoch": 1.6738648128975004, |
| "grad_norm": 0.20456843078136444, |
| "learning_rate": 0.00017047195912440612, |
| "loss": 0.6639, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.6768229551841443, |
| "grad_norm": 0.20255456864833832, |
| "learning_rate": 0.0001698179953688759, |
| "loss": 0.6766, |
| "step": 2835 |
| }, |
| { |
| "epoch": 1.6797810974707883, |
| "grad_norm": 0.2101144939661026, |
| "learning_rate": 0.00016916441749926738, |
| "loss": 0.6827, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.6827392397574323, |
| "grad_norm": 0.20345039665699005, |
| "learning_rate": 0.00016851123223210452, |
| "loss": 0.6615, |
| "step": 2845 |
| }, |
| { |
| "epoch": 1.6856973820440762, |
| "grad_norm": 0.20403869450092316, |
| "learning_rate": 0.00016785844627987656, |
| "loss": 0.682, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.6856973820440762, |
| "eval_loss": 0.6230462193489075, |
| "eval_runtime": 15.1867, |
| "eval_samples_per_second": 427.347, |
| "eval_steps_per_second": 13.367, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.6886555243307204, |
| "grad_norm": 0.20366299152374268, |
| "learning_rate": 0.00016720606635096897, |
| "loss": 0.6793, |
| "step": 2855 |
| }, |
| { |
| "epoch": 1.6916136666173642, |
| "grad_norm": 0.21023479104042053, |
| "learning_rate": 0.00016655409914959505, |
| "loss": 0.672, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.6945718089040083, |
| "grad_norm": 0.19778937101364136, |
| "learning_rate": 0.00016590255137572643, |
| "loss": 0.6758, |
| "step": 2865 |
| }, |
| { |
| "epoch": 1.6975299511906523, |
| "grad_norm": 0.19528599083423615, |
| "learning_rate": 0.00016525142972502466, |
| "loss": 0.6751, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.7004880934772961, |
| "grad_norm": 0.2070263922214508, |
| "learning_rate": 0.00016460074088877212, |
| "loss": 0.6921, |
| "step": 2875 |
| }, |
| { |
| "epoch": 1.7034462357639404, |
| "grad_norm": 0.2026599794626236, |
| "learning_rate": 0.00016395049155380328, |
| "loss": 0.6843, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.7064043780505842, |
| "grad_norm": 0.19992013275623322, |
| "learning_rate": 0.00016330068840243625, |
| "loss": 0.6571, |
| "step": 2885 |
| }, |
| { |
| "epoch": 1.7093625203372282, |
| "grad_norm": 0.21355992555618286, |
| "learning_rate": 0.00016265133811240373, |
| "loss": 0.6607, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.7123206626238723, |
| "grad_norm": 0.19565469026565552, |
| "learning_rate": 0.00016200244735678466, |
| "loss": 0.6737, |
| "step": 2895 |
| }, |
| { |
| "epoch": 1.715278804910516, |
| "grad_norm": 0.20738384127616882, |
| "learning_rate": 0.00016135402280393553, |
| "loss": 0.6762, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.715278804910516, |
| "eval_loss": 0.6193926334381104, |
| "eval_runtime": 15.232, |
| "eval_samples_per_second": 426.075, |
| "eval_steps_per_second": 13.327, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.7182369471971601, |
| "grad_norm": 0.20475776493549347, |
| "learning_rate": 0.0001607060711174218, |
| "loss": 0.6728, |
| "step": 2905 |
| }, |
| { |
| "epoch": 1.7211950894838042, |
| "grad_norm": 0.197879359126091, |
| "learning_rate": 0.00016005859895594968, |
| "loss": 0.6728, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.7241532317704482, |
| "grad_norm": 0.19947008788585663, |
| "learning_rate": 0.00015941161297329737, |
| "loss": 0.6636, |
| "step": 2915 |
| }, |
| { |
| "epoch": 1.7271113740570923, |
| "grad_norm": 0.20044192671775818, |
| "learning_rate": 0.00015876511981824685, |
| "loss": 0.6697, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.730069516343736, |
| "grad_norm": 0.1967218816280365, |
| "learning_rate": 0.00015811912613451556, |
| "loss": 0.6734, |
| "step": 2925 |
| }, |
| { |
| "epoch": 1.73302765863038, |
| "grad_norm": 0.19173486530780792, |
| "learning_rate": 0.00015747363856068812, |
| "loss": 0.6703, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.7359858009170241, |
| "grad_norm": 0.20418646931648254, |
| "learning_rate": 0.0001568286637301481, |
| "loss": 0.6751, |
| "step": 2935 |
| }, |
| { |
| "epoch": 1.738943943203668, |
| "grad_norm": 0.19849319756031036, |
| "learning_rate": 0.00015618420827100975, |
| "loss": 0.6572, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.7419020854903122, |
| "grad_norm": 0.20207248628139496, |
| "learning_rate": 0.00015554027880605, |
| "loss": 0.6763, |
| "step": 2945 |
| }, |
| { |
| "epoch": 1.744860227776956, |
| "grad_norm": 0.1988462209701538, |
| "learning_rate": 0.00015489688195264038, |
| "loss": 0.6638, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.744860227776956, |
| "eval_loss": 0.6163128614425659, |
| "eval_runtime": 15.1693, |
| "eval_samples_per_second": 427.836, |
| "eval_steps_per_second": 13.382, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.7478183700636, |
| "grad_norm": 0.20099283754825592, |
| "learning_rate": 0.00015425402432267906, |
| "loss": 0.6579, |
| "step": 2955 |
| }, |
| { |
| "epoch": 1.7507765123502441, |
| "grad_norm": 0.19520577788352966, |
| "learning_rate": 0.0001536117125225229, |
| "loss": 0.6725, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.753734654636888, |
| "grad_norm": 0.2104516476392746, |
| "learning_rate": 0.0001529699531529194, |
| "loss": 0.6619, |
| "step": 2965 |
| }, |
| { |
| "epoch": 1.756692796923532, |
| "grad_norm": 0.20790335536003113, |
| "learning_rate": 0.000152328752808939, |
| "loss": 0.6771, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.759650939210176, |
| "grad_norm": 0.20284044742584229, |
| "learning_rate": 0.00015168811807990732, |
| "loss": 0.6688, |
| "step": 2975 |
| }, |
| { |
| "epoch": 1.76260908149682, |
| "grad_norm": 0.21280840039253235, |
| "learning_rate": 0.00015104805554933744, |
| "loss": 0.6924, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.765567223783464, |
| "grad_norm": 0.19089211523532867, |
| "learning_rate": 0.0001504085717948622, |
| "loss": 0.6801, |
| "step": 2985 |
| }, |
| { |
| "epoch": 1.768525366070108, |
| "grad_norm": 0.20048676431179047, |
| "learning_rate": 0.00014976967338816653, |
| "loss": 0.6843, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.771483508356752, |
| "grad_norm": 0.2061685174703598, |
| "learning_rate": 0.00014913136689492004, |
| "loss": 0.6674, |
| "step": 2995 |
| }, |
| { |
| "epoch": 1.774441650643396, |
| "grad_norm": 0.19444270431995392, |
| "learning_rate": 0.00014849365887470962, |
| "loss": 0.6786, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.774441650643396, |
| "eval_loss": 0.6131055951118469, |
| "eval_runtime": 15.1868, |
| "eval_samples_per_second": 427.344, |
| "eval_steps_per_second": 13.367, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.7773997929300398, |
| "grad_norm": 0.20577003061771393, |
| "learning_rate": 0.00014785655588097182, |
| "loss": 0.6652, |
| "step": 3005 |
| }, |
| { |
| "epoch": 1.780357935216684, |
| "grad_norm": 0.201372891664505, |
| "learning_rate": 0.00014722006446092568, |
| "loss": 0.6783, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.7833160775033279, |
| "grad_norm": 0.19634784758090973, |
| "learning_rate": 0.0001465841911555053, |
| "loss": 0.6781, |
| "step": 3015 |
| }, |
| { |
| "epoch": 1.786274219789972, |
| "grad_norm": 0.20469844341278076, |
| "learning_rate": 0.00014594894249929271, |
| "loss": 0.6726, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.789232362076616, |
| "grad_norm": 0.2057006061077118, |
| "learning_rate": 0.0001453143250204508, |
| "loss": 0.6631, |
| "step": 3025 |
| }, |
| { |
| "epoch": 1.7921905043632598, |
| "grad_norm": 0.20145894587039948, |
| "learning_rate": 0.000144680345240656, |
| "loss": 0.6701, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.7951486466499038, |
| "grad_norm": 0.19412663578987122, |
| "learning_rate": 0.00014404700967503143, |
| "loss": 0.6779, |
| "step": 3035 |
| }, |
| { |
| "epoch": 1.7981067889365479, |
| "grad_norm": 0.2155628502368927, |
| "learning_rate": 0.00014341432483207993, |
| "loss": 0.6687, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.801064931223192, |
| "grad_norm": 0.19732846319675446, |
| "learning_rate": 0.0001427822972136172, |
| "loss": 0.6673, |
| "step": 3045 |
| }, |
| { |
| "epoch": 1.804023073509836, |
| "grad_norm": 0.19896374642848969, |
| "learning_rate": 0.00014215093331470494, |
| "loss": 0.6712, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.804023073509836, |
| "eval_loss": 0.6097522974014282, |
| "eval_runtime": 15.1894, |
| "eval_samples_per_second": 427.271, |
| "eval_steps_per_second": 13.365, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.8069812157964797, |
| "grad_norm": 0.19442537426948547, |
| "learning_rate": 0.00014152023962358398, |
| "loss": 0.6645, |
| "step": 3055 |
| }, |
| { |
| "epoch": 1.8099393580831238, |
| "grad_norm": 0.20382952690124512, |
| "learning_rate": 0.00014089022262160788, |
| "loss": 0.6701, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.8128975003697678, |
| "grad_norm": 0.19677461683750153, |
| "learning_rate": 0.00014026088878317611, |
| "loss": 0.6733, |
| "step": 3065 |
| }, |
| { |
| "epoch": 1.8158556426564116, |
| "grad_norm": 0.20080624520778656, |
| "learning_rate": 0.00013963224457566755, |
| "loss": 0.6716, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.818813784943056, |
| "grad_norm": 0.1959795206785202, |
| "learning_rate": 0.00013900429645937417, |
| "loss": 0.6786, |
| "step": 3075 |
| }, |
| { |
| "epoch": 1.8217719272296997, |
| "grad_norm": 0.20665378868579865, |
| "learning_rate": 0.00013837705088743426, |
| "loss": 0.6837, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.8247300695163438, |
| "grad_norm": 0.20092874765396118, |
| "learning_rate": 0.0001377505143057667, |
| "loss": 0.673, |
| "step": 3085 |
| }, |
| { |
| "epoch": 1.8276882118029878, |
| "grad_norm": 0.20177334547042847, |
| "learning_rate": 0.0001371246931530042, |
| "loss": 0.6676, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.8306463540896316, |
| "grad_norm": 0.1979428380727768, |
| "learning_rate": 0.0001364995938604274, |
| "loss": 0.672, |
| "step": 3095 |
| }, |
| { |
| "epoch": 1.8336044963762756, |
| "grad_norm": 0.20506584644317627, |
| "learning_rate": 0.00013587522285189873, |
| "loss": 0.6657, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.8336044963762756, |
| "eval_loss": 0.6049384474754333, |
| "eval_runtime": 15.0954, |
| "eval_samples_per_second": 429.931, |
| "eval_steps_per_second": 13.448, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.8365626386629197, |
| "grad_norm": 0.20680159330368042, |
| "learning_rate": 0.00013525158654379628, |
| "loss": 0.6698, |
| "step": 3105 |
| }, |
| { |
| "epoch": 1.8395207809495637, |
| "grad_norm": 0.1984373927116394, |
| "learning_rate": 0.00013462869134494806, |
| "loss": 0.6603, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.8424789232362078, |
| "grad_norm": 0.202684223651886, |
| "learning_rate": 0.000134006543656566, |
| "loss": 0.6553, |
| "step": 3115 |
| }, |
| { |
| "epoch": 1.8454370655228516, |
| "grad_norm": 0.21238334476947784, |
| "learning_rate": 0.0001333851498721802, |
| "loss": 0.6663, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.8483952078094956, |
| "grad_norm": 0.20673152804374695, |
| "learning_rate": 0.0001327645163775732, |
| "loss": 0.6701, |
| "step": 3125 |
| }, |
| { |
| "epoch": 1.8513533500961397, |
| "grad_norm": 0.2024824321269989, |
| "learning_rate": 0.00013214464955071438, |
| "loss": 0.6555, |
| "step": 3130 |
| }, |
| { |
| "epoch": 1.8543114923827835, |
| "grad_norm": 0.2038116157054901, |
| "learning_rate": 0.00013152555576169446, |
| "loss": 0.6693, |
| "step": 3135 |
| }, |
| { |
| "epoch": 1.8572696346694277, |
| "grad_norm": 0.19714276492595673, |
| "learning_rate": 0.00013090724137266007, |
| "loss": 0.6619, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.8602277769560716, |
| "grad_norm": 0.20710057020187378, |
| "learning_rate": 0.00013028971273774817, |
| "loss": 0.6764, |
| "step": 3145 |
| }, |
| { |
| "epoch": 1.8631859192427156, |
| "grad_norm": 0.19880205392837524, |
| "learning_rate": 0.00012967297620302095, |
| "loss": 0.6702, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.8631859192427156, |
| "eval_loss": 0.6030129194259644, |
| "eval_runtime": 15.2766, |
| "eval_samples_per_second": 424.834, |
| "eval_steps_per_second": 13.288, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.8661440615293596, |
| "grad_norm": 0.21328890323638916, |
| "learning_rate": 0.00012905703810640054, |
| "loss": 0.6627, |
| "step": 3155 |
| }, |
| { |
| "epoch": 1.8691022038160034, |
| "grad_norm": 0.20017056167125702, |
| "learning_rate": 0.00012844190477760388, |
| "loss": 0.6653, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.8720603461026475, |
| "grad_norm": 0.204426571726799, |
| "learning_rate": 0.00012782758253807765, |
| "loss": 0.6725, |
| "step": 3165 |
| }, |
| { |
| "epoch": 1.8750184883892915, |
| "grad_norm": 0.19793303310871124, |
| "learning_rate": 0.00012721407770093334, |
| "loss": 0.6578, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.8779766306759356, |
| "grad_norm": 0.20140038430690765, |
| "learning_rate": 0.00012660139657088242, |
| "loss": 0.6706, |
| "step": 3175 |
| }, |
| { |
| "epoch": 1.8809347729625796, |
| "grad_norm": 0.20214778184890747, |
| "learning_rate": 0.0001259895454441714, |
| "loss": 0.6767, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.8838929152492234, |
| "grad_norm": 0.21046917140483856, |
| "learning_rate": 0.0001253785306085173, |
| "loss": 0.6768, |
| "step": 3185 |
| }, |
| { |
| "epoch": 1.8868510575358675, |
| "grad_norm": 0.20898491144180298, |
| "learning_rate": 0.00012476835834304294, |
| "loss": 0.6654, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.8898091998225115, |
| "grad_norm": 0.20452511310577393, |
| "learning_rate": 0.0001241590349182124, |
| "loss": 0.6599, |
| "step": 3195 |
| }, |
| { |
| "epoch": 1.8927673421091553, |
| "grad_norm": 0.21516427397727966, |
| "learning_rate": 0.00012355056659576664, |
| "loss": 0.6646, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.8927673421091553, |
| "eval_loss": 0.59881591796875, |
| "eval_runtime": 15.1719, |
| "eval_samples_per_second": 427.765, |
| "eval_steps_per_second": 13.38, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.8957254843957996, |
| "grad_norm": 0.19969050586223602, |
| "learning_rate": 0.00012294295962865908, |
| "loss": 0.6641, |
| "step": 3205 |
| }, |
| { |
| "epoch": 1.8986836266824434, |
| "grad_norm": 0.2035350650548935, |
| "learning_rate": 0.0001223362202609915, |
| "loss": 0.6679, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.9016417689690874, |
| "grad_norm": 0.20504876971244812, |
| "learning_rate": 0.00012173035472794956, |
| "loss": 0.664, |
| "step": 3215 |
| }, |
| { |
| "epoch": 1.9045999112557315, |
| "grad_norm": 0.19677531719207764, |
| "learning_rate": 0.00012112536925573904, |
| "loss": 0.6605, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.9075580535423753, |
| "grad_norm": 0.19949232041835785, |
| "learning_rate": 0.00012052127006152172, |
| "loss": 0.6718, |
| "step": 3225 |
| }, |
| { |
| "epoch": 1.9105161958290193, |
| "grad_norm": 0.19459928572177887, |
| "learning_rate": 0.00011991806335335154, |
| "loss": 0.6639, |
| "step": 3230 |
| }, |
| { |
| "epoch": 1.9134743381156634, |
| "grad_norm": 0.203868106007576, |
| "learning_rate": 0.00011931575533011058, |
| "loss": 0.6664, |
| "step": 3235 |
| }, |
| { |
| "epoch": 1.9164324804023074, |
| "grad_norm": 0.1938386857509613, |
| "learning_rate": 0.00011871435218144587, |
| "loss": 0.6619, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.9193906226889514, |
| "grad_norm": 0.20791150629520416, |
| "learning_rate": 0.00011811386008770509, |
| "loss": 0.6547, |
| "step": 3245 |
| }, |
| { |
| "epoch": 1.9223487649755953, |
| "grad_norm": 0.20660457015037537, |
| "learning_rate": 0.00011751428521987375, |
| "loss": 0.6793, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.9223487649755953, |
| "eval_loss": 0.5979748368263245, |
| "eval_runtime": 15.2507, |
| "eval_samples_per_second": 425.554, |
| "eval_steps_per_second": 13.311, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.9253069072622393, |
| "grad_norm": 0.20144982635974884, |
| "learning_rate": 0.00011691563373951126, |
| "loss": 0.6696, |
| "step": 3255 |
| }, |
| { |
| "epoch": 1.9282650495488833, |
| "grad_norm": 0.20913758873939514, |
| "learning_rate": 0.00011631791179868765, |
| "loss": 0.6535, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.9312231918355272, |
| "grad_norm": 0.19052323698997498, |
| "learning_rate": 0.0001157211255399209, |
| "loss": 0.6608, |
| "step": 3265 |
| }, |
| { |
| "epoch": 1.9341813341221714, |
| "grad_norm": 0.20822355151176453, |
| "learning_rate": 0.000115125281096113, |
| "loss": 0.6643, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.9371394764088152, |
| "grad_norm": 0.21328042447566986, |
| "learning_rate": 0.00011453038459048767, |
| "loss": 0.6634, |
| "step": 3275 |
| }, |
| { |
| "epoch": 1.9400976186954593, |
| "grad_norm": 0.19759127497673035, |
| "learning_rate": 0.00011393644213652677, |
| "loss": 0.6496, |
| "step": 3280 |
| }, |
| { |
| "epoch": 1.9430557609821033, |
| "grad_norm": 0.20105671882629395, |
| "learning_rate": 0.00011334345983790816, |
| "loss": 0.6537, |
| "step": 3285 |
| }, |
| { |
| "epoch": 1.9460139032687471, |
| "grad_norm": 0.2096051126718521, |
| "learning_rate": 0.00011275144378844229, |
| "loss": 0.6494, |
| "step": 3290 |
| }, |
| { |
| "epoch": 1.9489720455553912, |
| "grad_norm": 0.20718033611774445, |
| "learning_rate": 0.00011216040007201014, |
| "loss": 0.6488, |
| "step": 3295 |
| }, |
| { |
| "epoch": 1.9519301878420352, |
| "grad_norm": 0.2058909386396408, |
| "learning_rate": 0.0001115703347625003, |
| "loss": 0.665, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.9519301878420352, |
| "eval_loss": 0.5932101011276245, |
| "eval_runtime": 15.197, |
| "eval_samples_per_second": 427.057, |
| "eval_steps_per_second": 13.358, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.9548883301286792, |
| "grad_norm": 0.2102581113576889, |
| "learning_rate": 0.00011098125392374676, |
| "loss": 0.6345, |
| "step": 3305 |
| }, |
| { |
| "epoch": 1.9578464724153233, |
| "grad_norm": 0.20526902377605438, |
| "learning_rate": 0.00011039316360946673, |
| "loss": 0.6647, |
| "step": 3310 |
| }, |
| { |
| "epoch": 1.960804614701967, |
| "grad_norm": 0.203394815325737, |
| "learning_rate": 0.00010980606986319787, |
| "loss": 0.662, |
| "step": 3315 |
| }, |
| { |
| "epoch": 1.9637627569886111, |
| "grad_norm": 0.21088528633117676, |
| "learning_rate": 0.00010921997871823699, |
| "loss": 0.6572, |
| "step": 3320 |
| }, |
| { |
| "epoch": 1.9667208992752552, |
| "grad_norm": 0.20182575285434723, |
| "learning_rate": 0.00010863489619757724, |
| "loss": 0.6625, |
| "step": 3325 |
| }, |
| { |
| "epoch": 1.969679041561899, |
| "grad_norm": 0.20655593276023865, |
| "learning_rate": 0.00010805082831384698, |
| "loss": 0.6421, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.9726371838485433, |
| "grad_norm": 0.2001488208770752, |
| "learning_rate": 0.00010746778106924716, |
| "loss": 0.6604, |
| "step": 3335 |
| }, |
| { |
| "epoch": 1.975595326135187, |
| "grad_norm": 0.2096022516489029, |
| "learning_rate": 0.00010688576045549053, |
| "loss": 0.6479, |
| "step": 3340 |
| }, |
| { |
| "epoch": 1.978553468421831, |
| "grad_norm": 0.20127557218074799, |
| "learning_rate": 0.0001063047724537393, |
| "loss": 0.6628, |
| "step": 3345 |
| }, |
| { |
| "epoch": 1.9815116107084751, |
| "grad_norm": 0.20845440030097961, |
| "learning_rate": 0.00010572482303454416, |
| "loss": 0.6577, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.9815116107084751, |
| "eval_loss": 0.5896474719047546, |
| "eval_runtime": 15.2103, |
| "eval_samples_per_second": 426.685, |
| "eval_steps_per_second": 13.346, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.984469752995119, |
| "grad_norm": 0.20478816330432892, |
| "learning_rate": 0.00010514591815778253, |
| "loss": 0.6398, |
| "step": 3355 |
| }, |
| { |
| "epoch": 1.987427895281763, |
| "grad_norm": 0.20369143784046173, |
| "learning_rate": 0.00010456806377259795, |
| "loss": 0.671, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.990386037568407, |
| "grad_norm": 0.2043718546628952, |
| "learning_rate": 0.0001039912658173381, |
| "loss": 0.6423, |
| "step": 3365 |
| }, |
| { |
| "epoch": 1.993344179855051, |
| "grad_norm": 0.20425325632095337, |
| "learning_rate": 0.00010341553021949456, |
| "loss": 0.6566, |
| "step": 3370 |
| }, |
| { |
| "epoch": 1.9963023221416951, |
| "grad_norm": 0.19125695526599884, |
| "learning_rate": 0.00010284086289564125, |
| "loss": 0.6491, |
| "step": 3375 |
| }, |
| { |
| "epoch": 1.999260464428339, |
| "grad_norm": 0.20721665024757385, |
| "learning_rate": 0.00010226726975137421, |
| "loss": 0.6697, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.0017748853719866, |
| "grad_norm": 0.1947908252477646, |
| "learning_rate": 0.0001016947566812503, |
| "loss": 0.6123, |
| "step": 3385 |
| }, |
| { |
| "epoch": 2.0047330276586304, |
| "grad_norm": 0.22768820822238922, |
| "learning_rate": 0.0001011233295687272, |
| "loss": 0.598, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.007691169945274, |
| "grad_norm": 0.20400209724903107, |
| "learning_rate": 0.00010055299428610279, |
| "loss": 0.5928, |
| "step": 3395 |
| }, |
| { |
| "epoch": 2.0106493122319185, |
| "grad_norm": 0.2241700440645218, |
| "learning_rate": 9.998375669445419e-05, |
| "loss": 0.5821, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.0106493122319185, |
| "eval_loss": 0.5831112861633301, |
| "eval_runtime": 15.2237, |
| "eval_samples_per_second": 426.309, |
| "eval_steps_per_second": 13.334, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.0136074545185623, |
| "grad_norm": 0.20261913537979126, |
| "learning_rate": 9.941562264357865e-05, |
| "loss": 0.5866, |
| "step": 3405 |
| }, |
| { |
| "epoch": 2.0165655968052065, |
| "grad_norm": 0.20536375045776367, |
| "learning_rate": 9.884859797193239e-05, |
| "loss": 0.5946, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.0195237390918503, |
| "grad_norm": 0.21416200697422028, |
| "learning_rate": 9.828268850657138e-05, |
| "loss": 0.5856, |
| "step": 3415 |
| }, |
| { |
| "epoch": 2.022481881378494, |
| "grad_norm": 0.2201823741197586, |
| "learning_rate": 9.771790006309084e-05, |
| "loss": 0.6029, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.0254400236651384, |
| "grad_norm": 0.2173227071762085, |
| "learning_rate": 9.715423844556602e-05, |
| "loss": 0.5871, |
| "step": 3425 |
| }, |
| { |
| "epoch": 2.0283981659517822, |
| "grad_norm": 0.21699583530426025, |
| "learning_rate": 9.659170944649196e-05, |
| "loss": 0.5773, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.031356308238426, |
| "grad_norm": 0.2157624065876007, |
| "learning_rate": 9.603031884672467e-05, |
| "loss": 0.5979, |
| "step": 3435 |
| }, |
| { |
| "epoch": 2.0343144505250703, |
| "grad_norm": 0.21308545768260956, |
| "learning_rate": 9.547007241542108e-05, |
| "loss": 0.5749, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.037272592811714, |
| "grad_norm": 0.21203207969665527, |
| "learning_rate": 9.491097590998e-05, |
| "loss": 0.5985, |
| "step": 3445 |
| }, |
| { |
| "epoch": 2.0402307350983584, |
| "grad_norm": 0.21418456733226776, |
| "learning_rate": 9.435303507598322e-05, |
| "loss": 0.5917, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.0402307350983584, |
| "eval_loss": 0.5792038440704346, |
| "eval_runtime": 15.2302, |
| "eval_samples_per_second": 426.128, |
| "eval_steps_per_second": 13.329, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.043188877385002, |
| "grad_norm": 0.21770378947257996, |
| "learning_rate": 9.379625564713593e-05, |
| "loss": 0.5706, |
| "step": 3455 |
| }, |
| { |
| "epoch": 2.046147019671646, |
| "grad_norm": 0.21893031895160675, |
| "learning_rate": 9.324064334520837e-05, |
| "loss": 0.5926, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.0491051619582903, |
| "grad_norm": 0.20557957887649536, |
| "learning_rate": 9.268620387997643e-05, |
| "loss": 0.5886, |
| "step": 3465 |
| }, |
| { |
| "epoch": 2.052063304244934, |
| "grad_norm": 0.22509662806987762, |
| "learning_rate": 9.213294294916363e-05, |
| "loss": 0.5848, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.0550214465315784, |
| "grad_norm": 0.2100546807050705, |
| "learning_rate": 9.158086623838189e-05, |
| "loss": 0.5863, |
| "step": 3475 |
| }, |
| { |
| "epoch": 2.057979588818222, |
| "grad_norm": 0.20398281514644623, |
| "learning_rate": 9.102997942107373e-05, |
| "loss": 0.603, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.060937731104866, |
| "grad_norm": 0.20686747133731842, |
| "learning_rate": 9.04802881584535e-05, |
| "loss": 0.5747, |
| "step": 3485 |
| }, |
| { |
| "epoch": 2.0638958733915103, |
| "grad_norm": 0.2102808654308319, |
| "learning_rate": 8.993179809944937e-05, |
| "loss": 0.5916, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.066854015678154, |
| "grad_norm": 0.21308141946792603, |
| "learning_rate": 8.938451488064526e-05, |
| "loss": 0.5668, |
| "step": 3495 |
| }, |
| { |
| "epoch": 2.069812157964798, |
| "grad_norm": 0.21371279656887054, |
| "learning_rate": 8.883844412622322e-05, |
| "loss": 0.5813, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.069812157964798, |
| "eval_loss": 0.575473964214325, |
| "eval_runtime": 15.2042, |
| "eval_samples_per_second": 426.855, |
| "eval_steps_per_second": 13.352, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.072770300251442, |
| "grad_norm": 0.22001944482326508, |
| "learning_rate": 8.829359144790494e-05, |
| "loss": 0.5913, |
| "step": 3505 |
| }, |
| { |
| "epoch": 2.075728442538086, |
| "grad_norm": 0.22368231415748596, |
| "learning_rate": 8.774996244489475e-05, |
| "loss": 0.5877, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.0786865848247302, |
| "grad_norm": 0.22365407645702362, |
| "learning_rate": 8.72075627038219e-05, |
| "loss": 0.6037, |
| "step": 3515 |
| }, |
| { |
| "epoch": 2.081644727111374, |
| "grad_norm": 0.20973624289035797, |
| "learning_rate": 8.666639779868279e-05, |
| "loss": 0.5844, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.084602869398018, |
| "grad_norm": 0.21420718729496002, |
| "learning_rate": 8.612647329078422e-05, |
| "loss": 0.5921, |
| "step": 3525 |
| }, |
| { |
| "epoch": 2.087561011684662, |
| "grad_norm": 0.22668996453285217, |
| "learning_rate": 8.558779472868585e-05, |
| "loss": 0.5886, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.090519153971306, |
| "grad_norm": 0.21416419744491577, |
| "learning_rate": 8.505036764814334e-05, |
| "loss": 0.5981, |
| "step": 3535 |
| }, |
| { |
| "epoch": 2.09347729625795, |
| "grad_norm": 0.21493327617645264, |
| "learning_rate": 8.451419757205141e-05, |
| "loss": 0.5813, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.096435438544594, |
| "grad_norm": 0.20993159711360931, |
| "learning_rate": 8.397929001038732e-05, |
| "loss": 0.5791, |
| "step": 3545 |
| }, |
| { |
| "epoch": 2.099393580831238, |
| "grad_norm": 0.21955925226211548, |
| "learning_rate": 8.344565046015369e-05, |
| "loss": 0.5934, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.099393580831238, |
| "eval_loss": 0.5726394057273865, |
| "eval_runtime": 15.2125, |
| "eval_samples_per_second": 426.624, |
| "eval_steps_per_second": 13.344, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.102351723117882, |
| "grad_norm": 0.21743761003017426, |
| "learning_rate": 8.291328440532275e-05, |
| "loss": 0.5923, |
| "step": 3555 |
| }, |
| { |
| "epoch": 2.105309865404526, |
| "grad_norm": 0.2203519195318222, |
| "learning_rate": 8.23821973167792e-05, |
| "loss": 0.5911, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.1082680076911697, |
| "grad_norm": 0.21465690433979034, |
| "learning_rate": 8.185239465226481e-05, |
| "loss": 0.5821, |
| "step": 3565 |
| }, |
| { |
| "epoch": 2.111226149977814, |
| "grad_norm": 0.21976549923419952, |
| "learning_rate": 8.132388185632145e-05, |
| "loss": 0.5812, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.114184292264458, |
| "grad_norm": 0.21722052991390228, |
| "learning_rate": 8.079666436023603e-05, |
| "loss": 0.5763, |
| "step": 3575 |
| }, |
| { |
| "epoch": 2.117142434551102, |
| "grad_norm": 0.21873724460601807, |
| "learning_rate": 8.027074758198394e-05, |
| "loss": 0.6005, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.120100576837746, |
| "grad_norm": 0.21776770055294037, |
| "learning_rate": 7.974613692617372e-05, |
| "loss": 0.5781, |
| "step": 3585 |
| }, |
| { |
| "epoch": 2.1230587191243897, |
| "grad_norm": 0.22048649191856384, |
| "learning_rate": 7.922283778399167e-05, |
| "loss": 0.5792, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.126016861411034, |
| "grad_norm": 0.21600739657878876, |
| "learning_rate": 7.870085553314602e-05, |
| "loss": 0.6024, |
| "step": 3595 |
| }, |
| { |
| "epoch": 2.128975003697678, |
| "grad_norm": 0.224426731467247, |
| "learning_rate": 7.818019553781215e-05, |
| "loss": 0.5959, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.128975003697678, |
| "eval_loss": 0.5697709321975708, |
| "eval_runtime": 15.28, |
| "eval_samples_per_second": 424.739, |
| "eval_steps_per_second": 13.285, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.131933145984322, |
| "grad_norm": 0.20624466240406036, |
| "learning_rate": 7.766086314857693e-05, |
| "loss": 0.578, |
| "step": 3605 |
| }, |
| { |
| "epoch": 2.134891288270966, |
| "grad_norm": 0.21350590884685516, |
| "learning_rate": 7.714286370238435e-05, |
| "loss": 0.5791, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.1378494305576097, |
| "grad_norm": 0.2191067934036255, |
| "learning_rate": 7.662620252248002e-05, |
| "loss": 0.577, |
| "step": 3615 |
| }, |
| { |
| "epoch": 2.140807572844254, |
| "grad_norm": 0.21235939860343933, |
| "learning_rate": 7.611088491835717e-05, |
| "loss": 0.5812, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.1437657151308978, |
| "grad_norm": 0.20971763134002686, |
| "learning_rate": 7.559691618570121e-05, |
| "loss": 0.5837, |
| "step": 3625 |
| }, |
| { |
| "epoch": 2.146723857417542, |
| "grad_norm": 0.22103586792945862, |
| "learning_rate": 7.508430160633623e-05, |
| "loss": 0.6064, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.149681999704186, |
| "grad_norm": 0.20907030999660492, |
| "learning_rate": 7.457304644817021e-05, |
| "loss": 0.5821, |
| "step": 3635 |
| }, |
| { |
| "epoch": 2.1526401419908296, |
| "grad_norm": 0.21533241868019104, |
| "learning_rate": 7.406315596514083e-05, |
| "loss": 0.5904, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.155598284277474, |
| "grad_norm": 0.2256341576576233, |
| "learning_rate": 7.355463539716179e-05, |
| "loss": 0.5935, |
| "step": 3645 |
| }, |
| { |
| "epoch": 2.1585564265641177, |
| "grad_norm": 0.21132247149944305, |
| "learning_rate": 7.304748997006862e-05, |
| "loss": 0.5842, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.1585564265641177, |
| "eval_loss": 0.567834198474884, |
| "eval_runtime": 15.3306, |
| "eval_samples_per_second": 423.336, |
| "eval_steps_per_second": 13.241, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.1615145688507615, |
| "grad_norm": 0.22597523033618927, |
| "learning_rate": 7.254172489556542e-05, |
| "loss": 0.5977, |
| "step": 3655 |
| }, |
| { |
| "epoch": 2.164472711137406, |
| "grad_norm": 0.22739310562610626, |
| "learning_rate": 7.203734537117064e-05, |
| "loss": 0.594, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.1674308534240496, |
| "grad_norm": 0.21618716418743134, |
| "learning_rate": 7.153435658016453e-05, |
| "loss": 0.5776, |
| "step": 3665 |
| }, |
| { |
| "epoch": 2.170388995710694, |
| "grad_norm": 0.22005634009838104, |
| "learning_rate": 7.10327636915349e-05, |
| "loss": 0.5861, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.1733471379973377, |
| "grad_norm": 0.2195035070180893, |
| "learning_rate": 7.053257185992494e-05, |
| "loss": 0.5941, |
| "step": 3675 |
| }, |
| { |
| "epoch": 2.1763052802839815, |
| "grad_norm": 0.20980341732501984, |
| "learning_rate": 7.003378622557946e-05, |
| "loss": 0.5724, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.1792634225706258, |
| "grad_norm": 0.22259102761745453, |
| "learning_rate": 6.953641191429277e-05, |
| "loss": 0.573, |
| "step": 3685 |
| }, |
| { |
| "epoch": 2.1822215648572696, |
| "grad_norm": 0.22640399634838104, |
| "learning_rate": 6.904045403735528e-05, |
| "loss": 0.583, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.185179707143914, |
| "grad_norm": 0.21845850348472595, |
| "learning_rate": 6.85459176915017e-05, |
| "loss": 0.5812, |
| "step": 3695 |
| }, |
| { |
| "epoch": 2.1881378494305577, |
| "grad_norm": 0.22164185345172882, |
| "learning_rate": 6.8052807958858e-05, |
| "loss": 0.5848, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.1881378494305577, |
| "eval_loss": 0.5647861361503601, |
| "eval_runtime": 15.5273, |
| "eval_samples_per_second": 417.975, |
| "eval_steps_per_second": 13.074, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.1910959917172015, |
| "grad_norm": 0.22857585549354553, |
| "learning_rate": 6.756112990688974e-05, |
| "loss": 0.5896, |
| "step": 3705 |
| }, |
| { |
| "epoch": 2.1940541340038457, |
| "grad_norm": 0.22805197536945343, |
| "learning_rate": 6.707088858834962e-05, |
| "loss": 0.583, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.1970122762904896, |
| "grad_norm": 0.21297574043273926, |
| "learning_rate": 6.658208904122559e-05, |
| "loss": 0.5707, |
| "step": 3715 |
| }, |
| { |
| "epoch": 2.1999704185771334, |
| "grad_norm": 0.21938017010688782, |
| "learning_rate": 6.609473628868942e-05, |
| "loss": 0.5826, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.2029285608637776, |
| "grad_norm": 0.22642748057842255, |
| "learning_rate": 6.560883533904459e-05, |
| "loss": 0.5791, |
| "step": 3725 |
| }, |
| { |
| "epoch": 2.2058867031504215, |
| "grad_norm": 0.22415103018283844, |
| "learning_rate": 6.512439118567521e-05, |
| "loss": 0.5906, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.2088448454370657, |
| "grad_norm": 0.22948449850082397, |
| "learning_rate": 6.46414088069944e-05, |
| "loss": 0.5946, |
| "step": 3735 |
| }, |
| { |
| "epoch": 2.2118029877237095, |
| "grad_norm": 0.2156379073858261, |
| "learning_rate": 6.415989316639354e-05, |
| "loss": 0.58, |
| "step": 3740 |
| }, |
| { |
| "epoch": 2.2147611300103534, |
| "grad_norm": 0.21520055830478668, |
| "learning_rate": 6.367984921219066e-05, |
| "loss": 0.5611, |
| "step": 3745 |
| }, |
| { |
| "epoch": 2.2177192722969976, |
| "grad_norm": 0.23094354569911957, |
| "learning_rate": 6.320128187758033e-05, |
| "loss": 0.5992, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.2177192722969976, |
| "eval_loss": 0.5629158020019531, |
| "eval_runtime": 15.3089, |
| "eval_samples_per_second": 423.936, |
| "eval_steps_per_second": 13.26, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.2206774145836414, |
| "grad_norm": 0.2340443730354309, |
| "learning_rate": 6.272419608058222e-05, |
| "loss": 0.6019, |
| "step": 3755 |
| }, |
| { |
| "epoch": 2.2236355568702857, |
| "grad_norm": 0.22871170938014984, |
| "learning_rate": 6.224859672399101e-05, |
| "loss": 0.6042, |
| "step": 3760 |
| }, |
| { |
| "epoch": 2.2265936991569295, |
| "grad_norm": 0.22238165140151978, |
| "learning_rate": 6.17744886953261e-05, |
| "loss": 0.5862, |
| "step": 3765 |
| }, |
| { |
| "epoch": 2.2295518414435733, |
| "grad_norm": 0.23319736123085022, |
| "learning_rate": 6.130187686678089e-05, |
| "loss": 0.5803, |
| "step": 3770 |
| }, |
| { |
| "epoch": 2.2325099837302176, |
| "grad_norm": 0.22640018165111542, |
| "learning_rate": 6.0830766095173266e-05, |
| "loss": 0.5948, |
| "step": 3775 |
| }, |
| { |
| "epoch": 2.2354681260168614, |
| "grad_norm": 0.2193189114332199, |
| "learning_rate": 6.03611612218952e-05, |
| "loss": 0.5903, |
| "step": 3780 |
| }, |
| { |
| "epoch": 2.238426268303505, |
| "grad_norm": 0.22366072237491608, |
| "learning_rate": 5.989306707286349e-05, |
| "loss": 0.5957, |
| "step": 3785 |
| }, |
| { |
| "epoch": 2.2413844105901495, |
| "grad_norm": 0.21540489792823792, |
| "learning_rate": 5.942648845846961e-05, |
| "loss": 0.5798, |
| "step": 3790 |
| }, |
| { |
| "epoch": 2.2443425528767933, |
| "grad_norm": 0.21111348271369934, |
| "learning_rate": 5.896143017353086e-05, |
| "loss": 0.5841, |
| "step": 3795 |
| }, |
| { |
| "epoch": 2.2473006951634376, |
| "grad_norm": 0.21304626762866974, |
| "learning_rate": 5.849789699724059e-05, |
| "loss": 0.5901, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.2473006951634376, |
| "eval_loss": 0.5599969625473022, |
| "eval_runtime": 15.2714, |
| "eval_samples_per_second": 424.977, |
| "eval_steps_per_second": 13.293, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.2502588374500814, |
| "grad_norm": 0.2219642996788025, |
| "learning_rate": 5.803589369311938e-05, |
| "loss": 0.5921, |
| "step": 3805 |
| }, |
| { |
| "epoch": 2.253216979736725, |
| "grad_norm": 0.21248315274715424, |
| "learning_rate": 5.757542500896596e-05, |
| "loss": 0.5721, |
| "step": 3810 |
| }, |
| { |
| "epoch": 2.2561751220233695, |
| "grad_norm": 0.22615401446819305, |
| "learning_rate": 5.711649567680859e-05, |
| "loss": 0.5929, |
| "step": 3815 |
| }, |
| { |
| "epoch": 2.2591332643100133, |
| "grad_norm": 0.22166916728019714, |
| "learning_rate": 5.665911041285612e-05, |
| "loss": 0.5707, |
| "step": 3820 |
| }, |
| { |
| "epoch": 2.2620914065966575, |
| "grad_norm": 0.2249995470046997, |
| "learning_rate": 5.620327391744995e-05, |
| "loss": 0.5878, |
| "step": 3825 |
| }, |
| { |
| "epoch": 2.2650495488833013, |
| "grad_norm": 0.20594008266925812, |
| "learning_rate": 5.57489908750152e-05, |
| "loss": 0.5796, |
| "step": 3830 |
| }, |
| { |
| "epoch": 2.268007691169945, |
| "grad_norm": 0.2205434888601303, |
| "learning_rate": 5.5296265954013146e-05, |
| "loss": 0.5674, |
| "step": 3835 |
| }, |
| { |
| "epoch": 2.2709658334565894, |
| "grad_norm": 0.22647491097450256, |
| "learning_rate": 5.484510380689269e-05, |
| "loss": 0.5821, |
| "step": 3840 |
| }, |
| { |
| "epoch": 2.2739239757432332, |
| "grad_norm": 0.22615395486354828, |
| "learning_rate": 5.439550907004304e-05, |
| "loss": 0.5899, |
| "step": 3845 |
| }, |
| { |
| "epoch": 2.276882118029877, |
| "grad_norm": 0.2161322832107544, |
| "learning_rate": 5.394748636374572e-05, |
| "loss": 0.5791, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.276882118029877, |
| "eval_loss": 0.5578413009643555, |
| "eval_runtime": 15.1815, |
| "eval_samples_per_second": 427.494, |
| "eval_steps_per_second": 13.372, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.2798402603165213, |
| "grad_norm": 0.22290246188640594, |
| "learning_rate": 5.3501040292127126e-05, |
| "loss": 0.592, |
| "step": 3855 |
| }, |
| { |
| "epoch": 2.282798402603165, |
| "grad_norm": 0.22553138434886932, |
| "learning_rate": 5.305617544311153e-05, |
| "loss": 0.5808, |
| "step": 3860 |
| }, |
| { |
| "epoch": 2.2857565448898094, |
| "grad_norm": 0.2151021510362625, |
| "learning_rate": 5.2612896388373444e-05, |
| "loss": 0.5892, |
| "step": 3865 |
| }, |
| { |
| "epoch": 2.288714687176453, |
| "grad_norm": 0.22751715779304504, |
| "learning_rate": 5.217120768329112e-05, |
| "loss": 0.5763, |
| "step": 3870 |
| }, |
| { |
| "epoch": 2.291672829463097, |
| "grad_norm": 0.21743617951869965, |
| "learning_rate": 5.1731113866899264e-05, |
| "loss": 0.5873, |
| "step": 3875 |
| }, |
| { |
| "epoch": 2.2946309717497413, |
| "grad_norm": 0.21894121170043945, |
| "learning_rate": 5.12926194618429e-05, |
| "loss": 0.571, |
| "step": 3880 |
| }, |
| { |
| "epoch": 2.297589114036385, |
| "grad_norm": 0.21481330692768097, |
| "learning_rate": 5.085572897433036e-05, |
| "loss": 0.5832, |
| "step": 3885 |
| }, |
| { |
| "epoch": 2.3005472563230294, |
| "grad_norm": 0.2185722440481186, |
| "learning_rate": 5.042044689408748e-05, |
| "loss": 0.5796, |
| "step": 3890 |
| }, |
| { |
| "epoch": 2.303505398609673, |
| "grad_norm": 0.21149496734142303, |
| "learning_rate": 4.998677769431105e-05, |
| "loss": 0.5774, |
| "step": 3895 |
| }, |
| { |
| "epoch": 2.306463540896317, |
| "grad_norm": 0.2339731901884079, |
| "learning_rate": 4.9554725831623036e-05, |
| "loss": 0.5697, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.306463540896317, |
| "eval_loss": 0.5554924607276917, |
| "eval_runtime": 15.2313, |
| "eval_samples_per_second": 426.096, |
| "eval_steps_per_second": 13.328, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.3094216831829613, |
| "grad_norm": 0.21912769973278046, |
| "learning_rate": 4.9124295746024905e-05, |
| "loss": 0.5794, |
| "step": 3905 |
| }, |
| { |
| "epoch": 2.312379825469605, |
| "grad_norm": 0.21787823736667633, |
| "learning_rate": 4.869549186085165e-05, |
| "loss": 0.562, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.315337967756249, |
| "grad_norm": 0.2172250598669052, |
| "learning_rate": 4.8268318582726754e-05, |
| "loss": 0.5806, |
| "step": 3915 |
| }, |
| { |
| "epoch": 2.318296110042893, |
| "grad_norm": 0.21160916984081268, |
| "learning_rate": 4.784278030151647e-05, |
| "loss": 0.5779, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.321254252329537, |
| "grad_norm": 0.21556870639324188, |
| "learning_rate": 4.7418881390285164e-05, |
| "loss": 0.5773, |
| "step": 3925 |
| }, |
| { |
| "epoch": 2.3242123946161812, |
| "grad_norm": 0.2316085398197174, |
| "learning_rate": 4.699662620524988e-05, |
| "loss": 0.575, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.327170536902825, |
| "grad_norm": 0.2333250194787979, |
| "learning_rate": 4.657601908573614e-05, |
| "loss": 0.597, |
| "step": 3935 |
| }, |
| { |
| "epoch": 2.330128679189469, |
| "grad_norm": 0.21552544832229614, |
| "learning_rate": 4.6157064354132644e-05, |
| "loss": 0.5801, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.333086821476113, |
| "grad_norm": 0.2206254005432129, |
| "learning_rate": 4.573976631584764e-05, |
| "loss": 0.5875, |
| "step": 3945 |
| }, |
| { |
| "epoch": 2.336044963762757, |
| "grad_norm": 0.23494865000247955, |
| "learning_rate": 4.532412925926401e-05, |
| "loss": 0.5908, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.336044963762757, |
| "eval_loss": 0.5537571907043457, |
| "eval_runtime": 15.188, |
| "eval_samples_per_second": 427.311, |
| "eval_steps_per_second": 13.366, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.339003106049401, |
| "grad_norm": 0.2163960486650467, |
| "learning_rate": 4.491015745569572e-05, |
| "loss": 0.5735, |
| "step": 3955 |
| }, |
| { |
| "epoch": 2.341961248336045, |
| "grad_norm": 0.2299627661705017, |
| "learning_rate": 4.4497855159343435e-05, |
| "loss": 0.5574, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.344919390622689, |
| "grad_norm": 0.22644494473934174, |
| "learning_rate": 4.408722660725121e-05, |
| "loss": 0.5687, |
| "step": 3965 |
| }, |
| { |
| "epoch": 2.347877532909333, |
| "grad_norm": 0.21835680305957794, |
| "learning_rate": 4.3678276019262836e-05, |
| "loss": 0.5675, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.350835675195977, |
| "grad_norm": 0.22462695837020874, |
| "learning_rate": 4.32710075979782e-05, |
| "loss": 0.5843, |
| "step": 3975 |
| }, |
| { |
| "epoch": 2.3537938174826207, |
| "grad_norm": 0.22448210418224335, |
| "learning_rate": 4.28654255287106e-05, |
| "loss": 0.5863, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.356751959769265, |
| "grad_norm": 0.21931299567222595, |
| "learning_rate": 4.2461533979443276e-05, |
| "loss": 0.5733, |
| "step": 3985 |
| }, |
| { |
| "epoch": 2.359710102055909, |
| "grad_norm": 0.2160944640636444, |
| "learning_rate": 4.2059337100786736e-05, |
| "loss": 0.5796, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.362668244342553, |
| "grad_norm": 0.22542470693588257, |
| "learning_rate": 4.165883902593623e-05, |
| "loss": 0.5832, |
| "step": 3995 |
| }, |
| { |
| "epoch": 2.365626386629197, |
| "grad_norm": 0.2219017744064331, |
| "learning_rate": 4.12600438706292e-05, |
| "loss": 0.5875, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.365626386629197, |
| "eval_loss": 0.5519596338272095, |
| "eval_runtime": 15.192, |
| "eval_samples_per_second": 427.199, |
| "eval_steps_per_second": 13.362, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.3685845289158407, |
| "grad_norm": 0.21874652802944183, |
| "learning_rate": 4.086295573310277e-05, |
| "loss": 0.579, |
| "step": 4005 |
| }, |
| { |
| "epoch": 2.371542671202485, |
| "grad_norm": 0.21773286163806915, |
| "learning_rate": 4.0467578694052067e-05, |
| "loss": 0.6015, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.374500813489129, |
| "grad_norm": 0.22749686241149902, |
| "learning_rate": 4.007391681658778e-05, |
| "loss": 0.5737, |
| "step": 4015 |
| }, |
| { |
| "epoch": 2.377458955775773, |
| "grad_norm": 0.22802948951721191, |
| "learning_rate": 3.968197414619491e-05, |
| "loss": 0.5694, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.380417098062417, |
| "grad_norm": 0.21437138319015503, |
| "learning_rate": 3.929175471069067e-05, |
| "loss": 0.5741, |
| "step": 4025 |
| }, |
| { |
| "epoch": 2.3833752403490607, |
| "grad_norm": 0.22747749090194702, |
| "learning_rate": 3.8903262520183675e-05, |
| "loss": 0.5907, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.386333382635705, |
| "grad_norm": 0.21637846529483795, |
| "learning_rate": 3.851650156703215e-05, |
| "loss": 0.5689, |
| "step": 4035 |
| }, |
| { |
| "epoch": 2.3892915249223488, |
| "grad_norm": 0.21709904074668884, |
| "learning_rate": 3.81314758258033e-05, |
| "loss": 0.5813, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.3922496672089926, |
| "grad_norm": 0.22659163177013397, |
| "learning_rate": 3.7748189253232394e-05, |
| "loss": 0.5856, |
| "step": 4045 |
| }, |
| { |
| "epoch": 2.395207809495637, |
| "grad_norm": 0.21906381845474243, |
| "learning_rate": 3.736664578818191e-05, |
| "loss": 0.5832, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.395207809495637, |
| "eval_loss": 0.550140917301178, |
| "eval_runtime": 15.1756, |
| "eval_samples_per_second": 427.662, |
| "eval_steps_per_second": 13.377, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.3981659517822806, |
| "grad_norm": 0.22354574501514435, |
| "learning_rate": 3.6986849351601395e-05, |
| "loss": 0.5782, |
| "step": 4055 |
| }, |
| { |
| "epoch": 2.401124094068925, |
| "grad_norm": 0.2217273861169815, |
| "learning_rate": 3.660880384648673e-05, |
| "loss": 0.591, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.4040822363555687, |
| "grad_norm": 0.21969923377037048, |
| "learning_rate": 3.623251315784055e-05, |
| "loss": 0.5749, |
| "step": 4065 |
| }, |
| { |
| "epoch": 2.4070403786422125, |
| "grad_norm": 0.22072407603263855, |
| "learning_rate": 3.5857981152631714e-05, |
| "loss": 0.5727, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.409998520928857, |
| "grad_norm": 0.21851158142089844, |
| "learning_rate": 3.5485211679756226e-05, |
| "loss": 0.5737, |
| "step": 4075 |
| }, |
| { |
| "epoch": 2.4129566632155006, |
| "grad_norm": 0.22839130461215973, |
| "learning_rate": 3.51142085699971e-05, |
| "loss": 0.5833, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.415914805502145, |
| "grad_norm": 0.2259005606174469, |
| "learning_rate": 3.474497563598524e-05, |
| "loss": 0.5942, |
| "step": 4085 |
| }, |
| { |
| "epoch": 2.4188729477887887, |
| "grad_norm": 0.22297006845474243, |
| "learning_rate": 3.437751667216045e-05, |
| "loss": 0.5809, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.4218310900754325, |
| "grad_norm": 0.2227243185043335, |
| "learning_rate": 3.401183545473203e-05, |
| "loss": 0.5713, |
| "step": 4095 |
| }, |
| { |
| "epoch": 2.4247892323620768, |
| "grad_norm": 0.22505450248718262, |
| "learning_rate": 3.364793574164036e-05, |
| "loss": 0.5814, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.4247892323620768, |
| "eval_loss": 0.5483865737915039, |
| "eval_runtime": 15.2502, |
| "eval_samples_per_second": 425.567, |
| "eval_steps_per_second": 13.311, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.4277473746487206, |
| "grad_norm": 0.2335953265428543, |
| "learning_rate": 3.328582127251795e-05, |
| "loss": 0.5903, |
| "step": 4105 |
| }, |
| { |
| "epoch": 2.4307055169353644, |
| "grad_norm": 0.22900566458702087, |
| "learning_rate": 3.29254957686513e-05, |
| "loss": 0.5795, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.4336636592220087, |
| "grad_norm": 0.2234223335981369, |
| "learning_rate": 3.256696293294239e-05, |
| "loss": 0.5797, |
| "step": 4115 |
| }, |
| { |
| "epoch": 2.4366218015086525, |
| "grad_norm": 0.22404076159000397, |
| "learning_rate": 3.2210226449870985e-05, |
| "loss": 0.5707, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.4395799437952967, |
| "grad_norm": 0.2215253710746765, |
| "learning_rate": 3.185528998545622e-05, |
| "loss": 0.5675, |
| "step": 4125 |
| }, |
| { |
| "epoch": 2.4425380860819406, |
| "grad_norm": 0.22823339700698853, |
| "learning_rate": 3.150215718721953e-05, |
| "loss": 0.5782, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.4454962283685844, |
| "grad_norm": 0.21130341291427612, |
| "learning_rate": 3.1150831684146714e-05, |
| "loss": 0.5719, |
| "step": 4135 |
| }, |
| { |
| "epoch": 2.4484543706552286, |
| "grad_norm": 0.22080035507678986, |
| "learning_rate": 3.0801317086651016e-05, |
| "loss": 0.5729, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.4514125129418725, |
| "grad_norm": 0.21857194602489471, |
| "learning_rate": 3.0453616986535577e-05, |
| "loss": 0.5751, |
| "step": 4145 |
| }, |
| { |
| "epoch": 2.4543706552285167, |
| "grad_norm": 0.2172456979751587, |
| "learning_rate": 3.010773495695699e-05, |
| "loss": 0.5829, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.4543706552285167, |
| "eval_loss": 0.5469695925712585, |
| "eval_runtime": 15.3408, |
| "eval_samples_per_second": 423.054, |
| "eval_steps_per_second": 13.233, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.4573287975151605, |
| "grad_norm": 0.23181508481502533, |
| "learning_rate": 2.9763674552388183e-05, |
| "loss": 0.5826, |
| "step": 4155 |
| }, |
| { |
| "epoch": 2.4602869398018044, |
| "grad_norm": 0.21691949665546417, |
| "learning_rate": 2.9421439308582223e-05, |
| "loss": 0.5757, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.4632450820884486, |
| "grad_norm": 0.2233293354511261, |
| "learning_rate": 2.908103274253573e-05, |
| "loss": 0.586, |
| "step": 4165 |
| }, |
| { |
| "epoch": 2.4662032243750924, |
| "grad_norm": 0.22276481986045837, |
| "learning_rate": 2.87424583524528e-05, |
| "loss": 0.5745, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.4691613666617362, |
| "grad_norm": 0.22253195941448212, |
| "learning_rate": 2.8405719617709216e-05, |
| "loss": 0.5874, |
| "step": 4175 |
| }, |
| { |
| "epoch": 2.4721195089483805, |
| "grad_norm": 0.22543801367282867, |
| "learning_rate": 2.8070819998816428e-05, |
| "loss": 0.5726, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.4750776512350243, |
| "grad_norm": 0.22294208407402039, |
| "learning_rate": 2.7737762937386233e-05, |
| "loss": 0.579, |
| "step": 4185 |
| }, |
| { |
| "epoch": 2.4780357935216686, |
| "grad_norm": 0.22648243606090546, |
| "learning_rate": 2.7406551856095202e-05, |
| "loss": 0.5707, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.4809939358083124, |
| "grad_norm": 0.22971773147583008, |
| "learning_rate": 2.7077190158649696e-05, |
| "loss": 0.5892, |
| "step": 4195 |
| }, |
| { |
| "epoch": 2.483952078094956, |
| "grad_norm": 0.2182358354330063, |
| "learning_rate": 2.6749681229750704e-05, |
| "loss": 0.5656, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.483952078094956, |
| "eval_loss": 0.5456834435462952, |
| "eval_runtime": 15.2341, |
| "eval_samples_per_second": 426.017, |
| "eval_steps_per_second": 13.325, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.4869102203816005, |
| "grad_norm": 0.22291642427444458, |
| "learning_rate": 2.6424028435059256e-05, |
| "loss": 0.5852, |
| "step": 4205 |
| }, |
| { |
| "epoch": 2.4898683626682443, |
| "grad_norm": 0.21934635937213898, |
| "learning_rate": 2.6100235121161643e-05, |
| "loss": 0.5811, |
| "step": 4210 |
| }, |
| { |
| "epoch": 2.4928265049548886, |
| "grad_norm": 0.22124481201171875, |
| "learning_rate": 2.5778304615535083e-05, |
| "loss": 0.5646, |
| "step": 4215 |
| }, |
| { |
| "epoch": 2.4957846472415324, |
| "grad_norm": 0.2168821543455124, |
| "learning_rate": 2.5458240226513753e-05, |
| "loss": 0.5701, |
| "step": 4220 |
| }, |
| { |
| "epoch": 2.498742789528176, |
| "grad_norm": 0.22023184597492218, |
| "learning_rate": 2.5140045243254303e-05, |
| "loss": 0.5771, |
| "step": 4225 |
| }, |
| { |
| "epoch": 2.5017009318148204, |
| "grad_norm": 0.22466090321540833, |
| "learning_rate": 2.4823722935702658e-05, |
| "loss": 0.5746, |
| "step": 4230 |
| }, |
| { |
| "epoch": 2.5046590741014643, |
| "grad_norm": 0.21793463826179504, |
| "learning_rate": 2.4509276554559827e-05, |
| "loss": 0.5911, |
| "step": 4235 |
| }, |
| { |
| "epoch": 2.507617216388108, |
| "grad_norm": 0.22143520414829254, |
| "learning_rate": 2.4196709331248968e-05, |
| "loss": 0.5827, |
| "step": 4240 |
| }, |
| { |
| "epoch": 2.5105753586747523, |
| "grad_norm": 0.22911550104618073, |
| "learning_rate": 2.3886024477881854e-05, |
| "loss": 0.5873, |
| "step": 4245 |
| }, |
| { |
| "epoch": 2.513533500961396, |
| "grad_norm": 0.22519181668758392, |
| "learning_rate": 2.3577225187226116e-05, |
| "loss": 0.5883, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.513533500961396, |
| "eval_loss": 0.5445500612258911, |
| "eval_runtime": 15.2095, |
| "eval_samples_per_second": 426.706, |
| "eval_steps_per_second": 13.347, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.51649164324804, |
| "grad_norm": 0.22342784702777863, |
| "learning_rate": 2.3270314632672217e-05, |
| "loss": 0.5666, |
| "step": 4255 |
| }, |
| { |
| "epoch": 2.5194497855346842, |
| "grad_norm": 0.2204166203737259, |
| "learning_rate": 2.2965295968200944e-05, |
| "loss": 0.5656, |
| "step": 4260 |
| }, |
| { |
| "epoch": 2.522407927821328, |
| "grad_norm": 0.2240799218416214, |
| "learning_rate": 2.2662172328350975e-05, |
| "loss": 0.5842, |
| "step": 4265 |
| }, |
| { |
| "epoch": 2.5253660701079723, |
| "grad_norm": 0.22049422562122345, |
| "learning_rate": 2.2360946828186807e-05, |
| "loss": 0.5602, |
| "step": 4270 |
| }, |
| { |
| "epoch": 2.528324212394616, |
| "grad_norm": 0.2233700156211853, |
| "learning_rate": 2.20616225632664e-05, |
| "loss": 0.583, |
| "step": 4275 |
| }, |
| { |
| "epoch": 2.5312823546812604, |
| "grad_norm": 0.21494439244270325, |
| "learning_rate": 2.176420260960981e-05, |
| "loss": 0.5665, |
| "step": 4280 |
| }, |
| { |
| "epoch": 2.534240496967904, |
| "grad_norm": 0.22276660799980164, |
| "learning_rate": 2.146869002366714e-05, |
| "loss": 0.5828, |
| "step": 4285 |
| }, |
| { |
| "epoch": 2.537198639254548, |
| "grad_norm": 0.23662561178207397, |
| "learning_rate": 2.1175087842287453e-05, |
| "loss": 0.5875, |
| "step": 4290 |
| }, |
| { |
| "epoch": 2.5401567815411923, |
| "grad_norm": 0.2167738676071167, |
| "learning_rate": 2.0883399082687503e-05, |
| "loss": 0.579, |
| "step": 4295 |
| }, |
| { |
| "epoch": 2.543114923827836, |
| "grad_norm": 0.2297111451625824, |
| "learning_rate": 2.0593626742420543e-05, |
| "loss": 0.5786, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.543114923827836, |
| "eval_loss": 0.5430043935775757, |
| "eval_runtime": 15.2067, |
| "eval_samples_per_second": 426.786, |
| "eval_steps_per_second": 13.349, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.54607306611448, |
| "grad_norm": 0.2167220413684845, |
| "learning_rate": 2.0305773799345715e-05, |
| "loss": 0.5746, |
| "step": 4305 |
| }, |
| { |
| "epoch": 2.549031208401124, |
| "grad_norm": 0.21370890736579895, |
| "learning_rate": 2.0019843211597343e-05, |
| "loss": 0.5808, |
| "step": 4310 |
| }, |
| { |
| "epoch": 2.551989350687768, |
| "grad_norm": 0.21757473051548004, |
| "learning_rate": 1.9735837917554708e-05, |
| "loss": 0.5833, |
| "step": 4315 |
| }, |
| { |
| "epoch": 2.554947492974412, |
| "grad_norm": 0.22231775522232056, |
| "learning_rate": 1.9453760835811493e-05, |
| "loss": 0.5669, |
| "step": 4320 |
| }, |
| { |
| "epoch": 2.557905635261056, |
| "grad_norm": 0.23005709052085876, |
| "learning_rate": 1.9173614865146273e-05, |
| "loss": 0.5765, |
| "step": 4325 |
| }, |
| { |
| "epoch": 2.5608637775477, |
| "grad_norm": 0.22354349493980408, |
| "learning_rate": 1.889540288449228e-05, |
| "loss": 0.5687, |
| "step": 4330 |
| }, |
| { |
| "epoch": 2.563821919834344, |
| "grad_norm": 0.21966968476772308, |
| "learning_rate": 1.8619127752908098e-05, |
| "loss": 0.5756, |
| "step": 4335 |
| }, |
| { |
| "epoch": 2.566780062120988, |
| "grad_norm": 0.21885375678539276, |
| "learning_rate": 1.8344792309548108e-05, |
| "loss": 0.5739, |
| "step": 4340 |
| }, |
| { |
| "epoch": 2.5697382044076322, |
| "grad_norm": 0.22621390223503113, |
| "learning_rate": 1.8072399373633515e-05, |
| "loss": 0.5851, |
| "step": 4345 |
| }, |
| { |
| "epoch": 2.572696346694276, |
| "grad_norm": 0.2224331647157669, |
| "learning_rate": 1.7801951744423186e-05, |
| "loss": 0.5824, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.572696346694276, |
| "eval_loss": 0.5423793792724609, |
| "eval_runtime": 15.1763, |
| "eval_samples_per_second": 427.641, |
| "eval_steps_per_second": 13.376, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.57565448898092, |
| "grad_norm": 0.23509664833545685, |
| "learning_rate": 1.7533452201184873e-05, |
| "loss": 0.5856, |
| "step": 4355 |
| }, |
| { |
| "epoch": 2.578612631267564, |
| "grad_norm": 0.22954770922660828, |
| "learning_rate": 1.7266903503166882e-05, |
| "loss": 0.5799, |
| "step": 4360 |
| }, |
| { |
| "epoch": 2.581570773554208, |
| "grad_norm": 0.2257954627275467, |
| "learning_rate": 1.7002308389569457e-05, |
| "loss": 0.5725, |
| "step": 4365 |
| }, |
| { |
| "epoch": 2.5845289158408518, |
| "grad_norm": 0.23301996290683746, |
| "learning_rate": 1.673966957951685e-05, |
| "loss": 0.5738, |
| "step": 4370 |
| }, |
| { |
| "epoch": 2.587487058127496, |
| "grad_norm": 0.22801564633846283, |
| "learning_rate": 1.6478989772029073e-05, |
| "loss": 0.5701, |
| "step": 4375 |
| }, |
| { |
| "epoch": 2.59044520041414, |
| "grad_norm": 0.20772784948349, |
| "learning_rate": 1.622027164599458e-05, |
| "loss": 0.5921, |
| "step": 4380 |
| }, |
| { |
| "epoch": 2.5934033427007837, |
| "grad_norm": 0.22121798992156982, |
| "learning_rate": 1.5963517860142358e-05, |
| "loss": 0.5858, |
| "step": 4385 |
| }, |
| { |
| "epoch": 2.596361484987428, |
| "grad_norm": 0.21859820187091827, |
| "learning_rate": 1.5708731053014873e-05, |
| "loss": 0.5699, |
| "step": 4390 |
| }, |
| { |
| "epoch": 2.5993196272740717, |
| "grad_norm": 0.2150518298149109, |
| "learning_rate": 1.5455913842940675e-05, |
| "loss": 0.566, |
| "step": 4395 |
| }, |
| { |
| "epoch": 2.602277769560716, |
| "grad_norm": 0.22459477186203003, |
| "learning_rate": 1.5205068828007849e-05, |
| "loss": 0.5802, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.602277769560716, |
| "eval_loss": 0.54107266664505, |
| "eval_runtime": 15.154, |
| "eval_samples_per_second": 428.269, |
| "eval_steps_per_second": 13.396, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.60523591184736, |
| "grad_norm": 0.22681817412376404, |
| "learning_rate": 1.4956198586036965e-05, |
| "loss": 0.5926, |
| "step": 4405 |
| }, |
| { |
| "epoch": 2.608194054134004, |
| "grad_norm": 0.2272169440984726, |
| "learning_rate": 1.4709305674554852e-05, |
| "loss": 0.5857, |
| "step": 4410 |
| }, |
| { |
| "epoch": 2.611152196420648, |
| "grad_norm": 0.22858625650405884, |
| "learning_rate": 1.4464392630768207e-05, |
| "loss": 0.5673, |
| "step": 4415 |
| }, |
| { |
| "epoch": 2.6141103387072917, |
| "grad_norm": 0.22836680710315704, |
| "learning_rate": 1.4221461971537435e-05, |
| "loss": 0.5648, |
| "step": 4420 |
| }, |
| { |
| "epoch": 2.617068480993936, |
| "grad_norm": 0.22064611315727234, |
| "learning_rate": 1.3980516193350969e-05, |
| "loss": 0.578, |
| "step": 4425 |
| }, |
| { |
| "epoch": 2.62002662328058, |
| "grad_norm": 0.21425370872020721, |
| "learning_rate": 1.3741557772299449e-05, |
| "loss": 0.5629, |
| "step": 4430 |
| }, |
| { |
| "epoch": 2.6229847655672236, |
| "grad_norm": 0.23086489737033844, |
| "learning_rate": 1.3504589164050405e-05, |
| "loss": 0.5916, |
| "step": 4435 |
| }, |
| { |
| "epoch": 2.625942907853868, |
| "grad_norm": 0.21682678163051605, |
| "learning_rate": 1.3269612803822861e-05, |
| "loss": 0.5628, |
| "step": 4440 |
| }, |
| { |
| "epoch": 2.6289010501405117, |
| "grad_norm": 0.2231551855802536, |
| "learning_rate": 1.3036631106362562e-05, |
| "loss": 0.5793, |
| "step": 4445 |
| }, |
| { |
| "epoch": 2.6318591924271555, |
| "grad_norm": 0.2130296528339386, |
| "learning_rate": 1.2805646465916838e-05, |
| "loss": 0.563, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.6318591924271555, |
| "eval_loss": 0.5402519106864929, |
| "eval_runtime": 15.1991, |
| "eval_samples_per_second": 426.999, |
| "eval_steps_per_second": 13.356, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.6348173347137998, |
| "grad_norm": 0.225514218211174, |
| "learning_rate": 1.257666125621033e-05, |
| "loss": 0.5828, |
| "step": 4455 |
| }, |
| { |
| "epoch": 2.6377754770004436, |
| "grad_norm": 0.21802076697349548, |
| "learning_rate": 1.2349677830420293e-05, |
| "loss": 0.5674, |
| "step": 4460 |
| }, |
| { |
| "epoch": 2.640733619287088, |
| "grad_norm": 0.22092710435390472, |
| "learning_rate": 1.2124698521152674e-05, |
| "loss": 0.5715, |
| "step": 4465 |
| }, |
| { |
| "epoch": 2.6436917615737316, |
| "grad_norm": 0.2172520011663437, |
| "learning_rate": 1.1901725640417918e-05, |
| "loss": 0.5695, |
| "step": 4470 |
| }, |
| { |
| "epoch": 2.646649903860376, |
| "grad_norm": 0.2219080626964569, |
| "learning_rate": 1.1680761479607432e-05, |
| "loss": 0.58, |
| "step": 4475 |
| }, |
| { |
| "epoch": 2.6496080461470197, |
| "grad_norm": 0.22562995553016663, |
| "learning_rate": 1.1461808309469787e-05, |
| "loss": 0.5764, |
| "step": 4480 |
| }, |
| { |
| "epoch": 2.6525661884336635, |
| "grad_norm": 0.22420786321163177, |
| "learning_rate": 1.1244868380087579e-05, |
| "loss": 0.576, |
| "step": 4485 |
| }, |
| { |
| "epoch": 2.655524330720308, |
| "grad_norm": 0.2163666933774948, |
| "learning_rate": 1.1029943920854286e-05, |
| "loss": 0.5697, |
| "step": 4490 |
| }, |
| { |
| "epoch": 2.6584824730069516, |
| "grad_norm": 0.22043688595294952, |
| "learning_rate": 1.0817037140451184e-05, |
| "loss": 0.5934, |
| "step": 4495 |
| }, |
| { |
| "epoch": 2.6614406152935954, |
| "grad_norm": 0.22016723453998566, |
| "learning_rate": 1.0606150226824918e-05, |
| "loss": 0.584, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.6614406152935954, |
| "eval_loss": 0.539762556552887, |
| "eval_runtime": 15.2106, |
| "eval_samples_per_second": 426.675, |
| "eval_steps_per_second": 13.346, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.6643987575802397, |
| "grad_norm": 0.22158069908618927, |
| "learning_rate": 1.039728534716478e-05, |
| "loss": 0.5703, |
| "step": 4505 |
| }, |
| { |
| "epoch": 2.6673568998668835, |
| "grad_norm": 0.22302477061748505, |
| "learning_rate": 1.0190444647880609e-05, |
| "loss": 0.5894, |
| "step": 4510 |
| }, |
| { |
| "epoch": 2.6703150421535273, |
| "grad_norm": 0.22093521058559418, |
| "learning_rate": 9.98563025458055e-06, |
| "loss": 0.5745, |
| "step": 4515 |
| }, |
| { |
| "epoch": 2.6732731844401716, |
| "grad_norm": 0.2225552648305893, |
| "learning_rate": 9.78284427204948e-06, |
| "loss": 0.5729, |
| "step": 4520 |
| }, |
| { |
| "epoch": 2.6762313267268154, |
| "grad_norm": 0.227900430560112, |
| "learning_rate": 9.582088784227052e-06, |
| "loss": 0.5939, |
| "step": 4525 |
| }, |
| { |
| "epoch": 2.6791894690134597, |
| "grad_norm": 0.21942593157291412, |
| "learning_rate": 9.3833658541865e-06, |
| "loss": 0.5775, |
| "step": 4530 |
| }, |
| { |
| "epoch": 2.6821476113001035, |
| "grad_norm": 0.22814010083675385, |
| "learning_rate": 9.186677524113473e-06, |
| "loss": 0.5763, |
| "step": 4535 |
| }, |
| { |
| "epoch": 2.6851057535867477, |
| "grad_norm": 0.2286267876625061, |
| "learning_rate": 8.992025815284826e-06, |
| "loss": 0.5765, |
| "step": 4540 |
| }, |
| { |
| "epoch": 2.6880638958733916, |
| "grad_norm": 0.22250616550445557, |
| "learning_rate": 8.799412728048058e-06, |
| "loss": 0.5762, |
| "step": 4545 |
| }, |
| { |
| "epoch": 2.6910220381600354, |
| "grad_norm": 0.22371500730514526, |
| "learning_rate": 8.608840241800641e-06, |
| "loss": 0.5744, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.6910220381600354, |
| "eval_loss": 0.5388516187667847, |
| "eval_runtime": 15.2087, |
| "eval_samples_per_second": 426.73, |
| "eval_steps_per_second": 13.348, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.6939801804466796, |
| "grad_norm": 0.22178302705287933, |
| "learning_rate": 8.420310314969735e-06, |
| "loss": 0.5766, |
| "step": 4555 |
| }, |
| { |
| "epoch": 2.6969383227333235, |
| "grad_norm": 0.21156929433345795, |
| "learning_rate": 8.23382488499205e-06, |
| "loss": 0.5777, |
| "step": 4560 |
| }, |
| { |
| "epoch": 2.6998964650199673, |
| "grad_norm": 0.22891399264335632, |
| "learning_rate": 8.049385868293896e-06, |
| "loss": 0.5634, |
| "step": 4565 |
| }, |
| { |
| "epoch": 2.7028546073066115, |
| "grad_norm": 0.22337764501571655, |
| "learning_rate": 7.866995160271555e-06, |
| "loss": 0.5713, |
| "step": 4570 |
| }, |
| { |
| "epoch": 2.7058127495932554, |
| "grad_norm": 0.22055917978286743, |
| "learning_rate": 7.686654635271734e-06, |
| "loss": 0.5663, |
| "step": 4575 |
| }, |
| { |
| "epoch": 2.708770891879899, |
| "grad_norm": 0.2214854657649994, |
| "learning_rate": 7.508366146572334e-06, |
| "loss": 0.5647, |
| "step": 4580 |
| }, |
| { |
| "epoch": 2.7117290341665434, |
| "grad_norm": 0.22195084393024445, |
| "learning_rate": 7.3321315263634685e-06, |
| "loss": 0.5661, |
| "step": 4585 |
| }, |
| { |
| "epoch": 2.7146871764531872, |
| "grad_norm": 0.22451895475387573, |
| "learning_rate": 7.157952585728481e-06, |
| "loss": 0.5856, |
| "step": 4590 |
| }, |
| { |
| "epoch": 2.7176453187398315, |
| "grad_norm": 0.22339747846126556, |
| "learning_rate": 6.985831114625555e-06, |
| "loss": 0.5811, |
| "step": 4595 |
| }, |
| { |
| "epoch": 2.7206034610264753, |
| "grad_norm": 0.216370090842247, |
| "learning_rate": 6.815768881869047e-06, |
| "loss": 0.575, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.7206034610264753, |
| "eval_loss": 0.5387815833091736, |
| "eval_runtime": 15.2306, |
| "eval_samples_per_second": 426.115, |
| "eval_steps_per_second": 13.328, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.7235616033131196, |
| "grad_norm": 0.2195269763469696, |
| "learning_rate": 6.647767635111566e-06, |
| "loss": 0.5778, |
| "step": 4605 |
| }, |
| { |
| "epoch": 2.7265197455997634, |
| "grad_norm": 0.21644777059555054, |
| "learning_rate": 6.481829100825816e-06, |
| "loss": 0.5563, |
| "step": 4610 |
| }, |
| { |
| "epoch": 2.729477887886407, |
| "grad_norm": 0.21869517862796783, |
| "learning_rate": 6.317954984287005e-06, |
| "loss": 0.569, |
| "step": 4615 |
| }, |
| { |
| "epoch": 2.7324360301730515, |
| "grad_norm": 0.22483299672603607, |
| "learning_rate": 6.156146969555277e-06, |
| "loss": 0.5699, |
| "step": 4620 |
| }, |
| { |
| "epoch": 2.7353941724596953, |
| "grad_norm": 0.2190885692834854, |
| "learning_rate": 5.996406719458241e-06, |
| "loss": 0.5756, |
| "step": 4625 |
| }, |
| { |
| "epoch": 2.738352314746339, |
| "grad_norm": 0.21631799638271332, |
| "learning_rate": 5.838735875574182e-06, |
| "loss": 0.5848, |
| "step": 4630 |
| }, |
| { |
| "epoch": 2.7413104570329834, |
| "grad_norm": 0.22044958174228668, |
| "learning_rate": 5.6831360582149405e-06, |
| "loss": 0.5806, |
| "step": 4635 |
| }, |
| { |
| "epoch": 2.744268599319627, |
| "grad_norm": 0.2257104516029358, |
| "learning_rate": 5.529608866409443e-06, |
| "loss": 0.5553, |
| "step": 4640 |
| }, |
| { |
| "epoch": 2.747226741606271, |
| "grad_norm": 0.21883299946784973, |
| "learning_rate": 5.378155877887042e-06, |
| "loss": 0.5758, |
| "step": 4645 |
| }, |
| { |
| "epoch": 2.7501848838929153, |
| "grad_norm": 0.21661600470542908, |
| "learning_rate": 5.2287786490616e-06, |
| "loss": 0.5644, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.7501848838929153, |
| "eval_loss": 0.5379989743232727, |
| "eval_runtime": 15.2107, |
| "eval_samples_per_second": 426.673, |
| "eval_steps_per_second": 13.346, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.753143026179559, |
| "grad_norm": 0.22663559019565582, |
| "learning_rate": 5.081478715015193e-06, |
| "loss": 0.5832, |
| "step": 4655 |
| }, |
| { |
| "epoch": 2.7561011684662033, |
| "grad_norm": 0.2192746102809906, |
| "learning_rate": 4.93625758948264e-06, |
| "loss": 0.572, |
| "step": 4660 |
| }, |
| { |
| "epoch": 2.759059310752847, |
| "grad_norm": 0.21773546934127808, |
| "learning_rate": 4.793116764835617e-06, |
| "loss": 0.5719, |
| "step": 4665 |
| }, |
| { |
| "epoch": 2.7620174530394914, |
| "grad_norm": 0.22200337052345276, |
| "learning_rate": 4.652057712067575e-06, |
| "loss": 0.5696, |
| "step": 4670 |
| }, |
| { |
| "epoch": 2.7649755953261352, |
| "grad_norm": 0.2322062999010086, |
| "learning_rate": 4.513081880778574e-06, |
| "loss": 0.5866, |
| "step": 4675 |
| }, |
| { |
| "epoch": 2.767933737612779, |
| "grad_norm": 0.22154580056667328, |
| "learning_rate": 4.376190699160239e-06, |
| "loss": 0.5669, |
| "step": 4680 |
| }, |
| { |
| "epoch": 2.7708918798994233, |
| "grad_norm": 0.21697700023651123, |
| "learning_rate": 4.241385573981337e-06, |
| "loss": 0.5948, |
| "step": 4685 |
| }, |
| { |
| "epoch": 2.773850022186067, |
| "grad_norm": 0.22418950498104095, |
| "learning_rate": 4.108667890573057e-06, |
| "loss": 0.5685, |
| "step": 4690 |
| }, |
| { |
| "epoch": 2.776808164472711, |
| "grad_norm": 0.2229933887720108, |
| "learning_rate": 3.978039012814971e-06, |
| "loss": 0.5821, |
| "step": 4695 |
| }, |
| { |
| "epoch": 2.779766306759355, |
| "grad_norm": 0.23993420600891113, |
| "learning_rate": 3.84950028312085e-06, |
| "loss": 0.5829, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.779766306759355, |
| "eval_loss": 0.5377324223518372, |
| "eval_runtime": 15.191, |
| "eval_samples_per_second": 427.228, |
| "eval_steps_per_second": 13.363, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.782724449045999, |
| "grad_norm": 0.2284260392189026, |
| "learning_rate": 3.7230530224251017e-06, |
| "loss": 0.5808, |
| "step": 4705 |
| }, |
| { |
| "epoch": 2.785682591332643, |
| "grad_norm": 0.21950730681419373, |
| "learning_rate": 3.5986985301689156e-06, |
| "loss": 0.5776, |
| "step": 4710 |
| }, |
| { |
| "epoch": 2.788640733619287, |
| "grad_norm": 0.22390055656433105, |
| "learning_rate": 3.4764380842871153e-06, |
| "loss": 0.5694, |
| "step": 4715 |
| }, |
| { |
| "epoch": 2.791598875905931, |
| "grad_norm": 0.22625946998596191, |
| "learning_rate": 3.356272941194918e-06, |
| "loss": 0.5732, |
| "step": 4720 |
| }, |
| { |
| "epoch": 2.794557018192575, |
| "grad_norm": 0.23500940203666687, |
| "learning_rate": 3.2382043357751384e-06, |
| "loss": 0.5641, |
| "step": 4725 |
| }, |
| { |
| "epoch": 2.797515160479219, |
| "grad_norm": 0.22036881744861603, |
| "learning_rate": 3.122233481365339e-06, |
| "loss": 0.5807, |
| "step": 4730 |
| }, |
| { |
| "epoch": 2.8004733027658633, |
| "grad_norm": 0.23122946918010712, |
| "learning_rate": 3.008361569745513e-06, |
| "loss": 0.5762, |
| "step": 4735 |
| }, |
| { |
| "epoch": 2.803431445052507, |
| "grad_norm": 0.23130850493907928, |
| "learning_rate": 2.8965897711257245e-06, |
| "loss": 0.567, |
| "step": 4740 |
| }, |
| { |
| "epoch": 2.806389587339151, |
| "grad_norm": 0.22971408069133759, |
| "learning_rate": 2.7869192341341095e-06, |
| "loss": 0.5662, |
| "step": 4745 |
| }, |
| { |
| "epoch": 2.809347729625795, |
| "grad_norm": 0.21526266634464264, |
| "learning_rate": 2.6793510858051828e-06, |
| "loss": 0.5702, |
| "step": 4750 |
| }, |
| { |
| "epoch": 2.809347729625795, |
| "eval_loss": 0.5374576449394226, |
| "eval_runtime": 15.2056, |
| "eval_samples_per_second": 426.815, |
| "eval_steps_per_second": 13.35, |
| "step": 4750 |
| }, |
| { |
| "epoch": 2.812305871912439, |
| "grad_norm": 0.22544841468334198, |
| "learning_rate": 2.5738864315680513e-06, |
| "loss": 0.5679, |
| "step": 4755 |
| }, |
| { |
| "epoch": 2.815264014199083, |
| "grad_norm": 0.22757330536842346, |
| "learning_rate": 2.470526355235246e-06, |
| "loss": 0.5632, |
| "step": 4760 |
| }, |
| { |
| "epoch": 2.818222156485727, |
| "grad_norm": 0.22844068706035614, |
| "learning_rate": 2.3692719189914185e-06, |
| "loss": 0.5752, |
| "step": 4765 |
| }, |
| { |
| "epoch": 2.821180298772371, |
| "grad_norm": 0.22197550535202026, |
| "learning_rate": 2.270124163382614e-06, |
| "loss": 0.5501, |
| "step": 4770 |
| }, |
| { |
| "epoch": 2.8241384410590147, |
| "grad_norm": 0.21858546137809753, |
| "learning_rate": 2.173084107305403e-06, |
| "loss": 0.5726, |
| "step": 4775 |
| }, |
| { |
| "epoch": 2.827096583345659, |
| "grad_norm": 0.22059139609336853, |
| "learning_rate": 2.0781527479965216e-06, |
| "loss": 0.5783, |
| "step": 4780 |
| }, |
| { |
| "epoch": 2.8300547256323028, |
| "grad_norm": 0.2304689586162567, |
| "learning_rate": 1.9853310610225355e-06, |
| "loss": 0.5821, |
| "step": 4785 |
| }, |
| { |
| "epoch": 2.833012867918947, |
| "grad_norm": 0.22385703027248383, |
| "learning_rate": 1.8946200002699386e-06, |
| "loss": 0.5908, |
| "step": 4790 |
| }, |
| { |
| "epoch": 2.835971010205591, |
| "grad_norm": 0.2215282917022705, |
| "learning_rate": 1.806020497935185e-06, |
| "loss": 0.5644, |
| "step": 4795 |
| }, |
| { |
| "epoch": 2.838929152492235, |
| "grad_norm": 0.21840998530387878, |
| "learning_rate": 1.7195334645152737e-06, |
| "loss": 0.5814, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.838929152492235, |
| "eval_loss": 0.5373325943946838, |
| "eval_runtime": 15.2351, |
| "eval_samples_per_second": 425.989, |
| "eval_steps_per_second": 13.324, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.841887294778879, |
| "grad_norm": 0.22534750401973724, |
| "learning_rate": 1.6351597887982846e-06, |
| "loss": 0.581, |
| "step": 4805 |
| }, |
| { |
| "epoch": 2.8448454370655227, |
| "grad_norm": 0.23928098380565643, |
| "learning_rate": 1.5529003378542404e-06, |
| "loss": 0.5837, |
| "step": 4810 |
| }, |
| { |
| "epoch": 2.847803579352167, |
| "grad_norm": 0.21647833287715912, |
| "learning_rate": 1.4727559570263333e-06, |
| "loss": 0.5701, |
| "step": 4815 |
| }, |
| { |
| "epoch": 2.850761721638811, |
| "grad_norm": 0.2176506221294403, |
| "learning_rate": 1.3947274699220398e-06, |
| "loss": 0.5626, |
| "step": 4820 |
| }, |
| { |
| "epoch": 2.8537198639254546, |
| "grad_norm": 0.21065934002399445, |
| "learning_rate": 1.3188156784048088e-06, |
| "loss": 0.5686, |
| "step": 4825 |
| }, |
| { |
| "epoch": 2.856678006212099, |
| "grad_norm": 0.22182585299015045, |
| "learning_rate": 1.2450213625857274e-06, |
| "loss": 0.5761, |
| "step": 4830 |
| }, |
| { |
| "epoch": 2.8596361484987427, |
| "grad_norm": 0.21298271417617798, |
| "learning_rate": 1.1733452808156017e-06, |
| "loss": 0.5867, |
| "step": 4835 |
| }, |
| { |
| "epoch": 2.8625942907853865, |
| "grad_norm": 0.229048490524292, |
| "learning_rate": 1.103788169677036e-06, |
| "loss": 0.589, |
| "step": 4840 |
| }, |
| { |
| "epoch": 2.865552433072031, |
| "grad_norm": 0.2213655412197113, |
| "learning_rate": 1.0363507439769986e-06, |
| "loss": 0.5597, |
| "step": 4845 |
| }, |
| { |
| "epoch": 2.8685105753586746, |
| "grad_norm": 0.21822868287563324, |
| "learning_rate": 9.7103369673936e-07, |
| "loss": 0.5712, |
| "step": 4850 |
| }, |
| { |
| "epoch": 2.8685105753586746, |
| "eval_loss": 0.5373578667640686, |
| "eval_runtime": 15.1783, |
| "eval_samples_per_second": 427.584, |
| "eval_steps_per_second": 13.374, |
| "step": 4850 |
| }, |
| { |
| "epoch": 2.871468717645319, |
| "grad_norm": 0.22016650438308716, |
| "learning_rate": 9.078376991978266e-07, |
| "loss": 0.5587, |
| "step": 4855 |
| }, |
| { |
| "epoch": 2.8744268599319627, |
| "grad_norm": 0.23947712779045105, |
| "learning_rate": 8.467634007890796e-07, |
| "loss": 0.5841, |
| "step": 4860 |
| }, |
| { |
| "epoch": 2.877385002218607, |
| "grad_norm": 0.2243824005126953, |
| "learning_rate": 7.878114291460063e-07, |
| "loss": 0.5736, |
| "step": 4865 |
| }, |
| { |
| "epoch": 2.8803431445052508, |
| "grad_norm": 0.22133906185626984, |
| "learning_rate": 7.309823900913461e-07, |
| "loss": 0.5764, |
| "step": 4870 |
| }, |
| { |
| "epoch": 2.8833012867918946, |
| "grad_norm": 0.21976634860038757, |
| "learning_rate": 6.76276867631405e-07, |
| "loss": 0.5699, |
| "step": 4875 |
| }, |
| { |
| "epoch": 2.886259429078539, |
| "grad_norm": 0.22008314728736877, |
| "learning_rate": 6.236954239500471e-07, |
| "loss": 0.5527, |
| "step": 4880 |
| }, |
| { |
| "epoch": 2.8892175713651826, |
| "grad_norm": 0.22807146608829498, |
| "learning_rate": 5.732385994029618e-07, |
| "loss": 0.5943, |
| "step": 4885 |
| }, |
| { |
| "epoch": 2.8921757136518265, |
| "grad_norm": 0.22938776016235352, |
| "learning_rate": 5.249069125121154e-07, |
| "loss": 0.5825, |
| "step": 4890 |
| }, |
| { |
| "epoch": 2.8951338559384707, |
| "grad_norm": 0.20941923558712006, |
| "learning_rate": 4.787008599603642e-07, |
| "loss": 0.5685, |
| "step": 4895 |
| }, |
| { |
| "epoch": 2.8980919982251145, |
| "grad_norm": 0.22085338830947876, |
| "learning_rate": 4.346209165863655e-07, |
| "loss": 0.5588, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.8980919982251145, |
| "eval_loss": 0.5373329520225525, |
| "eval_runtime": 15.2559, |
| "eval_samples_per_second": 425.409, |
| "eval_steps_per_second": 13.306, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.9010501405117584, |
| "grad_norm": 0.22424866259098053, |
| "learning_rate": 3.926675353797443e-07, |
| "loss": 0.5725, |
| "step": 4905 |
| }, |
| { |
| "epoch": 2.9040082827984026, |
| "grad_norm": 0.2182874232530594, |
| "learning_rate": 3.5284114747641856e-07, |
| "loss": 0.5582, |
| "step": 4910 |
| }, |
| { |
| "epoch": 2.9069664250850464, |
| "grad_norm": 0.21973784267902374, |
| "learning_rate": 3.151421621541335e-07, |
| "loss": 0.5684, |
| "step": 4915 |
| }, |
| { |
| "epoch": 2.9099245673716907, |
| "grad_norm": 0.2083846479654312, |
| "learning_rate": 2.795709668283172e-07, |
| "loss": 0.578, |
| "step": 4920 |
| }, |
| { |
| "epoch": 2.9128827096583345, |
| "grad_norm": 0.2196836769580841, |
| "learning_rate": 2.4612792704798287e-07, |
| "loss": 0.5603, |
| "step": 4925 |
| }, |
| { |
| "epoch": 2.9158408519449788, |
| "grad_norm": 0.22254040837287903, |
| "learning_rate": 2.1481338649216013e-07, |
| "loss": 0.5526, |
| "step": 4930 |
| }, |
| { |
| "epoch": 2.9187989942316226, |
| "grad_norm": 0.2200893610715866, |
| "learning_rate": 1.8562766696618855e-07, |
| "loss": 0.5661, |
| "step": 4935 |
| }, |
| { |
| "epoch": 2.9217571365182664, |
| "grad_norm": 0.22102928161621094, |
| "learning_rate": 1.5857106839847136e-07, |
| "loss": 0.5905, |
| "step": 4940 |
| }, |
| { |
| "epoch": 2.9247152788049107, |
| "grad_norm": 0.2244081199169159, |
| "learning_rate": 1.3364386883745962e-07, |
| "loss": 0.5743, |
| "step": 4945 |
| }, |
| { |
| "epoch": 2.9276734210915545, |
| "grad_norm": 0.23028399050235748, |
| "learning_rate": 1.1084632444868224e-07, |
| "loss": 0.5852, |
| "step": 4950 |
| }, |
| { |
| "epoch": 2.9276734210915545, |
| "eval_loss": 0.5373095273971558, |
| "eval_runtime": 15.2077, |
| "eval_samples_per_second": 426.758, |
| "eval_steps_per_second": 13.349, |
| "step": 4950 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 3 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.873445665417724e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|