| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 778, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012853470437017995, |
| "grad_norm": 1.0161515474319458, |
| "learning_rate": 1.2244897959183673e-06, |
| "loss": 1.2722, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02570694087403599, |
| "grad_norm": 0.878677487373352, |
| "learning_rate": 2.7551020408163266e-06, |
| "loss": 1.3141, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.038560411311053984, |
| "grad_norm": 0.7686222195625305, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 1.2799, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05141388174807198, |
| "grad_norm": 0.545415997505188, |
| "learning_rate": 5.816326530612245e-06, |
| "loss": 1.2905, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06426735218508997, |
| "grad_norm": 0.5554194450378418, |
| "learning_rate": 7.346938775510204e-06, |
| "loss": 1.232, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07712082262210797, |
| "grad_norm": 0.5022542476654053, |
| "learning_rate": 8.877551020408163e-06, |
| "loss": 1.1671, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08997429305912596, |
| "grad_norm": 0.4759712815284729, |
| "learning_rate": 1.0408163265306123e-05, |
| "loss": 1.1927, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.10282776349614396, |
| "grad_norm": 0.5466979146003723, |
| "learning_rate": 1.1938775510204082e-05, |
| "loss": 1.1454, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.11568123393316196, |
| "grad_norm": 0.42459335923194885, |
| "learning_rate": 1.3469387755102042e-05, |
| "loss": 1.1662, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.12853470437017994, |
| "grad_norm": 0.45232370495796204, |
| "learning_rate": 1.5e-05, |
| "loss": 1.1538, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.14138817480719795, |
| "grad_norm": 0.4375338852405548, |
| "learning_rate": 1.6530612244897957e-05, |
| "loss": 1.1508, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.15424164524421594, |
| "grad_norm": 0.40976741909980774, |
| "learning_rate": 1.806122448979592e-05, |
| "loss": 1.1557, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.16709511568123395, |
| "grad_norm": 0.4517289996147156, |
| "learning_rate": 1.9591836734693877e-05, |
| "loss": 1.1779, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.17994858611825193, |
| "grad_norm": 0.43108654022216797, |
| "learning_rate": 2.1122448979591836e-05, |
| "loss": 1.1197, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1928020565552699, |
| "grad_norm": 0.43453311920166016, |
| "learning_rate": 2.2653061224489794e-05, |
| "loss": 1.1141, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.20565552699228792, |
| "grad_norm": 0.5921583771705627, |
| "learning_rate": 2.4183673469387756e-05, |
| "loss": 1.1139, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2185089974293059, |
| "grad_norm": 0.8495065569877625, |
| "learning_rate": 2.5714285714285714e-05, |
| "loss": 1.1039, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.23136246786632392, |
| "grad_norm": 0.5331577658653259, |
| "learning_rate": 2.7244897959183673e-05, |
| "loss": 1.0815, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2442159383033419, |
| "grad_norm": 0.5275247097015381, |
| "learning_rate": 2.877551020408163e-05, |
| "loss": 1.0436, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2570694087403599, |
| "grad_norm": 0.498080849647522, |
| "learning_rate": 2.9999978301629866e-05, |
| "loss": 1.0805, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2699228791773779, |
| "grad_norm": 0.5699777603149414, |
| "learning_rate": 2.999921886526661e-05, |
| "loss": 1.0579, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2827763496143959, |
| "grad_norm": 0.5747466087341309, |
| "learning_rate": 2.999737457317172e-05, |
| "loss": 1.1137, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.29562982005141386, |
| "grad_norm": 0.4869830906391144, |
| "learning_rate": 2.9994445558738194e-05, |
| "loss": 1.0894, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.30848329048843187, |
| "grad_norm": 0.5086082816123962, |
| "learning_rate": 2.999043203381427e-05, |
| "loss": 1.0055, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3213367609254499, |
| "grad_norm": 0.5111905932426453, |
| "learning_rate": 2.9985334288688106e-05, |
| "loss": 1.0172, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3341902313624679, |
| "grad_norm": 0.5968515276908875, |
| "learning_rate": 2.997915269206677e-05, |
| "loss": 1.0152, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.34704370179948585, |
| "grad_norm": 0.6744562387466431, |
| "learning_rate": 2.9971887691049578e-05, |
| "loss": 0.9944, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.35989717223650386, |
| "grad_norm": 0.5791626572608948, |
| "learning_rate": 2.9963539811095754e-05, |
| "loss": 0.9451, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.37275064267352187, |
| "grad_norm": 0.6787604689598083, |
| "learning_rate": 2.9954109655986444e-05, |
| "loss": 0.9964, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3856041131105398, |
| "grad_norm": 0.5651743412017822, |
| "learning_rate": 2.9943597907781013e-05, |
| "loss": 0.9361, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.39845758354755784, |
| "grad_norm": 0.7102354764938354, |
| "learning_rate": 2.9932005326767748e-05, |
| "loss": 0.9206, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.41131105398457585, |
| "grad_norm": 0.6241850256919861, |
| "learning_rate": 2.9919332751408837e-05, |
| "loss": 0.9521, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4241645244215938, |
| "grad_norm": 0.6255318522453308, |
| "learning_rate": 2.9905581098279747e-05, |
| "loss": 0.946, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.4370179948586118, |
| "grad_norm": 0.6290681958198547, |
| "learning_rate": 2.9890751362002923e-05, |
| "loss": 0.9135, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4498714652956298, |
| "grad_norm": 0.7111775875091553, |
| "learning_rate": 2.9874844615175846e-05, |
| "loss": 0.9554, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.46272493573264784, |
| "grad_norm": 0.6591962575912476, |
| "learning_rate": 2.985786200829346e-05, |
| "loss": 0.9218, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4755784061696658, |
| "grad_norm": 0.7715244889259338, |
| "learning_rate": 2.9839804769664957e-05, |
| "loss": 0.8853, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4884318766066838, |
| "grad_norm": 0.6468695402145386, |
| "learning_rate": 2.982067420532494e-05, |
| "loss": 0.8898, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5012853470437018, |
| "grad_norm": 0.7891976833343506, |
| "learning_rate": 2.980047169893895e-05, |
| "loss": 0.8674, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5141388174807198, |
| "grad_norm": 0.7604276537895203, |
| "learning_rate": 2.9779198711703414e-05, |
| "loss": 0.9642, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5269922879177378, |
| "grad_norm": 0.8270877003669739, |
| "learning_rate": 2.9756856782239924e-05, |
| "loss": 0.8366, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5398457583547558, |
| "grad_norm": 0.7906156778335571, |
| "learning_rate": 2.973344752648398e-05, |
| "loss": 0.8761, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5526992287917738, |
| "grad_norm": 0.7102853655815125, |
| "learning_rate": 2.9708972637568106e-05, |
| "loss": 0.8519, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5655526992287918, |
| "grad_norm": 0.7394425272941589, |
| "learning_rate": 2.9683433885699393e-05, |
| "loss": 0.8693, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5784061696658098, |
| "grad_norm": 0.7369321584701538, |
| "learning_rate": 2.965683311803144e-05, |
| "loss": 0.8805, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5912596401028277, |
| "grad_norm": 0.7407816648483276, |
| "learning_rate": 2.962917225853081e-05, |
| "loss": 0.8342, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6041131105398457, |
| "grad_norm": 0.7909078001976013, |
| "learning_rate": 2.960045330783781e-05, |
| "loss": 0.8429, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.6169665809768637, |
| "grad_norm": 0.7501896619796753, |
| "learning_rate": 2.957067834312183e-05, |
| "loss": 0.812, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6298200514138818, |
| "grad_norm": 0.9143732786178589, |
| "learning_rate": 2.9539849517931084e-05, |
| "loss": 0.8153, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.6426735218508998, |
| "grad_norm": 0.8319126963615417, |
| "learning_rate": 2.9507969062036884e-05, |
| "loss": 0.831, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6555269922879178, |
| "grad_norm": 0.9196388125419617, |
| "learning_rate": 2.9475039281272315e-05, |
| "loss": 0.8021, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6683804627249358, |
| "grad_norm": 0.785527765750885, |
| "learning_rate": 2.9441062557365505e-05, |
| "loss": 0.7962, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6812339331619537, |
| "grad_norm": 0.7972485423088074, |
| "learning_rate": 2.9406041347767342e-05, |
| "loss": 0.8106, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.6940874035989717, |
| "grad_norm": 0.8886427879333496, |
| "learning_rate": 2.9369978185473732e-05, |
| "loss": 0.7575, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7069408740359897, |
| "grad_norm": 0.8090516924858093, |
| "learning_rate": 2.9332875678842385e-05, |
| "loss": 0.8337, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7197943444730077, |
| "grad_norm": 0.8744608759880066, |
| "learning_rate": 2.929473651140419e-05, |
| "loss": 0.8028, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7326478149100257, |
| "grad_norm": 0.9550356268882751, |
| "learning_rate": 2.9255563441669085e-05, |
| "loss": 0.7823, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.7455012853470437, |
| "grad_norm": 0.9044423699378967, |
| "learning_rate": 2.9215359302926564e-05, |
| "loss": 0.7508, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7583547557840618, |
| "grad_norm": 0.874662458896637, |
| "learning_rate": 2.917412700304075e-05, |
| "loss": 0.7513, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7712082262210797, |
| "grad_norm": 0.9646016955375671, |
| "learning_rate": 2.913186952424007e-05, |
| "loss": 0.7954, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7840616966580977, |
| "grad_norm": 0.9356961846351624, |
| "learning_rate": 2.9088589922901544e-05, |
| "loss": 0.7316, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.7969151670951157, |
| "grad_norm": 1.0034047365188599, |
| "learning_rate": 2.9044291329329772e-05, |
| "loss": 0.7385, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.8097686375321337, |
| "grad_norm": 1.038320779800415, |
| "learning_rate": 2.8998976947530478e-05, |
| "loss": 0.7038, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8226221079691517, |
| "grad_norm": 0.9056432843208313, |
| "learning_rate": 2.8952650054978792e-05, |
| "loss": 0.7287, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8354755784061697, |
| "grad_norm": 0.8862286806106567, |
| "learning_rate": 2.8905314002382196e-05, |
| "loss": 0.7359, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8483290488431876, |
| "grad_norm": 0.924501895904541, |
| "learning_rate": 2.8856972213438183e-05, |
| "loss": 0.6987, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8611825192802056, |
| "grad_norm": 0.9231320023536682, |
| "learning_rate": 2.8807628184586618e-05, |
| "loss": 0.7152, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.8740359897172236, |
| "grad_norm": 0.9292797446250916, |
| "learning_rate": 2.8757285484756853e-05, |
| "loss": 0.6684, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8868894601542416, |
| "grad_norm": 0.8607897758483887, |
| "learning_rate": 2.870594775510961e-05, |
| "loss": 0.6443, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8997429305912596, |
| "grad_norm": 0.918314516544342, |
| "learning_rate": 2.8653618708773598e-05, |
| "loss": 0.6427, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9125964010282777, |
| "grad_norm": 0.9614541530609131, |
| "learning_rate": 2.8600302130576966e-05, |
| "loss": 0.6409, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.9254498714652957, |
| "grad_norm": 0.9149335622787476, |
| "learning_rate": 2.854600187677357e-05, |
| "loss": 0.6544, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9383033419023136, |
| "grad_norm": 1.0157501697540283, |
| "learning_rate": 2.849072187476403e-05, |
| "loss": 0.6663, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.9511568123393316, |
| "grad_norm": 0.8862146139144897, |
| "learning_rate": 2.8434466122811694e-05, |
| "loss": 0.6654, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.9640102827763496, |
| "grad_norm": 0.9366165399551392, |
| "learning_rate": 2.8377238689753448e-05, |
| "loss": 0.6497, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9768637532133676, |
| "grad_norm": 0.9608516097068787, |
| "learning_rate": 2.831904371470542e-05, |
| "loss": 0.6262, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9897172236503856, |
| "grad_norm": 0.9198379516601562, |
| "learning_rate": 2.825988540676362e-05, |
| "loss": 0.6893, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.0025706940874035, |
| "grad_norm": 1.1038013696670532, |
| "learning_rate": 2.81997680446995e-05, |
| "loss": 0.5883, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.0154241645244215, |
| "grad_norm": 1.0748287439346313, |
| "learning_rate": 2.8138695976650474e-05, |
| "loss": 0.5292, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.0282776349614395, |
| "grad_norm": 1.0702522993087769, |
| "learning_rate": 2.807667361980544e-05, |
| "loss": 0.5584, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0411311053984575, |
| "grad_norm": 1.0195493698120117, |
| "learning_rate": 2.8013705460085298e-05, |
| "loss": 0.5249, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.0539845758354756, |
| "grad_norm": 1.0030614137649536, |
| "learning_rate": 2.7949796051818478e-05, |
| "loss": 0.5383, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.0668380462724936, |
| "grad_norm": 1.0707740783691406, |
| "learning_rate": 2.7884950017411556e-05, |
| "loss": 0.578, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.0796915167095116, |
| "grad_norm": 1.021653652191162, |
| "learning_rate": 2.7819172047014916e-05, |
| "loss": 0.5773, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0925449871465296, |
| "grad_norm": 1.0462572574615479, |
| "learning_rate": 2.7752466898183518e-05, |
| "loss": 0.5325, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.1053984575835476, |
| "grad_norm": 0.8722683191299438, |
| "learning_rate": 2.7684839395532815e-05, |
| "loss": 0.5503, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.1182519280205656, |
| "grad_norm": 1.03123939037323, |
| "learning_rate": 2.761629443038978e-05, |
| "loss": 0.5297, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.1311053984575836, |
| "grad_norm": 1.130732774734497, |
| "learning_rate": 2.7546836960439146e-05, |
| "loss": 0.5413, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1439588688946016, |
| "grad_norm": 0.9612518548965454, |
| "learning_rate": 2.7476472009364814e-05, |
| "loss": 0.5987, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.1568123393316196, |
| "grad_norm": 1.2290369272232056, |
| "learning_rate": 2.7405204666486513e-05, |
| "loss": 0.5066, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1696658097686377, |
| "grad_norm": 1.1223726272583008, |
| "learning_rate": 2.7333040086391692e-05, |
| "loss": 0.4859, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.1825192802056554, |
| "grad_norm": 1.047003984451294, |
| "learning_rate": 2.7259983488562726e-05, |
| "loss": 0.5298, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1953727506426735, |
| "grad_norm": 1.047174096107483, |
| "learning_rate": 2.718604015699937e-05, |
| "loss": 0.4896, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.2082262210796915, |
| "grad_norm": 1.0913561582565308, |
| "learning_rate": 2.7111215439836596e-05, |
| "loss": 0.5099, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.2210796915167095, |
| "grad_norm": 0.9646836519241333, |
| "learning_rate": 2.7035514748957798e-05, |
| "loss": 0.5123, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2339331619537275, |
| "grad_norm": 0.9636846780776978, |
| "learning_rate": 2.6958943559603316e-05, |
| "loss": 0.535, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.2467866323907455, |
| "grad_norm": 1.0172802209854126, |
| "learning_rate": 2.6881507409974473e-05, |
| "loss": 0.4792, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.2596401028277635, |
| "grad_norm": 1.0088897943496704, |
| "learning_rate": 2.6803211900832975e-05, |
| "loss": 0.4895, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2724935732647815, |
| "grad_norm": 1.17917001247406, |
| "learning_rate": 2.6724062695095853e-05, |
| "loss": 0.4796, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.2853470437017995, |
| "grad_norm": 1.1440588235855103, |
| "learning_rate": 2.6644065517425857e-05, |
| "loss": 0.5509, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2982005141388175, |
| "grad_norm": 1.267622470855713, |
| "learning_rate": 2.65632261538174e-05, |
| "loss": 0.5004, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.3110539845758356, |
| "grad_norm": 0.9338383078575134, |
| "learning_rate": 2.64815504511781e-05, |
| "loss": 0.4886, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.3239074550128533, |
| "grad_norm": 1.0977288484573364, |
| "learning_rate": 2.639904431690587e-05, |
| "loss": 0.4851, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.3367609254498714, |
| "grad_norm": 1.1947989463806152, |
| "learning_rate": 2.631571371846164e-05, |
| "loss": 0.4573, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.3496143958868894, |
| "grad_norm": 1.1320568323135376, |
| "learning_rate": 2.6231564682937762e-05, |
| "loss": 0.4805, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3624678663239074, |
| "grad_norm": 1.0743120908737183, |
| "learning_rate": 2.614660329662209e-05, |
| "loss": 0.4867, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.3753213367609254, |
| "grad_norm": 1.1182608604431152, |
| "learning_rate": 2.606083570455776e-05, |
| "loss": 0.4444, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.3881748071979434, |
| "grad_norm": 1.0361340045928955, |
| "learning_rate": 2.5974268110098727e-05, |
| "loss": 0.4507, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.4010282776349614, |
| "grad_norm": 1.0230952501296997, |
| "learning_rate": 2.588690677446113e-05, |
| "loss": 0.4262, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.4138817480719794, |
| "grad_norm": 1.0415043830871582, |
| "learning_rate": 2.5798758016270384e-05, |
| "loss": 0.4946, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.4267352185089974, |
| "grad_norm": 1.0534740686416626, |
| "learning_rate": 2.570982821110421e-05, |
| "loss": 0.4764, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.4395886889460154, |
| "grad_norm": 1.078011155128479, |
| "learning_rate": 2.5620123791031488e-05, |
| "loss": 0.4319, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.4524421593830334, |
| "grad_norm": 0.988162100315094, |
| "learning_rate": 2.5529651244147035e-05, |
| "loss": 0.4761, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.4652956298200515, |
| "grad_norm": 1.1337324380874634, |
| "learning_rate": 2.5438417114102358e-05, |
| "loss": 0.4563, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4781491002570695, |
| "grad_norm": 1.0532126426696777, |
| "learning_rate": 2.5346427999632342e-05, |
| "loss": 0.4486, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.4910025706940875, |
| "grad_norm": 1.0289413928985596, |
| "learning_rate": 2.5253690554078018e-05, |
| "loss": 0.4767, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.5038560411311055, |
| "grad_norm": 1.106880784034729, |
| "learning_rate": 2.5160211484905285e-05, |
| "loss": 0.4757, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.5167095115681235, |
| "grad_norm": 0.9928240180015564, |
| "learning_rate": 2.5065997553219846e-05, |
| "loss": 0.46, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.5295629820051415, |
| "grad_norm": 1.1204873323440552, |
| "learning_rate": 2.4971055573278135e-05, |
| "loss": 0.3968, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.5424164524421595, |
| "grad_norm": 0.9707500338554382, |
| "learning_rate": 2.48753924119945e-05, |
| "loss": 0.4138, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5552699228791775, |
| "grad_norm": 1.307215690612793, |
| "learning_rate": 2.47790149884445e-05, |
| "loss": 0.4653, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.5681233933161953, |
| "grad_norm": 1.1242326498031616, |
| "learning_rate": 2.468193027336451e-05, |
| "loss": 0.4385, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.5809768637532133, |
| "grad_norm": 1.0686546564102173, |
| "learning_rate": 2.4584145288647497e-05, |
| "loss": 0.4359, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.5938303341902313, |
| "grad_norm": 0.9722070693969727, |
| "learning_rate": 2.448566710683518e-05, |
| "loss": 0.4189, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.6066838046272494, |
| "grad_norm": 1.043075680732727, |
| "learning_rate": 2.4386502850606477e-05, |
| "loss": 0.4478, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.6195372750642674, |
| "grad_norm": 1.1963540315628052, |
| "learning_rate": 2.4286659692262342e-05, |
| "loss": 0.4276, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.6323907455012854, |
| "grad_norm": 0.9894089102745056, |
| "learning_rate": 2.4186144853206997e-05, |
| "loss": 0.3736, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.6452442159383034, |
| "grad_norm": 1.1344518661499023, |
| "learning_rate": 2.4084965603425663e-05, |
| "loss": 0.3955, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.6580976863753212, |
| "grad_norm": 1.0184675455093384, |
| "learning_rate": 2.398312926095869e-05, |
| "loss": 0.3938, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.6709511568123392, |
| "grad_norm": 0.9903520941734314, |
| "learning_rate": 2.3880643191372306e-05, |
| "loss": 0.4075, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.6838046272493572, |
| "grad_norm": 1.0676472187042236, |
| "learning_rate": 2.3777514807225857e-05, |
| "loss": 0.404, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.6966580976863752, |
| "grad_norm": 1.0501329898834229, |
| "learning_rate": 2.3673751567535683e-05, |
| "loss": 0.4091, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.7095115681233932, |
| "grad_norm": 0.9608376622200012, |
| "learning_rate": 2.3569360977235625e-05, |
| "loss": 0.4083, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.7223650385604112, |
| "grad_norm": 1.2020519971847534, |
| "learning_rate": 2.346435058663423e-05, |
| "loss": 0.3767, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.7352185089974292, |
| "grad_norm": 1.165675401687622, |
| "learning_rate": 2.335872799086862e-05, |
| "loss": 0.4, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.7480719794344473, |
| "grad_norm": 1.3436106443405151, |
| "learning_rate": 2.325250082935518e-05, |
| "loss": 0.3921, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.7609254498714653, |
| "grad_norm": 1.2254986763000488, |
| "learning_rate": 2.314567678523703e-05, |
| "loss": 0.363, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.7737789203084833, |
| "grad_norm": 1.0373125076293945, |
| "learning_rate": 2.3038263584828272e-05, |
| "loss": 0.3791, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.7866323907455013, |
| "grad_norm": 1.060325026512146, |
| "learning_rate": 2.2930268997055234e-05, |
| "loss": 0.3559, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.7994858611825193, |
| "grad_norm": 1.0755804777145386, |
| "learning_rate": 2.282170083289451e-05, |
| "loss": 0.367, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.8123393316195373, |
| "grad_norm": 0.9357439875602722, |
| "learning_rate": 2.271256694480803e-05, |
| "loss": 0.3539, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.8251928020565553, |
| "grad_norm": 1.1248786449432373, |
| "learning_rate": 2.2602875226175115e-05, |
| "loss": 0.3601, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.8380462724935733, |
| "grad_norm": 1.1184437274932861, |
| "learning_rate": 2.2492633610721562e-05, |
| "loss": 0.3506, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.8508997429305913, |
| "grad_norm": 1.078311562538147, |
| "learning_rate": 2.2381850071945826e-05, |
| "loss": 0.355, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.8637532133676094, |
| "grad_norm": 1.202223539352417, |
| "learning_rate": 2.2270532622542308e-05, |
| "loss": 0.3526, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.8766066838046274, |
| "grad_norm": 1.1870999336242676, |
| "learning_rate": 2.2158689313821812e-05, |
| "loss": 0.3556, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8894601542416454, |
| "grad_norm": 1.0799825191497803, |
| "learning_rate": 2.2046328235129237e-05, |
| "loss": 0.3354, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.9023136246786634, |
| "grad_norm": 0.9738938212394714, |
| "learning_rate": 2.193345751325847e-05, |
| "loss": 0.3546, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.9151670951156814, |
| "grad_norm": 0.9771959781646729, |
| "learning_rate": 2.1820085311864616e-05, |
| "loss": 0.3732, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.9280205655526992, |
| "grad_norm": 1.138285517692566, |
| "learning_rate": 2.170621983087351e-05, |
| "loss": 0.332, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.9408740359897172, |
| "grad_norm": 1.048834204673767, |
| "learning_rate": 2.1591869305888694e-05, |
| "loss": 0.3499, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.9537275064267352, |
| "grad_norm": 1.0362058877944946, |
| "learning_rate": 2.1477042007595676e-05, |
| "loss": 0.3614, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9665809768637532, |
| "grad_norm": 1.0137721300125122, |
| "learning_rate": 2.1361746241163807e-05, |
| "loss": 0.3326, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.9794344473007712, |
| "grad_norm": 1.0517054796218872, |
| "learning_rate": 2.1245990345645562e-05, |
| "loss": 0.3399, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9922879177377892, |
| "grad_norm": 1.1292747259140015, |
| "learning_rate": 2.1129782693373374e-05, |
| "loss": 0.3438, |
| "step": 775 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1945, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2389199453544776e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|