| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 100, |
| "global_step": 1386, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021645021645021644, |
| "grad_norm": 10.67443561553955, |
| "learning_rate": 4.5e-06, |
| "loss": 3.1621, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04329004329004329, |
| "grad_norm": 5.979658126831055, |
| "learning_rate": 9.5e-06, |
| "loss": 2.9832, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06493506493506493, |
| "grad_norm": 6.758372783660889, |
| "learning_rate": 1.45e-05, |
| "loss": 2.6111, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08658008658008658, |
| "grad_norm": 6.213019371032715, |
| "learning_rate": 1.9500000000000003e-05, |
| "loss": 2.0887, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10822510822510822, |
| "grad_norm": 4.623664855957031, |
| "learning_rate": 2.45e-05, |
| "loss": 1.6826, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12987012987012986, |
| "grad_norm": 3.8101072311401367, |
| "learning_rate": 2.95e-05, |
| "loss": 1.1627, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15151515151515152, |
| "grad_norm": 3.790235996246338, |
| "learning_rate": 3.45e-05, |
| "loss": 0.9186, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17316017316017315, |
| "grad_norm": 3.3711209297180176, |
| "learning_rate": 3.9500000000000005e-05, |
| "loss": 0.7483, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19480519480519481, |
| "grad_norm": 2.9479563236236572, |
| "learning_rate": 4.4500000000000004e-05, |
| "loss": 0.64, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.21645021645021645, |
| "grad_norm": 2.083592414855957, |
| "learning_rate": 4.9500000000000004e-05, |
| "loss": 0.5307, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.21645021645021645, |
| "eval_loss": 0.18818749487400055, |
| "eval_runtime": 26.3426, |
| "eval_samples_per_second": 18.981, |
| "eval_steps_per_second": 0.607, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 2.0266427993774414, |
| "learning_rate": 4.9650077760497674e-05, |
| "loss": 0.4357, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2597402597402597, |
| "grad_norm": 2.4006917476654053, |
| "learning_rate": 4.926127527216174e-05, |
| "loss": 0.3998, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2813852813852814, |
| "grad_norm": 1.9328948259353638, |
| "learning_rate": 4.887247278382582e-05, |
| "loss": 0.3816, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.30303030303030304, |
| "grad_norm": 2.675459146499634, |
| "learning_rate": 4.848367029548989e-05, |
| "loss": 0.3783, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3246753246753247, |
| "grad_norm": 1.5593161582946777, |
| "learning_rate": 4.809486780715397e-05, |
| "loss": 0.3344, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3463203463203463, |
| "grad_norm": 2.0941596031188965, |
| "learning_rate": 4.770606531881804e-05, |
| "loss": 0.3389, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.36796536796536794, |
| "grad_norm": 1.4612257480621338, |
| "learning_rate": 4.731726283048212e-05, |
| "loss": 0.3662, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.38961038961038963, |
| "grad_norm": 1.9398542642593384, |
| "learning_rate": 4.692846034214619e-05, |
| "loss": 0.3258, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.41125541125541126, |
| "grad_norm": 1.4659396409988403, |
| "learning_rate": 4.653965785381027e-05, |
| "loss": 0.3255, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4329004329004329, |
| "grad_norm": 1.8461475372314453, |
| "learning_rate": 4.615085536547434e-05, |
| "loss": 0.3288, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4329004329004329, |
| "eval_loss": 0.13621099293231964, |
| "eval_runtime": 25.3133, |
| "eval_samples_per_second": 19.752, |
| "eval_steps_per_second": 0.632, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.45454545454545453, |
| "grad_norm": 1.8327516317367554, |
| "learning_rate": 4.576205287713842e-05, |
| "loss": 0.3385, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 1.3256858587265015, |
| "learning_rate": 4.537325038880249e-05, |
| "loss": 0.3087, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.49783549783549785, |
| "grad_norm": 1.5844945907592773, |
| "learning_rate": 4.498444790046656e-05, |
| "loss": 0.3099, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5194805194805194, |
| "grad_norm": 2.342363119125366, |
| "learning_rate": 4.4595645412130637e-05, |
| "loss": 0.2831, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5411255411255411, |
| "grad_norm": 2.106011152267456, |
| "learning_rate": 4.420684292379472e-05, |
| "loss": 0.3231, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5627705627705628, |
| "grad_norm": 1.503414511680603, |
| "learning_rate": 4.3818040435458794e-05, |
| "loss": 0.2756, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5844155844155844, |
| "grad_norm": 1.6984680891036987, |
| "learning_rate": 4.342923794712286e-05, |
| "loss": 0.251, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 1.6489607095718384, |
| "learning_rate": 4.304043545878694e-05, |
| "loss": 0.272, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6277056277056277, |
| "grad_norm": 1.6416125297546387, |
| "learning_rate": 4.265163297045101e-05, |
| "loss": 0.2917, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6493506493506493, |
| "grad_norm": 1.477407693862915, |
| "learning_rate": 4.226283048211509e-05, |
| "loss": 0.2849, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6493506493506493, |
| "eval_loss": 0.12747418880462646, |
| "eval_runtime": 23.694, |
| "eval_samples_per_second": 21.102, |
| "eval_steps_per_second": 0.675, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.670995670995671, |
| "grad_norm": 1.2621872425079346, |
| "learning_rate": 4.187402799377916e-05, |
| "loss": 0.2728, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6926406926406926, |
| "grad_norm": 1.1352860927581787, |
| "learning_rate": 4.148522550544324e-05, |
| "loss": 0.2831, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 3.7155425548553467, |
| "learning_rate": 4.109642301710731e-05, |
| "loss": 0.2691, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7359307359307359, |
| "grad_norm": 1.287180781364441, |
| "learning_rate": 4.070762052877139e-05, |
| "loss": 0.2711, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7575757575757576, |
| "grad_norm": 1.4427192211151123, |
| "learning_rate": 4.031881804043546e-05, |
| "loss": 0.2923, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7792207792207793, |
| "grad_norm": 1.3908069133758545, |
| "learning_rate": 3.993001555209954e-05, |
| "loss": 0.2636, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8008658008658008, |
| "grad_norm": 1.7817895412445068, |
| "learning_rate": 3.954121306376361e-05, |
| "loss": 0.2533, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8225108225108225, |
| "grad_norm": 1.4778376817703247, |
| "learning_rate": 3.915241057542768e-05, |
| "loss": 0.2829, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8441558441558441, |
| "grad_norm": 1.4331694841384888, |
| "learning_rate": 3.8763608087091756e-05, |
| "loss": 0.2788, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8658008658008658, |
| "grad_norm": 1.4717748165130615, |
| "learning_rate": 3.837480559875584e-05, |
| "loss": 0.318, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8658008658008658, |
| "eval_loss": 0.11318539083003998, |
| "eval_runtime": 24.6731, |
| "eval_samples_per_second": 20.265, |
| "eval_steps_per_second": 0.648, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8874458874458875, |
| "grad_norm": 1.3594976663589478, |
| "learning_rate": 3.798600311041991e-05, |
| "loss": 0.2735, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 1.303183674812317, |
| "learning_rate": 3.759720062208398e-05, |
| "loss": 0.249, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9307359307359307, |
| "grad_norm": 1.1007519960403442, |
| "learning_rate": 3.7208398133748056e-05, |
| "loss": 0.2749, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 1.306304931640625, |
| "learning_rate": 3.681959564541213e-05, |
| "loss": 0.2643, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.974025974025974, |
| "grad_norm": 1.3814477920532227, |
| "learning_rate": 3.6430793157076207e-05, |
| "loss": 0.2601, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9956709956709957, |
| "grad_norm": 1.2352524995803833, |
| "learning_rate": 3.604199066874028e-05, |
| "loss": 0.2669, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0173160173160174, |
| "grad_norm": 1.3061290979385376, |
| "learning_rate": 3.565318818040436e-05, |
| "loss": 0.2395, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0389610389610389, |
| "grad_norm": 1.3227717876434326, |
| "learning_rate": 3.526438569206843e-05, |
| "loss": 0.2439, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0606060606060606, |
| "grad_norm": 1.2254371643066406, |
| "learning_rate": 3.487558320373251e-05, |
| "loss": 0.2421, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0822510822510822, |
| "grad_norm": 1.0040128231048584, |
| "learning_rate": 3.448678071539658e-05, |
| "loss": 0.2691, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0822510822510822, |
| "eval_loss": 0.12204229831695557, |
| "eval_runtime": 24.5509, |
| "eval_samples_per_second": 20.366, |
| "eval_steps_per_second": 0.652, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.103896103896104, |
| "grad_norm": 1.2335394620895386, |
| "learning_rate": 3.409797822706066e-05, |
| "loss": 0.2047, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1255411255411256, |
| "grad_norm": 1.1710796356201172, |
| "learning_rate": 3.370917573872473e-05, |
| "loss": 0.23, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1471861471861473, |
| "grad_norm": 1.1290990114212036, |
| "learning_rate": 3.33203732503888e-05, |
| "loss": 0.2323, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1688311688311688, |
| "grad_norm": 1.1899147033691406, |
| "learning_rate": 3.2931570762052876e-05, |
| "loss": 0.2414, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 1.129758596420288, |
| "learning_rate": 3.254276827371696e-05, |
| "loss": 0.2549, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2121212121212122, |
| "grad_norm": 1.4995540380477905, |
| "learning_rate": 3.215396578538103e-05, |
| "loss": 0.2094, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2337662337662338, |
| "grad_norm": 1.4586797952651978, |
| "learning_rate": 3.17651632970451e-05, |
| "loss": 0.24, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2554112554112553, |
| "grad_norm": 1.256712555885315, |
| "learning_rate": 3.1376360808709176e-05, |
| "loss": 0.2069, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.277056277056277, |
| "grad_norm": 1.3815618753433228, |
| "learning_rate": 3.098755832037325e-05, |
| "loss": 0.2282, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2987012987012987, |
| "grad_norm": 1.2242530584335327, |
| "learning_rate": 3.0598755832037326e-05, |
| "loss": 0.2318, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2987012987012987, |
| "eval_loss": 0.12561704218387604, |
| "eval_runtime": 23.9001, |
| "eval_samples_per_second": 20.92, |
| "eval_steps_per_second": 0.669, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3203463203463204, |
| "grad_norm": 1.352359414100647, |
| "learning_rate": 3.02099533437014e-05, |
| "loss": 0.2032, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.341991341991342, |
| "grad_norm": 1.3257685899734497, |
| "learning_rate": 2.9821150855365476e-05, |
| "loss": 0.243, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3636363636363638, |
| "grad_norm": 1.1021738052368164, |
| "learning_rate": 2.943234836702955e-05, |
| "loss": 0.2093, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3852813852813852, |
| "grad_norm": 1.2614463567733765, |
| "learning_rate": 2.9043545878693623e-05, |
| "loss": 0.2268, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.406926406926407, |
| "grad_norm": 1.1275025606155396, |
| "learning_rate": 2.86547433903577e-05, |
| "loss": 0.2457, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 1.2248728275299072, |
| "learning_rate": 2.8265940902021777e-05, |
| "loss": 0.2235, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4502164502164503, |
| "grad_norm": 1.260707974433899, |
| "learning_rate": 2.7877138413685848e-05, |
| "loss": 0.216, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.4718614718614718, |
| "grad_norm": 1.3132363557815552, |
| "learning_rate": 2.7488335925349923e-05, |
| "loss": 0.2197, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4935064935064934, |
| "grad_norm": 1.1333873271942139, |
| "learning_rate": 2.7099533437013995e-05, |
| "loss": 0.2319, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.5151515151515151, |
| "grad_norm": 1.1966497898101807, |
| "learning_rate": 2.6710730948678077e-05, |
| "loss": 0.225, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5151515151515151, |
| "eval_loss": 0.12175622582435608, |
| "eval_runtime": 24.8733, |
| "eval_samples_per_second": 20.102, |
| "eval_steps_per_second": 0.643, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5367965367965368, |
| "grad_norm": 1.1184797286987305, |
| "learning_rate": 2.632192846034215e-05, |
| "loss": 0.218, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.5584415584415585, |
| "grad_norm": 1.0531848669052124, |
| "learning_rate": 2.5933125972006224e-05, |
| "loss": 0.1902, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5800865800865802, |
| "grad_norm": 1.3125649690628052, |
| "learning_rate": 2.5544323483670295e-05, |
| "loss": 0.2248, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.601731601731602, |
| "grad_norm": 1.2626018524169922, |
| "learning_rate": 2.515552099533437e-05, |
| "loss": 0.2111, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.6233766233766234, |
| "grad_norm": 1.2592939138412476, |
| "learning_rate": 2.4766718506998446e-05, |
| "loss": 0.2251, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.645021645021645, |
| "grad_norm": 1.3215529918670654, |
| "learning_rate": 2.437791601866252e-05, |
| "loss": 0.2089, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.471236228942871, |
| "learning_rate": 2.3989113530326596e-05, |
| "loss": 0.2352, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6883116883116882, |
| "grad_norm": 1.332468867301941, |
| "learning_rate": 2.360031104199067e-05, |
| "loss": 0.2075, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.70995670995671, |
| "grad_norm": 1.2207419872283936, |
| "learning_rate": 2.3211508553654746e-05, |
| "loss": 0.2329, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.7316017316017316, |
| "grad_norm": 1.2343429327011108, |
| "learning_rate": 2.2822706065318818e-05, |
| "loss": 0.2029, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.7316017316017316, |
| "eval_loss": 0.112111896276474, |
| "eval_runtime": 24.4556, |
| "eval_samples_per_second": 20.445, |
| "eval_steps_per_second": 0.654, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.7532467532467533, |
| "grad_norm": 1.0487161874771118, |
| "learning_rate": 2.2433903576982893e-05, |
| "loss": 0.2268, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.774891774891775, |
| "grad_norm": 1.0433166027069092, |
| "learning_rate": 2.2045101088646968e-05, |
| "loss": 0.2176, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.7965367965367967, |
| "grad_norm": 1.1807918548583984, |
| "learning_rate": 2.1656298600311043e-05, |
| "loss": 0.225, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 1.3814514875411987, |
| "learning_rate": 2.1267496111975118e-05, |
| "loss": 0.239, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8398268398268398, |
| "grad_norm": 1.3248578310012817, |
| "learning_rate": 2.0878693623639193e-05, |
| "loss": 0.1726, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8614718614718615, |
| "grad_norm": 1.1932059526443481, |
| "learning_rate": 2.0489891135303265e-05, |
| "loss": 0.2201, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.883116883116883, |
| "grad_norm": 2.034952402114868, |
| "learning_rate": 2.0101088646967343e-05, |
| "loss": 0.2238, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 1.3041192293167114, |
| "learning_rate": 1.9712286158631415e-05, |
| "loss": 0.2018, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.9264069264069263, |
| "grad_norm": 1.185820460319519, |
| "learning_rate": 1.9323483670295493e-05, |
| "loss": 0.2236, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.948051948051948, |
| "grad_norm": 1.0928832292556763, |
| "learning_rate": 1.8934681181959565e-05, |
| "loss": 0.2138, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.948051948051948, |
| "eval_loss": 0.10634125024080276, |
| "eval_runtime": 24.1737, |
| "eval_samples_per_second": 20.684, |
| "eval_steps_per_second": 0.662, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.9696969696969697, |
| "grad_norm": 1.2616679668426514, |
| "learning_rate": 1.854587869362364e-05, |
| "loss": 0.2109, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9913419913419914, |
| "grad_norm": 1.5013710260391235, |
| "learning_rate": 1.8157076205287715e-05, |
| "loss": 0.196, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.012987012987013, |
| "grad_norm": 1.1472351551055908, |
| "learning_rate": 1.776827371695179e-05, |
| "loss": 0.1839, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.034632034632035, |
| "grad_norm": 1.5114259719848633, |
| "learning_rate": 1.7379471228615865e-05, |
| "loss": 0.2003, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.0562770562770565, |
| "grad_norm": 1.0763689279556274, |
| "learning_rate": 1.6990668740279937e-05, |
| "loss": 0.2029, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0779220779220777, |
| "grad_norm": 1.5134332180023193, |
| "learning_rate": 1.6601866251944012e-05, |
| "loss": 0.2028, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.0995670995670994, |
| "grad_norm": 1.4813941717147827, |
| "learning_rate": 1.6213063763608087e-05, |
| "loss": 0.1657, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.121212121212121, |
| "grad_norm": 1.3589473962783813, |
| "learning_rate": 1.5824261275272162e-05, |
| "loss": 0.2153, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 1.2208274602890015, |
| "learning_rate": 1.5435458786936237e-05, |
| "loss": 0.1889, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.1645021645021645, |
| "grad_norm": 1.466425895690918, |
| "learning_rate": 1.5046656298600313e-05, |
| "loss": 0.1786, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.1645021645021645, |
| "eval_loss": 0.11710453778505325, |
| "eval_runtime": 23.4898, |
| "eval_samples_per_second": 21.286, |
| "eval_steps_per_second": 0.681, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.186147186147186, |
| "grad_norm": 1.1942178010940552, |
| "learning_rate": 1.4657853810264386e-05, |
| "loss": 0.1597, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.207792207792208, |
| "grad_norm": 1.6706466674804688, |
| "learning_rate": 1.4269051321928461e-05, |
| "loss": 0.1746, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.2294372294372296, |
| "grad_norm": 1.4485812187194824, |
| "learning_rate": 1.3880248833592534e-05, |
| "loss": 0.1874, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.2510822510822512, |
| "grad_norm": 1.2479631900787354, |
| "learning_rate": 1.3491446345256611e-05, |
| "loss": 0.1755, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.2727272727272725, |
| "grad_norm": 1.4938236474990845, |
| "learning_rate": 1.3102643856920685e-05, |
| "loss": 0.1765, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.2943722943722946, |
| "grad_norm": 1.1002484560012817, |
| "learning_rate": 1.2713841368584758e-05, |
| "loss": 0.1823, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.316017316017316, |
| "grad_norm": 1.3851666450500488, |
| "learning_rate": 1.2325038880248835e-05, |
| "loss": 0.199, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.3376623376623376, |
| "grad_norm": 1.0471409559249878, |
| "learning_rate": 1.1936236391912908e-05, |
| "loss": 0.1707, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.3593073593073592, |
| "grad_norm": 1.4738085269927979, |
| "learning_rate": 1.1547433903576983e-05, |
| "loss": 0.1716, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 1.499088168144226, |
| "learning_rate": 1.1158631415241058e-05, |
| "loss": 0.1773, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "eval_loss": 0.11423339694738388, |
| "eval_runtime": 25.1041, |
| "eval_samples_per_second": 19.917, |
| "eval_steps_per_second": 0.637, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.4025974025974026, |
| "grad_norm": 1.1052640676498413, |
| "learning_rate": 1.0769828926905133e-05, |
| "loss": 0.164, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "grad_norm": 1.096550464630127, |
| "learning_rate": 1.0381026438569208e-05, |
| "loss": 0.1904, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.445887445887446, |
| "grad_norm": 1.2570807933807373, |
| "learning_rate": 9.992223950233282e-06, |
| "loss": 0.172, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.4675324675324677, |
| "grad_norm": 1.1647895574569702, |
| "learning_rate": 9.603421461897357e-06, |
| "loss": 0.1816, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.4891774891774894, |
| "grad_norm": 1.2219709157943726, |
| "learning_rate": 9.214618973561432e-06, |
| "loss": 0.1883, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.5108225108225106, |
| "grad_norm": 1.1570717096328735, |
| "learning_rate": 8.825816485225505e-06, |
| "loss": 0.1864, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.5324675324675323, |
| "grad_norm": 1.3424004316329956, |
| "learning_rate": 8.43701399688958e-06, |
| "loss": 0.1662, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.554112554112554, |
| "grad_norm": 1.32485032081604, |
| "learning_rate": 8.048211508553654e-06, |
| "loss": 0.1593, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.5757575757575757, |
| "grad_norm": 1.309839129447937, |
| "learning_rate": 7.659409020217729e-06, |
| "loss": 0.1811, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.5974025974025974, |
| "grad_norm": 1.4496508836746216, |
| "learning_rate": 7.270606531881805e-06, |
| "loss": 0.1953, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.5974025974025974, |
| "eval_loss": 0.11794831603765488, |
| "eval_runtime": 24.811, |
| "eval_samples_per_second": 20.152, |
| "eval_steps_per_second": 0.645, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 1.12656569480896, |
| "learning_rate": 6.881804043545879e-06, |
| "loss": 0.1845, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.6406926406926408, |
| "grad_norm": 1.2158961296081543, |
| "learning_rate": 6.493001555209954e-06, |
| "loss": 0.1755, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.6623376623376624, |
| "grad_norm": 0.9884421229362488, |
| "learning_rate": 6.1041990668740285e-06, |
| "loss": 0.1611, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.683982683982684, |
| "grad_norm": 1.1807091236114502, |
| "learning_rate": 5.715396578538103e-06, |
| "loss": 0.1955, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.7056277056277054, |
| "grad_norm": 1.504586935043335, |
| "learning_rate": 5.326594090202178e-06, |
| "loss": 0.1999, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 1.1582438945770264, |
| "learning_rate": 4.937791601866252e-06, |
| "loss": 0.1815, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.7489177489177488, |
| "grad_norm": 1.0870862007141113, |
| "learning_rate": 4.548989113530327e-06, |
| "loss": 0.1563, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.7705627705627704, |
| "grad_norm": 1.4519983530044556, |
| "learning_rate": 4.160186625194401e-06, |
| "loss": 0.171, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.792207792207792, |
| "grad_norm": 1.3172404766082764, |
| "learning_rate": 3.771384136858476e-06, |
| "loss": 0.1807, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.813852813852814, |
| "grad_norm": 1.5675866603851318, |
| "learning_rate": 3.382581648522551e-06, |
| "loss": 0.1732, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.813852813852814, |
| "eval_loss": 0.11541545391082764, |
| "eval_runtime": 25.6413, |
| "eval_samples_per_second": 19.5, |
| "eval_steps_per_second": 0.624, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.8354978354978355, |
| "grad_norm": 1.2589536905288696, |
| "learning_rate": 2.9937791601866253e-06, |
| "loss": 0.1823, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.9841740727424622, |
| "learning_rate": 2.6049766718507e-06, |
| "loss": 0.1649, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.878787878787879, |
| "grad_norm": 1.3231135606765747, |
| "learning_rate": 2.2161741835147746e-06, |
| "loss": 0.1784, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.9004329004329006, |
| "grad_norm": 1.3610490560531616, |
| "learning_rate": 1.827371695178849e-06, |
| "loss": 0.1759, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.9220779220779223, |
| "grad_norm": 1.0712759494781494, |
| "learning_rate": 1.438569206842924e-06, |
| "loss": 0.1848, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.9437229437229435, |
| "grad_norm": 1.3472720384597778, |
| "learning_rate": 1.0497667185069984e-06, |
| "loss": 0.2005, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.965367965367965, |
| "grad_norm": 1.5114219188690186, |
| "learning_rate": 6.609642301710731e-07, |
| "loss": 0.1808, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.987012987012987, |
| "grad_norm": 1.4632176160812378, |
| "learning_rate": 2.721617418351478e-07, |
| "loss": 0.1878, |
| "step": 1380 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1386, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.3171098953286943e+18, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|