| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8201261132418904, | |
| "eval_steps": 500, | |
| "global_step": 3500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00520054605733602, | |
| "grad_norm": 16.05304718017578, | |
| "learning_rate": 1.5570934256055363e-06, | |
| "loss": 1.3826, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01040109211467204, | |
| "grad_norm": 13.638071060180664, | |
| "learning_rate": 3.2871972318339097e-06, | |
| "loss": 1.2516, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.015601638172008062, | |
| "grad_norm": 17.292724609375, | |
| "learning_rate": 5.017301038062284e-06, | |
| "loss": 1.1593, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02080218422934408, | |
| "grad_norm": 2.1085638999938965, | |
| "learning_rate": 6.747404844290659e-06, | |
| "loss": 0.5496, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0260027302866801, | |
| "grad_norm": 1.403855562210083, | |
| "learning_rate": 8.477508650519032e-06, | |
| "loss": 0.369, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.031203276344016123, | |
| "grad_norm": 2.6436541080474854, | |
| "learning_rate": 1.0207612456747406e-05, | |
| "loss": 0.2841, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03640382240135214, | |
| "grad_norm": 1.9844225645065308, | |
| "learning_rate": 1.193771626297578e-05, | |
| "loss": 0.4041, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04160436845868816, | |
| "grad_norm": 2.442768096923828, | |
| "learning_rate": 1.3667820069204155e-05, | |
| "loss": 0.2672, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.046804914516024185, | |
| "grad_norm": 2.2531392574310303, | |
| "learning_rate": 1.5397923875432525e-05, | |
| "loss": 0.3032, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0520054605733602, | |
| "grad_norm": 1.9605236053466797, | |
| "learning_rate": 1.7128027681660898e-05, | |
| "loss": 0.2662, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05720600663069622, | |
| "grad_norm": 2.8307950496673584, | |
| "learning_rate": 1.8858131487889273e-05, | |
| "loss": 0.2989, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.062406552688032246, | |
| "grad_norm": 2.2501089572906494, | |
| "learning_rate": 2.058823529411765e-05, | |
| "loss": 0.2533, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06760709874536826, | |
| "grad_norm": 3.275035858154297, | |
| "learning_rate": 2.231833910034602e-05, | |
| "loss": 0.2911, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07280764480270428, | |
| "grad_norm": 3.080817699432373, | |
| "learning_rate": 2.4048442906574396e-05, | |
| "loss": 0.3063, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07800819086004031, | |
| "grad_norm": 5.425448894500732, | |
| "learning_rate": 2.5778546712802772e-05, | |
| "loss": 0.3093, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08320873691737632, | |
| "grad_norm": 1.7119687795639038, | |
| "learning_rate": 2.7508650519031144e-05, | |
| "loss": 0.2612, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08840928297471234, | |
| "grad_norm": 4.50128173828125, | |
| "learning_rate": 2.9238754325259516e-05, | |
| "loss": 0.2895, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09360982903204837, | |
| "grad_norm": 1.802933692932129, | |
| "learning_rate": 3.096885813148789e-05, | |
| "loss": 0.2085, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09881037508938438, | |
| "grad_norm": 2.629002571105957, | |
| "learning_rate": 3.269896193771627e-05, | |
| "loss": 0.2971, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1040109211467204, | |
| "grad_norm": 1.411960244178772, | |
| "learning_rate": 3.4429065743944636e-05, | |
| "loss": 0.2985, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10921146720405643, | |
| "grad_norm": 1.9607653617858887, | |
| "learning_rate": 3.615916955017301e-05, | |
| "loss": 0.2282, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11441201326139244, | |
| "grad_norm": 3.4696173667907715, | |
| "learning_rate": 3.788927335640138e-05, | |
| "loss": 0.259, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.11961255931872847, | |
| "grad_norm": 3.1414554119110107, | |
| "learning_rate": 3.961937716262976e-05, | |
| "loss": 0.2353, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12481310537606449, | |
| "grad_norm": 1.7068389654159546, | |
| "learning_rate": 4.134948096885813e-05, | |
| "loss": 0.2279, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13001365143340052, | |
| "grad_norm": 2.7408318519592285, | |
| "learning_rate": 4.307958477508651e-05, | |
| "loss": 0.2809, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13521419749073652, | |
| "grad_norm": 3.036931276321411, | |
| "learning_rate": 4.480968858131488e-05, | |
| "loss": 0.253, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14041474354807254, | |
| "grad_norm": 2.8552465438842773, | |
| "learning_rate": 4.653979238754326e-05, | |
| "loss": 0.2454, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14561528960540857, | |
| "grad_norm": 3.6416499614715576, | |
| "learning_rate": 4.826989619377163e-05, | |
| "loss": 0.244, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1508158356627446, | |
| "grad_norm": 3.5004782676696777, | |
| "learning_rate": 5e-05, | |
| "loss": 0.2909, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15601638172008062, | |
| "grad_norm": 1.2734322547912598, | |
| "learning_rate": 4.999958918390321e-05, | |
| "loss": 0.33, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16121692777741664, | |
| "grad_norm": 5.017611026763916, | |
| "learning_rate": 4.999835674911443e-05, | |
| "loss": 0.2723, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.16641747383475264, | |
| "grad_norm": 2.2255094051361084, | |
| "learning_rate": 4.999630273613799e-05, | |
| "loss": 0.2993, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17161801989208866, | |
| "grad_norm": 1.4226183891296387, | |
| "learning_rate": 4.9993427212479606e-05, | |
| "loss": 0.2749, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1768185659494247, | |
| "grad_norm": 1.295336127281189, | |
| "learning_rate": 4.998973027264419e-05, | |
| "loss": 0.2618, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1820191120067607, | |
| "grad_norm": 1.9380894899368286, | |
| "learning_rate": 4.998521203813274e-05, | |
| "loss": 0.2595, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18721965806409674, | |
| "grad_norm": 2.218477964401245, | |
| "learning_rate": 4.997987265743834e-05, | |
| "loss": 0.2305, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19242020412143276, | |
| "grad_norm": 2.5676722526550293, | |
| "learning_rate": 4.9973712306041256e-05, | |
| "loss": 0.2259, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.19762075017876876, | |
| "grad_norm": 1.3287098407745361, | |
| "learning_rate": 4.996673118640323e-05, | |
| "loss": 0.2082, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2028212962361048, | |
| "grad_norm": 1.6181299686431885, | |
| "learning_rate": 4.995892952796074e-05, | |
| "loss": 0.2422, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2080218422934408, | |
| "grad_norm": 2.0212459564208984, | |
| "learning_rate": 4.995030758711756e-05, | |
| "loss": 0.296, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21322238835077684, | |
| "grad_norm": 1.0081758499145508, | |
| "learning_rate": 4.994086564723626e-05, | |
| "loss": 0.2289, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.21842293440811286, | |
| "grad_norm": 1.7539795637130737, | |
| "learning_rate": 4.993060401862888e-05, | |
| "loss": 0.2118, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22362348046544886, | |
| "grad_norm": 1.8362935781478882, | |
| "learning_rate": 4.991952303854682e-05, | |
| "loss": 0.2198, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.22882402652278488, | |
| "grad_norm": 3.820204734802246, | |
| "learning_rate": 4.9907623071169686e-05, | |
| "loss": 0.2744, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2340245725801209, | |
| "grad_norm": 2.739043951034546, | |
| "learning_rate": 4.9894904507593316e-05, | |
| "loss": 0.1887, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.23922511863745693, | |
| "grad_norm": 1.1746188402175903, | |
| "learning_rate": 4.988136776581696e-05, | |
| "loss": 0.2105, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.24442566469479296, | |
| "grad_norm": 4.921947956085205, | |
| "learning_rate": 4.9867013290729535e-05, | |
| "loss": 0.2306, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.24962621075212899, | |
| "grad_norm": 4.337555408477783, | |
| "learning_rate": 4.9851841554095e-05, | |
| "loss": 0.2564, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.254826756809465, | |
| "grad_norm": 3.977388620376587, | |
| "learning_rate": 4.9835853054536846e-05, | |
| "loss": 0.2106, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.26002730286680104, | |
| "grad_norm": 1.2592873573303223, | |
| "learning_rate": 4.981904831752171e-05, | |
| "loss": 0.3106, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.26002730286680104, | |
| "eval_loss": 0.2771838307380676, | |
| "eval_runtime": 136.7574, | |
| "eval_samples_per_second": 14.054, | |
| "eval_steps_per_second": 14.054, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.26522784892413703, | |
| "grad_norm": 1.807676911354065, | |
| "learning_rate": 4.98014278953421e-05, | |
| "loss": 0.2341, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27042839498147303, | |
| "grad_norm": 2.764136791229248, | |
| "learning_rate": 4.978299236709826e-05, | |
| "loss": 0.339, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2756289410388091, | |
| "grad_norm": 1.2402966022491455, | |
| "learning_rate": 4.9763742338679145e-05, | |
| "loss": 0.2754, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.2808294870961451, | |
| "grad_norm": 1.5016759634017944, | |
| "learning_rate": 4.974367844274248e-05, | |
| "loss": 0.2544, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.28603003315348113, | |
| "grad_norm": 2.2027359008789062, | |
| "learning_rate": 4.972280133869396e-05, | |
| "loss": 0.232, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.29123057921081713, | |
| "grad_norm": 0.8741855621337891, | |
| "learning_rate": 4.9701111712665625e-05, | |
| "loss": 0.2665, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.29643112526815313, | |
| "grad_norm": 2.105534315109253, | |
| "learning_rate": 4.9678610277493275e-05, | |
| "loss": 0.2719, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3016316713254892, | |
| "grad_norm": 2.820169687271118, | |
| "learning_rate": 4.965529777269306e-05, | |
| "loss": 0.2875, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3068322173828252, | |
| "grad_norm": 1.968910813331604, | |
| "learning_rate": 4.963117496443715e-05, | |
| "loss": 0.2547, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.31203276344016123, | |
| "grad_norm": 1.4258973598480225, | |
| "learning_rate": 4.960624264552858e-05, | |
| "loss": 0.3096, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.31723330949749723, | |
| "grad_norm": 0.6942580342292786, | |
| "learning_rate": 4.958050163537519e-05, | |
| "loss": 0.2271, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3224338555548333, | |
| "grad_norm": 2.4023945331573486, | |
| "learning_rate": 4.955395277996268e-05, | |
| "loss": 0.2973, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3276344016121693, | |
| "grad_norm": 0.890560507774353, | |
| "learning_rate": 4.9526596951826824e-05, | |
| "loss": 0.2368, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3328349476695053, | |
| "grad_norm": 1.4097232818603516, | |
| "learning_rate": 4.949843505002477e-05, | |
| "loss": 0.1829, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.33803549372684133, | |
| "grad_norm": 1.28754723072052, | |
| "learning_rate": 4.946946800010556e-05, | |
| "loss": 0.3505, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3432360397841773, | |
| "grad_norm": 0.8762970566749573, | |
| "learning_rate": 4.9439696754079595e-05, | |
| "loss": 0.2356, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3484365858415134, | |
| "grad_norm": 2.1406095027923584, | |
| "learning_rate": 4.940912229038745e-05, | |
| "loss": 0.2232, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3536371318988494, | |
| "grad_norm": 1.4764164686203003, | |
| "learning_rate": 4.937774561386768e-05, | |
| "loss": 0.2281, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3588376779561854, | |
| "grad_norm": 1.5396536588668823, | |
| "learning_rate": 4.934556775572377e-05, | |
| "loss": 0.2875, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3640382240135214, | |
| "grad_norm": 1.0842628479003906, | |
| "learning_rate": 4.9312589773490304e-05, | |
| "loss": 0.2562, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3692387700708574, | |
| "grad_norm": 1.8963087797164917, | |
| "learning_rate": 4.927881275099815e-05, | |
| "loss": 0.2413, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3744393161281935, | |
| "grad_norm": 1.5899958610534668, | |
| "learning_rate": 4.9244237798338866e-05, | |
| "loss": 0.2979, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3796398621855295, | |
| "grad_norm": 0.8220577836036682, | |
| "learning_rate": 4.920886605182823e-05, | |
| "loss": 0.2868, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.38484040824286553, | |
| "grad_norm": 1.0545523166656494, | |
| "learning_rate": 4.917269867396886e-05, | |
| "loss": 0.194, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3900409543002015, | |
| "grad_norm": 1.3721591234207153, | |
| "learning_rate": 4.913573685341205e-05, | |
| "loss": 0.2109, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3952415003575375, | |
| "grad_norm": 0.9382643699645996, | |
| "learning_rate": 4.909798180491865e-05, | |
| "loss": 0.2194, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4004420464148736, | |
| "grad_norm": 0.6716025471687317, | |
| "learning_rate": 4.9059434769319205e-05, | |
| "loss": 0.2021, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4056425924722096, | |
| "grad_norm": 2.405698537826538, | |
| "learning_rate": 4.902009701347313e-05, | |
| "loss": 0.2933, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4108431385295456, | |
| "grad_norm": 1.7277915477752686, | |
| "learning_rate": 4.8979969830227086e-05, | |
| "loss": 0.2376, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4160436845868816, | |
| "grad_norm": 1.790748119354248, | |
| "learning_rate": 4.8939054538372496e-05, | |
| "loss": 0.2227, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4212442306442176, | |
| "grad_norm": 1.2813634872436523, | |
| "learning_rate": 4.889735248260221e-05, | |
| "loss": 0.2544, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4264447767015537, | |
| "grad_norm": 0.9295778870582581, | |
| "learning_rate": 4.8854865033466275e-05, | |
| "loss": 0.1625, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.43164532275888967, | |
| "grad_norm": 1.9681141376495361, | |
| "learning_rate": 4.881159358732694e-05, | |
| "loss": 0.2262, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4368458688162257, | |
| "grad_norm": 1.1844898462295532, | |
| "learning_rate": 4.8767539566312734e-05, | |
| "loss": 0.2683, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4420464148735617, | |
| "grad_norm": 1.1099355220794678, | |
| "learning_rate": 4.8722704418271745e-05, | |
| "loss": 0.2281, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4472469609308977, | |
| "grad_norm": 1.4917421340942383, | |
| "learning_rate": 4.867708961672399e-05, | |
| "loss": 0.3092, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.45244750698823377, | |
| "grad_norm": 1.1806445121765137, | |
| "learning_rate": 4.863069666081307e-05, | |
| "loss": 0.2272, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.45764805304556977, | |
| "grad_norm": 1.3496099710464478, | |
| "learning_rate": 4.8583527075256804e-05, | |
| "loss": 0.2299, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4628485991029058, | |
| "grad_norm": 2.9580721855163574, | |
| "learning_rate": 4.853558241029723e-05, | |
| "loss": 0.2648, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4680491451602418, | |
| "grad_norm": 0.47517985105514526, | |
| "learning_rate": 4.848686424164953e-05, | |
| "loss": 0.2166, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4732496912175779, | |
| "grad_norm": 1.1966201066970825, | |
| "learning_rate": 4.8437374170450344e-05, | |
| "loss": 0.2499, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.47845023727491387, | |
| "grad_norm": 1.4806653261184692, | |
| "learning_rate": 4.8387113823205096e-05, | |
| "loss": 0.2532, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.48365078333224987, | |
| "grad_norm": 1.9070792198181152, | |
| "learning_rate": 4.833608485173457e-05, | |
| "loss": 0.2721, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4888513293895859, | |
| "grad_norm": 1.1496449708938599, | |
| "learning_rate": 4.8284288933120594e-05, | |
| "loss": 0.2181, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4940518754469219, | |
| "grad_norm": 1.1686209440231323, | |
| "learning_rate": 4.823172776965094e-05, | |
| "loss": 0.2084, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.49925242150425797, | |
| "grad_norm": 1.7963812351226807, | |
| "learning_rate": 4.8178403088763355e-05, | |
| "loss": 0.2436, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.504452967561594, | |
| "grad_norm": 1.3361034393310547, | |
| "learning_rate": 4.812431664298883e-05, | |
| "loss": 0.1777, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.50965351361893, | |
| "grad_norm": 0.7462561726570129, | |
| "learning_rate": 4.8069470209893974e-05, | |
| "loss": 0.2749, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.514854059676266, | |
| "grad_norm": 1.4435970783233643, | |
| "learning_rate": 4.801386559202259e-05, | |
| "loss": 0.2099, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5200546057336021, | |
| "grad_norm": 1.6081739664077759, | |
| "learning_rate": 4.795750461683644e-05, | |
| "loss": 0.2594, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5200546057336021, | |
| "eval_loss": 0.24056576192378998, | |
| "eval_runtime": 134.5423, | |
| "eval_samples_per_second": 14.285, | |
| "eval_steps_per_second": 14.285, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5252551517909381, | |
| "grad_norm": 0.9048750996589661, | |
| "learning_rate": 4.790038913665519e-05, | |
| "loss": 0.2459, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5304556978482741, | |
| "grad_norm": 1.2910796403884888, | |
| "learning_rate": 4.7842521028595526e-05, | |
| "loss": 0.2357, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5356562439056101, | |
| "grad_norm": 1.6829766035079956, | |
| "learning_rate": 4.778390219450949e-05, | |
| "loss": 0.2348, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5408567899629461, | |
| "grad_norm": 2.526048421859741, | |
| "learning_rate": 4.772453456092191e-05, | |
| "loss": 0.2503, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5460573360202822, | |
| "grad_norm": 0.8338559865951538, | |
| "learning_rate": 4.766442007896715e-05, | |
| "loss": 0.1851, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5512578820776182, | |
| "grad_norm": 2.0072736740112305, | |
| "learning_rate": 4.760356072432498e-05, | |
| "loss": 0.3063, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5564584281349542, | |
| "grad_norm": 2.7068746089935303, | |
| "learning_rate": 4.754195849715557e-05, | |
| "loss": 0.2264, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5616589741922902, | |
| "grad_norm": 1.7025487422943115, | |
| "learning_rate": 4.747961542203386e-05, | |
| "loss": 0.1975, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5668595202496262, | |
| "grad_norm": 1.6216896772384644, | |
| "learning_rate": 4.741653354788295e-05, | |
| "loss": 0.232, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5720600663069623, | |
| "grad_norm": 1.5931206941604614, | |
| "learning_rate": 4.735271494790678e-05, | |
| "loss": 0.2607, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5772606123642983, | |
| "grad_norm": 1.2996855974197388, | |
| "learning_rate": 4.7288161719522016e-05, | |
| "loss": 0.2148, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5824611584216343, | |
| "grad_norm": 1.3389666080474854, | |
| "learning_rate": 4.722287598428907e-05, | |
| "loss": 0.2831, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5876617044789703, | |
| "grad_norm": 2.0776829719543457, | |
| "learning_rate": 4.7156859887842416e-05, | |
| "loss": 0.3034, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.5928622505363063, | |
| "grad_norm": 0.8629754781723022, | |
| "learning_rate": 4.709011559982006e-05, | |
| "loss": 0.2287, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5980627965936424, | |
| "grad_norm": 1.2654669284820557, | |
| "learning_rate": 4.7022645313792235e-05, | |
| "loss": 0.2223, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6032633426509784, | |
| "grad_norm": 1.1408824920654297, | |
| "learning_rate": 4.695445124718931e-05, | |
| "loss": 0.1832, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6084638887083144, | |
| "grad_norm": 1.0831233263015747, | |
| "learning_rate": 4.6885535641228904e-05, | |
| "loss": 0.2787, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6136644347656504, | |
| "grad_norm": 1.243690848350525, | |
| "learning_rate": 4.6815900760842236e-05, | |
| "loss": 0.2505, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6188649808229865, | |
| "grad_norm": 2.173030138015747, | |
| "learning_rate": 4.674554889459968e-05, | |
| "loss": 0.2526, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6240655268803225, | |
| "grad_norm": 1.0949965715408325, | |
| "learning_rate": 4.667448235463557e-05, | |
| "loss": 0.233, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6292660729376585, | |
| "grad_norm": 2.3284902572631836, | |
| "learning_rate": 4.660270347657219e-05, | |
| "loss": 0.2447, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6344666189949945, | |
| "grad_norm": 1.0869665145874023, | |
| "learning_rate": 4.6530214619443037e-05, | |
| "loss": 0.2217, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6396671650523305, | |
| "grad_norm": 1.639493465423584, | |
| "learning_rate": 4.645701816561523e-05, | |
| "loss": 0.2322, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6448677111096666, | |
| "grad_norm": 1.2198299169540405, | |
| "learning_rate": 4.63831165207113e-05, | |
| "loss": 0.1883, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6500682571670026, | |
| "grad_norm": 1.4124974012374878, | |
| "learning_rate": 4.630851211353007e-05, | |
| "loss": 0.2559, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6552688032243386, | |
| "grad_norm": 1.7080676555633545, | |
| "learning_rate": 4.623320739596685e-05, | |
| "loss": 0.2219, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6604693492816746, | |
| "grad_norm": 2.443284511566162, | |
| "learning_rate": 4.615720484293286e-05, | |
| "loss": 0.2324, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6656698953390106, | |
| "grad_norm": 0.6745538115501404, | |
| "learning_rate": 4.608050695227385e-05, | |
| "loss": 0.2877, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6708704413963467, | |
| "grad_norm": 1.1423040628433228, | |
| "learning_rate": 4.60031162446881e-05, | |
| "loss": 0.2469, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6760709874536827, | |
| "grad_norm": 1.5825380086898804, | |
| "learning_rate": 4.5925035263643444e-05, | |
| "loss": 0.2699, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6812715335110187, | |
| "grad_norm": 1.138910174369812, | |
| "learning_rate": 4.5846266575293816e-05, | |
| "loss": 0.2457, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.6864720795683547, | |
| "grad_norm": 1.3718457221984863, | |
| "learning_rate": 4.576681276839483e-05, | |
| "loss": 0.2485, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.6916726256256907, | |
| "grad_norm": 1.4293012619018555, | |
| "learning_rate": 4.56866764542187e-05, | |
| "loss": 0.2458, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.6968731716830268, | |
| "grad_norm": 1.009885311126709, | |
| "learning_rate": 4.560586026646845e-05, | |
| "loss": 0.2077, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7020737177403628, | |
| "grad_norm": 0.6243613362312317, | |
| "learning_rate": 4.552436686119134e-05, | |
| "loss": 0.2204, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7072742637976988, | |
| "grad_norm": 1.6868172883987427, | |
| "learning_rate": 4.54421989166916e-05, | |
| "loss": 0.2372, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7124748098550348, | |
| "grad_norm": 1.7123680114746094, | |
| "learning_rate": 4.5359359133442356e-05, | |
| "loss": 0.2613, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7176753559123707, | |
| "grad_norm": 0.856176495552063, | |
| "learning_rate": 4.5275850233996925e-05, | |
| "loss": 0.2438, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7228759019697069, | |
| "grad_norm": 1.1216453313827515, | |
| "learning_rate": 4.5191674962899314e-05, | |
| "loss": 0.2029, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7280764480270429, | |
| "grad_norm": 1.8667545318603516, | |
| "learning_rate": 4.510683608659403e-05, | |
| "loss": 0.1938, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7332769940843789, | |
| "grad_norm": 1.677372932434082, | |
| "learning_rate": 4.502133639333516e-05, | |
| "loss": 0.2053, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7384775401417148, | |
| "grad_norm": 1.217119574546814, | |
| "learning_rate": 4.4935178693094714e-05, | |
| "loss": 0.1992, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7436780861990508, | |
| "grad_norm": 2.1485345363616943, | |
| "learning_rate": 4.484836581747032e-05, | |
| "loss": 0.2454, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.748878632256387, | |
| "grad_norm": 1.3972569704055786, | |
| "learning_rate": 4.4760900619592085e-05, | |
| "loss": 0.1673, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.754079178313723, | |
| "grad_norm": 1.4621198177337646, | |
| "learning_rate": 4.467278597402894e-05, | |
| "loss": 0.2137, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.759279724371059, | |
| "grad_norm": 1.6665892601013184, | |
| "learning_rate": 4.4584024776694035e-05, | |
| "loss": 0.1556, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.764480270428395, | |
| "grad_norm": 1.4974132776260376, | |
| "learning_rate": 4.449461994474968e-05, | |
| "loss": 0.278, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7696808164857311, | |
| "grad_norm": 0.9022512435913086, | |
| "learning_rate": 4.440457441651139e-05, | |
| "loss": 0.1929, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.774881362543067, | |
| "grad_norm": 1.8019062280654907, | |
| "learning_rate": 4.4313891151351375e-05, | |
| "loss": 0.2594, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.780081908600403, | |
| "grad_norm": 1.0030608177185059, | |
| "learning_rate": 4.422257312960123e-05, | |
| "loss": 0.1938, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.780081908600403, | |
| "eval_loss": 0.2387997955083847, | |
| "eval_runtime": 136.4254, | |
| "eval_samples_per_second": 14.088, | |
| "eval_steps_per_second": 14.088, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.785282454657739, | |
| "grad_norm": 1.8986437320709229, | |
| "learning_rate": 4.413062335245402e-05, | |
| "loss": 0.2154, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.790483000715075, | |
| "grad_norm": 1.5987744331359863, | |
| "learning_rate": 4.4038044841865614e-05, | |
| "loss": 0.2624, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7956835467724112, | |
| "grad_norm": 1.032251000404358, | |
| "learning_rate": 4.394484064045542e-05, | |
| "loss": 0.2311, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8008840928297472, | |
| "grad_norm": 1.9166332483291626, | |
| "learning_rate": 4.385101381140633e-05, | |
| "loss": 0.2384, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8060846388870831, | |
| "grad_norm": 0.6986478567123413, | |
| "learning_rate": 4.375656743836407e-05, | |
| "loss": 0.1841, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8112851849444191, | |
| "grad_norm": 0.631565511226654, | |
| "learning_rate": 4.366150462533588e-05, | |
| "loss": 0.2398, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8164857310017551, | |
| "grad_norm": 1.0940667390823364, | |
| "learning_rate": 4.356582849658845e-05, | |
| "loss": 0.1876, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8216862770590913, | |
| "grad_norm": 0.7327963709831238, | |
| "learning_rate": 4.34695421965453e-05, | |
| "loss": 0.2551, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8268868231164272, | |
| "grad_norm": 1.5531721115112305, | |
| "learning_rate": 4.3372648889683364e-05, | |
| "loss": 0.1719, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8320873691737632, | |
| "grad_norm": 0.8876403570175171, | |
| "learning_rate": 4.3275151760429075e-05, | |
| "loss": 0.2152, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8372879152310992, | |
| "grad_norm": 2.079756259918213, | |
| "learning_rate": 4.317705401305362e-05, | |
| "loss": 0.2369, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8424884612884352, | |
| "grad_norm": 1.2363635301589966, | |
| "learning_rate": 4.3078358871567706e-05, | |
| "loss": 0.2718, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8476890073457714, | |
| "grad_norm": 1.3667513132095337, | |
| "learning_rate": 4.2979069579615564e-05, | |
| "loss": 0.2221, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8528895534031073, | |
| "grad_norm": 1.1651591062545776, | |
| "learning_rate": 4.2879189400368314e-05, | |
| "loss": 0.2858, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8580900994604433, | |
| "grad_norm": 0.9213271141052246, | |
| "learning_rate": 4.277872161641682e-05, | |
| "loss": 0.2187, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8632906455177793, | |
| "grad_norm": 0.8052433133125305, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.2695, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8684911915751153, | |
| "grad_norm": 1.9036948680877686, | |
| "learning_rate": 4.257603646121484e-05, | |
| "loss": 0.2253, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8736917376324514, | |
| "grad_norm": 0.8116464018821716, | |
| "learning_rate": 4.247382575127031e-05, | |
| "loss": 0.2417, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.8788922836897874, | |
| "grad_norm": 1.7750636339187622, | |
| "learning_rate": 4.237104075901449e-05, | |
| "loss": 0.2438, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.8840928297471234, | |
| "grad_norm": 0.9960026144981384, | |
| "learning_rate": 4.226768486250572e-05, | |
| "loss": 0.2928, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8892933758044594, | |
| "grad_norm": 1.5663594007492065, | |
| "learning_rate": 4.216376145856529e-05, | |
| "loss": 0.249, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.8944939218617954, | |
| "grad_norm": 2.8207902908325195, | |
| "learning_rate": 4.205927396266577e-05, | |
| "loss": 0.233, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.8996944679191315, | |
| "grad_norm": 0.683710515499115, | |
| "learning_rate": 4.195422580881878e-05, | |
| "loss": 0.1886, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9048950139764675, | |
| "grad_norm": 1.2048577070236206, | |
| "learning_rate": 4.1848620449462115e-05, | |
| "loss": 0.205, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9100955600338035, | |
| "grad_norm": 1.833343505859375, | |
| "learning_rate": 4.17424613553463e-05, | |
| "loss": 0.2846, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9152961060911395, | |
| "grad_norm": 1.2163664102554321, | |
| "learning_rate": 4.163575201542052e-05, | |
| "loss": 0.2269, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9204966521484755, | |
| "grad_norm": 0.7797666788101196, | |
| "learning_rate": 4.152849593671793e-05, | |
| "loss": 0.1856, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9256971982058116, | |
| "grad_norm": 1.4620978832244873, | |
| "learning_rate": 4.142069664424041e-05, | |
| "loss": 0.2599, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9308977442631476, | |
| "grad_norm": 0.480034202337265, | |
| "learning_rate": 4.1312357680842735e-05, | |
| "loss": 0.2485, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9360982903204836, | |
| "grad_norm": 1.0644006729125977, | |
| "learning_rate": 4.120348260711611e-05, | |
| "loss": 0.2576, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9412988363778196, | |
| "grad_norm": 1.8595833778381348, | |
| "learning_rate": 4.109407500127116e-05, | |
| "loss": 0.2438, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9464993824351557, | |
| "grad_norm": 0.9909834861755371, | |
| "learning_rate": 4.098413845902033e-05, | |
| "loss": 0.241, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9516999284924917, | |
| "grad_norm": 1.157691478729248, | |
| "learning_rate": 4.0873676593459725e-05, | |
| "loss": 0.2383, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9569004745498277, | |
| "grad_norm": 1.2096604108810425, | |
| "learning_rate": 4.076269303495033e-05, | |
| "loss": 0.2554, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9621010206071637, | |
| "grad_norm": 0.8286678194999695, | |
| "learning_rate": 4.065119143099874e-05, | |
| "loss": 0.1894, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9673015666644997, | |
| "grad_norm": 0.9873716235160828, | |
| "learning_rate": 4.053917544613723e-05, | |
| "loss": 0.2311, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9725021127218358, | |
| "grad_norm": 0.9408676028251648, | |
| "learning_rate": 4.042664876180341e-05, | |
| "loss": 0.2386, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9777026587791718, | |
| "grad_norm": 0.6958754062652588, | |
| "learning_rate": 4.031361507621911e-05, | |
| "loss": 0.2468, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.9829032048365078, | |
| "grad_norm": 0.8920957446098328, | |
| "learning_rate": 4.0200078104268944e-05, | |
| "loss": 0.2584, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.9881037508938438, | |
| "grad_norm": 1.3254570960998535, | |
| "learning_rate": 4.0086041577378166e-05, | |
| "loss": 0.2755, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9933042969511798, | |
| "grad_norm": 1.2101293802261353, | |
| "learning_rate": 3.9971509243390025e-05, | |
| "loss": 0.2417, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.9985048430085159, | |
| "grad_norm": 0.42130109667778015, | |
| "learning_rate": 3.985648486644267e-05, | |
| "loss": 0.1982, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.0036403822401352, | |
| "grad_norm": 2.4333481788635254, | |
| "learning_rate": 3.974097222684532e-05, | |
| "loss": 0.2277, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.0088409282974713, | |
| "grad_norm": 1.6568609476089478, | |
| "learning_rate": 3.962497512095412e-05, | |
| "loss": 0.1901, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.0140414743548072, | |
| "grad_norm": 1.0351656675338745, | |
| "learning_rate": 3.9508497361047334e-05, | |
| "loss": 0.2923, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0192420204121433, | |
| "grad_norm": 0.8283625245094299, | |
| "learning_rate": 3.939154277520006e-05, | |
| "loss": 0.2245, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.0244425664694794, | |
| "grad_norm": 0.6887472867965698, | |
| "learning_rate": 3.92741152071584e-05, | |
| "loss": 0.1447, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.0296431125268153, | |
| "grad_norm": 2.1077232360839844, | |
| "learning_rate": 3.915621851621318e-05, | |
| "loss": 0.2368, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.0348436585841514, | |
| "grad_norm": 0.7262524366378784, | |
| "learning_rate": 3.903785657707307e-05, | |
| "loss": 0.2153, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.0400442046414873, | |
| "grad_norm": 0.6093840003013611, | |
| "learning_rate": 3.8919033279737274e-05, | |
| "loss": 0.1695, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0400442046414873, | |
| "eval_loss": 0.24628731608390808, | |
| "eval_runtime": 134.8334, | |
| "eval_samples_per_second": 14.255, | |
| "eval_steps_per_second": 14.255, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0452447506988234, | |
| "grad_norm": 1.6017835140228271, | |
| "learning_rate": 3.879975252936761e-05, | |
| "loss": 0.202, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.0504452967561595, | |
| "grad_norm": 1.7225841283798218, | |
| "learning_rate": 3.8680018246160295e-05, | |
| "loss": 0.1952, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.0556458428134954, | |
| "grad_norm": 2.1085808277130127, | |
| "learning_rate": 3.855983436521699e-05, | |
| "loss": 0.2721, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.0608463888708315, | |
| "grad_norm": 0.8755818605422974, | |
| "learning_rate": 3.843920483641551e-05, | |
| "loss": 0.2199, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.0660469349281674, | |
| "grad_norm": 0.6190668344497681, | |
| "learning_rate": 3.831813362428005e-05, | |
| "loss": 0.1944, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.0712474809855035, | |
| "grad_norm": 0.6328080296516418, | |
| "learning_rate": 3.819662470785082e-05, | |
| "loss": 0.2687, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.0764480270428396, | |
| "grad_norm": 1.3243086338043213, | |
| "learning_rate": 3.8074682080553335e-05, | |
| "loss": 0.1866, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.0816485731001755, | |
| "grad_norm": 1.4289870262145996, | |
| "learning_rate": 3.795230975006712e-05, | |
| "loss": 0.1979, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.0868491191575116, | |
| "grad_norm": 1.1440227031707764, | |
| "learning_rate": 3.782951173819403e-05, | |
| "loss": 0.2097, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.0920496652148475, | |
| "grad_norm": 0.7256899476051331, | |
| "learning_rate": 3.7706292080726055e-05, | |
| "loss": 0.2522, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.0972502112721836, | |
| "grad_norm": 1.0164716243743896, | |
| "learning_rate": 3.75826548273127e-05, | |
| "loss": 0.2312, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.1024507573295197, | |
| "grad_norm": 1.053582787513733, | |
| "learning_rate": 3.7458604041327874e-05, | |
| "loss": 0.1406, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.1076513033868556, | |
| "grad_norm": 1.578212022781372, | |
| "learning_rate": 3.733414379973635e-05, | |
| "loss": 0.1913, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.1128518494441917, | |
| "grad_norm": 1.1891608238220215, | |
| "learning_rate": 3.720927819295979e-05, | |
| "loss": 0.2298, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.1180523955015276, | |
| "grad_norm": 0.4603135585784912, | |
| "learning_rate": 3.708401132474228e-05, | |
| "loss": 0.2261, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.1232529415588637, | |
| "grad_norm": 2.1462292671203613, | |
| "learning_rate": 3.695834731201548e-05, | |
| "loss": 0.2354, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.1284534876161998, | |
| "grad_norm": 1.139315128326416, | |
| "learning_rate": 3.683229028476334e-05, | |
| "loss": 0.1615, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.1336540336735357, | |
| "grad_norm": 1.1548924446105957, | |
| "learning_rate": 3.6705844385886334e-05, | |
| "loss": 0.1705, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.1388545797308718, | |
| "grad_norm": 1.0922483205795288, | |
| "learning_rate": 3.6579013771065305e-05, | |
| "loss": 0.1906, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.1440551257882077, | |
| "grad_norm": 0.8926368951797485, | |
| "learning_rate": 3.645180260862492e-05, | |
| "loss": 0.1744, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.1492556718455438, | |
| "grad_norm": 1.1546534299850464, | |
| "learning_rate": 3.632421507939661e-05, | |
| "loss": 0.2112, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.1544562179028799, | |
| "grad_norm": 1.9052295684814453, | |
| "learning_rate": 3.6196255376581254e-05, | |
| "loss": 0.2351, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.1596567639602158, | |
| "grad_norm": 0.9189292788505554, | |
| "learning_rate": 3.6067927705611304e-05, | |
| "loss": 0.2165, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.1648573100175519, | |
| "grad_norm": 0.5956322550773621, | |
| "learning_rate": 3.593923628401259e-05, | |
| "loss": 0.2127, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.1700578560748878, | |
| "grad_norm": 2.0540506839752197, | |
| "learning_rate": 3.581018534126571e-05, | |
| "loss": 0.2175, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.1752584021322239, | |
| "grad_norm": 0.8053009510040283, | |
| "learning_rate": 3.568077911866703e-05, | |
| "loss": 0.2046, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.18045894818956, | |
| "grad_norm": 1.437412142753601, | |
| "learning_rate": 3.5551021869189286e-05, | |
| "loss": 0.2297, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.1856594942468959, | |
| "grad_norm": 0.7657543420791626, | |
| "learning_rate": 3.542091785734184e-05, | |
| "loss": 0.1784, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.190860040304232, | |
| "grad_norm": 1.170629620552063, | |
| "learning_rate": 3.529047135903045e-05, | |
| "loss": 0.1824, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.1960605863615679, | |
| "grad_norm": 1.3208539485931396, | |
| "learning_rate": 3.5159686661416834e-05, | |
| "loss": 0.1682, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.201261132418904, | |
| "grad_norm": 0.5824002027511597, | |
| "learning_rate": 3.502856806277773e-05, | |
| "loss": 0.1631, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.20646167847624, | |
| "grad_norm": 2.711642265319824, | |
| "learning_rate": 3.489711987236357e-05, | |
| "loss": 0.1973, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.211662224533576, | |
| "grad_norm": 0.9232580661773682, | |
| "learning_rate": 3.476534641025698e-05, | |
| "loss": 0.246, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.216862770590912, | |
| "grad_norm": 1.4809739589691162, | |
| "learning_rate": 3.463325200723071e-05, | |
| "loss": 0.2476, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.222063316648248, | |
| "grad_norm": 1.0022258758544922, | |
| "learning_rate": 3.4500841004605324e-05, | |
| "loss": 0.1629, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.227263862705584, | |
| "grad_norm": 0.6187863945960999, | |
| "learning_rate": 3.436811775410651e-05, | |
| "loss": 0.2049, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.2324644087629202, | |
| "grad_norm": 1.0579588413238525, | |
| "learning_rate": 3.42350866177221e-05, | |
| "loss": 0.1923, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.237664954820256, | |
| "grad_norm": 0.8715612888336182, | |
| "learning_rate": 3.410175196755866e-05, | |
| "loss": 0.1777, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.2428655008775922, | |
| "grad_norm": 1.0652248859405518, | |
| "learning_rate": 3.396811818569785e-05, | |
| "loss": 0.258, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.248066046934928, | |
| "grad_norm": 1.5773491859436035, | |
| "learning_rate": 3.383418966405234e-05, | |
| "loss": 0.2021, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.2532665929922642, | |
| "grad_norm": 1.5874974727630615, | |
| "learning_rate": 3.369997080422155e-05, | |
| "loss": 0.2206, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.2584671390496003, | |
| "grad_norm": 1.1131178140640259, | |
| "learning_rate": 3.356546601734692e-05, | |
| "loss": 0.2099, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.2636676851069362, | |
| "grad_norm": 1.019285798072815, | |
| "learning_rate": 3.3430679723966976e-05, | |
| "loss": 0.2599, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.2688682311642723, | |
| "grad_norm": 1.3517482280731201, | |
| "learning_rate": 3.3295616353872026e-05, | |
| "loss": 0.1706, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.2740687772216082, | |
| "grad_norm": 1.2477843761444092, | |
| "learning_rate": 3.3160280345958614e-05, | |
| "loss": 0.2172, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.2792693232789443, | |
| "grad_norm": 0.7591115236282349, | |
| "learning_rate": 3.3024676148083555e-05, | |
| "loss": 0.2201, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.2844698693362804, | |
| "grad_norm": 1.461832046508789, | |
| "learning_rate": 3.288880821691785e-05, | |
| "loss": 0.1695, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.2896704153936163, | |
| "grad_norm": 1.8396881818771362, | |
| "learning_rate": 3.2752681017800144e-05, | |
| "loss": 0.175, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.2948709614509524, | |
| "grad_norm": 1.3018438816070557, | |
| "learning_rate": 3.261629902459e-05, | |
| "loss": 0.2071, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.3000715075082883, | |
| "grad_norm": 1.120477557182312, | |
| "learning_rate": 3.2479666719520886e-05, | |
| "loss": 0.1841, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3000715075082883, | |
| "eval_loss": 0.23911671340465546, | |
| "eval_runtime": 135.2893, | |
| "eval_samples_per_second": 14.207, | |
| "eval_steps_per_second": 14.207, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3052720535656244, | |
| "grad_norm": 2.349160671234131, | |
| "learning_rate": 3.23427885930528e-05, | |
| "loss": 0.1993, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.3104725996229605, | |
| "grad_norm": 0.9985238313674927, | |
| "learning_rate": 3.220566914372477e-05, | |
| "loss": 0.1448, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.3156731456802964, | |
| "grad_norm": 1.038683295249939, | |
| "learning_rate": 3.2068312878006955e-05, | |
| "loss": 0.1793, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.3208736917376325, | |
| "grad_norm": 1.3996448516845703, | |
| "learning_rate": 3.193072431015254e-05, | |
| "loss": 0.1495, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.3260742377949684, | |
| "grad_norm": 1.8597303628921509, | |
| "learning_rate": 3.17929079620494e-05, | |
| "loss": 0.1746, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.3312747838523045, | |
| "grad_norm": 0.5454281568527222, | |
| "learning_rate": 3.1654868363071484e-05, | |
| "loss": 0.1633, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.3364753299096406, | |
| "grad_norm": 2.386983871459961, | |
| "learning_rate": 3.151661004992992e-05, | |
| "loss": 0.2391, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.3416758759669765, | |
| "grad_norm": 1.90854811668396, | |
| "learning_rate": 3.137813756652395e-05, | |
| "loss": 0.1816, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.3468764220243126, | |
| "grad_norm": 0.8159545063972473, | |
| "learning_rate": 3.12394554637916e-05, | |
| "loss": 0.235, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.3520769680816485, | |
| "grad_norm": 1.6975359916687012, | |
| "learning_rate": 3.110056829956006e-05, | |
| "loss": 0.1799, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.3572775141389846, | |
| "grad_norm": 1.2948479652404785, | |
| "learning_rate": 3.096148063839596e-05, | |
| "loss": 0.1747, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.3624780601963207, | |
| "grad_norm": 1.0926662683486938, | |
| "learning_rate": 3.08221970514553e-05, | |
| "loss": 0.1946, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.3676786062536566, | |
| "grad_norm": 2.317523956298828, | |
| "learning_rate": 3.068272211633326e-05, | |
| "loss": 0.2677, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.3728791523109927, | |
| "grad_norm": 1.379921555519104, | |
| "learning_rate": 3.0543060416913696e-05, | |
| "loss": 0.2897, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.3780796983683286, | |
| "grad_norm": 1.2815351486206055, | |
| "learning_rate": 3.0403216543218547e-05, | |
| "loss": 0.205, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.3832802444256647, | |
| "grad_norm": 1.7982994318008423, | |
| "learning_rate": 3.026319509125697e-05, | |
| "loss": 0.1774, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.3884807904830008, | |
| "grad_norm": 2.2039549350738525, | |
| "learning_rate": 3.0123000662874272e-05, | |
| "loss": 0.1811, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.3936813365403367, | |
| "grad_norm": 1.7380796670913696, | |
| "learning_rate": 2.9982637865600683e-05, | |
| "loss": 0.2688, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.3988818825976728, | |
| "grad_norm": 0.9833778738975525, | |
| "learning_rate": 2.9842111312499914e-05, | |
| "loss": 0.1609, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.4040824286550087, | |
| "grad_norm": 2.575516939163208, | |
| "learning_rate": 2.9701425622017583e-05, | |
| "loss": 0.1734, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4092829747123448, | |
| "grad_norm": 3.007417678833008, | |
| "learning_rate": 2.9560585417829368e-05, | |
| "loss": 0.2598, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.4144835207696809, | |
| "grad_norm": 1.1851876974105835, | |
| "learning_rate": 2.9419595328689138e-05, | |
| "loss": 0.1271, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.4196840668270168, | |
| "grad_norm": 2.1141178607940674, | |
| "learning_rate": 2.9278459988276703e-05, | |
| "loss": 0.1752, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.4248846128843529, | |
| "grad_norm": 1.5198488235473633, | |
| "learning_rate": 2.913718403504567e-05, | |
| "loss": 0.2225, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.4300851589416887, | |
| "grad_norm": 0.9600934386253357, | |
| "learning_rate": 2.899577211207087e-05, | |
| "loss": 0.2169, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.4352857049990249, | |
| "grad_norm": 1.3893183469772339, | |
| "learning_rate": 2.8854228866895855e-05, | |
| "loss": 0.2257, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.440486251056361, | |
| "grad_norm": 1.2468478679656982, | |
| "learning_rate": 2.8712558951380097e-05, | |
| "loss": 0.221, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.445686797113697, | |
| "grad_norm": 0.7069809436798096, | |
| "learning_rate": 2.857076702154614e-05, | |
| "loss": 0.1912, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.450887343171033, | |
| "grad_norm": 1.5114367008209229, | |
| "learning_rate": 2.8428857737426556e-05, | |
| "loss": 0.2006, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.4560878892283688, | |
| "grad_norm": 0.9951623678207397, | |
| "learning_rate": 2.8286835762910803e-05, | |
| "loss": 0.1765, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.461288435285705, | |
| "grad_norm": 0.7911898493766785, | |
| "learning_rate": 2.8144705765591938e-05, | |
| "loss": 0.1737, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.466488981343041, | |
| "grad_norm": 0.7575000524520874, | |
| "learning_rate": 2.800247241661321e-05, | |
| "loss": 0.2185, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.4716895274003772, | |
| "grad_norm": 1.342424988746643, | |
| "learning_rate": 2.7860140390514583e-05, | |
| "loss": 0.2083, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.476890073457713, | |
| "grad_norm": 2.5245749950408936, | |
| "learning_rate": 2.771771436507903e-05, | |
| "loss": 0.1811, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.482090619515049, | |
| "grad_norm": 2.4802660942077637, | |
| "learning_rate": 2.757519902117886e-05, | |
| "loss": 0.1575, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.487291165572385, | |
| "grad_norm": 1.177516222000122, | |
| "learning_rate": 2.743259904262187e-05, | |
| "loss": 0.2133, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.4924917116297212, | |
| "grad_norm": 1.1934640407562256, | |
| "learning_rate": 2.7289919115997374e-05, | |
| "loss": 0.23, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.4976922576870573, | |
| "grad_norm": 1.5221962928771973, | |
| "learning_rate": 2.714716393052223e-05, | |
| "loss": 0.2154, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.5028928037443932, | |
| "grad_norm": 2.0732405185699463, | |
| "learning_rate": 2.7004338177886672e-05, | |
| "loss": 0.1759, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.508093349801729, | |
| "grad_norm": 0.8759207129478455, | |
| "learning_rate": 2.686144655210016e-05, | |
| "loss": 0.2008, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5132938958590652, | |
| "grad_norm": 0.9305397868156433, | |
| "learning_rate": 2.6718493749337105e-05, | |
| "loss": 0.1785, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.5184944419164013, | |
| "grad_norm": 0.9819073677062988, | |
| "learning_rate": 2.6575484467782486e-05, | |
| "loss": 0.2719, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.5236949879737374, | |
| "grad_norm": 2.144178628921509, | |
| "learning_rate": 2.6432423407477496e-05, | |
| "loss": 0.1598, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.5288955340310733, | |
| "grad_norm": 2.3962485790252686, | |
| "learning_rate": 2.6289315270165062e-05, | |
| "loss": 0.2127, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.5340960800884091, | |
| "grad_norm": 1.1640074253082275, | |
| "learning_rate": 2.6146164759135266e-05, | |
| "loss": 0.1784, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.5392966261457453, | |
| "grad_norm": 1.0884958505630493, | |
| "learning_rate": 2.6002976579070872e-05, | |
| "loss": 0.1717, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.5444971722030814, | |
| "grad_norm": 1.471543312072754, | |
| "learning_rate": 2.5859755435892597e-05, | |
| "loss": 0.1892, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.5496977182604175, | |
| "grad_norm": 1.1566507816314697, | |
| "learning_rate": 2.5716506036604542e-05, | |
| "loss": 0.2027, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.5548982643177534, | |
| "grad_norm": 1.8999615907669067, | |
| "learning_rate": 2.557323308913942e-05, | |
| "loss": 0.2162, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.5600988103750892, | |
| "grad_norm": 1.2542750835418701, | |
| "learning_rate": 2.542994130220388e-05, | |
| "loss": 0.1548, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.5600988103750892, | |
| "eval_loss": 0.24241599440574646, | |
| "eval_runtime": 135.9654, | |
| "eval_samples_per_second": 14.136, | |
| "eval_steps_per_second": 14.136, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.5652993564324253, | |
| "grad_norm": 2.8087780475616455, | |
| "learning_rate": 2.5286635385123725e-05, | |
| "loss": 0.24, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.5704999024897615, | |
| "grad_norm": 1.2270337343215942, | |
| "learning_rate": 2.5143320047689173e-05, | |
| "loss": 0.1968, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.5757004485470976, | |
| "grad_norm": 1.490675926208496, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.167, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.5809009946044335, | |
| "grad_norm": 0.7937414646148682, | |
| "learning_rate": 2.485667995231084e-05, | |
| "loss": 0.1436, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.5861015406617693, | |
| "grad_norm": 1.8276423215866089, | |
| "learning_rate": 2.4713364614876274e-05, | |
| "loss": 0.2169, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.5913020867191054, | |
| "grad_norm": 2.1891725063323975, | |
| "learning_rate": 2.4570058697796125e-05, | |
| "loss": 0.2003, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.5965026327764416, | |
| "grad_norm": 1.920414686203003, | |
| "learning_rate": 2.4426766910860585e-05, | |
| "loss": 0.224, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.6017031788337777, | |
| "grad_norm": 1.974658727645874, | |
| "learning_rate": 2.428349396339547e-05, | |
| "loss": 0.1934, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.6069037248911135, | |
| "grad_norm": 2.3854596614837646, | |
| "learning_rate": 2.4140244564107402e-05, | |
| "loss": 0.2128, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.6121042709484494, | |
| "grad_norm": 1.476598858833313, | |
| "learning_rate": 2.3997023420929137e-05, | |
| "loss": 0.1819, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.6173048170057855, | |
| "grad_norm": 1.3164430856704712, | |
| "learning_rate": 2.3853835240864743e-05, | |
| "loss": 0.222, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.6225053630631217, | |
| "grad_norm": 1.467546820640564, | |
| "learning_rate": 2.3710684729834954e-05, | |
| "loss": 0.173, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.6277059091204578, | |
| "grad_norm": 0.9425441026687622, | |
| "learning_rate": 2.3567576592522507e-05, | |
| "loss": 0.2174, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.6329064551777936, | |
| "grad_norm": 1.062456488609314, | |
| "learning_rate": 2.342451553221752e-05, | |
| "loss": 0.1934, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.6381070012351295, | |
| "grad_norm": 1.7149615287780762, | |
| "learning_rate": 2.32815062506629e-05, | |
| "loss": 0.2479, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.6433075472924656, | |
| "grad_norm": 0.6858556866645813, | |
| "learning_rate": 2.3138553447899835e-05, | |
| "loss": 0.1825, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.6485080933498018, | |
| "grad_norm": 0.9924718737602234, | |
| "learning_rate": 2.299566182211333e-05, | |
| "loss": 0.155, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.6537086394071379, | |
| "grad_norm": 2.138089656829834, | |
| "learning_rate": 2.2852836069477773e-05, | |
| "loss": 0.2105, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.6589091854644737, | |
| "grad_norm": 1.5541861057281494, | |
| "learning_rate": 2.2710080884002632e-05, | |
| "loss": 0.2087, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.6641097315218096, | |
| "grad_norm": 1.846656084060669, | |
| "learning_rate": 2.2567400957378132e-05, | |
| "loss": 0.1669, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.6693102775791457, | |
| "grad_norm": 2.2019214630126953, | |
| "learning_rate": 2.2424800978821146e-05, | |
| "loss": 0.1955, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.6745108236364818, | |
| "grad_norm": 0.8931058645248413, | |
| "learning_rate": 2.228228563492098e-05, | |
| "loss": 0.1679, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.679711369693818, | |
| "grad_norm": 1.5306602716445923, | |
| "learning_rate": 2.2139859609485426e-05, | |
| "loss": 0.1887, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.6849119157511538, | |
| "grad_norm": 0.7173328399658203, | |
| "learning_rate": 2.199752758338679e-05, | |
| "loss": 0.1744, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.6901124618084897, | |
| "grad_norm": 2.8038320541381836, | |
| "learning_rate": 2.1855294234408068e-05, | |
| "loss": 0.2108, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.6953130078658258, | |
| "grad_norm": 1.2980599403381348, | |
| "learning_rate": 2.1713164237089203e-05, | |
| "loss": 0.1721, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.700513553923162, | |
| "grad_norm": 1.4280049800872803, | |
| "learning_rate": 2.1571142262573457e-05, | |
| "loss": 0.1959, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.705714099980498, | |
| "grad_norm": 2.656005382537842, | |
| "learning_rate": 2.1429232978453862e-05, | |
| "loss": 0.2284, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.710914646037834, | |
| "grad_norm": 0.8656441569328308, | |
| "learning_rate": 2.128744104861991e-05, | |
| "loss": 0.2159, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.7161151920951698, | |
| "grad_norm": 1.6419271230697632, | |
| "learning_rate": 2.1145771133104157e-05, | |
| "loss": 0.1671, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.721315738152506, | |
| "grad_norm": 1.286908507347107, | |
| "learning_rate": 2.1004227887929133e-05, | |
| "loss": 0.1683, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.726516284209842, | |
| "grad_norm": 3.205409288406372, | |
| "learning_rate": 2.086281596495434e-05, | |
| "loss": 0.1585, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.7317168302671782, | |
| "grad_norm": 0.6113395094871521, | |
| "learning_rate": 2.07215400117233e-05, | |
| "loss": 0.1593, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.736917376324514, | |
| "grad_norm": 1.3752492666244507, | |
| "learning_rate": 2.0580404671310878e-05, | |
| "loss": 0.2058, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.7421179223818501, | |
| "grad_norm": 0.68391352891922, | |
| "learning_rate": 2.0439414582170628e-05, | |
| "loss": 0.1796, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.747318468439186, | |
| "grad_norm": 1.9185495376586914, | |
| "learning_rate": 2.0298574377982427e-05, | |
| "loss": 0.2212, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.7525190144965221, | |
| "grad_norm": 1.3910088539123535, | |
| "learning_rate": 2.015788868750009e-05, | |
| "loss": 0.1488, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.7577195605538583, | |
| "grad_norm": 0.8257030248641968, | |
| "learning_rate": 2.001736213439933e-05, | |
| "loss": 0.1957, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.7629201066111941, | |
| "grad_norm": 0.8184394240379333, | |
| "learning_rate": 1.987699933712573e-05, | |
| "loss": 0.2042, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.7681206526685302, | |
| "grad_norm": 0.9625434875488281, | |
| "learning_rate": 1.9736804908743033e-05, | |
| "loss": 0.1953, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.7733211987258661, | |
| "grad_norm": 2.588742256164551, | |
| "learning_rate": 1.959678345678146e-05, | |
| "loss": 0.2007, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.7785217447832022, | |
| "grad_norm": 1.6495355367660522, | |
| "learning_rate": 1.9456939583086303e-05, | |
| "loss": 0.1823, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.7837222908405383, | |
| "grad_norm": 1.325899600982666, | |
| "learning_rate": 1.9317277883666745e-05, | |
| "loss": 0.2144, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.7889228368978742, | |
| "grad_norm": 1.2811932563781738, | |
| "learning_rate": 1.91778029485447e-05, | |
| "loss": 0.2244, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.7941233829552103, | |
| "grad_norm": 1.6615418195724487, | |
| "learning_rate": 1.9038519361604046e-05, | |
| "loss": 0.1965, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.7993239290125462, | |
| "grad_norm": 1.7860767841339111, | |
| "learning_rate": 1.8899431700439946e-05, | |
| "loss": 0.206, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.8045244750698823, | |
| "grad_norm": 1.323864221572876, | |
| "learning_rate": 1.876054453620841e-05, | |
| "loss": 0.1507, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.8097250211272184, | |
| "grad_norm": 1.264664649963379, | |
| "learning_rate": 1.8621862433476054e-05, | |
| "loss": 0.1847, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.8149255671845543, | |
| "grad_norm": 2.377115249633789, | |
| "learning_rate": 1.8483389950070097e-05, | |
| "loss": 0.2117, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.8201261132418904, | |
| "grad_norm": 1.387811541557312, | |
| "learning_rate": 1.8345131636928518e-05, | |
| "loss": 0.2048, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8201261132418904, | |
| "eval_loss": 0.2365516871213913, | |
| "eval_runtime": 134.9588, | |
| "eval_samples_per_second": 14.241, | |
| "eval_steps_per_second": 14.241, | |
| "step": 3500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5769, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2327670832608051e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |