| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1383, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021691973969631236, |
| "grad_norm": 65.0, |
| "learning_rate": 6.474820143884893e-07, |
| "loss": 2.3602, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04338394793926247, |
| "grad_norm": 52.75, |
| "learning_rate": 1.366906474820144e-06, |
| "loss": 2.1594, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0650759219088937, |
| "grad_norm": 61.25, |
| "learning_rate": 2.0863309352517987e-06, |
| "loss": 2.0571, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08676789587852494, |
| "grad_norm": 18.875, |
| "learning_rate": 2.805755395683453e-06, |
| "loss": 1.5085, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10845986984815618, |
| "grad_norm": 10.0, |
| "learning_rate": 3.525179856115108e-06, |
| "loss": 1.1625, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1301518438177874, |
| "grad_norm": 5.78125, |
| "learning_rate": 4.244604316546763e-06, |
| "loss": 0.9246, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15184381778741865, |
| "grad_norm": 5.9375, |
| "learning_rate": 4.9640287769784175e-06, |
| "loss": 0.8639, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1735357917570499, |
| "grad_norm": 5.15625, |
| "learning_rate": 5.683453237410073e-06, |
| "loss": 0.8425, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19522776572668113, |
| "grad_norm": 5.21875, |
| "learning_rate": 6.402877697841727e-06, |
| "loss": 0.8051, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.21691973969631237, |
| "grad_norm": 5.71875, |
| "learning_rate": 7.122302158273382e-06, |
| "loss": 0.811, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2386117136659436, |
| "grad_norm": 6.3125, |
| "learning_rate": 7.841726618705036e-06, |
| "loss": 0.7728, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2603036876355748, |
| "grad_norm": 5.40625, |
| "learning_rate": 8.561151079136692e-06, |
| "loss": 0.782, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.28199566160520606, |
| "grad_norm": 4.78125, |
| "learning_rate": 9.280575539568346e-06, |
| "loss": 0.7653, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3036876355748373, |
| "grad_norm": 5.28125, |
| "learning_rate": 1e-05, |
| "loss": 0.7803, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.32537960954446854, |
| "grad_norm": 5.40625, |
| "learning_rate": 9.998405678466673e-06, |
| "loss": 0.7212, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3470715835140998, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.993623730611148e-06, |
| "loss": 0.731, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.368763557483731, |
| "grad_norm": 5.125, |
| "learning_rate": 9.985657206018403e-06, |
| "loss": 0.7328, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.39045553145336226, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.97451118516912e-06, |
| "loss": 0.725, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4121475054229935, |
| "grad_norm": 5.21875, |
| "learning_rate": 9.960192776199717e-06, |
| "loss": 0.714, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.43383947939262474, |
| "grad_norm": 5.03125, |
| "learning_rate": 9.942711110369292e-06, |
| "loss": 0.7096, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.455531453362256, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.922077336236354e-06, |
| "loss": 0.6876, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4772234273318872, |
| "grad_norm": 4.75, |
| "learning_rate": 9.898304612549068e-06, |
| "loss": 0.696, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.49891540130151846, |
| "grad_norm": 5.15625, |
| "learning_rate": 9.871408099853548e-06, |
| "loss": 0.715, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5206073752711496, |
| "grad_norm": 4.875, |
| "learning_rate": 9.841404950825537e-06, |
| "loss": 0.723, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5422993492407809, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.808314299331661e-06, |
| "loss": 0.6996, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5639913232104121, |
| "grad_norm": 5.59375, |
| "learning_rate": 9.772157248227212e-06, |
| "loss": 0.6812, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5856832971800434, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.732956855898251e-06, |
| "loss": 0.6851, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6073752711496746, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.690738121556622e-06, |
| "loss": 0.6719, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6290672451193059, |
| "grad_norm": 5.21875, |
| "learning_rate": 9.645527969297232e-06, |
| "loss": 0.6816, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6507592190889371, |
| "grad_norm": 4.6875, |
| "learning_rate": 9.59735523092779e-06, |
| "loss": 0.6893, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6724511930585684, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.546250627581937e-06, |
| "loss": 0.659, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6941431670281996, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.4922467501275e-06, |
| "loss": 0.6796, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7158351409978309, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.435378038382364e-06, |
| "loss": 0.6585, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.737527114967462, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.375680759151206e-06, |
| "loss": 0.6772, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7592190889370932, |
| "grad_norm": 4.90625, |
| "learning_rate": 9.313192983097137e-06, |
| "loss": 0.6659, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7809110629067245, |
| "grad_norm": 5.03125, |
| "learning_rate": 9.247954560462929e-06, |
| "loss": 0.6428, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8026030368763557, |
| "grad_norm": 5.21875, |
| "learning_rate": 9.18000709565738e-06, |
| "loss": 0.6718, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.824295010845987, |
| "grad_norm": 4.78125, |
| "learning_rate": 9.109393920723001e-06, |
| "loss": 0.6658, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8459869848156182, |
| "grad_norm": 5.21875, |
| "learning_rate": 9.036160067701931e-06, |
| "loss": 0.6673, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8676789587852495, |
| "grad_norm": 5.34375, |
| "learning_rate": 8.9603522399177e-06, |
| "loss": 0.6866, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8893709327548807, |
| "grad_norm": 4.375, |
| "learning_rate": 8.882018782191205e-06, |
| "loss": 0.6683, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.911062906724512, |
| "grad_norm": 4.8125, |
| "learning_rate": 8.801209650009813e-06, |
| "loss": 0.6814, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9327548806941431, |
| "grad_norm": 4.71875, |
| "learning_rate": 8.717976377669344e-06, |
| "loss": 0.6564, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9544468546637744, |
| "grad_norm": 4.84375, |
| "learning_rate": 8.632372045409142e-06, |
| "loss": 0.6489, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9761388286334056, |
| "grad_norm": 5.375, |
| "learning_rate": 8.544451245561318e-06, |
| "loss": 0.6598, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9978308026030369, |
| "grad_norm": 4.84375, |
| "learning_rate": 8.454270047735644e-06, |
| "loss": 0.6783, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0195227765726682, |
| "grad_norm": 5.0, |
| "learning_rate": 8.361885963062352e-06, |
| "loss": 0.6311, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0412147505422993, |
| "grad_norm": 4.71875, |
| "learning_rate": 8.267357907515662e-06, |
| "loss": 0.6408, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0629067245119306, |
| "grad_norm": 4.9375, |
| "learning_rate": 8.170746164341351e-06, |
| "loss": 0.6211, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0845986984815619, |
| "grad_norm": 5.53125, |
| "learning_rate": 8.072112345612434e-06, |
| "loss": 0.5889, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.106290672451193, |
| "grad_norm": 5.09375, |
| "learning_rate": 7.971519352937348e-06, |
| "loss": 0.5941, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1279826464208242, |
| "grad_norm": 4.90625, |
| "learning_rate": 7.869031337345828e-06, |
| "loss": 0.6015, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1496746203904555, |
| "grad_norm": 5.28125, |
| "learning_rate": 7.764713658377938e-06, |
| "loss": 0.621, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1713665943600868, |
| "grad_norm": 5.15625, |
| "learning_rate": 7.658632842402432e-06, |
| "loss": 0.5997, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1930585683297181, |
| "grad_norm": 5.21875, |
| "learning_rate": 7.550856540190985e-06, |
| "loss": 0.6111, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2147505422993492, |
| "grad_norm": 5.5625, |
| "learning_rate": 7.441453483775354e-06, |
| "loss": 0.5997, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2364425162689805, |
| "grad_norm": 4.71875, |
| "learning_rate": 7.330493442615001e-06, |
| "loss": 0.6216, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2581344902386118, |
| "grad_norm": 4.53125, |
| "learning_rate": 7.218047179103112e-06, |
| "loss": 0.6137, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2798264642082429, |
| "grad_norm": 5.65625, |
| "learning_rate": 7.104186403439391e-06, |
| "loss": 0.6086, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.3015184381778742, |
| "grad_norm": 5.0625, |
| "learning_rate": 6.988983727898414e-06, |
| "loss": 0.5941, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3232104121475055, |
| "grad_norm": 5.4375, |
| "learning_rate": 6.872512620522707e-06, |
| "loss": 0.6141, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.3449023861171367, |
| "grad_norm": 4.75, |
| "learning_rate": 6.754847358270067e-06, |
| "loss": 0.6065, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3665943600867678, |
| "grad_norm": 5.0625, |
| "learning_rate": 6.6360629796450295e-06, |
| "loss": 0.6192, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3882863340563991, |
| "grad_norm": 4.9375, |
| "learning_rate": 6.516235236844661e-06, |
| "loss": 0.6157, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.4099783080260304, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.395440547449214e-06, |
| "loss": 0.6003, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.4316702819956615, |
| "grad_norm": 4.875, |
| "learning_rate": 6.273755945688458e-06, |
| "loss": 0.6219, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4533622559652928, |
| "grad_norm": 4.90625, |
| "learning_rate": 6.1512590333147335e-06, |
| "loss": 0.6114, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.475054229934924, |
| "grad_norm": 5.53125, |
| "learning_rate": 6.02802793011411e-06, |
| "loss": 0.6161, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4967462039045554, |
| "grad_norm": 4.6875, |
| "learning_rate": 5.904141224087147e-06, |
| "loss": 0.6124, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.5184381778741867, |
| "grad_norm": 4.71875, |
| "learning_rate": 5.779677921331094e-06, |
| "loss": 0.6083, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5401301518438177, |
| "grad_norm": 5.03125, |
| "learning_rate": 5.654717395655424e-06, |
| "loss": 0.6023, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.561822125813449, |
| "grad_norm": 5.03125, |
| "learning_rate": 5.529339337962898e-06, |
| "loss": 0.6114, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.58351409978308, |
| "grad_norm": 5.25, |
| "learning_rate": 5.403623705428391e-06, |
| "loss": 0.6036, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.6052060737527114, |
| "grad_norm": 4.625, |
| "learning_rate": 5.277650670507916e-06, |
| "loss": 0.6071, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.6268980477223427, |
| "grad_norm": 5.5625, |
| "learning_rate": 5.151500569810345e-06, |
| "loss": 0.5949, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.648590021691974, |
| "grad_norm": 4.875, |
| "learning_rate": 5.0252538528644715e-06, |
| "loss": 0.6097, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6702819956616053, |
| "grad_norm": 5.375, |
| "learning_rate": 4.898991030814028e-06, |
| "loss": 0.6147, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6919739696312366, |
| "grad_norm": 4.84375, |
| "learning_rate": 4.7727926250734396e-06, |
| "loss": 0.5828, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.7136659436008677, |
| "grad_norm": 4.65625, |
| "learning_rate": 4.646739115977e-06, |
| "loss": 0.5769, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.735357917570499, |
| "grad_norm": 4.8125, |
| "learning_rate": 4.520910891454272e-06, |
| "loss": 0.6084, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.75704989154013, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.3953881957644014e-06, |
| "loss": 0.602, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.7787418655097613, |
| "grad_norm": 4.75, |
| "learning_rate": 4.270251078322048e-06, |
| "loss": 0.6035, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.8004338394793926, |
| "grad_norm": 5.90625, |
| "learning_rate": 4.145579342647595e-06, |
| "loss": 0.5894, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.822125813449024, |
| "grad_norm": 4.625, |
| "learning_rate": 4.021452495474159e-06, |
| "loss": 0.5918, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8438177874186552, |
| "grad_norm": 5.0, |
| "learning_rate": 3.897949696043864e-06, |
| "loss": 0.5936, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8655097613882863, |
| "grad_norm": 5.0, |
| "learning_rate": 3.7751497056257306e-06, |
| "loss": 0.621, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8872017353579176, |
| "grad_norm": 5.15625, |
| "learning_rate": 3.6531308372873663e-06, |
| "loss": 0.6158, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.9088937093275486, |
| "grad_norm": 5.40625, |
| "learning_rate": 3.531970905952478e-06, |
| "loss": 0.5837, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.93058568329718, |
| "grad_norm": 5.0625, |
| "learning_rate": 3.4117471787760682e-06, |
| "loss": 0.6134, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.9522776572668112, |
| "grad_norm": 4.84375, |
| "learning_rate": 3.2925363258689556e-06, |
| "loss": 0.5865, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.9739696312364425, |
| "grad_norm": 4.8125, |
| "learning_rate": 3.174414371403061e-06, |
| "loss": 0.596, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9956616052060738, |
| "grad_norm": 5.40625, |
| "learning_rate": 3.0574566451286094e-06, |
| "loss": 0.6074, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.017353579175705, |
| "grad_norm": 4.96875, |
| "learning_rate": 2.9417377343341935e-06, |
| "loss": 0.583, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.0390455531453364, |
| "grad_norm": 4.6875, |
| "learning_rate": 2.8273314362803337e-06, |
| "loss": 0.584, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.0607375271149673, |
| "grad_norm": 4.90625, |
| "learning_rate": 2.7143107111368437e-06, |
| "loss": 0.6191, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0824295010845986, |
| "grad_norm": 4.875, |
| "learning_rate": 2.602747635454047e-06, |
| "loss": 0.5873, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.10412147505423, |
| "grad_norm": 5.75, |
| "learning_rate": 2.492713356197497e-06, |
| "loss": 0.5804, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.125813449023861, |
| "grad_norm": 4.6875, |
| "learning_rate": 2.3842780453755232e-06, |
| "loss": 0.5855, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.1475054229934925, |
| "grad_norm": 4.8125, |
| "learning_rate": 2.277510855288534e-06, |
| "loss": 0.5958, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.1691973969631237, |
| "grad_norm": 5.3125, |
| "learning_rate": 2.1724798744286076e-06, |
| "loss": 0.5661, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.190889370932755, |
| "grad_norm": 4.625, |
| "learning_rate": 2.0692520840575297e-06, |
| "loss": 0.5603, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.212581344902386, |
| "grad_norm": 5.125, |
| "learning_rate": 1.9678933154909096e-06, |
| "loss": 0.5669, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.234273318872017, |
| "grad_norm": 5.25, |
| "learning_rate": 1.8684682081156764e-06, |
| "loss": 0.5851, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.2559652928416485, |
| "grad_norm": 4.9375, |
| "learning_rate": 1.7710401681676802e-06, |
| "loss": 0.5444, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.27765726681128, |
| "grad_norm": 5.53125, |
| "learning_rate": 1.6756713282957427e-06, |
| "loss": 0.553, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.299349240780911, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.5824225079378686e-06, |
| "loss": 0.5522, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.3210412147505424, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.4913531745349612e-06, |
| "loss": 0.5668, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.3427331887201737, |
| "grad_norm": 5.78125, |
| "learning_rate": 1.4025214056067237e-06, |
| "loss": 0.6002, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.364425162689805, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.3159838517139795e-06, |
| "loss": 0.5798, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.3861171366594363, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.2317957003309727e-06, |
| "loss": 0.6012, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.407809110629067, |
| "grad_norm": 5.21875, |
| "learning_rate": 1.1500106406507416e-06, |
| "loss": 0.5759, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.4295010845986984, |
| "grad_norm": 4.9375, |
| "learning_rate": 1.0706808293459875e-06, |
| "loss": 0.598, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.4511930585683297, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.938568573072716e-07, |
| "loss": 0.5542, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.472885032537961, |
| "grad_norm": 5.3125, |
| "learning_rate": 9.195877173797535e-07, |
| "loss": 0.5699, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.4945770065075923, |
| "grad_norm": 5.09375, |
| "learning_rate": 8.479207731190491e-07, |
| "loss": 0.5716, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.5162689804772236, |
| "grad_norm": 5.03125, |
| "learning_rate": 7.789017285861439e-07, |
| "loss": 0.5833, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.537960954446855, |
| "grad_norm": 5.34375, |
| "learning_rate": 7.125745992006044e-07, |
| "loss": 0.6223, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.5596529284164857, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.489816836706786e-07, |
| "loss": 0.5936, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.581344902386117, |
| "grad_norm": 5.8125, |
| "learning_rate": 5.881635370182037e-07, |
| "loss": 0.5811, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.6030368763557483, |
| "grad_norm": 5.25, |
| "learning_rate": 5.301589447155092e-07, |
| "loss": 0.6056, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.6247288503253796, |
| "grad_norm": 5.28125, |
| "learning_rate": 4.7500489795081485e-07, |
| "loss": 0.5783, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.646420824295011, |
| "grad_norm": 5.09375, |
| "learning_rate": 4.2273657003787993e-07, |
| "loss": 0.5733, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.668112798264642, |
| "grad_norm": 4.65625, |
| "learning_rate": 3.733872939849875e-07, |
| "loss": 0.5613, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.6898047722342735, |
| "grad_norm": 4.625, |
| "learning_rate": 3.269885412375223e-07, |
| "loss": 0.5819, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.7114967462039044, |
| "grad_norm": 5.21875, |
| "learning_rate": 2.8356990160773534e-07, |
| "loss": 0.5772, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.7331887201735356, |
| "grad_norm": 5.03125, |
| "learning_rate": 2.4315906440446957e-07, |
| "loss": 0.5633, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.754880694143167, |
| "grad_norm": 5.75, |
| "learning_rate": 2.0578180077489906e-07, |
| "loss": 0.5772, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.7765726681127982, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.714619472695278e-07, |
| "loss": 0.5639, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.7982646420824295, |
| "grad_norm": 5.21875, |
| "learning_rate": 1.4022139064094163e-07, |
| "loss": 0.5599, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.819956616052061, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.1208005388599952e-07, |
| "loss": 0.5976, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.841648590021692, |
| "grad_norm": 5.15625, |
| "learning_rate": 8.705588354036675e-08, |
| "loss": 0.5988, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.863340563991323, |
| "grad_norm": 4.96875, |
| "learning_rate": 6.516483823349796e-08, |
| "loss": 0.5771, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.8850325379609547, |
| "grad_norm": 4.875, |
| "learning_rate": 4.642087851136123e-08, |
| "loss": 0.5546, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.9067245119305856, |
| "grad_norm": 5.71875, |
| "learning_rate": 3.083595793339778e-08, |
| "loss": 0.5834, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.928416485900217, |
| "grad_norm": 5.25, |
| "learning_rate": 1.842001544939742e-08, |
| "loss": 0.6076, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.950108459869848, |
| "grad_norm": 5.25, |
| "learning_rate": 9.180969061143852e-09, |
| "loss": 0.5899, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.9718004338394794, |
| "grad_norm": 4.75, |
| "learning_rate": 3.1247107728776815e-09, |
| "loss": 0.566, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.9934924078091107, |
| "grad_norm": 5.34375, |
| "learning_rate": 2.5510283379992507e-10, |
| "loss": 0.5987, |
| "step": 1380 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1383, |
| "total_flos": 4.094077492231373e+17, |
| "train_loss": 0.674382138752196, |
| "train_runtime": 4290.9934, |
| "train_samples_per_second": 1.289, |
| "train_steps_per_second": 0.322 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1383, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.094077492231373e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|