| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9968, |
| "eval_steps": 500, |
| "global_step": 156, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0128, |
| "grad_norm": 3.4233698136439568, |
| "learning_rate": 1.25e-07, |
| "loss": 0.0246, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 3.728998587328461, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0252, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0384, |
| "grad_norm": 3.0616613124293135, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0227, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0512, |
| "grad_norm": 2.3123184172566016, |
| "learning_rate": 5e-07, |
| "loss": 0.0168, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 3.360264502123766, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.021, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0768, |
| "grad_norm": 3.3460855292395757, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0229, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0896, |
| "grad_norm": 3.1072974219219085, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0226, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.1024, |
| "grad_norm": 4.56578073058385, |
| "learning_rate": 1e-06, |
| "loss": 0.0296, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1152, |
| "grad_norm": 3.4123791670336443, |
| "learning_rate": 1.125e-06, |
| "loss": 0.0234, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 2.7894681328326816, |
| "learning_rate": 1.2499999999999999e-06, |
| "loss": 0.0203, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1408, |
| "grad_norm": 4.6455794479831685, |
| "learning_rate": 1.375e-06, |
| "loss": 0.0299, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1536, |
| "grad_norm": 3.8109983639167577, |
| "learning_rate": 1.5e-06, |
| "loss": 0.025, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1664, |
| "grad_norm": 4.183418083336812, |
| "learning_rate": 1.625e-06, |
| "loss": 0.0311, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.1792, |
| "grad_norm": 4.024058901580512, |
| "learning_rate": 1.75e-06, |
| "loss": 0.024, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 2.9842133290060593, |
| "learning_rate": 1.8749999999999998e-06, |
| "loss": 0.0198, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2048, |
| "grad_norm": 4.060055578632782, |
| "learning_rate": 2e-06, |
| "loss": 0.0272, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2176, |
| "grad_norm": 4.646550393359002, |
| "learning_rate": 1.9997482349425066e-06, |
| "loss": 0.0216, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.2304, |
| "grad_norm": 3.9839364783795164, |
| "learning_rate": 1.9989930665413145e-06, |
| "loss": 0.0211, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2432, |
| "grad_norm": 4.2706791731528435, |
| "learning_rate": 1.997734875046456e-06, |
| "loss": 0.0275, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 4.46746121804618, |
| "learning_rate": 1.995974293995239e-06, |
| "loss": 0.0258, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2688, |
| "grad_norm": 4.703579744776647, |
| "learning_rate": 1.9937122098932426e-06, |
| "loss": 0.0273, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.2816, |
| "grad_norm": 5.162187031521371, |
| "learning_rate": 1.9909497617679347e-06, |
| "loss": 0.0297, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.2944, |
| "grad_norm": 4.08147747350908, |
| "learning_rate": 1.9876883405951377e-06, |
| "loss": 0.0241, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3072, |
| "grad_norm": 5.338597185310122, |
| "learning_rate": 1.9839295885986295e-06, |
| "loss": 0.0313, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 4.004331157501513, |
| "learning_rate": 1.9796753984232355e-06, |
| "loss": 0.0233, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3328, |
| "grad_norm": 5.79846158935698, |
| "learning_rate": 1.9749279121818236e-06, |
| "loss": 0.0328, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3456, |
| "grad_norm": 6.927108105836598, |
| "learning_rate": 1.9696895203766866e-06, |
| "loss": 0.0381, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.3584, |
| "grad_norm": 5.251309102448725, |
| "learning_rate": 1.9639628606958534e-06, |
| "loss": 0.0282, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3712, |
| "grad_norm": 5.050171176886125, |
| "learning_rate": 1.9577508166849303e-06, |
| "loss": 0.0246, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 5.512551765259008, |
| "learning_rate": 1.9510565162951534e-06, |
| "loss": 0.03, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3968, |
| "grad_norm": 5.7365196994206, |
| "learning_rate": 1.9438833303083674e-06, |
| "loss": 0.0314, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4096, |
| "grad_norm": 5.220122317364698, |
| "learning_rate": 1.936234870639737e-06, |
| "loss": 0.0311, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4224, |
| "grad_norm": 5.090432583516014, |
| "learning_rate": 1.928114988519039e-06, |
| "loss": 0.0289, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.4352, |
| "grad_norm": 5.145680195739282, |
| "learning_rate": 1.9195277725514506e-06, |
| "loss": 0.0272, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 5.308627403286571, |
| "learning_rate": 1.9104775466588157e-06, |
| "loss": 0.0324, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.4608, |
| "grad_norm": 6.22201023848965, |
| "learning_rate": 1.9009688679024189e-06, |
| "loss": 0.0344, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4736, |
| "grad_norm": 5.003597737244695, |
| "learning_rate": 1.8910065241883678e-06, |
| "loss": 0.0333, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.4864, |
| "grad_norm": 5.6592651331248, |
| "learning_rate": 1.8805955318567379e-06, |
| "loss": 0.0315, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.4992, |
| "grad_norm": 5.975923701038477, |
| "learning_rate": 1.8697411331556953e-06, |
| "loss": 0.0241, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 5.750552226599778, |
| "learning_rate": 1.858448793601866e-06, |
| "loss": 0.0329, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5248, |
| "grad_norm": 5.816663494605659, |
| "learning_rate": 1.8467241992282841e-06, |
| "loss": 0.0337, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.5376, |
| "grad_norm": 5.060423336355904, |
| "learning_rate": 1.8345732537213026e-06, |
| "loss": 0.0289, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5504, |
| "grad_norm": 5.111705069882343, |
| "learning_rate": 1.82200207544791e-06, |
| "loss": 0.0253, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.5632, |
| "grad_norm": 4.76340608246537, |
| "learning_rate": 1.8090169943749474e-06, |
| "loss": 0.0242, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 5.232632523840601, |
| "learning_rate": 1.795624548881781e-06, |
| "loss": 0.0332, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5888, |
| "grad_norm": 5.063499559835732, |
| "learning_rate": 1.7818314824680298e-06, |
| "loss": 0.0331, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.6016, |
| "grad_norm": 5.081572984551496, |
| "learning_rate": 1.767644740358011e-06, |
| "loss": 0.0353, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6144, |
| "grad_norm": 4.256457044525209, |
| "learning_rate": 1.753071466003611e-06, |
| "loss": 0.0275, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6272, |
| "grad_norm": 5.0457530324965925, |
| "learning_rate": 1.7381189974873407e-06, |
| "loss": 0.0345, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 4.222996253822678, |
| "learning_rate": 1.7227948638273915e-06, |
| "loss": 0.0258, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6528, |
| "grad_norm": 5.105453296008258, |
| "learning_rate": 1.7071067811865474e-06, |
| "loss": 0.0361, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.6656, |
| "grad_norm": 4.7238042861158505, |
| "learning_rate": 1.6910626489868648e-06, |
| "loss": 0.03, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.6784, |
| "grad_norm": 4.467100915377624, |
| "learning_rate": 1.6746705459320744e-06, |
| "loss": 0.0301, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.6912, |
| "grad_norm": 4.199798840654239, |
| "learning_rate": 1.6579387259397126e-06, |
| "loss": 0.0272, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 4.253879642031892, |
| "learning_rate": 1.640875613985024e-06, |
| "loss": 0.0263, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7168, |
| "grad_norm": 5.478200127323976, |
| "learning_rate": 1.6234898018587336e-06, |
| "loss": 0.0369, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7296, |
| "grad_norm": 5.3136786533226426, |
| "learning_rate": 1.6057900438408199e-06, |
| "loss": 0.0337, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.7424, |
| "grad_norm": 5.166894330495721, |
| "learning_rate": 1.587785252292473e-06, |
| "loss": 0.0355, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7552, |
| "grad_norm": 4.375886424848137, |
| "learning_rate": 1.569484493168452e-06, |
| "loss": 0.0281, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 6.886035234115627, |
| "learning_rate": 1.5508969814521024e-06, |
| "loss": 0.0388, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7808, |
| "grad_norm": 5.791644877375871, |
| "learning_rate": 1.5320320765153365e-06, |
| "loss": 0.0373, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.7936, |
| "grad_norm": 5.096680042384012, |
| "learning_rate": 1.5128992774059062e-06, |
| "loss": 0.0344, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.8064, |
| "grad_norm": 5.046290484669824, |
| "learning_rate": 1.4935082180643467e-06, |
| "loss": 0.0411, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.8192, |
| "grad_norm": 5.210730266900293, |
| "learning_rate": 1.4738686624729987e-06, |
| "loss": 0.0353, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 4.780860746415674, |
| "learning_rate": 1.4539904997395467e-06, |
| "loss": 0.0285, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8448, |
| "grad_norm": 4.9827723432345765, |
| "learning_rate": 1.433883739117558e-06, |
| "loss": 0.0355, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.8576, |
| "grad_norm": 4.566523361618775, |
| "learning_rate": 1.4135585049665206e-06, |
| "loss": 0.0229, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.8704, |
| "grad_norm": 3.9442441573671534, |
| "learning_rate": 1.3930250316539235e-06, |
| "loss": 0.0251, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.8832, |
| "grad_norm": 4.996881415714366, |
| "learning_rate": 1.3722936584019451e-06, |
| "loss": 0.0361, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 5.769732263820899, |
| "learning_rate": 1.3513748240813427e-06, |
| "loss": 0.0366, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.9088, |
| "grad_norm": 4.573503361877803, |
| "learning_rate": 1.3302790619551672e-06, |
| "loss": 0.0272, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.9216, |
| "grad_norm": 3.296575123346509, |
| "learning_rate": 1.3090169943749473e-06, |
| "loss": 0.023, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9344, |
| "grad_norm": 4.908588421936803, |
| "learning_rate": 1.2875993274320173e-06, |
| "loss": 0.0278, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.9472, |
| "grad_norm": 3.846287378603072, |
| "learning_rate": 1.266036845566675e-06, |
| "loss": 0.0255, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 4.087209232767521, |
| "learning_rate": 1.244340406137894e-06, |
| "loss": 0.0295, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.9728, |
| "grad_norm": 4.64654246748357, |
| "learning_rate": 1.2225209339563143e-06, |
| "loss": 0.0278, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.9856, |
| "grad_norm": 4.507522376329423, |
| "learning_rate": 1.2005894157832728e-06, |
| "loss": 0.0319, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.9984, |
| "grad_norm": 5.762678744609664, |
| "learning_rate": 1.1785568947986366e-06, |
| "loss": 0.0352, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.0112, |
| "grad_norm": 3.1644394788507513, |
| "learning_rate": 1.156434465040231e-06, |
| "loss": 0.0161, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.024, |
| "grad_norm": 3.7447721631178, |
| "learning_rate": 1.1342332658176555e-06, |
| "loss": 0.0191, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0368, |
| "grad_norm": 3.2322255457531637, |
| "learning_rate": 1.1119644761033077e-06, |
| "loss": 0.0124, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.0496, |
| "grad_norm": 2.4196830627748285, |
| "learning_rate": 1.0896393089034335e-06, |
| "loss": 0.0105, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.0624, |
| "grad_norm": 2.5920868298208872, |
| "learning_rate": 1.0672690056120398e-06, |
| "loss": 0.0144, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.0752, |
| "grad_norm": 2.5821101862637175, |
| "learning_rate": 1.044864830350515e-06, |
| "loss": 0.0139, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.088, |
| "grad_norm": 3.02231827873139, |
| "learning_rate": 1.022438064295805e-06, |
| "loss": 0.0135, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.1008, |
| "grad_norm": 2.2923086363844845, |
| "learning_rate": 1e-06, |
| "loss": 0.0117, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.1136, |
| "grad_norm": 2.8842975225041623, |
| "learning_rate": 9.77561935704195e-07, |
| "loss": 0.0115, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.1264, |
| "grad_norm": 5.879985429015499, |
| "learning_rate": 9.551351696494853e-07, |
| "loss": 0.0119, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.1392, |
| "grad_norm": 2.360056985536234, |
| "learning_rate": 9.327309943879603e-07, |
| "loss": 0.0096, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.152, |
| "grad_norm": 1.9869435676758012, |
| "learning_rate": 9.103606910965665e-07, |
| "loss": 0.0079, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1648, |
| "grad_norm": 2.5601097067547856, |
| "learning_rate": 8.880355238966921e-07, |
| "loss": 0.0104, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.1776, |
| "grad_norm": 2.7730596832345564, |
| "learning_rate": 8.657667341823448e-07, |
| "loss": 0.0118, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.1904, |
| "grad_norm": 2.18163461710527, |
| "learning_rate": 8.435655349597689e-07, |
| "loss": 0.0105, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.2032, |
| "grad_norm": 2.251464092159168, |
| "learning_rate": 8.214431052013634e-07, |
| "loss": 0.0148, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.216, |
| "grad_norm": 2.7530295169182333, |
| "learning_rate": 7.994105842167272e-07, |
| "loss": 0.01, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.2288000000000001, |
| "grad_norm": 2.526225960487079, |
| "learning_rate": 7.774790660436857e-07, |
| "loss": 0.0089, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.2416, |
| "grad_norm": 2.9558648711495414, |
| "learning_rate": 7.556595938621058e-07, |
| "loss": 0.0121, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.2544, |
| "grad_norm": 4.047664680116945, |
| "learning_rate": 7.33963154433325e-07, |
| "loss": 0.0122, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.2671999999999999, |
| "grad_norm": 4.128501309390267, |
| "learning_rate": 7.124006725679828e-07, |
| "loss": 0.0132, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 3.158070174858742, |
| "learning_rate": 6.909830056250526e-07, |
| "loss": 0.0105, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2928, |
| "grad_norm": 2.330265912872272, |
| "learning_rate": 6.697209380448332e-07, |
| "loss": 0.0101, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.3056, |
| "grad_norm": 4.9757360072405445, |
| "learning_rate": 6.486251759186572e-07, |
| "loss": 0.0179, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.3184, |
| "grad_norm": 3.8743181605977743, |
| "learning_rate": 6.277063415980548e-07, |
| "loss": 0.0129, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.3312, |
| "grad_norm": 3.263723675313523, |
| "learning_rate": 6.069749683460764e-07, |
| "loss": 0.0111, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.3439999999999999, |
| "grad_norm": 3.1218175870205584, |
| "learning_rate": 5.864414950334795e-07, |
| "loss": 0.0119, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.3568, |
| "grad_norm": 3.717962785205817, |
| "learning_rate": 5.661162608824419e-07, |
| "loss": 0.0115, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.3696, |
| "grad_norm": 3.650556269715187, |
| "learning_rate": 5.460095002604532e-07, |
| "loss": 0.0123, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.3824, |
| "grad_norm": 3.2197493950580296, |
| "learning_rate": 5.261313375270013e-07, |
| "loss": 0.0137, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.3952, |
| "grad_norm": 3.365111064634147, |
| "learning_rate": 5.064917819356531e-07, |
| "loss": 0.0111, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.408, |
| "grad_norm": 4.710390460257863, |
| "learning_rate": 4.871007225940939e-07, |
| "loss": 0.0129, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.4208, |
| "grad_norm": 2.76927802183368, |
| "learning_rate": 4.6796792348466353e-07, |
| "loss": 0.013, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.4336, |
| "grad_norm": 3.2171761582689915, |
| "learning_rate": 4.4910301854789755e-07, |
| "loss": 0.0114, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.4464000000000001, |
| "grad_norm": 2.7947744875678096, |
| "learning_rate": 4.3051550683154804e-07, |
| "loss": 0.0113, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.4592, |
| "grad_norm": 2.4585140708787043, |
| "learning_rate": 4.1221474770752696e-07, |
| "loss": 0.0103, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.472, |
| "grad_norm": 2.8113536653109965, |
| "learning_rate": 3.942099561591802e-07, |
| "loss": 0.0106, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.4848, |
| "grad_norm": 3.6398945452240055, |
| "learning_rate": 3.765101981412665e-07, |
| "loss": 0.0127, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.4976, |
| "grad_norm": 2.9485443643029607, |
| "learning_rate": 3.5912438601497584e-07, |
| "loss": 0.009, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.5104, |
| "grad_norm": 2.9984190637681096, |
| "learning_rate": 3.420612740602874e-07, |
| "loss": 0.0093, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.5232, |
| "grad_norm": 2.8046736646132744, |
| "learning_rate": 3.253294540679257e-07, |
| "loss": 0.0094, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.536, |
| "grad_norm": 2.9182942187963605, |
| "learning_rate": 3.0893735101313535e-07, |
| "loss": 0.0101, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.5488, |
| "grad_norm": 4.061738080588852, |
| "learning_rate": 2.9289321881345254e-07, |
| "loss": 0.0148, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.5615999999999999, |
| "grad_norm": 2.6181262527250064, |
| "learning_rate": 2.7720513617260855e-07, |
| "loss": 0.0108, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.5744, |
| "grad_norm": 2.814162128761838, |
| "learning_rate": 2.6188100251265943e-07, |
| "loss": 0.0085, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.5872000000000002, |
| "grad_norm": 2.978469400791401, |
| "learning_rate": 2.4692853399638913e-07, |
| "loss": 0.0123, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.9752826330171183, |
| "learning_rate": 2.3235525964198888e-07, |
| "loss": 0.0091, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.6128, |
| "grad_norm": 2.6883643677418623, |
| "learning_rate": 2.181685175319702e-07, |
| "loss": 0.0097, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.6256, |
| "grad_norm": 2.9489506229688884, |
| "learning_rate": 2.043754511182191e-07, |
| "loss": 0.0079, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.6383999999999999, |
| "grad_norm": 1.9593044592383062, |
| "learning_rate": 1.9098300562505264e-07, |
| "loss": 0.0081, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.6512, |
| "grad_norm": 1.952957689869861, |
| "learning_rate": 1.7799792455209016e-07, |
| "loss": 0.0082, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.6640000000000001, |
| "grad_norm": 2.38161666441156, |
| "learning_rate": 1.6542674627869734e-07, |
| "loss": 0.0094, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.6768, |
| "grad_norm": 2.164377402243927, |
| "learning_rate": 1.5327580077171588e-07, |
| "loss": 0.0097, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.6896, |
| "grad_norm": 3.518297727779028, |
| "learning_rate": 1.415512063981339e-07, |
| "loss": 0.0089, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.7024, |
| "grad_norm": 3.697074996731219, |
| "learning_rate": 1.3025886684430465e-07, |
| "loss": 0.0078, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.7151999999999998, |
| "grad_norm": 4.314207988612093, |
| "learning_rate": 1.19404468143262e-07, |
| "loss": 0.0119, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.728, |
| "grad_norm": 2.9026476752647414, |
| "learning_rate": 1.089934758116322e-07, |
| "loss": 0.0138, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.7408000000000001, |
| "grad_norm": 2.502384977722473, |
| "learning_rate": 9.903113209758096e-08, |
| "loss": 0.0112, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.7536, |
| "grad_norm": 1.7807221172577514, |
| "learning_rate": 8.952245334118413e-08, |
| "loss": 0.0077, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.7664, |
| "grad_norm": 2.727436513377534, |
| "learning_rate": 8.047222744854942e-08, |
| "loss": 0.0096, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.7792, |
| "grad_norm": 1.6157555666433816, |
| "learning_rate": 7.188501148096116e-08, |
| "loss": 0.007, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.792, |
| "grad_norm": 1.5949515973033697, |
| "learning_rate": 6.376512936026279e-08, |
| "loss": 0.0062, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.8048, |
| "grad_norm": 2.6399637194756718, |
| "learning_rate": 5.611666969163242e-08, |
| "loss": 0.0095, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.8176, |
| "grad_norm": 3.0621545226623907, |
| "learning_rate": 4.8943483704846465e-08, |
| "loss": 0.0132, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.8304, |
| "grad_norm": 1.9600404784878551, |
| "learning_rate": 4.224918331506955e-08, |
| "loss": 0.0095, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.8432, |
| "grad_norm": 1.963746557604649, |
| "learning_rate": 3.6037139304146756e-08, |
| "loss": 0.0099, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.8559999999999999, |
| "grad_norm": 4.527295989762448, |
| "learning_rate": 3.0310479623313125e-08, |
| "loss": 0.0144, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.8688, |
| "grad_norm": 2.6834067438548166, |
| "learning_rate": 2.507208781817638e-08, |
| "loss": 0.012, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.8816000000000002, |
| "grad_norm": 1.8417342752412211, |
| "learning_rate": 2.032460157676452e-08, |
| "loss": 0.0077, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.8944, |
| "grad_norm": 2.99373797619176, |
| "learning_rate": 1.607041140137033e-08, |
| "loss": 0.0115, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.9072, |
| "grad_norm": 1.1214911188004222, |
| "learning_rate": 1.231165940486234e-08, |
| "loss": 0.0062, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 2.116183152950272, |
| "learning_rate": 9.050238232065299e-09, |
| "loss": 0.0097, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.9327999999999999, |
| "grad_norm": 1.8359832453089462, |
| "learning_rate": 6.2877901067573955e-09, |
| "loss": 0.0069, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.9456, |
| "grad_norm": 2.0445129237685773, |
| "learning_rate": 4.025706004760931e-09, |
| "loss": 0.0086, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.9584000000000001, |
| "grad_norm": 1.4095794512624829, |
| "learning_rate": 2.2651249535439177e-09, |
| "loss": 0.0053, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.9712, |
| "grad_norm": 1.4980650303315703, |
| "learning_rate": 1.0069334586854105e-09, |
| "loss": 0.0068, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.984, |
| "grad_norm": 1.6817715164958336, |
| "learning_rate": 2.517650574934693e-10, |
| "loss": 0.0087, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.9968, |
| "grad_norm": 5.327422033689792, |
| "learning_rate": 0.0, |
| "loss": 0.0163, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.9968, |
| "step": 156, |
| "total_flos": 138561371045888.0, |
| "train_loss": 0.019949143110678937, |
| "train_runtime": 6111.638, |
| "train_samples_per_second": 6.545, |
| "train_steps_per_second": 0.026 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 156, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 138561371045888.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|