mangopy
/

Qwen7_warmup500_em3

Safetensors

qwen2

Model card Files Files and versions

xet

Community

mangopy commited on Apr 25, 2025

Commit

5b871e0

verified ·

1 Parent(s): df9fce3

Upload trainer_state.json with huggingface_hub

Browse files

Files changed (1) hide show

trainer_state.json +1134 -0

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1134 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9968,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0128,
+      "grad_norm": 3.4233698136439568,
+      "learning_rate": 1.25e-07,
+      "loss": 0.0246,
+      "step": 1
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 3.728998587328461,
+      "learning_rate": 2.5e-07,
+      "loss": 0.0252,
+      "step": 2
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 3.0616613124293135,
+      "learning_rate": 3.75e-07,
+      "loss": 0.0227,
+      "step": 3
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 2.3123184172566016,
+      "learning_rate": 5e-07,
+      "loss": 0.0168,
+      "step": 4
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 3.360264502123766,
+      "learning_rate": 6.249999999999999e-07,
+      "loss": 0.021,
+      "step": 5
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 3.3460855292395757,
+      "learning_rate": 7.5e-07,
+      "loss": 0.0229,
+      "step": 6
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 3.1072974219219085,
+      "learning_rate": 8.75e-07,
+      "loss": 0.0226,
+      "step": 7
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 4.56578073058385,
+      "learning_rate": 1e-06,
+      "loss": 0.0296,
+      "step": 8
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 3.4123791670336443,
+      "learning_rate": 1.125e-06,
+      "loss": 0.0234,
+      "step": 9
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 2.7894681328326816,
+      "learning_rate": 1.2499999999999999e-06,
+      "loss": 0.0203,
+      "step": 10
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 4.6455794479831685,
+      "learning_rate": 1.375e-06,
+      "loss": 0.0299,
+      "step": 11
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 3.8109983639167577,
+      "learning_rate": 1.5e-06,
+      "loss": 0.025,
+      "step": 12
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 4.183418083336812,
+      "learning_rate": 1.625e-06,
+      "loss": 0.0311,
+      "step": 13
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 4.024058901580512,
+      "learning_rate": 1.75e-06,
+      "loss": 0.024,
+      "step": 14
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 2.9842133290060593,
+      "learning_rate": 1.8749999999999998e-06,
+      "loss": 0.0198,
+      "step": 15
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 4.060055578632782,
+      "learning_rate": 2e-06,
+      "loss": 0.0272,
+      "step": 16
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 4.646550393359002,
+      "learning_rate": 1.9997482349425066e-06,
+      "loss": 0.0216,
+      "step": 17
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 3.9839364783795164,
+      "learning_rate": 1.9989930665413145e-06,
+      "loss": 0.0211,
+      "step": 18
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 4.2706791731528435,
+      "learning_rate": 1.997734875046456e-06,
+      "loss": 0.0275,
+      "step": 19
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 4.46746121804618,
+      "learning_rate": 1.995974293995239e-06,
+      "loss": 0.0258,
+      "step": 20
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 4.703579744776647,
+      "learning_rate": 1.9937122098932426e-06,
+      "loss": 0.0273,
+      "step": 21
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 5.162187031521371,
+      "learning_rate": 1.9909497617679347e-06,
+      "loss": 0.0297,
+      "step": 22
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 4.08147747350908,
+      "learning_rate": 1.9876883405951377e-06,
+      "loss": 0.0241,
+      "step": 23
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 5.338597185310122,
+      "learning_rate": 1.9839295885986295e-06,
+      "loss": 0.0313,
+      "step": 24
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 4.004331157501513,
+      "learning_rate": 1.9796753984232355e-06,
+      "loss": 0.0233,
+      "step": 25
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 5.79846158935698,
+      "learning_rate": 1.9749279121818236e-06,
+      "loss": 0.0328,
+      "step": 26
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 6.927108105836598,
+      "learning_rate": 1.9696895203766866e-06,
+      "loss": 0.0381,
+      "step": 27
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 5.251309102448725,
+      "learning_rate": 1.9639628606958534e-06,
+      "loss": 0.0282,
+      "step": 28
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 5.050171176886125,
+      "learning_rate": 1.9577508166849303e-06,
+      "loss": 0.0246,
+      "step": 29
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 5.512551765259008,
+      "learning_rate": 1.9510565162951534e-06,
+      "loss": 0.03,
+      "step": 30
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 5.7365196994206,
+      "learning_rate": 1.9438833303083674e-06,
+      "loss": 0.0314,
+      "step": 31
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 5.220122317364698,
+      "learning_rate": 1.936234870639737e-06,
+      "loss": 0.0311,
+      "step": 32
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 5.090432583516014,
+      "learning_rate": 1.928114988519039e-06,
+      "loss": 0.0289,
+      "step": 33
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 5.145680195739282,
+      "learning_rate": 1.9195277725514506e-06,
+      "loss": 0.0272,
+      "step": 34
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 5.308627403286571,
+      "learning_rate": 1.9104775466588157e-06,
+      "loss": 0.0324,
+      "step": 35
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 6.22201023848965,
+      "learning_rate": 1.9009688679024189e-06,
+      "loss": 0.0344,
+      "step": 36
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 5.003597737244695,
+      "learning_rate": 1.8910065241883678e-06,
+      "loss": 0.0333,
+      "step": 37
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 5.6592651331248,
+      "learning_rate": 1.8805955318567379e-06,
+      "loss": 0.0315,
+      "step": 38
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 5.975923701038477,
+      "learning_rate": 1.8697411331556953e-06,
+      "loss": 0.0241,
+      "step": 39
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 5.750552226599778,
+      "learning_rate": 1.858448793601866e-06,
+      "loss": 0.0329,
+      "step": 40
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 5.816663494605659,
+      "learning_rate": 1.8467241992282841e-06,
+      "loss": 0.0337,
+      "step": 41
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 5.060423336355904,
+      "learning_rate": 1.8345732537213026e-06,
+      "loss": 0.0289,
+      "step": 42
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 5.111705069882343,
+      "learning_rate": 1.82200207544791e-06,
+      "loss": 0.0253,
+      "step": 43
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 4.76340608246537,
+      "learning_rate": 1.8090169943749474e-06,
+      "loss": 0.0242,
+      "step": 44
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 5.232632523840601,
+      "learning_rate": 1.795624548881781e-06,
+      "loss": 0.0332,
+      "step": 45
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 5.063499559835732,
+      "learning_rate": 1.7818314824680298e-06,
+      "loss": 0.0331,
+      "step": 46
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 5.081572984551496,
+      "learning_rate": 1.767644740358011e-06,
+      "loss": 0.0353,
+      "step": 47
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 4.256457044525209,
+      "learning_rate": 1.753071466003611e-06,
+      "loss": 0.0275,
+      "step": 48
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 5.0457530324965925,
+      "learning_rate": 1.7381189974873407e-06,
+      "loss": 0.0345,
+      "step": 49
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 4.222996253822678,
+      "learning_rate": 1.7227948638273915e-06,
+      "loss": 0.0258,
+      "step": 50
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 5.105453296008258,
+      "learning_rate": 1.7071067811865474e-06,
+      "loss": 0.0361,
+      "step": 51
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 4.7238042861158505,
+      "learning_rate": 1.6910626489868648e-06,
+      "loss": 0.03,
+      "step": 52
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 4.467100915377624,
+      "learning_rate": 1.6746705459320744e-06,
+      "loss": 0.0301,
+      "step": 53
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 4.199798840654239,
+      "learning_rate": 1.6579387259397126e-06,
+      "loss": 0.0272,
+      "step": 54
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 4.253879642031892,
+      "learning_rate": 1.640875613985024e-06,
+      "loss": 0.0263,
+      "step": 55
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 5.478200127323976,
+      "learning_rate": 1.6234898018587336e-06,
+      "loss": 0.0369,
+      "step": 56
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 5.3136786533226426,
+      "learning_rate": 1.6057900438408199e-06,
+      "loss": 0.0337,
+      "step": 57
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 5.166894330495721,
+      "learning_rate": 1.587785252292473e-06,
+      "loss": 0.0355,
+      "step": 58
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 4.375886424848137,
+      "learning_rate": 1.569484493168452e-06,
+      "loss": 0.0281,
+      "step": 59
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 6.886035234115627,
+      "learning_rate": 1.5508969814521024e-06,
+      "loss": 0.0388,
+      "step": 60
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 5.791644877375871,
+      "learning_rate": 1.5320320765153365e-06,
+      "loss": 0.0373,
+      "step": 61
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 5.096680042384012,
+      "learning_rate": 1.5128992774059062e-06,
+      "loss": 0.0344,
+      "step": 62
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 5.046290484669824,
+      "learning_rate": 1.4935082180643467e-06,
+      "loss": 0.0411,
+      "step": 63
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 5.210730266900293,
+      "learning_rate": 1.4738686624729987e-06,
+      "loss": 0.0353,
+      "step": 64
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 4.780860746415674,
+      "learning_rate": 1.4539904997395467e-06,
+      "loss": 0.0285,
+      "step": 65
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 4.9827723432345765,
+      "learning_rate": 1.433883739117558e-06,
+      "loss": 0.0355,
+      "step": 66
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 4.566523361618775,
+      "learning_rate": 1.4135585049665206e-06,
+      "loss": 0.0229,
+      "step": 67
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 3.9442441573671534,
+      "learning_rate": 1.3930250316539235e-06,
+      "loss": 0.0251,
+      "step": 68
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 4.996881415714366,
+      "learning_rate": 1.3722936584019451e-06,
+      "loss": 0.0361,
+      "step": 69
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 5.769732263820899,
+      "learning_rate": 1.3513748240813427e-06,
+      "loss": 0.0366,
+      "step": 70
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 4.573503361877803,
+      "learning_rate": 1.3302790619551672e-06,
+      "loss": 0.0272,
+      "step": 71
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 3.296575123346509,
+      "learning_rate": 1.3090169943749473e-06,
+      "loss": 0.023,
+      "step": 72
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 4.908588421936803,
+      "learning_rate": 1.2875993274320173e-06,
+      "loss": 0.0278,
+      "step": 73
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 3.846287378603072,
+      "learning_rate": 1.266036845566675e-06,
+      "loss": 0.0255,
+      "step": 74
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 4.087209232767521,
+      "learning_rate": 1.244340406137894e-06,
+      "loss": 0.0295,
+      "step": 75
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 4.64654246748357,
+      "learning_rate": 1.2225209339563143e-06,
+      "loss": 0.0278,
+      "step": 76
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 4.507522376329423,
+      "learning_rate": 1.2005894157832728e-06,
+      "loss": 0.0319,
+      "step": 77
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 5.762678744609664,
+      "learning_rate": 1.1785568947986366e-06,
+      "loss": 0.0352,
+      "step": 78
+    },
+    {
+      "epoch": 1.0112,
+      "grad_norm": 3.1644394788507513,
+      "learning_rate": 1.156434465040231e-06,
+      "loss": 0.0161,
+      "step": 79
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 3.7447721631178,
+      "learning_rate": 1.1342332658176555e-06,
+      "loss": 0.0191,
+      "step": 80
+    },
+    {
+      "epoch": 1.0368,
+      "grad_norm": 3.2322255457531637,
+      "learning_rate": 1.1119644761033077e-06,
+      "loss": 0.0124,
+      "step": 81
+    },
+    {
+      "epoch": 1.0496,
+      "grad_norm": 2.4196830627748285,
+      "learning_rate": 1.0896393089034335e-06,
+      "loss": 0.0105,
+      "step": 82
+    },
+    {
+      "epoch": 1.0624,
+      "grad_norm": 2.5920868298208872,
+      "learning_rate": 1.0672690056120398e-06,
+      "loss": 0.0144,
+      "step": 83
+    },
+    {
+      "epoch": 1.0752,
+      "grad_norm": 2.5821101862637175,
+      "learning_rate": 1.044864830350515e-06,
+      "loss": 0.0139,
+      "step": 84
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 3.02231827873139,
+      "learning_rate": 1.022438064295805e-06,
+      "loss": 0.0135,
+      "step": 85
+    },
+    {
+      "epoch": 1.1008,
+      "grad_norm": 2.2923086363844845,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "step": 86
+    },
+    {
+      "epoch": 1.1136,
+      "grad_norm": 2.8842975225041623,
+      "learning_rate": 9.77561935704195e-07,
+      "loss": 0.0115,
+      "step": 87
+    },
+    {
+      "epoch": 1.1264,
+      "grad_norm": 5.879985429015499,
+      "learning_rate": 9.551351696494853e-07,
+      "loss": 0.0119,
+      "step": 88
+    },
+    {
+      "epoch": 1.1392,
+      "grad_norm": 2.360056985536234,
+      "learning_rate": 9.327309943879603e-07,
+      "loss": 0.0096,
+      "step": 89
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 1.9869435676758012,
+      "learning_rate": 9.103606910965665e-07,
+      "loss": 0.0079,
+      "step": 90
+    },
+    {
+      "epoch": 1.1648,
+      "grad_norm": 2.5601097067547856,
+      "learning_rate": 8.880355238966921e-07,
+      "loss": 0.0104,
+      "step": 91
+    },
+    {
+      "epoch": 1.1776,
+      "grad_norm": 2.7730596832345564,
+      "learning_rate": 8.657667341823448e-07,
+      "loss": 0.0118,
+      "step": 92
+    },
+    {
+      "epoch": 1.1904,
+      "grad_norm": 2.18163461710527,
+      "learning_rate": 8.435655349597689e-07,
+      "loss": 0.0105,
+      "step": 93
+    },
+    {
+      "epoch": 1.2032,
+      "grad_norm": 2.251464092159168,
+      "learning_rate": 8.214431052013634e-07,
+      "loss": 0.0148,
+      "step": 94
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 2.7530295169182333,
+      "learning_rate": 7.994105842167272e-07,
+      "loss": 0.01,
+      "step": 95
+    },
+    {
+      "epoch": 1.2288000000000001,
+      "grad_norm": 2.526225960487079,
+      "learning_rate": 7.774790660436857e-07,
+      "loss": 0.0089,
+      "step": 96
+    },
+    {
+      "epoch": 1.2416,
+      "grad_norm": 2.9558648711495414,
+      "learning_rate": 7.556595938621058e-07,
+      "loss": 0.0121,
+      "step": 97
+    },
+    {
+      "epoch": 1.2544,
+      "grad_norm": 4.047664680116945,
+      "learning_rate": 7.33963154433325e-07,
+      "loss": 0.0122,
+      "step": 98
+    },
+    {
+      "epoch": 1.2671999999999999,
+      "grad_norm": 4.128501309390267,
+      "learning_rate": 7.124006725679828e-07,
+      "loss": 0.0132,
+      "step": 99
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 3.158070174858742,
+      "learning_rate": 6.909830056250526e-07,
+      "loss": 0.0105,
+      "step": 100
+    },
+    {
+      "epoch": 1.2928,
+      "grad_norm": 2.330265912872272,
+      "learning_rate": 6.697209380448332e-07,
+      "loss": 0.0101,
+      "step": 101
+    },
+    {
+      "epoch": 1.3056,
+      "grad_norm": 4.9757360072405445,
+      "learning_rate": 6.486251759186572e-07,
+      "loss": 0.0179,
+      "step": 102
+    },
+    {
+      "epoch": 1.3184,
+      "grad_norm": 3.8743181605977743,
+      "learning_rate": 6.277063415980548e-07,
+      "loss": 0.0129,
+      "step": 103
+    },
+    {
+      "epoch": 1.3312,
+      "grad_norm": 3.263723675313523,
+      "learning_rate": 6.069749683460764e-07,
+      "loss": 0.0111,
+      "step": 104
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 3.1218175870205584,
+      "learning_rate": 5.864414950334795e-07,
+      "loss": 0.0119,
+      "step": 105
+    },
+    {
+      "epoch": 1.3568,
+      "grad_norm": 3.717962785205817,
+      "learning_rate": 5.661162608824419e-07,
+      "loss": 0.0115,
+      "step": 106
+    },
+    {
+      "epoch": 1.3696,
+      "grad_norm": 3.650556269715187,
+      "learning_rate": 5.460095002604532e-07,
+      "loss": 0.0123,
+      "step": 107
+    },
+    {
+      "epoch": 1.3824,
+      "grad_norm": 3.2197493950580296,
+      "learning_rate": 5.261313375270013e-07,
+      "loss": 0.0137,
+      "step": 108
+    },
+    {
+      "epoch": 1.3952,
+      "grad_norm": 3.365111064634147,
+      "learning_rate": 5.064917819356531e-07,
+      "loss": 0.0111,
+      "step": 109
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 4.710390460257863,
+      "learning_rate": 4.871007225940939e-07,
+      "loss": 0.0129,
+      "step": 110
+    },
+    {
+      "epoch": 1.4208,
+      "grad_norm": 2.76927802183368,
+      "learning_rate": 4.6796792348466353e-07,
+      "loss": 0.013,
+      "step": 111
+    },
+    {
+      "epoch": 1.4336,
+      "grad_norm": 3.2171761582689915,
+      "learning_rate": 4.4910301854789755e-07,
+      "loss": 0.0114,
+      "step": 112
+    },
+    {
+      "epoch": 1.4464000000000001,
+      "grad_norm": 2.7947744875678096,
+      "learning_rate": 4.3051550683154804e-07,
+      "loss": 0.0113,
+      "step": 113
+    },
+    {
+      "epoch": 1.4592,
+      "grad_norm": 2.4585140708787043,
+      "learning_rate": 4.1221474770752696e-07,
+      "loss": 0.0103,
+      "step": 114
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 2.8113536653109965,
+      "learning_rate": 3.942099561591802e-07,
+      "loss": 0.0106,
+      "step": 115
+    },
+    {
+      "epoch": 1.4848,
+      "grad_norm": 3.6398945452240055,
+      "learning_rate": 3.765101981412665e-07,
+      "loss": 0.0127,
+      "step": 116
+    },
+    {
+      "epoch": 1.4976,
+      "grad_norm": 2.9485443643029607,
+      "learning_rate": 3.5912438601497584e-07,
+      "loss": 0.009,
+      "step": 117
+    },
+    {
+      "epoch": 1.5104,
+      "grad_norm": 2.9984190637681096,
+      "learning_rate": 3.420612740602874e-07,
+      "loss": 0.0093,
+      "step": 118
+    },
+    {
+      "epoch": 1.5232,
+      "grad_norm": 2.8046736646132744,
+      "learning_rate": 3.253294540679257e-07,
+      "loss": 0.0094,
+      "step": 119
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 2.9182942187963605,
+      "learning_rate": 3.0893735101313535e-07,
+      "loss": 0.0101,
+      "step": 120
+    },
+    {
+      "epoch": 1.5488,
+      "grad_norm": 4.061738080588852,
+      "learning_rate": 2.9289321881345254e-07,
+      "loss": 0.0148,
+      "step": 121
+    },
+    {
+      "epoch": 1.5615999999999999,
+      "grad_norm": 2.6181262527250064,
+      "learning_rate": 2.7720513617260855e-07,
+      "loss": 0.0108,
+      "step": 122
+    },
+    {
+      "epoch": 1.5744,
+      "grad_norm": 2.814162128761838,
+      "learning_rate": 2.6188100251265943e-07,
+      "loss": 0.0085,
+      "step": 123
+    },
+    {
+      "epoch": 1.5872000000000002,
+      "grad_norm": 2.978469400791401,
+      "learning_rate": 2.4692853399638913e-07,
+      "loss": 0.0123,
+      "step": 124
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.9752826330171183,
+      "learning_rate": 2.3235525964198888e-07,
+      "loss": 0.0091,
+      "step": 125
+    },
+    {
+      "epoch": 1.6128,
+      "grad_norm": 2.6883643677418623,
+      "learning_rate": 2.181685175319702e-07,
+      "loss": 0.0097,
+      "step": 126
+    },
+    {
+      "epoch": 1.6256,
+      "grad_norm": 2.9489506229688884,
+      "learning_rate": 2.043754511182191e-07,
+      "loss": 0.0079,
+      "step": 127
+    },
+    {
+      "epoch": 1.6383999999999999,
+      "grad_norm": 1.9593044592383062,
+      "learning_rate": 1.9098300562505264e-07,
+      "loss": 0.0081,
+      "step": 128
+    },
+    {
+      "epoch": 1.6512,
+      "grad_norm": 1.952957689869861,
+      "learning_rate": 1.7799792455209016e-07,
+      "loss": 0.0082,
+      "step": 129
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 2.38161666441156,
+      "learning_rate": 1.6542674627869734e-07,
+      "loss": 0.0094,
+      "step": 130
+    },
+    {
+      "epoch": 1.6768,
+      "grad_norm": 2.164377402243927,
+      "learning_rate": 1.5327580077171588e-07,
+      "loss": 0.0097,
+      "step": 131
+    },
+    {
+      "epoch": 1.6896,
+      "grad_norm": 3.518297727779028,
+      "learning_rate": 1.415512063981339e-07,
+      "loss": 0.0089,
+      "step": 132
+    },
+    {
+      "epoch": 1.7024,
+      "grad_norm": 3.697074996731219,
+      "learning_rate": 1.3025886684430465e-07,
+      "loss": 0.0078,
+      "step": 133
+    },
+    {
+      "epoch": 1.7151999999999998,
+      "grad_norm": 4.314207988612093,
+      "learning_rate": 1.19404468143262e-07,
+      "loss": 0.0119,
+      "step": 134
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 2.9026476752647414,
+      "learning_rate": 1.089934758116322e-07,
+      "loss": 0.0138,
+      "step": 135
+    },
+    {
+      "epoch": 1.7408000000000001,
+      "grad_norm": 2.502384977722473,
+      "learning_rate": 9.903113209758096e-08,
+      "loss": 0.0112,
+      "step": 136
+    },
+    {
+      "epoch": 1.7536,
+      "grad_norm": 1.7807221172577514,
+      "learning_rate": 8.952245334118413e-08,
+      "loss": 0.0077,
+      "step": 137
+    },
+    {
+      "epoch": 1.7664,
+      "grad_norm": 2.727436513377534,
+      "learning_rate": 8.047222744854942e-08,
+      "loss": 0.0096,
+      "step": 138
+    },
+    {
+      "epoch": 1.7792,
+      "grad_norm": 1.6157555666433816,
+      "learning_rate": 7.188501148096116e-08,
+      "loss": 0.007,
+      "step": 139
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 1.5949515973033697,
+      "learning_rate": 6.376512936026279e-08,
+      "loss": 0.0062,
+      "step": 140
+    },
+    {
+      "epoch": 1.8048,
+      "grad_norm": 2.6399637194756718,
+      "learning_rate": 5.611666969163242e-08,
+      "loss": 0.0095,
+      "step": 141
+    },
+    {
+      "epoch": 1.8176,
+      "grad_norm": 3.0621545226623907,
+      "learning_rate": 4.8943483704846465e-08,
+      "loss": 0.0132,
+      "step": 142
+    },
+    {
+      "epoch": 1.8304,
+      "grad_norm": 1.9600404784878551,
+      "learning_rate": 4.224918331506955e-08,
+      "loss": 0.0095,
+      "step": 143
+    },
+    {
+      "epoch": 1.8432,
+      "grad_norm": 1.963746557604649,
+      "learning_rate": 3.6037139304146756e-08,
+      "loss": 0.0099,
+      "step": 144
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 4.527295989762448,
+      "learning_rate": 3.0310479623313125e-08,
+      "loss": 0.0144,
+      "step": 145
+    },
+    {
+      "epoch": 1.8688,
+      "grad_norm": 2.6834067438548166,
+      "learning_rate": 2.507208781817638e-08,
+      "loss": 0.012,
+      "step": 146
+    },
+    {
+      "epoch": 1.8816000000000002,
+      "grad_norm": 1.8417342752412211,
+      "learning_rate": 2.032460157676452e-08,
+      "loss": 0.0077,
+      "step": 147
+    },
+    {
+      "epoch": 1.8944,
+      "grad_norm": 2.99373797619176,
+      "learning_rate": 1.607041140137033e-08,
+      "loss": 0.0115,
+      "step": 148
+    },
+    {
+      "epoch": 1.9072,
+      "grad_norm": 1.1214911188004222,
+      "learning_rate": 1.231165940486234e-08,
+      "loss": 0.0062,
+      "step": 149
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 2.116183152950272,
+      "learning_rate": 9.050238232065299e-09,
+      "loss": 0.0097,
+      "step": 150
+    },
+    {
+      "epoch": 1.9327999999999999,
+      "grad_norm": 1.8359832453089462,
+      "learning_rate": 6.2877901067573955e-09,
+      "loss": 0.0069,
+      "step": 151
+    },
+    {
+      "epoch": 1.9456,
+      "grad_norm": 2.0445129237685773,
+      "learning_rate": 4.025706004760931e-09,
+      "loss": 0.0086,
+      "step": 152
+    },
+    {
+      "epoch": 1.9584000000000001,
+      "grad_norm": 1.4095794512624829,
+      "learning_rate": 2.2651249535439177e-09,
+      "loss": 0.0053,
+      "step": 153
+    },
+    {
+      "epoch": 1.9712,
+      "grad_norm": 1.4980650303315703,
+      "learning_rate": 1.0069334586854105e-09,
+      "loss": 0.0068,
+      "step": 154
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 1.6817715164958336,
+      "learning_rate": 2.517650574934693e-10,
+      "loss": 0.0087,
+      "step": 155
+    },
+    {
+      "epoch": 1.9968,
+      "grad_norm": 5.327422033689792,
+      "learning_rate": 0.0,
+      "loss": 0.0163,
+      "step": 156
+    },
+    {
+      "epoch": 1.9968,
+      "step": 156,
+      "total_flos": 138561371045888.0,
+      "train_loss": 0.019949143110678937,
+      "train_runtime": 6111.638,
+      "train_samples_per_second": 6.545,
+      "train_steps_per_second": 0.026
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 138561371045888.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}