qwen7_2wiki1 / trainer_state.json
mangopy's picture
Upload trainer_state.json with huggingface_hub
e36347c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9975990396158463,
"eval_steps": 500,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006402561024409764,
"grad_norm": 13.680483919438638,
"learning_rate": 6.25e-08,
"loss": 0.7419,
"step": 1
},
{
"epoch": 0.012805122048819529,
"grad_norm": 12.895832423301675,
"learning_rate": 1.25e-07,
"loss": 0.6763,
"step": 2
},
{
"epoch": 0.01920768307322929,
"grad_norm": 12.171002116468559,
"learning_rate": 1.875e-07,
"loss": 0.6351,
"step": 3
},
{
"epoch": 0.025610244097639057,
"grad_norm": 12.464785861868467,
"learning_rate": 2.5e-07,
"loss": 0.7288,
"step": 4
},
{
"epoch": 0.03201280512204882,
"grad_norm": 10.388017670547901,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.6076,
"step": 5
},
{
"epoch": 0.03841536614645858,
"grad_norm": 12.079460028084922,
"learning_rate": 3.75e-07,
"loss": 0.6902,
"step": 6
},
{
"epoch": 0.04481792717086835,
"grad_norm": 12.19876157935689,
"learning_rate": 4.375e-07,
"loss": 0.6948,
"step": 7
},
{
"epoch": 0.051220488195278115,
"grad_norm": 9.756977949266757,
"learning_rate": 5e-07,
"loss": 0.7175,
"step": 8
},
{
"epoch": 0.057623049219687875,
"grad_norm": 10.46968505857669,
"learning_rate": 5.625e-07,
"loss": 0.6654,
"step": 9
},
{
"epoch": 0.06402561024409764,
"grad_norm": 8.571146829037712,
"learning_rate": 6.249999999999999e-07,
"loss": 0.6365,
"step": 10
},
{
"epoch": 0.07042817126850741,
"grad_norm": 7.915474952782504,
"learning_rate": 6.875e-07,
"loss": 0.6584,
"step": 11
},
{
"epoch": 0.07683073229291716,
"grad_norm": 8.017393974111991,
"learning_rate": 7.5e-07,
"loss": 0.6789,
"step": 12
},
{
"epoch": 0.08323329331732693,
"grad_norm": 7.146421473324447,
"learning_rate": 8.125e-07,
"loss": 0.6551,
"step": 13
},
{
"epoch": 0.0896358543417367,
"grad_norm": 6.490398797125461,
"learning_rate": 8.75e-07,
"loss": 0.5721,
"step": 14
},
{
"epoch": 0.09603841536614646,
"grad_norm": 6.813176303739041,
"learning_rate": 9.374999999999999e-07,
"loss": 0.5933,
"step": 15
},
{
"epoch": 0.10244097639055623,
"grad_norm": 6.452956439451391,
"learning_rate": 1e-06,
"loss": 0.5332,
"step": 16
},
{
"epoch": 0.10884353741496598,
"grad_norm": 6.639180709933743,
"learning_rate": 1.0625e-06,
"loss": 0.5836,
"step": 17
},
{
"epoch": 0.11524609843937575,
"grad_norm": 5.461791194778153,
"learning_rate": 1.125e-06,
"loss": 0.5577,
"step": 18
},
{
"epoch": 0.12164865946378552,
"grad_norm": 5.930832649263125,
"learning_rate": 1.1874999999999999e-06,
"loss": 0.5337,
"step": 19
},
{
"epoch": 0.12805122048819528,
"grad_norm": 4.956904837601893,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.4669,
"step": 20
},
{
"epoch": 0.13445378151260504,
"grad_norm": 6.256208371824237,
"learning_rate": 1.3125e-06,
"loss": 0.4811,
"step": 21
},
{
"epoch": 0.14085634253701482,
"grad_norm": 5.246513837804897,
"learning_rate": 1.375e-06,
"loss": 0.4414,
"step": 22
},
{
"epoch": 0.14725890356142457,
"grad_norm": 5.528561343115844,
"learning_rate": 1.4375e-06,
"loss": 0.4948,
"step": 23
},
{
"epoch": 0.15366146458583432,
"grad_norm": 5.333224339149711,
"learning_rate": 1.5e-06,
"loss": 0.4281,
"step": 24
},
{
"epoch": 0.1600640256102441,
"grad_norm": 4.9864846595798324,
"learning_rate": 1.5624999999999999e-06,
"loss": 0.4362,
"step": 25
},
{
"epoch": 0.16646658663465386,
"grad_norm": 5.269516171255013,
"learning_rate": 1.625e-06,
"loss": 0.4036,
"step": 26
},
{
"epoch": 0.17286914765906364,
"grad_norm": 4.932131322685013,
"learning_rate": 1.6875e-06,
"loss": 0.4038,
"step": 27
},
{
"epoch": 0.1792717086834734,
"grad_norm": 5.318953799775683,
"learning_rate": 1.75e-06,
"loss": 0.4283,
"step": 28
},
{
"epoch": 0.18567426970788314,
"grad_norm": 5.190854642160747,
"learning_rate": 1.8125e-06,
"loss": 0.4044,
"step": 29
},
{
"epoch": 0.19207683073229292,
"grad_norm": 4.807806644055786,
"learning_rate": 1.8749999999999998e-06,
"loss": 0.4117,
"step": 30
},
{
"epoch": 0.19847939175670268,
"grad_norm": 4.292846106324363,
"learning_rate": 1.9375e-06,
"loss": 0.3416,
"step": 31
},
{
"epoch": 0.20488195278111246,
"grad_norm": 4.03903415559866,
"learning_rate": 2e-06,
"loss": 0.3794,
"step": 32
},
{
"epoch": 0.2112845138055222,
"grad_norm": 4.493745802649108,
"learning_rate": 1.9999370567547003e-06,
"loss": 0.3952,
"step": 33
},
{
"epoch": 0.21768707482993196,
"grad_norm": 4.645817807816507,
"learning_rate": 1.9997482349425066e-06,
"loss": 0.3743,
"step": 34
},
{
"epoch": 0.22408963585434175,
"grad_norm": 5.0435551125438,
"learning_rate": 1.9994335583335335e-06,
"loss": 0.3525,
"step": 35
},
{
"epoch": 0.2304921968787515,
"grad_norm": 4.788576656798642,
"learning_rate": 1.9989930665413145e-06,
"loss": 0.3865,
"step": 36
},
{
"epoch": 0.23689475790316125,
"grad_norm": 4.390755031941915,
"learning_rate": 1.9984268150178167e-06,
"loss": 0.3295,
"step": 37
},
{
"epoch": 0.24329731892757103,
"grad_norm": 4.772214941084568,
"learning_rate": 1.997734875046456e-06,
"loss": 0.3334,
"step": 38
},
{
"epoch": 0.24969987995198079,
"grad_norm": 5.273946498985277,
"learning_rate": 1.996917333733128e-06,
"loss": 0.3726,
"step": 39
},
{
"epoch": 0.25610244097639057,
"grad_norm": 5.369010852231794,
"learning_rate": 1.995974293995239e-06,
"loss": 0.3707,
"step": 40
},
{
"epoch": 0.26250500200080035,
"grad_norm": 5.767047861871442,
"learning_rate": 1.994905874548752e-06,
"loss": 0.3604,
"step": 41
},
{
"epoch": 0.2689075630252101,
"grad_norm": 5.503870643763186,
"learning_rate": 1.9937122098932426e-06,
"loss": 0.3746,
"step": 42
},
{
"epoch": 0.27531012404961985,
"grad_norm": 6.3642035735044935,
"learning_rate": 1.9923934502949643e-06,
"loss": 0.3524,
"step": 43
},
{
"epoch": 0.28171268507402963,
"grad_norm": 5.870101299422478,
"learning_rate": 1.9909497617679347e-06,
"loss": 0.3672,
"step": 44
},
{
"epoch": 0.28811524609843936,
"grad_norm": 4.907548990312214,
"learning_rate": 1.9893813260530367e-06,
"loss": 0.3748,
"step": 45
},
{
"epoch": 0.29451780712284914,
"grad_norm": 4.106972565193302,
"learning_rate": 1.9876883405951377e-06,
"loss": 0.3265,
"step": 46
},
{
"epoch": 0.3009203681472589,
"grad_norm": 4.649037094549095,
"learning_rate": 1.9858710185182355e-06,
"loss": 0.2802,
"step": 47
},
{
"epoch": 0.30732292917166865,
"grad_norm": 4.578190605072973,
"learning_rate": 1.9839295885986295e-06,
"loss": 0.3414,
"step": 48
},
{
"epoch": 0.3137254901960784,
"grad_norm": 5.338755744928473,
"learning_rate": 1.9818642952361183e-06,
"loss": 0.3513,
"step": 49
},
{
"epoch": 0.3201280512204882,
"grad_norm": 5.16732801301707,
"learning_rate": 1.9796753984232355e-06,
"loss": 0.3385,
"step": 50
},
{
"epoch": 0.32653061224489793,
"grad_norm": 4.308178011035292,
"learning_rate": 1.977363173712519e-06,
"loss": 0.3337,
"step": 51
},
{
"epoch": 0.3329331732693077,
"grad_norm": 4.047304578891064,
"learning_rate": 1.9749279121818236e-06,
"loss": 0.3536,
"step": 52
},
{
"epoch": 0.3393357342937175,
"grad_norm": 4.189108068341681,
"learning_rate": 1.9723699203976766e-06,
"loss": 0.3007,
"step": 53
},
{
"epoch": 0.3457382953181273,
"grad_norm": 4.150519885409928,
"learning_rate": 1.9696895203766866e-06,
"loss": 0.2981,
"step": 54
},
{
"epoch": 0.352140856342537,
"grad_norm": 4.59545667540724,
"learning_rate": 1.966887049545006e-06,
"loss": 0.3121,
"step": 55
},
{
"epoch": 0.3585434173669468,
"grad_norm": 3.9736439160316683,
"learning_rate": 1.9639628606958534e-06,
"loss": 0.3049,
"step": 56
},
{
"epoch": 0.36494597839135656,
"grad_norm": 4.3054703674323,
"learning_rate": 1.9609173219450997e-06,
"loss": 0.3124,
"step": 57
},
{
"epoch": 0.3713485394157663,
"grad_norm": 3.4948815561353537,
"learning_rate": 1.9577508166849303e-06,
"loss": 0.3013,
"step": 58
},
{
"epoch": 0.37775110044017607,
"grad_norm": 3.741965125562426,
"learning_rate": 1.9544637435355806e-06,
"loss": 0.2894,
"step": 59
},
{
"epoch": 0.38415366146458585,
"grad_norm": 4.280653888850941,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.3134,
"step": 60
},
{
"epoch": 0.3905562224889956,
"grad_norm": 4.405700671682634,
"learning_rate": 1.947529563887529e-06,
"loss": 0.3177,
"step": 61
},
{
"epoch": 0.39695878351340536,
"grad_norm": 3.9394485247143636,
"learning_rate": 1.9438833303083674e-06,
"loss": 0.2966,
"step": 62
},
{
"epoch": 0.40336134453781514,
"grad_norm": 4.389702795617689,
"learning_rate": 1.9401182745692187e-06,
"loss": 0.2903,
"step": 63
},
{
"epoch": 0.4097639055622249,
"grad_norm": 3.5979658724911077,
"learning_rate": 1.936234870639737e-06,
"loss": 0.2953,
"step": 64
},
{
"epoch": 0.41616646658663464,
"grad_norm": 4.47441718112865,
"learning_rate": 1.9322336073880143e-06,
"loss": 0.3248,
"step": 65
},
{
"epoch": 0.4225690276110444,
"grad_norm": 3.9355652185440637,
"learning_rate": 1.928114988519039e-06,
"loss": 0.3093,
"step": 66
},
{
"epoch": 0.4289715886354542,
"grad_norm": 4.580138491679913,
"learning_rate": 1.9238795325112867e-06,
"loss": 0.3471,
"step": 67
},
{
"epoch": 0.43537414965986393,
"grad_norm": 4.264136088165448,
"learning_rate": 1.9195277725514506e-06,
"loss": 0.3202,
"step": 68
},
{
"epoch": 0.4417767106842737,
"grad_norm": 3.955882298206603,
"learning_rate": 1.91506025646732e-06,
"loss": 0.3297,
"step": 69
},
{
"epoch": 0.4481792717086835,
"grad_norm": 3.5995979170148513,
"learning_rate": 1.9104775466588157e-06,
"loss": 0.2737,
"step": 70
},
{
"epoch": 0.4545818327330932,
"grad_norm": 5.25272090784733,
"learning_rate": 1.905780220027194e-06,
"loss": 0.2706,
"step": 71
},
{
"epoch": 0.460984393757503,
"grad_norm": 4.341561414420844,
"learning_rate": 1.9009688679024189e-06,
"loss": 0.3079,
"step": 72
},
{
"epoch": 0.4673869547819128,
"grad_norm": 3.6758492701618226,
"learning_rate": 1.8960440959687252e-06,
"loss": 0.2924,
"step": 73
},
{
"epoch": 0.4737895158063225,
"grad_norm": 3.9451817967595386,
"learning_rate": 1.8910065241883678e-06,
"loss": 0.2971,
"step": 74
},
{
"epoch": 0.4801920768307323,
"grad_norm": 4.96315413342626,
"learning_rate": 1.8858567867235798e-06,
"loss": 0.2673,
"step": 75
},
{
"epoch": 0.48659463785514206,
"grad_norm": 4.139530587876671,
"learning_rate": 1.8805955318567379e-06,
"loss": 0.27,
"step": 76
},
{
"epoch": 0.49299719887955185,
"grad_norm": 4.036729827911165,
"learning_rate": 1.8752234219087537e-06,
"loss": 0.3079,
"step": 77
},
{
"epoch": 0.49939975990396157,
"grad_norm": 4.267304261790974,
"learning_rate": 1.8697411331556953e-06,
"loss": 0.2777,
"step": 78
},
{
"epoch": 0.5058023209283713,
"grad_norm": 4.140171639449446,
"learning_rate": 1.8641493557436548e-06,
"loss": 0.2965,
"step": 79
},
{
"epoch": 0.5122048819527811,
"grad_norm": 4.42365589599126,
"learning_rate": 1.858448793601866e-06,
"loss": 0.3452,
"step": 80
},
{
"epoch": 0.5186074429771909,
"grad_norm": 4.2607097203243125,
"learning_rate": 1.852640164354092e-06,
"loss": 0.2703,
"step": 81
},
{
"epoch": 0.5250100040016007,
"grad_norm": 3.9797822473229814,
"learning_rate": 1.8467241992282841e-06,
"loss": 0.2994,
"step": 82
},
{
"epoch": 0.5314125650260104,
"grad_norm": 3.7553052637845035,
"learning_rate": 1.8407016429645302e-06,
"loss": 0.296,
"step": 83
},
{
"epoch": 0.5378151260504201,
"grad_norm": 3.7344036316964893,
"learning_rate": 1.8345732537213026e-06,
"loss": 0.3052,
"step": 84
},
{
"epoch": 0.54421768707483,
"grad_norm": 4.104658693180337,
"learning_rate": 1.8283398029800164e-06,
"loss": 0.2794,
"step": 85
},
{
"epoch": 0.5506202480992397,
"grad_norm": 3.86258232609451,
"learning_rate": 1.82200207544791e-06,
"loss": 0.2729,
"step": 86
},
{
"epoch": 0.5570228091236494,
"grad_norm": 3.5583118098950064,
"learning_rate": 1.8155608689592601e-06,
"loss": 0.2882,
"step": 87
},
{
"epoch": 0.5634253701480593,
"grad_norm": 4.521351652684801,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.3041,
"step": 88
},
{
"epoch": 0.569827931172469,
"grad_norm": 3.403629696215942,
"learning_rate": 1.802371275480378e-06,
"loss": 0.2579,
"step": 89
},
{
"epoch": 0.5762304921968787,
"grad_norm": 3.6982154812345396,
"learning_rate": 1.795624548881781e-06,
"loss": 0.2464,
"step": 90
},
{
"epoch": 0.5826330532212886,
"grad_norm": 4.352958033159996,
"learning_rate": 1.7887776639008912e-06,
"loss": 0.2641,
"step": 91
},
{
"epoch": 0.5890356142456983,
"grad_norm": 4.54700815993712,
"learning_rate": 1.7818314824680298e-06,
"loss": 0.3019,
"step": 92
},
{
"epoch": 0.595438175270108,
"grad_norm": 3.789128443890756,
"learning_rate": 1.774786879013601e-06,
"loss": 0.2883,
"step": 93
},
{
"epoch": 0.6018407362945178,
"grad_norm": 4.340181858310212,
"learning_rate": 1.767644740358011e-06,
"loss": 0.3271,
"step": 94
},
{
"epoch": 0.6082432973189276,
"grad_norm": 4.181233142586761,
"learning_rate": 1.760405965600031e-06,
"loss": 0.3036,
"step": 95
},
{
"epoch": 0.6146458583433373,
"grad_norm": 4.468139724866329,
"learning_rate": 1.753071466003611e-06,
"loss": 0.2808,
"step": 96
},
{
"epoch": 0.6210484193677471,
"grad_norm": 4.510323988503172,
"learning_rate": 1.7456421648831654e-06,
"loss": 0.26,
"step": 97
},
{
"epoch": 0.6274509803921569,
"grad_norm": 3.8787104679210507,
"learning_rate": 1.7381189974873407e-06,
"loss": 0.2888,
"step": 98
},
{
"epoch": 0.6338535414165666,
"grad_norm": 4.459830521867393,
"learning_rate": 1.7305029108812774e-06,
"loss": 0.2464,
"step": 99
},
{
"epoch": 0.6402561024409764,
"grad_norm": 4.709563601825434,
"learning_rate": 1.7227948638273915e-06,
"loss": 0.3026,
"step": 100
},
{
"epoch": 0.6466586634653861,
"grad_norm": 3.7651337955677255,
"learning_rate": 1.7149958266646754e-06,
"loss": 0.2786,
"step": 101
},
{
"epoch": 0.6530612244897959,
"grad_norm": 5.5973077659411405,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.288,
"step": 102
},
{
"epoch": 0.6594637855142057,
"grad_norm": 3.8873700424843127,
"learning_rate": 1.6991287205172574e-06,
"loss": 0.2722,
"step": 103
},
{
"epoch": 0.6658663465386154,
"grad_norm": 4.2130449731998425,
"learning_rate": 1.6910626489868648e-06,
"loss": 0.2955,
"step": 104
},
{
"epoch": 0.6722689075630253,
"grad_norm": 4.14817042922031,
"learning_rate": 1.682909582004807e-06,
"loss": 0.3185,
"step": 105
},
{
"epoch": 0.678671468587435,
"grad_norm": 4.776345876515052,
"learning_rate": 1.6746705459320744e-06,
"loss": 0.3116,
"step": 106
},
{
"epoch": 0.6850740296118447,
"grad_norm": 3.857374024007739,
"learning_rate": 1.6663465779520037e-06,
"loss": 0.3189,
"step": 107
},
{
"epoch": 0.6914765906362546,
"grad_norm": 4.133483454926766,
"learning_rate": 1.6579387259397126e-06,
"loss": 0.3082,
"step": 108
},
{
"epoch": 0.6978791516606643,
"grad_norm": 4.020769377918411,
"learning_rate": 1.6494480483301835e-06,
"loss": 0.2825,
"step": 109
},
{
"epoch": 0.704281712685074,
"grad_norm": 4.261022360359079,
"learning_rate": 1.640875613985024e-06,
"loss": 0.2907,
"step": 110
},
{
"epoch": 0.7106842737094838,
"grad_norm": 3.712351214158889,
"learning_rate": 1.6322225020579096e-06,
"loss": 0.2822,
"step": 111
},
{
"epoch": 0.7170868347338936,
"grad_norm": 3.440740014931854,
"learning_rate": 1.6234898018587336e-06,
"loss": 0.2381,
"step": 112
},
{
"epoch": 0.7234893957583033,
"grad_norm": 3.775626420708748,
"learning_rate": 1.6146786127164771e-06,
"loss": 0.2937,
"step": 113
},
{
"epoch": 0.7298919567827131,
"grad_norm": 3.8211915298740666,
"learning_rate": 1.6057900438408199e-06,
"loss": 0.2843,
"step": 114
},
{
"epoch": 0.7362945178071229,
"grad_norm": 3.752355539307679,
"learning_rate": 1.5968252141825035e-06,
"loss": 0.2648,
"step": 115
},
{
"epoch": 0.7426970788315326,
"grad_norm": 3.842629249054385,
"learning_rate": 1.587785252292473e-06,
"loss": 0.27,
"step": 116
},
{
"epoch": 0.7490996398559424,
"grad_norm": 4.166939902099283,
"learning_rate": 1.578671296179806e-06,
"loss": 0.2521,
"step": 117
},
{
"epoch": 0.7555022008803521,
"grad_norm": 3.8837453592291844,
"learning_rate": 1.569484493168452e-06,
"loss": 0.2926,
"step": 118
},
{
"epoch": 0.7619047619047619,
"grad_norm": 3.9849389096374837,
"learning_rate": 1.5602259997528027e-06,
"loss": 0.2415,
"step": 119
},
{
"epoch": 0.7683073229291717,
"grad_norm": 3.8214588045054407,
"learning_rate": 1.5508969814521024e-06,
"loss": 0.2896,
"step": 120
},
{
"epoch": 0.7747098839535814,
"grad_norm": 3.596004790144505,
"learning_rate": 1.5414986126637257e-06,
"loss": 0.25,
"step": 121
},
{
"epoch": 0.7811124449779911,
"grad_norm": 3.7868659894236734,
"learning_rate": 1.5320320765153365e-06,
"loss": 0.2742,
"step": 122
},
{
"epoch": 0.787515006002401,
"grad_norm": 3.605103510806505,
"learning_rate": 1.5224985647159488e-06,
"loss": 0.2288,
"step": 123
},
{
"epoch": 0.7939175670268107,
"grad_norm": 4.164526361851514,
"learning_rate": 1.5128992774059062e-06,
"loss": 0.257,
"step": 124
},
{
"epoch": 0.8003201280512204,
"grad_norm": 4.2601493772240095,
"learning_rate": 1.5032354230058002e-06,
"loss": 0.2703,
"step": 125
},
{
"epoch": 0.8067226890756303,
"grad_norm": 4.230695549396448,
"learning_rate": 1.4935082180643467e-06,
"loss": 0.315,
"step": 126
},
{
"epoch": 0.81312525010004,
"grad_norm": 4.029734026287984,
"learning_rate": 1.4837188871052397e-06,
"loss": 0.2587,
"step": 127
},
{
"epoch": 0.8195278111244498,
"grad_norm": 4.418674847039931,
"learning_rate": 1.4738686624729987e-06,
"loss": 0.3162,
"step": 128
},
{
"epoch": 0.8259303721488596,
"grad_norm": 4.0453350041861205,
"learning_rate": 1.463958784177834e-06,
"loss": 0.2534,
"step": 129
},
{
"epoch": 0.8323329331732693,
"grad_norm": 4.6877576825765335,
"learning_rate": 1.4539904997395467e-06,
"loss": 0.2631,
"step": 130
},
{
"epoch": 0.8387354941976791,
"grad_norm": 3.780029409396397,
"learning_rate": 1.4439650640304821e-06,
"loss": 0.271,
"step": 131
},
{
"epoch": 0.8451380552220888,
"grad_norm": 3.6785546417231756,
"learning_rate": 1.433883739117558e-06,
"loss": 0.2418,
"step": 132
},
{
"epoch": 0.8515406162464986,
"grad_norm": 3.396354715623399,
"learning_rate": 1.4237477941033886e-06,
"loss": 0.2499,
"step": 133
},
{
"epoch": 0.8579431772709084,
"grad_norm": 3.7280690326153265,
"learning_rate": 1.4135585049665206e-06,
"loss": 0.2846,
"step": 134
},
{
"epoch": 0.8643457382953181,
"grad_norm": 4.004231717394044,
"learning_rate": 1.4033171544008051e-06,
"loss": 0.2938,
"step": 135
},
{
"epoch": 0.8707482993197279,
"grad_norm": 3.342930768359313,
"learning_rate": 1.3930250316539235e-06,
"loss": 0.2693,
"step": 136
},
{
"epoch": 0.8771508603441377,
"grad_norm": 3.9327560661895244,
"learning_rate": 1.3826834323650898e-06,
"loss": 0.3064,
"step": 137
},
{
"epoch": 0.8835534213685474,
"grad_norm": 3.8159537564044954,
"learning_rate": 1.3722936584019451e-06,
"loss": 0.2529,
"step": 138
},
{
"epoch": 0.8899559823929571,
"grad_norm": 3.453933647883322,
"learning_rate": 1.3618570176966722e-06,
"loss": 0.2837,
"step": 139
},
{
"epoch": 0.896358543417367,
"grad_norm": 3.7776942360972035,
"learning_rate": 1.3513748240813427e-06,
"loss": 0.3414,
"step": 140
},
{
"epoch": 0.9027611044417767,
"grad_norm": 3.701115924801193,
"learning_rate": 1.3408483971225249e-06,
"loss": 0.2845,
"step": 141
},
{
"epoch": 0.9091636654661864,
"grad_norm": 3.642984026753104,
"learning_rate": 1.3302790619551672e-06,
"loss": 0.2472,
"step": 142
},
{
"epoch": 0.9155662264905963,
"grad_norm": 3.875796995615745,
"learning_rate": 1.3196681491157816e-06,
"loss": 0.2481,
"step": 143
},
{
"epoch": 0.921968787515006,
"grad_norm": 3.8424117933387256,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.2819,
"step": 144
},
{
"epoch": 0.9283713485394157,
"grad_norm": 4.037093475071882,
"learning_rate": 1.298326938569156e-06,
"loss": 0.324,
"step": 145
},
{
"epoch": 0.9347739095638256,
"grad_norm": 3.5202146156430447,
"learning_rate": 1.2875993274320173e-06,
"loss": 0.2459,
"step": 146
},
{
"epoch": 0.9411764705882353,
"grad_norm": 3.740795759086208,
"learning_rate": 1.2768355114248492e-06,
"loss": 0.2443,
"step": 147
},
{
"epoch": 0.947579031612645,
"grad_norm": 4.3061782207302866,
"learning_rate": 1.266036845566675e-06,
"loss": 0.3083,
"step": 148
},
{
"epoch": 0.9539815926370548,
"grad_norm": 3.9442811474090775,
"learning_rate": 1.2552046892636426e-06,
"loss": 0.244,
"step": 149
},
{
"epoch": 0.9603841536614646,
"grad_norm": 3.9865071324515027,
"learning_rate": 1.244340406137894e-06,
"loss": 0.2277,
"step": 150
},
{
"epoch": 0.9667867146858744,
"grad_norm": 4.016047319687184,
"learning_rate": 1.2334453638559054e-06,
"loss": 0.2541,
"step": 151
},
{
"epoch": 0.9731892757102841,
"grad_norm": 4.032894518578795,
"learning_rate": 1.2225209339563143e-06,
"loss": 0.2575,
"step": 152
},
{
"epoch": 0.9795918367346939,
"grad_norm": 4.065384007721365,
"learning_rate": 1.211568491677263e-06,
"loss": 0.2638,
"step": 153
},
{
"epoch": 0.9859943977591037,
"grad_norm": 4.082382918746455,
"learning_rate": 1.2005894157832728e-06,
"loss": 0.3124,
"step": 154
},
{
"epoch": 0.9923969587835134,
"grad_norm": 4.690622898413357,
"learning_rate": 1.1895850883916785e-06,
"loss": 0.2434,
"step": 155
},
{
"epoch": 0.9987995198079231,
"grad_norm": 4.05650193960916,
"learning_rate": 1.1785568947986366e-06,
"loss": 0.3047,
"step": 156
},
{
"epoch": 1.005202080832333,
"grad_norm": 3.2193287198369993,
"learning_rate": 1.1675062233047363e-06,
"loss": 0.22,
"step": 157
},
{
"epoch": 1.0116046418567426,
"grad_norm": 2.8522266075564024,
"learning_rate": 1.156434465040231e-06,
"loss": 0.2104,
"step": 158
},
{
"epoch": 1.0180072028811524,
"grad_norm": 2.960293382016092,
"learning_rate": 1.1453430137899128e-06,
"loss": 0.224,
"step": 159
},
{
"epoch": 1.0244097639055623,
"grad_norm": 3.1247052252152434,
"learning_rate": 1.1342332658176555e-06,
"loss": 0.2318,
"step": 160
},
{
"epoch": 1.0308123249299719,
"grad_norm": 2.7511164775931163,
"learning_rate": 1.123106619690643e-06,
"loss": 0.2045,
"step": 161
},
{
"epoch": 1.0372148859543817,
"grad_norm": 2.7289267777021027,
"learning_rate": 1.1119644761033077e-06,
"loss": 0.2038,
"step": 162
},
{
"epoch": 1.0436174469787916,
"grad_norm": 2.713995483150959,
"learning_rate": 1.1008082377010045e-06,
"loss": 0.2129,
"step": 163
},
{
"epoch": 1.0500200080032012,
"grad_norm": 2.78973076664929,
"learning_rate": 1.0896393089034335e-06,
"loss": 0.2182,
"step": 164
},
{
"epoch": 1.056422569027611,
"grad_norm": 2.9643061675150038,
"learning_rate": 1.078459095727845e-06,
"loss": 0.2232,
"step": 165
},
{
"epoch": 1.0628251300520208,
"grad_norm": 2.8380963314510113,
"learning_rate": 1.0672690056120398e-06,
"loss": 0.2227,
"step": 166
},
{
"epoch": 1.0692276910764307,
"grad_norm": 2.687954696836688,
"learning_rate": 1.0560704472371917e-06,
"loss": 0.2114,
"step": 167
},
{
"epoch": 1.0756302521008403,
"grad_norm": 2.9271042366476996,
"learning_rate": 1.044864830350515e-06,
"loss": 0.2059,
"step": 168
},
{
"epoch": 1.0820328131252501,
"grad_norm": 3.0180594470719897,
"learning_rate": 1.033653565587794e-06,
"loss": 0.2217,
"step": 169
},
{
"epoch": 1.08843537414966,
"grad_norm": 3.1673109290531154,
"learning_rate": 1.022438064295805e-06,
"loss": 0.2155,
"step": 170
},
{
"epoch": 1.0948379351740696,
"grad_norm": 3.0766952480651164,
"learning_rate": 1.0112197383546459e-06,
"loss": 0.217,
"step": 171
},
{
"epoch": 1.1012404961984794,
"grad_norm": 2.719327293458082,
"learning_rate": 1e-06,
"loss": 0.1822,
"step": 172
},
{
"epoch": 1.1076430572228892,
"grad_norm": 2.770227862802081,
"learning_rate": 9.88780261645354e-07,
"loss": 0.1926,
"step": 173
},
{
"epoch": 1.1140456182472989,
"grad_norm": 3.0009223093685704,
"learning_rate": 9.77561935704195e-07,
"loss": 0.1911,
"step": 174
},
{
"epoch": 1.1204481792717087,
"grad_norm": 3.023185799043678,
"learning_rate": 9.663464344122063e-07,
"loss": 0.1903,
"step": 175
},
{
"epoch": 1.1268507402961185,
"grad_norm": 3.457998950657755,
"learning_rate": 9.551351696494853e-07,
"loss": 0.251,
"step": 176
},
{
"epoch": 1.1332533013205282,
"grad_norm": 2.9312099897929036,
"learning_rate": 9.43929552762808e-07,
"loss": 0.2176,
"step": 177
},
{
"epoch": 1.139655862344938,
"grad_norm": 3.3046823347401175,
"learning_rate": 9.327309943879603e-07,
"loss": 0.2109,
"step": 178
},
{
"epoch": 1.1460584233693478,
"grad_norm": 3.0921904822263078,
"learning_rate": 9.215409042721551e-07,
"loss": 0.22,
"step": 179
},
{
"epoch": 1.1524609843937574,
"grad_norm": 2.991127231725485,
"learning_rate": 9.103606910965665e-07,
"loss": 0.188,
"step": 180
},
{
"epoch": 1.1588635454181673,
"grad_norm": 3.281303486352065,
"learning_rate": 8.991917622989955e-07,
"loss": 0.2051,
"step": 181
},
{
"epoch": 1.165266106442577,
"grad_norm": 3.2779897667339393,
"learning_rate": 8.880355238966921e-07,
"loss": 0.2128,
"step": 182
},
{
"epoch": 1.1716686674669867,
"grad_norm": 2.908646079818215,
"learning_rate": 8.768933803093572e-07,
"loss": 0.211,
"step": 183
},
{
"epoch": 1.1780712284913966,
"grad_norm": 3.511327611579083,
"learning_rate": 8.657667341823448e-07,
"loss": 0.2067,
"step": 184
},
{
"epoch": 1.1844737895158064,
"grad_norm": 3.446334131319975,
"learning_rate": 8.546569862100875e-07,
"loss": 0.2126,
"step": 185
},
{
"epoch": 1.190876350540216,
"grad_norm": 3.086417796365709,
"learning_rate": 8.435655349597689e-07,
"loss": 0.1961,
"step": 186
},
{
"epoch": 1.1972789115646258,
"grad_norm": 3.486051214472594,
"learning_rate": 8.324937766952636e-07,
"loss": 0.1954,
"step": 187
},
{
"epoch": 1.2036814725890357,
"grad_norm": 3.4725632158887625,
"learning_rate": 8.214431052013634e-07,
"loss": 0.2207,
"step": 188
},
{
"epoch": 1.2100840336134453,
"grad_norm": 3.102428873715714,
"learning_rate": 8.104149116083216e-07,
"loss": 0.2167,
"step": 189
},
{
"epoch": 1.2164865946378551,
"grad_norm": 3.2423731064102457,
"learning_rate": 7.994105842167272e-07,
"loss": 0.1924,
"step": 190
},
{
"epoch": 1.222889155662265,
"grad_norm": 3.2545985252021152,
"learning_rate": 7.884315083227372e-07,
"loss": 0.2029,
"step": 191
},
{
"epoch": 1.2292917166866746,
"grad_norm": 3.0971604028938517,
"learning_rate": 7.774790660436857e-07,
"loss": 0.1741,
"step": 192
},
{
"epoch": 1.2356942777110844,
"grad_norm": 3.443986175823104,
"learning_rate": 7.665546361440949e-07,
"loss": 0.2451,
"step": 193
},
{
"epoch": 1.2420968387354943,
"grad_norm": 3.3511781482482847,
"learning_rate": 7.556595938621058e-07,
"loss": 0.1843,
"step": 194
},
{
"epoch": 1.2484993997599039,
"grad_norm": 3.4059727380620686,
"learning_rate": 7.447953107363574e-07,
"loss": 0.2354,
"step": 195
},
{
"epoch": 1.2549019607843137,
"grad_norm": 3.2416330111232243,
"learning_rate": 7.33963154433325e-07,
"loss": 0.1955,
"step": 196
},
{
"epoch": 1.2613045218087235,
"grad_norm": 4.2347778935869345,
"learning_rate": 7.231644885751507e-07,
"loss": 0.217,
"step": 197
},
{
"epoch": 1.2677070828331334,
"grad_norm": 3.567306963297163,
"learning_rate": 7.124006725679828e-07,
"loss": 0.2212,
"step": 198
},
{
"epoch": 1.274109643857543,
"grad_norm": 3.00005386027494,
"learning_rate": 7.016730614308439e-07,
"loss": 0.1926,
"step": 199
},
{
"epoch": 1.2805122048819528,
"grad_norm": 3.414969994137627,
"learning_rate": 6.909830056250526e-07,
"loss": 0.2044,
"step": 200
},
{
"epoch": 1.2869147659063627,
"grad_norm": 3.5101423542049868,
"learning_rate": 6.803318508842186e-07,
"loss": 0.1958,
"step": 201
},
{
"epoch": 1.2933173269307723,
"grad_norm": 3.08086263335831,
"learning_rate": 6.697209380448332e-07,
"loss": 0.2239,
"step": 202
},
{
"epoch": 1.2997198879551821,
"grad_norm": 3.6991398316157835,
"learning_rate": 6.59151602877475e-07,
"loss": 0.2019,
"step": 203
},
{
"epoch": 1.306122448979592,
"grad_norm": 3.1898949896135953,
"learning_rate": 6.486251759186572e-07,
"loss": 0.2276,
"step": 204
},
{
"epoch": 1.3125250100040016,
"grad_norm": 3.15209044029482,
"learning_rate": 6.381429823033279e-07,
"loss": 0.1788,
"step": 205
},
{
"epoch": 1.3189275710284114,
"grad_norm": 3.2074734897831756,
"learning_rate": 6.277063415980548e-07,
"loss": 0.2434,
"step": 206
},
{
"epoch": 1.3253301320528212,
"grad_norm": 2.875848443017429,
"learning_rate": 6.173165676349102e-07,
"loss": 0.1809,
"step": 207
},
{
"epoch": 1.3317326930772309,
"grad_norm": 3.273739860976,
"learning_rate": 6.069749683460764e-07,
"loss": 0.2,
"step": 208
},
{
"epoch": 1.3381352541016407,
"grad_norm": 3.4156670470458494,
"learning_rate": 5.96682845599195e-07,
"loss": 0.2044,
"step": 209
},
{
"epoch": 1.3445378151260505,
"grad_norm": 4.24358157090511,
"learning_rate": 5.864414950334795e-07,
"loss": 0.194,
"step": 210
},
{
"epoch": 1.3509403761504601,
"grad_norm": 3.116102100075619,
"learning_rate": 5.762522058966113e-07,
"loss": 0.2089,
"step": 211
},
{
"epoch": 1.35734293717487,
"grad_norm": 2.8625746825319034,
"learning_rate": 5.661162608824419e-07,
"loss": 0.2083,
"step": 212
},
{
"epoch": 1.3637454981992798,
"grad_norm": 3.2057903093169555,
"learning_rate": 5.56034935969518e-07,
"loss": 0.1952,
"step": 213
},
{
"epoch": 1.3701480592236894,
"grad_norm": 2.669239875130929,
"learning_rate": 5.460095002604532e-07,
"loss": 0.2056,
"step": 214
},
{
"epoch": 1.3765506202480993,
"grad_norm": 3.2441222068201605,
"learning_rate": 5.36041215822166e-07,
"loss": 0.209,
"step": 215
},
{
"epoch": 1.382953181272509,
"grad_norm": 3.0334649559489724,
"learning_rate": 5.261313375270013e-07,
"loss": 0.1821,
"step": 216
},
{
"epoch": 1.3893557422969187,
"grad_norm": 3.005021545247239,
"learning_rate": 5.162811128947602e-07,
"loss": 0.1919,
"step": 217
},
{
"epoch": 1.3957583033213286,
"grad_norm": 3.0185970628882184,
"learning_rate": 5.064917819356531e-07,
"loss": 0.2124,
"step": 218
},
{
"epoch": 1.4021608643457384,
"grad_norm": 2.9841034532353645,
"learning_rate": 4.967645769941999e-07,
"loss": 0.1751,
"step": 219
},
{
"epoch": 1.408563425370148,
"grad_norm": 3.1214597510428783,
"learning_rate": 4.871007225940939e-07,
"loss": 0.2245,
"step": 220
},
{
"epoch": 1.4149659863945578,
"grad_norm": 3.118933296947675,
"learning_rate": 4.775014352840512e-07,
"loss": 0.2171,
"step": 221
},
{
"epoch": 1.4213685474189677,
"grad_norm": 3.1742771966166567,
"learning_rate": 4.6796792348466353e-07,
"loss": 0.2132,
"step": 222
},
{
"epoch": 1.4277711084433773,
"grad_norm": 2.602593575781098,
"learning_rate": 4.585013873362743e-07,
"loss": 0.1799,
"step": 223
},
{
"epoch": 1.4341736694677871,
"grad_norm": 2.94290229867875,
"learning_rate": 4.4910301854789755e-07,
"loss": 0.1815,
"step": 224
},
{
"epoch": 1.440576230492197,
"grad_norm": 3.2300005719414204,
"learning_rate": 4.397740002471972e-07,
"loss": 0.2143,
"step": 225
},
{
"epoch": 1.4469787915166066,
"grad_norm": 2.8425860620393943,
"learning_rate": 4.3051550683154804e-07,
"loss": 0.2251,
"step": 226
},
{
"epoch": 1.4533813525410164,
"grad_norm": 2.7744967195870136,
"learning_rate": 4.2132870382019427e-07,
"loss": 0.2023,
"step": 227
},
{
"epoch": 1.4597839135654262,
"grad_norm": 2.8299937805822384,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.1825,
"step": 228
},
{
"epoch": 1.4661864745898359,
"grad_norm": 2.842358054849413,
"learning_rate": 4.031747858174964e-07,
"loss": 0.1815,
"step": 229
},
{
"epoch": 1.4725890356142457,
"grad_norm": 3.0046695988860064,
"learning_rate": 3.942099561591802e-07,
"loss": 0.1876,
"step": 230
},
{
"epoch": 1.4789915966386555,
"grad_norm": 3.049335410632264,
"learning_rate": 3.853213872835228e-07,
"loss": 0.1965,
"step": 231
},
{
"epoch": 1.4853941576630652,
"grad_norm": 2.9908629914418663,
"learning_rate": 3.765101981412665e-07,
"loss": 0.206,
"step": 232
},
{
"epoch": 1.491796718687475,
"grad_norm": 3.6142447406768,
"learning_rate": 3.677774979420903e-07,
"loss": 0.1962,
"step": 233
},
{
"epoch": 1.4981992797118848,
"grad_norm": 3.1083828790336265,
"learning_rate": 3.5912438601497584e-07,
"loss": 0.1907,
"step": 234
},
{
"epoch": 1.5046018407362944,
"grad_norm": 3.8490048708212017,
"learning_rate": 3.5055195166981646e-07,
"loss": 0.2483,
"step": 235
},
{
"epoch": 1.5110044017607043,
"grad_norm": 3.5135746675092574,
"learning_rate": 3.420612740602874e-07,
"loss": 0.2197,
"step": 236
},
{
"epoch": 1.517406962785114,
"grad_norm": 2.9605758005174083,
"learning_rate": 3.3365342204799606e-07,
"loss": 0.2048,
"step": 237
},
{
"epoch": 1.5238095238095237,
"grad_norm": 3.102724567495504,
"learning_rate": 3.253294540679257e-07,
"loss": 0.2163,
"step": 238
},
{
"epoch": 1.5302120848339336,
"grad_norm": 3.338826460184625,
"learning_rate": 3.170904179951931e-07,
"loss": 0.1892,
"step": 239
},
{
"epoch": 1.5366146458583434,
"grad_norm": 3.234763671406949,
"learning_rate": 3.0893735101313535e-07,
"loss": 0.2762,
"step": 240
},
{
"epoch": 1.543017206882753,
"grad_norm": 2.812979283705818,
"learning_rate": 3.008712794827426e-07,
"loss": 0.1717,
"step": 241
},
{
"epoch": 1.5494197679071628,
"grad_norm": 2.934584165405633,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.2,
"step": 242
},
{
"epoch": 1.5558223289315727,
"grad_norm": 3.017927185569003,
"learning_rate": 2.850041733353247e-07,
"loss": 0.1887,
"step": 243
},
{
"epoch": 1.5622248899559823,
"grad_norm": 2.821836163783163,
"learning_rate": 2.7720513617260855e-07,
"loss": 0.1685,
"step": 244
},
{
"epoch": 1.5686274509803921,
"grad_norm": 2.9956132520307692,
"learning_rate": 2.6949708911872247e-07,
"loss": 0.1944,
"step": 245
},
{
"epoch": 1.575030012004802,
"grad_norm": 3.012268102569264,
"learning_rate": 2.6188100251265943e-07,
"loss": 0.2168,
"step": 246
},
{
"epoch": 1.5814325730292116,
"grad_norm": 2.6551981273651646,
"learning_rate": 2.543578351168344e-07,
"loss": 0.1587,
"step": 247
},
{
"epoch": 1.5878351340536214,
"grad_norm": 2.6439156715272913,
"learning_rate": 2.4692853399638913e-07,
"loss": 0.1889,
"step": 248
},
{
"epoch": 1.5942376950780313,
"grad_norm": 2.9350723750826266,
"learning_rate": 2.395940343999691e-07,
"loss": 0.2073,
"step": 249
},
{
"epoch": 1.6006402561024409,
"grad_norm": 3.0827120888341013,
"learning_rate": 2.3235525964198888e-07,
"loss": 0.1895,
"step": 250
},
{
"epoch": 1.6070428171268507,
"grad_norm": 3.1686080831269483,
"learning_rate": 2.252131209863991e-07,
"loss": 0.2029,
"step": 251
},
{
"epoch": 1.6134453781512605,
"grad_norm": 2.851390750231077,
"learning_rate": 2.181685175319702e-07,
"loss": 0.2061,
"step": 252
},
{
"epoch": 1.6198479391756702,
"grad_norm": 3.094325341444059,
"learning_rate": 2.11222336099109e-07,
"loss": 0.192,
"step": 253
},
{
"epoch": 1.62625050020008,
"grad_norm": 3.1464174144330026,
"learning_rate": 2.043754511182191e-07,
"loss": 0.2103,
"step": 254
},
{
"epoch": 1.6326530612244898,
"grad_norm": 3.120376340739763,
"learning_rate": 1.9762872451962208e-07,
"loss": 0.1705,
"step": 255
},
{
"epoch": 1.6390556222488994,
"grad_norm": 3.2179511708389037,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.1948,
"step": 256
},
{
"epoch": 1.6454581832733093,
"grad_norm": 2.8898679416961692,
"learning_rate": 1.8443913104073982e-07,
"loss": 0.1823,
"step": 257
},
{
"epoch": 1.6518607442977191,
"grad_norm": 2.993303933297039,
"learning_rate": 1.7799792455209016e-07,
"loss": 0.2126,
"step": 258
},
{
"epoch": 1.6582633053221287,
"grad_norm": 3.2103383308153925,
"learning_rate": 1.716601970199836e-07,
"loss": 0.1961,
"step": 259
},
{
"epoch": 1.6646658663465386,
"grad_norm": 3.235948737487849,
"learning_rate": 1.6542674627869734e-07,
"loss": 0.2057,
"step": 260
},
{
"epoch": 1.6710684273709484,
"grad_norm": 3.103836779762489,
"learning_rate": 1.592983570354699e-07,
"loss": 0.2186,
"step": 261
},
{
"epoch": 1.677470988395358,
"grad_norm": 3.014947796395499,
"learning_rate": 1.5327580077171588e-07,
"loss": 0.1912,
"step": 262
},
{
"epoch": 1.6838735494197679,
"grad_norm": 3.2700341051751245,
"learning_rate": 1.473598356459078e-07,
"loss": 0.2456,
"step": 263
},
{
"epoch": 1.6902761104441777,
"grad_norm": 3.2091848048221308,
"learning_rate": 1.415512063981339e-07,
"loss": 0.2099,
"step": 264
},
{
"epoch": 1.6966786714685873,
"grad_norm": 2.750474336934699,
"learning_rate": 1.358506442563454e-07,
"loss": 0.2159,
"step": 265
},
{
"epoch": 1.7030812324929971,
"grad_norm": 4.614673424699257,
"learning_rate": 1.3025886684430465e-07,
"loss": 0.2067,
"step": 266
},
{
"epoch": 1.709483793517407,
"grad_norm": 2.9058686083863514,
"learning_rate": 1.2477657809124632e-07,
"loss": 0.1992,
"step": 267
},
{
"epoch": 1.7158863545418166,
"grad_norm": 2.862146073630623,
"learning_rate": 1.19404468143262e-07,
"loss": 0.2142,
"step": 268
},
{
"epoch": 1.7222889155662267,
"grad_norm": 3.287005616915961,
"learning_rate": 1.1414321327642019e-07,
"loss": 0.2051,
"step": 269
},
{
"epoch": 1.7286914765906363,
"grad_norm": 2.9357679152099383,
"learning_rate": 1.089934758116322e-07,
"loss": 0.2113,
"step": 270
},
{
"epoch": 1.7350940376150459,
"grad_norm": 2.9564341258962683,
"learning_rate": 1.0395590403127486e-07,
"loss": 0.1711,
"step": 271
},
{
"epoch": 1.741496598639456,
"grad_norm": 3.1982558699085484,
"learning_rate": 9.903113209758096e-08,
"loss": 0.2086,
"step": 272
},
{
"epoch": 1.7478991596638656,
"grad_norm": 2.9531309947941566,
"learning_rate": 9.421977997280594e-08,
"loss": 0.1959,
"step": 273
},
{
"epoch": 1.7543017206882752,
"grad_norm": 3.070946141827486,
"learning_rate": 8.952245334118413e-08,
"loss": 0.1975,
"step": 274
},
{
"epoch": 1.7607042817126852,
"grad_norm": 2.758150815701359,
"learning_rate": 8.493974353268019e-08,
"loss": 0.1922,
"step": 275
},
{
"epoch": 1.7671068427370948,
"grad_norm": 2.9185225875350858,
"learning_rate": 8.047222744854942e-08,
"loss": 0.1825,
"step": 276
},
{
"epoch": 1.7735094037615045,
"grad_norm": 3.110148299633103,
"learning_rate": 7.612046748871326e-08,
"loss": 0.2096,
"step": 277
},
{
"epoch": 1.7799119647859145,
"grad_norm": 3.1130566100683246,
"learning_rate": 7.188501148096116e-08,
"loss": 0.1733,
"step": 278
},
{
"epoch": 1.7863145258103241,
"grad_norm": 3.1415180042361444,
"learning_rate": 6.77663926119858e-08,
"loss": 0.2043,
"step": 279
},
{
"epoch": 1.7927170868347337,
"grad_norm": 3.3828521625639554,
"learning_rate": 6.376512936026279e-08,
"loss": 0.2248,
"step": 280
},
{
"epoch": 1.7991196478591438,
"grad_norm": 3.0831187019400916,
"learning_rate": 5.988172543078096e-08,
"loss": 0.2238,
"step": 281
},
{
"epoch": 1.8055222088835534,
"grad_norm": 3.583225680392213,
"learning_rate": 5.611666969163242e-08,
"loss": 0.2025,
"step": 282
},
{
"epoch": 1.811924769907963,
"grad_norm": 2.7749576641359055,
"learning_rate": 5.2470436112471264e-08,
"loss": 0.1918,
"step": 283
},
{
"epoch": 1.818327330932373,
"grad_norm": 3.491814950661689,
"learning_rate": 4.8943483704846465e-08,
"loss": 0.2017,
"step": 284
},
{
"epoch": 1.8247298919567827,
"grad_norm": 3.3544823516715656,
"learning_rate": 4.553625646441928e-08,
"loss": 0.2182,
"step": 285
},
{
"epoch": 1.8311324529811923,
"grad_norm": 3.03556242043361,
"learning_rate": 4.224918331506955e-08,
"loss": 0.2172,
"step": 286
},
{
"epoch": 1.8375350140056024,
"grad_norm": 3.1512946254390815,
"learning_rate": 3.908267805490051e-08,
"loss": 0.2375,
"step": 287
},
{
"epoch": 1.843937575030012,
"grad_norm": 3.1583349292530687,
"learning_rate": 3.6037139304146756e-08,
"loss": 0.2164,
"step": 288
},
{
"epoch": 1.8503401360544216,
"grad_norm": 2.6885025755750007,
"learning_rate": 3.3112950454993625e-08,
"loss": 0.2124,
"step": 289
},
{
"epoch": 1.8567426970788317,
"grad_norm": 2.899693820698532,
"learning_rate": 3.0310479623313125e-08,
"loss": 0.2094,
"step": 290
},
{
"epoch": 1.8631452581032413,
"grad_norm": 2.9718624867551746,
"learning_rate": 2.7630079602323443e-08,
"loss": 0.1863,
"step": 291
},
{
"epoch": 1.8695478191276511,
"grad_norm": 3.1641777461916396,
"learning_rate": 2.507208781817638e-08,
"loss": 0.253,
"step": 292
},
{
"epoch": 1.875950380152061,
"grad_norm": 2.979215203416192,
"learning_rate": 2.263682628748087e-08,
"loss": 0.218,
"step": 293
},
{
"epoch": 1.8823529411764706,
"grad_norm": 2.935051364990218,
"learning_rate": 2.032460157676452e-08,
"loss": 0.1981,
"step": 294
},
{
"epoch": 1.8887555022008804,
"grad_norm": 2.818283326905153,
"learning_rate": 1.8135704763881598e-08,
"loss": 0.2028,
"step": 295
},
{
"epoch": 1.8951580632252902,
"grad_norm": 3.2868954384023823,
"learning_rate": 1.607041140137033e-08,
"loss": 0.2051,
"step": 296
},
{
"epoch": 1.9015606242496998,
"grad_norm": 2.948811609921979,
"learning_rate": 1.4128981481764113e-08,
"loss": 0.2096,
"step": 297
},
{
"epoch": 1.9079631852741097,
"grad_norm": 2.6648158990098314,
"learning_rate": 1.231165940486234e-08,
"loss": 0.1701,
"step": 298
},
{
"epoch": 1.9143657462985195,
"grad_norm": 2.828105773518058,
"learning_rate": 1.0618673946963364e-08,
"loss": 0.2012,
"step": 299
},
{
"epoch": 1.9207683073229291,
"grad_norm": 3.20360356272725,
"learning_rate": 9.050238232065299e-09,
"loss": 0.1798,
"step": 300
},
{
"epoch": 1.927170868347339,
"grad_norm": 3.1211562291161457,
"learning_rate": 7.606549705035935e-09,
"loss": 0.2063,
"step": 301
},
{
"epoch": 1.9335734293717488,
"grad_norm": 3.514934313160573,
"learning_rate": 6.2877901067573955e-09,
"loss": 0.1871,
"step": 302
},
{
"epoch": 1.9399759903961584,
"grad_norm": 3.080937668829297,
"learning_rate": 5.094125451247655e-09,
"loss": 0.2097,
"step": 303
},
{
"epoch": 1.9463785514205683,
"grad_norm": 3.035817857990733,
"learning_rate": 4.025706004760931e-09,
"loss": 0.2057,
"step": 304
},
{
"epoch": 1.952781112444978,
"grad_norm": 3.3455920299884734,
"learning_rate": 3.082666266872036e-09,
"loss": 0.2198,
"step": 305
},
{
"epoch": 1.9591836734693877,
"grad_norm": 3.2826612214440845,
"learning_rate": 2.2651249535439177e-09,
"loss": 0.2028,
"step": 306
},
{
"epoch": 1.9655862344937975,
"grad_norm": 3.3305034767663546,
"learning_rate": 1.5731849821833953e-09,
"loss": 0.2134,
"step": 307
},
{
"epoch": 1.9719887955182074,
"grad_norm": 3.5471720967981573,
"learning_rate": 1.0069334586854105e-09,
"loss": 0.1908,
"step": 308
},
{
"epoch": 1.978391356542617,
"grad_norm": 3.0004004418384542,
"learning_rate": 5.664416664666882e-10,
"loss": 0.2138,
"step": 309
},
{
"epoch": 1.9847939175670268,
"grad_norm": 2.8720482666550136,
"learning_rate": 2.517650574934693e-10,
"loss": 0.2277,
"step": 310
},
{
"epoch": 1.9911964785914367,
"grad_norm": 2.7009139828058832,
"learning_rate": 6.29432452994294e-11,
"loss": 0.215,
"step": 311
},
{
"epoch": 1.9975990396158463,
"grad_norm": 3.2699084579180306,
"learning_rate": 0.0,
"loss": 0.2042,
"step": 312
},
{
"epoch": 1.9975990396158463,
"step": 312,
"total_flos": 212071482654720.0,
"train_loss": 0.2767890989780426,
"train_runtime": 7611.443,
"train_samples_per_second": 10.506,
"train_steps_per_second": 0.041
}
],
"logging_steps": 1,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 212071482654720.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}