trocr_streets / trainer_state.json
muk42's picture
Upload 8 files
f3eda05 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 415,
"best_metric": 0.2885131265635832,
"best_model_checkpoint": "trocr\\checkpoint-415",
"epoch": 0.603112840466926,
"eval_steps": 5,
"global_step": 465,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025940337224383916,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 8.3361,
"step": 2
},
{
"epoch": 0.005188067444876783,
"grad_norm": 42.44731521606445,
"learning_rate": 0.0,
"loss": 9.0712,
"step": 4
},
{
"epoch": 0.00648508430609598,
"eval_cer": 0.9740181451005214,
"eval_loss": 10.673304557800293,
"eval_runtime": 121.4924,
"eval_samples_per_second": 21.763,
"eval_steps_per_second": 2.724,
"step": 5
},
{
"epoch": 0.007782101167315175,
"grad_norm": 50.13338088989258,
"learning_rate": 1.2000000000000002e-07,
"loss": 9.0523,
"step": 6
},
{
"epoch": 0.010376134889753566,
"grad_norm": 54.91879653930664,
"learning_rate": 2.4000000000000003e-07,
"loss": 8.6874,
"step": 8
},
{
"epoch": 0.01297016861219196,
"grad_norm": 57.007408142089844,
"learning_rate": 3.6e-07,
"loss": 9.1085,
"step": 10
},
{
"epoch": 0.01297016861219196,
"eval_cer": 0.9735660246556349,
"eval_loss": 10.574353218078613,
"eval_runtime": 92.0698,
"eval_samples_per_second": 28.717,
"eval_steps_per_second": 3.595,
"step": 10
},
{
"epoch": 0.01556420233463035,
"grad_norm": 65.91108703613281,
"learning_rate": 4.800000000000001e-07,
"loss": 8.1886,
"step": 12
},
{
"epoch": 0.018158236057068743,
"grad_norm": 34.232662200927734,
"learning_rate": 6.000000000000001e-07,
"loss": 8.3056,
"step": 14
},
{
"epoch": 0.019455252918287938,
"eval_cer": 0.9708231606233234,
"eval_loss": 10.372063636779785,
"eval_runtime": 93.5905,
"eval_samples_per_second": 28.251,
"eval_steps_per_second": 3.537,
"step": 15
},
{
"epoch": 0.020752269779507133,
"grad_norm": 42.87216567993164,
"learning_rate": 7.2e-07,
"loss": 8.7293,
"step": 16
},
{
"epoch": 0.023346303501945526,
"grad_norm": 39.21005630493164,
"learning_rate": 8.4e-07,
"loss": 8.6748,
"step": 18
},
{
"epoch": 0.02594033722438392,
"grad_norm": 94.66182708740234,
"learning_rate": 9.600000000000001e-07,
"loss": 8.2991,
"step": 20
},
{
"epoch": 0.02594033722438392,
"eval_cer": 0.9659101184555565,
"eval_loss": 10.076202392578125,
"eval_runtime": 88.5272,
"eval_samples_per_second": 29.867,
"eval_steps_per_second": 3.739,
"step": 20
},
{
"epoch": 0.028534370946822308,
"grad_norm": 34.152923583984375,
"learning_rate": 1.08e-06,
"loss": 8.3962,
"step": 22
},
{
"epoch": 0.0311284046692607,
"grad_norm": NaN,
"learning_rate": 1.2000000000000002e-06,
"loss": 8.2708,
"step": 24
},
{
"epoch": 0.0324254215304799,
"eval_cer": 0.9612683485547217,
"eval_loss": 9.797987937927246,
"eval_runtime": 96.7288,
"eval_samples_per_second": 27.334,
"eval_steps_per_second": 3.422,
"step": 25
},
{
"epoch": 0.03372243839169909,
"grad_norm": 45.9756965637207,
"learning_rate": 1.26e-06,
"loss": 7.8454,
"step": 26
},
{
"epoch": 0.03631647211413749,
"grad_norm": 32.557674407958984,
"learning_rate": 1.38e-06,
"loss": 8.2102,
"step": 28
},
{
"epoch": 0.038910505836575876,
"grad_norm": 32.354164123535156,
"learning_rate": 1.5e-06,
"loss": 8.6184,
"step": 30
},
{
"epoch": 0.038910505836575876,
"eval_cer": 0.9502366096994906,
"eval_loss": 9.3967866897583,
"eval_runtime": 98.634,
"eval_samples_per_second": 26.806,
"eval_steps_per_second": 3.356,
"step": 30
},
{
"epoch": 0.041504539559014265,
"grad_norm": 33.6341667175293,
"learning_rate": 1.62e-06,
"loss": 8.1516,
"step": 32
},
{
"epoch": 0.04409857328145266,
"grad_norm": 39.92112350463867,
"learning_rate": 1.74e-06,
"loss": 8.0678,
"step": 34
},
{
"epoch": 0.04539559014267185,
"eval_cer": 0.9342315459505078,
"eval_loss": 9.007755279541016,
"eval_runtime": 81.7686,
"eval_samples_per_second": 32.335,
"eval_steps_per_second": 4.048,
"step": 35
},
{
"epoch": 0.04669260700389105,
"grad_norm": 29.05873680114746,
"learning_rate": 1.86e-06,
"loss": 7.6011,
"step": 36
},
{
"epoch": 0.04928664072632944,
"grad_norm": 25.371105194091797,
"learning_rate": 1.98e-06,
"loss": 7.2863,
"step": 38
},
{
"epoch": 0.05188067444876784,
"grad_norm": 26.220144271850586,
"learning_rate": 2.1000000000000002e-06,
"loss": 7.699,
"step": 40
},
{
"epoch": 0.05188067444876784,
"eval_cer": 0.9170509690448202,
"eval_loss": 8.67233657836914,
"eval_runtime": 84.2938,
"eval_samples_per_second": 31.366,
"eval_steps_per_second": 3.927,
"step": 40
},
{
"epoch": 0.054474708171206226,
"grad_norm": 29.047372817993164,
"learning_rate": 2.22e-06,
"loss": 7.8821,
"step": 42
},
{
"epoch": 0.057068741893644616,
"grad_norm": 27.84252166748047,
"learning_rate": 2.34e-06,
"loss": 7.091,
"step": 44
},
{
"epoch": 0.058365758754863814,
"eval_cer": 0.9047231515809145,
"eval_loss": 8.38408088684082,
"eval_runtime": 92.3132,
"eval_samples_per_second": 28.642,
"eval_steps_per_second": 3.586,
"step": 45
},
{
"epoch": 0.05966277561608301,
"grad_norm": 22.684850692749023,
"learning_rate": 2.46e-06,
"loss": 7.4182,
"step": 46
},
{
"epoch": 0.0622568093385214,
"grad_norm": 33.87125778198242,
"learning_rate": 2.58e-06,
"loss": 7.2075,
"step": 48
},
{
"epoch": 0.0648508430609598,
"grad_norm": 21.509159088134766,
"learning_rate": 2.7e-06,
"loss": 7.1052,
"step": 50
},
{
"epoch": 0.0648508430609598,
"eval_cer": 0.899629261235193,
"eval_loss": 8.13282585144043,
"eval_runtime": 101.5177,
"eval_samples_per_second": 26.045,
"eval_steps_per_second": 3.261,
"step": 50
},
{
"epoch": 0.06744487678339818,
"grad_norm": NaN,
"learning_rate": 2.82e-06,
"loss": 6.7926,
"step": 52
},
{
"epoch": 0.07003891050583658,
"grad_norm": 21.155956268310547,
"learning_rate": 2.88e-06,
"loss": 6.8734,
"step": 54
},
{
"epoch": 0.07133592736705577,
"eval_cer": 0.8984236067154957,
"eval_loss": 7.93735408782959,
"eval_runtime": 106.5732,
"eval_samples_per_second": 24.809,
"eval_steps_per_second": 3.106,
"step": 55
},
{
"epoch": 0.07263294422827497,
"grad_norm": 19.74744415283203,
"learning_rate": 3e-06,
"loss": 6.8722,
"step": 56
},
{
"epoch": 0.07522697795071336,
"grad_norm": 319.0416564941406,
"learning_rate": 3.1199999999999998e-06,
"loss": 6.4253,
"step": 58
},
{
"epoch": 0.07782101167315175,
"grad_norm": 23.68337631225586,
"learning_rate": 3.24e-06,
"loss": 6.8842,
"step": 60
},
{
"epoch": 0.07782101167315175,
"eval_cer": 0.8882056846610604,
"eval_loss": 7.703160285949707,
"eval_runtime": 110.1461,
"eval_samples_per_second": 24.004,
"eval_steps_per_second": 3.005,
"step": 60
},
{
"epoch": 0.08041504539559015,
"grad_norm": 20.635147094726562,
"learning_rate": 3.36e-06,
"loss": 6.2162,
"step": 62
},
{
"epoch": 0.08300907911802853,
"grad_norm": 19.55179786682129,
"learning_rate": 3.48e-06,
"loss": 6.7139,
"step": 64
},
{
"epoch": 0.08430609597924774,
"eval_cer": 0.8779576212436326,
"eval_loss": 7.470834255218506,
"eval_runtime": 111.6785,
"eval_samples_per_second": 23.675,
"eval_steps_per_second": 2.964,
"step": 65
},
{
"epoch": 0.08560311284046693,
"grad_norm": 17.086580276489258,
"learning_rate": 3.6e-06,
"loss": 6.6201,
"step": 66
},
{
"epoch": 0.08819714656290532,
"grad_norm": 15.556456565856934,
"learning_rate": 3.72e-06,
"loss": 6.093,
"step": 68
},
{
"epoch": 0.0907911802853437,
"grad_norm": 49.20164489746094,
"learning_rate": 3.8400000000000005e-06,
"loss": 5.9421,
"step": 70
},
{
"epoch": 0.0907911802853437,
"eval_cer": 0.8584561593875275,
"eval_loss": 7.236043930053711,
"eval_runtime": 126.8066,
"eval_samples_per_second": 20.851,
"eval_steps_per_second": 2.61,
"step": 70
},
{
"epoch": 0.0933852140077821,
"grad_norm": 17.137556076049805,
"learning_rate": 3.96e-06,
"loss": 6.7995,
"step": 72
},
{
"epoch": 0.0959792477302205,
"grad_norm": 24.586021423339844,
"learning_rate": 4.080000000000001e-06,
"loss": 6.043,
"step": 74
},
{
"epoch": 0.09727626459143969,
"eval_cer": 0.8255719323627815,
"eval_loss": 7.014294624328613,
"eval_runtime": 128.9469,
"eval_samples_per_second": 20.505,
"eval_steps_per_second": 2.567,
"step": 75
},
{
"epoch": 0.09857328145265888,
"grad_norm": 17.460948944091797,
"learning_rate": 4.2000000000000004e-06,
"loss": 6.0706,
"step": 76
},
{
"epoch": 0.10116731517509728,
"grad_norm": 26.46303939819336,
"learning_rate": 4.32e-06,
"loss": 6.7666,
"step": 78
},
{
"epoch": 0.10376134889753567,
"grad_norm": 32.067054748535156,
"learning_rate": 4.44e-06,
"loss": 6.0295,
"step": 80
},
{
"epoch": 0.10376134889753567,
"eval_cer": 0.782530066009585,
"eval_loss": 6.801568984985352,
"eval_runtime": 114.6152,
"eval_samples_per_second": 23.069,
"eval_steps_per_second": 2.888,
"step": 80
},
{
"epoch": 0.10635538261997406,
"grad_norm": 14.94288158416748,
"learning_rate": 4.56e-06,
"loss": 5.7654,
"step": 82
},
{
"epoch": 0.10894941634241245,
"grad_norm": 21.89992904663086,
"learning_rate": 4.68e-06,
"loss": 5.7672,
"step": 84
},
{
"epoch": 0.11024643320363164,
"eval_cer": 0.7390059378485095,
"eval_loss": 6.629735469818115,
"eval_runtime": 116.8302,
"eval_samples_per_second": 22.631,
"eval_steps_per_second": 2.833,
"step": 85
},
{
"epoch": 0.11154345006485085,
"grad_norm": 23.829565048217773,
"learning_rate": 4.800000000000001e-06,
"loss": 5.4628,
"step": 86
},
{
"epoch": 0.11413748378728923,
"grad_norm": 18.05320930480957,
"learning_rate": 4.92e-06,
"loss": 5.1534,
"step": 88
},
{
"epoch": 0.11673151750972763,
"grad_norm": 25.233701705932617,
"learning_rate": 5.04e-06,
"loss": 5.3042,
"step": 90
},
{
"epoch": 0.11673151750972763,
"eval_cer": 0.7084124544111885,
"eval_loss": 6.476639747619629,
"eval_runtime": 118.7446,
"eval_samples_per_second": 22.266,
"eval_steps_per_second": 2.787,
"step": 90
},
{
"epoch": 0.11932555123216602,
"grad_norm": 35.21253967285156,
"learning_rate": 5.16e-06,
"loss": 5.4296,
"step": 92
},
{
"epoch": 0.1219195849546044,
"grad_norm": 17.080265045166016,
"learning_rate": 5.279999999999999e-06,
"loss": 5.3031,
"step": 94
},
{
"epoch": 0.12321660181582361,
"eval_cer": 0.6901467884377731,
"eval_loss": 6.346051216125488,
"eval_runtime": 119.9281,
"eval_samples_per_second": 22.047,
"eval_steps_per_second": 2.76,
"step": 95
},
{
"epoch": 0.1245136186770428,
"grad_norm": 14.830784797668457,
"learning_rate": 5.4e-06,
"loss": 5.4933,
"step": 96
},
{
"epoch": 0.12710765239948119,
"grad_norm": 14.9316987991333,
"learning_rate": 5.52e-06,
"loss": 5.3361,
"step": 98
},
{
"epoch": 0.1297016861219196,
"grad_norm": 42.79384994506836,
"learning_rate": 5.64e-06,
"loss": 4.6843,
"step": 100
},
{
"epoch": 0.1297016861219196,
"eval_cer": 0.6729059288061006,
"eval_loss": 6.210666179656982,
"eval_runtime": 120.5118,
"eval_samples_per_second": 21.94,
"eval_steps_per_second": 2.747,
"step": 100
},
{
"epoch": 0.13229571984435798,
"grad_norm": 69.97132110595703,
"learning_rate": 5.76e-06,
"loss": 5.278,
"step": 102
},
{
"epoch": 0.13488975356679636,
"grad_norm": 68.64286041259766,
"learning_rate": 5.8800000000000005e-06,
"loss": 5.3985,
"step": 104
},
{
"epoch": 0.13618677042801555,
"eval_cer": 0.6567501582421557,
"eval_loss": 6.105401039123535,
"eval_runtime": 120.3425,
"eval_samples_per_second": 21.971,
"eval_steps_per_second": 2.75,
"step": 105
},
{
"epoch": 0.13748378728923477,
"grad_norm": 16.881147384643555,
"learning_rate": 6e-06,
"loss": 5.5177,
"step": 106
},
{
"epoch": 0.14007782101167315,
"grad_norm": 13.392465591430664,
"learning_rate": 6.12e-06,
"loss": 5.6112,
"step": 108
},
{
"epoch": 0.14267185473411154,
"grad_norm": 12.735469818115234,
"learning_rate": 6.2399999999999995e-06,
"loss": 4.3825,
"step": 110
},
{
"epoch": 0.14267185473411154,
"eval_cer": 0.6333303191970341,
"eval_loss": 5.990973472595215,
"eval_runtime": 122.2904,
"eval_samples_per_second": 21.621,
"eval_steps_per_second": 2.707,
"step": 110
},
{
"epoch": 0.14526588845654995,
"grad_norm": 13.771244049072266,
"learning_rate": 6.36e-06,
"loss": 4.9807,
"step": 112
},
{
"epoch": 0.14785992217898833,
"grad_norm": 16.60836410522461,
"learning_rate": 6.48e-06,
"loss": 5.0463,
"step": 114
},
{
"epoch": 0.14915693904020752,
"eval_cer": 0.6150345118606263,
"eval_loss": 5.89418363571167,
"eval_runtime": 115.3341,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 2.87,
"step": 115
},
{
"epoch": 0.1504539559014267,
"grad_norm": 11.2271146774292,
"learning_rate": 6.6e-06,
"loss": 5.1669,
"step": 116
},
{
"epoch": 0.15304798962386512,
"grad_norm": 14.511568069458008,
"learning_rate": 6.72e-06,
"loss": 4.883,
"step": 118
},
{
"epoch": 0.1556420233463035,
"grad_norm": 12.640761375427246,
"learning_rate": 6.840000000000001e-06,
"loss": 5.3346,
"step": 120
},
{
"epoch": 0.1556420233463035,
"eval_cer": 0.6130451819031256,
"eval_loss": 5.80028772354126,
"eval_runtime": 113.846,
"eval_samples_per_second": 23.224,
"eval_steps_per_second": 2.907,
"step": 120
},
{
"epoch": 0.1582360570687419,
"grad_norm": 10.243730545043945,
"learning_rate": 6.96e-06,
"loss": 4.669,
"step": 122
},
{
"epoch": 0.1608300907911803,
"grad_norm": 19.194486618041992,
"learning_rate": 7.08e-06,
"loss": 5.2001,
"step": 124
},
{
"epoch": 0.1621271076523995,
"eval_cer": 0.6162703077433161,
"eval_loss": 5.716529369354248,
"eval_runtime": 113.3314,
"eval_samples_per_second": 23.33,
"eval_steps_per_second": 2.921,
"step": 125
},
{
"epoch": 0.16342412451361868,
"grad_norm": 25.41460418701172,
"learning_rate": 7.2e-06,
"loss": 4.8738,
"step": 126
},
{
"epoch": 0.16601815823605706,
"grad_norm": 11.016427040100098,
"learning_rate": 7.32e-06,
"loss": 5.5428,
"step": 128
},
{
"epoch": 0.16861219195849547,
"grad_norm": 19.337942123413086,
"learning_rate": 7.44e-06,
"loss": 5.097,
"step": 130
},
{
"epoch": 0.16861219195849547,
"eval_cer": 0.6253127166410465,
"eval_loss": 5.637584686279297,
"eval_runtime": 111.6419,
"eval_samples_per_second": 23.683,
"eval_steps_per_second": 2.965,
"step": 130
},
{
"epoch": 0.17120622568093385,
"grad_norm": 14.668910026550293,
"learning_rate": 7.5600000000000005e-06,
"loss": 5.1702,
"step": 132
},
{
"epoch": 0.17380025940337224,
"grad_norm": 14.700507164001465,
"learning_rate": 7.680000000000001e-06,
"loss": 5.1842,
"step": 134
},
{
"epoch": 0.17509727626459143,
"eval_cer": 0.6279952979473732,
"eval_loss": 5.5578293800354,
"eval_runtime": 115.7363,
"eval_samples_per_second": 22.845,
"eval_steps_per_second": 2.86,
"step": 135
},
{
"epoch": 0.17639429312581065,
"grad_norm": 11.149736404418945,
"learning_rate": 7.8e-06,
"loss": 5.1527,
"step": 136
},
{
"epoch": 0.17898832684824903,
"grad_norm": 12.17773723602295,
"learning_rate": 7.92e-06,
"loss": 5.3414,
"step": 138
},
{
"epoch": 0.1815823605706874,
"grad_norm": 15.777327537536621,
"learning_rate": 8.040000000000001e-06,
"loss": 5.1606,
"step": 140
},
{
"epoch": 0.1815823605706874,
"eval_cer": 0.611960092835398,
"eval_loss": 5.470022201538086,
"eval_runtime": 115.9413,
"eval_samples_per_second": 22.805,
"eval_steps_per_second": 2.855,
"step": 140
},
{
"epoch": 0.18417639429312582,
"grad_norm": 22.217771530151367,
"learning_rate": 8.160000000000001e-06,
"loss": 5.0028,
"step": 142
},
{
"epoch": 0.1867704280155642,
"grad_norm": 13.488722801208496,
"learning_rate": 8.28e-06,
"loss": 4.7495,
"step": 144
},
{
"epoch": 0.1880674448767834,
"eval_cer": 0.6009886367061519,
"eval_loss": 5.381906509399414,
"eval_runtime": 109.493,
"eval_samples_per_second": 24.148,
"eval_steps_per_second": 3.023,
"step": 145
},
{
"epoch": 0.1893644617380026,
"grad_norm": 12.898096084594727,
"learning_rate": 8.400000000000001e-06,
"loss": 4.847,
"step": 146
},
{
"epoch": 0.191958495460441,
"grad_norm": 13.653580665588379,
"learning_rate": 8.52e-06,
"loss": 5.2004,
"step": 148
},
{
"epoch": 0.19455252918287938,
"grad_norm": 10.915148735046387,
"learning_rate": 8.64e-06,
"loss": 4.9847,
"step": 150
},
{
"epoch": 0.19455252918287938,
"eval_cer": 0.594960364107665,
"eval_loss": 5.297786712646484,
"eval_runtime": 108.0527,
"eval_samples_per_second": 24.47,
"eval_steps_per_second": 3.063,
"step": 150
},
{
"epoch": 0.19714656290531776,
"grad_norm": 23.160659790039062,
"learning_rate": 8.759999999999999e-06,
"loss": 4.7818,
"step": 152
},
{
"epoch": 0.19974059662775617,
"grad_norm": 14.993002891540527,
"learning_rate": 8.88e-06,
"loss": 4.7727,
"step": 154
},
{
"epoch": 0.20103761348897536,
"eval_cer": 0.5589715766946981,
"eval_loss": 5.1952223777771,
"eval_runtime": 111.9206,
"eval_samples_per_second": 23.624,
"eval_steps_per_second": 2.957,
"step": 155
},
{
"epoch": 0.20233463035019456,
"grad_norm": 13.507874488830566,
"learning_rate": 9e-06,
"loss": 4.4196,
"step": 156
},
{
"epoch": 0.20492866407263294,
"grad_norm": 12.171915054321289,
"learning_rate": 9.12e-06,
"loss": 4.088,
"step": 158
},
{
"epoch": 0.20752269779507135,
"grad_norm": 10.198915481567383,
"learning_rate": 9.24e-06,
"loss": 4.0656,
"step": 160
},
{
"epoch": 0.20752269779507135,
"eval_cer": 0.5371492298881756,
"eval_loss": 5.124251365661621,
"eval_runtime": 115.3755,
"eval_samples_per_second": 22.916,
"eval_steps_per_second": 2.869,
"step": 160
},
{
"epoch": 0.21011673151750973,
"grad_norm": 10.800135612487793,
"learning_rate": 9.36e-06,
"loss": 4.2534,
"step": 162
},
{
"epoch": 0.2127107652399481,
"grad_norm": 28.48563575744629,
"learning_rate": 9.48e-06,
"loss": 4.0234,
"step": 164
},
{
"epoch": 0.2140077821011673,
"eval_cer": 0.530246857762908,
"eval_loss": 5.063826084136963,
"eval_runtime": 114.849,
"eval_samples_per_second": 23.022,
"eval_steps_per_second": 2.882,
"step": 165
},
{
"epoch": 0.21530479896238652,
"grad_norm": 20.067386627197266,
"learning_rate": 9.600000000000001e-06,
"loss": 4.3631,
"step": 166
},
{
"epoch": 0.2178988326848249,
"grad_norm": 16.29481315612793,
"learning_rate": 9.72e-06,
"loss": 4.4636,
"step": 168
},
{
"epoch": 0.2204928664072633,
"grad_norm": 101.64680480957031,
"learning_rate": 9.84e-06,
"loss": 5.033,
"step": 170
},
{
"epoch": 0.2204928664072633,
"eval_cer": 0.5299755854959761,
"eval_loss": 4.967648029327393,
"eval_runtime": 111.3501,
"eval_samples_per_second": 23.745,
"eval_steps_per_second": 2.973,
"step": 170
},
{
"epoch": 0.2230869001297017,
"grad_norm": 22.31951904296875,
"learning_rate": 9.960000000000001e-06,
"loss": 4.0891,
"step": 172
},
{
"epoch": 0.22568093385214008,
"grad_norm": 56.18452072143555,
"learning_rate": 1.008e-05,
"loss": 4.284,
"step": 174
},
{
"epoch": 0.22697795071335927,
"eval_cer": 0.520179642523435,
"eval_loss": 4.895947456359863,
"eval_runtime": 112.6531,
"eval_samples_per_second": 23.47,
"eval_steps_per_second": 2.938,
"step": 175
},
{
"epoch": 0.22827496757457846,
"grad_norm": 13.42766284942627,
"learning_rate": 1.02e-05,
"loss": 4.4092,
"step": 176
},
{
"epoch": 0.23086900129701687,
"grad_norm": 21.44829559326172,
"learning_rate": 1.032e-05,
"loss": 4.1597,
"step": 178
},
{
"epoch": 0.23346303501945526,
"grad_norm": 14.215667724609375,
"learning_rate": 1.044e-05,
"loss": 3.8574,
"step": 180
},
{
"epoch": 0.23346303501945526,
"eval_cer": 0.5017632697350575,
"eval_loss": 4.84341287612915,
"eval_runtime": 115.4414,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 2.867,
"step": 180
},
{
"epoch": 0.23605706874189364,
"grad_norm": 13.1805419921875,
"learning_rate": 1.0559999999999999e-05,
"loss": 4.0776,
"step": 182
},
{
"epoch": 0.23865110246433205,
"grad_norm": 98.30623626708984,
"learning_rate": 1.068e-05,
"loss": 4.2507,
"step": 184
},
{
"epoch": 0.23994811932555124,
"eval_cer": 0.48886276637429543,
"eval_loss": 4.78084135055542,
"eval_runtime": 123.8484,
"eval_samples_per_second": 21.349,
"eval_steps_per_second": 2.673,
"step": 185
},
{
"epoch": 0.24124513618677043,
"grad_norm": 35.59674072265625,
"learning_rate": 1.08e-05,
"loss": 4.1455,
"step": 186
},
{
"epoch": 0.2438391699092088,
"grad_norm": 21.6286563873291,
"learning_rate": 1.092e-05,
"loss": 5.0675,
"step": 188
},
{
"epoch": 0.24643320363164722,
"grad_norm": 17.0408878326416,
"learning_rate": 1.104e-05,
"loss": 3.953,
"step": 190
},
{
"epoch": 0.24643320363164722,
"eval_cer": 0.47979021611357264,
"eval_loss": 4.711887836456299,
"eval_runtime": 135.169,
"eval_samples_per_second": 19.561,
"eval_steps_per_second": 2.449,
"step": 190
},
{
"epoch": 0.2490272373540856,
"grad_norm": 24.359798431396484,
"learning_rate": 1.116e-05,
"loss": 4.4154,
"step": 192
},
{
"epoch": 0.251621271076524,
"grad_norm": 13.193626403808594,
"learning_rate": 1.128e-05,
"loss": 4.3269,
"step": 194
},
{
"epoch": 0.2529182879377432,
"eval_cer": 0.4659251891370528,
"eval_loss": 4.614772319793701,
"eval_runtime": 132.7599,
"eval_samples_per_second": 19.916,
"eval_steps_per_second": 2.493,
"step": 195
},
{
"epoch": 0.25421530479896237,
"grad_norm": 14.497838973999023,
"learning_rate": 1.1400000000000001e-05,
"loss": 4.0266,
"step": 196
},
{
"epoch": 0.25680933852140075,
"grad_norm": 12.457406997680664,
"learning_rate": 1.152e-05,
"loss": 4.3069,
"step": 198
},
{
"epoch": 0.2594033722438392,
"grad_norm": 18.889881134033203,
"learning_rate": 1.164e-05,
"loss": 4.1068,
"step": 200
},
{
"epoch": 0.2594033722438392,
"eval_cer": 0.4684872049914097,
"eval_loss": 4.578884601593018,
"eval_runtime": 129.8465,
"eval_samples_per_second": 20.363,
"eval_steps_per_second": 2.549,
"step": 200
},
{
"epoch": 0.2619974059662776,
"grad_norm": 11.648727416992188,
"learning_rate": 1.1760000000000001e-05,
"loss": 3.7185,
"step": 202
},
{
"epoch": 0.26459143968871596,
"grad_norm": 13.08809757232666,
"learning_rate": 1.1880000000000001e-05,
"loss": 4.1442,
"step": 204
},
{
"epoch": 0.26588845654993515,
"eval_cer": 0.46004762335352806,
"eval_loss": 4.4956889152526855,
"eval_runtime": 114.7341,
"eval_samples_per_second": 23.045,
"eval_steps_per_second": 2.885,
"step": 205
},
{
"epoch": 0.26718547341115434,
"grad_norm": 12.64474105834961,
"learning_rate": 1.2e-05,
"loss": 3.7967,
"step": 206
},
{
"epoch": 0.2697795071335927,
"grad_norm": 12.794676780700684,
"learning_rate": 1.2120000000000001e-05,
"loss": 3.8475,
"step": 208
},
{
"epoch": 0.2723735408560311,
"grad_norm": 13.091010093688965,
"learning_rate": 1.224e-05,
"loss": 3.5213,
"step": 210
},
{
"epoch": 0.2723735408560311,
"eval_cer": 0.43729089429424,
"eval_loss": 4.389532089233398,
"eval_runtime": 111.3543,
"eval_samples_per_second": 23.744,
"eval_steps_per_second": 2.972,
"step": 210
},
{
"epoch": 0.27496757457846954,
"grad_norm": 13.716208457946777,
"learning_rate": 1.236e-05,
"loss": 4.0241,
"step": 212
},
{
"epoch": 0.2775616083009079,
"grad_norm": 14.271407127380371,
"learning_rate": 1.2479999999999999e-05,
"loss": 4.1152,
"step": 214
},
{
"epoch": 0.2788586251621271,
"eval_cer": 0.449709135847123,
"eval_loss": 4.357320308685303,
"eval_runtime": 113.2044,
"eval_samples_per_second": 23.356,
"eval_steps_per_second": 2.924,
"step": 215
},
{
"epoch": 0.2801556420233463,
"grad_norm": 12.267643928527832,
"learning_rate": 1.26e-05,
"loss": 3.2922,
"step": 216
},
{
"epoch": 0.2827496757457847,
"grad_norm": 13.182437896728516,
"learning_rate": 1.272e-05,
"loss": 3.8861,
"step": 218
},
{
"epoch": 0.2853437094682231,
"grad_norm": 12.376786231994629,
"learning_rate": 1.284e-05,
"loss": 3.5171,
"step": 220
},
{
"epoch": 0.2853437094682231,
"eval_cer": 0.45802815203303493,
"eval_loss": 4.331967830657959,
"eval_runtime": 114.1848,
"eval_samples_per_second": 23.155,
"eval_steps_per_second": 2.899,
"step": 220
},
{
"epoch": 0.28793774319066145,
"grad_norm": 11.612021446228027,
"learning_rate": 1.296e-05,
"loss": 4.1036,
"step": 222
},
{
"epoch": 0.2905317769130999,
"grad_norm": 12.48078727722168,
"learning_rate": 1.308e-05,
"loss": 3.1498,
"step": 224
},
{
"epoch": 0.2918287937743191,
"eval_cer": 0.4402146065045061,
"eval_loss": 4.2296319007873535,
"eval_runtime": 118.3956,
"eval_samples_per_second": 22.332,
"eval_steps_per_second": 2.796,
"step": 225
},
{
"epoch": 0.2931258106355383,
"grad_norm": 28.98529815673828,
"learning_rate": 1.32e-05,
"loss": 3.7617,
"step": 226
},
{
"epoch": 0.29571984435797666,
"grad_norm": 16.19705581665039,
"learning_rate": 1.3320000000000001e-05,
"loss": 4.0489,
"step": 228
},
{
"epoch": 0.29831387808041504,
"grad_norm": 12.226841926574707,
"learning_rate": 1.344e-05,
"loss": 3.6797,
"step": 230
},
{
"epoch": 0.29831387808041504,
"eval_cer": 0.43195587304457905,
"eval_loss": 4.158808708190918,
"eval_runtime": 112.6694,
"eval_samples_per_second": 23.467,
"eval_steps_per_second": 2.938,
"step": 230
},
{
"epoch": 0.3009079118028534,
"grad_norm": 20.419113159179688,
"learning_rate": 1.356e-05,
"loss": 3.6434,
"step": 232
},
{
"epoch": 0.3035019455252918,
"grad_norm": 18.819181442260742,
"learning_rate": 1.3680000000000001e-05,
"loss": 3.2154,
"step": 234
},
{
"epoch": 0.30479896238651105,
"eval_cer": 0.4050697772553275,
"eval_loss": 4.025953769683838,
"eval_runtime": 113.1476,
"eval_samples_per_second": 23.368,
"eval_steps_per_second": 2.925,
"step": 235
},
{
"epoch": 0.30609597924773024,
"grad_norm": 12.357544898986816,
"learning_rate": 1.3800000000000002e-05,
"loss": 3.9309,
"step": 236
},
{
"epoch": 0.3086900129701686,
"grad_norm": 14.584222793579102,
"learning_rate": 1.392e-05,
"loss": 3.5647,
"step": 238
},
{
"epoch": 0.311284046692607,
"grad_norm": 11.948848724365234,
"learning_rate": 1.4040000000000001e-05,
"loss": 3.439,
"step": 240
},
{
"epoch": 0.311284046692607,
"eval_cer": 0.3937064834071797,
"eval_loss": 3.905667304992676,
"eval_runtime": 113.4696,
"eval_samples_per_second": 23.301,
"eval_steps_per_second": 2.917,
"step": 240
},
{
"epoch": 0.3138780804150454,
"grad_norm": 13.00888729095459,
"learning_rate": 1.416e-05,
"loss": 3.2106,
"step": 242
},
{
"epoch": 0.3164721141374838,
"grad_norm": 12.678916931152344,
"learning_rate": 1.428e-05,
"loss": 3.4027,
"step": 244
},
{
"epoch": 0.31776913099870296,
"eval_cer": 0.3925912529764596,
"eval_loss": 3.8874895572662354,
"eval_runtime": 189.1601,
"eval_samples_per_second": 13.978,
"eval_steps_per_second": 1.75,
"step": 245
},
{
"epoch": 0.31906614785992216,
"grad_norm": 16.414127349853516,
"learning_rate": 1.44e-05,
"loss": 2.5058,
"step": 246
},
{
"epoch": 0.3216601815823606,
"grad_norm": 16.07786750793457,
"learning_rate": 1.452e-05,
"loss": 3.3537,
"step": 248
},
{
"epoch": 0.324254215304799,
"grad_norm": 15.625645637512207,
"learning_rate": 1.464e-05,
"loss": 3.6318,
"step": 250
},
{
"epoch": 0.324254215304799,
"eval_cer": 0.3870151008228592,
"eval_loss": 3.824657917022705,
"eval_runtime": 233.078,
"eval_samples_per_second": 11.344,
"eval_steps_per_second": 1.42,
"step": 250
},
{
"epoch": 0.32684824902723736,
"grad_norm": 16.048980712890625,
"learning_rate": 1.4760000000000001e-05,
"loss": 3.6032,
"step": 252
},
{
"epoch": 0.32944228274967574,
"grad_norm": 12.714573860168457,
"learning_rate": 1.488e-05,
"loss": 3.8301,
"step": 254
},
{
"epoch": 0.33073929961089493,
"eval_cer": 0.374446152455014,
"eval_loss": 3.690356969833374,
"eval_runtime": 139.2198,
"eval_samples_per_second": 18.992,
"eval_steps_per_second": 2.378,
"step": 255
},
{
"epoch": 0.3320363164721141,
"grad_norm": 12.425498962402344,
"learning_rate": 1.5e-05,
"loss": 3.0105,
"step": 256
},
{
"epoch": 0.3346303501945525,
"grad_norm": 13.164816856384277,
"learning_rate": 1.5120000000000001e-05,
"loss": 3.6224,
"step": 258
},
{
"epoch": 0.33722438391699094,
"grad_norm": 14.977278709411621,
"learning_rate": 1.524e-05,
"loss": 3.2017,
"step": 260
},
{
"epoch": 0.33722438391699094,
"eval_cer": 0.3727582361274377,
"eval_loss": 3.6364212036132812,
"eval_runtime": 123.239,
"eval_samples_per_second": 21.454,
"eval_steps_per_second": 2.686,
"step": 260
},
{
"epoch": 0.3398184176394293,
"grad_norm": 16.265350341796875,
"learning_rate": 1.5360000000000002e-05,
"loss": 3.9344,
"step": 262
},
{
"epoch": 0.3424124513618677,
"grad_norm": 13.141109466552734,
"learning_rate": 1.548e-05,
"loss": 2.757,
"step": 264
},
{
"epoch": 0.3437094682230869,
"eval_cer": 0.3830062995448654,
"eval_loss": 3.622190475463867,
"eval_runtime": 117.2686,
"eval_samples_per_second": 22.547,
"eval_steps_per_second": 2.823,
"step": 265
},
{
"epoch": 0.3450064850843061,
"grad_norm": 12.868675231933594,
"learning_rate": 1.56e-05,
"loss": 3.4981,
"step": 266
},
{
"epoch": 0.3476005188067445,
"grad_norm": 12.726391792297363,
"learning_rate": 1.5720000000000002e-05,
"loss": 2.9291,
"step": 268
},
{
"epoch": 0.35019455252918286,
"grad_norm": 31.09647560119629,
"learning_rate": 1.584e-05,
"loss": 3.1786,
"step": 270
},
{
"epoch": 0.35019455252918286,
"eval_cer": 0.3782741055550532,
"eval_loss": 3.598266124725342,
"eval_runtime": 115.8076,
"eval_samples_per_second": 22.831,
"eval_steps_per_second": 2.858,
"step": 270
},
{
"epoch": 0.3527885862516213,
"grad_norm": 12.694645881652832,
"learning_rate": 1.596e-05,
"loss": 3.2305,
"step": 272
},
{
"epoch": 0.3553826199740597,
"grad_norm": 20.454267501831055,
"learning_rate": 1.6080000000000002e-05,
"loss": 2.9721,
"step": 274
},
{
"epoch": 0.35667963683527887,
"eval_cer": 0.3755613828857341,
"eval_loss": 3.512030839920044,
"eval_runtime": 133.3206,
"eval_samples_per_second": 19.832,
"eval_steps_per_second": 2.483,
"step": 275
},
{
"epoch": 0.35797665369649806,
"grad_norm": 20.09004783630371,
"learning_rate": 1.62e-05,
"loss": 3.0882,
"step": 276
},
{
"epoch": 0.36057068741893644,
"grad_norm": 16.710346221923828,
"learning_rate": 1.6320000000000003e-05,
"loss": 2.762,
"step": 278
},
{
"epoch": 0.3631647211413748,
"grad_norm": 19.519004821777344,
"learning_rate": 1.6440000000000002e-05,
"loss": 3.0841,
"step": 280
},
{
"epoch": 0.3631647211413748,
"eval_cer": 0.36691081170690537,
"eval_loss": 3.4976441860198975,
"eval_runtime": 135.0965,
"eval_samples_per_second": 19.571,
"eval_steps_per_second": 2.45,
"step": 280
},
{
"epoch": 0.3657587548638132,
"grad_norm": 13.175090789794922,
"learning_rate": 1.656e-05,
"loss": 3.5364,
"step": 282
},
{
"epoch": 0.36835278858625164,
"grad_norm": 16.523889541625977,
"learning_rate": 1.6680000000000003e-05,
"loss": 2.6431,
"step": 284
},
{
"epoch": 0.36964980544747084,
"eval_cer": 0.3738734665581578,
"eval_loss": 3.458583116531372,
"eval_runtime": 229.2061,
"eval_samples_per_second": 11.535,
"eval_steps_per_second": 1.444,
"step": 285
},
{
"epoch": 0.37094682230869,
"grad_norm": 14.984639167785645,
"learning_rate": 1.6800000000000002e-05,
"loss": 4.2237,
"step": 286
},
{
"epoch": 0.3735408560311284,
"grad_norm": 13.299590110778809,
"learning_rate": 1.6919999999999997e-05,
"loss": 2.6747,
"step": 288
},
{
"epoch": 0.3761348897535668,
"grad_norm": 12.275932312011719,
"learning_rate": 1.704e-05,
"loss": 2.3366,
"step": 290
},
{
"epoch": 0.3761348897535668,
"eval_cer": 0.3661271362691021,
"eval_loss": 3.3806421756744385,
"eval_runtime": 245.1909,
"eval_samples_per_second": 10.783,
"eval_steps_per_second": 1.35,
"step": 290
},
{
"epoch": 0.3787289234760052,
"grad_norm": 14.83483600616455,
"learning_rate": 1.716e-05,
"loss": 3.1826,
"step": 292
},
{
"epoch": 0.38132295719844356,
"grad_norm": 14.161396026611328,
"learning_rate": 1.728e-05,
"loss": 3.5359,
"step": 294
},
{
"epoch": 0.38261997405966275,
"eval_cer": 0.3429182867649275,
"eval_loss": 3.350353956222534,
"eval_runtime": 219.4263,
"eval_samples_per_second": 12.05,
"eval_steps_per_second": 1.508,
"step": 295
},
{
"epoch": 0.383916990920882,
"grad_norm": 12.273178100585938,
"learning_rate": 1.74e-05,
"loss": 3.6439,
"step": 296
},
{
"epoch": 0.3865110246433204,
"grad_norm": 14.881448745727539,
"learning_rate": 1.7519999999999998e-05,
"loss": 3.6943,
"step": 298
},
{
"epoch": 0.38910505836575876,
"grad_norm": 14.406302452087402,
"learning_rate": 1.764e-05,
"loss": 3.4593,
"step": 300
},
{
"epoch": 0.38910505836575876,
"eval_cer": 0.3484040148295506,
"eval_loss": 3.2906293869018555,
"eval_runtime": 141.4541,
"eval_samples_per_second": 18.692,
"eval_steps_per_second": 2.34,
"step": 300
},
{
"epoch": 0.39169909208819714,
"grad_norm": 15.321798324584961,
"learning_rate": 1.776e-05,
"loss": 3.1981,
"step": 302
},
{
"epoch": 0.3942931258106355,
"grad_norm": 12.990147590637207,
"learning_rate": 1.7879999999999998e-05,
"loss": 3.1501,
"step": 304
},
{
"epoch": 0.3955901426718547,
"eval_cer": 0.34671609850197427,
"eval_loss": 3.222804069519043,
"eval_runtime": 189.6181,
"eval_samples_per_second": 13.944,
"eval_steps_per_second": 1.746,
"step": 305
},
{
"epoch": 0.3968871595330739,
"grad_norm": 11.798747062683105,
"learning_rate": 1.8e-05,
"loss": 3.6844,
"step": 306
},
{
"epoch": 0.39948119325551235,
"grad_norm": 15.285426139831543,
"learning_rate": 1.812e-05,
"loss": 2.115,
"step": 308
},
{
"epoch": 0.40207522697795073,
"grad_norm": 14.921792984008789,
"learning_rate": 1.824e-05,
"loss": 2.8101,
"step": 310
},
{
"epoch": 0.40207522697795073,
"eval_cer": 0.342466166320041,
"eval_loss": 3.1945455074310303,
"eval_runtime": 145.2418,
"eval_samples_per_second": 18.204,
"eval_steps_per_second": 2.279,
"step": 310
},
{
"epoch": 0.4046692607003891,
"grad_norm": 12.135457992553711,
"learning_rate": 1.836e-05,
"loss": 3.4391,
"step": 312
},
{
"epoch": 0.4072632944228275,
"grad_norm": 14.905659675598145,
"learning_rate": 1.848e-05,
"loss": 2.8493,
"step": 314
},
{
"epoch": 0.4085603112840467,
"eval_cer": 0.35328691563432496,
"eval_loss": 3.176107883453369,
"eval_runtime": 245.7678,
"eval_samples_per_second": 10.758,
"eval_steps_per_second": 1.347,
"step": 315
},
{
"epoch": 0.4098573281452659,
"grad_norm": 13.051637649536133,
"learning_rate": 1.86e-05,
"loss": 2.8454,
"step": 316
},
{
"epoch": 0.41245136186770426,
"grad_norm": 14.108623504638672,
"learning_rate": 1.872e-05,
"loss": 3.4802,
"step": 318
},
{
"epoch": 0.4150453955901427,
"grad_norm": 19.467906951904297,
"learning_rate": 1.884e-05,
"loss": 2.8067,
"step": 320
},
{
"epoch": 0.4150453955901427,
"eval_cer": 0.3708593302589143,
"eval_loss": 3.1531643867492676,
"eval_runtime": 211.2143,
"eval_samples_per_second": 12.518,
"eval_steps_per_second": 1.567,
"step": 320
},
{
"epoch": 0.4176394293125811,
"grad_norm": 15.744620323181152,
"learning_rate": 1.896e-05,
"loss": 2.2496,
"step": 322
},
{
"epoch": 0.42023346303501946,
"grad_norm": 14.49579906463623,
"learning_rate": 1.908e-05,
"loss": 2.7236,
"step": 324
},
{
"epoch": 0.42153047989623865,
"eval_cer": 0.35376917744220393,
"eval_loss": 3.1204159259796143,
"eval_runtime": 194.3013,
"eval_samples_per_second": 13.608,
"eval_steps_per_second": 1.704,
"step": 325
},
{
"epoch": 0.42282749675745784,
"grad_norm": 16.304920196533203,
"learning_rate": 1.9200000000000003e-05,
"loss": 2.785,
"step": 326
},
{
"epoch": 0.4254215304798962,
"grad_norm": 12.900490760803223,
"learning_rate": 1.932e-05,
"loss": 2.5259,
"step": 328
},
{
"epoch": 0.4280155642023346,
"grad_norm": 15.345794677734375,
"learning_rate": 1.944e-05,
"loss": 3.023,
"step": 330
},
{
"epoch": 0.4280155642023346,
"eval_cer": 0.3658558640021702,
"eval_loss": 3.054361581802368,
"eval_runtime": 234.2094,
"eval_samples_per_second": 11.289,
"eval_steps_per_second": 1.413,
"step": 330
},
{
"epoch": 0.43060959792477305,
"grad_norm": 17.006378173828125,
"learning_rate": 1.9560000000000002e-05,
"loss": 2.5932,
"step": 332
},
{
"epoch": 0.43320363164721143,
"grad_norm": 12.188159942626953,
"learning_rate": 1.968e-05,
"loss": 3.1202,
"step": 334
},
{
"epoch": 0.4345006485084306,
"eval_cer": 0.35904391596588,
"eval_loss": 3.008192300796509,
"eval_runtime": 161.9322,
"eval_samples_per_second": 16.328,
"eval_steps_per_second": 2.044,
"step": 335
},
{
"epoch": 0.4357976653696498,
"grad_norm": 13.4000883102417,
"learning_rate": 1.98e-05,
"loss": 2.9631,
"step": 336
},
{
"epoch": 0.4383916990920882,
"grad_norm": 12.92082691192627,
"learning_rate": 1.9920000000000002e-05,
"loss": 2.6806,
"step": 338
},
{
"epoch": 0.4409857328145266,
"grad_norm": 19.443449020385742,
"learning_rate": 2.004e-05,
"loss": 2.6839,
"step": 340
},
{
"epoch": 0.4409857328145266,
"eval_cer": 0.34593242306417094,
"eval_loss": 2.986955404281616,
"eval_runtime": 209.5827,
"eval_samples_per_second": 12.616,
"eval_steps_per_second": 1.579,
"step": 340
},
{
"epoch": 0.44357976653696496,
"grad_norm": 13.657390594482422,
"learning_rate": 2.016e-05,
"loss": 3.3844,
"step": 342
},
{
"epoch": 0.4461738002594034,
"grad_norm": 12.142219543457031,
"learning_rate": 2.0280000000000002e-05,
"loss": 2.5051,
"step": 344
},
{
"epoch": 0.4474708171206226,
"eval_cer": 0.34991108297917234,
"eval_loss": 2.9238357543945312,
"eval_runtime": 172.4959,
"eval_samples_per_second": 15.328,
"eval_steps_per_second": 1.919,
"step": 345
},
{
"epoch": 0.4487678339818418,
"grad_norm": 11.67302417755127,
"learning_rate": 2.04e-05,
"loss": 2.6669,
"step": 346
},
{
"epoch": 0.45136186770428016,
"grad_norm": 9.952072143554688,
"learning_rate": 2.0520000000000003e-05,
"loss": 3.1192,
"step": 348
},
{
"epoch": 0.45395590142671854,
"grad_norm": 13.724016189575195,
"learning_rate": 2.064e-05,
"loss": 3.026,
"step": 350
},
{
"epoch": 0.45395590142671854,
"eval_cer": 0.35256352292250653,
"eval_loss": 2.923769235610962,
"eval_runtime": 178.4978,
"eval_samples_per_second": 14.813,
"eval_steps_per_second": 1.854,
"step": 350
},
{
"epoch": 0.4565499351491569,
"grad_norm": 12.271801948547363,
"learning_rate": 2.0759999999999998e-05,
"loss": 2.0673,
"step": 352
},
{
"epoch": 0.4591439688715953,
"grad_norm": 11.006880760192871,
"learning_rate": 2.088e-05,
"loss": 2.1873,
"step": 354
},
{
"epoch": 0.4604409857328145,
"eval_cer": 0.3467763812279591,
"eval_loss": 2.878007173538208,
"eval_runtime": 239.5277,
"eval_samples_per_second": 11.038,
"eval_steps_per_second": 1.382,
"step": 355
},
{
"epoch": 0.46173800259403375,
"grad_norm": 20.289621353149414,
"learning_rate": 2.1e-05,
"loss": 2.8391,
"step": 356
},
{
"epoch": 0.46433203631647213,
"grad_norm": 12.577980995178223,
"learning_rate": 2.1119999999999998e-05,
"loss": 2.2868,
"step": 358
},
{
"epoch": 0.4669260700389105,
"grad_norm": 12.996379852294922,
"learning_rate": 2.124e-05,
"loss": 2.8355,
"step": 360
},
{
"epoch": 0.4669260700389105,
"eval_cer": 0.32480332760647435,
"eval_loss": 2.820495843887329,
"eval_runtime": 183.724,
"eval_samples_per_second": 14.391,
"eval_steps_per_second": 1.802,
"step": 360
},
{
"epoch": 0.4695201037613489,
"grad_norm": 20.477155685424805,
"learning_rate": 2.136e-05,
"loss": 2.8462,
"step": 362
},
{
"epoch": 0.4721141374837873,
"grad_norm": 32.31524658203125,
"learning_rate": 2.148e-05,
"loss": 2.0954,
"step": 364
},
{
"epoch": 0.47341115434500647,
"eval_cer": 0.3083160020496127,
"eval_loss": 2.8296797275543213,
"eval_runtime": 276.3938,
"eval_samples_per_second": 9.566,
"eval_steps_per_second": 1.198,
"step": 365
},
{
"epoch": 0.47470817120622566,
"grad_norm": 14.220911026000977,
"learning_rate": 2.16e-05,
"loss": 2.4953,
"step": 366
},
{
"epoch": 0.4773022049286641,
"grad_norm": 16.390596389770508,
"learning_rate": 2.172e-05,
"loss": 2.624,
"step": 368
},
{
"epoch": 0.4798962386511025,
"grad_norm": 14.07410717010498,
"learning_rate": 2.184e-05,
"loss": 2.9978,
"step": 370
},
{
"epoch": 0.4798962386511025,
"eval_cer": 0.3032522530668837,
"eval_loss": 2.793341875076294,
"eval_runtime": 279.1859,
"eval_samples_per_second": 9.47,
"eval_steps_per_second": 1.186,
"step": 370
},
{
"epoch": 0.48249027237354086,
"grad_norm": 24.049970626831055,
"learning_rate": 2.196e-05,
"loss": 2.9861,
"step": 372
},
{
"epoch": 0.48508430609597925,
"grad_norm": 10.874021530151367,
"learning_rate": 2.208e-05,
"loss": 2.1597,
"step": 374
},
{
"epoch": 0.48638132295719844,
"eval_cer": 0.3145251228260542,
"eval_loss": 2.7285666465759277,
"eval_runtime": 309.9236,
"eval_samples_per_second": 8.531,
"eval_steps_per_second": 1.068,
"step": 375
},
{
"epoch": 0.4876783398184176,
"grad_norm": 12.51282787322998,
"learning_rate": 2.22e-05,
"loss": 2.2976,
"step": 376
},
{
"epoch": 0.490272373540856,
"grad_norm": 11.6898775100708,
"learning_rate": 2.232e-05,
"loss": 2.703,
"step": 378
},
{
"epoch": 0.49286640726329445,
"grad_norm": 14.729179382324219,
"learning_rate": 2.2440000000000002e-05,
"loss": 2.0108,
"step": 380
},
{
"epoch": 0.49286640726329445,
"eval_cer": 0.31328932694336437,
"eval_loss": 2.689061403274536,
"eval_runtime": 188.7169,
"eval_samples_per_second": 14.01,
"eval_steps_per_second": 1.754,
"step": 380
},
{
"epoch": 0.49546044098573283,
"grad_norm": 11.870194435119629,
"learning_rate": 2.256e-05,
"loss": 2.2467,
"step": 382
},
{
"epoch": 0.4980544747081712,
"grad_norm": 12.500712394714355,
"learning_rate": 2.268e-05,
"loss": 2.0565,
"step": 384
},
{
"epoch": 0.4993514915693904,
"eval_cer": 0.3065075202700666,
"eval_loss": 2.683504104614258,
"eval_runtime": 139.7652,
"eval_samples_per_second": 18.917,
"eval_steps_per_second": 2.368,
"step": 385
},
{
"epoch": 0.5006485084306096,
"grad_norm": 17.805166244506836,
"learning_rate": 2.2800000000000002e-05,
"loss": 2.0304,
"step": 386
},
{
"epoch": 0.503242542153048,
"grad_norm": 10.874719619750977,
"learning_rate": 2.292e-05,
"loss": 2.2385,
"step": 388
},
{
"epoch": 0.5058365758754864,
"grad_norm": 19.65207862854004,
"learning_rate": 2.304e-05,
"loss": 2.2806,
"step": 390
},
{
"epoch": 0.5058365758754864,
"eval_cer": 0.310697169726015,
"eval_loss": 2.64746356010437,
"eval_runtime": 158.3003,
"eval_samples_per_second": 16.702,
"eval_steps_per_second": 2.091,
"step": 390
},
{
"epoch": 0.5084306095979247,
"grad_norm": 11.749553680419922,
"learning_rate": 2.3160000000000002e-05,
"loss": 2.0118,
"step": 392
},
{
"epoch": 0.5110246433203631,
"grad_norm": 12.608861923217773,
"learning_rate": 2.328e-05,
"loss": 1.8522,
"step": 394
},
{
"epoch": 0.5123216601815823,
"eval_cer": 0.32664195074901287,
"eval_loss": 2.6235055923461914,
"eval_runtime": 186.0047,
"eval_samples_per_second": 14.215,
"eval_steps_per_second": 1.78,
"step": 395
},
{
"epoch": 0.5136186770428015,
"grad_norm": 15.640968322753906,
"learning_rate": 2.3400000000000003e-05,
"loss": 2.5232,
"step": 396
},
{
"epoch": 0.51621271076524,
"grad_norm": 12.88823127746582,
"learning_rate": 2.3520000000000002e-05,
"loss": 2.196,
"step": 398
},
{
"epoch": 0.5188067444876784,
"grad_norm": 12.094499588012695,
"learning_rate": 2.364e-05,
"loss": 2.3092,
"step": 400
},
{
"epoch": 0.5188067444876784,
"eval_cer": 0.3185640654670404,
"eval_loss": 2.6280529499053955,
"eval_runtime": 155.4528,
"eval_samples_per_second": 17.008,
"eval_steps_per_second": 2.129,
"step": 400
},
{
"epoch": 0.5214007782101168,
"grad_norm": 27.93305015563965,
"learning_rate": 2.3760000000000003e-05,
"loss": 2.1069,
"step": 402
},
{
"epoch": 0.5239948119325551,
"grad_norm": 14.44329833984375,
"learning_rate": 2.3880000000000002e-05,
"loss": 2.513,
"step": 404
},
{
"epoch": 0.5252918287937743,
"eval_cer": 0.2939988546282063,
"eval_loss": 2.567127227783203,
"eval_runtime": 158.9316,
"eval_samples_per_second": 16.636,
"eval_steps_per_second": 2.083,
"step": 405
},
{
"epoch": 0.5265888456549935,
"grad_norm": 12.40060806274414,
"learning_rate": 2.4e-05,
"loss": 2.412,
"step": 406
},
{
"epoch": 0.5291828793774319,
"grad_norm": 12.270583152770996,
"learning_rate": 2.4120000000000003e-05,
"loss": 2.0339,
"step": 408
},
{
"epoch": 0.5317769130998703,
"grad_norm": 17.22001838684082,
"learning_rate": 2.4240000000000002e-05,
"loss": 2.0117,
"step": 410
},
{
"epoch": 0.5317769130998703,
"eval_cer": 0.2961388914006691,
"eval_loss": 2.5344009399414062,
"eval_runtime": 164.3351,
"eval_samples_per_second": 16.089,
"eval_steps_per_second": 2.014,
"step": 410
},
{
"epoch": 0.5343709468223087,
"grad_norm": 10.44601058959961,
"learning_rate": 2.4360000000000004e-05,
"loss": 1.6976,
"step": 412
},
{
"epoch": 0.5369649805447471,
"grad_norm": 16.720975875854492,
"learning_rate": 2.448e-05,
"loss": 3.0921,
"step": 414
},
{
"epoch": 0.5382619974059663,
"eval_cer": 0.2885131265635832,
"eval_loss": 2.51837158203125,
"eval_runtime": 169.9004,
"eval_samples_per_second": 15.562,
"eval_steps_per_second": 1.948,
"step": 415
},
{
"epoch": 0.5395590142671854,
"grad_norm": 11.013751983642578,
"learning_rate": 2.4599999999999998e-05,
"loss": 1.8176,
"step": 416
},
{
"epoch": 0.5421530479896238,
"grad_norm": 13.207280158996582,
"learning_rate": 2.472e-05,
"loss": 2.0911,
"step": 418
},
{
"epoch": 0.5447470817120622,
"grad_norm": 13.864497184753418,
"learning_rate": 2.484e-05,
"loss": 2.1863,
"step": 420
},
{
"epoch": 0.5447470817120622,
"eval_cer": 0.3223920185670796,
"eval_loss": 2.489891767501831,
"eval_runtime": 198.7131,
"eval_samples_per_second": 13.306,
"eval_steps_per_second": 1.666,
"step": 420
},
{
"epoch": 0.5473411154345007,
"grad_norm": 13.306368827819824,
"learning_rate": 2.4959999999999998e-05,
"loss": 2.1374,
"step": 422
},
{
"epoch": 0.5499351491569391,
"grad_norm": 14.835345268249512,
"learning_rate": 2.508e-05,
"loss": 2.0687,
"step": 424
},
{
"epoch": 0.5512321660181583,
"eval_cer": 0.3371612864333725,
"eval_loss": 2.540318250656128,
"eval_runtime": 177.6188,
"eval_samples_per_second": 14.886,
"eval_steps_per_second": 1.864,
"step": 425
},
{
"epoch": 0.5525291828793775,
"grad_norm": 15.374982833862305,
"learning_rate": 2.52e-05,
"loss": 2.0655,
"step": 426
},
{
"epoch": 0.5551232166018158,
"grad_norm": 14.928581237792969,
"learning_rate": 2.5319999999999998e-05,
"loss": 2.7183,
"step": 428
},
{
"epoch": 0.5577172503242542,
"grad_norm": 14.489096641540527,
"learning_rate": 2.544e-05,
"loss": 2.0051,
"step": 430
},
{
"epoch": 0.5577172503242542,
"eval_cer": 0.31229466196461403,
"eval_loss": 2.4285073280334473,
"eval_runtime": 181.8863,
"eval_samples_per_second": 14.537,
"eval_steps_per_second": 1.82,
"step": 430
},
{
"epoch": 0.5603112840466926,
"grad_norm": 11.531155586242676,
"learning_rate": 2.556e-05,
"loss": 2.2619,
"step": 432
},
{
"epoch": 0.562905317769131,
"grad_norm": 17.837749481201172,
"learning_rate": 2.568e-05,
"loss": 2.5056,
"step": 434
},
{
"epoch": 0.5642023346303502,
"eval_cer": 0.3202519817946168,
"eval_loss": 2.4498050212860107,
"eval_runtime": 230.2751,
"eval_samples_per_second": 11.482,
"eval_steps_per_second": 1.437,
"step": 435
},
{
"epoch": 0.5654993514915694,
"grad_norm": 13.783636093139648,
"learning_rate": 2.58e-05,
"loss": 2.0943,
"step": 436
},
{
"epoch": 0.5680933852140078,
"grad_norm": 17.753210067749023,
"learning_rate": 2.592e-05,
"loss": 2.12,
"step": 438
},
{
"epoch": 0.5706874189364461,
"grad_norm": 15.3496732711792,
"learning_rate": 2.604e-05,
"loss": 2.2611,
"step": 440
},
{
"epoch": 0.5706874189364461,
"eval_cer": 0.33478011875697017,
"eval_loss": 2.4362807273864746,
"eval_runtime": 233.7715,
"eval_samples_per_second": 11.31,
"eval_steps_per_second": 1.416,
"step": 440
},
{
"epoch": 0.5732814526588845,
"grad_norm": 12.754862785339355,
"learning_rate": 2.616e-05,
"loss": 1.7839,
"step": 442
},
{
"epoch": 0.5758754863813229,
"grad_norm": 15.386824607849121,
"learning_rate": 2.628e-05,
"loss": 2.4994,
"step": 444
},
{
"epoch": 0.5771725032425421,
"eval_cer": 0.310908159266962,
"eval_loss": 2.458259105682373,
"eval_runtime": 167.458,
"eval_samples_per_second": 15.789,
"eval_steps_per_second": 1.977,
"step": 445
},
{
"epoch": 0.5784695201037614,
"grad_norm": 14.832752227783203,
"learning_rate": 2.64e-05,
"loss": 2.0773,
"step": 446
},
{
"epoch": 0.5810635538261998,
"grad_norm": 15.057633399963379,
"learning_rate": 2.652e-05,
"loss": 2.0135,
"step": 448
},
{
"epoch": 0.5836575875486382,
"grad_norm": 17.804443359375,
"learning_rate": 2.6640000000000002e-05,
"loss": 2.4173,
"step": 450
},
{
"epoch": 0.5836575875486382,
"eval_cer": 0.3094312324803328,
"eval_loss": 2.376800060272217,
"eval_runtime": 172.2385,
"eval_samples_per_second": 15.351,
"eval_steps_per_second": 1.922,
"step": 450
},
{
"epoch": 0.5862516212710766,
"grad_norm": 22.79265022277832,
"learning_rate": 2.676e-05,
"loss": 1.9889,
"step": 452
},
{
"epoch": 0.5888456549935149,
"grad_norm": 11.24325942993164,
"learning_rate": 2.688e-05,
"loss": 2.9177,
"step": 454
},
{
"epoch": 0.5901426718547341,
"eval_cer": 0.31289748922446275,
"eval_loss": 2.4268851280212402,
"eval_runtime": 150.7156,
"eval_samples_per_second": 17.543,
"eval_steps_per_second": 2.196,
"step": 455
},
{
"epoch": 0.5914396887159533,
"grad_norm": 14.807707786560059,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.6092,
"step": 456
},
{
"epoch": 0.5940337224383917,
"grad_norm": 16.166181564331055,
"learning_rate": 2.712e-05,
"loss": 2.2921,
"step": 458
},
{
"epoch": 0.5966277561608301,
"grad_norm": 18.733001708984375,
"learning_rate": 2.724e-05,
"loss": 2.4549,
"step": 460
},
{
"epoch": 0.5966277561608301,
"eval_cer": 0.3229647044639359,
"eval_loss": 2.331587076187134,
"eval_runtime": 176.3172,
"eval_samples_per_second": 14.996,
"eval_steps_per_second": 1.877,
"step": 460
},
{
"epoch": 0.5992217898832685,
"grad_norm": 13.820377349853516,
"learning_rate": 2.7360000000000002e-05,
"loss": 2.3386,
"step": 462
},
{
"epoch": 0.6018158236057068,
"grad_norm": 11.139546394348145,
"learning_rate": 2.748e-05,
"loss": 2.3171,
"step": 464
},
{
"epoch": 0.603112840466926,
"eval_cer": 0.3494589625342858,
"eval_loss": 2.3250718116760254,
"eval_runtime": 227.8398,
"eval_samples_per_second": 11.605,
"eval_steps_per_second": 1.453,
"step": 465
}
],
"logging_steps": 2,
"max_steps": 77100,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 5,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 10
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.783620910505001e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}