FinBERT_Test / trainer_state.json
PurplelinkPL's picture
Upload 10 files
563567f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.062,
"eval_steps": 1000,
"global_step": 91000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2e-06,
"grad_norm": 29.506126403808594,
"learning_rate": 0.0,
"loss": 1.5091,
"step": 1
},
{
"epoch": 0.0002,
"grad_norm": 7.35781717300415,
"learning_rate": 9.9e-07,
"loss": 1.6562,
"step": 100
},
{
"epoch": 0.0004,
"grad_norm": 4.9180989265441895,
"learning_rate": 1.99e-06,
"loss": 1.6176,
"step": 200
},
{
"epoch": 0.0006,
"grad_norm": 1.8868086338043213,
"learning_rate": 2.99e-06,
"loss": 1.548,
"step": 300
},
{
"epoch": 0.0008,
"grad_norm": 7.365355491638184,
"learning_rate": 3.99e-06,
"loss": 1.4958,
"step": 400
},
{
"epoch": 0.001,
"grad_norm": 8.965476989746094,
"learning_rate": 4.9900000000000005e-06,
"loss": 1.4918,
"step": 500
},
{
"epoch": 0.0012,
"grad_norm": 2.2186834812164307,
"learning_rate": 5.99e-06,
"loss": 1.4807,
"step": 600
},
{
"epoch": 0.0014,
"grad_norm": 1.970430850982666,
"learning_rate": 6.990000000000001e-06,
"loss": 1.4312,
"step": 700
},
{
"epoch": 0.0016,
"grad_norm": 1.5914119482040405,
"learning_rate": 7.99e-06,
"loss": 1.3848,
"step": 800
},
{
"epoch": 0.0018,
"grad_norm": 1.7615679502487183,
"learning_rate": 8.99e-06,
"loss": 1.4126,
"step": 900
},
{
"epoch": 0.002,
"grad_norm": 1.5981565713882446,
"learning_rate": 9.990000000000001e-06,
"loss": 1.3768,
"step": 1000
},
{
"epoch": 0.002,
"eval_loss": 1.1488478183746338,
"eval_runtime": 84.3931,
"eval_samples_per_second": 182.989,
"eval_steps_per_second": 2.868,
"step": 1000
},
{
"epoch": 0.0022,
"grad_norm": 1.9463247060775757,
"learning_rate": 1.099e-05,
"loss": 1.4649,
"step": 1100
},
{
"epoch": 0.0024,
"grad_norm": 1.997353434562683,
"learning_rate": 1.199e-05,
"loss": 1.422,
"step": 1200
},
{
"epoch": 0.0026,
"grad_norm": 2.028587818145752,
"learning_rate": 1.299e-05,
"loss": 1.4101,
"step": 1300
},
{
"epoch": 0.0028,
"grad_norm": 1.8055784702301025,
"learning_rate": 1.399e-05,
"loss": 1.379,
"step": 1400
},
{
"epoch": 0.003,
"grad_norm": 2.630389451980591,
"learning_rate": 1.499e-05,
"loss": 1.3915,
"step": 1500
},
{
"epoch": 0.0032,
"grad_norm": 1.4471231698989868,
"learning_rate": 1.599e-05,
"loss": 1.3651,
"step": 1600
},
{
"epoch": 0.0034,
"grad_norm": 1.4115934371948242,
"learning_rate": 1.699e-05,
"loss": 1.3327,
"step": 1700
},
{
"epoch": 0.0036,
"grad_norm": 1.1099858283996582,
"learning_rate": 1.7990000000000002e-05,
"loss": 1.304,
"step": 1800
},
{
"epoch": 0.0038,
"grad_norm": 1.5767651796340942,
"learning_rate": 1.8990000000000003e-05,
"loss": 1.3375,
"step": 1900
},
{
"epoch": 0.004,
"grad_norm": 1.3484268188476562,
"learning_rate": 1.999e-05,
"loss": 1.3746,
"step": 2000
},
{
"epoch": 0.004,
"eval_loss": 1.1486531496047974,
"eval_runtime": 76.1223,
"eval_samples_per_second": 202.871,
"eval_steps_per_second": 3.179,
"step": 2000
},
{
"epoch": 0.0042,
"grad_norm": 1.6412079334259033,
"learning_rate": 2.099e-05,
"loss": 1.3931,
"step": 2100
},
{
"epoch": 0.0044,
"grad_norm": 1.17317533493042,
"learning_rate": 2.199e-05,
"loss": 1.3512,
"step": 2200
},
{
"epoch": 0.0046,
"grad_norm": 0.8342074751853943,
"learning_rate": 2.2990000000000002e-05,
"loss": 1.3805,
"step": 2300
},
{
"epoch": 0.0048,
"grad_norm": 1.5843234062194824,
"learning_rate": 2.3990000000000002e-05,
"loss": 1.377,
"step": 2400
},
{
"epoch": 0.005,
"grad_norm": 1.915511131286621,
"learning_rate": 2.4990000000000003e-05,
"loss": 1.3659,
"step": 2500
},
{
"epoch": 0.0052,
"grad_norm": 1.6507076025009155,
"learning_rate": 2.5990000000000004e-05,
"loss": 1.2875,
"step": 2600
},
{
"epoch": 0.0054,
"grad_norm": 1.5680265426635742,
"learning_rate": 2.6989999999999997e-05,
"loss": 1.3402,
"step": 2700
},
{
"epoch": 0.0056,
"grad_norm": 0.8005309700965881,
"learning_rate": 2.7989999999999998e-05,
"loss": 1.3565,
"step": 2800
},
{
"epoch": 0.0058,
"grad_norm": 1.664014220237732,
"learning_rate": 2.8990000000000002e-05,
"loss": 1.3118,
"step": 2900
},
{
"epoch": 0.006,
"grad_norm": 1.1597651243209839,
"learning_rate": 2.9990000000000003e-05,
"loss": 1.3207,
"step": 3000
},
{
"epoch": 0.006,
"eval_loss": 1.1344993114471436,
"eval_runtime": 76.5771,
"eval_samples_per_second": 201.666,
"eval_steps_per_second": 3.16,
"step": 3000
},
{
"epoch": 0.0062,
"grad_norm": 1.6559661626815796,
"learning_rate": 3.099e-05,
"loss": 1.3103,
"step": 3100
},
{
"epoch": 0.0064,
"grad_norm": 1.390712857246399,
"learning_rate": 3.1990000000000004e-05,
"loss": 1.3855,
"step": 3200
},
{
"epoch": 0.0066,
"grad_norm": 1.9980418682098389,
"learning_rate": 3.299e-05,
"loss": 1.3109,
"step": 3300
},
{
"epoch": 0.0068,
"grad_norm": 1.2899682521820068,
"learning_rate": 3.399e-05,
"loss": 1.3219,
"step": 3400
},
{
"epoch": 0.007,
"grad_norm": 1.44901704788208,
"learning_rate": 3.499e-05,
"loss": 1.3089,
"step": 3500
},
{
"epoch": 0.0072,
"grad_norm": 1.3377976417541504,
"learning_rate": 3.599e-05,
"loss": 1.2995,
"step": 3600
},
{
"epoch": 0.0074,
"grad_norm": 1.5043129920959473,
"learning_rate": 3.699e-05,
"loss": 1.3421,
"step": 3700
},
{
"epoch": 0.0076,
"grad_norm": 1.4387165307998657,
"learning_rate": 3.799e-05,
"loss": 1.3337,
"step": 3800
},
{
"epoch": 0.0078,
"grad_norm": 1.1607294082641602,
"learning_rate": 3.8990000000000004e-05,
"loss": 1.2852,
"step": 3900
},
{
"epoch": 0.008,
"grad_norm": 1.0189259052276611,
"learning_rate": 3.999e-05,
"loss": 1.3277,
"step": 4000
},
{
"epoch": 0.008,
"eval_loss": 1.1298929452896118,
"eval_runtime": 76.4952,
"eval_samples_per_second": 201.882,
"eval_steps_per_second": 3.164,
"step": 4000
},
{
"epoch": 0.0082,
"grad_norm": 1.6229581832885742,
"learning_rate": 4.099e-05,
"loss": 1.2878,
"step": 4100
},
{
"epoch": 0.0084,
"grad_norm": 1.693702220916748,
"learning_rate": 4.199e-05,
"loss": 1.313,
"step": 4200
},
{
"epoch": 0.0086,
"grad_norm": 1.169730544090271,
"learning_rate": 4.299e-05,
"loss": 1.2915,
"step": 4300
},
{
"epoch": 0.0088,
"grad_norm": 1.3561712503433228,
"learning_rate": 4.3990000000000004e-05,
"loss": 1.3337,
"step": 4400
},
{
"epoch": 0.009,
"grad_norm": 1.4713114500045776,
"learning_rate": 4.499e-05,
"loss": 1.309,
"step": 4500
},
{
"epoch": 0.0092,
"grad_norm": 1.0679044723510742,
"learning_rate": 4.599e-05,
"loss": 1.3464,
"step": 4600
},
{
"epoch": 0.0094,
"grad_norm": 1.4595869779586792,
"learning_rate": 4.699e-05,
"loss": 1.3385,
"step": 4700
},
{
"epoch": 0.0096,
"grad_norm": 1.6443949937820435,
"learning_rate": 4.799e-05,
"loss": 1.3287,
"step": 4800
},
{
"epoch": 0.0098,
"grad_norm": 1.3524634838104248,
"learning_rate": 4.8990000000000004e-05,
"loss": 1.3224,
"step": 4900
},
{
"epoch": 0.01,
"grad_norm": 1.552986979484558,
"learning_rate": 4.999e-05,
"loss": 1.3256,
"step": 5000
},
{
"epoch": 0.01,
"eval_loss": 1.1314986944198608,
"eval_runtime": 76.3433,
"eval_samples_per_second": 202.284,
"eval_steps_per_second": 3.17,
"step": 5000
},
{
"epoch": 0.0102,
"grad_norm": 1.1126846075057983,
"learning_rate": 4.9999995065197964e-05,
"loss": 1.3184,
"step": 5100
},
{
"epoch": 0.0104,
"grad_norm": 0.8533400893211365,
"learning_rate": 4.999998006090441e-05,
"loss": 1.3145,
"step": 5200
},
{
"epoch": 0.0106,
"grad_norm": 1.6032077074050903,
"learning_rate": 4.9999954986621866e-05,
"loss": 1.2894,
"step": 5300
},
{
"epoch": 0.0108,
"grad_norm": 1.2594430446624756,
"learning_rate": 4.999991984236044e-05,
"loss": 1.2515,
"step": 5400
},
{
"epoch": 0.011,
"grad_norm": 1.2169750928878784,
"learning_rate": 4.99998746281343e-05,
"loss": 1.2603,
"step": 5500
},
{
"epoch": 0.0112,
"grad_norm": 1.2038013935089111,
"learning_rate": 4.999981934396165e-05,
"loss": 1.3063,
"step": 5600
},
{
"epoch": 0.0114,
"grad_norm": 1.1477010250091553,
"learning_rate": 4.999975398986476e-05,
"loss": 1.3057,
"step": 5700
},
{
"epoch": 0.0116,
"grad_norm": 0.6725754141807556,
"learning_rate": 4.9999678565869944e-05,
"loss": 1.3211,
"step": 5800
},
{
"epoch": 0.0118,
"grad_norm": 1.5470402240753174,
"learning_rate": 4.99995930720076e-05,
"loss": 1.2794,
"step": 5900
},
{
"epoch": 0.012,
"grad_norm": 1.8079277276992798,
"learning_rate": 4.999949750831215e-05,
"loss": 1.2736,
"step": 6000
},
{
"epoch": 0.012,
"eval_loss": 1.1335862874984741,
"eval_runtime": 76.3508,
"eval_samples_per_second": 202.264,
"eval_steps_per_second": 3.17,
"step": 6000
},
{
"epoch": 0.0122,
"grad_norm": 1.4117431640625,
"learning_rate": 4.99993918748221e-05,
"loss": 1.3142,
"step": 6100
},
{
"epoch": 0.0124,
"grad_norm": 1.2657192945480347,
"learning_rate": 4.999927617157998e-05,
"loss": 1.3216,
"step": 6200
},
{
"epoch": 0.0126,
"grad_norm": 1.0358809232711792,
"learning_rate": 4.9999150398632425e-05,
"loss": 1.329,
"step": 6300
},
{
"epoch": 0.0128,
"grad_norm": 1.6824450492858887,
"learning_rate": 4.999901455603007e-05,
"loss": 1.2911,
"step": 6400
},
{
"epoch": 0.013,
"grad_norm": 1.5632168054580688,
"learning_rate": 4.9998868643827635e-05,
"loss": 1.3004,
"step": 6500
},
{
"epoch": 0.0132,
"grad_norm": 1.254310131072998,
"learning_rate": 4.99987126620839e-05,
"loss": 1.2981,
"step": 6600
},
{
"epoch": 0.0134,
"grad_norm": 1.4540060758590698,
"learning_rate": 4.999854661086171e-05,
"loss": 1.3184,
"step": 6700
},
{
"epoch": 0.0136,
"grad_norm": 1.3684179782867432,
"learning_rate": 4.999837049022792e-05,
"loss": 1.2914,
"step": 6800
},
{
"epoch": 0.0138,
"grad_norm": 1.474075436592102,
"learning_rate": 4.999818430025349e-05,
"loss": 1.2702,
"step": 6900
},
{
"epoch": 0.014,
"grad_norm": 1.3687875270843506,
"learning_rate": 4.999798804101341e-05,
"loss": 1.2388,
"step": 7000
},
{
"epoch": 0.014,
"eval_loss": 1.1258224248886108,
"eval_runtime": 76.3516,
"eval_samples_per_second": 202.262,
"eval_steps_per_second": 3.17,
"step": 7000
},
{
"epoch": 0.0142,
"grad_norm": 0.6668384075164795,
"learning_rate": 4.999778171258675e-05,
"loss": 1.2768,
"step": 7100
},
{
"epoch": 0.0144,
"grad_norm": 1.1303478479385376,
"learning_rate": 4.9997565315056596e-05,
"loss": 1.2639,
"step": 7200
},
{
"epoch": 0.0146,
"grad_norm": 1.516221046447754,
"learning_rate": 4.999733884851012e-05,
"loss": 1.2805,
"step": 7300
},
{
"epoch": 0.0148,
"grad_norm": 1.3124428987503052,
"learning_rate": 4.9997102313038544e-05,
"loss": 1.2811,
"step": 7400
},
{
"epoch": 0.015,
"grad_norm": 1.390687346458435,
"learning_rate": 4.999685570873715e-05,
"loss": 1.2481,
"step": 7500
},
{
"epoch": 0.0152,
"grad_norm": 0.8783305883407593,
"learning_rate": 4.999659903570526e-05,
"loss": 1.2986,
"step": 7600
},
{
"epoch": 0.0154,
"grad_norm": 1.0741727352142334,
"learning_rate": 4.999633229404628e-05,
"loss": 1.2784,
"step": 7700
},
{
"epoch": 0.0156,
"grad_norm": 1.022088885307312,
"learning_rate": 4.999605548386763e-05,
"loss": 1.2869,
"step": 7800
},
{
"epoch": 0.0158,
"grad_norm": 1.0997594594955444,
"learning_rate": 4.9995768605280826e-05,
"loss": 1.2736,
"step": 7900
},
{
"epoch": 0.016,
"grad_norm": 1.191188931465149,
"learning_rate": 4.9995471658401414e-05,
"loss": 1.256,
"step": 8000
},
{
"epoch": 0.016,
"eval_loss": 1.1234357357025146,
"eval_runtime": 76.115,
"eval_samples_per_second": 202.89,
"eval_steps_per_second": 3.179,
"step": 8000
},
{
"epoch": 0.0162,
"grad_norm": 0.7304887175559998,
"learning_rate": 4.9995164643349015e-05,
"loss": 1.2717,
"step": 8100
},
{
"epoch": 0.0164,
"grad_norm": 1.2335166931152344,
"learning_rate": 4.9994847560247276e-05,
"loss": 1.2657,
"step": 8200
},
{
"epoch": 0.0166,
"grad_norm": 1.424973487854004,
"learning_rate": 4.999452040922393e-05,
"loss": 1.3235,
"step": 8300
},
{
"epoch": 0.0168,
"grad_norm": 1.1544169187545776,
"learning_rate": 4.999418319041076e-05,
"loss": 1.2455,
"step": 8400
},
{
"epoch": 0.017,
"grad_norm": 1.1393338441848755,
"learning_rate": 4.9993835903943585e-05,
"loss": 1.233,
"step": 8500
},
{
"epoch": 0.0172,
"grad_norm": 1.1183439493179321,
"learning_rate": 4.99934785499623e-05,
"loss": 1.2282,
"step": 8600
},
{
"epoch": 0.0174,
"grad_norm": 1.275148868560791,
"learning_rate": 4.999311112861084e-05,
"loss": 1.2665,
"step": 8700
},
{
"epoch": 0.0176,
"grad_norm": 1.4136372804641724,
"learning_rate": 4.99927336400372e-05,
"loss": 1.2617,
"step": 8800
},
{
"epoch": 0.0178,
"grad_norm": 1.392327904701233,
"learning_rate": 4.999234608439345e-05,
"loss": 1.292,
"step": 8900
},
{
"epoch": 0.018,
"grad_norm": 1.367475152015686,
"learning_rate": 4.9991948461835685e-05,
"loss": 1.2153,
"step": 9000
},
{
"epoch": 0.018,
"eval_loss": 1.1127148866653442,
"eval_runtime": 76.2524,
"eval_samples_per_second": 202.525,
"eval_steps_per_second": 3.174,
"step": 9000
},
{
"epoch": 0.0182,
"grad_norm": 0.8793131709098816,
"learning_rate": 4.999154077252407e-05,
"loss": 1.2734,
"step": 9100
},
{
"epoch": 0.0184,
"grad_norm": 0.6496739387512207,
"learning_rate": 4.999112301662281e-05,
"loss": 1.2498,
"step": 9200
},
{
"epoch": 0.0186,
"grad_norm": 1.1462939977645874,
"learning_rate": 4.99906951943002e-05,
"loss": 1.2549,
"step": 9300
},
{
"epoch": 0.0188,
"grad_norm": 1.520691156387329,
"learning_rate": 4.999025730572854e-05,
"loss": 1.2437,
"step": 9400
},
{
"epoch": 0.019,
"grad_norm": 1.3555136919021606,
"learning_rate": 4.998980935108424e-05,
"loss": 1.2326,
"step": 9500
},
{
"epoch": 0.0192,
"grad_norm": 1.467217206954956,
"learning_rate": 4.9989351330547715e-05,
"loss": 1.2768,
"step": 9600
},
{
"epoch": 0.0194,
"grad_norm": 1.3842765092849731,
"learning_rate": 4.998888324430346e-05,
"loss": 1.2675,
"step": 9700
},
{
"epoch": 0.0196,
"grad_norm": 1.344078540802002,
"learning_rate": 4.998840509254003e-05,
"loss": 1.2619,
"step": 9800
},
{
"epoch": 0.0198,
"grad_norm": 0.7567517757415771,
"learning_rate": 4.998791687545001e-05,
"loss": 1.2794,
"step": 9900
},
{
"epoch": 0.02,
"grad_norm": 0.9987697601318359,
"learning_rate": 4.998741859323006e-05,
"loss": 1.2778,
"step": 10000
},
{
"epoch": 0.02,
"eval_loss": 1.1275579929351807,
"eval_runtime": 76.2888,
"eval_samples_per_second": 202.428,
"eval_steps_per_second": 3.172,
"step": 10000
},
{
"epoch": 0.0202,
"grad_norm": 1.5212323665618896,
"learning_rate": 4.9986910246080894e-05,
"loss": 1.2884,
"step": 10100
},
{
"epoch": 0.0204,
"grad_norm": 1.5730245113372803,
"learning_rate": 4.998639183420727e-05,
"loss": 1.282,
"step": 10200
},
{
"epoch": 0.0206,
"grad_norm": 0.8342368602752686,
"learning_rate": 4.9985863357818e-05,
"loss": 1.2408,
"step": 10300
},
{
"epoch": 0.0208,
"grad_norm": 1.3672316074371338,
"learning_rate": 4.998532481712596e-05,
"loss": 1.2205,
"step": 10400
},
{
"epoch": 0.021,
"grad_norm": 1.1164605617523193,
"learning_rate": 4.998477621234806e-05,
"loss": 1.2817,
"step": 10500
},
{
"epoch": 0.0212,
"grad_norm": 1.2867449522018433,
"learning_rate": 4.99842175437053e-05,
"loss": 1.2598,
"step": 10600
},
{
"epoch": 0.0214,
"grad_norm": 1.6646244525909424,
"learning_rate": 4.99836488114227e-05,
"loss": 1.2163,
"step": 10700
},
{
"epoch": 0.0216,
"grad_norm": 1.3233399391174316,
"learning_rate": 4.998307001572935e-05,
"loss": 1.2744,
"step": 10800
},
{
"epoch": 0.0218,
"grad_norm": 1.1658077239990234,
"learning_rate": 4.9982481156858385e-05,
"loss": 1.274,
"step": 10900
},
{
"epoch": 0.022,
"grad_norm": 1.4505467414855957,
"learning_rate": 4.9981882235046995e-05,
"loss": 1.2645,
"step": 11000
},
{
"epoch": 0.022,
"eval_loss": 1.1138958930969238,
"eval_runtime": 76.7643,
"eval_samples_per_second": 201.174,
"eval_steps_per_second": 3.153,
"step": 11000
},
{
"epoch": 0.0222,
"grad_norm": 0.8515588641166687,
"learning_rate": 4.998127325053642e-05,
"loss": 1.2359,
"step": 11100
},
{
"epoch": 0.0224,
"grad_norm": 1.4022259712219238,
"learning_rate": 4.9980654203571983e-05,
"loss": 1.2515,
"step": 11200
},
{
"epoch": 0.0226,
"grad_norm": 1.5902676582336426,
"learning_rate": 4.998002509440301e-05,
"loss": 1.2305,
"step": 11300
},
{
"epoch": 0.0228,
"grad_norm": 0.763087809085846,
"learning_rate": 4.997938592328292e-05,
"loss": 1.2312,
"step": 11400
},
{
"epoch": 0.023,
"grad_norm": 1.4949332475662231,
"learning_rate": 4.997873669046916e-05,
"loss": 1.2768,
"step": 11500
},
{
"epoch": 0.0232,
"grad_norm": 1.0390666723251343,
"learning_rate": 4.9978077396223255e-05,
"loss": 1.2355,
"step": 11600
},
{
"epoch": 0.0234,
"grad_norm": 0.6799549460411072,
"learning_rate": 4.997740804081076e-05,
"loss": 1.264,
"step": 11700
},
{
"epoch": 0.0236,
"grad_norm": 1.4702496528625488,
"learning_rate": 4.99767286245013e-05,
"loss": 1.3092,
"step": 11800
},
{
"epoch": 0.0238,
"grad_norm": 1.3574661016464233,
"learning_rate": 4.997603914756853e-05,
"loss": 1.2654,
"step": 11900
},
{
"epoch": 0.024,
"grad_norm": 1.1170625686645508,
"learning_rate": 4.9975339610290175e-05,
"loss": 1.2343,
"step": 12000
},
{
"epoch": 0.024,
"eval_loss": 1.1109821796417236,
"eval_runtime": 76.4587,
"eval_samples_per_second": 201.978,
"eval_steps_per_second": 3.165,
"step": 12000
},
{
"epoch": 0.0242,
"grad_norm": 1.2707583904266357,
"learning_rate": 4.997463001294802e-05,
"loss": 1.2525,
"step": 12100
},
{
"epoch": 0.0244,
"grad_norm": 1.2613739967346191,
"learning_rate": 4.997391035582788e-05,
"loss": 1.2698,
"step": 12200
},
{
"epoch": 0.0246,
"grad_norm": 1.1995183229446411,
"learning_rate": 4.997318063921963e-05,
"loss": 1.237,
"step": 12300
},
{
"epoch": 0.0248,
"grad_norm": 0.729535698890686,
"learning_rate": 4.997244086341721e-05,
"loss": 1.2248,
"step": 12400
},
{
"epoch": 0.025,
"grad_norm": 1.3250787258148193,
"learning_rate": 4.9971691028718594e-05,
"loss": 1.2617,
"step": 12500
},
{
"epoch": 0.0252,
"grad_norm": 1.421278476715088,
"learning_rate": 4.997093113542582e-05,
"loss": 1.2321,
"step": 12600
},
{
"epoch": 0.0254,
"grad_norm": 1.5168310403823853,
"learning_rate": 4.997016118384497e-05,
"loss": 1.2268,
"step": 12700
},
{
"epoch": 0.0256,
"grad_norm": 1.045483946800232,
"learning_rate": 4.996938117428618e-05,
"loss": 1.2714,
"step": 12800
},
{
"epoch": 0.0258,
"grad_norm": 0.8379656076431274,
"learning_rate": 4.9968591107063647e-05,
"loss": 1.2792,
"step": 12900
},
{
"epoch": 0.026,
"grad_norm": 1.620133638381958,
"learning_rate": 4.996779098249559e-05,
"loss": 1.2456,
"step": 13000
},
{
"epoch": 0.026,
"eval_loss": 1.1081608533859253,
"eval_runtime": 76.4734,
"eval_samples_per_second": 201.939,
"eval_steps_per_second": 3.164,
"step": 13000
},
{
"epoch": 0.0262,
"grad_norm": 1.2181329727172852,
"learning_rate": 4.9966980800904315e-05,
"loss": 1.2187,
"step": 13100
},
{
"epoch": 0.0264,
"grad_norm": 1.4935636520385742,
"learning_rate": 4.996616056261616e-05,
"loss": 1.2405,
"step": 13200
},
{
"epoch": 0.0266,
"grad_norm": 1.3096436262130737,
"learning_rate": 4.996533026796152e-05,
"loss": 1.2599,
"step": 13300
},
{
"epoch": 0.0268,
"grad_norm": 1.5392045974731445,
"learning_rate": 4.996448991727483e-05,
"loss": 1.2491,
"step": 13400
},
{
"epoch": 0.027,
"grad_norm": 1.3175737857818604,
"learning_rate": 4.996363951089459e-05,
"loss": 1.2383,
"step": 13500
},
{
"epoch": 0.0272,
"grad_norm": 1.3839282989501953,
"learning_rate": 4.9962779049163335e-05,
"loss": 1.2739,
"step": 13600
},
{
"epoch": 0.0274,
"grad_norm": 0.8403354287147522,
"learning_rate": 4.996190853242767e-05,
"loss": 1.2378,
"step": 13700
},
{
"epoch": 0.0276,
"grad_norm": 1.2463191747665405,
"learning_rate": 4.996102796103823e-05,
"loss": 1.2248,
"step": 13800
},
{
"epoch": 0.0278,
"grad_norm": 1.466070294380188,
"learning_rate": 4.996013733534971e-05,
"loss": 1.2567,
"step": 13900
},
{
"epoch": 0.028,
"grad_norm": 0.8661775588989258,
"learning_rate": 4.995923665572085e-05,
"loss": 1.2372,
"step": 14000
},
{
"epoch": 0.028,
"eval_loss": 1.113655686378479,
"eval_runtime": 76.3727,
"eval_samples_per_second": 202.206,
"eval_steps_per_second": 3.169,
"step": 14000
},
{
"epoch": 0.0282,
"grad_norm": 0.9262897968292236,
"learning_rate": 4.9958325922514466e-05,
"loss": 1.2082,
"step": 14100
},
{
"epoch": 0.0284,
"grad_norm": 1.406928539276123,
"learning_rate": 4.995740513609738e-05,
"loss": 1.2576,
"step": 14200
},
{
"epoch": 0.0286,
"grad_norm": 0.9858616590499878,
"learning_rate": 4.9956474296840485e-05,
"loss": 1.2173,
"step": 14300
},
{
"epoch": 0.0288,
"grad_norm": 0.6425116062164307,
"learning_rate": 4.9955533405118725e-05,
"loss": 1.237,
"step": 14400
},
{
"epoch": 0.029,
"grad_norm": 0.7704317569732666,
"learning_rate": 4.9954582461311106e-05,
"loss": 1.286,
"step": 14500
},
{
"epoch": 0.0292,
"grad_norm": 1.2745368480682373,
"learning_rate": 4.995362146580065e-05,
"loss": 1.2553,
"step": 14600
},
{
"epoch": 0.0294,
"grad_norm": 1.1889222860336304,
"learning_rate": 4.995265041897444e-05,
"loss": 1.2783,
"step": 14700
},
{
"epoch": 0.0296,
"grad_norm": 1.4223252534866333,
"learning_rate": 4.9951669321223645e-05,
"loss": 1.27,
"step": 14800
},
{
"epoch": 0.0298,
"grad_norm": 1.0991147756576538,
"learning_rate": 4.995067817294342e-05,
"loss": 1.2373,
"step": 14900
},
{
"epoch": 0.03,
"grad_norm": 1.2834559679031372,
"learning_rate": 4.994967697453301e-05,
"loss": 1.2725,
"step": 15000
},
{
"epoch": 0.03,
"eval_loss": 1.1147979497909546,
"eval_runtime": 77.4863,
"eval_samples_per_second": 199.3,
"eval_steps_per_second": 3.123,
"step": 15000
},
{
"epoch": 0.0302,
"grad_norm": 1.3690969944000244,
"learning_rate": 4.9948665726395705e-05,
"loss": 1.2631,
"step": 15100
},
{
"epoch": 0.0304,
"grad_norm": 1.0501981973648071,
"learning_rate": 4.994764442893882e-05,
"loss": 1.2614,
"step": 15200
},
{
"epoch": 0.0306,
"grad_norm": 1.2085719108581543,
"learning_rate": 4.994661308257375e-05,
"loss": 1.1982,
"step": 15300
},
{
"epoch": 0.0308,
"grad_norm": 1.1436259746551514,
"learning_rate": 4.994557168771591e-05,
"loss": 1.2079,
"step": 15400
},
{
"epoch": 0.031,
"grad_norm": 0.8355712890625,
"learning_rate": 4.994452024478478e-05,
"loss": 1.2537,
"step": 15500
},
{
"epoch": 0.0312,
"grad_norm": 0.9547547698020935,
"learning_rate": 4.9943458754203875e-05,
"loss": 1.2399,
"step": 15600
},
{
"epoch": 0.0314,
"grad_norm": 1.090165138244629,
"learning_rate": 4.994238721640077e-05,
"loss": 1.2324,
"step": 15700
},
{
"epoch": 0.0316,
"grad_norm": 0.9351906180381775,
"learning_rate": 4.9941305631807076e-05,
"loss": 1.2431,
"step": 15800
},
{
"epoch": 0.0318,
"grad_norm": 1.3740676641464233,
"learning_rate": 4.9940214000858456e-05,
"loss": 1.2487,
"step": 15900
},
{
"epoch": 0.032,
"grad_norm": 0.656019926071167,
"learning_rate": 4.993911232399462e-05,
"loss": 1.2371,
"step": 16000
},
{
"epoch": 0.032,
"eval_loss": 1.1028244495391846,
"eval_runtime": 76.4629,
"eval_samples_per_second": 201.967,
"eval_steps_per_second": 3.165,
"step": 16000
},
{
"epoch": 0.0322,
"grad_norm": 1.20018470287323,
"learning_rate": 4.9938000601659315e-05,
"loss": 1.2547,
"step": 16100
},
{
"epoch": 0.0324,
"grad_norm": 1.2216906547546387,
"learning_rate": 4.993687883430036e-05,
"loss": 1.2327,
"step": 16200
},
{
"epoch": 0.0326,
"grad_norm": 1.0969616174697876,
"learning_rate": 4.99357470223696e-05,
"loss": 1.2513,
"step": 16300
},
{
"epoch": 0.0328,
"grad_norm": 1.026194453239441,
"learning_rate": 4.99346051663229e-05,
"loss": 1.2508,
"step": 16400
},
{
"epoch": 0.033,
"grad_norm": 1.1246017217636108,
"learning_rate": 4.993345326662023e-05,
"loss": 1.2538,
"step": 16500
},
{
"epoch": 0.0332,
"grad_norm": 1.293093204498291,
"learning_rate": 4.993229132372557e-05,
"loss": 1.2236,
"step": 16600
},
{
"epoch": 0.0334,
"grad_norm": 1.208122730255127,
"learning_rate": 4.993111933810695e-05,
"loss": 1.2753,
"step": 16700
},
{
"epoch": 0.0336,
"grad_norm": 1.073480248451233,
"learning_rate": 4.992993731023643e-05,
"loss": 1.2665,
"step": 16800
},
{
"epoch": 0.0338,
"grad_norm": 1.4211028814315796,
"learning_rate": 4.9928745240590146e-05,
"loss": 1.2388,
"step": 16900
},
{
"epoch": 0.034,
"grad_norm": 1.1787285804748535,
"learning_rate": 4.992754312964827e-05,
"loss": 1.2118,
"step": 17000
},
{
"epoch": 0.034,
"eval_loss": 1.104814887046814,
"eval_runtime": 76.4454,
"eval_samples_per_second": 202.013,
"eval_steps_per_second": 3.166,
"step": 17000
},
{
"epoch": 0.0342,
"grad_norm": 0.9049177765846252,
"learning_rate": 4.992633097789499e-05,
"loss": 1.1995,
"step": 17100
},
{
"epoch": 0.0344,
"grad_norm": 1.2447205781936646,
"learning_rate": 4.992510878581858e-05,
"loss": 1.2174,
"step": 17200
},
{
"epoch": 0.0346,
"grad_norm": 1.0060733556747437,
"learning_rate": 4.9923876553911334e-05,
"loss": 1.2098,
"step": 17300
},
{
"epoch": 0.0348,
"grad_norm": 1.3275829553604126,
"learning_rate": 4.992263428266958e-05,
"loss": 1.2256,
"step": 17400
},
{
"epoch": 0.035,
"grad_norm": 1.3165931701660156,
"learning_rate": 4.992138197259373e-05,
"loss": 1.2276,
"step": 17500
},
{
"epoch": 0.0352,
"grad_norm": 1.2749327421188354,
"learning_rate": 4.9920119624188196e-05,
"loss": 1.2758,
"step": 17600
},
{
"epoch": 0.0354,
"grad_norm": 1.0836033821105957,
"learning_rate": 4.991884723796146e-05,
"loss": 1.2407,
"step": 17700
},
{
"epoch": 0.0356,
"grad_norm": 1.343475103378296,
"learning_rate": 4.9917564814426034e-05,
"loss": 1.2466,
"step": 17800
},
{
"epoch": 0.0358,
"grad_norm": 1.3868790864944458,
"learning_rate": 4.991627235409848e-05,
"loss": 1.2402,
"step": 17900
},
{
"epoch": 0.036,
"grad_norm": 1.5200074911117554,
"learning_rate": 4.99149698574994e-05,
"loss": 1.2183,
"step": 18000
},
{
"epoch": 0.036,
"eval_loss": 1.0960842370986938,
"eval_runtime": 76.481,
"eval_samples_per_second": 201.92,
"eval_steps_per_second": 3.164,
"step": 18000
},
{
"epoch": 0.0362,
"grad_norm": 1.4647791385650635,
"learning_rate": 4.991365732515345e-05,
"loss": 1.2386,
"step": 18100
},
{
"epoch": 0.0364,
"grad_norm": 0.9076351523399353,
"learning_rate": 4.991233475758931e-05,
"loss": 1.2011,
"step": 18200
},
{
"epoch": 0.0366,
"grad_norm": 0.9813222289085388,
"learning_rate": 4.99110021553397e-05,
"loss": 1.214,
"step": 18300
},
{
"epoch": 0.0368,
"grad_norm": 1.5431565046310425,
"learning_rate": 4.99096595189414e-05,
"loss": 1.2206,
"step": 18400
},
{
"epoch": 0.037,
"grad_norm": 0.9991932511329651,
"learning_rate": 4.990830684893523e-05,
"loss": 1.2334,
"step": 18500
},
{
"epoch": 0.0372,
"grad_norm": 0.6322658658027649,
"learning_rate": 4.9906944145866035e-05,
"loss": 1.2354,
"step": 18600
},
{
"epoch": 0.0374,
"grad_norm": 0.9555477499961853,
"learning_rate": 4.990557141028272e-05,
"loss": 1.2017,
"step": 18700
},
{
"epoch": 0.0376,
"grad_norm": 1.171019196510315,
"learning_rate": 4.990418864273822e-05,
"loss": 1.286,
"step": 18800
},
{
"epoch": 0.0378,
"grad_norm": 1.2275811433792114,
"learning_rate": 4.990279584378951e-05,
"loss": 1.2345,
"step": 18900
},
{
"epoch": 0.038,
"grad_norm": 1.6589407920837402,
"learning_rate": 4.9901393013997616e-05,
"loss": 1.2376,
"step": 19000
},
{
"epoch": 0.038,
"eval_loss": 1.107132077217102,
"eval_runtime": 76.3932,
"eval_samples_per_second": 202.152,
"eval_steps_per_second": 3.168,
"step": 19000
},
{
"epoch": 0.0382,
"grad_norm": 0.7907335758209229,
"learning_rate": 4.9899980153927596e-05,
"loss": 1.2554,
"step": 19100
},
{
"epoch": 0.0384,
"grad_norm": 1.4444235563278198,
"learning_rate": 4.989855726414854e-05,
"loss": 1.2618,
"step": 19200
},
{
"epoch": 0.0386,
"grad_norm": 1.1591296195983887,
"learning_rate": 4.98971243452336e-05,
"loss": 1.2028,
"step": 19300
},
{
"epoch": 0.0388,
"grad_norm": 0.9183579087257385,
"learning_rate": 4.989568139775995e-05,
"loss": 1.2259,
"step": 19400
},
{
"epoch": 0.039,
"grad_norm": 1.0866785049438477,
"learning_rate": 4.9894228422308805e-05,
"loss": 1.2307,
"step": 19500
},
{
"epoch": 0.0392,
"grad_norm": 1.5889687538146973,
"learning_rate": 4.9892765419465436e-05,
"loss": 1.2346,
"step": 19600
},
{
"epoch": 0.0394,
"grad_norm": 1.300850510597229,
"learning_rate": 4.989129238981913e-05,
"loss": 1.2748,
"step": 19700
},
{
"epoch": 0.0396,
"grad_norm": 1.2363704442977905,
"learning_rate": 4.988980933396323e-05,
"loss": 1.2536,
"step": 19800
},
{
"epoch": 0.0398,
"grad_norm": 0.8141745328903198,
"learning_rate": 4.9888316252495106e-05,
"loss": 1.2198,
"step": 19900
},
{
"epoch": 0.04,
"grad_norm": 1.0759721994400024,
"learning_rate": 4.988681314601617e-05,
"loss": 1.2225,
"step": 20000
},
{
"epoch": 0.04,
"eval_loss": 1.0962127447128296,
"eval_runtime": 76.7102,
"eval_samples_per_second": 201.316,
"eval_steps_per_second": 3.155,
"step": 20000
},
{
"epoch": 0.0402,
"grad_norm": 1.2748645544052124,
"learning_rate": 4.988530001513187e-05,
"loss": 1.2245,
"step": 20100
},
{
"epoch": 0.0404,
"grad_norm": 1.3192243576049805,
"learning_rate": 4.9883776860451704e-05,
"loss": 1.2292,
"step": 20200
},
{
"epoch": 0.0406,
"grad_norm": 1.329868197441101,
"learning_rate": 4.98822436825892e-05,
"loss": 1.2243,
"step": 20300
},
{
"epoch": 0.0408,
"grad_norm": 1.3394356966018677,
"learning_rate": 4.988070048216191e-05,
"loss": 1.216,
"step": 20400
},
{
"epoch": 0.041,
"grad_norm": 1.3035671710968018,
"learning_rate": 4.987914725979144e-05,
"loss": 1.2335,
"step": 20500
},
{
"epoch": 0.0412,
"grad_norm": 1.2765480279922485,
"learning_rate": 4.987758401610343e-05,
"loss": 1.261,
"step": 20600
},
{
"epoch": 0.0414,
"grad_norm": 1.0472270250320435,
"learning_rate": 4.9876010751727553e-05,
"loss": 1.2173,
"step": 20700
},
{
"epoch": 0.0416,
"grad_norm": 1.163237452507019,
"learning_rate": 4.9874427467297525e-05,
"loss": 1.2316,
"step": 20800
},
{
"epoch": 0.0418,
"grad_norm": 1.3546457290649414,
"learning_rate": 4.987283416345109e-05,
"loss": 1.2268,
"step": 20900
},
{
"epoch": 0.042,
"grad_norm": 1.0812748670578003,
"learning_rate": 4.9871230840830016e-05,
"loss": 1.2267,
"step": 21000
},
{
"epoch": 0.042,
"eval_loss": 1.1046785116195679,
"eval_runtime": 76.3631,
"eval_samples_per_second": 202.231,
"eval_steps_per_second": 3.169,
"step": 21000
},
{
"epoch": 0.0422,
"grad_norm": 0.7458230257034302,
"learning_rate": 4.986961750008014e-05,
"loss": 1.1918,
"step": 21100
},
{
"epoch": 0.0424,
"grad_norm": 1.2837951183319092,
"learning_rate": 4.986799414185131e-05,
"loss": 1.2206,
"step": 21200
},
{
"epoch": 0.0426,
"grad_norm": 1.4213489294052124,
"learning_rate": 4.986636076679742e-05,
"loss": 1.2552,
"step": 21300
},
{
"epoch": 0.0428,
"grad_norm": 1.297608733177185,
"learning_rate": 4.986471737557638e-05,
"loss": 1.2234,
"step": 21400
},
{
"epoch": 0.043,
"grad_norm": 1.3617885112762451,
"learning_rate": 4.986306396885015e-05,
"loss": 1.2381,
"step": 21500
},
{
"epoch": 0.0432,
"grad_norm": 1.500025749206543,
"learning_rate": 4.986140054728473e-05,
"loss": 1.1957,
"step": 21600
},
{
"epoch": 0.0434,
"grad_norm": 0.6222732663154602,
"learning_rate": 4.9859727111550147e-05,
"loss": 1.2579,
"step": 21700
},
{
"epoch": 0.0436,
"grad_norm": 1.4154349565505981,
"learning_rate": 4.985804366232045e-05,
"loss": 1.2073,
"step": 21800
},
{
"epoch": 0.0438,
"grad_norm": 1.334390640258789,
"learning_rate": 4.9856350200273746e-05,
"loss": 1.2317,
"step": 21900
},
{
"epoch": 0.044,
"grad_norm": 0.8164774179458618,
"learning_rate": 4.985464672609215e-05,
"loss": 1.2248,
"step": 22000
},
{
"epoch": 0.044,
"eval_loss": 1.1025385856628418,
"eval_runtime": 76.8498,
"eval_samples_per_second": 200.951,
"eval_steps_per_second": 3.149,
"step": 22000
},
{
"epoch": 0.0442,
"grad_norm": 1.1641725301742554,
"learning_rate": 4.985293324046182e-05,
"loss": 1.1928,
"step": 22100
},
{
"epoch": 0.0444,
"grad_norm": 1.2185006141662598,
"learning_rate": 4.9851209744072954e-05,
"loss": 1.2435,
"step": 22200
},
{
"epoch": 0.0446,
"grad_norm": 1.0973742008209229,
"learning_rate": 4.9849476237619784e-05,
"loss": 1.2515,
"step": 22300
},
{
"epoch": 0.0448,
"grad_norm": 1.0242998600006104,
"learning_rate": 4.984773272180056e-05,
"loss": 1.2511,
"step": 22400
},
{
"epoch": 0.045,
"grad_norm": 0.598416805267334,
"learning_rate": 4.984597919731755e-05,
"loss": 1.215,
"step": 22500
},
{
"epoch": 0.0452,
"grad_norm": 0.9391146302223206,
"learning_rate": 4.98442156648771e-05,
"loss": 1.2303,
"step": 22600
},
{
"epoch": 0.0454,
"grad_norm": 0.9301611185073853,
"learning_rate": 4.9842442125189556e-05,
"loss": 1.2621,
"step": 22700
},
{
"epoch": 0.0456,
"grad_norm": 1.3423951864242554,
"learning_rate": 4.984065857896928e-05,
"loss": 1.2251,
"step": 22800
},
{
"epoch": 0.0458,
"grad_norm": 1.3373651504516602,
"learning_rate": 4.983886502693471e-05,
"loss": 1.2738,
"step": 22900
},
{
"epoch": 0.046,
"grad_norm": 1.007158637046814,
"learning_rate": 4.983706146980828e-05,
"loss": 1.1923,
"step": 23000
},
{
"epoch": 0.046,
"eval_loss": 1.1094993352890015,
"eval_runtime": 76.6473,
"eval_samples_per_second": 201.481,
"eval_steps_per_second": 3.157,
"step": 23000
},
{
"epoch": 0.0462,
"grad_norm": 0.7804542779922485,
"learning_rate": 4.9835247908316454e-05,
"loss": 1.2098,
"step": 23100
},
{
"epoch": 0.0464,
"grad_norm": 1.377008318901062,
"learning_rate": 4.983342434318975e-05,
"loss": 1.2202,
"step": 23200
},
{
"epoch": 0.0466,
"grad_norm": 1.1037031412124634,
"learning_rate": 4.983159077516268e-05,
"loss": 1.1977,
"step": 23300
},
{
"epoch": 0.0468,
"grad_norm": 0.7141278386116028,
"learning_rate": 4.982974720497382e-05,
"loss": 1.2054,
"step": 23400
},
{
"epoch": 0.047,
"grad_norm": 0.570811927318573,
"learning_rate": 4.9827893633365754e-05,
"loss": 1.2163,
"step": 23500
},
{
"epoch": 0.0472,
"grad_norm": 0.7255613803863525,
"learning_rate": 4.98260300610851e-05,
"loss": 1.2212,
"step": 23600
},
{
"epoch": 0.0474,
"grad_norm": 0.8988520503044128,
"learning_rate": 4.982415648888251e-05,
"loss": 1.2332,
"step": 23700
},
{
"epoch": 0.0476,
"grad_norm": 1.2191438674926758,
"learning_rate": 4.9822272917512644e-05,
"loss": 1.1974,
"step": 23800
},
{
"epoch": 0.0478,
"grad_norm": 1.2043516635894775,
"learning_rate": 4.982037934773423e-05,
"loss": 1.2229,
"step": 23900
},
{
"epoch": 0.048,
"grad_norm": 1.3503689765930176,
"learning_rate": 4.981847578030998e-05,
"loss": 1.2307,
"step": 24000
},
{
"epoch": 0.048,
"eval_loss": 1.0969973802566528,
"eval_runtime": 76.7433,
"eval_samples_per_second": 201.229,
"eval_steps_per_second": 3.153,
"step": 24000
},
{
"epoch": 0.0482,
"grad_norm": 1.3795185089111328,
"learning_rate": 4.9816562216006645e-05,
"loss": 1.1894,
"step": 24100
},
{
"epoch": 0.0484,
"grad_norm": 1.1966140270233154,
"learning_rate": 4.9814638655595024e-05,
"loss": 1.2011,
"step": 24200
},
{
"epoch": 0.0486,
"grad_norm": 1.179077386856079,
"learning_rate": 4.981270509984992e-05,
"loss": 1.2596,
"step": 24300
},
{
"epoch": 0.0488,
"grad_norm": 1.24593186378479,
"learning_rate": 4.9810761549550166e-05,
"loss": 1.2219,
"step": 24400
},
{
"epoch": 0.049,
"grad_norm": 1.2809820175170898,
"learning_rate": 4.9808808005478635e-05,
"loss": 1.2033,
"step": 24500
},
{
"epoch": 0.0492,
"grad_norm": 0.9016757011413574,
"learning_rate": 4.9806844468422196e-05,
"loss": 1.2394,
"step": 24600
},
{
"epoch": 0.0494,
"grad_norm": 0.7064381837844849,
"learning_rate": 4.9804870939171774e-05,
"loss": 1.2154,
"step": 24700
},
{
"epoch": 0.0496,
"grad_norm": 0.626646101474762,
"learning_rate": 4.980288741852231e-05,
"loss": 1.2021,
"step": 24800
},
{
"epoch": 0.0498,
"grad_norm": 1.049187421798706,
"learning_rate": 4.980089390727275e-05,
"loss": 1.1839,
"step": 24900
},
{
"epoch": 0.05,
"grad_norm": 1.2987581491470337,
"learning_rate": 4.97988904062261e-05,
"loss": 1.1969,
"step": 25000
},
{
"epoch": 0.05,
"eval_loss": 1.090114951133728,
"eval_runtime": 77.5992,
"eval_samples_per_second": 199.01,
"eval_steps_per_second": 3.119,
"step": 25000
},
{
"epoch": 0.0502,
"grad_norm": 1.105361819267273,
"learning_rate": 4.979687691618936e-05,
"loss": 1.1784,
"step": 25100
},
{
"epoch": 0.0504,
"grad_norm": 0.7138956189155579,
"learning_rate": 4.9794853437973555e-05,
"loss": 1.2016,
"step": 25200
},
{
"epoch": 0.0506,
"grad_norm": 1.250241756439209,
"learning_rate": 4.9792819972393756e-05,
"loss": 1.2032,
"step": 25300
},
{
"epoch": 0.0508,
"grad_norm": 0.5875529050827026,
"learning_rate": 4.9790776520269034e-05,
"loss": 1.2034,
"step": 25400
},
{
"epoch": 0.051,
"grad_norm": 1.2880475521087646,
"learning_rate": 4.9788723082422495e-05,
"loss": 1.2172,
"step": 25500
},
{
"epoch": 0.0512,
"grad_norm": 0.8775302767753601,
"learning_rate": 4.978665965968127e-05,
"loss": 1.2264,
"step": 25600
},
{
"epoch": 0.0514,
"grad_norm": 0.7336851954460144,
"learning_rate": 4.978458625287649e-05,
"loss": 1.2248,
"step": 25700
},
{
"epoch": 0.0516,
"grad_norm": 1.431084156036377,
"learning_rate": 4.978250286284333e-05,
"loss": 1.2353,
"step": 25800
},
{
"epoch": 0.0518,
"grad_norm": 1.6342276334762573,
"learning_rate": 4.978040949042099e-05,
"loss": 1.1984,
"step": 25900
},
{
"epoch": 0.052,
"grad_norm": 1.5883526802062988,
"learning_rate": 4.977830613645266e-05,
"loss": 1.2251,
"step": 26000
},
{
"epoch": 0.052,
"eval_loss": 1.0901614427566528,
"eval_runtime": 76.7254,
"eval_samples_per_second": 201.276,
"eval_steps_per_second": 3.154,
"step": 26000
},
{
"epoch": 0.0522,
"grad_norm": 1.1527795791625977,
"learning_rate": 4.977619280178558e-05,
"loss": 1.2043,
"step": 26100
},
{
"epoch": 0.0524,
"grad_norm": 1.5160431861877441,
"learning_rate": 4.9774069487271014e-05,
"loss": 1.1931,
"step": 26200
},
{
"epoch": 0.0526,
"grad_norm": 1.2551748752593994,
"learning_rate": 4.977193619376421e-05,
"loss": 1.2397,
"step": 26300
},
{
"epoch": 0.0528,
"grad_norm": 1.2745076417922974,
"learning_rate": 4.976979292212448e-05,
"loss": 1.2336,
"step": 26400
},
{
"epoch": 0.053,
"grad_norm": 1.4893673658370972,
"learning_rate": 4.976763967321511e-05,
"loss": 1.1827,
"step": 26500
},
{
"epoch": 0.0532,
"grad_norm": 0.857379138469696,
"learning_rate": 4.976547644790346e-05,
"loss": 1.2441,
"step": 26600
},
{
"epoch": 0.0534,
"grad_norm": 1.167006492614746,
"learning_rate": 4.976330324706084e-05,
"loss": 1.2779,
"step": 26700
},
{
"epoch": 0.0536,
"grad_norm": 0.634842574596405,
"learning_rate": 4.976112007156265e-05,
"loss": 1.2828,
"step": 26800
},
{
"epoch": 0.0538,
"grad_norm": 0.9239290952682495,
"learning_rate": 4.975892692228825e-05,
"loss": 1.2094,
"step": 26900
},
{
"epoch": 0.054,
"grad_norm": 1.2031028270721436,
"learning_rate": 4.9756723800121044e-05,
"loss": 1.222,
"step": 27000
},
{
"epoch": 0.054,
"eval_loss": 1.0867078304290771,
"eval_runtime": 76.6606,
"eval_samples_per_second": 201.446,
"eval_steps_per_second": 3.157,
"step": 27000
},
{
"epoch": 0.0542,
"grad_norm": 1.3575947284698486,
"learning_rate": 4.9754510705948456e-05,
"loss": 1.1622,
"step": 27100
},
{
"epoch": 0.0544,
"grad_norm": 1.142074465751648,
"learning_rate": 4.975228764066191e-05,
"loss": 1.2703,
"step": 27200
},
{
"epoch": 0.0546,
"grad_norm": 0.8273721933364868,
"learning_rate": 4.975005460515686e-05,
"loss": 1.1921,
"step": 27300
},
{
"epoch": 0.0548,
"grad_norm": 1.3859556913375854,
"learning_rate": 4.974781160033278e-05,
"loss": 1.2195,
"step": 27400
},
{
"epoch": 0.055,
"grad_norm": 1.2232416868209839,
"learning_rate": 4.974555862709315e-05,
"loss": 1.1851,
"step": 27500
},
{
"epoch": 0.0552,
"grad_norm": 0.7069573998451233,
"learning_rate": 4.974329568634546e-05,
"loss": 1.2098,
"step": 27600
},
{
"epoch": 0.0554,
"grad_norm": 1.2497153282165527,
"learning_rate": 4.974102277900122e-05,
"loss": 1.206,
"step": 27700
},
{
"epoch": 0.0556,
"grad_norm": 1.206449031829834,
"learning_rate": 4.9738739905975976e-05,
"loss": 1.2352,
"step": 27800
},
{
"epoch": 0.0558,
"grad_norm": 1.3927749395370483,
"learning_rate": 4.973644706818925e-05,
"loss": 1.1952,
"step": 27900
},
{
"epoch": 0.056,
"grad_norm": 1.3856321573257446,
"learning_rate": 4.973414426656461e-05,
"loss": 1.2499,
"step": 28000
},
{
"epoch": 0.056,
"eval_loss": 1.0941141843795776,
"eval_runtime": 76.7063,
"eval_samples_per_second": 201.326,
"eval_steps_per_second": 3.155,
"step": 28000
},
{
"epoch": 0.0562,
"grad_norm": 0.6676329970359802,
"learning_rate": 4.9731831502029606e-05,
"loss": 1.2333,
"step": 28100
},
{
"epoch": 0.0564,
"grad_norm": 1.2670732736587524,
"learning_rate": 4.972950877551584e-05,
"loss": 1.183,
"step": 28200
},
{
"epoch": 0.0566,
"grad_norm": 1.2089595794677734,
"learning_rate": 4.972717608795889e-05,
"loss": 1.2445,
"step": 28300
},
{
"epoch": 0.0568,
"grad_norm": 1.1897366046905518,
"learning_rate": 4.972483344029838e-05,
"loss": 1.2217,
"step": 28400
},
{
"epoch": 0.057,
"grad_norm": 1.4963501691818237,
"learning_rate": 4.97224808334779e-05,
"loss": 1.2079,
"step": 28500
},
{
"epoch": 0.0572,
"grad_norm": 1.594019889831543,
"learning_rate": 4.972011826844511e-05,
"loss": 1.1822,
"step": 28600
},
{
"epoch": 0.0574,
"grad_norm": 1.3324779272079468,
"learning_rate": 4.971774574615163e-05,
"loss": 1.2562,
"step": 28700
},
{
"epoch": 0.0576,
"grad_norm": 1.3334344625473022,
"learning_rate": 4.971536326755313e-05,
"loss": 1.2509,
"step": 28800
},
{
"epoch": 0.0578,
"grad_norm": 0.9475389719009399,
"learning_rate": 4.971297083360925e-05,
"loss": 1.1826,
"step": 28900
},
{
"epoch": 0.058,
"grad_norm": 0.8067657947540283,
"learning_rate": 4.971056844528368e-05,
"loss": 1.1895,
"step": 29000
},
{
"epoch": 0.058,
"eval_loss": 1.0870901346206665,
"eval_runtime": 76.6141,
"eval_samples_per_second": 201.569,
"eval_steps_per_second": 3.159,
"step": 29000
},
{
"epoch": 0.0582,
"grad_norm": 0.7364763617515564,
"learning_rate": 4.970815610354409e-05,
"loss": 1.1821,
"step": 29100
},
{
"epoch": 0.0584,
"grad_norm": 1.494878888130188,
"learning_rate": 4.970573380936218e-05,
"loss": 1.1592,
"step": 29200
},
{
"epoch": 0.0586,
"grad_norm": 0.7247675061225891,
"learning_rate": 4.9703301563713645e-05,
"loss": 1.2347,
"step": 29300
},
{
"epoch": 0.0588,
"grad_norm": 1.0013625621795654,
"learning_rate": 4.970085936757819e-05,
"loss": 1.2536,
"step": 29400
},
{
"epoch": 0.059,
"grad_norm": 1.012537956237793,
"learning_rate": 4.969840722193955e-05,
"loss": 1.2461,
"step": 29500
},
{
"epoch": 0.0592,
"grad_norm": 0.8702846169471741,
"learning_rate": 4.969594512778541e-05,
"loss": 1.2005,
"step": 29600
},
{
"epoch": 0.0594,
"grad_norm": 1.1068499088287354,
"learning_rate": 4.969347308610755e-05,
"loss": 1.1942,
"step": 29700
},
{
"epoch": 0.0596,
"grad_norm": 1.6333682537078857,
"learning_rate": 4.969099109790167e-05,
"loss": 1.2372,
"step": 29800
},
{
"epoch": 0.0598,
"grad_norm": 1.0337685346603394,
"learning_rate": 4.9688499164167536e-05,
"loss": 1.2435,
"step": 29900
},
{
"epoch": 0.06,
"grad_norm": 0.8429011702537537,
"learning_rate": 4.9685997285908894e-05,
"loss": 1.2023,
"step": 30000
},
{
"epoch": 0.06,
"eval_loss": 1.086748480796814,
"eval_runtime": 76.8684,
"eval_samples_per_second": 200.902,
"eval_steps_per_second": 3.148,
"step": 30000
},
{
"epoch": 0.0002,
"grad_norm": 0.8381020426750183,
"learning_rate": 4.9683485464133484e-05,
"loss": 1.2362,
"step": 30100
},
{
"epoch": 0.0004,
"grad_norm": 0.6860467791557312,
"learning_rate": 4.968096369985309e-05,
"loss": 1.2125,
"step": 30200
},
{
"epoch": 0.0006,
"grad_norm": 0.9316505193710327,
"learning_rate": 4.967843199408347e-05,
"loss": 1.1904,
"step": 30300
},
{
"epoch": 0.0008,
"grad_norm": 1.3389461040496826,
"learning_rate": 4.967589034784439e-05,
"loss": 1.2689,
"step": 30400
},
{
"epoch": 0.001,
"grad_norm": 0.9387079477310181,
"learning_rate": 4.967333876215963e-05,
"loss": 1.2205,
"step": 30500
},
{
"epoch": 0.0012,
"grad_norm": 0.7549923062324524,
"learning_rate": 4.967077723805697e-05,
"loss": 1.21,
"step": 30600
},
{
"epoch": 0.0014,
"grad_norm": 1.1242858171463013,
"learning_rate": 4.966820577656819e-05,
"loss": 1.203,
"step": 30700
},
{
"epoch": 0.0016,
"grad_norm": 1.5065937042236328,
"learning_rate": 4.966562437872907e-05,
"loss": 1.2233,
"step": 30800
},
{
"epoch": 0.0018,
"grad_norm": 1.1448508501052856,
"learning_rate": 4.96630330455794e-05,
"loss": 1.2242,
"step": 30900
},
{
"epoch": 0.002,
"grad_norm": 0.7356053590774536,
"learning_rate": 4.966043177816296e-05,
"loss": 1.2541,
"step": 31000
},
{
"epoch": 0.002,
"eval_loss": 1.0892270803451538,
"eval_runtime": 78.1396,
"eval_samples_per_second": 197.633,
"eval_steps_per_second": 3.097,
"step": 31000
},
{
"epoch": 0.0022,
"grad_norm": 1.290472149848938,
"learning_rate": 4.965782057752757e-05,
"loss": 1.2005,
"step": 31100
},
{
"epoch": 0.0024,
"grad_norm": 0.7970076203346252,
"learning_rate": 4.965519944472498e-05,
"loss": 1.2718,
"step": 31200
},
{
"epoch": 0.0026,
"grad_norm": 1.3415039777755737,
"learning_rate": 4.9652568380811016e-05,
"loss": 1.2673,
"step": 31300
},
{
"epoch": 0.0028,
"grad_norm": 1.3146836757659912,
"learning_rate": 4.9649927386845444e-05,
"loss": 1.2717,
"step": 31400
},
{
"epoch": 0.003,
"grad_norm": 0.9725894927978516,
"learning_rate": 4.964727646389208e-05,
"loss": 1.2418,
"step": 31500
},
{
"epoch": 0.0032,
"grad_norm": 0.9590099453926086,
"learning_rate": 4.96446156130187e-05,
"loss": 1.2389,
"step": 31600
},
{
"epoch": 0.0034,
"grad_norm": 1.5478194952011108,
"learning_rate": 4.964194483529709e-05,
"loss": 1.2693,
"step": 31700
},
{
"epoch": 0.0036,
"grad_norm": 0.7029865384101868,
"learning_rate": 4.9639264131803056e-05,
"loss": 1.25,
"step": 31800
},
{
"epoch": 0.0038,
"grad_norm": 0.7784998416900635,
"learning_rate": 4.963657350361637e-05,
"loss": 1.2339,
"step": 31900
},
{
"epoch": 0.004,
"grad_norm": 0.6479517817497253,
"learning_rate": 4.963387295182083e-05,
"loss": 1.2538,
"step": 32000
},
{
"epoch": 0.004,
"eval_loss": 1.0948545932769775,
"eval_runtime": 77.4713,
"eval_samples_per_second": 199.338,
"eval_steps_per_second": 3.124,
"step": 32000
},
{
"epoch": 0.0042,
"grad_norm": 1.4759093523025513,
"learning_rate": 4.963116247750421e-05,
"loss": 1.2646,
"step": 32100
},
{
"epoch": 0.0044,
"grad_norm": 0.7561829686164856,
"learning_rate": 4.9628442081758285e-05,
"loss": 1.2083,
"step": 32200
},
{
"epoch": 0.0046,
"grad_norm": 0.6289774775505066,
"learning_rate": 4.962571176567884e-05,
"loss": 1.2492,
"step": 32300
},
{
"epoch": 0.0048,
"grad_norm": 0.8146848678588867,
"learning_rate": 4.962297153036564e-05,
"loss": 1.2693,
"step": 32400
},
{
"epoch": 0.005,
"grad_norm": 1.1135525703430176,
"learning_rate": 4.962022137692245e-05,
"loss": 1.2218,
"step": 32500
},
{
"epoch": 0.0052,
"grad_norm": 1.1507619619369507,
"learning_rate": 4.961746130645703e-05,
"loss": 1.2118,
"step": 32600
},
{
"epoch": 0.0054,
"grad_norm": 0.8586376905441284,
"learning_rate": 4.961469132008114e-05,
"loss": 1.2115,
"step": 32700
},
{
"epoch": 0.0056,
"grad_norm": 1.5335224866867065,
"learning_rate": 4.961191141891054e-05,
"loss": 1.2239,
"step": 32800
},
{
"epoch": 0.0058,
"grad_norm": 1.2822892665863037,
"learning_rate": 4.960912160406496e-05,
"loss": 1.2443,
"step": 32900
},
{
"epoch": 0.006,
"grad_norm": 0.9584761261940002,
"learning_rate": 4.960632187666814e-05,
"loss": 1.243,
"step": 33000
},
{
"epoch": 0.006,
"eval_loss": 1.0964241027832031,
"eval_runtime": 76.2571,
"eval_samples_per_second": 202.512,
"eval_steps_per_second": 3.173,
"step": 33000
},
{
"epoch": 0.0062,
"grad_norm": 0.7512497305870056,
"learning_rate": 4.960351223784781e-05,
"loss": 1.1821,
"step": 33100
},
{
"epoch": 0.0064,
"grad_norm": 1.3305505514144897,
"learning_rate": 4.960069268873568e-05,
"loss": 1.2393,
"step": 33200
},
{
"epoch": 0.0066,
"grad_norm": 1.5360506772994995,
"learning_rate": 4.959786323046749e-05,
"loss": 1.2475,
"step": 33300
},
{
"epoch": 0.0068,
"grad_norm": 0.7005806565284729,
"learning_rate": 4.959502386418293e-05,
"loss": 1.2122,
"step": 33400
},
{
"epoch": 0.007,
"grad_norm": 1.381052017211914,
"learning_rate": 4.95921745910257e-05,
"loss": 1.2336,
"step": 33500
},
{
"epoch": 0.0072,
"grad_norm": 1.074300765991211,
"learning_rate": 4.958931541214349e-05,
"loss": 1.2661,
"step": 33600
},
{
"epoch": 0.0074,
"grad_norm": 1.1441256999969482,
"learning_rate": 4.9586446328687967e-05,
"loss": 1.2296,
"step": 33700
},
{
"epoch": 0.0076,
"grad_norm": 0.8737586140632629,
"learning_rate": 4.958356734181481e-05,
"loss": 1.2067,
"step": 33800
},
{
"epoch": 0.0078,
"grad_norm": 1.1493791341781616,
"learning_rate": 4.958067845268366e-05,
"loss": 1.2643,
"step": 33900
},
{
"epoch": 0.008,
"grad_norm": 1.3028621673583984,
"learning_rate": 4.957777966245817e-05,
"loss": 1.2427,
"step": 34000
},
{
"epoch": 0.008,
"eval_loss": 1.096444845199585,
"eval_runtime": 76.4253,
"eval_samples_per_second": 202.067,
"eval_steps_per_second": 3.166,
"step": 34000
},
{
"epoch": 0.0082,
"grad_norm": 1.31423819065094,
"learning_rate": 4.957487097230597e-05,
"loss": 1.2137,
"step": 34100
},
{
"epoch": 0.0084,
"grad_norm": 1.1846545934677124,
"learning_rate": 4.957195238339868e-05,
"loss": 1.2141,
"step": 34200
},
{
"epoch": 0.0086,
"grad_norm": 0.9421952366828918,
"learning_rate": 4.9569023896911914e-05,
"loss": 1.219,
"step": 34300
},
{
"epoch": 0.0088,
"grad_norm": 1.4107282161712646,
"learning_rate": 4.9566085514025256e-05,
"loss": 1.2141,
"step": 34400
},
{
"epoch": 0.009,
"grad_norm": 0.7364057302474976,
"learning_rate": 4.95631372359223e-05,
"loss": 1.246,
"step": 34500
},
{
"epoch": 0.0092,
"grad_norm": 0.8100732564926147,
"learning_rate": 4.956017906379059e-05,
"loss": 1.1891,
"step": 34600
},
{
"epoch": 0.0094,
"grad_norm": 1.2455086708068848,
"learning_rate": 4.955721099882169e-05,
"loss": 1.2458,
"step": 34700
},
{
"epoch": 0.0096,
"grad_norm": 0.676437497138977,
"learning_rate": 4.9554233042211146e-05,
"loss": 1.2058,
"step": 34800
},
{
"epoch": 0.0098,
"grad_norm": 1.3339647054672241,
"learning_rate": 4.955124519515847e-05,
"loss": 1.2407,
"step": 34900
},
{
"epoch": 0.01,
"grad_norm": 0.9411395192146301,
"learning_rate": 4.954824745886716e-05,
"loss": 1.1974,
"step": 35000
},
{
"epoch": 0.01,
"eval_loss": 1.0945005416870117,
"eval_runtime": 76.9422,
"eval_samples_per_second": 200.709,
"eval_steps_per_second": 3.145,
"step": 35000
},
{
"epoch": 0.0102,
"grad_norm": 0.6638602018356323,
"learning_rate": 4.95452398345447e-05,
"loss": 1.2259,
"step": 35100
},
{
"epoch": 0.0104,
"grad_norm": 0.6337453722953796,
"learning_rate": 4.954222232340259e-05,
"loss": 1.1686,
"step": 35200
},
{
"epoch": 0.0106,
"grad_norm": 0.809762179851532,
"learning_rate": 4.953919492665625e-05,
"loss": 1.2174,
"step": 35300
},
{
"epoch": 0.0108,
"grad_norm": 0.9431924819946289,
"learning_rate": 4.953615764552513e-05,
"loss": 1.2128,
"step": 35400
},
{
"epoch": 0.011,
"grad_norm": 0.7606577277183533,
"learning_rate": 4.953311048123265e-05,
"loss": 1.2473,
"step": 35500
},
{
"epoch": 0.0112,
"grad_norm": 1.1843669414520264,
"learning_rate": 4.953005343500619e-05,
"loss": 1.2194,
"step": 35600
},
{
"epoch": 0.0114,
"grad_norm": 0.9086577296257019,
"learning_rate": 4.952698650807715e-05,
"loss": 1.2572,
"step": 35700
},
{
"epoch": 0.0116,
"grad_norm": 1.36215078830719,
"learning_rate": 4.9523909701680874e-05,
"loss": 1.2263,
"step": 35800
},
{
"epoch": 0.0118,
"grad_norm": 0.8537183403968811,
"learning_rate": 4.952082301705671e-05,
"loss": 1.2297,
"step": 35900
},
{
"epoch": 0.012,
"grad_norm": 0.6182298064231873,
"learning_rate": 4.9517726455447955e-05,
"loss": 1.2101,
"step": 36000
},
{
"epoch": 0.012,
"eval_loss": 1.0894391536712646,
"eval_runtime": 76.3033,
"eval_samples_per_second": 202.39,
"eval_steps_per_second": 3.172,
"step": 36000
},
{
"epoch": 0.0122,
"grad_norm": 1.1102640628814697,
"learning_rate": 4.951462001810192e-05,
"loss": 1.2086,
"step": 36100
},
{
"epoch": 0.0124,
"grad_norm": 0.9391844868659973,
"learning_rate": 4.951150370626988e-05,
"loss": 1.2595,
"step": 36200
},
{
"epoch": 0.0126,
"grad_norm": 1.3386393785476685,
"learning_rate": 4.950837752120707e-05,
"loss": 1.1953,
"step": 36300
},
{
"epoch": 0.0128,
"grad_norm": 1.0943065881729126,
"learning_rate": 4.950524146417273e-05,
"loss": 1.2759,
"step": 36400
},
{
"epoch": 0.013,
"grad_norm": 0.9743318557739258,
"learning_rate": 4.950209553643006e-05,
"loss": 1.2421,
"step": 36500
},
{
"epoch": 0.0132,
"grad_norm": 1.2555447816848755,
"learning_rate": 4.949893973924623e-05,
"loss": 1.242,
"step": 36600
},
{
"epoch": 0.0134,
"grad_norm": 1.3289902210235596,
"learning_rate": 4.949577407389241e-05,
"loss": 1.2337,
"step": 36700
},
{
"epoch": 0.0136,
"grad_norm": 0.8806101679801941,
"learning_rate": 4.949259854164372e-05,
"loss": 1.244,
"step": 36800
},
{
"epoch": 0.0138,
"grad_norm": 1.211584448814392,
"learning_rate": 4.948941314377927e-05,
"loss": 1.2344,
"step": 36900
},
{
"epoch": 0.014,
"grad_norm": 1.6472032070159912,
"learning_rate": 4.9486217881582134e-05,
"loss": 1.1866,
"step": 37000
},
{
"epoch": 0.014,
"eval_loss": 1.0940065383911133,
"eval_runtime": 76.3383,
"eval_samples_per_second": 202.297,
"eval_steps_per_second": 3.17,
"step": 37000
},
{
"epoch": 0.0142,
"grad_norm": 1.5010918378829956,
"learning_rate": 4.948301275633936e-05,
"loss": 1.2057,
"step": 37100
},
{
"epoch": 0.0144,
"grad_norm": 0.6793572306632996,
"learning_rate": 4.947979776934197e-05,
"loss": 1.2104,
"step": 37200
},
{
"epoch": 0.0146,
"grad_norm": 0.7654362916946411,
"learning_rate": 4.947657292188498e-05,
"loss": 1.2266,
"step": 37300
},
{
"epoch": 0.0148,
"grad_norm": 1.0618220567703247,
"learning_rate": 4.947333821526734e-05,
"loss": 1.2509,
"step": 37400
},
{
"epoch": 0.015,
"grad_norm": 1.2712790966033936,
"learning_rate": 4.947009365079199e-05,
"loss": 1.2179,
"step": 37500
},
{
"epoch": 0.0152,
"grad_norm": 1.3342602252960205,
"learning_rate": 4.946683922976584e-05,
"loss": 1.2224,
"step": 37600
},
{
"epoch": 0.0154,
"grad_norm": 0.8218332529067993,
"learning_rate": 4.946357495349978e-05,
"loss": 1.2402,
"step": 37700
},
{
"epoch": 0.0156,
"grad_norm": 2.0291969776153564,
"learning_rate": 4.946030082330865e-05,
"loss": 1.1599,
"step": 37800
},
{
"epoch": 0.0158,
"grad_norm": 1.5702838897705078,
"learning_rate": 4.945701684051128e-05,
"loss": 1.1784,
"step": 37900
},
{
"epoch": 0.016,
"grad_norm": 1.268508791923523,
"learning_rate": 4.9453723006430444e-05,
"loss": 1.2172,
"step": 38000
},
{
"epoch": 0.016,
"eval_loss": 1.088572382926941,
"eval_runtime": 76.533,
"eval_samples_per_second": 201.782,
"eval_steps_per_second": 3.162,
"step": 38000
},
{
"epoch": 0.0162,
"grad_norm": 1.3127037286758423,
"learning_rate": 4.945041932239292e-05,
"loss": 1.2299,
"step": 38100
},
{
"epoch": 0.0164,
"grad_norm": 0.7277888655662537,
"learning_rate": 4.9447105789729396e-05,
"loss": 1.2655,
"step": 38200
},
{
"epoch": 0.0166,
"grad_norm": 1.031909704208374,
"learning_rate": 4.94437824097746e-05,
"loss": 1.2179,
"step": 38300
},
{
"epoch": 0.0168,
"grad_norm": 1.2462060451507568,
"learning_rate": 4.9440449183867166e-05,
"loss": 1.2311,
"step": 38400
},
{
"epoch": 0.017,
"grad_norm": 0.5426816344261169,
"learning_rate": 4.9437106113349716e-05,
"loss": 1.1637,
"step": 38500
},
{
"epoch": 0.0172,
"grad_norm": 1.2320595979690552,
"learning_rate": 4.9433753199568856e-05,
"loss": 1.2282,
"step": 38600
},
{
"epoch": 0.0174,
"grad_norm": 0.928945779800415,
"learning_rate": 4.943039044387513e-05,
"loss": 1.1936,
"step": 38700
},
{
"epoch": 0.0176,
"grad_norm": 1.4080160856246948,
"learning_rate": 4.9427017847623044e-05,
"loss": 1.251,
"step": 38800
},
{
"epoch": 0.0178,
"grad_norm": 1.3436859846115112,
"learning_rate": 4.9423635412171106e-05,
"loss": 1.287,
"step": 38900
},
{
"epoch": 0.018,
"grad_norm": 0.9334709048271179,
"learning_rate": 4.9420243138881734e-05,
"loss": 1.1766,
"step": 39000
},
{
"epoch": 0.018,
"eval_loss": 1.092005968093872,
"eval_runtime": 76.2687,
"eval_samples_per_second": 202.482,
"eval_steps_per_second": 3.173,
"step": 39000
},
{
"epoch": 0.0182,
"grad_norm": 0.9674895405769348,
"learning_rate": 4.9416841029121355e-05,
"loss": 1.2388,
"step": 39100
},
{
"epoch": 0.0184,
"grad_norm": 1.3673955202102661,
"learning_rate": 4.941342908426032e-05,
"loss": 1.183,
"step": 39200
},
{
"epoch": 0.0186,
"grad_norm": 0.8423133492469788,
"learning_rate": 4.941000730567297e-05,
"loss": 1.1847,
"step": 39300
},
{
"epoch": 0.0188,
"grad_norm": 0.9814749360084534,
"learning_rate": 4.94065756947376e-05,
"loss": 1.2022,
"step": 39400
},
{
"epoch": 0.019,
"grad_norm": 1.125647783279419,
"learning_rate": 4.9403134252836456e-05,
"loss": 1.1966,
"step": 39500
},
{
"epoch": 0.0192,
"grad_norm": 0.8501796722412109,
"learning_rate": 4.9399682981355755e-05,
"loss": 1.2347,
"step": 39600
},
{
"epoch": 0.0194,
"grad_norm": 0.8226144909858704,
"learning_rate": 4.9396221881685665e-05,
"loss": 1.2129,
"step": 39700
},
{
"epoch": 0.0196,
"grad_norm": 0.9265516400337219,
"learning_rate": 4.939275095522032e-05,
"loss": 1.1917,
"step": 39800
},
{
"epoch": 0.0198,
"grad_norm": 0.8538194298744202,
"learning_rate": 4.938927020335781e-05,
"loss": 1.2548,
"step": 39900
},
{
"epoch": 0.02,
"grad_norm": 1.2129065990447998,
"learning_rate": 4.9385779627500174e-05,
"loss": 1.2219,
"step": 40000
},
{
"epoch": 0.02,
"eval_loss": 1.087021827697754,
"eval_runtime": 76.3535,
"eval_samples_per_second": 202.257,
"eval_steps_per_second": 3.169,
"step": 40000
},
{
"epoch": 0.0202,
"grad_norm": 1.2157970666885376,
"learning_rate": 4.938227922905342e-05,
"loss": 1.1623,
"step": 40100
},
{
"epoch": 0.0204,
"grad_norm": 0.6873258948326111,
"learning_rate": 4.9378769009427515e-05,
"loss": 1.2088,
"step": 40200
},
{
"epoch": 0.0206,
"grad_norm": 1.139224886894226,
"learning_rate": 4.937524897003637e-05,
"loss": 1.2158,
"step": 40300
},
{
"epoch": 0.0208,
"grad_norm": 1.2190488576889038,
"learning_rate": 4.9371719112297845e-05,
"loss": 1.19,
"step": 40400
},
{
"epoch": 0.021,
"grad_norm": 1.2439500093460083,
"learning_rate": 4.936817943763378e-05,
"loss": 1.173,
"step": 40500
},
{
"epoch": 0.0212,
"grad_norm": 1.030110478401184,
"learning_rate": 4.936462994746995e-05,
"loss": 1.1995,
"step": 40600
},
{
"epoch": 0.0214,
"grad_norm": 0.666333794593811,
"learning_rate": 4.93610706432361e-05,
"loss": 1.2476,
"step": 40700
},
{
"epoch": 0.0216,
"grad_norm": 0.8477672934532166,
"learning_rate": 4.93575015263659e-05,
"loss": 1.2225,
"step": 40800
},
{
"epoch": 0.0218,
"grad_norm": 1.087173342704773,
"learning_rate": 4.9353922598296995e-05,
"loss": 1.1758,
"step": 40900
},
{
"epoch": 0.022,
"grad_norm": 1.2760623693466187,
"learning_rate": 4.935033386047099e-05,
"loss": 1.2811,
"step": 41000
},
{
"epoch": 0.022,
"eval_loss": 1.082631230354309,
"eval_runtime": 75.9811,
"eval_samples_per_second": 203.248,
"eval_steps_per_second": 3.185,
"step": 41000
},
{
"epoch": 0.0222,
"grad_norm": 1.0236754417419434,
"learning_rate": 4.934673531433341e-05,
"loss": 1.2283,
"step": 41100
},
{
"epoch": 0.0224,
"grad_norm": 1.509448766708374,
"learning_rate": 4.934312696133376e-05,
"loss": 1.1989,
"step": 41200
},
{
"epoch": 0.0226,
"grad_norm": 1.2022035121917725,
"learning_rate": 4.9339508802925475e-05,
"loss": 1.2247,
"step": 41300
},
{
"epoch": 0.0228,
"grad_norm": 1.4019054174423218,
"learning_rate": 4.933588084056596e-05,
"loss": 1.2201,
"step": 41400
},
{
"epoch": 0.023,
"grad_norm": 1.06856107711792,
"learning_rate": 4.933224307571655e-05,
"loss": 1.1789,
"step": 41500
},
{
"epoch": 0.0232,
"grad_norm": 1.0807596445083618,
"learning_rate": 4.932859550984255e-05,
"loss": 1.2361,
"step": 41600
},
{
"epoch": 0.0234,
"grad_norm": 1.20824134349823,
"learning_rate": 4.932493814441318e-05,
"loss": 1.2167,
"step": 41700
},
{
"epoch": 0.0236,
"grad_norm": 0.7066964507102966,
"learning_rate": 4.9321270980901635e-05,
"loss": 1.1941,
"step": 41800
},
{
"epoch": 0.0238,
"grad_norm": 0.7342857122421265,
"learning_rate": 4.9317594020785044e-05,
"loss": 1.1709,
"step": 41900
},
{
"epoch": 0.024,
"grad_norm": 1.239176630973816,
"learning_rate": 4.931390726554449e-05,
"loss": 1.2238,
"step": 42000
},
{
"epoch": 0.024,
"eval_loss": 1.0859261751174927,
"eval_runtime": 76.6051,
"eval_samples_per_second": 201.592,
"eval_steps_per_second": 3.159,
"step": 42000
},
{
"epoch": 0.0242,
"grad_norm": 0.9031541347503662,
"learning_rate": 4.9310210716665003e-05,
"loss": 1.1621,
"step": 42100
},
{
"epoch": 0.0244,
"grad_norm": 0.744767963886261,
"learning_rate": 4.930650437563554e-05,
"loss": 1.21,
"step": 42200
},
{
"epoch": 0.0246,
"grad_norm": 1.2594637870788574,
"learning_rate": 4.9302788243949025e-05,
"loss": 1.21,
"step": 42300
},
{
"epoch": 0.0248,
"grad_norm": 0.67472243309021,
"learning_rate": 4.929906232310231e-05,
"loss": 1.1785,
"step": 42400
},
{
"epoch": 0.025,
"grad_norm": 1.3947267532348633,
"learning_rate": 4.92953266145962e-05,
"loss": 1.1598,
"step": 42500
},
{
"epoch": 0.0252,
"grad_norm": 0.7739892601966858,
"learning_rate": 4.929158111993543e-05,
"loss": 1.1492,
"step": 42600
},
{
"epoch": 0.0254,
"grad_norm": 0.8620167970657349,
"learning_rate": 4.9287825840628695e-05,
"loss": 1.1863,
"step": 42700
},
{
"epoch": 0.0256,
"grad_norm": 0.7649038434028625,
"learning_rate": 4.928406077818861e-05,
"loss": 1.1782,
"step": 42800
},
{
"epoch": 0.0258,
"grad_norm": 1.2743923664093018,
"learning_rate": 4.9280285934131755e-05,
"loss": 1.2254,
"step": 42900
},
{
"epoch": 0.026,
"grad_norm": 0.6955134272575378,
"learning_rate": 4.927650130997862e-05,
"loss": 1.2254,
"step": 43000
},
{
"epoch": 0.026,
"eval_loss": 1.0833112001419067,
"eval_runtime": 77.475,
"eval_samples_per_second": 199.329,
"eval_steps_per_second": 3.124,
"step": 43000
},
{
"epoch": 0.0262,
"grad_norm": 0.8997926115989685,
"learning_rate": 4.927270690725367e-05,
"loss": 1.1989,
"step": 43100
},
{
"epoch": 0.0264,
"grad_norm": 1.3762701749801636,
"learning_rate": 4.9268902727485276e-05,
"loss": 1.1928,
"step": 43200
},
{
"epoch": 0.0266,
"grad_norm": 0.7553657293319702,
"learning_rate": 4.926508877220577e-05,
"loss": 1.2266,
"step": 43300
},
{
"epoch": 0.0268,
"grad_norm": 0.6331331133842468,
"learning_rate": 4.92612650429514e-05,
"loss": 1.2034,
"step": 43400
},
{
"epoch": 0.027,
"grad_norm": 0.6229783892631531,
"learning_rate": 4.925743154126238e-05,
"loss": 1.2123,
"step": 43500
},
{
"epoch": 0.0272,
"grad_norm": 1.2101593017578125,
"learning_rate": 4.9253588268682835e-05,
"loss": 1.2473,
"step": 43600
},
{
"epoch": 0.0274,
"grad_norm": 1.2178127765655518,
"learning_rate": 4.924973522676083e-05,
"loss": 1.2391,
"step": 43700
},
{
"epoch": 0.0276,
"grad_norm": 1.4870595932006836,
"learning_rate": 4.924587241704838e-05,
"loss": 1.2358,
"step": 43800
},
{
"epoch": 0.0278,
"grad_norm": 1.2042150497436523,
"learning_rate": 4.924199984110142e-05,
"loss": 1.1996,
"step": 43900
},
{
"epoch": 0.028,
"grad_norm": 1.3220444917678833,
"learning_rate": 4.923811750047982e-05,
"loss": 1.2052,
"step": 44000
},
{
"epoch": 0.028,
"eval_loss": 1.0859400033950806,
"eval_runtime": 76.6882,
"eval_samples_per_second": 201.374,
"eval_steps_per_second": 3.156,
"step": 44000
},
{
"epoch": 0.0282,
"grad_norm": 1.464141607284546,
"learning_rate": 4.923422539674739e-05,
"loss": 1.2326,
"step": 44100
},
{
"epoch": 0.0284,
"grad_norm": 1.2406100034713745,
"learning_rate": 4.923032353147187e-05,
"loss": 1.2092,
"step": 44200
},
{
"epoch": 0.0286,
"grad_norm": 0.9459540247917175,
"learning_rate": 4.9226411906224935e-05,
"loss": 1.2023,
"step": 44300
},
{
"epoch": 0.0288,
"grad_norm": 1.2143398523330688,
"learning_rate": 4.922249052258217e-05,
"loss": 1.2348,
"step": 44400
},
{
"epoch": 0.029,
"grad_norm": 1.1002607345581055,
"learning_rate": 4.921855938212312e-05,
"loss": 1.1912,
"step": 44500
},
{
"epoch": 0.0292,
"grad_norm": 1.169640302658081,
"learning_rate": 4.921461848643126e-05,
"loss": 1.1797,
"step": 44600
},
{
"epoch": 0.0294,
"grad_norm": 1.2756543159484863,
"learning_rate": 4.921066783709396e-05,
"loss": 1.1691,
"step": 44700
},
{
"epoch": 0.0296,
"grad_norm": 0.5525041222572327,
"learning_rate": 4.920670743570255e-05,
"loss": 1.2011,
"step": 44800
},
{
"epoch": 0.0298,
"grad_norm": 0.7082927823066711,
"learning_rate": 4.9202737283852284e-05,
"loss": 1.1831,
"step": 44900
},
{
"epoch": 0.03,
"grad_norm": 0.7773894667625427,
"learning_rate": 4.919875738314233e-05,
"loss": 1.1947,
"step": 45000
},
{
"epoch": 0.03,
"eval_loss": 1.0890144109725952,
"eval_runtime": 76.6594,
"eval_samples_per_second": 201.45,
"eval_steps_per_second": 3.157,
"step": 45000
},
{
"epoch": 0.0302,
"grad_norm": 0.7057791352272034,
"learning_rate": 4.91947677351758e-05,
"loss": 1.2717,
"step": 45100
},
{
"epoch": 0.0304,
"grad_norm": 0.9837706685066223,
"learning_rate": 4.919076834155971e-05,
"loss": 1.206,
"step": 45200
},
{
"epoch": 0.0306,
"grad_norm": 0.5716899633407593,
"learning_rate": 4.918675920390504e-05,
"loss": 1.2071,
"step": 45300
},
{
"epoch": 0.0308,
"grad_norm": 0.6972540020942688,
"learning_rate": 4.918274032382665e-05,
"loss": 1.1761,
"step": 45400
},
{
"epoch": 0.031,
"grad_norm": 1.4802424907684326,
"learning_rate": 4.917871170294334e-05,
"loss": 1.2109,
"step": 45500
},
{
"epoch": 0.0312,
"grad_norm": 0.7575565576553345,
"learning_rate": 4.9174673342877854e-05,
"loss": 1.2169,
"step": 45600
},
{
"epoch": 0.0314,
"grad_norm": 2.227360963821411,
"learning_rate": 4.917062524525684e-05,
"loss": 1.1657,
"step": 45700
},
{
"epoch": 0.0316,
"grad_norm": 0.8020743727684021,
"learning_rate": 4.916656741171086e-05,
"loss": 1.2073,
"step": 45800
},
{
"epoch": 0.0318,
"grad_norm": 1.1863917112350464,
"learning_rate": 4.916249984387443e-05,
"loss": 1.211,
"step": 45900
},
{
"epoch": 0.032,
"grad_norm": 0.5976528525352478,
"learning_rate": 4.915842254338594e-05,
"loss": 1.2468,
"step": 46000
},
{
"epoch": 0.032,
"eval_loss": 1.0842978954315186,
"eval_runtime": 76.5369,
"eval_samples_per_second": 201.772,
"eval_steps_per_second": 3.162,
"step": 46000
},
{
"epoch": 0.0322,
"grad_norm": 1.4908519983291626,
"learning_rate": 4.915433551188774e-05,
"loss": 1.1695,
"step": 46100
},
{
"epoch": 0.0324,
"grad_norm": 1.1190279722213745,
"learning_rate": 4.915023875102609e-05,
"loss": 1.2017,
"step": 46200
},
{
"epoch": 0.0326,
"grad_norm": 1.1334049701690674,
"learning_rate": 4.914613226245115e-05,
"loss": 1.2083,
"step": 46300
},
{
"epoch": 0.0328,
"grad_norm": 0.6902172565460205,
"learning_rate": 4.914201604781703e-05,
"loss": 1.233,
"step": 46400
},
{
"epoch": 0.033,
"grad_norm": 0.7509928941726685,
"learning_rate": 4.913789010878174e-05,
"loss": 1.2437,
"step": 46500
},
{
"epoch": 0.0332,
"grad_norm": 1.4217336177825928,
"learning_rate": 4.9133754447007185e-05,
"loss": 1.1909,
"step": 46600
},
{
"epoch": 0.0334,
"grad_norm": 1.212930679321289,
"learning_rate": 4.912960906415923e-05,
"loss": 1.1828,
"step": 46700
},
{
"epoch": 0.0336,
"grad_norm": 1.1408753395080566,
"learning_rate": 4.912545396190763e-05,
"loss": 1.2118,
"step": 46800
},
{
"epoch": 0.0338,
"grad_norm": 0.649695634841919,
"learning_rate": 4.9121289141926066e-05,
"loss": 1.1877,
"step": 46900
},
{
"epoch": 0.034,
"grad_norm": 1.4613287448883057,
"learning_rate": 4.911711460589211e-05,
"loss": 1.1977,
"step": 47000
},
{
"epoch": 0.034,
"eval_loss": 1.0870256423950195,
"eval_runtime": 76.7051,
"eval_samples_per_second": 201.33,
"eval_steps_per_second": 3.155,
"step": 47000
},
{
"epoch": 0.0342,
"grad_norm": 1.1586204767227173,
"learning_rate": 4.9112930355487284e-05,
"loss": 1.2222,
"step": 47100
},
{
"epoch": 0.0344,
"grad_norm": 1.220306158065796,
"learning_rate": 4.910873639239699e-05,
"loss": 1.1909,
"step": 47200
},
{
"epoch": 0.0346,
"grad_norm": 0.589338481426239,
"learning_rate": 4.910453271831056e-05,
"loss": 1.2034,
"step": 47300
},
{
"epoch": 0.0348,
"grad_norm": 1.4743396043777466,
"learning_rate": 4.910031933492123e-05,
"loss": 1.2019,
"step": 47400
},
{
"epoch": 0.035,
"grad_norm": 0.6481319069862366,
"learning_rate": 4.909609624392616e-05,
"loss": 1.2107,
"step": 47500
},
{
"epoch": 0.0352,
"grad_norm": 1.1668992042541504,
"learning_rate": 4.9091863447026404e-05,
"loss": 1.2498,
"step": 47600
},
{
"epoch": 0.0354,
"grad_norm": 1.115519404411316,
"learning_rate": 4.908762094592693e-05,
"loss": 1.206,
"step": 47700
},
{
"epoch": 0.0356,
"grad_norm": 1.3867928981781006,
"learning_rate": 4.908336874233662e-05,
"loss": 1.2082,
"step": 47800
},
{
"epoch": 0.0358,
"grad_norm": 0.6380243301391602,
"learning_rate": 4.9079106837968264e-05,
"loss": 1.1693,
"step": 47900
},
{
"epoch": 0.036,
"grad_norm": 1.8375539779663086,
"learning_rate": 4.907483523453855e-05,
"loss": 1.1531,
"step": 48000
},
{
"epoch": 0.036,
"eval_loss": 1.0780328512191772,
"eval_runtime": 76.5805,
"eval_samples_per_second": 201.657,
"eval_steps_per_second": 3.16,
"step": 48000
},
{
"epoch": 0.0362,
"grad_norm": 1.231332778930664,
"learning_rate": 4.907055393376808e-05,
"loss": 1.1618,
"step": 48100
},
{
"epoch": 0.0364,
"grad_norm": 1.2306678295135498,
"learning_rate": 4.906626293738137e-05,
"loss": 1.2365,
"step": 48200
},
{
"epoch": 0.0366,
"grad_norm": 1.057521104812622,
"learning_rate": 4.906196224710683e-05,
"loss": 1.1775,
"step": 48300
},
{
"epoch": 0.0368,
"grad_norm": 0.9679245352745056,
"learning_rate": 4.905765186467677e-05,
"loss": 1.2175,
"step": 48400
},
{
"epoch": 0.037,
"grad_norm": 1.325900912284851,
"learning_rate": 4.9053331791827404e-05,
"loss": 1.1848,
"step": 48500
},
{
"epoch": 0.0372,
"grad_norm": 1.3124104738235474,
"learning_rate": 4.9049002030298887e-05,
"loss": 1.1779,
"step": 48600
},
{
"epoch": 0.0374,
"grad_norm": 1.7284040451049805,
"learning_rate": 4.904466258183522e-05,
"loss": 1.2144,
"step": 48700
},
{
"epoch": 0.0376,
"grad_norm": 0.9314505457878113,
"learning_rate": 4.904031344818434e-05,
"loss": 1.219,
"step": 48800
},
{
"epoch": 0.0378,
"grad_norm": 1.1688934564590454,
"learning_rate": 4.903595463109808e-05,
"loss": 1.2268,
"step": 48900
},
{
"epoch": 0.038,
"grad_norm": 1.0910236835479736,
"learning_rate": 4.903158613233216e-05,
"loss": 1.2213,
"step": 49000
},
{
"epoch": 0.038,
"eval_loss": 1.0866200923919678,
"eval_runtime": 76.736,
"eval_samples_per_second": 201.248,
"eval_steps_per_second": 3.154,
"step": 49000
},
{
"epoch": 0.0382,
"grad_norm": 1.0715341567993164,
"learning_rate": 4.902720795364623e-05,
"loss": 1.2007,
"step": 49100
},
{
"epoch": 0.0384,
"grad_norm": 0.6578232645988464,
"learning_rate": 4.902282009680381e-05,
"loss": 1.2078,
"step": 49200
},
{
"epoch": 0.0386,
"grad_norm": 1.34630286693573,
"learning_rate": 4.9018422563572326e-05,
"loss": 1.1894,
"step": 49300
},
{
"epoch": 0.0388,
"grad_norm": 1.1832722425460815,
"learning_rate": 4.9014015355723104e-05,
"loss": 1.1846,
"step": 49400
},
{
"epoch": 0.039,
"grad_norm": 0.9175591468811035,
"learning_rate": 4.900959847503137e-05,
"loss": 1.1984,
"step": 49500
},
{
"epoch": 0.0392,
"grad_norm": 1.077879548072815,
"learning_rate": 4.9005171923276236e-05,
"loss": 1.1868,
"step": 49600
},
{
"epoch": 0.0394,
"grad_norm": 0.5999984741210938,
"learning_rate": 4.900073570224073e-05,
"loss": 1.1816,
"step": 49700
},
{
"epoch": 0.0396,
"grad_norm": 1.24228835105896,
"learning_rate": 4.899628981371175e-05,
"loss": 1.191,
"step": 49800
},
{
"epoch": 0.0398,
"grad_norm": 0.7666544318199158,
"learning_rate": 4.899183425948011e-05,
"loss": 1.1813,
"step": 49900
},
{
"epoch": 0.04,
"grad_norm": 1.2996748685836792,
"learning_rate": 4.8987369041340486e-05,
"loss": 1.184,
"step": 50000
},
{
"epoch": 0.04,
"eval_loss": 1.0817583799362183,
"eval_runtime": 77.0972,
"eval_samples_per_second": 200.306,
"eval_steps_per_second": 3.139,
"step": 50000
},
{
"epoch": 0.0402,
"grad_norm": 1.1717365980148315,
"learning_rate": 4.898289416109149e-05,
"loss": 1.1936,
"step": 50100
},
{
"epoch": 0.0404,
"grad_norm": 1.3680170774459839,
"learning_rate": 4.8978409620535595e-05,
"loss": 1.2138,
"step": 50200
},
{
"epoch": 0.0406,
"grad_norm": 1.6390254497528076,
"learning_rate": 4.897391542147916e-05,
"loss": 1.1883,
"step": 50300
},
{
"epoch": 0.0408,
"grad_norm": 1.2523001432418823,
"learning_rate": 4.896941156573247e-05,
"loss": 1.2157,
"step": 50400
},
{
"epoch": 0.041,
"grad_norm": 1.4317930936813354,
"learning_rate": 4.896489805510966e-05,
"loss": 1.1721,
"step": 50500
},
{
"epoch": 0.0412,
"grad_norm": 0.9794881939888,
"learning_rate": 4.896037489142879e-05,
"loss": 1.2073,
"step": 50600
},
{
"epoch": 0.0414,
"grad_norm": 0.8774665594100952,
"learning_rate": 4.895584207651178e-05,
"loss": 1.1934,
"step": 50700
},
{
"epoch": 0.0416,
"grad_norm": 1.421742558479309,
"learning_rate": 4.895129961218444e-05,
"loss": 1.2078,
"step": 50800
},
{
"epoch": 0.0418,
"grad_norm": 1.0715827941894531,
"learning_rate": 4.894674750027648e-05,
"loss": 1.1713,
"step": 50900
},
{
"epoch": 0.042,
"grad_norm": 0.7623746991157532,
"learning_rate": 4.894218574262149e-05,
"loss": 1.1779,
"step": 51000
},
{
"epoch": 0.042,
"eval_loss": 1.0817545652389526,
"eval_runtime": 76.5318,
"eval_samples_per_second": 201.785,
"eval_steps_per_second": 3.162,
"step": 51000
},
{
"epoch": 0.0422,
"grad_norm": 0.710477888584137,
"learning_rate": 4.893761434105695e-05,
"loss": 1.1876,
"step": 51100
},
{
"epoch": 0.0424,
"grad_norm": 1.244310736656189,
"learning_rate": 4.893303329742421e-05,
"loss": 1.2077,
"step": 51200
},
{
"epoch": 0.0426,
"grad_norm": 1.6161651611328125,
"learning_rate": 4.8928442613568535e-05,
"loss": 1.1896,
"step": 51300
},
{
"epoch": 0.0428,
"grad_norm": 1.0831233263015747,
"learning_rate": 4.892384229133902e-05,
"loss": 1.1904,
"step": 51400
},
{
"epoch": 0.043,
"grad_norm": 0.8258353471755981,
"learning_rate": 4.89192323325887e-05,
"loss": 1.1906,
"step": 51500
},
{
"epoch": 0.0432,
"grad_norm": 0.7877621054649353,
"learning_rate": 4.8914612739174456e-05,
"loss": 1.1416,
"step": 51600
},
{
"epoch": 0.0434,
"grad_norm": 1.2102254629135132,
"learning_rate": 4.890998351295706e-05,
"loss": 1.1782,
"step": 51700
},
{
"epoch": 0.0436,
"grad_norm": 1.139289140701294,
"learning_rate": 4.890534465580115e-05,
"loss": 1.1471,
"step": 51800
},
{
"epoch": 0.0438,
"grad_norm": 1.2521135807037354,
"learning_rate": 4.890069616957529e-05,
"loss": 1.206,
"step": 51900
},
{
"epoch": 0.044,
"grad_norm": 1.3690674304962158,
"learning_rate": 4.889603805615187e-05,
"loss": 1.2328,
"step": 52000
},
{
"epoch": 0.044,
"eval_loss": 1.0797057151794434,
"eval_runtime": 76.4385,
"eval_samples_per_second": 202.032,
"eval_steps_per_second": 3.166,
"step": 52000
},
{
"epoch": 0.0442,
"grad_norm": 1.2689367532730103,
"learning_rate": 4.889137031740717e-05,
"loss": 1.2189,
"step": 52100
},
{
"epoch": 0.0444,
"grad_norm": 1.0029367208480835,
"learning_rate": 4.888669295522137e-05,
"loss": 1.1754,
"step": 52200
},
{
"epoch": 0.0446,
"grad_norm": 0.6958720684051514,
"learning_rate": 4.8882005971478504e-05,
"loss": 1.1601,
"step": 52300
},
{
"epoch": 0.0448,
"grad_norm": 1.2337570190429688,
"learning_rate": 4.887730936806648e-05,
"loss": 1.2244,
"step": 52400
},
{
"epoch": 0.045,
"grad_norm": 1.2311972379684448,
"learning_rate": 4.8872603146877104e-05,
"loss": 1.2031,
"step": 52500
},
{
"epoch": 0.0452,
"grad_norm": 1.145331859588623,
"learning_rate": 4.886788730980604e-05,
"loss": 1.1947,
"step": 52600
},
{
"epoch": 0.0454,
"grad_norm": 1.1688799858093262,
"learning_rate": 4.886316185875282e-05,
"loss": 1.1655,
"step": 52700
},
{
"epoch": 0.0456,
"grad_norm": 1.2751972675323486,
"learning_rate": 4.885842679562085e-05,
"loss": 1.2038,
"step": 52800
},
{
"epoch": 0.0458,
"grad_norm": 0.6860191822052002,
"learning_rate": 4.8853682122317426e-05,
"loss": 1.1922,
"step": 52900
},
{
"epoch": 0.046,
"grad_norm": 1.4772953987121582,
"learning_rate": 4.8848927840753695e-05,
"loss": 1.1856,
"step": 53000
},
{
"epoch": 0.046,
"eval_loss": 1.0836056470870972,
"eval_runtime": 76.3679,
"eval_samples_per_second": 202.218,
"eval_steps_per_second": 3.169,
"step": 53000
},
{
"epoch": 0.0462,
"grad_norm": 1.2491508722305298,
"learning_rate": 4.884416395284468e-05,
"loss": 1.1924,
"step": 53100
},
{
"epoch": 0.0464,
"grad_norm": 1.1689327955245972,
"learning_rate": 4.883939046050928e-05,
"loss": 1.1675,
"step": 53200
},
{
"epoch": 0.0466,
"grad_norm": 1.0528528690338135,
"learning_rate": 4.883460736567025e-05,
"loss": 1.1879,
"step": 53300
},
{
"epoch": 0.0468,
"grad_norm": 1.141653060913086,
"learning_rate": 4.8829814670254226e-05,
"loss": 1.1637,
"step": 53400
},
{
"epoch": 0.047,
"grad_norm": 0.8094840049743652,
"learning_rate": 4.88250123761917e-05,
"loss": 1.1924,
"step": 53500
},
{
"epoch": 0.0472,
"grad_norm": 1.4988161325454712,
"learning_rate": 4.8820200485417036e-05,
"loss": 1.1962,
"step": 53600
},
{
"epoch": 0.0474,
"grad_norm": 0.8497682809829712,
"learning_rate": 4.881537899986847e-05,
"loss": 1.1987,
"step": 53700
},
{
"epoch": 0.0476,
"grad_norm": 1.0132189989089966,
"learning_rate": 4.8810547921488083e-05,
"loss": 1.1666,
"step": 53800
},
{
"epoch": 0.0478,
"grad_norm": 1.275478720664978,
"learning_rate": 4.8805707252221846e-05,
"loss": 1.2072,
"step": 53900
},
{
"epoch": 0.048,
"grad_norm": 1.1257511377334595,
"learning_rate": 4.880085699401958e-05,
"loss": 1.2128,
"step": 54000
},
{
"epoch": 0.048,
"eval_loss": 1.081576943397522,
"eval_runtime": 76.6431,
"eval_samples_per_second": 201.492,
"eval_steps_per_second": 3.157,
"step": 54000
},
{
"epoch": 0.0482,
"grad_norm": 1.132750153541565,
"learning_rate": 4.879599714883496e-05,
"loss": 1.2239,
"step": 54100
},
{
"epoch": 0.0484,
"grad_norm": 1.3854628801345825,
"learning_rate": 4.8791127718625526e-05,
"loss": 1.1447,
"step": 54200
},
{
"epoch": 0.0486,
"grad_norm": 1.32233464717865,
"learning_rate": 4.87862487053527e-05,
"loss": 1.1765,
"step": 54300
},
{
"epoch": 0.0488,
"grad_norm": 1.1571578979492188,
"learning_rate": 4.8781360110981744e-05,
"loss": 1.1844,
"step": 54400
},
{
"epoch": 0.049,
"grad_norm": 1.552740216255188,
"learning_rate": 4.877646193748177e-05,
"loss": 1.1336,
"step": 54500
},
{
"epoch": 0.0492,
"grad_norm": 1.3447420597076416,
"learning_rate": 4.8771554186825774e-05,
"loss": 1.2401,
"step": 54600
},
{
"epoch": 0.0494,
"grad_norm": 1.0012767314910889,
"learning_rate": 4.87666368609906e-05,
"loss": 1.2236,
"step": 54700
},
{
"epoch": 0.0496,
"grad_norm": 1.1246662139892578,
"learning_rate": 4.876170996195693e-05,
"loss": 1.2452,
"step": 54800
},
{
"epoch": 0.0498,
"grad_norm": 0.7534450888633728,
"learning_rate": 4.875677349170934e-05,
"loss": 1.2333,
"step": 54900
},
{
"epoch": 0.05,
"grad_norm": 1.2943884134292603,
"learning_rate": 4.875182745223622e-05,
"loss": 1.1986,
"step": 55000
},
{
"epoch": 0.05,
"eval_loss": 1.0774849653244019,
"eval_runtime": 76.6003,
"eval_samples_per_second": 201.605,
"eval_steps_per_second": 3.159,
"step": 55000
},
{
"epoch": 0.0502,
"grad_norm": 1.0771546363830566,
"learning_rate": 4.874687184552984e-05,
"loss": 1.2022,
"step": 55100
},
{
"epoch": 0.0504,
"grad_norm": 1.1722393035888672,
"learning_rate": 4.8741906673586334e-05,
"loss": 1.1856,
"step": 55200
},
{
"epoch": 0.0506,
"grad_norm": 0.7547242045402527,
"learning_rate": 4.873693193840565e-05,
"loss": 1.153,
"step": 55300
},
{
"epoch": 0.0508,
"grad_norm": 0.9694270491600037,
"learning_rate": 4.873194764199162e-05,
"loss": 1.2389,
"step": 55400
},
{
"epoch": 0.051,
"grad_norm": 0.6288232803344727,
"learning_rate": 4.872695378635192e-05,
"loss": 1.195,
"step": 55500
},
{
"epoch": 0.0512,
"grad_norm": 1.1400961875915527,
"learning_rate": 4.872195037349807e-05,
"loss": 1.1903,
"step": 55600
},
{
"epoch": 0.0514,
"grad_norm": 1.0738123655319214,
"learning_rate": 4.871693740544545e-05,
"loss": 1.1764,
"step": 55700
},
{
"epoch": 0.0516,
"grad_norm": 1.2298240661621094,
"learning_rate": 4.871191488421327e-05,
"loss": 1.1701,
"step": 55800
},
{
"epoch": 0.0518,
"grad_norm": 1.3240865468978882,
"learning_rate": 4.8706882811824624e-05,
"loss": 1.1828,
"step": 55900
},
{
"epoch": 0.052,
"grad_norm": 1.4167003631591797,
"learning_rate": 4.870184119030641e-05,
"loss": 1.204,
"step": 56000
},
{
"epoch": 0.052,
"eval_loss": 1.0775164365768433,
"eval_runtime": 76.8889,
"eval_samples_per_second": 200.848,
"eval_steps_per_second": 3.147,
"step": 56000
},
{
"epoch": 0.0522,
"grad_norm": 0.6648851037025452,
"learning_rate": 4.86967900216894e-05,
"loss": 1.174,
"step": 56100
},
{
"epoch": 0.0524,
"grad_norm": 1.29317307472229,
"learning_rate": 4.8691729308008196e-05,
"loss": 1.1695,
"step": 56200
},
{
"epoch": 0.0526,
"grad_norm": 1.3121986389160156,
"learning_rate": 4.868665905130127e-05,
"loss": 1.1941,
"step": 56300
},
{
"epoch": 0.0528,
"grad_norm": 0.6604340672492981,
"learning_rate": 4.868157925361091e-05,
"loss": 1.1875,
"step": 56400
},
{
"epoch": 0.053,
"grad_norm": 1.0366885662078857,
"learning_rate": 4.867648991698325e-05,
"loss": 1.2265,
"step": 56500
},
{
"epoch": 0.0532,
"grad_norm": 1.382543683052063,
"learning_rate": 4.867139104346829e-05,
"loss": 1.2122,
"step": 56600
},
{
"epoch": 0.0534,
"grad_norm": 1.0773979425430298,
"learning_rate": 4.866628263511985e-05,
"loss": 1.2375,
"step": 56700
},
{
"epoch": 0.0536,
"grad_norm": 1.178758978843689,
"learning_rate": 4.8661164693995584e-05,
"loss": 1.1959,
"step": 56800
},
{
"epoch": 0.0538,
"grad_norm": 0.7048764228820801,
"learning_rate": 4.865603722215702e-05,
"loss": 1.1841,
"step": 56900
},
{
"epoch": 0.054,
"grad_norm": 1.3390711545944214,
"learning_rate": 4.865090022166949e-05,
"loss": 1.2033,
"step": 57000
},
{
"epoch": 0.054,
"eval_loss": 1.0746017694473267,
"eval_runtime": 77.1768,
"eval_samples_per_second": 200.099,
"eval_steps_per_second": 3.136,
"step": 57000
},
{
"epoch": 0.0542,
"grad_norm": 1.0639598369598389,
"learning_rate": 4.864575369460218e-05,
"loss": 1.1948,
"step": 57100
},
{
"epoch": 0.0544,
"grad_norm": 1.1349152326583862,
"learning_rate": 4.86405976430281e-05,
"loss": 1.1666,
"step": 57200
},
{
"epoch": 0.0546,
"grad_norm": 1.0187245607376099,
"learning_rate": 4.8635432069024125e-05,
"loss": 1.1614,
"step": 57300
},
{
"epoch": 0.0548,
"grad_norm": 0.6468742489814758,
"learning_rate": 4.863025697467094e-05,
"loss": 1.2043,
"step": 57400
},
{
"epoch": 0.055,
"grad_norm": 1.1298869848251343,
"learning_rate": 4.862507236205307e-05,
"loss": 1.1884,
"step": 57500
},
{
"epoch": 0.0552,
"grad_norm": 0.7240111827850342,
"learning_rate": 4.861987823325887e-05,
"loss": 1.186,
"step": 57600
},
{
"epoch": 0.0554,
"grad_norm": 0.8047366142272949,
"learning_rate": 4.861467459038056e-05,
"loss": 1.2029,
"step": 57700
},
{
"epoch": 0.0556,
"grad_norm": 0.8840340375900269,
"learning_rate": 4.860946143551413e-05,
"loss": 1.19,
"step": 57800
},
{
"epoch": 0.0558,
"grad_norm": 1.1575409173965454,
"learning_rate": 4.860423877075947e-05,
"loss": 1.1637,
"step": 57900
},
{
"epoch": 0.056,
"grad_norm": 0.6591224074363708,
"learning_rate": 4.859900659822025e-05,
"loss": 1.2203,
"step": 58000
},
{
"epoch": 0.056,
"eval_loss": 1.0788133144378662,
"eval_runtime": 76.7654,
"eval_samples_per_second": 201.171,
"eval_steps_per_second": 3.152,
"step": 58000
},
{
"epoch": 0.0562,
"grad_norm": 1.3405015468597412,
"learning_rate": 4.859376492000399e-05,
"loss": 1.19,
"step": 58100
},
{
"epoch": 0.0564,
"grad_norm": 1.1912270784378052,
"learning_rate": 4.858851373822205e-05,
"loss": 1.1521,
"step": 58200
},
{
"epoch": 0.0566,
"grad_norm": 1.0169751644134521,
"learning_rate": 4.85832530549896e-05,
"loss": 1.2054,
"step": 58300
},
{
"epoch": 0.0568,
"grad_norm": 0.6713104248046875,
"learning_rate": 4.857798287242563e-05,
"loss": 1.2033,
"step": 58400
},
{
"epoch": 0.057,
"grad_norm": 1.2116252183914185,
"learning_rate": 4.857270319265298e-05,
"loss": 1.1919,
"step": 58500
},
{
"epoch": 0.0572,
"grad_norm": 0.9526674151420593,
"learning_rate": 4.856741401779831e-05,
"loss": 1.1724,
"step": 58600
},
{
"epoch": 0.0574,
"grad_norm": 1.458253264427185,
"learning_rate": 4.856211534999209e-05,
"loss": 1.1479,
"step": 58700
},
{
"epoch": 0.0576,
"grad_norm": 1.173437237739563,
"learning_rate": 4.855680719136862e-05,
"loss": 1.2005,
"step": 58800
},
{
"epoch": 0.0578,
"grad_norm": 0.7292013168334961,
"learning_rate": 4.8551489544066034e-05,
"loss": 1.1292,
"step": 58900
},
{
"epoch": 0.058,
"grad_norm": 0.6017533540725708,
"learning_rate": 4.854616241022627e-05,
"loss": 1.1527,
"step": 59000
},
{
"epoch": 0.058,
"eval_loss": 1.0688042640686035,
"eval_runtime": 76.596,
"eval_samples_per_second": 201.616,
"eval_steps_per_second": 3.159,
"step": 59000
},
{
"epoch": 0.0582,
"grad_norm": 0.8270254731178284,
"learning_rate": 4.8540825791995114e-05,
"loss": 1.1517,
"step": 59100
},
{
"epoch": 0.0584,
"grad_norm": 1.1182663440704346,
"learning_rate": 4.8535479691522136e-05,
"loss": 1.2282,
"step": 59200
},
{
"epoch": 0.0586,
"grad_norm": 1.1249291896820068,
"learning_rate": 4.853012411096075e-05,
"loss": 1.1314,
"step": 59300
},
{
"epoch": 0.0588,
"grad_norm": 0.6025962233543396,
"learning_rate": 4.85247590524682e-05,
"loss": 1.1879,
"step": 59400
},
{
"epoch": 0.059,
"grad_norm": 1.2914466857910156,
"learning_rate": 4.85193845182055e-05,
"loss": 1.1926,
"step": 59500
},
{
"epoch": 0.0592,
"grad_norm": 0.7965525388717651,
"learning_rate": 4.8514000510337544e-05,
"loss": 1.2344,
"step": 59600
},
{
"epoch": 0.0594,
"grad_norm": 0.6595709323883057,
"learning_rate": 4.850860703103298e-05,
"loss": 1.2056,
"step": 59700
},
{
"epoch": 0.0596,
"grad_norm": 0.783892035484314,
"learning_rate": 4.850320408246433e-05,
"loss": 1.1343,
"step": 59800
},
{
"epoch": 0.0598,
"grad_norm": 0.948952853679657,
"learning_rate": 4.849779166680788e-05,
"loss": 1.1607,
"step": 59900
},
{
"epoch": 0.06,
"grad_norm": 0.725027322769165,
"learning_rate": 4.849236978624375e-05,
"loss": 1.2125,
"step": 60000
},
{
"epoch": 0.06,
"eval_loss": 1.0838971138000488,
"eval_runtime": 76.8451,
"eval_samples_per_second": 200.963,
"eval_steps_per_second": 3.149,
"step": 60000
},
{
"epoch": 0.0002,
"grad_norm": 1.1788556575775146,
"learning_rate": 4.848693844295589e-05,
"loss": 1.1917,
"step": 60100
},
{
"epoch": 0.0004,
"grad_norm": 1.3381775617599487,
"learning_rate": 4.848149763913202e-05,
"loss": 1.2108,
"step": 60200
},
{
"epoch": 0.0006,
"grad_norm": 0.9748820066452026,
"learning_rate": 4.847604737696372e-05,
"loss": 1.2054,
"step": 60300
},
{
"epoch": 0.0008,
"grad_norm": 1.3528317213058472,
"learning_rate": 4.847058765864634e-05,
"loss": 1.1582,
"step": 60400
},
{
"epoch": 0.001,
"grad_norm": 1.0475611686706543,
"learning_rate": 4.8465118486379065e-05,
"loss": 1.1409,
"step": 60500
},
{
"epoch": 0.0012,
"grad_norm": 0.667515754699707,
"learning_rate": 4.8459639862364866e-05,
"loss": 1.1548,
"step": 60600
},
{
"epoch": 0.0014,
"grad_norm": 1.3529212474822998,
"learning_rate": 4.845415178881053e-05,
"loss": 1.1741,
"step": 60700
},
{
"epoch": 0.0016,
"grad_norm": 1.2415704727172852,
"learning_rate": 4.844865426792666e-05,
"loss": 1.1689,
"step": 60800
},
{
"epoch": 0.0018,
"grad_norm": 0.9598329663276672,
"learning_rate": 4.844314730192766e-05,
"loss": 1.2138,
"step": 60900
},
{
"epoch": 0.002,
"grad_norm": 0.660463273525238,
"learning_rate": 4.843763089303173e-05,
"loss": 1.1897,
"step": 61000
},
{
"epoch": 0.002,
"eval_loss": 1.0804229974746704,
"eval_runtime": 77.9042,
"eval_samples_per_second": 198.231,
"eval_steps_per_second": 3.106,
"step": 61000
},
{
"epoch": 0.0022,
"grad_norm": 1.3137476444244385,
"learning_rate": 4.843210504346088e-05,
"loss": 1.2149,
"step": 61100
},
{
"epoch": 0.0024,
"grad_norm": 2.466374158859253,
"learning_rate": 4.842656975544092e-05,
"loss": 1.2294,
"step": 61200
},
{
"epoch": 0.0026,
"grad_norm": 0.9236732721328735,
"learning_rate": 4.842102503120146e-05,
"loss": 1.2316,
"step": 61300
},
{
"epoch": 0.0028,
"grad_norm": 0.9453101754188538,
"learning_rate": 4.841547087297592e-05,
"loss": 1.1903,
"step": 61400
},
{
"epoch": 0.003,
"grad_norm": 1.0694693326950073,
"learning_rate": 4.840990728300151e-05,
"loss": 1.2027,
"step": 61500
},
{
"epoch": 0.0032,
"grad_norm": 1.0661156177520752,
"learning_rate": 4.8404334263519254e-05,
"loss": 1.2268,
"step": 61600
},
{
"epoch": 0.0034,
"grad_norm": 1.3803828954696655,
"learning_rate": 4.839875181677394e-05,
"loss": 1.2084,
"step": 61700
},
{
"epoch": 0.0036,
"grad_norm": 0.896979033946991,
"learning_rate": 4.839315994501421e-05,
"loss": 1.1818,
"step": 61800
},
{
"epoch": 0.0038,
"grad_norm": 1.1509560346603394,
"learning_rate": 4.8387558650492446e-05,
"loss": 1.226,
"step": 61900
},
{
"epoch": 0.004,
"grad_norm": 1.2490339279174805,
"learning_rate": 4.8381947935464854e-05,
"loss": 1.2283,
"step": 62000
},
{
"epoch": 0.004,
"eval_loss": 1.086965560913086,
"eval_runtime": 75.4991,
"eval_samples_per_second": 204.545,
"eval_steps_per_second": 3.205,
"step": 62000
},
{
"epoch": 0.0042,
"grad_norm": 1.0047966241836548,
"learning_rate": 4.837632780219142e-05,
"loss": 1.2006,
"step": 62100
},
{
"epoch": 0.0044,
"grad_norm": 1.3791793584823608,
"learning_rate": 4.837069825293596e-05,
"loss": 1.2191,
"step": 62200
},
{
"epoch": 0.0046,
"grad_norm": 1.4083282947540283,
"learning_rate": 4.836505928996603e-05,
"loss": 1.2232,
"step": 62300
},
{
"epoch": 0.0048,
"grad_norm": 1.5420063734054565,
"learning_rate": 4.835941091555301e-05,
"loss": 1.2281,
"step": 62400
},
{
"epoch": 0.005,
"grad_norm": 0.7661809921264648,
"learning_rate": 4.8353753131972066e-05,
"loss": 1.2262,
"step": 62500
},
{
"epoch": 0.0052,
"grad_norm": 0.5983784198760986,
"learning_rate": 4.8348085941502164e-05,
"loss": 1.2203,
"step": 62600
},
{
"epoch": 0.0054,
"grad_norm": 0.8108716011047363,
"learning_rate": 4.8342409346426024e-05,
"loss": 1.1536,
"step": 62700
},
{
"epoch": 0.0056,
"grad_norm": 0.9011421203613281,
"learning_rate": 4.83367233490302e-05,
"loss": 1.2214,
"step": 62800
},
{
"epoch": 0.0058,
"grad_norm": 0.6667259335517883,
"learning_rate": 4.8331027951604995e-05,
"loss": 1.1932,
"step": 62900
},
{
"epoch": 0.006,
"grad_norm": 1.2265853881835938,
"learning_rate": 4.8325323156444525e-05,
"loss": 1.235,
"step": 63000
},
{
"epoch": 0.006,
"eval_loss": 1.0849037170410156,
"eval_runtime": 76.5492,
"eval_samples_per_second": 201.74,
"eval_steps_per_second": 3.161,
"step": 63000
},
{
"epoch": 0.0062,
"grad_norm": 1.468518614768982,
"learning_rate": 4.831960896584667e-05,
"loss": 1.1886,
"step": 63100
},
{
"epoch": 0.0064,
"grad_norm": 1.2378790378570557,
"learning_rate": 4.831388538211312e-05,
"loss": 1.1983,
"step": 63200
},
{
"epoch": 0.0066,
"grad_norm": 1.2989089488983154,
"learning_rate": 4.830815240754933e-05,
"loss": 1.1894,
"step": 63300
},
{
"epoch": 0.0068,
"grad_norm": 1.3696600198745728,
"learning_rate": 4.830241004446453e-05,
"loss": 1.1798,
"step": 63400
},
{
"epoch": 0.007,
"grad_norm": 1.3715136051177979,
"learning_rate": 4.829665829517175e-05,
"loss": 1.2323,
"step": 63500
},
{
"epoch": 0.0072,
"grad_norm": 0.7888614535331726,
"learning_rate": 4.82908971619878e-05,
"loss": 1.2098,
"step": 63600
},
{
"epoch": 0.0074,
"grad_norm": 1.0456979274749756,
"learning_rate": 4.828512664723326e-05,
"loss": 1.21,
"step": 63700
},
{
"epoch": 0.0076,
"grad_norm": 1.4525970220565796,
"learning_rate": 4.827934675323248e-05,
"loss": 1.191,
"step": 63800
},
{
"epoch": 0.0078,
"grad_norm": 1.6751583814620972,
"learning_rate": 4.8273557482313625e-05,
"loss": 1.2084,
"step": 63900
},
{
"epoch": 0.008,
"grad_norm": 0.7282904982566833,
"learning_rate": 4.826775883680859e-05,
"loss": 1.2376,
"step": 64000
},
{
"epoch": 0.008,
"eval_loss": 1.0806148052215576,
"eval_runtime": 75.7629,
"eval_samples_per_second": 203.833,
"eval_steps_per_second": 3.194,
"step": 64000
},
{
"epoch": 0.0082,
"grad_norm": 1.0859407186508179,
"learning_rate": 4.826195081905308e-05,
"loss": 1.1807,
"step": 64100
},
{
"epoch": 0.0084,
"grad_norm": 1.3917006254196167,
"learning_rate": 4.8256133431386566e-05,
"loss": 1.2012,
"step": 64200
},
{
"epoch": 0.0086,
"grad_norm": 1.4448059797286987,
"learning_rate": 4.825030667615228e-05,
"loss": 1.2305,
"step": 64300
},
{
"epoch": 0.0088,
"grad_norm": 1.0721293687820435,
"learning_rate": 4.824447055569725e-05,
"loss": 1.2332,
"step": 64400
},
{
"epoch": 0.009,
"grad_norm": 0.9940403699874878,
"learning_rate": 4.823862507237226e-05,
"loss": 1.2096,
"step": 64500
},
{
"epoch": 0.0092,
"grad_norm": 1.5473828315734863,
"learning_rate": 4.823277022853187e-05,
"loss": 1.1706,
"step": 64600
},
{
"epoch": 0.0094,
"grad_norm": 1.3127409219741821,
"learning_rate": 4.822690602653441e-05,
"loss": 1.2051,
"step": 64700
},
{
"epoch": 0.0096,
"grad_norm": 1.7532451152801514,
"learning_rate": 4.822103246874198e-05,
"loss": 1.196,
"step": 64800
},
{
"epoch": 0.0098,
"grad_norm": 0.8706884980201721,
"learning_rate": 4.8215149557520446e-05,
"loss": 1.1862,
"step": 64900
},
{
"epoch": 0.01,
"grad_norm": 1.5764431953430176,
"learning_rate": 4.8209257295239455e-05,
"loss": 1.2257,
"step": 65000
},
{
"epoch": 0.01,
"eval_loss": 1.0817573070526123,
"eval_runtime": 75.771,
"eval_samples_per_second": 203.811,
"eval_steps_per_second": 3.194,
"step": 65000
},
{
"epoch": 0.0102,
"grad_norm": 1.467939019203186,
"learning_rate": 4.820335568427239e-05,
"loss": 1.2317,
"step": 65100
},
{
"epoch": 0.0104,
"grad_norm": 1.270477533340454,
"learning_rate": 4.819744472699643e-05,
"loss": 1.2308,
"step": 65200
},
{
"epoch": 0.0106,
"grad_norm": 1.073867917060852,
"learning_rate": 4.8191524425792526e-05,
"loss": 1.1991,
"step": 65300
},
{
"epoch": 0.0108,
"grad_norm": 1.0844908952713013,
"learning_rate": 4.818559478304534e-05,
"loss": 1.1914,
"step": 65400
},
{
"epoch": 0.011,
"grad_norm": 1.282365322113037,
"learning_rate": 4.817965580114335e-05,
"loss": 1.2035,
"step": 65500
},
{
"epoch": 0.0112,
"grad_norm": 1.3751475811004639,
"learning_rate": 4.817370748247878e-05,
"loss": 1.215,
"step": 65600
},
{
"epoch": 0.0114,
"grad_norm": 1.484107255935669,
"learning_rate": 4.81677498294476e-05,
"loss": 1.2298,
"step": 65700
},
{
"epoch": 0.0116,
"grad_norm": 1.326803207397461,
"learning_rate": 4.8161782844449566e-05,
"loss": 1.1794,
"step": 65800
},
{
"epoch": 0.0118,
"grad_norm": 1.6823039054870605,
"learning_rate": 4.815580652988817e-05,
"loss": 1.1896,
"step": 65900
},
{
"epoch": 0.012,
"grad_norm": 1.1735076904296875,
"learning_rate": 4.8149820888170673e-05,
"loss": 1.2089,
"step": 66000
},
{
"epoch": 0.012,
"eval_loss": 1.081894874572754,
"eval_runtime": 75.5115,
"eval_samples_per_second": 204.512,
"eval_steps_per_second": 3.205,
"step": 66000
},
{
"epoch": 0.0122,
"grad_norm": 1.0032376050949097,
"learning_rate": 4.814382592170808e-05,
"loss": 1.2197,
"step": 66100
},
{
"epoch": 0.0124,
"grad_norm": 1.2638306617736816,
"learning_rate": 4.813782163291519e-05,
"loss": 1.2009,
"step": 66200
},
{
"epoch": 0.0126,
"grad_norm": 1.2233041524887085,
"learning_rate": 4.813180802421051e-05,
"loss": 1.2069,
"step": 66300
},
{
"epoch": 0.0128,
"grad_norm": 0.857825756072998,
"learning_rate": 4.812578509801632e-05,
"loss": 1.1942,
"step": 66400
},
{
"epoch": 0.013,
"grad_norm": 0.8879494667053223,
"learning_rate": 4.811975285675866e-05,
"loss": 1.1689,
"step": 66500
},
{
"epoch": 0.0132,
"grad_norm": 1.3842177391052246,
"learning_rate": 4.811371130286731e-05,
"loss": 1.1941,
"step": 66600
},
{
"epoch": 0.0134,
"grad_norm": 1.303063988685608,
"learning_rate": 4.810766043877582e-05,
"loss": 1.194,
"step": 66700
},
{
"epoch": 0.0136,
"grad_norm": 1.3135032653808594,
"learning_rate": 4.810160026692147e-05,
"loss": 1.1536,
"step": 66800
},
{
"epoch": 0.0138,
"grad_norm": 0.8059789538383484,
"learning_rate": 4.809553078974528e-05,
"loss": 1.2083,
"step": 66900
},
{
"epoch": 0.014,
"grad_norm": 1.493458867073059,
"learning_rate": 4.808945200969206e-05,
"loss": 1.2031,
"step": 67000
},
{
"epoch": 0.014,
"eval_loss": 1.0807029008865356,
"eval_runtime": 76.4746,
"eval_samples_per_second": 201.936,
"eval_steps_per_second": 3.164,
"step": 67000
},
{
"epoch": 0.0142,
"grad_norm": 0.9932582378387451,
"learning_rate": 4.808336392921033e-05,
"loss": 1.1932,
"step": 67100
},
{
"epoch": 0.0144,
"grad_norm": 1.1588648557662964,
"learning_rate": 4.807726655075237e-05,
"loss": 1.2004,
"step": 67200
},
{
"epoch": 0.0146,
"grad_norm": 0.713295042514801,
"learning_rate": 4.80711598767742e-05,
"loss": 1.1336,
"step": 67300
},
{
"epoch": 0.0148,
"grad_norm": 1.474853277206421,
"learning_rate": 4.80650439097356e-05,
"loss": 1.1909,
"step": 67400
},
{
"epoch": 0.015,
"grad_norm": 1.0433249473571777,
"learning_rate": 4.805891865210006e-05,
"loss": 1.1868,
"step": 67500
},
{
"epoch": 0.0152,
"grad_norm": 0.9942545294761658,
"learning_rate": 4.8052784106334854e-05,
"loss": 1.1896,
"step": 67600
},
{
"epoch": 0.0154,
"grad_norm": 0.9021309018135071,
"learning_rate": 4.804664027491096e-05,
"loss": 1.2265,
"step": 67700
},
{
"epoch": 0.0156,
"grad_norm": 1.4818402528762817,
"learning_rate": 4.8040487160303126e-05,
"loss": 1.2149,
"step": 67800
},
{
"epoch": 0.0158,
"grad_norm": 0.74870365858078,
"learning_rate": 4.803432476498981e-05,
"loss": 1.1928,
"step": 67900
},
{
"epoch": 0.016,
"grad_norm": 0.7827754020690918,
"learning_rate": 4.8028153091453246e-05,
"loss": 1.2062,
"step": 68000
},
{
"epoch": 0.016,
"eval_loss": 1.0748348236083984,
"eval_runtime": 75.9274,
"eval_samples_per_second": 203.392,
"eval_steps_per_second": 3.187,
"step": 68000
},
{
"epoch": 0.0162,
"grad_norm": 1.2250913381576538,
"learning_rate": 4.802197214217936e-05,
"loss": 1.1412,
"step": 68100
},
{
"epoch": 0.0164,
"grad_norm": 1.4763202667236328,
"learning_rate": 4.801578191965785e-05,
"loss": 1.173,
"step": 68200
},
{
"epoch": 0.0166,
"grad_norm": 0.8980317115783691,
"learning_rate": 4.800958242638214e-05,
"loss": 1.1801,
"step": 68300
},
{
"epoch": 0.0168,
"grad_norm": 1.2781926393508911,
"learning_rate": 4.800337366484937e-05,
"loss": 1.2012,
"step": 68400
},
{
"epoch": 0.017,
"grad_norm": 0.8269230723381042,
"learning_rate": 4.799715563756045e-05,
"loss": 1.2319,
"step": 68500
},
{
"epoch": 0.0172,
"grad_norm": 0.633537232875824,
"learning_rate": 4.7990928347019984e-05,
"loss": 1.2058,
"step": 68600
},
{
"epoch": 0.0174,
"grad_norm": 1.39164400100708,
"learning_rate": 4.7984691795736324e-05,
"loss": 1.2066,
"step": 68700
},
{
"epoch": 0.0176,
"grad_norm": 1.5555399656295776,
"learning_rate": 4.7978445986221566e-05,
"loss": 1.2088,
"step": 68800
},
{
"epoch": 0.0178,
"grad_norm": 1.2505526542663574,
"learning_rate": 4.7972190920991514e-05,
"loss": 1.203,
"step": 68900
},
{
"epoch": 0.018,
"grad_norm": 1.5910965204238892,
"learning_rate": 4.7965926602565705e-05,
"loss": 1.1877,
"step": 69000
},
{
"epoch": 0.018,
"eval_loss": 1.0717748403549194,
"eval_runtime": 75.7519,
"eval_samples_per_second": 203.863,
"eval_steps_per_second": 3.195,
"step": 69000
},
{
"epoch": 0.0182,
"grad_norm": 0.7755507826805115,
"learning_rate": 4.79596530334674e-05,
"loss": 1.1864,
"step": 69100
},
{
"epoch": 0.0184,
"grad_norm": 1.2141857147216797,
"learning_rate": 4.79533702162236e-05,
"loss": 1.1849,
"step": 69200
},
{
"epoch": 0.0186,
"grad_norm": 1.399149775505066,
"learning_rate": 4.794707815336503e-05,
"loss": 1.1702,
"step": 69300
},
{
"epoch": 0.0188,
"grad_norm": 1.3381379842758179,
"learning_rate": 4.7940776847426114e-05,
"loss": 1.2052,
"step": 69400
},
{
"epoch": 0.019,
"grad_norm": 1.347264051437378,
"learning_rate": 4.793446630094503e-05,
"loss": 1.1998,
"step": 69500
},
{
"epoch": 0.0192,
"grad_norm": 1.2072675228118896,
"learning_rate": 4.792814651646367e-05,
"loss": 1.2127,
"step": 69600
},
{
"epoch": 0.0194,
"grad_norm": 0.7959086894989014,
"learning_rate": 4.792181749652763e-05,
"loss": 1.1474,
"step": 69700
},
{
"epoch": 0.0196,
"grad_norm": 1.0272786617279053,
"learning_rate": 4.7915479243686244e-05,
"loss": 1.2033,
"step": 69800
},
{
"epoch": 0.0198,
"grad_norm": 0.8985835909843445,
"learning_rate": 4.790913176049256e-05,
"loss": 1.1942,
"step": 69900
},
{
"epoch": 0.02,
"grad_norm": 0.676177442073822,
"learning_rate": 4.7902775049503346e-05,
"loss": 1.1883,
"step": 70000
},
{
"epoch": 0.02,
"eval_loss": 1.0733578205108643,
"eval_runtime": 75.8186,
"eval_samples_per_second": 203.684,
"eval_steps_per_second": 3.192,
"step": 70000
},
{
"epoch": 0.0202,
"grad_norm": 0.7747570872306824,
"learning_rate": 4.789640911327907e-05,
"loss": 1.1883,
"step": 70100
},
{
"epoch": 0.0204,
"grad_norm": 1.1808815002441406,
"learning_rate": 4.789003395438395e-05,
"loss": 1.1932,
"step": 70200
},
{
"epoch": 0.0206,
"grad_norm": 1.29102623462677,
"learning_rate": 4.7883649575385894e-05,
"loss": 1.1654,
"step": 70300
},
{
"epoch": 0.0208,
"grad_norm": 0.8418052792549133,
"learning_rate": 4.7877255978856516e-05,
"loss": 1.1702,
"step": 70400
},
{
"epoch": 0.021,
"grad_norm": 1.1825124025344849,
"learning_rate": 4.787085316737116e-05,
"loss": 1.1654,
"step": 70500
},
{
"epoch": 0.0212,
"grad_norm": 1.301255702972412,
"learning_rate": 4.78644411435089e-05,
"loss": 1.2505,
"step": 70600
},
{
"epoch": 0.0214,
"grad_norm": 1.2461885213851929,
"learning_rate": 4.785801990985247e-05,
"loss": 1.1907,
"step": 70700
},
{
"epoch": 0.0216,
"grad_norm": 1.2957687377929688,
"learning_rate": 4.7851589468988364e-05,
"loss": 1.2244,
"step": 70800
},
{
"epoch": 0.0218,
"grad_norm": 1.9566733837127686,
"learning_rate": 4.7845149823506744e-05,
"loss": 1.1688,
"step": 70900
},
{
"epoch": 0.022,
"grad_norm": 0.9749345779418945,
"learning_rate": 4.783870097600151e-05,
"loss": 1.2178,
"step": 71000
},
{
"epoch": 0.022,
"eval_loss": 1.076163649559021,
"eval_runtime": 75.78,
"eval_samples_per_second": 203.787,
"eval_steps_per_second": 3.193,
"step": 71000
},
{
"epoch": 0.0222,
"grad_norm": 1.1278064250946045,
"learning_rate": 4.783224292907025e-05,
"loss": 1.1899,
"step": 71100
},
{
"epoch": 0.0224,
"grad_norm": 1.023586392402649,
"learning_rate": 4.7825775685314277e-05,
"loss": 1.1967,
"step": 71200
},
{
"epoch": 0.0226,
"grad_norm": 1.2925764322280884,
"learning_rate": 4.781929924733858e-05,
"loss": 1.2154,
"step": 71300
},
{
"epoch": 0.0228,
"grad_norm": 0.8185212016105652,
"learning_rate": 4.781281361775188e-05,
"loss": 1.195,
"step": 71400
},
{
"epoch": 0.023,
"grad_norm": 0.8742319941520691,
"learning_rate": 4.7806318799166586e-05,
"loss": 1.1746,
"step": 71500
},
{
"epoch": 0.0232,
"grad_norm": 1.2598085403442383,
"learning_rate": 4.77998147941988e-05,
"loss": 1.1781,
"step": 71600
},
{
"epoch": 0.0234,
"grad_norm": 1.2358424663543701,
"learning_rate": 4.7793301605468344e-05,
"loss": 1.2345,
"step": 71700
},
{
"epoch": 0.0236,
"grad_norm": 1.2528828382492065,
"learning_rate": 4.778677923559872e-05,
"loss": 1.2109,
"step": 71800
},
{
"epoch": 0.0238,
"grad_norm": 0.5741105675697327,
"learning_rate": 4.778024768721716e-05,
"loss": 1.2076,
"step": 71900
},
{
"epoch": 0.024,
"grad_norm": 1.3200185298919678,
"learning_rate": 4.7773706962954545e-05,
"loss": 1.2124,
"step": 72000
},
{
"epoch": 0.024,
"eval_loss": 1.0720120668411255,
"eval_runtime": 76.4471,
"eval_samples_per_second": 202.009,
"eval_steps_per_second": 3.166,
"step": 72000
},
{
"epoch": 0.0242,
"grad_norm": 1.4096635580062866,
"learning_rate": 4.776715706544549e-05,
"loss": 1.2283,
"step": 72100
},
{
"epoch": 0.0244,
"grad_norm": 1.5862853527069092,
"learning_rate": 4.7760597997328295e-05,
"loss": 1.1927,
"step": 72200
},
{
"epoch": 0.0246,
"grad_norm": 1.3406593799591064,
"learning_rate": 4.7754029761244945e-05,
"loss": 1.1709,
"step": 72300
},
{
"epoch": 0.0248,
"grad_norm": 1.189676284790039,
"learning_rate": 4.774745235984113e-05,
"loss": 1.2176,
"step": 72400
},
{
"epoch": 0.025,
"grad_norm": 1.4424960613250732,
"learning_rate": 4.7740865795766224e-05,
"loss": 1.2212,
"step": 72500
},
{
"epoch": 0.0252,
"grad_norm": 0.7654275298118591,
"learning_rate": 4.77342700716733e-05,
"loss": 1.2196,
"step": 72600
},
{
"epoch": 0.0254,
"grad_norm": 1.1389504671096802,
"learning_rate": 4.772766519021911e-05,
"loss": 1.1937,
"step": 72700
},
{
"epoch": 0.0256,
"grad_norm": 1.1204986572265625,
"learning_rate": 4.772105115406409e-05,
"loss": 1.1623,
"step": 72800
},
{
"epoch": 0.0258,
"grad_norm": 1.2594044208526611,
"learning_rate": 4.771442796587239e-05,
"loss": 1.2127,
"step": 72900
},
{
"epoch": 0.026,
"grad_norm": 1.3245586156845093,
"learning_rate": 4.770779562831181e-05,
"loss": 1.1919,
"step": 73000
},
{
"epoch": 0.026,
"eval_loss": 1.0672369003295898,
"eval_runtime": 76.1554,
"eval_samples_per_second": 202.783,
"eval_steps_per_second": 3.178,
"step": 73000
},
{
"epoch": 0.0262,
"grad_norm": 0.813410222530365,
"learning_rate": 4.770115414405388e-05,
"loss": 1.224,
"step": 73100
},
{
"epoch": 0.0264,
"grad_norm": 1.3278921842575073,
"learning_rate": 4.769450351577377e-05,
"loss": 1.2304,
"step": 73200
},
{
"epoch": 0.0266,
"grad_norm": 1.1676868200302124,
"learning_rate": 4.768784374615036e-05,
"loss": 1.2144,
"step": 73300
},
{
"epoch": 0.0268,
"grad_norm": 1.2270694971084595,
"learning_rate": 4.7681174837866196e-05,
"loss": 1.2584,
"step": 73400
},
{
"epoch": 0.027,
"grad_norm": 1.5095762014389038,
"learning_rate": 4.7674496793607525e-05,
"loss": 1.1892,
"step": 73500
},
{
"epoch": 0.0272,
"grad_norm": 1.0437262058258057,
"learning_rate": 4.766780961606426e-05,
"loss": 1.2003,
"step": 73600
},
{
"epoch": 0.0274,
"grad_norm": 0.6719204187393188,
"learning_rate": 4.766111330793e-05,
"loss": 1.2145,
"step": 73700
},
{
"epoch": 0.0276,
"grad_norm": 0.7166513204574585,
"learning_rate": 4.765440787190199e-05,
"loss": 1.2463,
"step": 73800
},
{
"epoch": 0.0278,
"grad_norm": 0.9765319228172302,
"learning_rate": 4.7647693310681204e-05,
"loss": 1.2095,
"step": 73900
},
{
"epoch": 0.028,
"grad_norm": 1.298134446144104,
"learning_rate": 4.7640969626972265e-05,
"loss": 1.2089,
"step": 74000
},
{
"epoch": 0.028,
"eval_loss": 1.0727263689041138,
"eval_runtime": 76.0016,
"eval_samples_per_second": 203.193,
"eval_steps_per_second": 3.184,
"step": 74000
},
{
"epoch": 0.0282,
"grad_norm": 1.1968761682510376,
"learning_rate": 4.763423682348347e-05,
"loss": 1.1719,
"step": 74100
},
{
"epoch": 0.0284,
"grad_norm": 1.1887174844741821,
"learning_rate": 4.762749490292678e-05,
"loss": 1.1961,
"step": 74200
},
{
"epoch": 0.0286,
"grad_norm": 1.4029371738433838,
"learning_rate": 4.762074386801786e-05,
"loss": 1.1609,
"step": 74300
},
{
"epoch": 0.0288,
"grad_norm": 1.3785122632980347,
"learning_rate": 4.761398372147601e-05,
"loss": 1.1931,
"step": 74400
},
{
"epoch": 0.029,
"grad_norm": 1.1329565048217773,
"learning_rate": 4.760721446602422e-05,
"loss": 1.2107,
"step": 74500
},
{
"epoch": 0.0292,
"grad_norm": 1.2266113758087158,
"learning_rate": 4.760043610438915e-05,
"loss": 1.1708,
"step": 74600
},
{
"epoch": 0.0294,
"grad_norm": 1.2526196241378784,
"learning_rate": 4.759364863930112e-05,
"loss": 1.2073,
"step": 74700
},
{
"epoch": 0.0296,
"grad_norm": 1.3959336280822754,
"learning_rate": 4.7586852073494113e-05,
"loss": 1.1995,
"step": 74800
},
{
"epoch": 0.0298,
"grad_norm": 1.2470852136611938,
"learning_rate": 4.7580046409705806e-05,
"loss": 1.2227,
"step": 74900
},
{
"epoch": 0.03,
"grad_norm": 1.0915220975875854,
"learning_rate": 4.7573231650677495e-05,
"loss": 1.1955,
"step": 75000
},
{
"epoch": 0.03,
"eval_loss": 1.0732471942901611,
"eval_runtime": 75.8455,
"eval_samples_per_second": 203.611,
"eval_steps_per_second": 3.191,
"step": 75000
},
{
"epoch": 0.0302,
"grad_norm": 1.4608689546585083,
"learning_rate": 4.756640779915418e-05,
"loss": 1.1588,
"step": 75100
},
{
"epoch": 0.0304,
"grad_norm": 1.2811450958251953,
"learning_rate": 4.755957485788449e-05,
"loss": 1.1722,
"step": 75200
},
{
"epoch": 0.0306,
"grad_norm": 1.3260635137557983,
"learning_rate": 4.755273282962075e-05,
"loss": 1.2238,
"step": 75300
},
{
"epoch": 0.0308,
"grad_norm": 1.219567060470581,
"learning_rate": 4.754588171711893e-05,
"loss": 1.2718,
"step": 75400
},
{
"epoch": 0.031,
"grad_norm": 1.368947982788086,
"learning_rate": 4.753902152313865e-05,
"loss": 1.1998,
"step": 75500
},
{
"epoch": 0.0312,
"grad_norm": 1.3364487886428833,
"learning_rate": 4.7532152250443194e-05,
"loss": 1.2043,
"step": 75600
},
{
"epoch": 0.0314,
"grad_norm": 1.348130702972412,
"learning_rate": 4.7525273901799506e-05,
"loss": 1.1834,
"step": 75700
},
{
"epoch": 0.0316,
"grad_norm": 1.1862202882766724,
"learning_rate": 4.751838647997818e-05,
"loss": 1.2061,
"step": 75800
},
{
"epoch": 0.0318,
"grad_norm": 0.7471460103988647,
"learning_rate": 4.7511489987753476e-05,
"loss": 1.1866,
"step": 75900
},
{
"epoch": 0.032,
"grad_norm": 1.4090434312820435,
"learning_rate": 4.75045844279033e-05,
"loss": 1.1878,
"step": 76000
},
{
"epoch": 0.032,
"eval_loss": 1.0745600461959839,
"eval_runtime": 76.306,
"eval_samples_per_second": 202.382,
"eval_steps_per_second": 3.171,
"step": 76000
},
{
"epoch": 0.0322,
"grad_norm": 1.591199278831482,
"learning_rate": 4.7497669803209204e-05,
"loss": 1.1824,
"step": 76100
},
{
"epoch": 0.0324,
"grad_norm": 0.8325656652450562,
"learning_rate": 4.749074611645641e-05,
"loss": 1.1723,
"step": 76200
},
{
"epoch": 0.0326,
"grad_norm": 0.8313425779342651,
"learning_rate": 4.748381337043376e-05,
"loss": 1.2033,
"step": 76300
},
{
"epoch": 0.0328,
"grad_norm": 1.4721826314926147,
"learning_rate": 4.7476871567933775e-05,
"loss": 1.1988,
"step": 76400
},
{
"epoch": 0.033,
"grad_norm": 0.9206506013870239,
"learning_rate": 4.746992071175261e-05,
"loss": 1.1844,
"step": 76500
},
{
"epoch": 0.0332,
"grad_norm": 1.0820422172546387,
"learning_rate": 4.746296080469007e-05,
"loss": 1.1902,
"step": 76600
},
{
"epoch": 0.0334,
"grad_norm": 0.9319769144058228,
"learning_rate": 4.745599184954961e-05,
"loss": 1.2031,
"step": 76700
},
{
"epoch": 0.0336,
"grad_norm": 1.1914819478988647,
"learning_rate": 4.744901384913831e-05,
"loss": 1.166,
"step": 76800
},
{
"epoch": 0.0338,
"grad_norm": 0.8440219163894653,
"learning_rate": 4.7442026806266914e-05,
"loss": 1.1493,
"step": 76900
},
{
"epoch": 0.034,
"grad_norm": 1.001518726348877,
"learning_rate": 4.7435030723749813e-05,
"loss": 1.1835,
"step": 77000
},
{
"epoch": 0.034,
"eval_loss": 1.0681182146072388,
"eval_runtime": 76.1301,
"eval_samples_per_second": 202.85,
"eval_steps_per_second": 3.179,
"step": 77000
},
{
"epoch": 0.0342,
"grad_norm": 1.347307562828064,
"learning_rate": 4.742802560440501e-05,
"loss": 1.2213,
"step": 77100
},
{
"epoch": 0.0344,
"grad_norm": 1.1187894344329834,
"learning_rate": 4.742101145105419e-05,
"loss": 1.1949,
"step": 77200
},
{
"epoch": 0.0346,
"grad_norm": 0.8066337704658508,
"learning_rate": 4.741398826652262e-05,
"loss": 1.2008,
"step": 77300
},
{
"epoch": 0.0348,
"grad_norm": 1.0704104900360107,
"learning_rate": 4.740695605363927e-05,
"loss": 1.1804,
"step": 77400
},
{
"epoch": 0.035,
"grad_norm": 1.104546070098877,
"learning_rate": 4.7399914815236704e-05,
"loss": 1.2232,
"step": 77500
},
{
"epoch": 0.0352,
"grad_norm": 1.1818023920059204,
"learning_rate": 4.7392864554151126e-05,
"loss": 1.2062,
"step": 77600
},
{
"epoch": 0.0354,
"grad_norm": 1.3036936521530151,
"learning_rate": 4.738580527322238e-05,
"loss": 1.1905,
"step": 77700
},
{
"epoch": 0.0356,
"grad_norm": 1.1169214248657227,
"learning_rate": 4.737873697529395e-05,
"loss": 1.1759,
"step": 77800
},
{
"epoch": 0.0358,
"grad_norm": 0.8993995189666748,
"learning_rate": 4.7371659663212934e-05,
"loss": 1.1677,
"step": 77900
},
{
"epoch": 0.036,
"grad_norm": 1.258747935295105,
"learning_rate": 4.736457333983009e-05,
"loss": 1.2166,
"step": 78000
},
{
"epoch": 0.036,
"eval_loss": 1.0701075792312622,
"eval_runtime": 75.9209,
"eval_samples_per_second": 203.409,
"eval_steps_per_second": 3.188,
"step": 78000
},
{
"epoch": 0.0362,
"grad_norm": 1.269551396369934,
"learning_rate": 4.735747800799978e-05,
"loss": 1.2185,
"step": 78100
},
{
"epoch": 0.0364,
"grad_norm": 1.3016119003295898,
"learning_rate": 4.735037367057999e-05,
"loss": 1.182,
"step": 78200
},
{
"epoch": 0.0366,
"grad_norm": 1.1407994031906128,
"learning_rate": 4.734326033043238e-05,
"loss": 1.2102,
"step": 78300
},
{
"epoch": 0.0368,
"grad_norm": 1.1673243045806885,
"learning_rate": 4.7336137990422164e-05,
"loss": 1.1902,
"step": 78400
},
{
"epoch": 0.037,
"grad_norm": 0.9958565831184387,
"learning_rate": 4.732900665341824e-05,
"loss": 1.2112,
"step": 78500
},
{
"epoch": 0.0372,
"grad_norm": 0.6769017577171326,
"learning_rate": 4.732186632229311e-05,
"loss": 1.1933,
"step": 78600
},
{
"epoch": 0.0374,
"grad_norm": 0.6430754661560059,
"learning_rate": 4.7314716999922896e-05,
"loss": 1.1851,
"step": 78700
},
{
"epoch": 0.0376,
"grad_norm": 1.103901982307434,
"learning_rate": 4.7307558689187334e-05,
"loss": 1.2234,
"step": 78800
},
{
"epoch": 0.0378,
"grad_norm": 1.143268346786499,
"learning_rate": 4.73003913929698e-05,
"loss": 1.1609,
"step": 78900
},
{
"epoch": 0.038,
"grad_norm": 1.2543673515319824,
"learning_rate": 4.7293215114157284e-05,
"loss": 1.1862,
"step": 79000
},
{
"epoch": 0.038,
"eval_loss": 1.075058937072754,
"eval_runtime": 77.0151,
"eval_samples_per_second": 200.519,
"eval_steps_per_second": 3.142,
"step": 79000
},
{
"epoch": 0.0382,
"grad_norm": 1.0687370300292969,
"learning_rate": 4.728602985564039e-05,
"loss": 1.1878,
"step": 79100
},
{
"epoch": 0.0384,
"grad_norm": 1.230892539024353,
"learning_rate": 4.727883562031333e-05,
"loss": 1.1561,
"step": 79200
},
{
"epoch": 0.0386,
"grad_norm": 1.0465742349624634,
"learning_rate": 4.727163241107395e-05,
"loss": 1.1677,
"step": 79300
},
{
"epoch": 0.0388,
"grad_norm": 0.6553373336791992,
"learning_rate": 4.726442023082369e-05,
"loss": 1.2035,
"step": 79400
},
{
"epoch": 0.039,
"grad_norm": 0.9347487688064575,
"learning_rate": 4.725719908246763e-05,
"loss": 1.2116,
"step": 79500
},
{
"epoch": 0.0392,
"grad_norm": 1.0414602756500244,
"learning_rate": 4.724996896891445e-05,
"loss": 1.2237,
"step": 79600
},
{
"epoch": 0.0394,
"grad_norm": 1.1857577562332153,
"learning_rate": 4.724272989307642e-05,
"loss": 1.1653,
"step": 79700
},
{
"epoch": 0.0396,
"grad_norm": 1.3574703931808472,
"learning_rate": 4.7235481857869446e-05,
"loss": 1.2176,
"step": 79800
},
{
"epoch": 0.0398,
"grad_norm": 1.3188464641571045,
"learning_rate": 4.722822486621304e-05,
"loss": 1.1872,
"step": 79900
},
{
"epoch": 0.04,
"grad_norm": 1.1241661310195923,
"learning_rate": 4.722095892103032e-05,
"loss": 1.1926,
"step": 80000
},
{
"epoch": 0.04,
"eval_loss": 1.0716365575790405,
"eval_runtime": 76.5906,
"eval_samples_per_second": 201.63,
"eval_steps_per_second": 3.16,
"step": 80000
},
{
"epoch": 0.0402,
"grad_norm": 0.9855309724807739,
"learning_rate": 4.721368402524801e-05,
"loss": 1.1427,
"step": 80100
},
{
"epoch": 0.0404,
"grad_norm": 0.6458451151847839,
"learning_rate": 4.720640018179642e-05,
"loss": 1.2032,
"step": 80200
},
{
"epoch": 0.0406,
"grad_norm": 1.1878600120544434,
"learning_rate": 4.71991073936095e-05,
"loss": 1.1879,
"step": 80300
},
{
"epoch": 0.0408,
"grad_norm": 0.8349748253822327,
"learning_rate": 4.719180566362477e-05,
"loss": 1.1355,
"step": 80400
},
{
"epoch": 0.041,
"grad_norm": 1.1937662363052368,
"learning_rate": 4.7184494994783376e-05,
"loss": 1.2018,
"step": 80500
},
{
"epoch": 0.0412,
"grad_norm": 1.3011997938156128,
"learning_rate": 4.7177175390030054e-05,
"loss": 1.1697,
"step": 80600
},
{
"epoch": 0.0414,
"grad_norm": 1.1909871101379395,
"learning_rate": 4.7169846852313137e-05,
"loss": 1.2126,
"step": 80700
},
{
"epoch": 0.0416,
"grad_norm": 1.5078299045562744,
"learning_rate": 4.7162509384584555e-05,
"loss": 1.1983,
"step": 80800
},
{
"epoch": 0.0418,
"grad_norm": 1.3141160011291504,
"learning_rate": 4.715516298979984e-05,
"loss": 1.2118,
"step": 80900
},
{
"epoch": 0.042,
"grad_norm": 1.3565207719802856,
"learning_rate": 4.714780767091813e-05,
"loss": 1.2054,
"step": 81000
},
{
"epoch": 0.042,
"eval_loss": 1.0669591426849365,
"eval_runtime": 75.959,
"eval_samples_per_second": 203.307,
"eval_steps_per_second": 3.186,
"step": 81000
},
{
"epoch": 0.0422,
"grad_norm": 1.3890715837478638,
"learning_rate": 4.714044343090214e-05,
"loss": 1.1917,
"step": 81100
},
{
"epoch": 0.0424,
"grad_norm": 0.9992968440055847,
"learning_rate": 4.713307027271817e-05,
"loss": 1.1869,
"step": 81200
},
{
"epoch": 0.0426,
"grad_norm": 0.8716880679130554,
"learning_rate": 4.712568819933615e-05,
"loss": 1.1941,
"step": 81300
},
{
"epoch": 0.0428,
"grad_norm": 1.243594765663147,
"learning_rate": 4.711829721372957e-05,
"loss": 1.1667,
"step": 81400
},
{
"epoch": 0.043,
"grad_norm": 0.6567416191101074,
"learning_rate": 4.7110897318875516e-05,
"loss": 1.2105,
"step": 81500
},
{
"epoch": 0.0432,
"grad_norm": 0.5886017084121704,
"learning_rate": 4.710348851775467e-05,
"loss": 1.1867,
"step": 81600
},
{
"epoch": 0.0434,
"grad_norm": 0.6296970248222351,
"learning_rate": 4.709607081335129e-05,
"loss": 1.1702,
"step": 81700
},
{
"epoch": 0.0436,
"grad_norm": 0.9896938800811768,
"learning_rate": 4.7088644208653226e-05,
"loss": 1.1628,
"step": 81800
},
{
"epoch": 0.0438,
"grad_norm": 0.7199723720550537,
"learning_rate": 4.708120870665192e-05,
"loss": 1.1792,
"step": 81900
},
{
"epoch": 0.044,
"grad_norm": 1.3148512840270996,
"learning_rate": 4.707376431034238e-05,
"loss": 1.185,
"step": 82000
},
{
"epoch": 0.044,
"eval_loss": 1.0709099769592285,
"eval_runtime": 75.8635,
"eval_samples_per_second": 203.563,
"eval_steps_per_second": 3.19,
"step": 82000
},
{
"epoch": 0.0442,
"grad_norm": 0.6634069681167603,
"learning_rate": 4.706631102272323e-05,
"loss": 1.1633,
"step": 82100
},
{
"epoch": 0.0444,
"grad_norm": 1.3700015544891357,
"learning_rate": 4.705884884679663e-05,
"loss": 1.1712,
"step": 82200
},
{
"epoch": 0.0446,
"grad_norm": 1.1697111129760742,
"learning_rate": 4.705137778556835e-05,
"loss": 1.1902,
"step": 82300
},
{
"epoch": 0.0448,
"grad_norm": 1.4012552499771118,
"learning_rate": 4.7043897842047735e-05,
"loss": 1.216,
"step": 82400
},
{
"epoch": 0.045,
"grad_norm": 1.2128801345825195,
"learning_rate": 4.7036409019247706e-05,
"loss": 1.2169,
"step": 82500
},
{
"epoch": 0.0452,
"grad_norm": 1.435831904411316,
"learning_rate": 4.7028911320184766e-05,
"loss": 1.1839,
"step": 82600
},
{
"epoch": 0.0454,
"grad_norm": 0.8126788139343262,
"learning_rate": 4.702140474787898e-05,
"loss": 1.1652,
"step": 82700
},
{
"epoch": 0.0456,
"grad_norm": 1.1946730613708496,
"learning_rate": 4.7013889305353985e-05,
"loss": 1.2277,
"step": 82800
},
{
"epoch": 0.0458,
"grad_norm": 0.6007882952690125,
"learning_rate": 4.700636499563702e-05,
"loss": 1.1586,
"step": 82900
},
{
"epoch": 0.046,
"grad_norm": 0.6656979322433472,
"learning_rate": 4.699883182175886e-05,
"loss": 1.1902,
"step": 83000
},
{
"epoch": 0.046,
"eval_loss": 1.072899580001831,
"eval_runtime": 77.2342,
"eval_samples_per_second": 199.95,
"eval_steps_per_second": 3.133,
"step": 83000
},
{
"epoch": 0.0462,
"grad_norm": 1.5463351011276245,
"learning_rate": 4.6991289786753876e-05,
"loss": 1.1988,
"step": 83100
},
{
"epoch": 0.0464,
"grad_norm": 1.202536940574646,
"learning_rate": 4.698373889366e-05,
"loss": 1.1983,
"step": 83200
},
{
"epoch": 0.0466,
"grad_norm": 0.7186087369918823,
"learning_rate": 4.6976179145518724e-05,
"loss": 1.15,
"step": 83300
},
{
"epoch": 0.0468,
"grad_norm": 1.3059759140014648,
"learning_rate": 4.6968610545375116e-05,
"loss": 1.1896,
"step": 83400
},
{
"epoch": 0.047,
"grad_norm": 0.8425590991973877,
"learning_rate": 4.696103309627781e-05,
"loss": 1.1747,
"step": 83500
},
{
"epoch": 0.0472,
"grad_norm": 1.1745330095291138,
"learning_rate": 4.695344680127899e-05,
"loss": 1.1591,
"step": 83600
},
{
"epoch": 0.0474,
"grad_norm": 0.6429449915885925,
"learning_rate": 4.694585166343443e-05,
"loss": 1.1893,
"step": 83700
},
{
"epoch": 0.0476,
"grad_norm": 1.5323892831802368,
"learning_rate": 4.693824768580344e-05,
"loss": 1.2037,
"step": 83800
},
{
"epoch": 0.0478,
"grad_norm": 1.2719688415527344,
"learning_rate": 4.693063487144891e-05,
"loss": 1.191,
"step": 83900
},
{
"epoch": 0.048,
"grad_norm": 1.1735507249832153,
"learning_rate": 4.6923013223437276e-05,
"loss": 1.1904,
"step": 84000
},
{
"epoch": 0.048,
"eval_loss": 1.0721956491470337,
"eval_runtime": 76.3531,
"eval_samples_per_second": 202.258,
"eval_steps_per_second": 3.169,
"step": 84000
},
{
"epoch": 0.0482,
"grad_norm": 1.1949045658111572,
"learning_rate": 4.6915382744838536e-05,
"loss": 1.1507,
"step": 84100
},
{
"epoch": 0.0484,
"grad_norm": 1.074385404586792,
"learning_rate": 4.690774343872625e-05,
"loss": 1.1504,
"step": 84200
},
{
"epoch": 0.0486,
"grad_norm": 1.0720473527908325,
"learning_rate": 4.690009530817753e-05,
"loss": 1.1758,
"step": 84300
},
{
"epoch": 0.0488,
"grad_norm": 1.0596733093261719,
"learning_rate": 4.6892438356273024e-05,
"loss": 1.1778,
"step": 84400
},
{
"epoch": 0.049,
"grad_norm": 1.2753647565841675,
"learning_rate": 4.688477258609698e-05,
"loss": 1.1827,
"step": 84500
},
{
"epoch": 0.0492,
"grad_norm": 1.2803727388381958,
"learning_rate": 4.687709800073715e-05,
"loss": 1.164,
"step": 84600
},
{
"epoch": 0.0494,
"grad_norm": 1.4797301292419434,
"learning_rate": 4.6869414603284865e-05,
"loss": 1.1748,
"step": 84700
},
{
"epoch": 0.0496,
"grad_norm": 1.1455540657043457,
"learning_rate": 4.6861722396834996e-05,
"loss": 1.1918,
"step": 84800
},
{
"epoch": 0.0498,
"grad_norm": 1.1636658906936646,
"learning_rate": 4.6854021384485954e-05,
"loss": 1.208,
"step": 84900
},
{
"epoch": 0.05,
"grad_norm": 1.267817735671997,
"learning_rate": 4.684631156933971e-05,
"loss": 1.1679,
"step": 85000
},
{
"epoch": 0.05,
"eval_loss": 1.0709259510040283,
"eval_runtime": 76.3729,
"eval_samples_per_second": 202.205,
"eval_steps_per_second": 3.169,
"step": 85000
},
{
"epoch": 0.0502,
"grad_norm": 1.5029271841049194,
"learning_rate": 4.683859295450178e-05,
"loss": 1.1459,
"step": 85100
},
{
"epoch": 0.0504,
"grad_norm": 0.7328454256057739,
"learning_rate": 4.683086554308123e-05,
"loss": 1.1861,
"step": 85200
},
{
"epoch": 0.0506,
"grad_norm": 1.114625334739685,
"learning_rate": 4.682312933819063e-05,
"loss": 1.1609,
"step": 85300
},
{
"epoch": 0.0508,
"grad_norm": 1.4052484035491943,
"learning_rate": 4.681538434294615e-05,
"loss": 1.1534,
"step": 85400
},
{
"epoch": 0.051,
"grad_norm": 0.7364799976348877,
"learning_rate": 4.6807630560467475e-05,
"loss": 1.1973,
"step": 85500
},
{
"epoch": 0.0512,
"grad_norm": 0.701787531375885,
"learning_rate": 4.679986799387781e-05,
"loss": 1.1743,
"step": 85600
},
{
"epoch": 0.0514,
"grad_norm": 1.331763744354248,
"learning_rate": 4.679209664630393e-05,
"loss": 1.1516,
"step": 85700
},
{
"epoch": 0.0516,
"grad_norm": 0.9733197689056396,
"learning_rate": 4.6784316520876124e-05,
"loss": 1.1646,
"step": 85800
},
{
"epoch": 0.0518,
"grad_norm": 0.7415294051170349,
"learning_rate": 4.677652762072823e-05,
"loss": 1.2005,
"step": 85900
},
{
"epoch": 0.052,
"grad_norm": 1.1027395725250244,
"learning_rate": 4.6768729948997606e-05,
"loss": 1.1601,
"step": 86000
},
{
"epoch": 0.052,
"eval_loss": 1.0681675672531128,
"eval_runtime": 76.2441,
"eval_samples_per_second": 202.547,
"eval_steps_per_second": 3.174,
"step": 86000
},
{
"epoch": 0.0522,
"grad_norm": 0.7156331539154053,
"learning_rate": 4.676092350882517e-05,
"loss": 1.1854,
"step": 86100
},
{
"epoch": 0.0524,
"grad_norm": 1.3423713445663452,
"learning_rate": 4.675310830335534e-05,
"loss": 1.2135,
"step": 86200
},
{
"epoch": 0.0526,
"grad_norm": 1.1925442218780518,
"learning_rate": 4.6745284335736095e-05,
"loss": 1.1414,
"step": 86300
},
{
"epoch": 0.0528,
"grad_norm": 1.1717417240142822,
"learning_rate": 4.673745160911891e-05,
"loss": 1.184,
"step": 86400
},
{
"epoch": 0.053,
"grad_norm": 1.0722715854644775,
"learning_rate": 4.672961012665883e-05,
"loss": 1.1685,
"step": 86500
},
{
"epoch": 0.0532,
"grad_norm": 1.293058156967163,
"learning_rate": 4.6721759891514386e-05,
"loss": 1.1639,
"step": 86600
},
{
"epoch": 0.0534,
"grad_norm": 1.1121761798858643,
"learning_rate": 4.671390090684765e-05,
"loss": 1.1693,
"step": 86700
},
{
"epoch": 0.0536,
"grad_norm": 1.1979039907455444,
"learning_rate": 4.6706033175824226e-05,
"loss": 1.2123,
"step": 86800
},
{
"epoch": 0.0538,
"grad_norm": 1.3003602027893066,
"learning_rate": 4.669815670161324e-05,
"loss": 1.1529,
"step": 86900
},
{
"epoch": 0.054,
"grad_norm": 0.627068817615509,
"learning_rate": 4.669027148738732e-05,
"loss": 1.1901,
"step": 87000
},
{
"epoch": 0.054,
"eval_loss": 1.0730499029159546,
"eval_runtime": 76.271,
"eval_samples_per_second": 202.475,
"eval_steps_per_second": 3.173,
"step": 87000
},
{
"epoch": 0.0542,
"grad_norm": 1.0153006315231323,
"learning_rate": 4.6682377536322644e-05,
"loss": 1.1295,
"step": 87100
},
{
"epoch": 0.0544,
"grad_norm": 1.3619033098220825,
"learning_rate": 4.667447485159889e-05,
"loss": 1.1759,
"step": 87200
},
{
"epoch": 0.0546,
"grad_norm": 0.8665468692779541,
"learning_rate": 4.666656343639926e-05,
"loss": 1.1602,
"step": 87300
},
{
"epoch": 0.0548,
"grad_norm": 0.7338219285011292,
"learning_rate": 4.665864329391046e-05,
"loss": 1.1856,
"step": 87400
},
{
"epoch": 0.055,
"grad_norm": 0.7363407611846924,
"learning_rate": 4.665071442732274e-05,
"loss": 1.1629,
"step": 87500
},
{
"epoch": 0.0552,
"grad_norm": 0.9836055636405945,
"learning_rate": 4.664277683982984e-05,
"loss": 1.1755,
"step": 87600
},
{
"epoch": 0.0554,
"grad_norm": 1.0638995170593262,
"learning_rate": 4.663483053462901e-05,
"loss": 1.186,
"step": 87700
},
{
"epoch": 0.0556,
"grad_norm": 0.9050219058990479,
"learning_rate": 4.662687551492103e-05,
"loss": 1.2357,
"step": 87800
},
{
"epoch": 0.0558,
"grad_norm": 0.917178213596344,
"learning_rate": 4.661891178391018e-05,
"loss": 1.1573,
"step": 87900
},
{
"epoch": 0.056,
"grad_norm": 1.2023630142211914,
"learning_rate": 4.661093934480425e-05,
"loss": 1.1795,
"step": 88000
},
{
"epoch": 0.056,
"eval_loss": 1.0689297914505005,
"eval_runtime": 77.0471,
"eval_samples_per_second": 200.436,
"eval_steps_per_second": 3.141,
"step": 88000
},
{
"epoch": 0.0562,
"grad_norm": 1.2633955478668213,
"learning_rate": 4.660295820081453e-05,
"loss": 1.1501,
"step": 88100
},
{
"epoch": 0.0564,
"grad_norm": 0.5867215991020203,
"learning_rate": 4.6594968355155835e-05,
"loss": 1.2096,
"step": 88200
},
{
"epoch": 0.0566,
"grad_norm": 1.3425019979476929,
"learning_rate": 4.658696981104646e-05,
"loss": 1.2016,
"step": 88300
},
{
"epoch": 0.0568,
"grad_norm": 0.8101886510848999,
"learning_rate": 4.657896257170825e-05,
"loss": 1.1512,
"step": 88400
},
{
"epoch": 0.057,
"grad_norm": 1.43784761428833,
"learning_rate": 4.6570946640366474e-05,
"loss": 1.1536,
"step": 88500
},
{
"epoch": 0.0572,
"grad_norm": 0.766494870185852,
"learning_rate": 4.6562922020249984e-05,
"loss": 1.1521,
"step": 88600
},
{
"epoch": 0.0574,
"grad_norm": 1.5485390424728394,
"learning_rate": 4.6554888714591076e-05,
"loss": 1.176,
"step": 88700
},
{
"epoch": 0.0576,
"grad_norm": 0.8266467452049255,
"learning_rate": 4.654684672662557e-05,
"loss": 1.1514,
"step": 88800
},
{
"epoch": 0.0578,
"grad_norm": 1.2086583375930786,
"learning_rate": 4.6538796059592784e-05,
"loss": 1.177,
"step": 88900
},
{
"epoch": 0.058,
"grad_norm": 1.4609780311584473,
"learning_rate": 4.6530736716735526e-05,
"loss": 1.1447,
"step": 89000
},
{
"epoch": 0.058,
"eval_loss": 1.0664150714874268,
"eval_runtime": 76.3509,
"eval_samples_per_second": 202.264,
"eval_steps_per_second": 3.17,
"step": 89000
},
{
"epoch": 0.0582,
"grad_norm": 1.0640435218811035,
"learning_rate": 4.652266870130008e-05,
"loss": 1.1392,
"step": 89100
},
{
"epoch": 0.0584,
"grad_norm": 1.3286436796188354,
"learning_rate": 4.651459201653626e-05,
"loss": 1.222,
"step": 89200
},
{
"epoch": 0.0586,
"grad_norm": 0.7577000260353088,
"learning_rate": 4.650650666569736e-05,
"loss": 1.1842,
"step": 89300
},
{
"epoch": 0.0588,
"grad_norm": 1.0623698234558105,
"learning_rate": 4.6498412652040137e-05,
"loss": 1.2071,
"step": 89400
},
{
"epoch": 0.059,
"grad_norm": 0.9597827792167664,
"learning_rate": 4.6490309978824866e-05,
"loss": 1.1781,
"step": 89500
},
{
"epoch": 0.0592,
"grad_norm": 1.126639485359192,
"learning_rate": 4.6482198649315306e-05,
"loss": 1.1897,
"step": 89600
},
{
"epoch": 0.0594,
"grad_norm": 1.1724388599395752,
"learning_rate": 4.64740786667787e-05,
"loss": 1.1567,
"step": 89700
},
{
"epoch": 0.0596,
"grad_norm": 1.14126718044281,
"learning_rate": 4.6465950034485776e-05,
"loss": 1.1819,
"step": 89800
},
{
"epoch": 0.0598,
"grad_norm": 0.8016234040260315,
"learning_rate": 4.645781275571075e-05,
"loss": 1.1906,
"step": 89900
},
{
"epoch": 0.06,
"grad_norm": 1.3095015287399292,
"learning_rate": 4.644966683373131e-05,
"loss": 1.1976,
"step": 90000
},
{
"epoch": 0.06,
"eval_loss": 1.0730445384979248,
"eval_runtime": 76.1401,
"eval_samples_per_second": 202.823,
"eval_steps_per_second": 3.178,
"step": 90000
},
{
"epoch": 0.0602,
"grad_norm": 0.5794508457183838,
"learning_rate": 4.6441512271828626e-05,
"loss": 1.1478,
"step": 90100
},
{
"epoch": 0.0604,
"grad_norm": 0.9965047240257263,
"learning_rate": 4.6433349073287366e-05,
"loss": 1.201,
"step": 90200
},
{
"epoch": 0.0606,
"grad_norm": 1.280166506767273,
"learning_rate": 4.642517724139567e-05,
"loss": 1.1542,
"step": 90300
},
{
"epoch": 0.0608,
"grad_norm": 0.7828945517539978,
"learning_rate": 4.641699677944514e-05,
"loss": 1.186,
"step": 90400
},
{
"epoch": 0.061,
"grad_norm": 1.096155047416687,
"learning_rate": 4.640880769073087e-05,
"loss": 1.1969,
"step": 90500
},
{
"epoch": 0.0612,
"grad_norm": 0.7447170615196228,
"learning_rate": 4.6400609978551416e-05,
"loss": 1.1482,
"step": 90600
},
{
"epoch": 0.0614,
"grad_norm": 0.8162779808044434,
"learning_rate": 4.639240364620882e-05,
"loss": 1.2072,
"step": 90700
},
{
"epoch": 0.0616,
"grad_norm": 1.2612018585205078,
"learning_rate": 4.638418869700861e-05,
"loss": 1.1402,
"step": 90800
},
{
"epoch": 0.0618,
"grad_norm": 0.8543398380279541,
"learning_rate": 4.637596513425974e-05,
"loss": 1.1718,
"step": 90900
},
{
"epoch": 0.062,
"grad_norm": 1.2375905513763428,
"learning_rate": 4.636773296127467e-05,
"loss": 1.1587,
"step": 91000
},
{
"epoch": 0.062,
"eval_loss": 1.0713858604431152,
"eval_runtime": 76.3385,
"eval_samples_per_second": 202.296,
"eval_steps_per_second": 3.17,
"step": 91000
}
],
"logging_steps": 100,
"max_steps": 500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.970894657486848e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}