aysinghal's picture
Checkpoint at step 8000
3d6706f verified
{
"best_global_step": 8000,
"best_metric": 0.11738622933626175,
"best_model_checkpoint": "./output/run_20260415_164722_truncate_hard/checkpoint-8000",
"epoch": 0.4371584699453552,
"eval_steps": 1600,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 1.416967749595642,
"eval_runtime": 2260.5945,
"eval_samples_per_second": 54.537,
"eval_steps_per_second": 3.409,
"step": 0
},
{
"epoch": 0.00273224043715847,
"grad_norm": 38.5,
"learning_rate": 1.2250000000000001e-06,
"loss": 2.8207272338867186,
"step": 50
},
{
"epoch": 0.00546448087431694,
"grad_norm": 32.25,
"learning_rate": 2.475e-06,
"loss": 2.6559625244140626,
"step": 100
},
{
"epoch": 0.00819672131147541,
"grad_norm": 18.875,
"learning_rate": 3.7250000000000003e-06,
"loss": 2.1567892456054687,
"step": 150
},
{
"epoch": 0.01092896174863388,
"grad_norm": 18.125,
"learning_rate": 4.975000000000001e-06,
"loss": 1.8427839660644532,
"step": 200
},
{
"epoch": 0.01366120218579235,
"grad_norm": 18.625,
"learning_rate": 6.225000000000001e-06,
"loss": 1.6992501831054687,
"step": 250
},
{
"epoch": 0.01639344262295082,
"grad_norm": 20.5,
"learning_rate": 7.475000000000001e-06,
"loss": 1.576735382080078,
"step": 300
},
{
"epoch": 0.01912568306010929,
"grad_norm": 19.625,
"learning_rate": 8.725000000000002e-06,
"loss": 1.5175244140625,
"step": 350
},
{
"epoch": 0.02185792349726776,
"grad_norm": 22.375,
"learning_rate": 9.975000000000002e-06,
"loss": 1.461768798828125,
"step": 400
},
{
"epoch": 0.02459016393442623,
"grad_norm": 26.375,
"learning_rate": 1.1225000000000002e-05,
"loss": 1.3915757751464843,
"step": 450
},
{
"epoch": 0.0273224043715847,
"grad_norm": 20.5,
"learning_rate": 1.2475000000000002e-05,
"loss": 1.3361874389648438,
"step": 500
},
{
"epoch": 0.030054644808743168,
"grad_norm": 20.0,
"learning_rate": 1.3725000000000002e-05,
"loss": 1.2622003936767578,
"step": 550
},
{
"epoch": 0.03278688524590164,
"grad_norm": 22.125,
"learning_rate": 1.4975000000000001e-05,
"loss": 1.2655531311035155,
"step": 600
},
{
"epoch": 0.03551912568306011,
"grad_norm": 20.375,
"learning_rate": 1.6225e-05,
"loss": 1.182469024658203,
"step": 650
},
{
"epoch": 0.03825136612021858,
"grad_norm": 20.875,
"learning_rate": 1.7475e-05,
"loss": 1.1610452270507812,
"step": 700
},
{
"epoch": 0.040983606557377046,
"grad_norm": 28.625,
"learning_rate": 1.8725e-05,
"loss": 1.1653611755371094,
"step": 750
},
{
"epoch": 0.04371584699453552,
"grad_norm": 20.375,
"learning_rate": 1.9975e-05,
"loss": 1.1153118896484375,
"step": 800
},
{
"epoch": 0.04644808743169399,
"grad_norm": 19.25,
"learning_rate": 1.9863888888888892e-05,
"loss": 1.09112060546875,
"step": 850
},
{
"epoch": 0.04918032786885246,
"grad_norm": 18.375,
"learning_rate": 1.9725000000000002e-05,
"loss": 1.0499742126464844,
"step": 900
},
{
"epoch": 0.05191256830601093,
"grad_norm": 18.75,
"learning_rate": 1.958611111111111e-05,
"loss": 1.0255325317382813,
"step": 950
},
{
"epoch": 0.0546448087431694,
"grad_norm": 17.625,
"learning_rate": 1.9447222222222224e-05,
"loss": 0.9900017547607421,
"step": 1000
},
{
"epoch": 0.05737704918032787,
"grad_norm": 17.375,
"learning_rate": 1.9308333333333336e-05,
"loss": 0.9526313018798828,
"step": 1050
},
{
"epoch": 0.060109289617486336,
"grad_norm": 23.875,
"learning_rate": 1.9169444444444445e-05,
"loss": 0.9093154907226563,
"step": 1100
},
{
"epoch": 0.06284153005464481,
"grad_norm": 18.375,
"learning_rate": 1.9030555555555558e-05,
"loss": 0.90221923828125,
"step": 1150
},
{
"epoch": 0.06557377049180328,
"grad_norm": 18.875,
"learning_rate": 1.8891666666666667e-05,
"loss": 0.8817276000976563,
"step": 1200
},
{
"epoch": 0.06830601092896176,
"grad_norm": 18.125,
"learning_rate": 1.875277777777778e-05,
"loss": 0.8606462097167968,
"step": 1250
},
{
"epoch": 0.07103825136612021,
"grad_norm": 25.125,
"learning_rate": 1.8613888888888893e-05,
"loss": 0.8436074829101563,
"step": 1300
},
{
"epoch": 0.07377049180327869,
"grad_norm": 14.75,
"learning_rate": 1.8475000000000002e-05,
"loss": 0.8204904174804688,
"step": 1350
},
{
"epoch": 0.07650273224043716,
"grad_norm": 18.75,
"learning_rate": 1.833611111111111e-05,
"loss": 0.8097339630126953,
"step": 1400
},
{
"epoch": 0.07923497267759563,
"grad_norm": 18.0,
"learning_rate": 1.8197222222222224e-05,
"loss": 0.8396243286132813,
"step": 1450
},
{
"epoch": 0.08196721311475409,
"grad_norm": 20.75,
"learning_rate": 1.8058333333333336e-05,
"loss": 0.8059647369384766,
"step": 1500
},
{
"epoch": 0.08469945355191257,
"grad_norm": 20.25,
"learning_rate": 1.7919444444444446e-05,
"loss": 0.7802545166015625,
"step": 1550
},
{
"epoch": 0.08743169398907104,
"grad_norm": 17.625,
"learning_rate": 1.7780555555555555e-05,
"loss": 0.8032011413574218,
"step": 1600
},
{
"epoch": 0.08743169398907104,
"eval_loss": 0.20313987135887146,
"eval_runtime": 1993.2601,
"eval_samples_per_second": 61.851,
"eval_steps_per_second": 3.866,
"step": 1600
},
{
"epoch": 0.09016393442622951,
"grad_norm": 22.625,
"learning_rate": 1.7641666666666667e-05,
"loss": 0.7802546691894531,
"step": 1650
},
{
"epoch": 0.09289617486338798,
"grad_norm": 19.125,
"learning_rate": 1.750277777777778e-05,
"loss": 0.7605763244628906,
"step": 1700
},
{
"epoch": 0.09562841530054644,
"grad_norm": 17.125,
"learning_rate": 1.7363888888888893e-05,
"loss": 0.7257496643066407,
"step": 1750
},
{
"epoch": 0.09836065573770492,
"grad_norm": 19.875,
"learning_rate": 1.7225000000000002e-05,
"loss": 0.7296395874023438,
"step": 1800
},
{
"epoch": 0.10109289617486339,
"grad_norm": 21.5,
"learning_rate": 1.708611111111111e-05,
"loss": 0.7457513427734375,
"step": 1850
},
{
"epoch": 0.10382513661202186,
"grad_norm": 21.25,
"learning_rate": 1.6947222222222224e-05,
"loss": 0.7263921356201172,
"step": 1900
},
{
"epoch": 0.10655737704918032,
"grad_norm": 23.625,
"learning_rate": 1.6808333333333336e-05,
"loss": 0.7204135131835937,
"step": 1950
},
{
"epoch": 0.1092896174863388,
"grad_norm": 21.0,
"learning_rate": 1.6669444444444446e-05,
"loss": 0.7349383544921875,
"step": 2000
},
{
"epoch": 0.11202185792349727,
"grad_norm": 19.875,
"learning_rate": 1.6530555555555555e-05,
"loss": 0.7568646240234375,
"step": 2050
},
{
"epoch": 0.11475409836065574,
"grad_norm": 23.125,
"learning_rate": 1.6391666666666668e-05,
"loss": 0.7186477661132813,
"step": 2100
},
{
"epoch": 0.11748633879781421,
"grad_norm": 16.75,
"learning_rate": 1.625277777777778e-05,
"loss": 0.6933038330078125,
"step": 2150
},
{
"epoch": 0.12021857923497267,
"grad_norm": 25.125,
"learning_rate": 1.6113888888888893e-05,
"loss": 0.7022312927246094,
"step": 2200
},
{
"epoch": 0.12295081967213115,
"grad_norm": 19.875,
"learning_rate": 1.5975000000000002e-05,
"loss": 0.6835511016845703,
"step": 2250
},
{
"epoch": 0.12568306010928962,
"grad_norm": 20.125,
"learning_rate": 1.583611111111111e-05,
"loss": 0.7262260437011718,
"step": 2300
},
{
"epoch": 0.1284153005464481,
"grad_norm": 18.5,
"learning_rate": 1.5697222222222224e-05,
"loss": 0.6783106994628906,
"step": 2350
},
{
"epoch": 0.13114754098360656,
"grad_norm": 22.0,
"learning_rate": 1.5558333333333337e-05,
"loss": 0.6687103271484375,
"step": 2400
},
{
"epoch": 0.13387978142076504,
"grad_norm": 21.5,
"learning_rate": 1.5419444444444446e-05,
"loss": 0.6666416931152344,
"step": 2450
},
{
"epoch": 0.1366120218579235,
"grad_norm": 18.0,
"learning_rate": 1.5280555555555555e-05,
"loss": 0.6612174987792969,
"step": 2500
},
{
"epoch": 0.13934426229508196,
"grad_norm": 23.125,
"learning_rate": 1.5141666666666668e-05,
"loss": 0.6176729202270508,
"step": 2550
},
{
"epoch": 0.14207650273224043,
"grad_norm": 20.5,
"learning_rate": 1.5002777777777779e-05,
"loss": 0.6544680786132813,
"step": 2600
},
{
"epoch": 0.1448087431693989,
"grad_norm": 16.75,
"learning_rate": 1.4863888888888891e-05,
"loss": 0.6495597839355469,
"step": 2650
},
{
"epoch": 0.14754098360655737,
"grad_norm": 20.625,
"learning_rate": 1.4725e-05,
"loss": 0.6269410705566406,
"step": 2700
},
{
"epoch": 0.15027322404371585,
"grad_norm": 19.25,
"learning_rate": 1.4586111111111111e-05,
"loss": 0.6502537536621094,
"step": 2750
},
{
"epoch": 0.15300546448087432,
"grad_norm": 25.75,
"learning_rate": 1.4447222222222224e-05,
"loss": 0.6356121826171875,
"step": 2800
},
{
"epoch": 0.1557377049180328,
"grad_norm": 21.125,
"learning_rate": 1.4308333333333335e-05,
"loss": 0.6310220336914063,
"step": 2850
},
{
"epoch": 0.15846994535519127,
"grad_norm": 18.75,
"learning_rate": 1.4169444444444444e-05,
"loss": 0.6256195449829102,
"step": 2900
},
{
"epoch": 0.16120218579234974,
"grad_norm": 19.5,
"learning_rate": 1.4030555555555557e-05,
"loss": 0.6188963317871093,
"step": 2950
},
{
"epoch": 0.16393442622950818,
"grad_norm": 20.125,
"learning_rate": 1.3891666666666668e-05,
"loss": 0.6223039245605468,
"step": 3000
},
{
"epoch": 0.16666666666666666,
"grad_norm": 25.25,
"learning_rate": 1.3752777777777779e-05,
"loss": 0.6119123840332031,
"step": 3050
},
{
"epoch": 0.16939890710382513,
"grad_norm": 18.375,
"learning_rate": 1.3613888888888891e-05,
"loss": 0.6270243835449218,
"step": 3100
},
{
"epoch": 0.1721311475409836,
"grad_norm": 21.125,
"learning_rate": 1.3475e-05,
"loss": 0.6205867004394531,
"step": 3150
},
{
"epoch": 0.17486338797814208,
"grad_norm": 19.375,
"learning_rate": 1.3336111111111112e-05,
"loss": 0.6450627136230469,
"step": 3200
},
{
"epoch": 0.17486338797814208,
"eval_loss": 0.140077143907547,
"eval_runtime": 1996.0971,
"eval_samples_per_second": 61.763,
"eval_steps_per_second": 3.861,
"step": 3200
},
{
"epoch": 0.17759562841530055,
"grad_norm": 26.625,
"learning_rate": 1.3197222222222224e-05,
"loss": 0.6031318664550781,
"step": 3250
},
{
"epoch": 0.18032786885245902,
"grad_norm": 23.5,
"learning_rate": 1.3058333333333335e-05,
"loss": 0.5786302947998047,
"step": 3300
},
{
"epoch": 0.1830601092896175,
"grad_norm": 20.625,
"learning_rate": 1.2919444444444444e-05,
"loss": 0.6212103271484375,
"step": 3350
},
{
"epoch": 0.18579234972677597,
"grad_norm": 23.375,
"learning_rate": 1.2780555555555555e-05,
"loss": 0.6024067687988282,
"step": 3400
},
{
"epoch": 0.1885245901639344,
"grad_norm": 18.625,
"learning_rate": 1.2641666666666668e-05,
"loss": 0.581977767944336,
"step": 3450
},
{
"epoch": 0.1912568306010929,
"grad_norm": 21.125,
"learning_rate": 1.2502777777777779e-05,
"loss": 0.5875739288330079,
"step": 3500
},
{
"epoch": 0.19398907103825136,
"grad_norm": 23.0,
"learning_rate": 1.2363888888888891e-05,
"loss": 0.6084017944335938,
"step": 3550
},
{
"epoch": 0.19672131147540983,
"grad_norm": 23.25,
"learning_rate": 1.2225e-05,
"loss": 0.599505615234375,
"step": 3600
},
{
"epoch": 0.1994535519125683,
"grad_norm": 24.625,
"learning_rate": 1.2086111111111112e-05,
"loss": 0.6103274917602539,
"step": 3650
},
{
"epoch": 0.20218579234972678,
"grad_norm": 21.75,
"learning_rate": 1.1947222222222223e-05,
"loss": 0.5922727966308594,
"step": 3700
},
{
"epoch": 0.20491803278688525,
"grad_norm": 19.75,
"learning_rate": 1.1808333333333335e-05,
"loss": 0.6008519744873047,
"step": 3750
},
{
"epoch": 0.20765027322404372,
"grad_norm": 21.0,
"learning_rate": 1.1669444444444444e-05,
"loss": 0.5870630645751953,
"step": 3800
},
{
"epoch": 0.2103825136612022,
"grad_norm": 19.375,
"learning_rate": 1.1530555555555555e-05,
"loss": 0.5807292938232422,
"step": 3850
},
{
"epoch": 0.21311475409836064,
"grad_norm": 23.75,
"learning_rate": 1.1391666666666668e-05,
"loss": 0.5860625839233399,
"step": 3900
},
{
"epoch": 0.21584699453551912,
"grad_norm": 21.75,
"learning_rate": 1.1252777777777779e-05,
"loss": 0.5723530578613282,
"step": 3950
},
{
"epoch": 0.2185792349726776,
"grad_norm": 22.25,
"learning_rate": 1.1113888888888892e-05,
"loss": 0.5986472320556641,
"step": 4000
},
{
"epoch": 0.22131147540983606,
"grad_norm": 20.875,
"learning_rate": 1.0975e-05,
"loss": 0.5930126190185547,
"step": 4050
},
{
"epoch": 0.22404371584699453,
"grad_norm": 23.25,
"learning_rate": 1.0836111111111112e-05,
"loss": 0.5758544540405274,
"step": 4100
},
{
"epoch": 0.226775956284153,
"grad_norm": 22.125,
"learning_rate": 1.0697222222222223e-05,
"loss": 0.5803060913085938,
"step": 4150
},
{
"epoch": 0.22950819672131148,
"grad_norm": 22.5,
"learning_rate": 1.0558333333333335e-05,
"loss": 0.5753678894042968,
"step": 4200
},
{
"epoch": 0.23224043715846995,
"grad_norm": 21.625,
"learning_rate": 1.0419444444444445e-05,
"loss": 0.5640956497192383,
"step": 4250
},
{
"epoch": 0.23497267759562843,
"grad_norm": 21.375,
"learning_rate": 1.0280555555555555e-05,
"loss": 0.5678297424316406,
"step": 4300
},
{
"epoch": 0.23770491803278687,
"grad_norm": 23.125,
"learning_rate": 1.0141666666666668e-05,
"loss": 0.5765494537353516,
"step": 4350
},
{
"epoch": 0.24043715846994534,
"grad_norm": 24.875,
"learning_rate": 1.0002777777777779e-05,
"loss": 0.5585842895507812,
"step": 4400
},
{
"epoch": 0.24316939890710382,
"grad_norm": 27.375,
"learning_rate": 9.86388888888889e-06,
"loss": 0.5505801391601562,
"step": 4450
},
{
"epoch": 0.2459016393442623,
"grad_norm": 28.875,
"learning_rate": 9.725000000000001e-06,
"loss": 0.5651544952392578,
"step": 4500
},
{
"epoch": 0.24863387978142076,
"grad_norm": 26.5,
"learning_rate": 9.586111111111112e-06,
"loss": 0.5462136077880859,
"step": 4550
},
{
"epoch": 0.25136612021857924,
"grad_norm": 25.0,
"learning_rate": 9.447222222222223e-06,
"loss": 0.55468994140625,
"step": 4600
},
{
"epoch": 0.2540983606557377,
"grad_norm": 24.0,
"learning_rate": 9.308333333333334e-06,
"loss": 0.5691349792480469,
"step": 4650
},
{
"epoch": 0.2568306010928962,
"grad_norm": 25.75,
"learning_rate": 9.169444444444445e-06,
"loss": 0.5687982940673828,
"step": 4700
},
{
"epoch": 0.25956284153005466,
"grad_norm": 26.125,
"learning_rate": 9.030555555555556e-06,
"loss": 0.5460214233398437,
"step": 4750
},
{
"epoch": 0.26229508196721313,
"grad_norm": 26.25,
"learning_rate": 8.891666666666667e-06,
"loss": 0.5721450805664062,
"step": 4800
},
{
"epoch": 0.26229508196721313,
"eval_loss": 0.12485909461975098,
"eval_runtime": 1994.0276,
"eval_samples_per_second": 61.827,
"eval_steps_per_second": 3.865,
"step": 4800
},
{
"epoch": 0.2650273224043716,
"grad_norm": 27.375,
"learning_rate": 8.752777777777779e-06,
"loss": 0.5646157073974609,
"step": 4850
},
{
"epoch": 0.2677595628415301,
"grad_norm": 19.375,
"learning_rate": 8.61388888888889e-06,
"loss": 0.5710072326660156,
"step": 4900
},
{
"epoch": 0.27049180327868855,
"grad_norm": 20.5,
"learning_rate": 8.475000000000001e-06,
"loss": 0.5315534210205078,
"step": 4950
},
{
"epoch": 0.273224043715847,
"grad_norm": 29.25,
"learning_rate": 8.336111111111112e-06,
"loss": 0.5359284591674804,
"step": 5000
},
{
"epoch": 0.27595628415300544,
"grad_norm": 20.625,
"learning_rate": 8.197222222222223e-06,
"loss": 0.5669486236572265,
"step": 5050
},
{
"epoch": 0.2786885245901639,
"grad_norm": 34.5,
"learning_rate": 8.058333333333334e-06,
"loss": 0.5689411544799805,
"step": 5100
},
{
"epoch": 0.2814207650273224,
"grad_norm": 20.0,
"learning_rate": 7.919444444444445e-06,
"loss": 0.5391948699951172,
"step": 5150
},
{
"epoch": 0.28415300546448086,
"grad_norm": 28.875,
"learning_rate": 7.780555555555556e-06,
"loss": 0.5572643280029297,
"step": 5200
},
{
"epoch": 0.28688524590163933,
"grad_norm": 27.125,
"learning_rate": 7.641666666666667e-06,
"loss": 0.538712158203125,
"step": 5250
},
{
"epoch": 0.2896174863387978,
"grad_norm": 23.875,
"learning_rate": 7.502777777777778e-06,
"loss": 0.5496884536743164,
"step": 5300
},
{
"epoch": 0.2923497267759563,
"grad_norm": 28.125,
"learning_rate": 7.363888888888889e-06,
"loss": 0.5370260620117188,
"step": 5350
},
{
"epoch": 0.29508196721311475,
"grad_norm": 27.5,
"learning_rate": 7.225000000000001e-06,
"loss": 0.5524336242675781,
"step": 5400
},
{
"epoch": 0.2978142076502732,
"grad_norm": 29.625,
"learning_rate": 7.086111111111111e-06,
"loss": 0.546603012084961,
"step": 5450
},
{
"epoch": 0.3005464480874317,
"grad_norm": 25.0,
"learning_rate": 6.947222222222223e-06,
"loss": 0.5310775756835937,
"step": 5500
},
{
"epoch": 0.30327868852459017,
"grad_norm": 26.125,
"learning_rate": 6.808333333333333e-06,
"loss": 0.5651336669921875,
"step": 5550
},
{
"epoch": 0.30601092896174864,
"grad_norm": 27.375,
"learning_rate": 6.669444444444445e-06,
"loss": 0.5147453689575195,
"step": 5600
},
{
"epoch": 0.3087431693989071,
"grad_norm": 21.5,
"learning_rate": 6.530555555555556e-06,
"loss": 0.539222183227539,
"step": 5650
},
{
"epoch": 0.3114754098360656,
"grad_norm": 30.25,
"learning_rate": 6.391666666666667e-06,
"loss": 0.5556621170043945,
"step": 5700
},
{
"epoch": 0.31420765027322406,
"grad_norm": 23.75,
"learning_rate": 6.2527777777777785e-06,
"loss": 0.5418627166748047,
"step": 5750
},
{
"epoch": 0.31693989071038253,
"grad_norm": 26.5,
"learning_rate": 6.1138888888888895e-06,
"loss": 0.5251173782348633,
"step": 5800
},
{
"epoch": 0.319672131147541,
"grad_norm": 26.375,
"learning_rate": 5.975e-06,
"loss": 0.5350538635253906,
"step": 5850
},
{
"epoch": 0.3224043715846995,
"grad_norm": 18.5,
"learning_rate": 5.836111111111111e-06,
"loss": 0.5373062896728515,
"step": 5900
},
{
"epoch": 0.3251366120218579,
"grad_norm": 20.375,
"learning_rate": 5.697222222222223e-06,
"loss": 0.5342027282714844,
"step": 5950
},
{
"epoch": 0.32786885245901637,
"grad_norm": 25.0,
"learning_rate": 5.558333333333333e-06,
"loss": 0.5342716979980469,
"step": 6000
},
{
"epoch": 0.33060109289617484,
"grad_norm": 22.5,
"learning_rate": 5.419444444444445e-06,
"loss": 0.5382299423217773,
"step": 6050
},
{
"epoch": 0.3333333333333333,
"grad_norm": 28.375,
"learning_rate": 5.280555555555555e-06,
"loss": 0.5459404373168946,
"step": 6100
},
{
"epoch": 0.3360655737704918,
"grad_norm": 25.625,
"learning_rate": 5.141666666666667e-06,
"loss": 0.5430771636962891,
"step": 6150
},
{
"epoch": 0.33879781420765026,
"grad_norm": 21.875,
"learning_rate": 5.002777777777779e-06,
"loss": 0.5261387252807617,
"step": 6200
},
{
"epoch": 0.34153005464480873,
"grad_norm": 27.625,
"learning_rate": 4.863888888888889e-06,
"loss": 0.5376666259765625,
"step": 6250
},
{
"epoch": 0.3442622950819672,
"grad_norm": 27.75,
"learning_rate": 4.7250000000000005e-06,
"loss": 0.5197310638427735,
"step": 6300
},
{
"epoch": 0.3469945355191257,
"grad_norm": 23.0,
"learning_rate": 4.5861111111111114e-06,
"loss": 0.5165463256835937,
"step": 6350
},
{
"epoch": 0.34972677595628415,
"grad_norm": 26.75,
"learning_rate": 4.447222222222222e-06,
"loss": 0.5260678100585937,
"step": 6400
},
{
"epoch": 0.34972677595628415,
"eval_loss": 0.11765411496162415,
"eval_runtime": 1990.9759,
"eval_samples_per_second": 61.922,
"eval_steps_per_second": 3.87,
"step": 6400
},
{
"epoch": 0.3524590163934426,
"grad_norm": 26.0,
"learning_rate": 4.308333333333334e-06,
"loss": 0.5461505126953125,
"step": 6450
},
{
"epoch": 0.3551912568306011,
"grad_norm": 23.75,
"learning_rate": 4.169444444444445e-06,
"loss": 0.5273017120361329,
"step": 6500
},
{
"epoch": 0.35792349726775957,
"grad_norm": 28.75,
"learning_rate": 4.030555555555556e-06,
"loss": 0.5286589050292969,
"step": 6550
},
{
"epoch": 0.36065573770491804,
"grad_norm": 27.75,
"learning_rate": 3.891666666666667e-06,
"loss": 0.5252587127685547,
"step": 6600
},
{
"epoch": 0.3633879781420765,
"grad_norm": 26.25,
"learning_rate": 3.752777777777778e-06,
"loss": 0.5342652893066406,
"step": 6650
},
{
"epoch": 0.366120218579235,
"grad_norm": 30.125,
"learning_rate": 3.613888888888889e-06,
"loss": 0.536466178894043,
"step": 6700
},
{
"epoch": 0.36885245901639346,
"grad_norm": 27.75,
"learning_rate": 3.475e-06,
"loss": 0.519189453125,
"step": 6750
},
{
"epoch": 0.37158469945355194,
"grad_norm": 26.75,
"learning_rate": 3.3361111111111115e-06,
"loss": 0.5311366271972656,
"step": 6800
},
{
"epoch": 0.3743169398907104,
"grad_norm": 29.125,
"learning_rate": 3.1972222222222225e-06,
"loss": 0.5349716186523438,
"step": 6850
},
{
"epoch": 0.3770491803278688,
"grad_norm": 24.375,
"learning_rate": 3.058333333333334e-06,
"loss": 0.531919174194336,
"step": 6900
},
{
"epoch": 0.3797814207650273,
"grad_norm": 28.25,
"learning_rate": 2.9194444444444448e-06,
"loss": 0.5321033477783204,
"step": 6950
},
{
"epoch": 0.3825136612021858,
"grad_norm": 26.125,
"learning_rate": 2.7805555555555557e-06,
"loss": 0.5234020233154297,
"step": 7000
},
{
"epoch": 0.38524590163934425,
"grad_norm": 26.25,
"learning_rate": 2.6416666666666666e-06,
"loss": 0.5243960571289062,
"step": 7050
},
{
"epoch": 0.3879781420765027,
"grad_norm": 24.25,
"learning_rate": 2.502777777777778e-06,
"loss": 0.5315042877197266,
"step": 7100
},
{
"epoch": 0.3907103825136612,
"grad_norm": 27.625,
"learning_rate": 2.3638888888888894e-06,
"loss": 0.5547313690185547,
"step": 7150
},
{
"epoch": 0.39344262295081966,
"grad_norm": 31.375,
"learning_rate": 2.2250000000000003e-06,
"loss": 0.5182462310791016,
"step": 7200
},
{
"epoch": 0.39617486338797814,
"grad_norm": 30.0,
"learning_rate": 2.0861111111111112e-06,
"loss": 0.530079116821289,
"step": 7250
},
{
"epoch": 0.3989071038251366,
"grad_norm": 28.125,
"learning_rate": 1.947222222222222e-06,
"loss": 0.5425289916992188,
"step": 7300
},
{
"epoch": 0.4016393442622951,
"grad_norm": 28.375,
"learning_rate": 1.8083333333333335e-06,
"loss": 0.5327968597412109,
"step": 7350
},
{
"epoch": 0.40437158469945356,
"grad_norm": 26.5,
"learning_rate": 1.6694444444444447e-06,
"loss": 0.5102980804443359,
"step": 7400
},
{
"epoch": 0.40710382513661203,
"grad_norm": 23.375,
"learning_rate": 1.5305555555555556e-06,
"loss": 0.528506965637207,
"step": 7450
},
{
"epoch": 0.4098360655737705,
"grad_norm": 29.125,
"learning_rate": 1.3916666666666668e-06,
"loss": 0.52130615234375,
"step": 7500
},
{
"epoch": 0.412568306010929,
"grad_norm": 20.875,
"learning_rate": 1.2527777777777777e-06,
"loss": 0.5167626953125,
"step": 7550
},
{
"epoch": 0.41530054644808745,
"grad_norm": 28.5,
"learning_rate": 1.1138888888888888e-06,
"loss": 0.5269654464721679,
"step": 7600
},
{
"epoch": 0.4180327868852459,
"grad_norm": 28.0,
"learning_rate": 9.750000000000002e-07,
"loss": 0.5395594024658203,
"step": 7650
},
{
"epoch": 0.4207650273224044,
"grad_norm": 28.375,
"learning_rate": 8.361111111111111e-07,
"loss": 0.5267076110839843,
"step": 7700
},
{
"epoch": 0.42349726775956287,
"grad_norm": 29.875,
"learning_rate": 6.972222222222223e-07,
"loss": 0.5366734695434571,
"step": 7750
},
{
"epoch": 0.4262295081967213,
"grad_norm": 30.875,
"learning_rate": 5.583333333333333e-07,
"loss": 0.531547737121582,
"step": 7800
},
{
"epoch": 0.42896174863387976,
"grad_norm": 26.625,
"learning_rate": 4.1944444444444446e-07,
"loss": 0.5210472869873047,
"step": 7850
},
{
"epoch": 0.43169398907103823,
"grad_norm": 31.25,
"learning_rate": 2.8055555555555556e-07,
"loss": 0.5263444900512695,
"step": 7900
},
{
"epoch": 0.4344262295081967,
"grad_norm": 23.375,
"learning_rate": 1.4166666666666668e-07,
"loss": 0.5500655364990235,
"step": 7950
},
{
"epoch": 0.4371584699453552,
"grad_norm": 29.0,
"learning_rate": 2.777777777777778e-09,
"loss": 0.540992546081543,
"step": 8000
},
{
"epoch": 0.4371584699453552,
"eval_loss": 0.11738622933626175,
"eval_runtime": 1990.2366,
"eval_samples_per_second": 61.945,
"eval_steps_per_second": 3.872,
"step": 8000
}
],
"logging_steps": 50,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}