{ "best_global_step": 8000, "best_metric": 0.11738622933626175, "best_model_checkpoint": "./output/run_20260415_164722_truncate_hard/checkpoint-8000", "epoch": 0.4371584699453552, "eval_steps": 1600, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.416967749595642, "eval_runtime": 2260.5945, "eval_samples_per_second": 54.537, "eval_steps_per_second": 3.409, "step": 0 }, { "epoch": 0.00273224043715847, "grad_norm": 38.5, "learning_rate": 1.2250000000000001e-06, "loss": 2.8207272338867186, "step": 50 }, { "epoch": 0.00546448087431694, "grad_norm": 32.25, "learning_rate": 2.475e-06, "loss": 2.6559625244140626, "step": 100 }, { "epoch": 0.00819672131147541, "grad_norm": 18.875, "learning_rate": 3.7250000000000003e-06, "loss": 2.1567892456054687, "step": 150 }, { "epoch": 0.01092896174863388, "grad_norm": 18.125, "learning_rate": 4.975000000000001e-06, "loss": 1.8427839660644532, "step": 200 }, { "epoch": 0.01366120218579235, "grad_norm": 18.625, "learning_rate": 6.225000000000001e-06, "loss": 1.6992501831054687, "step": 250 }, { "epoch": 0.01639344262295082, "grad_norm": 20.5, "learning_rate": 7.475000000000001e-06, "loss": 1.576735382080078, "step": 300 }, { "epoch": 0.01912568306010929, "grad_norm": 19.625, "learning_rate": 8.725000000000002e-06, "loss": 1.5175244140625, "step": 350 }, { "epoch": 0.02185792349726776, "grad_norm": 22.375, "learning_rate": 9.975000000000002e-06, "loss": 1.461768798828125, "step": 400 }, { "epoch": 0.02459016393442623, "grad_norm": 26.375, "learning_rate": 1.1225000000000002e-05, "loss": 1.3915757751464843, "step": 450 }, { "epoch": 0.0273224043715847, "grad_norm": 20.5, "learning_rate": 1.2475000000000002e-05, "loss": 1.3361874389648438, "step": 500 }, { "epoch": 0.030054644808743168, "grad_norm": 20.0, "learning_rate": 1.3725000000000002e-05, "loss": 1.2622003936767578, "step": 550 }, { "epoch": 0.03278688524590164, "grad_norm": 22.125, "learning_rate": 1.4975000000000001e-05, "loss": 1.2655531311035155, "step": 600 }, { "epoch": 0.03551912568306011, "grad_norm": 20.375, "learning_rate": 1.6225e-05, "loss": 1.182469024658203, "step": 650 }, { "epoch": 0.03825136612021858, "grad_norm": 20.875, "learning_rate": 1.7475e-05, "loss": 1.1610452270507812, "step": 700 }, { "epoch": 0.040983606557377046, "grad_norm": 28.625, "learning_rate": 1.8725e-05, "loss": 1.1653611755371094, "step": 750 }, { "epoch": 0.04371584699453552, "grad_norm": 20.375, "learning_rate": 1.9975e-05, "loss": 1.1153118896484375, "step": 800 }, { "epoch": 0.04644808743169399, "grad_norm": 19.25, "learning_rate": 1.9863888888888892e-05, "loss": 1.09112060546875, "step": 850 }, { "epoch": 0.04918032786885246, "grad_norm": 18.375, "learning_rate": 1.9725000000000002e-05, "loss": 1.0499742126464844, "step": 900 }, { "epoch": 0.05191256830601093, "grad_norm": 18.75, "learning_rate": 1.958611111111111e-05, "loss": 1.0255325317382813, "step": 950 }, { "epoch": 0.0546448087431694, "grad_norm": 17.625, "learning_rate": 1.9447222222222224e-05, "loss": 0.9900017547607421, "step": 1000 }, { "epoch": 0.05737704918032787, "grad_norm": 17.375, "learning_rate": 1.9308333333333336e-05, "loss": 0.9526313018798828, "step": 1050 }, { "epoch": 0.060109289617486336, "grad_norm": 23.875, "learning_rate": 1.9169444444444445e-05, "loss": 0.9093154907226563, "step": 1100 }, { "epoch": 0.06284153005464481, "grad_norm": 18.375, "learning_rate": 1.9030555555555558e-05, "loss": 0.90221923828125, "step": 1150 }, { "epoch": 0.06557377049180328, "grad_norm": 18.875, "learning_rate": 1.8891666666666667e-05, "loss": 0.8817276000976563, "step": 1200 }, { "epoch": 0.06830601092896176, "grad_norm": 18.125, "learning_rate": 1.875277777777778e-05, "loss": 0.8606462097167968, "step": 1250 }, { "epoch": 0.07103825136612021, "grad_norm": 25.125, "learning_rate": 1.8613888888888893e-05, "loss": 0.8436074829101563, "step": 1300 }, { "epoch": 0.07377049180327869, "grad_norm": 14.75, "learning_rate": 1.8475000000000002e-05, "loss": 0.8204904174804688, "step": 1350 }, { "epoch": 0.07650273224043716, "grad_norm": 18.75, "learning_rate": 1.833611111111111e-05, "loss": 0.8097339630126953, "step": 1400 }, { "epoch": 0.07923497267759563, "grad_norm": 18.0, "learning_rate": 1.8197222222222224e-05, "loss": 0.8396243286132813, "step": 1450 }, { "epoch": 0.08196721311475409, "grad_norm": 20.75, "learning_rate": 1.8058333333333336e-05, "loss": 0.8059647369384766, "step": 1500 }, { "epoch": 0.08469945355191257, "grad_norm": 20.25, "learning_rate": 1.7919444444444446e-05, "loss": 0.7802545166015625, "step": 1550 }, { "epoch": 0.08743169398907104, "grad_norm": 17.625, "learning_rate": 1.7780555555555555e-05, "loss": 0.8032011413574218, "step": 1600 }, { "epoch": 0.08743169398907104, "eval_loss": 0.20313987135887146, "eval_runtime": 1993.2601, "eval_samples_per_second": 61.851, "eval_steps_per_second": 3.866, "step": 1600 }, { "epoch": 0.09016393442622951, "grad_norm": 22.625, "learning_rate": 1.7641666666666667e-05, "loss": 0.7802546691894531, "step": 1650 }, { "epoch": 0.09289617486338798, "grad_norm": 19.125, "learning_rate": 1.750277777777778e-05, "loss": 0.7605763244628906, "step": 1700 }, { "epoch": 0.09562841530054644, "grad_norm": 17.125, "learning_rate": 1.7363888888888893e-05, "loss": 0.7257496643066407, "step": 1750 }, { "epoch": 0.09836065573770492, "grad_norm": 19.875, "learning_rate": 1.7225000000000002e-05, "loss": 0.7296395874023438, "step": 1800 }, { "epoch": 0.10109289617486339, "grad_norm": 21.5, "learning_rate": 1.708611111111111e-05, "loss": 0.7457513427734375, "step": 1850 }, { "epoch": 0.10382513661202186, "grad_norm": 21.25, "learning_rate": 1.6947222222222224e-05, "loss": 0.7263921356201172, "step": 1900 }, { "epoch": 0.10655737704918032, "grad_norm": 23.625, "learning_rate": 1.6808333333333336e-05, "loss": 0.7204135131835937, "step": 1950 }, { "epoch": 0.1092896174863388, "grad_norm": 21.0, "learning_rate": 1.6669444444444446e-05, "loss": 0.7349383544921875, "step": 2000 }, { "epoch": 0.11202185792349727, "grad_norm": 19.875, "learning_rate": 1.6530555555555555e-05, "loss": 0.7568646240234375, "step": 2050 }, { "epoch": 0.11475409836065574, "grad_norm": 23.125, "learning_rate": 1.6391666666666668e-05, "loss": 0.7186477661132813, "step": 2100 }, { "epoch": 0.11748633879781421, "grad_norm": 16.75, "learning_rate": 1.625277777777778e-05, "loss": 0.6933038330078125, "step": 2150 }, { "epoch": 0.12021857923497267, "grad_norm": 25.125, "learning_rate": 1.6113888888888893e-05, "loss": 0.7022312927246094, "step": 2200 }, { "epoch": 0.12295081967213115, "grad_norm": 19.875, "learning_rate": 1.5975000000000002e-05, "loss": 0.6835511016845703, "step": 2250 }, { "epoch": 0.12568306010928962, "grad_norm": 20.125, "learning_rate": 1.583611111111111e-05, "loss": 0.7262260437011718, "step": 2300 }, { "epoch": 0.1284153005464481, "grad_norm": 18.5, "learning_rate": 1.5697222222222224e-05, "loss": 0.6783106994628906, "step": 2350 }, { "epoch": 0.13114754098360656, "grad_norm": 22.0, "learning_rate": 1.5558333333333337e-05, "loss": 0.6687103271484375, "step": 2400 }, { "epoch": 0.13387978142076504, "grad_norm": 21.5, "learning_rate": 1.5419444444444446e-05, "loss": 0.6666416931152344, "step": 2450 }, { "epoch": 0.1366120218579235, "grad_norm": 18.0, "learning_rate": 1.5280555555555555e-05, "loss": 0.6612174987792969, "step": 2500 }, { "epoch": 0.13934426229508196, "grad_norm": 23.125, "learning_rate": 1.5141666666666668e-05, "loss": 0.6176729202270508, "step": 2550 }, { "epoch": 0.14207650273224043, "grad_norm": 20.5, "learning_rate": 1.5002777777777779e-05, "loss": 0.6544680786132813, "step": 2600 }, { "epoch": 0.1448087431693989, "grad_norm": 16.75, "learning_rate": 1.4863888888888891e-05, "loss": 0.6495597839355469, "step": 2650 }, { "epoch": 0.14754098360655737, "grad_norm": 20.625, "learning_rate": 1.4725e-05, "loss": 0.6269410705566406, "step": 2700 }, { "epoch": 0.15027322404371585, "grad_norm": 19.25, "learning_rate": 1.4586111111111111e-05, "loss": 0.6502537536621094, "step": 2750 }, { "epoch": 0.15300546448087432, "grad_norm": 25.75, "learning_rate": 1.4447222222222224e-05, "loss": 0.6356121826171875, "step": 2800 }, { "epoch": 0.1557377049180328, "grad_norm": 21.125, "learning_rate": 1.4308333333333335e-05, "loss": 0.6310220336914063, "step": 2850 }, { "epoch": 0.15846994535519127, "grad_norm": 18.75, "learning_rate": 1.4169444444444444e-05, "loss": 0.6256195449829102, "step": 2900 }, { "epoch": 0.16120218579234974, "grad_norm": 19.5, "learning_rate": 1.4030555555555557e-05, "loss": 0.6188963317871093, "step": 2950 }, { "epoch": 0.16393442622950818, "grad_norm": 20.125, "learning_rate": 1.3891666666666668e-05, "loss": 0.6223039245605468, "step": 3000 }, { "epoch": 0.16666666666666666, "grad_norm": 25.25, "learning_rate": 1.3752777777777779e-05, "loss": 0.6119123840332031, "step": 3050 }, { "epoch": 0.16939890710382513, "grad_norm": 18.375, "learning_rate": 1.3613888888888891e-05, "loss": 0.6270243835449218, "step": 3100 }, { "epoch": 0.1721311475409836, "grad_norm": 21.125, "learning_rate": 1.3475e-05, "loss": 0.6205867004394531, "step": 3150 }, { "epoch": 0.17486338797814208, "grad_norm": 19.375, "learning_rate": 1.3336111111111112e-05, "loss": 0.6450627136230469, "step": 3200 }, { "epoch": 0.17486338797814208, "eval_loss": 0.140077143907547, "eval_runtime": 1996.0971, "eval_samples_per_second": 61.763, "eval_steps_per_second": 3.861, "step": 3200 }, { "epoch": 0.17759562841530055, "grad_norm": 26.625, "learning_rate": 1.3197222222222224e-05, "loss": 0.6031318664550781, "step": 3250 }, { "epoch": 0.18032786885245902, "grad_norm": 23.5, "learning_rate": 1.3058333333333335e-05, "loss": 0.5786302947998047, "step": 3300 }, { "epoch": 0.1830601092896175, "grad_norm": 20.625, "learning_rate": 1.2919444444444444e-05, "loss": 0.6212103271484375, "step": 3350 }, { "epoch": 0.18579234972677597, "grad_norm": 23.375, "learning_rate": 1.2780555555555555e-05, "loss": 0.6024067687988282, "step": 3400 }, { "epoch": 0.1885245901639344, "grad_norm": 18.625, "learning_rate": 1.2641666666666668e-05, "loss": 0.581977767944336, "step": 3450 }, { "epoch": 0.1912568306010929, "grad_norm": 21.125, "learning_rate": 1.2502777777777779e-05, "loss": 0.5875739288330079, "step": 3500 }, { "epoch": 0.19398907103825136, "grad_norm": 23.0, "learning_rate": 1.2363888888888891e-05, "loss": 0.6084017944335938, "step": 3550 }, { "epoch": 0.19672131147540983, "grad_norm": 23.25, "learning_rate": 1.2225e-05, "loss": 0.599505615234375, "step": 3600 }, { "epoch": 0.1994535519125683, "grad_norm": 24.625, "learning_rate": 1.2086111111111112e-05, "loss": 0.6103274917602539, "step": 3650 }, { "epoch": 0.20218579234972678, "grad_norm": 21.75, "learning_rate": 1.1947222222222223e-05, "loss": 0.5922727966308594, "step": 3700 }, { "epoch": 0.20491803278688525, "grad_norm": 19.75, "learning_rate": 1.1808333333333335e-05, "loss": 0.6008519744873047, "step": 3750 }, { "epoch": 0.20765027322404372, "grad_norm": 21.0, "learning_rate": 1.1669444444444444e-05, "loss": 0.5870630645751953, "step": 3800 }, { "epoch": 0.2103825136612022, "grad_norm": 19.375, "learning_rate": 1.1530555555555555e-05, "loss": 0.5807292938232422, "step": 3850 }, { "epoch": 0.21311475409836064, "grad_norm": 23.75, "learning_rate": 1.1391666666666668e-05, "loss": 0.5860625839233399, "step": 3900 }, { "epoch": 0.21584699453551912, "grad_norm": 21.75, "learning_rate": 1.1252777777777779e-05, "loss": 0.5723530578613282, "step": 3950 }, { "epoch": 0.2185792349726776, "grad_norm": 22.25, "learning_rate": 1.1113888888888892e-05, "loss": 0.5986472320556641, "step": 4000 }, { "epoch": 0.22131147540983606, "grad_norm": 20.875, "learning_rate": 1.0975e-05, "loss": 0.5930126190185547, "step": 4050 }, { "epoch": 0.22404371584699453, "grad_norm": 23.25, "learning_rate": 1.0836111111111112e-05, "loss": 0.5758544540405274, "step": 4100 }, { "epoch": 0.226775956284153, "grad_norm": 22.125, "learning_rate": 1.0697222222222223e-05, "loss": 0.5803060913085938, "step": 4150 }, { "epoch": 0.22950819672131148, "grad_norm": 22.5, "learning_rate": 1.0558333333333335e-05, "loss": 0.5753678894042968, "step": 4200 }, { "epoch": 0.23224043715846995, "grad_norm": 21.625, "learning_rate": 1.0419444444444445e-05, "loss": 0.5640956497192383, "step": 4250 }, { "epoch": 0.23497267759562843, "grad_norm": 21.375, "learning_rate": 1.0280555555555555e-05, "loss": 0.5678297424316406, "step": 4300 }, { "epoch": 0.23770491803278687, "grad_norm": 23.125, "learning_rate": 1.0141666666666668e-05, "loss": 0.5765494537353516, "step": 4350 }, { "epoch": 0.24043715846994534, "grad_norm": 24.875, "learning_rate": 1.0002777777777779e-05, "loss": 0.5585842895507812, "step": 4400 }, { "epoch": 0.24316939890710382, "grad_norm": 27.375, "learning_rate": 9.86388888888889e-06, "loss": 0.5505801391601562, "step": 4450 }, { "epoch": 0.2459016393442623, "grad_norm": 28.875, "learning_rate": 9.725000000000001e-06, "loss": 0.5651544952392578, "step": 4500 }, { "epoch": 0.24863387978142076, "grad_norm": 26.5, "learning_rate": 9.586111111111112e-06, "loss": 0.5462136077880859, "step": 4550 }, { "epoch": 0.25136612021857924, "grad_norm": 25.0, "learning_rate": 9.447222222222223e-06, "loss": 0.55468994140625, "step": 4600 }, { "epoch": 0.2540983606557377, "grad_norm": 24.0, "learning_rate": 9.308333333333334e-06, "loss": 0.5691349792480469, "step": 4650 }, { "epoch": 0.2568306010928962, "grad_norm": 25.75, "learning_rate": 9.169444444444445e-06, "loss": 0.5687982940673828, "step": 4700 }, { "epoch": 0.25956284153005466, "grad_norm": 26.125, "learning_rate": 9.030555555555556e-06, "loss": 0.5460214233398437, "step": 4750 }, { "epoch": 0.26229508196721313, "grad_norm": 26.25, "learning_rate": 8.891666666666667e-06, "loss": 0.5721450805664062, "step": 4800 }, { "epoch": 0.26229508196721313, "eval_loss": 0.12485909461975098, "eval_runtime": 1994.0276, "eval_samples_per_second": 61.827, "eval_steps_per_second": 3.865, "step": 4800 }, { "epoch": 0.2650273224043716, "grad_norm": 27.375, "learning_rate": 8.752777777777779e-06, "loss": 0.5646157073974609, "step": 4850 }, { "epoch": 0.2677595628415301, "grad_norm": 19.375, "learning_rate": 8.61388888888889e-06, "loss": 0.5710072326660156, "step": 4900 }, { "epoch": 0.27049180327868855, "grad_norm": 20.5, "learning_rate": 8.475000000000001e-06, "loss": 0.5315534210205078, "step": 4950 }, { "epoch": 0.273224043715847, "grad_norm": 29.25, "learning_rate": 8.336111111111112e-06, "loss": 0.5359284591674804, "step": 5000 }, { "epoch": 0.27595628415300544, "grad_norm": 20.625, "learning_rate": 8.197222222222223e-06, "loss": 0.5669486236572265, "step": 5050 }, { "epoch": 0.2786885245901639, "grad_norm": 34.5, "learning_rate": 8.058333333333334e-06, "loss": 0.5689411544799805, "step": 5100 }, { "epoch": 0.2814207650273224, "grad_norm": 20.0, "learning_rate": 7.919444444444445e-06, "loss": 0.5391948699951172, "step": 5150 }, { "epoch": 0.28415300546448086, "grad_norm": 28.875, "learning_rate": 7.780555555555556e-06, "loss": 0.5572643280029297, "step": 5200 }, { "epoch": 0.28688524590163933, "grad_norm": 27.125, "learning_rate": 7.641666666666667e-06, "loss": 0.538712158203125, "step": 5250 }, { "epoch": 0.2896174863387978, "grad_norm": 23.875, "learning_rate": 7.502777777777778e-06, "loss": 0.5496884536743164, "step": 5300 }, { "epoch": 0.2923497267759563, "grad_norm": 28.125, "learning_rate": 7.363888888888889e-06, "loss": 0.5370260620117188, "step": 5350 }, { "epoch": 0.29508196721311475, "grad_norm": 27.5, "learning_rate": 7.225000000000001e-06, "loss": 0.5524336242675781, "step": 5400 }, { "epoch": 0.2978142076502732, "grad_norm": 29.625, "learning_rate": 7.086111111111111e-06, "loss": 0.546603012084961, "step": 5450 }, { "epoch": 0.3005464480874317, "grad_norm": 25.0, "learning_rate": 6.947222222222223e-06, "loss": 0.5310775756835937, "step": 5500 }, { "epoch": 0.30327868852459017, "grad_norm": 26.125, "learning_rate": 6.808333333333333e-06, "loss": 0.5651336669921875, "step": 5550 }, { "epoch": 0.30601092896174864, "grad_norm": 27.375, "learning_rate": 6.669444444444445e-06, "loss": 0.5147453689575195, "step": 5600 }, { "epoch": 0.3087431693989071, "grad_norm": 21.5, "learning_rate": 6.530555555555556e-06, "loss": 0.539222183227539, "step": 5650 }, { "epoch": 0.3114754098360656, "grad_norm": 30.25, "learning_rate": 6.391666666666667e-06, "loss": 0.5556621170043945, "step": 5700 }, { "epoch": 0.31420765027322406, "grad_norm": 23.75, "learning_rate": 6.2527777777777785e-06, "loss": 0.5418627166748047, "step": 5750 }, { "epoch": 0.31693989071038253, "grad_norm": 26.5, "learning_rate": 6.1138888888888895e-06, "loss": 0.5251173782348633, "step": 5800 }, { "epoch": 0.319672131147541, "grad_norm": 26.375, "learning_rate": 5.975e-06, "loss": 0.5350538635253906, "step": 5850 }, { "epoch": 0.3224043715846995, "grad_norm": 18.5, "learning_rate": 5.836111111111111e-06, "loss": 0.5373062896728515, "step": 5900 }, { "epoch": 0.3251366120218579, "grad_norm": 20.375, "learning_rate": 5.697222222222223e-06, "loss": 0.5342027282714844, "step": 5950 }, { "epoch": 0.32786885245901637, "grad_norm": 25.0, "learning_rate": 5.558333333333333e-06, "loss": 0.5342716979980469, "step": 6000 }, { "epoch": 0.33060109289617484, "grad_norm": 22.5, "learning_rate": 5.419444444444445e-06, "loss": 0.5382299423217773, "step": 6050 }, { "epoch": 0.3333333333333333, "grad_norm": 28.375, "learning_rate": 5.280555555555555e-06, "loss": 0.5459404373168946, "step": 6100 }, { "epoch": 0.3360655737704918, "grad_norm": 25.625, "learning_rate": 5.141666666666667e-06, "loss": 0.5430771636962891, "step": 6150 }, { "epoch": 0.33879781420765026, "grad_norm": 21.875, "learning_rate": 5.002777777777779e-06, "loss": 0.5261387252807617, "step": 6200 }, { "epoch": 0.34153005464480873, "grad_norm": 27.625, "learning_rate": 4.863888888888889e-06, "loss": 0.5376666259765625, "step": 6250 }, { "epoch": 0.3442622950819672, "grad_norm": 27.75, "learning_rate": 4.7250000000000005e-06, "loss": 0.5197310638427735, "step": 6300 }, { "epoch": 0.3469945355191257, "grad_norm": 23.0, "learning_rate": 4.5861111111111114e-06, "loss": 0.5165463256835937, "step": 6350 }, { "epoch": 0.34972677595628415, "grad_norm": 26.75, "learning_rate": 4.447222222222222e-06, "loss": 0.5260678100585937, "step": 6400 }, { "epoch": 0.34972677595628415, "eval_loss": 0.11765411496162415, "eval_runtime": 1990.9759, "eval_samples_per_second": 61.922, "eval_steps_per_second": 3.87, "step": 6400 }, { "epoch": 0.3524590163934426, "grad_norm": 26.0, "learning_rate": 4.308333333333334e-06, "loss": 0.5461505126953125, "step": 6450 }, { "epoch": 0.3551912568306011, "grad_norm": 23.75, "learning_rate": 4.169444444444445e-06, "loss": 0.5273017120361329, "step": 6500 }, { "epoch": 0.35792349726775957, "grad_norm": 28.75, "learning_rate": 4.030555555555556e-06, "loss": 0.5286589050292969, "step": 6550 }, { "epoch": 0.36065573770491804, "grad_norm": 27.75, "learning_rate": 3.891666666666667e-06, "loss": 0.5252587127685547, "step": 6600 }, { "epoch": 0.3633879781420765, "grad_norm": 26.25, "learning_rate": 3.752777777777778e-06, "loss": 0.5342652893066406, "step": 6650 }, { "epoch": 0.366120218579235, "grad_norm": 30.125, "learning_rate": 3.613888888888889e-06, "loss": 0.536466178894043, "step": 6700 }, { "epoch": 0.36885245901639346, "grad_norm": 27.75, "learning_rate": 3.475e-06, "loss": 0.519189453125, "step": 6750 }, { "epoch": 0.37158469945355194, "grad_norm": 26.75, "learning_rate": 3.3361111111111115e-06, "loss": 0.5311366271972656, "step": 6800 }, { "epoch": 0.3743169398907104, "grad_norm": 29.125, "learning_rate": 3.1972222222222225e-06, "loss": 0.5349716186523438, "step": 6850 }, { "epoch": 0.3770491803278688, "grad_norm": 24.375, "learning_rate": 3.058333333333334e-06, "loss": 0.531919174194336, "step": 6900 }, { "epoch": 0.3797814207650273, "grad_norm": 28.25, "learning_rate": 2.9194444444444448e-06, "loss": 0.5321033477783204, "step": 6950 }, { "epoch": 0.3825136612021858, "grad_norm": 26.125, "learning_rate": 2.7805555555555557e-06, "loss": 0.5234020233154297, "step": 7000 }, { "epoch": 0.38524590163934425, "grad_norm": 26.25, "learning_rate": 2.6416666666666666e-06, "loss": 0.5243960571289062, "step": 7050 }, { "epoch": 0.3879781420765027, "grad_norm": 24.25, "learning_rate": 2.502777777777778e-06, "loss": 0.5315042877197266, "step": 7100 }, { "epoch": 0.3907103825136612, "grad_norm": 27.625, "learning_rate": 2.3638888888888894e-06, "loss": 0.5547313690185547, "step": 7150 }, { "epoch": 0.39344262295081966, "grad_norm": 31.375, "learning_rate": 2.2250000000000003e-06, "loss": 0.5182462310791016, "step": 7200 }, { "epoch": 0.39617486338797814, "grad_norm": 30.0, "learning_rate": 2.0861111111111112e-06, "loss": 0.530079116821289, "step": 7250 }, { "epoch": 0.3989071038251366, "grad_norm": 28.125, "learning_rate": 1.947222222222222e-06, "loss": 0.5425289916992188, "step": 7300 }, { "epoch": 0.4016393442622951, "grad_norm": 28.375, "learning_rate": 1.8083333333333335e-06, "loss": 0.5327968597412109, "step": 7350 }, { "epoch": 0.40437158469945356, "grad_norm": 26.5, "learning_rate": 1.6694444444444447e-06, "loss": 0.5102980804443359, "step": 7400 }, { "epoch": 0.40710382513661203, "grad_norm": 23.375, "learning_rate": 1.5305555555555556e-06, "loss": 0.528506965637207, "step": 7450 }, { "epoch": 0.4098360655737705, "grad_norm": 29.125, "learning_rate": 1.3916666666666668e-06, "loss": 0.52130615234375, "step": 7500 }, { "epoch": 0.412568306010929, "grad_norm": 20.875, "learning_rate": 1.2527777777777777e-06, "loss": 0.5167626953125, "step": 7550 }, { "epoch": 0.41530054644808745, "grad_norm": 28.5, "learning_rate": 1.1138888888888888e-06, "loss": 0.5269654464721679, "step": 7600 }, { "epoch": 0.4180327868852459, "grad_norm": 28.0, "learning_rate": 9.750000000000002e-07, "loss": 0.5395594024658203, "step": 7650 }, { "epoch": 0.4207650273224044, "grad_norm": 28.375, "learning_rate": 8.361111111111111e-07, "loss": 0.5267076110839843, "step": 7700 }, { "epoch": 0.42349726775956287, "grad_norm": 29.875, "learning_rate": 6.972222222222223e-07, "loss": 0.5366734695434571, "step": 7750 }, { "epoch": 0.4262295081967213, "grad_norm": 30.875, "learning_rate": 5.583333333333333e-07, "loss": 0.531547737121582, "step": 7800 }, { "epoch": 0.42896174863387976, "grad_norm": 26.625, "learning_rate": 4.1944444444444446e-07, "loss": 0.5210472869873047, "step": 7850 }, { "epoch": 0.43169398907103823, "grad_norm": 31.25, "learning_rate": 2.8055555555555556e-07, "loss": 0.5263444900512695, "step": 7900 }, { "epoch": 0.4344262295081967, "grad_norm": 23.375, "learning_rate": 1.4166666666666668e-07, "loss": 0.5500655364990235, "step": 7950 }, { "epoch": 0.4371584699453552, "grad_norm": 29.0, "learning_rate": 2.777777777777778e-09, "loss": 0.540992546081543, "step": 8000 }, { "epoch": 0.4371584699453552, "eval_loss": 0.11738622933626175, "eval_runtime": 1990.2366, "eval_samples_per_second": 61.945, "eval_steps_per_second": 3.872, "step": 8000 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }