{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.967105263157895, "eval_steps": 500, "global_step": 755, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06578947368421052, "grad_norm": 12741.283203125, "learning_rate": 2.368421052631579e-06, "loss": 1343.5036, "step": 10 }, { "epoch": 0.13157894736842105, "grad_norm": 3765.042724609375, "learning_rate": 5e-06, "loss": 1025.8134, "step": 20 }, { "epoch": 0.19736842105263158, "grad_norm": 3857.6552734375, "learning_rate": 7.631578947368423e-06, "loss": 931.7692, "step": 30 }, { "epoch": 0.2631578947368421, "grad_norm": 3910.090087890625, "learning_rate": 9.986052998605302e-06, "loss": 828.1059, "step": 40 }, { "epoch": 0.32894736842105265, "grad_norm": 3312.494140625, "learning_rate": 9.846582984658299e-06, "loss": 701.0779, "step": 50 }, { "epoch": 0.39473684210526316, "grad_norm": 7742.6982421875, "learning_rate": 9.707112970711298e-06, "loss": 575.7785, "step": 60 }, { "epoch": 0.4605263157894737, "grad_norm": 4383.2861328125, "learning_rate": 9.567642956764297e-06, "loss": 683.3667, "step": 70 }, { "epoch": 0.5263157894736842, "grad_norm": 4326.119140625, "learning_rate": 9.428172942817295e-06, "loss": 689.7952, "step": 80 }, { "epoch": 0.5921052631578947, "grad_norm": 3050.6357421875, "learning_rate": 9.288702928870293e-06, "loss": 529.22, "step": 90 }, { "epoch": 0.6578947368421053, "grad_norm": 8133.0595703125, "learning_rate": 9.149232914923292e-06, "loss": 493.8726, "step": 100 }, { "epoch": 0.7236842105263158, "grad_norm": 4629.41162109375, "learning_rate": 9.00976290097629e-06, "loss": 480.3524, "step": 110 }, { "epoch": 0.7894736842105263, "grad_norm": 3343.38232421875, "learning_rate": 8.87029288702929e-06, "loss": 519.6425, "step": 120 }, { "epoch": 0.8552631578947368, "grad_norm": 3955.08447265625, "learning_rate": 8.730822873082288e-06, "loss": 580.8739, "step": 130 }, { "epoch": 0.9210526315789473, "grad_norm": 4120.26123046875, "learning_rate": 8.591352859135287e-06, "loss": 566.2863, "step": 140 }, { "epoch": 0.9868421052631579, "grad_norm": 2743.97119140625, "learning_rate": 8.451882845188284e-06, "loss": 498.7171, "step": 150 }, { "epoch": 1.0526315789473684, "grad_norm": 3119.446044921875, "learning_rate": 8.312412831241283e-06, "loss": 448.7976, "step": 160 }, { "epoch": 1.118421052631579, "grad_norm": 3619.334228515625, "learning_rate": 8.172942817294282e-06, "loss": 490.538, "step": 170 }, { "epoch": 1.1842105263157894, "grad_norm": 3563.162353515625, "learning_rate": 8.033472803347281e-06, "loss": 550.9566, "step": 180 }, { "epoch": 1.25, "grad_norm": 3006.31884765625, "learning_rate": 7.89400278940028e-06, "loss": 496.8316, "step": 190 }, { "epoch": 1.3157894736842106, "grad_norm": 3445.16650390625, "learning_rate": 7.754532775453279e-06, "loss": 468.8848, "step": 200 }, { "epoch": 1.381578947368421, "grad_norm": 2652.3056640625, "learning_rate": 7.615062761506277e-06, "loss": 386.9485, "step": 210 }, { "epoch": 1.4473684210526316, "grad_norm": 7152.8544921875, "learning_rate": 7.475592747559275e-06, "loss": 406.5552, "step": 220 }, { "epoch": 1.513157894736842, "grad_norm": 2007.9990234375, "learning_rate": 7.3361227336122745e-06, "loss": 453.7567, "step": 230 }, { "epoch": 1.5789473684210527, "grad_norm": 3840.892578125, "learning_rate": 7.1966527196652726e-06, "loss": 379.3861, "step": 240 }, { "epoch": 1.6447368421052633, "grad_norm": 3999.458740234375, "learning_rate": 7.057182705718271e-06, "loss": 477.9281, "step": 250 }, { "epoch": 1.7105263157894737, "grad_norm": 2132.456787109375, "learning_rate": 6.91771269177127e-06, "loss": 410.921, "step": 260 }, { "epoch": 1.776315789473684, "grad_norm": 3105.357421875, "learning_rate": 6.778242677824268e-06, "loss": 442.1194, "step": 270 }, { "epoch": 1.8421052631578947, "grad_norm": 2635.864501953125, "learning_rate": 6.6387726638772664e-06, "loss": 389.496, "step": 280 }, { "epoch": 1.9078947368421053, "grad_norm": 2923.394287109375, "learning_rate": 6.499302649930266e-06, "loss": 439.763, "step": 290 }, { "epoch": 1.973684210526316, "grad_norm": 3647.169921875, "learning_rate": 6.359832635983264e-06, "loss": 406.6518, "step": 300 }, { "epoch": 2.039473684210526, "grad_norm": 2935.630126953125, "learning_rate": 6.220362622036262e-06, "loss": 331.8123, "step": 310 }, { "epoch": 2.1052631578947367, "grad_norm": 2370.527099609375, "learning_rate": 6.080892608089262e-06, "loss": 343.2197, "step": 320 }, { "epoch": 2.1710526315789473, "grad_norm": 2738.644287109375, "learning_rate": 5.94142259414226e-06, "loss": 369.9035, "step": 330 }, { "epoch": 2.236842105263158, "grad_norm": 2592.207275390625, "learning_rate": 5.801952580195258e-06, "loss": 402.548, "step": 340 }, { "epoch": 2.3026315789473686, "grad_norm": 4248.5478515625, "learning_rate": 5.662482566248258e-06, "loss": 457.1924, "step": 350 }, { "epoch": 2.3684210526315788, "grad_norm": 2736.706298828125, "learning_rate": 5.523012552301256e-06, "loss": 381.4614, "step": 360 }, { "epoch": 2.4342105263157894, "grad_norm": 2056.372802734375, "learning_rate": 5.383542538354254e-06, "loss": 363.8397, "step": 370 }, { "epoch": 2.5, "grad_norm": 2987.6806640625, "learning_rate": 5.244072524407254e-06, "loss": 432.6885, "step": 380 }, { "epoch": 2.5657894736842106, "grad_norm": 4919.654296875, "learning_rate": 5.104602510460252e-06, "loss": 395.5194, "step": 390 }, { "epoch": 2.6315789473684212, "grad_norm": 2719.568115234375, "learning_rate": 4.9651324965132506e-06, "loss": 377.963, "step": 400 }, { "epoch": 2.6973684210526314, "grad_norm": 3378.98876953125, "learning_rate": 4.825662482566249e-06, "loss": 454.2221, "step": 410 }, { "epoch": 2.763157894736842, "grad_norm": 941.3067016601562, "learning_rate": 4.6861924686192475e-06, "loss": 334.1994, "step": 420 }, { "epoch": 2.8289473684210527, "grad_norm": 3209.615478515625, "learning_rate": 4.546722454672246e-06, "loss": 386.7458, "step": 430 }, { "epoch": 2.8947368421052633, "grad_norm": 2738.612060546875, "learning_rate": 4.407252440725244e-06, "loss": 313.3205, "step": 440 }, { "epoch": 2.9605263157894735, "grad_norm": 2505.005126953125, "learning_rate": 4.267782426778243e-06, "loss": 318.3042, "step": 450 }, { "epoch": 3.026315789473684, "grad_norm": 2280.536865234375, "learning_rate": 4.128312412831242e-06, "loss": 355.1427, "step": 460 }, { "epoch": 3.0921052631578947, "grad_norm": 3746.776123046875, "learning_rate": 3.98884239888424e-06, "loss": 321.6535, "step": 470 }, { "epoch": 3.1578947368421053, "grad_norm": 2580.090087890625, "learning_rate": 3.849372384937239e-06, "loss": 389.3218, "step": 480 }, { "epoch": 3.223684210526316, "grad_norm": 2343.642578125, "learning_rate": 3.7099023709902376e-06, "loss": 359.2708, "step": 490 }, { "epoch": 3.2894736842105265, "grad_norm": 2266.267822265625, "learning_rate": 3.570432357043236e-06, "loss": 435.1246, "step": 500 }, { "epoch": 3.3552631578947367, "grad_norm": 3043.51318359375, "learning_rate": 3.4309623430962345e-06, "loss": 321.2599, "step": 510 }, { "epoch": 3.4210526315789473, "grad_norm": 2253.009765625, "learning_rate": 3.2914923291492334e-06, "loss": 310.0588, "step": 520 }, { "epoch": 3.486842105263158, "grad_norm": 4041.337890625, "learning_rate": 3.152022315202232e-06, "loss": 302.3955, "step": 530 }, { "epoch": 3.5526315789473686, "grad_norm": 3892.867431640625, "learning_rate": 3.0125523012552303e-06, "loss": 372.87, "step": 540 }, { "epoch": 3.6184210526315788, "grad_norm": 1721.0771484375, "learning_rate": 2.873082287308229e-06, "loss": 339.0192, "step": 550 }, { "epoch": 3.6842105263157894, "grad_norm": 2730.7734375, "learning_rate": 2.7336122733612273e-06, "loss": 335.406, "step": 560 }, { "epoch": 3.75, "grad_norm": 2706.309814453125, "learning_rate": 2.594142259414226e-06, "loss": 357.9787, "step": 570 }, { "epoch": 3.8157894736842106, "grad_norm": 5227.97705078125, "learning_rate": 2.4546722454672246e-06, "loss": 362.4296, "step": 580 }, { "epoch": 3.8815789473684212, "grad_norm": 2932.572998046875, "learning_rate": 2.315202231520223e-06, "loss": 329.5911, "step": 590 }, { "epoch": 3.9473684210526314, "grad_norm": 2948.697265625, "learning_rate": 2.175732217573222e-06, "loss": 366.91, "step": 600 }, { "epoch": 4.0131578947368425, "grad_norm": 2344.489013671875, "learning_rate": 2.0362622036262205e-06, "loss": 314.6336, "step": 610 }, { "epoch": 4.078947368421052, "grad_norm": 3181.4521484375, "learning_rate": 1.8967921896792191e-06, "loss": 309.1377, "step": 620 }, { "epoch": 4.144736842105263, "grad_norm": 2467.346435546875, "learning_rate": 1.7573221757322176e-06, "loss": 349.3991, "step": 630 }, { "epoch": 4.2105263157894735, "grad_norm": 4259.5654296875, "learning_rate": 1.6178521617852163e-06, "loss": 318.5878, "step": 640 }, { "epoch": 4.276315789473684, "grad_norm": 2608.833984375, "learning_rate": 1.478382147838215e-06, "loss": 329.6976, "step": 650 }, { "epoch": 4.342105263157895, "grad_norm": 4429.38671875, "learning_rate": 1.3389121338912134e-06, "loss": 308.7365, "step": 660 }, { "epoch": 4.407894736842105, "grad_norm": 3620.8515625, "learning_rate": 1.199442119944212e-06, "loss": 353.2343, "step": 670 }, { "epoch": 4.473684210526316, "grad_norm": 3577.33203125, "learning_rate": 1.0599721059972108e-06, "loss": 304.5744, "step": 680 }, { "epoch": 4.5394736842105265, "grad_norm": 2387.5029296875, "learning_rate": 9.205020920502093e-07, "loss": 388.6656, "step": 690 }, { "epoch": 4.605263157894737, "grad_norm": 2653.211669921875, "learning_rate": 7.810320781032078e-07, "loss": 360.9954, "step": 700 }, { "epoch": 4.671052631578947, "grad_norm": 3058.907470703125, "learning_rate": 6.415620641562065e-07, "loss": 417.5069, "step": 710 }, { "epoch": 4.7368421052631575, "grad_norm": 3122.1025390625, "learning_rate": 5.020920502092051e-07, "loss": 286.6377, "step": 720 }, { "epoch": 4.802631578947368, "grad_norm": 2571.658935546875, "learning_rate": 3.6262203626220363e-07, "loss": 289.6024, "step": 730 }, { "epoch": 4.868421052631579, "grad_norm": 4391.81591796875, "learning_rate": 2.2315202231520225e-07, "loss": 301.8846, "step": 740 }, { "epoch": 4.934210526315789, "grad_norm": 2866.98486328125, "learning_rate": 8.368200836820084e-08, "loss": 348.0844, "step": 750 } ], "logging_steps": 10, "max_steps": 755, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }