{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076923076923077, "grad_norm": 2.3324186560475026, "learning_rate": 5e-06, "loss": 0.9796, "step": 10 }, { "epoch": 0.06153846153846154, "grad_norm": 1.0624345768919523, "learning_rate": 5e-06, "loss": 0.8629, "step": 20 }, { "epoch": 0.09230769230769231, "grad_norm": 1.0856708355070386, "learning_rate": 5e-06, "loss": 0.8232, "step": 30 }, { "epoch": 0.12307692307692308, "grad_norm": 1.3950701294784864, "learning_rate": 5e-06, "loss": 0.8034, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 0.9965613643303693, "learning_rate": 5e-06, "loss": 0.7826, "step": 50 }, { "epoch": 0.18461538461538463, "grad_norm": 0.835067800401052, "learning_rate": 5e-06, "loss": 0.7723, "step": 60 }, { "epoch": 0.2153846153846154, "grad_norm": 0.781647957469068, "learning_rate": 5e-06, "loss": 0.767, "step": 70 }, { "epoch": 0.24615384615384617, "grad_norm": 1.1712549789017386, "learning_rate": 5e-06, "loss": 0.7559, "step": 80 }, { "epoch": 0.27692307692307694, "grad_norm": 1.0092940618549995, "learning_rate": 5e-06, "loss": 0.7469, "step": 90 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5624307958541879, "learning_rate": 5e-06, "loss": 0.7414, "step": 100 }, { "epoch": 0.3384615384615385, "grad_norm": 0.5739018842323079, "learning_rate": 5e-06, "loss": 0.7402, "step": 110 }, { "epoch": 0.36923076923076925, "grad_norm": 0.5881069634317425, "learning_rate": 5e-06, "loss": 0.7409, "step": 120 }, { "epoch": 0.4, "grad_norm": 0.7381656504744467, "learning_rate": 5e-06, "loss": 0.7376, "step": 130 }, { "epoch": 0.4307692307692308, "grad_norm": 0.5677544829127201, "learning_rate": 5e-06, "loss": 0.7298, "step": 140 }, { "epoch": 0.46153846153846156, "grad_norm": 0.7988060295240222, "learning_rate": 5e-06, "loss": 0.734, "step": 150 }, { "epoch": 0.49230769230769234, "grad_norm": 0.6718712556340437, "learning_rate": 5e-06, "loss": 0.7301, "step": 160 }, { "epoch": 0.5230769230769231, "grad_norm": 0.6738769041614976, "learning_rate": 5e-06, "loss": 0.7316, "step": 170 }, { "epoch": 0.5538461538461539, "grad_norm": 0.5838925373137313, "learning_rate": 5e-06, "loss": 0.7253, "step": 180 }, { "epoch": 0.5846153846153846, "grad_norm": 0.6725428665358475, "learning_rate": 5e-06, "loss": 0.7241, "step": 190 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5603996763654583, "learning_rate": 5e-06, "loss": 0.7228, "step": 200 }, { "epoch": 0.6461538461538462, "grad_norm": 0.7449029762390439, "learning_rate": 5e-06, "loss": 0.7204, "step": 210 }, { "epoch": 0.676923076923077, "grad_norm": 0.5236476916978245, "learning_rate": 5e-06, "loss": 0.717, "step": 220 }, { "epoch": 0.7076923076923077, "grad_norm": 0.6865974040573393, "learning_rate": 5e-06, "loss": 0.7134, "step": 230 }, { "epoch": 0.7384615384615385, "grad_norm": 0.6717184095724111, "learning_rate": 5e-06, "loss": 0.7159, "step": 240 }, { "epoch": 0.7692307692307693, "grad_norm": 0.8235556500295945, "learning_rate": 5e-06, "loss": 0.711, "step": 250 }, { "epoch": 0.8, "grad_norm": 0.5131501040264781, "learning_rate": 5e-06, "loss": 0.7165, "step": 260 }, { "epoch": 0.8307692307692308, "grad_norm": 0.5193918430229191, "learning_rate": 5e-06, "loss": 0.7055, "step": 270 }, { "epoch": 0.8615384615384616, "grad_norm": 0.5456897002979056, "learning_rate": 5e-06, "loss": 0.711, "step": 280 }, { "epoch": 0.8923076923076924, "grad_norm": 0.5750072559309753, "learning_rate": 5e-06, "loss": 0.7084, "step": 290 }, { "epoch": 0.9230769230769231, "grad_norm": 0.6116232515773206, "learning_rate": 5e-06, "loss": 0.7128, "step": 300 }, { "epoch": 0.9538461538461539, "grad_norm": 0.5681793375147939, "learning_rate": 5e-06, "loss": 0.7086, "step": 310 }, { "epoch": 0.9846153846153847, "grad_norm": 0.6343675909171526, "learning_rate": 5e-06, "loss": 0.7104, "step": 320 }, { "epoch": 1.0, "eval_loss": 0.7103046774864197, "eval_runtime": 341.5784, "eval_samples_per_second": 25.637, "eval_steps_per_second": 0.401, "step": 325 }, { "epoch": 1.0153846153846153, "grad_norm": 0.7692870192954333, "learning_rate": 5e-06, "loss": 0.6897, "step": 330 }, { "epoch": 1.0461538461538462, "grad_norm": 0.5505626206403108, "learning_rate": 5e-06, "loss": 0.6636, "step": 340 }, { "epoch": 1.0769230769230769, "grad_norm": 0.5824910302345806, "learning_rate": 5e-06, "loss": 0.665, "step": 350 }, { "epoch": 1.1076923076923078, "grad_norm": 0.6417795632315966, "learning_rate": 5e-06, "loss": 0.664, "step": 360 }, { "epoch": 1.1384615384615384, "grad_norm": 0.6909469669994084, "learning_rate": 5e-06, "loss": 0.6634, "step": 370 }, { "epoch": 1.1692307692307693, "grad_norm": 0.5616732809499269, "learning_rate": 5e-06, "loss": 0.6653, "step": 380 }, { "epoch": 1.2, "grad_norm": 0.567347803253029, "learning_rate": 5e-06, "loss": 0.6605, "step": 390 }, { "epoch": 1.2307692307692308, "grad_norm": 0.6020788318645328, "learning_rate": 5e-06, "loss": 0.6649, "step": 400 }, { "epoch": 1.2615384615384615, "grad_norm": 0.6247830402384781, "learning_rate": 5e-06, "loss": 0.6662, "step": 410 }, { "epoch": 1.2923076923076924, "grad_norm": 0.7223035560412274, "learning_rate": 5e-06, "loss": 0.6631, "step": 420 }, { "epoch": 1.323076923076923, "grad_norm": 0.7277266215407076, "learning_rate": 5e-06, "loss": 0.6596, "step": 430 }, { "epoch": 1.353846153846154, "grad_norm": 0.5531058222331595, "learning_rate": 5e-06, "loss": 0.6659, "step": 440 }, { "epoch": 1.3846153846153846, "grad_norm": 0.7560378640801586, "learning_rate": 5e-06, "loss": 0.6654, "step": 450 }, { "epoch": 1.4153846153846155, "grad_norm": 0.6002650702602214, "learning_rate": 5e-06, "loss": 0.6602, "step": 460 }, { "epoch": 1.4461538461538461, "grad_norm": 0.5446217104790005, "learning_rate": 5e-06, "loss": 0.6633, "step": 470 }, { "epoch": 1.476923076923077, "grad_norm": 0.6427854645725265, "learning_rate": 5e-06, "loss": 0.6606, "step": 480 }, { "epoch": 1.5076923076923077, "grad_norm": 0.5777163615379136, "learning_rate": 5e-06, "loss": 0.6619, "step": 490 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5645927733372786, "learning_rate": 5e-06, "loss": 0.6613, "step": 500 }, { "epoch": 1.5692307692307692, "grad_norm": 0.5877237236567665, "learning_rate": 5e-06, "loss": 0.6666, "step": 510 }, { "epoch": 1.6, "grad_norm": 0.7318508117188922, "learning_rate": 5e-06, "loss": 0.6657, "step": 520 }, { "epoch": 1.6307692307692307, "grad_norm": 0.5337461204074161, "learning_rate": 5e-06, "loss": 0.6678, "step": 530 }, { "epoch": 1.6615384615384614, "grad_norm": 0.6743103368990547, "learning_rate": 5e-06, "loss": 0.6593, "step": 540 }, { "epoch": 1.6923076923076923, "grad_norm": 0.7414434681428989, "learning_rate": 5e-06, "loss": 0.6575, "step": 550 }, { "epoch": 1.7230769230769232, "grad_norm": 0.8444136346063817, "learning_rate": 5e-06, "loss": 0.6631, "step": 560 }, { "epoch": 1.7538461538461538, "grad_norm": 0.5509955001744837, "learning_rate": 5e-06, "loss": 0.6592, "step": 570 }, { "epoch": 1.7846153846153845, "grad_norm": 0.6630480503790037, "learning_rate": 5e-06, "loss": 0.6594, "step": 580 }, { "epoch": 1.8153846153846154, "grad_norm": 0.5120012670268558, "learning_rate": 5e-06, "loss": 0.6568, "step": 590 }, { "epoch": 1.8461538461538463, "grad_norm": 0.5752130113945352, "learning_rate": 5e-06, "loss": 0.6599, "step": 600 }, { "epoch": 1.876923076923077, "grad_norm": 0.5835189539855182, "learning_rate": 5e-06, "loss": 0.6591, "step": 610 }, { "epoch": 1.9076923076923076, "grad_norm": 0.5603728195599548, "learning_rate": 5e-06, "loss": 0.6629, "step": 620 }, { "epoch": 1.9384615384615385, "grad_norm": 0.6856529088244573, "learning_rate": 5e-06, "loss": 0.6586, "step": 630 }, { "epoch": 1.9692307692307693, "grad_norm": 0.5160023610012462, "learning_rate": 5e-06, "loss": 0.6603, "step": 640 }, { "epoch": 2.0, "grad_norm": 0.6268329172925445, "learning_rate": 5e-06, "loss": 0.658, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.6978012323379517, "eval_runtime": 343.0061, "eval_samples_per_second": 25.53, "eval_steps_per_second": 0.399, "step": 650 }, { "epoch": 2.0307692307692307, "grad_norm": 0.7968437785991972, "learning_rate": 5e-06, "loss": 0.6088, "step": 660 }, { "epoch": 2.0615384615384613, "grad_norm": 1.1539294867806145, "learning_rate": 5e-06, "loss": 0.6083, "step": 670 }, { "epoch": 2.0923076923076924, "grad_norm": 0.9623521768346174, "learning_rate": 5e-06, "loss": 0.6149, "step": 680 }, { "epoch": 2.123076923076923, "grad_norm": 0.6040507943945268, "learning_rate": 5e-06, "loss": 0.6164, "step": 690 }, { "epoch": 2.1538461538461537, "grad_norm": 0.5851170034060879, "learning_rate": 5e-06, "loss": 0.6115, "step": 700 }, { "epoch": 2.184615384615385, "grad_norm": 0.600096702685487, "learning_rate": 5e-06, "loss": 0.6153, "step": 710 }, { "epoch": 2.2153846153846155, "grad_norm": 0.5589523777176305, "learning_rate": 5e-06, "loss": 0.6133, "step": 720 }, { "epoch": 2.246153846153846, "grad_norm": 0.5907169270955183, "learning_rate": 5e-06, "loss": 0.6131, "step": 730 }, { "epoch": 2.276923076923077, "grad_norm": 0.6628827223280479, "learning_rate": 5e-06, "loss": 0.6159, "step": 740 }, { "epoch": 2.3076923076923075, "grad_norm": 0.6215774766516785, "learning_rate": 5e-06, "loss": 0.6149, "step": 750 }, { "epoch": 2.3384615384615386, "grad_norm": 0.6649815302458084, "learning_rate": 5e-06, "loss": 0.6132, "step": 760 }, { "epoch": 2.3692307692307693, "grad_norm": 0.699779615122456, "learning_rate": 5e-06, "loss": 0.6128, "step": 770 }, { "epoch": 2.4, "grad_norm": 0.546711048417698, "learning_rate": 5e-06, "loss": 0.6164, "step": 780 }, { "epoch": 2.430769230769231, "grad_norm": 0.6327118907910535, "learning_rate": 5e-06, "loss": 0.623, "step": 790 }, { "epoch": 2.4615384615384617, "grad_norm": 0.5174866030508046, "learning_rate": 5e-06, "loss": 0.6166, "step": 800 }, { "epoch": 2.4923076923076923, "grad_norm": 0.5497949972140503, "learning_rate": 5e-06, "loss": 0.615, "step": 810 }, { "epoch": 2.523076923076923, "grad_norm": 0.5993580492245987, "learning_rate": 5e-06, "loss": 0.614, "step": 820 }, { "epoch": 2.5538461538461537, "grad_norm": 0.6442768713339674, "learning_rate": 5e-06, "loss": 0.6222, "step": 830 }, { "epoch": 2.5846153846153848, "grad_norm": 0.6428843121497717, "learning_rate": 5e-06, "loss": 0.617, "step": 840 }, { "epoch": 2.6153846153846154, "grad_norm": 0.6117392354746288, "learning_rate": 5e-06, "loss": 0.62, "step": 850 }, { "epoch": 2.646153846153846, "grad_norm": 0.7180295224765794, "learning_rate": 5e-06, "loss": 0.619, "step": 860 }, { "epoch": 2.676923076923077, "grad_norm": 0.6501140616713876, "learning_rate": 5e-06, "loss": 0.6164, "step": 870 }, { "epoch": 2.707692307692308, "grad_norm": 0.5412288061836285, "learning_rate": 5e-06, "loss": 0.6218, "step": 880 }, { "epoch": 2.7384615384615385, "grad_norm": 0.7302614499001912, "learning_rate": 5e-06, "loss": 0.6176, "step": 890 }, { "epoch": 2.769230769230769, "grad_norm": 0.6661401784715161, "learning_rate": 5e-06, "loss": 0.6217, "step": 900 }, { "epoch": 2.8, "grad_norm": 0.7263597509162613, "learning_rate": 5e-06, "loss": 0.6154, "step": 910 }, { "epoch": 2.830769230769231, "grad_norm": 0.7506085991128838, "learning_rate": 5e-06, "loss": 0.6189, "step": 920 }, { "epoch": 2.8615384615384616, "grad_norm": 0.5917421117589724, "learning_rate": 5e-06, "loss": 0.6211, "step": 930 }, { "epoch": 2.8923076923076922, "grad_norm": 0.5306708247183923, "learning_rate": 5e-06, "loss": 0.6175, "step": 940 }, { "epoch": 2.9230769230769234, "grad_norm": 0.6648394376559398, "learning_rate": 5e-06, "loss": 0.6222, "step": 950 }, { "epoch": 2.953846153846154, "grad_norm": 0.5459019838181679, "learning_rate": 5e-06, "loss": 0.6174, "step": 960 }, { "epoch": 2.9846153846153847, "grad_norm": 0.6951553333219783, "learning_rate": 5e-06, "loss": 0.6176, "step": 970 }, { "epoch": 3.0, "eval_loss": 0.6986876726150513, "eval_runtime": 342.0684, "eval_samples_per_second": 25.6, "eval_steps_per_second": 0.401, "step": 975 }, { "epoch": 3.0, "step": 975, "total_flos": 1632951934648320.0, "train_loss": 0.6751218311603253, "train_runtime": 57099.1896, "train_samples_per_second": 8.741, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 975, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1632951934648320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }