{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09111617312072894, "grad_norm": 0.6609868407249451, "learning_rate": 2.7272727272727273e-05, "loss": 0.9616, "step": 10 }, { "epoch": 0.18223234624145787, "grad_norm": 0.47854021191596985, "learning_rate": 5.757575757575758e-05, "loss": 0.6243, "step": 20 }, { "epoch": 0.2733485193621868, "grad_norm": 0.3362863063812256, "learning_rate": 8.787878787878789e-05, "loss": 0.4652, "step": 30 }, { "epoch": 0.36446469248291574, "grad_norm": 0.2833992838859558, "learning_rate": 9.989933382359422e-05, "loss": 0.417, "step": 40 }, { "epoch": 0.45558086560364464, "grad_norm": 0.25973188877105713, "learning_rate": 9.928561894527353e-05, "loss": 0.3717, "step": 50 }, { "epoch": 0.5466970387243736, "grad_norm": 0.2526822090148926, "learning_rate": 9.812096688325354e-05, "loss": 0.3537, "step": 60 }, { "epoch": 0.6378132118451025, "grad_norm": 0.27696409821510315, "learning_rate": 9.641839665080363e-05, "loss": 0.3549, "step": 70 }, { "epoch": 0.7289293849658315, "grad_norm": 0.2620004117488861, "learning_rate": 9.419694035645751e-05, "loss": 0.3487, "step": 80 }, { "epoch": 0.8200455580865603, "grad_norm": 0.2623962461948395, "learning_rate": 9.14814304544018e-05, "loss": 0.3373, "step": 90 }, { "epoch": 0.9111617312072893, "grad_norm": 0.25898459553718567, "learning_rate": 8.83022221559489e-05, "loss": 0.3258, "step": 100 }, { "epoch": 0.9111617312072893, "eval_loss": 0.3247193992137909, "eval_runtime": 220.1924, "eval_samples_per_second": 1.771, "eval_steps_per_second": 0.445, "step": 100 }, { "epoch": 1.0, "grad_norm": 0.2992849051952362, "learning_rate": 8.469485410510545e-05, "loss": 0.3189, "step": 110 }, { "epoch": 1.0911161731207288, "grad_norm": 0.2806636691093445, "learning_rate": 8.06996511113601e-05, "loss": 0.2635, "step": 120 }, { "epoch": 1.182232346241458, "grad_norm": 0.3009299039840698, "learning_rate": 7.636127338052512e-05, "loss": 0.2591, "step": 130 }, { "epoch": 1.2733485193621867, "grad_norm": 0.2634446322917938, "learning_rate": 7.172821728253562e-05, "loss": 0.2522, "step": 140 }, { "epoch": 1.3644646924829158, "grad_norm": 0.25836703181266785, "learning_rate": 6.685227323685209e-05, "loss": 0.255, "step": 150 }, { "epoch": 1.4555808656036446, "grad_norm": 0.2524298131465912, "learning_rate": 6.178794677547137e-05, "loss": 0.2609, "step": 160 }, { "epoch": 1.5466970387243735, "grad_norm": 0.2687428593635559, "learning_rate": 5.6591849255168015e-05, "loss": 0.2486, "step": 170 }, { "epoch": 1.6378132118451025, "grad_norm": 0.2657402455806732, "learning_rate": 5.132206502986368e-05, "loss": 0.2414, "step": 180 }, { "epoch": 1.7289293849658316, "grad_norm": 0.270682156085968, "learning_rate": 4.603750215716057e-05, "loss": 0.2416, "step": 190 }, { "epoch": 1.8200455580865604, "grad_norm": 0.2766685485839844, "learning_rate": 4.0797233897138985e-05, "loss": 0.2463, "step": 200 }, { "epoch": 1.8200455580865604, "eval_loss": 0.30126097798347473, "eval_runtime": 97.3172, "eval_samples_per_second": 4.008, "eval_steps_per_second": 1.007, "step": 200 }, { "epoch": 1.9111617312072893, "grad_norm": 0.25800785422325134, "learning_rate": 3.5659838364445505e-05, "loss": 0.2402, "step": 210 }, { "epoch": 2.0, "grad_norm": 0.278238445520401, "learning_rate": 3.0682743715343564e-05, "loss": 0.2455, "step": 220 }, { "epoch": 2.091116173120729, "grad_norm": 0.25559282302856445, "learning_rate": 2.5921586189524694e-05, "loss": 0.1869, "step": 230 }, { "epoch": 2.1822323462414577, "grad_norm": 0.271758109331131, "learning_rate": 2.1429588182782144e-05, "loss": 0.182, "step": 240 }, { "epoch": 2.273348519362187, "grad_norm": 0.26463642716407776, "learning_rate": 1.725696330273575e-05, "loss": 0.1812, "step": 250 }, { "epoch": 2.364464692482916, "grad_norm": 0.26230356097221375, "learning_rate": 1.345035505816642e-05, "loss": 0.186, "step": 260 }, { "epoch": 2.4555808656036446, "grad_norm": 0.26036038994789124, "learning_rate": 1.0052315456547934e-05, "loss": 0.1816, "step": 270 }, { "epoch": 2.5466970387243735, "grad_norm": 0.24081580340862274, "learning_rate": 7.100829338251147e-06, "loss": 0.165, "step": 280 }, { "epoch": 2.6378132118451028, "grad_norm": 0.2601291239261627, "learning_rate": 4.6288897646302785e-06, "loss": 0.1747, "step": 290 }, { "epoch": 2.7289293849658316, "grad_norm": 0.26094359159469604, "learning_rate": 2.664129206497479e-06, "loss": 0.1706, "step": 300 }, { "epoch": 2.7289293849658316, "eval_loss": 0.3079453408718109, "eval_runtime": 96.434, "eval_samples_per_second": 4.044, "eval_steps_per_second": 1.016, "step": 300 }, { "epoch": 2.8200455580865604, "grad_norm": 0.25573495030403137, "learning_rate": 1.2285106557296477e-06, "loss": 0.1731, "step": 310 }, { "epoch": 2.9111617312072893, "grad_norm": 0.23953253030776978, "learning_rate": 3.380821129028489e-07, "loss": 0.1722, "step": 320 }, { "epoch": 3.0, "grad_norm": 0.28564217686653137, "learning_rate": 2.797195404247166e-09, "loss": 0.1705, "step": 330 }, { "epoch": 3.0, "step": 330, "total_flos": 5.7502645012240794e+17, "train_loss": 0.2902192401163506, "train_runtime": 4452.9364, "train_samples_per_second": 2.365, "train_steps_per_second": 0.074 } ], "logging_steps": 10, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.7502645012240794e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }