{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05649717514124294, "grad_norm": 1.9709218740463257, "learning_rate": 4.444444444444444e-06, "loss": 2.0949, "step": 5 }, { "epoch": 0.11299435028248588, "grad_norm": 0.6447364687919617, "learning_rate": 9.999999999999999e-06, "loss": 2.0169, "step": 10 }, { "epoch": 0.1694915254237288, "grad_norm": 0.4998757839202881, "learning_rate": 1.5555555555555555e-05, "loss": 1.9899, "step": 15 }, { "epoch": 0.22598870056497175, "grad_norm": 0.45615148544311523, "learning_rate": 2.111111111111111e-05, "loss": 1.9512, "step": 20 }, { "epoch": 0.2824858757062147, "grad_norm": 0.40060025453567505, "learning_rate": 2.6666666666666667e-05, "loss": 1.8531, "step": 25 }, { "epoch": 0.3389830508474576, "grad_norm": 0.3707159161567688, "learning_rate": 2.9994859874633358e-05, "loss": 1.8622, "step": 30 }, { "epoch": 0.3954802259887006, "grad_norm": 0.37233778834342957, "learning_rate": 2.9937073913619926e-05, "loss": 1.7941, "step": 35 }, { "epoch": 0.4519774011299435, "grad_norm": 0.47624891996383667, "learning_rate": 2.981532510892707e-05, "loss": 1.6613, "step": 40 }, { "epoch": 0.5084745762711864, "grad_norm": 0.4137255847454071, "learning_rate": 2.963013480762769e-05, "loss": 1.6513, "step": 45 }, { "epoch": 0.5649717514124294, "grad_norm": 0.5203472971916199, "learning_rate": 2.9382296023022895e-05, "loss": 1.5814, "step": 50 }, { "epoch": 0.6214689265536724, "grad_norm": 0.558957576751709, "learning_rate": 2.9072870038837266e-05, "loss": 1.5635, "step": 55 }, { "epoch": 0.6779661016949152, "grad_norm": 0.5810022950172424, "learning_rate": 2.8703181864639013e-05, "loss": 1.4292, "step": 60 }, { "epoch": 0.7344632768361582, "grad_norm": 0.6703647971153259, "learning_rate": 2.827481456194563e-05, "loss": 1.377, "step": 65 }, { "epoch": 0.7909604519774012, "grad_norm": 0.6997694969177246, "learning_rate": 2.7789602465311384e-05, "loss": 1.2762, "step": 70 }, { "epoch": 0.847457627118644, "grad_norm": 0.7786891460418701, "learning_rate": 2.7249623327425187e-05, "loss": 1.2512, "step": 75 }, { "epoch": 0.903954802259887, "grad_norm": 0.9769183993339539, "learning_rate": 2.6657189421854564e-05, "loss": 1.1935, "step": 80 }, { "epoch": 0.96045197740113, "grad_norm": 1.0085625648498535, "learning_rate": 2.6014837641535285e-05, "loss": 1.1678, "step": 85 }, { "epoch": 1.0112994350282485, "grad_norm": 1.1803938150405884, "learning_rate": 2.5325318635406308e-05, "loss": 1.0728, "step": 90 }, { "epoch": 1.0677966101694916, "grad_norm": 1.2611010074615479, "learning_rate": 2.4591585029708772e-05, "loss": 0.9305, "step": 95 }, { "epoch": 1.1242937853107344, "grad_norm": 1.2549060583114624, "learning_rate": 2.3816778784387097e-05, "loss": 0.9051, "step": 100 }, { "epoch": 1.1807909604519775, "grad_norm": 1.189209222793579, "learning_rate": 2.3004217738734173e-05, "loss": 0.857, "step": 105 }, { "epoch": 1.2372881355932204, "grad_norm": 1.4436615705490112, "learning_rate": 2.2157381403894126e-05, "loss": 0.7732, "step": 110 }, { "epoch": 1.2937853107344632, "grad_norm": 1.2701079845428467, "learning_rate": 2.1279896063061422e-05, "loss": 0.7486, "step": 115 }, { "epoch": 1.3502824858757063, "grad_norm": 1.6101775169372559, "learning_rate": 2.03755192431795e-05, "loss": 0.703, "step": 120 }, { "epoch": 1.4067796610169492, "grad_norm": 1.269362211227417, "learning_rate": 1.9448123624633565e-05, "loss": 0.6978, "step": 125 }, { "epoch": 1.463276836158192, "grad_norm": 1.514592170715332, "learning_rate": 1.8501680457838582e-05, "loss": 0.6686, "step": 130 }, { "epoch": 1.5197740112994351, "grad_norm": 1.4410349130630493, "learning_rate": 1.7540242557735366e-05, "loss": 0.5763, "step": 135 }, { "epoch": 1.576271186440678, "grad_norm": 1.8117526769638062, "learning_rate": 1.6567926949014805e-05, "loss": 0.5527, "step": 140 }, { "epoch": 1.6327683615819208, "grad_norm": 1.3523154258728027, "learning_rate": 1.558889723638603e-05, "loss": 0.5502, "step": 145 }, { "epoch": 1.689265536723164, "grad_norm": 1.4709467887878418, "learning_rate": 1.4607345775381906e-05, "loss": 0.5152, "step": 150 }, { "epoch": 1.7457627118644068, "grad_norm": 1.6748976707458496, "learning_rate": 1.3627475720048966e-05, "loss": 0.5193, "step": 155 }, { "epoch": 1.8022598870056497, "grad_norm": 1.3551208972930908, "learning_rate": 1.2653483024396535e-05, "loss": 0.4585, "step": 160 }, { "epoch": 1.8587570621468927, "grad_norm": 1.7820173501968384, "learning_rate": 1.1689538474677485e-05, "loss": 0.4549, "step": 165 }, { "epoch": 1.9152542372881356, "grad_norm": 1.6256532669067383, "learning_rate": 1.073976982944116e-05, "loss": 0.4349, "step": 170 }, { "epoch": 1.9717514124293785, "grad_norm": 1.5240833759307861, "learning_rate": 9.808244143837603e-06, "loss": 0.3836, "step": 175 }, { "epoch": 2.022598870056497, "grad_norm": 1.3894063234329224, "learning_rate": 8.898950353863e-06, "loss": 0.3354, "step": 180 }, { "epoch": 2.07909604519774, "grad_norm": 1.5614291429519653, "learning_rate": 8.015782195123329e-06, "loss": 0.3003, "step": 185 }, { "epoch": 2.135593220338983, "grad_norm": 1.3802332878112793, "learning_rate": 7.1625215292607685e-06, "loss": 0.2765, "step": 190 }, { "epoch": 2.1920903954802258, "grad_norm": 1.4348934888839722, "learning_rate": 6.3428221494414976e-06, "loss": 0.2754, "step": 195 }, { "epoch": 2.248587570621469, "grad_norm": 1.1548937559127808, "learning_rate": 5.560194134252441e-06, "loss": 0.2583, "step": 200 }, { "epoch": 2.305084745762712, "grad_norm": 1.5925812721252441, "learning_rate": 4.817988817005873e-06, "loss": 0.2607, "step": 205 }, { "epoch": 2.361581920903955, "grad_norm": 1.2499586343765259, "learning_rate": 4.119384434815689e-06, "loss": 0.2518, "step": 210 }, { "epoch": 2.4180790960451977, "grad_norm": 1.2626211643218994, "learning_rate": 3.4673725188981083e-06, "loss": 0.2318, "step": 215 }, { "epoch": 2.4745762711864407, "grad_norm": 1.214537262916565, "learning_rate": 2.86474508437579e-06, "loss": 0.2329, "step": 220 }, { "epoch": 2.5310734463276834, "grad_norm": 1.0615917444229126, "learning_rate": 2.314082674440402e-06, "loss": 0.2075, "step": 225 }, { "epoch": 2.5875706214689265, "grad_norm": 1.1500240564346313, "learning_rate": 1.817743310070521e-06, "loss": 0.2267, "step": 230 }, { "epoch": 2.6440677966101696, "grad_norm": 1.0365818738937378, "learning_rate": 1.3778523926237797e-06, "loss": 0.1853, "step": 235 }, { "epoch": 2.7005649717514126, "grad_norm": 1.3539810180664062, "learning_rate": 9.962936025419755e-07, "loss": 0.2178, "step": 240 }, { "epoch": 2.7570621468926553, "grad_norm": 1.1099207401275635, "learning_rate": 6.747008331422006e-07, "loss": 0.2211, "step": 245 }, { "epoch": 2.8135593220338984, "grad_norm": 1.1119697093963623, "learning_rate": 4.1445119403485165e-07, "loss": 0.2054, "step": 250 }, { "epoch": 2.870056497175141, "grad_norm": 1.1232099533081055, "learning_rate": 2.1665911412883376e-07, "loss": 0.2283, "step": 255 }, { "epoch": 2.926553672316384, "grad_norm": 1.1913937330245972, "learning_rate": 8.217156947590066e-08, "loss": 0.1973, "step": 260 }, { "epoch": 2.983050847457627, "grad_norm": 1.1854861974716187, "learning_rate": 1.1564456389156486e-08, "loss": 0.2062, "step": 265 }, { "epoch": 3.0, "step": 267, "total_flos": 4.7428635473844634e+17, "train_loss": 0.821970669629422, "train_runtime": 236.2192, "train_samples_per_second": 35.967, "train_steps_per_second": 1.13 } ], "logging_steps": 5, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.7428635473844634e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }