{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 199, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005037783375314861, "grad_norm": 6.495429992675781, "learning_rate": 2.5e-08, "loss": 0.0456, "num_input_tokens_seen": 14432, "step": 1 }, { "epoch": 0.02518891687657431, "grad_norm": 29.901782989501953, "learning_rate": 1.25e-07, "loss": 0.398, "num_input_tokens_seen": 77072, "step": 5 }, { "epoch": 0.05037783375314862, "grad_norm": 27.349750518798828, "learning_rate": 2.5e-07, "loss": 0.1862, "num_input_tokens_seen": 149192, "step": 10 }, { "epoch": 0.07556675062972293, "grad_norm": 5.38295841217041, "learning_rate": 3.75e-07, "loss": 0.2736, "num_input_tokens_seen": 223528, "step": 15 }, { "epoch": 0.10075566750629723, "grad_norm": 28.08098030090332, "learning_rate": 5e-07, "loss": 0.3118, "num_input_tokens_seen": 303576, "step": 20 }, { "epoch": 0.12594458438287154, "grad_norm": 16.091209411621094, "learning_rate": 6.249999999999999e-07, "loss": 0.138, "num_input_tokens_seen": 378072, "step": 25 }, { "epoch": 0.15113350125944586, "grad_norm": 8.152594566345215, "learning_rate": 7.5e-07, "loss": 0.2662, "num_input_tokens_seen": 454800, "step": 30 }, { "epoch": 0.17632241813602015, "grad_norm": 4.938981056213379, "learning_rate": 8.75e-07, "loss": 0.1581, "num_input_tokens_seen": 529928, "step": 35 }, { "epoch": 0.20151133501259447, "grad_norm": 14.633964538574219, "learning_rate": 1e-06, "loss": 0.1316, "num_input_tokens_seen": 603768, "step": 40 }, { "epoch": 0.22670025188916876, "grad_norm": 16.411048889160156, "learning_rate": 9.995133583167832e-07, "loss": 0.208, "num_input_tokens_seen": 680480, "step": 45 }, { "epoch": 0.2518891687657431, "grad_norm": 9.24244213104248, "learning_rate": 9.980543805476444e-07, "loss": 0.1438, "num_input_tokens_seen": 755408, "step": 50 }, { "epoch": 0.2770780856423174, "grad_norm": 2.8657264709472656, "learning_rate": 9.95625906690173e-07, "loss": 0.0899, "num_input_tokens_seen": 830200, "step": 55 }, { "epoch": 0.3022670025188917, "grad_norm": 6.731296062469482, "learning_rate": 9.922326639307916e-07, "loss": 0.1614, "num_input_tokens_seen": 910000, "step": 60 }, { "epoch": 0.327455919395466, "grad_norm": 4.519056797027588, "learning_rate": 9.87881257442972e-07, "loss": 0.1569, "num_input_tokens_seen": 987368, "step": 65 }, { "epoch": 0.3526448362720403, "grad_norm": 27.61652374267578, "learning_rate": 9.825801575298247e-07, "loss": 0.2221, "num_input_tokens_seen": 1064224, "step": 70 }, { "epoch": 0.3778337531486146, "grad_norm": 4.453612804412842, "learning_rate": 9.763396831360883e-07, "loss": 0.1448, "num_input_tokens_seen": 1136896, "step": 75 }, { "epoch": 0.40302267002518893, "grad_norm": 7.729987144470215, "learning_rate": 9.691719817616146e-07, "loss": 0.0765, "num_input_tokens_seen": 1215664, "step": 80 }, { "epoch": 0.4282115869017632, "grad_norm": 5.992551326751709, "learning_rate": 9.61091005815451e-07, "loss": 0.0614, "num_input_tokens_seen": 1290864, "step": 85 }, { "epoch": 0.4534005037783375, "grad_norm": 3.615497589111328, "learning_rate": 9.521124854565424e-07, "loss": 0.0916, "num_input_tokens_seen": 1364584, "step": 90 }, { "epoch": 0.47858942065491183, "grad_norm": 34.75035095214844, "learning_rate": 9.422538979739307e-07, "loss": 0.1149, "num_input_tokens_seen": 1441024, "step": 95 }, { "epoch": 0.5037783375314862, "grad_norm": 14.529001235961914, "learning_rate": 9.315344337660421e-07, "loss": 0.1291, "num_input_tokens_seen": 1518184, "step": 100 }, { "epoch": 0.5289672544080605, "grad_norm": 11.793113708496094, "learning_rate": 9.199749589852978e-07, "loss": 0.257, "num_input_tokens_seen": 1595672, "step": 105 }, { "epoch": 0.5541561712846348, "grad_norm": 18.369070053100586, "learning_rate": 9.07597974920756e-07, "loss": 0.1978, "num_input_tokens_seen": 1673760, "step": 110 }, { "epoch": 0.5793450881612091, "grad_norm": 8.60573959350586, "learning_rate": 8.944275741978494e-07, "loss": 0.1907, "num_input_tokens_seen": 1752864, "step": 115 }, { "epoch": 0.6045340050377834, "grad_norm": 4.430349349975586, "learning_rate": 8.804893938804838e-07, "loss": 0.2279, "num_input_tokens_seen": 1830584, "step": 120 }, { "epoch": 0.6297229219143576, "grad_norm": 6.285661697387695, "learning_rate": 8.658105655667819e-07, "loss": 0.1144, "num_input_tokens_seen": 1907336, "step": 125 }, { "epoch": 0.654911838790932, "grad_norm": 5.751603126525879, "learning_rate": 8.504196625756165e-07, "loss": 0.1368, "num_input_tokens_seen": 1981776, "step": 130 }, { "epoch": 0.6801007556675063, "grad_norm": 14.740753173828125, "learning_rate": 8.343466443267389e-07, "loss": 0.1961, "num_input_tokens_seen": 2058696, "step": 135 }, { "epoch": 0.7052896725440806, "grad_norm": 6.905796051025391, "learning_rate": 8.176227980227692e-07, "loss": 0.0864, "num_input_tokens_seen": 2135904, "step": 140 }, { "epoch": 0.7304785894206549, "grad_norm": 10.428398132324219, "learning_rate": 8.002806777465684e-07, "loss": 0.18, "num_input_tokens_seen": 2213328, "step": 145 }, { "epoch": 0.7556675062972292, "grad_norm": 12.141814231872559, "learning_rate": 7.823540410925433e-07, "loss": 0.0682, "num_input_tokens_seen": 2289976, "step": 150 }, { "epoch": 0.7808564231738035, "grad_norm": 13.058424949645996, "learning_rate": 7.63877783455237e-07, "loss": 0.2185, "num_input_tokens_seen": 2366048, "step": 155 }, { "epoch": 0.8060453400503779, "grad_norm": 8.26505184173584, "learning_rate": 7.448878701031142e-07, "loss": 0.0514, "num_input_tokens_seen": 2440224, "step": 160 }, { "epoch": 0.8312342569269522, "grad_norm": 18.119977951049805, "learning_rate": 7.254212661697659e-07, "loss": 0.13, "num_input_tokens_seen": 2518392, "step": 165 }, { "epoch": 0.8564231738035264, "grad_norm": 14.345734596252441, "learning_rate": 7.055158646988109e-07, "loss": 0.1626, "num_input_tokens_seen": 2591048, "step": 170 }, { "epoch": 0.8816120906801007, "grad_norm": 5.978022575378418, "learning_rate": 6.852104128825569e-07, "loss": 0.1861, "num_input_tokens_seen": 2666192, "step": 175 }, { "epoch": 0.906801007556675, "grad_norm": 6.636469841003418, "learning_rate": 6.64544436638005e-07, "loss": 0.1828, "num_input_tokens_seen": 2740216, "step": 180 }, { "epoch": 0.9319899244332494, "grad_norm": 10.074236869812012, "learning_rate": 6.435581636670153e-07, "loss": 0.1638, "num_input_tokens_seen": 2815464, "step": 185 }, { "epoch": 0.9571788413098237, "grad_norm": 5.399825572967529, "learning_rate": 6.222924451504e-07, "loss": 0.125, "num_input_tokens_seen": 2892736, "step": 190 }, { "epoch": 0.982367758186398, "grad_norm": 8.574385643005371, "learning_rate": 6.007886762283739e-07, "loss": 0.0747, "num_input_tokens_seen": 2969936, "step": 195 } ], "logging_steps": 5, "max_steps": 396, "num_input_tokens_seen": 3024200, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 16364156059648.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }