| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 199, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005037783375314861, | |
| "grad_norm": 6.495429992675781, | |
| "learning_rate": 2.5e-08, | |
| "loss": 0.0456, | |
| "num_input_tokens_seen": 14432, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02518891687657431, | |
| "grad_norm": 29.901782989501953, | |
| "learning_rate": 1.25e-07, | |
| "loss": 0.398, | |
| "num_input_tokens_seen": 77072, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05037783375314862, | |
| "grad_norm": 27.349750518798828, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.1862, | |
| "num_input_tokens_seen": 149192, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07556675062972293, | |
| "grad_norm": 5.38295841217041, | |
| "learning_rate": 3.75e-07, | |
| "loss": 0.2736, | |
| "num_input_tokens_seen": 223528, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10075566750629723, | |
| "grad_norm": 28.08098030090332, | |
| "learning_rate": 5e-07, | |
| "loss": 0.3118, | |
| "num_input_tokens_seen": 303576, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12594458438287154, | |
| "grad_norm": 16.091209411621094, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 0.138, | |
| "num_input_tokens_seen": 378072, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.15113350125944586, | |
| "grad_norm": 8.152594566345215, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.2662, | |
| "num_input_tokens_seen": 454800, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.17632241813602015, | |
| "grad_norm": 4.938981056213379, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.1581, | |
| "num_input_tokens_seen": 529928, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.20151133501259447, | |
| "grad_norm": 14.633964538574219, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1316, | |
| "num_input_tokens_seen": 603768, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22670025188916876, | |
| "grad_norm": 16.411048889160156, | |
| "learning_rate": 9.995133583167832e-07, | |
| "loss": 0.208, | |
| "num_input_tokens_seen": 680480, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2518891687657431, | |
| "grad_norm": 9.24244213104248, | |
| "learning_rate": 9.980543805476444e-07, | |
| "loss": 0.1438, | |
| "num_input_tokens_seen": 755408, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2770780856423174, | |
| "grad_norm": 2.8657264709472656, | |
| "learning_rate": 9.95625906690173e-07, | |
| "loss": 0.0899, | |
| "num_input_tokens_seen": 830200, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3022670025188917, | |
| "grad_norm": 6.731296062469482, | |
| "learning_rate": 9.922326639307916e-07, | |
| "loss": 0.1614, | |
| "num_input_tokens_seen": 910000, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.327455919395466, | |
| "grad_norm": 4.519056797027588, | |
| "learning_rate": 9.87881257442972e-07, | |
| "loss": 0.1569, | |
| "num_input_tokens_seen": 987368, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3526448362720403, | |
| "grad_norm": 27.61652374267578, | |
| "learning_rate": 9.825801575298247e-07, | |
| "loss": 0.2221, | |
| "num_input_tokens_seen": 1064224, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3778337531486146, | |
| "grad_norm": 4.453612804412842, | |
| "learning_rate": 9.763396831360883e-07, | |
| "loss": 0.1448, | |
| "num_input_tokens_seen": 1136896, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.40302267002518893, | |
| "grad_norm": 7.729987144470215, | |
| "learning_rate": 9.691719817616146e-07, | |
| "loss": 0.0765, | |
| "num_input_tokens_seen": 1215664, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4282115869017632, | |
| "grad_norm": 5.992551326751709, | |
| "learning_rate": 9.61091005815451e-07, | |
| "loss": 0.0614, | |
| "num_input_tokens_seen": 1290864, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4534005037783375, | |
| "grad_norm": 3.615497589111328, | |
| "learning_rate": 9.521124854565424e-07, | |
| "loss": 0.0916, | |
| "num_input_tokens_seen": 1364584, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.47858942065491183, | |
| "grad_norm": 34.75035095214844, | |
| "learning_rate": 9.422538979739307e-07, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 1441024, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5037783375314862, | |
| "grad_norm": 14.529001235961914, | |
| "learning_rate": 9.315344337660421e-07, | |
| "loss": 0.1291, | |
| "num_input_tokens_seen": 1518184, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5289672544080605, | |
| "grad_norm": 11.793113708496094, | |
| "learning_rate": 9.199749589852978e-07, | |
| "loss": 0.257, | |
| "num_input_tokens_seen": 1595672, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5541561712846348, | |
| "grad_norm": 18.369070053100586, | |
| "learning_rate": 9.07597974920756e-07, | |
| "loss": 0.1978, | |
| "num_input_tokens_seen": 1673760, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5793450881612091, | |
| "grad_norm": 8.60573959350586, | |
| "learning_rate": 8.944275741978494e-07, | |
| "loss": 0.1907, | |
| "num_input_tokens_seen": 1752864, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6045340050377834, | |
| "grad_norm": 4.430349349975586, | |
| "learning_rate": 8.804893938804838e-07, | |
| "loss": 0.2279, | |
| "num_input_tokens_seen": 1830584, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6297229219143576, | |
| "grad_norm": 6.285661697387695, | |
| "learning_rate": 8.658105655667819e-07, | |
| "loss": 0.1144, | |
| "num_input_tokens_seen": 1907336, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.654911838790932, | |
| "grad_norm": 5.751603126525879, | |
| "learning_rate": 8.504196625756165e-07, | |
| "loss": 0.1368, | |
| "num_input_tokens_seen": 1981776, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6801007556675063, | |
| "grad_norm": 14.740753173828125, | |
| "learning_rate": 8.343466443267389e-07, | |
| "loss": 0.1961, | |
| "num_input_tokens_seen": 2058696, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7052896725440806, | |
| "grad_norm": 6.905796051025391, | |
| "learning_rate": 8.176227980227692e-07, | |
| "loss": 0.0864, | |
| "num_input_tokens_seen": 2135904, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7304785894206549, | |
| "grad_norm": 10.428398132324219, | |
| "learning_rate": 8.002806777465684e-07, | |
| "loss": 0.18, | |
| "num_input_tokens_seen": 2213328, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7556675062972292, | |
| "grad_norm": 12.141814231872559, | |
| "learning_rate": 7.823540410925433e-07, | |
| "loss": 0.0682, | |
| "num_input_tokens_seen": 2289976, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7808564231738035, | |
| "grad_norm": 13.058424949645996, | |
| "learning_rate": 7.63877783455237e-07, | |
| "loss": 0.2185, | |
| "num_input_tokens_seen": 2366048, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8060453400503779, | |
| "grad_norm": 8.26505184173584, | |
| "learning_rate": 7.448878701031142e-07, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 2440224, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8312342569269522, | |
| "grad_norm": 18.119977951049805, | |
| "learning_rate": 7.254212661697659e-07, | |
| "loss": 0.13, | |
| "num_input_tokens_seen": 2518392, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.8564231738035264, | |
| "grad_norm": 14.345734596252441, | |
| "learning_rate": 7.055158646988109e-07, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 2591048, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8816120906801007, | |
| "grad_norm": 5.978022575378418, | |
| "learning_rate": 6.852104128825569e-07, | |
| "loss": 0.1861, | |
| "num_input_tokens_seen": 2666192, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.906801007556675, | |
| "grad_norm": 6.636469841003418, | |
| "learning_rate": 6.64544436638005e-07, | |
| "loss": 0.1828, | |
| "num_input_tokens_seen": 2740216, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9319899244332494, | |
| "grad_norm": 10.074236869812012, | |
| "learning_rate": 6.435581636670153e-07, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 2815464, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.9571788413098237, | |
| "grad_norm": 5.399825572967529, | |
| "learning_rate": 6.222924451504e-07, | |
| "loss": 0.125, | |
| "num_input_tokens_seen": 2892736, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.982367758186398, | |
| "grad_norm": 8.574385643005371, | |
| "learning_rate": 6.007886762283739e-07, | |
| "loss": 0.0747, | |
| "num_input_tokens_seen": 2969936, | |
| "step": 195 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 396, | |
| "num_input_tokens_seen": 3024200, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 16364156059648.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |