| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9910313901345291, | |
| "eval_steps": 500, | |
| "global_step": 444, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02242152466367713, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.5712402217176182e-05, | |
| "loss": 0.9259, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04484304932735426, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.535290498864641e-05, | |
| "loss": 0.7319, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06726457399103139, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 5.499340776011664e-05, | |
| "loss": 0.6744, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08968609865470852, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 7.463391053158687e-05, | |
| "loss": 0.6524, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11210762331838565, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.427441330305709e-05, | |
| "loss": 0.6502, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13452914798206278, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 0.00011391491607452732, | |
| "loss": 0.6209, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15695067264573992, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.00013355541884599756, | |
| "loss": 0.6194, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.17937219730941703, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.0001531959216174678, | |
| "loss": 0.625, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.20179372197309417, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.000172836424388938, | |
| "loss": 0.6517, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2242152466367713, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.00019247692716040823, | |
| "loss": 0.6418, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.24663677130044842, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 0.00019638985101337757, | |
| "loss": 0.6533, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.26905829596412556, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 0.00019632820638367662, | |
| "loss": 0.753, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2914798206278027, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 0.0001962191849457307, | |
| "loss": 0.6835, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.31390134529147984, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 0.0001960628569011471, | |
| "loss": 0.6626, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.336322869955157, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.00019585932291343037, | |
| "loss": 0.6538, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 0.0001956087140431623, | |
| "loss": 0.6461, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3811659192825112, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00019531119166360904, | |
| "loss": 0.6212, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.40358744394618834, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0001949669473568087, | |
| "loss": 0.6295, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4260089686098655, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00019457620279020684, | |
| "loss": 0.6273, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4484304932735426, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00019413920957391957, | |
| "loss": 0.6306, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.47085201793721976, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00019365624909871518, | |
| "loss": 0.6158, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.49327354260089684, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0001931276323548201, | |
| "loss": 0.6148, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.515695067264574, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00019255369973166414, | |
| "loss": 0.6119, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5381165919282511, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00019193482079869564, | |
| "loss": 0.6052, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5605381165919282, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00019127139406740633, | |
| "loss": 0.5977, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5829596412556054, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.00019056384673471995, | |
| "loss": 0.6138, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6053811659192825, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0001898126344079096, | |
| "loss": 0.5896, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6278026905829597, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.0001890182408112209, | |
| "loss": 0.612, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6502242152466368, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00018818117747438989, | |
| "loss": 0.6001, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.672645739910314, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0001873019834032565, | |
| "loss": 0.6011, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.695067264573991, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0001863812247326851, | |
| "loss": 0.5787, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00018541949436201642, | |
| "loss": 0.5969, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7399103139013453, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 0.00018441741157328495, | |
| "loss": 0.5868, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7623318385650224, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00018337562163244764, | |
| "loss": 0.5984, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7847533632286996, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00018229479537388148, | |
| "loss": 0.5867, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.8071748878923767, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0001811756287684164, | |
| "loss": 0.5811, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8295964125560538, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.00018001884247518247, | |
| "loss": 0.586, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.852017937219731, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0001788251813775596, | |
| "loss": 0.5717, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.874439461883408, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00017759541410352866, | |
| "loss": 0.5685, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8968609865470852, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00017633033253073284, | |
| "loss": 0.5995, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9192825112107623, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.000175030751276568, | |
| "loss": 0.5783, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9417040358744395, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00017369750717363046, | |
| "loss": 0.565, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9641255605381166, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00017233145873085946, | |
| "loss": 0.5718, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9865470852017937, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.00017093348558072226, | |
| "loss": 0.5827, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9955156950672646, | |
| "eval_loss": 0.5796323418617249, | |
| "eval_runtime": 1.9339, | |
| "eval_samples_per_second": 21.717, | |
| "eval_steps_per_second": 21.717, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.0089686098654709, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0001695044879127968, | |
| "loss": 0.5111, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.031390134529148, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00016804538589411738, | |
| "loss": 0.4244, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.053811659192825, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.00016655711907665626, | |
| "loss": 0.413, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0762331838565022, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00016504064579232286, | |
| "loss": 0.419, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0986547085201794, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00016349694253587008, | |
| "loss": 0.4098, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1210762331838564, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00016192700333610512, | |
| "loss": 0.4091, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1434977578475336, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.00016033183911580963, | |
| "loss": 0.4216, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1659192825112108, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.00015871247704078153, | |
| "loss": 0.4162, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.188340807174888, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.00015706995985841752, | |
| "loss": 0.4133, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.210762331838565, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.00015540534522626195, | |
| "loss": 0.4118, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2331838565022422, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.00015371970503095522, | |
| "loss": 0.4179, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2556053811659194, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0001520141246980191, | |
| "loss": 0.4271, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2780269058295963, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00015028970249292457, | |
| "loss": 0.419, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.3004484304932735, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00014854754881389124, | |
| "loss": 0.4206, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3228699551569507, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0001467887854768745, | |
| "loss": 0.4206, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3452914798206277, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00014501454499320048, | |
| "loss": 0.4139, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3677130044843049, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00014322596984031366, | |
| "loss": 0.4201, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.390134529147982, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00014142421172610775, | |
| "loss": 0.4184, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4125560538116593, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.00013961043084731196, | |
| "loss": 0.419, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.4349775784753362, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.0001377857951424118, | |
| "loss": 0.4171, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4573991031390134, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.0001359514795395845, | |
| "loss": 0.4271, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4798206278026906, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.0001341086652001336, | |
| "loss": 0.4175, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.5022421524663678, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00013225853875790998, | |
| "loss": 0.4131, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.5246636771300448, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.0001304022915552087, | |
| "loss": 0.4148, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.547085201793722, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00012854111887563427, | |
| "loss": 0.4027, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5695067264573992, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00012667621917442808, | |
| "loss": 0.4102, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5919282511210762, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00012480879330675288, | |
| "loss": 0.4117, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.6143497757847534, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00012294004375443292, | |
| "loss": 0.4023, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6367713004484306, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00012107117385164573, | |
| "loss": 0.4082, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6591928251121075, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00011920338701006534, | |
| "loss": 0.4134, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6816143497757847, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00011733788594395609, | |
| "loss": 0.4118, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.704035874439462, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.00011547587189571459, | |
| "loss": 0.4061, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.726457399103139, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00011361854386236023, | |
| "loss": 0.4096, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7488789237668163, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00011176709782347126, | |
| "loss": 0.407, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7713004484304933, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00010992272597106382, | |
| "loss": 0.3893, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7937219730941703, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.0001080866159419099, | |
| "loss": 0.4016, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8161434977578477, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00010625995005278866, | |
| "loss": 0.4006, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8385650224215246, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.0001044439045391633, | |
| "loss": 0.4031, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8609865470852018, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00010263964879777389, | |
| "loss": 0.3909, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.883408071748879, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.0001008483446336338, | |
| "loss": 0.3904, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.905829596412556, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 9.907114551191456e-05, | |
| "loss": 0.3931, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.9282511210762332, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 9.730919581520121e-05, | |
| "loss": 0.4016, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9506726457399104, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 9.556363010659583e-05, | |
| "loss": 0.3874, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9730941704035874, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 9.383557239914428e-05, | |
| "loss": 0.3942, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9910313901345291, | |
| "eval_loss": 0.5428131818771362, | |
| "eval_runtime": 1.9363, | |
| "eval_samples_per_second": 21.691, | |
| "eval_steps_per_second": 21.691, | |
| "step": 444 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 669, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.145669325918044e+17, | |
| "train_batch_size": 100, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |