{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9910313901345291, "eval_steps": 500, "global_step": 444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02242152466367713, "grad_norm": 3.140625, "learning_rate": 1.5712402217176182e-05, "loss": 0.9259, "step": 5 }, { "epoch": 0.04484304932735426, "grad_norm": 1.1015625, "learning_rate": 3.535290498864641e-05, "loss": 0.7319, "step": 10 }, { "epoch": 0.06726457399103139, "grad_norm": 1.03125, "learning_rate": 5.499340776011664e-05, "loss": 0.6744, "step": 15 }, { "epoch": 0.08968609865470852, "grad_norm": 1.0078125, "learning_rate": 7.463391053158687e-05, "loss": 0.6524, "step": 20 }, { "epoch": 0.11210762331838565, "grad_norm": 0.97265625, "learning_rate": 9.427441330305709e-05, "loss": 0.6502, "step": 25 }, { "epoch": 0.13452914798206278, "grad_norm": 1.1875, "learning_rate": 0.00011391491607452732, "loss": 0.6209, "step": 30 }, { "epoch": 0.15695067264573992, "grad_norm": 1.2265625, "learning_rate": 0.00013355541884599756, "loss": 0.6194, "step": 35 }, { "epoch": 0.17937219730941703, "grad_norm": 1.2109375, "learning_rate": 0.0001531959216174678, "loss": 0.625, "step": 40 }, { "epoch": 0.20179372197309417, "grad_norm": 1.171875, "learning_rate": 0.000172836424388938, "loss": 0.6517, "step": 45 }, { "epoch": 0.2242152466367713, "grad_norm": 1.046875, "learning_rate": 0.00019247692716040823, "loss": 0.6418, "step": 50 }, { "epoch": 0.24663677130044842, "grad_norm": 0.95703125, "learning_rate": 0.00019638985101337757, "loss": 0.6533, "step": 55 }, { "epoch": 0.26905829596412556, "grad_norm": 1.46875, "learning_rate": 0.00019632820638367662, "loss": 0.753, "step": 60 }, { "epoch": 0.2914798206278027, "grad_norm": 0.88671875, "learning_rate": 0.0001962191849457307, "loss": 0.6835, "step": 65 }, { "epoch": 0.31390134529147984, "grad_norm": 0.95703125, "learning_rate": 0.0001960628569011471, "loss": 0.6626, "step": 70 }, { "epoch": 0.336322869955157, "grad_norm": 0.6484375, "learning_rate": 0.00019585932291343037, "loss": 0.6538, "step": 75 }, { "epoch": 0.35874439461883406, "grad_norm": 0.80859375, "learning_rate": 0.0001956087140431623, "loss": 0.6461, "step": 80 }, { "epoch": 0.3811659192825112, "grad_norm": 0.5390625, "learning_rate": 0.00019531119166360904, "loss": 0.6212, "step": 85 }, { "epoch": 0.40358744394618834, "grad_norm": 0.6328125, "learning_rate": 0.0001949669473568087, "loss": 0.6295, "step": 90 }, { "epoch": 0.4260089686098655, "grad_norm": 0.65234375, "learning_rate": 0.00019457620279020684, "loss": 0.6273, "step": 95 }, { "epoch": 0.4484304932735426, "grad_norm": 0.55078125, "learning_rate": 0.00019413920957391957, "loss": 0.6306, "step": 100 }, { "epoch": 0.47085201793721976, "grad_norm": 0.53515625, "learning_rate": 0.00019365624909871518, "loss": 0.6158, "step": 105 }, { "epoch": 0.49327354260089684, "grad_norm": 0.53515625, "learning_rate": 0.0001931276323548201, "loss": 0.6148, "step": 110 }, { "epoch": 0.515695067264574, "grad_norm": 0.51171875, "learning_rate": 0.00019255369973166414, "loss": 0.6119, "step": 115 }, { "epoch": 0.5381165919282511, "grad_norm": 0.51171875, "learning_rate": 0.00019193482079869564, "loss": 0.6052, "step": 120 }, { "epoch": 0.5605381165919282, "grad_norm": 0.53125, "learning_rate": 0.00019127139406740633, "loss": 0.5977, "step": 125 }, { "epoch": 0.5829596412556054, "grad_norm": 0.49609375, "learning_rate": 0.00019056384673471995, "loss": 0.6138, "step": 130 }, { "epoch": 0.6053811659192825, "grad_norm": 0.51953125, "learning_rate": 0.0001898126344079096, "loss": 0.5896, "step": 135 }, { "epoch": 0.6278026905829597, "grad_norm": 0.498046875, "learning_rate": 0.0001890182408112209, "loss": 0.612, "step": 140 }, { "epoch": 0.6502242152466368, "grad_norm": 0.470703125, "learning_rate": 0.00018818117747438989, "loss": 0.6001, "step": 145 }, { "epoch": 0.672645739910314, "grad_norm": 0.5390625, "learning_rate": 0.0001873019834032565, "loss": 0.6011, "step": 150 }, { "epoch": 0.695067264573991, "grad_norm": 0.53125, "learning_rate": 0.0001863812247326851, "loss": 0.5787, "step": 155 }, { "epoch": 0.7174887892376681, "grad_norm": 0.478515625, "learning_rate": 0.00018541949436201642, "loss": 0.5969, "step": 160 }, { "epoch": 0.7399103139013453, "grad_norm": 0.44921875, "learning_rate": 0.00018441741157328495, "loss": 0.5868, "step": 165 }, { "epoch": 0.7623318385650224, "grad_norm": 0.478515625, "learning_rate": 0.00018337562163244764, "loss": 0.5984, "step": 170 }, { "epoch": 0.7847533632286996, "grad_norm": 0.46875, "learning_rate": 0.00018229479537388148, "loss": 0.5867, "step": 175 }, { "epoch": 0.8071748878923767, "grad_norm": 0.50390625, "learning_rate": 0.0001811756287684164, "loss": 0.5811, "step": 180 }, { "epoch": 0.8295964125560538, "grad_norm": 0.50390625, "learning_rate": 0.00018001884247518247, "loss": 0.586, "step": 185 }, { "epoch": 0.852017937219731, "grad_norm": 0.474609375, "learning_rate": 0.0001788251813775596, "loss": 0.5717, "step": 190 }, { "epoch": 0.874439461883408, "grad_norm": 0.47265625, "learning_rate": 0.00017759541410352866, "loss": 0.5685, "step": 195 }, { "epoch": 0.8968609865470852, "grad_norm": 0.53125, "learning_rate": 0.00017633033253073284, "loss": 0.5995, "step": 200 }, { "epoch": 0.9192825112107623, "grad_norm": 0.5078125, "learning_rate": 0.000175030751276568, "loss": 0.5783, "step": 205 }, { "epoch": 0.9417040358744395, "grad_norm": 0.55078125, "learning_rate": 0.00017369750717363046, "loss": 0.565, "step": 210 }, { "epoch": 0.9641255605381166, "grad_norm": 0.45703125, "learning_rate": 0.00017233145873085946, "loss": 0.5718, "step": 215 }, { "epoch": 0.9865470852017937, "grad_norm": 0.486328125, "learning_rate": 0.00017093348558072226, "loss": 0.5827, "step": 220 }, { "epoch": 0.9955156950672646, "eval_loss": 0.5796323418617249, "eval_runtime": 1.9339, "eval_samples_per_second": 21.717, "eval_steps_per_second": 21.717, "step": 222 }, { "epoch": 1.0089686098654709, "grad_norm": 0.6640625, "learning_rate": 0.0001695044879127968, "loss": 0.5111, "step": 225 }, { "epoch": 1.031390134529148, "grad_norm": 0.58203125, "learning_rate": 0.00016804538589411738, "loss": 0.4244, "step": 230 }, { "epoch": 1.053811659192825, "grad_norm": 0.640625, "learning_rate": 0.00016655711907665626, "loss": 0.413, "step": 235 }, { "epoch": 1.0762331838565022, "grad_norm": 0.51171875, "learning_rate": 0.00016504064579232286, "loss": 0.419, "step": 240 }, { "epoch": 1.0986547085201794, "grad_norm": 0.5234375, "learning_rate": 0.00016349694253587008, "loss": 0.4098, "step": 245 }, { "epoch": 1.1210762331838564, "grad_norm": 0.515625, "learning_rate": 0.00016192700333610512, "loss": 0.4091, "step": 250 }, { "epoch": 1.1434977578475336, "grad_norm": 0.466796875, "learning_rate": 0.00016033183911580963, "loss": 0.4216, "step": 255 }, { "epoch": 1.1659192825112108, "grad_norm": 0.50390625, "learning_rate": 0.00015871247704078153, "loss": 0.4162, "step": 260 }, { "epoch": 1.188340807174888, "grad_norm": 0.453125, "learning_rate": 0.00015706995985841752, "loss": 0.4133, "step": 265 }, { "epoch": 1.210762331838565, "grad_norm": 0.466796875, "learning_rate": 0.00015540534522626195, "loss": 0.4118, "step": 270 }, { "epoch": 1.2331838565022422, "grad_norm": 0.46484375, "learning_rate": 0.00015371970503095522, "loss": 0.4179, "step": 275 }, { "epoch": 1.2556053811659194, "grad_norm": 0.431640625, "learning_rate": 0.0001520141246980191, "loss": 0.4271, "step": 280 }, { "epoch": 1.2780269058295963, "grad_norm": 0.46875, "learning_rate": 0.00015028970249292457, "loss": 0.419, "step": 285 }, { "epoch": 1.3004484304932735, "grad_norm": 0.427734375, "learning_rate": 0.00014854754881389124, "loss": 0.4206, "step": 290 }, { "epoch": 1.3228699551569507, "grad_norm": 0.42578125, "learning_rate": 0.0001467887854768745, "loss": 0.4206, "step": 295 }, { "epoch": 1.3452914798206277, "grad_norm": 0.45703125, "learning_rate": 0.00014501454499320048, "loss": 0.4139, "step": 300 }, { "epoch": 1.3677130044843049, "grad_norm": 0.462890625, "learning_rate": 0.00014322596984031366, "loss": 0.4201, "step": 305 }, { "epoch": 1.390134529147982, "grad_norm": 0.4609375, "learning_rate": 0.00014142421172610775, "loss": 0.4184, "step": 310 }, { "epoch": 1.4125560538116593, "grad_norm": 0.4375, "learning_rate": 0.00013961043084731196, "loss": 0.419, "step": 315 }, { "epoch": 1.4349775784753362, "grad_norm": 0.427734375, "learning_rate": 0.0001377857951424118, "loss": 0.4171, "step": 320 }, { "epoch": 1.4573991031390134, "grad_norm": 0.419921875, "learning_rate": 0.0001359514795395845, "loss": 0.4271, "step": 325 }, { "epoch": 1.4798206278026906, "grad_norm": 0.427734375, "learning_rate": 0.0001341086652001336, "loss": 0.4175, "step": 330 }, { "epoch": 1.5022421524663678, "grad_norm": 0.419921875, "learning_rate": 0.00013225853875790998, "loss": 0.4131, "step": 335 }, { "epoch": 1.5246636771300448, "grad_norm": 0.4140625, "learning_rate": 0.0001304022915552087, "loss": 0.4148, "step": 340 }, { "epoch": 1.547085201793722, "grad_norm": 0.4140625, "learning_rate": 0.00012854111887563427, "loss": 0.4027, "step": 345 }, { "epoch": 1.5695067264573992, "grad_norm": 0.423828125, "learning_rate": 0.00012667621917442808, "loss": 0.4102, "step": 350 }, { "epoch": 1.5919282511210762, "grad_norm": 0.435546875, "learning_rate": 0.00012480879330675288, "loss": 0.4117, "step": 355 }, { "epoch": 1.6143497757847534, "grad_norm": 0.412109375, "learning_rate": 0.00012294004375443292, "loss": 0.4023, "step": 360 }, { "epoch": 1.6367713004484306, "grad_norm": 0.39453125, "learning_rate": 0.00012107117385164573, "loss": 0.4082, "step": 365 }, { "epoch": 1.6591928251121075, "grad_norm": 0.412109375, "learning_rate": 0.00011920338701006534, "loss": 0.4134, "step": 370 }, { "epoch": 1.6816143497757847, "grad_norm": 0.41796875, "learning_rate": 0.00011733788594395609, "loss": 0.4118, "step": 375 }, { "epoch": 1.704035874439462, "grad_norm": 0.40625, "learning_rate": 0.00011547587189571459, "loss": 0.4061, "step": 380 }, { "epoch": 1.726457399103139, "grad_norm": 0.392578125, "learning_rate": 0.00011361854386236023, "loss": 0.4096, "step": 385 }, { "epoch": 1.7488789237668163, "grad_norm": 0.435546875, "learning_rate": 0.00011176709782347126, "loss": 0.407, "step": 390 }, { "epoch": 1.7713004484304933, "grad_norm": 0.39453125, "learning_rate": 0.00010992272597106382, "loss": 0.3893, "step": 395 }, { "epoch": 1.7937219730941703, "grad_norm": 0.4140625, "learning_rate": 0.0001080866159419099, "loss": 0.4016, "step": 400 }, { "epoch": 1.8161434977578477, "grad_norm": 0.408203125, "learning_rate": 0.00010625995005278866, "loss": 0.4006, "step": 405 }, { "epoch": 1.8385650224215246, "grad_norm": 0.38671875, "learning_rate": 0.0001044439045391633, "loss": 0.4031, "step": 410 }, { "epoch": 1.8609865470852018, "grad_norm": 0.392578125, "learning_rate": 0.00010263964879777389, "loss": 0.3909, "step": 415 }, { "epoch": 1.883408071748879, "grad_norm": 0.412109375, "learning_rate": 0.0001008483446336338, "loss": 0.3904, "step": 420 }, { "epoch": 1.905829596412556, "grad_norm": 0.41015625, "learning_rate": 9.907114551191456e-05, "loss": 0.3931, "step": 425 }, { "epoch": 1.9282511210762332, "grad_norm": 0.419921875, "learning_rate": 9.730919581520121e-05, "loss": 0.4016, "step": 430 }, { "epoch": 1.9506726457399104, "grad_norm": 0.3984375, "learning_rate": 9.556363010659583e-05, "loss": 0.3874, "step": 435 }, { "epoch": 1.9730941704035874, "grad_norm": 0.404296875, "learning_rate": 9.383557239914428e-05, "loss": 0.3942, "step": 440 }, { "epoch": 1.9910313901345291, "eval_loss": 0.5428131818771362, "eval_runtime": 1.9363, "eval_samples_per_second": 21.691, "eval_steps_per_second": 21.691, "step": 444 } ], "logging_steps": 5, "max_steps": 669, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.145669325918044e+17, "train_batch_size": 100, "trial_name": null, "trial_params": null }