{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4537205081669691, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004537205081669692, "grad_norm": 1.4140625, "learning_rate": 1.7632653061224493e-05, "loss": 2.0708, "step": 5 }, { "epoch": 0.009074410163339383, "grad_norm": 1.1796875, "learning_rate": 3.96734693877551e-05, "loss": 1.9893, "step": 10 }, { "epoch": 0.013611615245009074, "grad_norm": 0.85546875, "learning_rate": 6.171428571428573e-05, "loss": 1.955, "step": 15 }, { "epoch": 0.018148820326678767, "grad_norm": 1.0, "learning_rate": 8.375510204081634e-05, "loss": 1.9002, "step": 20 }, { "epoch": 0.022686025408348458, "grad_norm": 0.78515625, "learning_rate": 0.00010579591836734696, "loss": 1.8666, "step": 25 }, { "epoch": 0.02722323049001815, "grad_norm": 0.71875, "learning_rate": 0.00012783673469387758, "loss": 1.9462, "step": 30 }, { "epoch": 0.03176043557168784, "grad_norm": 0.734375, "learning_rate": 0.00014987755102040818, "loss": 1.9157, "step": 35 }, { "epoch": 0.036297640653357534, "grad_norm": 0.703125, "learning_rate": 0.00015428528732777552, "loss": 1.9, "step": 40 }, { "epoch": 0.04083484573502722, "grad_norm": 0.7265625, "learning_rate": 0.00015428355282194904, "loss": 1.9086, "step": 45 }, { "epoch": 0.045372050816696916, "grad_norm": 0.69921875, "learning_rate": 0.00015428048412333237, "loss": 1.918, "step": 50 }, { "epoch": 0.0499092558983666, "grad_norm": 0.6484375, "learning_rate": 0.00015427608130269275, "loss": 1.8871, "step": 55 }, { "epoch": 0.0544464609800363, "grad_norm": 0.66015625, "learning_rate": 0.00015427034446156347, "loss": 1.879, "step": 60 }, { "epoch": 0.05898366606170599, "grad_norm": 0.6484375, "learning_rate": 0.00015426327373224179, "loss": 1.8881, "step": 65 }, { "epoch": 0.06352087114337568, "grad_norm": 0.69921875, "learning_rate": 0.00015425486927778568, "loss": 1.8637, "step": 70 }, { "epoch": 0.06805807622504537, "grad_norm": 0.66796875, "learning_rate": 0.0001542451312920101, "loss": 1.9391, "step": 75 }, { "epoch": 0.07259528130671507, "grad_norm": 0.65234375, "learning_rate": 0.0001542340599994826, "loss": 1.9262, "step": 80 }, { "epoch": 0.07713248638838476, "grad_norm": 0.6171875, "learning_rate": 0.0001542216556555181, "loss": 1.9177, "step": 85 }, { "epoch": 0.08166969147005444, "grad_norm": 0.66015625, "learning_rate": 0.00015420791854617294, "loss": 1.9026, "step": 90 }, { "epoch": 0.08620689655172414, "grad_norm": 0.69140625, "learning_rate": 0.0001541928489882384, "loss": 1.868, "step": 95 }, { "epoch": 0.09074410163339383, "grad_norm": 0.62890625, "learning_rate": 0.0001541764473292333, "loss": 1.8556, "step": 100 }, { "epoch": 0.09528130671506352, "grad_norm": 0.69921875, "learning_rate": 0.00015415871394739605, "loss": 1.8941, "step": 105 }, { "epoch": 0.0998185117967332, "grad_norm": 0.6640625, "learning_rate": 0.00015413964925167585, "loss": 1.8843, "step": 110 }, { "epoch": 0.10435571687840291, "grad_norm": 0.609375, "learning_rate": 0.00015411925368172337, "loss": 1.8913, "step": 115 }, { "epoch": 0.1088929219600726, "grad_norm": 0.60546875, "learning_rate": 0.0001540975277078805, "loss": 1.8362, "step": 120 }, { "epoch": 0.11343012704174228, "grad_norm": 0.62109375, "learning_rate": 0.0001540744718311696, "loss": 1.851, "step": 125 }, { "epoch": 0.11796733212341198, "grad_norm": 0.671875, "learning_rate": 0.00015405008658328184, "loss": 1.8704, "step": 130 }, { "epoch": 0.12250453720508167, "grad_norm": 0.640625, "learning_rate": 0.000154024372526565, "loss": 1.8304, "step": 135 }, { "epoch": 0.12704174228675136, "grad_norm": 0.609375, "learning_rate": 0.0001539973302540106, "loss": 1.8121, "step": 140 }, { "epoch": 0.13157894736842105, "grad_norm": 0.66015625, "learning_rate": 0.00015396896038924, "loss": 1.861, "step": 145 }, { "epoch": 0.13611615245009073, "grad_norm": 0.96484375, "learning_rate": 0.0001539392635864902, "loss": 1.858, "step": 150 }, { "epoch": 0.14065335753176045, "grad_norm": 0.6328125, "learning_rate": 0.00015390824053059868, "loss": 1.8305, "step": 155 }, { "epoch": 0.14519056261343014, "grad_norm": 0.61328125, "learning_rate": 0.00015387589193698766, "loss": 1.8998, "step": 160 }, { "epoch": 0.14972776769509982, "grad_norm": 0.921875, "learning_rate": 0.00015384221855164752, "loss": 1.8725, "step": 165 }, { "epoch": 0.1542649727767695, "grad_norm": 0.59375, "learning_rate": 0.00015380722115111966, "loss": 1.8505, "step": 170 }, { "epoch": 0.1588021778584392, "grad_norm": 0.62109375, "learning_rate": 0.00015377090054247852, "loss": 1.8817, "step": 175 }, { "epoch": 0.16333938294010888, "grad_norm": 0.59765625, "learning_rate": 0.00015373325756331313, "loss": 1.8494, "step": 180 }, { "epoch": 0.16787658802177857, "grad_norm": 0.61328125, "learning_rate": 0.00015369429308170762, "loss": 1.9201, "step": 185 }, { "epoch": 0.1724137931034483, "grad_norm": 0.609375, "learning_rate": 0.00015365400799622123, "loss": 1.8467, "step": 190 }, { "epoch": 0.17695099818511797, "grad_norm": 0.703125, "learning_rate": 0.00015361240323586768, "loss": 1.8867, "step": 195 }, { "epoch": 0.18148820326678766, "grad_norm": 0.60546875, "learning_rate": 0.00015356947976009368, "loss": 1.8469, "step": 200 }, { "epoch": 0.18602540834845735, "grad_norm": 0.703125, "learning_rate": 0.00015352523855875685, "loss": 1.8678, "step": 205 }, { "epoch": 0.19056261343012704, "grad_norm": 0.6171875, "learning_rate": 0.00015347968065210273, "loss": 1.8783, "step": 210 }, { "epoch": 0.19509981851179672, "grad_norm": 0.55859375, "learning_rate": 0.00015343280709074155, "loss": 1.8663, "step": 215 }, { "epoch": 0.1996370235934664, "grad_norm": 0.6328125, "learning_rate": 0.0001533846189556237, "loss": 1.8891, "step": 220 }, { "epoch": 0.20417422867513613, "grad_norm": 0.55859375, "learning_rate": 0.000153335117358015, "loss": 1.87, "step": 225 }, { "epoch": 0.20871143375680581, "grad_norm": 0.6015625, "learning_rate": 0.00015328430343947104, "loss": 1.7773, "step": 230 }, { "epoch": 0.2132486388384755, "grad_norm": 0.58203125, "learning_rate": 0.00015323217837181068, "loss": 1.8276, "step": 235 }, { "epoch": 0.2177858439201452, "grad_norm": 0.55078125, "learning_rate": 0.00015317874335708936, "loss": 1.8301, "step": 240 }, { "epoch": 0.22232304900181488, "grad_norm": 0.5859375, "learning_rate": 0.00015312399962757098, "loss": 1.8491, "step": 245 }, { "epoch": 0.22686025408348456, "grad_norm": 0.5703125, "learning_rate": 0.0001530679484456999, "loss": 1.8976, "step": 250 }, { "epoch": 0.23139745916515425, "grad_norm": 0.57421875, "learning_rate": 0.00015301059110407148, "loss": 1.7919, "step": 255 }, { "epoch": 0.23593466424682397, "grad_norm": 0.578125, "learning_rate": 0.00015295192892540244, "loss": 1.8364, "step": 260 }, { "epoch": 0.24047186932849365, "grad_norm": 0.6640625, "learning_rate": 0.00015289196326250036, "loss": 1.846, "step": 265 }, { "epoch": 0.24500907441016334, "grad_norm": 0.58203125, "learning_rate": 0.00015283069549823237, "loss": 1.8033, "step": 270 }, { "epoch": 0.24954627949183303, "grad_norm": 0.609375, "learning_rate": 0.00015276812704549347, "loss": 1.8463, "step": 275 }, { "epoch": 0.2540834845735027, "grad_norm": 0.5703125, "learning_rate": 0.00015270425934717368, "loss": 1.7821, "step": 280 }, { "epoch": 0.25862068965517243, "grad_norm": 0.59765625, "learning_rate": 0.00015263909387612496, "loss": 1.7916, "step": 285 }, { "epoch": 0.2631578947368421, "grad_norm": 0.671875, "learning_rate": 0.0001525726321351272, "loss": 1.8298, "step": 290 }, { "epoch": 0.2676950998185118, "grad_norm": 0.5078125, "learning_rate": 0.00015250487565685353, "loss": 1.8417, "step": 295 }, { "epoch": 0.27223230490018147, "grad_norm": 0.5703125, "learning_rate": 0.000152435826003835, "loss": 1.8475, "step": 300 }, { "epoch": 0.2767695099818512, "grad_norm": 1.1328125, "learning_rate": 0.00015236548476842452, "loss": 1.7997, "step": 305 }, { "epoch": 0.2813067150635209, "grad_norm": 0.6640625, "learning_rate": 0.00015229385357276023, "loss": 1.8284, "step": 310 }, { "epoch": 0.28584392014519056, "grad_norm": 0.56640625, "learning_rate": 0.00015222093406872794, "loss": 1.8628, "step": 315 }, { "epoch": 0.29038112522686027, "grad_norm": 0.5703125, "learning_rate": 0.00015214672793792313, "loss": 1.8286, "step": 320 }, { "epoch": 0.29491833030852993, "grad_norm": 0.53125, "learning_rate": 0.00015207123689161224, "loss": 1.8207, "step": 325 }, { "epoch": 0.29945553539019965, "grad_norm": 0.74609375, "learning_rate": 0.00015199446267069304, "loss": 1.8005, "step": 330 }, { "epoch": 0.3039927404718693, "grad_norm": 0.58984375, "learning_rate": 0.0001519164070456546, "loss": 1.8659, "step": 335 }, { "epoch": 0.308529945553539, "grad_norm": 0.58984375, "learning_rate": 0.00015183707181653643, "loss": 1.8642, "step": 340 }, { "epoch": 0.31306715063520874, "grad_norm": 0.6015625, "learning_rate": 0.00015175645881288695, "loss": 1.7679, "step": 345 }, { "epoch": 0.3176043557168784, "grad_norm": 0.60546875, "learning_rate": 0.00015167456989372132, "loss": 1.8267, "step": 350 }, { "epoch": 0.3221415607985481, "grad_norm": 0.55859375, "learning_rate": 0.00015159140694747864, "loss": 1.826, "step": 355 }, { "epoch": 0.32667876588021777, "grad_norm": 0.6328125, "learning_rate": 0.0001515069718919782, "loss": 1.8233, "step": 360 }, { "epoch": 0.3312159709618875, "grad_norm": 0.51171875, "learning_rate": 0.0001514212666743755, "loss": 1.8402, "step": 365 }, { "epoch": 0.33575317604355714, "grad_norm": 0.52734375, "learning_rate": 0.00015133429327111715, "loss": 1.8067, "step": 370 }, { "epoch": 0.34029038112522686, "grad_norm": 0.5234375, "learning_rate": 0.00015124605368789542, "loss": 1.7867, "step": 375 }, { "epoch": 0.3448275862068966, "grad_norm": 0.546875, "learning_rate": 0.00015115654995960187, "loss": 1.8151, "step": 380 }, { "epoch": 0.34936479128856623, "grad_norm": 0.58984375, "learning_rate": 0.00015106578415028057, "loss": 1.7844, "step": 385 }, { "epoch": 0.35390199637023595, "grad_norm": 0.5625, "learning_rate": 0.00015097375835308036, "loss": 1.818, "step": 390 }, { "epoch": 0.3584392014519056, "grad_norm": 0.60546875, "learning_rate": 0.00015088047469020668, "loss": 1.8075, "step": 395 }, { "epoch": 0.3629764065335753, "grad_norm": 0.56640625, "learning_rate": 0.00015078593531287252, "loss": 1.8531, "step": 400 }, { "epoch": 0.367513611615245, "grad_norm": 0.73828125, "learning_rate": 0.00015069014240124903, "loss": 1.7792, "step": 405 }, { "epoch": 0.3720508166969147, "grad_norm": 0.546875, "learning_rate": 0.0001505930981644149, "loss": 1.8016, "step": 410 }, { "epoch": 0.3765880217785844, "grad_norm": 0.55859375, "learning_rate": 0.00015049480484030574, "loss": 1.7925, "step": 415 }, { "epoch": 0.3811252268602541, "grad_norm": 0.609375, "learning_rate": 0.0001503952646956623, "loss": 1.7902, "step": 420 }, { "epoch": 0.3856624319419238, "grad_norm": 0.55078125, "learning_rate": 0.00015029448002597828, "loss": 1.785, "step": 425 }, { "epoch": 0.39019963702359345, "grad_norm": 0.54296875, "learning_rate": 0.00015019245315544728, "loss": 1.8234, "step": 430 }, { "epoch": 0.39473684210526316, "grad_norm": 0.56640625, "learning_rate": 0.00015008918643690932, "loss": 1.7869, "step": 435 }, { "epoch": 0.3992740471869328, "grad_norm": 0.5703125, "learning_rate": 0.00014998468225179657, "loss": 1.8319, "step": 440 }, { "epoch": 0.40381125226860254, "grad_norm": 0.54296875, "learning_rate": 0.00014987894301007825, "loss": 1.7935, "step": 445 }, { "epoch": 0.40834845735027225, "grad_norm": 0.55859375, "learning_rate": 0.0001497719711502054, "loss": 1.7998, "step": 450 }, { "epoch": 0.4128856624319419, "grad_norm": 0.578125, "learning_rate": 0.0001496637691390543, "loss": 1.8234, "step": 455 }, { "epoch": 0.41742286751361163, "grad_norm": 1.40625, "learning_rate": 0.00014955433947186982, "loss": 1.7724, "step": 460 }, { "epoch": 0.4219600725952813, "grad_norm": 0.56640625, "learning_rate": 0.0001494436846722077, "loss": 1.7939, "step": 465 }, { "epoch": 0.426497277676951, "grad_norm": 0.55859375, "learning_rate": 0.00014933180729187652, "loss": 1.7921, "step": 470 }, { "epoch": 0.43103448275862066, "grad_norm": 0.5390625, "learning_rate": 0.0001492187099108787, "loss": 1.8269, "step": 475 }, { "epoch": 0.4355716878402904, "grad_norm": 0.55078125, "learning_rate": 0.00014910439513735114, "loss": 1.8065, "step": 480 }, { "epoch": 0.4401088929219601, "grad_norm": 0.59765625, "learning_rate": 0.00014898886560750502, "loss": 1.793, "step": 485 }, { "epoch": 0.44464609800362975, "grad_norm": 0.5859375, "learning_rate": 0.00014887212398556488, "loss": 1.8052, "step": 490 }, { "epoch": 0.44918330308529947, "grad_norm": 0.5625, "learning_rate": 0.00014875417296370742, "loss": 1.7471, "step": 495 }, { "epoch": 0.4537205081669691, "grad_norm": 0.55078125, "learning_rate": 0.00014863501526199925, "loss": 1.7734, "step": 500 }, { "epoch": 0.4537205081669691, "eval_loss": 1.7299695014953613, "eval_runtime": 4.0995, "eval_samples_per_second": 15.612, "eval_steps_per_second": 15.612, "step": 500 } ], "logging_steps": 5, "max_steps": 3306, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.097833201532928e+18, "train_batch_size": 48, "trial_name": null, "trial_params": null }