{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 606, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024783147459727387, "grad_norm": 2.3770346641540527, "learning_rate": 8e-05, "loss": 2.289, "step": 5 }, { "epoch": 0.04956629491945477, "grad_norm": 1.5179001092910767, "learning_rate": 0.00018, "loss": 1.6219, "step": 10 }, { "epoch": 0.07434944237918216, "grad_norm": 0.6122242212295532, "learning_rate": 0.0001986577181208054, "loss": 0.9076, "step": 15 }, { "epoch": 0.09913258983890955, "grad_norm": 0.7408744096755981, "learning_rate": 0.00019697986577181208, "loss": 1.0936, "step": 20 }, { "epoch": 0.12391573729863693, "grad_norm": 0.773099958896637, "learning_rate": 0.0001953020134228188, "loss": 1.0541, "step": 25 }, { "epoch": 0.14869888475836432, "grad_norm": 0.5606175661087036, "learning_rate": 0.00019362416107382552, "loss": 0.857, "step": 30 }, { "epoch": 0.1734820322180917, "grad_norm": 0.5474881529808044, "learning_rate": 0.0001919463087248322, "loss": 0.8884, "step": 35 }, { "epoch": 0.1982651796778191, "grad_norm": 0.6062418818473816, "learning_rate": 0.00019026845637583895, "loss": 0.7933, "step": 40 }, { "epoch": 0.22304832713754646, "grad_norm": 0.5491968393325806, "learning_rate": 0.00018859060402684564, "loss": 1.0993, "step": 45 }, { "epoch": 0.24783147459727387, "grad_norm": 0.6508728861808777, "learning_rate": 0.00018691275167785236, "loss": 0.7969, "step": 50 }, { "epoch": 0.27261462205700127, "grad_norm": 0.5214135050773621, "learning_rate": 0.00018523489932885907, "loss": 0.7337, "step": 55 }, { "epoch": 0.29739776951672864, "grad_norm": 0.6263068318367004, "learning_rate": 0.0001835570469798658, "loss": 0.7676, "step": 60 }, { "epoch": 0.322180916976456, "grad_norm": 0.905588686466217, "learning_rate": 0.00018187919463087248, "loss": 0.6215, "step": 65 }, { "epoch": 0.3469640644361834, "grad_norm": 0.7323358654975891, "learning_rate": 0.0001802013422818792, "loss": 0.8739, "step": 70 }, { "epoch": 0.37174721189591076, "grad_norm": 0.7098009586334229, "learning_rate": 0.0001785234899328859, "loss": 0.8578, "step": 75 }, { "epoch": 0.3965303593556382, "grad_norm": 0.6922330260276794, "learning_rate": 0.00017684563758389263, "loss": 0.9465, "step": 80 }, { "epoch": 0.42131350681536556, "grad_norm": 0.5942096710205078, "learning_rate": 0.00017516778523489935, "loss": 0.8533, "step": 85 }, { "epoch": 0.44609665427509293, "grad_norm": 0.5012693405151367, "learning_rate": 0.00017348993288590604, "loss": 0.9949, "step": 90 }, { "epoch": 0.4708798017348203, "grad_norm": 0.6599251627922058, "learning_rate": 0.00017181208053691275, "loss": 0.6861, "step": 95 }, { "epoch": 0.49566294919454773, "grad_norm": 0.6302607655525208, "learning_rate": 0.00017013422818791947, "loss": 0.7804, "step": 100 }, { "epoch": 0.5204460966542751, "grad_norm": 0.4946972727775574, "learning_rate": 0.00016845637583892619, "loss": 0.8112, "step": 105 }, { "epoch": 0.5452292441140025, "grad_norm": 0.5818770527839661, "learning_rate": 0.0001667785234899329, "loss": 0.7273, "step": 110 }, { "epoch": 0.5700123915737298, "grad_norm": 0.8738688826560974, "learning_rate": 0.00016510067114093962, "loss": 0.8921, "step": 115 }, { "epoch": 0.5947955390334573, "grad_norm": 0.6265502572059631, "learning_rate": 0.0001634228187919463, "loss": 0.7654, "step": 120 }, { "epoch": 0.6195786864931846, "grad_norm": 0.6213403344154358, "learning_rate": 0.00016174496644295302, "loss": 0.8194, "step": 125 }, { "epoch": 0.644361833952912, "grad_norm": 0.6503012776374817, "learning_rate": 0.00016006711409395974, "loss": 0.8546, "step": 130 }, { "epoch": 0.6691449814126395, "grad_norm": 0.4928428828716278, "learning_rate": 0.00015838926174496643, "loss": 0.528, "step": 135 }, { "epoch": 0.6939281288723668, "grad_norm": 0.581334114074707, "learning_rate": 0.00015671140939597315, "loss": 0.5692, "step": 140 }, { "epoch": 0.7187112763320942, "grad_norm": 0.7810402512550354, "learning_rate": 0.0001550335570469799, "loss": 1.1702, "step": 145 }, { "epoch": 0.7434944237918215, "grad_norm": 0.6355721354484558, "learning_rate": 0.00015335570469798658, "loss": 0.6699, "step": 150 }, { "epoch": 0.7682775712515489, "grad_norm": 0.521967887878418, "learning_rate": 0.0001516778523489933, "loss": 0.6565, "step": 155 }, { "epoch": 0.7930607187112764, "grad_norm": 0.4311043322086334, "learning_rate": 0.00015000000000000001, "loss": 0.7537, "step": 160 }, { "epoch": 0.8178438661710037, "grad_norm": 0.5549885630607605, "learning_rate": 0.0001483221476510067, "loss": 0.6691, "step": 165 }, { "epoch": 0.8426270136307311, "grad_norm": 0.4813348650932312, "learning_rate": 0.00014664429530201342, "loss": 0.5279, "step": 170 }, { "epoch": 0.8674101610904585, "grad_norm": 0.6353710293769836, "learning_rate": 0.00014496644295302014, "loss": 0.6643, "step": 175 }, { "epoch": 0.8921933085501859, "grad_norm": 0.49870240688323975, "learning_rate": 0.00014328859060402685, "loss": 0.7134, "step": 180 }, { "epoch": 0.9169764560099133, "grad_norm": 0.536192774772644, "learning_rate": 0.00014161073825503357, "loss": 0.6358, "step": 185 }, { "epoch": 0.9417596034696406, "grad_norm": 0.6779680848121643, "learning_rate": 0.0001399328859060403, "loss": 0.716, "step": 190 }, { "epoch": 0.966542750929368, "grad_norm": 0.6344008445739746, "learning_rate": 0.00013825503355704698, "loss": 0.6644, "step": 195 }, { "epoch": 0.9913258983890955, "grad_norm": 0.5447779297828674, "learning_rate": 0.0001365771812080537, "loss": 0.9557, "step": 200 }, { "epoch": 1.0148698884758365, "grad_norm": 0.45053160190582275, "learning_rate": 0.0001348993288590604, "loss": 0.6475, "step": 205 }, { "epoch": 1.0396530359355638, "grad_norm": 0.3626702129840851, "learning_rate": 0.0001332214765100671, "loss": 0.3968, "step": 210 }, { "epoch": 1.0644361833952911, "grad_norm": 0.6345260739326477, "learning_rate": 0.00013154362416107384, "loss": 0.6513, "step": 215 }, { "epoch": 1.0892193308550187, "grad_norm": 0.6892905235290527, "learning_rate": 0.00012986577181208056, "loss": 0.7689, "step": 220 }, { "epoch": 1.114002478314746, "grad_norm": 0.5781119465827942, "learning_rate": 0.00012818791946308725, "loss": 0.7809, "step": 225 }, { "epoch": 1.1387856257744733, "grad_norm": 0.5030850768089294, "learning_rate": 0.00012651006711409397, "loss": 0.6872, "step": 230 }, { "epoch": 1.1635687732342008, "grad_norm": 0.7697097063064575, "learning_rate": 0.00012483221476510068, "loss": 0.58, "step": 235 }, { "epoch": 1.1883519206939281, "grad_norm": 0.6653311252593994, "learning_rate": 0.00012315436241610737, "loss": 0.7252, "step": 240 }, { "epoch": 1.2131350681536555, "grad_norm": 0.5599316358566284, "learning_rate": 0.00012147651006711409, "loss": 0.5401, "step": 245 }, { "epoch": 1.2379182156133828, "grad_norm": 0.621579110622406, "learning_rate": 0.00011979865771812082, "loss": 0.6159, "step": 250 }, { "epoch": 1.2627013630731103, "grad_norm": 0.6578307151794434, "learning_rate": 0.00011812080536912754, "loss": 0.4616, "step": 255 }, { "epoch": 1.2874845105328376, "grad_norm": 0.6472283601760864, "learning_rate": 0.00011644295302013424, "loss": 0.4225, "step": 260 }, { "epoch": 1.3122676579925652, "grad_norm": 0.5076863765716553, "learning_rate": 0.00011476510067114094, "loss": 0.5308, "step": 265 }, { "epoch": 1.3370508054522925, "grad_norm": 0.5231316089630127, "learning_rate": 0.00011308724832214766, "loss": 0.5059, "step": 270 }, { "epoch": 1.3618339529120198, "grad_norm": 0.7917241454124451, "learning_rate": 0.00011140939597315436, "loss": 0.4856, "step": 275 }, { "epoch": 1.3866171003717471, "grad_norm": 0.616142988204956, "learning_rate": 0.00010973154362416106, "loss": 0.4773, "step": 280 }, { "epoch": 1.4114002478314747, "grad_norm": 0.6872631311416626, "learning_rate": 0.0001080536912751678, "loss": 0.5274, "step": 285 }, { "epoch": 1.436183395291202, "grad_norm": 0.7652924656867981, "learning_rate": 0.00010637583892617451, "loss": 0.495, "step": 290 }, { "epoch": 1.4609665427509293, "grad_norm": 0.3928225636482239, "learning_rate": 0.00010469798657718121, "loss": 0.3075, "step": 295 }, { "epoch": 1.4857496902106568, "grad_norm": 0.5579555630683899, "learning_rate": 0.00010302013422818793, "loss": 0.5002, "step": 300 }, { "epoch": 1.5105328376703842, "grad_norm": 0.530565083026886, "learning_rate": 0.00010134228187919463, "loss": 0.712, "step": 305 }, { "epoch": 1.5353159851301115, "grad_norm": 0.9171392321586609, "learning_rate": 9.966442953020134e-05, "loss": 0.5517, "step": 310 }, { "epoch": 1.5600991325898388, "grad_norm": 0.7776666879653931, "learning_rate": 9.798657718120807e-05, "loss": 0.4571, "step": 315 }, { "epoch": 1.5848822800495663, "grad_norm": 0.672423243522644, "learning_rate": 9.630872483221477e-05, "loss": 0.5058, "step": 320 }, { "epoch": 1.6096654275092936, "grad_norm": 0.5513662695884705, "learning_rate": 9.463087248322147e-05, "loss": 0.63, "step": 325 }, { "epoch": 1.6344485749690212, "grad_norm": 0.4629589319229126, "learning_rate": 9.295302013422819e-05, "loss": 0.4052, "step": 330 }, { "epoch": 1.6592317224287485, "grad_norm": 0.6519562005996704, "learning_rate": 9.127516778523491e-05, "loss": 0.5305, "step": 335 }, { "epoch": 1.6840148698884758, "grad_norm": 0.5246181488037109, "learning_rate": 8.959731543624161e-05, "loss": 0.4193, "step": 340 }, { "epoch": 1.7087980173482031, "grad_norm": 0.7614356279373169, "learning_rate": 8.791946308724833e-05, "loss": 0.5908, "step": 345 }, { "epoch": 1.7335811648079305, "grad_norm": 0.6947569847106934, "learning_rate": 8.624161073825504e-05, "loss": 0.4335, "step": 350 }, { "epoch": 1.758364312267658, "grad_norm": 0.4542942941188812, "learning_rate": 8.456375838926175e-05, "loss": 0.6186, "step": 355 }, { "epoch": 1.7831474597273855, "grad_norm": 0.563624382019043, "learning_rate": 8.288590604026846e-05, "loss": 0.4605, "step": 360 }, { "epoch": 1.8079306071871128, "grad_norm": 0.5522857904434204, "learning_rate": 8.120805369127518e-05, "loss": 0.5918, "step": 365 }, { "epoch": 1.8327137546468402, "grad_norm": 0.3432987928390503, "learning_rate": 7.953020134228188e-05, "loss": 0.4177, "step": 370 }, { "epoch": 1.8574969021065675, "grad_norm": 0.5656675100326538, "learning_rate": 7.78523489932886e-05, "loss": 0.6676, "step": 375 }, { "epoch": 1.8822800495662948, "grad_norm": 0.5128700733184814, "learning_rate": 7.61744966442953e-05, "loss": 0.5425, "step": 380 }, { "epoch": 1.9070631970260223, "grad_norm": 0.6921593546867371, "learning_rate": 7.449664429530202e-05, "loss": 0.4439, "step": 385 }, { "epoch": 1.9318463444857497, "grad_norm": 0.6611471176147461, "learning_rate": 7.281879194630872e-05, "loss": 0.5939, "step": 390 }, { "epoch": 1.9566294919454772, "grad_norm": 0.8047774434089661, "learning_rate": 7.114093959731544e-05, "loss": 0.7568, "step": 395 }, { "epoch": 1.9814126394052045, "grad_norm": 0.6147902607917786, "learning_rate": 6.946308724832216e-05, "loss": 0.5265, "step": 400 }, { "epoch": 2.0049566294919456, "grad_norm": 0.4429222643375397, "learning_rate": 6.778523489932886e-05, "loss": 0.5753, "step": 405 }, { "epoch": 2.029739776951673, "grad_norm": 0.49188828468322754, "learning_rate": 6.610738255033558e-05, "loss": 0.4439, "step": 410 }, { "epoch": 2.0545229244114003, "grad_norm": 0.7511292695999146, "learning_rate": 6.442953020134228e-05, "loss": 0.3947, "step": 415 }, { "epoch": 2.0793060718711276, "grad_norm": 0.5809934735298157, "learning_rate": 6.2751677852349e-05, "loss": 0.3935, "step": 420 }, { "epoch": 2.104089219330855, "grad_norm": 1.1626038551330566, "learning_rate": 6.107382550335571e-05, "loss": 0.388, "step": 425 }, { "epoch": 2.1288723667905822, "grad_norm": 0.6084951758384705, "learning_rate": 5.9395973154362415e-05, "loss": 0.4371, "step": 430 }, { "epoch": 2.15365551425031, "grad_norm": 0.5379754304885864, "learning_rate": 5.771812080536914e-05, "loss": 0.3975, "step": 435 }, { "epoch": 2.1784386617100373, "grad_norm": 0.4898189902305603, "learning_rate": 5.604026845637584e-05, "loss": 0.5212, "step": 440 }, { "epoch": 2.2032218091697646, "grad_norm": 0.8490257859230042, "learning_rate": 5.436241610738255e-05, "loss": 0.3889, "step": 445 }, { "epoch": 2.228004956629492, "grad_norm": 0.8492045998573303, "learning_rate": 5.268456375838926e-05, "loss": 0.354, "step": 450 }, { "epoch": 2.2527881040892193, "grad_norm": 0.3837261497974396, "learning_rate": 5.100671140939598e-05, "loss": 0.2432, "step": 455 }, { "epoch": 2.2775712515489466, "grad_norm": 0.617027759552002, "learning_rate": 4.932885906040269e-05, "loss": 0.298, "step": 460 }, { "epoch": 2.3023543990086743, "grad_norm": 0.6203614473342896, "learning_rate": 4.76510067114094e-05, "loss": 0.4173, "step": 465 }, { "epoch": 2.3271375464684017, "grad_norm": 0.6991643309593201, "learning_rate": 4.597315436241611e-05, "loss": 0.2615, "step": 470 }, { "epoch": 2.351920693928129, "grad_norm": 0.8322747349739075, "learning_rate": 4.4295302013422824e-05, "loss": 0.354, "step": 475 }, { "epoch": 2.3767038413878563, "grad_norm": 0.8549102544784546, "learning_rate": 4.2617449664429534e-05, "loss": 0.3329, "step": 480 }, { "epoch": 2.4014869888475836, "grad_norm": 0.7914723753929138, "learning_rate": 4.0939597315436244e-05, "loss": 0.4457, "step": 485 }, { "epoch": 2.426270136307311, "grad_norm": 0.24238666892051697, "learning_rate": 3.9261744966442954e-05, "loss": 0.1957, "step": 490 }, { "epoch": 2.4510532837670382, "grad_norm": 0.7603949904441833, "learning_rate": 3.758389261744967e-05, "loss": 0.363, "step": 495 }, { "epoch": 2.4758364312267656, "grad_norm": 0.5621479153633118, "learning_rate": 3.5906040268456373e-05, "loss": 0.1987, "step": 500 }, { "epoch": 2.5006195786864933, "grad_norm": 0.6900079846382141, "learning_rate": 3.422818791946309e-05, "loss": 0.3846, "step": 505 }, { "epoch": 2.5254027261462206, "grad_norm": 0.7483950853347778, "learning_rate": 3.25503355704698e-05, "loss": 0.4089, "step": 510 }, { "epoch": 2.550185873605948, "grad_norm": 0.7797295451164246, "learning_rate": 3.087248322147651e-05, "loss": 0.3883, "step": 515 }, { "epoch": 2.5749690210656753, "grad_norm": 1.0466924905776978, "learning_rate": 2.9194630872483227e-05, "loss": 0.5279, "step": 520 }, { "epoch": 2.5997521685254026, "grad_norm": 0.9217528700828552, "learning_rate": 2.7516778523489933e-05, "loss": 0.4031, "step": 525 }, { "epoch": 2.6245353159851303, "grad_norm": 0.7957612872123718, "learning_rate": 2.5838926174496646e-05, "loss": 0.3163, "step": 530 }, { "epoch": 2.6493184634448577, "grad_norm": 0.7865810990333557, "learning_rate": 2.416107382550336e-05, "loss": 0.5586, "step": 535 }, { "epoch": 2.674101610904585, "grad_norm": 0.5190469026565552, "learning_rate": 2.248322147651007e-05, "loss": 0.2444, "step": 540 }, { "epoch": 2.6988847583643123, "grad_norm": 0.790671706199646, "learning_rate": 2.080536912751678e-05, "loss": 0.6483, "step": 545 }, { "epoch": 2.7236679058240396, "grad_norm": 0.9736090302467346, "learning_rate": 1.9127516778523493e-05, "loss": 0.3612, "step": 550 }, { "epoch": 2.748451053283767, "grad_norm": 0.7225416898727417, "learning_rate": 1.7449664429530202e-05, "loss": 0.3578, "step": 555 }, { "epoch": 2.7732342007434942, "grad_norm": 0.5790948867797852, "learning_rate": 1.5771812080536912e-05, "loss": 0.2573, "step": 560 }, { "epoch": 2.7980173482032216, "grad_norm": 0.6138997673988342, "learning_rate": 1.4093959731543624e-05, "loss": 0.5458, "step": 565 }, { "epoch": 2.8228004956629493, "grad_norm": 0.5243539810180664, "learning_rate": 1.2416107382550337e-05, "loss": 0.3168, "step": 570 }, { "epoch": 2.8475836431226766, "grad_norm": 1.29122793674469, "learning_rate": 1.0738255033557047e-05, "loss": 0.5166, "step": 575 }, { "epoch": 2.872366790582404, "grad_norm": 0.6179217100143433, "learning_rate": 9.060402684563759e-06, "loss": 0.2481, "step": 580 }, { "epoch": 2.8971499380421313, "grad_norm": 0.7408438324928284, "learning_rate": 7.382550335570471e-06, "loss": 0.2677, "step": 585 }, { "epoch": 2.9219330855018586, "grad_norm": 0.4825621247291565, "learning_rate": 5.704697986577182e-06, "loss": 0.3451, "step": 590 }, { "epoch": 2.9467162329615864, "grad_norm": 0.9304247498512268, "learning_rate": 4.026845637583892e-06, "loss": 0.3597, "step": 595 }, { "epoch": 2.9714993804213137, "grad_norm": 0.7385866045951843, "learning_rate": 2.3489932885906044e-06, "loss": 0.5111, "step": 600 }, { "epoch": 2.996282527881041, "grad_norm": 0.7714264988899231, "learning_rate": 6.711409395973154e-07, "loss": 0.3069, "step": 605 }, { "epoch": 3.0, "step": 606, "total_flos": 2.9833309199010816e+16, "train_loss": 0.5936956948327152, "train_runtime": 983.9366, "train_samples_per_second": 4.921, "train_steps_per_second": 0.616 } ], "logging_steps": 5, "max_steps": 606, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9833309199010816e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }