{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997827661115134, "eval_steps": 500, "global_step": 1035, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02896451846488052, "grad_norm": 1.9546706041531023, "learning_rate": 5e-06, "loss": 1.0679, "step": 10 }, { "epoch": 0.05792903692976104, "grad_norm": 5.4683536223610085, "learning_rate": 5e-06, "loss": 0.9613, "step": 20 }, { "epoch": 0.08689355539464157, "grad_norm": 1.6939976541654087, "learning_rate": 5e-06, "loss": 0.9223, "step": 30 }, { "epoch": 0.11585807385952208, "grad_norm": 1.0339182103747349, "learning_rate": 5e-06, "loss": 0.8975, "step": 40 }, { "epoch": 0.14482259232440262, "grad_norm": 0.9491704034528055, "learning_rate": 5e-06, "loss": 0.88, "step": 50 }, { "epoch": 0.17378711078928313, "grad_norm": 1.2085381768876486, "learning_rate": 5e-06, "loss": 0.8621, "step": 60 }, { "epoch": 0.20275162925416365, "grad_norm": 1.4117640839833099, "learning_rate": 5e-06, "loss": 0.8519, "step": 70 }, { "epoch": 0.23171614771904417, "grad_norm": 1.1514550106019426, "learning_rate": 5e-06, "loss": 0.8438, "step": 80 }, { "epoch": 0.2606806661839247, "grad_norm": 0.9243756138786047, "learning_rate": 5e-06, "loss": 0.8335, "step": 90 }, { "epoch": 0.28964518464880523, "grad_norm": 0.846545720738904, "learning_rate": 5e-06, "loss": 0.8269, "step": 100 }, { "epoch": 0.31860970311368575, "grad_norm": 0.5825470716035409, "learning_rate": 5e-06, "loss": 0.8194, "step": 110 }, { "epoch": 0.34757422157856627, "grad_norm": 0.7173313925098385, "learning_rate": 5e-06, "loss": 0.8195, "step": 120 }, { "epoch": 0.3765387400434468, "grad_norm": 0.7287411570445328, "learning_rate": 5e-06, "loss": 0.8156, "step": 130 }, { "epoch": 0.4055032585083273, "grad_norm": 0.7081477098458706, "learning_rate": 5e-06, "loss": 0.8116, "step": 140 }, { "epoch": 0.4344677769732078, "grad_norm": 0.8598043962323773, "learning_rate": 5e-06, "loss": 0.8135, "step": 150 }, { "epoch": 0.46343229543808834, "grad_norm": 0.5450111798416236, "learning_rate": 5e-06, "loss": 0.8074, "step": 160 }, { "epoch": 0.49239681390296886, "grad_norm": 0.939010922813204, "learning_rate": 5e-06, "loss": 0.8062, "step": 170 }, { "epoch": 0.5213613323678494, "grad_norm": 0.847543033032073, "learning_rate": 5e-06, "loss": 0.8029, "step": 180 }, { "epoch": 0.5503258508327299, "grad_norm": 0.8787853615954785, "learning_rate": 5e-06, "loss": 0.8006, "step": 190 }, { "epoch": 0.5792903692976105, "grad_norm": 1.0063082389466802, "learning_rate": 5e-06, "loss": 0.8009, "step": 200 }, { "epoch": 0.6082548877624909, "grad_norm": 0.6359141777605319, "learning_rate": 5e-06, "loss": 0.7989, "step": 210 }, { "epoch": 0.6372194062273715, "grad_norm": 0.6914513445032724, "learning_rate": 5e-06, "loss": 0.7942, "step": 220 }, { "epoch": 0.666183924692252, "grad_norm": 0.6878895702782952, "learning_rate": 5e-06, "loss": 0.7916, "step": 230 }, { "epoch": 0.6951484431571325, "grad_norm": 0.5660450961011149, "learning_rate": 5e-06, "loss": 0.7874, "step": 240 }, { "epoch": 0.724112961622013, "grad_norm": 0.5744944170750963, "learning_rate": 5e-06, "loss": 0.7853, "step": 250 }, { "epoch": 0.7530774800868936, "grad_norm": 0.8500170302127678, "learning_rate": 5e-06, "loss": 0.7868, "step": 260 }, { "epoch": 0.782041998551774, "grad_norm": 0.9550149697236748, "learning_rate": 5e-06, "loss": 0.7901, "step": 270 }, { "epoch": 0.8110065170166546, "grad_norm": 0.5546890467469614, "learning_rate": 5e-06, "loss": 0.7864, "step": 280 }, { "epoch": 0.8399710354815351, "grad_norm": 0.6331265595090024, "learning_rate": 5e-06, "loss": 0.788, "step": 290 }, { "epoch": 0.8689355539464156, "grad_norm": 0.6196206182624663, "learning_rate": 5e-06, "loss": 0.782, "step": 300 }, { "epoch": 0.8979000724112962, "grad_norm": 0.7124233685042775, "learning_rate": 5e-06, "loss": 0.7814, "step": 310 }, { "epoch": 0.9268645908761767, "grad_norm": 0.6281001430065685, "learning_rate": 5e-06, "loss": 0.7821, "step": 320 }, { "epoch": 0.9558291093410572, "grad_norm": 0.6288445991938771, "learning_rate": 5e-06, "loss": 0.7835, "step": 330 }, { "epoch": 0.9847936278059377, "grad_norm": 0.5963707636678538, "learning_rate": 5e-06, "loss": 0.785, "step": 340 }, { "epoch": 0.999275887038378, "eval_loss": 0.7745929956436157, "eval_runtime": 243.4419, "eval_samples_per_second": 38.21, "eval_steps_per_second": 0.6, "step": 345 }, { "epoch": 1.0137581462708183, "grad_norm": 0.887536909694853, "learning_rate": 5e-06, "loss": 0.8189, "step": 350 }, { "epoch": 1.0427226647356989, "grad_norm": 0.9746717296866173, "learning_rate": 5e-06, "loss": 0.7355, "step": 360 }, { "epoch": 1.0716871832005792, "grad_norm": 0.6469650977983997, "learning_rate": 5e-06, "loss": 0.734, "step": 370 }, { "epoch": 1.1006517016654598, "grad_norm": 0.7891671650419445, "learning_rate": 5e-06, "loss": 0.7328, "step": 380 }, { "epoch": 1.1296162201303404, "grad_norm": 0.6208155398032233, "learning_rate": 5e-06, "loss": 0.732, "step": 390 }, { "epoch": 1.158580738595221, "grad_norm": 0.6207300417186483, "learning_rate": 5e-06, "loss": 0.7297, "step": 400 }, { "epoch": 1.1875452570601013, "grad_norm": 0.6785729802747962, "learning_rate": 5e-06, "loss": 0.7341, "step": 410 }, { "epoch": 1.2165097755249819, "grad_norm": 0.8266342046421713, "learning_rate": 5e-06, "loss": 0.7323, "step": 420 }, { "epoch": 1.2454742939898624, "grad_norm": 0.9660234843621154, "learning_rate": 5e-06, "loss": 0.7306, "step": 430 }, { "epoch": 1.274438812454743, "grad_norm": 0.6935390944005595, "learning_rate": 5e-06, "loss": 0.7319, "step": 440 }, { "epoch": 1.3034033309196236, "grad_norm": 0.5941658732015132, "learning_rate": 5e-06, "loss": 0.7363, "step": 450 }, { "epoch": 1.332367849384504, "grad_norm": 0.590155807756852, "learning_rate": 5e-06, "loss": 0.7324, "step": 460 }, { "epoch": 1.3613323678493845, "grad_norm": 0.6352796068632335, "learning_rate": 5e-06, "loss": 0.7305, "step": 470 }, { "epoch": 1.390296886314265, "grad_norm": 0.6933255949901775, "learning_rate": 5e-06, "loss": 0.7335, "step": 480 }, { "epoch": 1.4192614047791454, "grad_norm": 0.5969978905166637, "learning_rate": 5e-06, "loss": 0.7331, "step": 490 }, { "epoch": 1.448225923244026, "grad_norm": 0.5822244996224603, "learning_rate": 5e-06, "loss": 0.7316, "step": 500 }, { "epoch": 1.4771904417089066, "grad_norm": 0.6473397313720373, "learning_rate": 5e-06, "loss": 0.7325, "step": 510 }, { "epoch": 1.5061549601737871, "grad_norm": 0.7379810742604224, "learning_rate": 5e-06, "loss": 0.7322, "step": 520 }, { "epoch": 1.5351194786386677, "grad_norm": 0.877777055252363, "learning_rate": 5e-06, "loss": 0.7267, "step": 530 }, { "epoch": 1.5640839971035483, "grad_norm": 0.702500798809481, "learning_rate": 5e-06, "loss": 0.7278, "step": 540 }, { "epoch": 1.5930485155684286, "grad_norm": 0.7082980837667808, "learning_rate": 5e-06, "loss": 0.7282, "step": 550 }, { "epoch": 1.6220130340333092, "grad_norm": 0.6193169422318507, "learning_rate": 5e-06, "loss": 0.7322, "step": 560 }, { "epoch": 1.6509775524981896, "grad_norm": 0.6349696489382688, "learning_rate": 5e-06, "loss": 0.7272, "step": 570 }, { "epoch": 1.6799420709630701, "grad_norm": 0.5808865809115293, "learning_rate": 5e-06, "loss": 0.7256, "step": 580 }, { "epoch": 1.7089065894279507, "grad_norm": 0.5965976052156886, "learning_rate": 5e-06, "loss": 0.7222, "step": 590 }, { "epoch": 1.7378711078928313, "grad_norm": 0.5710627722704887, "learning_rate": 5e-06, "loss": 0.7297, "step": 600 }, { "epoch": 1.7668356263577119, "grad_norm": 0.608816224452555, "learning_rate": 5e-06, "loss": 0.7262, "step": 610 }, { "epoch": 1.7958001448225924, "grad_norm": 0.6371768739478637, "learning_rate": 5e-06, "loss": 0.7303, "step": 620 }, { "epoch": 1.824764663287473, "grad_norm": 0.5358050463696646, "learning_rate": 5e-06, "loss": 0.7287, "step": 630 }, { "epoch": 1.8537291817523533, "grad_norm": 0.7036505719286567, "learning_rate": 5e-06, "loss": 0.7247, "step": 640 }, { "epoch": 1.882693700217234, "grad_norm": 0.6805629942635829, "learning_rate": 5e-06, "loss": 0.7294, "step": 650 }, { "epoch": 1.9116582186821143, "grad_norm": 0.6498078585112878, "learning_rate": 5e-06, "loss": 0.7289, "step": 660 }, { "epoch": 1.9406227371469948, "grad_norm": 0.5784132159085213, "learning_rate": 5e-06, "loss": 0.7296, "step": 670 }, { "epoch": 1.9695872556118754, "grad_norm": 0.5796673295424948, "learning_rate": 5e-06, "loss": 0.7277, "step": 680 }, { "epoch": 1.998551774076756, "grad_norm": 0.5936807614505102, "learning_rate": 5e-06, "loss": 0.72, "step": 690 }, { "epoch": 1.998551774076756, "eval_loss": 0.7591201663017273, "eval_runtime": 239.3101, "eval_samples_per_second": 38.87, "eval_steps_per_second": 0.61, "step": 690 }, { "epoch": 2.0275162925416366, "grad_norm": 0.715729072917497, "learning_rate": 5e-06, "loss": 0.7321, "step": 700 }, { "epoch": 2.056480811006517, "grad_norm": 0.6144874116006872, "learning_rate": 5e-06, "loss": 0.6723, "step": 710 }, { "epoch": 2.0854453294713977, "grad_norm": 0.7303697849008477, "learning_rate": 5e-06, "loss": 0.6804, "step": 720 }, { "epoch": 2.114409847936278, "grad_norm": 0.68311827138117, "learning_rate": 5e-06, "loss": 0.6797, "step": 730 }, { "epoch": 2.1433743664011584, "grad_norm": 0.5545659750688543, "learning_rate": 5e-06, "loss": 0.6733, "step": 740 }, { "epoch": 2.172338884866039, "grad_norm": 0.5612265877265745, "learning_rate": 5e-06, "loss": 0.6807, "step": 750 }, { "epoch": 2.2013034033309196, "grad_norm": 0.6652869497470263, "learning_rate": 5e-06, "loss": 0.6836, "step": 760 }, { "epoch": 2.2302679217958, "grad_norm": 0.5387482606960555, "learning_rate": 5e-06, "loss": 0.6782, "step": 770 }, { "epoch": 2.2592324402606807, "grad_norm": 0.9270487907700882, "learning_rate": 5e-06, "loss": 0.6792, "step": 780 }, { "epoch": 2.2881969587255613, "grad_norm": 0.5940033865543676, "learning_rate": 5e-06, "loss": 0.6839, "step": 790 }, { "epoch": 2.317161477190442, "grad_norm": 0.6205717881980902, "learning_rate": 5e-06, "loss": 0.6793, "step": 800 }, { "epoch": 2.3461259956553224, "grad_norm": 0.7863817807933237, "learning_rate": 5e-06, "loss": 0.6771, "step": 810 }, { "epoch": 2.3750905141202026, "grad_norm": 0.6565632670788291, "learning_rate": 5e-06, "loss": 0.6834, "step": 820 }, { "epoch": 2.404055032585083, "grad_norm": 0.6373382549439205, "learning_rate": 5e-06, "loss": 0.6828, "step": 830 }, { "epoch": 2.4330195510499637, "grad_norm": 0.6613391820459834, "learning_rate": 5e-06, "loss": 0.6835, "step": 840 }, { "epoch": 2.4619840695148443, "grad_norm": 0.5400957321029153, "learning_rate": 5e-06, "loss": 0.6852, "step": 850 }, { "epoch": 2.490948587979725, "grad_norm": 0.6001602197216074, "learning_rate": 5e-06, "loss": 0.6813, "step": 860 }, { "epoch": 2.5199131064446054, "grad_norm": 0.6758144590327261, "learning_rate": 5e-06, "loss": 0.6846, "step": 870 }, { "epoch": 2.548877624909486, "grad_norm": 0.7279056638638517, "learning_rate": 5e-06, "loss": 0.6806, "step": 880 }, { "epoch": 2.577842143374366, "grad_norm": 0.6530599852985071, "learning_rate": 5e-06, "loss": 0.6818, "step": 890 }, { "epoch": 2.606806661839247, "grad_norm": 0.6780623323564912, "learning_rate": 5e-06, "loss": 0.6825, "step": 900 }, { "epoch": 2.6357711803041273, "grad_norm": 0.5837946750071207, "learning_rate": 5e-06, "loss": 0.6825, "step": 910 }, { "epoch": 2.664735698769008, "grad_norm": 0.5602013963713363, "learning_rate": 5e-06, "loss": 0.6834, "step": 920 }, { "epoch": 2.6937002172338884, "grad_norm": 0.7041458005241192, "learning_rate": 5e-06, "loss": 0.6808, "step": 930 }, { "epoch": 2.722664735698769, "grad_norm": 0.6490957944655403, "learning_rate": 5e-06, "loss": 0.6844, "step": 940 }, { "epoch": 2.7516292541636496, "grad_norm": 0.6599285105597698, "learning_rate": 5e-06, "loss": 0.6841, "step": 950 }, { "epoch": 2.78059377262853, "grad_norm": 0.8783093936780623, "learning_rate": 5e-06, "loss": 0.6805, "step": 960 }, { "epoch": 2.8095582910934107, "grad_norm": 0.6029880285970106, "learning_rate": 5e-06, "loss": 0.6836, "step": 970 }, { "epoch": 2.838522809558291, "grad_norm": 0.6433012731687247, "learning_rate": 5e-06, "loss": 0.6802, "step": 980 }, { "epoch": 2.867487328023172, "grad_norm": 0.6070387953228661, "learning_rate": 5e-06, "loss": 0.6779, "step": 990 }, { "epoch": 2.896451846488052, "grad_norm": 0.5805918805506912, "learning_rate": 5e-06, "loss": 0.6815, "step": 1000 }, { "epoch": 2.9254163649529326, "grad_norm": 0.6364227056673066, "learning_rate": 5e-06, "loss": 0.6828, "step": 1010 }, { "epoch": 2.954380883417813, "grad_norm": 0.7905206793090467, "learning_rate": 5e-06, "loss": 0.6884, "step": 1020 }, { "epoch": 2.9833454018826937, "grad_norm": 0.6878648180858722, "learning_rate": 5e-06, "loss": 0.685, "step": 1030 }, { "epoch": 2.997827661115134, "eval_loss": 0.7575626373291016, "eval_runtime": 234.038, "eval_samples_per_second": 39.746, "eval_steps_per_second": 0.624, "step": 1035 }, { "epoch": 2.997827661115134, "step": 1035, "total_flos": 1733454169374720.0, "train_loss": 0.7466064549874568, "train_runtime": 34327.0468, "train_samples_per_second": 15.444, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1035, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1733454169374720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }