{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 771, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038910505836575876, "grad_norm": 1.1950826608910594, "learning_rate": 5e-06, "loss": 0.7894, "step": 10 }, { "epoch": 0.07782101167315175, "grad_norm": 0.8641628238228234, "learning_rate": 5e-06, "loss": 0.7077, "step": 20 }, { "epoch": 0.11673151750972763, "grad_norm": 1.0850249051640393, "learning_rate": 5e-06, "loss": 0.6773, "step": 30 }, { "epoch": 0.1556420233463035, "grad_norm": 2.1113594635780544, "learning_rate": 5e-06, "loss": 0.6672, "step": 40 }, { "epoch": 0.19455252918287938, "grad_norm": 2.7266567559510855, "learning_rate": 5e-06, "loss": 0.655, "step": 50 }, { "epoch": 0.23346303501945526, "grad_norm": 0.8866071545434335, "learning_rate": 5e-06, "loss": 0.646, "step": 60 }, { "epoch": 0.2723735408560311, "grad_norm": 1.0279911167206606, "learning_rate": 5e-06, "loss": 0.6403, "step": 70 }, { "epoch": 0.311284046692607, "grad_norm": 1.2351298423408894, "learning_rate": 5e-06, "loss": 0.6418, "step": 80 }, { "epoch": 0.35019455252918286, "grad_norm": 1.4010637045179566, "learning_rate": 5e-06, "loss": 0.6327, "step": 90 }, { "epoch": 0.38910505836575876, "grad_norm": 0.8292028754750206, "learning_rate": 5e-06, "loss": 0.6193, "step": 100 }, { "epoch": 0.4280155642023346, "grad_norm": 1.0764429621104903, "learning_rate": 5e-06, "loss": 0.6207, "step": 110 }, { "epoch": 0.4669260700389105, "grad_norm": 0.9459675747674458, "learning_rate": 5e-06, "loss": 0.6163, "step": 120 }, { "epoch": 0.5058365758754864, "grad_norm": 0.5625210449909341, "learning_rate": 5e-06, "loss": 0.6274, "step": 130 }, { "epoch": 0.5447470817120622, "grad_norm": 0.5592680038544479, "learning_rate": 5e-06, "loss": 0.6072, "step": 140 }, { "epoch": 0.5836575875486382, "grad_norm": 0.9284204912138712, "learning_rate": 5e-06, "loss": 0.6133, "step": 150 }, { "epoch": 0.622568093385214, "grad_norm": 0.5401038269116542, "learning_rate": 5e-06, "loss": 0.6082, "step": 160 }, { "epoch": 0.6614785992217899, "grad_norm": 0.6501354489369745, "learning_rate": 5e-06, "loss": 0.623, "step": 170 }, { "epoch": 0.7003891050583657, "grad_norm": 0.4374034361305582, "learning_rate": 5e-06, "loss": 0.6028, "step": 180 }, { "epoch": 0.7392996108949417, "grad_norm": 0.48237196565070833, "learning_rate": 5e-06, "loss": 0.6084, "step": 190 }, { "epoch": 0.7782101167315175, "grad_norm": 0.6671847308065324, "learning_rate": 5e-06, "loss": 0.6084, "step": 200 }, { "epoch": 0.8171206225680934, "grad_norm": 0.492851346406084, "learning_rate": 5e-06, "loss": 0.6115, "step": 210 }, { "epoch": 0.8560311284046692, "grad_norm": 0.5561699128946762, "learning_rate": 5e-06, "loss": 0.6084, "step": 220 }, { "epoch": 0.8949416342412452, "grad_norm": 0.523790379318423, "learning_rate": 5e-06, "loss": 0.6074, "step": 230 }, { "epoch": 0.933852140077821, "grad_norm": 0.4858980422996836, "learning_rate": 5e-06, "loss": 0.5988, "step": 240 }, { "epoch": 0.9727626459143969, "grad_norm": 0.614521306119893, "learning_rate": 5e-06, "loss": 0.5983, "step": 250 }, { "epoch": 1.0, "eval_loss": 0.6058223843574524, "eval_runtime": 275.8969, "eval_samples_per_second": 25.096, "eval_steps_per_second": 0.395, "step": 257 }, { "epoch": 1.0116731517509727, "grad_norm": 0.7818180596091272, "learning_rate": 5e-06, "loss": 0.5823, "step": 260 }, { "epoch": 1.0505836575875487, "grad_norm": 0.694447527091343, "learning_rate": 5e-06, "loss": 0.5545, "step": 270 }, { "epoch": 1.0894941634241244, "grad_norm": 0.9129706616389401, "learning_rate": 5e-06, "loss": 0.5519, "step": 280 }, { "epoch": 1.1284046692607004, "grad_norm": 0.5744956834175966, "learning_rate": 5e-06, "loss": 0.5524, "step": 290 }, { "epoch": 1.1673151750972763, "grad_norm": 0.5365095452402927, "learning_rate": 5e-06, "loss": 0.5522, "step": 300 }, { "epoch": 1.206225680933852, "grad_norm": 0.5366207968612337, "learning_rate": 5e-06, "loss": 0.5501, "step": 310 }, { "epoch": 1.245136186770428, "grad_norm": 0.566878607561335, "learning_rate": 5e-06, "loss": 0.5593, "step": 320 }, { "epoch": 1.2840466926070038, "grad_norm": 0.5025021953975053, "learning_rate": 5e-06, "loss": 0.5529, "step": 330 }, { "epoch": 1.3229571984435797, "grad_norm": 0.6517974108354097, "learning_rate": 5e-06, "loss": 0.5516, "step": 340 }, { "epoch": 1.3618677042801557, "grad_norm": 0.5877223133514731, "learning_rate": 5e-06, "loss": 0.5478, "step": 350 }, { "epoch": 1.4007782101167314, "grad_norm": 0.523088164773279, "learning_rate": 5e-06, "loss": 0.5648, "step": 360 }, { "epoch": 1.4396887159533074, "grad_norm": 0.618255839408507, "learning_rate": 5e-06, "loss": 0.5513, "step": 370 }, { "epoch": 1.4785992217898833, "grad_norm": 0.6260088198489807, "learning_rate": 5e-06, "loss": 0.5544, "step": 380 }, { "epoch": 1.517509727626459, "grad_norm": 0.7806658992319804, "learning_rate": 5e-06, "loss": 0.5528, "step": 390 }, { "epoch": 1.556420233463035, "grad_norm": 0.6140146431690726, "learning_rate": 5e-06, "loss": 0.5481, "step": 400 }, { "epoch": 1.595330739299611, "grad_norm": 0.5582487614699068, "learning_rate": 5e-06, "loss": 0.5545, "step": 410 }, { "epoch": 1.6342412451361867, "grad_norm": 0.542657780109806, "learning_rate": 5e-06, "loss": 0.553, "step": 420 }, { "epoch": 1.6731517509727627, "grad_norm": 0.6708086079418305, "learning_rate": 5e-06, "loss": 0.5391, "step": 430 }, { "epoch": 1.7120622568093387, "grad_norm": 0.5278094833402398, "learning_rate": 5e-06, "loss": 0.5576, "step": 440 }, { "epoch": 1.7509727626459144, "grad_norm": 0.47924313254672846, "learning_rate": 5e-06, "loss": 0.5439, "step": 450 }, { "epoch": 1.7898832684824901, "grad_norm": 0.6689950674332287, "learning_rate": 5e-06, "loss": 0.5509, "step": 460 }, { "epoch": 1.8287937743190663, "grad_norm": 0.4975766449875395, "learning_rate": 5e-06, "loss": 0.5547, "step": 470 }, { "epoch": 1.867704280155642, "grad_norm": 0.5110905522427862, "learning_rate": 5e-06, "loss": 0.5466, "step": 480 }, { "epoch": 1.9066147859922178, "grad_norm": 0.5472555938572321, "learning_rate": 5e-06, "loss": 0.5505, "step": 490 }, { "epoch": 1.9455252918287937, "grad_norm": 0.47391342497543426, "learning_rate": 5e-06, "loss": 0.548, "step": 500 }, { "epoch": 1.9844357976653697, "grad_norm": 0.5241425070111457, "learning_rate": 5e-06, "loss": 0.5559, "step": 510 }, { "epoch": 2.0, "eval_loss": 0.5962130427360535, "eval_runtime": 274.4938, "eval_samples_per_second": 25.225, "eval_steps_per_second": 0.397, "step": 514 }, { "epoch": 2.0233463035019454, "grad_norm": 0.7607891193722286, "learning_rate": 5e-06, "loss": 0.5269, "step": 520 }, { "epoch": 2.062256809338521, "grad_norm": 0.80811781065185, "learning_rate": 5e-06, "loss": 0.4971, "step": 530 }, { "epoch": 2.1011673151750974, "grad_norm": 0.6433576240535729, "learning_rate": 5e-06, "loss": 0.4955, "step": 540 }, { "epoch": 2.140077821011673, "grad_norm": 0.5189612131550143, "learning_rate": 5e-06, "loss": 0.5002, "step": 550 }, { "epoch": 2.178988326848249, "grad_norm": 0.5760279746695216, "learning_rate": 5e-06, "loss": 0.5012, "step": 560 }, { "epoch": 2.217898832684825, "grad_norm": 0.6914197290750576, "learning_rate": 5e-06, "loss": 0.4955, "step": 570 }, { "epoch": 2.2568093385214008, "grad_norm": 0.7728451901596064, "learning_rate": 5e-06, "loss": 0.5055, "step": 580 }, { "epoch": 2.2957198443579765, "grad_norm": 0.5835397819143044, "learning_rate": 5e-06, "loss": 0.503, "step": 590 }, { "epoch": 2.3346303501945527, "grad_norm": 0.5879273137062859, "learning_rate": 5e-06, "loss": 0.5027, "step": 600 }, { "epoch": 2.3735408560311284, "grad_norm": 0.5684409532220068, "learning_rate": 5e-06, "loss": 0.497, "step": 610 }, { "epoch": 2.412451361867704, "grad_norm": 0.6525031507435581, "learning_rate": 5e-06, "loss": 0.4932, "step": 620 }, { "epoch": 2.4513618677042803, "grad_norm": 0.49071643579732227, "learning_rate": 5e-06, "loss": 0.5024, "step": 630 }, { "epoch": 2.490272373540856, "grad_norm": 0.5247335428962803, "learning_rate": 5e-06, "loss": 0.4985, "step": 640 }, { "epoch": 2.529182879377432, "grad_norm": 0.48059327924155726, "learning_rate": 5e-06, "loss": 0.5107, "step": 650 }, { "epoch": 2.5680933852140075, "grad_norm": 0.5430102339057058, "learning_rate": 5e-06, "loss": 0.4941, "step": 660 }, { "epoch": 2.6070038910505837, "grad_norm": 0.534453464123415, "learning_rate": 5e-06, "loss": 0.5018, "step": 670 }, { "epoch": 2.6459143968871595, "grad_norm": 0.571745060913961, "learning_rate": 5e-06, "loss": 0.5104, "step": 680 }, { "epoch": 2.6848249027237356, "grad_norm": 0.540202566567447, "learning_rate": 5e-06, "loss": 0.4999, "step": 690 }, { "epoch": 2.7237354085603114, "grad_norm": 0.4691152648293088, "learning_rate": 5e-06, "loss": 0.5009, "step": 700 }, { "epoch": 2.762645914396887, "grad_norm": 0.6372006699442468, "learning_rate": 5e-06, "loss": 0.5055, "step": 710 }, { "epoch": 2.801556420233463, "grad_norm": 0.484555768964224, "learning_rate": 5e-06, "loss": 0.5075, "step": 720 }, { "epoch": 2.840466926070039, "grad_norm": 0.5065631150373296, "learning_rate": 5e-06, "loss": 0.4927, "step": 730 }, { "epoch": 2.8793774319066148, "grad_norm": 0.5294227842308346, "learning_rate": 5e-06, "loss": 0.51, "step": 740 }, { "epoch": 2.9182879377431905, "grad_norm": 0.541508376210009, "learning_rate": 5e-06, "loss": 0.4998, "step": 750 }, { "epoch": 2.9571984435797667, "grad_norm": 0.5130299093056558, "learning_rate": 5e-06, "loss": 0.5119, "step": 760 }, { "epoch": 2.9961089494163424, "grad_norm": 0.4915867802493192, "learning_rate": 5e-06, "loss": 0.5053, "step": 770 }, { "epoch": 3.0, "eval_loss": 0.6005940437316895, "eval_runtime": 276.4988, "eval_samples_per_second": 25.042, "eval_steps_per_second": 0.394, "step": 771 }, { "epoch": 3.0, "step": 771, "total_flos": 1291244336578560.0, "train_loss": 0.5620605509955287, "train_runtime": 45702.4745, "train_samples_per_second": 8.636, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 771, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1291244336578560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }