| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 771, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.038910505836575876, | |
| "grad_norm": 1.1950826608910594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7894, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07782101167315175, | |
| "grad_norm": 0.8641628238228234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7077, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11673151750972763, | |
| "grad_norm": 1.0850249051640393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6773, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1556420233463035, | |
| "grad_norm": 2.1113594635780544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6672, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19455252918287938, | |
| "grad_norm": 2.7266567559510855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.655, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23346303501945526, | |
| "grad_norm": 0.8866071545434335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.646, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2723735408560311, | |
| "grad_norm": 1.0279911167206606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.311284046692607, | |
| "grad_norm": 1.2351298423408894, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.35019455252918286, | |
| "grad_norm": 1.4010637045179566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6327, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38910505836575876, | |
| "grad_norm": 0.8292028754750206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6193, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4280155642023346, | |
| "grad_norm": 1.0764429621104903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6207, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4669260700389105, | |
| "grad_norm": 0.9459675747674458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6163, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5058365758754864, | |
| "grad_norm": 0.5625210449909341, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6274, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5447470817120622, | |
| "grad_norm": 0.5592680038544479, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6072, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5836575875486382, | |
| "grad_norm": 0.9284204912138712, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6133, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.622568093385214, | |
| "grad_norm": 0.5401038269116542, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6082, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6614785992217899, | |
| "grad_norm": 0.6501354489369745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.623, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7003891050583657, | |
| "grad_norm": 0.4374034361305582, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6028, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7392996108949417, | |
| "grad_norm": 0.48237196565070833, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6084, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7782101167315175, | |
| "grad_norm": 0.6671847308065324, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6084, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8171206225680934, | |
| "grad_norm": 0.492851346406084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8560311284046692, | |
| "grad_norm": 0.5561699128946762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6084, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8949416342412452, | |
| "grad_norm": 0.523790379318423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6074, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.933852140077821, | |
| "grad_norm": 0.4858980422996836, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5988, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9727626459143969, | |
| "grad_norm": 0.614521306119893, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5983, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6058223843574524, | |
| "eval_runtime": 275.8969, | |
| "eval_samples_per_second": 25.096, | |
| "eval_steps_per_second": 0.395, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.0116731517509727, | |
| "grad_norm": 0.7818180596091272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5823, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0505836575875487, | |
| "grad_norm": 0.694447527091343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5545, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0894941634241244, | |
| "grad_norm": 0.9129706616389401, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5519, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1284046692607004, | |
| "grad_norm": 0.5744956834175966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5524, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1673151750972763, | |
| "grad_norm": 0.5365095452402927, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5522, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.206225680933852, | |
| "grad_norm": 0.5366207968612337, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5501, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.245136186770428, | |
| "grad_norm": 0.566878607561335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5593, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2840466926070038, | |
| "grad_norm": 0.5025021953975053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5529, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3229571984435797, | |
| "grad_norm": 0.6517974108354097, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5516, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3618677042801557, | |
| "grad_norm": 0.5877223133514731, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5478, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4007782101167314, | |
| "grad_norm": 0.523088164773279, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5648, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4396887159533074, | |
| "grad_norm": 0.618255839408507, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5513, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4785992217898833, | |
| "grad_norm": 0.6260088198489807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5544, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.517509727626459, | |
| "grad_norm": 0.7806658992319804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5528, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.556420233463035, | |
| "grad_norm": 0.6140146431690726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5481, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.595330739299611, | |
| "grad_norm": 0.5582487614699068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5545, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6342412451361867, | |
| "grad_norm": 0.542657780109806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.553, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6731517509727627, | |
| "grad_norm": 0.6708086079418305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5391, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7120622568093387, | |
| "grad_norm": 0.5278094833402398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5576, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7509727626459144, | |
| "grad_norm": 0.47924313254672846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5439, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7898832684824901, | |
| "grad_norm": 0.6689950674332287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5509, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8287937743190663, | |
| "grad_norm": 0.4975766449875395, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5547, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.867704280155642, | |
| "grad_norm": 0.5110905522427862, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5466, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9066147859922178, | |
| "grad_norm": 0.5472555938572321, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5505, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9455252918287937, | |
| "grad_norm": 0.47391342497543426, | |
| "learning_rate": 5e-06, | |
| "loss": 0.548, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9844357976653697, | |
| "grad_norm": 0.5241425070111457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5559, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5962130427360535, | |
| "eval_runtime": 274.4938, | |
| "eval_samples_per_second": 25.225, | |
| "eval_steps_per_second": 0.397, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.0233463035019454, | |
| "grad_norm": 0.7607891193722286, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5269, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.062256809338521, | |
| "grad_norm": 0.80811781065185, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4971, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.1011673151750974, | |
| "grad_norm": 0.6433576240535729, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4955, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.140077821011673, | |
| "grad_norm": 0.5189612131550143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5002, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.178988326848249, | |
| "grad_norm": 0.5760279746695216, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5012, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.217898832684825, | |
| "grad_norm": 0.6914197290750576, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4955, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.2568093385214008, | |
| "grad_norm": 0.7728451901596064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5055, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.2957198443579765, | |
| "grad_norm": 0.5835397819143044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.503, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.3346303501945527, | |
| "grad_norm": 0.5879273137062859, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5027, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.3735408560311284, | |
| "grad_norm": 0.5684409532220068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.497, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.412451361867704, | |
| "grad_norm": 0.6525031507435581, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4932, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.4513618677042803, | |
| "grad_norm": 0.49071643579732227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5024, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.490272373540856, | |
| "grad_norm": 0.5247335428962803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4985, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.529182879377432, | |
| "grad_norm": 0.48059327924155726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5107, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5680933852140075, | |
| "grad_norm": 0.5430102339057058, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4941, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.6070038910505837, | |
| "grad_norm": 0.534453464123415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5018, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.6459143968871595, | |
| "grad_norm": 0.571745060913961, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5104, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.6848249027237356, | |
| "grad_norm": 0.540202566567447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4999, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.7237354085603114, | |
| "grad_norm": 0.4691152648293088, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5009, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.762645914396887, | |
| "grad_norm": 0.6372006699442468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5055, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.801556420233463, | |
| "grad_norm": 0.484555768964224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5075, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.840466926070039, | |
| "grad_norm": 0.5065631150373296, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4927, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.8793774319066148, | |
| "grad_norm": 0.5294227842308346, | |
| "learning_rate": 5e-06, | |
| "loss": 0.51, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.9182879377431905, | |
| "grad_norm": 0.541508376210009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4998, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.9571984435797667, | |
| "grad_norm": 0.5130299093056558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5119, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.9961089494163424, | |
| "grad_norm": 0.4915867802493192, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5053, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6005940437316895, | |
| "eval_runtime": 276.4988, | |
| "eval_samples_per_second": 25.042, | |
| "eval_steps_per_second": 0.394, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 771, | |
| "total_flos": 1291244336578560.0, | |
| "train_loss": 0.5620605509955287, | |
| "train_runtime": 45702.4745, | |
| "train_samples_per_second": 8.636, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 771, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1291244336578560.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |