{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9961522017956392, "eval_steps": 500, "global_step": 876, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034202650705429674, "grad_norm": 1.6804444186183018, "learning_rate": 5e-06, "loss": 0.788, "step": 10 }, { "epoch": 0.06840530141085935, "grad_norm": 1.2870091906780483, "learning_rate": 5e-06, "loss": 0.7236, "step": 20 }, { "epoch": 0.10260795211628901, "grad_norm": 2.3640555348664254, "learning_rate": 5e-06, "loss": 0.6985, "step": 30 }, { "epoch": 0.1368106028217187, "grad_norm": 2.7430590909120953, "learning_rate": 5e-06, "loss": 0.6844, "step": 40 }, { "epoch": 0.17101325352714836, "grad_norm": 0.9746905467359548, "learning_rate": 5e-06, "loss": 0.6715, "step": 50 }, { "epoch": 0.20521590423257802, "grad_norm": 0.8507207569814836, "learning_rate": 5e-06, "loss": 0.6674, "step": 60 }, { "epoch": 0.2394185549380077, "grad_norm": 1.0995524010187212, "learning_rate": 5e-06, "loss": 0.6524, "step": 70 }, { "epoch": 0.2736212056434374, "grad_norm": 0.6280153768789657, "learning_rate": 5e-06, "loss": 0.6362, "step": 80 }, { "epoch": 0.30782385634886705, "grad_norm": 0.5688437842480452, "learning_rate": 5e-06, "loss": 0.6299, "step": 90 }, { "epoch": 0.3420265070542967, "grad_norm": 1.431931070075041, "learning_rate": 5e-06, "loss": 0.629, "step": 100 }, { "epoch": 0.3762291577597264, "grad_norm": 1.6920363679606414, "learning_rate": 5e-06, "loss": 0.6307, "step": 110 }, { "epoch": 0.41043180846515603, "grad_norm": 0.580804701324865, "learning_rate": 5e-06, "loss": 0.6138, "step": 120 }, { "epoch": 0.4446344591705857, "grad_norm": 0.705027663006494, "learning_rate": 5e-06, "loss": 0.624, "step": 130 }, { "epoch": 0.4788371098760154, "grad_norm": 0.7502862161170595, "learning_rate": 5e-06, "loss": 0.6205, "step": 140 }, { "epoch": 0.5130397605814451, "grad_norm": 0.6708553413300853, "learning_rate": 5e-06, "loss": 0.616, "step": 150 }, { "epoch": 0.5472424112868748, "grad_norm": 0.8494244770944251, "learning_rate": 5e-06, "loss": 0.627, "step": 160 }, { "epoch": 0.5814450619923044, "grad_norm": 0.53626174143875, "learning_rate": 5e-06, "loss": 0.6147, "step": 170 }, { "epoch": 0.6156477126977341, "grad_norm": 0.6364357464747067, "learning_rate": 5e-06, "loss": 0.6195, "step": 180 }, { "epoch": 0.6498503634031637, "grad_norm": 0.6082222049720536, "learning_rate": 5e-06, "loss": 0.6069, "step": 190 }, { "epoch": 0.6840530141085934, "grad_norm": 0.566328820213013, "learning_rate": 5e-06, "loss": 0.6072, "step": 200 }, { "epoch": 0.718255664814023, "grad_norm": 0.49888078998310204, "learning_rate": 5e-06, "loss": 0.602, "step": 210 }, { "epoch": 0.7524583155194527, "grad_norm": 0.4747529862310286, "learning_rate": 5e-06, "loss": 0.6168, "step": 220 }, { "epoch": 0.7866609662248825, "grad_norm": 0.5260744936940637, "learning_rate": 5e-06, "loss": 0.6046, "step": 230 }, { "epoch": 0.8208636169303121, "grad_norm": 0.5136960498148959, "learning_rate": 5e-06, "loss": 0.6091, "step": 240 }, { "epoch": 0.8550662676357418, "grad_norm": 0.5616383990535931, "learning_rate": 5e-06, "loss": 0.6057, "step": 250 }, { "epoch": 0.8892689183411714, "grad_norm": 0.6884011630886375, "learning_rate": 5e-06, "loss": 0.6137, "step": 260 }, { "epoch": 0.9234715690466011, "grad_norm": 0.6448568689058315, "learning_rate": 5e-06, "loss": 0.6094, "step": 270 }, { "epoch": 0.9576742197520308, "grad_norm": 0.7693749759884462, "learning_rate": 5e-06, "loss": 0.6101, "step": 280 }, { "epoch": 0.9918768704574604, "grad_norm": 0.5239302029558732, "learning_rate": 5e-06, "loss": 0.6053, "step": 290 }, { "epoch": 0.9987174005985464, "eval_loss": 0.607873260974884, "eval_runtime": 205.6424, "eval_samples_per_second": 38.299, "eval_steps_per_second": 0.603, "step": 292 }, { "epoch": 1.0260795211628901, "grad_norm": 0.5873382254143713, "learning_rate": 5e-06, "loss": 0.6057, "step": 300 }, { "epoch": 1.0602821718683197, "grad_norm": 0.5561542049663418, "learning_rate": 5e-06, "loss": 0.5509, "step": 310 }, { "epoch": 1.0944848225737496, "grad_norm": 0.580839172660231, "learning_rate": 5e-06, "loss": 0.5532, "step": 320 }, { "epoch": 1.1286874732791792, "grad_norm": 1.1795363810290622, "learning_rate": 5e-06, "loss": 0.5608, "step": 330 }, { "epoch": 1.1628901239846088, "grad_norm": 0.5245548599902146, "learning_rate": 5e-06, "loss": 0.551, "step": 340 }, { "epoch": 1.1970927746900384, "grad_norm": 0.7325061869705158, "learning_rate": 5e-06, "loss": 0.5542, "step": 350 }, { "epoch": 1.2312954253954682, "grad_norm": 0.5758834321007024, "learning_rate": 5e-06, "loss": 0.5569, "step": 360 }, { "epoch": 1.2654980761008978, "grad_norm": 0.5517090543506249, "learning_rate": 5e-06, "loss": 0.56, "step": 370 }, { "epoch": 1.2997007268063274, "grad_norm": 0.54026871791332, "learning_rate": 5e-06, "loss": 0.5567, "step": 380 }, { "epoch": 1.3339033775117572, "grad_norm": 0.6625845470909241, "learning_rate": 5e-06, "loss": 0.5539, "step": 390 }, { "epoch": 1.3681060282171869, "grad_norm": 0.6679250166336651, "learning_rate": 5e-06, "loss": 0.5546, "step": 400 }, { "epoch": 1.4023086789226165, "grad_norm": 0.43568230991463597, "learning_rate": 5e-06, "loss": 0.5548, "step": 410 }, { "epoch": 1.436511329628046, "grad_norm": 0.7673132417511955, "learning_rate": 5e-06, "loss": 0.5539, "step": 420 }, { "epoch": 1.470713980333476, "grad_norm": 0.8267199515857039, "learning_rate": 5e-06, "loss": 0.5609, "step": 430 }, { "epoch": 1.5049166310389055, "grad_norm": 0.7420049089951272, "learning_rate": 5e-06, "loss": 0.5526, "step": 440 }, { "epoch": 1.5391192817443353, "grad_norm": 0.5711771430617684, "learning_rate": 5e-06, "loss": 0.5459, "step": 450 }, { "epoch": 1.573321932449765, "grad_norm": 0.5059415250675989, "learning_rate": 5e-06, "loss": 0.5559, "step": 460 }, { "epoch": 1.6075245831551945, "grad_norm": 0.5634757605615536, "learning_rate": 5e-06, "loss": 0.5527, "step": 470 }, { "epoch": 1.6417272338606241, "grad_norm": 0.47121211159709975, "learning_rate": 5e-06, "loss": 0.5586, "step": 480 }, { "epoch": 1.6759298845660537, "grad_norm": 0.5677050425607371, "learning_rate": 5e-06, "loss": 0.5598, "step": 490 }, { "epoch": 1.7101325352714836, "grad_norm": 0.610525785684426, "learning_rate": 5e-06, "loss": 0.5509, "step": 500 }, { "epoch": 1.7443351859769132, "grad_norm": 0.6011597954395561, "learning_rate": 5e-06, "loss": 0.5551, "step": 510 }, { "epoch": 1.778537836682343, "grad_norm": 0.4613495220552048, "learning_rate": 5e-06, "loss": 0.5492, "step": 520 }, { "epoch": 1.8127404873877726, "grad_norm": 0.5256335189092972, "learning_rate": 5e-06, "loss": 0.5528, "step": 530 }, { "epoch": 1.8469431380932022, "grad_norm": 0.4659992912811747, "learning_rate": 5e-06, "loss": 0.5438, "step": 540 }, { "epoch": 1.8811457887986318, "grad_norm": 0.49388622607101107, "learning_rate": 5e-06, "loss": 0.5516, "step": 550 }, { "epoch": 1.9153484395040614, "grad_norm": 0.49235853413145225, "learning_rate": 5e-06, "loss": 0.5499, "step": 560 }, { "epoch": 1.9495510902094912, "grad_norm": 0.48530934948386684, "learning_rate": 5e-06, "loss": 0.5491, "step": 570 }, { "epoch": 1.983753740914921, "grad_norm": 0.681088284788218, "learning_rate": 5e-06, "loss": 0.5603, "step": 580 }, { "epoch": 1.9974348011970928, "eval_loss": 0.5989111065864563, "eval_runtime": 198.5327, "eval_samples_per_second": 39.671, "eval_steps_per_second": 0.625, "step": 584 }, { "epoch": 2.0179563916203507, "grad_norm": 0.6929323229832084, "learning_rate": 5e-06, "loss": 0.5622, "step": 590 }, { "epoch": 2.0521590423257803, "grad_norm": 0.5875304861764742, "learning_rate": 5e-06, "loss": 0.4938, "step": 600 }, { "epoch": 2.08636169303121, "grad_norm": 0.6162810457588677, "learning_rate": 5e-06, "loss": 0.5075, "step": 610 }, { "epoch": 2.1205643437366395, "grad_norm": 0.5609910238130167, "learning_rate": 5e-06, "loss": 0.4983, "step": 620 }, { "epoch": 2.154766994442069, "grad_norm": 0.5851708963453293, "learning_rate": 5e-06, "loss": 0.5001, "step": 630 }, { "epoch": 2.188969645147499, "grad_norm": 0.5635902260827742, "learning_rate": 5e-06, "loss": 0.5018, "step": 640 }, { "epoch": 2.2231722958529287, "grad_norm": 0.7189631820244675, "learning_rate": 5e-06, "loss": 0.4981, "step": 650 }, { "epoch": 2.2573749465583584, "grad_norm": 0.5675999743836657, "learning_rate": 5e-06, "loss": 0.4955, "step": 660 }, { "epoch": 2.291577597263788, "grad_norm": 0.5230970038040867, "learning_rate": 5e-06, "loss": 0.5011, "step": 670 }, { "epoch": 2.3257802479692176, "grad_norm": 0.5536244794373915, "learning_rate": 5e-06, "loss": 0.5016, "step": 680 }, { "epoch": 2.359982898674647, "grad_norm": 0.4987813092132446, "learning_rate": 5e-06, "loss": 0.5107, "step": 690 }, { "epoch": 2.3941855493800768, "grad_norm": 0.5979352098881308, "learning_rate": 5e-06, "loss": 0.5026, "step": 700 }, { "epoch": 2.428388200085507, "grad_norm": 0.508657041803482, "learning_rate": 5e-06, "loss": 0.5053, "step": 710 }, { "epoch": 2.4625908507909364, "grad_norm": 0.5418322815471002, "learning_rate": 5e-06, "loss": 0.4999, "step": 720 }, { "epoch": 2.496793501496366, "grad_norm": 2.189328511686068, "learning_rate": 5e-06, "loss": 0.5039, "step": 730 }, { "epoch": 2.5309961522017956, "grad_norm": 0.5032483059171474, "learning_rate": 5e-06, "loss": 0.5086, "step": 740 }, { "epoch": 2.5651988029072252, "grad_norm": 0.6897627250322256, "learning_rate": 5e-06, "loss": 0.4967, "step": 750 }, { "epoch": 2.599401453612655, "grad_norm": 0.6125595774876328, "learning_rate": 5e-06, "loss": 0.5082, "step": 760 }, { "epoch": 2.633604104318085, "grad_norm": 0.5478411600953903, "learning_rate": 5e-06, "loss": 0.5008, "step": 770 }, { "epoch": 2.6678067550235145, "grad_norm": 0.4805016073434078, "learning_rate": 5e-06, "loss": 0.5015, "step": 780 }, { "epoch": 2.702009405728944, "grad_norm": 0.5561978743904621, "learning_rate": 5e-06, "loss": 0.5045, "step": 790 }, { "epoch": 2.7362120564343737, "grad_norm": 0.4899190841497652, "learning_rate": 5e-06, "loss": 0.5024, "step": 800 }, { "epoch": 2.7704147071398033, "grad_norm": 0.513594637059708, "learning_rate": 5e-06, "loss": 0.4987, "step": 810 }, { "epoch": 2.804617357845233, "grad_norm": 0.5314442863240219, "learning_rate": 5e-06, "loss": 0.5077, "step": 820 }, { "epoch": 2.8388200085506625, "grad_norm": 0.5397227572021858, "learning_rate": 5e-06, "loss": 0.5084, "step": 830 }, { "epoch": 2.873022659256092, "grad_norm": 0.5577858554366566, "learning_rate": 5e-06, "loss": 0.5085, "step": 840 }, { "epoch": 2.907225309961522, "grad_norm": 0.5436188950593666, "learning_rate": 5e-06, "loss": 0.51, "step": 850 }, { "epoch": 2.941427960666952, "grad_norm": 0.5548892577477849, "learning_rate": 5e-06, "loss": 0.5111, "step": 860 }, { "epoch": 2.9756306113723814, "grad_norm": 0.5037692940815492, "learning_rate": 5e-06, "loss": 0.5083, "step": 870 }, { "epoch": 2.9961522017956392, "eval_loss": 0.6033630967140198, "eval_runtime": 196.2227, "eval_samples_per_second": 40.138, "eval_steps_per_second": 0.632, "step": 876 }, { "epoch": 2.9961522017956392, "step": 876, "total_flos": 1467123247349760.0, "train_loss": 0.5650538080903493, "train_runtime": 28810.5739, "train_samples_per_second": 15.582, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1467123247349760.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }