| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9961522017956392, |
| "eval_steps": 500, |
| "global_step": 876, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.034202650705429674, |
| "grad_norm": 8.72109102116674, |
| "learning_rate": 5e-06, |
| "loss": 1.0718, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06840530141085935, |
| "grad_norm": 3.3322663623660125, |
| "learning_rate": 5e-06, |
| "loss": 0.9254, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10260795211628901, |
| "grad_norm": 4.702669152395071, |
| "learning_rate": 5e-06, |
| "loss": 0.8861, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1368106028217187, |
| "grad_norm": 1.5324073221362737, |
| "learning_rate": 5e-06, |
| "loss": 0.8626, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17101325352714836, |
| "grad_norm": 1.6283441090652204, |
| "learning_rate": 5e-06, |
| "loss": 0.8431, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20521590423257802, |
| "grad_norm": 1.6064927943897758, |
| "learning_rate": 5e-06, |
| "loss": 0.826, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2394185549380077, |
| "grad_norm": 1.0683822361301152, |
| "learning_rate": 5e-06, |
| "loss": 0.8056, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2736212056434374, |
| "grad_norm": 1.27549981223469, |
| "learning_rate": 5e-06, |
| "loss": 0.7992, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.30782385634886705, |
| "grad_norm": 0.7781873272432135, |
| "learning_rate": 5e-06, |
| "loss": 0.7964, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3420265070542967, |
| "grad_norm": 1.9361055434956347, |
| "learning_rate": 5e-06, |
| "loss": 0.7831, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3762291577597264, |
| "grad_norm": 0.8445413978039268, |
| "learning_rate": 5e-06, |
| "loss": 0.7823, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.41043180846515603, |
| "grad_norm": 0.7086688497164513, |
| "learning_rate": 5e-06, |
| "loss": 0.7849, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4446344591705857, |
| "grad_norm": 0.7956920234183803, |
| "learning_rate": 5e-06, |
| "loss": 0.778, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4788371098760154, |
| "grad_norm": 1.0350837015570293, |
| "learning_rate": 5e-06, |
| "loss": 0.7702, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5130397605814451, |
| "grad_norm": 0.7439380594321496, |
| "learning_rate": 5e-06, |
| "loss": 0.7717, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5472424112868748, |
| "grad_norm": 0.6292484792677624, |
| "learning_rate": 5e-06, |
| "loss": 0.7671, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5814450619923044, |
| "grad_norm": 0.6412867752520384, |
| "learning_rate": 5e-06, |
| "loss": 0.7587, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6156477126977341, |
| "grad_norm": 0.6646094993947935, |
| "learning_rate": 5e-06, |
| "loss": 0.7575, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6498503634031637, |
| "grad_norm": 0.665291201598649, |
| "learning_rate": 5e-06, |
| "loss": 0.757, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6840530141085934, |
| "grad_norm": 0.9123265402327665, |
| "learning_rate": 5e-06, |
| "loss": 0.7583, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.718255664814023, |
| "grad_norm": 1.1094941903938924, |
| "learning_rate": 5e-06, |
| "loss": 0.7517, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7524583155194527, |
| "grad_norm": 0.6482371740931725, |
| "learning_rate": 5e-06, |
| "loss": 0.7551, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7866609662248825, |
| "grad_norm": 0.6555914261724218, |
| "learning_rate": 5e-06, |
| "loss": 0.7597, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8208636169303121, |
| "grad_norm": 0.8657857653422225, |
| "learning_rate": 5e-06, |
| "loss": 0.7563, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8550662676357418, |
| "grad_norm": 0.6407753060120058, |
| "learning_rate": 5e-06, |
| "loss": 0.7509, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8892689183411714, |
| "grad_norm": 0.780403624876025, |
| "learning_rate": 5e-06, |
| "loss": 0.7453, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9234715690466011, |
| "grad_norm": 0.8368374257574379, |
| "learning_rate": 5e-06, |
| "loss": 0.7464, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9576742197520308, |
| "grad_norm": 0.664598550973498, |
| "learning_rate": 5e-06, |
| "loss": 0.7479, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9918768704574604, |
| "grad_norm": 0.9068241317522673, |
| "learning_rate": 5e-06, |
| "loss": 0.7498, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9987174005985464, |
| "eval_loss": 0.7466688752174377, |
| "eval_runtime": 311.1589, |
| "eval_samples_per_second": 25.318, |
| "eval_steps_per_second": 0.399, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.0260795211628901, |
| "grad_norm": 0.8795675245561216, |
| "learning_rate": 5e-06, |
| "loss": 0.7504, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0602821718683197, |
| "grad_norm": 0.7200932920136344, |
| "learning_rate": 5e-06, |
| "loss": 0.6956, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0944848225737496, |
| "grad_norm": 0.7000347561619843, |
| "learning_rate": 5e-06, |
| "loss": 0.6999, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1286874732791792, |
| "grad_norm": 0.7293027219793144, |
| "learning_rate": 5e-06, |
| "loss": 0.6924, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1628901239846088, |
| "grad_norm": 0.7112121560461261, |
| "learning_rate": 5e-06, |
| "loss": 0.6954, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1970927746900384, |
| "grad_norm": 0.6676298058618554, |
| "learning_rate": 5e-06, |
| "loss": 0.6928, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2312954253954682, |
| "grad_norm": 0.7948139991703412, |
| "learning_rate": 5e-06, |
| "loss": 0.6926, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2654980761008978, |
| "grad_norm": 0.6005615071846377, |
| "learning_rate": 5e-06, |
| "loss": 0.6941, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2997007268063274, |
| "grad_norm": 0.8364878484721108, |
| "learning_rate": 5e-06, |
| "loss": 0.6973, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.3339033775117572, |
| "grad_norm": 0.6553636674363748, |
| "learning_rate": 5e-06, |
| "loss": 0.6927, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3681060282171869, |
| "grad_norm": 0.6347311000125444, |
| "learning_rate": 5e-06, |
| "loss": 0.6896, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4023086789226165, |
| "grad_norm": 0.6290632367689938, |
| "learning_rate": 5e-06, |
| "loss": 0.6914, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.436511329628046, |
| "grad_norm": 0.6894916281860864, |
| "learning_rate": 5e-06, |
| "loss": 0.6932, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.470713980333476, |
| "grad_norm": 0.7350261506900151, |
| "learning_rate": 5e-06, |
| "loss": 0.6975, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5049166310389055, |
| "grad_norm": 0.7533532069494846, |
| "learning_rate": 5e-06, |
| "loss": 0.6901, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5391192817443353, |
| "grad_norm": 0.6857720752951453, |
| "learning_rate": 5e-06, |
| "loss": 0.6908, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.573321932449765, |
| "grad_norm": 0.5643890760448684, |
| "learning_rate": 5e-06, |
| "loss": 0.6903, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6075245831551945, |
| "grad_norm": 0.7285156581970353, |
| "learning_rate": 5e-06, |
| "loss": 0.6903, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6417272338606241, |
| "grad_norm": 0.6041759269305311, |
| "learning_rate": 5e-06, |
| "loss": 0.6929, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.6759298845660537, |
| "grad_norm": 0.9035571829039372, |
| "learning_rate": 5e-06, |
| "loss": 0.6906, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7101325352714836, |
| "grad_norm": 0.6211832411696856, |
| "learning_rate": 5e-06, |
| "loss": 0.689, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7443351859769132, |
| "grad_norm": 0.65412684208557, |
| "learning_rate": 5e-06, |
| "loss": 0.6904, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.778537836682343, |
| "grad_norm": 0.6853919016199206, |
| "learning_rate": 5e-06, |
| "loss": 0.6921, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8127404873877726, |
| "grad_norm": 0.6632587233526316, |
| "learning_rate": 5e-06, |
| "loss": 0.6901, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8469431380932022, |
| "grad_norm": 0.674379475654248, |
| "learning_rate": 5e-06, |
| "loss": 0.6885, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.8811457887986318, |
| "grad_norm": 0.7387517241758113, |
| "learning_rate": 5e-06, |
| "loss": 0.6923, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9153484395040614, |
| "grad_norm": 0.6488637527925505, |
| "learning_rate": 5e-06, |
| "loss": 0.6902, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9495510902094912, |
| "grad_norm": 0.6779717849056704, |
| "learning_rate": 5e-06, |
| "loss": 0.6935, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.983753740914921, |
| "grad_norm": 0.6583316625280361, |
| "learning_rate": 5e-06, |
| "loss": 0.687, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.9974348011970928, |
| "eval_loss": 0.7336150407791138, |
| "eval_runtime": 313.4691, |
| "eval_samples_per_second": 25.132, |
| "eval_steps_per_second": 0.396, |
| "step": 584 |
| }, |
| { |
| "epoch": 2.0179563916203507, |
| "grad_norm": 0.9477386095251965, |
| "learning_rate": 5e-06, |
| "loss": 0.7037, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0521590423257803, |
| "grad_norm": 0.996102550235612, |
| "learning_rate": 5e-06, |
| "loss": 0.634, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.08636169303121, |
| "grad_norm": 0.8448344638737102, |
| "learning_rate": 5e-06, |
| "loss": 0.6376, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.1205643437366395, |
| "grad_norm": 0.7047506891860142, |
| "learning_rate": 5e-06, |
| "loss": 0.6403, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.154766994442069, |
| "grad_norm": 0.717915631892293, |
| "learning_rate": 5e-06, |
| "loss": 0.634, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.188969645147499, |
| "grad_norm": 0.7332658653764828, |
| "learning_rate": 5e-06, |
| "loss": 0.6379, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.2231722958529287, |
| "grad_norm": 1.0505890513606944, |
| "learning_rate": 5e-06, |
| "loss": 0.6406, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.2573749465583584, |
| "grad_norm": 0.7382119345966844, |
| "learning_rate": 5e-06, |
| "loss": 0.6335, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.291577597263788, |
| "grad_norm": 0.70658364818583, |
| "learning_rate": 5e-06, |
| "loss": 0.6357, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.3257802479692176, |
| "grad_norm": 0.7046941652506434, |
| "learning_rate": 5e-06, |
| "loss": 0.6394, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.359982898674647, |
| "grad_norm": 0.7318336570070835, |
| "learning_rate": 5e-06, |
| "loss": 0.642, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.3941855493800768, |
| "grad_norm": 0.8432239156367899, |
| "learning_rate": 5e-06, |
| "loss": 0.6327, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.428388200085507, |
| "grad_norm": 0.7780419527832853, |
| "learning_rate": 5e-06, |
| "loss": 0.6446, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.4625908507909364, |
| "grad_norm": 0.7542982970599986, |
| "learning_rate": 5e-06, |
| "loss": 0.6409, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.496793501496366, |
| "grad_norm": 0.5645709157421896, |
| "learning_rate": 5e-06, |
| "loss": 0.6384, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.5309961522017956, |
| "grad_norm": 0.6696520450882577, |
| "learning_rate": 5e-06, |
| "loss": 0.6404, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.5651988029072252, |
| "grad_norm": 0.6097202796868465, |
| "learning_rate": 5e-06, |
| "loss": 0.6378, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.599401453612655, |
| "grad_norm": 0.8310444592987216, |
| "learning_rate": 5e-06, |
| "loss": 0.641, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.633604104318085, |
| "grad_norm": 0.6861309816401907, |
| "learning_rate": 5e-06, |
| "loss": 0.6444, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.6678067550235145, |
| "grad_norm": 0.6192293002458097, |
| "learning_rate": 5e-06, |
| "loss": 0.6344, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.702009405728944, |
| "grad_norm": 0.7587967307120371, |
| "learning_rate": 5e-06, |
| "loss": 0.642, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.7362120564343737, |
| "grad_norm": 0.610354783503826, |
| "learning_rate": 5e-06, |
| "loss": 0.6403, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.7704147071398033, |
| "grad_norm": 0.7203807608541957, |
| "learning_rate": 5e-06, |
| "loss": 0.6423, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.804617357845233, |
| "grad_norm": 0.8207621169451732, |
| "learning_rate": 5e-06, |
| "loss": 0.6436, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.8388200085506625, |
| "grad_norm": 0.6380822902161547, |
| "learning_rate": 5e-06, |
| "loss": 0.6436, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.873022659256092, |
| "grad_norm": 0.6919410892047235, |
| "learning_rate": 5e-06, |
| "loss": 0.6428, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.907225309961522, |
| "grad_norm": 0.6714077848139417, |
| "learning_rate": 5e-06, |
| "loss": 0.6419, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.941427960666952, |
| "grad_norm": 0.6977467899713985, |
| "learning_rate": 5e-06, |
| "loss": 0.64, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.9756306113723814, |
| "grad_norm": 0.6287903035183255, |
| "learning_rate": 5e-06, |
| "loss": 0.6436, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.9961522017956392, |
| "eval_loss": 0.7354960441589355, |
| "eval_runtime": 311.6721, |
| "eval_samples_per_second": 25.277, |
| "eval_steps_per_second": 0.398, |
| "step": 876 |
| }, |
| { |
| "epoch": 2.9961522017956392, |
| "step": 876, |
| "total_flos": 1467123247349760.0, |
| "train_loss": 0.7098228664703021, |
| "train_runtime": 51889.5388, |
| "train_samples_per_second": 8.653, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 876, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1467123247349760.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|