{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.945914738349268, "eval_steps": 4096, "global_step": 20480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 1.1222522258758545, "learning_rate": 0.000498046875, "loss": 0.7148012518882751, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 1.1869175434112549, "learning_rate": 0.000998046875, "loss": 0.7245784401893616, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 1.4848835468292236, "learning_rate": 0.000999640996023194, "loss": 0.7249147891998291, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 1.7819839715957642, "learning_rate": 0.0009985588674043958, "loss": 0.7251258492469788, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.966533899307251, "learning_rate": 0.0009967551747861387, "loss": 0.7263231873512268, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 1.143156886100769, "learning_rate": 0.000994232528651847, "loss": 0.7227165699005127, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.969417929649353, "learning_rate": 0.0009909945800260092, "loss": 0.7240616083145142, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 1.5566692352294922, "learning_rate": 0.0009870460151900522, "loss": 0.714276134967804, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.6052780151367188, "learning_rate": 0.0009823925488998885, "loss": 0.712334394454956, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 1.8517088890075684, "learning_rate": 0.0009770409161149525, "loss": 0.7229606509208679, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.7567554712295532, "learning_rate": 0.0009709988622506973, "loss": 0.7149595618247986, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 1.334909439086914, "learning_rate": 0.000964275131968659, "loss": 0.7151144742965698, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 1.4210789203643799, "learning_rate": 0.0009568794565203123, "loss": 0.7077174186706543, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 1.216098427772522, "learning_rate": 0.0009488225396630347, "loss": 0.7150501012802124, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 2.2890193462371826, "learning_rate": 0.0009401160421685646, "loss": 0.7134696841239929, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.5908980369567871, "learning_rate": 0.0009307725649463714, "loss": 0.7133264541625977, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.0009799282077471887, "eval_loss": 0.7145318688322965, "eval_mse_loss": 0.7145318688322965, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.0009799282077471887, "eval_loss": 0.7145318688322965, "eval_mse_loss": 0.7145318688322965, "eval_runtime": 230.3288, "eval_samples_per_second": 121.535, "eval_steps_per_second": 1.902, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 1.3493183851242065, "learning_rate": 0.0009208056308063659, "loss": 0.7063367366790771, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 1.127218246459961, "learning_rate": 0.0009102296648873445, "loss": 0.7145900130271912, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 1.356933832168579, "learning_rate": 0.0008990599737794927, "loss": 0.7130961418151855, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.5257934331893921, "learning_rate": 0.0008873127233711644, "loss": 0.713787853717804, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.9926263093948364, "learning_rate": 0.0008750049154520011, "loss": 0.7070172429084778, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 1.379420518875122, "learning_rate": 0.0008621543631062487, "loss": 0.7044810056686401, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 1.3494232892990112, "learning_rate": 0.0008487796649318904, "loss": 0.7017611265182495, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 1.3593381643295288, "learning_rate": 0.0008349001781229053, "loss": 0.7081840634346008, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 1.4364413022994995, "learning_rate": 0.0008205359904536107, "loss": 0.6988816857337952, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 1.7631044387817383, "learning_rate": 0.0008057078912056363, "loss": 0.7046327590942383, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 1.3340634107589722, "learning_rate": 0.0007904373410796086, "loss": 0.6967505216598511, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 1.0474779605865479, "learning_rate": 0.0007747464411350876, "loss": 0.6985277533531189, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 1.2403302192687988, "learning_rate": 0.000758657900803716, "loss": 0.6996051073074341, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.8496942520141602, "learning_rate": 0.000742195005021869, "loss": 0.6959736347198486, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.9611047506332397, "learning_rate": 0.0007253815805303786, "loss": 0.6994542479515076, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.5330133438110352, "learning_rate": 0.0007082419613901028, "loss": 0.7035479545593262, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.0010052073081668492, "eval_loss": 0.6948950419959412, "eval_mse_loss": 0.6948950419959412, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.0010052073081668492, "eval_loss": 0.6948950419959412, "eval_mse_loss": 0.6948950419959412, "eval_runtime": 230.6732, "eval_samples_per_second": 121.354, "eval_steps_per_second": 1.899, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.8082056045532227, "learning_rate": 0.0006908009537632514, "loss": 0.7005434036254883, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 1.2990059852600098, "learning_rate": 0.0006730838000114403, "loss": 0.7007656097412109, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.5188842415809631, "learning_rate": 0.0006551161421624341, "loss": 0.7021461129188538, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.6316655874252319, "learning_rate": 0.0006369239847984517, "loss": 0.6990594863891602, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 1.5941392183303833, "learning_rate": 0.0006185336574197479, "loss": 0.6935109496116638, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.41938135027885437, "learning_rate": 0.0005999717763379407, "loss": 0.6998211145401001, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.9023215770721436, "learning_rate": 0.0005812652061542363, "loss": 0.6968086361885071, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 1.2630209922790527, "learning_rate": 0.0005624410208783071, "loss": 0.6952697038650513, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 1.2264976501464844, "learning_rate": 0.0005435264647440881, "loss": 0.6887762546539307, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.728610098361969, "learning_rate": 0.000524548912779213, "loss": 0.6938179135322571, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.8676101565361023, "learning_rate": 0.0005055358311851499, "loss": 0.6948530673980713, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 1.1160482168197632, "learning_rate": 0.0004865147375853812, "loss": 0.691513180732727, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.6974979043006897, "learning_rate": 0.0004675131611991607, "loss": 0.6864028573036194, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.46719640493392944, "learning_rate": 0.0004485586029984899, "loss": 0.6954278349876404, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 1.2132223844528198, "learning_rate": 0.00042967849590597266, "loss": 0.6914871335029602, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 1.054893970489502, "learning_rate": 0.0004109001650911621, "loss": 0.686524510383606, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.0010400607906154184, "eval_loss": 0.6875994070752026, "eval_mse_loss": 0.6875994070752026, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.0010400607906154184, "eval_loss": 0.6875994070752026, "eval_mse_loss": 0.6875994070752026, "eval_runtime": 236.4224, "eval_samples_per_second": 118.402, "eval_steps_per_second": 1.853, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.7405524253845215, "learning_rate": 0.0003922507884228551, "loss": 0.6839651465415955, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.7894928455352783, "learning_rate": 0.00037375735713457723, "loss": 0.6843001842498779, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.7159557938575745, "learning_rate": 0.00035544663676018276, "loss": 0.6922345161437988, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 1.1562808752059937, "learning_rate": 0.00033734512839611255, "loss": 0.6860140562057495, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.5824499130249023, "learning_rate": 0.0003194790303463687, "loss": 0.6856705546379089, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 3.080799102783203, "learning_rate": 0.00030187420020572406, "loss": 0.6847730278968811, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.6743490695953369, "learning_rate": 0.00028455611743603626, "loss": 0.6856830716133118, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 1.060508370399475, "learning_rate": 0.0002675498464898373, "loss": 0.6844523549079895, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 1.611391544342041, "learning_rate": 0.0002508800005345623, "loss": 0.6842568516731262, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 1.5996400117874146, "learning_rate": 0.00023457070582992562, "loss": 0.6826963424682617, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.6100959181785583, "learning_rate": 0.00021864556680999692, "loss": 0.6834294199943542, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 1.0907484292984009, "learning_rate": 0.0002031276319205152, "loss": 0.6882811784744263, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 1.3213837146759033, "learning_rate": 0.00018803936026088542, "loss": 0.6864408850669861, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.40607210993766785, "learning_rate": 0.00017340258907913464, "loss": 0.6805769205093384, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.48498424887657166, "learning_rate": 0.0001592385021668743, "loss": 0.6834226846694946, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.4616319239139557, "learning_rate": 0.0001455675992000087, "loss": 0.6785019040107727, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.0009476330082336835, "eval_loss": 0.6870187982984873, "eval_mse_loss": 0.6870187982984873, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.0009476330082336835, "eval_loss": 0.6870187982984873, "eval_mse_loss": 0.6870187982984873, "eval_runtime": 231.2977, "eval_samples_per_second": 121.026, "eval_steps_per_second": 1.894, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 1.320206642150879, "learning_rate": 0.000132409666069565, "loss": 0.6836502552032471, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.3926827311515808, "learning_rate": 0.0001197837462455823, "loss": 0.6756935119628906, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.4777701199054718, "learning_rate": 0.00010770811321550749, "loss": 0.6764777898788452, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 1.5470472574234009, "learning_rate": 9.620024403698591e-05, "loss": 0.6807876229286194, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 1.4899367094039917, "learning_rate": 8.527679404332429e-05, "loss": 0.6819196343421936, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 1.1178877353668213, "learning_rate": 7.495357273823544e-05, "loss": 0.6775845289230347, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 1.9614344835281372, "learning_rate": 6.524552091475183e-05, "loss": 0.6840441823005676, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 1.231566309928894, "learning_rate": 5.6166689031422024e-05, "loss": 0.6766948103904724, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 3.0353503227233887, "learning_rate": 4.773021687709067e-05, "loss": 0.677415132522583, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 2.707676887512207, "learning_rate": 3.994831455368719e-05, "loss": 0.6782501339912415, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.49309444427490234, "learning_rate": 3.283224480455282e-05, "loss": 0.680189847946167, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 1.044478178024292, "learning_rate": 2.639230671387627e-05, "loss": 0.6733763217926025, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 2.4254560470581055, "learning_rate": 2.063782080083576e-05, "loss": 0.6758845448493958, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.8764089345932007, "learning_rate": 1.557711553001523e-05, "loss": 0.6751888990402222, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 0.6017519235610962, "learning_rate": 1.1217515257622269e-05, "loss": 0.6804232597351074, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 2.2349886894226074, "learning_rate": 7.565329630950746e-06, "loss": 0.6779721975326538, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.0009655630043002199, "eval_loss": 0.675000424866807, "eval_mse_loss": 0.675000424866807, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.0009655630043002199, "eval_loss": 0.675000424866807, "eval_mse_loss": 0.675000424866807, "eval_runtime": 236.8078, "eval_samples_per_second": 118.21, "eval_steps_per_second": 1.85, "step": 20480 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4096, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }