{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 772, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012961762799740765, "grad_norm": 3.271511827979835, "learning_rate": 5.76923076923077e-06, "loss": 0.5184, "step": 10 }, { "epoch": 0.02592352559948153, "grad_norm": 0.831092338338801, "learning_rate": 1.217948717948718e-05, "loss": 0.3124, "step": 20 }, { "epoch": 0.03888528839922229, "grad_norm": 0.8169740852721643, "learning_rate": 1.858974358974359e-05, "loss": 0.244, "step": 30 }, { "epoch": 0.05184705119896306, "grad_norm": 0.8022441079176706, "learning_rate": 2.5e-05, "loss": 0.2194, "step": 40 }, { "epoch": 0.06480881399870382, "grad_norm": 0.746730565456582, "learning_rate": 3.141025641025641e-05, "loss": 0.1932, "step": 50 }, { "epoch": 0.07777057679844458, "grad_norm": 0.45890023499187765, "learning_rate": 3.782051282051282e-05, "loss": 0.175, "step": 60 }, { "epoch": 0.09073233959818536, "grad_norm": 0.22457234130494264, "learning_rate": 4.423076923076923e-05, "loss": 0.1627, "step": 70 }, { "epoch": 0.10369410239792612, "grad_norm": 0.20882952808150415, "learning_rate": 4.999974385252693e-05, "loss": 0.1604, "step": 80 }, { "epoch": 0.11665586519766688, "grad_norm": 0.20503246990709634, "learning_rate": 4.996901250644663e-05, "loss": 0.1563, "step": 90 }, { "epoch": 0.12961762799740764, "grad_norm": 0.18969366345140587, "learning_rate": 4.9887123814116815e-05, "loss": 0.158, "step": 100 }, { "epoch": 0.1425793907971484, "grad_norm": 0.18632569690899564, "learning_rate": 4.9754245551823644e-05, "loss": 0.1564, "step": 110 }, { "epoch": 0.15554115359688916, "grad_norm": 0.18117655622812814, "learning_rate": 4.957064996498616e-05, "loss": 0.154, "step": 120 }, { "epoch": 0.16850291639662995, "grad_norm": 0.16588546566690052, "learning_rate": 4.9336713210370824e-05, "loss": 0.1523, "step": 130 }, { "epoch": 0.18146467919637072, "grad_norm": 0.18912029718121498, "learning_rate": 4.905291458540893e-05, "loss": 0.1511, "step": 140 }, { "epoch": 0.19442644199611148, "grad_norm": 0.17373091061587545, "learning_rate": 4.871983554619603e-05, "loss": 0.1503, "step": 150 }, { "epoch": 0.20738820479585224, "grad_norm": 0.17290195615203574, "learning_rate": 4.833815851618534e-05, "loss": 0.1514, "step": 160 }, { "epoch": 0.220349967595593, "grad_norm": 0.17919206762895043, "learning_rate": 4.7908665488015724e-05, "loss": 0.15, "step": 170 }, { "epoch": 0.23331173039533376, "grad_norm": 0.16987143452586112, "learning_rate": 4.7432236421339085e-05, "loss": 0.1481, "step": 180 }, { "epoch": 0.24627349319507452, "grad_norm": 0.18456595982528615, "learning_rate": 4.690984743992968e-05, "loss": 0.1479, "step": 190 }, { "epoch": 0.2592352559948153, "grad_norm": 0.16935904456944317, "learning_rate": 4.6342568831769154e-05, "loss": 0.1476, "step": 200 }, { "epoch": 0.27219701879455604, "grad_norm": 0.1686721280050141, "learning_rate": 4.5731562856204766e-05, "loss": 0.1456, "step": 210 }, { "epoch": 0.2851587815942968, "grad_norm": 0.17009760438578003, "learning_rate": 4.507808136267367e-05, "loss": 0.1464, "step": 220 }, { "epoch": 0.29812054439403757, "grad_norm": 0.16186269370223397, "learning_rate": 4.4383463225872e-05, "loss": 0.1454, "step": 230 }, { "epoch": 0.31108230719377833, "grad_norm": 0.1567841142418867, "learning_rate": 4.3649131602623684e-05, "loss": 0.1461, "step": 240 }, { "epoch": 0.32404406999351915, "grad_norm": 0.15669189958261343, "learning_rate": 4.2876591016069276e-05, "loss": 0.1425, "step": 250 }, { "epoch": 0.3370058327932599, "grad_norm": 0.15932597917665003, "learning_rate": 4.206742427314869e-05, "loss": 0.142, "step": 260 }, { "epoch": 0.34996759559300067, "grad_norm": 0.1619448915136352, "learning_rate": 4.122328922169354e-05, "loss": 0.1439, "step": 270 }, { "epoch": 0.36292935839274143, "grad_norm": 0.15820361686037115, "learning_rate": 4.034591535377315e-05, "loss": 0.1431, "step": 280 }, { "epoch": 0.3758911211924822, "grad_norm": 0.15375142675453407, "learning_rate": 3.9437100262253444e-05, "loss": 0.1439, "step": 290 }, { "epoch": 0.38885288399222295, "grad_norm": 0.15083396004850522, "learning_rate": 3.849870595782879e-05, "loss": 0.1394, "step": 300 }, { "epoch": 0.4018146467919637, "grad_norm": 0.15309832243642346, "learning_rate": 3.7532655054072175e-05, "loss": 0.1402, "step": 310 }, { "epoch": 0.4147764095917045, "grad_norm": 0.15212599623263634, "learning_rate": 3.65409268283205e-05, "loss": 0.1394, "step": 320 }, { "epoch": 0.42773817239144524, "grad_norm": 0.16388840607478222, "learning_rate": 3.5525553166464995e-05, "loss": 0.1393, "step": 330 }, { "epoch": 0.440699935191186, "grad_norm": 0.14965585253954922, "learning_rate": 3.4488614399955655e-05, "loss": 0.1401, "step": 340 }, { "epoch": 0.45366169799092676, "grad_norm": 0.14839886982769007, "learning_rate": 3.343223504354868e-05, "loss": 0.1388, "step": 350 }, { "epoch": 0.4666234607906675, "grad_norm": 0.15069527899368187, "learning_rate": 3.2358579442529756e-05, "loss": 0.1397, "step": 360 }, { "epoch": 0.4795852235904083, "grad_norm": 0.16024683724642488, "learning_rate": 3.1269847338331195e-05, "loss": 0.1393, "step": 370 }, { "epoch": 0.49254698639014904, "grad_norm": 0.15567268677689633, "learning_rate": 3.016826936162822e-05, "loss": 0.1372, "step": 380 }, { "epoch": 0.5055087491898899, "grad_norm": 0.152409091505418, "learning_rate": 2.905610246214846e-05, "loss": 0.1379, "step": 390 }, { "epoch": 0.5184705119896306, "grad_norm": 0.15469090492982207, "learning_rate": 2.7935625284557933e-05, "loss": 0.1363, "step": 400 }, { "epoch": 0.5314322747893714, "grad_norm": 0.15396385907985133, "learning_rate": 2.6809133499897853e-05, "loss": 0.1355, "step": 410 }, { "epoch": 0.5443940375891121, "grad_norm": 0.1509398233656185, "learning_rate": 2.567893510213716e-05, "loss": 0.1357, "step": 420 }, { "epoch": 0.5573558003888529, "grad_norm": 0.1517027742189059, "learning_rate": 2.4547345679477424e-05, "loss": 0.1356, "step": 430 }, { "epoch": 0.5703175631885936, "grad_norm": 0.1445296123427847, "learning_rate": 2.3416683670098457e-05, "loss": 0.1335, "step": 440 }, { "epoch": 0.5832793259883344, "grad_norm": 0.14623295764555178, "learning_rate": 2.22892656120648e-05, "loss": 0.134, "step": 450 }, { "epoch": 0.5962410887880751, "grad_norm": 0.1438322715440882, "learning_rate": 2.1167401397125193e-05, "loss": 0.1311, "step": 460 }, { "epoch": 0.609202851587816, "grad_norm": 0.13535718612249212, "learning_rate": 2.0053389538129257e-05, "loss": 0.1324, "step": 470 }, { "epoch": 0.6221646143875567, "grad_norm": 0.14161349156500758, "learning_rate": 1.8949512459757668e-05, "loss": 0.1326, "step": 480 }, { "epoch": 0.6351263771872975, "grad_norm": 0.15638548124890905, "learning_rate": 1.7858031822214284e-05, "loss": 0.1316, "step": 490 }, { "epoch": 0.6480881399870383, "grad_norm": 0.13753009005968514, "learning_rate": 1.678118388746118e-05, "loss": 0.1318, "step": 500 }, { "epoch": 0.661049902786779, "grad_norm": 0.13951872960880884, "learning_rate": 1.5721174937490584e-05, "loss": 0.128, "step": 510 }, { "epoch": 0.6740116655865198, "grad_norm": 0.15062860291892982, "learning_rate": 1.4680176754020627e-05, "loss": 0.1329, "step": 520 }, { "epoch": 0.6869734283862605, "grad_norm": 0.1373383252389213, "learning_rate": 1.3660322168876483e-05, "loss": 0.1314, "step": 530 }, { "epoch": 0.6999351911860013, "grad_norm": 0.1450933429238937, "learning_rate": 1.2663700694173325e-05, "loss": 0.1307, "step": 540 }, { "epoch": 0.712896953985742, "grad_norm": 0.1392322353140692, "learning_rate": 1.1692354241254183e-05, "loss": 0.1305, "step": 550 }, { "epoch": 0.7258587167854829, "grad_norm": 0.14054841459970466, "learning_rate": 1.0748272937153824e-05, "loss": 0.1312, "step": 560 }, { "epoch": 0.7388204795852236, "grad_norm": 0.14860896958457903, "learning_rate": 9.83339104716002e-06, "loss": 0.1292, "step": 570 }, { "epoch": 0.7517822423849644, "grad_norm": 0.13529195006367872, "learning_rate": 8.949583011826313e-06, "loss": 0.1289, "step": 580 }, { "epoch": 0.7647440051847051, "grad_norm": 0.1324125611532461, "learning_rate": 8.098659606555617e-06, "loss": 0.1281, "step": 590 }, { "epoch": 0.7777057679844459, "grad_norm": 0.13800340855022483, "learning_rate": 7.282364231623137e-06, "loss": 0.1276, "step": 600 }, { "epoch": 0.7906675307841866, "grad_norm": 0.13855130020381573, "learning_rate": 6.502369340239678e-06, "loss": 0.1275, "step": 610 }, { "epoch": 0.8036292935839274, "grad_norm": 0.1357606689573841, "learning_rate": 5.76027301197371e-06, "loss": 0.1296, "step": 620 }, { "epoch": 0.8165910563836681, "grad_norm": 0.137168244596666, "learning_rate": 5.057595678552596e-06, "loss": 0.1255, "step": 630 }, { "epoch": 0.829552819183409, "grad_norm": 0.13260125486622454, "learning_rate": 4.395777008751317e-06, "loss": 0.1277, "step": 640 }, { "epoch": 0.8425145819831497, "grad_norm": 0.1322898739724209, "learning_rate": 3.776172958751012e-06, "loss": 0.1276, "step": 650 }, { "epoch": 0.8554763447828905, "grad_norm": 0.14011701850297203, "learning_rate": 3.2000529940107353e-06, "loss": 0.1289, "step": 660 }, { "epoch": 0.8684381075826313, "grad_norm": 0.14213930494271382, "learning_rate": 2.668597488344232e-06, "loss": 0.1282, "step": 670 }, { "epoch": 0.881399870382372, "grad_norm": 0.14207443413888324, "learning_rate": 2.1828953055306468e-06, "loss": 0.126, "step": 680 }, { "epoch": 0.8943616331821128, "grad_norm": 0.14029242432499878, "learning_rate": 1.7439415684141063e-06, "loss": 0.1283, "step": 690 }, { "epoch": 0.9073233959818535, "grad_norm": 0.13420810652236514, "learning_rate": 1.3526356200628005e-06, "loss": 0.1275, "step": 700 }, { "epoch": 0.9202851587815943, "grad_norm": 0.13435026638625558, "learning_rate": 1.009779181164891e-06, "loss": 0.1287, "step": 710 }, { "epoch": 0.933246921581335, "grad_norm": 0.1366326068760381, "learning_rate": 7.160747074363927e-07, "loss": 0.1276, "step": 720 }, { "epoch": 0.9462086843810759, "grad_norm": 0.1369464774984695, "learning_rate": 4.7212395040647783e-07, "loss": 0.1274, "step": 730 }, { "epoch": 0.9591704471808166, "grad_norm": 0.13491429289666754, "learning_rate": 2.784267245288408e-07, "loss": 0.128, "step": 740 }, { "epoch": 0.9721322099805574, "grad_norm": 0.14545746024987635, "learning_rate": 1.3537988314516748e-07, "loss": 0.128, "step": 750 }, { "epoch": 0.9850939727802981, "grad_norm": 0.14001699440802526, "learning_rate": 4.3276505398764935e-08, "loss": 0.1261, "step": 760 }, { "epoch": 0.9980557355800389, "grad_norm": 0.1401206033433944, "learning_rate": 2.3052957642238915e-09, "loss": 0.1234, "step": 770 }, { "epoch": 1.0, "step": 772, "total_flos": 190452488863744.0, "train_loss": 0.1484613320138788, "train_runtime": 5530.0522, "train_samples_per_second": 71.428, "train_steps_per_second": 0.14 } ], "logging_steps": 10, "max_steps": 772, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 190452488863744.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }