| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9934162399414777, |
| "eval_steps": 500, |
| "global_step": 1023, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.029261155815654718, |
| "grad_norm": 1.4050924813037304, |
| "learning_rate": 5e-06, |
| "loss": 0.8017, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.058522311631309436, |
| "grad_norm": 1.3065904000552369, |
| "learning_rate": 5e-06, |
| "loss": 0.7404, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08778346744696415, |
| "grad_norm": 1.2979239308672723, |
| "learning_rate": 5e-06, |
| "loss": 0.708, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11704462326261887, |
| "grad_norm": 1.0926191134123135, |
| "learning_rate": 5e-06, |
| "loss": 0.6972, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14630577907827358, |
| "grad_norm": 1.1489051712650007, |
| "learning_rate": 5e-06, |
| "loss": 0.6873, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1755669348939283, |
| "grad_norm": 0.9148644352688643, |
| "learning_rate": 5e-06, |
| "loss": 0.6769, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20482809070958302, |
| "grad_norm": 0.6058128646359505, |
| "learning_rate": 5e-06, |
| "loss": 0.6704, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23408924652523774, |
| "grad_norm": 0.41717582950682175, |
| "learning_rate": 5e-06, |
| "loss": 0.6652, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.26335040234089246, |
| "grad_norm": 0.4128490966927176, |
| "learning_rate": 5e-06, |
| "loss": 0.6623, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.29261155815654716, |
| "grad_norm": 0.43823761398400346, |
| "learning_rate": 5e-06, |
| "loss": 0.6548, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3218727139722019, |
| "grad_norm": 0.4120722698771585, |
| "learning_rate": 5e-06, |
| "loss": 0.6632, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3511338697878566, |
| "grad_norm": 0.3783900899120013, |
| "learning_rate": 5e-06, |
| "loss": 0.6684, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.38039502560351135, |
| "grad_norm": 0.386551090084575, |
| "learning_rate": 5e-06, |
| "loss": 0.6524, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.40965618141916604, |
| "grad_norm": 0.3597889472672021, |
| "learning_rate": 5e-06, |
| "loss": 0.6528, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4389173372348208, |
| "grad_norm": 0.3450666491165035, |
| "learning_rate": 5e-06, |
| "loss": 0.6484, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4681784930504755, |
| "grad_norm": 0.38386599901600177, |
| "learning_rate": 5e-06, |
| "loss": 0.6498, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.49743964886613024, |
| "grad_norm": 0.36293707173971257, |
| "learning_rate": 5e-06, |
| "loss": 0.6549, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5267008046817849, |
| "grad_norm": 0.33541060801231504, |
| "learning_rate": 5e-06, |
| "loss": 0.6548, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5559619604974396, |
| "grad_norm": 0.3641707608459179, |
| "learning_rate": 5e-06, |
| "loss": 0.6431, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5852231163130943, |
| "grad_norm": 0.3425809888866301, |
| "learning_rate": 5e-06, |
| "loss": 0.6447, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6144842721287491, |
| "grad_norm": 0.3411120604972397, |
| "learning_rate": 5e-06, |
| "loss": 0.6515, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6437454279444038, |
| "grad_norm": 0.34977383465484474, |
| "learning_rate": 5e-06, |
| "loss": 0.6392, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6730065837600585, |
| "grad_norm": 0.3610432531693289, |
| "learning_rate": 5e-06, |
| "loss": 0.6394, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7022677395757132, |
| "grad_norm": 0.3260802596280787, |
| "learning_rate": 5e-06, |
| "loss": 0.646, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.731528895391368, |
| "grad_norm": 0.33967500101215686, |
| "learning_rate": 5e-06, |
| "loss": 0.6445, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7607900512070227, |
| "grad_norm": 0.34323108098698346, |
| "learning_rate": 5e-06, |
| "loss": 0.6472, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7900512070226774, |
| "grad_norm": 0.38672586052678504, |
| "learning_rate": 5e-06, |
| "loss": 0.6495, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8193123628383321, |
| "grad_norm": 0.34875877597940846, |
| "learning_rate": 5e-06, |
| "loss": 0.6475, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8485735186539868, |
| "grad_norm": 0.34020999596385737, |
| "learning_rate": 5e-06, |
| "loss": 0.6403, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8778346744696416, |
| "grad_norm": 0.34896616600125596, |
| "learning_rate": 5e-06, |
| "loss": 0.632, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9070958302852963, |
| "grad_norm": 0.35096132599415825, |
| "learning_rate": 5e-06, |
| "loss": 0.6454, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.936356986100951, |
| "grad_norm": 0.33844104268026326, |
| "learning_rate": 5e-06, |
| "loss": 0.6374, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9656181419166057, |
| "grad_norm": 0.3796088669629025, |
| "learning_rate": 5e-06, |
| "loss": 0.6405, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9948792977322605, |
| "grad_norm": 0.3566632168663002, |
| "learning_rate": 5e-06, |
| "loss": 0.6366, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9978054133138259, |
| "eval_loss": 0.640764057636261, |
| "eval_runtime": 172.4584, |
| "eval_samples_per_second": 53.398, |
| "eval_steps_per_second": 0.417, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.024140453547915, |
| "grad_norm": 0.3607362960926969, |
| "learning_rate": 5e-06, |
| "loss": 0.6258, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0534016093635699, |
| "grad_norm": 0.36780546039151835, |
| "learning_rate": 5e-06, |
| "loss": 0.6076, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.0826627651792247, |
| "grad_norm": 0.3909882574933846, |
| "learning_rate": 5e-06, |
| "loss": 0.609, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1119239209948792, |
| "grad_norm": 0.36418781193851457, |
| "learning_rate": 5e-06, |
| "loss": 0.607, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.141185076810534, |
| "grad_norm": 0.36669566643557633, |
| "learning_rate": 5e-06, |
| "loss": 0.611, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.1704462326261886, |
| "grad_norm": 0.3740411216691114, |
| "learning_rate": 5e-06, |
| "loss": 0.6136, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.1997073884418434, |
| "grad_norm": 0.38204393447820967, |
| "learning_rate": 5e-06, |
| "loss": 0.6121, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2289685442574982, |
| "grad_norm": 0.32858410125693777, |
| "learning_rate": 5e-06, |
| "loss": 0.6017, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.2582297000731528, |
| "grad_norm": 0.35352828972116446, |
| "learning_rate": 5e-06, |
| "loss": 0.6115, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.2874908558888076, |
| "grad_norm": 0.3500900972976414, |
| "learning_rate": 5e-06, |
| "loss": 0.6047, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3167520117044624, |
| "grad_norm": 0.3748496524706886, |
| "learning_rate": 5e-06, |
| "loss": 0.6064, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.346013167520117, |
| "grad_norm": 0.34864950354517366, |
| "learning_rate": 5e-06, |
| "loss": 0.606, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.3752743233357718, |
| "grad_norm": 0.3246951601284818, |
| "learning_rate": 5e-06, |
| "loss": 0.6017, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4045354791514264, |
| "grad_norm": 0.3706517957917298, |
| "learning_rate": 5e-06, |
| "loss": 0.6061, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4337966349670812, |
| "grad_norm": 0.34984943751515873, |
| "learning_rate": 5e-06, |
| "loss": 0.6069, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.4630577907827358, |
| "grad_norm": 0.3670659231729917, |
| "learning_rate": 5e-06, |
| "loss": 0.6006, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.4923189465983906, |
| "grad_norm": 0.35461595507153526, |
| "learning_rate": 5e-06, |
| "loss": 0.6053, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5215801024140454, |
| "grad_norm": 0.40792822804315615, |
| "learning_rate": 5e-06, |
| "loss": 0.6085, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.5508412582297, |
| "grad_norm": 0.3660724485011459, |
| "learning_rate": 5e-06, |
| "loss": 0.608, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.5801024140453548, |
| "grad_norm": 0.3761570864865201, |
| "learning_rate": 5e-06, |
| "loss": 0.6143, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.6093635698610096, |
| "grad_norm": 0.3652441538521536, |
| "learning_rate": 5e-06, |
| "loss": 0.6046, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6386247256766642, |
| "grad_norm": 0.3626650308681376, |
| "learning_rate": 5e-06, |
| "loss": 0.6067, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.667885881492319, |
| "grad_norm": 0.33256833438265615, |
| "learning_rate": 5e-06, |
| "loss": 0.5992, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.6971470373079738, |
| "grad_norm": 0.34639609892679385, |
| "learning_rate": 5e-06, |
| "loss": 0.6033, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.7264081931236284, |
| "grad_norm": 0.41288603809587293, |
| "learning_rate": 5e-06, |
| "loss": 0.6082, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.755669348939283, |
| "grad_norm": 0.35078522187248407, |
| "learning_rate": 5e-06, |
| "loss": 0.6034, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.784930504754938, |
| "grad_norm": 0.3500706757634982, |
| "learning_rate": 5e-06, |
| "loss": 0.6073, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.8141916605705926, |
| "grad_norm": 0.34791150473187205, |
| "learning_rate": 5e-06, |
| "loss": 0.6073, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.8434528163862471, |
| "grad_norm": 0.35855392054483914, |
| "learning_rate": 5e-06, |
| "loss": 0.6061, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.872713972201902, |
| "grad_norm": 0.3590152390035877, |
| "learning_rate": 5e-06, |
| "loss": 0.6001, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.9019751280175567, |
| "grad_norm": 0.34420267789473347, |
| "learning_rate": 5e-06, |
| "loss": 0.6069, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.9312362838332113, |
| "grad_norm": 0.353252964395989, |
| "learning_rate": 5e-06, |
| "loss": 0.6065, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.9604974396488661, |
| "grad_norm": 0.3601604912833267, |
| "learning_rate": 5e-06, |
| "loss": 0.5951, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.989758595464521, |
| "grad_norm": 0.3837579941787285, |
| "learning_rate": 5e-06, |
| "loss": 0.6092, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.9985369422092174, |
| "eval_loss": 0.6331804394721985, |
| "eval_runtime": 173.0999, |
| "eval_samples_per_second": 53.201, |
| "eval_steps_per_second": 0.416, |
| "step": 683 |
| }, |
| { |
| "epoch": 2.0190197512801755, |
| "grad_norm": 0.37406824558305324, |
| "learning_rate": 5e-06, |
| "loss": 0.5986, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.04828090709583, |
| "grad_norm": 0.3714772768326328, |
| "learning_rate": 5e-06, |
| "loss": 0.5679, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.077542062911485, |
| "grad_norm": 0.34979442520485465, |
| "learning_rate": 5e-06, |
| "loss": 0.5704, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.1068032187271397, |
| "grad_norm": 0.3654154071295783, |
| "learning_rate": 5e-06, |
| "loss": 0.5704, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.1360643745427943, |
| "grad_norm": 0.34335563724610463, |
| "learning_rate": 5e-06, |
| "loss": 0.5712, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.1653255303584493, |
| "grad_norm": 0.36250439290462216, |
| "learning_rate": 5e-06, |
| "loss": 0.5778, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.194586686174104, |
| "grad_norm": 0.3320940524182815, |
| "learning_rate": 5e-06, |
| "loss": 0.5773, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.2238478419897585, |
| "grad_norm": 0.3581411603952458, |
| "learning_rate": 5e-06, |
| "loss": 0.5734, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.2531089978054135, |
| "grad_norm": 0.34614899254145687, |
| "learning_rate": 5e-06, |
| "loss": 0.5731, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.282370153621068, |
| "grad_norm": 0.3567591013456269, |
| "learning_rate": 5e-06, |
| "loss": 0.5719, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.3116313094367227, |
| "grad_norm": 0.35393266140908636, |
| "learning_rate": 5e-06, |
| "loss": 0.572, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.3408924652523773, |
| "grad_norm": 0.34358900874192705, |
| "learning_rate": 5e-06, |
| "loss": 0.5772, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.3701536210680323, |
| "grad_norm": 0.35278690571653865, |
| "learning_rate": 5e-06, |
| "loss": 0.5708, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.399414776883687, |
| "grad_norm": 0.40283847792672317, |
| "learning_rate": 5e-06, |
| "loss": 0.5804, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.4286759326993415, |
| "grad_norm": 0.3475369878896552, |
| "learning_rate": 5e-06, |
| "loss": 0.5735, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.4579370885149965, |
| "grad_norm": 0.36659134111304026, |
| "learning_rate": 5e-06, |
| "loss": 0.5709, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.487198244330651, |
| "grad_norm": 0.3328517816299331, |
| "learning_rate": 5e-06, |
| "loss": 0.5725, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.5164594001463056, |
| "grad_norm": 0.3449210883266256, |
| "learning_rate": 5e-06, |
| "loss": 0.569, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.5457205559619602, |
| "grad_norm": 0.3432206214099153, |
| "learning_rate": 5e-06, |
| "loss": 0.5695, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.5749817117776153, |
| "grad_norm": 0.37139001643486197, |
| "learning_rate": 5e-06, |
| "loss": 0.5757, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.60424286759327, |
| "grad_norm": 0.37849506135493044, |
| "learning_rate": 5e-06, |
| "loss": 0.5727, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.633504023408925, |
| "grad_norm": 0.36195620771283454, |
| "learning_rate": 5e-06, |
| "loss": 0.5797, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.6627651792245794, |
| "grad_norm": 0.3395444260645495, |
| "learning_rate": 5e-06, |
| "loss": 0.5747, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.692026335040234, |
| "grad_norm": 0.3494251583889423, |
| "learning_rate": 5e-06, |
| "loss": 0.5755, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.7212874908558886, |
| "grad_norm": 0.3408552125872173, |
| "learning_rate": 5e-06, |
| "loss": 0.5753, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.7505486466715436, |
| "grad_norm": 0.37863476717768724, |
| "learning_rate": 5e-06, |
| "loss": 0.5764, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.7798098024871982, |
| "grad_norm": 0.3476516038420746, |
| "learning_rate": 5e-06, |
| "loss": 0.576, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.809070958302853, |
| "grad_norm": 0.32769709791000895, |
| "learning_rate": 5e-06, |
| "loss": 0.5754, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.838332114118508, |
| "grad_norm": 0.34446488612800036, |
| "learning_rate": 5e-06, |
| "loss": 0.5743, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.8675932699341624, |
| "grad_norm": 0.3511137393107823, |
| "learning_rate": 5e-06, |
| "loss": 0.5728, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.896854425749817, |
| "grad_norm": 0.3412142074428731, |
| "learning_rate": 5e-06, |
| "loss": 0.568, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.9261155815654716, |
| "grad_norm": 0.3281557966779097, |
| "learning_rate": 5e-06, |
| "loss": 0.5732, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.9553767373811266, |
| "grad_norm": 0.4011158753397735, |
| "learning_rate": 5e-06, |
| "loss": 0.5647, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.984637893196781, |
| "grad_norm": 0.33298486513398695, |
| "learning_rate": 5e-06, |
| "loss": 0.5769, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.9934162399414777, |
| "eval_loss": 0.6332815885543823, |
| "eval_runtime": 172.1406, |
| "eval_samples_per_second": 53.497, |
| "eval_steps_per_second": 0.418, |
| "step": 1023 |
| }, |
| { |
| "epoch": 2.9934162399414777, |
| "step": 1023, |
| "total_flos": 2144724936818688.0, |
| "train_loss": 0.6140779541151731, |
| "train_runtime": 27853.5124, |
| "train_samples_per_second": 18.844, |
| "train_steps_per_second": 0.037 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1023, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2144724936818688.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|