{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06896551724137931, "grad_norm": 0.7040123343467712, "learning_rate": 6e-05, "loss": 2.4602, "step": 10 }, { "epoch": 0.13793103448275862, "grad_norm": 0.5209792256355286, "learning_rate": 0.00012666666666666666, "loss": 2.082, "step": 20 }, { "epoch": 0.20689655172413793, "grad_norm": 0.5385194420814514, "learning_rate": 0.00019333333333333333, "loss": 1.7231, "step": 30 }, { "epoch": 0.27586206896551724, "grad_norm": 0.39407166838645935, "learning_rate": 0.00019995880424308071, "loss": 1.4061, "step": 40 }, { "epoch": 0.3448275862068966, "grad_norm": 0.41344210505485535, "learning_rate": 0.00019981644273304322, "loss": 1.1996, "step": 50 }, { "epoch": 0.41379310344827586, "grad_norm": 0.7267842292785645, "learning_rate": 0.00019957255165063584, "loss": 1.0567, "step": 60 }, { "epoch": 0.4827586206896552, "grad_norm": 0.4600793719291687, "learning_rate": 0.0001992273790727949, "loss": 0.9303, "step": 70 }, { "epoch": 0.5517241379310345, "grad_norm": 0.5658743381500244, "learning_rate": 0.00019878127609622607, "loss": 0.872, "step": 80 }, { "epoch": 0.6206896551724138, "grad_norm": 0.6658106446266174, "learning_rate": 0.00019823469648028207, "loss": 0.8219, "step": 90 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8075541853904724, "learning_rate": 0.0001975881961854155, "loss": 0.663, "step": 100 }, { "epoch": 0.7586206896551724, "grad_norm": 0.5715014934539795, "learning_rate": 0.00019684243280767633, "loss": 0.612, "step": 110 }, { "epoch": 0.8275862068965517, "grad_norm": 0.6722455024719238, "learning_rate": 0.00019599816490983006, "loss": 0.5205, "step": 120 }, { "epoch": 0.896551724137931, "grad_norm": 0.665215253829956, "learning_rate": 0.0001950562512497755, "loss": 0.4817, "step": 130 }, { "epoch": 0.9655172413793104, "grad_norm": 0.7088465690612793, "learning_rate": 0.00019401764990704842, "loss": 0.4479, "step": 140 }, { "epoch": 1.0, "eval_loss": 0.40697577595710754, "eval_runtime": 125.2582, "eval_samples_per_second": 8.255, "eval_steps_per_second": 2.068, "step": 145 }, { "epoch": 1.0344827586206897, "grad_norm": 0.6929165720939636, "learning_rate": 0.0001928834173082986, "loss": 0.3906, "step": 150 }, { "epoch": 1.103448275862069, "grad_norm": 0.5821729898452759, "learning_rate": 0.00019165470715273197, "loss": 0.3082, "step": 160 }, { "epoch": 1.1724137931034484, "grad_norm": 0.8513327836990356, "learning_rate": 0.0001903327692386107, "loss": 0.3073, "step": 170 }, { "epoch": 1.2413793103448276, "grad_norm": 0.5871995687484741, "learning_rate": 0.0001889189481920048, "loss": 0.2877, "step": 180 }, { "epoch": 1.3103448275862069, "grad_norm": 0.5694791078567505, "learning_rate": 0.0001874146820990887, "loss": 0.2462, "step": 190 }, { "epoch": 1.3793103448275863, "grad_norm": 0.53709876537323, "learning_rate": 0.00018582150104337326, "loss": 0.2469, "step": 200 }, { "epoch": 1.4482758620689655, "grad_norm": 0.5729167461395264, "learning_rate": 0.00018414102554936194, "loss": 0.204, "step": 210 }, { "epoch": 1.5172413793103448, "grad_norm": 0.5852716565132141, "learning_rate": 0.0001823749649342135, "loss": 0.1935, "step": 220 }, { "epoch": 1.5862068965517242, "grad_norm": 0.5843812227249146, "learning_rate": 0.000180525115569088, "loss": 0.2048, "step": 230 }, { "epoch": 1.6551724137931034, "grad_norm": 0.471234530210495, "learning_rate": 0.00017859335905194521, "loss": 0.1982, "step": 240 }, { "epoch": 1.7241379310344827, "grad_norm": 0.45676925778388977, "learning_rate": 0.00017658166029365288, "loss": 0.1631, "step": 250 }, { "epoch": 1.793103448275862, "grad_norm": 0.44925084710121155, "learning_rate": 0.00017449206551935258, "loss": 0.1504, "step": 260 }, { "epoch": 1.8620689655172413, "grad_norm": 0.4521825313568115, "learning_rate": 0.00017232670018711572, "loss": 0.1397, "step": 270 }, { "epoch": 1.9310344827586206, "grad_norm": 0.3619280755519867, "learning_rate": 0.0001700877668260065, "loss": 0.129, "step": 280 }, { "epoch": 2.0, "grad_norm": 0.46234917640686035, "learning_rate": 0.00016777754279575136, "loss": 0.1276, "step": 290 }, { "epoch": 2.0, "eval_loss": 0.14752867817878723, "eval_runtime": 112.8661, "eval_samples_per_second": 9.161, "eval_steps_per_second": 2.295, "step": 290 }, { "epoch": 2.0689655172413794, "grad_norm": 0.4741278886795044, "learning_rate": 0.00016539837797029341, "loss": 0.1097, "step": 300 }, { "epoch": 2.1379310344827585, "grad_norm": 0.46682700514793396, "learning_rate": 0.00016295269234758796, "loss": 0.1014, "step": 310 }, { "epoch": 2.206896551724138, "grad_norm": 0.48358821868896484, "learning_rate": 0.00016044297358807085, "loss": 0.096, "step": 320 }, { "epoch": 2.2758620689655173, "grad_norm": 0.4410136044025421, "learning_rate": 0.00015787177448430253, "loss": 0.1056, "step": 330 }, { "epoch": 2.344827586206897, "grad_norm": 0.46466073393821716, "learning_rate": 0.00015524171036436255, "loss": 0.09, "step": 340 }, { "epoch": 2.413793103448276, "grad_norm": 0.37848514318466187, "learning_rate": 0.00015255545643163516, "loss": 0.095, "step": 350 }, { "epoch": 2.4827586206896552, "grad_norm": 0.3724570870399475, "learning_rate": 0.00014981574504369194, "loss": 0.0897, "step": 360 }, { "epoch": 2.5517241379310347, "grad_norm": 0.4251348078250885, "learning_rate": 0.00014702536293303924, "loss": 0.0953, "step": 370 }, { "epoch": 2.6206896551724137, "grad_norm": 0.33401480317115784, "learning_rate": 0.00014418714837255764, "loss": 0.0856, "step": 380 }, { "epoch": 2.689655172413793, "grad_norm": 0.3629361689090729, "learning_rate": 0.00014130398828851625, "loss": 0.0986, "step": 390 }, { "epoch": 2.7586206896551726, "grad_norm": 0.3115435540676117, "learning_rate": 0.00013837881532409888, "loss": 0.087, "step": 400 }, { "epoch": 2.8275862068965516, "grad_norm": 0.32981187105178833, "learning_rate": 0.00013541460485642825, "loss": 0.0755, "step": 410 }, { "epoch": 2.896551724137931, "grad_norm": 0.4125177264213562, "learning_rate": 0.00013241437197012326, "loss": 0.0765, "step": 420 }, { "epoch": 2.9655172413793105, "grad_norm": 0.3462369740009308, "learning_rate": 0.00012938116839046704, "loss": 0.0754, "step": 430 }, { "epoch": 3.0, "eval_loss": 0.09198899567127228, "eval_runtime": 105.7366, "eval_samples_per_second": 9.779, "eval_steps_per_second": 2.449, "step": 435 }, { "epoch": 3.0344827586206895, "grad_norm": 0.41924241185188293, "learning_rate": 0.0001263180793793054, "loss": 0.0753, "step": 440 }, { "epoch": 3.103448275862069, "grad_norm": 0.30730345845222473, "learning_rate": 0.00012322822059683344, "loss": 0.0608, "step": 450 }, { "epoch": 3.1724137931034484, "grad_norm": 0.2622484564781189, "learning_rate": 0.00012011473493246166, "loss": 0.0537, "step": 460 }, { "epoch": 3.2413793103448274, "grad_norm": 0.4255964756011963, "learning_rate": 0.00011698078930798606, "loss": 0.0608, "step": 470 }, { "epoch": 3.310344827586207, "grad_norm": 0.32836616039276123, "learning_rate": 0.00011382957145631284, "loss": 0.0603, "step": 480 }, { "epoch": 3.3793103448275863, "grad_norm": 0.3328634798526764, "learning_rate": 0.00011066428667901523, "loss": 0.0491, "step": 490 }, { "epoch": 3.4482758620689653, "grad_norm": 0.33855220675468445, "learning_rate": 0.00010748815458601989, "loss": 0.0565, "step": 500 }, { "epoch": 3.5172413793103448, "grad_norm": 0.35493630170822144, "learning_rate": 0.00010430440582073946, "loss": 0.0516, "step": 510 }, { "epoch": 3.586206896551724, "grad_norm": 0.31457582116127014, "learning_rate": 0.00010111627877398236, "loss": 0.0483, "step": 520 }, { "epoch": 3.655172413793103, "grad_norm": 0.2784205377101898, "learning_rate": 9.7927016289982e-05, "loss": 0.0561, "step": 530 }, { "epoch": 3.7241379310344827, "grad_norm": 0.31077679991722107, "learning_rate": 9.473986236789633e-05, "loss": 0.0513, "step": 540 }, { "epoch": 3.793103448275862, "grad_norm": 0.262256383895874, "learning_rate": 9.155805886213265e-05, "loss": 0.0553, "step": 550 }, { "epoch": 3.862068965517241, "grad_norm": 0.2638000547885895, "learning_rate": 8.838484218485358e-05, "loss": 0.0488, "step": 560 }, { "epoch": 3.9310344827586206, "grad_norm": 0.2960629165172577, "learning_rate": 8.522344001401945e-05, "loss": 0.0539, "step": 570 }, { "epoch": 4.0, "grad_norm": 0.3162899911403656, "learning_rate": 8.207706801031408e-05, "loss": 0.0505, "step": 580 }, { "epoch": 4.0, "eval_loss": 0.06869391351938248, "eval_runtime": 107.7637, "eval_samples_per_second": 9.595, "eval_steps_per_second": 2.403, "step": 580 }, { "epoch": 4.068965517241379, "grad_norm": 0.23831477761268616, "learning_rate": 7.894892654629438e-05, "loss": 0.0346, "step": 590 }, { "epoch": 4.137931034482759, "grad_norm": 0.20031996071338654, "learning_rate": 7.584219745109047e-05, "loss": 0.0359, "step": 600 }, { "epoch": 4.206896551724138, "grad_norm": 0.1699780821800232, "learning_rate": 7.276004077396747e-05, "loss": 0.0401, "step": 610 }, { "epoch": 4.275862068965517, "grad_norm": 0.22816252708435059, "learning_rate": 6.970559157004097e-05, "loss": 0.0461, "step": 620 }, { "epoch": 4.344827586206897, "grad_norm": 0.2750411927700043, "learning_rate": 6.668195671141542e-05, "loss": 0.0358, "step": 630 }, { "epoch": 4.413793103448276, "grad_norm": 0.25011658668518066, "learning_rate": 6.369221172698963e-05, "loss": 0.0402, "step": 640 }, { "epoch": 4.482758620689655, "grad_norm": 0.19483405351638794, "learning_rate": 6.073939767414305e-05, "loss": 0.0317, "step": 650 }, { "epoch": 4.551724137931035, "grad_norm": 0.2074318677186966, "learning_rate": 5.782651804548538e-05, "loss": 0.034, "step": 660 }, { "epoch": 4.620689655172414, "grad_norm": 0.19932489097118378, "learning_rate": 5.495653571381554e-05, "loss": 0.0358, "step": 670 }, { "epoch": 4.689655172413794, "grad_norm": 0.19241374731063843, "learning_rate": 5.213236991839781e-05, "loss": 0.0368, "step": 680 }, { "epoch": 4.758620689655173, "grad_norm": 0.21923574805259705, "learning_rate": 4.93568932956201e-05, "loss": 0.0367, "step": 690 }, { "epoch": 4.827586206896552, "grad_norm": 0.24106892943382263, "learning_rate": 4.663292895705526e-05, "loss": 0.0424, "step": 700 }, { "epoch": 4.896551724137931, "grad_norm": 0.21531100571155548, "learning_rate": 4.396324761789672e-05, "loss": 0.0436, "step": 710 }, { "epoch": 4.9655172413793105, "grad_norm": 0.22390952706336975, "learning_rate": 4.1350564778690424e-05, "loss": 0.0415, "step": 720 }, { "epoch": 5.0, "eval_loss": 0.06067777797579765, "eval_runtime": 107.7054, "eval_samples_per_second": 9.6, "eval_steps_per_second": 2.405, "step": 725 }, { "epoch": 5.0344827586206895, "grad_norm": 0.1765316277742386, "learning_rate": 3.879753796322845e-05, "loss": 0.0304, "step": 730 }, { "epoch": 5.103448275862069, "grad_norm": 0.14481499791145325, "learning_rate": 3.630676401541466e-05, "loss": 0.028, "step": 740 }, { "epoch": 5.172413793103448, "grad_norm": 0.2429375946521759, "learning_rate": 3.388077645785186e-05, "loss": 0.0314, "step": 750 }, { "epoch": 5.241379310344827, "grad_norm": 0.1938163936138153, "learning_rate": 3.1522042914836704e-05, "loss": 0.0298, "step": 760 }, { "epoch": 5.310344827586207, "grad_norm": 0.2667997181415558, "learning_rate": 2.923296260238412e-05, "loss": 0.0299, "step": 770 }, { "epoch": 5.379310344827586, "grad_norm": 0.2025449424982071, "learning_rate": 2.7015863887833947e-05, "loss": 0.0299, "step": 780 }, { "epoch": 5.448275862068965, "grad_norm": 0.28111398220062256, "learning_rate": 2.4873001921522444e-05, "loss": 0.0319, "step": 790 }, { "epoch": 5.517241379310345, "grad_norm": 0.17337462306022644, "learning_rate": 2.2806556342927142e-05, "loss": 0.0308, "step": 800 }, { "epoch": 5.586206896551724, "grad_norm": 0.1848144233226776, "learning_rate": 2.0818629063618656e-05, "loss": 0.0307, "step": 810 }, { "epoch": 5.655172413793103, "grad_norm": 0.16859489679336548, "learning_rate": 1.8911242129274498e-05, "loss": 0.0294, "step": 820 }, { "epoch": 5.724137931034483, "grad_norm": 0.20610836148262024, "learning_rate": 1.7086335662929352e-05, "loss": 0.0288, "step": 830 }, { "epoch": 5.793103448275862, "grad_norm": 0.17157162725925446, "learning_rate": 1.5345765891554163e-05, "loss": 0.0263, "step": 840 }, { "epoch": 5.862068965517241, "grad_norm": 0.21943090856075287, "learning_rate": 1.3691303257971033e-05, "loss": 0.0336, "step": 850 }, { "epoch": 5.931034482758621, "grad_norm": 0.17992551624774933, "learning_rate": 1.2124630620024746e-05, "loss": 0.0288, "step": 860 }, { "epoch": 6.0, "grad_norm": 0.2768034040927887, "learning_rate": 1.0647341538842282e-05, "loss": 0.0327, "step": 870 }, { "epoch": 6.0, "eval_loss": 0.05765723064541817, "eval_runtime": 106.1855, "eval_samples_per_second": 9.738, "eval_steps_per_second": 2.439, "step": 870 } ], "logging_steps": 10, "max_steps": 1015, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3175319780130816e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }