{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932104752667313, "eval_steps": 1024, "global_step": 21504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.5449081659317017, "learning_rate": 0.000498046875, "loss": 9.256452560424805, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.5607944130897522, "learning_rate": 0.000998046875, "loss": 5.257084369659424, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.15684783458709717, "learning_rate": 0.000999640996023194, "loss": 1.0911091566085815, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.0985269844532013, "learning_rate": 0.0009985588674043958, "loss": 0.41666868329048157, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.27265830639135136, "eval_ce_loss": 0.3059718095480579, "eval_loss": 0.3059718095480579, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_bleu": 0.27265830639135136, "eval_ce_loss": 0.3059718095480579, "eval_loss": 0.3059718095480579, "eval_runtime": 113.3557, "eval_samples_per_second": 246.948, "eval_steps_per_second": 3.864, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.0791890025138855, "learning_rate": 0.0009967551747861387, "loss": 0.24503706395626068, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.07241348922252655, "learning_rate": 0.000994232528651847, "loss": 0.16681283712387085, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.06362218409776688, "learning_rate": 0.0009909945800260092, "loss": 0.12249665707349777, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.06142408400774002, "learning_rate": 0.0009870460151900522, "loss": 0.09425020217895508, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.2645654621946291, "eval_ce_loss": 0.0849783036211446, "eval_loss": 0.0849783036211446, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_bleu": 0.2645654621946291, "eval_ce_loss": 0.0849783036211446, "eval_loss": 0.0849783036211446, "eval_runtime": 107.8993, "eval_samples_per_second": 259.436, "eval_steps_per_second": 4.059, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.058095596730709076, "learning_rate": 0.0009823925488998885, "loss": 0.07650011032819748, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.05368533730506897, "learning_rate": 0.0009770409161149525, "loss": 0.06166598200798035, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.04492728039622307, "learning_rate": 0.0009709988622506973, "loss": 0.051953624933958054, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.038033194839954376, "learning_rate": 0.000964275131968659, "loss": 0.0445532500743866, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.26919279555175596, "eval_ce_loss": 0.04093987658858027, "eval_loss": 0.04093987658858027, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_bleu": 0.26919279555175596, "eval_ce_loss": 0.04093987658858027, "eval_loss": 0.04093987658858027, "eval_runtime": 110.2876, "eval_samples_per_second": 253.818, "eval_steps_per_second": 3.971, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.036269549280405045, "learning_rate": 0.0009568794565203123, "loss": 0.03803830221295357, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.03661005198955536, "learning_rate": 0.0009488225396630347, "loss": 0.03269872069358826, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.03384782001376152, "learning_rate": 0.0009401160421685646, "loss": 0.02887391857802868, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.03782425448298454, "learning_rate": 0.0009307725649463714, "loss": 0.025456363335251808, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.26767754742363126, "eval_ce_loss": 0.023863823824991647, "eval_loss": 0.023863823824991647, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_bleu": 0.26767754742363126, "eval_ce_loss": 0.023863823824991647, "eval_loss": 0.023863823824991647, "eval_runtime": 110.9469, "eval_samples_per_second": 252.31, "eval_steps_per_second": 3.948, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.043788664042949677, "learning_rate": 0.0009208056308063659, "loss": 0.022065965458750725, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.04368242621421814, "learning_rate": 0.0009102296648873445, "loss": 0.020814381539821625, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.024551063776016235, "learning_rate": 0.0008990599737794927, "loss": 0.01824565976858139, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.02257447876036167, "learning_rate": 0.0008873127233711644, "loss": 0.016040779650211334, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.27054794345363903, "eval_ce_loss": 0.01571538393092278, "eval_loss": 0.01571538393092278, "step": 5120 }, { "epoch": 0.236478684587317, "eval_bleu": 0.27054794345363903, "eval_ce_loss": 0.01571538393092278, "eval_loss": 0.01571538393092278, "eval_runtime": 111.5952, "eval_samples_per_second": 250.844, "eval_steps_per_second": 3.925, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.021084846928715706, "learning_rate": 0.0008750049154520011, "loss": 0.014501616358757019, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.025962376967072487, "learning_rate": 0.0008621543631062487, "loss": 0.013666299171745777, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.058145921677351, "learning_rate": 0.0008487796649318904, "loss": 0.012593724764883518, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.02390468120574951, "learning_rate": 0.0008349001781229053, "loss": 0.011387365870177746, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.2698466629682675, "eval_ce_loss": 0.010894727121324164, "eval_loss": 0.010894727121324164, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_bleu": 0.2698466629682675, "eval_ce_loss": 0.010894727121324164, "eval_loss": 0.010894727121324164, "eval_runtime": 111.9404, "eval_samples_per_second": 250.071, "eval_steps_per_second": 3.913, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.01789081282913685, "learning_rate": 0.0008205359904536107, "loss": 0.010051091201603413, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.024055052548646927, "learning_rate": 0.0008057078912056363, "loss": 0.009441766887903214, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.025855517014861107, "learning_rate": 0.0007904373410796086, "loss": 0.009011849761009216, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.02206815779209137, "learning_rate": 0.0007747464411350876, "loss": 0.007886786945164204, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.26735583207895347, "eval_ce_loss": 0.007830630816449092, "eval_loss": 0.007830630816449092, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_bleu": 0.26735583207895347, "eval_ce_loss": 0.007830630816449092, "eval_loss": 0.007830630816449092, "eval_runtime": 115.3929, "eval_samples_per_second": 242.589, "eval_steps_per_second": 3.796, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.020352263003587723, "learning_rate": 0.000758657900803716, "loss": 0.007837384939193726, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.028763771057128906, "learning_rate": 0.000742195005021869, "loss": 0.007105502299964428, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.011777768842875957, "learning_rate": 0.0007253815805303786, "loss": 0.00655593303963542, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.014801163226366043, "learning_rate": 0.0007082419613901028, "loss": 0.006127453874796629, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.26779506983265766, "eval_ce_loss": 0.006108720087580229, "eval_loss": 0.006108720087580229, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_bleu": 0.26779506983265766, "eval_ce_loss": 0.006108720087580229, "eval_loss": 0.006108720087580229, "eval_runtime": 111.3262, "eval_samples_per_second": 251.45, "eval_steps_per_second": 3.934, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.012766832485795021, "learning_rate": 0.0006908009537632514, "loss": 0.005777300801128149, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.010622252710163593, "learning_rate": 0.0006730838000114403, "loss": 0.005370937753468752, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.018417516723275185, "learning_rate": 0.0006551161421624341, "loss": 0.004708444699645042, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.01107096392661333, "learning_rate": 0.0006369239847984517, "loss": 0.004844233393669128, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.27117611081612686, "eval_ce_loss": 0.004634970387913814, "eval_loss": 0.004634970387913814, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_bleu": 0.27117611081612686, "eval_ce_loss": 0.004634970387913814, "eval_loss": 0.004634970387913814, "eval_runtime": 110.365, "eval_samples_per_second": 253.64, "eval_steps_per_second": 3.969, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.016654323786497116, "learning_rate": 0.0006185336574197479, "loss": 0.004370348993688822, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.009538416750729084, "learning_rate": 0.0005999717763379407, "loss": 0.004241208545863628, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.019346073269844055, "learning_rate": 0.0005812652061542363, "loss": 0.004045420326292515, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.009237069636583328, "learning_rate": 0.0005624410208783071, "loss": 0.0038289830554276705, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.2709327535478657, "eval_ce_loss": 0.0036890122933966028, "eval_loss": 0.0036890122933966028, "step": 10240 }, { "epoch": 0.472957369174634, "eval_bleu": 0.2709327535478657, "eval_ce_loss": 0.0036890122933966028, "eval_loss": 0.0036890122933966028, "eval_runtime": 109.8588, "eval_samples_per_second": 254.809, "eval_steps_per_second": 3.987, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.018487900495529175, "learning_rate": 0.0005435264647440881, "loss": 0.0035509562585502863, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.007689731661230326, "learning_rate": 0.000524548912779213, "loss": 0.003250380977988243, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.013474254868924618, "learning_rate": 0.0005055358311851499, "loss": 0.003267573891207576, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.0173040684312582, "learning_rate": 0.0004865147375853812, "loss": 0.0030927686020731926, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.27271556678043923, "eval_ce_loss": 0.0031219599181127023, "eval_loss": 0.0031219599181127023, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_bleu": 0.27271556678043923, "eval_ce_loss": 0.0031219599181127023, "eval_loss": 0.0031219599181127023, "eval_runtime": 109.1112, "eval_samples_per_second": 256.555, "eval_steps_per_second": 4.014, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.01329875085502863, "learning_rate": 0.0004675131611991607, "loss": 0.002897108905017376, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.01354902796447277, "learning_rate": 0.0004485586029984899, "loss": 0.003055332228541374, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.008980591781437397, "learning_rate": 0.00042967849590597266, "loss": 0.0028114793822169304, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.006398347206413746, "learning_rate": 0.0004109001650911621, "loss": 0.002583935856819153, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.2711715005619301, "eval_ce_loss": 0.002555297637415718, "eval_loss": 0.002555297637415718, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_bleu": 0.2711715005619301, "eval_ce_loss": 0.002555297637415718, "eval_loss": 0.002555297637415718, "eval_runtime": 110.1449, "eval_samples_per_second": 254.147, "eval_steps_per_second": 3.977, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.006243300624191761, "learning_rate": 0.0003922507884228551, "loss": 0.002527546603232622, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.013985877856612206, "learning_rate": 0.00037375735713457723, "loss": 0.002268948359414935, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.007294178940355778, "learning_rate": 0.00035544663676018276, "loss": 0.0022150948643684387, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.00554023077711463, "learning_rate": 0.00033734512839611255, "loss": 0.0021498501300811768, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.2683290253865085, "eval_ce_loss": 0.0022093608622775693, "eval_loss": 0.0022093608622775693, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_bleu": 0.2683290253865085, "eval_ce_loss": 0.0022093608622775693, "eval_loss": 0.0022093608622775693, "eval_runtime": 110.7199, "eval_samples_per_second": 252.827, "eval_steps_per_second": 3.956, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.013757260516285896, "learning_rate": 0.0003194790303463687, "loss": 0.0020817620679736137, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.019224034622311592, "learning_rate": 0.00030187420020572406, "loss": 0.002039685845375061, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.01222301833331585, "learning_rate": 0.00028455611743603626, "loss": 0.0020018373616039753, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.021646995097398758, "learning_rate": 0.0002675498464898373, "loss": 0.00218460732139647, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.26793733719293295, "eval_ce_loss": 0.0019528514221664114, "eval_loss": 0.0019528514221664114, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_bleu": 0.26793733719293295, "eval_ce_loss": 0.0019528514221664114, "eval_loss": 0.0019528514221664114, "eval_runtime": 110.9086, "eval_samples_per_second": 252.397, "eval_steps_per_second": 3.949, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.006785638630390167, "learning_rate": 0.0002508800005345623, "loss": 0.001926972414366901, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 0.008890391327440739, "learning_rate": 0.00023457070582992562, "loss": 0.001943480921909213, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.006759077310562134, "learning_rate": 0.00021864556680999692, "loss": 0.00178119249176234, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.006032236386090517, "learning_rate": 0.0002031276319205152, "loss": 0.0017275057034566998, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.27057655967695254, "eval_ce_loss": 0.0017077088214219951, "eval_loss": 0.0017077088214219951, "step": 15360 }, { "epoch": 0.709436053761951, "eval_bleu": 0.27057655967695254, "eval_ce_loss": 0.0017077088214219951, "eval_loss": 0.0017077088214219951, "eval_runtime": 111.5469, "eval_samples_per_second": 250.953, "eval_steps_per_second": 3.927, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.008526836521923542, "learning_rate": 0.00018803936026088542, "loss": 0.001562677789479494, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.007385567296296358, "learning_rate": 0.00017340258907913464, "loss": 0.0016144757391884923, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.0050244200974702835, "learning_rate": 0.0001592385021668743, "loss": 0.0016120458021759987, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.005648311693221331, "learning_rate": 0.0001455675992000087, "loss": 0.001692429999820888, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.2685717071524248, "eval_ce_loss": 0.0015959215974420593, "eval_loss": 0.0015959215974420593, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_bleu": 0.2685717071524248, "eval_ce_loss": 0.0015959215974420593, "eval_loss": 0.0015959215974420593, "eval_runtime": 109.7001, "eval_samples_per_second": 255.177, "eval_steps_per_second": 3.993, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.00418821582570672, "learning_rate": 0.000132409666069565, "loss": 0.001514198025688529, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.008380233310163021, "learning_rate": 0.0001197837462455823, "loss": 0.0014031081227585673, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.008456946350634098, "learning_rate": 0.00010770811321550749, "loss": 0.0014507079031318426, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 0.01046363078057766, "learning_rate": 9.620024403698591e-05, "loss": 0.0016408010851591825, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.2652117482429029, "eval_ce_loss": 0.0014940057051086057, "eval_loss": 0.0014940057051086057, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_bleu": 0.2652117482429029, "eval_ce_loss": 0.0014940057051086057, "eval_loss": 0.0014940057051086057, "eval_runtime": 109.1029, "eval_samples_per_second": 256.574, "eval_steps_per_second": 4.015, "step": 17408 }, { "epoch": 0.8158514618262436, "grad_norm": 0.003936219960451126, "learning_rate": 8.527679404332429e-05, "loss": 0.001487646484747529, "step": 17664 }, { "epoch": 0.8276753960556095, "grad_norm": 0.030263634398579597, "learning_rate": 7.495357273823544e-05, "loss": 0.0014763937797397375, "step": 17920 }, { "epoch": 0.8394993302849753, "grad_norm": 0.021031692624092102, "learning_rate": 6.524552091475183e-05, "loss": 0.001435705809853971, "step": 18176 }, { "epoch": 0.8513232645143411, "grad_norm": 0.010445632040500641, "learning_rate": 5.6166689031422024e-05, "loss": 0.001405209768563509, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.2666050549594915, "eval_ce_loss": 0.0014094488985837038, "eval_loss": 0.0014094488985837038, "step": 18432 }, { "epoch": 0.8513232645143411, "eval_bleu": 0.2666050549594915, "eval_ce_loss": 0.0014094488985837038, "eval_loss": 0.0014094488985837038, "eval_runtime": 110.9263, "eval_samples_per_second": 252.357, "eval_steps_per_second": 3.949, "step": 18432 }, { "epoch": 0.8631471987437069, "grad_norm": 0.0071349553763866425, "learning_rate": 4.773021687709067e-05, "loss": 0.0014939571265131235, "step": 18688 }, { "epoch": 0.8749711329730728, "grad_norm": 0.0075807152315974236, "learning_rate": 3.994831455368719e-05, "loss": 0.0016243808204308152, "step": 18944 }, { "epoch": 0.8867950672024387, "grad_norm": 0.003486819099634886, "learning_rate": 3.283224480455282e-05, "loss": 0.0014689115341752768, "step": 19200 }, { "epoch": 0.8986190014318045, "grad_norm": 0.004220427479594946, "learning_rate": 2.639230671387627e-05, "loss": 0.0012729011941701174, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.2695832110812109, "eval_ce_loss": 0.0013992262525565297, "eval_loss": 0.0013992262525565297, "step": 19456 }, { "epoch": 0.8986190014318045, "eval_bleu": 0.2695832110812109, "eval_ce_loss": 0.0013992262525565297, "eval_loss": 0.0013992262525565297, "eval_runtime": 110.239, "eval_samples_per_second": 253.93, "eval_steps_per_second": 3.973, "step": 19456 }, { "epoch": 0.9104429356611704, "grad_norm": 0.013082730583846569, "learning_rate": 2.063782080083576e-05, "loss": 0.0013599519152194262, "step": 19712 }, { "epoch": 0.9222668698905362, "grad_norm": 0.00748586468398571, "learning_rate": 1.557711553001523e-05, "loss": 0.0015039942227303982, "step": 19968 }, { "epoch": 0.9340908041199021, "grad_norm": 0.003521893871948123, "learning_rate": 1.1217515257622269e-05, "loss": 0.0014425483532249928, "step": 20224 }, { "epoch": 0.945914738349268, "grad_norm": 0.008369974792003632, "learning_rate": 7.565329630950746e-06, "loss": 0.0014663139590993524, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.2669610603654314, "eval_ce_loss": 0.0013518765415306859, "eval_loss": 0.0013518765415306859, "step": 20480 }, { "epoch": 0.945914738349268, "eval_bleu": 0.2669610603654314, "eval_ce_loss": 0.0013518765415306859, "eval_loss": 0.0013518765415306859, "eval_runtime": 112.2477, "eval_samples_per_second": 249.386, "eval_steps_per_second": 3.902, "step": 20480 }, { "epoch": 0.9577386725786338, "grad_norm": 0.008028030395507812, "learning_rate": 4.62584445643166e-06, "loss": 0.0015790105098858476, "step": 20736 }, { "epoch": 0.9695626068079997, "grad_norm": 0.005876564886420965, "learning_rate": 2.40331404948807e-06, "loss": 0.001455229939892888, "step": 20992 }, { "epoch": 0.9813865410373654, "grad_norm": 0.008718357421457767, "learning_rate": 9.009550772663965e-07, "loss": 0.0013052865397185087, "step": 21248 }, { "epoch": 0.9932104752667313, "grad_norm": 0.0048894439823925495, "learning_rate": 1.2094190315575791e-07, "loss": 0.0014210316585376859, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.2713391630531368, "eval_ce_loss": 0.001370158953104858, "eval_loss": 0.001370158953104858, "step": 21504 }, { "epoch": 0.9932104752667313, "eval_bleu": 0.2713391630531368, "eval_ce_loss": 0.001370158953104858, "eval_loss": 0.001370158953104858, "eval_runtime": 108.048, "eval_samples_per_second": 259.079, "eval_steps_per_second": 4.054, "step": 21504 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }