| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.945914738349268, |
| "eval_steps": 4096, |
| "global_step": 20480, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 2.4639666080474854, |
| "learning_rate": 0.000498046875, |
| "loss": 0.1339670717716217, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 1.5375629663467407, |
| "learning_rate": 0.000998046875, |
| "loss": 0.1467132717370987, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 1.506445288658142, |
| "learning_rate": 0.000999640996023194, |
| "loss": 0.15337222814559937, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 1.641593098640442, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 0.15139617025852203, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 1.3137743473052979, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 0.14983117580413818, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 1.3725732564926147, |
| "learning_rate": 0.000994232528651847, |
| "loss": 0.14979030191898346, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 2.0527191162109375, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 0.14714637398719788, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 1.8648000955581665, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 0.1472843438386917, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 1.025286078453064, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 0.1452621966600418, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.9860194325447083, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.1464366316795349, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 1.4449650049209595, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.1476413458585739, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 1.548802375793457, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.14479056000709534, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.9572728872299194, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.14605838060379028, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 1.05500066280365, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.14362677931785583, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.7869608402252197, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.14176346361637115, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.8075957298278809, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.14215654134750366, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_bleu": 0.0007156187561984728, |
| "eval_loss": 0.1407095858291404, |
| "eval_mse_loss": 0.1407095858291404, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_bleu": 0.0007156187561984728, |
| "eval_loss": 0.1407095858291404, |
| "eval_mse_loss": 0.1407095858291404, |
| "eval_runtime": 172.648, |
| "eval_samples_per_second": 162.139, |
| "eval_steps_per_second": 2.537, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 1.2594881057739258, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.1409526914358139, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 1.2300289869308472, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.14084666967391968, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 1.4511398077011108, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.13930904865264893, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 1.1964119672775269, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.13842026889324188, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.9578610062599182, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.1392611265182495, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.9959284067153931, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.1368764489889145, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 1.342193603515625, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.13701793551445007, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.5825753211975098, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 0.13572891056537628, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 1.275922417640686, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 0.134858176112175, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 1.0904085636138916, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 0.13740675151348114, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.7365452647209167, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 0.13476549088954926, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.9742300510406494, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 0.1339239925146103, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 1.1681714057922363, |
| "learning_rate": 0.000758657900803716, |
| "loss": 0.1327570527791977, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 1.0932402610778809, |
| "learning_rate": 0.000742195005021869, |
| "loss": 0.13453976809978485, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 1.7933037281036377, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 0.1339324563741684, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 1.258001446723938, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 0.13321880996227264, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_bleu": 0.0009159004592492724, |
| "eval_loss": 0.13241331889953242, |
| "eval_mse_loss": 0.13241331889953242, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_bleu": 0.0009159004592492724, |
| "eval_loss": 0.13241331889953242, |
| "eval_mse_loss": 0.13241331889953242, |
| "eval_runtime": 171.6296, |
| "eval_samples_per_second": 163.101, |
| "eval_steps_per_second": 2.552, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 2.0500435829162598, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 0.13275721669197083, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 1.0164549350738525, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 0.13082970678806305, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.9163410663604736, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 0.1323855221271515, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.8464987874031067, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 0.12907665967941284, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.790055513381958, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 0.13018614053726196, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 1.4321770668029785, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 0.13014769554138184, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 1.1316198110580444, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 0.13009950518608093, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 2.499011516571045, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 0.13046522438526154, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 1.3082486391067505, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 0.12951838970184326, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.8309692144393921, |
| "learning_rate": 0.000524548912779213, |
| "loss": 0.1311841756105423, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 2.236577272415161, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 0.1328016221523285, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 1.8365247249603271, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 0.13411875069141388, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 1.6082954406738281, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 0.13458885252475739, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 1.031647801399231, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 0.13649903237819672, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 1.5296710729599, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 0.1377032846212387, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 1.8958959579467773, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 0.13875064253807068, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_bleu": 0.0008588915725993688, |
| "eval_loss": 0.13989224594477648, |
| "eval_mse_loss": 0.13989224594477648, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_bleu": 0.0008588915725993688, |
| "eval_loss": 0.13989224594477648, |
| "eval_mse_loss": 0.13989224594477648, |
| "eval_runtime": 173.5527, |
| "eval_samples_per_second": 161.294, |
| "eval_steps_per_second": 2.524, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 2.3443777561187744, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 0.1418546885251999, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 1.1060763597488403, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 0.1433623880147934, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 1.806127905845642, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 0.14775635302066803, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 1.7125252485275269, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 0.14960132539272308, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 2.017378091812134, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 0.15272539854049683, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 2.1264116764068604, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 0.15218184888362885, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 1.8446365594863892, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 0.1529884785413742, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 2.2964935302734375, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 0.15051132440567017, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 2.4945242404937744, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 0.14853885769844055, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 1.3437820672988892, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 0.14716462790966034, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 1.631665587425232, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 0.14523349702358246, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 1.9889062643051147, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 0.14392541348934174, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 2.4409310817718506, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 0.14354340732097626, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 2.1943178176879883, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 0.14236165583133698, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 7.1248555183410645, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 0.14313820004463196, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 1.3160756826400757, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 0.1404125690460205, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_bleu": 0.000725950642943874, |
| "eval_loss": 0.1416369776101145, |
| "eval_mse_loss": 0.1416369776101145, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_bleu": 0.000725950642943874, |
| "eval_loss": 0.1416369776101145, |
| "eval_mse_loss": 0.1416369776101145, |
| "eval_runtime": 172.4801, |
| "eval_samples_per_second": 162.297, |
| "eval_steps_per_second": 2.539, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 2.9946746826171875, |
| "learning_rate": 0.000132409666069565, |
| "loss": 0.13992173969745636, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 1.9437847137451172, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 0.1393483430147171, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 1.5408047437667847, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 0.139127716422081, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 4.208256721496582, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 0.1388680338859558, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 1.5926990509033203, |
| "learning_rate": 8.527679404332429e-05, |
| "loss": 0.13790729641914368, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 1.5863239765167236, |
| "learning_rate": 7.495357273823544e-05, |
| "loss": 0.13762786984443665, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.7356535792350769, |
| "learning_rate": 6.524552091475183e-05, |
| "loss": 0.13656531274318695, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 1.0298752784729004, |
| "learning_rate": 5.6166689031422024e-05, |
| "loss": 0.13603614270687103, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.8704520463943481, |
| "learning_rate": 4.773021687709067e-05, |
| "loss": 0.13718988001346588, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 1.7993870973587036, |
| "learning_rate": 3.994831455368719e-05, |
| "loss": 0.13554991781711578, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 1.9661895036697388, |
| "learning_rate": 3.283224480455282e-05, |
| "loss": 0.13669779896736145, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 2.396397113800049, |
| "learning_rate": 2.639230671387627e-05, |
| "loss": 0.13444113731384277, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 5.158544063568115, |
| "learning_rate": 2.063782080083576e-05, |
| "loss": 0.13486656546592712, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 2.635748863220215, |
| "learning_rate": 1.557711553001523e-05, |
| "loss": 0.13480573892593384, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 1.383921504020691, |
| "learning_rate": 1.1217515257622269e-05, |
| "loss": 0.13407942652702332, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 1.3553746938705444, |
| "learning_rate": 7.565329630950746e-06, |
| "loss": 0.13326187431812286, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_bleu": 0.0007028356585600878, |
| "eval_loss": 0.13298223137039028, |
| "eval_mse_loss": 0.13298223137039028, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_bleu": 0.0007028356585600878, |
| "eval_loss": 0.13298223137039028, |
| "eval_mse_loss": 0.13298223137039028, |
| "eval_runtime": 172.3836, |
| "eval_samples_per_second": 162.388, |
| "eval_steps_per_second": 2.541, |
| "step": 20480 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 4096, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|