| { |
| "best_global_step": 18000, |
| "best_metric": 3.9484923051611727, |
| "best_model_checkpoint": "outputs/bert-tiny-stage2-sbert/checkpoints/checkpoint-18000", |
| "epoch": 5.0, |
| "eval_steps": 2000, |
| "global_step": 21140, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011825922421948912, |
| "grad_norm": 37.789451599121094, |
| "learning_rate": 2.3173327027666118e-07, |
| "loss": 18.0314, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.023651844843897825, |
| "grad_norm": 43.297508239746094, |
| "learning_rate": 4.6819579096713174e-07, |
| "loss": 17.7147, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.035477767265846734, |
| "grad_norm": 36.56124496459961, |
| "learning_rate": 7.046583116576024e-07, |
| "loss": 17.0007, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.04730368968779565, |
| "grad_norm": 34.428916931152344, |
| "learning_rate": 9.411208323480729e-07, |
| "loss": 16.0667, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.05912961210974456, |
| "grad_norm": 33.16645812988281, |
| "learning_rate": 1.1775833530385434e-06, |
| "loss": 14.9131, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.07095553453169347, |
| "grad_norm": 29.064250946044922, |
| "learning_rate": 1.4140458737290142e-06, |
| "loss": 13.9449, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.08278145695364239, |
| "grad_norm": 31.42921257019043, |
| "learning_rate": 1.6505083944194847e-06, |
| "loss": 12.7957, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0946073793755913, |
| "grad_norm": 33.341365814208984, |
| "learning_rate": 1.8869709151099552e-06, |
| "loss": 11.7288, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.10643330179754021, |
| "grad_norm": 36.34325408935547, |
| "learning_rate": 2.123433435800426e-06, |
| "loss": 10.6945, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.11825922421948912, |
| "grad_norm": 39.06941604614258, |
| "learning_rate": 2.3598959564908965e-06, |
| "loss": 9.4743, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.13008514664143803, |
| "grad_norm": 32.969947814941406, |
| "learning_rate": 2.596358477181367e-06, |
| "loss": 8.6215, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.14191106906338694, |
| "grad_norm": 33.9212760925293, |
| "learning_rate": 2.8328209978718375e-06, |
| "loss": 7.7279, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.15373699148533584, |
| "grad_norm": 32.65876007080078, |
| "learning_rate": 3.069283518562308e-06, |
| "loss": 7.1892, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.16556291390728478, |
| "grad_norm": 29.210859298706055, |
| "learning_rate": 3.3057460392527786e-06, |
| "loss": 6.9682, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.1773888363292337, |
| "grad_norm": 29.231189727783203, |
| "learning_rate": 3.5422085599432495e-06, |
| "loss": 6.4781, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.1892147587511826, |
| "grad_norm": 28.949066162109375, |
| "learning_rate": 3.77867108063372e-06, |
| "loss": 6.1271, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2010406811731315, |
| "grad_norm": 29.826133728027344, |
| "learning_rate": 4.01513360132419e-06, |
| "loss": 6.1199, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.21286660359508042, |
| "grad_norm": 27.585041046142578, |
| "learning_rate": 4.2515961220146615e-06, |
| "loss": 5.9544, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.22469252601702933, |
| "grad_norm": 28.10279655456543, |
| "learning_rate": 4.488058642705131e-06, |
| "loss": 5.8145, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.23651844843897823, |
| "grad_norm": 26.567943572998047, |
| "learning_rate": 4.7245211633956025e-06, |
| "loss": 5.5599, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24834437086092714, |
| "grad_norm": 24.42616081237793, |
| "learning_rate": 4.960983684086072e-06, |
| "loss": 5.2344, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.26017029328287605, |
| "grad_norm": 25.857810974121094, |
| "learning_rate": 5.197446204776543e-06, |
| "loss": 5.3013, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.27199621570482496, |
| "grad_norm": 26.047733306884766, |
| "learning_rate": 5.433908725467014e-06, |
| "loss": 5.0562, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.28382213812677387, |
| "grad_norm": 26.875659942626953, |
| "learning_rate": 5.670371246157485e-06, |
| "loss": 4.8728, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.2956480605487228, |
| "grad_norm": 21.9539737701416, |
| "learning_rate": 5.906833766847954e-06, |
| "loss": 4.7826, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3074739829706717, |
| "grad_norm": 23.06488609313965, |
| "learning_rate": 6.143296287538426e-06, |
| "loss": 4.8806, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3192999053926206, |
| "grad_norm": 24.24974250793457, |
| "learning_rate": 6.379758808228896e-06, |
| "loss": 4.6464, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.33112582781456956, |
| "grad_norm": 22.658571243286133, |
| "learning_rate": 6.616221328919367e-06, |
| "loss": 4.7046, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.34295175023651847, |
| "grad_norm": 21.927656173706055, |
| "learning_rate": 6.852683849609837e-06, |
| "loss": 4.5188, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.3547776726584674, |
| "grad_norm": 24.39653778076172, |
| "learning_rate": 7.089146370300309e-06, |
| "loss": 4.4968, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3666035950804163, |
| "grad_norm": 23.591333389282227, |
| "learning_rate": 7.325608890990778e-06, |
| "loss": 4.4387, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.3784295175023652, |
| "grad_norm": 24.572961807250977, |
| "learning_rate": 7.562071411681249e-06, |
| "loss": 4.1702, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3902554399243141, |
| "grad_norm": 22.61821174621582, |
| "learning_rate": 7.79853393237172e-06, |
| "loss": 4.2147, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.402081362346263, |
| "grad_norm": 22.490327835083008, |
| "learning_rate": 8.03499645306219e-06, |
| "loss": 3.9972, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.4139072847682119, |
| "grad_norm": 23.695873260498047, |
| "learning_rate": 8.271458973752661e-06, |
| "loss": 4.1279, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.42573320719016083, |
| "grad_norm": 24.085838317871094, |
| "learning_rate": 8.507921494443131e-06, |
| "loss": 4.0214, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.43755912961210974, |
| "grad_norm": 20.78253173828125, |
| "learning_rate": 8.744384015133602e-06, |
| "loss": 3.9161, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.44938505203405865, |
| "grad_norm": 19.800090789794922, |
| "learning_rate": 8.980846535824072e-06, |
| "loss": 3.7544, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.46121097445600756, |
| "grad_norm": 22.900514602661133, |
| "learning_rate": 9.217309056514543e-06, |
| "loss": 3.8246, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.47303689687795647, |
| "grad_norm": 22.419363021850586, |
| "learning_rate": 9.453771577205015e-06, |
| "loss": 3.7991, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.47303689687795647, |
| "eval_runtime": 46.7005, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 5.98806651504585, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.4848628192999054, |
| "grad_norm": 22.308876037597656, |
| "learning_rate": 9.690234097895484e-06, |
| "loss": 3.8554, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4966887417218543, |
| "grad_norm": 23.8614501953125, |
| "learning_rate": 9.926696618585954e-06, |
| "loss": 3.8123, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.5085146641438032, |
| "grad_norm": 21.00491714477539, |
| "learning_rate": 1.0163159139276425e-05, |
| "loss": 3.5525, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5203405865657521, |
| "grad_norm": 25.555097579956055, |
| "learning_rate": 1.0399621659966897e-05, |
| "loss": 3.5591, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.532166508987701, |
| "grad_norm": 25.4840087890625, |
| "learning_rate": 1.0636084180657367e-05, |
| "loss": 3.6293, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5439924314096499, |
| "grad_norm": 21.117971420288086, |
| "learning_rate": 1.0872546701347836e-05, |
| "loss": 3.5831, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5558183538315988, |
| "grad_norm": 23.38995361328125, |
| "learning_rate": 1.1109009222038308e-05, |
| "loss": 3.6007, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5676442762535477, |
| "grad_norm": 22.385738372802734, |
| "learning_rate": 1.1345471742728777e-05, |
| "loss": 3.4225, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5794701986754967, |
| "grad_norm": 21.53306007385254, |
| "learning_rate": 1.158193426341925e-05, |
| "loss": 3.4405, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5912961210974456, |
| "grad_norm": 22.93678092956543, |
| "learning_rate": 1.181839678410972e-05, |
| "loss": 3.4002, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6031220435193945, |
| "grad_norm": 20.330045700073242, |
| "learning_rate": 1.2054859304800191e-05, |
| "loss": 3.3653, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6149479659413434, |
| "grad_norm": 21.98198699951172, |
| "learning_rate": 1.2291321825490661e-05, |
| "loss": 3.321, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6267738883632923, |
| "grad_norm": 18.49015998840332, |
| "learning_rate": 1.252778434618113e-05, |
| "loss": 3.3042, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6385998107852412, |
| "grad_norm": 22.69803237915039, |
| "learning_rate": 1.2764246866871602e-05, |
| "loss": 3.2117, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6504257332071902, |
| "grad_norm": 19.658132553100586, |
| "learning_rate": 1.3000709387562072e-05, |
| "loss": 3.3423, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.6622516556291391, |
| "grad_norm": 20.783931732177734, |
| "learning_rate": 1.3237171908252545e-05, |
| "loss": 3.2494, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.674077578051088, |
| "grad_norm": 17.039609909057617, |
| "learning_rate": 1.3473634428943014e-05, |
| "loss": 3.1364, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6859035004730369, |
| "grad_norm": 21.787738800048828, |
| "learning_rate": 1.3710096949633484e-05, |
| "loss": 3.1836, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6977294228949859, |
| "grad_norm": 20.883773803710938, |
| "learning_rate": 1.3946559470323956e-05, |
| "loss": 3.1268, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7095553453169348, |
| "grad_norm": 17.700597763061523, |
| "learning_rate": 1.4183021991014425e-05, |
| "loss": 3.072, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7213812677388837, |
| "grad_norm": 20.23262596130371, |
| "learning_rate": 1.4419484511704895e-05, |
| "loss": 3.0135, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.7332071901608326, |
| "grad_norm": 19.417842864990234, |
| "learning_rate": 1.4655947032395366e-05, |
| "loss": 3.0607, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7450331125827815, |
| "grad_norm": 19.843341827392578, |
| "learning_rate": 1.4892409553085838e-05, |
| "loss": 3.0963, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7568590350047304, |
| "grad_norm": 20.248523712158203, |
| "learning_rate": 1.5128872073776309e-05, |
| "loss": 3.0419, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7686849574266793, |
| "grad_norm": 24.61260986328125, |
| "learning_rate": 1.5365334594466777e-05, |
| "loss": 2.9891, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7805108798486282, |
| "grad_norm": 16.637826919555664, |
| "learning_rate": 1.560179711515725e-05, |
| "loss": 2.9384, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7923368022705771, |
| "grad_norm": 24.341026306152344, |
| "learning_rate": 1.583825963584772e-05, |
| "loss": 2.8918, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.804162724692526, |
| "grad_norm": 18.246440887451172, |
| "learning_rate": 1.607472215653819e-05, |
| "loss": 2.9816, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8159886471144749, |
| "grad_norm": 19.296022415161133, |
| "learning_rate": 1.631118467722866e-05, |
| "loss": 2.9664, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.8278145695364238, |
| "grad_norm": 19.331918716430664, |
| "learning_rate": 1.6547647197919134e-05, |
| "loss": 2.8969, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8396404919583728, |
| "grad_norm": 25.586254119873047, |
| "learning_rate": 1.6784109718609602e-05, |
| "loss": 2.9368, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.8514664143803217, |
| "grad_norm": 19.701223373413086, |
| "learning_rate": 1.7020572239300073e-05, |
| "loss": 2.8513, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8632923368022706, |
| "grad_norm": 16.68182945251465, |
| "learning_rate": 1.7257034759990545e-05, |
| "loss": 2.9808, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8751182592242195, |
| "grad_norm": 19.592416763305664, |
| "learning_rate": 1.7493497280681013e-05, |
| "loss": 2.8428, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8869441816461684, |
| "grad_norm": 20.324504852294922, |
| "learning_rate": 1.7729959801371484e-05, |
| "loss": 2.8775, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.8987701040681173, |
| "grad_norm": 19.49851417541504, |
| "learning_rate": 1.7966422322061955e-05, |
| "loss": 2.739, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.9105960264900662, |
| "grad_norm": 19.18546485900879, |
| "learning_rate": 1.8202884842752427e-05, |
| "loss": 2.8277, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.9224219489120151, |
| "grad_norm": 23.6113338470459, |
| "learning_rate": 1.8439347363442898e-05, |
| "loss": 2.767, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.934247871333964, |
| "grad_norm": 19.779712677001953, |
| "learning_rate": 1.8675809884133366e-05, |
| "loss": 2.794, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.9460737937559129, |
| "grad_norm": 23.361425399780273, |
| "learning_rate": 1.8912272404823837e-05, |
| "loss": 2.7738, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9460737937559129, |
| "eval_runtime": 47.0317, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 4.773771009103065, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9578997161778618, |
| "grad_norm": 18.137535095214844, |
| "learning_rate": 1.914873492551431e-05, |
| "loss": 2.8568, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.9697256385998108, |
| "grad_norm": 18.014116287231445, |
| "learning_rate": 1.9385197446204777e-05, |
| "loss": 2.7938, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9815515610217597, |
| "grad_norm": 17.168569564819336, |
| "learning_rate": 1.9621659966895248e-05, |
| "loss": 2.7272, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9933774834437086, |
| "grad_norm": 17.75269889831543, |
| "learning_rate": 1.985812248758572e-05, |
| "loss": 2.7079, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.0052034058656576, |
| "grad_norm": 19.342844009399414, |
| "learning_rate": 1.9976346756548995e-05, |
| "loss": 2.6383, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.0170293282876064, |
| "grad_norm": 17.54117774963379, |
| "learning_rate": 1.9917213647921473e-05, |
| "loss": 2.6855, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.0288552507095554, |
| "grad_norm": 18.412206649780273, |
| "learning_rate": 1.9858080539293952e-05, |
| "loss": 2.6568, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.0406811731315042, |
| "grad_norm": 18.794939041137695, |
| "learning_rate": 1.979894743066643e-05, |
| "loss": 2.5981, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.0525070955534532, |
| "grad_norm": 17.26803970336914, |
| "learning_rate": 1.973981432203891e-05, |
| "loss": 2.6987, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.064333017975402, |
| "grad_norm": 15.831737518310547, |
| "learning_rate": 1.968068121341139e-05, |
| "loss": 2.6992, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.076158940397351, |
| "grad_norm": 16.746700286865234, |
| "learning_rate": 1.962154810478387e-05, |
| "loss": 2.5434, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.0879848628192998, |
| "grad_norm": 18.824857711791992, |
| "learning_rate": 1.956241499615635e-05, |
| "loss": 2.5553, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.0998107852412489, |
| "grad_norm": 16.81246566772461, |
| "learning_rate": 1.9503281887528828e-05, |
| "loss": 2.4978, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.1116367076631977, |
| "grad_norm": 18.369991302490234, |
| "learning_rate": 1.9444148778901307e-05, |
| "loss": 2.5679, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.1234626300851467, |
| "grad_norm": 19.55158805847168, |
| "learning_rate": 1.938501567027379e-05, |
| "loss": 2.4768, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.1352885525070955, |
| "grad_norm": 20.673002243041992, |
| "learning_rate": 1.9325882561646268e-05, |
| "loss": 2.5578, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1471144749290445, |
| "grad_norm": 17.067432403564453, |
| "learning_rate": 1.9266749453018747e-05, |
| "loss": 2.4758, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.1589403973509933, |
| "grad_norm": 22.328304290771484, |
| "learning_rate": 1.9207616344391226e-05, |
| "loss": 2.5352, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.1707663197729423, |
| "grad_norm": 15.121694564819336, |
| "learning_rate": 1.9148483235763708e-05, |
| "loss": 2.5023, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.1825922421948911, |
| "grad_norm": 15.201376914978027, |
| "learning_rate": 1.9089350127136187e-05, |
| "loss": 2.4713, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.1944181646168401, |
| "grad_norm": 20.54207992553711, |
| "learning_rate": 1.9030217018508665e-05, |
| "loss": 2.486, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.206244087038789, |
| "grad_norm": 16.934635162353516, |
| "learning_rate": 1.8971083909881144e-05, |
| "loss": 2.483, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.218070009460738, |
| "grad_norm": 16.963790893554688, |
| "learning_rate": 1.8911950801253623e-05, |
| "loss": 2.4098, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.2298959318826868, |
| "grad_norm": 16.505352020263672, |
| "learning_rate": 1.8852817692626102e-05, |
| "loss": 2.5061, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.2417218543046358, |
| "grad_norm": 16.634069442749023, |
| "learning_rate": 1.879368458399858e-05, |
| "loss": 2.4597, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.2535477767265846, |
| "grad_norm": 16.373046875, |
| "learning_rate": 1.8734551475371063e-05, |
| "loss": 2.4591, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.2653736991485336, |
| "grad_norm": 21.308876037597656, |
| "learning_rate": 1.867541836674354e-05, |
| "loss": 2.3879, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.2771996215704826, |
| "grad_norm": 20.565275192260742, |
| "learning_rate": 1.861628525811602e-05, |
| "loss": 2.4146, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.2890255439924314, |
| "grad_norm": 15.853353500366211, |
| "learning_rate": 1.85571521494885e-05, |
| "loss": 2.3418, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.3008514664143802, |
| "grad_norm": 13.12362003326416, |
| "learning_rate": 1.8498019040860978e-05, |
| "loss": 2.4307, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.3126773888363292, |
| "grad_norm": 19.059667587280273, |
| "learning_rate": 1.843888593223346e-05, |
| "loss": 2.3653, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.3245033112582782, |
| "grad_norm": 17.448827743530273, |
| "learning_rate": 1.837975282360594e-05, |
| "loss": 2.3995, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.336329233680227, |
| "grad_norm": 18.326887130737305, |
| "learning_rate": 1.8320619714978418e-05, |
| "loss": 2.4527, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.3481551561021758, |
| "grad_norm": 18.03122901916504, |
| "learning_rate": 1.8261486606350896e-05, |
| "loss": 2.4547, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.3599810785241249, |
| "grad_norm": 18.269872665405273, |
| "learning_rate": 1.820235349772338e-05, |
| "loss": 2.3695, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.3718070009460739, |
| "grad_norm": 16.90838623046875, |
| "learning_rate": 1.8143220389095857e-05, |
| "loss": 2.3341, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.3836329233680227, |
| "grad_norm": 18.816362380981445, |
| "learning_rate": 1.8084087280468336e-05, |
| "loss": 2.2412, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.3954588457899715, |
| "grad_norm": 17.30527687072754, |
| "learning_rate": 1.8024954171840815e-05, |
| "loss": 2.2695, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.4072847682119205, |
| "grad_norm": 18.299711227416992, |
| "learning_rate": 1.7965821063213297e-05, |
| "loss": 2.2922, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.4191106906338695, |
| "grad_norm": 18.047449111938477, |
| "learning_rate": 1.7906687954585773e-05, |
| "loss": 2.3176, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.4191106906338695, |
| "eval_runtime": 46.9839, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 4.305679076455633, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.4309366130558183, |
| "grad_norm": 20.608333587646484, |
| "learning_rate": 1.784755484595825e-05, |
| "loss": 2.281, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.4427625354777673, |
| "grad_norm": 16.299001693725586, |
| "learning_rate": 1.7788421737330734e-05, |
| "loss": 2.2155, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.4545884578997161, |
| "grad_norm": 17.70014762878418, |
| "learning_rate": 1.7729288628703212e-05, |
| "loss": 2.1908, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.4664143803216652, |
| "grad_norm": 13.944992065429688, |
| "learning_rate": 1.767015552007569e-05, |
| "loss": 2.2071, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.478240302743614, |
| "grad_norm": 18.37308692932129, |
| "learning_rate": 1.761102241144817e-05, |
| "loss": 2.2617, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.490066225165563, |
| "grad_norm": 16.624183654785156, |
| "learning_rate": 1.7551889302820652e-05, |
| "loss": 2.2864, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.5018921475875118, |
| "grad_norm": 15.490421295166016, |
| "learning_rate": 1.749275619419313e-05, |
| "loss": 2.2509, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.5137180700094608, |
| "grad_norm": 15.517704010009766, |
| "learning_rate": 1.743362308556561e-05, |
| "loss": 2.1227, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.5255439924314098, |
| "grad_norm": 14.78442096710205, |
| "learning_rate": 1.737448997693809e-05, |
| "loss": 2.1919, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.5373699148533586, |
| "grad_norm": 19.766271591186523, |
| "learning_rate": 1.7315356868310567e-05, |
| "loss": 2.2072, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.5491958372753074, |
| "grad_norm": 17.84695053100586, |
| "learning_rate": 1.725622375968305e-05, |
| "loss": 2.1652, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.5610217596972564, |
| "grad_norm": 17.325145721435547, |
| "learning_rate": 1.7197090651055528e-05, |
| "loss": 2.2224, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.5728476821192054, |
| "grad_norm": 19.243274688720703, |
| "learning_rate": 1.7137957542428007e-05, |
| "loss": 2.0715, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.5846736045411542, |
| "grad_norm": 17.589859008789062, |
| "learning_rate": 1.7078824433800486e-05, |
| "loss": 2.1693, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.596499526963103, |
| "grad_norm": 14.71687126159668, |
| "learning_rate": 1.7019691325172968e-05, |
| "loss": 2.1141, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.608325449385052, |
| "grad_norm": 14.723918914794922, |
| "learning_rate": 1.6960558216545443e-05, |
| "loss": 2.1129, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.620151371807001, |
| "grad_norm": 16.5570011138916, |
| "learning_rate": 1.6901425107917922e-05, |
| "loss": 2.1001, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.6319772942289499, |
| "grad_norm": 17.945083618164062, |
| "learning_rate": 1.6842291999290404e-05, |
| "loss": 2.094, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.6438032166508987, |
| "grad_norm": 18.704225540161133, |
| "learning_rate": 1.6783158890662883e-05, |
| "loss": 2.2176, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.6556291390728477, |
| "grad_norm": 15.701910018920898, |
| "learning_rate": 1.6724025782035362e-05, |
| "loss": 2.109, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.6674550614947967, |
| "grad_norm": 16.768260955810547, |
| "learning_rate": 1.666489267340784e-05, |
| "loss": 2.0537, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.6792809839167455, |
| "grad_norm": 17.835603713989258, |
| "learning_rate": 1.6605759564780323e-05, |
| "loss": 2.0328, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.6911069063386943, |
| "grad_norm": 18.1043701171875, |
| "learning_rate": 1.6546626456152802e-05, |
| "loss": 2.1541, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.7029328287606433, |
| "grad_norm": 14.032896995544434, |
| "learning_rate": 1.648749334752528e-05, |
| "loss": 2.0164, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.7147587511825924, |
| "grad_norm": 15.934415817260742, |
| "learning_rate": 1.642836023889776e-05, |
| "loss": 2.0225, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.7265846736045412, |
| "grad_norm": 15.602225303649902, |
| "learning_rate": 1.636922713027024e-05, |
| "loss": 2.0243, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.73841059602649, |
| "grad_norm": 15.584887504577637, |
| "learning_rate": 1.631009402164272e-05, |
| "loss": 2.0152, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.750236518448439, |
| "grad_norm": 17.52799415588379, |
| "learning_rate": 1.62509609130152e-05, |
| "loss": 2.0455, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.762062440870388, |
| "grad_norm": 15.92798900604248, |
| "learning_rate": 1.6191827804387678e-05, |
| "loss": 2.0026, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.7738883632923368, |
| "grad_norm": 14.851804733276367, |
| "learning_rate": 1.6132694695760157e-05, |
| "loss": 1.9846, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.7857142857142856, |
| "grad_norm": 15.551090240478516, |
| "learning_rate": 1.607356158713264e-05, |
| "loss": 1.9594, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.7975402081362346, |
| "grad_norm": 14.651620864868164, |
| "learning_rate": 1.6014428478505118e-05, |
| "loss": 2.0523, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.8093661305581836, |
| "grad_norm": 19.447086334228516, |
| "learning_rate": 1.5955295369877596e-05, |
| "loss": 1.9751, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.8211920529801324, |
| "grad_norm": 14.130012512207031, |
| "learning_rate": 1.5896162261250075e-05, |
| "loss": 1.9898, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.8330179754020812, |
| "grad_norm": 18.4505615234375, |
| "learning_rate": 1.5837029152622554e-05, |
| "loss": 1.9658, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.8448438978240302, |
| "grad_norm": 12.992496490478516, |
| "learning_rate": 1.5777896043995033e-05, |
| "loss": 1.9976, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.8566698202459793, |
| "grad_norm": 17.20708656311035, |
| "learning_rate": 1.571876293536751e-05, |
| "loss": 1.9939, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.868495742667928, |
| "grad_norm": 14.438339233398438, |
| "learning_rate": 1.5659629826739994e-05, |
| "loss": 1.9666, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.8803216650898769, |
| "grad_norm": 16.87125015258789, |
| "learning_rate": 1.5600496718112473e-05, |
| "loss": 1.9704, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.8921475875118259, |
| "grad_norm": 17.480026245117188, |
| "learning_rate": 1.554136360948495e-05, |
| "loss": 1.9822, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.8921475875118259, |
| "eval_runtime": 47.026, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 4.1330844767959665, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.903973509933775, |
| "grad_norm": 15.649256706237793, |
| "learning_rate": 1.548223050085743e-05, |
| "loss": 1.8534, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.9157994323557237, |
| "grad_norm": 17.02906608581543, |
| "learning_rate": 1.5423097392229912e-05, |
| "loss": 1.856, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.9276253547776725, |
| "grad_norm": 16.321977615356445, |
| "learning_rate": 1.536396428360239e-05, |
| "loss": 1.9817, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.9394512771996215, |
| "grad_norm": 21.492490768432617, |
| "learning_rate": 1.530483117497487e-05, |
| "loss": 1.9095, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.9512771996215705, |
| "grad_norm": 18.752315521240234, |
| "learning_rate": 1.524569806634735e-05, |
| "loss": 1.9343, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.9631031220435196, |
| "grad_norm": 17.007205963134766, |
| "learning_rate": 1.518656495771983e-05, |
| "loss": 1.95, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.9749290444654684, |
| "grad_norm": 16.75872230529785, |
| "learning_rate": 1.512743184909231e-05, |
| "loss": 1.9981, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.9867549668874172, |
| "grad_norm": 18.816049575805664, |
| "learning_rate": 1.5068298740464788e-05, |
| "loss": 1.8872, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.9985808893093662, |
| "grad_norm": 16.992637634277344, |
| "learning_rate": 1.5009165631837266e-05, |
| "loss": 1.8112, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.010406811731315, |
| "grad_norm": 16.72859001159668, |
| "learning_rate": 1.4950032523209746e-05, |
| "loss": 1.8451, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.0222327341532638, |
| "grad_norm": 15.676278114318848, |
| "learning_rate": 1.4890899414582225e-05, |
| "loss": 1.8918, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.034058656575213, |
| "grad_norm": 15.531780242919922, |
| "learning_rate": 1.4831766305954705e-05, |
| "loss": 1.7837, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.045884578997162, |
| "grad_norm": 17.246252059936523, |
| "learning_rate": 1.4772633197327184e-05, |
| "loss": 1.8692, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.057710501419111, |
| "grad_norm": 13.021443367004395, |
| "learning_rate": 1.4713500088699663e-05, |
| "loss": 1.8614, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.0695364238410594, |
| "grad_norm": 15.586688041687012, |
| "learning_rate": 1.4654366980072143e-05, |
| "loss": 1.8677, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.0813623462630084, |
| "grad_norm": 19.62430191040039, |
| "learning_rate": 1.4595233871444622e-05, |
| "loss": 1.8005, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.0931882686849574, |
| "grad_norm": 15.454833984375, |
| "learning_rate": 1.4536100762817103e-05, |
| "loss": 1.8008, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.1050141911069065, |
| "grad_norm": 16.70480728149414, |
| "learning_rate": 1.4476967654189581e-05, |
| "loss": 1.8207, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.116840113528855, |
| "grad_norm": 17.584407806396484, |
| "learning_rate": 1.4417834545562062e-05, |
| "loss": 1.7491, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.128666035950804, |
| "grad_norm": 17.367647171020508, |
| "learning_rate": 1.435870143693454e-05, |
| "loss": 1.8351, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.140491958372753, |
| "grad_norm": 15.521934509277344, |
| "learning_rate": 1.4299568328307021e-05, |
| "loss": 1.7934, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.152317880794702, |
| "grad_norm": 18.928241729736328, |
| "learning_rate": 1.42404352196795e-05, |
| "loss": 1.8162, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.1641438032166507, |
| "grad_norm": 16.490169525146484, |
| "learning_rate": 1.418130211105198e-05, |
| "loss": 1.8496, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.1759697256385997, |
| "grad_norm": 16.48432731628418, |
| "learning_rate": 1.412216900242446e-05, |
| "loss": 1.7747, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.1877956480605487, |
| "grad_norm": 11.924939155578613, |
| "learning_rate": 1.406303589379694e-05, |
| "loss": 1.7665, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.1996215704824977, |
| "grad_norm": 17.498945236206055, |
| "learning_rate": 1.4003902785169417e-05, |
| "loss": 1.789, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.2114474929044468, |
| "grad_norm": 15.384320259094238, |
| "learning_rate": 1.3944769676541896e-05, |
| "loss": 1.8264, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.2232734153263953, |
| "grad_norm": 13.456559181213379, |
| "learning_rate": 1.3885636567914376e-05, |
| "loss": 1.788, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.2350993377483444, |
| "grad_norm": 24.769336700439453, |
| "learning_rate": 1.3826503459286855e-05, |
| "loss": 1.7902, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.2469252601702934, |
| "grad_norm": 18.193721771240234, |
| "learning_rate": 1.3767370350659335e-05, |
| "loss": 1.8175, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.258751182592242, |
| "grad_norm": 16.10167121887207, |
| "learning_rate": 1.3708237242031814e-05, |
| "loss": 1.8042, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.270577105014191, |
| "grad_norm": 15.939582824707031, |
| "learning_rate": 1.3649104133404295e-05, |
| "loss": 1.7767, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.28240302743614, |
| "grad_norm": 17.35470199584961, |
| "learning_rate": 1.3589971024776774e-05, |
| "loss": 1.7099, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.294228949858089, |
| "grad_norm": 16.262712478637695, |
| "learning_rate": 1.3530837916149252e-05, |
| "loss": 1.7841, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.306054872280038, |
| "grad_norm": 13.716343879699707, |
| "learning_rate": 1.3471704807521733e-05, |
| "loss": 1.87, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.3178807947019866, |
| "grad_norm": 13.402505874633789, |
| "learning_rate": 1.3412571698894212e-05, |
| "loss": 1.7485, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.3297067171239356, |
| "grad_norm": 14.37375259399414, |
| "learning_rate": 1.3353438590266692e-05, |
| "loss": 1.8367, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.3415326395458846, |
| "grad_norm": 14.258302688598633, |
| "learning_rate": 1.3294305481639171e-05, |
| "loss": 1.7925, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.3533585619678337, |
| "grad_norm": 18.176448822021484, |
| "learning_rate": 1.3235172373011651e-05, |
| "loss": 1.9135, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.3651844843897822, |
| "grad_norm": 16.076860427856445, |
| "learning_rate": 1.317603926438413e-05, |
| "loss": 1.7746, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.3651844843897822, |
| "eval_runtime": 46.8576, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 4.062871016729038, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.3770104068117313, |
| "grad_norm": 14.89098072052002, |
| "learning_rate": 1.311690615575661e-05, |
| "loss": 1.672, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.3888363292336803, |
| "grad_norm": 21.15306854248047, |
| "learning_rate": 1.3057773047129088e-05, |
| "loss": 1.7265, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.4006622516556293, |
| "grad_norm": 13.14006519317627, |
| "learning_rate": 1.2998639938501567e-05, |
| "loss": 1.6875, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.412488174077578, |
| "grad_norm": 16.71653175354004, |
| "learning_rate": 1.2939506829874047e-05, |
| "loss": 1.7421, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.424314096499527, |
| "grad_norm": 19.673765182495117, |
| "learning_rate": 1.2880373721246526e-05, |
| "loss": 1.7447, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.436140018921476, |
| "grad_norm": 13.806225776672363, |
| "learning_rate": 1.2821240612619006e-05, |
| "loss": 1.7335, |
| "step": 10300 |
| }, |
| { |
| "epoch": 2.447965941343425, |
| "grad_norm": 17.10091209411621, |
| "learning_rate": 1.2762107503991485e-05, |
| "loss": 1.6583, |
| "step": 10350 |
| }, |
| { |
| "epoch": 2.4597918637653735, |
| "grad_norm": 13.57816219329834, |
| "learning_rate": 1.2702974395363966e-05, |
| "loss": 1.6937, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.4716177861873225, |
| "grad_norm": 15.529336929321289, |
| "learning_rate": 1.2643841286736444e-05, |
| "loss": 1.6425, |
| "step": 10450 |
| }, |
| { |
| "epoch": 2.4834437086092715, |
| "grad_norm": 15.039297103881836, |
| "learning_rate": 1.2584708178108925e-05, |
| "loss": 1.7837, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.4952696310312206, |
| "grad_norm": 18.062923431396484, |
| "learning_rate": 1.2525575069481404e-05, |
| "loss": 1.7589, |
| "step": 10550 |
| }, |
| { |
| "epoch": 2.507095553453169, |
| "grad_norm": 14.291655540466309, |
| "learning_rate": 1.2466441960853884e-05, |
| "loss": 1.6618, |
| "step": 10600 |
| }, |
| { |
| "epoch": 2.518921475875118, |
| "grad_norm": 15.268333435058594, |
| "learning_rate": 1.2407308852226363e-05, |
| "loss": 1.6107, |
| "step": 10650 |
| }, |
| { |
| "epoch": 2.530747398297067, |
| "grad_norm": 15.746752738952637, |
| "learning_rate": 1.2348175743598842e-05, |
| "loss": 1.706, |
| "step": 10700 |
| }, |
| { |
| "epoch": 2.542573320719016, |
| "grad_norm": 14.740198135375977, |
| "learning_rate": 1.2289042634971322e-05, |
| "loss": 1.6662, |
| "step": 10750 |
| }, |
| { |
| "epoch": 2.5543992431409652, |
| "grad_norm": 18.715717315673828, |
| "learning_rate": 1.2229909526343801e-05, |
| "loss": 1.7491, |
| "step": 10800 |
| }, |
| { |
| "epoch": 2.566225165562914, |
| "grad_norm": 13.341856956481934, |
| "learning_rate": 1.2170776417716281e-05, |
| "loss": 1.615, |
| "step": 10850 |
| }, |
| { |
| "epoch": 2.578051087984863, |
| "grad_norm": 15.429610252380371, |
| "learning_rate": 1.211164330908876e-05, |
| "loss": 1.6314, |
| "step": 10900 |
| }, |
| { |
| "epoch": 2.589877010406812, |
| "grad_norm": 16.15951919555664, |
| "learning_rate": 1.2052510200461239e-05, |
| "loss": 1.6564, |
| "step": 10950 |
| }, |
| { |
| "epoch": 2.6017029328287604, |
| "grad_norm": 16.425504684448242, |
| "learning_rate": 1.1993377091833718e-05, |
| "loss": 1.6085, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.6135288552507094, |
| "grad_norm": 19.02115249633789, |
| "learning_rate": 1.1934243983206197e-05, |
| "loss": 1.6969, |
| "step": 11050 |
| }, |
| { |
| "epoch": 2.6253547776726585, |
| "grad_norm": 16.245838165283203, |
| "learning_rate": 1.1875110874578677e-05, |
| "loss": 1.5963, |
| "step": 11100 |
| }, |
| { |
| "epoch": 2.6371807000946075, |
| "grad_norm": 14.986413955688477, |
| "learning_rate": 1.1815977765951156e-05, |
| "loss": 1.6626, |
| "step": 11150 |
| }, |
| { |
| "epoch": 2.6490066225165565, |
| "grad_norm": 18.501134872436523, |
| "learning_rate": 1.1756844657323636e-05, |
| "loss": 1.715, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.660832544938505, |
| "grad_norm": 19.390989303588867, |
| "learning_rate": 1.1697711548696115e-05, |
| "loss": 1.6182, |
| "step": 11250 |
| }, |
| { |
| "epoch": 2.672658467360454, |
| "grad_norm": 16.83384132385254, |
| "learning_rate": 1.1638578440068596e-05, |
| "loss": 1.5667, |
| "step": 11300 |
| }, |
| { |
| "epoch": 2.684484389782403, |
| "grad_norm": 17.595382690429688, |
| "learning_rate": 1.1579445331441074e-05, |
| "loss": 1.6255, |
| "step": 11350 |
| }, |
| { |
| "epoch": 2.6963103122043517, |
| "grad_norm": 18.588014602661133, |
| "learning_rate": 1.1520312222813555e-05, |
| "loss": 1.6146, |
| "step": 11400 |
| }, |
| { |
| "epoch": 2.7081362346263007, |
| "grad_norm": 18.090600967407227, |
| "learning_rate": 1.1461179114186034e-05, |
| "loss": 1.5807, |
| "step": 11450 |
| }, |
| { |
| "epoch": 2.7199621570482497, |
| "grad_norm": 16.144756317138672, |
| "learning_rate": 1.1402046005558514e-05, |
| "loss": 1.571, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.7317880794701987, |
| "grad_norm": 19.271270751953125, |
| "learning_rate": 1.1342912896930993e-05, |
| "loss": 1.611, |
| "step": 11550 |
| }, |
| { |
| "epoch": 2.7436140018921478, |
| "grad_norm": 15.365574836730957, |
| "learning_rate": 1.1283779788303474e-05, |
| "loss": 1.5757, |
| "step": 11600 |
| }, |
| { |
| "epoch": 2.7554399243140963, |
| "grad_norm": 18.699979782104492, |
| "learning_rate": 1.1224646679675952e-05, |
| "loss": 1.6048, |
| "step": 11650 |
| }, |
| { |
| "epoch": 2.7672658467360454, |
| "grad_norm": 15.537507057189941, |
| "learning_rate": 1.1165513571048431e-05, |
| "loss": 1.5559, |
| "step": 11700 |
| }, |
| { |
| "epoch": 2.7790917691579944, |
| "grad_norm": 15.148637771606445, |
| "learning_rate": 1.110638046242091e-05, |
| "loss": 1.5634, |
| "step": 11750 |
| }, |
| { |
| "epoch": 2.790917691579943, |
| "grad_norm": 17.472370147705078, |
| "learning_rate": 1.1047247353793389e-05, |
| "loss": 1.5663, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.8027436140018924, |
| "grad_norm": 16.284570693969727, |
| "learning_rate": 1.098811424516587e-05, |
| "loss": 1.6274, |
| "step": 11850 |
| }, |
| { |
| "epoch": 2.814569536423841, |
| "grad_norm": 17.758365631103516, |
| "learning_rate": 1.0928981136538348e-05, |
| "loss": 1.5478, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.82639545884579, |
| "grad_norm": 14.631210327148438, |
| "learning_rate": 1.0869848027910828e-05, |
| "loss": 1.5728, |
| "step": 11950 |
| }, |
| { |
| "epoch": 2.838221381267739, |
| "grad_norm": 13.960256576538086, |
| "learning_rate": 1.0810714919283307e-05, |
| "loss": 1.5694, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.838221381267739, |
| "eval_runtime": 47.1893, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 3.9845195254937416, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.8500473036896876, |
| "grad_norm": 13.964838981628418, |
| "learning_rate": 1.0751581810655786e-05, |
| "loss": 1.5544, |
| "step": 12050 |
| }, |
| { |
| "epoch": 2.8618732261116366, |
| "grad_norm": 20.20077133178711, |
| "learning_rate": 1.0692448702028267e-05, |
| "loss": 1.6172, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.8736991485335857, |
| "grad_norm": 13.513507843017578, |
| "learning_rate": 1.0633315593400745e-05, |
| "loss": 1.5358, |
| "step": 12150 |
| }, |
| { |
| "epoch": 2.8855250709555347, |
| "grad_norm": 18.936565399169922, |
| "learning_rate": 1.0574182484773226e-05, |
| "loss": 1.5656, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.8973509933774837, |
| "grad_norm": 17.975814819335938, |
| "learning_rate": 1.0515049376145705e-05, |
| "loss": 1.5273, |
| "step": 12250 |
| }, |
| { |
| "epoch": 2.9091769157994323, |
| "grad_norm": 18.273731231689453, |
| "learning_rate": 1.0455916267518185e-05, |
| "loss": 1.4981, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.9210028382213813, |
| "grad_norm": 16.280357360839844, |
| "learning_rate": 1.0396783158890664e-05, |
| "loss": 1.5256, |
| "step": 12350 |
| }, |
| { |
| "epoch": 2.9328287606433303, |
| "grad_norm": 13.220331192016602, |
| "learning_rate": 1.0337650050263144e-05, |
| "loss": 1.522, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.944654683065279, |
| "grad_norm": 16.336288452148438, |
| "learning_rate": 1.0278516941635623e-05, |
| "loss": 1.465, |
| "step": 12450 |
| }, |
| { |
| "epoch": 2.956480605487228, |
| "grad_norm": 20.016876220703125, |
| "learning_rate": 1.0219383833008104e-05, |
| "loss": 1.6151, |
| "step": 12500 |
| }, |
| { |
| "epoch": 2.968306527909177, |
| "grad_norm": 17.370023727416992, |
| "learning_rate": 1.0160250724380582e-05, |
| "loss": 1.5521, |
| "step": 12550 |
| }, |
| { |
| "epoch": 2.980132450331126, |
| "grad_norm": 18.69423484802246, |
| "learning_rate": 1.010111761575306e-05, |
| "loss": 1.5657, |
| "step": 12600 |
| }, |
| { |
| "epoch": 2.991958372753075, |
| "grad_norm": 18.094669342041016, |
| "learning_rate": 1.004198450712554e-05, |
| "loss": 1.4935, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.0037842951750235, |
| "grad_norm": 15.485885620117188, |
| "learning_rate": 9.98285139849802e-06, |
| "loss": 1.4081, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.0156102175969726, |
| "grad_norm": 15.505888938903809, |
| "learning_rate": 9.9237182898705e-06, |
| "loss": 1.5243, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.0274361400189216, |
| "grad_norm": 16.799917221069336, |
| "learning_rate": 9.864585181242978e-06, |
| "loss": 1.4999, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.0392620624408706, |
| "grad_norm": 15.498218536376953, |
| "learning_rate": 9.805452072615459e-06, |
| "loss": 1.4139, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.051087984862819, |
| "grad_norm": 19.318891525268555, |
| "learning_rate": 9.746318963987937e-06, |
| "loss": 1.5522, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.062913907284768, |
| "grad_norm": 14.893320083618164, |
| "learning_rate": 9.687185855360418e-06, |
| "loss": 1.4865, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.074739829706717, |
| "grad_norm": 18.767566680908203, |
| "learning_rate": 9.628052746732897e-06, |
| "loss": 1.4755, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.0865657521286662, |
| "grad_norm": 14.623005867004395, |
| "learning_rate": 9.568919638105375e-06, |
| "loss": 1.4582, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.098391674550615, |
| "grad_norm": 14.217521667480469, |
| "learning_rate": 9.509786529477856e-06, |
| "loss": 1.4112, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.110217596972564, |
| "grad_norm": 13.287856101989746, |
| "learning_rate": 9.450653420850335e-06, |
| "loss": 1.4758, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.122043519394513, |
| "grad_norm": 14.488649368286133, |
| "learning_rate": 9.391520312222813e-06, |
| "loss": 1.4388, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.133869441816462, |
| "grad_norm": 15.88402271270752, |
| "learning_rate": 9.332387203595294e-06, |
| "loss": 1.4819, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.1456953642384105, |
| "grad_norm": 13.743453025817871, |
| "learning_rate": 9.273254094967773e-06, |
| "loss": 1.4525, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.1575212866603595, |
| "grad_norm": 16.949493408203125, |
| "learning_rate": 9.214120986340253e-06, |
| "loss": 1.4583, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.1693472090823085, |
| "grad_norm": 15.139965057373047, |
| "learning_rate": 9.154987877712732e-06, |
| "loss": 1.4714, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.1811731315042575, |
| "grad_norm": 18.97600746154785, |
| "learning_rate": 9.095854769085213e-06, |
| "loss": 1.4265, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.192999053926206, |
| "grad_norm": 16.3485107421875, |
| "learning_rate": 9.036721660457691e-06, |
| "loss": 1.454, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.204824976348155, |
| "grad_norm": 17.43102264404297, |
| "learning_rate": 8.97758855183017e-06, |
| "loss": 1.4506, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.216650898770104, |
| "grad_norm": 15.63143253326416, |
| "learning_rate": 8.91845544320265e-06, |
| "loss": 1.4055, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.228476821192053, |
| "grad_norm": 12.323601722717285, |
| "learning_rate": 8.85932233457513e-06, |
| "loss": 1.4729, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.2403027436140017, |
| "grad_norm": 17.078189849853516, |
| "learning_rate": 8.800189225947608e-06, |
| "loss": 1.4791, |
| "step": 13700 |
| }, |
| { |
| "epoch": 3.2521286660359507, |
| "grad_norm": 14.752788543701172, |
| "learning_rate": 8.741056117320089e-06, |
| "loss": 1.4962, |
| "step": 13750 |
| }, |
| { |
| "epoch": 3.2639545884578998, |
| "grad_norm": 12.897354125976562, |
| "learning_rate": 8.681923008692567e-06, |
| "loss": 1.4101, |
| "step": 13800 |
| }, |
| { |
| "epoch": 3.275780510879849, |
| "grad_norm": 12.985773086547852, |
| "learning_rate": 8.622789900065048e-06, |
| "loss": 1.4596, |
| "step": 13850 |
| }, |
| { |
| "epoch": 3.2876064333017974, |
| "grad_norm": 16.538026809692383, |
| "learning_rate": 8.563656791437527e-06, |
| "loss": 1.4207, |
| "step": 13900 |
| }, |
| { |
| "epoch": 3.2994323557237464, |
| "grad_norm": 20.395875930786133, |
| "learning_rate": 8.504523682810007e-06, |
| "loss": 1.4972, |
| "step": 13950 |
| }, |
| { |
| "epoch": 3.3112582781456954, |
| "grad_norm": 11.993943214416504, |
| "learning_rate": 8.445390574182486e-06, |
| "loss": 1.4473, |
| "step": 14000 |
| }, |
| { |
| "epoch": 3.3112582781456954, |
| "eval_runtime": 47.0766, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 3.9688115547590725, |
| "step": 14000 |
| }, |
| { |
| "epoch": 3.3230842005676444, |
| "grad_norm": 12.915935516357422, |
| "learning_rate": 8.386257465554965e-06, |
| "loss": 1.46, |
| "step": 14050 |
| }, |
| { |
| "epoch": 3.334910122989593, |
| "grad_norm": 19.571788787841797, |
| "learning_rate": 8.327124356927444e-06, |
| "loss": 1.5012, |
| "step": 14100 |
| }, |
| { |
| "epoch": 3.346736045411542, |
| "grad_norm": 17.244380950927734, |
| "learning_rate": 8.267991248299924e-06, |
| "loss": 1.5466, |
| "step": 14150 |
| }, |
| { |
| "epoch": 3.358561967833491, |
| "grad_norm": 18.117067337036133, |
| "learning_rate": 8.208858139672403e-06, |
| "loss": 1.5067, |
| "step": 14200 |
| }, |
| { |
| "epoch": 3.37038789025544, |
| "grad_norm": 14.23071575164795, |
| "learning_rate": 8.149725031044883e-06, |
| "loss": 1.4413, |
| "step": 14250 |
| }, |
| { |
| "epoch": 3.3822138126773886, |
| "grad_norm": 16.817371368408203, |
| "learning_rate": 8.090591922417362e-06, |
| "loss": 1.3448, |
| "step": 14300 |
| }, |
| { |
| "epoch": 3.3940397350993377, |
| "grad_norm": 21.39740753173828, |
| "learning_rate": 8.031458813789843e-06, |
| "loss": 1.3792, |
| "step": 14350 |
| }, |
| { |
| "epoch": 3.4058656575212867, |
| "grad_norm": 13.991111755371094, |
| "learning_rate": 7.972325705162321e-06, |
| "loss": 1.4019, |
| "step": 14400 |
| }, |
| { |
| "epoch": 3.4176915799432357, |
| "grad_norm": 14.572546005249023, |
| "learning_rate": 7.9131925965348e-06, |
| "loss": 1.4771, |
| "step": 14450 |
| }, |
| { |
| "epoch": 3.4295175023651847, |
| "grad_norm": 15.65616226196289, |
| "learning_rate": 7.854059487907279e-06, |
| "loss": 1.4383, |
| "step": 14500 |
| }, |
| { |
| "epoch": 3.4413434247871333, |
| "grad_norm": 16.871171951293945, |
| "learning_rate": 7.79492637927976e-06, |
| "loss": 1.4172, |
| "step": 14550 |
| }, |
| { |
| "epoch": 3.4531693472090823, |
| "grad_norm": 16.653839111328125, |
| "learning_rate": 7.735793270652238e-06, |
| "loss": 1.3284, |
| "step": 14600 |
| }, |
| { |
| "epoch": 3.4649952696310313, |
| "grad_norm": 18.008516311645508, |
| "learning_rate": 7.676660162024719e-06, |
| "loss": 1.3867, |
| "step": 14650 |
| }, |
| { |
| "epoch": 3.47682119205298, |
| "grad_norm": 21.629899978637695, |
| "learning_rate": 7.6175270533971976e-06, |
| "loss": 1.3931, |
| "step": 14700 |
| }, |
| { |
| "epoch": 3.488647114474929, |
| "grad_norm": 15.525995254516602, |
| "learning_rate": 7.558393944769677e-06, |
| "loss": 1.4429, |
| "step": 14750 |
| }, |
| { |
| "epoch": 3.500473036896878, |
| "grad_norm": 15.045352935791016, |
| "learning_rate": 7.499260836142157e-06, |
| "loss": 1.4665, |
| "step": 14800 |
| }, |
| { |
| "epoch": 3.512298959318827, |
| "grad_norm": 16.258941650390625, |
| "learning_rate": 7.440127727514636e-06, |
| "loss": 1.3232, |
| "step": 14850 |
| }, |
| { |
| "epoch": 3.524124881740776, |
| "grad_norm": 14.834844589233398, |
| "learning_rate": 7.380994618887115e-06, |
| "loss": 1.4112, |
| "step": 14900 |
| }, |
| { |
| "epoch": 3.5359508041627246, |
| "grad_norm": 18.840707778930664, |
| "learning_rate": 7.321861510259595e-06, |
| "loss": 1.3916, |
| "step": 14950 |
| }, |
| { |
| "epoch": 3.5477767265846736, |
| "grad_norm": 17.09494972229004, |
| "learning_rate": 7.2627284016320746e-06, |
| "loss": 1.3572, |
| "step": 15000 |
| }, |
| { |
| "epoch": 3.5596026490066226, |
| "grad_norm": 17.76523780822754, |
| "learning_rate": 7.203595293004554e-06, |
| "loss": 1.4414, |
| "step": 15050 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 19.53270149230957, |
| "learning_rate": 7.144462184377034e-06, |
| "loss": 1.2716, |
| "step": 15100 |
| }, |
| { |
| "epoch": 3.58325449385052, |
| "grad_norm": 18.649320602416992, |
| "learning_rate": 7.085329075749513e-06, |
| "loss": 1.4043, |
| "step": 15150 |
| }, |
| { |
| "epoch": 3.595080416272469, |
| "grad_norm": 13.581181526184082, |
| "learning_rate": 7.026195967121992e-06, |
| "loss": 1.3686, |
| "step": 15200 |
| }, |
| { |
| "epoch": 3.6069063386944182, |
| "grad_norm": 21.46381950378418, |
| "learning_rate": 6.967062858494472e-06, |
| "loss": 1.3687, |
| "step": 15250 |
| }, |
| { |
| "epoch": 3.6187322611163673, |
| "grad_norm": 10.937467575073242, |
| "learning_rate": 6.907929749866951e-06, |
| "loss": 1.3183, |
| "step": 15300 |
| }, |
| { |
| "epoch": 3.630558183538316, |
| "grad_norm": 18.974475860595703, |
| "learning_rate": 6.84879664123943e-06, |
| "loss": 1.3712, |
| "step": 15350 |
| }, |
| { |
| "epoch": 3.642384105960265, |
| "grad_norm": 17.913204193115234, |
| "learning_rate": 6.78966353261191e-06, |
| "loss": 1.4006, |
| "step": 15400 |
| }, |
| { |
| "epoch": 3.654210028382214, |
| "grad_norm": 14.945576667785645, |
| "learning_rate": 6.73053042398439e-06, |
| "loss": 1.4326, |
| "step": 15450 |
| }, |
| { |
| "epoch": 3.666035950804163, |
| "grad_norm": 15.58818531036377, |
| "learning_rate": 6.671397315356869e-06, |
| "loss": 1.3116, |
| "step": 15500 |
| }, |
| { |
| "epoch": 3.677861873226112, |
| "grad_norm": 16.57988739013672, |
| "learning_rate": 6.612264206729349e-06, |
| "loss": 1.2975, |
| "step": 15550 |
| }, |
| { |
| "epoch": 3.6896877956480605, |
| "grad_norm": 13.658615112304688, |
| "learning_rate": 6.5531310981018285e-06, |
| "loss": 1.3709, |
| "step": 15600 |
| }, |
| { |
| "epoch": 3.7015137180700095, |
| "grad_norm": 16.559919357299805, |
| "learning_rate": 6.493997989474307e-06, |
| "loss": 1.3267, |
| "step": 15650 |
| }, |
| { |
| "epoch": 3.7133396404919585, |
| "grad_norm": 16.319732666015625, |
| "learning_rate": 6.434864880846786e-06, |
| "loss": 1.2947, |
| "step": 15700 |
| }, |
| { |
| "epoch": 3.725165562913907, |
| "grad_norm": 15.4765043258667, |
| "learning_rate": 6.375731772219266e-06, |
| "loss": 1.3524, |
| "step": 15750 |
| }, |
| { |
| "epoch": 3.736991485335856, |
| "grad_norm": 14.876737594604492, |
| "learning_rate": 6.316598663591745e-06, |
| "loss": 1.3092, |
| "step": 15800 |
| }, |
| { |
| "epoch": 3.748817407757805, |
| "grad_norm": 13.654143333435059, |
| "learning_rate": 6.257465554964225e-06, |
| "loss": 1.3635, |
| "step": 15850 |
| }, |
| { |
| "epoch": 3.760643330179754, |
| "grad_norm": 16.795425415039062, |
| "learning_rate": 6.198332446336705e-06, |
| "loss": 1.282, |
| "step": 15900 |
| }, |
| { |
| "epoch": 3.772469252601703, |
| "grad_norm": 12.707657814025879, |
| "learning_rate": 6.139199337709184e-06, |
| "loss": 1.3122, |
| "step": 15950 |
| }, |
| { |
| "epoch": 3.7842951750236518, |
| "grad_norm": 15.771327018737793, |
| "learning_rate": 6.080066229081664e-06, |
| "loss": 1.2944, |
| "step": 16000 |
| }, |
| { |
| "epoch": 3.7842951750236518, |
| "eval_runtime": 46.7062, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 3.97229260179494, |
| "step": 16000 |
| }, |
| { |
| "epoch": 3.796121097445601, |
| "grad_norm": 15.549239158630371, |
| "learning_rate": 6.020933120454144e-06, |
| "loss": 1.3878, |
| "step": 16050 |
| }, |
| { |
| "epoch": 3.80794701986755, |
| "grad_norm": 11.558497428894043, |
| "learning_rate": 5.9618000118266215e-06, |
| "loss": 1.2978, |
| "step": 16100 |
| }, |
| { |
| "epoch": 3.8197729422894984, |
| "grad_norm": 17.571189880371094, |
| "learning_rate": 5.902666903199101e-06, |
| "loss": 1.3128, |
| "step": 16150 |
| }, |
| { |
| "epoch": 3.8315988647114474, |
| "grad_norm": 16.608991622924805, |
| "learning_rate": 5.843533794571581e-06, |
| "loss": 1.317, |
| "step": 16200 |
| }, |
| { |
| "epoch": 3.8434247871333964, |
| "grad_norm": 17.64645004272461, |
| "learning_rate": 5.7844006859440605e-06, |
| "loss": 1.3225, |
| "step": 16250 |
| }, |
| { |
| "epoch": 3.8552507095553454, |
| "grad_norm": 16.340919494628906, |
| "learning_rate": 5.72526757731654e-06, |
| "loss": 1.3339, |
| "step": 16300 |
| }, |
| { |
| "epoch": 3.8670766319772945, |
| "grad_norm": 17.24504280090332, |
| "learning_rate": 5.66613446868902e-06, |
| "loss": 1.3137, |
| "step": 16350 |
| }, |
| { |
| "epoch": 3.878902554399243, |
| "grad_norm": 16.168750762939453, |
| "learning_rate": 5.607001360061499e-06, |
| "loss": 1.3128, |
| "step": 16400 |
| }, |
| { |
| "epoch": 3.890728476821192, |
| "grad_norm": 20.071321487426758, |
| "learning_rate": 5.547868251433979e-06, |
| "loss": 1.3262, |
| "step": 16450 |
| }, |
| { |
| "epoch": 3.902554399243141, |
| "grad_norm": 19.031503677368164, |
| "learning_rate": 5.488735142806457e-06, |
| "loss": 1.2235, |
| "step": 16500 |
| }, |
| { |
| "epoch": 3.9143803216650896, |
| "grad_norm": 18.36022186279297, |
| "learning_rate": 5.429602034178937e-06, |
| "loss": 1.2619, |
| "step": 16550 |
| }, |
| { |
| "epoch": 3.9262062440870387, |
| "grad_norm": 17.923831939697266, |
| "learning_rate": 5.370468925551416e-06, |
| "loss": 1.3289, |
| "step": 16600 |
| }, |
| { |
| "epoch": 3.9380321665089877, |
| "grad_norm": 13.550859451293945, |
| "learning_rate": 5.311335816923896e-06, |
| "loss": 1.2437, |
| "step": 16650 |
| }, |
| { |
| "epoch": 3.9498580889309367, |
| "grad_norm": 12.4674654006958, |
| "learning_rate": 5.2522027082963755e-06, |
| "loss": 1.2886, |
| "step": 16700 |
| }, |
| { |
| "epoch": 3.9616840113528857, |
| "grad_norm": 18.66042709350586, |
| "learning_rate": 5.193069599668855e-06, |
| "loss": 1.3309, |
| "step": 16750 |
| }, |
| { |
| "epoch": 3.9735099337748343, |
| "grad_norm": 15.646864891052246, |
| "learning_rate": 5.133936491041335e-06, |
| "loss": 1.3457, |
| "step": 16800 |
| }, |
| { |
| "epoch": 3.9853358561967833, |
| "grad_norm": 17.00884246826172, |
| "learning_rate": 5.0748033824138144e-06, |
| "loss": 1.3184, |
| "step": 16850 |
| }, |
| { |
| "epoch": 3.9971617786187323, |
| "grad_norm": 13.399270057678223, |
| "learning_rate": 5.015670273786294e-06, |
| "loss": 1.2087, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.008987701040681, |
| "grad_norm": 13.706061363220215, |
| "learning_rate": 4.956537165158773e-06, |
| "loss": 1.229, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.02081362346263, |
| "grad_norm": 22.725217819213867, |
| "learning_rate": 4.8974040565312525e-06, |
| "loss": 1.3235, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.032639545884579, |
| "grad_norm": 16.072246551513672, |
| "learning_rate": 4.838270947903731e-06, |
| "loss": 1.195, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.0444654683065275, |
| "grad_norm": 17.015745162963867, |
| "learning_rate": 4.779137839276211e-06, |
| "loss": 1.2793, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.056291390728477, |
| "grad_norm": 15.687799453735352, |
| "learning_rate": 4.7200047306486906e-06, |
| "loss": 1.2719, |
| "step": 17150 |
| }, |
| { |
| "epoch": 4.068117313150426, |
| "grad_norm": 11.79020881652832, |
| "learning_rate": 4.66087162202117e-06, |
| "loss": 1.2701, |
| "step": 17200 |
| }, |
| { |
| "epoch": 4.079943235572375, |
| "grad_norm": 14.385472297668457, |
| "learning_rate": 4.601738513393649e-06, |
| "loss": 1.2593, |
| "step": 17250 |
| }, |
| { |
| "epoch": 4.091769157994324, |
| "grad_norm": 18.47262954711914, |
| "learning_rate": 4.542605404766129e-06, |
| "loss": 1.2324, |
| "step": 17300 |
| }, |
| { |
| "epoch": 4.103595080416272, |
| "grad_norm": 22.801834106445312, |
| "learning_rate": 4.483472296138608e-06, |
| "loss": 1.2338, |
| "step": 17350 |
| }, |
| { |
| "epoch": 4.115421002838222, |
| "grad_norm": 16.11665916442871, |
| "learning_rate": 4.424339187511088e-06, |
| "loss": 1.2338, |
| "step": 17400 |
| }, |
| { |
| "epoch": 4.12724692526017, |
| "grad_norm": 18.113365173339844, |
| "learning_rate": 4.365206078883567e-06, |
| "loss": 1.2595, |
| "step": 17450 |
| }, |
| { |
| "epoch": 4.139072847682119, |
| "grad_norm": 15.56670093536377, |
| "learning_rate": 4.306072970256046e-06, |
| "loss": 1.2434, |
| "step": 17500 |
| }, |
| { |
| "epoch": 4.150898770104068, |
| "grad_norm": 18.501914978027344, |
| "learning_rate": 4.246939861628526e-06, |
| "loss": 1.268, |
| "step": 17550 |
| }, |
| { |
| "epoch": 4.162724692526017, |
| "grad_norm": 16.622150421142578, |
| "learning_rate": 4.187806753001006e-06, |
| "loss": 1.2345, |
| "step": 17600 |
| }, |
| { |
| "epoch": 4.174550614947966, |
| "grad_norm": 19.019207000732422, |
| "learning_rate": 4.128673644373484e-06, |
| "loss": 1.228, |
| "step": 17650 |
| }, |
| { |
| "epoch": 4.186376537369915, |
| "grad_norm": 11.55809211730957, |
| "learning_rate": 4.069540535745964e-06, |
| "loss": 1.235, |
| "step": 17700 |
| }, |
| { |
| "epoch": 4.1982024597918635, |
| "grad_norm": 14.763603210449219, |
| "learning_rate": 4.010407427118444e-06, |
| "loss": 1.2198, |
| "step": 17750 |
| }, |
| { |
| "epoch": 4.210028382213813, |
| "grad_norm": 17.480113983154297, |
| "learning_rate": 3.951274318490923e-06, |
| "loss": 1.263, |
| "step": 17800 |
| }, |
| { |
| "epoch": 4.2218543046357615, |
| "grad_norm": 17.487497329711914, |
| "learning_rate": 3.892141209863403e-06, |
| "loss": 1.2288, |
| "step": 17850 |
| }, |
| { |
| "epoch": 4.23368022705771, |
| "grad_norm": 14.157654762268066, |
| "learning_rate": 3.833008101235882e-06, |
| "loss": 1.2251, |
| "step": 17900 |
| }, |
| { |
| "epoch": 4.2455061494796595, |
| "grad_norm": 21.731857299804688, |
| "learning_rate": 3.773874992608362e-06, |
| "loss": 1.2796, |
| "step": 17950 |
| }, |
| { |
| "epoch": 4.257332071901608, |
| "grad_norm": 17.268417358398438, |
| "learning_rate": 3.7147418839808415e-06, |
| "loss": 1.2934, |
| "step": 18000 |
| }, |
| { |
| "epoch": 4.257332071901608, |
| "eval_runtime": 47.1593, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 3.9484923051611727, |
| "step": 18000 |
| }, |
| { |
| "epoch": 4.269157994323558, |
| "grad_norm": 12.740385055541992, |
| "learning_rate": 3.6556087753533203e-06, |
| "loss": 1.2197, |
| "step": 18050 |
| }, |
| { |
| "epoch": 4.280983916745506, |
| "grad_norm": 17.239517211914062, |
| "learning_rate": 3.5964756667258e-06, |
| "loss": 1.1908, |
| "step": 18100 |
| }, |
| { |
| "epoch": 4.292809839167455, |
| "grad_norm": 16.485107421875, |
| "learning_rate": 3.5373425580982795e-06, |
| "loss": 1.2549, |
| "step": 18150 |
| }, |
| { |
| "epoch": 4.304635761589404, |
| "grad_norm": 17.04962921142578, |
| "learning_rate": 3.478209449470759e-06, |
| "loss": 1.3468, |
| "step": 18200 |
| }, |
| { |
| "epoch": 4.316461684011353, |
| "grad_norm": 14.987895965576172, |
| "learning_rate": 3.419076340843238e-06, |
| "loss": 1.2323, |
| "step": 18250 |
| }, |
| { |
| "epoch": 4.328287606433301, |
| "grad_norm": 14.840313911437988, |
| "learning_rate": 3.3599432322157176e-06, |
| "loss": 1.2897, |
| "step": 18300 |
| }, |
| { |
| "epoch": 4.340113528855251, |
| "grad_norm": 17.09177589416504, |
| "learning_rate": 3.3008101235881973e-06, |
| "loss": 1.3231, |
| "step": 18350 |
| }, |
| { |
| "epoch": 4.351939451277199, |
| "grad_norm": 16.76932716369629, |
| "learning_rate": 3.241677014960677e-06, |
| "loss": 1.3587, |
| "step": 18400 |
| }, |
| { |
| "epoch": 4.363765373699149, |
| "grad_norm": 17.611955642700195, |
| "learning_rate": 3.1825439063331565e-06, |
| "loss": 1.2639, |
| "step": 18450 |
| }, |
| { |
| "epoch": 4.375591296121097, |
| "grad_norm": 11.352503776550293, |
| "learning_rate": 3.1234107977056353e-06, |
| "loss": 1.2244, |
| "step": 18500 |
| }, |
| { |
| "epoch": 4.387417218543046, |
| "grad_norm": 17.059810638427734, |
| "learning_rate": 3.064277689078115e-06, |
| "loss": 1.1932, |
| "step": 18550 |
| }, |
| { |
| "epoch": 4.3992431409649955, |
| "grad_norm": 15.7676420211792, |
| "learning_rate": 3.0051445804505946e-06, |
| "loss": 1.2072, |
| "step": 18600 |
| }, |
| { |
| "epoch": 4.411069063386944, |
| "grad_norm": 20.51708984375, |
| "learning_rate": 2.9460114718230742e-06, |
| "loss": 1.2257, |
| "step": 18650 |
| }, |
| { |
| "epoch": 4.4228949858088935, |
| "grad_norm": 9.713994979858398, |
| "learning_rate": 2.8868783631955535e-06, |
| "loss": 1.2368, |
| "step": 18700 |
| }, |
| { |
| "epoch": 4.434720908230842, |
| "grad_norm": 17.381057739257812, |
| "learning_rate": 2.8277452545680327e-06, |
| "loss": 1.2796, |
| "step": 18750 |
| }, |
| { |
| "epoch": 4.446546830652791, |
| "grad_norm": 17.901290893554688, |
| "learning_rate": 2.7686121459405123e-06, |
| "loss": 1.1617, |
| "step": 18800 |
| }, |
| { |
| "epoch": 4.45837275307474, |
| "grad_norm": 14.669180870056152, |
| "learning_rate": 2.709479037312992e-06, |
| "loss": 1.238, |
| "step": 18850 |
| }, |
| { |
| "epoch": 4.470198675496689, |
| "grad_norm": 19.362512588500977, |
| "learning_rate": 2.650345928685471e-06, |
| "loss": 1.1765, |
| "step": 18900 |
| }, |
| { |
| "epoch": 4.482024597918637, |
| "grad_norm": 16.083276748657227, |
| "learning_rate": 2.591212820057951e-06, |
| "loss": 1.2626, |
| "step": 18950 |
| }, |
| { |
| "epoch": 4.493850520340587, |
| "grad_norm": 12.657955169677734, |
| "learning_rate": 2.53207971143043e-06, |
| "loss": 1.2582, |
| "step": 19000 |
| }, |
| { |
| "epoch": 4.505676442762535, |
| "grad_norm": 19.395004272460938, |
| "learning_rate": 2.4729466028029097e-06, |
| "loss": 1.2478, |
| "step": 19050 |
| }, |
| { |
| "epoch": 4.517502365184484, |
| "grad_norm": 19.803897857666016, |
| "learning_rate": 2.413813494175389e-06, |
| "loss": 1.1628, |
| "step": 19100 |
| }, |
| { |
| "epoch": 4.529328287606433, |
| "grad_norm": 18.098979949951172, |
| "learning_rate": 2.3546803855478685e-06, |
| "loss": 1.251, |
| "step": 19150 |
| }, |
| { |
| "epoch": 4.541154210028382, |
| "grad_norm": 20.26512908935547, |
| "learning_rate": 2.295547276920348e-06, |
| "loss": 1.208, |
| "step": 19200 |
| }, |
| { |
| "epoch": 4.552980132450331, |
| "grad_norm": 11.94166088104248, |
| "learning_rate": 2.2364141682928274e-06, |
| "loss": 1.2535, |
| "step": 19250 |
| }, |
| { |
| "epoch": 4.56480605487228, |
| "grad_norm": 15.473821640014648, |
| "learning_rate": 2.177281059665307e-06, |
| "loss": 1.1903, |
| "step": 19300 |
| }, |
| { |
| "epoch": 4.5766319772942285, |
| "grad_norm": 14.091665267944336, |
| "learning_rate": 2.1181479510377862e-06, |
| "loss": 1.1725, |
| "step": 19350 |
| }, |
| { |
| "epoch": 4.588457899716178, |
| "grad_norm": 15.09231948852539, |
| "learning_rate": 2.059014842410266e-06, |
| "loss": 1.2023, |
| "step": 19400 |
| }, |
| { |
| "epoch": 4.600283822138127, |
| "grad_norm": 19.047542572021484, |
| "learning_rate": 1.999881733782745e-06, |
| "loss": 1.1607, |
| "step": 19450 |
| }, |
| { |
| "epoch": 4.612109744560076, |
| "grad_norm": 15.40837574005127, |
| "learning_rate": 1.9407486251552247e-06, |
| "loss": 1.2483, |
| "step": 19500 |
| }, |
| { |
| "epoch": 4.623935666982025, |
| "grad_norm": 16.487464904785156, |
| "learning_rate": 1.881615516527704e-06, |
| "loss": 1.1202, |
| "step": 19550 |
| }, |
| { |
| "epoch": 4.635761589403973, |
| "grad_norm": 18.49724006652832, |
| "learning_rate": 1.8224824079001836e-06, |
| "loss": 1.2428, |
| "step": 19600 |
| }, |
| { |
| "epoch": 4.647587511825923, |
| "grad_norm": 13.098505973815918, |
| "learning_rate": 1.7633492992726628e-06, |
| "loss": 1.2413, |
| "step": 19650 |
| }, |
| { |
| "epoch": 4.659413434247871, |
| "grad_norm": 14.599630355834961, |
| "learning_rate": 1.7042161906451424e-06, |
| "loss": 1.1916, |
| "step": 19700 |
| }, |
| { |
| "epoch": 4.671239356669821, |
| "grad_norm": 13.156811714172363, |
| "learning_rate": 1.6450830820176216e-06, |
| "loss": 1.1738, |
| "step": 19750 |
| }, |
| { |
| "epoch": 4.683065279091769, |
| "grad_norm": 12.79720687866211, |
| "learning_rate": 1.5859499733901013e-06, |
| "loss": 1.1718, |
| "step": 19800 |
| }, |
| { |
| "epoch": 4.694891201513718, |
| "grad_norm": 20.443012237548828, |
| "learning_rate": 1.5268168647625805e-06, |
| "loss": 1.2093, |
| "step": 19850 |
| }, |
| { |
| "epoch": 4.706717123935667, |
| "grad_norm": 14.799368858337402, |
| "learning_rate": 1.4676837561350601e-06, |
| "loss": 1.1457, |
| "step": 19900 |
| }, |
| { |
| "epoch": 4.718543046357616, |
| "grad_norm": 12.656880378723145, |
| "learning_rate": 1.4085506475075394e-06, |
| "loss": 1.1704, |
| "step": 19950 |
| }, |
| { |
| "epoch": 4.7303689687795645, |
| "grad_norm": 17.24571418762207, |
| "learning_rate": 1.349417538880019e-06, |
| "loss": 1.1767, |
| "step": 20000 |
| }, |
| { |
| "epoch": 4.7303689687795645, |
| "eval_runtime": 47.055, |
| "eval_samples_per_second": 0.0, |
| "eval_steps_per_second": 0.0, |
| "eval_validation_loss": 3.9691287893885456, |
| "step": 20000 |
| }, |
| { |
| "epoch": 4.742194891201514, |
| "grad_norm": 15.378254890441895, |
| "learning_rate": 1.2902844302524986e-06, |
| "loss": 1.1956, |
| "step": 20050 |
| }, |
| { |
| "epoch": 4.7540208136234625, |
| "grad_norm": 17.969493865966797, |
| "learning_rate": 1.2311513216249778e-06, |
| "loss": 1.1815, |
| "step": 20100 |
| }, |
| { |
| "epoch": 4.765846736045411, |
| "grad_norm": 18.55719757080078, |
| "learning_rate": 1.1720182129974573e-06, |
| "loss": 1.1376, |
| "step": 20150 |
| }, |
| { |
| "epoch": 4.7776726584673606, |
| "grad_norm": 17.548654556274414, |
| "learning_rate": 1.1128851043699367e-06, |
| "loss": 1.1839, |
| "step": 20200 |
| }, |
| { |
| "epoch": 4.789498580889309, |
| "grad_norm": 21.243549346923828, |
| "learning_rate": 1.0537519957424163e-06, |
| "loss": 1.1722, |
| "step": 20250 |
| }, |
| { |
| "epoch": 4.801324503311259, |
| "grad_norm": 12.436286926269531, |
| "learning_rate": 9.946188871148958e-07, |
| "loss": 1.2525, |
| "step": 20300 |
| }, |
| { |
| "epoch": 4.813150425733207, |
| "grad_norm": 12.564568519592285, |
| "learning_rate": 9.354857784873751e-07, |
| "loss": 1.1445, |
| "step": 20350 |
| }, |
| { |
| "epoch": 4.824976348155156, |
| "grad_norm": 19.906652450561523, |
| "learning_rate": 8.763526698598545e-07, |
| "loss": 1.1819, |
| "step": 20400 |
| }, |
| { |
| "epoch": 4.836802270577105, |
| "grad_norm": 17.435781478881836, |
| "learning_rate": 8.17219561232334e-07, |
| "loss": 1.213, |
| "step": 20450 |
| }, |
| { |
| "epoch": 4.848628192999054, |
| "grad_norm": 14.816115379333496, |
| "learning_rate": 7.580864526048135e-07, |
| "loss": 1.1366, |
| "step": 20500 |
| }, |
| { |
| "epoch": 4.860454115421003, |
| "grad_norm": 14.993414878845215, |
| "learning_rate": 6.989533439772929e-07, |
| "loss": 1.2219, |
| "step": 20550 |
| }, |
| { |
| "epoch": 4.872280037842952, |
| "grad_norm": 17.53949737548828, |
| "learning_rate": 6.398202353497723e-07, |
| "loss": 1.1501, |
| "step": 20600 |
| }, |
| { |
| "epoch": 4.8841059602649, |
| "grad_norm": 14.761266708374023, |
| "learning_rate": 5.806871267222519e-07, |
| "loss": 1.1949, |
| "step": 20650 |
| }, |
| { |
| "epoch": 4.89593188268685, |
| "grad_norm": 15.1113920211792, |
| "learning_rate": 5.215540180947313e-07, |
| "loss": 1.1757, |
| "step": 20700 |
| }, |
| { |
| "epoch": 4.907757805108798, |
| "grad_norm": 17.890682220458984, |
| "learning_rate": 4.624209094672107e-07, |
| "loss": 1.1029, |
| "step": 20750 |
| }, |
| { |
| "epoch": 4.919583727530747, |
| "grad_norm": 16.85039710998535, |
| "learning_rate": 4.032878008396902e-07, |
| "loss": 1.1426, |
| "step": 20800 |
| }, |
| { |
| "epoch": 4.9314096499526965, |
| "grad_norm": 22.656349182128906, |
| "learning_rate": 3.441546922121696e-07, |
| "loss": 1.2021, |
| "step": 20850 |
| }, |
| { |
| "epoch": 4.943235572374645, |
| "grad_norm": 19.370864868164062, |
| "learning_rate": 2.8502158358464905e-07, |
| "loss": 1.1034, |
| "step": 20900 |
| }, |
| { |
| "epoch": 4.955061494796594, |
| "grad_norm": 13.943963050842285, |
| "learning_rate": 2.258884749571285e-07, |
| "loss": 1.2271, |
| "step": 20950 |
| }, |
| { |
| "epoch": 4.966887417218543, |
| "grad_norm": 14.200597763061523, |
| "learning_rate": 1.6675536632960799e-07, |
| "loss": 1.2032, |
| "step": 21000 |
| }, |
| { |
| "epoch": 4.978713339640492, |
| "grad_norm": 12.472103118896484, |
| "learning_rate": 1.076222577020874e-07, |
| "loss": 1.2036, |
| "step": 21050 |
| }, |
| { |
| "epoch": 4.990539262062441, |
| "grad_norm": 15.718477249145508, |
| "learning_rate": 4.848914907456685e-08, |
| "loss": 1.1324, |
| "step": 21100 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 21140, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|