| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.7323685598172008, | |
| "best_model_checkpoint": "./SALAMA_NEWMEDTT/checkpoint-2000", | |
| "epoch": 0.7664670658682635, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0038323353293413173, | |
| "grad_norm": 0.13999255001544952, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0033, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007664670658682635, | |
| "grad_norm": 0.06949484348297119, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.0033, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.011497005988023952, | |
| "grad_norm": 0.10336039215326309, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.0034, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01532934131736527, | |
| "grad_norm": 2.217076063156128, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.003, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.019161676646706587, | |
| "grad_norm": 1.3674581050872803, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.0042, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.022994011976047904, | |
| "grad_norm": 0.8775060176849365, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.0014, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02682634730538922, | |
| "grad_norm": 0.21147924661636353, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.0023, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03065868263473054, | |
| "grad_norm": 0.285540908575058, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.0038, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.034491017964071856, | |
| "grad_norm": 0.5599580407142639, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.0036, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03832335329341317, | |
| "grad_norm": 0.7187970280647278, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.0054, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04215568862275449, | |
| "grad_norm": 1.0117833614349365, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.0035, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04598802395209581, | |
| "grad_norm": 0.15052905678749084, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.0032, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.049820359281437125, | |
| "grad_norm": 0.3713392913341522, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.0025, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05365269461077844, | |
| "grad_norm": 0.11066653579473495, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.0017, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05748502994011976, | |
| "grad_norm": 0.7113040089607239, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.0035, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06131736526946108, | |
| "grad_norm": 0.19436658918857574, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.0029, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0651497005988024, | |
| "grad_norm": 0.10705593228340149, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.0015, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06898203592814371, | |
| "grad_norm": 0.28974607586860657, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.002, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07281437125748504, | |
| "grad_norm": 0.6417028903961182, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.002, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07664670658682635, | |
| "grad_norm": 0.4101906418800354, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.0023, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08047904191616767, | |
| "grad_norm": 0.393304705619812, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.0029, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08431137724550898, | |
| "grad_norm": 2.8025639057159424, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.0063, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0881437125748503, | |
| "grad_norm": 0.30129295587539673, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.0043, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09197604790419162, | |
| "grad_norm": 0.8991917967796326, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.0041, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.09580838323353294, | |
| "grad_norm": 0.5826700329780579, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.0052, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09964071856287425, | |
| "grad_norm": 0.4652438759803772, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.0032, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.10347305389221557, | |
| "grad_norm": 0.24232645332813263, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.003, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.10730538922155688, | |
| "grad_norm": 0.2619079649448395, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.0035, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11113772455089821, | |
| "grad_norm": 0.36271339654922485, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.004, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.11497005988023952, | |
| "grad_norm": 1.1893694400787354, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.0075, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11880239520958084, | |
| "grad_norm": 1.4707320928573608, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.0065, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.12263473053892215, | |
| "grad_norm": 0.5402860045433044, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.0079, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.12646706586826348, | |
| "grad_norm": 0.8445234894752502, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.0038, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1302994011976048, | |
| "grad_norm": 1.0490593910217285, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.0065, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1341317365269461, | |
| "grad_norm": 0.7944777011871338, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.0066, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.13796407185628742, | |
| "grad_norm": 0.7534486055374146, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.0049, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.14179640718562875, | |
| "grad_norm": 0.8252223134040833, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.0066, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.14562874251497007, | |
| "grad_norm": 0.8292574286460876, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.0052, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.14946107784431137, | |
| "grad_norm": 1.1227622032165527, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.0076, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1532934131736527, | |
| "grad_norm": 1.2978622913360596, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.0065, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15712574850299402, | |
| "grad_norm": 2.281665325164795, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.0072, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.16095808383233534, | |
| "grad_norm": 0.9634031653404236, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.0066, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.16479041916167664, | |
| "grad_norm": 1.3554670810699463, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.007, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.16862275449101796, | |
| "grad_norm": 1.5378248691558838, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.0082, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1724550898203593, | |
| "grad_norm": 2.1895182132720947, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.0103, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1762874251497006, | |
| "grad_norm": 0.634242057800293, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.0113, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1801197604790419, | |
| "grad_norm": 0.6717728972434998, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.0057, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.18395209580838323, | |
| "grad_norm": 2.004511833190918, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.0147, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.18778443113772456, | |
| "grad_norm": 8.510348320007324, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.012, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.19161676646706588, | |
| "grad_norm": 1.1750833988189697, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.0132, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19544910179640718, | |
| "grad_norm": 1.6581082344055176, | |
| "learning_rate": 9.980932203389831e-06, | |
| "loss": 0.0213, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1992814371257485, | |
| "grad_norm": 1.5398513078689575, | |
| "learning_rate": 9.959745762711866e-06, | |
| "loss": 0.0136, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.20311377245508982, | |
| "grad_norm": 0.879449725151062, | |
| "learning_rate": 9.9385593220339e-06, | |
| "loss": 0.0119, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.20694610778443115, | |
| "grad_norm": 2.3282341957092285, | |
| "learning_rate": 9.917372881355933e-06, | |
| "loss": 0.0143, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.21077844311377245, | |
| "grad_norm": 2.3774726390838623, | |
| "learning_rate": 9.896186440677968e-06, | |
| "loss": 0.0159, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.21461077844311377, | |
| "grad_norm": 1.5949875116348267, | |
| "learning_rate": 9.875000000000001e-06, | |
| "loss": 0.0101, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2184431137724551, | |
| "grad_norm": 1.4217886924743652, | |
| "learning_rate": 9.853813559322034e-06, | |
| "loss": 0.0137, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.22227544910179642, | |
| "grad_norm": 1.0425392389297485, | |
| "learning_rate": 9.832627118644068e-06, | |
| "loss": 0.012, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.22610778443113771, | |
| "grad_norm": 1.374623417854309, | |
| "learning_rate": 9.811440677966103e-06, | |
| "loss": 0.0128, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.22994011976047904, | |
| "grad_norm": 2.8496975898742676, | |
| "learning_rate": 9.790254237288136e-06, | |
| "loss": 0.0122, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.23377245508982036, | |
| "grad_norm": 1.7832646369934082, | |
| "learning_rate": 9.76906779661017e-06, | |
| "loss": 0.016, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2376047904191617, | |
| "grad_norm": 1.8113480806350708, | |
| "learning_rate": 9.747881355932204e-06, | |
| "loss": 0.016, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.24143712574850298, | |
| "grad_norm": 1.0705426931381226, | |
| "learning_rate": 9.726694915254238e-06, | |
| "loss": 0.0149, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2452694610778443, | |
| "grad_norm": 1.5321799516677856, | |
| "learning_rate": 9.705508474576271e-06, | |
| "loss": 0.0158, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.24910179640718563, | |
| "grad_norm": 2.2437734603881836, | |
| "learning_rate": 9.684322033898306e-06, | |
| "loss": 0.0176, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.25293413173652696, | |
| "grad_norm": 0.9095500707626343, | |
| "learning_rate": 9.66313559322034e-06, | |
| "loss": 0.0153, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.25676646706586825, | |
| "grad_norm": 1.800218105316162, | |
| "learning_rate": 9.641949152542374e-06, | |
| "loss": 0.0147, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2605988023952096, | |
| "grad_norm": 1.5202052593231201, | |
| "learning_rate": 9.620762711864408e-06, | |
| "loss": 0.0131, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2644311377245509, | |
| "grad_norm": 2.1265900135040283, | |
| "learning_rate": 9.59957627118644e-06, | |
| "loss": 0.0131, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.2682634730538922, | |
| "grad_norm": 1.3656080961227417, | |
| "learning_rate": 9.578389830508476e-06, | |
| "loss": 0.0159, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.27209580838323355, | |
| "grad_norm": 1.1013087034225464, | |
| "learning_rate": 9.557203389830509e-06, | |
| "loss": 0.0144, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.27592814371257485, | |
| "grad_norm": 1.7320525646209717, | |
| "learning_rate": 9.536016949152544e-06, | |
| "loss": 0.013, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27976047904191614, | |
| "grad_norm": 0.7219749093055725, | |
| "learning_rate": 9.514830508474577e-06, | |
| "loss": 0.0134, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2835928143712575, | |
| "grad_norm": 1.914589285850525, | |
| "learning_rate": 9.49364406779661e-06, | |
| "loss": 0.0178, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2874251497005988, | |
| "grad_norm": 2.229616641998291, | |
| "learning_rate": 9.472457627118646e-06, | |
| "loss": 0.0163, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.29125748502994014, | |
| "grad_norm": 0.7531014680862427, | |
| "learning_rate": 9.451271186440679e-06, | |
| "loss": 0.0097, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.29508982035928144, | |
| "grad_norm": 1.1441811323165894, | |
| "learning_rate": 9.430084745762714e-06, | |
| "loss": 0.0121, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.29892215568862274, | |
| "grad_norm": 1.730209469795227, | |
| "learning_rate": 9.408898305084746e-06, | |
| "loss": 0.0176, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3027544910179641, | |
| "grad_norm": 2.434473752975464, | |
| "learning_rate": 9.38771186440678e-06, | |
| "loss": 0.0136, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3065868263473054, | |
| "grad_norm": 1.3024921417236328, | |
| "learning_rate": 9.366525423728814e-06, | |
| "loss": 0.013, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3104191616766467, | |
| "grad_norm": 1.1504980325698853, | |
| "learning_rate": 9.345338983050847e-06, | |
| "loss": 0.0115, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.31425149700598803, | |
| "grad_norm": 1.0408570766448975, | |
| "learning_rate": 9.324152542372882e-06, | |
| "loss": 0.0146, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.31808383233532933, | |
| "grad_norm": 1.8448822498321533, | |
| "learning_rate": 9.302966101694915e-06, | |
| "loss": 0.0137, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.3219161676646707, | |
| "grad_norm": 1.2359811067581177, | |
| "learning_rate": 9.28177966101695e-06, | |
| "loss": 0.0115, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.325748502994012, | |
| "grad_norm": 1.6629658937454224, | |
| "learning_rate": 9.260593220338984e-06, | |
| "loss": 0.0185, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3295808383233533, | |
| "grad_norm": 1.3565815687179565, | |
| "learning_rate": 9.239406779661017e-06, | |
| "loss": 0.0105, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3334131736526946, | |
| "grad_norm": 0.47145602107048035, | |
| "learning_rate": 9.218220338983052e-06, | |
| "loss": 0.0124, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3372455089820359, | |
| "grad_norm": 1.452635645866394, | |
| "learning_rate": 9.197033898305085e-06, | |
| "loss": 0.0143, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3410778443113772, | |
| "grad_norm": 1.3402444124221802, | |
| "learning_rate": 9.17584745762712e-06, | |
| "loss": 0.0158, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.3449101796407186, | |
| "grad_norm": 0.9056028127670288, | |
| "learning_rate": 9.154661016949154e-06, | |
| "loss": 0.0132, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.34874251497005987, | |
| "grad_norm": 2.6174232959747314, | |
| "learning_rate": 9.133474576271187e-06, | |
| "loss": 0.0104, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.3525748502994012, | |
| "grad_norm": 1.692396879196167, | |
| "learning_rate": 9.112288135593222e-06, | |
| "loss": 0.0139, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3564071856287425, | |
| "grad_norm": 1.1301361322402954, | |
| "learning_rate": 9.091101694915255e-06, | |
| "loss": 0.0114, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3602395209580838, | |
| "grad_norm": 0.9672715663909912, | |
| "learning_rate": 9.069915254237288e-06, | |
| "loss": 0.0128, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.36407185628742517, | |
| "grad_norm": 2.274716377258301, | |
| "learning_rate": 9.048728813559323e-06, | |
| "loss": 0.0141, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.36790419161676646, | |
| "grad_norm": 1.0660594701766968, | |
| "learning_rate": 9.027542372881357e-06, | |
| "loss": 0.0113, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.37173652694610776, | |
| "grad_norm": 1.8312493562698364, | |
| "learning_rate": 9.006355932203392e-06, | |
| "loss": 0.0131, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3755688622754491, | |
| "grad_norm": 1.8305320739746094, | |
| "learning_rate": 8.985169491525423e-06, | |
| "loss": 0.0146, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3794011976047904, | |
| "grad_norm": 1.281570553779602, | |
| "learning_rate": 8.963983050847458e-06, | |
| "loss": 0.0145, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.38323353293413176, | |
| "grad_norm": 0.8759114146232605, | |
| "learning_rate": 8.942796610169492e-06, | |
| "loss": 0.0118, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.38706586826347306, | |
| "grad_norm": 1.1629912853240967, | |
| "learning_rate": 8.921610169491527e-06, | |
| "loss": 0.0152, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.39089820359281435, | |
| "grad_norm": 1.3534737825393677, | |
| "learning_rate": 8.90042372881356e-06, | |
| "loss": 0.0133, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3947305389221557, | |
| "grad_norm": 1.538173794746399, | |
| "learning_rate": 8.879237288135593e-06, | |
| "loss": 0.0153, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.398562874251497, | |
| "grad_norm": 0.8408123254776001, | |
| "learning_rate": 8.858050847457628e-06, | |
| "loss": 0.0117, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.4023952095808383, | |
| "grad_norm": 1.6027411222457886, | |
| "learning_rate": 8.836864406779662e-06, | |
| "loss": 0.0107, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.40622754491017965, | |
| "grad_norm": 1.9977298974990845, | |
| "learning_rate": 8.815677966101695e-06, | |
| "loss": 0.0151, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.41005988023952095, | |
| "grad_norm": 1.1906282901763916, | |
| "learning_rate": 8.79449152542373e-06, | |
| "loss": 0.0133, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.4138922155688623, | |
| "grad_norm": 1.9095267057418823, | |
| "learning_rate": 8.773305084745763e-06, | |
| "loss": 0.0141, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.4177245508982036, | |
| "grad_norm": 1.7171216011047363, | |
| "learning_rate": 8.752118644067798e-06, | |
| "loss": 0.0122, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.4215568862275449, | |
| "grad_norm": 2.373567819595337, | |
| "learning_rate": 8.730932203389831e-06, | |
| "loss": 0.0165, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.42538922155688624, | |
| "grad_norm": 1.757907509803772, | |
| "learning_rate": 8.709745762711865e-06, | |
| "loss": 0.0167, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.42922155688622754, | |
| "grad_norm": 3.8954083919525146, | |
| "learning_rate": 8.6885593220339e-06, | |
| "loss": 0.0205, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.43305389221556884, | |
| "grad_norm": 1.8802250623703003, | |
| "learning_rate": 8.667372881355933e-06, | |
| "loss": 0.0106, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.4368862275449102, | |
| "grad_norm": 1.3269938230514526, | |
| "learning_rate": 8.646186440677968e-06, | |
| "loss": 0.0101, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4407185628742515, | |
| "grad_norm": 0.8917752504348755, | |
| "learning_rate": 8.625000000000001e-06, | |
| "loss": 0.011, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.44455089820359284, | |
| "grad_norm": 1.179168939590454, | |
| "learning_rate": 8.603813559322035e-06, | |
| "loss": 0.0132, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.44838323353293413, | |
| "grad_norm": 1.326897382736206, | |
| "learning_rate": 8.582627118644068e-06, | |
| "loss": 0.0105, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.45221556886227543, | |
| "grad_norm": 1.5681304931640625, | |
| "learning_rate": 8.561440677966101e-06, | |
| "loss": 0.0107, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4560479041916168, | |
| "grad_norm": 0.7448801398277283, | |
| "learning_rate": 8.540254237288136e-06, | |
| "loss": 0.0105, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4598802395209581, | |
| "grad_norm": 1.8620606660842896, | |
| "learning_rate": 8.51906779661017e-06, | |
| "loss": 0.0149, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4637125748502994, | |
| "grad_norm": 0.7713643312454224, | |
| "learning_rate": 8.497881355932204e-06, | |
| "loss": 0.0103, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.4675449101796407, | |
| "grad_norm": 1.0003689527511597, | |
| "learning_rate": 8.476694915254238e-06, | |
| "loss": 0.0131, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.471377245508982, | |
| "grad_norm": 1.3732171058654785, | |
| "learning_rate": 8.455508474576271e-06, | |
| "loss": 0.0147, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.4752095808383234, | |
| "grad_norm": 0.6933659911155701, | |
| "learning_rate": 8.434322033898306e-06, | |
| "loss": 0.0117, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 1.7515251636505127, | |
| "learning_rate": 8.41313559322034e-06, | |
| "loss": 0.0146, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.48287425149700597, | |
| "grad_norm": 1.186540126800537, | |
| "learning_rate": 8.391949152542374e-06, | |
| "loss": 0.0094, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4867065868263473, | |
| "grad_norm": 0.7318383455276489, | |
| "learning_rate": 8.370762711864408e-06, | |
| "loss": 0.0107, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4905389221556886, | |
| "grad_norm": 1.4842137098312378, | |
| "learning_rate": 8.349576271186441e-06, | |
| "loss": 0.016, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4943712574850299, | |
| "grad_norm": 0.6988366842269897, | |
| "learning_rate": 8.328389830508476e-06, | |
| "loss": 0.0114, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.49820359281437127, | |
| "grad_norm": 2.0333776473999023, | |
| "learning_rate": 8.30720338983051e-06, | |
| "loss": 0.0139, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5020359281437126, | |
| "grad_norm": 1.5975853204727173, | |
| "learning_rate": 8.286016949152543e-06, | |
| "loss": 0.0132, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5058682634730539, | |
| "grad_norm": 1.2636547088623047, | |
| "learning_rate": 8.264830508474577e-06, | |
| "loss": 0.0131, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5097005988023952, | |
| "grad_norm": 1.1417256593704224, | |
| "learning_rate": 8.24364406779661e-06, | |
| "loss": 0.0129, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5135329341317365, | |
| "grad_norm": 1.3969192504882812, | |
| "learning_rate": 8.222457627118646e-06, | |
| "loss": 0.0077, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5173652694610779, | |
| "grad_norm": 0.5558528900146484, | |
| "learning_rate": 8.201271186440679e-06, | |
| "loss": 0.0122, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5211976047904192, | |
| "grad_norm": 1.0696184635162354, | |
| "learning_rate": 8.180084745762712e-06, | |
| "loss": 0.0099, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5250299401197605, | |
| "grad_norm": 1.4563461542129517, | |
| "learning_rate": 8.158898305084746e-06, | |
| "loss": 0.0106, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.5288622754491018, | |
| "grad_norm": 0.995069146156311, | |
| "learning_rate": 8.13771186440678e-06, | |
| "loss": 0.0117, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5326946107784432, | |
| "grad_norm": 1.1932368278503418, | |
| "learning_rate": 8.116525423728814e-06, | |
| "loss": 0.0119, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.5365269461077844, | |
| "grad_norm": 0.5526638031005859, | |
| "learning_rate": 8.095338983050847e-06, | |
| "loss": 0.0164, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5403592814371257, | |
| "grad_norm": 1.2106008529663086, | |
| "learning_rate": 8.074152542372882e-06, | |
| "loss": 0.0117, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.5441916167664671, | |
| "grad_norm": 1.3328733444213867, | |
| "learning_rate": 8.052966101694916e-06, | |
| "loss": 0.0103, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5480239520958083, | |
| "grad_norm": 1.2352383136749268, | |
| "learning_rate": 8.031779661016949e-06, | |
| "loss": 0.0132, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.5518562874251497, | |
| "grad_norm": 0.8932350277900696, | |
| "learning_rate": 8.010593220338984e-06, | |
| "loss": 0.0094, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.555688622754491, | |
| "grad_norm": 1.1897926330566406, | |
| "learning_rate": 7.989406779661017e-06, | |
| "loss": 0.0083, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5595209580838323, | |
| "grad_norm": 3.584073543548584, | |
| "learning_rate": 7.968220338983052e-06, | |
| "loss": 0.0107, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5633532934131736, | |
| "grad_norm": 2.2174160480499268, | |
| "learning_rate": 7.947033898305085e-06, | |
| "loss": 0.0143, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.567185628742515, | |
| "grad_norm": 1.3436836004257202, | |
| "learning_rate": 7.925847457627119e-06, | |
| "loss": 0.016, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5710179640718562, | |
| "grad_norm": 2.2885513305664062, | |
| "learning_rate": 7.904661016949154e-06, | |
| "loss": 0.0097, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5748502994011976, | |
| "grad_norm": 1.111118197441101, | |
| "learning_rate": 7.883474576271187e-06, | |
| "loss": 0.012, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5786826347305389, | |
| "grad_norm": 1.1496660709381104, | |
| "learning_rate": 7.862288135593222e-06, | |
| "loss": 0.0112, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.5825149700598803, | |
| "grad_norm": 1.069238305091858, | |
| "learning_rate": 7.841101694915255e-06, | |
| "loss": 0.0143, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.5863473053892215, | |
| "grad_norm": 0.8200716972351074, | |
| "learning_rate": 7.819915254237289e-06, | |
| "loss": 0.0122, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.5901796407185629, | |
| "grad_norm": 0.911482036113739, | |
| "learning_rate": 7.798728813559324e-06, | |
| "loss": 0.0114, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5940119760479042, | |
| "grad_norm": 0.8424916863441467, | |
| "learning_rate": 7.777542372881357e-06, | |
| "loss": 0.0111, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5978443113772455, | |
| "grad_norm": 1.4030300378799438, | |
| "learning_rate": 7.75635593220339e-06, | |
| "loss": 0.0126, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6016766467065868, | |
| "grad_norm": 1.4150911569595337, | |
| "learning_rate": 7.735169491525423e-06, | |
| "loss": 0.0083, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6055089820359282, | |
| "grad_norm": 1.5504539012908936, | |
| "learning_rate": 7.713983050847458e-06, | |
| "loss": 0.0104, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6093413173652694, | |
| "grad_norm": 2.837200403213501, | |
| "learning_rate": 7.692796610169492e-06, | |
| "loss": 0.0148, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6131736526946108, | |
| "grad_norm": 1.2483636140823364, | |
| "learning_rate": 7.671610169491525e-06, | |
| "loss": 0.0124, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6170059880239521, | |
| "grad_norm": 1.9936306476593018, | |
| "learning_rate": 7.65042372881356e-06, | |
| "loss": 0.0111, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6208383233532934, | |
| "grad_norm": 1.3132163286209106, | |
| "learning_rate": 7.629237288135593e-06, | |
| "loss": 0.0122, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6246706586826347, | |
| "grad_norm": 1.5041028261184692, | |
| "learning_rate": 7.6080508474576275e-06, | |
| "loss": 0.0114, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.6285029940119761, | |
| "grad_norm": 1.1258797645568848, | |
| "learning_rate": 7.586864406779662e-06, | |
| "loss": 0.0177, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.6323353293413174, | |
| "grad_norm": 1.5175997018814087, | |
| "learning_rate": 7.565677966101696e-06, | |
| "loss": 0.0112, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6361676646706587, | |
| "grad_norm": 0.8304109573364258, | |
| "learning_rate": 7.544491525423729e-06, | |
| "loss": 0.0132, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.6195735931396484, | |
| "learning_rate": 7.523305084745763e-06, | |
| "loss": 0.008, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.6438323353293414, | |
| "grad_norm": 0.660678505897522, | |
| "learning_rate": 7.502118644067797e-06, | |
| "loss": 0.0095, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.6476646706586826, | |
| "grad_norm": 1.7854886054992676, | |
| "learning_rate": 7.4809322033898315e-06, | |
| "loss": 0.0154, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.651497005988024, | |
| "grad_norm": 1.3273577690124512, | |
| "learning_rate": 7.459745762711866e-06, | |
| "loss": 0.0094, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6553293413173653, | |
| "grad_norm": 0.7021352648735046, | |
| "learning_rate": 7.438559322033899e-06, | |
| "loss": 0.0097, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.6591616766467066, | |
| "grad_norm": 1.966849446296692, | |
| "learning_rate": 7.417372881355933e-06, | |
| "loss": 0.0104, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.6629940119760479, | |
| "grad_norm": 1.0740894079208374, | |
| "learning_rate": 7.396186440677967e-06, | |
| "loss": 0.0098, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.6668263473053893, | |
| "grad_norm": 1.1839215755462646, | |
| "learning_rate": 7.375000000000001e-06, | |
| "loss": 0.0173, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6706586826347305, | |
| "grad_norm": 1.8997067213058472, | |
| "learning_rate": 7.353813559322035e-06, | |
| "loss": 0.0104, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6744910179640718, | |
| "grad_norm": 1.172722339630127, | |
| "learning_rate": 7.332627118644068e-06, | |
| "loss": 0.0118, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.6783233532934132, | |
| "grad_norm": 1.0804802179336548, | |
| "learning_rate": 7.311440677966102e-06, | |
| "loss": 0.0125, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.6821556886227544, | |
| "grad_norm": 1.2899483442306519, | |
| "learning_rate": 7.290254237288135e-06, | |
| "loss": 0.0113, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6859880239520958, | |
| "grad_norm": 1.2753748893737793, | |
| "learning_rate": 7.2690677966101696e-06, | |
| "loss": 0.0076, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.6898203592814371, | |
| "grad_norm": 1.0572164058685303, | |
| "learning_rate": 7.247881355932204e-06, | |
| "loss": 0.0146, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6936526946107785, | |
| "grad_norm": 0.9690531492233276, | |
| "learning_rate": 7.226694915254238e-06, | |
| "loss": 0.0106, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.6974850299401197, | |
| "grad_norm": 0.37759602069854736, | |
| "learning_rate": 7.205508474576271e-06, | |
| "loss": 0.0087, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7013173652694611, | |
| "grad_norm": 1.595554232597351, | |
| "learning_rate": 7.184322033898305e-06, | |
| "loss": 0.0121, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.7051497005988024, | |
| "grad_norm": 5.621078014373779, | |
| "learning_rate": 7.1631355932203394e-06, | |
| "loss": 0.0097, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7089820359281437, | |
| "grad_norm": 0.313541978597641, | |
| "learning_rate": 7.141949152542374e-06, | |
| "loss": 0.0105, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.712814371257485, | |
| "grad_norm": 0.7415321469306946, | |
| "learning_rate": 7.120762711864408e-06, | |
| "loss": 0.013, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.7166467065868264, | |
| "grad_norm": 0.7116707563400269, | |
| "learning_rate": 7.099576271186441e-06, | |
| "loss": 0.0089, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.7204790419161676, | |
| "grad_norm": 2.017526388168335, | |
| "learning_rate": 7.078389830508475e-06, | |
| "loss": 0.0123, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.724311377245509, | |
| "grad_norm": 1.4061137437820435, | |
| "learning_rate": 7.057203389830509e-06, | |
| "loss": 0.0077, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.7281437125748503, | |
| "grad_norm": 1.1427956819534302, | |
| "learning_rate": 7.0360169491525435e-06, | |
| "loss": 0.0074, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7319760479041916, | |
| "grad_norm": 4.021897792816162, | |
| "learning_rate": 7.014830508474577e-06, | |
| "loss": 0.0109, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.7358083832335329, | |
| "grad_norm": 0.36056721210479736, | |
| "learning_rate": 6.993644067796611e-06, | |
| "loss": 0.0102, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.7396407185628743, | |
| "grad_norm": 0.8428685665130615, | |
| "learning_rate": 6.972457627118645e-06, | |
| "loss": 0.0084, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.7434730538922155, | |
| "grad_norm": 0.6571751236915588, | |
| "learning_rate": 6.951271186440679e-06, | |
| "loss": 0.0075, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.7473053892215569, | |
| "grad_norm": 1.2280906438827515, | |
| "learning_rate": 6.930084745762713e-06, | |
| "loss": 0.0141, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.7511377245508982, | |
| "grad_norm": 0.9602510333061218, | |
| "learning_rate": 6.908898305084746e-06, | |
| "loss": 0.0092, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.7549700598802396, | |
| "grad_norm": 0.7274242043495178, | |
| "learning_rate": 6.88771186440678e-06, | |
| "loss": 0.0061, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.7588023952095808, | |
| "grad_norm": 1.538460373878479, | |
| "learning_rate": 6.866525423728814e-06, | |
| "loss": 0.0106, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.7626347305389222, | |
| "grad_norm": 0.8836348056793213, | |
| "learning_rate": 6.845338983050847e-06, | |
| "loss": 0.0119, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.7664670658682635, | |
| "grad_norm": 1.2621474266052246, | |
| "learning_rate": 6.8241525423728815e-06, | |
| "loss": 0.0118, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7664670658682635, | |
| "eval_loss": 0.006921224296092987, | |
| "eval_runtime": 15262.8627, | |
| "eval_samples_per_second": 1.368, | |
| "eval_steps_per_second": 0.171, | |
| "eval_wer": 0.7323685598172008, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5220, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.531871408128e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |