{ "best_global_step": 2000, "best_metric": 0.7323685598172008, "best_model_checkpoint": "./SALAMA_NEWMEDTT/checkpoint-2000", "epoch": 0.7664670658682635, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038323353293413173, "grad_norm": 0.13999255001544952, "learning_rate": 1.8e-07, "loss": 0.0033, "step": 10 }, { "epoch": 0.007664670658682635, "grad_norm": 0.06949484348297119, "learning_rate": 3.8e-07, "loss": 0.0033, "step": 20 }, { "epoch": 0.011497005988023952, "grad_norm": 0.10336039215326309, "learning_rate": 5.800000000000001e-07, "loss": 0.0034, "step": 30 }, { "epoch": 0.01532934131736527, "grad_norm": 2.217076063156128, "learning_rate": 7.8e-07, "loss": 0.003, "step": 40 }, { "epoch": 0.019161676646706587, "grad_norm": 1.3674581050872803, "learning_rate": 9.800000000000001e-07, "loss": 0.0042, "step": 50 }, { "epoch": 0.022994011976047904, "grad_norm": 0.8775060176849365, "learning_rate": 1.1800000000000001e-06, "loss": 0.0014, "step": 60 }, { "epoch": 0.02682634730538922, "grad_norm": 0.21147924661636353, "learning_rate": 1.3800000000000001e-06, "loss": 0.0023, "step": 70 }, { "epoch": 0.03065868263473054, "grad_norm": 0.285540908575058, "learning_rate": 1.5800000000000001e-06, "loss": 0.0038, "step": 80 }, { "epoch": 0.034491017964071856, "grad_norm": 0.5599580407142639, "learning_rate": 1.7800000000000001e-06, "loss": 0.0036, "step": 90 }, { "epoch": 0.03832335329341317, "grad_norm": 0.7187970280647278, "learning_rate": 1.98e-06, "loss": 0.0054, "step": 100 }, { "epoch": 0.04215568862275449, "grad_norm": 1.0117833614349365, "learning_rate": 2.1800000000000003e-06, "loss": 0.0035, "step": 110 }, { "epoch": 0.04598802395209581, "grad_norm": 0.15052905678749084, "learning_rate": 2.38e-06, "loss": 0.0032, "step": 120 }, { "epoch": 0.049820359281437125, "grad_norm": 0.3713392913341522, "learning_rate": 2.5800000000000003e-06, "loss": 0.0025, "step": 130 }, { "epoch": 0.05365269461077844, "grad_norm": 0.11066653579473495, "learning_rate": 2.7800000000000005e-06, "loss": 0.0017, "step": 140 }, { "epoch": 0.05748502994011976, "grad_norm": 0.7113040089607239, "learning_rate": 2.9800000000000003e-06, "loss": 0.0035, "step": 150 }, { "epoch": 0.06131736526946108, "grad_norm": 0.19436658918857574, "learning_rate": 3.1800000000000005e-06, "loss": 0.0029, "step": 160 }, { "epoch": 0.0651497005988024, "grad_norm": 0.10705593228340149, "learning_rate": 3.3800000000000007e-06, "loss": 0.0015, "step": 170 }, { "epoch": 0.06898203592814371, "grad_norm": 0.28974607586860657, "learning_rate": 3.58e-06, "loss": 0.002, "step": 180 }, { "epoch": 0.07281437125748504, "grad_norm": 0.6417028903961182, "learning_rate": 3.7800000000000002e-06, "loss": 0.002, "step": 190 }, { "epoch": 0.07664670658682635, "grad_norm": 0.4101906418800354, "learning_rate": 3.980000000000001e-06, "loss": 0.0023, "step": 200 }, { "epoch": 0.08047904191616767, "grad_norm": 0.393304705619812, "learning_rate": 4.18e-06, "loss": 0.0029, "step": 210 }, { "epoch": 0.08431137724550898, "grad_norm": 2.8025639057159424, "learning_rate": 4.38e-06, "loss": 0.0063, "step": 220 }, { "epoch": 0.0881437125748503, "grad_norm": 0.30129295587539673, "learning_rate": 4.58e-06, "loss": 0.0043, "step": 230 }, { "epoch": 0.09197604790419162, "grad_norm": 0.8991917967796326, "learning_rate": 4.78e-06, "loss": 0.0041, "step": 240 }, { "epoch": 0.09580838323353294, "grad_norm": 0.5826700329780579, "learning_rate": 4.980000000000001e-06, "loss": 0.0052, "step": 250 }, { "epoch": 0.09964071856287425, "grad_norm": 0.4652438759803772, "learning_rate": 5.18e-06, "loss": 0.0032, "step": 260 }, { "epoch": 0.10347305389221557, "grad_norm": 0.24232645332813263, "learning_rate": 5.380000000000001e-06, "loss": 0.003, "step": 270 }, { "epoch": 0.10730538922155688, "grad_norm": 0.2619079649448395, "learning_rate": 5.580000000000001e-06, "loss": 0.0035, "step": 280 }, { "epoch": 0.11113772455089821, "grad_norm": 0.36271339654922485, "learning_rate": 5.78e-06, "loss": 0.004, "step": 290 }, { "epoch": 0.11497005988023952, "grad_norm": 1.1893694400787354, "learning_rate": 5.98e-06, "loss": 0.0075, "step": 300 }, { "epoch": 0.11880239520958084, "grad_norm": 1.4707320928573608, "learning_rate": 6.18e-06, "loss": 0.0065, "step": 310 }, { "epoch": 0.12263473053892215, "grad_norm": 0.5402860045433044, "learning_rate": 6.380000000000001e-06, "loss": 0.0079, "step": 320 }, { "epoch": 0.12646706586826348, "grad_norm": 0.8445234894752502, "learning_rate": 6.5800000000000005e-06, "loss": 0.0038, "step": 330 }, { "epoch": 0.1302994011976048, "grad_norm": 1.0490593910217285, "learning_rate": 6.780000000000001e-06, "loss": 0.0065, "step": 340 }, { "epoch": 0.1341317365269461, "grad_norm": 0.7944777011871338, "learning_rate": 6.98e-06, "loss": 0.0066, "step": 350 }, { "epoch": 0.13796407185628742, "grad_norm": 0.7534486055374146, "learning_rate": 7.180000000000001e-06, "loss": 0.0049, "step": 360 }, { "epoch": 0.14179640718562875, "grad_norm": 0.8252223134040833, "learning_rate": 7.3800000000000005e-06, "loss": 0.0066, "step": 370 }, { "epoch": 0.14562874251497007, "grad_norm": 0.8292574286460876, "learning_rate": 7.58e-06, "loss": 0.0052, "step": 380 }, { "epoch": 0.14946107784431137, "grad_norm": 1.1227622032165527, "learning_rate": 7.78e-06, "loss": 0.0076, "step": 390 }, { "epoch": 0.1532934131736527, "grad_norm": 1.2978622913360596, "learning_rate": 7.980000000000002e-06, "loss": 0.0065, "step": 400 }, { "epoch": 0.15712574850299402, "grad_norm": 2.281665325164795, "learning_rate": 8.18e-06, "loss": 0.0072, "step": 410 }, { "epoch": 0.16095808383233534, "grad_norm": 0.9634031653404236, "learning_rate": 8.380000000000001e-06, "loss": 0.0066, "step": 420 }, { "epoch": 0.16479041916167664, "grad_norm": 1.3554670810699463, "learning_rate": 8.580000000000001e-06, "loss": 0.007, "step": 430 }, { "epoch": 0.16862275449101796, "grad_norm": 1.5378248691558838, "learning_rate": 8.78e-06, "loss": 0.0082, "step": 440 }, { "epoch": 0.1724550898203593, "grad_norm": 2.1895182132720947, "learning_rate": 8.98e-06, "loss": 0.0103, "step": 450 }, { "epoch": 0.1762874251497006, "grad_norm": 0.634242057800293, "learning_rate": 9.180000000000002e-06, "loss": 0.0113, "step": 460 }, { "epoch": 0.1801197604790419, "grad_norm": 0.6717728972434998, "learning_rate": 9.38e-06, "loss": 0.0057, "step": 470 }, { "epoch": 0.18395209580838323, "grad_norm": 2.004511833190918, "learning_rate": 9.58e-06, "loss": 0.0147, "step": 480 }, { "epoch": 0.18778443113772456, "grad_norm": 8.510348320007324, "learning_rate": 9.780000000000001e-06, "loss": 0.012, "step": 490 }, { "epoch": 0.19161676646706588, "grad_norm": 1.1750833988189697, "learning_rate": 9.980000000000001e-06, "loss": 0.0132, "step": 500 }, { "epoch": 0.19544910179640718, "grad_norm": 1.6581082344055176, "learning_rate": 9.980932203389831e-06, "loss": 0.0213, "step": 510 }, { "epoch": 0.1992814371257485, "grad_norm": 1.5398513078689575, "learning_rate": 9.959745762711866e-06, "loss": 0.0136, "step": 520 }, { "epoch": 0.20311377245508982, "grad_norm": 0.879449725151062, "learning_rate": 9.9385593220339e-06, "loss": 0.0119, "step": 530 }, { "epoch": 0.20694610778443115, "grad_norm": 2.3282341957092285, "learning_rate": 9.917372881355933e-06, "loss": 0.0143, "step": 540 }, { "epoch": 0.21077844311377245, "grad_norm": 2.3774726390838623, "learning_rate": 9.896186440677968e-06, "loss": 0.0159, "step": 550 }, { "epoch": 0.21461077844311377, "grad_norm": 1.5949875116348267, "learning_rate": 9.875000000000001e-06, "loss": 0.0101, "step": 560 }, { "epoch": 0.2184431137724551, "grad_norm": 1.4217886924743652, "learning_rate": 9.853813559322034e-06, "loss": 0.0137, "step": 570 }, { "epoch": 0.22227544910179642, "grad_norm": 1.0425392389297485, "learning_rate": 9.832627118644068e-06, "loss": 0.012, "step": 580 }, { "epoch": 0.22610778443113771, "grad_norm": 1.374623417854309, "learning_rate": 9.811440677966103e-06, "loss": 0.0128, "step": 590 }, { "epoch": 0.22994011976047904, "grad_norm": 2.8496975898742676, "learning_rate": 9.790254237288136e-06, "loss": 0.0122, "step": 600 }, { "epoch": 0.23377245508982036, "grad_norm": 1.7832646369934082, "learning_rate": 9.76906779661017e-06, "loss": 0.016, "step": 610 }, { "epoch": 0.2376047904191617, "grad_norm": 1.8113480806350708, "learning_rate": 9.747881355932204e-06, "loss": 0.016, "step": 620 }, { "epoch": 0.24143712574850298, "grad_norm": 1.0705426931381226, "learning_rate": 9.726694915254238e-06, "loss": 0.0149, "step": 630 }, { "epoch": 0.2452694610778443, "grad_norm": 1.5321799516677856, "learning_rate": 9.705508474576271e-06, "loss": 0.0158, "step": 640 }, { "epoch": 0.24910179640718563, "grad_norm": 2.2437734603881836, "learning_rate": 9.684322033898306e-06, "loss": 0.0176, "step": 650 }, { "epoch": 0.25293413173652696, "grad_norm": 0.9095500707626343, "learning_rate": 9.66313559322034e-06, "loss": 0.0153, "step": 660 }, { "epoch": 0.25676646706586825, "grad_norm": 1.800218105316162, "learning_rate": 9.641949152542374e-06, "loss": 0.0147, "step": 670 }, { "epoch": 0.2605988023952096, "grad_norm": 1.5202052593231201, "learning_rate": 9.620762711864408e-06, "loss": 0.0131, "step": 680 }, { "epoch": 0.2644311377245509, "grad_norm": 2.1265900135040283, "learning_rate": 9.59957627118644e-06, "loss": 0.0131, "step": 690 }, { "epoch": 0.2682634730538922, "grad_norm": 1.3656080961227417, "learning_rate": 9.578389830508476e-06, "loss": 0.0159, "step": 700 }, { "epoch": 0.27209580838323355, "grad_norm": 1.1013087034225464, "learning_rate": 9.557203389830509e-06, "loss": 0.0144, "step": 710 }, { "epoch": 0.27592814371257485, "grad_norm": 1.7320525646209717, "learning_rate": 9.536016949152544e-06, "loss": 0.013, "step": 720 }, { "epoch": 0.27976047904191614, "grad_norm": 0.7219749093055725, "learning_rate": 9.514830508474577e-06, "loss": 0.0134, "step": 730 }, { "epoch": 0.2835928143712575, "grad_norm": 1.914589285850525, "learning_rate": 9.49364406779661e-06, "loss": 0.0178, "step": 740 }, { "epoch": 0.2874251497005988, "grad_norm": 2.229616641998291, "learning_rate": 9.472457627118646e-06, "loss": 0.0163, "step": 750 }, { "epoch": 0.29125748502994014, "grad_norm": 0.7531014680862427, "learning_rate": 9.451271186440679e-06, "loss": 0.0097, "step": 760 }, { "epoch": 0.29508982035928144, "grad_norm": 1.1441811323165894, "learning_rate": 9.430084745762714e-06, "loss": 0.0121, "step": 770 }, { "epoch": 0.29892215568862274, "grad_norm": 1.730209469795227, "learning_rate": 9.408898305084746e-06, "loss": 0.0176, "step": 780 }, { "epoch": 0.3027544910179641, "grad_norm": 2.434473752975464, "learning_rate": 9.38771186440678e-06, "loss": 0.0136, "step": 790 }, { "epoch": 0.3065868263473054, "grad_norm": 1.3024921417236328, "learning_rate": 9.366525423728814e-06, "loss": 0.013, "step": 800 }, { "epoch": 0.3104191616766467, "grad_norm": 1.1504980325698853, "learning_rate": 9.345338983050847e-06, "loss": 0.0115, "step": 810 }, { "epoch": 0.31425149700598803, "grad_norm": 1.0408570766448975, "learning_rate": 9.324152542372882e-06, "loss": 0.0146, "step": 820 }, { "epoch": 0.31808383233532933, "grad_norm": 1.8448822498321533, "learning_rate": 9.302966101694915e-06, "loss": 0.0137, "step": 830 }, { "epoch": 0.3219161676646707, "grad_norm": 1.2359811067581177, "learning_rate": 9.28177966101695e-06, "loss": 0.0115, "step": 840 }, { "epoch": 0.325748502994012, "grad_norm": 1.6629658937454224, "learning_rate": 9.260593220338984e-06, "loss": 0.0185, "step": 850 }, { "epoch": 0.3295808383233533, "grad_norm": 1.3565815687179565, "learning_rate": 9.239406779661017e-06, "loss": 0.0105, "step": 860 }, { "epoch": 0.3334131736526946, "grad_norm": 0.47145602107048035, "learning_rate": 9.218220338983052e-06, "loss": 0.0124, "step": 870 }, { "epoch": 0.3372455089820359, "grad_norm": 1.452635645866394, "learning_rate": 9.197033898305085e-06, "loss": 0.0143, "step": 880 }, { "epoch": 0.3410778443113772, "grad_norm": 1.3402444124221802, "learning_rate": 9.17584745762712e-06, "loss": 0.0158, "step": 890 }, { "epoch": 0.3449101796407186, "grad_norm": 0.9056028127670288, "learning_rate": 9.154661016949154e-06, "loss": 0.0132, "step": 900 }, { "epoch": 0.34874251497005987, "grad_norm": 2.6174232959747314, "learning_rate": 9.133474576271187e-06, "loss": 0.0104, "step": 910 }, { "epoch": 0.3525748502994012, "grad_norm": 1.692396879196167, "learning_rate": 9.112288135593222e-06, "loss": 0.0139, "step": 920 }, { "epoch": 0.3564071856287425, "grad_norm": 1.1301361322402954, "learning_rate": 9.091101694915255e-06, "loss": 0.0114, "step": 930 }, { "epoch": 0.3602395209580838, "grad_norm": 0.9672715663909912, "learning_rate": 9.069915254237288e-06, "loss": 0.0128, "step": 940 }, { "epoch": 0.36407185628742517, "grad_norm": 2.274716377258301, "learning_rate": 9.048728813559323e-06, "loss": 0.0141, "step": 950 }, { "epoch": 0.36790419161676646, "grad_norm": 1.0660594701766968, "learning_rate": 9.027542372881357e-06, "loss": 0.0113, "step": 960 }, { "epoch": 0.37173652694610776, "grad_norm": 1.8312493562698364, "learning_rate": 9.006355932203392e-06, "loss": 0.0131, "step": 970 }, { "epoch": 0.3755688622754491, "grad_norm": 1.8305320739746094, "learning_rate": 8.985169491525423e-06, "loss": 0.0146, "step": 980 }, { "epoch": 0.3794011976047904, "grad_norm": 1.281570553779602, "learning_rate": 8.963983050847458e-06, "loss": 0.0145, "step": 990 }, { "epoch": 0.38323353293413176, "grad_norm": 0.8759114146232605, "learning_rate": 8.942796610169492e-06, "loss": 0.0118, "step": 1000 }, { "epoch": 0.38706586826347306, "grad_norm": 1.1629912853240967, "learning_rate": 8.921610169491527e-06, "loss": 0.0152, "step": 1010 }, { "epoch": 0.39089820359281435, "grad_norm": 1.3534737825393677, "learning_rate": 8.90042372881356e-06, "loss": 0.0133, "step": 1020 }, { "epoch": 0.3947305389221557, "grad_norm": 1.538173794746399, "learning_rate": 8.879237288135593e-06, "loss": 0.0153, "step": 1030 }, { "epoch": 0.398562874251497, "grad_norm": 0.8408123254776001, "learning_rate": 8.858050847457628e-06, "loss": 0.0117, "step": 1040 }, { "epoch": 0.4023952095808383, "grad_norm": 1.6027411222457886, "learning_rate": 8.836864406779662e-06, "loss": 0.0107, "step": 1050 }, { "epoch": 0.40622754491017965, "grad_norm": 1.9977298974990845, "learning_rate": 8.815677966101695e-06, "loss": 0.0151, "step": 1060 }, { "epoch": 0.41005988023952095, "grad_norm": 1.1906282901763916, "learning_rate": 8.79449152542373e-06, "loss": 0.0133, "step": 1070 }, { "epoch": 0.4138922155688623, "grad_norm": 1.9095267057418823, "learning_rate": 8.773305084745763e-06, "loss": 0.0141, "step": 1080 }, { "epoch": 0.4177245508982036, "grad_norm": 1.7171216011047363, "learning_rate": 8.752118644067798e-06, "loss": 0.0122, "step": 1090 }, { "epoch": 0.4215568862275449, "grad_norm": 2.373567819595337, "learning_rate": 8.730932203389831e-06, "loss": 0.0165, "step": 1100 }, { "epoch": 0.42538922155688624, "grad_norm": 1.757907509803772, "learning_rate": 8.709745762711865e-06, "loss": 0.0167, "step": 1110 }, { "epoch": 0.42922155688622754, "grad_norm": 3.8954083919525146, "learning_rate": 8.6885593220339e-06, "loss": 0.0205, "step": 1120 }, { "epoch": 0.43305389221556884, "grad_norm": 1.8802250623703003, "learning_rate": 8.667372881355933e-06, "loss": 0.0106, "step": 1130 }, { "epoch": 0.4368862275449102, "grad_norm": 1.3269938230514526, "learning_rate": 8.646186440677968e-06, "loss": 0.0101, "step": 1140 }, { "epoch": 0.4407185628742515, "grad_norm": 0.8917752504348755, "learning_rate": 8.625000000000001e-06, "loss": 0.011, "step": 1150 }, { "epoch": 0.44455089820359284, "grad_norm": 1.179168939590454, "learning_rate": 8.603813559322035e-06, "loss": 0.0132, "step": 1160 }, { "epoch": 0.44838323353293413, "grad_norm": 1.326897382736206, "learning_rate": 8.582627118644068e-06, "loss": 0.0105, "step": 1170 }, { "epoch": 0.45221556886227543, "grad_norm": 1.5681304931640625, "learning_rate": 8.561440677966101e-06, "loss": 0.0107, "step": 1180 }, { "epoch": 0.4560479041916168, "grad_norm": 0.7448801398277283, "learning_rate": 8.540254237288136e-06, "loss": 0.0105, "step": 1190 }, { "epoch": 0.4598802395209581, "grad_norm": 1.8620606660842896, "learning_rate": 8.51906779661017e-06, "loss": 0.0149, "step": 1200 }, { "epoch": 0.4637125748502994, "grad_norm": 0.7713643312454224, "learning_rate": 8.497881355932204e-06, "loss": 0.0103, "step": 1210 }, { "epoch": 0.4675449101796407, "grad_norm": 1.0003689527511597, "learning_rate": 8.476694915254238e-06, "loss": 0.0131, "step": 1220 }, { "epoch": 0.471377245508982, "grad_norm": 1.3732171058654785, "learning_rate": 8.455508474576271e-06, "loss": 0.0147, "step": 1230 }, { "epoch": 0.4752095808383234, "grad_norm": 0.6933659911155701, "learning_rate": 8.434322033898306e-06, "loss": 0.0117, "step": 1240 }, { "epoch": 0.47904191616766467, "grad_norm": 1.7515251636505127, "learning_rate": 8.41313559322034e-06, "loss": 0.0146, "step": 1250 }, { "epoch": 0.48287425149700597, "grad_norm": 1.186540126800537, "learning_rate": 8.391949152542374e-06, "loss": 0.0094, "step": 1260 }, { "epoch": 0.4867065868263473, "grad_norm": 0.7318383455276489, "learning_rate": 8.370762711864408e-06, "loss": 0.0107, "step": 1270 }, { "epoch": 0.4905389221556886, "grad_norm": 1.4842137098312378, "learning_rate": 8.349576271186441e-06, "loss": 0.016, "step": 1280 }, { "epoch": 0.4943712574850299, "grad_norm": 0.6988366842269897, "learning_rate": 8.328389830508476e-06, "loss": 0.0114, "step": 1290 }, { "epoch": 0.49820359281437127, "grad_norm": 2.0333776473999023, "learning_rate": 8.30720338983051e-06, "loss": 0.0139, "step": 1300 }, { "epoch": 0.5020359281437126, "grad_norm": 1.5975853204727173, "learning_rate": 8.286016949152543e-06, "loss": 0.0132, "step": 1310 }, { "epoch": 0.5058682634730539, "grad_norm": 1.2636547088623047, "learning_rate": 8.264830508474577e-06, "loss": 0.0131, "step": 1320 }, { "epoch": 0.5097005988023952, "grad_norm": 1.1417256593704224, "learning_rate": 8.24364406779661e-06, "loss": 0.0129, "step": 1330 }, { "epoch": 0.5135329341317365, "grad_norm": 1.3969192504882812, "learning_rate": 8.222457627118646e-06, "loss": 0.0077, "step": 1340 }, { "epoch": 0.5173652694610779, "grad_norm": 0.5558528900146484, "learning_rate": 8.201271186440679e-06, "loss": 0.0122, "step": 1350 }, { "epoch": 0.5211976047904192, "grad_norm": 1.0696184635162354, "learning_rate": 8.180084745762712e-06, "loss": 0.0099, "step": 1360 }, { "epoch": 0.5250299401197605, "grad_norm": 1.4563461542129517, "learning_rate": 8.158898305084746e-06, "loss": 0.0106, "step": 1370 }, { "epoch": 0.5288622754491018, "grad_norm": 0.995069146156311, "learning_rate": 8.13771186440678e-06, "loss": 0.0117, "step": 1380 }, { "epoch": 0.5326946107784432, "grad_norm": 1.1932368278503418, "learning_rate": 8.116525423728814e-06, "loss": 0.0119, "step": 1390 }, { "epoch": 0.5365269461077844, "grad_norm": 0.5526638031005859, "learning_rate": 8.095338983050847e-06, "loss": 0.0164, "step": 1400 }, { "epoch": 0.5403592814371257, "grad_norm": 1.2106008529663086, "learning_rate": 8.074152542372882e-06, "loss": 0.0117, "step": 1410 }, { "epoch": 0.5441916167664671, "grad_norm": 1.3328733444213867, "learning_rate": 8.052966101694916e-06, "loss": 0.0103, "step": 1420 }, { "epoch": 0.5480239520958083, "grad_norm": 1.2352383136749268, "learning_rate": 8.031779661016949e-06, "loss": 0.0132, "step": 1430 }, { "epoch": 0.5518562874251497, "grad_norm": 0.8932350277900696, "learning_rate": 8.010593220338984e-06, "loss": 0.0094, "step": 1440 }, { "epoch": 0.555688622754491, "grad_norm": 1.1897926330566406, "learning_rate": 7.989406779661017e-06, "loss": 0.0083, "step": 1450 }, { "epoch": 0.5595209580838323, "grad_norm": 3.584073543548584, "learning_rate": 7.968220338983052e-06, "loss": 0.0107, "step": 1460 }, { "epoch": 0.5633532934131736, "grad_norm": 2.2174160480499268, "learning_rate": 7.947033898305085e-06, "loss": 0.0143, "step": 1470 }, { "epoch": 0.567185628742515, "grad_norm": 1.3436836004257202, "learning_rate": 7.925847457627119e-06, "loss": 0.016, "step": 1480 }, { "epoch": 0.5710179640718562, "grad_norm": 2.2885513305664062, "learning_rate": 7.904661016949154e-06, "loss": 0.0097, "step": 1490 }, { "epoch": 0.5748502994011976, "grad_norm": 1.111118197441101, "learning_rate": 7.883474576271187e-06, "loss": 0.012, "step": 1500 }, { "epoch": 0.5786826347305389, "grad_norm": 1.1496660709381104, "learning_rate": 7.862288135593222e-06, "loss": 0.0112, "step": 1510 }, { "epoch": 0.5825149700598803, "grad_norm": 1.069238305091858, "learning_rate": 7.841101694915255e-06, "loss": 0.0143, "step": 1520 }, { "epoch": 0.5863473053892215, "grad_norm": 0.8200716972351074, "learning_rate": 7.819915254237289e-06, "loss": 0.0122, "step": 1530 }, { "epoch": 0.5901796407185629, "grad_norm": 0.911482036113739, "learning_rate": 7.798728813559324e-06, "loss": 0.0114, "step": 1540 }, { "epoch": 0.5940119760479042, "grad_norm": 0.8424916863441467, "learning_rate": 7.777542372881357e-06, "loss": 0.0111, "step": 1550 }, { "epoch": 0.5978443113772455, "grad_norm": 1.4030300378799438, "learning_rate": 7.75635593220339e-06, "loss": 0.0126, "step": 1560 }, { "epoch": 0.6016766467065868, "grad_norm": 1.4150911569595337, "learning_rate": 7.735169491525423e-06, "loss": 0.0083, "step": 1570 }, { "epoch": 0.6055089820359282, "grad_norm": 1.5504539012908936, "learning_rate": 7.713983050847458e-06, "loss": 0.0104, "step": 1580 }, { "epoch": 0.6093413173652694, "grad_norm": 2.837200403213501, "learning_rate": 7.692796610169492e-06, "loss": 0.0148, "step": 1590 }, { "epoch": 0.6131736526946108, "grad_norm": 1.2483636140823364, "learning_rate": 7.671610169491525e-06, "loss": 0.0124, "step": 1600 }, { "epoch": 0.6170059880239521, "grad_norm": 1.9936306476593018, "learning_rate": 7.65042372881356e-06, "loss": 0.0111, "step": 1610 }, { "epoch": 0.6208383233532934, "grad_norm": 1.3132163286209106, "learning_rate": 7.629237288135593e-06, "loss": 0.0122, "step": 1620 }, { "epoch": 0.6246706586826347, "grad_norm": 1.5041028261184692, "learning_rate": 7.6080508474576275e-06, "loss": 0.0114, "step": 1630 }, { "epoch": 0.6285029940119761, "grad_norm": 1.1258797645568848, "learning_rate": 7.586864406779662e-06, "loss": 0.0177, "step": 1640 }, { "epoch": 0.6323353293413174, "grad_norm": 1.5175997018814087, "learning_rate": 7.565677966101696e-06, "loss": 0.0112, "step": 1650 }, { "epoch": 0.6361676646706587, "grad_norm": 0.8304109573364258, "learning_rate": 7.544491525423729e-06, "loss": 0.0132, "step": 1660 }, { "epoch": 0.64, "grad_norm": 1.6195735931396484, "learning_rate": 7.523305084745763e-06, "loss": 0.008, "step": 1670 }, { "epoch": 0.6438323353293414, "grad_norm": 0.660678505897522, "learning_rate": 7.502118644067797e-06, "loss": 0.0095, "step": 1680 }, { "epoch": 0.6476646706586826, "grad_norm": 1.7854886054992676, "learning_rate": 7.4809322033898315e-06, "loss": 0.0154, "step": 1690 }, { "epoch": 0.651497005988024, "grad_norm": 1.3273577690124512, "learning_rate": 7.459745762711866e-06, "loss": 0.0094, "step": 1700 }, { "epoch": 0.6553293413173653, "grad_norm": 0.7021352648735046, "learning_rate": 7.438559322033899e-06, "loss": 0.0097, "step": 1710 }, { "epoch": 0.6591616766467066, "grad_norm": 1.966849446296692, "learning_rate": 7.417372881355933e-06, "loss": 0.0104, "step": 1720 }, { "epoch": 0.6629940119760479, "grad_norm": 1.0740894079208374, "learning_rate": 7.396186440677967e-06, "loss": 0.0098, "step": 1730 }, { "epoch": 0.6668263473053893, "grad_norm": 1.1839215755462646, "learning_rate": 7.375000000000001e-06, "loss": 0.0173, "step": 1740 }, { "epoch": 0.6706586826347305, "grad_norm": 1.8997067213058472, "learning_rate": 7.353813559322035e-06, "loss": 0.0104, "step": 1750 }, { "epoch": 0.6744910179640718, "grad_norm": 1.172722339630127, "learning_rate": 7.332627118644068e-06, "loss": 0.0118, "step": 1760 }, { "epoch": 0.6783233532934132, "grad_norm": 1.0804802179336548, "learning_rate": 7.311440677966102e-06, "loss": 0.0125, "step": 1770 }, { "epoch": 0.6821556886227544, "grad_norm": 1.2899483442306519, "learning_rate": 7.290254237288135e-06, "loss": 0.0113, "step": 1780 }, { "epoch": 0.6859880239520958, "grad_norm": 1.2753748893737793, "learning_rate": 7.2690677966101696e-06, "loss": 0.0076, "step": 1790 }, { "epoch": 0.6898203592814371, "grad_norm": 1.0572164058685303, "learning_rate": 7.247881355932204e-06, "loss": 0.0146, "step": 1800 }, { "epoch": 0.6936526946107785, "grad_norm": 0.9690531492233276, "learning_rate": 7.226694915254238e-06, "loss": 0.0106, "step": 1810 }, { "epoch": 0.6974850299401197, "grad_norm": 0.37759602069854736, "learning_rate": 7.205508474576271e-06, "loss": 0.0087, "step": 1820 }, { "epoch": 0.7013173652694611, "grad_norm": 1.595554232597351, "learning_rate": 7.184322033898305e-06, "loss": 0.0121, "step": 1830 }, { "epoch": 0.7051497005988024, "grad_norm": 5.621078014373779, "learning_rate": 7.1631355932203394e-06, "loss": 0.0097, "step": 1840 }, { "epoch": 0.7089820359281437, "grad_norm": 0.313541978597641, "learning_rate": 7.141949152542374e-06, "loss": 0.0105, "step": 1850 }, { "epoch": 0.712814371257485, "grad_norm": 0.7415321469306946, "learning_rate": 7.120762711864408e-06, "loss": 0.013, "step": 1860 }, { "epoch": 0.7166467065868264, "grad_norm": 0.7116707563400269, "learning_rate": 7.099576271186441e-06, "loss": 0.0089, "step": 1870 }, { "epoch": 0.7204790419161676, "grad_norm": 2.017526388168335, "learning_rate": 7.078389830508475e-06, "loss": 0.0123, "step": 1880 }, { "epoch": 0.724311377245509, "grad_norm": 1.4061137437820435, "learning_rate": 7.057203389830509e-06, "loss": 0.0077, "step": 1890 }, { "epoch": 0.7281437125748503, "grad_norm": 1.1427956819534302, "learning_rate": 7.0360169491525435e-06, "loss": 0.0074, "step": 1900 }, { "epoch": 0.7319760479041916, "grad_norm": 4.021897792816162, "learning_rate": 7.014830508474577e-06, "loss": 0.0109, "step": 1910 }, { "epoch": 0.7358083832335329, "grad_norm": 0.36056721210479736, "learning_rate": 6.993644067796611e-06, "loss": 0.0102, "step": 1920 }, { "epoch": 0.7396407185628743, "grad_norm": 0.8428685665130615, "learning_rate": 6.972457627118645e-06, "loss": 0.0084, "step": 1930 }, { "epoch": 0.7434730538922155, "grad_norm": 0.6571751236915588, "learning_rate": 6.951271186440679e-06, "loss": 0.0075, "step": 1940 }, { "epoch": 0.7473053892215569, "grad_norm": 1.2280906438827515, "learning_rate": 6.930084745762713e-06, "loss": 0.0141, "step": 1950 }, { "epoch": 0.7511377245508982, "grad_norm": 0.9602510333061218, "learning_rate": 6.908898305084746e-06, "loss": 0.0092, "step": 1960 }, { "epoch": 0.7549700598802396, "grad_norm": 0.7274242043495178, "learning_rate": 6.88771186440678e-06, "loss": 0.0061, "step": 1970 }, { "epoch": 0.7588023952095808, "grad_norm": 1.538460373878479, "learning_rate": 6.866525423728814e-06, "loss": 0.0106, "step": 1980 }, { "epoch": 0.7626347305389222, "grad_norm": 0.8836348056793213, "learning_rate": 6.845338983050847e-06, "loss": 0.0119, "step": 1990 }, { "epoch": 0.7664670658682635, "grad_norm": 1.2621474266052246, "learning_rate": 6.8241525423728815e-06, "loss": 0.0118, "step": 2000 }, { "epoch": 0.7664670658682635, "eval_loss": 0.006921224296092987, "eval_runtime": 15262.8627, "eval_samples_per_second": 1.368, "eval_steps_per_second": 0.171, "eval_wer": 0.7323685598172008, "step": 2000 } ], "logging_steps": 10, "max_steps": 5220, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.531871408128e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }