| { |
| "best_global_step": 7000, |
| "best_metric": 0.9755905511811024, |
| "best_model_checkpoint": "output/checkpoint-7000", |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 9048, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016578249336870028, |
| "grad_norm": 15.383491516113281, |
| "learning_rate": 2.1633554083885212e-06, |
| "loss": 1.0204, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.033156498673740056, |
| "grad_norm": 15.810857772827148, |
| "learning_rate": 4.370860927152319e-06, |
| "loss": 0.8263, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04973474801061008, |
| "grad_norm": 29.156631469726562, |
| "learning_rate": 6.578366445916116e-06, |
| "loss": 0.7179, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.06631299734748011, |
| "grad_norm": 13.032061576843262, |
| "learning_rate": 8.785871964679912e-06, |
| "loss": 0.4882, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08289124668435013, |
| "grad_norm": 7.32506799697876, |
| "learning_rate": 1.099337748344371e-05, |
| "loss": 0.4156, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09946949602122016, |
| "grad_norm": 16.674861907958984, |
| "learning_rate": 1.3200883002207508e-05, |
| "loss": 0.3548, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.11604774535809019, |
| "grad_norm": 39.721839904785156, |
| "learning_rate": 1.5408388520971304e-05, |
| "loss": 0.3145, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.13262599469496023, |
| "grad_norm": 9.635961532592773, |
| "learning_rate": 1.76158940397351e-05, |
| "loss": 0.292, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14920424403183025, |
| "grad_norm": 3.8946573734283447, |
| "learning_rate": 1.9823399558498897e-05, |
| "loss": 0.3039, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.16578249336870027, |
| "grad_norm": 2.919046401977539, |
| "learning_rate": 1.989296102385108e-05, |
| "loss": 0.2764, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16578249336870027, |
| "eval_accuracy": 0.9015748031496063, |
| "eval_loss": 0.25162026286125183, |
| "eval_runtime": 8.6404, |
| "eval_samples_per_second": 293.967, |
| "eval_steps_per_second": 9.259, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1823607427055703, |
| "grad_norm": 4.6071295738220215, |
| "learning_rate": 1.9776614310645726e-05, |
| "loss": 0.2176, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1989389920424403, |
| "grad_norm": 1.5469706058502197, |
| "learning_rate": 1.9660267597440373e-05, |
| "loss": 0.2785, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.21551724137931033, |
| "grad_norm": 18.289094924926758, |
| "learning_rate": 1.9543920884235023e-05, |
| "loss": 0.1978, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.23209549071618038, |
| "grad_norm": 16.161046981811523, |
| "learning_rate": 1.942757417102967e-05, |
| "loss": 0.2398, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2486737400530504, |
| "grad_norm": 4.511693000793457, |
| "learning_rate": 1.9311227457824316e-05, |
| "loss": 0.1964, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.26525198938992045, |
| "grad_norm": 0.6140362620353699, |
| "learning_rate": 1.9194880744618966e-05, |
| "loss": 0.1817, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.28183023872679047, |
| "grad_norm": 34.74034881591797, |
| "learning_rate": 1.9078534031413613e-05, |
| "loss": 0.1988, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2984084880636605, |
| "grad_norm": 2.9800262451171875, |
| "learning_rate": 1.8962187318208263e-05, |
| "loss": 0.1641, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3149867374005305, |
| "grad_norm": 15.570615768432617, |
| "learning_rate": 1.884584060500291e-05, |
| "loss": 0.1554, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.33156498673740054, |
| "grad_norm": 1.9845480918884277, |
| "learning_rate": 1.872949389179756e-05, |
| "loss": 0.1855, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.33156498673740054, |
| "eval_accuracy": 0.9358267716535433, |
| "eval_loss": 0.18621081113815308, |
| "eval_runtime": 8.3497, |
| "eval_samples_per_second": 304.203, |
| "eval_steps_per_second": 9.581, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.34814323607427056, |
| "grad_norm": 14.85474681854248, |
| "learning_rate": 1.8613147178592206e-05, |
| "loss": 0.1725, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3647214854111406, |
| "grad_norm": 4.149899959564209, |
| "learning_rate": 1.8496800465386856e-05, |
| "loss": 0.1513, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3812997347480106, |
| "grad_norm": 22.695873260498047, |
| "learning_rate": 1.8380453752181503e-05, |
| "loss": 0.1767, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3978779840848806, |
| "grad_norm": 0.17689716815948486, |
| "learning_rate": 1.8264107038976153e-05, |
| "loss": 0.1623, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.41445623342175064, |
| "grad_norm": 30.613101959228516, |
| "learning_rate": 1.81477603257708e-05, |
| "loss": 0.1682, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.43103448275862066, |
| "grad_norm": 16.982980728149414, |
| "learning_rate": 1.8031413612565446e-05, |
| "loss": 0.1508, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.44761273209549074, |
| "grad_norm": 0.1425020545721054, |
| "learning_rate": 1.7915066899360092e-05, |
| "loss": 0.1679, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.46419098143236076, |
| "grad_norm": 5.924993991851807, |
| "learning_rate": 1.7798720186154742e-05, |
| "loss": 0.1387, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4807692307692308, |
| "grad_norm": 5.42221212387085, |
| "learning_rate": 1.768237347294939e-05, |
| "loss": 0.1627, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4973474801061008, |
| "grad_norm": 31.555957794189453, |
| "learning_rate": 1.756602675974404e-05, |
| "loss": 0.1188, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4973474801061008, |
| "eval_accuracy": 0.9417322834645669, |
| "eval_loss": 0.1927609145641327, |
| "eval_runtime": 8.4174, |
| "eval_samples_per_second": 301.756, |
| "eval_steps_per_second": 9.504, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5139257294429708, |
| "grad_norm": 15.842325210571289, |
| "learning_rate": 1.7449680046538686e-05, |
| "loss": 0.1678, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5305039787798409, |
| "grad_norm": 6.040674686431885, |
| "learning_rate": 1.7333333333333336e-05, |
| "loss": 0.1215, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5470822281167109, |
| "grad_norm": 0.5153898596763611, |
| "learning_rate": 1.7216986620127982e-05, |
| "loss": 0.1507, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5636604774535809, |
| "grad_norm": 18.98447608947754, |
| "learning_rate": 1.7100639906922632e-05, |
| "loss": 0.1481, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.580238726790451, |
| "grad_norm": 43.290855407714844, |
| "learning_rate": 1.698429319371728e-05, |
| "loss": 0.1422, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.596816976127321, |
| "grad_norm": 0.14761729538440704, |
| "learning_rate": 1.686794648051193e-05, |
| "loss": 0.1551, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.613395225464191, |
| "grad_norm": 96.18028259277344, |
| "learning_rate": 1.6751599767306575e-05, |
| "loss": 0.1303, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.629973474801061, |
| "grad_norm": 0.17378291487693787, |
| "learning_rate": 1.6635253054101222e-05, |
| "loss": 0.1484, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.646551724137931, |
| "grad_norm": 0.2519828677177429, |
| "learning_rate": 1.6518906340895872e-05, |
| "loss": 0.1845, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6631299734748011, |
| "grad_norm": 37.454341888427734, |
| "learning_rate": 1.640255962769052e-05, |
| "loss": 0.1544, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6631299734748011, |
| "eval_accuracy": 0.9551181102362205, |
| "eval_loss": 0.14238852262496948, |
| "eval_runtime": 8.4401, |
| "eval_samples_per_second": 300.943, |
| "eval_steps_per_second": 9.479, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6797082228116711, |
| "grad_norm": 9.755090713500977, |
| "learning_rate": 1.6286212914485165e-05, |
| "loss": 0.1176, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6962864721485411, |
| "grad_norm": 0.9808468222618103, |
| "learning_rate": 1.6169866201279815e-05, |
| "loss": 0.1234, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.7128647214854111, |
| "grad_norm": 4.4084649085998535, |
| "learning_rate": 1.6053519488074462e-05, |
| "loss": 0.0882, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.7294429708222812, |
| "grad_norm": 35.67166519165039, |
| "learning_rate": 1.5937172774869112e-05, |
| "loss": 0.1246, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7460212201591512, |
| "grad_norm": 54.88447189331055, |
| "learning_rate": 1.582082606166376e-05, |
| "loss": 0.1504, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.7625994694960212, |
| "grad_norm": 0.32783272862434387, |
| "learning_rate": 1.570447934845841e-05, |
| "loss": 0.1419, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.7791777188328912, |
| "grad_norm": 0.9903936386108398, |
| "learning_rate": 1.5588132635253055e-05, |
| "loss": 0.1154, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.7957559681697612, |
| "grad_norm": 0.5903331637382507, |
| "learning_rate": 1.5471785922047705e-05, |
| "loss": 0.0817, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.8123342175066313, |
| "grad_norm": 6.974926471710205, |
| "learning_rate": 1.5355439208842352e-05, |
| "loss": 0.1164, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.8289124668435013, |
| "grad_norm": 25.492650985717773, |
| "learning_rate": 1.5239092495637e-05, |
| "loss": 0.1592, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8289124668435013, |
| "eval_accuracy": 0.9641732283464567, |
| "eval_loss": 0.131427600979805, |
| "eval_runtime": 8.5238, |
| "eval_samples_per_second": 297.989, |
| "eval_steps_per_second": 9.385, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8454907161803713, |
| "grad_norm": 24.847103118896484, |
| "learning_rate": 1.5122745782431647e-05, |
| "loss": 0.1119, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.8620689655172413, |
| "grad_norm": 2.9396579265594482, |
| "learning_rate": 1.5006399069226297e-05, |
| "loss": 0.0978, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.8786472148541115, |
| "grad_norm": 21.483346939086914, |
| "learning_rate": 1.4890052356020943e-05, |
| "loss": 0.1233, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.8952254641909815, |
| "grad_norm": 0.2358592301607132, |
| "learning_rate": 1.4773705642815592e-05, |
| "loss": 0.1582, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.9118037135278515, |
| "grad_norm": 0.7912116646766663, |
| "learning_rate": 1.465735892961024e-05, |
| "loss": 0.1432, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.9283819628647215, |
| "grad_norm": 0.2763047516345978, |
| "learning_rate": 1.4541012216404888e-05, |
| "loss": 0.0988, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.9449602122015915, |
| "grad_norm": 0.904279887676239, |
| "learning_rate": 1.4424665503199535e-05, |
| "loss": 0.1295, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "grad_norm": 0.578948974609375, |
| "learning_rate": 1.4308318789994185e-05, |
| "loss": 0.0949, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.9781167108753316, |
| "grad_norm": 5.489175796508789, |
| "learning_rate": 1.4191972076788831e-05, |
| "loss": 0.1224, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.9946949602122016, |
| "grad_norm": 17.271835327148438, |
| "learning_rate": 1.4075625363583481e-05, |
| "loss": 0.1476, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9946949602122016, |
| "eval_accuracy": 0.9535433070866142, |
| "eval_loss": 0.16435901820659637, |
| "eval_runtime": 8.2931, |
| "eval_samples_per_second": 306.28, |
| "eval_steps_per_second": 9.647, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.0112732095490715, |
| "grad_norm": 17.650592803955078, |
| "learning_rate": 1.3959278650378128e-05, |
| "loss": 0.064, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.0278514588859415, |
| "grad_norm": 0.08039892464876175, |
| "learning_rate": 1.3842931937172776e-05, |
| "loss": 0.055, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.0444297082228116, |
| "grad_norm": 36.28068542480469, |
| "learning_rate": 1.3726585223967423e-05, |
| "loss": 0.052, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.0610079575596818, |
| "grad_norm": 0.07026328891515732, |
| "learning_rate": 1.3610238510762073e-05, |
| "loss": 0.0706, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.0775862068965518, |
| "grad_norm": 6.0042853355407715, |
| "learning_rate": 1.349389179755672e-05, |
| "loss": 0.1037, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.0941644562334218, |
| "grad_norm": 0.8829227089881897, |
| "learning_rate": 1.337754508435137e-05, |
| "loss": 0.0536, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.1107427055702919, |
| "grad_norm": 54.231544494628906, |
| "learning_rate": 1.3261198371146016e-05, |
| "loss": 0.0374, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.1273209549071619, |
| "grad_norm": 0.07569246739149094, |
| "learning_rate": 1.3144851657940664e-05, |
| "loss": 0.0738, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.143899204244032, |
| "grad_norm": 0.017937984317541122, |
| "learning_rate": 1.3028504944735311e-05, |
| "loss": 0.03, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.160477453580902, |
| "grad_norm": 9.184370040893555, |
| "learning_rate": 1.2912158231529961e-05, |
| "loss": 0.0778, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.160477453580902, |
| "eval_accuracy": 0.9677165354330709, |
| "eval_loss": 0.1568383127450943, |
| "eval_runtime": 8.2686, |
| "eval_samples_per_second": 307.187, |
| "eval_steps_per_second": 9.675, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.177055702917772, |
| "grad_norm": 31.2453556060791, |
| "learning_rate": 1.2795811518324608e-05, |
| "loss": 0.04, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.193633952254642, |
| "grad_norm": 0.031678102910518646, |
| "learning_rate": 1.2679464805119258e-05, |
| "loss": 0.0687, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.210212201591512, |
| "grad_norm": 0.20173047482967377, |
| "learning_rate": 1.2563118091913904e-05, |
| "loss": 0.092, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.226790450928382, |
| "grad_norm": 0.08714820444583893, |
| "learning_rate": 1.2446771378708553e-05, |
| "loss": 0.0692, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.243368700265252, |
| "grad_norm": 22.537704467773438, |
| "learning_rate": 1.23304246655032e-05, |
| "loss": 0.0652, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.259946949602122, |
| "grad_norm": 8.494444847106934, |
| "learning_rate": 1.2214077952297849e-05, |
| "loss": 0.0541, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.276525198938992, |
| "grad_norm": 0.4346514046192169, |
| "learning_rate": 1.2097731239092496e-05, |
| "loss": 0.055, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.293103448275862, |
| "grad_norm": 0.08467918634414673, |
| "learning_rate": 1.1981384525887146e-05, |
| "loss": 0.0454, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.3096816976127321, |
| "grad_norm": 0.10869591683149338, |
| "learning_rate": 1.1865037812681792e-05, |
| "loss": 0.0438, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.3262599469496021, |
| "grad_norm": 20.05512046813965, |
| "learning_rate": 1.174869109947644e-05, |
| "loss": 0.0661, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.3262599469496021, |
| "eval_accuracy": 0.9696850393700788, |
| "eval_loss": 0.14853054285049438, |
| "eval_runtime": 8.4299, |
| "eval_samples_per_second": 301.307, |
| "eval_steps_per_second": 9.49, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.3428381962864722, |
| "grad_norm": 0.03564869612455368, |
| "learning_rate": 1.1632344386271089e-05, |
| "loss": 0.0529, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.3594164456233422, |
| "grad_norm": 0.020624179393053055, |
| "learning_rate": 1.1515997673065737e-05, |
| "loss": 0.0525, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.3759946949602122, |
| "grad_norm": 10.6432466506958, |
| "learning_rate": 1.1399650959860384e-05, |
| "loss": 0.0502, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.3925729442970822, |
| "grad_norm": 0.02102123387157917, |
| "learning_rate": 1.1283304246655034e-05, |
| "loss": 0.0582, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.4091511936339522, |
| "grad_norm": 32.820335388183594, |
| "learning_rate": 1.116695753344968e-05, |
| "loss": 0.0693, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.4257294429708223, |
| "grad_norm": 0.8974321484565735, |
| "learning_rate": 1.105061082024433e-05, |
| "loss": 0.0898, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.4423076923076923, |
| "grad_norm": 12.125022888183594, |
| "learning_rate": 1.0934264107038977e-05, |
| "loss": 0.0251, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.4588859416445623, |
| "grad_norm": 25.702117919921875, |
| "learning_rate": 1.0817917393833625e-05, |
| "loss": 0.0526, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.4754641909814323, |
| "grad_norm": 0.011364310048520565, |
| "learning_rate": 1.0701570680628272e-05, |
| "loss": 0.0376, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.4920424403183024, |
| "grad_norm": 0.02076265960931778, |
| "learning_rate": 1.0585223967422922e-05, |
| "loss": 0.0352, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.4920424403183024, |
| "eval_accuracy": 0.9708661417322835, |
| "eval_loss": 0.14908622205257416, |
| "eval_runtime": 8.3245, |
| "eval_samples_per_second": 305.125, |
| "eval_steps_per_second": 9.61, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.5086206896551724, |
| "grad_norm": 0.32766398787498474, |
| "learning_rate": 1.0468877254217569e-05, |
| "loss": 0.0315, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.5251989389920424, |
| "grad_norm": 1.0165653228759766, |
| "learning_rate": 1.0352530541012219e-05, |
| "loss": 0.0792, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.5417771883289124, |
| "grad_norm": 19.78496551513672, |
| "learning_rate": 1.0236183827806865e-05, |
| "loss": 0.0906, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.5583554376657824, |
| "grad_norm": 0.024883978068828583, |
| "learning_rate": 1.0119837114601513e-05, |
| "loss": 0.0493, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.5749336870026527, |
| "grad_norm": 0.23417970538139343, |
| "learning_rate": 1.000349040139616e-05, |
| "loss": 0.0657, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.5915119363395225, |
| "grad_norm": 0.09522684663534164, |
| "learning_rate": 9.887143688190808e-06, |
| "loss": 0.0412, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.6080901856763927, |
| "grad_norm": 0.16693036258220673, |
| "learning_rate": 9.770796974985457e-06, |
| "loss": 0.0619, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.6246684350132625, |
| "grad_norm": 25.931489944458008, |
| "learning_rate": 9.654450261780105e-06, |
| "loss": 0.0221, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.6412466843501328, |
| "grad_norm": 0.02511373534798622, |
| "learning_rate": 9.538103548574753e-06, |
| "loss": 0.0228, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.6578249336870026, |
| "grad_norm": 0.014305735006928444, |
| "learning_rate": 9.421756835369402e-06, |
| "loss": 0.0483, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.6578249336870026, |
| "eval_accuracy": 0.9692913385826771, |
| "eval_loss": 0.1490258425474167, |
| "eval_runtime": 8.5307, |
| "eval_samples_per_second": 297.75, |
| "eval_steps_per_second": 9.378, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.6744031830238728, |
| "grad_norm": 1.0724488496780396, |
| "learning_rate": 9.30541012216405e-06, |
| "loss": 0.0485, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.6909814323607426, |
| "grad_norm": 0.2601702809333801, |
| "learning_rate": 9.189063408958697e-06, |
| "loss": 0.0585, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.7075596816976129, |
| "grad_norm": 0.4412553608417511, |
| "learning_rate": 9.072716695753345e-06, |
| "loss": 0.0278, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.7241379310344827, |
| "grad_norm": 20.371938705444336, |
| "learning_rate": 8.956369982547993e-06, |
| "loss": 0.0605, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.740716180371353, |
| "grad_norm": 33.141326904296875, |
| "learning_rate": 8.840023269342641e-06, |
| "loss": 0.0902, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.7572944297082227, |
| "grad_norm": 0.3250061571598053, |
| "learning_rate": 8.72367655613729e-06, |
| "loss": 0.0552, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.773872679045093, |
| "grad_norm": 26.73221206665039, |
| "learning_rate": 8.607329842931938e-06, |
| "loss": 0.0696, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.7904509283819627, |
| "grad_norm": 0.10050135850906372, |
| "learning_rate": 8.490983129726585e-06, |
| "loss": 0.07, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.807029177718833, |
| "grad_norm": 0.03302587568759918, |
| "learning_rate": 8.374636416521233e-06, |
| "loss": 0.0495, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.8236074270557028, |
| "grad_norm": 9.589860916137695, |
| "learning_rate": 8.258289703315881e-06, |
| "loss": 0.0459, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.8236074270557028, |
| "eval_accuracy": 0.9748031496062992, |
| "eval_loss": 0.12067966908216476, |
| "eval_runtime": 8.3606, |
| "eval_samples_per_second": 303.807, |
| "eval_steps_per_second": 9.569, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.840185676392573, |
| "grad_norm": 21.569997787475586, |
| "learning_rate": 8.14194299011053e-06, |
| "loss": 0.0545, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.8567639257294428, |
| "grad_norm": 0.021630477160215378, |
| "learning_rate": 8.025596276905178e-06, |
| "loss": 0.0433, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.873342175066313, |
| "grad_norm": 0.01602640002965927, |
| "learning_rate": 7.909249563699826e-06, |
| "loss": 0.0281, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.8899204244031829, |
| "grad_norm": 0.2678263485431671, |
| "learning_rate": 7.792902850494474e-06, |
| "loss": 0.0476, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.906498673740053, |
| "grad_norm": 0.037133533507585526, |
| "learning_rate": 7.676556137289121e-06, |
| "loss": 0.0491, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.03356484696269035, |
| "learning_rate": 7.560209424083769e-06, |
| "loss": 0.0448, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.9396551724137931, |
| "grad_norm": 150.93212890625, |
| "learning_rate": 7.443862710878418e-06, |
| "loss": 0.0306, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.9562334217506632, |
| "grad_norm": 0.283079594373703, |
| "learning_rate": 7.327515997673066e-06, |
| "loss": 0.0707, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.9728116710875332, |
| "grad_norm": 8.607619285583496, |
| "learning_rate": 7.211169284467714e-06, |
| "loss": 0.0648, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.9893899204244032, |
| "grad_norm": 2.3767149448394775, |
| "learning_rate": 7.094822571262362e-06, |
| "loss": 0.0441, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.9893899204244032, |
| "eval_accuracy": 0.971259842519685, |
| "eval_loss": 0.13274061679840088, |
| "eval_runtime": 8.2266, |
| "eval_samples_per_second": 308.755, |
| "eval_steps_per_second": 9.725, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.0059681697612732, |
| "grad_norm": 0.05666418746113777, |
| "learning_rate": 6.97847585805701e-06, |
| "loss": 0.0261, |
| "step": 6050 |
| }, |
| { |
| "epoch": 2.022546419098143, |
| "grad_norm": 0.046994440257549286, |
| "learning_rate": 6.862129144851658e-06, |
| "loss": 0.0163, |
| "step": 6100 |
| }, |
| { |
| "epoch": 2.0391246684350133, |
| "grad_norm": 92.52049255371094, |
| "learning_rate": 6.745782431646306e-06, |
| "loss": 0.0156, |
| "step": 6150 |
| }, |
| { |
| "epoch": 2.055702917771883, |
| "grad_norm": 0.050654154270887375, |
| "learning_rate": 6.629435718440954e-06, |
| "loss": 0.0159, |
| "step": 6200 |
| }, |
| { |
| "epoch": 2.0722811671087533, |
| "grad_norm": 0.07930738478899002, |
| "learning_rate": 6.513089005235602e-06, |
| "loss": 0.0155, |
| "step": 6250 |
| }, |
| { |
| "epoch": 2.088859416445623, |
| "grad_norm": 0.006583095528185368, |
| "learning_rate": 6.39674229203025e-06, |
| "loss": 0.0122, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.1054376657824934, |
| "grad_norm": 0.009297781623899937, |
| "learning_rate": 6.280395578824898e-06, |
| "loss": 0.0197, |
| "step": 6350 |
| }, |
| { |
| "epoch": 2.1220159151193636, |
| "grad_norm": 0.06135769560933113, |
| "learning_rate": 6.1640488656195465e-06, |
| "loss": 0.01, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.1385941644562334, |
| "grad_norm": 0.01910097897052765, |
| "learning_rate": 6.047702152414194e-06, |
| "loss": 0.0166, |
| "step": 6450 |
| }, |
| { |
| "epoch": 2.1551724137931036, |
| "grad_norm": 0.009077299386262894, |
| "learning_rate": 5.931355439208842e-06, |
| "loss": 0.0123, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.1551724137931036, |
| "eval_accuracy": 0.9751968503937007, |
| "eval_loss": 0.1399421989917755, |
| "eval_runtime": 8.3995, |
| "eval_samples_per_second": 302.399, |
| "eval_steps_per_second": 9.524, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.1717506631299734, |
| "grad_norm": 0.005973361898213625, |
| "learning_rate": 5.8150087260034905e-06, |
| "loss": 0.006, |
| "step": 6550 |
| }, |
| { |
| "epoch": 2.1883289124668437, |
| "grad_norm": 0.1609232872724533, |
| "learning_rate": 5.698662012798139e-06, |
| "loss": 0.0244, |
| "step": 6600 |
| }, |
| { |
| "epoch": 2.2049071618037135, |
| "grad_norm": 30.786165237426758, |
| "learning_rate": 5.582315299592786e-06, |
| "loss": 0.036, |
| "step": 6650 |
| }, |
| { |
| "epoch": 2.2214854111405837, |
| "grad_norm": 0.007496810518205166, |
| "learning_rate": 5.465968586387435e-06, |
| "loss": 0.0053, |
| "step": 6700 |
| }, |
| { |
| "epoch": 2.2380636604774535, |
| "grad_norm": 86.89860534667969, |
| "learning_rate": 5.349621873182083e-06, |
| "loss": 0.021, |
| "step": 6750 |
| }, |
| { |
| "epoch": 2.2546419098143238, |
| "grad_norm": 0.007081742864102125, |
| "learning_rate": 5.23327515997673e-06, |
| "loss": 0.0184, |
| "step": 6800 |
| }, |
| { |
| "epoch": 2.2712201591511936, |
| "grad_norm": 11.790361404418945, |
| "learning_rate": 5.116928446771379e-06, |
| "loss": 0.0149, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.287798408488064, |
| "grad_norm": 0.012445084750652313, |
| "learning_rate": 5.000581733566027e-06, |
| "loss": 0.0187, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.3043766578249336, |
| "grad_norm": 0.00747014069929719, |
| "learning_rate": 4.884235020360675e-06, |
| "loss": 0.0003, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.320954907161804, |
| "grad_norm": 0.016270918771624565, |
| "learning_rate": 4.7678883071553236e-06, |
| "loss": 0.0208, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.320954907161804, |
| "eval_accuracy": 0.9755905511811024, |
| "eval_loss": 0.1440856009721756, |
| "eval_runtime": 8.3702, |
| "eval_samples_per_second": 303.457, |
| "eval_steps_per_second": 9.558, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.3375331564986737, |
| "grad_norm": 0.0076462519355118275, |
| "learning_rate": 4.651541593949971e-06, |
| "loss": 0.0134, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.354111405835544, |
| "grad_norm": 0.008808553218841553, |
| "learning_rate": 4.535194880744619e-06, |
| "loss": 0.0092, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.3706896551724137, |
| "grad_norm": 8.406987190246582, |
| "learning_rate": 4.418848167539268e-06, |
| "loss": 0.0368, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.387267904509284, |
| "grad_norm": 0.004396683536469936, |
| "learning_rate": 4.302501454333916e-06, |
| "loss": 0.0035, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.4038461538461537, |
| "grad_norm": 0.005913051310926676, |
| "learning_rate": 4.186154741128563e-06, |
| "loss": 0.0021, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.420424403183024, |
| "grad_norm": 0.00849180482327938, |
| "learning_rate": 4.069808027923212e-06, |
| "loss": 0.0135, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.437002652519894, |
| "grad_norm": 18.357879638671875, |
| "learning_rate": 3.95346131471786e-06, |
| "loss": 0.0119, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.453580901856764, |
| "grad_norm": 0.005445088259875774, |
| "learning_rate": 3.8371146015125074e-06, |
| "loss": 0.018, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.470159151193634, |
| "grad_norm": 97.43321990966797, |
| "learning_rate": 3.7207678883071557e-06, |
| "loss": 0.0329, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.486737400530504, |
| "grad_norm": 0.003963190596550703, |
| "learning_rate": 3.6044211751018036e-06, |
| "loss": 0.0385, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.486737400530504, |
| "eval_accuracy": 0.974015748031496, |
| "eval_loss": 0.1490032821893692, |
| "eval_runtime": 8.4191, |
| "eval_samples_per_second": 301.694, |
| "eval_steps_per_second": 9.502, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.503315649867374, |
| "grad_norm": 0.004599283449351788, |
| "learning_rate": 3.488074461896452e-06, |
| "loss": 0.0151, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.519893899204244, |
| "grad_norm": 0.032206956297159195, |
| "learning_rate": 3.3717277486911e-06, |
| "loss": 0.0075, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.536472148541114, |
| "grad_norm": 7.669720649719238, |
| "learning_rate": 3.255381035485748e-06, |
| "loss": 0.0329, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.553050397877984, |
| "grad_norm": 0.045368775725364685, |
| "learning_rate": 3.139034322280396e-06, |
| "loss": 0.0394, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.569628647214854, |
| "grad_norm": 0.012171960435807705, |
| "learning_rate": 3.022687609075044e-06, |
| "loss": 0.0035, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.586206896551724, |
| "grad_norm": 0.06905154883861542, |
| "learning_rate": 2.906340895869692e-06, |
| "loss": 0.0241, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.602785145888594, |
| "grad_norm": 0.00501677393913269, |
| "learning_rate": 2.78999418266434e-06, |
| "loss": 0.0073, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.6193633952254642, |
| "grad_norm": 0.04552486911416054, |
| "learning_rate": 2.673647469458988e-06, |
| "loss": 0.0064, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.635941644562334, |
| "grad_norm": 0.04517211765050888, |
| "learning_rate": 2.5573007562536362e-06, |
| "loss": 0.0223, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.6525198938992043, |
| "grad_norm": 0.008471992798149586, |
| "learning_rate": 2.440954043048284e-06, |
| "loss": 0.0241, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.6525198938992043, |
| "eval_accuracy": 0.9708661417322835, |
| "eval_loss": 0.1622270792722702, |
| "eval_runtime": 8.3014, |
| "eval_samples_per_second": 305.972, |
| "eval_steps_per_second": 9.637, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.6690981432360745, |
| "grad_norm": 0.169709712266922, |
| "learning_rate": 2.3246073298429324e-06, |
| "loss": 0.0124, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.6856763925729443, |
| "grad_norm": 1.3567891120910645, |
| "learning_rate": 2.2082606166375803e-06, |
| "loss": 0.0065, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.702254641909814, |
| "grad_norm": 18.267772674560547, |
| "learning_rate": 2.091913903432228e-06, |
| "loss": 0.0129, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.7188328912466844, |
| "grad_norm": 0.012821214273571968, |
| "learning_rate": 1.9755671902268765e-06, |
| "loss": 0.0071, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.7354111405835546, |
| "grad_norm": 0.004050163086503744, |
| "learning_rate": 1.8592204770215244e-06, |
| "loss": 0.0136, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.7519893899204244, |
| "grad_norm": 0.008309832774102688, |
| "learning_rate": 1.7428737638161724e-06, |
| "loss": 0.0136, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.768567639257294, |
| "grad_norm": 0.038879744708538055, |
| "learning_rate": 1.6265270506108205e-06, |
| "loss": 0.0036, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.7851458885941645, |
| "grad_norm": 0.045335281640291214, |
| "learning_rate": 1.5101803374054686e-06, |
| "loss": 0.0089, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.8017241379310347, |
| "grad_norm": 0.09732075035572052, |
| "learning_rate": 1.3938336242001165e-06, |
| "loss": 0.0002, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.8183023872679045, |
| "grad_norm": 0.004367977846413851, |
| "learning_rate": 1.2774869109947646e-06, |
| "loss": 0.0112, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.8183023872679045, |
| "eval_accuracy": 0.9751968503937007, |
| "eval_loss": 0.15158241987228394, |
| "eval_runtime": 8.4462, |
| "eval_samples_per_second": 300.725, |
| "eval_steps_per_second": 9.472, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.8348806366047743, |
| "grad_norm": 0.003420003689825535, |
| "learning_rate": 1.1611401977894125e-06, |
| "loss": 0.0128, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.8514588859416445, |
| "grad_norm": 0.020186182111501694, |
| "learning_rate": 1.0447934845840606e-06, |
| "loss": 0.0188, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.868037135278515, |
| "grad_norm": 0.021418306976556778, |
| "learning_rate": 9.284467713787087e-07, |
| "loss": 0.0217, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.8846153846153846, |
| "grad_norm": 0.0053920382633805275, |
| "learning_rate": 8.121000581733566e-07, |
| "loss": 0.0141, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.9011936339522544, |
| "grad_norm": 0.019735030829906464, |
| "learning_rate": 6.957533449680047e-07, |
| "loss": 0.0268, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.9177718832891246, |
| "grad_norm": 0.027690809220075607, |
| "learning_rate": 5.794066317626527e-07, |
| "loss": 0.0231, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.934350132625995, |
| "grad_norm": 1.5282634496688843, |
| "learning_rate": 4.6305991855730076e-07, |
| "loss": 0.0094, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.9509283819628647, |
| "grad_norm": 0.007764532696455717, |
| "learning_rate": 3.4671320535194885e-07, |
| "loss": 0.0053, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.967506631299735, |
| "grad_norm": 0.03694835305213928, |
| "learning_rate": 2.3036649214659686e-07, |
| "loss": 0.0089, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.9840848806366047, |
| "grad_norm": 0.009050080552697182, |
| "learning_rate": 1.1401977894124491e-07, |
| "loss": 0.0075, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.9840848806366047, |
| "eval_accuracy": 0.9748031496062992, |
| "eval_loss": 0.15235309302806854, |
| "eval_runtime": 8.4365, |
| "eval_samples_per_second": 301.073, |
| "eval_steps_per_second": 9.483, |
| "step": 9000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 9048, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.3327379956546404e+19, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|