| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 202, |
| "global_step": 202, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009900990099009901, |
| "grad_norm": 152.50006103515625, |
| "learning_rate": 0.0, |
| "loss": 15.6875, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.019801980198019802, |
| "grad_norm": 155.11602783203125, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 15.8984, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0297029702970297, |
| "grad_norm": 39.620399475097656, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 11.7891, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.039603960396039604, |
| "grad_norm": 481.17474365234375, |
| "learning_rate": 5.4545454545454546e-05, |
| "loss": 12.0391, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.04950495049504951, |
| "grad_norm": 45.559226989746094, |
| "learning_rate": 7.272727272727273e-05, |
| "loss": 13.8828, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0594059405940594, |
| "grad_norm": 65.59961700439453, |
| "learning_rate": 9.090909090909092e-05, |
| "loss": 16.6641, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.06930693069306931, |
| "grad_norm": 33.685943603515625, |
| "learning_rate": 0.00010909090909090909, |
| "loss": 15.5469, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.07920792079207921, |
| "grad_norm": 73.23568725585938, |
| "learning_rate": 0.00012727272727272728, |
| "loss": 19.2344, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0891089108910891, |
| "grad_norm": 26.491865158081055, |
| "learning_rate": 0.00014545454545454546, |
| "loss": 14.6875, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.09900990099009901, |
| "grad_norm": 29.270771026611328, |
| "learning_rate": 0.00016363636363636366, |
| "loss": 13.5625, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10891089108910891, |
| "grad_norm": 22.94576644897461, |
| "learning_rate": 0.00018181818181818183, |
| "loss": 13.8359, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1188118811881188, |
| "grad_norm": 19.714340209960938, |
| "learning_rate": 0.0002, |
| "loss": 13.4453, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.12871287128712872, |
| "grad_norm": 29.34218406677246, |
| "learning_rate": 0.00019998782593171394, |
| "loss": 14.1328, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.13861386138613863, |
| "grad_norm": 15.325265884399414, |
| "learning_rate": 0.0001999513070203655, |
| "loss": 14.1484, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1485148514851485, |
| "grad_norm": 15.70521354675293, |
| "learning_rate": 0.00019989045314559295, |
| "loss": 13.1875, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.15841584158415842, |
| "grad_norm": 13.77431869506836, |
| "learning_rate": 0.00019980528077049017, |
| "loss": 12.2109, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.16831683168316833, |
| "grad_norm": 48.380741119384766, |
| "learning_rate": 0.00019969581293715296, |
| "loss": 12.6484, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.1782178217821782, |
| "grad_norm": 13.91443157196045, |
| "learning_rate": 0.00019956207926044532, |
| "loss": 12.6172, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.18811881188118812, |
| "grad_norm": 9.085782051086426, |
| "learning_rate": 0.00019940411591998748, |
| "loss": 11.7734, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.19801980198019803, |
| "grad_norm": 8.792189598083496, |
| "learning_rate": 0.0001992219656503682, |
| "loss": 11.2812, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2079207920792079, |
| "grad_norm": 9.451997756958008, |
| "learning_rate": 0.0001990156777295835, |
| "loss": 10.9844, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.21782178217821782, |
| "grad_norm": 8.6239652633667, |
| "learning_rate": 0.00019878530796570528, |
| "loss": 10.3516, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.22772277227722773, |
| "grad_norm": 9.932726860046387, |
| "learning_rate": 0.00019853091868178316, |
| "loss": 10.2031, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.2376237623762376, |
| "grad_norm": 5.901266098022461, |
| "learning_rate": 0.00019825257869898418, |
| "loss": 9.9297, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.24752475247524752, |
| "grad_norm": 4.901180744171143, |
| "learning_rate": 0.00019795036331797405, |
| "loss": 9.5938, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.25742574257425743, |
| "grad_norm": 4.704899311065674, |
| "learning_rate": 0.00019762435429854577, |
| "loss": 9.5703, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.26732673267326734, |
| "grad_norm": 4.755985736846924, |
| "learning_rate": 0.00019727463983750077, |
| "loss": 9.5312, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.27722772277227725, |
| "grad_norm": 5.313395023345947, |
| "learning_rate": 0.00019690131454478858, |
| "loss": 9.3672, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.2871287128712871, |
| "grad_norm": 6.608179092407227, |
| "learning_rate": 0.00019650447941791155, |
| "loss": 9.6875, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.297029702970297, |
| "grad_norm": 7.129603385925293, |
| "learning_rate": 0.0001960842418146016, |
| "loss": 9.6016, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3069306930693069, |
| "grad_norm": 1075.1676025390625, |
| "learning_rate": 0.00019564071542377596, |
| "loss": 18.0156, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.31683168316831684, |
| "grad_norm": 12.196839332580566, |
| "learning_rate": 0.00019517402023478062, |
| "loss": 10.8125, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.32673267326732675, |
| "grad_norm": 6.137757301330566, |
| "learning_rate": 0.0001946842825049289, |
| "loss": 10.0625, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.33663366336633666, |
| "grad_norm": 9.860557556152344, |
| "learning_rate": 0.00019417163472534456, |
| "loss": 10.1719, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.3465346534653465, |
| "grad_norm": 9.4225435256958, |
| "learning_rate": 0.00019363621558511805, |
| "loss": 8.7197, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3564356435643564, |
| "grad_norm": 4.244920253753662, |
| "learning_rate": 0.00019307816993378662, |
| "loss": 9.5391, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.36633663366336633, |
| "grad_norm": 4.227206707000732, |
| "learning_rate": 0.00019249764874214732, |
| "loss": 9.3281, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.37623762376237624, |
| "grad_norm": 3.867461681365967, |
| "learning_rate": 0.00019189480906141413, |
| "loss": 9.0625, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.38613861386138615, |
| "grad_norm": 5.17470121383667, |
| "learning_rate": 0.00019126981398073008, |
| "loss": 9.1094, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.39603960396039606, |
| "grad_norm": 5.798826694488525, |
| "learning_rate": 0.00019062283258304612, |
| "loss": 9.3359, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.40594059405940597, |
| "grad_norm": 4.38295841217041, |
| "learning_rate": 0.0001899540398993781, |
| "loss": 9.4219, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.4158415841584158, |
| "grad_norm": 4.414400577545166, |
| "learning_rate": 0.00018926361686145494, |
| "loss": 9.1016, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.42574257425742573, |
| "grad_norm": 4.6771559715271, |
| "learning_rate": 0.00018855175025277022, |
| "loss": 8.9219, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.43564356435643564, |
| "grad_norm": 4.436445713043213, |
| "learning_rate": 0.00018781863265805065, |
| "loss": 9.1562, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.44554455445544555, |
| "grad_norm": 3.0445127487182617, |
| "learning_rate": 0.00018706446241115537, |
| "loss": 9.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.45544554455445546, |
| "grad_norm": 13.921093940734863, |
| "learning_rate": 0.00018628944354141962, |
| "loss": 8.6484, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.46534653465346537, |
| "grad_norm": 7.534108638763428, |
| "learning_rate": 0.00018549378571845767, |
| "loss": 9.7734, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.4752475247524752, |
| "grad_norm": 3.2327933311462402, |
| "learning_rate": 0.00018467770419543998, |
| "loss": 9.375, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.48514851485148514, |
| "grad_norm": 4.523435115814209, |
| "learning_rate": 0.00018384141975085952, |
| "loss": 9.1406, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.49504950495049505, |
| "grad_norm": 83.07750701904297, |
| "learning_rate": 0.00018298515862880387, |
| "loss": 8.9531, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.504950495049505, |
| "grad_norm": 6.1501641273498535, |
| "learning_rate": 0.00018210915247774784, |
| "loss": 10.0547, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.5148514851485149, |
| "grad_norm": 3.1539700031280518, |
| "learning_rate": 0.00018121363828788484, |
| "loss": 9.3359, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.5247524752475248, |
| "grad_norm": 3.68802809715271, |
| "learning_rate": 0.0001802988583270126, |
| "loss": 9.1328, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.5346534653465347, |
| "grad_norm": 3.102386713027954, |
| "learning_rate": 0.00017936506007499136, |
| "loss": 8.9609, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.5445544554455446, |
| "grad_norm": 3.1378631591796875, |
| "learning_rate": 0.00017841249615679192, |
| "loss": 8.7891, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5544554455445545, |
| "grad_norm": 3.207996129989624, |
| "learning_rate": 0.00017744142427415172, |
| "loss": 8.75, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.5643564356435643, |
| "grad_norm": 4.185523509979248, |
| "learning_rate": 0.0001764521071358577, |
| "loss": 8.5781, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.5742574257425742, |
| "grad_norm": 2.66141676902771, |
| "learning_rate": 0.00017544481238667426, |
| "loss": 8.6172, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.5841584158415841, |
| "grad_norm": 4.6184258460998535, |
| "learning_rate": 0.00017441981253493607, |
| "loss": 8.625, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.594059405940594, |
| "grad_norm": 114.2392578125, |
| "learning_rate": 0.00017337738487882508, |
| "loss": 13.5, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6039603960396039, |
| "grad_norm": 6.742166996002197, |
| "learning_rate": 0.00017231781143135173, |
| "loss": 9.3594, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.6138613861386139, |
| "grad_norm": 2.791332483291626, |
| "learning_rate": 0.00017124137884406054, |
| "loss": 8.9375, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.6237623762376238, |
| "grad_norm": 7.167698383331299, |
| "learning_rate": 0.00017014837832948087, |
| "loss": 8.6094, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.6336633663366337, |
| "grad_norm": 4.072234153747559, |
| "learning_rate": 0.00016903910558234393, |
| "loss": 8.9688, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.6435643564356436, |
| "grad_norm": 4.949003219604492, |
| "learning_rate": 0.0001679138606995868, |
| "loss": 8.7266, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6534653465346535, |
| "grad_norm": 70.55978393554688, |
| "learning_rate": 0.00016677294809916597, |
| "loss": 8.9453, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.6633663366336634, |
| "grad_norm": 4.001098155975342, |
| "learning_rate": 0.0001656166764377016, |
| "loss": 8.9453, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.6732673267326733, |
| "grad_norm": 3.2970945835113525, |
| "learning_rate": 0.00016444535852697508, |
| "loss": 8.7578, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.6831683168316832, |
| "grad_norm": 7.960860252380371, |
| "learning_rate": 0.00016325931124930266, |
| "loss": 8.4883, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.693069306930693, |
| "grad_norm": 4.774203300476074, |
| "learning_rate": 0.00016205885547180762, |
| "loss": 9.0859, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7029702970297029, |
| "grad_norm": 2.7817211151123047, |
| "learning_rate": 0.00016084431595961464, |
| "loss": 9.0234, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.7128712871287128, |
| "grad_norm": 3.5456042289733887, |
| "learning_rate": 0.0001596160212879894, |
| "loss": 8.5508, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.7227722772277227, |
| "grad_norm": 2.8811049461364746, |
| "learning_rate": 0.00015837430375344766, |
| "loss": 8.3203, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.7326732673267327, |
| "grad_norm": 2.634841203689575, |
| "learning_rate": 0.00015711949928385742, |
| "loss": 8.2539, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.7425742574257426, |
| "grad_norm": 2.80352520942688, |
| "learning_rate": 0.0001558519473475585, |
| "loss": 8.3594, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7524752475247525, |
| "grad_norm": 2.983501434326172, |
| "learning_rate": 0.000154571990861525, |
| "loss": 8.3555, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.7623762376237624, |
| "grad_norm": 2.525681972503662, |
| "learning_rate": 0.00015327997609859386, |
| "loss": 8.0117, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.7722772277227723, |
| "grad_norm": 2.6573147773742676, |
| "learning_rate": 0.0001519762525937862, |
| "loss": 8.1836, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.7821782178217822, |
| "grad_norm": 3.265751361846924, |
| "learning_rate": 0.0001506611730497459, |
| "loss": 8.1836, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.7920792079207921, |
| "grad_norm": 3.280087947845459, |
| "learning_rate": 0.00014933509324132116, |
| "loss": 8.2617, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.801980198019802, |
| "grad_norm": 2.2106235027313232, |
| "learning_rate": 0.0001479983719193151, |
| "loss": 8.0078, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.8118811881188119, |
| "grad_norm": 2.518003225326538, |
| "learning_rate": 0.0001466513707134312, |
| "loss": 8.0078, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.8217821782178217, |
| "grad_norm": 38.51225280761719, |
| "learning_rate": 0.00014529445403443965, |
| "loss": 7.7637, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.8316831683168316, |
| "grad_norm": 5.821120738983154, |
| "learning_rate": 0.0001439279889755918, |
| "loss": 8.375, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.8415841584158416, |
| "grad_norm": 3.276437282562256, |
| "learning_rate": 0.0001425523452133084, |
| "loss": 8.7188, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.8514851485148515, |
| "grad_norm": 6.970194339752197, |
| "learning_rate": 0.00014116789490716954, |
| "loss": 8.5781, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.8613861386138614, |
| "grad_norm": 94.84854125976562, |
| "learning_rate": 0.00013977501259923215, |
| "loss": 23.9531, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.8712871287128713, |
| "grad_norm": 4.508042335510254, |
| "learning_rate": 0.00013837407511270365, |
| "loss": 9.0234, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.8811881188118812, |
| "grad_norm": 2.887805223464966, |
| "learning_rate": 0.00013696546144999786, |
| "loss": 8.375, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.8910891089108911, |
| "grad_norm": 2.4554858207702637, |
| "learning_rate": 0.00013554955269020195, |
| "loss": 8.3242, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.900990099009901, |
| "grad_norm": 2.6604201793670654, |
| "learning_rate": 0.00013412673188598092, |
| "loss": 8.207, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.9108910891089109, |
| "grad_norm": 3.3090832233428955, |
| "learning_rate": 0.00013269738395994883, |
| "loss": 8.2266, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.9207920792079208, |
| "grad_norm": 2.517388105392456, |
| "learning_rate": 0.00013126189560053352, |
| "loss": 8.1797, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.9306930693069307, |
| "grad_norm": 2.1608598232269287, |
| "learning_rate": 0.00012982065515736418, |
| "loss": 7.9102, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.9405940594059405, |
| "grad_norm": 2.3422179222106934, |
| "learning_rate": 0.00012837405253620905, |
| "loss": 7.6211, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.9504950495049505, |
| "grad_norm": 2.019343852996826, |
| "learning_rate": 0.00012692247909349243, |
| "loss": 7.8633, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.9603960396039604, |
| "grad_norm": 2.547525405883789, |
| "learning_rate": 0.00012546632753041904, |
| "loss": 7.5391, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.9702970297029703, |
| "grad_norm": 2.123288869857788, |
| "learning_rate": 0.00012400599178673483, |
| "loss": 7.6953, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.9801980198019802, |
| "grad_norm": 2.303816080093384, |
| "learning_rate": 0.00012254186693415222, |
| "loss": 7.8242, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.9900990099009901, |
| "grad_norm": 2.4239020347595215, |
| "learning_rate": 0.00012107434906946982, |
| "loss": 7.5977, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.0720744132995605, |
| "learning_rate": 0.00011960383520741419, |
| "loss": 7.3789, |
| "step": 101 |
| }, |
| { |
| "checkpoint_runtime": 117.0071 |
| }, |
| { |
| "epoch": 1.00990099009901, |
| "grad_norm": 2.3910915851593018, |
| "learning_rate": 0.0001181307231732336, |
| "loss": 6.1016, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.0198019801980198, |
| "grad_norm": 2.539257764816284, |
| "learning_rate": 0.00011665541149507239, |
| "loss": 7.4375, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.0297029702970297, |
| "grad_norm": 2.1717493534088135, |
| "learning_rate": 0.0001151782992961554, |
| "loss": 7.2188, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.0396039603960396, |
| "grad_norm": 2.1999051570892334, |
| "learning_rate": 0.0001136997861868109, |
| "loss": 7.7773, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.0495049504950495, |
| "grad_norm": 20.015344619750977, |
| "learning_rate": 0.00011222027215636235, |
| "loss": 13.418, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.0594059405940595, |
| "grad_norm": 3.3290457725524902, |
| "learning_rate": 0.00011074015746491722, |
| "loss": 7.6797, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.0693069306930694, |
| "grad_norm": 2.3182663917541504, |
| "learning_rate": 0.00010925984253508279, |
| "loss": 7.4141, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.0792079207920793, |
| "grad_norm": 2.7846322059631348, |
| "learning_rate": 0.00010777972784363765, |
| "loss": 7.5781, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.0891089108910892, |
| "grad_norm": 3.0692903995513916, |
| "learning_rate": 0.00010630021381318915, |
| "loss": 7.207, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.099009900990099, |
| "grad_norm": 2.5749456882476807, |
| "learning_rate": 0.0001048217007038446, |
| "loss": 6.8672, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.108910891089109, |
| "grad_norm": 2.473937511444092, |
| "learning_rate": 0.00010334458850492763, |
| "loss": 6.7148, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.118811881188119, |
| "grad_norm": 2.3984756469726562, |
| "learning_rate": 0.00010186927682676646, |
| "loss": 6.6914, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.1287128712871288, |
| "grad_norm": 2.423884391784668, |
| "learning_rate": 0.00010039616479258587, |
| "loss": 7.0469, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.1386138613861387, |
| "grad_norm": 3.279681921005249, |
| "learning_rate": 9.892565093053024e-05, |
| "loss": 6.5078, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.1485148514851484, |
| "grad_norm": 3.0356831550598145, |
| "learning_rate": 9.745813306584781e-05, |
| "loss": 6.5273, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.1584158415841583, |
| "grad_norm": 2.747650146484375, |
| "learning_rate": 9.599400821326521e-05, |
| "loss": 6.4219, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.1683168316831682, |
| "grad_norm": 2.4899802207946777, |
| "learning_rate": 9.453367246958095e-05, |
| "loss": 7.0586, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.1782178217821782, |
| "grad_norm": 2.7331626415252686, |
| "learning_rate": 9.307752090650759e-05, |
| "loss": 6.3047, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.188118811881188, |
| "grad_norm": 2.377115488052368, |
| "learning_rate": 9.162594746379097e-05, |
| "loss": 5.9258, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.198019801980198, |
| "grad_norm": 2.8060243129730225, |
| "learning_rate": 9.017934484263584e-05, |
| "loss": 5.6406, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.2079207920792079, |
| "grad_norm": 2.350632429122925, |
| "learning_rate": 8.873810439946648e-05, |
| "loss": 5.5859, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.2178217821782178, |
| "grad_norm": 2.519296169281006, |
| "learning_rate": 8.73026160400512e-05, |
| "loss": 5.1758, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.2277227722772277, |
| "grad_norm": 2.0400750637054443, |
| "learning_rate": 8.58732681140191e-05, |
| "loss": 5.5195, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.2376237623762376, |
| "grad_norm": 1.9192492961883545, |
| "learning_rate": 8.445044730979808e-05, |
| "loss": 4.6875, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.2475247524752475, |
| "grad_norm": 2.3437716960906982, |
| "learning_rate": 8.303453855000217e-05, |
| "loss": 4.4492, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.2574257425742574, |
| "grad_norm": 2.027672052383423, |
| "learning_rate": 8.162592488729637e-05, |
| "loss": 4.3867, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.2673267326732673, |
| "grad_norm": 2.480762243270874, |
| "learning_rate": 8.022498740076788e-05, |
| "loss": 4.582, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.2772277227722773, |
| "grad_norm": 2.1822168827056885, |
| "learning_rate": 7.883210509283048e-05, |
| "loss": 4.6445, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.2871287128712872, |
| "grad_norm": 2.1713345050811768, |
| "learning_rate": 7.74476547866916e-05, |
| "loss": 4.8281, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.297029702970297, |
| "grad_norm": 2.949915885925293, |
| "learning_rate": 7.607201102440822e-05, |
| "loss": 4.8945, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.306930693069307, |
| "grad_norm": 5.816825866699219, |
| "learning_rate": 7.470554596556038e-05, |
| "loss": 6.4536, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.316831683168317, |
| "grad_norm": 2.8560147285461426, |
| "learning_rate": 7.334862928656882e-05, |
| "loss": 6.2148, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.3267326732673268, |
| "grad_norm": 1.9923173189163208, |
| "learning_rate": 7.200162808068492e-05, |
| "loss": 4.8516, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.3366336633663367, |
| "grad_norm": 3.521530866622925, |
| "learning_rate": 7.06649067586789e-05, |
| "loss": 5.7188, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.3465346534653464, |
| "grad_norm": 32.16452407836914, |
| "learning_rate": 6.933882695025414e-05, |
| "loss": 6.3516, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.3564356435643563, |
| "grad_norm": 2.6356420516967773, |
| "learning_rate": 6.802374740621382e-05, |
| "loss": 4.4414, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.3663366336633662, |
| "grad_norm": 1.9716286659240723, |
| "learning_rate": 6.672002390140615e-05, |
| "loss": 4.2656, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.3762376237623761, |
| "grad_norm": 1.7040444612503052, |
| "learning_rate": 6.542800913847502e-05, |
| "loss": 3.9785, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.386138613861386, |
| "grad_norm": 2.3597848415374756, |
| "learning_rate": 6.414805265244153e-05, |
| "loss": 4.4141, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.396039603960396, |
| "grad_norm": 2.488919734954834, |
| "learning_rate": 6.288050071614263e-05, |
| "loss": 4.8203, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.4059405940594059, |
| "grad_norm": 2.223335027694702, |
| "learning_rate": 6.162569624655235e-05, |
| "loss": 4.1738, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.4158415841584158, |
| "grad_norm": 1.9648133516311646, |
| "learning_rate": 6.0383978712010627e-05, |
| "loss": 4.0176, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.4257425742574257, |
| "grad_norm": 2.9666850566864014, |
| "learning_rate": 5.9155684040385396e-05, |
| "loss": 4.4258, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.4356435643564356, |
| "grad_norm": 2.051849603652954, |
| "learning_rate": 5.794114452819239e-05, |
| "loss": 4.0508, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.4455445544554455, |
| "grad_norm": 1.7934975624084473, |
| "learning_rate": 5.674068875069736e-05, |
| "loss": 3.6016, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.4554455445544554, |
| "grad_norm": 4.365362167358398, |
| "learning_rate": 5.5554641473024916e-05, |
| "loss": 6.043, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.4653465346534653, |
| "grad_norm": 2.678313732147217, |
| "learning_rate": 5.4383323562298405e-05, |
| "loss": 5.1289, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.4752475247524752, |
| "grad_norm": 1.8244177103042603, |
| "learning_rate": 5.3227051900834024e-05, |
| "loss": 3.7969, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.4851485148514851, |
| "grad_norm": 1.8080298900604248, |
| "learning_rate": 5.208613930041323e-05, |
| "loss": 4.1211, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.495049504950495, |
| "grad_norm": 50.39912414550781, |
| "learning_rate": 5.09608944176561e-05, |
| "loss": 9.9336, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.504950495049505, |
| "grad_norm": 2.4307775497436523, |
| "learning_rate": 4.985162167051917e-05, |
| "loss": 4.9141, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.5148514851485149, |
| "grad_norm": 1.858076572418213, |
| "learning_rate": 4.875862115593951e-05, |
| "loss": 3.7559, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.5247524752475248, |
| "grad_norm": 1.654653787612915, |
| "learning_rate": 4.7682188568648294e-05, |
| "loss": 3.8965, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.5346534653465347, |
| "grad_norm": 1.9130183458328247, |
| "learning_rate": 4.662261512117496e-05, |
| "loss": 3.5098, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.5445544554455446, |
| "grad_norm": 2.3735601902008057, |
| "learning_rate": 4.558018746506396e-05, |
| "loss": 3.4219, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.5544554455445545, |
| "grad_norm": 1.6245256662368774, |
| "learning_rate": 4.455518761332576e-05, |
| "loss": 3.4375, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.5643564356435644, |
| "grad_norm": 1.7158585786819458, |
| "learning_rate": 4.35478928641423e-05, |
| "loss": 4.0195, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.5742574257425743, |
| "grad_norm": 1.537367343902588, |
| "learning_rate": 4.25585757258483e-05, |
| "loss": 3.1113, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.5841584158415842, |
| "grad_norm": 2.29418683052063, |
| "learning_rate": 4.158750384320811e-05, |
| "loss": 3.9023, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.5940594059405941, |
| "grad_norm": 14.955975532531738, |
| "learning_rate": 4.063493992500865e-05, |
| "loss": 7.5469, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.603960396039604, |
| "grad_norm": 2.4816298484802246, |
| "learning_rate": 3.9701141672987406e-05, |
| "loss": 4.8594, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.613861386138614, |
| "grad_norm": 1.8471182584762573, |
| "learning_rate": 3.878636171211518e-05, |
| "loss": 3.4258, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.6237623762376239, |
| "grad_norm": 1.9999842643737793, |
| "learning_rate": 3.789084752225219e-05, |
| "loss": 4.6172, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.6336633663366338, |
| "grad_norm": 1.5966260433197021, |
| "learning_rate": 3.701484137119613e-05, |
| "loss": 3.7656, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.6435643564356437, |
| "grad_norm": 1.9665318727493286, |
| "learning_rate": 3.615858024914048e-05, |
| "loss": 3.8867, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.6534653465346536, |
| "grad_norm": 39.52935791015625, |
| "learning_rate": 3.532229580456006e-05, |
| "loss": 7.5195, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.6633663366336635, |
| "grad_norm": 2.001122236251831, |
| "learning_rate": 3.450621428154235e-05, |
| "loss": 4.2148, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.6732673267326734, |
| "grad_norm": 1.9016386270523071, |
| "learning_rate": 3.371055645858038e-05, |
| "loss": 3.6035, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.6831683168316833, |
| "grad_norm": 8.8970947265625, |
| "learning_rate": 3.2935537588844645e-05, |
| "loss": 5.6367, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.693069306930693, |
| "grad_norm": 1.5331684350967407, |
| "learning_rate": 3.218136734194936e-05, |
| "loss": 4.3047, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.702970297029703, |
| "grad_norm": 1.695820927619934, |
| "learning_rate": 3.144824974722983e-05, |
| "loss": 3.3164, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.7128712871287128, |
| "grad_norm": 7.624035835266113, |
| "learning_rate": 3.0736383138545086e-05, |
| "loss": 3.7305, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.7227722772277227, |
| "grad_norm": 1.5859265327453613, |
| "learning_rate": 3.0045960100621918e-05, |
| "loss": 3.2656, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.7326732673267327, |
| "grad_norm": 1.5074691772460938, |
| "learning_rate": 2.937716741695392e-05, |
| "loss": 3.1797, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.7425742574257426, |
| "grad_norm": 1.3923929929733276, |
| "learning_rate": 2.873018601926994e-05, |
| "loss": 3.1094, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.7524752475247525, |
| "grad_norm": 1.4183502197265625, |
| "learning_rate": 2.8105190938585885e-05, |
| "loss": 3.1641, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.7623762376237624, |
| "grad_norm": 1.5268654823303223, |
| "learning_rate": 2.7502351257852678e-05, |
| "loss": 2.7832, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.7722772277227723, |
| "grad_norm": 1.5342729091644287, |
| "learning_rate": 2.692183006621338e-05, |
| "loss": 2.8652, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.7821782178217822, |
| "grad_norm": 1.6859102249145508, |
| "learning_rate": 2.636378441488197e-05, |
| "loss": 3.1641, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.7920792079207921, |
| "grad_norm": 1.5989638566970825, |
| "learning_rate": 2.5828365274655457e-05, |
| "loss": 3.0234, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.801980198019802, |
| "grad_norm": 1.5343683958053589, |
| "learning_rate": 2.5315717495071078e-05, |
| "loss": 2.5156, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.811881188118812, |
| "grad_norm": 1.508675456047058, |
| "learning_rate": 2.4825979765219392e-05, |
| "loss": 2.6621, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.8217821782178216, |
| "grad_norm": 19.975343704223633, |
| "learning_rate": 2.4359284576224078e-05, |
| "loss": 7.1133, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.8316831683168315, |
| "grad_norm": 3.3649535179138184, |
| "learning_rate": 2.3915758185398423e-05, |
| "loss": 4.6992, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.8415841584158414, |
| "grad_norm": 2.1585795879364014, |
| "learning_rate": 2.3495520582088455e-05, |
| "loss": 3.5879, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.8514851485148514, |
| "grad_norm": 2.5024328231811523, |
| "learning_rate": 2.3098685455211446e-05, |
| "loss": 4.875, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.8613861386138613, |
| "grad_norm": 15.675300598144531, |
| "learning_rate": 2.272536016249926e-05, |
| "loss": 7.0469, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.8712871287128712, |
| "grad_norm": 2.0795493125915527, |
| "learning_rate": 2.2375645701454248e-05, |
| "loss": 4.3086, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.881188118811881, |
| "grad_norm": 1.4968867301940918, |
| "learning_rate": 2.2049636682025957e-05, |
| "loss": 3.1797, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.891089108910891, |
| "grad_norm": 1.6626455783843994, |
| "learning_rate": 2.174742130101581e-05, |
| "loss": 3.2617, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.900990099009901, |
| "grad_norm": 1.4873976707458496, |
| "learning_rate": 2.1469081318216833e-05, |
| "loss": 3.0488, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.9108910891089108, |
| "grad_norm": 1.8592286109924316, |
| "learning_rate": 2.121469203429473e-05, |
| "loss": 3.2988, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.9207920792079207, |
| "grad_norm": 1.742360234260559, |
| "learning_rate": 2.0984322270416487e-05, |
| "loss": 3.0312, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.9306930693069306, |
| "grad_norm": 1.4714621305465698, |
| "learning_rate": 2.077803434963179e-05, |
| "loss": 2.6895, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.9405940594059405, |
| "grad_norm": 1.268083095550537, |
| "learning_rate": 2.059588408001252e-05, |
| "loss": 2.5215, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.9504950495049505, |
| "grad_norm": 1.3318591117858887, |
| "learning_rate": 2.0437920739554686e-05, |
| "loss": 2.543, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.9603960396039604, |
| "grad_norm": 1.3971670866012573, |
| "learning_rate": 2.0304187062847038e-05, |
| "loss": 2.6191, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.9702970297029703, |
| "grad_norm": 1.3031758069992065, |
| "learning_rate": 2.0194719229509842e-05, |
| "loss": 2.418, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.9801980198019802, |
| "grad_norm": 1.3859241008758545, |
| "learning_rate": 2.0109546854407064e-05, |
| "loss": 2.5918, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.99009900990099, |
| "grad_norm": 1.3760048151016235, |
| "learning_rate": 2.0048692979634493e-05, |
| "loss": 2.6914, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.2137415409088135, |
| "learning_rate": 2.0012174068286065e-05, |
| "loss": 1.6016, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 2.703822612762451, |
| "eval_runtime": 12.5311, |
| "eval_samples_per_second": 4.309, |
| "eval_steps_per_second": 0.559, |
| "step": 202 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 202, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 101, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.5479892660867564e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|