| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9997283346916599, |
| "eval_steps": 500, |
| "global_step": 920, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0010866612333604998, |
| "grad_norm": 0.4251704808550498, |
| "learning_rate": 2.173913043478261e-06, |
| "loss": 1.4269, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0054333061668024995, |
| "grad_norm": 0.40984452500537005, |
| "learning_rate": 1.0869565217391305e-05, |
| "loss": 1.3471, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010866612333604999, |
| "grad_norm": 0.4873634611727503, |
| "learning_rate": 2.173913043478261e-05, |
| "loss": 1.381, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.016299918500407497, |
| "grad_norm": 0.18148535913697195, |
| "learning_rate": 3.260869565217392e-05, |
| "loss": 1.3548, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.021733224667209998, |
| "grad_norm": 0.15940647336910793, |
| "learning_rate": 4.347826086956522e-05, |
| "loss": 1.3268, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.027166530834012496, |
| "grad_norm": 0.1330605316188635, |
| "learning_rate": 5.4347826086956524e-05, |
| "loss": 1.2956, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.032599837000814993, |
| "grad_norm": 0.14643550415935352, |
| "learning_rate": 6.521739130434783e-05, |
| "loss": 1.2584, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.038033143167617495, |
| "grad_norm": 0.13789279323264822, |
| "learning_rate": 7.608695652173914e-05, |
| "loss": 1.2599, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.043466449334419996, |
| "grad_norm": 0.10731768218336359, |
| "learning_rate": 8.695652173913044e-05, |
| "loss": 1.2074, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0488997555012225, |
| "grad_norm": 0.1057871319222063, |
| "learning_rate": 9.782608695652174e-05, |
| "loss": 1.2533, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.05433306166802499, |
| "grad_norm": 0.07875920723380202, |
| "learning_rate": 0.00010869565217391305, |
| "loss": 1.1827, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05976636783482749, |
| "grad_norm": 0.07554518893021663, |
| "learning_rate": 0.00011956521739130435, |
| "loss": 1.1878, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06519967400162999, |
| "grad_norm": 0.07195376087029262, |
| "learning_rate": 0.00013043478260869567, |
| "loss": 1.1919, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07063298016843249, |
| "grad_norm": 0.07321939595594293, |
| "learning_rate": 0.00014130434782608697, |
| "loss": 1.1605, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07606628633523499, |
| "grad_norm": 0.07334785242718754, |
| "learning_rate": 0.00015217391304347827, |
| "loss": 1.1549, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.08149959250203749, |
| "grad_norm": 0.070707496165197, |
| "learning_rate": 0.00016304347826086955, |
| "loss": 1.163, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08693289866883999, |
| "grad_norm": 0.07258864547290272, |
| "learning_rate": 0.00017391304347826088, |
| "loss": 1.1355, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0923662048356425, |
| "grad_norm": 0.06874465839069938, |
| "learning_rate": 0.00018478260869565218, |
| "loss": 1.1351, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.097799511002445, |
| "grad_norm": 0.07678305279824389, |
| "learning_rate": 0.0001956521739130435, |
| "loss": 1.1059, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.10323281716924748, |
| "grad_norm": 0.0906973594061365, |
| "learning_rate": 0.00019999352191120556, |
| "loss": 1.1375, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10866612333604998, |
| "grad_norm": 0.09273255388055195, |
| "learning_rate": 0.00019995393663024054, |
| "loss": 1.1329, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11409942950285248, |
| "grad_norm": 0.08893171944910543, |
| "learning_rate": 0.0001998783792354841, |
| "loss": 1.1251, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.11953273566965499, |
| "grad_norm": 0.09204342908287226, |
| "learning_rate": 0.00019976687691905393, |
| "loss": 1.1294, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.12496604183645749, |
| "grad_norm": 0.08467143673010843, |
| "learning_rate": 0.00019961946980917456, |
| "loss": 1.1278, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.13039934800325997, |
| "grad_norm": 0.08144590635344492, |
| "learning_rate": 0.00019943621095573586, |
| "loss": 1.1154, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1358326541700625, |
| "grad_norm": 0.07775679298443673, |
| "learning_rate": 0.00019921716631120107, |
| "loss": 1.1338, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.14126596033686498, |
| "grad_norm": 0.07462503464881824, |
| "learning_rate": 0.0001989624147068713, |
| "loss": 1.0908, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1466992665036675, |
| "grad_norm": 0.10574377628309363, |
| "learning_rate": 0.0001986720478245153, |
| "loss": 1.0958, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.15213257267046998, |
| "grad_norm": 0.08626088216141505, |
| "learning_rate": 0.0001983461701633742, |
| "loss": 1.0946, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1575658788372725, |
| "grad_norm": 0.08101105375257252, |
| "learning_rate": 0.00019798489900255389, |
| "loss": 1.0921, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.16299918500407498, |
| "grad_norm": 0.07368284055990211, |
| "learning_rate": 0.00019758836435881746, |
| "loss": 1.1197, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.16843249117087747, |
| "grad_norm": 0.08774531529792205, |
| "learning_rate": 0.00019715670893979414, |
| "loss": 1.1245, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.17386579733767998, |
| "grad_norm": 0.08198118641745479, |
| "learning_rate": 0.00019669008809262062, |
| "loss": 1.1441, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.17929910350448247, |
| "grad_norm": 0.07963783890679672, |
| "learning_rate": 0.0001961886697480335, |
| "loss": 1.1265, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.184732409671285, |
| "grad_norm": 0.08389788352824891, |
| "learning_rate": 0.0001956526343599335, |
| "loss": 1.1071, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.19016571583808747, |
| "grad_norm": 0.08052713262866189, |
| "learning_rate": 0.0001950821748404421, |
| "loss": 1.1001, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.19559902200489, |
| "grad_norm": 0.0775857382618487, |
| "learning_rate": 0.00019447749649047542, |
| "loss": 1.1064, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.20103232817169248, |
| "grad_norm": 0.0917075056840311, |
| "learning_rate": 0.0001938388169258587, |
| "loss": 1.0996, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.20646563433849496, |
| "grad_norm": 0.08378602725221357, |
| "learning_rate": 0.00019316636599900946, |
| "loss": 1.1174, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.21189894050529748, |
| "grad_norm": 0.1278123340196638, |
| "learning_rate": 0.00019246038571621657, |
| "loss": 1.0996, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.21733224667209997, |
| "grad_norm": 0.08960771654215978, |
| "learning_rate": 0.00019172113015054532, |
| "loss": 1.1136, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22276555283890248, |
| "grad_norm": 0.08865525272297263, |
| "learning_rate": 0.00019094886535039982, |
| "loss": 1.0934, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.22819885900570497, |
| "grad_norm": 0.08132686975529266, |
| "learning_rate": 0.00019014386924377582, |
| "loss": 1.0969, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.23363216517250748, |
| "grad_norm": 0.08682177413199785, |
| "learning_rate": 0.00018930643153823777, |
| "loss": 1.1057, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.23906547133930997, |
| "grad_norm": 0.08366750333748893, |
| "learning_rate": 0.00018843685361665723, |
| "loss": 1.0984, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.24449877750611246, |
| "grad_norm": 0.0791018641660297, |
| "learning_rate": 0.00018753544842874887, |
| "loss": 1.124, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.24993208367291497, |
| "grad_norm": 0.08073734798833669, |
| "learning_rate": 0.00018660254037844388, |
| "loss": 1.1086, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.25536538983971746, |
| "grad_norm": 0.07872218202349608, |
| "learning_rate": 0.00018563846520714154, |
| "loss": 1.0975, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.26079869600651995, |
| "grad_norm": 0.07477291972797645, |
| "learning_rate": 0.00018464356987288013, |
| "loss": 1.0953, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2662320021733225, |
| "grad_norm": 0.07464290895172018, |
| "learning_rate": 0.0001836182124254711, |
| "loss": 1.1009, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.271665308340125, |
| "grad_norm": 0.08337470836636104, |
| "learning_rate": 0.00018256276187764197, |
| "loss": 1.133, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.27709861450692747, |
| "grad_norm": 0.0792698664089276, |
| "learning_rate": 0.0001814775980722332, |
| "loss": 1.0848, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.28253192067372995, |
| "grad_norm": 0.07783493666541991, |
| "learning_rate": 0.00018036311154549784, |
| "loss": 1.0946, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.28796522684053244, |
| "grad_norm": 0.07936535662112612, |
| "learning_rate": 0.00017921970338655266, |
| "loss": 1.0991, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.293398533007335, |
| "grad_norm": 0.08150639145601371, |
| "learning_rate": 0.00017804778509303138, |
| "loss": 1.1032, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.29883183917413747, |
| "grad_norm": 0.07393493455934289, |
| "learning_rate": 0.00017684777842299205, |
| "loss": 1.1061, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.30426514534093996, |
| "grad_norm": 0.07425512123490186, |
| "learning_rate": 0.00017562011524313185, |
| "loss": 1.0974, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.30969845150774244, |
| "grad_norm": 0.08088846247833803, |
| "learning_rate": 0.00017436523737336402, |
| "loss": 1.1057, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.315131757674545, |
| "grad_norm": 0.07594242007128106, |
| "learning_rate": 0.00017308359642781242, |
| "loss": 1.1107, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3205650638413475, |
| "grad_norm": 0.07364071747814806, |
| "learning_rate": 0.00017177565365228178, |
| "loss": 1.1071, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.32599837000814996, |
| "grad_norm": 0.0750961308431674, |
| "learning_rate": 0.00017044187975826124, |
| "loss": 1.0926, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.33143167617495245, |
| "grad_norm": 0.08124049903595534, |
| "learning_rate": 0.0001690827547535214, |
| "loss": 1.0779, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.33686498234175494, |
| "grad_norm": 0.07415464127126385, |
| "learning_rate": 0.0001676987677693659, |
| "loss": 1.1177, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3422982885085575, |
| "grad_norm": 0.07932404096302531, |
| "learning_rate": 0.00016629041688459941, |
| "loss": 1.0994, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.34773159467535997, |
| "grad_norm": 0.07494379306548073, |
| "learning_rate": 0.0001648582089462756, |
| "loss": 1.1003, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.35316490084216245, |
| "grad_norm": 0.07450254891711967, |
| "learning_rate": 0.00016340265938728958, |
| "loss": 1.1003, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.35859820700896494, |
| "grad_norm": 0.07502689209720823, |
| "learning_rate": 0.0001619242920408802, |
| "loss": 1.1131, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.36403151317576743, |
| "grad_norm": 0.07340743859135564, |
| "learning_rate": 0.00016042363895210946, |
| "loss": 1.1302, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.36946481934257, |
| "grad_norm": 0.08148683177168727, |
| "learning_rate": 0.00015890124018638638, |
| "loss": 1.0876, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.37489812550937246, |
| "grad_norm": 0.07814855659102214, |
| "learning_rate": 0.0001573576436351046, |
| "loss": 1.1052, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.38033143167617495, |
| "grad_norm": 0.07793395138533396, |
| "learning_rate": 0.00015579340481846336, |
| "loss": 1.0991, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.38576473784297743, |
| "grad_norm": 0.07203762927472582, |
| "learning_rate": 0.00015420908668554298, |
| "loss": 1.1242, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.39119804400978, |
| "grad_norm": 0.08066607050998563, |
| "learning_rate": 0.00015260525941170712, |
| "loss": 1.0598, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.39663135017658246, |
| "grad_norm": 0.07563612425103881, |
| "learning_rate": 0.00015098250019340387, |
| "loss": 1.1073, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.40206465634338495, |
| "grad_norm": 0.08780896217816431, |
| "learning_rate": 0.00014934139304044033, |
| "loss": 1.1077, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.40749796251018744, |
| "grad_norm": 0.075783720101519, |
| "learning_rate": 0.0001476825285658053, |
| "loss": 1.097, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4129312686769899, |
| "grad_norm": 0.07533667147991636, |
| "learning_rate": 0.00014600650377311522, |
| "loss": 1.0948, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.41836457484379247, |
| "grad_norm": 0.08145283761220191, |
| "learning_rate": 0.00014431392184176042, |
| "loss": 1.0902, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.42379788101059496, |
| "grad_norm": 0.07772240362555147, |
| "learning_rate": 0.00014260539190982886, |
| "loss": 1.0881, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.42923118717739744, |
| "grad_norm": 0.07228052744891678, |
| "learning_rate": 0.00014088152885488502, |
| "loss": 1.0788, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.43466449334419993, |
| "grad_norm": 0.08407870138614082, |
| "learning_rate": 0.00013914295307268396, |
| "loss": 1.1124, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4400977995110024, |
| "grad_norm": 0.07850798498901956, |
| "learning_rate": 0.00013739029025389846, |
| "loss": 1.0906, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.44553110567780496, |
| "grad_norm": 0.07381578409984058, |
| "learning_rate": 0.00013562417115894172, |
| "loss": 1.0943, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.45096441184460745, |
| "grad_norm": 0.08024210843335, |
| "learning_rate": 0.0001338452313909644, |
| "loss": 1.0968, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.45639771801140994, |
| "grad_norm": 0.07473871189132965, |
| "learning_rate": 0.00013205411116710972, |
| "loss": 1.0913, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.4618310241782124, |
| "grad_norm": 0.0714410489594485, |
| "learning_rate": 0.0001302514550881076, |
| "loss": 1.1018, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.46726433034501497, |
| "grad_norm": 0.07909271973522737, |
| "learning_rate": 0.0001284379119062912, |
| "loss": 1.0852, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.47269763651181745, |
| "grad_norm": 0.07891638633482141, |
| "learning_rate": 0.00012661413429211957, |
| "loss": 1.0711, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.47813094267861994, |
| "grad_norm": 0.08881872489618887, |
| "learning_rate": 0.00012478077859929, |
| "loss": 1.0738, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.48356424884542243, |
| "grad_norm": 0.08624317368044378, |
| "learning_rate": 0.00012293850462852496, |
| "loss": 1.1031, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.4889975550122249, |
| "grad_norm": 0.08266232685721502, |
| "learning_rate": 0.00012108797539011847, |
| "loss": 1.1198, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.49443086117902746, |
| "grad_norm": 0.07635565938747138, |
| "learning_rate": 0.00011922985686532726, |
| "loss": 1.0873, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.49986416734582995, |
| "grad_norm": 0.07781766022202603, |
| "learning_rate": 0.00011736481776669306, |
| "loss": 1.105, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5052974735126324, |
| "grad_norm": 0.07202406913689023, |
| "learning_rate": 0.00011549352929738142, |
| "loss": 1.1033, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.5107307796794349, |
| "grad_norm": 0.07526163912368695, |
| "learning_rate": 0.00011361666490962468, |
| "loss": 1.1015, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5161640858462374, |
| "grad_norm": 0.07194341123415399, |
| "learning_rate": 0.00011173490006235528, |
| "loss": 1.0844, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5215973920130399, |
| "grad_norm": 0.0778307000682154, |
| "learning_rate": 0.00010984891197811687, |
| "loss": 1.079, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5270306981798424, |
| "grad_norm": 0.07029041166784745, |
| "learning_rate": 0.00010795937939934088, |
| "loss": 1.0832, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.532464004346645, |
| "grad_norm": 0.072860965367971, |
| "learning_rate": 0.00010606698234407586, |
| "loss": 1.1017, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5378973105134475, |
| "grad_norm": 0.07249476012394583, |
| "learning_rate": 0.00010417240186125805, |
| "loss": 1.0848, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.54333061668025, |
| "grad_norm": 0.07190968384245135, |
| "learning_rate": 0.00010227631978561056, |
| "loss": 1.0834, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5487639228470524, |
| "grad_norm": 0.07212243055291719, |
| "learning_rate": 0.00010037941849226032, |
| "loss": 1.0681, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5541972290138549, |
| "grad_norm": 0.07472462127072327, |
| "learning_rate": 9.848238065115975e-05, |
| "loss": 1.0823, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5596305351806574, |
| "grad_norm": 0.07341247607319075, |
| "learning_rate": 9.658588898140322e-05, |
| "loss": 1.0808, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5650638413474599, |
| "grad_norm": 0.07373559114947376, |
| "learning_rate": 9.469062600552509e-05, |
| "loss": 1.0797, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.5704971475142624, |
| "grad_norm": 0.07594819742108236, |
| "learning_rate": 9.27972738038688e-05, |
| "loss": 1.0605, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.5759304536810649, |
| "grad_norm": 0.0782882995694038, |
| "learning_rate": 9.09065137691153e-05, |
| "loss": 1.1044, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.5813637598478675, |
| "grad_norm": 0.0764312523770268, |
| "learning_rate": 8.901902636105854e-05, |
| "loss": 1.0821, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.58679706601467, |
| "grad_norm": 0.07500428729495655, |
| "learning_rate": 8.713549086171691e-05, |
| "loss": 1.0875, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5922303721814725, |
| "grad_norm": 0.07269578451839101, |
| "learning_rate": 8.525658513086857e-05, |
| "loss": 1.0827, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5976636783482749, |
| "grad_norm": 0.0776349309988441, |
| "learning_rate": 8.33829853620986e-05, |
| "loss": 1.0835, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6030969845150774, |
| "grad_norm": 0.07914278991896619, |
| "learning_rate": 8.15153658394458e-05, |
| "loss": 1.0854, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.6085302906818799, |
| "grad_norm": 0.07805449276137157, |
| "learning_rate": 7.965439869473664e-05, |
| "loss": 1.0899, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6139635968486824, |
| "grad_norm": 0.07343569346455868, |
| "learning_rate": 7.780075366569374e-05, |
| "loss": 1.0689, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6193969030154849, |
| "grad_norm": 0.0763506099137777, |
| "learning_rate": 7.595509785490617e-05, |
| "loss": 1.1015, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6248302091822874, |
| "grad_norm": 0.07697592585435539, |
| "learning_rate": 7.411809548974792e-05, |
| "loss": 1.1086, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.63026351534909, |
| "grad_norm": 0.07858412463887798, |
| "learning_rate": 7.229040768333115e-05, |
| "loss": 1.0727, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6356968215158925, |
| "grad_norm": 0.07677005817307439, |
| "learning_rate": 7.04726921965801e-05, |
| "loss": 1.0916, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.641130127682695, |
| "grad_norm": 0.0769443341169544, |
| "learning_rate": 6.866560320151179e-05, |
| "loss": 1.0936, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6465634338494974, |
| "grad_norm": 0.07892506244450502, |
| "learning_rate": 6.686979104580788e-05, |
| "loss": 1.1201, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6519967400162999, |
| "grad_norm": 0.07597394899050844, |
| "learning_rate": 6.508590201876317e-05, |
| "loss": 1.0893, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6574300461831024, |
| "grad_norm": 0.07546777076824882, |
| "learning_rate": 6.331457811869437e-05, |
| "loss": 1.1171, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.6628633523499049, |
| "grad_norm": 0.07745516555125605, |
| "learning_rate": 6.155645682189351e-05, |
| "loss": 1.0738, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.6682966585167074, |
| "grad_norm": 0.07367510312563916, |
| "learning_rate": 5.9812170853208496e-05, |
| "loss": 1.0745, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.6737299646835099, |
| "grad_norm": 0.07824277686686397, |
| "learning_rate": 5.8082347958333625e-05, |
| "loss": 1.0905, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.6791632708503124, |
| "grad_norm": 0.0764173540069429, |
| "learning_rate": 5.6367610677892177e-05, |
| "loss": 1.1074, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.684596577017115, |
| "grad_norm": 0.07791563158504998, |
| "learning_rate": 5.466857612339229e-05, |
| "loss": 1.1194, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.6900298831839174, |
| "grad_norm": 0.07146233468425052, |
| "learning_rate": 5.298585575513648e-05, |
| "loss": 1.1077, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.6954631893507199, |
| "grad_norm": 0.07591539295717892, |
| "learning_rate": 5.1320055162165115e-05, |
| "loss": 1.0932, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.7008964955175224, |
| "grad_norm": 0.07910316647424652, |
| "learning_rate": 4.967177384431293e-05, |
| "loss": 1.1001, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.7063298016843249, |
| "grad_norm": 0.0765700985055494, |
| "learning_rate": 4.804160499645667e-05, |
| "loss": 1.1021, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7117631078511274, |
| "grad_norm": 0.07254920739923368, |
| "learning_rate": 4.6430135295032184e-05, |
| "loss": 1.0802, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.7171964140179299, |
| "grad_norm": 0.07330038875192313, |
| "learning_rate": 4.483794468689728e-05, |
| "loss": 1.1148, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7226297201847324, |
| "grad_norm": 0.07545496216336055, |
| "learning_rate": 4.326560618061639e-05, |
| "loss": 1.0979, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7280630263515349, |
| "grad_norm": 0.07908854694610165, |
| "learning_rate": 4.1713685640242165e-05, |
| "loss": 1.0907, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7334963325183375, |
| "grad_norm": 0.07579581147744809, |
| "learning_rate": 4.0182741581668593e-05, |
| "loss": 1.1085, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.73892963868514, |
| "grad_norm": 0.07729476399610574, |
| "learning_rate": 3.8673324971628357e-05, |
| "loss": 1.0798, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7443629448519424, |
| "grad_norm": 0.07944953369737749, |
| "learning_rate": 3.718597902940717e-05, |
| "loss": 1.0869, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7497962510187449, |
| "grad_norm": 0.07194925540581062, |
| "learning_rate": 3.5721239031346066e-05, |
| "loss": 1.0561, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.7552295571855474, |
| "grad_norm": 0.07617097405159699, |
| "learning_rate": 3.427963211820274e-05, |
| "loss": 1.0756, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.7606628633523499, |
| "grad_norm": 0.07302205190651118, |
| "learning_rate": 3.2861677105440336e-05, |
| "loss": 1.0898, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7660961695191524, |
| "grad_norm": 0.07237255657454869, |
| "learning_rate": 3.146788429651246e-05, |
| "loss": 1.1003, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.7715294756859549, |
| "grad_norm": 0.07609168686442808, |
| "learning_rate": 3.009875529921181e-05, |
| "loss": 1.1198, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.7769627818527574, |
| "grad_norm": 0.07439558041939377, |
| "learning_rate": 2.8754782845148043e-05, |
| "loss": 1.0885, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.78239608801956, |
| "grad_norm": 0.07799568204768835, |
| "learning_rate": 2.7436450612420095e-05, |
| "loss": 1.0984, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.7878293941863624, |
| "grad_norm": 0.0743227911288043, |
| "learning_rate": 2.6144233051546796e-05, |
| "loss": 1.0821, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.7932627003531649, |
| "grad_norm": 0.07533397831281968, |
| "learning_rate": 2.4878595214718236e-05, |
| "loss": 1.0954, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.7986960065199674, |
| "grad_norm": 0.07795341702809758, |
| "learning_rate": 2.3639992588429705e-05, |
| "loss": 1.073, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.8041293126867699, |
| "grad_norm": 0.07211902034739545, |
| "learning_rate": 2.242887092955801e-05, |
| "loss": 1.0815, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.8095626188535724, |
| "grad_norm": 0.0761432386265937, |
| "learning_rate": 2.12456661049394e-05, |
| "loss": 1.094, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.8149959250203749, |
| "grad_norm": 0.07747183067459043, |
| "learning_rate": 2.0090803934506764e-05, |
| "loss": 1.0879, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8204292311871774, |
| "grad_norm": 0.07443760717814148, |
| "learning_rate": 1.8964700038042626e-05, |
| "loss": 1.0774, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.8258625373539799, |
| "grad_norm": 0.07403731045418284, |
| "learning_rate": 1.7867759685603114e-05, |
| "loss": 1.062, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8312958435207825, |
| "grad_norm": 0.07999273099986547, |
| "learning_rate": 1.6800377651666465e-05, |
| "loss": 1.0893, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.8367291496875849, |
| "grad_norm": 0.07415045401695812, |
| "learning_rate": 1.5762938073058853e-05, |
| "loss": 1.0838, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.8421624558543874, |
| "grad_norm": 0.07804672929693661, |
| "learning_rate": 1.475581431070865e-05, |
| "loss": 1.0701, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.8475957620211899, |
| "grad_norm": 0.07518807484213265, |
| "learning_rate": 1.3779368815278647e-05, |
| "loss": 1.0964, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8530290681879924, |
| "grad_norm": 0.07563239778584362, |
| "learning_rate": 1.2833952996724863e-05, |
| "loss": 1.1061, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.8584623743547949, |
| "grad_norm": 0.07965844467223017, |
| "learning_rate": 1.1919907097828653e-05, |
| "loss": 1.0987, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.8638956805215974, |
| "grad_norm": 0.0766122440696586, |
| "learning_rate": 1.1037560071747732e-05, |
| "loss": 1.0821, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8693289866883999, |
| "grad_norm": 0.07471932158959192, |
| "learning_rate": 1.01872294636304e-05, |
| "loss": 1.0928, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.8747622928552024, |
| "grad_norm": 0.07879025603430712, |
| "learning_rate": 9.369221296335006e-06, |
| "loss": 1.0742, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.8801955990220048, |
| "grad_norm": 0.07251151651596634, |
| "learning_rate": 8.58382996029652e-06, |
| "loss": 1.1022, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.8856289051888074, |
| "grad_norm": 0.08267985751565841, |
| "learning_rate": 7.831338107579056e-06, |
| "loss": 1.111, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.8910622113556099, |
| "grad_norm": 0.07983071589620737, |
| "learning_rate": 7.1120165501533e-06, |
| "loss": 1.089, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.8964955175224124, |
| "grad_norm": 0.07253898026652339, |
| "learning_rate": 6.426124162434688e-06, |
| "loss": 1.0821, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.9019288236892149, |
| "grad_norm": 0.0726664203858709, |
| "learning_rate": 5.77390778811796e-06, |
| "loss": 1.1032, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.9073621298560174, |
| "grad_norm": 0.07385826672139016, |
| "learning_rate": 5.1556021513412544e-06, |
| "loss": 1.0809, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.9127954360228199, |
| "grad_norm": 0.06998427333733385, |
| "learning_rate": 4.5714297722121106e-06, |
| "loss": 1.1018, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.9182287421896224, |
| "grad_norm": 0.0757435677420101, |
| "learning_rate": 4.021600886725263e-06, |
| "loss": 1.0771, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.9236620483564248, |
| "grad_norm": 0.0722116543100783, |
| "learning_rate": 3.5063133711014882e-06, |
| "loss": 1.0807, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.9290953545232273, |
| "grad_norm": 0.07391301677162948, |
| "learning_rate": 3.025752670574622e-06, |
| "loss": 1.1152, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.9345286606900299, |
| "grad_norm": 0.07161981606521015, |
| "learning_rate": 2.580091732652101e-06, |
| "loss": 1.0914, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9399619668568324, |
| "grad_norm": 0.08220528509805279, |
| "learning_rate": 2.1694909448735645e-06, |
| "loss": 1.063, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.9453952730236349, |
| "grad_norm": 0.07693291624176714, |
| "learning_rate": 1.7940980770894122e-06, |
| "loss": 1.084, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.9508285791904374, |
| "grad_norm": 0.07644475605951945, |
| "learning_rate": 1.4540482282803137e-06, |
| "loss": 1.1081, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.9562618853572399, |
| "grad_norm": 0.07330126081285318, |
| "learning_rate": 1.1494637779369766e-06, |
| "loss": 1.0879, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.9616951915240424, |
| "grad_norm": 0.073595237012896, |
| "learning_rate": 8.804543420172562e-07, |
| "loss": 1.1054, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.9671284976908449, |
| "grad_norm": 0.07404663517702222, |
| "learning_rate": 6.471167334968886e-07, |
| "loss": 1.0708, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.9725618038576473, |
| "grad_norm": 0.07475084382356802, |
| "learning_rate": 4.495349275276839e-07, |
| "loss": 1.0812, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.9779951100244498, |
| "grad_norm": 0.07201213176485209, |
| "learning_rate": 2.877800312160783e-07, |
| "loss": 1.0603, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.9834284161912524, |
| "grad_norm": 0.07197972989674828, |
| "learning_rate": 1.6191025803250492e-07, |
| "loss": 1.0878, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.9888617223580549, |
| "grad_norm": 0.07408960367345192, |
| "learning_rate": 7.197090686119623e-08, |
| "loss": 1.0819, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.9942950285248574, |
| "grad_norm": 0.07112735003336768, |
| "learning_rate": 1.7994345697680547e-08, |
| "loss": 1.0836, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.9997283346916599, |
| "grad_norm": 0.07282814722420096, |
| "learning_rate": 0.0, |
| "loss": 1.0802, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9997283346916599, |
| "eval_loss": 1.005259394645691, |
| "eval_runtime": 2.7588, |
| "eval_samples_per_second": 2.537, |
| "eval_steps_per_second": 0.725, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9997283346916599, |
| "step": 920, |
| "total_flos": 1.1569444404854784e+16, |
| "train_loss": 1.1088063976039058, |
| "train_runtime": 18858.7381, |
| "train_samples_per_second": 3.122, |
| "train_steps_per_second": 0.049 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 920, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1569444404854784e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|