{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996222709073053, "eval_steps": 500, "global_step": 3308, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030218327415577548, "grad_norm": 166.1145477294922, "learning_rate": 1.0000000000000002e-06, "loss": 13.8334, "step": 10 }, { "epoch": 0.0060436654831155096, "grad_norm": 81.33020782470703, "learning_rate": 2.0000000000000003e-06, "loss": 13.1022, "step": 20 }, { "epoch": 0.009065498224673264, "grad_norm": 51.09122085571289, "learning_rate": 3e-06, "loss": 12.0899, "step": 30 }, { "epoch": 0.012087330966231019, "grad_norm": 49.01457214355469, "learning_rate": 4.000000000000001e-06, "loss": 10.2942, "step": 40 }, { "epoch": 0.015109163707788774, "grad_norm": 46.00205612182617, "learning_rate": 5e-06, "loss": 9.538, "step": 50 }, { "epoch": 0.01813099644934653, "grad_norm": 41.61635971069336, "learning_rate": 6e-06, "loss": 8.3357, "step": 60 }, { "epoch": 0.021152829190904283, "grad_norm": 31.584325790405273, "learning_rate": 7e-06, "loss": 6.9115, "step": 70 }, { "epoch": 0.024174661932462038, "grad_norm": 51.664695739746094, "learning_rate": 8.000000000000001e-06, "loss": 6.7596, "step": 80 }, { "epoch": 0.027196494674019793, "grad_norm": 31.014076232910156, "learning_rate": 9e-06, "loss": 7.0298, "step": 90 }, { "epoch": 0.030218327415577548, "grad_norm": 37.792179107666016, "learning_rate": 1e-05, "loss": 6.7021, "step": 100 }, { "epoch": 0.0332401601571353, "grad_norm": 37.68498611450195, "learning_rate": 9.999760394462267e-06, "loss": 7.3545, "step": 110 }, { "epoch": 0.03626199289869306, "grad_norm": 38.4805793762207, "learning_rate": 9.999041600813393e-06, "loss": 7.0073, "step": 120 }, { "epoch": 0.03928382564025081, "grad_norm": 32.300174713134766, "learning_rate": 9.997843687944153e-06, "loss": 6.2416, "step": 130 }, { "epoch": 0.04230565838180857, "grad_norm": 29.263317108154297, "learning_rate": 9.996166770665168e-06, "loss": 5.5583, "step": 140 }, { "epoch": 0.04532749112336632, "grad_norm": 33.3656005859375, "learning_rate": 9.994011009695908e-06, "loss": 5.6737, "step": 150 }, { "epoch": 0.048349323864924076, "grad_norm": 32.699825286865234, "learning_rate": 9.991376611649278e-06, "loss": 6.0879, "step": 160 }, { "epoch": 0.05137115660648183, "grad_norm": 27.45968246459961, "learning_rate": 9.988263829011821e-06, "loss": 5.4056, "step": 170 }, { "epoch": 0.054392989348039586, "grad_norm": 25.30878448486328, "learning_rate": 9.984672960119523e-06, "loss": 5.3618, "step": 180 }, { "epoch": 0.05741482208959734, "grad_norm": 40.055721282958984, "learning_rate": 9.980604349129212e-06, "loss": 5.7602, "step": 190 }, { "epoch": 0.060436654831155096, "grad_norm": 26.245195388793945, "learning_rate": 9.976058385985575e-06, "loss": 5.186, "step": 200 }, { "epoch": 0.06345848757271286, "grad_norm": 34.81965637207031, "learning_rate": 9.971035506383791e-06, "loss": 5.5341, "step": 210 }, { "epoch": 0.0664803203142706, "grad_norm": 29.513893127441406, "learning_rate": 9.96553619172777e-06, "loss": 5.0542, "step": 220 }, { "epoch": 0.06950215305582837, "grad_norm": 32.30284118652344, "learning_rate": 9.959560969084004e-06, "loss": 5.3365, "step": 230 }, { "epoch": 0.07252398579738611, "grad_norm": 27.652576446533203, "learning_rate": 9.953110411131073e-06, "loss": 4.7513, "step": 240 }, { "epoch": 0.07554581853894388, "grad_norm": 28.387413024902344, "learning_rate": 9.946185136104736e-06, "loss": 5.4127, "step": 250 }, { "epoch": 0.07856765128050162, "grad_norm": 29.694316864013672, "learning_rate": 9.938785807738692e-06, "loss": 4.813, "step": 260 }, { "epoch": 0.08158948402205939, "grad_norm": 31.964120864868164, "learning_rate": 9.930913135200964e-06, "loss": 5.4212, "step": 270 }, { "epoch": 0.08461131676361713, "grad_norm": 23.594715118408203, "learning_rate": 9.922567873025924e-06, "loss": 5.2445, "step": 280 }, { "epoch": 0.0876331495051749, "grad_norm": 23.0896053314209, "learning_rate": 9.913750821041988e-06, "loss": 4.5194, "step": 290 }, { "epoch": 0.09065498224673264, "grad_norm": 25.44329833984375, "learning_rate": 9.904462824294945e-06, "loss": 4.6093, "step": 300 }, { "epoch": 0.0936768149882904, "grad_norm": 27.408288955688477, "learning_rate": 9.894704772966978e-06, "loss": 4.512, "step": 310 }, { "epoch": 0.09669864772984815, "grad_norm": 24.26542091369629, "learning_rate": 9.884477602291343e-06, "loss": 4.5071, "step": 320 }, { "epoch": 0.09972048047140591, "grad_norm": 35.8819694519043, "learning_rate": 9.873782292462727e-06, "loss": 4.3557, "step": 330 }, { "epoch": 0.10274231321296366, "grad_norm": 29.487594604492188, "learning_rate": 9.862619868543323e-06, "loss": 8.2236, "step": 340 }, { "epoch": 0.10576414595452142, "grad_norm": 38.13749694824219, "learning_rate": 9.850991400364557e-06, "loss": 5.1538, "step": 350 }, { "epoch": 0.10878597869607917, "grad_norm": 25.492799758911133, "learning_rate": 9.838898002424586e-06, "loss": 6.0666, "step": 360 }, { "epoch": 0.11180781143763693, "grad_norm": 31.119089126586914, "learning_rate": 9.826340833781448e-06, "loss": 5.8633, "step": 370 }, { "epoch": 0.11482964417919468, "grad_norm": 21.065149307250977, "learning_rate": 9.813321097942005e-06, "loss": 5.1017, "step": 380 }, { "epoch": 0.11785147692075244, "grad_norm": 29.40814971923828, "learning_rate": 9.79984004274658e-06, "loss": 4.9132, "step": 390 }, { "epoch": 0.12087330966231019, "grad_norm": 22.45477294921875, "learning_rate": 9.785898960249365e-06, "loss": 4.2496, "step": 400 }, { "epoch": 0.12389514240386795, "grad_norm": 19.05487060546875, "learning_rate": 9.771499186594586e-06, "loss": 5.0767, "step": 410 }, { "epoch": 0.12691697514542571, "grad_norm": 31.310686111450195, "learning_rate": 9.756642101888449e-06, "loss": 5.192, "step": 420 }, { "epoch": 0.12993880788698345, "grad_norm": 25.689640045166016, "learning_rate": 9.74132913006686e-06, "loss": 3.445, "step": 430 }, { "epoch": 0.1329606406285412, "grad_norm": 21.052574157714844, "learning_rate": 9.725561738758956e-06, "loss": 3.3354, "step": 440 }, { "epoch": 0.13598247337009897, "grad_norm": 24.987884521484375, "learning_rate": 9.709341439146452e-06, "loss": 5.0777, "step": 450 }, { "epoch": 0.13900430611165673, "grad_norm": 26.842397689819336, "learning_rate": 9.692669785818787e-06, "loss": 6.4292, "step": 460 }, { "epoch": 0.14202613885321447, "grad_norm": 35.66836166381836, "learning_rate": 9.675548376624149e-06, "loss": 5.7348, "step": 470 }, { "epoch": 0.14504797159477223, "grad_norm": 29.318471908569336, "learning_rate": 9.657978852516318e-06, "loss": 5.6924, "step": 480 }, { "epoch": 0.14806980433633, "grad_norm": 23.544092178344727, "learning_rate": 9.639962897397405e-06, "loss": 4.183, "step": 490 }, { "epoch": 0.15109163707788775, "grad_norm": 22.90180206298828, "learning_rate": 9.621502237956452e-06, "loss": 5.085, "step": 500 }, { "epoch": 0.1541134698194455, "grad_norm": 23.748275756835938, "learning_rate": 9.602598643503957e-06, "loss": 3.2694, "step": 510 }, { "epoch": 0.15713530256100325, "grad_norm": 29.096708297729492, "learning_rate": 9.583253925802283e-06, "loss": 4.2373, "step": 520 }, { "epoch": 0.160157135302561, "grad_norm": 24.87314796447754, "learning_rate": 9.563469938892023e-06, "loss": 4.8482, "step": 530 }, { "epoch": 0.16317896804411877, "grad_norm": 24.310091018676758, "learning_rate": 9.543248578914309e-06, "loss": 3.2299, "step": 540 }, { "epoch": 0.1662008007856765, "grad_norm": 24.80878257751465, "learning_rate": 9.522591783929069e-06, "loss": 4.8424, "step": 550 }, { "epoch": 0.16922263352723427, "grad_norm": 22.142215728759766, "learning_rate": 9.501501533729297e-06, "loss": 4.1786, "step": 560 }, { "epoch": 0.17224446626879203, "grad_norm": 33.77587890625, "learning_rate": 9.479979849651287e-06, "loss": 5.7505, "step": 570 }, { "epoch": 0.1752662990103498, "grad_norm": 25.414831161499023, "learning_rate": 9.45802879438091e-06, "loss": 6.3392, "step": 580 }, { "epoch": 0.17828813175190752, "grad_norm": 25.716073989868164, "learning_rate": 9.43565047175593e-06, "loss": 4.1603, "step": 590 }, { "epoch": 0.1813099644934653, "grad_norm": 25.389522552490234, "learning_rate": 9.412847026564359e-06, "loss": 3.9676, "step": 600 }, { "epoch": 0.18433179723502305, "grad_norm": 22.911640167236328, "learning_rate": 9.389620644338893e-06, "loss": 4.1508, "step": 610 }, { "epoch": 0.1873536299765808, "grad_norm": 36.27210998535156, "learning_rate": 9.365973551147453e-06, "loss": 4.691, "step": 620 }, { "epoch": 0.19037546271813854, "grad_norm": 23.555246353149414, "learning_rate": 9.341908013379832e-06, "loss": 4.7148, "step": 630 }, { "epoch": 0.1933972954596963, "grad_norm": 25.42097282409668, "learning_rate": 9.317426337530477e-06, "loss": 4.0105, "step": 640 }, { "epoch": 0.19641912820125407, "grad_norm": 24.92901611328125, "learning_rate": 9.292530869977432e-06, "loss": 5.5589, "step": 650 }, { "epoch": 0.19944096094281183, "grad_norm": 26.411352157592773, "learning_rate": 9.26722399675745e-06, "loss": 3.1881, "step": 660 }, { "epoch": 0.20246279368436956, "grad_norm": 22.39121437072754, "learning_rate": 9.24150814333732e-06, "loss": 3.9177, "step": 670 }, { "epoch": 0.20548462642592732, "grad_norm": 21.436046600341797, "learning_rate": 9.215385774381395e-06, "loss": 6.2124, "step": 680 }, { "epoch": 0.2085064591674851, "grad_norm": 42.19996643066406, "learning_rate": 9.188859393515382e-06, "loss": 4.863, "step": 690 }, { "epoch": 0.21152829190904285, "grad_norm": 24.43948745727539, "learning_rate": 9.16193154308638e-06, "loss": 6.0562, "step": 700 }, { "epoch": 0.21455012465060058, "grad_norm": 36.5896110534668, "learning_rate": 9.13460480391922e-06, "loss": 6.1878, "step": 710 }, { "epoch": 0.21757195739215834, "grad_norm": 39.19657897949219, "learning_rate": 9.106881795069116e-06, "loss": 6.4964, "step": 720 }, { "epoch": 0.2205937901337161, "grad_norm": 19.438859939575195, "learning_rate": 9.078765173570649e-06, "loss": 3.1914, "step": 730 }, { "epoch": 0.22361562287527387, "grad_norm": 26.316898345947266, "learning_rate": 9.0502576341831e-06, "loss": 4.0543, "step": 740 }, { "epoch": 0.2266374556168316, "grad_norm": 21.5406436920166, "learning_rate": 9.02136190913219e-06, "loss": 5.4649, "step": 750 }, { "epoch": 0.22965928835838936, "grad_norm": 38.014617919921875, "learning_rate": 8.99208076784822e-06, "loss": 4.6499, "step": 760 }, { "epoch": 0.23268112109994712, "grad_norm": 16.046876907348633, "learning_rate": 8.962417016700624e-06, "loss": 3.0368, "step": 770 }, { "epoch": 0.2357029538415049, "grad_norm": 25.170169830322266, "learning_rate": 8.932373498729026e-06, "loss": 4.6374, "step": 780 }, { "epoch": 0.23872478658306262, "grad_norm": 28.294591903686523, "learning_rate": 8.901953093370734e-06, "loss": 4.0344, "step": 790 }, { "epoch": 0.24174661932462038, "grad_norm": 25.618423461914062, "learning_rate": 8.871158716184784e-06, "loss": 3.9153, "step": 800 }, { "epoch": 0.24476845206617814, "grad_norm": 33.044132232666016, "learning_rate": 8.839993318572497e-06, "loss": 4.852, "step": 810 }, { "epoch": 0.2477902848077359, "grad_norm": 19.522127151489258, "learning_rate": 8.808459887494617e-06, "loss": 3.0679, "step": 820 }, { "epoch": 0.25081211754929367, "grad_norm": 17.915157318115234, "learning_rate": 8.77656144518502e-06, "loss": 3.832, "step": 830 }, { "epoch": 0.25383395029085143, "grad_norm": 18.468053817749023, "learning_rate": 8.744301048861083e-06, "loss": 2.9134, "step": 840 }, { "epoch": 0.25685578303240914, "grad_norm": 25.19109535217285, "learning_rate": 8.711681790430646e-06, "loss": 2.9987, "step": 850 }, { "epoch": 0.2598776157739669, "grad_norm": 27.227184295654297, "learning_rate": 8.678706796195694e-06, "loss": 4.7592, "step": 860 }, { "epoch": 0.26289944851552466, "grad_norm": 28.04375457763672, "learning_rate": 8.645379226552712e-06, "loss": 3.7402, "step": 870 }, { "epoch": 0.2659212812570824, "grad_norm": 21.457616806030273, "learning_rate": 8.611702275689805e-06, "loss": 4.6756, "step": 880 }, { "epoch": 0.2689431139986402, "grad_norm": 35.01508331298828, "learning_rate": 8.577679171280538e-06, "loss": 4.5315, "step": 890 }, { "epoch": 0.27196494674019794, "grad_norm": 20.160045623779297, "learning_rate": 8.543313174174601e-06, "loss": 5.2698, "step": 900 }, { "epoch": 0.2749867794817557, "grad_norm": 22.52850341796875, "learning_rate": 8.508607578085281e-06, "loss": 3.849, "step": 910 }, { "epoch": 0.27800861222331347, "grad_norm": 21.895462036132812, "learning_rate": 8.473565709273786e-06, "loss": 3.8616, "step": 920 }, { "epoch": 0.2810304449648712, "grad_norm": 16.077316284179688, "learning_rate": 8.438190926230439e-06, "loss": 3.8386, "step": 930 }, { "epoch": 0.28405227770642894, "grad_norm": 33.1984977722168, "learning_rate": 8.40248661935281e-06, "loss": 4.3994, "step": 940 }, { "epoch": 0.2870741104479867, "grad_norm": 27.1571102142334, "learning_rate": 8.366456210620756e-06, "loss": 3.1029, "step": 950 }, { "epoch": 0.29009594318954446, "grad_norm": 31.706750869750977, "learning_rate": 8.330103153268464e-06, "loss": 3.7567, "step": 960 }, { "epoch": 0.2931177759311022, "grad_norm": 24.30504608154297, "learning_rate": 8.29343093145347e-06, "loss": 3.6988, "step": 970 }, { "epoch": 0.29613960867266, "grad_norm": 24.231523513793945, "learning_rate": 8.25644305992275e-06, "loss": 3.6097, "step": 980 }, { "epoch": 0.29916144141421774, "grad_norm": 19.621383666992188, "learning_rate": 8.21914308367584e-06, "loss": 4.5566, "step": 990 }, { "epoch": 0.3021832741557755, "grad_norm": 21.627859115600586, "learning_rate": 8.181534577625088e-06, "loss": 3.7714, "step": 1000 }, { "epoch": 0.3052051068973332, "grad_norm": 14.206421852111816, "learning_rate": 8.143621146253022e-06, "loss": 4.6373, "step": 1010 }, { "epoch": 0.308226939638891, "grad_norm": 27.084983825683594, "learning_rate": 8.105406423266884e-06, "loss": 4.6538, "step": 1020 }, { "epoch": 0.31124877238044873, "grad_norm": 20.950910568237305, "learning_rate": 8.066894071250374e-06, "loss": 4.4614, "step": 1030 }, { "epoch": 0.3142706051220065, "grad_norm": 20.357742309570312, "learning_rate": 8.02808778131262e-06, "loss": 3.7694, "step": 1040 }, { "epoch": 0.31729243786356426, "grad_norm": 18.685476303100586, "learning_rate": 7.988991272734407e-06, "loss": 4.4575, "step": 1050 }, { "epoch": 0.320314270605122, "grad_norm": 24.249338150024414, "learning_rate": 7.94960829261172e-06, "loss": 4.4394, "step": 1060 }, { "epoch": 0.3233361033466798, "grad_norm": 22.846027374267578, "learning_rate": 7.909942615496613e-06, "loss": 4.7241, "step": 1070 }, { "epoch": 0.32635793608823754, "grad_norm": 30.40308952331543, "learning_rate": 7.869998043035442e-06, "loss": 5.3999, "step": 1080 }, { "epoch": 0.32937976882979525, "grad_norm": 17.647789001464844, "learning_rate": 7.829778403604512e-06, "loss": 5.0469, "step": 1090 }, { "epoch": 0.332401601571353, "grad_norm": 33.98617935180664, "learning_rate": 7.789287551943158e-06, "loss": 6.0896, "step": 1100 }, { "epoch": 0.3354234343129108, "grad_norm": 21.646024703979492, "learning_rate": 7.748529368784293e-06, "loss": 4.5196, "step": 1110 }, { "epoch": 0.33844526705446853, "grad_norm": 18.94881820678711, "learning_rate": 7.707507760482473e-06, "loss": 6.1607, "step": 1120 }, { "epoch": 0.3414670997960263, "grad_norm": 18.058412551879883, "learning_rate": 7.666226658639507e-06, "loss": 3.7909, "step": 1130 }, { "epoch": 0.34448893253758406, "grad_norm": 22.541349411010742, "learning_rate": 7.624690019727636e-06, "loss": 3.638, "step": 1140 }, { "epoch": 0.3475107652791418, "grad_norm": 23.882991790771484, "learning_rate": 7.58290182471034e-06, "loss": 4.53, "step": 1150 }, { "epoch": 0.3505325980206996, "grad_norm": 19.6879940032959, "learning_rate": 7.5408660786607976e-06, "loss": 3.6987, "step": 1160 }, { "epoch": 0.3535544307622573, "grad_norm": 20.6401309967041, "learning_rate": 7.498586810378019e-06, "loss": 2.9513, "step": 1170 }, { "epoch": 0.35657626350381505, "grad_norm": 22.658132553100586, "learning_rate": 7.456068072000731e-06, "loss": 2.8103, "step": 1180 }, { "epoch": 0.3595980962453728, "grad_norm": 23.935726165771484, "learning_rate": 7.4133139386190026e-06, "loss": 4.5498, "step": 1190 }, { "epoch": 0.3626199289869306, "grad_norm": 18.697385787963867, "learning_rate": 7.3703285078836796e-06, "loss": 5.2042, "step": 1200 }, { "epoch": 0.36564176172848833, "grad_norm": 17.5216064453125, "learning_rate": 7.3271158996136625e-06, "loss": 3.7229, "step": 1210 }, { "epoch": 0.3686635944700461, "grad_norm": 18.313034057617188, "learning_rate": 7.283680255401049e-06, "loss": 4.403, "step": 1220 }, { "epoch": 0.37168542721160386, "grad_norm": 19.784748077392578, "learning_rate": 7.240025738214193e-06, "loss": 6.1978, "step": 1230 }, { "epoch": 0.3747072599531616, "grad_norm": 33.28024673461914, "learning_rate": 7.196156531998718e-06, "loss": 4.4892, "step": 1240 }, { "epoch": 0.3777290926947193, "grad_norm": 20.449913024902344, "learning_rate": 7.152076841276527e-06, "loss": 3.6566, "step": 1250 }, { "epoch": 0.3807509254362771, "grad_norm": 19.441957473754883, "learning_rate": 7.1077908907428154e-06, "loss": 3.7812, "step": 1260 }, { "epoch": 0.38377275817783485, "grad_norm": 32.515724182128906, "learning_rate": 7.063302924861182e-06, "loss": 3.8969, "step": 1270 }, { "epoch": 0.3867945909193926, "grad_norm": 22.129140853881836, "learning_rate": 7.018617207456821e-06, "loss": 3.5997, "step": 1280 }, { "epoch": 0.3898164236609504, "grad_norm": 19.576011657714844, "learning_rate": 6.973738021307872e-06, "loss": 3.6646, "step": 1290 }, { "epoch": 0.39283825640250813, "grad_norm": 17.848796844482422, "learning_rate": 6.9286696677349455e-06, "loss": 5.9623, "step": 1300 }, { "epoch": 0.3958600891440659, "grad_norm": 15.815289497375488, "learning_rate": 6.883416466188881e-06, "loss": 3.6821, "step": 1310 }, { "epoch": 0.39888192188562366, "grad_norm": 17.62392807006836, "learning_rate": 6.837982753836755e-06, "loss": 2.8778, "step": 1320 }, { "epoch": 0.40190375462718136, "grad_norm": 34.39213180541992, "learning_rate": 6.7923728851461955e-06, "loss": 6.0046, "step": 1330 }, { "epoch": 0.4049255873687391, "grad_norm": 22.834793090820312, "learning_rate": 6.74659123146805e-06, "loss": 3.6498, "step": 1340 }, { "epoch": 0.4079474201102969, "grad_norm": 18.146869659423828, "learning_rate": 6.70064218061742e-06, "loss": 2.8181, "step": 1350 }, { "epoch": 0.41096925285185465, "grad_norm": 18.262357711791992, "learning_rate": 6.654530136453119e-06, "loss": 4.3635, "step": 1360 }, { "epoch": 0.4139910855934124, "grad_norm": 18.1636905670166, "learning_rate": 6.608259518455599e-06, "loss": 5.2127, "step": 1370 }, { "epoch": 0.4170129183349702, "grad_norm": 17.246234893798828, "learning_rate": 6.5618347613033875e-06, "loss": 5.1173, "step": 1380 }, { "epoch": 0.42003475107652793, "grad_norm": 19.54306983947754, "learning_rate": 6.5152603144480406e-06, "loss": 5.9817, "step": 1390 }, { "epoch": 0.4230565838180857, "grad_norm": 31.445457458496094, "learning_rate": 6.468540641687716e-06, "loss": 4.5568, "step": 1400 }, { "epoch": 0.4260784165596434, "grad_norm": 19.258493423461914, "learning_rate": 6.421680220739337e-06, "loss": 3.9311, "step": 1410 }, { "epoch": 0.42910024930120116, "grad_norm": 33.21185302734375, "learning_rate": 6.374683542809447e-06, "loss": 7.8417, "step": 1420 }, { "epoch": 0.4321220820427589, "grad_norm": 19.956239700317383, "learning_rate": 6.327555112163761e-06, "loss": 4.3582, "step": 1430 }, { "epoch": 0.4351439147843167, "grad_norm": 19.256486892700195, "learning_rate": 6.280299445695469e-06, "loss": 5.2, "step": 1440 }, { "epoch": 0.43816574752587445, "grad_norm": 20.045286178588867, "learning_rate": 6.232921072492319e-06, "loss": 4.3409, "step": 1450 }, { "epoch": 0.4411875802674322, "grad_norm": 24.16641616821289, "learning_rate": 6.185424533402543e-06, "loss": 4.3162, "step": 1460 }, { "epoch": 0.44420941300899, "grad_norm": 23.316164016723633, "learning_rate": 6.13781438059966e-06, "loss": 3.5112, "step": 1470 }, { "epoch": 0.44723124575054773, "grad_norm": 34.204627990722656, "learning_rate": 6.090095177146178e-06, "loss": 5.1696, "step": 1480 }, { "epoch": 0.45025307849210544, "grad_norm": 17.53434181213379, "learning_rate": 6.042271496556255e-06, "loss": 2.7874, "step": 1490 }, { "epoch": 0.4532749112336632, "grad_norm": 21.362934112548828, "learning_rate": 5.994347922357372e-06, "loss": 3.8133, "step": 1500 }, { "epoch": 0.45629674397522096, "grad_norm": 19.935638427734375, "learning_rate": 5.946329047651037e-06, "loss": 3.592, "step": 1510 }, { "epoch": 0.4593185767167787, "grad_norm": 17.95412826538086, "learning_rate": 5.8982194746725686e-06, "loss": 2.7345, "step": 1520 }, { "epoch": 0.4623404094583365, "grad_norm": 24.026193618774414, "learning_rate": 5.850023814350007e-06, "loss": 4.2519, "step": 1530 }, { "epoch": 0.46536224219989425, "grad_norm": 12.00658893585205, "learning_rate": 5.801746685862197e-06, "loss": 6.0717, "step": 1540 }, { "epoch": 0.468384074941452, "grad_norm": 14.519695281982422, "learning_rate": 5.753392716196069e-06, "loss": 2.8474, "step": 1550 }, { "epoch": 0.4714059076830098, "grad_norm": 15.277630805969238, "learning_rate": 5.704966539703185e-06, "loss": 3.6301, "step": 1560 }, { "epoch": 0.4744277404245675, "grad_norm": 17.934938430786133, "learning_rate": 5.656472797655571e-06, "loss": 4.4189, "step": 1570 }, { "epoch": 0.47744957316612524, "grad_norm": 17.185529708862305, "learning_rate": 5.60791613780088e-06, "loss": 2.7758, "step": 1580 }, { "epoch": 0.480471405907683, "grad_norm": 25.111557006835938, "learning_rate": 5.5593012139169525e-06, "loss": 4.296, "step": 1590 }, { "epoch": 0.48349323864924076, "grad_norm": 23.77570343017578, "learning_rate": 5.510632685365777e-06, "loss": 4.4462, "step": 1600 }, { "epoch": 0.4865150713907985, "grad_norm": 17.37128448486328, "learning_rate": 5.461915216646938e-06, "loss": 2.7426, "step": 1610 }, { "epoch": 0.4895369041323563, "grad_norm": 23.484580993652344, "learning_rate": 5.41315347695055e-06, "loss": 4.2378, "step": 1620 }, { "epoch": 0.49255873687391405, "grad_norm": 23.495826721191406, "learning_rate": 5.364352139709758e-06, "loss": 4.8879, "step": 1630 }, { "epoch": 0.4955805696154718, "grad_norm": 16.23356819152832, "learning_rate": 5.315515882152822e-06, "loss": 3.5359, "step": 1640 }, { "epoch": 0.4986024023570295, "grad_norm": 16.77799415588379, "learning_rate": 5.266649384854842e-06, "loss": 4.2516, "step": 1650 }, { "epoch": 0.5016242350985873, "grad_norm": 21.264799118041992, "learning_rate": 5.217757331289165e-06, "loss": 3.6844, "step": 1660 }, { "epoch": 0.5046460678401451, "grad_norm": 18.198184967041016, "learning_rate": 5.168844407378506e-06, "loss": 4.8485, "step": 1670 }, { "epoch": 0.5076679005817029, "grad_norm": 13.497072219848633, "learning_rate": 5.119915301045836e-06, "loss": 2.8835, "step": 1680 }, { "epoch": 0.5106897333232605, "grad_norm": 24.342716217041016, "learning_rate": 5.070974701765089e-06, "loss": 5.1527, "step": 1690 }, { "epoch": 0.5137115660648183, "grad_norm": 25.917234420776367, "learning_rate": 5.022027300111712e-06, "loss": 4.3981, "step": 1700 }, { "epoch": 0.516733398806376, "grad_norm": 15.280237197875977, "learning_rate": 4.973077787313099e-06, "loss": 4.4554, "step": 1710 }, { "epoch": 0.5197552315479338, "grad_norm": 17.290264129638672, "learning_rate": 4.924130854798983e-06, "loss": 5.1108, "step": 1720 }, { "epoch": 0.5227770642894916, "grad_norm": 15.63051700592041, "learning_rate": 4.875191193751803e-06, "loss": 2.8006, "step": 1730 }, { "epoch": 0.5257988970310493, "grad_norm": 15.663633346557617, "learning_rate": 4.826263494657077e-06, "loss": 3.4979, "step": 1740 }, { "epoch": 0.5288207297726071, "grad_norm": 35.42136001586914, "learning_rate": 4.777352446853863e-06, "loss": 4.9996, "step": 1750 }, { "epoch": 0.5318425625141648, "grad_norm": 23.063594818115234, "learning_rate": 4.72846273808533e-06, "loss": 3.509, "step": 1760 }, { "epoch": 0.5348643952557226, "grad_norm": 21.706233978271484, "learning_rate": 4.679599054049458e-06, "loss": 3.3899, "step": 1770 }, { "epoch": 0.5378862279972804, "grad_norm": 20.82579231262207, "learning_rate": 4.630766077949965e-06, "loss": 5.9861, "step": 1780 }, { "epoch": 0.5409080607388381, "grad_norm": 32.06898880004883, "learning_rate": 4.5819684900474484e-06, "loss": 4.3172, "step": 1790 }, { "epoch": 0.5439298934803959, "grad_norm": 16.330984115600586, "learning_rate": 4.5332109672108245e-06, "loss": 4.4365, "step": 1800 }, { "epoch": 0.5469517262219536, "grad_norm": 17.189834594726562, "learning_rate": 4.484498182469085e-06, "loss": 3.6319, "step": 1810 }, { "epoch": 0.5499735589635114, "grad_norm": 19.211336135864258, "learning_rate": 4.435834804563422e-06, "loss": 5.8999, "step": 1820 }, { "epoch": 0.5529953917050692, "grad_norm": 26.310638427734375, "learning_rate": 4.387225497499767e-06, "loss": 3.5792, "step": 1830 }, { "epoch": 0.5560172244466269, "grad_norm": 20.680715560913086, "learning_rate": 4.3386749201017856e-06, "loss": 3.4555, "step": 1840 }, { "epoch": 0.5590390571881846, "grad_norm": 15.533769607543945, "learning_rate": 4.290187725564356e-06, "loss": 6.0278, "step": 1850 }, { "epoch": 0.5620608899297423, "grad_norm": 13.684257507324219, "learning_rate": 4.2417685610076135e-06, "loss": 3.4758, "step": 1860 }, { "epoch": 0.5650827226713001, "grad_norm": 15.711587905883789, "learning_rate": 4.193422067031535e-06, "loss": 4.3166, "step": 1870 }, { "epoch": 0.5681045554128579, "grad_norm": 18.764991760253906, "learning_rate": 4.145152877271196e-06, "loss": 4.1625, "step": 1880 }, { "epoch": 0.5711263881544156, "grad_norm": 19.19873809814453, "learning_rate": 4.096965617952667e-06, "loss": 4.4233, "step": 1890 }, { "epoch": 0.5741482208959734, "grad_norm": 20.817365646362305, "learning_rate": 4.048864907449619e-06, "loss": 3.5268, "step": 1900 }, { "epoch": 0.5771700536375312, "grad_norm": 18.440645217895508, "learning_rate": 4.000855355840695e-06, "loss": 3.5747, "step": 1910 }, { "epoch": 0.5801918863790889, "grad_norm": 15.997143745422363, "learning_rate": 3.952941564467665e-06, "loss": 4.2257, "step": 1920 }, { "epoch": 0.5832137191206467, "grad_norm": 20.629562377929688, "learning_rate": 3.905128125494427e-06, "loss": 4.3136, "step": 1930 }, { "epoch": 0.5862355518622044, "grad_norm": 33.730995178222656, "learning_rate": 3.8574196214668876e-06, "loss": 4.509, "step": 1940 }, { "epoch": 0.5892573846037622, "grad_norm": 30.045576095581055, "learning_rate": 3.8098206248737486e-06, "loss": 5.139, "step": 1950 }, { "epoch": 0.59227921734532, "grad_norm": 23.693470001220703, "learning_rate": 3.7623356977082794e-06, "loss": 2.5913, "step": 1960 }, { "epoch": 0.5953010500868777, "grad_norm": 18.655092239379883, "learning_rate": 3.714969391031084e-06, "loss": 4.3328, "step": 1970 }, { "epoch": 0.5983228828284355, "grad_norm": 15.45345687866211, "learning_rate": 3.6677262445339136e-06, "loss": 3.5691, "step": 1980 }, { "epoch": 0.6013447155699932, "grad_norm": 21.302995681762695, "learning_rate": 3.6206107861045803e-06, "loss": 2.5934, "step": 1990 }, { "epoch": 0.604366548311551, "grad_norm": 13.75935173034668, "learning_rate": 3.5736275313929826e-06, "loss": 4.3405, "step": 2000 }, { "epoch": 0.6073883810531087, "grad_norm": 17.593429565429688, "learning_rate": 3.5267809833783213e-06, "loss": 4.8443, "step": 2010 }, { "epoch": 0.6104102137946664, "grad_norm": 23.467853546142578, "learning_rate": 3.4800756319375326e-06, "loss": 3.4879, "step": 2020 }, { "epoch": 0.6134320465362242, "grad_norm": 25.12725830078125, "learning_rate": 3.433515953414953e-06, "loss": 2.7966, "step": 2030 }, { "epoch": 0.616453879277782, "grad_norm": 33.0245475769043, "learning_rate": 3.387106410193308e-06, "loss": 5.8078, "step": 2040 }, { "epoch": 0.6194757120193397, "grad_norm": 18.8001651763916, "learning_rate": 3.3408514502660195e-06, "loss": 5.2049, "step": 2050 }, { "epoch": 0.6224975447608975, "grad_norm": 16.787553787231445, "learning_rate": 3.2947555068109057e-06, "loss": 3.3988, "step": 2060 }, { "epoch": 0.6255193775024552, "grad_norm": 21.532262802124023, "learning_rate": 3.248822997765295e-06, "loss": 2.815, "step": 2070 }, { "epoch": 0.628541210244013, "grad_norm": 24.630603790283203, "learning_rate": 3.203058325402599e-06, "loss": 4.3332, "step": 2080 }, { "epoch": 0.6315630429855708, "grad_norm": 16.667922973632812, "learning_rate": 3.1574658759103904e-06, "loss": 4.3038, "step": 2090 }, { "epoch": 0.6345848757271285, "grad_norm": 20.671772003173828, "learning_rate": 3.1120500189700204e-06, "loss": 3.4132, "step": 2100 }, { "epoch": 0.6376067084686863, "grad_norm": 21.932987213134766, "learning_rate": 3.066815107337815e-06, "loss": 4.1988, "step": 2110 }, { "epoch": 0.640628541210244, "grad_norm": 17.348411560058594, "learning_rate": 3.0217654764279114e-06, "loss": 3.5937, "step": 2120 }, { "epoch": 0.6436503739518018, "grad_norm": 25.625871658325195, "learning_rate": 2.9769054438967192e-06, "loss": 5.9817, "step": 2130 }, { "epoch": 0.6466722066933596, "grad_norm": 31.0660457611084, "learning_rate": 2.9322393092291256e-06, "loss": 5.6772, "step": 2140 }, { "epoch": 0.6496940394349173, "grad_norm": 20.511960983276367, "learning_rate": 2.887771353326422e-06, "loss": 4.2915, "step": 2150 }, { "epoch": 0.6527158721764751, "grad_norm": 17.798234939575195, "learning_rate": 2.8435058380959957e-06, "loss": 2.642, "step": 2160 }, { "epoch": 0.6557377049180327, "grad_norm": 18.133886337280273, "learning_rate": 2.7994470060428835e-06, "loss": 4.1208, "step": 2170 }, { "epoch": 0.6587595376595905, "grad_norm": 18.74016571044922, "learning_rate": 2.7555990798631436e-06, "loss": 4.8817, "step": 2180 }, { "epoch": 0.6617813704011483, "grad_norm": 15.885804176330566, "learning_rate": 2.711966262039145e-06, "loss": 3.3242, "step": 2190 }, { "epoch": 0.664803203142706, "grad_norm": 24.100414276123047, "learning_rate": 2.668552734436802e-06, "loss": 4.3377, "step": 2200 }, { "epoch": 0.6678250358842638, "grad_norm": 17.113306045532227, "learning_rate": 2.6253626579047653e-06, "loss": 5.7855, "step": 2210 }, { "epoch": 0.6708468686258215, "grad_norm": 33.268699645996094, "learning_rate": 2.582400171875638e-06, "loss": 3.4326, "step": 2220 }, { "epoch": 0.6738687013673793, "grad_norm": 29.673768997192383, "learning_rate": 2.5396693939692474e-06, "loss": 4.8596, "step": 2230 }, { "epoch": 0.6768905341089371, "grad_norm": 14.550185203552246, "learning_rate": 2.4971744195979985e-06, "loss": 5.1031, "step": 2240 }, { "epoch": 0.6799123668504948, "grad_norm": 32.16508102416992, "learning_rate": 2.4549193215743706e-06, "loss": 5.833, "step": 2250 }, { "epoch": 0.6829341995920526, "grad_norm": 18.873088836669922, "learning_rate": 2.4129081497205536e-06, "loss": 3.3544, "step": 2260 }, { "epoch": 0.6859560323336104, "grad_norm": 31.875137329101562, "learning_rate": 2.3711449304803174e-06, "loss": 4.0864, "step": 2270 }, { "epoch": 0.6889778650751681, "grad_norm": 27.996572494506836, "learning_rate": 2.329633666533103e-06, "loss": 4.0582, "step": 2280 }, { "epoch": 0.6919996978167259, "grad_norm": 19.299062728881836, "learning_rate": 2.288378336410398e-06, "loss": 4.2188, "step": 2290 }, { "epoch": 0.6950215305582836, "grad_norm": 21.146148681640625, "learning_rate": 2.2473828941144277e-06, "loss": 4.8756, "step": 2300 }, { "epoch": 0.6980433632998414, "grad_norm": 28.3226261138916, "learning_rate": 2.20665126873919e-06, "loss": 3.3593, "step": 2310 }, { "epoch": 0.7010651960413992, "grad_norm": 16.02470588684082, "learning_rate": 2.1661873640938818e-06, "loss": 4.1255, "step": 2320 }, { "epoch": 0.7040870287829568, "grad_norm": 21.263837814331055, "learning_rate": 2.1259950583287633e-06, "loss": 4.145, "step": 2330 }, { "epoch": 0.7071088615245146, "grad_norm": 22.879661560058594, "learning_rate": 2.086078203563439e-06, "loss": 4.7453, "step": 2340 }, { "epoch": 0.7101306942660723, "grad_norm": 15.726652145385742, "learning_rate": 2.0464406255176967e-06, "loss": 4.019, "step": 2350 }, { "epoch": 0.7131525270076301, "grad_norm": 30.606904983520508, "learning_rate": 2.0070861231448142e-06, "loss": 4.9014, "step": 2360 }, { "epoch": 0.7161743597491879, "grad_norm": 17.185054779052734, "learning_rate": 1.968018468267472e-06, "loss": 4.1918, "step": 2370 }, { "epoch": 0.7191961924907456, "grad_norm": 15.510167121887207, "learning_rate": 1.929241405216254e-06, "loss": 4.0934, "step": 2380 }, { "epoch": 0.7222180252323034, "grad_norm": 20.12055206298828, "learning_rate": 1.8907586504707776e-06, "loss": 4.701, "step": 2390 }, { "epoch": 0.7252398579738611, "grad_norm": 19.135282516479492, "learning_rate": 1.8525738923035002e-06, "loss": 2.5439, "step": 2400 }, { "epoch": 0.7282616907154189, "grad_norm": 19.167003631591797, "learning_rate": 1.8146907904262268e-06, "loss": 4.2791, "step": 2410 }, { "epoch": 0.7312835234569767, "grad_norm": 24.79986572265625, "learning_rate": 1.7771129756393545e-06, "loss": 3.4256, "step": 2420 }, { "epoch": 0.7343053561985344, "grad_norm": 20.59393310546875, "learning_rate": 1.7398440494838947e-06, "loss": 3.5206, "step": 2430 }, { "epoch": 0.7373271889400922, "grad_norm": 25.903627395629883, "learning_rate": 1.7028875838962822e-06, "loss": 4.1281, "step": 2440 }, { "epoch": 0.74034902168165, "grad_norm": 35.45489501953125, "learning_rate": 1.6662471208660392e-06, "loss": 4.0468, "step": 2450 }, { "epoch": 0.7433708544232077, "grad_norm": 20.3117618560791, "learning_rate": 1.6299261720963095e-06, "loss": 4.1749, "step": 2460 }, { "epoch": 0.7463926871647655, "grad_norm": 15.878867149353027, "learning_rate": 1.5939282186672705e-06, "loss": 4.8916, "step": 2470 }, { "epoch": 0.7494145199063232, "grad_norm": 19.15277099609375, "learning_rate": 1.5582567107025237e-06, "loss": 4.8288, "step": 2480 }, { "epoch": 0.7524363526478809, "grad_norm": 29.44374656677246, "learning_rate": 1.5229150670384057e-06, "loss": 3.3806, "step": 2490 }, { "epoch": 0.7554581853894387, "grad_norm": 23.206140518188477, "learning_rate": 1.4879066748963295e-06, "loss": 2.5563, "step": 2500 }, { "epoch": 0.7584800181309964, "grad_norm": 27.133193969726562, "learning_rate": 1.4532348895581466e-06, "loss": 3.4434, "step": 2510 }, { "epoch": 0.7615018508725542, "grad_norm": 29.599319458007812, "learning_rate": 1.4189030340445648e-06, "loss": 6.7087, "step": 2520 }, { "epoch": 0.7645236836141119, "grad_norm": 17.123348236083984, "learning_rate": 1.3849143987966646e-06, "loss": 4.9595, "step": 2530 }, { "epoch": 0.7675455163556697, "grad_norm": 16.49233627319336, "learning_rate": 1.3512722413605356e-06, "loss": 4.0857, "step": 2540 }, { "epoch": 0.7705673490972275, "grad_norm": 16.6666316986084, "learning_rate": 1.3179797860750654e-06, "loss": 4.8943, "step": 2550 }, { "epoch": 0.7735891818387852, "grad_norm": 19.440494537353516, "learning_rate": 1.2850402237629184e-06, "loss": 4.1448, "step": 2560 }, { "epoch": 0.776611014580343, "grad_norm": 14.674943923950195, "learning_rate": 1.2524567114247083e-06, "loss": 3.3491, "step": 2570 }, { "epoch": 0.7796328473219007, "grad_norm": 16.349637985229492, "learning_rate": 1.2202323719364324e-06, "loss": 3.2897, "step": 2580 }, { "epoch": 0.7826546800634585, "grad_norm": 19.67890739440918, "learning_rate": 1.1883702937501708e-06, "loss": 4.0901, "step": 2590 }, { "epoch": 0.7856765128050163, "grad_norm": 21.339618682861328, "learning_rate": 1.1568735305980694e-06, "loss": 4.1003, "step": 2600 }, { "epoch": 0.788698345546574, "grad_norm": 21.269119262695312, "learning_rate": 1.1257451011996807e-06, "loss": 3.4165, "step": 2610 }, { "epoch": 0.7917201782881318, "grad_norm": 33.041419982910156, "learning_rate": 1.0949879889726295e-06, "loss": 3.4622, "step": 2620 }, { "epoch": 0.7947420110296896, "grad_norm": 28.960115432739258, "learning_rate": 1.0646051417466801e-06, "loss": 3.4136, "step": 2630 }, { "epoch": 0.7977638437712473, "grad_norm": 24.76239013671875, "learning_rate": 1.0345994714812135e-06, "loss": 4.1335, "step": 2640 }, { "epoch": 0.800785676512805, "grad_norm": 15.773963928222656, "learning_rate": 1.0049738539861332e-06, "loss": 3.2818, "step": 2650 }, { "epoch": 0.8038075092543627, "grad_norm": 21.248395919799805, "learning_rate": 9.757311286462428e-07, "loss": 4.1348, "step": 2660 }, { "epoch": 0.8068293419959205, "grad_norm": 23.75290298461914, "learning_rate": 9.468740981491143e-07, "loss": 4.1947, "step": 2670 }, { "epoch": 0.8098511747374783, "grad_norm": 16.7280330657959, "learning_rate": 9.1840552821647e-07, "loss": 4.0364, "step": 2680 }, { "epoch": 0.812873007479036, "grad_norm": 17.696247100830078, "learning_rate": 8.903281473391152e-07, "loss": 3.3641, "step": 2690 }, { "epoch": 0.8158948402205938, "grad_norm": 16.840299606323242, "learning_rate": 8.62644646515427e-07, "loss": 5.7446, "step": 2700 }, { "epoch": 0.8189166729621515, "grad_norm": 13.25534725189209, "learning_rate": 8.353576789934436e-07, "loss": 3.3763, "step": 2710 }, { "epoch": 0.8219385057037093, "grad_norm": 19.88932991027832, "learning_rate": 8.084698600165797e-07, "loss": 3.5133, "step": 2720 }, { "epoch": 0.8249603384452671, "grad_norm": 17.921199798583984, "learning_rate": 7.819837665729596e-07, "loss": 4.1018, "step": 2730 }, { "epoch": 0.8279821711868248, "grad_norm": 29.57664680480957, "learning_rate": 7.559019371484521e-07, "loss": 3.3378, "step": 2740 }, { "epoch": 0.8310040039283826, "grad_norm": 17.720863342285156, "learning_rate": 7.302268714833622e-07, "loss": 4.1487, "step": 2750 }, { "epoch": 0.8340258366699403, "grad_norm": 17.34684944152832, "learning_rate": 7.049610303328541e-07, "loss": 3.5199, "step": 2760 }, { "epoch": 0.8370476694114981, "grad_norm": 16.739910125732422, "learning_rate": 6.80106835231113e-07, "loss": 4.2899, "step": 2770 }, { "epoch": 0.8400695021530559, "grad_norm": 17.1294002532959, "learning_rate": 6.556666682592494e-07, "loss": 3.3016, "step": 2780 }, { "epoch": 0.8430913348946136, "grad_norm": 14.801079750061035, "learning_rate": 6.316428718170037e-07, "loss": 2.4169, "step": 2790 }, { "epoch": 0.8461131676361714, "grad_norm": 19.354856491088867, "learning_rate": 6.080377483982425e-07, "loss": 3.2883, "step": 2800 }, { "epoch": 0.849135000377729, "grad_norm": 17.925838470458984, "learning_rate": 5.848535603702798e-07, "loss": 3.3497, "step": 2810 }, { "epoch": 0.8521568331192868, "grad_norm": 20.340959548950195, "learning_rate": 5.62092529757054e-07, "loss": 6.4132, "step": 2820 }, { "epoch": 0.8551786658608446, "grad_norm": 21.507797241210938, "learning_rate": 5.397568380261559e-07, "loss": 2.3404, "step": 2830 }, { "epoch": 0.8582004986024023, "grad_norm": 16.9514102935791, "learning_rate": 5.178486258797555e-07, "loss": 4.0876, "step": 2840 }, { "epoch": 0.8612223313439601, "grad_norm": 14.505171775817871, "learning_rate": 4.963699930494365e-07, "loss": 3.3715, "step": 2850 }, { "epoch": 0.8642441640855179, "grad_norm": 22.551313400268555, "learning_rate": 4.75322998094942e-07, "loss": 4.2347, "step": 2860 }, { "epoch": 0.8672659968270756, "grad_norm": 20.145078659057617, "learning_rate": 4.5470965820689384e-07, "loss": 2.5903, "step": 2870 }, { "epoch": 0.8702878295686334, "grad_norm": 17.447914123535156, "learning_rate": 4.345319490134453e-07, "loss": 3.0177, "step": 2880 }, { "epoch": 0.8733096623101911, "grad_norm": 16.10365104675293, "learning_rate": 4.147918043909405e-07, "loss": 4.764, "step": 2890 }, { "epoch": 0.8763314950517489, "grad_norm": 19.066129684448242, "learning_rate": 3.9549111627856794e-07, "loss": 4.7699, "step": 2900 }, { "epoch": 0.8793533277933067, "grad_norm": 19.604887008666992, "learning_rate": 3.766317344970288e-07, "loss": 4.1165, "step": 2910 }, { "epoch": 0.8823751605348644, "grad_norm": 17.465734481811523, "learning_rate": 3.582154665712473e-07, "loss": 2.4443, "step": 2920 }, { "epoch": 0.8853969932764222, "grad_norm": 22.400236129760742, "learning_rate": 3.402440775571364e-07, "loss": 4.0664, "step": 2930 }, { "epoch": 0.88841882601798, "grad_norm": 21.420312881469727, "learning_rate": 3.227192898724252e-07, "loss": 5.7203, "step": 2940 }, { "epoch": 0.8914406587595377, "grad_norm": 23.331478118896484, "learning_rate": 3.056427831315878e-07, "loss": 3.367, "step": 2950 }, { "epoch": 0.8944624915010955, "grad_norm": 21.29648208618164, "learning_rate": 2.890161939848535e-07, "loss": 4.1604, "step": 2960 }, { "epoch": 0.8974843242426531, "grad_norm": 15.172201156616211, "learning_rate": 2.72841115961357e-07, "loss": 4.2335, "step": 2970 }, { "epoch": 0.9005061569842109, "grad_norm": 16.736038208007812, "learning_rate": 2.5711909931640633e-07, "loss": 3.9793, "step": 2980 }, { "epoch": 0.9035279897257686, "grad_norm": 22.6779727935791, "learning_rate": 2.418516508829e-07, "loss": 2.4922, "step": 2990 }, { "epoch": 0.9065498224673264, "grad_norm": 32.2912712097168, "learning_rate": 2.270402339269162e-07, "loss": 5.6454, "step": 3000 }, { "epoch": 0.9095716552088842, "grad_norm": 18.107574462890625, "learning_rate": 2.126862680074643e-07, "loss": 5.0056, "step": 3010 }, { "epoch": 0.9125934879504419, "grad_norm": 32.63033676147461, "learning_rate": 1.9879112884043317e-07, "loss": 2.5369, "step": 3020 }, { "epoch": 0.9156153206919997, "grad_norm": 18.089956283569336, "learning_rate": 1.853561481667404e-07, "loss": 2.4556, "step": 3030 }, { "epoch": 0.9186371534335575, "grad_norm": 13.772138595581055, "learning_rate": 1.7238261362469256e-07, "loss": 3.0884, "step": 3040 }, { "epoch": 0.9216589861751152, "grad_norm": 22.537776947021484, "learning_rate": 1.5987176862657883e-07, "loss": 3.2805, "step": 3050 }, { "epoch": 0.924680818916673, "grad_norm": 30.13243865966797, "learning_rate": 1.4782481223949597e-07, "loss": 3.2507, "step": 3060 }, { "epoch": 0.9277026516582307, "grad_norm": 20.858510971069336, "learning_rate": 1.3624289907042787e-07, "loss": 4.1981, "step": 3070 }, { "epoch": 0.9307244843997885, "grad_norm": 30.669658660888672, "learning_rate": 1.2512713915559027e-07, "loss": 4.9341, "step": 3080 }, { "epoch": 0.9337463171413463, "grad_norm": 32.03891372680664, "learning_rate": 1.1447859785403359e-07, "loss": 4.8266, "step": 3090 }, { "epoch": 0.936768149882904, "grad_norm": 18.382429122924805, "learning_rate": 1.0429829574554573e-07, "loss": 3.4044, "step": 3100 }, { "epoch": 0.9397899826244618, "grad_norm": 16.341550827026367, "learning_rate": 9.458720853282977e-08, "loss": 4.1438, "step": 3110 }, { "epoch": 0.9428118153660195, "grad_norm": 32.575286865234375, "learning_rate": 8.534626694799485e-08, "loss": 5.6917, "step": 3120 }, { "epoch": 0.9458336481075772, "grad_norm": 19.515989303588867, "learning_rate": 7.657635666335317e-08, "loss": 2.5437, "step": 3130 }, { "epoch": 0.948855480849135, "grad_norm": 18.81734275817871, "learning_rate": 6.827831820653163e-08, "loss": 2.5297, "step": 3140 }, { "epoch": 0.9518773135906927, "grad_norm": 20.44892120361328, "learning_rate": 6.045294687991643e-08, "loss": 5.3046, "step": 3150 }, { "epoch": 0.9548991463322505, "grad_norm": 30.222261428833008, "learning_rate": 5.310099268443114e-08, "loss": 7.1585, "step": 3160 }, { "epoch": 0.9579209790738082, "grad_norm": 22.93487548828125, "learning_rate": 4.622316024765039e-08, "loss": 3.9296, "step": 3170 }, { "epoch": 0.960942811815366, "grad_norm": 20.129398345947266, "learning_rate": 3.982010875626885e-08, "loss": 3.2971, "step": 3180 }, { "epoch": 0.9639646445569238, "grad_norm": 20.64815330505371, "learning_rate": 3.389245189292622e-08, "loss": 4.1501, "step": 3190 }, { "epoch": 0.9669864772984815, "grad_norm": 19.435129165649414, "learning_rate": 2.8440757777385976e-08, "loss": 4.9552, "step": 3200 }, { "epoch": 0.9700083100400393, "grad_norm": 17.719867706298828, "learning_rate": 2.3465548912088298e-08, "loss": 2.6329, "step": 3210 }, { "epoch": 0.973030142781597, "grad_norm": 21.178937911987305, "learning_rate": 1.896730213207132e-08, "loss": 4.0836, "step": 3220 }, { "epoch": 0.9760519755231548, "grad_norm": 16.906330108642578, "learning_rate": 1.4946448559270964e-08, "loss": 2.397, "step": 3230 }, { "epoch": 0.9790738082647126, "grad_norm": 23.301292419433594, "learning_rate": 1.1403373561199583e-08, "loss": 4.2365, "step": 3240 }, { "epoch": 0.9820956410062703, "grad_norm": 20.07245635986328, "learning_rate": 8.338416714013254e-09, "loss": 3.444, "step": 3250 }, { "epoch": 0.9851174737478281, "grad_norm": 16.27911949157715, "learning_rate": 5.751871769965056e-09, "loss": 5.5038, "step": 3260 }, { "epoch": 0.9881393064893859, "grad_norm": 21.404827117919922, "learning_rate": 3.643986629253138e-09, "loss": 4.1734, "step": 3270 }, { "epoch": 0.9911611392309436, "grad_norm": 32.63972473144531, "learning_rate": 2.014963316257501e-09, "loss": 4.9837, "step": 3280 }, { "epoch": 0.9941829719725013, "grad_norm": 19.831165313720703, "learning_rate": 8.649579601810454e-10, "loss": 4.9867, "step": 3290 }, { "epoch": 0.997204804714059, "grad_norm": 21.53673553466797, "learning_rate": 1.9408078008431587e-10, "loss": 3.3738, "step": 3300 }, { "epoch": 0.9996222709073053, "step": 3308, "total_flos": 1.175877708593234e+19, "train_loss": 4.4771003486744005, "train_runtime": 52781.7624, "train_samples_per_second": 4.013, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 3309, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.175877708593234e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }