| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.1272426517368622, |
| "eval_steps": 1000000, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001272426517368622, |
| "grad_norm": 3.5877323150634766, |
| "learning_rate": 5.725919328158799e-08, |
| "loss": 0.1624, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.002544853034737244, |
| "grad_norm": 2.085402488708496, |
| "learning_rate": 1.2088051915001908e-07, |
| "loss": 0.1406, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.003817279552105866, |
| "grad_norm": 2.65560245513916, |
| "learning_rate": 1.845018450184502e-07, |
| "loss": 0.1539, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.005089706069474488, |
| "grad_norm": 4.92278528213501, |
| "learning_rate": 2.4812317088688133e-07, |
| "loss": 0.1836, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00636213258684311, |
| "grad_norm": 3.515843152999878, |
| "learning_rate": 3.117444967553124e-07, |
| "loss": 0.1567, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.007634559104211732, |
| "grad_norm": 2.803701877593994, |
| "learning_rate": 3.7536582262374346e-07, |
| "loss": 0.1596, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.008906985621580354, |
| "grad_norm": 2.529664993286133, |
| "learning_rate": 4.389871484921746e-07, |
| "loss": 0.1436, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.010179412138948976, |
| "grad_norm": 4.315010070800781, |
| "learning_rate": 5.026084743606057e-07, |
| "loss": 0.175, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.011451838656317598, |
| "grad_norm": 3.158771276473999, |
| "learning_rate": 5.662298002290368e-07, |
| "loss": 0.1337, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.01272426517368622, |
| "grad_norm": 1.9685556888580322, |
| "learning_rate": 6.298511260974679e-07, |
| "loss": 0.1563, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.013996691691054842, |
| "grad_norm": 1.7232635021209717, |
| "learning_rate": 6.93472451965899e-07, |
| "loss": 0.1262, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.015269118208423464, |
| "grad_norm": 3.527454137802124, |
| "learning_rate": 7.570937778343301e-07, |
| "loss": 0.1413, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.016541544725792084, |
| "grad_norm": 3.0089797973632812, |
| "learning_rate": 8.207151037027612e-07, |
| "loss": 0.1524, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.017813971243160708, |
| "grad_norm": 1.7723190784454346, |
| "learning_rate": 8.843364295711924e-07, |
| "loss": 0.1307, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01908639776052933, |
| "grad_norm": 1.957555890083313, |
| "learning_rate": 9.479577554396234e-07, |
| "loss": 0.123, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.020358824277897952, |
| "grad_norm": 1.7378482818603516, |
| "learning_rate": 1.0115790813080545e-06, |
| "loss": 0.1708, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.021631250795266573, |
| "grad_norm": 1.664908766746521, |
| "learning_rate": 1.0752004071764857e-06, |
| "loss": 0.1132, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.022903677312635196, |
| "grad_norm": 2.3183891773223877, |
| "learning_rate": 1.1388217330449168e-06, |
| "loss": 0.1107, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.024176103830003817, |
| "grad_norm": 2.1844465732574463, |
| "learning_rate": 1.2024430589133478e-06, |
| "loss": 0.0953, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.02544853034737244, |
| "grad_norm": 4.4184417724609375, |
| "learning_rate": 1.266064384781779e-06, |
| "loss": 0.0922, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02672095686474106, |
| "grad_norm": 2.0239903926849365, |
| "learning_rate": 1.32968571065021e-06, |
| "loss": 0.0699, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.027993383382109684, |
| "grad_norm": 1.2873213291168213, |
| "learning_rate": 1.3933070365186412e-06, |
| "loss": 0.0717, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.029265809899478305, |
| "grad_norm": 1.9870949983596802, |
| "learning_rate": 1.4569283623870722e-06, |
| "loss": 0.055, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.03053823641684693, |
| "grad_norm": 0.75453120470047, |
| "learning_rate": 1.5205496882555034e-06, |
| "loss": 0.0558, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03181066293421555, |
| "grad_norm": 2.1326417922973633, |
| "learning_rate": 1.5841710141239346e-06, |
| "loss": 0.0428, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03308308945158417, |
| "grad_norm": 0.530195951461792, |
| "learning_rate": 1.6477923399923656e-06, |
| "loss": 0.0344, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.034355515968952796, |
| "grad_norm": 0.25554534792900085, |
| "learning_rate": 1.7114136658607965e-06, |
| "loss": 0.0342, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.035627942486321416, |
| "grad_norm": 2.5654115676879883, |
| "learning_rate": 1.7750349917292275e-06, |
| "loss": 0.0464, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.03690036900369004, |
| "grad_norm": 0.8986497521400452, |
| "learning_rate": 1.838656317597659e-06, |
| "loss": 0.0359, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.03817279552105866, |
| "grad_norm": 0.4290733337402344, |
| "learning_rate": 1.90227764346609e-06, |
| "loss": 0.0269, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.039445222038427284, |
| "grad_norm": 1.9061795473098755, |
| "learning_rate": 1.965898969334521e-06, |
| "loss": 0.0336, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.040717648555795904, |
| "grad_norm": 2.097940444946289, |
| "learning_rate": 2.029520295202952e-06, |
| "loss": 0.0527, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.041990075073164525, |
| "grad_norm": 2.3401193618774414, |
| "learning_rate": 2.093141621071383e-06, |
| "loss": 0.0254, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.043262501590533145, |
| "grad_norm": 0.4466155171394348, |
| "learning_rate": 2.1567629469398143e-06, |
| "loss": 0.0365, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.044534928107901765, |
| "grad_norm": 1.4632848501205444, |
| "learning_rate": 2.2203842728082453e-06, |
| "loss": 0.037, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04580735462527039, |
| "grad_norm": 1.3355021476745605, |
| "learning_rate": 2.2840055986766767e-06, |
| "loss": 0.0409, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.04707978114263901, |
| "grad_norm": 2.0859885215759277, |
| "learning_rate": 2.3476269245451077e-06, |
| "loss": 0.0305, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.04835220766000763, |
| "grad_norm": 1.3638544082641602, |
| "learning_rate": 2.4112482504135386e-06, |
| "loss": 0.0331, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.04962463417737625, |
| "grad_norm": 0.41071686148643494, |
| "learning_rate": 2.47486957628197e-06, |
| "loss": 0.03, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.05089706069474488, |
| "grad_norm": 0.8771544098854065, |
| "learning_rate": 2.538490902150401e-06, |
| "loss": 0.0306, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0521694872121135, |
| "grad_norm": 0.6460317373275757, |
| "learning_rate": 2.602112228018832e-06, |
| "loss": 0.0311, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.05344191372948212, |
| "grad_norm": 0.19301742315292358, |
| "learning_rate": 2.6657335538872634e-06, |
| "loss": 0.0138, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.05471434024685074, |
| "grad_norm": 2.0089757442474365, |
| "learning_rate": 2.7293548797556944e-06, |
| "loss": 0.0255, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.05598676676421937, |
| "grad_norm": 0.8393608927726746, |
| "learning_rate": 2.7929762056241254e-06, |
| "loss": 0.0206, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05725919328158799, |
| "grad_norm": 0.8323113322257996, |
| "learning_rate": 2.8565975314925564e-06, |
| "loss": 0.0148, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05853161979895661, |
| "grad_norm": 1.5697057247161865, |
| "learning_rate": 2.9202188573609878e-06, |
| "loss": 0.0258, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05980404631632523, |
| "grad_norm": 0.22127273678779602, |
| "learning_rate": 2.9838401832294188e-06, |
| "loss": 0.0195, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.06107647283369386, |
| "grad_norm": 1.421794056892395, |
| "learning_rate": 3.0474615090978497e-06, |
| "loss": 0.0202, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.06234889935106248, |
| "grad_norm": 2.1492910385131836, |
| "learning_rate": 3.1110828349662807e-06, |
| "loss": 0.0244, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0636213258684311, |
| "grad_norm": 0.09726544469594955, |
| "learning_rate": 3.174704160834712e-06, |
| "loss": 0.0257, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06489375238579972, |
| "grad_norm": 0.5567276477813721, |
| "learning_rate": 3.2383254867031427e-06, |
| "loss": 0.0103, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.06616617890316834, |
| "grad_norm": 0.857915997505188, |
| "learning_rate": 3.301946812571574e-06, |
| "loss": 0.0198, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.06743860542053697, |
| "grad_norm": 2.1099727153778076, |
| "learning_rate": 3.3655681384400055e-06, |
| "loss": 0.0169, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.06871103193790559, |
| "grad_norm": 0.35235458612442017, |
| "learning_rate": 3.429189464308436e-06, |
| "loss": 0.0169, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.0699834584552742, |
| "grad_norm": 0.27329525351524353, |
| "learning_rate": 3.4928107901768675e-06, |
| "loss": 0.025, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.07125588497264283, |
| "grad_norm": 1.0509276390075684, |
| "learning_rate": 3.556432116045299e-06, |
| "loss": 0.0143, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.07252831149001145, |
| "grad_norm": 2.021751880645752, |
| "learning_rate": 3.6200534419137294e-06, |
| "loss": 0.0248, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.07380073800738007, |
| "grad_norm": 0.9538995027542114, |
| "learning_rate": 3.683674767782161e-06, |
| "loss": 0.0234, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.0750731645247487, |
| "grad_norm": 1.5769826173782349, |
| "learning_rate": 3.7472960936505914e-06, |
| "loss": 0.0173, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.07634559104211731, |
| "grad_norm": 1.1721569299697876, |
| "learning_rate": 3.810917419519023e-06, |
| "loss": 0.0283, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07761801755948594, |
| "grad_norm": 0.1278325915336609, |
| "learning_rate": 3.874538745387454e-06, |
| "loss": 0.01, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.07889044407685457, |
| "grad_norm": 1.5084667205810547, |
| "learning_rate": 3.938160071255885e-06, |
| "loss": 0.0152, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.08016287059422318, |
| "grad_norm": 0.1045418232679367, |
| "learning_rate": 4.001781397124316e-06, |
| "loss": 0.0196, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.08143529711159181, |
| "grad_norm": 0.1617429554462433, |
| "learning_rate": 4.065402722992747e-06, |
| "loss": 0.0215, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.08270772362896042, |
| "grad_norm": 0.11483286321163177, |
| "learning_rate": 4.129024048861178e-06, |
| "loss": 0.0206, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.08398015014632905, |
| "grad_norm": 0.3859212100505829, |
| "learning_rate": 4.1926453747296096e-06, |
| "loss": 0.0067, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.08525257666369768, |
| "grad_norm": 0.23510600626468658, |
| "learning_rate": 4.25626670059804e-06, |
| "loss": 0.0213, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.08652500318106629, |
| "grad_norm": 1.86430823802948, |
| "learning_rate": 4.3198880264664715e-06, |
| "loss": 0.0217, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.08779742969843492, |
| "grad_norm": 1.0131075382232666, |
| "learning_rate": 4.383509352334903e-06, |
| "loss": 0.0143, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.08906985621580353, |
| "grad_norm": 0.29921114444732666, |
| "learning_rate": 4.4471306782033335e-06, |
| "loss": 0.0169, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.09034228273317216, |
| "grad_norm": 0.39632511138916016, |
| "learning_rate": 4.510752004071765e-06, |
| "loss": 0.0091, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.09161470925054079, |
| "grad_norm": 0.1986786276102066, |
| "learning_rate": 4.5743733299401955e-06, |
| "loss": 0.0077, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.0928871357679094, |
| "grad_norm": 1.939620852470398, |
| "learning_rate": 4.637994655808627e-06, |
| "loss": 0.0182, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.09415956228527803, |
| "grad_norm": 2.1149661540985107, |
| "learning_rate": 4.701615981677058e-06, |
| "loss": 0.0235, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.09543198880264665, |
| "grad_norm": 0.03329145163297653, |
| "learning_rate": 4.765237307545489e-06, |
| "loss": 0.0172, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.09670441532001527, |
| "grad_norm": 1.470470905303955, |
| "learning_rate": 4.82885863341392e-06, |
| "loss": 0.0178, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.0979768418373839, |
| "grad_norm": 1.055908441543579, |
| "learning_rate": 4.892479959282352e-06, |
| "loss": 0.0133, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0992492683547525, |
| "grad_norm": 2.2330827713012695, |
| "learning_rate": 4.956101285150783e-06, |
| "loss": 0.0102, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.10052169487212113, |
| "grad_norm": 0.20729655027389526, |
| "learning_rate": 5.019722611019214e-06, |
| "loss": 0.0081, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.10179412138948976, |
| "grad_norm": 0.1920371800661087, |
| "learning_rate": 5.083343936887645e-06, |
| "loss": 0.0081, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10306654790685837, |
| "grad_norm": 0.5931546092033386, |
| "learning_rate": 5.1469652627560764e-06, |
| "loss": 0.0313, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.104338974424227, |
| "grad_norm": 1.6505730152130127, |
| "learning_rate": 5.210586588624507e-06, |
| "loss": 0.01, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.10561140094159563, |
| "grad_norm": 1.3409397602081299, |
| "learning_rate": 5.274207914492938e-06, |
| "loss": 0.0182, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.10688382745896424, |
| "grad_norm": 0.2508680820465088, |
| "learning_rate": 5.33782924036137e-06, |
| "loss": 0.0108, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.10815625397633287, |
| "grad_norm": 0.7329053282737732, |
| "learning_rate": 5.401450566229801e-06, |
| "loss": 0.0168, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.10942868049370148, |
| "grad_norm": 0.12235169112682343, |
| "learning_rate": 5.465071892098232e-06, |
| "loss": 0.0149, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.11070110701107011, |
| "grad_norm": 0.058564718812704086, |
| "learning_rate": 5.528693217966663e-06, |
| "loss": 0.0148, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.11197353352843874, |
| "grad_norm": 0.6761524081230164, |
| "learning_rate": 5.592314543835094e-06, |
| "loss": 0.0131, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.11324596004580735, |
| "grad_norm": 0.7170369029045105, |
| "learning_rate": 5.655935869703525e-06, |
| "loss": 0.0121, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.11451838656317598, |
| "grad_norm": 0.5446187257766724, |
| "learning_rate": 5.7195571955719566e-06, |
| "loss": 0.0126, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.1157908130805446, |
| "grad_norm": 0.03237615525722504, |
| "learning_rate": 5.783178521440387e-06, |
| "loss": 0.0043, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.11706323959791322, |
| "grad_norm": 0.4705624580383301, |
| "learning_rate": 5.8467998473088185e-06, |
| "loss": 0.0122, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.11833566611528185, |
| "grad_norm": 0.8848108053207397, |
| "learning_rate": 5.91042117317725e-06, |
| "loss": 0.0184, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.11960809263265046, |
| "grad_norm": 0.5390125513076782, |
| "learning_rate": 5.9740424990456805e-06, |
| "loss": 0.0093, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.12088051915001909, |
| "grad_norm": 0.38235408067703247, |
| "learning_rate": 6.037663824914112e-06, |
| "loss": 0.0264, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.12215294566738771, |
| "grad_norm": 0.07413897663354874, |
| "learning_rate": 6.1012851507825425e-06, |
| "loss": 0.0154, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.12342537218475633, |
| "grad_norm": 0.08041392266750336, |
| "learning_rate": 6.164906476650974e-06, |
| "loss": 0.0094, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.12469779870212495, |
| "grad_norm": 0.4950060546398163, |
| "learning_rate": 6.228527802519405e-06, |
| "loss": 0.0078, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.12597022521949358, |
| "grad_norm": 0.028932543471455574, |
| "learning_rate": 6.292149128387837e-06, |
| "loss": 0.0115, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.1272426517368622, |
| "grad_norm": 1.5096838474273682, |
| "learning_rate": 6.355770454256267e-06, |
| "loss": 0.0165, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 78590, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.445812719283077e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|