ANLP-Final-Project
/

m0-oplora-lr

@@ -4,236 +4,866 @@
   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 15000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.03333333333333333,
-      "grad_norm": 0.8437663912773132,
-      "learning_rate": 0.00029999160495301487,
-      "loss": 0.4576,
       "step": 500
     },
     {
-      "epoch": 0.06666666666666667,
-      "grad_norm": 1.3834174871444702,
-      "learning_rate": 0.00029894738121610755,
-      "loss": 0.3618,
       "step": 1000
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 0.8362070322036743,
-      "learning_rate": 0.0002961688552258895,
-      "loss": 0.3312,
       "step": 1500
     },
     {
-      "epoch": 0.13333333333333333,
-      "grad_norm": 1.8011195659637451,
-      "learning_rate": 0.0002916883793731605,
-      "loss": 0.3265,
       "step": 2000
     },
     {
-      "epoch": 0.16666666666666666,
-      "grad_norm": 1.187129259109497,
-      "learning_rate": 0.0002855581230838202,
-      "loss": 0.3176,
       "step": 2500
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.6343923807144165,
-      "learning_rate": 0.000277849465372452,
-      "loss": 0.3267,
       "step": 3000
     },
     {
-      "epoch": 0.23333333333333334,
-      "grad_norm": 0.5139018297195435,
-      "learning_rate": 0.00026865216372475085,
-      "loss": 0.3304,
       "step": 3500
     },
     {
-      "epoch": 0.26666666666666666,
-      "grad_norm": 0.5840966701507568,
-      "learning_rate": 0.0002580733089860996,
-      "loss": 0.3174,
       "step": 4000
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 1.0480272769927979,
-      "learning_rate": 0.0002462360784252821,
-      "loss": 0.3131,
       "step": 4500
     },
     {
-      "epoch": 0.3333333333333333,
-      "grad_norm": 0.768731415271759,
-      "learning_rate": 0.00023327830149231583,
-      "loss": 0.3005,
       "step": 5000
     },
     {
-      "epoch": 0.36666666666666664,
-      "grad_norm": 0.8862756490707397,
-      "learning_rate": 0.00021935085497032568,
-      "loss": 0.2977,
       "step": 5500
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 1.8197040557861328,
-      "learning_rate": 0.00020461590620786605,
-      "loss": 0.3006,
       "step": 6000
     },
     {
-      "epoch": 0.43333333333333335,
-      "grad_norm": 0.5644539594650269,
-      "learning_rate": 0.00018924502488701202,
-      "loss": 0.2891,
       "step": 6500
     },
     {
-      "epoch": 0.4666666666666667,
-      "grad_norm": 0.7333141565322876,
-      "learning_rate": 0.00017341718531326979,
-      "loss": 0.2958,
       "step": 7000
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.8840310573577881,
-      "learning_rate": 0.00015731668248809323,
-      "loss": 0.2914,
       "step": 7500
     },
     {
-      "epoch": 0.5333333333333333,
-      "grad_norm": 0.7415375113487244,
-      "learning_rate": 0.0001411309862286835,
-      "loss": 0.284,
       "step": 8000
     },
     {
-      "epoch": 0.5666666666666667,
-      "grad_norm": 1.0699294805526733,
-      "learning_rate": 0.00012504855832110804,
-      "loss": 0.2776,
       "step": 8500
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 1.0262274742126465,
-      "learning_rate": 0.00010925665812320933,
-      "loss": 0.2751,
       "step": 9000
     },
     {
-      "epoch": 0.6333333333333333,
-      "grad_norm": 1.2797510623931885,
-      "learning_rate": 9.393916216825465e-05,
-      "loss": 0.2797,
       "step": 9500
     },
     {
-      "epoch": 0.6666666666666666,
-      "grad_norm": 0.8398839831352234,
-      "learning_rate": 7.927442315726411e-05,
-      "loss": 0.2655,
       "step": 10000
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.9251325726509094,
-      "learning_rate": 6.543319326931815e-05,
-      "loss": 0.2605,
       "step": 10500
     },
     {
-      "epoch": 0.7333333333333333,
-      "grad_norm": 0.6823338866233826,
-      "learning_rate": 5.257663597024785e-05,
-      "loss": 0.2674,
       "step": 11000
     },
     {
-      "epoch": 0.7666666666666667,
-      "grad_norm": 0.7772260904312134,
-      "learning_rate": 4.085444946965953e-05,
-      "loss": 0.2624,
       "step": 11500
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.6518853902816772,
-      "learning_rate": 3.040312367624248e-05,
-      "loss": 0.2596,
       "step": 12000
     },
     {
-      "epoch": 0.8333333333333334,
-      "grad_norm": 0.6722842454910278,
-      "learning_rate": 2.1344350946892218e-05,
-      "loss": 0.2509,
       "step": 12500
     },
     {
-      "epoch": 0.8666666666666667,
-      "grad_norm": 1.5721765756607056,
-      "learning_rate": 1.3783609134448331e-05,
-      "loss": 0.25,
       "step": 13000
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 1.592251181602478,
-      "learning_rate": 7.808933432648067e-06,
-      "loss": 0.249,
       "step": 13500
     },
     {
-      "epoch": 0.9333333333333333,
-      "grad_norm": 1.5700002908706665,
-      "learning_rate": 3.4898913185952726e-06,
-      "loss": 0.2618,
       "step": 14000
     },
     {
-      "epoch": 0.9666666666666667,
-      "grad_norm": 1.067083477973938,
-      "learning_rate": 8.767725282315785e-07,
-      "loss": 0.2653,
       "step": 14500
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.734412670135498,
-      "learning_rate": 3.496511979950156e-12,
-      "loss": 0.2583,
       "step": 15000
     },
     {
       "epoch": 1.0,
-      "step": 15000,
-      "total_flos": 1.6839046112147866e+17,
-      "train_loss": 0.29414576873779297,
-      "train_runtime": 2822.8333,
-      "train_samples_per_second": 5.314,
-      "train_steps_per_second": 5.314
     }
   ],
   "logging_steps": 500,
-  "max_steps": 15000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 5000,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -246,7 +876,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.6839046112147866e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 60000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.008333333333333333,
+      "grad_norm": 1.734326958656311,
+      "learning_rate": 8.316666666666665e-05,
+      "loss": 0.5142,
       "step": 500
     },
     {
+      "epoch": 0.016666666666666666,
+      "grad_norm": 0.7779222726821899,
+      "learning_rate": 0.0001665,
+      "loss": 0.3832,
       "step": 1000
     },
     {
+      "epoch": 0.025,
+      "grad_norm": 1.3082212209701538,
+      "learning_rate": 0.0002498333333333333,
+      "loss": 0.3418,
       "step": 1500
     },
     {
+      "epoch": 0.03333333333333333,
+      "grad_norm": 1.115393877029419,
+      "learning_rate": 0.000299991345997501,
+      "loss": 0.3593,
       "step": 2000
     },
     {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 1.1185020208358765,
+      "learning_rate": 0.0002998932377135537,
+      "loss": 0.3316,
       "step": 2500
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 0.44978195428848267,
+      "learning_rate": 0.0002996859478276907,
+      "loss": 0.3364,
       "step": 3000
     },
     {
+      "epoch": 0.058333333333333334,
+      "grad_norm": 1.157468318939209,
+      "learning_rate": 0.0002993696273289902,
+      "loss": 0.3145,
       "step": 3500
     },
     {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 1.247792363166809,
+      "learning_rate": 0.00029894450662396884,
+      "loss": 0.3201,
       "step": 4000
     },
     {
+      "epoch": 0.075,
+      "grad_norm": 0.9549034833908081,
+      "learning_rate": 0.00029841089536875444,
+      "loss": 0.3189,
       "step": 4500
     },
     {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 0.8676638007164001,
+      "learning_rate": 0.00029776918224353393,
+      "loss": 0.3224,
       "step": 5000
     },
     {
+      "epoch": 0.09166666666666666,
+      "grad_norm": 1.1023201942443848,
+      "learning_rate": 0.00029701983466944016,
+      "loss": 0.3048,
       "step": 5500
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 1.559238076210022,
+      "learning_rate": 0.00029616339846808443,
+      "loss": 0.3166,
       "step": 6000
     },
     {
+      "epoch": 0.10833333333333334,
+      "grad_norm": 0.6283588409423828,
+      "learning_rate": 0.0002952004974639823,
+      "loss": 0.3118,
       "step": 6500
     },
     {
+      "epoch": 0.11666666666666667,
+      "grad_norm": 0.8453167080879211,
+      "learning_rate": 0.0002941318330301624,
+      "loss": 0.2962,
       "step": 7000
     },
     {
+      "epoch": 0.125,
+      "grad_norm": 1.476069688796997,
+      "learning_rate": 0.00029295818357728945,
+      "loss": 0.3151,
       "step": 7500
     },
     {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.8654290437698364,
+      "learning_rate": 0.000291680403986673,
+      "loss": 0.297,
       "step": 8000
     },
     {
+      "epoch": 0.14166666666666666,
+      "grad_norm": 1.922337293624878,
+      "learning_rate": 0.00029029942498757556,
+      "loss": 0.3089,
       "step": 8500
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 1.667548418045044,
+      "learning_rate": 0.00028881625247927344,
+      "loss": 0.295,
       "step": 9000
     },
     {
+      "epoch": 0.15833333333333333,
+      "grad_norm": 0.7707281112670898,
+      "learning_rate": 0.0002872319667983639,
+      "loss": 0.2889,
       "step": 9500
     },
     {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.3987542390823364,
+      "learning_rate": 0.0002855477219318523,
+      "loss": 0.2831,
       "step": 10000
     },
     {
+      "epoch": 0.175,
+      "grad_norm": 0.8910896182060242,
+      "learning_rate": 0.0002837647446765933,
+      "loss": 0.2928,
       "step": 10500
     },
     {
+      "epoch": 0.18333333333333332,
+      "grad_norm": 1.34634268283844,
+      "learning_rate": 0.0002818843337456967,
+      "loss": 0.2866,
       "step": 11000
     },
     {
+      "epoch": 0.19166666666666668,
+      "grad_norm": 2.1172733306884766,
+      "learning_rate": 0.00027990785882255076,
+      "loss": 0.2869,
       "step": 11500
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 1.0616984367370605,
+      "learning_rate": 0.0002778367595631503,
+      "loss": 0.2924,
       "step": 12000
     },
     {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.6745150089263916,
+      "learning_rate": 0.0002756725445474574,
+      "loss": 0.289,
       "step": 12500
     },
     {
+      "epoch": 0.21666666666666667,
+      "grad_norm": 0.9809184670448303,
+      "learning_rate": 0.00027341679018055815,
+      "loss": 0.2821,
       "step": 13000
     },
     {
+      "epoch": 0.225,
+      "grad_norm": 0.9942313432693481,
+      "learning_rate": 0.0002710711395444158,
+      "loss": 0.2867,
       "step": 13500
     },
     {
+      "epoch": 0.23333333333333334,
+      "grad_norm": 0.7905780673027039,
+      "learning_rate": 0.00026863730120105694,
+      "loss": 0.2965,
       "step": 14000
     },
     {
+      "epoch": 0.24166666666666667,
+      "grad_norm": 0.4196673631668091,
+      "learning_rate": 0.0002661170479480619,
+      "loss": 0.2869,
       "step": 14500
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.975186824798584,
+      "learning_rate": 0.0002635122155272666,
+      "loss": 0.276,
       "step": 15000
     },
+    {
+      "epoch": 0.25833333333333336,
+      "grad_norm": 1.2537420988082886,
+      "learning_rate": 0.00026082470128761597,
+      "loss": 0.2794,
+      "step": 15500
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 1.1740987300872803,
+      "learning_rate": 0.0002580564628031429,
+      "loss": 0.2794,
+      "step": 16000
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 1.9777884483337402,
+      "learning_rate": 0.00025520951644707966,
+      "loss": 0.2871,
+      "step": 16500
+    },
+    {
+      "epoch": 0.2833333333333333,
+      "grad_norm": 1.662049412727356,
+      "learning_rate": 0.0002522859359231401,
+      "loss": 0.288,
+      "step": 17000
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 0.8431689739227295,
+      "learning_rate": 0.0002492878507550424,
+      "loss": 0.2752,
+      "step": 17500
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.1322907209396362,
+      "learning_rate": 0.00024621744473537365,
+      "loss": 0.2834,
+      "step": 18000
+    },
+    {
+      "epoch": 0.30833333333333335,
+      "grad_norm": 0.751252293586731,
+      "learning_rate": 0.00024307695433492364,
+      "loss": 0.2817,
+      "step": 18500
+    },
+    {
+      "epoch": 0.31666666666666665,
+      "grad_norm": 0.9835771918296814,
+      "learning_rate": 0.00023986866707364964,
+      "loss": 0.2651,
+      "step": 19000
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 1.3762648105621338,
+      "learning_rate": 0.0002365949198544562,
+      "loss": 0.2679,
+      "step": 19500
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.1998765468597412,
+      "learning_rate": 0.0002332580972610051,
+      "loss": 0.2751,
+      "step": 20000
+    },
+    {
+      "epoch": 0.3416666666666667,
+      "grad_norm": 0.4611862599849701,
+      "learning_rate": 0.00022986062982079458,
+      "loss": 0.2779,
+      "step": 20500
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7198874354362488,
+      "learning_rate": 0.00022640499223477339,
+      "loss": 0.2666,
+      "step": 21000
+    },
+    {
+      "epoch": 0.35833333333333334,
+      "grad_norm": 1.37840735912323,
+      "learning_rate": 0.000222893701574779,
+      "loss": 0.2698,
+      "step": 21500
+    },
+    {
+      "epoch": 0.36666666666666664,
+      "grad_norm": 1.549021601676941,
+      "learning_rate": 0.00021932931545011323,
+      "loss": 0.2672,
+      "step": 22000
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.489937663078308,
+      "learning_rate": 0.00021571443014459024,
+      "loss": 0.2611,
+      "step": 22500
+    },
+    {
+      "epoch": 0.38333333333333336,
+      "grad_norm": 1.0012102127075195,
+      "learning_rate": 0.0002120516787254146,
+      "loss": 0.2613,
+      "step": 23000
+    },
+    {
+      "epoch": 0.39166666666666666,
+      "grad_norm": 0.3051554560661316,
+      "learning_rate": 0.00020834372912526596,
+      "loss": 0.2662,
+      "step": 23500
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6066364049911499,
+      "learning_rate": 0.0002045932821989884,
+      "loss": 0.2653,
+      "step": 24000
+    },
+    {
+      "epoch": 0.4083333333333333,
+      "grad_norm": 0.881126344203949,
+      "learning_rate": 0.000200803069756299,
+      "loss": 0.2563,
+      "step": 24500
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.9933512210845947,
+      "learning_rate": 0.0001969758525719492,
+      "loss": 0.2592,
+      "step": 25000
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.8219375610351562,
+      "learning_rate": 0.00019311441837478816,
+      "loss": 0.272,
+      "step": 25500
+    },
+    {
+      "epoch": 0.43333333333333335,
+      "grad_norm": 1.2546143531799316,
+      "learning_rate": 0.0001892215798171928,
+      "loss": 0.2567,
+      "step": 26000
+    },
+    {
+      "epoch": 0.44166666666666665,
+      "grad_norm": 1.4476374387741089,
+      "learning_rate": 0.00018530017242634363,
+      "loss": 0.2588,
+      "step": 26500
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.356984257698059,
+      "learning_rate": 0.0001813530525388389,
+      "loss": 0.2556,
+      "step": 27000
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00017738309522015073,
+      "loss": 0.2477,
+      "step": 27500
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 1.395007848739624,
+      "learning_rate": 0.00017339319217043995,
+      "loss": 0.2498,
+      "step": 28000
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.7972742319107056,
+      "learning_rate": 0.00016938624961825358,
+      "loss": 0.2473,
+      "step": 28500
+    },
+    {
+      "epoch": 0.48333333333333334,
+      "grad_norm": 0.8678386807441711,
+      "learning_rate": 0.0001653651862036404,
+      "loss": 0.2534,
+      "step": 29000
+    },
+    {
+      "epoch": 0.49166666666666664,
+      "grad_norm": 1.0032129287719727,
+      "learning_rate": 0.00016133293085222585,
+      "loss": 0.2458,
+      "step": 29500
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.9749704599380493,
+      "learning_rate": 0.00015729242064179487,
+      "loss": 0.2516,
+      "step": 30000
+    },
+    {
+      "epoch": 0.5083333333333333,
+      "grad_norm": 1.6964690685272217,
+      "learning_rate": 0.0001532465986629368,
+      "loss": 0.2452,
+      "step": 30500
+    },
+    {
+      "epoch": 0.5166666666666667,
+      "grad_norm": 1.2840642929077148,
+      "learning_rate": 0.0001491984118753108,
+      "loss": 0.247,
+      "step": 31000
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 1.1751660108566284,
+      "learning_rate": 0.00014515080896109272,
+      "loss": 0.2561,
+      "step": 31500
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4990275204181671,
+      "learning_rate": 0.00014110673817716756,
+      "loss": 0.2429,
+      "step": 32000
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 1.0772459506988525,
+      "learning_rate": 0.0001370691452076314,
+      "loss": 0.2622,
+      "step": 32500
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.0913983583450317,
+      "learning_rate": 0.00013304097101816766,
+      "loss": 0.2406,
+      "step": 33000
+    },
+    {
+      "epoch": 0.5583333333333333,
+      "grad_norm": 0.7937314510345459,
+      "learning_rate": 0.0001290251497138601,
+      "loss": 0.2317,
+      "step": 33500
+    },
+    {
+      "epoch": 0.5666666666666667,
+      "grad_norm": 0.866894006729126,
+      "learning_rate": 0.0001250246064020032,
+      "loss": 0.2381,
+      "step": 34000
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.904768705368042,
+      "learning_rate": 0.000121042255061466,
+      "loss": 0.2416,
+      "step": 34500
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 2.5710256099700928,
+      "learning_rate": 0.00011708099642016254,
+      "loss": 0.2501,
+      "step": 35000
+    },
+    {
+      "epoch": 0.5916666666666667,
+      "grad_norm": 0.9502021074295044,
+      "learning_rate": 0.00011314371584217354,
+      "loss": 0.2462,
+      "step": 35500
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.1951791048049927,
+      "learning_rate": 0.00010923328122605982,
+      "loss": 0.2437,
+      "step": 36000
+    },
+    {
+      "epoch": 0.6083333333333333,
+      "grad_norm": 1.0552036762237549,
+      "learning_rate": 0.00010535254091589667,
+      "loss": 0.2426,
+      "step": 36500
+    },
+    {
+      "epoch": 0.6166666666666667,
+      "grad_norm": 1.1212034225463867,
+      "learning_rate": 0.00010150432162655218,
+      "loss": 0.2454,
+      "step": 37000
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.7416337132453918,
+      "learning_rate": 9.769142638472033e-05,
+      "loss": 0.2401,
+      "step": 37500
+    },
+    {
+      "epoch": 0.6333333333333333,
+      "grad_norm": 0.6174165606498718,
+      "learning_rate": 9.391663248720767e-05,
+      "loss": 0.2462,
+      "step": 38000
+    },
+    {
+      "epoch": 0.6416666666666667,
+      "grad_norm": 1.7195667028427124,
+      "learning_rate": 9.01826894779624e-05,
+      "loss": 0.2404,
+      "step": 38500
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8892576098442078,
+      "learning_rate": 8.649231714531805e-05,
+      "loss": 0.237,
+      "step": 39000
+    },
+    {
+      "epoch": 0.6583333333333333,
+      "grad_norm": 0.5483108758926392,
+      "learning_rate": 8.284820354091123e-05,
+      "loss": 0.2458,
+      "step": 39500
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.9700618982315063,
+      "learning_rate": 7.925300302171602e-05,
+      "loss": 0.24,
+      "step": 40000
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.8393011689186096,
+      "learning_rate": 7.570933431662156e-05,
+      "loss": 0.2334,
+      "step": 40500
+    },
+    {
+      "epoch": 0.6833333333333333,
+      "grad_norm": 1.7267228364944458,
+      "learning_rate": 7.221977861896105e-05,
+      "loss": 0.2454,
+      "step": 41000
+    },
+    {
+      "epoch": 0.6916666666666667,
+      "grad_norm": 1.5230600833892822,
+      "learning_rate": 6.878687770638148e-05,
+      "loss": 0.2291,
+      "step": 41500
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8894439935684204,
+      "learning_rate": 6.541313208942333e-05,
+      "loss": 0.2304,
+      "step": 42000
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 0.6637595891952515,
+      "learning_rate": 6.210099919015931e-05,
+      "loss": 0.2256,
+      "step": 42500
+    },
+    {
+      "epoch": 0.7166666666666667,
+      "grad_norm": 1.5464119911193848,
+      "learning_rate": 5.8852891552218674e-05,
+      "loss": 0.2296,
+      "step": 43000
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 1.361809492111206,
+      "learning_rate": 5.567117508350033e-05,
+      "loss": 0.225,
+      "step": 43500
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 1.100778341293335,
+      "learning_rate": 5.2558167332855614e-05,
+      "loss": 0.2171,
+      "step": 44000
+    },
+    {
+      "epoch": 0.7416666666666667,
+      "grad_norm": 0.8792382478713989,
+      "learning_rate": 4.951613580199569e-05,
+      "loss": 0.2253,
+      "step": 44500
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.008772611618042,
+      "learning_rate": 4.65472962938525e-05,
+      "loss": 0.2185,
+      "step": 45000
+    },
+    {
+      "epoch": 0.7583333333333333,
+      "grad_norm": 1.769035816192627,
+      "learning_rate": 4.365381129859762e-05,
+      "loss": 0.2323,
+      "step": 45500
+    },
+    {
+      "epoch": 0.7666666666666667,
+      "grad_norm": 1.0002933740615845,
+      "learning_rate": 4.0837788418493234e-05,
+      "loss": 0.2258,
+      "step": 46000
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.6770097613334656,
+      "learning_rate": 3.810127883272383e-05,
+      "loss": 0.2237,
+      "step": 46500
+    },
+    {
+      "epoch": 0.7833333333333333,
+      "grad_norm": 1.1669566631317139,
+      "learning_rate": 3.544627580332564e-05,
+      "loss": 0.2106,
+      "step": 47000
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 0.6947309374809265,
+      "learning_rate": 3.2874713223303216e-05,
+      "loss": 0.2198,
+      "step": 47500
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6739930510520935,
+      "learning_rate": 3.038846420798978e-05,
+      "loss": 0.2183,
+      "step": 48000
+    },
+    {
+      "epoch": 0.8083333333333333,
+      "grad_norm": 1.489799976348877,
+      "learning_rate": 2.7989339730678078e-05,
+      "loss": 0.2189,
+      "step": 48500
+    },
+    {
+      "epoch": 0.8166666666666667,
+      "grad_norm": 0.8515140414237976,
+      "learning_rate": 2.5679087303514894e-05,
+      "loss": 0.2244,
+      "step": 49000
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.49479931592941284,
+      "learning_rate": 2.345938970462068e-05,
+      "loss": 0.2369,
+      "step": 49500
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.8055633902549744,
+      "learning_rate": 2.1331863752360973e-05,
+      "loss": 0.2217,
+      "step": 50000
+    },
+    {
+      "epoch": 0.8416666666666667,
+      "grad_norm": 2.17622447013855,
+      "learning_rate": 1.9298059127662975e-05,
+      "loss": 0.2087,
+      "step": 50500
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.8723386526107788,
+      "learning_rate": 1.7359457245234156e-05,
+      "loss": 0.223,
+      "step": 51000
+    },
+    {
+      "epoch": 0.8583333333333333,
+      "grad_norm": 1.4284367561340332,
+      "learning_rate": 1.5517470174506246e-05,
+      "loss": 0.2201,
+      "step": 51500
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 1.5737581253051758,
+      "learning_rate": 1.3773439611089832e-05,
+      "loss": 0.224,
+      "step": 52000
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 1.9173991680145264,
+      "learning_rate": 1.212863589948883e-05,
+      "loss": 0.2316,
+      "step": 52500
+    },
+    {
+      "epoch": 0.8833333333333333,
+      "grad_norm": 0.9126625657081604,
+      "learning_rate": 1.058425710778692e-05,
+      "loss": 0.2099,
+      "step": 53000
+    },
+    {
+      "epoch": 0.8916666666666667,
+      "grad_norm": 0.9519773721694946,
+      "learning_rate": 9.141428154979886e-06,
+      "loss": 0.2231,
+      "step": 53500
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7559552192687988,
+      "learning_rate": 7.801199991589335e-06,
+      "loss": 0.2203,
+      "step": 54000
+    },
+    {
+      "epoch": 0.9083333333333333,
+      "grad_norm": 1.3599870204925537,
+      "learning_rate": 6.564548834154759e-06,
+      "loss": 0.2224,
+      "step": 54500
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 1.5323917865753174,
+      "learning_rate": 5.432375454161458e-06,
+      "loss": 0.2131,
+      "step": 55000
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.8116886615753174,
+      "learning_rate": 4.405504521922571e-06,
+      "loss": 0.2122,
+      "step": 55500
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.9112006425857544,
+      "learning_rate": 3.484684005892502e-06,
+      "loss": 0.2184,
+      "step": 56000
+    },
+    {
+      "epoch": 0.9416666666666667,
+      "grad_norm": 0.6394028663635254,
+      "learning_rate": 2.6705846278499532e-06,
+      "loss": 0.238,
+      "step": 56500
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.9545285701751709,
+      "learning_rate": 1.9637993743470525e-06,
+      "loss": 0.226,
+      "step": 57000
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 0.7243571877479553,
+      "learning_rate": 1.36484306478033e-06,
+      "loss": 0.2236,
+      "step": 57500
+    },
+    {
+      "epoch": 0.9666666666666667,
+      "grad_norm": 0.8191093802452087,
+      "learning_rate": 8.741519763985627e-07,
+      "loss": 0.217,
+      "step": 58000
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 1.4871535301208496,
+      "learning_rate": 4.920835265201595e-07,
+      "loss": 0.2176,
+      "step": 58500
+    },
+    {
+      "epoch": 0.9833333333333333,
+      "grad_norm": 0.9401077628135681,
+      "learning_rate": 2.1891601219199417e-07,
+      "loss": 0.2267,
+      "step": 59000
+    },
+    {
+      "epoch": 0.9916666666666667,
+      "grad_norm": 1.3500021696090698,
+      "learning_rate": 5.484840747892461e-08,
+      "loss": 0.2189,
+      "step": 59500
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5481218695640564,
+      "learning_rate": 2.1853199250188025e-13,
+      "loss": 0.2222,
+      "step": 60000
+    },
     {
       "epoch": 1.0,
+      "step": 60000,
+      "total_flos": 6.731455156851671e+17,
+      "train_loss": 0.2601620175679525,
+      "train_runtime": 10691.1388,
+      "train_samples_per_second": 5.612,
+      "train_steps_per_second": 5.612
     }
   ],
   "logging_steps": 500,
+  "max_steps": 60000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 10000,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 6.731455156851671e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null