Upload best model checkpoint (step 93, loss 0.9836)

Browse files

Files changed (3) hide show

generation_config.json +8 -0
loss.txt +1 -0
trainer_state.json +701 -0

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "max_length": 2048,
+  "pad_token_id": 0,
+  "transformers_version": "4.51.3"
+}

loss.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 93,0.9836028218269348

trainer_state.json ADDED Viewed

	@@ -0,0 +1,701 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0536254865215511,
+  "eval_steps": 1734,
+  "global_step": 93,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005766181346403345,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0,
+      "loss": 1.2938,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005766181346403345,
+      "eval_loss": 1.3029676675796509,
+      "eval_runtime": 1.482,
+      "eval_samples_per_second": 125.506,
+      "eval_steps_per_second": 2.699,
+      "step": 1
+    },
+    {
+      "epoch": 0.001153236269280669,
+      "grad_norm": 2.59375,
+      "learning_rate": 1e-05,
+      "loss": 1.2719,
+      "step": 2
+    },
+    {
+      "epoch": 0.0017298544039210033,
+      "grad_norm": 1.7265625,
+      "learning_rate": 2e-05,
+      "loss": 1.2501,
+      "step": 3
+    },
+    {
+      "epoch": 0.002306472538561338,
+      "grad_norm": 1.3359375,
+      "learning_rate": 3e-05,
+      "loss": 1.1749,
+      "step": 4
+    },
+    {
+      "epoch": 0.0028830906732016724,
+      "grad_norm": 1.203125,
+      "learning_rate": 4e-05,
+      "loss": 1.2028,
+      "step": 5
+    },
+    {
+      "epoch": 0.0034597088078420065,
+      "grad_norm": 1.1953125,
+      "learning_rate": 5e-05,
+      "loss": 1.1511,
+      "step": 6
+    },
+    {
+      "epoch": 0.004036326942482341,
+      "grad_norm": 1.1875,
+      "learning_rate": 6e-05,
+      "loss": 1.1037,
+      "step": 7
+    },
+    {
+      "epoch": 0.004612945077122676,
+      "grad_norm": 1.0390625,
+      "learning_rate": 7e-05,
+      "loss": 1.0619,
+      "step": 8
+    },
+    {
+      "epoch": 0.00518956321176301,
+      "grad_norm": 0.86328125,
+      "learning_rate": 8e-05,
+      "loss": 1.1328,
+      "step": 9
+    },
+    {
+      "epoch": 0.005766181346403345,
+      "grad_norm": 0.90625,
+      "learning_rate": 9e-05,
+      "loss": 1.0916,
+      "step": 10
+    },
+    {
+      "epoch": 0.0063427994810436785,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0001,
+      "loss": 1.0373,
+      "step": 11
+    },
+    {
+      "epoch": 0.006919417615684013,
+      "grad_norm": 0.9140625,
+      "learning_rate": 9.999999359279803e-05,
+      "loss": 1.0594,
+      "step": 12
+    },
+    {
+      "epoch": 0.007496035750324348,
+      "grad_norm": 0.7734375,
+      "learning_rate": 9.999997437119453e-05,
+      "loss": 1.0309,
+      "step": 13
+    },
+    {
+      "epoch": 0.008072653884964682,
+      "grad_norm": 0.8046875,
+      "learning_rate": 9.999994233519648e-05,
+      "loss": 1.0731,
+      "step": 14
+    },
+    {
+      "epoch": 0.008649272019605017,
+      "grad_norm": 0.79296875,
+      "learning_rate": 9.999989748481562e-05,
+      "loss": 1.0455,
+      "step": 15
+    },
+    {
+      "epoch": 0.009225890154245351,
+      "grad_norm": 0.7421875,
+      "learning_rate": 9.99998398200684e-05,
+      "loss": 1.0654,
+      "step": 16
+    },
+    {
+      "epoch": 0.009802508288885686,
+      "grad_norm": 0.7109375,
+      "learning_rate": 9.99997693409759e-05,
+      "loss": 1.0141,
+      "step": 17
+    },
+    {
+      "epoch": 0.01037912642352602,
+      "grad_norm": 0.71875,
+      "learning_rate": 9.999968604756393e-05,
+      "loss": 1.0353,
+      "step": 18
+    },
+    {
+      "epoch": 0.010955744558166355,
+      "grad_norm": 0.7109375,
+      "learning_rate": 9.999958993986302e-05,
+      "loss": 1.0461,
+      "step": 19
+    },
+    {
+      "epoch": 0.01153236269280669,
+      "grad_norm": 0.69140625,
+      "learning_rate": 9.999948101790833e-05,
+      "loss": 1.0137,
+      "step": 20
+    },
+    {
+      "epoch": 0.012108980827447022,
+      "grad_norm": 0.71484375,
+      "learning_rate": 9.999935928173971e-05,
+      "loss": 1.0216,
+      "step": 21
+    },
+    {
+      "epoch": 0.012685598962087357,
+      "grad_norm": 0.6953125,
+      "learning_rate": 9.999922473140177e-05,
+      "loss": 1.0957,
+      "step": 22
+    },
+    {
+      "epoch": 0.013262217096727692,
+      "grad_norm": 0.73046875,
+      "learning_rate": 9.999907736694379e-05,
+      "loss": 1.0038,
+      "step": 23
+    },
+    {
+      "epoch": 0.013838835231368026,
+      "grad_norm": 0.67578125,
+      "learning_rate": 9.99989171884197e-05,
+      "loss": 1.0123,
+      "step": 24
+    },
+    {
+      "epoch": 0.01441545336600836,
+      "grad_norm": 0.68359375,
+      "learning_rate": 9.99987441958881e-05,
+      "loss": 0.9884,
+      "step": 25
+    },
+    {
+      "epoch": 0.014992071500648695,
+      "grad_norm": 0.6953125,
+      "learning_rate": 9.999855838941241e-05,
+      "loss": 0.9824,
+      "step": 26
+    },
+    {
+      "epoch": 0.01556868963528903,
+      "grad_norm": 0.6875,
+      "learning_rate": 9.99983597690606e-05,
+      "loss": 1.0282,
+      "step": 27
+    },
+    {
+      "epoch": 0.016145307769929364,
+      "grad_norm": 0.6875,
+      "learning_rate": 9.999814833490542e-05,
+      "loss": 1.0501,
+      "step": 28
+    },
+    {
+      "epoch": 0.0167219259045697,
+      "grad_norm": 0.671875,
+      "learning_rate": 9.999792408702424e-05,
+      "loss": 1.0105,
+      "step": 29
+    },
+    {
+      "epoch": 0.017298544039210034,
+      "grad_norm": 0.74609375,
+      "learning_rate": 9.999768702549923e-05,
+      "loss": 1.0676,
+      "step": 30
+    },
+    {
+      "epoch": 0.017875162173850368,
+      "grad_norm": 0.6875,
+      "learning_rate": 9.999743715041714e-05,
+      "loss": 1.021,
+      "step": 31
+    },
+    {
+      "epoch": 0.018451780308490703,
+      "grad_norm": 0.67578125,
+      "learning_rate": 9.999717446186944e-05,
+      "loss": 0.9625,
+      "step": 32
+    },
+    {
+      "epoch": 0.019028398443131037,
+      "grad_norm": 0.71484375,
+      "learning_rate": 9.999689895995237e-05,
+      "loss": 1.0373,
+      "step": 33
+    },
+    {
+      "epoch": 0.019605016577771372,
+      "grad_norm": 0.68359375,
+      "learning_rate": 9.999661064476674e-05,
+      "loss": 1.0124,
+      "step": 34
+    },
+    {
+      "epoch": 0.020181634712411706,
+      "grad_norm": 0.66796875,
+      "learning_rate": 9.999630951641812e-05,
+      "loss": 0.9176,
+      "step": 35
+    },
+    {
+      "epoch": 0.02075825284705204,
+      "grad_norm": 0.68359375,
+      "learning_rate": 9.999599557501678e-05,
+      "loss": 1.0163,
+      "step": 36
+    },
+    {
+      "epoch": 0.021334870981692376,
+      "grad_norm": 0.68359375,
+      "learning_rate": 9.999566882067764e-05,
+      "loss": 1.0296,
+      "step": 37
+    },
+    {
+      "epoch": 0.02191148911633271,
+      "grad_norm": 0.671875,
+      "learning_rate": 9.999532925352036e-05,
+      "loss": 1.0034,
+      "step": 38
+    },
+    {
+      "epoch": 0.022488107250973045,
+      "grad_norm": 0.640625,
+      "learning_rate": 9.999497687366923e-05,
+      "loss": 0.9524,
+      "step": 39
+    },
+    {
+      "epoch": 0.02306472538561338,
+      "grad_norm": 0.6875,
+      "learning_rate": 9.999461168125327e-05,
+      "loss": 0.9762,
+      "step": 40
+    },
+    {
+      "epoch": 0.02364134352025371,
+      "grad_norm": 0.66015625,
+      "learning_rate": 9.999423367640626e-05,
+      "loss": 0.9876,
+      "step": 41
+    },
+    {
+      "epoch": 0.024217961654894045,
+      "grad_norm": 0.66796875,
+      "learning_rate": 9.99938428592665e-05,
+      "loss": 1.0121,
+      "step": 42
+    },
+    {
+      "epoch": 0.02479457978953438,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.99934392299771e-05,
+      "loss": 0.9703,
+      "step": 43
+    },
+    {
+      "epoch": 0.025371197924174714,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.999302278868587e-05,
+      "loss": 0.9806,
+      "step": 44
+    },
+    {
+      "epoch": 0.02594781605881505,
+      "grad_norm": 0.65625,
+      "learning_rate": 9.999259353554528e-05,
+      "loss": 1.0016,
+      "step": 45
+    },
+    {
+      "epoch": 0.026524434193455383,
+      "grad_norm": 0.6796875,
+      "learning_rate": 9.999215147071245e-05,
+      "loss": 1.0187,
+      "step": 46
+    },
+    {
+      "epoch": 0.027101052328095718,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.999169659434929e-05,
+      "loss": 0.9999,
+      "step": 47
+    },
+    {
+      "epoch": 0.027677670462736052,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.99912289066223e-05,
+      "loss": 0.994,
+      "step": 48
+    },
+    {
+      "epoch": 0.028254288597376387,
+      "grad_norm": 0.68359375,
+      "learning_rate": 9.999074840770268e-05,
+      "loss": 0.9996,
+      "step": 49
+    },
+    {
+      "epoch": 0.02883090673201672,
+      "grad_norm": 0.640625,
+      "learning_rate": 9.999025509776645e-05,
+      "loss": 0.9744,
+      "step": 50
+    },
+    {
+      "epoch": 0.029407524866657056,
+      "grad_norm": 0.6328125,
+      "learning_rate": 9.998974897699413e-05,
+      "loss": 0.9966,
+      "step": 51
+    },
+    {
+      "epoch": 0.02998414300129739,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.99892300455711e-05,
+      "loss": 0.9773,
+      "step": 52
+    },
+    {
+      "epoch": 0.030560761135937725,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.998869830368729e-05,
+      "loss": 0.993,
+      "step": 53
+    },
+    {
+      "epoch": 0.03113737927057806,
+      "grad_norm": 0.62890625,
+      "learning_rate": 9.99881537515374e-05,
+      "loss": 0.9538,
+      "step": 54
+    },
+    {
+      "epoch": 0.031713997405218394,
+      "grad_norm": 0.671875,
+      "learning_rate": 9.998759638932085e-05,
+      "loss": 0.9988,
+      "step": 55
+    },
+    {
+      "epoch": 0.03229061553985873,
+      "grad_norm": 0.65234375,
+      "learning_rate": 9.998702621724162e-05,
+      "loss": 1.0345,
+      "step": 56
+    },
+    {
+      "epoch": 0.03286723367449906,
+      "grad_norm": 0.65234375,
+      "learning_rate": 9.998644323550856e-05,
+      "loss": 1.0087,
+      "step": 57
+    },
+    {
+      "epoch": 0.0334438518091394,
+      "grad_norm": 0.65234375,
+      "learning_rate": 9.998584744433506e-05,
+      "loss": 0.9365,
+      "step": 58
+    },
+    {
+      "epoch": 0.03402046994377973,
+      "grad_norm": 0.63671875,
+      "learning_rate": 9.998523884393927e-05,
+      "loss": 0.9741,
+      "step": 59
+    },
+    {
+      "epoch": 0.03459708807842007,
+      "grad_norm": 0.6171875,
+      "learning_rate": 9.998461743454399e-05,
+      "loss": 0.9498,
+      "step": 60
+    },
+    {
+      "epoch": 0.0351737062130604,
+      "grad_norm": 0.703125,
+      "learning_rate": 9.998398321637676e-05,
+      "loss": 1.0063,
+      "step": 61
+    },
+    {
+      "epoch": 0.035750324347700736,
+      "grad_norm": 0.66015625,
+      "learning_rate": 9.998333618966978e-05,
+      "loss": 0.9967,
+      "step": 62
+    },
+    {
+      "epoch": 0.03632694248234107,
+      "grad_norm": 0.62890625,
+      "learning_rate": 9.998267635465992e-05,
+      "loss": 0.9099,
+      "step": 63
+    },
+    {
+      "epoch": 0.036903560616981405,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.998200371158881e-05,
+      "loss": 0.9456,
+      "step": 64
+    },
+    {
+      "epoch": 0.03748017875162174,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.998131826070267e-05,
+      "loss": 1.0032,
+      "step": 65
+    },
+    {
+      "epoch": 0.038056796886262075,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.99806200022525e-05,
+      "loss": 0.9238,
+      "step": 66
+    },
+    {
+      "epoch": 0.03863341502090241,
+      "grad_norm": 0.69921875,
+      "learning_rate": 9.997990893649394e-05,
+      "loss": 1.0059,
+      "step": 67
+    },
+    {
+      "epoch": 0.039210033155542744,
+      "grad_norm": 0.625,
+      "learning_rate": 9.997918506368732e-05,
+      "loss": 0.9957,
+      "step": 68
+    },
+    {
+      "epoch": 0.03978665129018308,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.997844838409767e-05,
+      "loss": 0.9871,
+      "step": 69
+    },
+    {
+      "epoch": 0.04036326942482341,
+      "grad_norm": 0.67578125,
+      "learning_rate": 9.99776988979947e-05,
+      "loss": 0.9906,
+      "step": 70
+    },
+    {
+      "epoch": 0.04093988755946375,
+      "grad_norm": 0.61328125,
+      "learning_rate": 9.997693660565284e-05,
+      "loss": 0.9593,
+      "step": 71
+    },
+    {
+      "epoch": 0.04151650569410408,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.997616150735119e-05,
+      "loss": 0.943,
+      "step": 72
+    },
+    {
+      "epoch": 0.042093123828744416,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.997537360337348e-05,
+      "loss": 0.988,
+      "step": 73
+    },
+    {
+      "epoch": 0.04266974196338475,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.997457289400825e-05,
+      "loss": 0.9691,
+      "step": 74
+    },
+    {
+      "epoch": 0.043246360098025086,
+      "grad_norm": 0.6640625,
+      "learning_rate": 9.997375937954861e-05,
+      "loss": 0.9682,
+      "step": 75
+    },
+    {
+      "epoch": 0.04382297823266542,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.997293306029243e-05,
+      "loss": 0.9736,
+      "step": 76
+    },
+    {
+      "epoch": 0.044399596367305755,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.997209393654223e-05,
+      "loss": 0.9804,
+      "step": 77
+    },
+    {
+      "epoch": 0.04497621450194609,
+      "grad_norm": 0.63671875,
+      "learning_rate": 9.997124200860528e-05,
+      "loss": 0.9651,
+      "step": 78
+    },
+    {
+      "epoch": 0.045552832636586424,
+      "grad_norm": 0.62109375,
+      "learning_rate": 9.997037727679343e-05,
+      "loss": 0.943,
+      "step": 79
+    },
+    {
+      "epoch": 0.04612945077122676,
+      "grad_norm": 0.66015625,
+      "learning_rate": 9.996949974142334e-05,
+      "loss": 0.9878,
+      "step": 80
+    },
+    {
+      "epoch": 0.046706068905867086,
+      "grad_norm": 0.66796875,
+      "learning_rate": 9.996860940281624e-05,
+      "loss": 0.956,
+      "step": 81
+    },
+    {
+      "epoch": 0.04728268704050742,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.996770626129817e-05,
+      "loss": 0.9741,
+      "step": 82
+    },
+    {
+      "epoch": 0.047859305175147755,
+      "grad_norm": 0.69140625,
+      "learning_rate": 9.996679031719973e-05,
+      "loss": 0.9553,
+      "step": 83
+    },
+    {
+      "epoch": 0.04843592330978809,
+      "grad_norm": 0.66015625,
+      "learning_rate": 9.996586157085632e-05,
+      "loss": 0.9911,
+      "step": 84
+    },
+    {
+      "epoch": 0.049012541444428424,
+      "grad_norm": 0.66015625,
+      "learning_rate": 9.996492002260794e-05,
+      "loss": 0.97,
+      "step": 85
+    },
+    {
+      "epoch": 0.04958915957906876,
+      "grad_norm": 0.66796875,
+      "learning_rate": 9.996396567279933e-05,
+      "loss": 0.9272,
+      "step": 86
+    },
+    {
+      "epoch": 0.050165777713709093,
+      "grad_norm": 0.64453125,
+      "learning_rate": 9.996299852177992e-05,
+      "loss": 1.0005,
+      "step": 87
+    },
+    {
+      "epoch": 0.05074239584834943,
+      "grad_norm": 0.6484375,
+      "learning_rate": 9.996201856990381e-05,
+      "loss": 0.9845,
+      "step": 88
+    },
+    {
+      "epoch": 0.05131901398298976,
+      "grad_norm": 0.6796875,
+      "learning_rate": 9.996102581752976e-05,
+      "loss": 1.0048,
+      "step": 89
+    },
+    {
+      "epoch": 0.0518956321176301,
+      "grad_norm": 0.61328125,
+      "learning_rate": 9.996002026502125e-05,
+      "loss": 0.9377,
+      "step": 90
+    },
+    {
+      "epoch": 0.05247225025227043,
+      "grad_norm": 0.640625,
+      "learning_rate": 9.995900191274643e-05,
+      "loss": 0.9595,
+      "step": 91
+    },
+    {
+      "epoch": 0.053048868386910766,
+      "grad_norm": 0.67578125,
+      "learning_rate": 9.995797076107818e-05,
+      "loss": 0.9824,
+      "step": 92
+    },
+    {
+      "epoch": 0.0536254865215511,
+      "grad_norm": 0.61328125,
+      "learning_rate": 9.995692681039396e-05,
+      "loss": 0.9639,
+      "step": 93
+    },
+    {
+      "epoch": 0.0536254865215511,
+      "eval_loss": 0.9836028218269348,
+      "eval_runtime": 1.427,
+      "eval_samples_per_second": 130.347,
+      "eval_steps_per_second": 2.803,
+      "step": 93
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 5202,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1734,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4185419194027213e+17,
+  "train_batch_size": 60,
+  "trial_name": null,
+  "trial_params": null
+}