Harras111 commited on Jul 16, 2025

Commit

a48b8ef

verified ·

1 Parent(s): 449b3a2

best model

Browse files

Files changed (39) hide show

checkpoint-264/optimizer.pt +3 -0
checkpoint-264/pytorch_model.bin +3 -0
checkpoint-264/rng_state.pth +3 -0
checkpoint-264/scaler.pt +3 -0
checkpoint-264/scheduler.pt +3 -0
checkpoint-264/trainer_state.json +1882 -0
checkpoint-264/training_args.bin +3 -0
checkpoint-33/optimizer.pt +3 -0
checkpoint-33/pytorch_model.bin +3 -0
checkpoint-33/rng_state.pth +3 -0
checkpoint-33/scaler.pt +3 -0
checkpoint-33/scheduler.pt +3 -0
checkpoint-33/trainer_state.json +265 -0
checkpoint-33/training_args.bin +3 -0
checkpoint-66/optimizer.pt +3 -0
checkpoint-66/pytorch_model.bin +3 -0
checkpoint-66/rng_state.pth +3 -0
checkpoint-66/scaler.pt +3 -0
checkpoint-66/scheduler.pt +3 -0
checkpoint-66/trainer_state.json +496 -0
checkpoint-66/training_args.bin +3 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
runs/Jul16_13-42-47_cbb1763e7b33/events.out.tfevents.1752673369.cbb1763e7b33.87.0 +3 -0
runs/Jul16_13-58-22_cbb1763e7b33/events.out.tfevents.1752674303.cbb1763e7b33.87.1 +3 -0
runs/Jul16_14-02-03_cbb1763e7b33/events.out.tfevents.1752674524.cbb1763e7b33.87.2 +3 -0
runs/Jul16_14-03-33_cbb1763e7b33/events.out.tfevents.1752674613.cbb1763e7b33.87.3 +3 -0
runs/Jul16_14-04-31_cbb1763e7b33/events.out.tfevents.1752674671.cbb1763e7b33.87.4 +3 -0
runs/Jul16_14-04-31_cbb1763e7b33/events.out.tfevents.1752674869.cbb1763e7b33.87.5 +3 -0
runs/Jul16_14-17-51_cbb1763e7b33/events.out.tfevents.1752675472.cbb1763e7b33.87.6 +3 -0
runs/Jul16_14-19-15_cbb1763e7b33/events.out.tfevents.1752675556.cbb1763e7b33.87.7 +3 -0
runs/Jul16_14-28-26_cbb1763e7b33/events.out.tfevents.1752676107.cbb1763e7b33.87.8 +3 -0
runs/Jul16_14-28-51_cbb1763e7b33/events.out.tfevents.1752676132.cbb1763e7b33.87.9 +3 -0
runs/Jul16_14-29-42_cbb1763e7b33/events.out.tfevents.1752676183.cbb1763e7b33.87.10 +3 -0
special_tokens_map.json +43 -0
tokenizer.json +0 -0
tokenizer_config.json +169 -0
training_args.bin +3 -0
vocab.json +0 -0

checkpoint-264/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2841afef85f36085cac335d182ad1eafd7b960d2d6a8cb428bdf27cd810622
+size 1077684090

checkpoint-264/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b630ce86e5360d08d3a55fda3f77c14874ecf351a2668a36b4ce68cdd918b9fa
+size 538847386

checkpoint-264/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5ef0b078293dd1016d6eef653684df637e72f2211f814e7e581a8612ddb4255
+size 14244

checkpoint-264/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
+size 988

checkpoint-264/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c719f59913a6ed280e1e82f42608192483a2429f020d464d9fd0ed07d97121
+size 1064

checkpoint-264/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1882 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 264,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011428571428571429,
+      "grad_norm": 118185.3203125,
+      "learning_rate": 0.0,
+      "loss": 0.6604,
+      "step": 1
+    },
+    {
+      "epoch": 0.022857142857142857,
+      "grad_norm": 159732.65625,
+      "learning_rate": 1.0000000000000001e-07,
+      "loss": 0.6194,
+      "step": 2
+    },
+    {
+      "epoch": 0.03428571428571429,
+      "grad_norm": 345344.875,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 0.6201,
+      "step": 3
+    },
+    {
+      "epoch": 0.045714285714285714,
+      "grad_norm": 219192.0,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.6499,
+      "step": 4
+    },
+    {
+      "epoch": 0.05714285714285714,
+      "grad_norm": 297524.125,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.7443,
+      "step": 5
+    },
+    {
+      "epoch": 0.06857142857142857,
+      "grad_norm": 411552.5625,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.7532,
+      "step": 6
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 270805.0,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.7017,
+      "step": 7
+    },
+    {
+      "epoch": 0.09142857142857143,
+      "grad_norm": 361510.84375,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 0.8146,
+      "step": 8
+    },
+    {
+      "epoch": 0.10285714285714286,
+      "grad_norm": 174881.5625,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 0.7315,
+      "step": 9
+    },
+    {
+      "epoch": 0.11428571428571428,
+      "grad_norm": 518314.0,
+      "learning_rate": 9e-07,
+      "loss": 0.7979,
+      "step": 10
+    },
+    {
+      "epoch": 0.12571428571428572,
+      "grad_norm": 602806.625,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.8385,
+      "step": 11
+    },
+    {
+      "epoch": 0.13714285714285715,
+      "grad_norm": 124327.703125,
+      "learning_rate": 1.1e-06,
+      "loss": 0.6674,
+      "step": 12
+    },
+    {
+      "epoch": 0.14857142857142858,
+      "grad_norm": 280730.6875,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.6828,
+      "step": 13
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 360234.5625,
+      "learning_rate": 1.3e-06,
+      "loss": 0.7472,
+      "step": 14
+    },
+    {
+      "epoch": 0.17142857142857143,
+      "grad_norm": 254333.03125,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 0.6993,
+      "step": 15
+    },
+    {
+      "epoch": 0.18285714285714286,
+      "grad_norm": 185385.34375,
+      "learning_rate": 1.5e-06,
+      "loss": 0.735,
+      "step": 16
+    },
+    {
+      "epoch": 0.19428571428571428,
+      "grad_norm": 529075.3125,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 0.7743,
+      "step": 17
+    },
+    {
+      "epoch": 0.2057142857142857,
+      "grad_norm": 278783.3125,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": 0.7857,
+      "step": 18
+    },
+    {
+      "epoch": 0.21714285714285714,
+      "grad_norm": 278557.0,
+      "learning_rate": 1.8e-06,
+      "loss": 0.7316,
+      "step": 19
+    },
+    {
+      "epoch": 0.22857142857142856,
+      "grad_norm": 214512.203125,
+      "learning_rate": 1.9e-06,
+      "loss": 0.7191,
+      "step": 20
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 200249.328125,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.6521,
+      "step": 21
+    },
+    {
+      "epoch": 0.25142857142857145,
+      "grad_norm": 203009.359375,
+      "learning_rate": 2.1000000000000002e-06,
+      "loss": 0.7226,
+      "step": 22
+    },
+    {
+      "epoch": 0.26285714285714284,
+      "grad_norm": 271878.9375,
+      "learning_rate": 2.2e-06,
+      "loss": 0.6584,
+      "step": 23
+    },
+    {
+      "epoch": 0.2742857142857143,
+      "grad_norm": 113301.359375,
+      "learning_rate": 2.3e-06,
+      "loss": 0.7122,
+      "step": 24
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 392648.6875,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.7044,
+      "step": 25
+    },
+    {
+      "epoch": 0.29714285714285715,
+      "grad_norm": 211160.40625,
+      "learning_rate": 2.5e-06,
+      "loss": 0.747,
+      "step": 26
+    },
+    {
+      "epoch": 0.30857142857142855,
+      "grad_norm": 344234.03125,
+      "learning_rate": 2.6e-06,
+      "loss": 0.7459,
+      "step": 27
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 259680.78125,
+      "learning_rate": 2.7e-06,
+      "loss": 0.6424,
+      "step": 28
+    },
+    {
+      "epoch": 0.3314285714285714,
+      "grad_norm": 140965.921875,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.7401,
+      "step": 29
+    },
+    {
+      "epoch": 0.34285714285714286,
+      "grad_norm": 374176.96875,
+      "learning_rate": 2.9e-06,
+      "loss": 0.6804,
+      "step": 30
+    },
+    {
+      "epoch": 0.35428571428571426,
+      "grad_norm": 170955.984375,
+      "learning_rate": 3e-06,
+      "loss": 0.7011,
+      "step": 31
+    },
+    {
+      "epoch": 0.3657142857142857,
+      "grad_norm": 211658.421875,
+      "learning_rate": 3.1e-06,
+      "loss": 0.6764,
+      "step": 32
+    },
+    {
+      "epoch": 0.37714285714285717,
+      "grad_norm": 304990.25,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 0.7528,
+      "step": 33
+    },
+    {
+      "epoch": 0.38857142857142857,
+      "grad_norm": 133403.875,
+      "learning_rate": 3.3e-06,
+      "loss": 0.7007,
+      "step": 34
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 232671.234375,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 0.6914,
+      "step": 35
+    },
+    {
+      "epoch": 0.4114285714285714,
+      "grad_norm": 174998.578125,
+      "learning_rate": 3.5000000000000004e-06,
+      "loss": 0.7153,
+      "step": 36
+    },
+    {
+      "epoch": 0.4228571428571429,
+      "grad_norm": 138208.15625,
+      "learning_rate": 3.6e-06,
+      "loss": 0.687,
+      "step": 37
+    },
+    {
+      "epoch": 0.4342857142857143,
+      "grad_norm": 103691.7265625,
+      "learning_rate": 3.7e-06,
+      "loss": 0.6483,
+      "step": 38
+    },
+    {
+      "epoch": 0.44571428571428573,
+      "grad_norm": 380380.25,
+      "learning_rate": 3.8e-06,
+      "loss": 0.7143,
+      "step": 39
+    },
+    {
+      "epoch": 0.45714285714285713,
+      "grad_norm": 269078.53125,
+      "learning_rate": 3.9e-06,
+      "loss": 0.6461,
+      "step": 40
+    },
+    {
+      "epoch": 0.4685714285714286,
+      "grad_norm": 134496.34375,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.6758,
+      "step": 41
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 428575.6875,
+      "learning_rate": 4.1000000000000006e-06,
+      "loss": 0.6868,
+      "step": 42
+    },
+    {
+      "epoch": 0.49142857142857144,
+      "grad_norm": 133276.71875,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": 0.7269,
+      "step": 43
+    },
+    {
+      "epoch": 0.5028571428571429,
+      "grad_norm": 237024.0625,
+      "learning_rate": 4.2999999999999995e-06,
+      "loss": 0.5928,
+      "step": 44
+    },
+    {
+      "epoch": 0.5142857142857142,
+      "grad_norm": 149670.96875,
+      "learning_rate": 4.4e-06,
+      "loss": 0.6346,
+      "step": 45
+    },
+    {
+      "epoch": 0.5257142857142857,
+      "grad_norm": 183600.953125,
+      "learning_rate": 4.5e-06,
+      "loss": 0.5803,
+      "step": 46
+    },
+    {
+      "epoch": 0.5371428571428571,
+      "grad_norm": 133982.6875,
+      "learning_rate": 4.6e-06,
+      "loss": 0.6558,
+      "step": 47
+    },
+    {
+      "epoch": 0.5485714285714286,
+      "grad_norm": 176806.078125,
+      "learning_rate": 4.7e-06,
+      "loss": 0.7226,
+      "step": 48
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 324079.4375,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.7344,
+      "step": 49
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 163303.296875,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 0.7074,
+      "step": 50
+    },
+    {
+      "epoch": 0.5828571428571429,
+      "grad_norm": 467207.9375,
+      "learning_rate": 5e-06,
+      "loss": 0.7526,
+      "step": 51
+    },
+    {
+      "epoch": 0.5942857142857143,
+      "grad_norm": 180564.609375,
+      "learning_rate": 5.1e-06,
+      "loss": 0.6799,
+      "step": 52
+    },
+    {
+      "epoch": 0.6057142857142858,
+      "grad_norm": 251939.953125,
+      "learning_rate": 5.2e-06,
+      "loss": 0.6534,
+      "step": 53
+    },
+    {
+      "epoch": 0.6171428571428571,
+      "grad_norm": 140387.640625,
+      "learning_rate": 5.3e-06,
+      "loss": 0.5768,
+      "step": 54
+    },
+    {
+      "epoch": 0.6285714285714286,
+      "grad_norm": 136101.828125,
+      "learning_rate": 5.4e-06,
+      "loss": 0.6431,
+      "step": 55
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 157331.125,
+      "learning_rate": 5.500000000000001e-06,
+      "loss": 0.6337,
+      "step": 56
+    },
+    {
+      "epoch": 0.6514285714285715,
+      "grad_norm": 194157.609375,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.6755,
+      "step": 57
+    },
+    {
+      "epoch": 0.6628571428571428,
+      "grad_norm": 150589.046875,
+      "learning_rate": 5.7000000000000005e-06,
+      "loss": 0.6846,
+      "step": 58
+    },
+    {
+      "epoch": 0.6742857142857143,
+      "grad_norm": 159283.9375,
+      "learning_rate": 5.8e-06,
+      "loss": 0.6495,
+      "step": 59
+    },
+    {
+      "epoch": 0.6857142857142857,
+      "grad_norm": 150622.296875,
+      "learning_rate": 5.9e-06,
+      "loss": 0.675,
+      "step": 60
+    },
+    {
+      "epoch": 0.6971428571428572,
+      "grad_norm": 403344.03125,
+      "learning_rate": 6e-06,
+      "loss": 0.6674,
+      "step": 61
+    },
+    {
+      "epoch": 0.7085714285714285,
+      "grad_norm": 120991.046875,
+      "learning_rate": 6.1e-06,
+      "loss": 0.6584,
+      "step": 62
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 125491.6171875,
+      "learning_rate": 6.2e-06,
+      "loss": 0.6695,
+      "step": 63
+    },
+    {
+      "epoch": 0.7314285714285714,
+      "grad_norm": 137534.640625,
+      "learning_rate": 6.300000000000001e-06,
+      "loss": 0.6989,
+      "step": 64
+    },
+    {
+      "epoch": 0.7428571428571429,
+      "grad_norm": 282724.15625,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.642,
+      "step": 65
+    },
+    {
+      "epoch": 0.7542857142857143,
+      "grad_norm": 144163.921875,
+      "learning_rate": 6.5000000000000004e-06,
+      "loss": 0.6244,
+      "step": 66
+    },
+    {
+      "epoch": 0.7657142857142857,
+      "grad_norm": 146836.90625,
+      "learning_rate": 6.6e-06,
+      "loss": 0.6402,
+      "step": 67
+    },
+    {
+      "epoch": 0.7771428571428571,
+      "grad_norm": 118937.953125,
+      "learning_rate": 6.700000000000001e-06,
+      "loss": 0.6798,
+      "step": 68
+    },
+    {
+      "epoch": 0.7885714285714286,
+      "grad_norm": 163573.25,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.672,
+      "step": 69
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 136640.703125,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 0.6546,
+      "step": 70
+    },
+    {
+      "epoch": 0.8114285714285714,
+      "grad_norm": 152148.46875,
+      "learning_rate": 7.000000000000001e-06,
+      "loss": 0.6608,
+      "step": 71
+    },
+    {
+      "epoch": 0.8228571428571428,
+      "grad_norm": 193317.15625,
+      "learning_rate": 7.1e-06,
+      "loss": 0.7177,
+      "step": 72
+    },
+    {
+      "epoch": 0.8342857142857143,
+      "grad_norm": 295872.84375,
+      "learning_rate": 7.2e-06,
+      "loss": 0.6906,
+      "step": 73
+    },
+    {
+      "epoch": 0.8457142857142858,
+      "grad_norm": 354559.40625,
+      "learning_rate": 7.2999999999999996e-06,
+      "loss": 0.6491,
+      "step": 74
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 272566.65625,
+      "learning_rate": 7.4e-06,
+      "loss": 0.593,
+      "step": 75
+    },
+    {
+      "epoch": 0.8685714285714285,
+      "grad_norm": 118028.6484375,
+      "learning_rate": 7.5e-06,
+      "loss": 0.602,
+      "step": 76
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 115188.484375,
+      "learning_rate": 7.6e-06,
+      "loss": 0.6822,
+      "step": 77
+    },
+    {
+      "epoch": 0.8914285714285715,
+      "grad_norm": 272999.90625,
+      "learning_rate": 7.7e-06,
+      "loss": 0.6062,
+      "step": 78
+    },
+    {
+      "epoch": 0.9028571428571428,
+      "grad_norm": 133682.421875,
+      "learning_rate": 7.8e-06,
+      "loss": 0.6068,
+      "step": 79
+    },
+    {
+      "epoch": 0.9142857142857143,
+      "grad_norm": 244230.953125,
+      "learning_rate": 7.9e-06,
+      "loss": 0.6167,
+      "step": 80
+    },
+    {
+      "epoch": 0.9257142857142857,
+      "grad_norm": 341543.125,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.7138,
+      "step": 81
+    },
+    {
+      "epoch": 0.9371428571428572,
+      "grad_norm": 511938.4375,
+      "learning_rate": 8.1e-06,
+      "loss": 0.6086,
+      "step": 82
+    },
+    {
+      "epoch": 0.9485714285714286,
+      "grad_norm": 139162.578125,
+      "learning_rate": 8.200000000000001e-06,
+      "loss": 0.6382,
+      "step": 83
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 294741.0,
+      "learning_rate": 8.3e-06,
+      "loss": 0.572,
+      "step": 84
+    },
+    {
+      "epoch": 0.9714285714285714,
+      "grad_norm": 133599.796875,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.5747,
+      "step": 85
+    },
+    {
+      "epoch": 0.9828571428571429,
+      "grad_norm": 202760.46875,
+      "learning_rate": 8.500000000000002e-06,
+      "loss": 0.677,
+      "step": 86
+    },
+    {
+      "epoch": 0.9942857142857143,
+      "grad_norm": 210599.0625,
+      "learning_rate": 8.599999999999999e-06,
+      "loss": 0.6257,
+      "step": 87
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 235686.546875,
+      "learning_rate": 8.7e-06,
+      "loss": 0.3265,
+      "step": 88
+    },
+    {
+      "epoch": 1.0114285714285713,
+      "grad_norm": 202346.375,
+      "learning_rate": 8.8e-06,
+      "loss": 0.6414,
+      "step": 89
+    },
+    {
+      "epoch": 1.022857142857143,
+      "grad_norm": 255899.5,
+      "learning_rate": 8.9e-06,
+      "loss": 0.5057,
+      "step": 90
+    },
+    {
+      "epoch": 1.0342857142857143,
+      "grad_norm": 164965.328125,
+      "learning_rate": 9e-06,
+      "loss": 0.5368,
+      "step": 91
+    },
+    {
+      "epoch": 1.0457142857142858,
+      "grad_norm": 230824.515625,
+      "learning_rate": 9.100000000000001e-06,
+      "loss": 0.5639,
+      "step": 92
+    },
+    {
+      "epoch": 1.0571428571428572,
+      "grad_norm": 215165.125,
+      "learning_rate": 9.2e-06,
+      "loss": 0.612,
+      "step": 93
+    },
+    {
+      "epoch": 1.0685714285714285,
+      "grad_norm": 158922.203125,
+      "learning_rate": 9.3e-06,
+      "loss": 0.611,
+      "step": 94
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 141139.6875,
+      "learning_rate": 9.4e-06,
+      "loss": 0.5533,
+      "step": 95
+    },
+    {
+      "epoch": 1.0914285714285714,
+      "grad_norm": 183250.34375,
+      "learning_rate": 9.5e-06,
+      "loss": 0.6086,
+      "step": 96
+    },
+    {
+      "epoch": 1.1028571428571428,
+      "grad_norm": 120192.1015625,
+      "learning_rate": 9.600000000000001e-06,
+      "loss": 0.5613,
+      "step": 97
+    },
+    {
+      "epoch": 1.1142857142857143,
+      "grad_norm": 229168.09375,
+      "learning_rate": 9.7e-06,
+      "loss": 0.6028,
+      "step": 98
+    },
+    {
+      "epoch": 1.1257142857142857,
+      "grad_norm": 136412.65625,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 0.6136,
+      "step": 99
+    },
+    {
+      "epoch": 1.1371428571428572,
+      "grad_norm": 173484.328125,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 0.6129,
+      "step": 100
+    },
+    {
+      "epoch": 1.1485714285714286,
+      "grad_norm": 157432.859375,
+      "learning_rate": 1e-05,
+      "loss": 0.5692,
+      "step": 101
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 353016.96875,
+      "learning_rate": 1.0100000000000002e-05,
+      "loss": 0.5369,
+      "step": 102
+    },
+    {
+      "epoch": 1.1714285714285715,
+      "grad_norm": 175952.03125,
+      "learning_rate": 1.02e-05,
+      "loss": 0.6299,
+      "step": 103
+    },
+    {
+      "epoch": 1.1828571428571428,
+      "grad_norm": 229591.46875,
+      "learning_rate": 1.03e-05,
+      "loss": 0.6234,
+      "step": 104
+    },
+    {
+      "epoch": 1.1942857142857144,
+      "grad_norm": 198739.484375,
+      "learning_rate": 1.04e-05,
+      "loss": 0.5545,
+      "step": 105
+    },
+    {
+      "epoch": 1.2057142857142857,
+      "grad_norm": 360903.9375,
+      "learning_rate": 1.05e-05,
+      "loss": 0.6226,
+      "step": 106
+    },
+    {
+      "epoch": 1.217142857142857,
+      "grad_norm": 165000.71875,
+      "learning_rate": 1.06e-05,
+      "loss": 0.4832,
+      "step": 107
+    },
+    {
+      "epoch": 1.2285714285714286,
+      "grad_norm": 166604.828125,
+      "learning_rate": 1.0700000000000001e-05,
+      "loss": 0.6472,
+      "step": 108
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 237359.609375,
+      "learning_rate": 1.08e-05,
+      "loss": 0.6102,
+      "step": 109
+    },
+    {
+      "epoch": 1.2514285714285713,
+      "grad_norm": 157993.421875,
+      "learning_rate": 1.09e-05,
+      "loss": 0.6639,
+      "step": 110
+    },
+    {
+      "epoch": 1.262857142857143,
+      "grad_norm": 318521.78125,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 0.5252,
+      "step": 111
+    },
+    {
+      "epoch": 1.2742857142857142,
+      "grad_norm": 188877.15625,
+      "learning_rate": 1.11e-05,
+      "loss": 0.4726,
+      "step": 112
+    },
+    {
+      "epoch": 1.2857142857142856,
+      "grad_norm": 188785.421875,
+      "learning_rate": 1.1200000000000001e-05,
+      "loss": 0.4682,
+      "step": 113
+    },
+    {
+      "epoch": 1.2971428571428572,
+      "grad_norm": 227261.40625,
+      "learning_rate": 1.13e-05,
+      "loss": 0.4504,
+      "step": 114
+    },
+    {
+      "epoch": 1.3085714285714285,
+      "grad_norm": 162537.125,
+      "learning_rate": 1.1400000000000001e-05,
+      "loss": 0.508,
+      "step": 115
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 181633.890625,
+      "learning_rate": 1.1500000000000002e-05,
+      "loss": 0.4634,
+      "step": 116
+    },
+    {
+      "epoch": 1.3314285714285714,
+      "grad_norm": 206084.078125,
+      "learning_rate": 1.16e-05,
+      "loss": 0.5446,
+      "step": 117
+    },
+    {
+      "epoch": 1.342857142857143,
+      "grad_norm": 237778.296875,
+      "learning_rate": 1.1700000000000001e-05,
+      "loss": 0.5681,
+      "step": 118
+    },
+    {
+      "epoch": 1.3542857142857143,
+      "grad_norm": 197374.0625,
+      "learning_rate": 1.18e-05,
+      "loss": 0.4843,
+      "step": 119
+    },
+    {
+      "epoch": 1.3657142857142857,
+      "grad_norm": 237646.25,
+      "learning_rate": 1.19e-05,
+      "loss": 0.6275,
+      "step": 120
+    },
+    {
+      "epoch": 1.3771428571428572,
+      "grad_norm": 248032.671875,
+      "learning_rate": 1.2e-05,
+      "loss": 0.6916,
+      "step": 121
+    },
+    {
+      "epoch": 1.3885714285714286,
+      "grad_norm": 185239.84375,
+      "learning_rate": 1.2100000000000001e-05,
+      "loss": 0.4597,
+      "step": 122
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 322073.59375,
+      "learning_rate": 1.22e-05,
+      "loss": 0.4998,
+      "step": 123
+    },
+    {
+      "epoch": 1.4114285714285715,
+      "grad_norm": 196296.5625,
+      "learning_rate": 1.23e-05,
+      "loss": 0.469,
+      "step": 124
+    },
+    {
+      "epoch": 1.4228571428571428,
+      "grad_norm": 227619.03125,
+      "learning_rate": 1.24e-05,
+      "loss": 0.4645,
+      "step": 125
+    },
+    {
+      "epoch": 1.4342857142857142,
+      "grad_norm": 184984.859375,
+      "learning_rate": 1.25e-05,
+      "loss": 0.463,
+      "step": 126
+    },
+    {
+      "epoch": 1.4457142857142857,
+      "grad_norm": 175033.484375,
+      "learning_rate": 1.2600000000000001e-05,
+      "loss": 0.3954,
+      "step": 127
+    },
+    {
+      "epoch": 1.457142857142857,
+      "grad_norm": 275993.84375,
+      "learning_rate": 1.27e-05,
+      "loss": 0.5972,
+      "step": 128
+    },
+    {
+      "epoch": 1.4685714285714286,
+      "grad_norm": 394419.4375,
+      "learning_rate": 1.2800000000000001e-05,
+      "loss": 0.6099,
+      "step": 129
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 393686.625,
+      "learning_rate": 1.29e-05,
+      "loss": 0.5569,
+      "step": 130
+    },
+    {
+      "epoch": 1.4914285714285715,
+      "grad_norm": 248753.34375,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 0.4639,
+      "step": 131
+    },
+    {
+      "epoch": 1.502857142857143,
+      "grad_norm": 322526.90625,
+      "learning_rate": 1.3100000000000002e-05,
+      "loss": 0.4935,
+      "step": 132
+    },
+    {
+      "epoch": 1.5142857142857142,
+      "grad_norm": 223679.984375,
+      "learning_rate": 1.32e-05,
+      "loss": 0.5462,
+      "step": 133
+    },
+    {
+      "epoch": 1.5257142857142858,
+      "grad_norm": 254149.53125,
+      "learning_rate": 1.3300000000000001e-05,
+      "loss": 0.4753,
+      "step": 134
+    },
+    {
+      "epoch": 1.5371428571428571,
+      "grad_norm": 334813.0625,
+      "learning_rate": 1.3400000000000002e-05,
+      "loss": 0.5321,
+      "step": 135
+    },
+    {
+      "epoch": 1.5485714285714285,
+      "grad_norm": 336419.03125,
+      "learning_rate": 1.3500000000000001e-05,
+      "loss": 0.6368,
+      "step": 136
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 206770.359375,
+      "learning_rate": 1.3600000000000002e-05,
+      "loss": 0.3531,
+      "step": 137
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 360716.1875,
+      "learning_rate": 1.3700000000000001e-05,
+      "loss": 0.5424,
+      "step": 138
+    },
+    {
+      "epoch": 1.5828571428571427,
+      "grad_norm": 177707.234375,
+      "learning_rate": 1.3800000000000002e-05,
+      "loss": 0.4255,
+      "step": 139
+    },
+    {
+      "epoch": 1.5942857142857143,
+      "grad_norm": 236608.4375,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 0.4149,
+      "step": 140
+    },
+    {
+      "epoch": 1.6057142857142859,
+      "grad_norm": 222276.046875,
+      "learning_rate": 1.4000000000000001e-05,
+      "loss": 0.4502,
+      "step": 141
+    },
+    {
+      "epoch": 1.617142857142857,
+      "grad_norm": 240427.28125,
+      "learning_rate": 1.4099999999999999e-05,
+      "loss": 0.4582,
+      "step": 142
+    },
+    {
+      "epoch": 1.6285714285714286,
+      "grad_norm": 330109.34375,
+      "learning_rate": 1.42e-05,
+      "loss": 0.597,
+      "step": 143
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 203952.8125,
+      "learning_rate": 1.43e-05,
+      "loss": 0.421,
+      "step": 144
+    },
+    {
+      "epoch": 1.6514285714285715,
+      "grad_norm": 314465.8125,
+      "learning_rate": 1.44e-05,
+      "loss": 0.4334,
+      "step": 145
+    },
+    {
+      "epoch": 1.6628571428571428,
+      "grad_norm": 260632.484375,
+      "learning_rate": 1.45e-05,
+      "loss": 0.4155,
+      "step": 146
+    },
+    {
+      "epoch": 1.6742857142857144,
+      "grad_norm": 600515.625,
+      "learning_rate": 1.4599999999999999e-05,
+      "loss": 0.4835,
+      "step": 147
+    },
+    {
+      "epoch": 1.6857142857142857,
+      "grad_norm": 210723.390625,
+      "learning_rate": 1.47e-05,
+      "loss": 0.2872,
+      "step": 148
+    },
+    {
+      "epoch": 1.697142857142857,
+      "grad_norm": 320605.6875,
+      "learning_rate": 1.48e-05,
+      "loss": 0.5275,
+      "step": 149
+    },
+    {
+      "epoch": 1.7085714285714286,
+      "grad_norm": 339823.46875,
+      "learning_rate": 1.49e-05,
+      "loss": 0.6135,
+      "step": 150
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 260396.15625,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4322,
+      "step": 151
+    },
+    {
+      "epoch": 1.7314285714285713,
+      "grad_norm": 458704.375,
+      "learning_rate": 1.51e-05,
+      "loss": 0.508,
+      "step": 152
+    },
+    {
+      "epoch": 1.7428571428571429,
+      "grad_norm": 540310.75,
+      "learning_rate": 1.52e-05,
+      "loss": 0.5959,
+      "step": 153
+    },
+    {
+      "epoch": 1.7542857142857144,
+      "grad_norm": 282878.21875,
+      "learning_rate": 1.53e-05,
+      "loss": 0.4027,
+      "step": 154
+    },
+    {
+      "epoch": 1.7657142857142856,
+      "grad_norm": 242143.484375,
+      "learning_rate": 1.54e-05,
+      "loss": 0.3716,
+      "step": 155
+    },
+    {
+      "epoch": 1.7771428571428571,
+      "grad_norm": 150724.328125,
+      "learning_rate": 1.55e-05,
+      "loss": 0.2495,
+      "step": 156
+    },
+    {
+      "epoch": 1.7885714285714287,
+      "grad_norm": 400347.625,
+      "learning_rate": 1.56e-05,
+      "loss": 0.537,
+      "step": 157
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 539187.6875,
+      "learning_rate": 1.5700000000000002e-05,
+      "loss": 0.3636,
+      "step": 158
+    },
+    {
+      "epoch": 1.8114285714285714,
+      "grad_norm": 268363.625,
+      "learning_rate": 1.58e-05,
+      "loss": 0.3159,
+      "step": 159
+    },
+    {
+      "epoch": 1.822857142857143,
+      "grad_norm": 460837.0,
+      "learning_rate": 1.59e-05,
+      "loss": 0.5676,
+      "step": 160
+    },
+    {
+      "epoch": 1.8342857142857143,
+      "grad_norm": 260367.046875,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.3577,
+      "step": 161
+    },
+    {
+      "epoch": 1.8457142857142856,
+      "grad_norm": 378360.0,
+      "learning_rate": 1.6100000000000002e-05,
+      "loss": 0.4669,
+      "step": 162
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 315112.875,
+      "learning_rate": 1.62e-05,
+      "loss": 0.5173,
+      "step": 163
+    },
+    {
+      "epoch": 1.8685714285714285,
+      "grad_norm": 385707.71875,
+      "learning_rate": 1.63e-05,
+      "loss": 0.4655,
+      "step": 164
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 221989.453125,
+      "learning_rate": 1.6400000000000002e-05,
+      "loss": 0.2326,
+      "step": 165
+    },
+    {
+      "epoch": 1.8914285714285715,
+      "grad_norm": 284272.3125,
+      "learning_rate": 1.65e-05,
+      "loss": 0.3275,
+      "step": 166
+    },
+    {
+      "epoch": 1.9028571428571428,
+      "grad_norm": 200877.9375,
+      "learning_rate": 1.66e-05,
+      "loss": 0.2807,
+      "step": 167
+    },
+    {
+      "epoch": 1.9142857142857141,
+      "grad_norm": 435734.4375,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 0.5807,
+      "step": 168
+    },
+    {
+      "epoch": 1.9257142857142857,
+      "grad_norm": 243956.390625,
+      "learning_rate": 1.6800000000000002e-05,
+      "loss": 0.3503,
+      "step": 169
+    },
+    {
+      "epoch": 1.9371428571428573,
+      "grad_norm": 215639.5625,
+      "learning_rate": 1.69e-05,
+      "loss": 0.3334,
+      "step": 170
+    },
+    {
+      "epoch": 1.9485714285714286,
+      "grad_norm": 337599.59375,
+      "learning_rate": 1.7000000000000003e-05,
+      "loss": 0.3732,
+      "step": 171
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 193279.8125,
+      "learning_rate": 1.7100000000000002e-05,
+      "loss": 0.2066,
+      "step": 172
+    },
+    {
+      "epoch": 1.9714285714285715,
+      "grad_norm": 292815.90625,
+      "learning_rate": 1.7199999999999998e-05,
+      "loss": 0.3958,
+      "step": 173
+    },
+    {
+      "epoch": 1.9828571428571429,
+      "grad_norm": 215893.390625,
+      "learning_rate": 1.73e-05,
+      "loss": 0.2177,
+      "step": 174
+    },
+    {
+      "epoch": 1.9942857142857142,
+      "grad_norm": 264176.84375,
+      "learning_rate": 1.74e-05,
+      "loss": 0.3627,
+      "step": 175
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 338398.84375,
+      "learning_rate": 1.75e-05,
+      "loss": 0.2794,
+      "step": 176
+    },
+    {
+      "epoch": 2.0114285714285716,
+      "grad_norm": 382892.53125,
+      "learning_rate": 1.76e-05,
+      "loss": 0.42,
+      "step": 177
+    },
+    {
+      "epoch": 2.0228571428571427,
+      "grad_norm": 294513.09375,
+      "learning_rate": 1.77e-05,
+      "loss": 0.3591,
+      "step": 178
+    },
+    {
+      "epoch": 2.0342857142857143,
+      "grad_norm": 288106.6875,
+      "learning_rate": 1.78e-05,
+      "loss": 0.2242,
+      "step": 179
+    },
+    {
+      "epoch": 2.045714285714286,
+      "grad_norm": 182290.265625,
+      "learning_rate": 1.79e-05,
+      "loss": 0.1664,
+      "step": 180
+    },
+    {
+      "epoch": 2.057142857142857,
+      "grad_norm": 154645.578125,
+      "learning_rate": 1.8e-05,
+      "loss": 0.1343,
+      "step": 181
+    },
+    {
+      "epoch": 2.0685714285714285,
+      "grad_norm": 279664.71875,
+      "learning_rate": 1.81e-05,
+      "loss": 0.2759,
+      "step": 182
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 172414.234375,
+      "learning_rate": 1.8200000000000002e-05,
+      "loss": 0.1847,
+      "step": 183
+    },
+    {
+      "epoch": 2.0914285714285716,
+      "grad_norm": 378055.875,
+      "learning_rate": 1.83e-05,
+      "loss": 0.2032,
+      "step": 184
+    },
+    {
+      "epoch": 2.1028571428571428,
+      "grad_norm": 154689.90625,
+      "learning_rate": 1.84e-05,
+      "loss": 0.1663,
+      "step": 185
+    },
+    {
+      "epoch": 2.1142857142857143,
+      "grad_norm": 203879.921875,
+      "learning_rate": 1.85e-05,
+      "loss": 0.1529,
+      "step": 186
+    },
+    {
+      "epoch": 2.125714285714286,
+      "grad_norm": 363326.59375,
+      "learning_rate": 1.86e-05,
+      "loss": 0.244,
+      "step": 187
+    },
+    {
+      "epoch": 2.137142857142857,
+      "grad_norm": 185000.40625,
+      "learning_rate": 1.87e-05,
+      "loss": 0.2258,
+      "step": 188
+    },
+    {
+      "epoch": 2.1485714285714286,
+      "grad_norm": 318877.1875,
+      "learning_rate": 1.88e-05,
+      "loss": 0.3265,
+      "step": 189
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 482190.375,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 0.3193,
+      "step": 190
+    },
+    {
+      "epoch": 2.1714285714285713,
+      "grad_norm": 215147.875,
+      "learning_rate": 1.9e-05,
+      "loss": 0.107,
+      "step": 191
+    },
+    {
+      "epoch": 2.182857142857143,
+      "grad_norm": 330217.3125,
+      "learning_rate": 1.91e-05,
+      "loss": 0.308,
+      "step": 192
+    },
+    {
+      "epoch": 2.1942857142857144,
+      "grad_norm": 386270.59375,
+      "learning_rate": 1.9200000000000003e-05,
+      "loss": 0.2939,
+      "step": 193
+    },
+    {
+      "epoch": 2.2057142857142855,
+      "grad_norm": 188564.09375,
+      "learning_rate": 1.93e-05,
+      "loss": 0.1267,
+      "step": 194
+    },
+    {
+      "epoch": 2.217142857142857,
+      "grad_norm": 484612.375,
+      "learning_rate": 1.94e-05,
+      "loss": 0.2262,
+      "step": 195
+    },
+    {
+      "epoch": 2.2285714285714286,
+      "grad_norm": 349719.21875,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 0.2971,
+      "step": 196
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 410038.4375,
+      "learning_rate": 1.9600000000000002e-05,
+      "loss": 0.217,
+      "step": 197
+    },
+    {
+      "epoch": 2.2514285714285713,
+      "grad_norm": 191728.890625,
+      "learning_rate": 1.97e-05,
+      "loss": 0.1174,
+      "step": 198
+    },
+    {
+      "epoch": 2.262857142857143,
+      "grad_norm": 626332.0,
+      "learning_rate": 1.9800000000000004e-05,
+      "loss": 0.3399,
+      "step": 199
+    },
+    {
+      "epoch": 2.2742857142857145,
+      "grad_norm": 186624.015625,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 0.1435,
+      "step": 200
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 218461.3125,
+      "learning_rate": 2e-05,
+      "loss": 0.1764,
+      "step": 201
+    },
+    {
+      "epoch": 2.297142857142857,
+      "grad_norm": 245343.21875,
+      "learning_rate": 2.01e-05,
+      "loss": 0.1779,
+      "step": 202
+    },
+    {
+      "epoch": 2.3085714285714287,
+      "grad_norm": 159707.234375,
+      "learning_rate": 2.0200000000000003e-05,
+      "loss": 0.0742,
+      "step": 203
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 359501.40625,
+      "learning_rate": 2.0300000000000002e-05,
+      "loss": 0.1966,
+      "step": 204
+    },
+    {
+      "epoch": 2.3314285714285714,
+      "grad_norm": 363601.65625,
+      "learning_rate": 2.04e-05,
+      "loss": 0.2353,
+      "step": 205
+    },
+    {
+      "epoch": 2.342857142857143,
+      "grad_norm": 240596.375,
+      "learning_rate": 2.05e-05,
+      "loss": 0.1005,
+      "step": 206
+    },
+    {
+      "epoch": 2.354285714285714,
+      "grad_norm": 340894.21875,
+      "learning_rate": 2.06e-05,
+      "loss": 0.2333,
+      "step": 207
+    },
+    {
+      "epoch": 2.3657142857142857,
+      "grad_norm": 580463.3125,
+      "learning_rate": 2.07e-05,
+      "loss": 0.3599,
+      "step": 208
+    },
+    {
+      "epoch": 2.3771428571428572,
+      "grad_norm": 250996.9375,
+      "learning_rate": 2.08e-05,
+      "loss": 0.1929,
+      "step": 209
+    },
+    {
+      "epoch": 2.388571428571429,
+      "grad_norm": 690737.5625,
+      "learning_rate": 2.09e-05,
+      "loss": 0.3504,
+      "step": 210
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 289985.28125,
+      "learning_rate": 2.1e-05,
+      "loss": 0.1902,
+      "step": 211
+    },
+    {
+      "epoch": 2.4114285714285715,
+      "grad_norm": 361845.1875,
+      "learning_rate": 2.11e-05,
+      "loss": 0.2156,
+      "step": 212
+    },
+    {
+      "epoch": 2.422857142857143,
+      "grad_norm": 273145.15625,
+      "learning_rate": 2.12e-05,
+      "loss": 0.1493,
+      "step": 213
+    },
+    {
+      "epoch": 2.434285714285714,
+      "grad_norm": 322946.625,
+      "learning_rate": 2.13e-05,
+      "loss": 0.156,
+      "step": 214
+    },
+    {
+      "epoch": 2.4457142857142857,
+      "grad_norm": 246520.5,
+      "learning_rate": 2.1400000000000002e-05,
+      "loss": 0.2083,
+      "step": 215
+    },
+    {
+      "epoch": 2.4571428571428573,
+      "grad_norm": 226999.8125,
+      "learning_rate": 2.15e-05,
+      "loss": 0.1215,
+      "step": 216
+    },
+    {
+      "epoch": 2.4685714285714284,
+      "grad_norm": 334140.5625,
+      "learning_rate": 2.16e-05,
+      "loss": 0.271,
+      "step": 217
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 130634.296875,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 0.0671,
+      "step": 218
+    },
+    {
+      "epoch": 2.4914285714285715,
+      "grad_norm": 491331.3125,
+      "learning_rate": 2.18e-05,
+      "loss": 0.2711,
+      "step": 219
+    },
+    {
+      "epoch": 2.5028571428571427,
+      "grad_norm": 858986.6875,
+      "learning_rate": 2.19e-05,
+      "loss": 0.2869,
+      "step": 220
+    },
+    {
+      "epoch": 2.5142857142857142,
+      "grad_norm": 712107.0,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 0.2121,
+      "step": 221
+    },
+    {
+      "epoch": 2.525714285714286,
+      "grad_norm": 467553.5,
+      "learning_rate": 2.2100000000000002e-05,
+      "loss": 0.179,
+      "step": 222
+    },
+    {
+      "epoch": 2.5371428571428574,
+      "grad_norm": 239981.125,
+      "learning_rate": 2.22e-05,
+      "loss": 0.1026,
+      "step": 223
+    },
+    {
+      "epoch": 2.5485714285714285,
+      "grad_norm": 317598.46875,
+      "learning_rate": 2.23e-05,
+      "loss": 0.1298,
+      "step": 224
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 531234.4375,
+      "learning_rate": 2.2400000000000002e-05,
+      "loss": 0.3043,
+      "step": 225
+    },
+    {
+      "epoch": 2.571428571428571,
+      "grad_norm": 259380.4375,
+      "learning_rate": 2.25e-05,
+      "loss": 0.0951,
+      "step": 226
+    },
+    {
+      "epoch": 2.5828571428571427,
+      "grad_norm": 188962.984375,
+      "learning_rate": 2.26e-05,
+      "loss": 0.1134,
+      "step": 227
+    },
+    {
+      "epoch": 2.5942857142857143,
+      "grad_norm": 266559.78125,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 0.1057,
+      "step": 228
+    },
+    {
+      "epoch": 2.605714285714286,
+      "grad_norm": 324699.53125,
+      "learning_rate": 2.2800000000000002e-05,
+      "loss": 0.1209,
+      "step": 229
+    },
+    {
+      "epoch": 2.617142857142857,
+      "grad_norm": 385137.0625,
+      "learning_rate": 2.29e-05,
+      "loss": 0.1874,
+      "step": 230
+    },
+    {
+      "epoch": 2.6285714285714286,
+      "grad_norm": 382826.625,
+      "learning_rate": 2.3000000000000003e-05,
+      "loss": 0.217,
+      "step": 231
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 497464.625,
+      "learning_rate": 2.3100000000000002e-05,
+      "loss": 0.2488,
+      "step": 232
+    },
+    {
+      "epoch": 2.6514285714285712,
+      "grad_norm": 243506.890625,
+      "learning_rate": 2.32e-05,
+      "loss": 0.1325,
+      "step": 233
+    },
+    {
+      "epoch": 2.662857142857143,
+      "grad_norm": 399187.125,
+      "learning_rate": 2.3300000000000004e-05,
+      "loss": 0.2673,
+      "step": 234
+    },
+    {
+      "epoch": 2.6742857142857144,
+      "grad_norm": 484322.15625,
+      "learning_rate": 2.3400000000000003e-05,
+      "loss": 0.2877,
+      "step": 235
+    },
+    {
+      "epoch": 2.685714285714286,
+      "grad_norm": 416523.34375,
+      "learning_rate": 2.35e-05,
+      "loss": 0.1781,
+      "step": 236
+    },
+    {
+      "epoch": 2.697142857142857,
+      "grad_norm": 408684.53125,
+      "learning_rate": 2.36e-05,
+      "loss": 0.3024,
+      "step": 237
+    },
+    {
+      "epoch": 2.7085714285714286,
+      "grad_norm": 127704.9453125,
+      "learning_rate": 2.37e-05,
+      "loss": 0.0451,
+      "step": 238
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 260942.40625,
+      "learning_rate": 2.38e-05,
+      "loss": 0.1902,
+      "step": 239
+    },
+    {
+      "epoch": 2.7314285714285713,
+      "grad_norm": 166456.453125,
+      "learning_rate": 2.39e-05,
+      "loss": 0.109,
+      "step": 240
+    },
+    {
+      "epoch": 2.742857142857143,
+      "grad_norm": 489848.5,
+      "learning_rate": 2.4e-05,
+      "loss": 0.1719,
+      "step": 241
+    },
+    {
+      "epoch": 2.7542857142857144,
+      "grad_norm": 63778.44921875,
+      "learning_rate": 2.41e-05,
+      "loss": 0.0549,
+      "step": 242
+    },
+    {
+      "epoch": 2.7657142857142856,
+      "grad_norm": 454851.5,
+      "learning_rate": 2.4200000000000002e-05,
+      "loss": 0.1278,
+      "step": 243
+    },
+    {
+      "epoch": 2.777142857142857,
+      "grad_norm": 442328.875,
+      "learning_rate": 2.43e-05,
+      "loss": 0.1828,
+      "step": 244
+    },
+    {
+      "epoch": 2.7885714285714287,
+      "grad_norm": 83840.3828125,
+      "learning_rate": 2.44e-05,
+      "loss": 0.0719,
+      "step": 245
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 503019.90625,
+      "learning_rate": 2.45e-05,
+      "loss": 0.1672,
+      "step": 246
+    },
+    {
+      "epoch": 2.8114285714285714,
+      "grad_norm": 441193.40625,
+      "learning_rate": 2.46e-05,
+      "loss": 0.2557,
+      "step": 247
+    },
+    {
+      "epoch": 2.822857142857143,
+      "grad_norm": 554498.75,
+      "learning_rate": 2.47e-05,
+      "loss": 0.1547,
+      "step": 248
+    },
+    {
+      "epoch": 2.8342857142857145,
+      "grad_norm": 206641.640625,
+      "learning_rate": 2.48e-05,
+      "loss": 0.0603,
+      "step": 249
+    },
+    {
+      "epoch": 2.8457142857142856,
+      "grad_norm": 370398.03125,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 0.175,
+      "step": 250
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 1053230.25,
+      "learning_rate": 2.5e-05,
+      "loss": 0.4524,
+      "step": 251
+    },
+    {
+      "epoch": 2.8685714285714283,
+      "grad_norm": 508066.0625,
+      "learning_rate": 2.51e-05,
+      "loss": 0.1333,
+      "step": 252
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 676064.3125,
+      "learning_rate": 2.5200000000000003e-05,
+      "loss": 0.1995,
+      "step": 253
+    },
+    {
+      "epoch": 2.8914285714285715,
+      "grad_norm": 906289.125,
+      "learning_rate": 2.5300000000000002e-05,
+      "loss": 0.21,
+      "step": 254
+    },
+    {
+      "epoch": 2.902857142857143,
+      "grad_norm": 97272.0234375,
+      "learning_rate": 2.54e-05,
+      "loss": 0.0462,
+      "step": 255
+    },
+    {
+      "epoch": 2.914285714285714,
+      "grad_norm": 642197.8125,
+      "learning_rate": 2.5500000000000003e-05,
+      "loss": 0.2187,
+      "step": 256
+    },
+    {
+      "epoch": 2.9257142857142857,
+      "grad_norm": 339229.46875,
+      "learning_rate": 2.5600000000000002e-05,
+      "loss": 0.081,
+      "step": 257
+    },
+    {
+      "epoch": 2.9371428571428573,
+      "grad_norm": 586855.6875,
+      "learning_rate": 2.57e-05,
+      "loss": 0.1322,
+      "step": 258
+    },
+    {
+      "epoch": 2.9485714285714284,
+      "grad_norm": 747893.125,
+      "learning_rate": 2.58e-05,
+      "loss": 0.1534,
+      "step": 259
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 221327.59375,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 0.097,
+      "step": 260
+    },
+    {
+      "epoch": 2.9714285714285715,
+      "grad_norm": 403810.5625,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 0.1275,
+      "step": 261
+    },
+    {
+      "epoch": 2.982857142857143,
+      "grad_norm": 367188.0625,
+      "learning_rate": 2.61e-05,
+      "loss": 0.1013,
+      "step": 262
+    },
+    {
+      "epoch": 2.994285714285714,
+      "grad_norm": 283714.46875,
+      "learning_rate": 2.6200000000000003e-05,
+      "loss": 0.0684,
+      "step": 263
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 73199.34375,
+      "learning_rate": 2.6300000000000002e-05,
+      "loss": 0.0201,
+      "step": 264
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 264,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-264/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63b0120fce60578d9e81786e748a190bb4474283a6da60a888acb5e76ffdb7b1
+size 5304

checkpoint-33/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec34d4e7ab4d7abfeef47c7f03b45f84b9e9da98966ca04b0e43765d28043e4b
+size 1077684090

checkpoint-33/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8df15fa14bcdc8a852381afaca240748bff9c761104a03b52c07b82b2acc8c53
+size 538847386

checkpoint-33/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:febb0720a7b7a4a2843bb6bc6b3c4091ef74ab3a4dc3719e96eaf91818cc6625
+size 14244

checkpoint-33/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
+size 988

checkpoint-33/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b782f9331da2cc758d1243717b84cf018be6d8153f33eb844449612c18eca0e6
+size 1064

checkpoint-33/trainer_state.json ADDED Viewed

	@@ -0,0 +1,265 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 33,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 411345.15625,
+      "learning_rate": 0.0,
+      "loss": 0.4991,
+      "step": 1
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 451981.875,
+      "learning_rate": 1.0000000000000001e-07,
+      "loss": 0.5075,
+      "step": 2
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 428244.09375,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 0.4775,
+      "step": 3
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 446559.96875,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.4627,
+      "step": 4
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 474946.78125,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.4707,
+      "step": 5
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 400206.125,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.4979,
+      "step": 6
+    },
+    {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 500328.625,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.4593,
+      "step": 7
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 424670.90625,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 0.4899,
+      "step": 8
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 480500.8125,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 0.449,
+      "step": 9
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 453084.65625,
+      "learning_rate": 9e-07,
+      "loss": 0.4767,
+      "step": 10
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 496174.34375,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.4601,
+      "step": 11
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 417244.5,
+      "learning_rate": 1.1e-06,
+      "loss": 0.4786,
+      "step": 12
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 519528.6875,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.4517,
+      "step": 13
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 389402.09375,
+      "learning_rate": 1.3e-06,
+      "loss": 0.4772,
+      "step": 14
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 418312.84375,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 0.4495,
+      "step": 15
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 464238.53125,
+      "learning_rate": 1.5e-06,
+      "loss": 0.4699,
+      "step": 16
+    },
+    {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 467524.3125,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 0.4315,
+      "step": 17
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 383996.6875,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": 0.4465,
+      "step": 18
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 400570.375,
+      "learning_rate": 1.8e-06,
+      "loss": 0.4583,
+      "step": 19
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 433225.6875,
+      "learning_rate": 1.9e-06,
+      "loss": 0.4295,
+      "step": 20
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 372978.4375,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.4378,
+      "step": 21
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 366551.03125,
+      "learning_rate": 2.1000000000000002e-06,
+      "loss": 0.451,
+      "step": 22
+    },
+    {
+      "epoch": 2.090909090909091,
+      "grad_norm": 351444.25,
+      "learning_rate": 2.2e-06,
+      "loss": 0.4483,
+      "step": 23
+    },
+    {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 427177.1875,
+      "learning_rate": 2.3e-06,
+      "loss": 0.398,
+      "step": 24
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 435540.125,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.3949,
+      "step": 25
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 274281.46875,
+      "learning_rate": 2.5e-06,
+      "loss": 0.4489,
+      "step": 26
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 364169.5625,
+      "learning_rate": 2.6e-06,
+      "loss": 0.4239,
+      "step": 27
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 389919.0625,
+      "learning_rate": 2.7e-06,
+      "loss": 0.3796,
+      "step": 28
+    },
+    {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 402486.5,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.362,
+      "step": 29
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 351625.8125,
+      "learning_rate": 2.9e-06,
+      "loss": 0.3626,
+      "step": 30
+    },
+    {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 324217.0,
+      "learning_rate": 3e-06,
+      "loss": 0.4019,
+      "step": 31
+    },
+    {
+      "epoch": 2.909090909090909,
+      "grad_norm": 324626.65625,
+      "learning_rate": 3.1e-06,
+      "loss": 0.3991,
+      "step": 32
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 350339.21875,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 0.3578,
+      "step": 33
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 33,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-33/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bb3465bf860bb3c1e30634e85ad7c5c85e5600cd85cdf19979bac7a31b0c03e
+size 5304

checkpoint-66/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aa4bd45c6dcb8a63d5a7e1bce7ac3962bfb4edad2fe0e77733b514d0f28cea2
+size 1077684090

checkpoint-66/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1de198c6c2e5d2857cbb98084f2ee3eacd27439a36438ea106c6d79a1215b9bf
+size 538847386

checkpoint-66/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8586512a4566a393800b83e344d6730e6751cb11be4ad93fd9de02ed9e47b01f
+size 14244

checkpoint-66/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
+size 988

checkpoint-66/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08311e5e26013ba10a7f8d66b2f49ef69d462d28b9abadfa05d7f9d217be8416
+size 1064

checkpoint-66/trainer_state.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 66,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.045454545454545456,
+      "grad_norm": 160962.125,
+      "learning_rate": 0.0,
+      "loss": 0.6352,
+      "step": 1
+    },
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 244773.53125,
+      "learning_rate": 1.0000000000000001e-07,
+      "loss": 0.7508,
+      "step": 2
+    },
+    {
+      "epoch": 0.13636363636363635,
+      "grad_norm": 318127.71875,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 0.7731,
+      "step": 3
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 111222.8984375,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.704,
+      "step": 4
+    },
+    {
+      "epoch": 0.22727272727272727,
+      "grad_norm": 311278.1875,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.7375,
+      "step": 5
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 86837.1484375,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.6853,
+      "step": 6
+    },
+    {
+      "epoch": 0.3181818181818182,
+      "grad_norm": 172702.484375,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.7058,
+      "step": 7
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 112781.3125,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 0.6864,
+      "step": 8
+    },
+    {
+      "epoch": 0.4090909090909091,
+      "grad_norm": 174636.5625,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 0.7207,
+      "step": 9
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 118128.6328125,
+      "learning_rate": 9e-07,
+      "loss": 0.6841,
+      "step": 10
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 157266.96875,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.7045,
+      "step": 11
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 70379.9609375,
+      "learning_rate": 1.1e-06,
+      "loss": 0.6865,
+      "step": 12
+    },
+    {
+      "epoch": 0.5909090909090909,
+      "grad_norm": 242274.828125,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.7568,
+      "step": 13
+    },
+    {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 102352.6953125,
+      "learning_rate": 1.3e-06,
+      "loss": 0.6823,
+      "step": 14
+    },
+    {
+      "epoch": 0.6818181818181818,
+      "grad_norm": 149701.515625,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 0.7222,
+      "step": 15
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 220230.953125,
+      "learning_rate": 1.5e-06,
+      "loss": 0.7444,
+      "step": 16
+    },
+    {
+      "epoch": 0.7727272727272727,
+      "grad_norm": 219751.140625,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 0.7566,
+      "step": 17
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 136347.578125,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": 0.7222,
+      "step": 18
+    },
+    {
+      "epoch": 0.8636363636363636,
+      "grad_norm": 229799.953125,
+      "learning_rate": 1.8e-06,
+      "loss": 0.7471,
+      "step": 19
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 219161.375,
+      "learning_rate": 1.9e-06,
+      "loss": 0.6964,
+      "step": 20
+    },
+    {
+      "epoch": 0.9545454545454546,
+      "grad_norm": 306108.9375,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.744,
+      "step": 21
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 86656.046875,
+      "learning_rate": 2.1000000000000002e-06,
+      "loss": 0.669,
+      "step": 22
+    },
+    {
+      "epoch": 1.0454545454545454,
+      "grad_norm": 189982.796875,
+      "learning_rate": 2.2e-06,
+      "loss": 0.6871,
+      "step": 23
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 196403.34375,
+      "learning_rate": 2.3e-06,
+      "loss": 0.7039,
+      "step": 24
+    },
+    {
+      "epoch": 1.1363636363636362,
+      "grad_norm": 106505.265625,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.7048,
+      "step": 25
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 70174.6015625,
+      "learning_rate": 2.5e-06,
+      "loss": 0.6748,
+      "step": 26
+    },
+    {
+      "epoch": 1.2272727272727273,
+      "grad_norm": 154341.859375,
+      "learning_rate": 2.6e-06,
+      "loss": 0.7283,
+      "step": 27
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 90075.5703125,
+      "learning_rate": 2.7e-06,
+      "loss": 0.6827,
+      "step": 28
+    },
+    {
+      "epoch": 1.3181818181818181,
+      "grad_norm": 181959.78125,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.68,
+      "step": 29
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 115269.859375,
+      "learning_rate": 2.9e-06,
+      "loss": 0.694,
+      "step": 30
+    },
+    {
+      "epoch": 1.4090909090909092,
+      "grad_norm": 285997.34375,
+      "learning_rate": 3e-06,
+      "loss": 0.7203,
+      "step": 31
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 90478.671875,
+      "learning_rate": 3.1e-06,
+      "loss": 0.6679,
+      "step": 32
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 126602.9921875,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 0.6943,
+      "step": 33
+    },
+    {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 67576.828125,
+      "learning_rate": 3.3e-06,
+      "loss": 0.6764,
+      "step": 34
+    },
+    {
+      "epoch": 1.5909090909090908,
+      "grad_norm": 90499.078125,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 0.6837,
+      "step": 35
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 87432.2421875,
+      "learning_rate": 3.5000000000000004e-06,
+      "loss": 0.6906,
+      "step": 36
+    },
+    {
+      "epoch": 1.6818181818181817,
+      "grad_norm": 149016.34375,
+      "learning_rate": 3.6e-06,
+      "loss": 0.6801,
+      "step": 37
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 184693.59375,
+      "learning_rate": 3.7e-06,
+      "loss": 0.696,
+      "step": 38
+    },
+    {
+      "epoch": 1.7727272727272727,
+      "grad_norm": 121410.65625,
+      "learning_rate": 3.8e-06,
+      "loss": 0.6775,
+      "step": 39
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 183644.046875,
+      "learning_rate": 3.9e-06,
+      "loss": 0.6763,
+      "step": 40
+    },
+    {
+      "epoch": 1.8636363636363638,
+      "grad_norm": 117808.8828125,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.6813,
+      "step": 41
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 91046.4765625,
+      "learning_rate": 4.1000000000000006e-06,
+      "loss": 0.6698,
+      "step": 42
+    },
+    {
+      "epoch": 1.9545454545454546,
+      "grad_norm": 70385.1328125,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": 0.6569,
+      "step": 43
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 172729.546875,
+      "learning_rate": 4.2999999999999995e-06,
+      "loss": 0.6717,
+      "step": 44
+    },
+    {
+      "epoch": 2.0454545454545454,
+      "grad_norm": 81323.515625,
+      "learning_rate": 4.4e-06,
+      "loss": 0.6756,
+      "step": 45
+    },
+    {
+      "epoch": 2.090909090909091,
+      "grad_norm": 215954.421875,
+      "learning_rate": 4.5e-06,
+      "loss": 0.6544,
+      "step": 46
+    },
+    {
+      "epoch": 2.1363636363636362,
+      "grad_norm": 159946.03125,
+      "learning_rate": 4.6e-06,
+      "loss": 0.6802,
+      "step": 47
+    },
+    {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 89950.46875,
+      "learning_rate": 4.7e-06,
+      "loss": 0.6706,
+      "step": 48
+    },
+    {
+      "epoch": 2.227272727272727,
+      "grad_norm": 132273.921875,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.6643,
+      "step": 49
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 79753.4140625,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 0.6651,
+      "step": 50
+    },
+    {
+      "epoch": 2.3181818181818183,
+      "grad_norm": 131875.796875,
+      "learning_rate": 5e-06,
+      "loss": 0.6347,
+      "step": 51
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 116849.3125,
+      "learning_rate": 5.1e-06,
+      "loss": 0.6409,
+      "step": 52
+    },
+    {
+      "epoch": 2.409090909090909,
+      "grad_norm": 108762.7265625,
+      "learning_rate": 5.2e-06,
+      "loss": 0.6631,
+      "step": 53
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 121625.21875,
+      "learning_rate": 5.3e-06,
+      "loss": 0.6262,
+      "step": 54
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 151074.40625,
+      "learning_rate": 5.4e-06,
+      "loss": 0.6412,
+      "step": 55
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 77287.875,
+      "learning_rate": 5.500000000000001e-06,
+      "loss": 0.657,
+      "step": 56
+    },
+    {
+      "epoch": 2.590909090909091,
+      "grad_norm": 89415.078125,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.6158,
+      "step": 57
+    },
+    {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 94613.78125,
+      "learning_rate": 5.7000000000000005e-06,
+      "loss": 0.6293,
+      "step": 58
+    },
+    {
+      "epoch": 2.6818181818181817,
+      "grad_norm": 82698.2890625,
+      "learning_rate": 5.8e-06,
+      "loss": 0.6181,
+      "step": 59
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 94982.9296875,
+      "learning_rate": 5.9e-06,
+      "loss": 0.6432,
+      "step": 60
+    },
+    {
+      "epoch": 2.7727272727272725,
+      "grad_norm": 217844.328125,
+      "learning_rate": 6e-06,
+      "loss": 0.6294,
+      "step": 61
+    },
+    {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 105915.171875,
+      "learning_rate": 6.1e-06,
+      "loss": 0.6642,
+      "step": 62
+    },
+    {
+      "epoch": 2.8636363636363638,
+      "grad_norm": 105239.1953125,
+      "learning_rate": 6.2e-06,
+      "loss": 0.6337,
+      "step": 63
+    },
+    {
+      "epoch": 2.909090909090909,
+      "grad_norm": 171439.03125,
+      "learning_rate": 6.300000000000001e-06,
+      "loss": 0.6623,
+      "step": 64
+    },
+    {
+      "epoch": 2.9545454545454546,
+      "grad_norm": 85152.15625,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.6153,
+      "step": 65
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 102992.1015625,
+      "learning_rate": 6.5000000000000004e-06,
+      "loss": 0.6266,
+      "step": 66
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 66,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-66/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfa835122b5449fe11f1ec55a2faa1403e892769d235b6d070132deaa48534ea
+size 5304

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b630ce86e5360d08d3a55fda3f77c14874ecf351a2668a36b4ce68cdd918b9fa
+size 538847386

runs/Jul16_13-42-47_cbb1763e7b33/events.out.tfevents.1752673369.cbb1763e7b33.87.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75d0246771f4eca2cc53511094cc4822543da8cddcc07a55bea656a947385e58
+size 4267

runs/Jul16_13-58-22_cbb1763e7b33/events.out.tfevents.1752674303.cbb1763e7b33.87.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06923dbcb8255683bc34b34af45cbe610fa4209cf45cb63fa067d7cd3e766b4c
+size 4267

runs/Jul16_14-02-03_cbb1763e7b33/events.out.tfevents.1752674524.cbb1763e7b33.87.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aff489290cd4ad7ac9325967c1dcc613cc45023c0e3a495f17668ea069baeac
+size 4265

runs/Jul16_14-03-33_cbb1763e7b33/events.out.tfevents.1752674613.cbb1763e7b33.87.3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f178e3bc8d56efb7909883c1f591bd0093d847a883bda40d61dcbdd9d3f4e40a
+size 4265

runs/Jul16_14-04-31_cbb1763e7b33/events.out.tfevents.1752674671.cbb1763e7b33.87.4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:289f8ad07a6169d36c2c546efdd0a0c0f2287a3d22f6baf2fc86184e288e2d8d
+size 11431

runs/Jul16_14-04-31_cbb1763e7b33/events.out.tfevents.1752674869.cbb1763e7b33.87.5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efe6ec76453347aeb153e0207a11da9b3f16a96296a060ff72bc7511dba09426
+size 551

runs/Jul16_14-17-51_cbb1763e7b33/events.out.tfevents.1752675472.cbb1763e7b33.87.6 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:587ff2c891dd7c5ae79105d75a6957f276df4b72486c6c485fa2fa6b58af8051
+size 4252

runs/Jul16_14-19-15_cbb1763e7b33/events.out.tfevents.1752675556.cbb1763e7b33.87.7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20fe0d5ab97a06015572c0d51d5ba214dd3bbc7fba5932d612cfbdbc2da586a0
+size 18262

runs/Jul16_14-28-26_cbb1763e7b33/events.out.tfevents.1752676107.cbb1763e7b33.87.8 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae21f3dc752121a62390ccb8fa48393a511b32e72506a8fa2d57daea121d7cdd
+size 4252

runs/Jul16_14-28-51_cbb1763e7b33/events.out.tfevents.1752676132.cbb1763e7b33.87.9 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4146883453ec6f604b205483afb2a8a19dfcb036b104d1ea39018bfc527116cf
+size 4252

runs/Jul16_14-29-42_cbb1763e7b33/events.out.tfevents.1752676183.cbb1763e7b33.87.10 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:944b044ae19a79612f77d687f47b83062252e866a7edd63d0624e5bad7a923b5
+size 59802

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63b0120fce60578d9e81786e748a190bb4474283a6da60a888acb5e76ffdb7b1
+size 5304

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff