hafizhaaarama commited on
Commit
9cdc291
·
verified ·
1 Parent(s): 5571c98

Upload folder using huggingface_hub

Browse files
Files changed (44) hide show
  1. README.md +8 -6
  2. checkpoint-130/model.safetensors +1 -1
  3. checkpoint-130/optimizer.pt +1 -1
  4. checkpoint-130/scheduler.pt +1 -1
  5. checkpoint-130/trainer_state.json +50 -50
  6. checkpoint-130/training_args.bin +1 -1
  7. checkpoint-195/model.safetensors +1 -1
  8. checkpoint-195/optimizer.pt +1 -1
  9. checkpoint-195/scheduler.pt +1 -1
  10. checkpoint-195/trainer_state.json +73 -73
  11. checkpoint-195/training_args.bin +1 -1
  12. checkpoint-260/model.safetensors +3 -0
  13. checkpoint-260/optimizer.pt +3 -0
  14. checkpoint-260/rng_state.pth +3 -0
  15. checkpoint-260/scheduler.pt +3 -0
  16. checkpoint-260/special_tokens_map.json +7 -0
  17. checkpoint-260/tokenizer.json +0 -0
  18. checkpoint-260/tokenizer_config.json +56 -0
  19. checkpoint-260/trainer_state.json +248 -0
  20. checkpoint-260/training_args.bin +3 -0
  21. checkpoint-260/vocab.txt +0 -0
  22. checkpoint-325/model.safetensors +3 -0
  23. checkpoint-325/optimizer.pt +3 -0
  24. checkpoint-325/rng_state.pth +3 -0
  25. checkpoint-325/scheduler.pt +3 -0
  26. checkpoint-325/special_tokens_map.json +7 -0
  27. checkpoint-325/tokenizer.json +0 -0
  28. checkpoint-325/tokenizer_config.json +56 -0
  29. checkpoint-325/trainer_state.json +298 -0
  30. checkpoint-325/training_args.bin +3 -0
  31. checkpoint-325/vocab.txt +0 -0
  32. checkpoint-65/model.safetensors +3 -0
  33. checkpoint-65/optimizer.pt +3 -0
  34. checkpoint-65/rng_state.pth +3 -0
  35. checkpoint-65/scheduler.pt +3 -0
  36. checkpoint-65/special_tokens_map.json +7 -0
  37. checkpoint-65/tokenizer.json +0 -0
  38. checkpoint-65/tokenizer_config.json +56 -0
  39. checkpoint-65/trainer_state.json +84 -0
  40. checkpoint-65/training_args.bin +3 -0
  41. checkpoint-65/vocab.txt +0 -0
  42. model.safetensors +1 -1
  43. pytorch_model.bin +1 -1
  44. training_args.bin +1 -1
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.0067
20
 
21
  ## Model description
22
 
@@ -35,21 +35,23 @@ More information needed
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
- - learning_rate: 2e-05
39
  - train_batch_size: 8
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
  - lr_scheduler_type: linear
44
- - num_epochs: 3
45
 
46
  ### Training results
47
 
48
  | Training Loss | Epoch | Step | Validation Loss |
49
  |:-------------:|:-----:|:----:|:---------------:|
50
- | 0.1249 | 1.0 | 65 | 0.0411 |
51
- | 0.0147 | 2.0 | 130 | 0.0085 |
52
- | 0.0118 | 3.0 | 195 | 0.0067 |
 
 
53
 
54
 
55
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.0017
20
 
21
  ## Model description
22
 
 
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
  - train_batch_size: 8
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
  - lr_scheduler_type: linear
44
+ - num_epochs: 5
45
 
46
  ### Training results
47
 
48
  | Training Loss | Epoch | Step | Validation Loss |
49
  |:-------------:|:-----:|:----:|:---------------:|
50
+ | 0.0119 | 1.0 | 65 | 0.0055 |
51
+ | 0.0039 | 2.0 | 130 | 0.0027 |
52
+ | 0.0029 | 3.0 | 195 | 0.0019 |
53
+ | 0.0027 | 4.0 | 260 | 0.0018 |
54
+ | 0.0026 | 5.0 | 325 | 0.0017 |
55
 
56
 
57
  ### Framework versions
checkpoint-130/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f062b9ad085c15780b1161f89a51545dbd1afcf2cb524739a52c49fe0222365c
3
  size 265491420
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b688524689f992a89d6c6033eff59f9de284a213648d036333a82ad02755cf3
3
  size 265491420
checkpoint-130/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3992ac39828d93828796d777d7dd813843f95e8809c86259dad72d03084cea5f
3
  size 531042682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdc46c7f8faed5dfef2315b88b6794e1887f61836b7ed54ad2187c796ea03380
3
  size 531042682
checkpoint-130/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fbe91af4c5769bfb655f93dcca687c5a783d2124b57d2ce7beb063d7751faa5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7273f2b5fa3934f4d56fdce2f6a4334466d94af90f50cd48930ff238ed4a876a
3
  size 1064
checkpoint-130/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 130,
3
- "best_metric": 0.008523747324943542,
4
  "best_model_checkpoint": "./multitask_model/checkpoint-130",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
@@ -11,116 +11,116 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.15384615384615385,
14
- "grad_norm": 9.697171211242676,
15
- "learning_rate": 1.907692307692308e-05,
16
- "loss": 2.4409,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.3076923076923077,
21
- "grad_norm": 7.024130344390869,
22
- "learning_rate": 1.8051282051282053e-05,
23
- "loss": 1.6879,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.46153846153846156,
28
- "grad_norm": 8.873391151428223,
29
- "learning_rate": 1.7025641025641026e-05,
30
- "loss": 1.0298,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.6153846153846154,
35
- "grad_norm": 5.143718719482422,
36
- "learning_rate": 1.6000000000000003e-05,
37
- "loss": 0.4906,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.7692307692307693,
42
- "grad_norm": 2.485305070877075,
43
- "learning_rate": 1.4974358974358976e-05,
44
- "loss": 0.2829,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.9230769230769231,
49
- "grad_norm": 1.282047152519226,
50
- "learning_rate": 1.3948717948717949e-05,
51
- "loss": 0.1249,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 1.0,
56
- "eval_loss": 0.04110988602042198,
57
- "eval_runtime": 27.2776,
58
- "eval_samples_per_second": 4.729,
59
- "eval_steps_per_second": 0.623,
60
  "step": 65
61
  },
62
  {
63
  "epoch": 1.0769230769230769,
64
- "grad_norm": 0.7192372679710388,
65
- "learning_rate": 1.2923076923076925e-05,
66
- "loss": 0.0663,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.2307692307692308,
71
- "grad_norm": 0.48828813433647156,
72
- "learning_rate": 1.1897435897435898e-05,
73
- "loss": 0.0416,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.3846153846153846,
78
- "grad_norm": 0.3217748999595642,
79
- "learning_rate": 1.0871794871794871e-05,
80
- "loss": 0.0281,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.5384615384615383,
85
- "grad_norm": 0.2811843454837799,
86
- "learning_rate": 9.846153846153848e-06,
87
- "loss": 0.0965,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.6923076923076923,
92
- "grad_norm": 0.2007972002029419,
93
- "learning_rate": 8.820512820512821e-06,
94
- "loss": 0.019,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.8461538461538463,
99
- "grad_norm": 0.18534308671951294,
100
- "learning_rate": 7.794871794871796e-06,
101
- "loss": 0.0156,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 2.0,
106
- "grad_norm": 0.20706918835639954,
107
- "learning_rate": 6.76923076923077e-06,
108
- "loss": 0.0147,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 2.0,
113
- "eval_loss": 0.008523747324943542,
114
- "eval_runtime": 29.1835,
115
- "eval_samples_per_second": 4.42,
116
- "eval_steps_per_second": 0.583,
117
  "step": 130
118
  }
119
  ],
120
  "logging_steps": 10,
121
- "max_steps": 195,
122
  "num_input_tokens_seen": 0,
123
- "num_train_epochs": 3,
124
  "save_steps": 500,
125
  "stateful_callbacks": {
126
  "TrainerControl": {
 
1
  {
2
  "best_global_step": 130,
3
+ "best_metric": 0.0026640458963811398,
4
  "best_model_checkpoint": "./multitask_model/checkpoint-130",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.15384615384615385,
14
+ "grad_norm": 8.8089017868042,
15
+ "learning_rate": 4.861538461538462e-05,
16
+ "loss": 2.1939,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.3076923076923077,
21
+ "grad_norm": 3.6326286792755127,
22
+ "learning_rate": 4.707692307692308e-05,
23
+ "loss": 1.1303,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.46153846153846156,
28
+ "grad_norm": 2.114581823348999,
29
+ "learning_rate": 4.553846153846154e-05,
30
+ "loss": 0.3031,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.6153846153846154,
35
+ "grad_norm": 0.4926183223724365,
36
+ "learning_rate": 4.4000000000000006e-05,
37
+ "loss": 0.0667,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.7692307692307693,
42
+ "grad_norm": 0.26311376690864563,
43
+ "learning_rate": 4.2461538461538465e-05,
44
+ "loss": 0.0229,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.9230769230769231,
49
+ "grad_norm": 0.16266165673732758,
50
+ "learning_rate": 4.0923076923076925e-05,
51
+ "loss": 0.0119,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 1.0,
56
+ "eval_loss": 0.005465179681777954,
57
+ "eval_runtime": 28.3814,
58
+ "eval_samples_per_second": 4.545,
59
+ "eval_steps_per_second": 0.599,
60
  "step": 65
61
  },
62
  {
63
  "epoch": 1.0769230769230769,
64
+ "grad_norm": 0.09443770349025726,
65
+ "learning_rate": 3.9384615384615384e-05,
66
+ "loss": 0.0088,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.2307692307692308,
71
+ "grad_norm": 0.05983053520321846,
72
+ "learning_rate": 3.784615384615385e-05,
73
+ "loss": 0.0066,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.3846153846153846,
78
+ "grad_norm": 0.045136693865060806,
79
+ "learning_rate": 3.630769230769231e-05,
80
+ "loss": 0.0053,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.5384615384615383,
85
+ "grad_norm": 0.05789942666888237,
86
+ "learning_rate": 3.476923076923077e-05,
87
+ "loss": 0.0994,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.6923076923076923,
92
+ "grad_norm": 0.06267621368169785,
93
+ "learning_rate": 3.323076923076923e-05,
94
+ "loss": 0.0058,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.8461538461538463,
99
+ "grad_norm": 0.050915952771902084,
100
+ "learning_rate": 3.1692307692307696e-05,
101
+ "loss": 0.0047,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 2.0,
106
+ "grad_norm": 0.0482671745121479,
107
+ "learning_rate": 3.0153846153846155e-05,
108
+ "loss": 0.0039,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 2.0,
113
+ "eval_loss": 0.0026640458963811398,
114
+ "eval_runtime": 28.7779,
115
+ "eval_samples_per_second": 4.483,
116
+ "eval_steps_per_second": 0.591,
117
  "step": 130
118
  }
119
  ],
120
  "logging_steps": 10,
121
+ "max_steps": 325,
122
  "num_input_tokens_seen": 0,
123
+ "num_train_epochs": 5,
124
  "save_steps": 500,
125
  "stateful_callbacks": {
126
  "TrainerControl": {
checkpoint-130/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:073443c160f218e70f39940b6473a05da96d48c0d26f92a97ad9644c0b98e4b2
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
  size 5240
checkpoint-195/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1c3678511418bfcf53a1ffcd424b0b7ee78fbbf110612cbc0db585216624acd
3
  size 265491420
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b44de11280834fbaa65d65c7ae9c9ff744b07227cb65654b2f34e949a5b6429
3
  size 265491420
checkpoint-195/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce3cd2e0830c2ef392de2f9c3da38c984be9100056d7b9151c7024e9d1156c32
3
  size 531042682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b46df0841f7eae17f9332233d59520a12a2b23078c3f39768b9c73b608c089
3
  size 531042682
checkpoint-195/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6341ae854532c35d5eb7c85e22f5fd447dd3009c5c76f080cb88f0f539b9815b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2aec4d7f710a44a0a1412345131bacaa233a6dae2a3d643b7b1956862e1d086
3
  size 1064
checkpoint-195/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 195,
3
- "best_metric": 0.00674749631434679,
4
  "best_model_checkpoint": "./multitask_model/checkpoint-195",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
@@ -11,166 +11,166 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.15384615384615385,
14
- "grad_norm": 9.697171211242676,
15
- "learning_rate": 1.907692307692308e-05,
16
- "loss": 2.4409,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.3076923076923077,
21
- "grad_norm": 7.024130344390869,
22
- "learning_rate": 1.8051282051282053e-05,
23
- "loss": 1.6879,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.46153846153846156,
28
- "grad_norm": 8.873391151428223,
29
- "learning_rate": 1.7025641025641026e-05,
30
- "loss": 1.0298,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.6153846153846154,
35
- "grad_norm": 5.143718719482422,
36
- "learning_rate": 1.6000000000000003e-05,
37
- "loss": 0.4906,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.7692307692307693,
42
- "grad_norm": 2.485305070877075,
43
- "learning_rate": 1.4974358974358976e-05,
44
- "loss": 0.2829,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.9230769230769231,
49
- "grad_norm": 1.282047152519226,
50
- "learning_rate": 1.3948717948717949e-05,
51
- "loss": 0.1249,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 1.0,
56
- "eval_loss": 0.04110988602042198,
57
- "eval_runtime": 27.2776,
58
- "eval_samples_per_second": 4.729,
59
- "eval_steps_per_second": 0.623,
60
  "step": 65
61
  },
62
  {
63
  "epoch": 1.0769230769230769,
64
- "grad_norm": 0.7192372679710388,
65
- "learning_rate": 1.2923076923076925e-05,
66
- "loss": 0.0663,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.2307692307692308,
71
- "grad_norm": 0.48828813433647156,
72
- "learning_rate": 1.1897435897435898e-05,
73
- "loss": 0.0416,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.3846153846153846,
78
- "grad_norm": 0.3217748999595642,
79
- "learning_rate": 1.0871794871794871e-05,
80
- "loss": 0.0281,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.5384615384615383,
85
- "grad_norm": 0.2811843454837799,
86
- "learning_rate": 9.846153846153848e-06,
87
- "loss": 0.0965,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.6923076923076923,
92
- "grad_norm": 0.2007972002029419,
93
- "learning_rate": 8.820512820512821e-06,
94
- "loss": 0.019,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.8461538461538463,
99
- "grad_norm": 0.18534308671951294,
100
- "learning_rate": 7.794871794871796e-06,
101
- "loss": 0.0156,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 2.0,
106
- "grad_norm": 0.20706918835639954,
107
- "learning_rate": 6.76923076923077e-06,
108
- "loss": 0.0147,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 2.0,
113
- "eval_loss": 0.008523747324943542,
114
- "eval_runtime": 29.1835,
115
- "eval_samples_per_second": 4.42,
116
- "eval_steps_per_second": 0.583,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 2.1538461538461537,
121
- "grad_norm": 0.18737384676933289,
122
- "learning_rate": 5.743589743589743e-06,
123
- "loss": 0.0141,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 2.3076923076923075,
128
- "grad_norm": 0.1485062688589096,
129
- "learning_rate": 4.717948717948718e-06,
130
- "loss": 0.0134,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 2.4615384615384617,
135
- "grad_norm": 0.15001751482486725,
136
- "learning_rate": 3.692307692307693e-06,
137
- "loss": 0.0122,
138
  "step": 160
139
  },
140
  {
141
  "epoch": 2.6153846153846154,
142
- "grad_norm": 0.174443319439888,
143
- "learning_rate": 2.666666666666667e-06,
144
- "loss": 0.012,
145
  "step": 170
146
  },
147
  {
148
  "epoch": 2.769230769230769,
149
- "grad_norm": 0.1295892894268036,
150
- "learning_rate": 1.6410256410256412e-06,
151
- "loss": 0.012,
152
  "step": 180
153
  },
154
  {
155
  "epoch": 2.9230769230769234,
156
- "grad_norm": 0.12927618622779846,
157
- "learning_rate": 6.153846153846155e-07,
158
- "loss": 0.0118,
159
  "step": 190
160
  },
161
  {
162
  "epoch": 3.0,
163
- "eval_loss": 0.00674749631434679,
164
- "eval_runtime": 27.72,
165
- "eval_samples_per_second": 4.654,
166
- "eval_steps_per_second": 0.613,
167
  "step": 195
168
  }
169
  ],
170
  "logging_steps": 10,
171
- "max_steps": 195,
172
  "num_input_tokens_seen": 0,
173
- "num_train_epochs": 3,
174
  "save_steps": 500,
175
  "stateful_callbacks": {
176
  "TrainerControl": {
@@ -179,7 +179,7 @@
179
  "should_evaluate": false,
180
  "should_log": false,
181
  "should_save": true,
182
- "should_training_stop": true
183
  },
184
  "attributes": {}
185
  }
 
1
  {
2
  "best_global_step": 195,
3
+ "best_metric": 0.001924663782119751,
4
  "best_model_checkpoint": "./multitask_model/checkpoint-195",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.15384615384615385,
14
+ "grad_norm": 8.8089017868042,
15
+ "learning_rate": 4.861538461538462e-05,
16
+ "loss": 2.1939,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.3076923076923077,
21
+ "grad_norm": 3.6326286792755127,
22
+ "learning_rate": 4.707692307692308e-05,
23
+ "loss": 1.1303,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.46153846153846156,
28
+ "grad_norm": 2.114581823348999,
29
+ "learning_rate": 4.553846153846154e-05,
30
+ "loss": 0.3031,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.6153846153846154,
35
+ "grad_norm": 0.4926183223724365,
36
+ "learning_rate": 4.4000000000000006e-05,
37
+ "loss": 0.0667,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.7692307692307693,
42
+ "grad_norm": 0.26311376690864563,
43
+ "learning_rate": 4.2461538461538465e-05,
44
+ "loss": 0.0229,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.9230769230769231,
49
+ "grad_norm": 0.16266165673732758,
50
+ "learning_rate": 4.0923076923076925e-05,
51
+ "loss": 0.0119,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 1.0,
56
+ "eval_loss": 0.005465179681777954,
57
+ "eval_runtime": 28.3814,
58
+ "eval_samples_per_second": 4.545,
59
+ "eval_steps_per_second": 0.599,
60
  "step": 65
61
  },
62
  {
63
  "epoch": 1.0769230769230769,
64
+ "grad_norm": 0.09443770349025726,
65
+ "learning_rate": 3.9384615384615384e-05,
66
+ "loss": 0.0088,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 1.2307692307692308,
71
+ "grad_norm": 0.05983053520321846,
72
+ "learning_rate": 3.784615384615385e-05,
73
+ "loss": 0.0066,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 1.3846153846153846,
78
+ "grad_norm": 0.045136693865060806,
79
+ "learning_rate": 3.630769230769231e-05,
80
+ "loss": 0.0053,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 1.5384615384615383,
85
+ "grad_norm": 0.05789942666888237,
86
+ "learning_rate": 3.476923076923077e-05,
87
+ "loss": 0.0994,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.6923076923076923,
92
+ "grad_norm": 0.06267621368169785,
93
+ "learning_rate": 3.323076923076923e-05,
94
+ "loss": 0.0058,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.8461538461538463,
99
+ "grad_norm": 0.050915952771902084,
100
+ "learning_rate": 3.1692307692307696e-05,
101
+ "loss": 0.0047,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 2.0,
106
+ "grad_norm": 0.0482671745121479,
107
+ "learning_rate": 3.0153846153846155e-05,
108
+ "loss": 0.0039,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 2.0,
113
+ "eval_loss": 0.0026640458963811398,
114
+ "eval_runtime": 28.7779,
115
+ "eval_samples_per_second": 4.483,
116
+ "eval_steps_per_second": 0.591,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 2.1538461538461537,
121
+ "grad_norm": 0.041481729596853256,
122
+ "learning_rate": 2.8615384615384615e-05,
123
+ "loss": 0.0038,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 2.3076923076923075,
128
+ "grad_norm": 0.04328610375523567,
129
+ "learning_rate": 2.7076923076923078e-05,
130
+ "loss": 0.0037,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 2.4615384615384617,
135
+ "grad_norm": 0.03569851815700531,
136
+ "learning_rate": 2.5538461538461538e-05,
137
+ "loss": 0.0033,
138
  "step": 160
139
  },
140
  {
141
  "epoch": 2.6153846153846154,
142
+ "grad_norm": 0.027992915362119675,
143
+ "learning_rate": 2.4e-05,
144
+ "loss": 0.0033,
145
  "step": 170
146
  },
147
  {
148
  "epoch": 2.769230769230769,
149
+ "grad_norm": 0.026577744632959366,
150
+ "learning_rate": 2.246153846153846e-05,
151
+ "loss": 0.003,
152
  "step": 180
153
  },
154
  {
155
  "epoch": 2.9230769230769234,
156
+ "grad_norm": 0.031246010214090347,
157
+ "learning_rate": 2.0923076923076923e-05,
158
+ "loss": 0.0029,
159
  "step": 190
160
  },
161
  {
162
  "epoch": 3.0,
163
+ "eval_loss": 0.001924663782119751,
164
+ "eval_runtime": 29.0281,
165
+ "eval_samples_per_second": 4.444,
166
+ "eval_steps_per_second": 0.586,
167
  "step": 195
168
  }
169
  ],
170
  "logging_steps": 10,
171
+ "max_steps": 325,
172
  "num_input_tokens_seen": 0,
173
+ "num_train_epochs": 5,
174
  "save_steps": 500,
175
  "stateful_callbacks": {
176
  "TrainerControl": {
 
179
  "should_evaluate": false,
180
  "should_log": false,
181
  "should_save": true,
182
+ "should_training_stop": false
183
  },
184
  "attributes": {}
185
  }
checkpoint-195/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:073443c160f218e70f39940b6473a05da96d48c0d26f92a97ad9644c0b98e4b2
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
  size 5240
checkpoint-260/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4628d2fbe94cefe7938ba160894f0dbfb4c5dd54665cc6d896d933bac44f01
3
+ size 265491420
checkpoint-260/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:219a3bcade79f93edcabc30f95ff8506e8cb0137eff43bd4da704362b151d763
3
+ size 531042682
checkpoint-260/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a05a945c5aa1a654d7904fdade8497ed47845adcf3dfc34bd357e4a5217dd388
3
+ size 13990
checkpoint-260/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f398f2121146e0aac7fcb103a78ed42fee3d3308e81e6c93c884800519a86957
3
+ size 1064
checkpoint-260/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-260/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-260/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-260/trainer_state.json ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 260,
3
+ "best_metric": 0.0018482182640582323,
4
+ "best_model_checkpoint": "./multitask_model/checkpoint-260",
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 260,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.15384615384615385,
14
+ "grad_norm": 8.8089017868042,
15
+ "learning_rate": 4.861538461538462e-05,
16
+ "loss": 2.1939,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.3076923076923077,
21
+ "grad_norm": 3.6326286792755127,
22
+ "learning_rate": 4.707692307692308e-05,
23
+ "loss": 1.1303,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.46153846153846156,
28
+ "grad_norm": 2.114581823348999,
29
+ "learning_rate": 4.553846153846154e-05,
30
+ "loss": 0.3031,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.6153846153846154,
35
+ "grad_norm": 0.4926183223724365,
36
+ "learning_rate": 4.4000000000000006e-05,
37
+ "loss": 0.0667,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.7692307692307693,
42
+ "grad_norm": 0.26311376690864563,
43
+ "learning_rate": 4.2461538461538465e-05,
44
+ "loss": 0.0229,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.9230769230769231,
49
+ "grad_norm": 0.16266165673732758,
50
+ "learning_rate": 4.0923076923076925e-05,
51
+ "loss": 0.0119,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 1.0,
56
+ "eval_loss": 0.005465179681777954,
57
+ "eval_runtime": 28.3814,
58
+ "eval_samples_per_second": 4.545,
59
+ "eval_steps_per_second": 0.599,
60
+ "step": 65
61
+ },
62
+ {
63
+ "epoch": 1.0769230769230769,
64
+ "grad_norm": 0.09443770349025726,
65
+ "learning_rate": 3.9384615384615384e-05,
66
+ "loss": 0.0088,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 1.2307692307692308,
71
+ "grad_norm": 0.05983053520321846,
72
+ "learning_rate": 3.784615384615385e-05,
73
+ "loss": 0.0066,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.3846153846153846,
78
+ "grad_norm": 0.045136693865060806,
79
+ "learning_rate": 3.630769230769231e-05,
80
+ "loss": 0.0053,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 1.5384615384615383,
85
+ "grad_norm": 0.05789942666888237,
86
+ "learning_rate": 3.476923076923077e-05,
87
+ "loss": 0.0994,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 1.6923076923076923,
92
+ "grad_norm": 0.06267621368169785,
93
+ "learning_rate": 3.323076923076923e-05,
94
+ "loss": 0.0058,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 1.8461538461538463,
99
+ "grad_norm": 0.050915952771902084,
100
+ "learning_rate": 3.1692307692307696e-05,
101
+ "loss": 0.0047,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "grad_norm": 0.0482671745121479,
107
+ "learning_rate": 3.0153846153846155e-05,
108
+ "loss": 0.0039,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_loss": 0.0026640458963811398,
114
+ "eval_runtime": 28.7779,
115
+ "eval_samples_per_second": 4.483,
116
+ "eval_steps_per_second": 0.591,
117
+ "step": 130
118
+ },
119
+ {
120
+ "epoch": 2.1538461538461537,
121
+ "grad_norm": 0.041481729596853256,
122
+ "learning_rate": 2.8615384615384615e-05,
123
+ "loss": 0.0038,
124
+ "step": 140
125
+ },
126
+ {
127
+ "epoch": 2.3076923076923075,
128
+ "grad_norm": 0.04328610375523567,
129
+ "learning_rate": 2.7076923076923078e-05,
130
+ "loss": 0.0037,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 2.4615384615384617,
135
+ "grad_norm": 0.03569851815700531,
136
+ "learning_rate": 2.5538461538461538e-05,
137
+ "loss": 0.0033,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 2.6153846153846154,
142
+ "grad_norm": 0.027992915362119675,
143
+ "learning_rate": 2.4e-05,
144
+ "loss": 0.0033,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 2.769230769230769,
149
+ "grad_norm": 0.026577744632959366,
150
+ "learning_rate": 2.246153846153846e-05,
151
+ "loss": 0.003,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 2.9230769230769234,
156
+ "grad_norm": 0.031246010214090347,
157
+ "learning_rate": 2.0923076923076923e-05,
158
+ "loss": 0.0029,
159
+ "step": 190
160
+ },
161
+ {
162
+ "epoch": 3.0,
163
+ "eval_loss": 0.001924663782119751,
164
+ "eval_runtime": 29.0281,
165
+ "eval_samples_per_second": 4.444,
166
+ "eval_steps_per_second": 0.586,
167
+ "step": 195
168
+ },
169
+ {
170
+ "epoch": 3.076923076923077,
171
+ "grad_norm": 0.03536583110690117,
172
+ "learning_rate": 1.9384615384615383e-05,
173
+ "loss": 0.3434,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 3.230769230769231,
178
+ "grad_norm": 0.03196291625499725,
179
+ "learning_rate": 1.7846153846153846e-05,
180
+ "loss": 0.0032,
181
+ "step": 210
182
+ },
183
+ {
184
+ "epoch": 3.3846153846153846,
185
+ "grad_norm": 0.05442598834633827,
186
+ "learning_rate": 1.630769230769231e-05,
187
+ "loss": 0.0034,
188
+ "step": 220
189
+ },
190
+ {
191
+ "epoch": 3.5384615384615383,
192
+ "grad_norm": 0.02654326893389225,
193
+ "learning_rate": 1.4769230769230772e-05,
194
+ "loss": 0.0766,
195
+ "step": 230
196
+ },
197
+ {
198
+ "epoch": 3.6923076923076925,
199
+ "grad_norm": 0.034997936338186264,
200
+ "learning_rate": 1.3230769230769233e-05,
201
+ "loss": 0.0038,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 3.8461538461538463,
206
+ "grad_norm": 0.027517210692167282,
207
+ "learning_rate": 1.1692307692307693e-05,
208
+ "loss": 0.0041,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 4.0,
213
+ "grad_norm": 0.04248378798365593,
214
+ "learning_rate": 1.0153846153846154e-05,
215
+ "loss": 0.0027,
216
+ "step": 260
217
+ },
218
+ {
219
+ "epoch": 4.0,
220
+ "eval_loss": 0.0018482182640582323,
221
+ "eval_runtime": 29.088,
222
+ "eval_samples_per_second": 4.435,
223
+ "eval_steps_per_second": 0.584,
224
+ "step": 260
225
+ }
226
+ ],
227
+ "logging_steps": 10,
228
+ "max_steps": 325,
229
+ "num_input_tokens_seen": 0,
230
+ "num_train_epochs": 5,
231
+ "save_steps": 500,
232
+ "stateful_callbacks": {
233
+ "TrainerControl": {
234
+ "args": {
235
+ "should_epoch_stop": false,
236
+ "should_evaluate": false,
237
+ "should_log": false,
238
+ "should_save": true,
239
+ "should_training_stop": false
240
+ },
241
+ "attributes": {}
242
+ }
243
+ },
244
+ "total_flos": 0.0,
245
+ "train_batch_size": 8,
246
+ "trial_name": null,
247
+ "trial_params": null
248
+ }
checkpoint-260/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
+ size 5240
checkpoint-260/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-325/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7799fbc15e49a5c41fcc8de8c0abbd168e5d0bbb3f98e94e4342e9b6aafd3c33
3
+ size 265491420
checkpoint-325/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9168ce922a3e34d30679ca47c18c4fb2455be976cad55b55b03af815d00cd14f
3
+ size 531042682
checkpoint-325/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ff827dc3b4e3145d896ac7a21efe1c9d9483596d5261a886232b9d11097911
3
+ size 13990
checkpoint-325/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f65fd15d19c13da73183b7d20ebcd6a2a19e7ae35c586d6625d4ab79074ffa6c
3
+ size 1064
checkpoint-325/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-325/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-325/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-325/trainer_state.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 325,
3
+ "best_metric": 0.00168671237770468,
4
+ "best_model_checkpoint": "./multitask_model/checkpoint-325",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 325,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.15384615384615385,
14
+ "grad_norm": 8.8089017868042,
15
+ "learning_rate": 4.861538461538462e-05,
16
+ "loss": 2.1939,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.3076923076923077,
21
+ "grad_norm": 3.6326286792755127,
22
+ "learning_rate": 4.707692307692308e-05,
23
+ "loss": 1.1303,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.46153846153846156,
28
+ "grad_norm": 2.114581823348999,
29
+ "learning_rate": 4.553846153846154e-05,
30
+ "loss": 0.3031,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.6153846153846154,
35
+ "grad_norm": 0.4926183223724365,
36
+ "learning_rate": 4.4000000000000006e-05,
37
+ "loss": 0.0667,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.7692307692307693,
42
+ "grad_norm": 0.26311376690864563,
43
+ "learning_rate": 4.2461538461538465e-05,
44
+ "loss": 0.0229,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.9230769230769231,
49
+ "grad_norm": 0.16266165673732758,
50
+ "learning_rate": 4.0923076923076925e-05,
51
+ "loss": 0.0119,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 1.0,
56
+ "eval_loss": 0.005465179681777954,
57
+ "eval_runtime": 28.3814,
58
+ "eval_samples_per_second": 4.545,
59
+ "eval_steps_per_second": 0.599,
60
+ "step": 65
61
+ },
62
+ {
63
+ "epoch": 1.0769230769230769,
64
+ "grad_norm": 0.09443770349025726,
65
+ "learning_rate": 3.9384615384615384e-05,
66
+ "loss": 0.0088,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 1.2307692307692308,
71
+ "grad_norm": 0.05983053520321846,
72
+ "learning_rate": 3.784615384615385e-05,
73
+ "loss": 0.0066,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.3846153846153846,
78
+ "grad_norm": 0.045136693865060806,
79
+ "learning_rate": 3.630769230769231e-05,
80
+ "loss": 0.0053,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 1.5384615384615383,
85
+ "grad_norm": 0.05789942666888237,
86
+ "learning_rate": 3.476923076923077e-05,
87
+ "loss": 0.0994,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 1.6923076923076923,
92
+ "grad_norm": 0.06267621368169785,
93
+ "learning_rate": 3.323076923076923e-05,
94
+ "loss": 0.0058,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 1.8461538461538463,
99
+ "grad_norm": 0.050915952771902084,
100
+ "learning_rate": 3.1692307692307696e-05,
101
+ "loss": 0.0047,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "grad_norm": 0.0482671745121479,
107
+ "learning_rate": 3.0153846153846155e-05,
108
+ "loss": 0.0039,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_loss": 0.0026640458963811398,
114
+ "eval_runtime": 28.7779,
115
+ "eval_samples_per_second": 4.483,
116
+ "eval_steps_per_second": 0.591,
117
+ "step": 130
118
+ },
119
+ {
120
+ "epoch": 2.1538461538461537,
121
+ "grad_norm": 0.041481729596853256,
122
+ "learning_rate": 2.8615384615384615e-05,
123
+ "loss": 0.0038,
124
+ "step": 140
125
+ },
126
+ {
127
+ "epoch": 2.3076923076923075,
128
+ "grad_norm": 0.04328610375523567,
129
+ "learning_rate": 2.7076923076923078e-05,
130
+ "loss": 0.0037,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 2.4615384615384617,
135
+ "grad_norm": 0.03569851815700531,
136
+ "learning_rate": 2.5538461538461538e-05,
137
+ "loss": 0.0033,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 2.6153846153846154,
142
+ "grad_norm": 0.027992915362119675,
143
+ "learning_rate": 2.4e-05,
144
+ "loss": 0.0033,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 2.769230769230769,
149
+ "grad_norm": 0.026577744632959366,
150
+ "learning_rate": 2.246153846153846e-05,
151
+ "loss": 0.003,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 2.9230769230769234,
156
+ "grad_norm": 0.031246010214090347,
157
+ "learning_rate": 2.0923076923076923e-05,
158
+ "loss": 0.0029,
159
+ "step": 190
160
+ },
161
+ {
162
+ "epoch": 3.0,
163
+ "eval_loss": 0.001924663782119751,
164
+ "eval_runtime": 29.0281,
165
+ "eval_samples_per_second": 4.444,
166
+ "eval_steps_per_second": 0.586,
167
+ "step": 195
168
+ },
169
+ {
170
+ "epoch": 3.076923076923077,
171
+ "grad_norm": 0.03536583110690117,
172
+ "learning_rate": 1.9384615384615383e-05,
173
+ "loss": 0.3434,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 3.230769230769231,
178
+ "grad_norm": 0.03196291625499725,
179
+ "learning_rate": 1.7846153846153846e-05,
180
+ "loss": 0.0032,
181
+ "step": 210
182
+ },
183
+ {
184
+ "epoch": 3.3846153846153846,
185
+ "grad_norm": 0.05442598834633827,
186
+ "learning_rate": 1.630769230769231e-05,
187
+ "loss": 0.0034,
188
+ "step": 220
189
+ },
190
+ {
191
+ "epoch": 3.5384615384615383,
192
+ "grad_norm": 0.02654326893389225,
193
+ "learning_rate": 1.4769230769230772e-05,
194
+ "loss": 0.0766,
195
+ "step": 230
196
+ },
197
+ {
198
+ "epoch": 3.6923076923076925,
199
+ "grad_norm": 0.034997936338186264,
200
+ "learning_rate": 1.3230769230769233e-05,
201
+ "loss": 0.0038,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 3.8461538461538463,
206
+ "grad_norm": 0.027517210692167282,
207
+ "learning_rate": 1.1692307692307693e-05,
208
+ "loss": 0.0041,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 4.0,
213
+ "grad_norm": 0.04248378798365593,
214
+ "learning_rate": 1.0153846153846154e-05,
215
+ "loss": 0.0027,
216
+ "step": 260
217
+ },
218
+ {
219
+ "epoch": 4.0,
220
+ "eval_loss": 0.0018482182640582323,
221
+ "eval_runtime": 29.088,
222
+ "eval_samples_per_second": 4.435,
223
+ "eval_steps_per_second": 0.584,
224
+ "step": 260
225
+ },
226
+ {
227
+ "epoch": 4.153846153846154,
228
+ "grad_norm": 0.022280381992459297,
229
+ "learning_rate": 8.615384615384615e-06,
230
+ "loss": 0.0026,
231
+ "step": 270
232
+ },
233
+ {
234
+ "epoch": 4.3076923076923075,
235
+ "grad_norm": 0.019785910844802856,
236
+ "learning_rate": 7.076923076923076e-06,
237
+ "loss": 0.0025,
238
+ "step": 280
239
+ },
240
+ {
241
+ "epoch": 4.461538461538462,
242
+ "grad_norm": 0.024894440546631813,
243
+ "learning_rate": 5.5384615384615385e-06,
244
+ "loss": 0.0025,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 4.615384615384615,
249
+ "grad_norm": 0.02627086639404297,
250
+ "learning_rate": 4.000000000000001e-06,
251
+ "loss": 0.0027,
252
+ "step": 300
253
+ },
254
+ {
255
+ "epoch": 4.769230769230769,
256
+ "grad_norm": 0.02380353771150112,
257
+ "learning_rate": 2.4615384615384615e-06,
258
+ "loss": 0.0026,
259
+ "step": 310
260
+ },
261
+ {
262
+ "epoch": 4.923076923076923,
263
+ "grad_norm": 0.02841930277645588,
264
+ "learning_rate": 9.230769230769232e-07,
265
+ "loss": 0.0026,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 5.0,
270
+ "eval_loss": 0.00168671237770468,
271
+ "eval_runtime": 28.7127,
272
+ "eval_samples_per_second": 4.493,
273
+ "eval_steps_per_second": 0.592,
274
+ "step": 325
275
+ }
276
+ ],
277
+ "logging_steps": 10,
278
+ "max_steps": 325,
279
+ "num_input_tokens_seen": 0,
280
+ "num_train_epochs": 5,
281
+ "save_steps": 500,
282
+ "stateful_callbacks": {
283
+ "TrainerControl": {
284
+ "args": {
285
+ "should_epoch_stop": false,
286
+ "should_evaluate": false,
287
+ "should_log": false,
288
+ "should_save": true,
289
+ "should_training_stop": true
290
+ },
291
+ "attributes": {}
292
+ }
293
+ },
294
+ "total_flos": 0.0,
295
+ "train_batch_size": 8,
296
+ "trial_name": null,
297
+ "trial_params": null
298
+ }
checkpoint-325/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
+ size 5240
checkpoint-325/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-65/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cb825538b96470c8e6fef5f37164969c0f6358f82a785aaa64bcb5c172ce0f3
3
+ size 265491420
checkpoint-65/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab25565f9bed3dcf21d6900aa0235d5967053090a699eb97f6406970826f0902
3
+ size 531042682
checkpoint-65/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed81109884212e008c22a3f3219503fe6c9802fc67f6757cce69b9a1832ae5f
3
+ size 13990
checkpoint-65/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:804ba4f1dab328314e2adde75ea376ac7ba6063fa1c99fb8442cadf8619e5cc9
3
+ size 1064
checkpoint-65/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-65/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-65/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
checkpoint-65/trainer_state.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 65,
3
+ "best_metric": 0.005465179681777954,
4
+ "best_model_checkpoint": "./multitask_model/checkpoint-65",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 65,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.15384615384615385,
14
+ "grad_norm": 8.8089017868042,
15
+ "learning_rate": 4.861538461538462e-05,
16
+ "loss": 2.1939,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.3076923076923077,
21
+ "grad_norm": 3.6326286792755127,
22
+ "learning_rate": 4.707692307692308e-05,
23
+ "loss": 1.1303,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.46153846153846156,
28
+ "grad_norm": 2.114581823348999,
29
+ "learning_rate": 4.553846153846154e-05,
30
+ "loss": 0.3031,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.6153846153846154,
35
+ "grad_norm": 0.4926183223724365,
36
+ "learning_rate": 4.4000000000000006e-05,
37
+ "loss": 0.0667,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.7692307692307693,
42
+ "grad_norm": 0.26311376690864563,
43
+ "learning_rate": 4.2461538461538465e-05,
44
+ "loss": 0.0229,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.9230769230769231,
49
+ "grad_norm": 0.16266165673732758,
50
+ "learning_rate": 4.0923076923076925e-05,
51
+ "loss": 0.0119,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 1.0,
56
+ "eval_loss": 0.005465179681777954,
57
+ "eval_runtime": 28.3814,
58
+ "eval_samples_per_second": 4.545,
59
+ "eval_steps_per_second": 0.599,
60
+ "step": 65
61
+ }
62
+ ],
63
+ "logging_steps": 10,
64
+ "max_steps": 325,
65
+ "num_input_tokens_seen": 0,
66
+ "num_train_epochs": 5,
67
+ "save_steps": 500,
68
+ "stateful_callbacks": {
69
+ "TrainerControl": {
70
+ "args": {
71
+ "should_epoch_stop": false,
72
+ "should_evaluate": false,
73
+ "should_log": false,
74
+ "should_save": true,
75
+ "should_training_stop": false
76
+ },
77
+ "attributes": {}
78
+ }
79
+ },
80
+ "total_flos": 0.0,
81
+ "train_batch_size": 8,
82
+ "trial_name": null,
83
+ "trial_params": null
84
+ }
checkpoint-65/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
+ size 5240
checkpoint-65/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1c3678511418bfcf53a1ffcd424b0b7ee78fbbf110612cbc0db585216624acd
3
  size 265491420
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7799fbc15e49a5c41fcc8de8c0abbd168e5d0bbb3f98e94e4342e9b6aafd3c33
3
  size 265491420
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42a73d0354f4a551827f18e2d69462ece69163777d0e4b293be9e21cef74917c
3
  size 265519274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dd544cc99f7916aef5c08cda4af7229a72e34d7cb06177d735bdb180d3c58e4
3
  size 265519274
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:073443c160f218e70f39940b6473a05da96d48c0d26f92a97ad9644c0b98e4b2
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46098c8581f529d1e81c3e5c89a28fe04605657218470fc5b76dd1763697bb6e
3
  size 5240