Docty commited on
Commit
d28cee0
·
verified ·
1 Parent(s): 545909d

End of training

Browse files
.gitattributes CHANGED
@@ -37,3 +37,7 @@ image_0.png filter=lfs diff=lfs merge=lfs -text
37
  image_1.png filter=lfs diff=lfs merge=lfs -text
38
  image_2.png filter=lfs diff=lfs merge=lfs -text
39
  image_3.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
37
  image_1.png filter=lfs diff=lfs merge=lfs -text
38
  image_2.png filter=lfs diff=lfs merge=lfs -text
39
  image_3.png filter=lfs diff=lfs merge=lfs -text
40
+ image_4.png filter=lfs diff=lfs merge=lfs -text
41
+ image_5.png filter=lfs diff=lfs merge=lfs -text
42
+ image_6.png filter=lfs diff=lfs merge=lfs -text
43
+ image_7.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,62 +1,46 @@
1
  ---
2
- library_name: transformers
3
- license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
 
 
 
5
  tags:
6
- - generated_from_trainer
7
- metrics:
8
- - accuracy
9
- model-index:
10
- - name: mangoes
11
- results: []
12
  ---
13
 
14
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # mangoes
18
 
19
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
- It achieves the following results on the evaluation set:
21
- - Loss: 0.7385
22
- - Accuracy: 0.9792
23
 
24
- ## Model description
25
 
26
- More information needed
27
 
28
- ## Intended uses & limitations
 
 
 
 
 
 
 
29
 
30
- More information needed
31
 
32
- ## Training and evaluation data
33
 
34
- More information needed
35
 
36
- ## Training procedure
37
-
38
- ### Training hyperparameters
39
 
40
- The following hyperparameters were used during training:
41
- - learning_rate: 2e-05
42
- - train_batch_size: 8
43
- - eval_batch_size: 8
44
- - seed: 1337
45
- - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
- - lr_scheduler_type: linear
47
- - num_epochs: 2.0
48
 
49
- ### Training results
 
 
50
 
51
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
- | 1.0281 | 1.0 | 170 | 1.0490 | 0.9583 |
54
- | 0.7454 | 2.0 | 340 | 0.7385 | 0.9792 |
55
 
 
56
 
57
- ### Framework versions
58
 
59
- - Transformers 4.56.1
60
- - Pytorch 2.8.0+cu126
61
- - Datasets 4.0.0
62
- - Tokenizers 0.22.0
 
1
  ---
 
 
2
  base_model: google/vit-base-patch16-224-in21k
3
+ library_name: transformers
4
+ license: creativeml-openrail-m
5
+ inference: true
6
  tags:
7
+ - image-classification
 
 
 
 
 
8
  ---
9
 
10
+ <!-- This model card has been generated automatically according to the information the training script had access to. You
11
  should probably proofread and complete it, then remove this comment. -->
12
 
 
13
 
14
+ # Image Classification
 
 
 
15
 
16
+ This model is a fine-tuned version of google/vit-base-patch16-224-in21k on the Docty/Mangovariety dataset.
17
 
18
+ You can find some example images in the following.
19
 
20
+ ![img_0](./image_0.png)
21
+ ![img_1](./image_1.png)
22
+ ![img_2](./image_2.png)
23
+ ![img_3](./image_3.png)
24
+ ![img_4](./image_4.png)
25
+ ![img_5](./image_5.png)
26
+ ![img_6](./image_6.png)
27
+ ![img_7](./image_7.png)
28
 
 
29
 
 
30
 
 
31
 
32
+ ## Intended uses & limitations
 
 
33
 
34
+ #### How to use
 
 
 
 
 
 
 
35
 
36
+ ```python
37
+ # TODO: add an example code snippet for running this diffusion pipeline
38
+ ```
39
 
40
+ #### Limitations and bias
 
 
 
41
 
42
+ [TODO: provide examples of latent issues and potential remediations]
43
 
44
+ ## Training details
45
 
46
+ [TODO: describe the data used to train the model]
 
 
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.925,
4
- "eval_loss": 1.3396402597427368,
5
- "eval_runtime": 4.0715,
6
- "eval_samples_per_second": 58.946,
7
- "eval_steps_per_second": 7.368,
8
- "total_flos": 1.0539477329117184e+17,
9
- "train_loss": 1.6162697343265309,
10
- "train_runtime": 67.3955,
11
- "train_samples_per_second": 20.179,
12
- "train_steps_per_second": 2.522
13
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.9791666666666666,
4
+ "eval_loss": 0.7385169267654419,
5
+ "eval_runtime": 4.5071,
6
+ "eval_samples_per_second": 53.249,
7
+ "eval_steps_per_second": 6.656,
8
+ "total_flos": 2.1078954658234368e+17,
9
+ "train_loss": 1.1879772003959208,
10
+ "train_runtime": 144.3943,
11
+ "train_samples_per_second": 18.837,
12
+ "train_steps_per_second": 2.355
13
  }
checkpoint-170/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c400d5ae11b7befb7cfc65d363d46ed164c34af774aab4f4604d03e3c5c0f1d
3
  size 343242432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab69531c641f042913217e8f611cc8af2ad491b9472fd164700a7b7c9a63bcb
3
  size 343242432
checkpoint-170/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cadfbd449ff36f3ea169dce8a96f1e6ad6c7b610cbf342866dc0aa99763da6db
3
  size 686608971
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c149235ca004d00f0368794e443c054747de8b2e63e0c61434315a24a95fbf5
3
  size 686608971
checkpoint-170/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d0b8f847c4abee6cbf37f74529475bcc76504fb647877758e521590108eadde
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88fbf6c00ad9453b86326498cdabcc19864275d1be005c45025374c29581a749
3
  size 1465
checkpoint-170/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 170,
3
- "best_metric": 1.3396402597427368,
4
  "best_model_checkpoint": "./mangoes/checkpoint-170",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
@@ -11,137 +11,137 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.058823529411764705,
14
- "grad_norm": 1.9543970823287964,
15
- "learning_rate": 1.8941176470588238e-05,
16
- "loss": 2.0442,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.11764705882352941,
21
- "grad_norm": 3.168637275695801,
22
- "learning_rate": 1.776470588235294e-05,
23
- "loss": 2.0181,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.17647058823529413,
28
- "grad_norm": 3.1181507110595703,
29
- "learning_rate": 1.658823529411765e-05,
30
- "loss": 1.9073,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.23529411764705882,
35
- "grad_norm": 2.7099850177764893,
36
- "learning_rate": 1.5411764705882356e-05,
37
- "loss": 1.8321,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.29411764705882354,
42
- "grad_norm": 3.0820131301879883,
43
- "learning_rate": 1.423529411764706e-05,
44
- "loss": 1.7744,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.35294117647058826,
49
- "grad_norm": 3.182910442352295,
50
- "learning_rate": 1.3058823529411766e-05,
51
- "loss": 1.7882,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.4117647058823529,
56
- "grad_norm": 2.999581813812256,
57
- "learning_rate": 1.1882352941176472e-05,
58
- "loss": 1.6671,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.47058823529411764,
63
- "grad_norm": 3.62984299659729,
64
- "learning_rate": 1.0705882352941178e-05,
65
- "loss": 1.6369,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.5294117647058824,
70
- "grad_norm": 3.2714788913726807,
71
- "learning_rate": 9.529411764705882e-06,
72
- "loss": 1.5923,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5882352941176471,
77
- "grad_norm": 3.1325395107269287,
78
- "learning_rate": 8.35294117647059e-06,
79
- "loss": 1.4627,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.6470588235294118,
84
- "grad_norm": 3.435844898223877,
85
- "learning_rate": 7.176470588235295e-06,
86
- "loss": 1.5063,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.7058823529411765,
91
- "grad_norm": 3.5805537700653076,
92
- "learning_rate": 6e-06,
93
- "loss": 1.4239,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.7647058823529411,
98
- "grad_norm": 3.2065505981445312,
99
- "learning_rate": 4.823529411764706e-06,
100
- "loss": 1.4303,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.8235294117647058,
105
- "grad_norm": 3.3940041065216064,
106
- "learning_rate": 3.6470588235294117e-06,
107
- "loss": 1.3697,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.8823529411764706,
112
- "grad_norm": 2.9482526779174805,
113
- "learning_rate": 2.470588235294118e-06,
114
- "loss": 1.3706,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.9411764705882353,
119
- "grad_norm": 3.306838035583496,
120
- "learning_rate": 1.2941176470588237e-06,
121
- "loss": 1.3454,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0,
126
- "grad_norm": 3.1316869258880615,
127
- "learning_rate": 1.1764705882352942e-07,
128
- "loss": 1.3072,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.0,
133
- "eval_accuracy": 0.925,
134
- "eval_loss": 1.3396402597427368,
135
- "eval_runtime": 3.5845,
136
- "eval_samples_per_second": 66.955,
137
- "eval_steps_per_second": 8.369,
138
  "step": 170
139
  }
140
  ],
141
  "logging_steps": 10,
142
- "max_steps": 170,
143
  "num_input_tokens_seen": 0,
144
- "num_train_epochs": 1,
145
  "save_steps": 500,
146
  "stateful_callbacks": {
147
  "TrainerControl": {
@@ -150,7 +150,7 @@
150
  "should_evaluate": false,
151
  "should_log": false,
152
  "should_save": true,
153
- "should_training_stop": true
154
  },
155
  "attributes": {}
156
  }
 
1
  {
2
  "best_global_step": 170,
3
+ "best_metric": 1.0490069389343262,
4
  "best_model_checkpoint": "./mangoes/checkpoint-170",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.058823529411764705,
14
+ "grad_norm": 1.9217854738235474,
15
+ "learning_rate": 1.9470588235294118e-05,
16
+ "loss": 2.044,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.11764705882352941,
21
+ "grad_norm": 3.201918363571167,
22
+ "learning_rate": 1.888235294117647e-05,
23
+ "loss": 2.0161,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.17647058823529413,
28
+ "grad_norm": 3.1314334869384766,
29
+ "learning_rate": 1.8294117647058824e-05,
30
+ "loss": 1.9021,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.23529411764705882,
35
+ "grad_norm": 2.9427032470703125,
36
+ "learning_rate": 1.7705882352941177e-05,
37
+ "loss": 1.8208,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.29411764705882354,
42
+ "grad_norm": 3.096381425857544,
43
+ "learning_rate": 1.711764705882353e-05,
44
+ "loss": 1.7553,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.35294117647058826,
49
+ "grad_norm": 3.1433303356170654,
50
+ "learning_rate": 1.6529411764705883e-05,
51
+ "loss": 1.7674,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.4117647058823529,
56
+ "grad_norm": 3.0195441246032715,
57
+ "learning_rate": 1.594117647058824e-05,
58
+ "loss": 1.6242,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.47058823529411764,
63
+ "grad_norm": 3.748368740081787,
64
+ "learning_rate": 1.535294117647059e-05,
65
+ "loss": 1.5804,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.5294117647058824,
70
+ "grad_norm": 2.8905935287475586,
71
+ "learning_rate": 1.4764705882352944e-05,
72
+ "loss": 1.5126,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5882352941176471,
77
+ "grad_norm": 3.1870696544647217,
78
+ "learning_rate": 1.4176470588235297e-05,
79
+ "loss": 1.3781,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.6470588235294118,
84
+ "grad_norm": 3.208005428314209,
85
+ "learning_rate": 1.3588235294117648e-05,
86
+ "loss": 1.4037,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.7058823529411765,
91
+ "grad_norm": 4.6087236404418945,
92
+ "learning_rate": 1.3000000000000001e-05,
93
+ "loss": 1.2771,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.7647058823529411,
98
+ "grad_norm": 3.6908063888549805,
99
+ "learning_rate": 1.2411764705882354e-05,
100
+ "loss": 1.2711,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.8235294117647058,
105
+ "grad_norm": 3.6166765689849854,
106
+ "learning_rate": 1.1823529411764707e-05,
107
+ "loss": 1.192,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.8823529411764706,
112
+ "grad_norm": 3.6934988498687744,
113
+ "learning_rate": 1.123529411764706e-05,
114
+ "loss": 1.1566,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.9411764705882353,
119
+ "grad_norm": 3.789727210998535,
120
+ "learning_rate": 1.0647058823529413e-05,
121
+ "loss": 1.1063,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0,
126
+ "grad_norm": 3.842630386352539,
127
+ "learning_rate": 1.0058823529411766e-05,
128
+ "loss": 1.0281,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.0,
133
+ "eval_accuracy": 0.9583333333333334,
134
+ "eval_loss": 1.0490069389343262,
135
+ "eval_runtime": 6.306,
136
+ "eval_samples_per_second": 38.059,
137
+ "eval_steps_per_second": 4.757,
138
  "step": 170
139
  }
140
  ],
141
  "logging_steps": 10,
142
+ "max_steps": 340,
143
  "num_input_tokens_seen": 0,
144
+ "num_train_epochs": 2,
145
  "save_steps": 500,
146
  "stateful_callbacks": {
147
  "TrainerControl": {
 
150
  "should_evaluate": false,
151
  "should_log": false,
152
  "should_save": true,
153
+ "should_training_stop": false
154
  },
155
  "attributes": {}
156
  }
checkpoint-170/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:893ca9b2102cd9b45b14bed6a9a6e70a0788bcd8b2b3081bfc3f818b64ed24e0
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6254db31a7a2ebc2b5442f55a1856e38f567ff87f688b6fada24c5e7ff0ac863
3
  size 5777
checkpoint-340/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ViTForImageClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "dtype": "float32",
7
+ "encoder_stride": 16,
8
+ "finetuning_task": "image-classification",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "Dosehri",
14
+ "1": "Sindhri",
15
+ "2": "Fajri",
16
+ "3": "Anwar Ratool",
17
+ "4": "Chaunsa (White)",
18
+ "5": "Langra",
19
+ "6": "Chaunsa (Black)",
20
+ "7": "Chaunsa (Summer Bahisht)"
21
+ },
22
+ "image_size": 224,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "Anwar Ratool": "3",
27
+ "Chaunsa (Black)": "6",
28
+ "Chaunsa (Summer Bahisht)": "7",
29
+ "Chaunsa (White)": "4",
30
+ "Dosehri": "0",
31
+ "Fajri": "2",
32
+ "Langra": "5",
33
+ "Sindhri": "1"
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "model_type": "vit",
37
+ "num_attention_heads": 12,
38
+ "num_channels": 3,
39
+ "num_hidden_layers": 12,
40
+ "patch_size": 16,
41
+ "pooler_act": "tanh",
42
+ "pooler_output_size": 768,
43
+ "problem_type": "single_label_classification",
44
+ "qkv_bias": true,
45
+ "transformers_version": "4.56.1"
46
+ }
checkpoint-340/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fb1fa9e80bf4eab83cc18c0c74bba63792386caef2b2130f2b5d423f7175e6c
3
+ size 343242432
checkpoint-340/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59b16c886c04057328c26eb6f50af8894e0a5f4abd74c1d246d97a20fac0e932
3
+ size 686608971
checkpoint-340/preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "ViTImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
checkpoint-340/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f404e05f6f5caffea0dc19f6a0ca097e8fa9410c78b5bb911b30fe514e69efb
3
+ size 14709
checkpoint-340/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76435d73fd4ee716fd8f4c37788fb374f3e2219f1c374082298b0111fccbbcd
3
+ size 1465
checkpoint-340/trainer_state.json ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 340,
3
+ "best_metric": 0.7385169267654419,
4
+ "best_model_checkpoint": "./mangoes/checkpoint-340",
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 340,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.058823529411764705,
14
+ "grad_norm": 1.9217854738235474,
15
+ "learning_rate": 1.9470588235294118e-05,
16
+ "loss": 2.044,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.11764705882352941,
21
+ "grad_norm": 3.201918363571167,
22
+ "learning_rate": 1.888235294117647e-05,
23
+ "loss": 2.0161,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.17647058823529413,
28
+ "grad_norm": 3.1314334869384766,
29
+ "learning_rate": 1.8294117647058824e-05,
30
+ "loss": 1.9021,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.23529411764705882,
35
+ "grad_norm": 2.9427032470703125,
36
+ "learning_rate": 1.7705882352941177e-05,
37
+ "loss": 1.8208,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.29411764705882354,
42
+ "grad_norm": 3.096381425857544,
43
+ "learning_rate": 1.711764705882353e-05,
44
+ "loss": 1.7553,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.35294117647058826,
49
+ "grad_norm": 3.1433303356170654,
50
+ "learning_rate": 1.6529411764705883e-05,
51
+ "loss": 1.7674,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.4117647058823529,
56
+ "grad_norm": 3.0195441246032715,
57
+ "learning_rate": 1.594117647058824e-05,
58
+ "loss": 1.6242,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.47058823529411764,
63
+ "grad_norm": 3.748368740081787,
64
+ "learning_rate": 1.535294117647059e-05,
65
+ "loss": 1.5804,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.5294117647058824,
70
+ "grad_norm": 2.8905935287475586,
71
+ "learning_rate": 1.4764705882352944e-05,
72
+ "loss": 1.5126,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.5882352941176471,
77
+ "grad_norm": 3.1870696544647217,
78
+ "learning_rate": 1.4176470588235297e-05,
79
+ "loss": 1.3781,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.6470588235294118,
84
+ "grad_norm": 3.208005428314209,
85
+ "learning_rate": 1.3588235294117648e-05,
86
+ "loss": 1.4037,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.7058823529411765,
91
+ "grad_norm": 4.6087236404418945,
92
+ "learning_rate": 1.3000000000000001e-05,
93
+ "loss": 1.2771,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.7647058823529411,
98
+ "grad_norm": 3.6908063888549805,
99
+ "learning_rate": 1.2411764705882354e-05,
100
+ "loss": 1.2711,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.8235294117647058,
105
+ "grad_norm": 3.6166765689849854,
106
+ "learning_rate": 1.1823529411764707e-05,
107
+ "loss": 1.192,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.8823529411764706,
112
+ "grad_norm": 3.6934988498687744,
113
+ "learning_rate": 1.123529411764706e-05,
114
+ "loss": 1.1566,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.9411764705882353,
119
+ "grad_norm": 3.789727210998535,
120
+ "learning_rate": 1.0647058823529413e-05,
121
+ "loss": 1.1063,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 1.0,
126
+ "grad_norm": 3.842630386352539,
127
+ "learning_rate": 1.0058823529411766e-05,
128
+ "loss": 1.0281,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 1.0,
133
+ "eval_accuracy": 0.9583333333333334,
134
+ "eval_loss": 1.0490069389343262,
135
+ "eval_runtime": 6.306,
136
+ "eval_samples_per_second": 38.059,
137
+ "eval_steps_per_second": 4.757,
138
+ "step": 170
139
+ },
140
+ {
141
+ "epoch": 1.0588235294117647,
142
+ "grad_norm": 3.8409788608551025,
143
+ "learning_rate": 9.470588235294119e-06,
144
+ "loss": 1.0169,
145
+ "step": 180
146
+ },
147
+ {
148
+ "epoch": 1.1176470588235294,
149
+ "grad_norm": 2.461111068725586,
150
+ "learning_rate": 8.88235294117647e-06,
151
+ "loss": 0.9592,
152
+ "step": 190
153
+ },
154
+ {
155
+ "epoch": 1.1764705882352942,
156
+ "grad_norm": 3.7472541332244873,
157
+ "learning_rate": 8.294117647058825e-06,
158
+ "loss": 1.014,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 1.2352941176470589,
163
+ "grad_norm": 4.744520664215088,
164
+ "learning_rate": 7.705882352941178e-06,
165
+ "loss": 0.9485,
166
+ "step": 210
167
+ },
168
+ {
169
+ "epoch": 1.2941176470588236,
170
+ "grad_norm": 2.4809184074401855,
171
+ "learning_rate": 7.11764705882353e-06,
172
+ "loss": 0.956,
173
+ "step": 220
174
+ },
175
+ {
176
+ "epoch": 1.3529411764705883,
177
+ "grad_norm": 6.709966659545898,
178
+ "learning_rate": 6.529411764705883e-06,
179
+ "loss": 0.9707,
180
+ "step": 230
181
+ },
182
+ {
183
+ "epoch": 1.4117647058823528,
184
+ "grad_norm": 4.961846828460693,
185
+ "learning_rate": 5.941176470588236e-06,
186
+ "loss": 0.8539,
187
+ "step": 240
188
+ },
189
+ {
190
+ "epoch": 1.4705882352941178,
191
+ "grad_norm": 5.209068298339844,
192
+ "learning_rate": 5.352941176470589e-06,
193
+ "loss": 0.8284,
194
+ "step": 250
195
+ },
196
+ {
197
+ "epoch": 1.5294117647058822,
198
+ "grad_norm": 3.826070547103882,
199
+ "learning_rate": 4.764705882352941e-06,
200
+ "loss": 0.8226,
201
+ "step": 260
202
+ },
203
+ {
204
+ "epoch": 1.5882352941176472,
205
+ "grad_norm": 2.8872721195220947,
206
+ "learning_rate": 4.176470588235295e-06,
207
+ "loss": 0.7727,
208
+ "step": 270
209
+ },
210
+ {
211
+ "epoch": 1.6470588235294117,
212
+ "grad_norm": 3.0581214427948,
213
+ "learning_rate": 3.5882352941176475e-06,
214
+ "loss": 0.7841,
215
+ "step": 280
216
+ },
217
+ {
218
+ "epoch": 1.7058823529411766,
219
+ "grad_norm": 4.626227855682373,
220
+ "learning_rate": 3e-06,
221
+ "loss": 0.7934,
222
+ "step": 290
223
+ },
224
+ {
225
+ "epoch": 1.7647058823529411,
226
+ "grad_norm": 2.622793436050415,
227
+ "learning_rate": 2.411764705882353e-06,
228
+ "loss": 0.7713,
229
+ "step": 300
230
+ },
231
+ {
232
+ "epoch": 1.8235294117647058,
233
+ "grad_norm": 2.549530267715454,
234
+ "learning_rate": 1.8235294117647058e-06,
235
+ "loss": 0.7459,
236
+ "step": 310
237
+ },
238
+ {
239
+ "epoch": 1.8823529411764706,
240
+ "grad_norm": 3.626901149749756,
241
+ "learning_rate": 1.235294117647059e-06,
242
+ "loss": 0.8056,
243
+ "step": 320
244
+ },
245
+ {
246
+ "epoch": 1.9411764705882353,
247
+ "grad_norm": 2.356318950653076,
248
+ "learning_rate": 6.470588235294118e-07,
249
+ "loss": 0.7665,
250
+ "step": 330
251
+ },
252
+ {
253
+ "epoch": 2.0,
254
+ "grad_norm": 4.176856517791748,
255
+ "learning_rate": 5.882352941176471e-08,
256
+ "loss": 0.7454,
257
+ "step": 340
258
+ },
259
+ {
260
+ "epoch": 2.0,
261
+ "eval_accuracy": 0.9791666666666666,
262
+ "eval_loss": 0.7385169267654419,
263
+ "eval_runtime": 3.873,
264
+ "eval_samples_per_second": 61.967,
265
+ "eval_steps_per_second": 7.746,
266
+ "step": 340
267
+ }
268
+ ],
269
+ "logging_steps": 10,
270
+ "max_steps": 340,
271
+ "num_input_tokens_seen": 0,
272
+ "num_train_epochs": 2,
273
+ "save_steps": 500,
274
+ "stateful_callbacks": {
275
+ "TrainerControl": {
276
+ "args": {
277
+ "should_epoch_stop": false,
278
+ "should_evaluate": false,
279
+ "should_log": false,
280
+ "should_save": true,
281
+ "should_training_stop": true
282
+ },
283
+ "attributes": {}
284
+ }
285
+ },
286
+ "total_flos": 2.1078954658234368e+17,
287
+ "train_batch_size": 8,
288
+ "trial_name": null,
289
+ "trial_params": null
290
+ }
checkpoint-340/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6254db31a7a2ebc2b5442f55a1856e38f567ff87f688b6fada24c5e7ff0ac863
3
+ size 5777
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.925,
4
- "eval_loss": 1.3396402597427368,
5
- "eval_runtime": 4.0715,
6
- "eval_samples_per_second": 58.946,
7
- "eval_steps_per_second": 7.368
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_accuracy": 0.9791666666666666,
4
+ "eval_loss": 0.7385169267654419,
5
+ "eval_runtime": 4.5071,
6
+ "eval_samples_per_second": 53.249,
7
+ "eval_steps_per_second": 6.656
8
  }
image_0.png CHANGED

Git LFS Details

  • SHA256: 0be8f58b4b6d41413d300543ee67f999305c7b14354f87e6f8231b5ed4f4c4c2
  • Pointer size: 131 Bytes
  • Size of remote file: 244 kB

Git LFS Details

  • SHA256: 8a9e167d0297ccafbd8ad85d623d75766a6bcb0c3887ce9cb06f2c50360e860e
  • Pointer size: 131 Bytes
  • Size of remote file: 324 kB
image_1.png CHANGED

Git LFS Details

  • SHA256: cac2905641f3a12b78857dbd6c9911bf4353f43626460809c402e68cdda9da39
  • Pointer size: 131 Bytes
  • Size of remote file: 300 kB

Git LFS Details

  • SHA256: 0be8f58b4b6d41413d300543ee67f999305c7b14354f87e6f8231b5ed4f4c4c2
  • Pointer size: 131 Bytes
  • Size of remote file: 244 kB
image_2.png CHANGED

Git LFS Details

  • SHA256: 8a9e167d0297ccafbd8ad85d623d75766a6bcb0c3887ce9cb06f2c50360e860e
  • Pointer size: 131 Bytes
  • Size of remote file: 324 kB

Git LFS Details

  • SHA256: 73d259138a0ec2a08db25248423c25f1ad1eae52d68af8d971b70da4a4da5532
  • Pointer size: 131 Bytes
  • Size of remote file: 227 kB
image_3.png CHANGED

Git LFS Details

  • SHA256: 73d259138a0ec2a08db25248423c25f1ad1eae52d68af8d971b70da4a4da5532
  • Pointer size: 131 Bytes
  • Size of remote file: 227 kB

Git LFS Details

  • SHA256: 22f98d05c4c88283bdeacf07b9b3b4db9af66be688a423beab90550f3f4cbdf5
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB
image_4.png ADDED

Git LFS Details

  • SHA256: cac2905641f3a12b78857dbd6c9911bf4353f43626460809c402e68cdda9da39
  • Pointer size: 131 Bytes
  • Size of remote file: 300 kB
image_5.png ADDED

Git LFS Details

  • SHA256: dbc7b24ed5d7a89bd53371c1a53d21a53725f84e779411dbea500bea9ae38d80
  • Pointer size: 131 Bytes
  • Size of remote file: 247 kB
image_6.png ADDED

Git LFS Details

  • SHA256: a9c3d3600a8e6f10f1c34f137732a81e7b7a9734e4b7e0512c84f48351682267
  • Pointer size: 131 Bytes
  • Size of remote file: 220 kB
image_7.png ADDED

Git LFS Details

  • SHA256: a99f844e8943ef41a29a7dbc6747e78e6cd68c3ad618c7c879d3ce71b3760ded
  • Pointer size: 131 Bytes
  • Size of remote file: 338 kB
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.0539477329117184e+17,
4
- "train_loss": 1.6162697343265309,
5
- "train_runtime": 67.3955,
6
- "train_samples_per_second": 20.179,
7
- "train_steps_per_second": 2.522
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 2.1078954658234368e+17,
4
+ "train_loss": 1.1879772003959208,
5
+ "train_runtime": 144.3943,
6
+ "train_samples_per_second": 18.837,
7
+ "train_steps_per_second": 2.355
8
  }
trainer_state.json CHANGED
@@ -1,156 +1,284 @@
1
  {
2
- "best_global_step": 170,
3
- "best_metric": 1.3396402597427368,
4
- "best_model_checkpoint": "./mangoes/checkpoint-170",
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 170,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.058823529411764705,
14
- "grad_norm": 1.9543970823287964,
15
- "learning_rate": 1.8941176470588238e-05,
16
- "loss": 2.0442,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.11764705882352941,
21
- "grad_norm": 3.168637275695801,
22
- "learning_rate": 1.776470588235294e-05,
23
- "loss": 2.0181,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.17647058823529413,
28
- "grad_norm": 3.1181507110595703,
29
- "learning_rate": 1.658823529411765e-05,
30
- "loss": 1.9073,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.23529411764705882,
35
- "grad_norm": 2.7099850177764893,
36
- "learning_rate": 1.5411764705882356e-05,
37
- "loss": 1.8321,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.29411764705882354,
42
- "grad_norm": 3.0820131301879883,
43
- "learning_rate": 1.423529411764706e-05,
44
- "loss": 1.7744,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.35294117647058826,
49
- "grad_norm": 3.182910442352295,
50
- "learning_rate": 1.3058823529411766e-05,
51
- "loss": 1.7882,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.4117647058823529,
56
- "grad_norm": 2.999581813812256,
57
- "learning_rate": 1.1882352941176472e-05,
58
- "loss": 1.6671,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.47058823529411764,
63
- "grad_norm": 3.62984299659729,
64
- "learning_rate": 1.0705882352941178e-05,
65
- "loss": 1.6369,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.5294117647058824,
70
- "grad_norm": 3.2714788913726807,
71
- "learning_rate": 9.529411764705882e-06,
72
- "loss": 1.5923,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5882352941176471,
77
- "grad_norm": 3.1325395107269287,
78
- "learning_rate": 8.35294117647059e-06,
79
- "loss": 1.4627,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.6470588235294118,
84
- "grad_norm": 3.435844898223877,
85
- "learning_rate": 7.176470588235295e-06,
86
- "loss": 1.5063,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.7058823529411765,
91
- "grad_norm": 3.5805537700653076,
92
- "learning_rate": 6e-06,
93
- "loss": 1.4239,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.7647058823529411,
98
- "grad_norm": 3.2065505981445312,
99
- "learning_rate": 4.823529411764706e-06,
100
- "loss": 1.4303,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.8235294117647058,
105
- "grad_norm": 3.3940041065216064,
106
- "learning_rate": 3.6470588235294117e-06,
107
- "loss": 1.3697,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.8823529411764706,
112
- "grad_norm": 2.9482526779174805,
113
- "learning_rate": 2.470588235294118e-06,
114
- "loss": 1.3706,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.9411764705882353,
119
- "grad_norm": 3.306838035583496,
120
- "learning_rate": 1.2941176470588237e-06,
121
- "loss": 1.3454,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0,
126
- "grad_norm": 3.1316869258880615,
127
- "learning_rate": 1.1764705882352942e-07,
128
- "loss": 1.3072,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.0,
133
- "eval_accuracy": 0.925,
134
- "eval_loss": 1.3396402597427368,
135
- "eval_runtime": 3.5845,
136
- "eval_samples_per_second": 66.955,
137
- "eval_steps_per_second": 8.369,
138
  "step": 170
139
  },
140
  {
141
- "epoch": 1.0,
142
- "step": 170,
143
- "total_flos": 1.0539477329117184e+17,
144
- "train_loss": 1.6162697343265309,
145
- "train_runtime": 67.3955,
146
- "train_samples_per_second": 20.179,
147
- "train_steps_per_second": 2.522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  }
149
  ],
150
  "logging_steps": 10,
151
- "max_steps": 170,
152
  "num_input_tokens_seen": 0,
153
- "num_train_epochs": 1,
154
  "save_steps": 500,
155
  "stateful_callbacks": {
156
  "TrainerControl": {
@@ -164,7 +292,7 @@
164
  "attributes": {}
165
  }
166
  },
167
- "total_flos": 1.0539477329117184e+17,
168
  "train_batch_size": 8,
169
  "trial_name": null,
170
  "trial_params": null
 
1
  {
2
+ "best_global_step": 340,
3
+ "best_metric": 0.7385169267654419,
4
+ "best_model_checkpoint": "./mangoes/checkpoint-340",
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 340,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.058823529411764705,
14
+ "grad_norm": 1.9217854738235474,
15
+ "learning_rate": 1.9470588235294118e-05,
16
+ "loss": 2.044,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.11764705882352941,
21
+ "grad_norm": 3.201918363571167,
22
+ "learning_rate": 1.888235294117647e-05,
23
+ "loss": 2.0161,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.17647058823529413,
28
+ "grad_norm": 3.1314334869384766,
29
+ "learning_rate": 1.8294117647058824e-05,
30
+ "loss": 1.9021,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.23529411764705882,
35
+ "grad_norm": 2.9427032470703125,
36
+ "learning_rate": 1.7705882352941177e-05,
37
+ "loss": 1.8208,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.29411764705882354,
42
+ "grad_norm": 3.096381425857544,
43
+ "learning_rate": 1.711764705882353e-05,
44
+ "loss": 1.7553,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.35294117647058826,
49
+ "grad_norm": 3.1433303356170654,
50
+ "learning_rate": 1.6529411764705883e-05,
51
+ "loss": 1.7674,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.4117647058823529,
56
+ "grad_norm": 3.0195441246032715,
57
+ "learning_rate": 1.594117647058824e-05,
58
+ "loss": 1.6242,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.47058823529411764,
63
+ "grad_norm": 3.748368740081787,
64
+ "learning_rate": 1.535294117647059e-05,
65
+ "loss": 1.5804,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.5294117647058824,
70
+ "grad_norm": 2.8905935287475586,
71
+ "learning_rate": 1.4764705882352944e-05,
72
+ "loss": 1.5126,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.5882352941176471,
77
+ "grad_norm": 3.1870696544647217,
78
+ "learning_rate": 1.4176470588235297e-05,
79
+ "loss": 1.3781,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.6470588235294118,
84
+ "grad_norm": 3.208005428314209,
85
+ "learning_rate": 1.3588235294117648e-05,
86
+ "loss": 1.4037,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.7058823529411765,
91
+ "grad_norm": 4.6087236404418945,
92
+ "learning_rate": 1.3000000000000001e-05,
93
+ "loss": 1.2771,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.7647058823529411,
98
+ "grad_norm": 3.6908063888549805,
99
+ "learning_rate": 1.2411764705882354e-05,
100
+ "loss": 1.2711,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.8235294117647058,
105
+ "grad_norm": 3.6166765689849854,
106
+ "learning_rate": 1.1823529411764707e-05,
107
+ "loss": 1.192,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.8823529411764706,
112
+ "grad_norm": 3.6934988498687744,
113
+ "learning_rate": 1.123529411764706e-05,
114
+ "loss": 1.1566,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.9411764705882353,
119
+ "grad_norm": 3.789727210998535,
120
+ "learning_rate": 1.0647058823529413e-05,
121
+ "loss": 1.1063,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0,
126
+ "grad_norm": 3.842630386352539,
127
+ "learning_rate": 1.0058823529411766e-05,
128
+ "loss": 1.0281,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.0,
133
+ "eval_accuracy": 0.9583333333333334,
134
+ "eval_loss": 1.0490069389343262,
135
+ "eval_runtime": 6.306,
136
+ "eval_samples_per_second": 38.059,
137
+ "eval_steps_per_second": 4.757,
138
  "step": 170
139
  },
140
  {
141
+ "epoch": 1.0588235294117647,
142
+ "grad_norm": 3.8409788608551025,
143
+ "learning_rate": 9.470588235294119e-06,
144
+ "loss": 1.0169,
145
+ "step": 180
146
+ },
147
+ {
148
+ "epoch": 1.1176470588235294,
149
+ "grad_norm": 2.461111068725586,
150
+ "learning_rate": 8.88235294117647e-06,
151
+ "loss": 0.9592,
152
+ "step": 190
153
+ },
154
+ {
155
+ "epoch": 1.1764705882352942,
156
+ "grad_norm": 3.7472541332244873,
157
+ "learning_rate": 8.294117647058825e-06,
158
+ "loss": 1.014,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 1.2352941176470589,
163
+ "grad_norm": 4.744520664215088,
164
+ "learning_rate": 7.705882352941178e-06,
165
+ "loss": 0.9485,
166
+ "step": 210
167
+ },
168
+ {
169
+ "epoch": 1.2941176470588236,
170
+ "grad_norm": 2.4809184074401855,
171
+ "learning_rate": 7.11764705882353e-06,
172
+ "loss": 0.956,
173
+ "step": 220
174
+ },
175
+ {
176
+ "epoch": 1.3529411764705883,
177
+ "grad_norm": 6.709966659545898,
178
+ "learning_rate": 6.529411764705883e-06,
179
+ "loss": 0.9707,
180
+ "step": 230
181
+ },
182
+ {
183
+ "epoch": 1.4117647058823528,
184
+ "grad_norm": 4.961846828460693,
185
+ "learning_rate": 5.941176470588236e-06,
186
+ "loss": 0.8539,
187
+ "step": 240
188
+ },
189
+ {
190
+ "epoch": 1.4705882352941178,
191
+ "grad_norm": 5.209068298339844,
192
+ "learning_rate": 5.352941176470589e-06,
193
+ "loss": 0.8284,
194
+ "step": 250
195
+ },
196
+ {
197
+ "epoch": 1.5294117647058822,
198
+ "grad_norm": 3.826070547103882,
199
+ "learning_rate": 4.764705882352941e-06,
200
+ "loss": 0.8226,
201
+ "step": 260
202
+ },
203
+ {
204
+ "epoch": 1.5882352941176472,
205
+ "grad_norm": 2.8872721195220947,
206
+ "learning_rate": 4.176470588235295e-06,
207
+ "loss": 0.7727,
208
+ "step": 270
209
+ },
210
+ {
211
+ "epoch": 1.6470588235294117,
212
+ "grad_norm": 3.0581214427948,
213
+ "learning_rate": 3.5882352941176475e-06,
214
+ "loss": 0.7841,
215
+ "step": 280
216
+ },
217
+ {
218
+ "epoch": 1.7058823529411766,
219
+ "grad_norm": 4.626227855682373,
220
+ "learning_rate": 3e-06,
221
+ "loss": 0.7934,
222
+ "step": 290
223
+ },
224
+ {
225
+ "epoch": 1.7647058823529411,
226
+ "grad_norm": 2.622793436050415,
227
+ "learning_rate": 2.411764705882353e-06,
228
+ "loss": 0.7713,
229
+ "step": 300
230
+ },
231
+ {
232
+ "epoch": 1.8235294117647058,
233
+ "grad_norm": 2.549530267715454,
234
+ "learning_rate": 1.8235294117647058e-06,
235
+ "loss": 0.7459,
236
+ "step": 310
237
+ },
238
+ {
239
+ "epoch": 1.8823529411764706,
240
+ "grad_norm": 3.626901149749756,
241
+ "learning_rate": 1.235294117647059e-06,
242
+ "loss": 0.8056,
243
+ "step": 320
244
+ },
245
+ {
246
+ "epoch": 1.9411764705882353,
247
+ "grad_norm": 2.356318950653076,
248
+ "learning_rate": 6.470588235294118e-07,
249
+ "loss": 0.7665,
250
+ "step": 330
251
+ },
252
+ {
253
+ "epoch": 2.0,
254
+ "grad_norm": 4.176856517791748,
255
+ "learning_rate": 5.882352941176471e-08,
256
+ "loss": 0.7454,
257
+ "step": 340
258
+ },
259
+ {
260
+ "epoch": 2.0,
261
+ "eval_accuracy": 0.9791666666666666,
262
+ "eval_loss": 0.7385169267654419,
263
+ "eval_runtime": 3.873,
264
+ "eval_samples_per_second": 61.967,
265
+ "eval_steps_per_second": 7.746,
266
+ "step": 340
267
+ },
268
+ {
269
+ "epoch": 2.0,
270
+ "step": 340,
271
+ "total_flos": 2.1078954658234368e+17,
272
+ "train_loss": 1.1879772003959208,
273
+ "train_runtime": 144.3943,
274
+ "train_samples_per_second": 18.837,
275
+ "train_steps_per_second": 2.355
276
  }
277
  ],
278
  "logging_steps": 10,
279
+ "max_steps": 340,
280
  "num_input_tokens_seen": 0,
281
+ "num_train_epochs": 2,
282
  "save_steps": 500,
283
  "stateful_callbacks": {
284
  "TrainerControl": {
 
292
  "attributes": {}
293
  }
294
  },
295
+ "total_flos": 2.1078954658234368e+17,
296
  "train_batch_size": 8,
297
  "trial_name": null,
298
  "trial_params": null