mtzig commited on
Commit
48145ea
·
verified ·
1 Parent(s): ba0c41f

End of training: push final checkpoint

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +15 -0
  3. train_results.json +15 -0
  4. trainer_state.json +203 -0
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 10.4557
18
 
19
  ## Model description
20
 
 
14
 
15
  This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 10.4539
18
 
19
  ## Model description
20
 
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9922480620155039,
3
+ "eval_epoch": 0.9922480620155039,
4
+ "eval_eval_loss": 10.453876495361328,
5
+ "eval_eval_runtime": 0.4463,
6
+ "eval_eval_samples_per_second": 11.203,
7
+ "eval_eval_steps_per_second": 4.481,
8
+ "eval_perplexity": 34678.54570620386,
9
+ "total_flos": 384752733388800.0,
10
+ "train_loss": 86.94341468811035,
11
+ "train_runtime": 61.6055,
12
+ "train_samples": 515,
13
+ "train_samples_per_second": 8.36,
14
+ "train_steps_per_second": 0.26
15
+ }
train_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9922480620155039,
3
+ "eval_epoch": 0.9922480620155039,
4
+ "eval_eval_loss": 10.453876495361328,
5
+ "eval_eval_runtime": 0.4463,
6
+ "eval_eval_samples_per_second": 11.203,
7
+ "eval_eval_steps_per_second": 4.481,
8
+ "eval_perplexity": 34678.54570620386,
9
+ "total_flos": 384752733388800.0,
10
+ "train_loss": 86.94341468811035,
11
+ "train_runtime": 61.6055,
12
+ "train_samples": 515,
13
+ "train_samples_per_second": 8.36,
14
+ "train_steps_per_second": 0.26
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9922480620155039,
6
+ "eval_steps": 3,
7
+ "global_step": 16,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06201550387596899,
14
+ "grad_norm": 4.862276554107666,
15
+ "learning_rate": 0.0,
16
+ "loss": 95.3351,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.12403100775193798,
21
+ "grad_norm": 4.652088642120361,
22
+ "learning_rate": 0.0002,
23
+ "loss": 95.3436,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.18604651162790697,
28
+ "grad_norm": 5.192637920379639,
29
+ "learning_rate": 0.00019781476007338058,
30
+ "loss": 93.3809,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.18604651162790697,
35
+ "eval_loss": 11.580174446105957,
36
+ "eval_runtime": 0.2088,
37
+ "eval_samples_per_second": 23.944,
38
+ "eval_steps_per_second": 9.578,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 0.24806201550387597,
43
+ "grad_norm": 6.524551868438721,
44
+ "learning_rate": 0.0001913545457642601,
45
+ "loss": 91.6408,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.31007751937984496,
50
+ "grad_norm": 8.191283226013184,
51
+ "learning_rate": 0.00018090169943749476,
52
+ "loss": 90.1653,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 0.37209302325581395,
57
+ "grad_norm": 9.808971405029297,
58
+ "learning_rate": 0.00016691306063588583,
59
+ "loss": 88.4358,
60
+ "step": 6
61
+ },
62
+ {
63
+ "epoch": 0.37209302325581395,
64
+ "eval_loss": 11.026472091674805,
65
+ "eval_runtime": 0.1942,
66
+ "eval_samples_per_second": 25.743,
67
+ "eval_steps_per_second": 10.297,
68
+ "step": 6
69
+ },
70
+ {
71
+ "epoch": 0.43410852713178294,
72
+ "grad_norm": 11.63279914855957,
73
+ "learning_rate": 0.00015000000000000001,
74
+ "loss": 86.722,
75
+ "step": 7
76
+ },
77
+ {
78
+ "epoch": 0.49612403100775193,
79
+ "grad_norm": 12.591235160827637,
80
+ "learning_rate": 0.00013090169943749476,
81
+ "loss": 85.2279,
82
+ "step": 8
83
+ },
84
+ {
85
+ "epoch": 0.5581395348837209,
86
+ "grad_norm": 12.81602668762207,
87
+ "learning_rate": 0.00011045284632676536,
88
+ "loss": 84.6683,
89
+ "step": 9
90
+ },
91
+ {
92
+ "epoch": 0.5581395348837209,
93
+ "eval_loss": 10.659093856811523,
94
+ "eval_runtime": 0.1895,
95
+ "eval_samples_per_second": 26.38,
96
+ "eval_steps_per_second": 10.552,
97
+ "step": 9
98
+ },
99
+ {
100
+ "epoch": 0.6201550387596899,
101
+ "grad_norm": 13.371841430664062,
102
+ "learning_rate": 8.954715367323468e-05,
103
+ "loss": 83.69,
104
+ "step": 10
105
+ },
106
+ {
107
+ "epoch": 0.6821705426356589,
108
+ "grad_norm": 13.044097900390625,
109
+ "learning_rate": 6.909830056250527e-05,
110
+ "loss": 83.3816,
111
+ "step": 11
112
+ },
113
+ {
114
+ "epoch": 0.7441860465116279,
115
+ "grad_norm": 12.994512557983398,
116
+ "learning_rate": 5.000000000000002e-05,
117
+ "loss": 83.1422,
118
+ "step": 12
119
+ },
120
+ {
121
+ "epoch": 0.7441860465116279,
122
+ "eval_loss": 10.501452445983887,
123
+ "eval_runtime": 0.213,
124
+ "eval_samples_per_second": 23.478,
125
+ "eval_steps_per_second": 9.391,
126
+ "step": 12
127
+ },
128
+ {
129
+ "epoch": 0.8062015503875969,
130
+ "grad_norm": 13.637518882751465,
131
+ "learning_rate": 3.308693936411421e-05,
132
+ "loss": 82.3943,
133
+ "step": 13
134
+ },
135
+ {
136
+ "epoch": 0.8682170542635659,
137
+ "grad_norm": 12.913551330566406,
138
+ "learning_rate": 1.9098300562505266e-05,
139
+ "loss": 82.7794,
140
+ "step": 14
141
+ },
142
+ {
143
+ "epoch": 0.9302325581395349,
144
+ "grad_norm": 13.087961196899414,
145
+ "learning_rate": 8.645454235739903e-06,
146
+ "loss": 82.4536,
147
+ "step": 15
148
+ },
149
+ {
150
+ "epoch": 0.9302325581395349,
151
+ "eval_loss": 10.455672264099121,
152
+ "eval_runtime": 0.2263,
153
+ "eval_samples_per_second": 22.094,
154
+ "eval_steps_per_second": 8.838,
155
+ "step": 15
156
+ },
157
+ {
158
+ "epoch": 0.9922480620155039,
159
+ "grad_norm": 13.101417541503906,
160
+ "learning_rate": 2.1852399266194314e-06,
161
+ "loss": 82.3338,
162
+ "step": 16
163
+ },
164
+ {
165
+ "epoch": 0.9922480620155039,
166
+ "step": 16,
167
+ "total_flos": 384752733388800.0,
168
+ "train_loss": 86.94341468811035,
169
+ "train_runtime": 61.6055,
170
+ "train_samples_per_second": 8.36,
171
+ "train_steps_per_second": 0.26
172
+ },
173
+ {
174
+ "epoch": 0.9922480620155039,
175
+ "eval_loss": 10.453876495361328,
176
+ "eval_runtime": 0.4463,
177
+ "eval_samples_per_second": 11.203,
178
+ "eval_steps_per_second": 4.481,
179
+ "step": 16
180
+ }
181
+ ],
182
+ "logging_steps": 1,
183
+ "max_steps": 16,
184
+ "num_input_tokens_seen": 0,
185
+ "num_train_epochs": 1,
186
+ "save_steps": 2000,
187
+ "stateful_callbacks": {
188
+ "TrainerControl": {
189
+ "args": {
190
+ "should_epoch_stop": false,
191
+ "should_evaluate": false,
192
+ "should_log": false,
193
+ "should_save": true,
194
+ "should_training_stop": true
195
+ },
196
+ "attributes": {}
197
+ }
198
+ },
199
+ "total_flos": 384752733388800.0,
200
+ "train_batch_size": 4,
201
+ "trial_name": null,
202
+ "trial_params": null
203
+ }