sjudicke commited on
Commit
9233e9a
·
verified ·
1 Parent(s): 04fd1f5

Full run push

Browse files
Files changed (2) hide show
  1. README.md +54 -0
  2. trainer_state.json +253 -0
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: llama2
4
+ base_model: meta-llama/Llama-2-7b-hf
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: m0-new-lr
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # m0-new-lr
16
+
17
+ This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0003
37
+ - train_batch_size: 1
38
+ - eval_batch_size: 8
39
+ - seed: 42
40
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: cosine
42
+ - lr_scheduler_warmup_ratio: 0.03
43
+ - num_epochs: 1.0
44
+
45
+ ### Training results
46
+
47
+
48
+
49
+ ### Framework versions
50
+
51
+ - Transformers 4.57.1
52
+ - Pytorch 2.8.0+cu126
53
+ - Datasets 4.0.0
54
+ - Tokenizers 0.22.1
trainer_state.json ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 15000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03333333333333333,
14
+ "grad_norm": 0.8437663912773132,
15
+ "learning_rate": 0.00029999160495301487,
16
+ "loss": 0.4576,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.06666666666666667,
21
+ "grad_norm": 1.3834174871444702,
22
+ "learning_rate": 0.00029894738121610755,
23
+ "loss": 0.3618,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.1,
28
+ "grad_norm": 0.8362070322036743,
29
+ "learning_rate": 0.0002961688552258895,
30
+ "loss": 0.3312,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.13333333333333333,
35
+ "grad_norm": 1.8011195659637451,
36
+ "learning_rate": 0.0002916883793731605,
37
+ "loss": 0.3265,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.16666666666666666,
42
+ "grad_norm": 1.187129259109497,
43
+ "learning_rate": 0.0002855581230838202,
44
+ "loss": 0.3176,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.2,
49
+ "grad_norm": 0.6343923807144165,
50
+ "learning_rate": 0.000277849465372452,
51
+ "loss": 0.3267,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.23333333333333334,
56
+ "grad_norm": 0.5139018297195435,
57
+ "learning_rate": 0.00026865216372475085,
58
+ "loss": 0.3304,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.26666666666666666,
63
+ "grad_norm": 0.5840966701507568,
64
+ "learning_rate": 0.0002580733089860996,
65
+ "loss": 0.3174,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.3,
70
+ "grad_norm": 1.0480272769927979,
71
+ "learning_rate": 0.0002462360784252821,
72
+ "loss": 0.3131,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.3333333333333333,
77
+ "grad_norm": 0.768731415271759,
78
+ "learning_rate": 0.00023327830149231583,
79
+ "loss": 0.3005,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 0.36666666666666664,
84
+ "grad_norm": 0.8862756490707397,
85
+ "learning_rate": 0.00021935085497032568,
86
+ "loss": 0.2977,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.4,
91
+ "grad_norm": 1.8197040557861328,
92
+ "learning_rate": 0.00020461590620786605,
93
+ "loss": 0.3006,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 0.43333333333333335,
98
+ "grad_norm": 0.5644539594650269,
99
+ "learning_rate": 0.00018924502488701202,
100
+ "loss": 0.2891,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 0.4666666666666667,
105
+ "grad_norm": 0.7333141565322876,
106
+ "learning_rate": 0.00017341718531326979,
107
+ "loss": 0.2958,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 0.5,
112
+ "grad_norm": 0.8840310573577881,
113
+ "learning_rate": 0.00015731668248809323,
114
+ "loss": 0.2914,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 0.5333333333333333,
119
+ "grad_norm": 0.7415375113487244,
120
+ "learning_rate": 0.0001411309862286835,
121
+ "loss": 0.284,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 0.5666666666666667,
126
+ "grad_norm": 1.0699294805526733,
127
+ "learning_rate": 0.00012504855832110804,
128
+ "loss": 0.2776,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 0.6,
133
+ "grad_norm": 1.0262274742126465,
134
+ "learning_rate": 0.00010925665812320933,
135
+ "loss": 0.2751,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 0.6333333333333333,
140
+ "grad_norm": 1.2797510623931885,
141
+ "learning_rate": 9.393916216825465e-05,
142
+ "loss": 0.2797,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 0.6666666666666666,
147
+ "grad_norm": 0.8398839831352234,
148
+ "learning_rate": 7.927442315726411e-05,
149
+ "loss": 0.2655,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 0.7,
154
+ "grad_norm": 0.9251325726509094,
155
+ "learning_rate": 6.543319326931815e-05,
156
+ "loss": 0.2605,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 0.7333333333333333,
161
+ "grad_norm": 0.6823338866233826,
162
+ "learning_rate": 5.257663597024785e-05,
163
+ "loss": 0.2674,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 0.7666666666666667,
168
+ "grad_norm": 0.7772260904312134,
169
+ "learning_rate": 4.085444946965953e-05,
170
+ "loss": 0.2624,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 0.8,
175
+ "grad_norm": 0.6518853902816772,
176
+ "learning_rate": 3.040312367624248e-05,
177
+ "loss": 0.2596,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 0.8333333333333334,
182
+ "grad_norm": 0.6722842454910278,
183
+ "learning_rate": 2.1344350946892218e-05,
184
+ "loss": 0.2509,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 0.8666666666666667,
189
+ "grad_norm": 1.5721765756607056,
190
+ "learning_rate": 1.3783609134448331e-05,
191
+ "loss": 0.25,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 0.9,
196
+ "grad_norm": 1.592251181602478,
197
+ "learning_rate": 7.808933432648067e-06,
198
+ "loss": 0.249,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 0.9333333333333333,
203
+ "grad_norm": 1.5700002908706665,
204
+ "learning_rate": 3.4898913185952726e-06,
205
+ "loss": 0.2618,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 0.9666666666666667,
210
+ "grad_norm": 1.067083477973938,
211
+ "learning_rate": 8.767725282315785e-07,
212
+ "loss": 0.2653,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 1.0,
217
+ "grad_norm": 0.734412670135498,
218
+ "learning_rate": 3.496511979950156e-12,
219
+ "loss": 0.2583,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 1.0,
224
+ "step": 15000,
225
+ "total_flos": 1.6839046112147866e+17,
226
+ "train_loss": 0.29414576873779297,
227
+ "train_runtime": 2822.8333,
228
+ "train_samples_per_second": 5.314,
229
+ "train_steps_per_second": 5.314
230
+ }
231
+ ],
232
+ "logging_steps": 500,
233
+ "max_steps": 15000,
234
+ "num_input_tokens_seen": 0,
235
+ "num_train_epochs": 1,
236
+ "save_steps": 5000,
237
+ "stateful_callbacks": {
238
+ "TrainerControl": {
239
+ "args": {
240
+ "should_epoch_stop": false,
241
+ "should_evaluate": false,
242
+ "should_log": false,
243
+ "should_save": true,
244
+ "should_training_stop": true
245
+ },
246
+ "attributes": {}
247
+ }
248
+ },
249
+ "total_flos": 1.6839046112147866e+17,
250
+ "train_batch_size": 1,
251
+ "trial_name": null,
252
+ "trial_params": null
253
+ }