nhxnnz commited on
Commit
516d064
·
verified ·
1 Parent(s): 4d71851

End of training

Browse files
README.md CHANGED
@@ -45,7 +45,7 @@ The following hyperparameters were used during training:
45
  - total_train_batch_size: 8
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
- - num_epochs: 3
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
 
45
  - total_train_batch_size: 8
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 5
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63bc963057a86f8afa5f6de9fd9f42b884bd976ba06efda3654a50ae41657833
3
  size 7098064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db989c634262e3d7b817f801025496595cf823d8c12471a07955f7dd660fe0fc
3
  size 7098064
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 2.42193908736e+18,
4
- "train_loss": 0.6797876440482455,
5
- "train_runtime": 2875.7206,
6
- "train_samples_per_second": 2.893,
7
- "train_steps_per_second": 0.362
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 4.0365651456e+18,
4
+ "train_loss": 0.3920804708079577,
5
+ "train_runtime": 3538.8693,
6
+ "train_samples_per_second": 3.918,
7
+ "train_steps_per_second": 0.49
8
  }
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-small",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForConditionalGeneration"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": null,
11
+ "bos_token_id": 50257,
12
+ "classifier_proj_size": 256,
13
+ "d_model": 768,
14
+ "decoder_attention_heads": 12,
15
+ "decoder_ffn_dim": 3072,
16
+ "decoder_layerdrop": 0.0,
17
+ "decoder_layers": 12,
18
+ "decoder_start_token_id": 50258,
19
+ "dropout": 0.0,
20
+ "encoder_attention_heads": 12,
21
+ "encoder_ffn_dim": 3072,
22
+ "encoder_layerdrop": 0.0,
23
+ "encoder_layers": 12,
24
+ "eos_token_id": 50257,
25
+ "forced_decoder_ids": [
26
+ [
27
+ 1,
28
+ 50259
29
+ ],
30
+ [
31
+ 2,
32
+ 50359
33
+ ],
34
+ [
35
+ 3,
36
+ 50363
37
+ ]
38
+ ],
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "mask_feature_length": 10,
42
+ "mask_feature_min_masks": 0,
43
+ "mask_feature_prob": 0.0,
44
+ "mask_time_length": 10,
45
+ "mask_time_min_masks": 2,
46
+ "mask_time_prob": 0.05,
47
+ "max_length": null,
48
+ "max_source_positions": 1500,
49
+ "max_target_positions": 448,
50
+ "median_filter_width": 7,
51
+ "model_type": "whisper",
52
+ "num_hidden_layers": 12,
53
+ "num_mel_bins": 80,
54
+ "pad_token_id": 50257,
55
+ "quantization_config": {
56
+ "_load_in_4bit": true,
57
+ "_load_in_8bit": false,
58
+ "bnb_4bit_compute_dtype": "bfloat16",
59
+ "bnb_4bit_quant_storage": "uint8",
60
+ "bnb_4bit_quant_type": "nf4",
61
+ "bnb_4bit_use_double_quant": true,
62
+ "llm_int8_enable_fp32_cpu_offload": false,
63
+ "llm_int8_has_fp16_weight": false,
64
+ "llm_int8_skip_modules": null,
65
+ "llm_int8_threshold": 6.0,
66
+ "load_in_4bit": true,
67
+ "load_in_8bit": false,
68
+ "quant_method": "bitsandbytes"
69
+ },
70
+ "scale_embedding": false,
71
+ "torch_dtype": "float16",
72
+ "transformers_version": "4.49.0.dev0",
73
+ "use_cache": true,
74
+ "use_weighted_layer_sum": false,
75
+ "vocab_size": 51865
76
+ }
generation_config.json CHANGED
@@ -250,8 +250,6 @@
250
  49870,
251
  50254,
252
  50258,
253
- 50358,
254
- 50359,
255
  50360,
256
  50361,
257
  50362
 
250
  49870,
251
  50254,
252
  50258,
 
 
253
  50360,
254
  50361,
255
  50362
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06837e6070f7a623601458bea61b85fb65e0cf1f3c031cb8813aa17f3d8fb26
3
+ size 189759653
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 2.42193908736e+18,
4
- "train_loss": 0.6797876440482455,
5
- "train_runtime": 2875.7206,
6
- "train_samples_per_second": 2.893,
7
- "train_steps_per_second": 0.362
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 4.0365651456e+18,
4
+ "train_loss": 0.3920804708079577,
5
+ "train_runtime": 3538.8693,
6
+ "train_samples_per_second": 3.918,
7
+ "train_steps_per_second": 0.49
8
  }
trainer_state.json CHANGED
@@ -1,133 +1,168 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 200,
6
- "global_step": 1041,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2881844380403458,
13
- "grad_norm": 1.327895164489746,
14
- "learning_rate": 0.0001815561959654179,
15
- "loss": 3.1246,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.5763688760806917,
20
- "grad_norm": 1.4119809865951538,
21
- "learning_rate": 0.00016234390009606147,
22
- "loss": 0.8831,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.5763688760806917,
27
- "eval_runtime": 221.4793,
28
- "eval_samples_per_second": 5.585,
29
- "eval_steps_per_second": 2.795,
30
  "step": 200
31
  },
32
  {
33
  "epoch": 0.8645533141210374,
34
- "grad_norm": 1.9445453882217407,
35
- "learning_rate": 0.0001431316042267051,
36
- "loss": 0.4528,
37
  "step": 300
38
  },
39
  {
40
  "epoch": 1.1527377521613833,
41
- "grad_norm": 1.2831073999404907,
42
- "learning_rate": 0.00012391930835734872,
43
- "loss": 0.4009,
44
- "step": 400
45
- },
46
- {
47
- "epoch": 1.1527377521613833,
48
- "eval_runtime": 222.2115,
49
- "eval_samples_per_second": 5.567,
50
- "eval_steps_per_second": 2.786,
51
  "step": 400
52
  },
53
  {
54
  "epoch": 1.440922190201729,
55
- "grad_norm": 1.79501473903656,
56
- "learning_rate": 0.00010470701248799233,
57
- "loss": 0.368,
58
  "step": 500
59
  },
60
  {
61
- "epoch": 1.729106628242075,
62
- "grad_norm": 1.340496301651001,
63
- "learning_rate": 8.549471661863592e-05,
64
- "loss": 0.3721,
65
- "step": 600
66
  },
67
  {
68
  "epoch": 1.729106628242075,
69
- "eval_runtime": 223.5538,
70
- "eval_samples_per_second": 5.533,
71
- "eval_steps_per_second": 2.769,
72
  "step": 600
73
  },
74
  {
75
  "epoch": 2.0172910662824206,
76
- "grad_norm": 1.4648982286453247,
77
- "learning_rate": 6.628242074927953e-05,
78
- "loss": 0.3737,
79
  "step": 700
80
  },
81
  {
82
  "epoch": 2.3054755043227666,
83
- "grad_norm": 1.6813404560089111,
84
- "learning_rate": 4.7070124879923156e-05,
85
- "loss": 0.3306,
86
- "step": 800
87
- },
88
- {
89
- "epoch": 2.3054755043227666,
90
- "eval_runtime": 224.219,
91
- "eval_samples_per_second": 5.517,
92
- "eval_steps_per_second": 2.761,
93
  "step": 800
94
  },
95
  {
96
  "epoch": 2.5936599423631126,
97
- "grad_norm": 1.084306001663208,
98
- "learning_rate": 2.7857829010566765e-05,
99
- "loss": 0.3155,
100
  "step": 900
101
  },
102
  {
103
  "epoch": 2.881844380403458,
104
- "grad_norm": 1.6043856143951416,
105
- "learning_rate": 8.645533141210376e-06,
106
- "loss": 0.3168,
107
  "step": 1000
108
  },
109
  {
110
  "epoch": 2.881844380403458,
111
- "eval_runtime": 225.5438,
112
- "eval_samples_per_second": 5.485,
113
- "eval_steps_per_second": 2.744,
114
  "step": 1000
115
  },
116
  {
117
- "epoch": 3.0,
118
- "step": 1041,
119
- "total_flos": 2.42193908736e+18,
120
- "train_loss": 0.6797876440482455,
121
- "train_runtime": 2875.7206,
122
- "train_samples_per_second": 2.893,
123
- "train_steps_per_second": 0.362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
125
  ],
126
  "logging_steps": 100,
127
- "max_steps": 1041,
128
  "num_input_tokens_seen": 0,
129
- "num_train_epochs": 3,
130
- "save_steps": 200,
131
  "stateful_callbacks": {
132
  "TrainerControl": {
133
  "args": {
@@ -140,7 +175,7 @@
140
  "attributes": {}
141
  }
142
  },
143
- "total_flos": 2.42193908736e+18,
144
  "train_batch_size": 4,
145
  "trial_name": null,
146
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1735,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2881844380403458,
13
+ "grad_norm": 1.3598850965499878,
14
+ "learning_rate": 0.00018881844380403459,
15
+ "loss": 1.6553,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.5763688760806917,
20
+ "grad_norm": 1.4105329513549805,
21
+ "learning_rate": 0.00017729106628242076,
22
+ "loss": 0.4925,
 
 
 
 
 
 
 
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.8645533141210374,
27
+ "grad_norm": 1.7036561965942383,
28
+ "learning_rate": 0.0001657636887608069,
29
+ "loss": 0.4314,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 1.1527377521613833,
34
+ "grad_norm": 1.3207303285598755,
35
+ "learning_rate": 0.0001542363112391931,
36
+ "loss": 0.3837,
 
 
 
 
 
 
 
37
  "step": 400
38
  },
39
  {
40
  "epoch": 1.440922190201729,
41
+ "grad_norm": 1.6807191371917725,
42
+ "learning_rate": 0.00014270893371757926,
43
+ "loss": 0.3506,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 1.440922190201729,
48
+ "eval_runtime": 227.6958,
49
+ "eval_samples_per_second": 5.433,
50
+ "eval_steps_per_second": 2.719,
51
+ "step": 500
52
  },
53
  {
54
  "epoch": 1.729106628242075,
55
+ "grad_norm": 1.200997233390808,
56
+ "learning_rate": 0.0001311815561959654,
57
+ "loss": 0.358,
58
  "step": 600
59
  },
60
  {
61
  "epoch": 2.0172910662824206,
62
+ "grad_norm": 1.296838402748108,
63
+ "learning_rate": 0.00011965417867435158,
64
+ "loss": 0.3602,
65
  "step": 700
66
  },
67
  {
68
  "epoch": 2.3054755043227666,
69
+ "grad_norm": 1.4795269966125488,
70
+ "learning_rate": 0.00010812680115273777,
71
+ "loss": 0.3105,
 
 
 
 
 
 
 
72
  "step": 800
73
  },
74
  {
75
  "epoch": 2.5936599423631126,
76
+ "grad_norm": 1.107097864151001,
77
+ "learning_rate": 9.659942363112392e-05,
78
+ "loss": 0.2963,
79
  "step": 900
80
  },
81
  {
82
  "epoch": 2.881844380403458,
83
+ "grad_norm": 1.2915793657302856,
84
+ "learning_rate": 8.507204610951009e-05,
85
+ "loss": 0.2954,
86
  "step": 1000
87
  },
88
  {
89
  "epoch": 2.881844380403458,
90
+ "eval_runtime": 228.8311,
91
+ "eval_samples_per_second": 5.406,
92
+ "eval_steps_per_second": 2.705,
93
  "step": 1000
94
  },
95
  {
96
+ "epoch": 3.170028818443804,
97
+ "grad_norm": 1.0209407806396484,
98
+ "learning_rate": 7.354466858789625e-05,
99
+ "loss": 0.292,
100
+ "step": 1100
101
+ },
102
+ {
103
+ "epoch": 3.4582132564841497,
104
+ "grad_norm": 1.3803058862686157,
105
+ "learning_rate": 6.201729106628242e-05,
106
+ "loss": 0.2548,
107
+ "step": 1200
108
+ },
109
+ {
110
+ "epoch": 3.7463976945244957,
111
+ "grad_norm": 1.1842936277389526,
112
+ "learning_rate": 5.048991354466859e-05,
113
+ "loss": 0.2808,
114
+ "step": 1300
115
+ },
116
+ {
117
+ "epoch": 4.034582132564841,
118
+ "grad_norm": 1.5686421394348145,
119
+ "learning_rate": 3.8962536023054756e-05,
120
+ "loss": 0.249,
121
+ "step": 1400
122
+ },
123
+ {
124
+ "epoch": 4.322766570605188,
125
+ "grad_norm": 1.2323178052902222,
126
+ "learning_rate": 2.7435158501440923e-05,
127
+ "loss": 0.2317,
128
+ "step": 1500
129
+ },
130
+ {
131
+ "epoch": 4.322766570605188,
132
+ "eval_runtime": 226.3984,
133
+ "eval_samples_per_second": 5.464,
134
+ "eval_steps_per_second": 2.734,
135
+ "step": 1500
136
+ },
137
+ {
138
+ "epoch": 4.610951008645533,
139
+ "grad_norm": 0.9450750946998596,
140
+ "learning_rate": 1.590778097982709e-05,
141
+ "loss": 0.2381,
142
+ "step": 1600
143
+ },
144
+ {
145
+ "epoch": 4.899135446685879,
146
+ "grad_norm": 1.4306625127792358,
147
+ "learning_rate": 4.380403458213257e-06,
148
+ "loss": 0.2411,
149
+ "step": 1700
150
+ },
151
+ {
152
+ "epoch": 5.0,
153
+ "step": 1735,
154
+ "total_flos": 4.0365651456e+18,
155
+ "train_loss": 0.3920804708079577,
156
+ "train_runtime": 3538.8693,
157
+ "train_samples_per_second": 3.918,
158
+ "train_steps_per_second": 0.49
159
  }
160
  ],
161
  "logging_steps": 100,
162
+ "max_steps": 1735,
163
  "num_input_tokens_seen": 0,
164
+ "num_train_epochs": 5,
165
+ "save_steps": 500,
166
  "stateful_callbacks": {
167
  "TrainerControl": {
168
  "args": {
 
175
  "attributes": {}
176
  }
177
  },
178
+ "total_flos": 4.0365651456e+18,
179
  "train_batch_size": 4,
180
  "trial_name": null,
181
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10fe486164d02b5141ca201b6f0bd243d13bc3799a3938aa4a56750bdd9b4ffb
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30f32d48e6fc7612992dedde0fe7264e5f8905c5bd332ddc71486c1d962cef3
3
  size 5304