KrafterDen commited on
Commit
0683525
·
verified ·
1 Parent(s): d511b91

Training in progress, step 200, checkpoint

Browse files
checkpoint-200/README.md CHANGED
@@ -202,4 +202,5 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
202
  ### Framework versions
203
 
204
  - PEFT 0.10.0
205
- - PEFT 0.8.2
 
 
202
  ### Framework versions
203
 
204
  - PEFT 0.10.0
205
+ - PEFT 0.8.2
206
+ - PEFT 0.7.1
checkpoint-200/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "exontidev/SISUS_SIKERS",
5
  "bias": "none",
6
- "fan_in_fan_out": false,
7
  "inference_mode": true,
8
  "init_lora_weights": true,
9
  "layer_replication": null,
 
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "exontidev/SISUS_SIKERS",
5
  "bias": "none",
6
+ "fan_in_fan_out": true,
7
  "inference_mode": true,
8
  "init_lora_weights": true,
9
  "layer_replication": null,
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed0f8a25c99bf0c4ece72eaba3a91a8ce363d040554758830c308008a85b8220
3
  size 9443384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71674582f3c925fa5981ad28b64ef40fdc52f8835fc0c1b1e5d7975a9f2a9df9
3
  size 9443384
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41a6fc26fd7b5026ca3631c40fc12ae6e5d9e13b4c209b07c2b04a2930045f4a
3
  size 18914450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6c13af21915fcb2a1612b1ad57e18218f8d524c9473f3630e0f05a64d19030
3
  size 18914450
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1eea912470e6b4cd46b7542517282753ef83722547642edfbc4744d8b77eaf0
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de5ac2b40ae2cc9772fa4e3c73beb2b9702b79b18cb50371312d46079b5e66fe
3
  size 14168
checkpoint-200/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4922698253980463,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,152 +9,152 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.42,
13
- "grad_norm": 0.1815568506717682,
14
  "learning_rate": 2.9999999999999997e-05,
15
- "loss": 3.8657,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.84,
20
- "grad_norm": 0.23455914855003357,
21
  "learning_rate": 5.9999999999999995e-05,
22
- "loss": 3.8223,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 1.25,
27
- "grad_norm": 0.32260793447494507,
28
  "learning_rate": 8.999999999999999e-05,
29
- "loss": 3.7255,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 1.67,
34
- "grad_norm": 0.38705918192863464,
35
  "learning_rate": 0.00011999999999999999,
36
- "loss": 3.4952,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 2.09,
41
- "grad_norm": 0.5273059606552124,
42
  "learning_rate": 0.00015,
43
- "loss": 3.098,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 2.51,
48
- "grad_norm": 0.6030514240264893,
49
  "learning_rate": 0.00017999999999999998,
50
- "loss": 2.5299,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 2.92,
55
- "grad_norm": 0.459722101688385,
56
  "learning_rate": 0.00020999999999999998,
57
- "loss": 1.899,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 3.34,
62
- "grad_norm": 0.1655016839504242,
63
  "learning_rate": 0.00023999999999999998,
64
- "loss": 1.6018,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 3.76,
69
- "grad_norm": 0.10938003659248352,
70
  "learning_rate": 0.00027,
71
- "loss": 1.4726,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 4.18,
76
- "grad_norm": 0.09813433140516281,
77
  "learning_rate": 0.0003,
78
- "loss": 1.4336,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.27,
83
- "grad_norm": 0.2995990812778473,
84
  "learning_rate": 0.000285,
85
- "loss": 3.3878,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.3,
90
- "grad_norm": 0.2459421455860138,
91
  "learning_rate": 0.00027,
92
- "loss": 3.0843,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.32,
97
- "grad_norm": 0.2377060502767563,
98
  "learning_rate": 0.00025499999999999996,
99
- "loss": 2.8413,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.34,
104
- "grad_norm": 0.1750001609325409,
105
  "learning_rate": 0.00023999999999999998,
106
- "loss": 2.7303,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.37,
111
- "grad_norm": 0.1821776032447815,
112
  "learning_rate": 0.000225,
113
- "loss": 2.6535,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.39,
118
- "grad_norm": 0.16587179899215698,
119
  "learning_rate": 0.00020999999999999998,
120
- "loss": 2.6147,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.42,
125
- "grad_norm": 0.15111136436462402,
126
  "learning_rate": 0.000195,
127
- "loss": 2.5929,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.44,
132
- "grad_norm": 0.13922317326068878,
133
  "learning_rate": 0.00017999999999999998,
134
- "loss": 2.554,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.47,
139
- "grad_norm": 0.14242495596408844,
140
  "learning_rate": 0.000165,
141
- "loss": 2.5381,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.49,
146
- "grad_norm": 0.1816890388727188,
147
  "learning_rate": 0.00015,
148
- "loss": 2.5145,
149
  "step": 200
150
  }
151
  ],
152
  "logging_steps": 10,
153
  "max_steps": 300,
154
  "num_input_tokens_seen": 0,
155
- "num_train_epochs": 1,
156
  "save_steps": 100,
157
- "total_flos": 3.2322500059336704e+16,
158
  "train_batch_size": 4,
159
  "trial_name": null,
160
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.35509138381201,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.4177545691906005,
13
+ "grad_norm": 0.28227752447128296,
14
  "learning_rate": 2.9999999999999997e-05,
15
+ "loss": 4.1508,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.835509138381201,
20
+ "grad_norm": 0.31433430314064026,
21
  "learning_rate": 5.9999999999999995e-05,
22
+ "loss": 4.1593,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 1.2532637075718016,
27
+ "grad_norm": 0.3350953161716461,
28
  "learning_rate": 8.999999999999999e-05,
29
+ "loss": 4.0414,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 1.671018276762402,
34
+ "grad_norm": 0.2885706126689911,
35
  "learning_rate": 0.00011999999999999999,
36
+ "loss": 3.8411,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 2.0887728459530024,
41
+ "grad_norm": 0.23711609840393066,
42
  "learning_rate": 0.00015,
43
+ "loss": 3.6434,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 2.506527415143603,
48
+ "grad_norm": 0.21583135426044464,
49
  "learning_rate": 0.00017999999999999998,
50
+ "loss": 3.4636,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 2.9242819843342036,
55
+ "grad_norm": 0.18754692375659943,
56
  "learning_rate": 0.00020999999999999998,
57
+ "loss": 3.3154,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 3.342036553524804,
62
+ "grad_norm": 0.15951760113239288,
63
  "learning_rate": 0.00023999999999999998,
64
+ "loss": 3.2195,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 3.759791122715405,
69
+ "grad_norm": 0.14639759063720703,
70
  "learning_rate": 0.00027,
71
+ "loss": 3.122,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 4.177545691906005,
76
+ "grad_norm": 0.1860765665769577,
77
  "learning_rate": 0.0003,
78
+ "loss": 3.0677,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 4.595300261096606,
83
+ "grad_norm": 0.1737535446882248,
84
  "learning_rate": 0.000285,
85
+ "loss": 2.9992,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 5.013054830287206,
90
+ "grad_norm": 0.181383416056633,
91
  "learning_rate": 0.00027,
92
+ "loss": 2.9761,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 5.430809399477806,
97
+ "grad_norm": 0.1873219609260559,
98
  "learning_rate": 0.00025499999999999996,
99
+ "loss": 2.9281,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 5.848563968668407,
104
+ "grad_norm": 0.19864186644554138,
105
  "learning_rate": 0.00023999999999999998,
106
+ "loss": 2.9168,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 6.266318537859008,
111
+ "grad_norm": 0.22326301038265228,
112
  "learning_rate": 0.000225,
113
+ "loss": 2.8549,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 6.684073107049608,
118
+ "grad_norm": 0.2200121283531189,
119
  "learning_rate": 0.00020999999999999998,
120
+ "loss": 2.855,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 7.101827676240209,
125
+ "grad_norm": 0.2546086311340332,
126
  "learning_rate": 0.000195,
127
+ "loss": 2.8509,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 7.51958224543081,
132
+ "grad_norm": 0.26345309615135193,
133
  "learning_rate": 0.00017999999999999998,
134
+ "loss": 2.8144,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 7.93733681462141,
139
+ "grad_norm": 0.21533280611038208,
140
  "learning_rate": 0.000165,
141
+ "loss": 2.8006,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 8.35509138381201,
146
+ "grad_norm": 0.2510657012462616,
147
  "learning_rate": 0.00015,
148
+ "loss": 2.7816,
149
  "step": 200
150
  }
151
  ],
152
  "logging_steps": 10,
153
  "max_steps": 300,
154
  "num_input_tokens_seen": 0,
155
+ "num_train_epochs": 14,
156
  "save_steps": 100,
157
+ "total_flos": 3.235965641452339e+16,
158
  "train_batch_size": 4,
159
  "trial_name": null,
160
  "trial_params": null
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7189ea50d3101204ed0a14552e3e368c7afb6c90ecc8b189f33ef27c2f23742e
3
- size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42329f5345a3c120af37c6fdbce453b0541524f81257e209baeb9a0b15e22c94
3
+ size 5024