baby-dev commited on
Commit
57a690f
·
verified ·
1 Parent(s): 1b745aa

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "v_proj",
24
  "q_proj",
25
- "down_proj",
26
  "o_proj",
27
  "gate_proj",
28
- "up_proj",
29
- "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "up_proj",
24
  "q_proj",
25
+ "k_proj",
26
  "o_proj",
27
  "gate_proj",
28
+ "v_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02e9e6cfd491339a63f64471a7a81cda3f53d2d51980e1b00307bca9bb403b17
3
  size 48552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c693a75292cc1fa3a9f6f73a72febe4c6e07a58a4197545a4c4d4fd6e879b5ee
3
  size 48552
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d52d2ede02987bc8297c1be2a75b0d97e64a5099ea3b24849d0d3eca58d5dc89
3
  size 107046
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294259cb2c0619098026d5dab6b1d13ce9ce514c68aaa0eb013550087742d2a7
3
  size 107046
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac03d4c1e9618de3b84437676fe2619fcfaff76bcfcfab780d49e69aaca826ea
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa86072d894b06ed15a781ebf27303cfbb0a54143c21949f75c3251dd2ec55c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a6e8ea84c6ecd44137e4a21b32f529d3562b8f29f7c3085359bd396071d4c55
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7962e06291987b3327e85ffe1a0f48aec5a6651b8b5139bec8a1fd5526407429
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,116 +1,184 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0041928721174003,
5
  "eval_steps": 500,
6
- "global_step": 239,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.16771488469601678,
13
- "grad_norm": 0.023019764572381973,
14
  "learning_rate": 0.000233974358974359,
15
- "loss": 11.9301,
16
  "step": 20
17
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.33542976939203356,
20
- "grad_norm": 0.05555475503206253,
21
  "learning_rate": 0.0002126068376068376,
22
- "loss": 11.9266,
23
  "step": 40
24
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  {
26
  "epoch": 0.5031446540880503,
27
- "grad_norm": 0.04104316607117653,
28
  "learning_rate": 0.00019123931623931623,
29
- "loss": 11.9229,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.6708595387840671,
34
- "grad_norm": 0.0416753776371479,
35
- "learning_rate": 0.0001698717948717949,
36
- "loss": 11.9211,
37
- "step": 80
38
  },
39
  {
40
- "epoch": 0.8385744234800838,
41
- "grad_norm": 0.04827936738729477,
42
- "learning_rate": 0.0001485042735042735,
43
- "loss": 11.9211,
44
- "step": 100
45
  },
46
  {
47
- "epoch": 0.9979035639412998,
48
- "eval_loss": 11.919425964355469,
49
- "eval_runtime": 0.4166,
50
- "eval_samples_per_second": 242.468,
51
- "eval_steps_per_second": 62.417,
52
- "step": 119
 
 
 
 
 
 
53
  },
54
  {
55
- "epoch": 1.0062893081761006,
56
- "grad_norm": 0.11942638456821442,
57
- "learning_rate": 0.00012713675213675213,
58
- "loss": 12.3702,
59
- "step": 120
60
  },
61
  {
62
- "epoch": 1.1740041928721174,
63
- "grad_norm": 0.059040140360593796,
64
- "learning_rate": 0.00010576923076923077,
65
- "loss": 11.8371,
66
- "step": 140
67
  },
68
  {
69
- "epoch": 1.3417190775681342,
70
- "grad_norm": 0.06853944063186646,
71
- "learning_rate": 8.440170940170941e-05,
72
- "loss": 12.0029,
73
- "step": 160
74
  },
75
  {
76
- "epoch": 1.509433962264151,
77
- "grad_norm": 0.030212825164198875,
78
- "learning_rate": 6.303418803418804e-05,
79
- "loss": 11.9184,
80
- "step": 180
81
  },
82
  {
83
- "epoch": 1.6771488469601676,
84
- "grad_norm": 0.034588687121868134,
85
- "learning_rate": 4.1666666666666665e-05,
86
- "loss": 11.9407,
87
- "step": 200
88
  },
89
  {
90
- "epoch": 1.8448637316561844,
91
- "grad_norm": 0.0361829474568367,
92
- "learning_rate": 2.02991452991453e-05,
93
- "loss": 11.8561,
94
- "step": 220
95
  },
96
  {
97
- "epoch": 1.9958071278825997,
98
- "eval_loss": 11.917513847351074,
99
- "eval_runtime": 0.4181,
100
- "eval_samples_per_second": 241.592,
101
- "eval_steps_per_second": 62.192,
102
- "step": 238
103
  },
104
  {
105
- "epoch": 2.0041928721174003,
106
- "eval_loss": 11.917852401733398,
107
- "eval_runtime": 0.4182,
108
- "eval_samples_per_second": 241.517,
109
- "eval_steps_per_second": 62.173,
110
- "step": 239
111
  }
112
  ],
113
- "logging_steps": 20,
114
  "max_steps": 239,
115
  "num_input_tokens_seen": 0,
116
  "num_train_epochs": 3,
@@ -122,12 +190,12 @@
122
  "should_evaluate": false,
123
  "should_log": false,
124
  "should_save": true,
125
- "should_training_stop": true
126
  },
127
  "attributes": {}
128
  }
129
  },
130
- "total_flos": 269012385792.0,
131
  "train_batch_size": 4,
132
  "trial_name": null,
133
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9979035639412998,
5
  "eval_steps": 500,
6
+ "global_step": 119,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.041928721174004195,
13
+ "grad_norm": 0.011030412279069424,
14
+ "learning_rate": 0.00025,
15
+ "loss": 11.9318,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.08385744234800839,
20
+ "grad_norm": 0.014100322499871254,
21
+ "learning_rate": 0.00024465811965811965,
22
+ "loss": 11.9305,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.12578616352201258,
27
+ "grad_norm": 0.017396269366145134,
28
+ "learning_rate": 0.00023931623931623932,
29
+ "loss": 11.9291,
30
+ "step": 15
31
+ },
32
  {
33
  "epoch": 0.16771488469601678,
34
+ "grad_norm": 0.022825436666607857,
35
  "learning_rate": 0.000233974358974359,
36
+ "loss": 11.9293,
37
  "step": 20
38
  },
39
+ {
40
+ "epoch": 0.20964360587002095,
41
+ "grad_norm": 0.030763259157538414,
42
+ "learning_rate": 0.00022863247863247864,
43
+ "loss": 11.928,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.25157232704402516,
48
+ "grad_norm": 0.05623968690633774,
49
+ "learning_rate": 0.0002232905982905983,
50
+ "loss": 11.9273,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.29350104821802936,
55
+ "grad_norm": 0.0468871183693409,
56
+ "learning_rate": 0.00021794871794871795,
57
+ "loss": 11.9263,
58
+ "step": 35
59
+ },
60
  {
61
  "epoch": 0.33542976939203356,
62
+ "grad_norm": 0.05555358901619911,
63
  "learning_rate": 0.0002126068376068376,
64
+ "loss": 11.9248,
65
  "step": 40
66
  },
67
+ {
68
+ "epoch": 0.37735849056603776,
69
+ "grad_norm": 0.0784514918923378,
70
+ "learning_rate": 0.00020726495726495727,
71
+ "loss": 11.9244,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.4192872117400419,
76
+ "grad_norm": 0.05951184406876564,
77
+ "learning_rate": 0.00020192307692307694,
78
+ "loss": 11.9228,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.4612159329140461,
83
+ "grad_norm": 0.057042159140110016,
84
+ "learning_rate": 0.00019658119658119659,
85
+ "loss": 11.9221,
86
+ "step": 55
87
+ },
88
  {
89
  "epoch": 0.5031446540880503,
90
+ "grad_norm": 0.04163195937871933,
91
  "learning_rate": 0.00019123931623931623,
92
+ "loss": 11.9225,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.5450733752620545,
97
+ "grad_norm": 0.03262303024530411,
98
+ "learning_rate": 0.0001858974358974359,
99
+ "loss": 11.9226,
100
+ "step": 65
101
  },
102
  {
103
+ "epoch": 0.5870020964360587,
104
+ "grad_norm": 0.05241989716887474,
105
+ "learning_rate": 0.00018055555555555555,
106
+ "loss": 11.922,
107
+ "step": 70
108
  },
109
  {
110
+ "epoch": 0.6289308176100629,
111
+ "grad_norm": 0.06784799695014954,
112
+ "learning_rate": 0.00017521367521367522,
113
+ "loss": 11.9214,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.6708595387840671,
118
+ "grad_norm": 0.042793747037649155,
119
+ "learning_rate": 0.0001698717948717949,
120
+ "loss": 11.9183,
121
+ "step": 80
122
  },
123
  {
124
+ "epoch": 0.7127882599580713,
125
+ "grad_norm": 0.0430237241089344,
126
+ "learning_rate": 0.00016452991452991454,
127
+ "loss": 11.9216,
128
+ "step": 85
129
  },
130
  {
131
+ "epoch": 0.7547169811320755,
132
+ "grad_norm": 0.03868071734905243,
133
+ "learning_rate": 0.00015918803418803418,
134
+ "loss": 11.9194,
135
+ "step": 90
136
  },
137
  {
138
+ "epoch": 0.7966457023060797,
139
+ "grad_norm": 0.024328265339136124,
140
+ "learning_rate": 0.00015384615384615385,
141
+ "loss": 11.9217,
142
+ "step": 95
143
  },
144
  {
145
+ "epoch": 0.8385744234800838,
146
+ "grad_norm": 0.04353172332048416,
147
+ "learning_rate": 0.0001485042735042735,
148
+ "loss": 11.9212,
149
+ "step": 100
150
  },
151
  {
152
+ "epoch": 0.8805031446540881,
153
+ "grad_norm": 0.057023949921131134,
154
+ "learning_rate": 0.00014316239316239317,
155
+ "loss": 11.92,
156
+ "step": 105
157
  },
158
  {
159
+ "epoch": 0.9224318658280922,
160
+ "grad_norm": 0.039732299745082855,
161
+ "learning_rate": 0.00013782051282051284,
162
+ "loss": 11.9183,
163
+ "step": 110
164
  },
165
  {
166
+ "epoch": 0.9643605870020965,
167
+ "grad_norm": 0.0544021911919117,
168
+ "learning_rate": 0.00013247863247863248,
169
+ "loss": 11.9203,
170
+ "step": 115
 
171
  },
172
  {
173
+ "epoch": 0.9979035639412998,
174
+ "eval_loss": 11.919066429138184,
175
+ "eval_runtime": 0.416,
176
+ "eval_samples_per_second": 242.779,
177
+ "eval_steps_per_second": 62.498,
178
+ "step": 119
179
  }
180
  ],
181
+ "logging_steps": 5,
182
  "max_steps": 239,
183
  "num_input_tokens_seen": 0,
184
  "num_train_epochs": 3,
 
190
  "should_evaluate": false,
191
  "should_log": false,
192
  "should_save": true,
193
+ "should_training_stop": false
194
  },
195
  "attributes": {}
196
  }
197
  },
198
+ "total_flos": 134180413440.0,
199
  "train_batch_size": 4,
200
  "trial_name": null,
201
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6409c0415d59e9b8f683a402ccbf4bf65283e87400f87e225cb3fa657625637
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a2cea0175391dc361c529537163a9e0c00600decdea164c934ea89be542405
3
  size 6776