saracandu commited on
Commit
893f8eb
·
verified ·
1 Parent(s): c1ad578

Delete last-checkpoint

Browse files
last-checkpoint/config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "architectures": [
3
- "STLEncoderModel"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_stlenc.STLEncoderConfig",
7
- "AutoModel": "modeling_stlenc.STLEncoderModel",
8
- "AutoTokenizer": [
9
- "tokenizer_stlenc.STLTokenizer",
10
- null
11
- ]
12
- },
13
- "bos_token_id": 2,
14
- "dtype": "float32",
15
- "embedding_dim_target": 1024,
16
- "eos_token_id": 3,
17
- "hidden_size": 1024,
18
- "intermediate_size": 4096,
19
- "max_position_embeddings": 512,
20
- "model_type": "stl_encoder",
21
- "num_attention_heads": 16,
22
- "num_hidden_layers": 12,
23
- "pad_token_id": 1,
24
- "transformers_version": "4.57.3",
25
- "vocab_size": 35
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/configuration_stlenc.py DELETED
@@ -1,23 +0,0 @@
1
- from transformers import PretrainedConfig
2
-
3
- class STLEncoderConfig(PretrainedConfig):
4
- model_type = "stl_encoder"
5
- def __init__(
6
- self,
7
- vocab_size=35,
8
- hidden_size=1024,
9
- num_hidden_layers=12,
10
- num_attention_heads=16,
11
- intermediate_size=4096,
12
- max_position_embeddings=512,
13
- embedding_dim_target=1024,
14
- **kwargs
15
- ):
16
- super().__init__(**kwargs)
17
- self.vocab_size = vocab_size
18
- self.hidden_size = hidden_size
19
- self.num_hidden_layers = num_hidden_layers
20
- self.num_attention_heads = num_attention_heads
21
- self.intermediate_size = intermediate_size
22
- self.max_position_embeddings = max_position_embeddings
23
- self.embedding_dim_target = embedding_dim_target
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c14360134ba900d2dec38e5d1938ef436c781041120dd2b1646f4d2872f9d93
3
- size 611073224
 
 
 
 
last-checkpoint/modeling_stlenc.py DELETED
@@ -1,35 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from transformers import PreTrainedModel
4
- from .configuration_stlenc import STLEncoderConfig
5
-
6
- class STLEncoderModel(PreTrainedModel):
7
- config_class = STLEncoderConfig
8
-
9
- def __init__(self, config):
10
- super().__init__(config)
11
- self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
12
- self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
13
-
14
- encoder_layer = nn.TransformerEncoderLayer(
15
- d_model=config.hidden_size,
16
- nhead=config.num_attention_heads,
17
- dim_feedforward=config.intermediate_size,
18
- batch_first=True
19
- )
20
- self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
21
- self.pooler = nn.Linear(config.hidden_size, config.embedding_dim_target)
22
- self.activation = nn.Tanh()
23
- self.post_init()
24
-
25
- def forward(self, input_ids, attention_mask=None, **kwargs):
26
- batch_size, seq_length = input_ids.size()
27
- position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
28
- position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_length)
29
-
30
- x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
31
- padding_mask = (attention_mask == 0) if attention_mask is not None else None
32
- x = self.encoder(x, src_key_padding_mask=padding_mask)
33
-
34
- pooled_output = self.activation(self.pooler(x[:, 0, :]))
35
- return pooled_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:99060a481ee1e9c32b48e8e0ca4159f4449a57541b54802b60aeea2b6c77354f
3
- size 1222241675
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9301fdb63c8f7e117dacb34dd6b2675f07003e4703c56505ff6c5837f6209a98
3
- size 14645
 
 
 
 
last-checkpoint/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca372268f4fa9335030c0cb7aedb6cdba75f457da50e7a4034abb1a2d0843689
3
- size 1383
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:314048210ad2604ec1b09b90e17a875d2f267f3c96d2b6a50754bb2f69863b15
3
- size 1465
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "/s",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "s",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "pad",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "unk",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,50 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "unk",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "pad",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "/s",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "s",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- }
35
- },
36
- "auto_map": {
37
- "AutoTokenizer": [
38
- "tokenizer_stlenc.STLTokenizer",
39
- null
40
- ]
41
- },
42
- "bos_token": "/s",
43
- "clean_up_tokenization_spaces": false,
44
- "eos_token": "s",
45
- "extra_special_tokens": {},
46
- "model_max_length": 512,
47
- "pad_token": "pad",
48
- "tokenizer_class": "STLTokenizer",
49
- "unk_token": "unk"
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer_stlenc.py DELETED
@@ -1,93 +0,0 @@
1
- import json
2
- import os
3
- import torch
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
- from transformers import PreTrainedTokenizer, AutoTokenizer
6
-
7
- class STLTokenizer(PreTrainedTokenizer):
8
- model_type = "stl_encoder"
9
-
10
- def __init__(
11
- self,
12
- vocab_file="vocab.json",
13
- unk_token="unk",
14
- pad_token="pad",
15
- bos_token="/s",
16
- eos_token="s",
17
- model_max_length=512,
18
- **kwargs
19
- ):
20
- current_dir = os.path.dirname(__file__)
21
- full_vocab_path = os.path.join(current_dir, vocab_file)
22
-
23
- if not os.path.exists(full_vocab_path):
24
- from huggingface_hub import hf_hub_download
25
- try:
26
- full_vocab_path = hf_hub_download("saracandu/stlenc", vocab_file)
27
- except:
28
- full_vocab_path = vocab_file
29
-
30
- with open(full_vocab_path, "r", encoding="utf-8") as f:
31
- self.vocab = json.load(f)
32
-
33
- self.id_to_token = {v: k for k, v in self.vocab.items()}
34
-
35
- super().__init__(
36
- unk_token=unk_token,
37
- pad_token=pad_token,
38
- bos_token=bos_token,
39
- eos_token=eos_token,
40
- model_max_length=model_max_length,
41
- **kwargs
42
- )
43
-
44
- @property
45
- def vocab_size(self) -> int:
46
- return len(self.vocab)
47
-
48
- def get_vocab(self) -> Dict[str, int]:
49
- return dict(self.vocab)
50
-
51
- def _tokenize(self, text: str) -> List[str]:
52
- text = f'{self.bos_token} {text} {self.eos_token}'.replace(' ', '@')
53
-
54
- tokens = []
55
- i = 0
56
- while i < len(text):
57
- best_match = None
58
- for j in range(min(i + 50, len(text)), i, -1):
59
- subtoken = text[i:j]
60
- if subtoken in self.vocab:
61
- best_match = subtoken
62
- break
63
-
64
- if best_match:
65
- tokens.append(best_match)
66
- i += len(best_match)
67
- else:
68
- tokens.append(self.unk_token)
69
- i += 1
70
- return tokens
71
-
72
- def _convert_token_to_id(self, token: str) -> int:
73
- return self.vocab.get(token, self.vocab.get(self.unk_token))
74
-
75
- def _convert_id_to_token(self, index: int) -> str:
76
- return self.id_to_token.get(index, self.unk_token)
77
-
78
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
79
- if not os.path.isdir(save_directory):
80
- os.makedirs(save_directory)
81
-
82
- prefix = filename_prefix if filename_prefix is not None else ""
83
- vocab_file = os.path.join(save_directory, prefix + "vocab.json")
84
-
85
- with open(vocab_file, "w", encoding="utf-8") as f:
86
- json.dump(self.vocab, f, indent=2, ensure_ascii=False)
87
-
88
- return (vocab_file,)
89
-
90
- try:
91
- AutoTokenizer.register("stl_encoder", STLTokenizer)
92
- except Exception:
93
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,1234 +0,0 @@
1
- {
2
- "best_global_step": 1500,
3
- "best_metric": 0.020201340437836564,
4
- "best_model_checkpoint": "./stlenc-training/checkpoint-1500",
5
- "epoch": 0.3048780487804878,
6
- "eval_steps": 100,
7
- "global_step": 1500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0020325203252032522,
14
- "grad_norm": 0.3073508143424988,
15
- "learning_rate": 4.998170731707317e-05,
16
- "loss": 0.1335,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0040650406504065045,
21
- "grad_norm": 0.20867981016635895,
22
- "learning_rate": 4.996138211382114e-05,
23
- "loss": 0.0568,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.006097560975609756,
28
- "grad_norm": 0.1618652641773224,
29
- "learning_rate": 4.994105691056911e-05,
30
- "loss": 0.0505,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.008130081300813009,
35
- "grad_norm": 0.14075744152069092,
36
- "learning_rate": 4.9920731707317074e-05,
37
- "loss": 0.0459,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.01016260162601626,
42
- "grad_norm": 0.15348248183727264,
43
- "learning_rate": 4.990040650406504e-05,
44
- "loss": 0.0441,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.012195121951219513,
49
- "grad_norm": 0.1347082257270813,
50
- "learning_rate": 4.988008130081301e-05,
51
- "loss": 0.0426,
52
- "step": 60
53
- },
54
- {
55
- "epoch": 0.014227642276422764,
56
- "grad_norm": 0.1535383015871048,
57
- "learning_rate": 4.9859756097560977e-05,
58
- "loss": 0.0428,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.016260162601626018,
63
- "grad_norm": 0.14122287929058075,
64
- "learning_rate": 4.9839430894308944e-05,
65
- "loss": 0.0395,
66
- "step": 80
67
- },
68
- {
69
- "epoch": 0.018292682926829267,
70
- "grad_norm": 0.16186140477657318,
71
- "learning_rate": 4.981910569105691e-05,
72
- "loss": 0.0397,
73
- "step": 90
74
- },
75
- {
76
- "epoch": 0.02032520325203252,
77
- "grad_norm": 0.14353497326374054,
78
- "learning_rate": 4.979878048780488e-05,
79
- "loss": 0.0387,
80
- "step": 100
81
- },
82
- {
83
- "epoch": 0.02032520325203252,
84
- "eval_cosine_similarity": 0.43845831272477015,
85
- "eval_loss": 0.023171832785010338,
86
- "eval_mse_sync": 0.023171832367424337,
87
- "eval_runtime": 7.8562,
88
- "eval_samples_per_second": 380.971,
89
- "eval_steps_per_second": 23.93,
90
- "step": 100
91
- },
92
- {
93
- "epoch": 0.022357723577235773,
94
- "grad_norm": 0.16455022990703583,
95
- "learning_rate": 4.977845528455285e-05,
96
- "loss": 0.039,
97
- "step": 110
98
- },
99
- {
100
- "epoch": 0.024390243902439025,
101
- "grad_norm": 0.14409448206424713,
102
- "learning_rate": 4.9758130081300813e-05,
103
- "loss": 0.0377,
104
- "step": 120
105
- },
106
- {
107
- "epoch": 0.026422764227642278,
108
- "grad_norm": 0.14973576366901398,
109
- "learning_rate": 4.973780487804878e-05,
110
- "loss": 0.0367,
111
- "step": 130
112
- },
113
- {
114
- "epoch": 0.028455284552845527,
115
- "grad_norm": 0.12309937924146652,
116
- "learning_rate": 4.9717479674796755e-05,
117
- "loss": 0.0358,
118
- "step": 140
119
- },
120
- {
121
- "epoch": 0.03048780487804878,
122
- "grad_norm": 0.12204054743051529,
123
- "learning_rate": 4.9697154471544716e-05,
124
- "loss": 0.0339,
125
- "step": 150
126
- },
127
- {
128
- "epoch": 0.032520325203252036,
129
- "grad_norm": 0.16795864701271057,
130
- "learning_rate": 4.967682926829268e-05,
131
- "loss": 0.0359,
132
- "step": 160
133
- },
134
- {
135
- "epoch": 0.034552845528455285,
136
- "grad_norm": 0.14515043795108795,
137
- "learning_rate": 4.965650406504066e-05,
138
- "loss": 0.034,
139
- "step": 170
140
- },
141
- {
142
- "epoch": 0.036585365853658534,
143
- "grad_norm": 0.14003875851631165,
144
- "learning_rate": 4.963617886178862e-05,
145
- "loss": 0.0351,
146
- "step": 180
147
- },
148
- {
149
- "epoch": 0.03861788617886179,
150
- "grad_norm": 0.1429203450679779,
151
- "learning_rate": 4.9615853658536585e-05,
152
- "loss": 0.035,
153
- "step": 190
154
- },
155
- {
156
- "epoch": 0.04065040650406504,
157
- "grad_norm": 0.13865381479263306,
158
- "learning_rate": 4.959552845528456e-05,
159
- "loss": 0.0337,
160
- "step": 200
161
- },
162
- {
163
- "epoch": 0.04065040650406504,
164
- "eval_cosine_similarity": 0.44644545833681326,
165
- "eval_loss": 0.022488251328468323,
166
- "eval_mse_sync": 0.02248825095382336,
167
- "eval_runtime": 7.8597,
168
- "eval_samples_per_second": 380.801,
169
- "eval_steps_per_second": 23.919,
170
- "step": 200
171
- },
172
- {
173
- "epoch": 0.042682926829268296,
174
- "grad_norm": 0.18544256687164307,
175
- "learning_rate": 4.957520325203252e-05,
176
- "loss": 0.0345,
177
- "step": 210
178
- },
179
- {
180
- "epoch": 0.044715447154471545,
181
- "grad_norm": 0.1306702345609665,
182
- "learning_rate": 4.955487804878049e-05,
183
- "loss": 0.0351,
184
- "step": 220
185
- },
186
- {
187
- "epoch": 0.046747967479674794,
188
- "grad_norm": 0.13073720037937164,
189
- "learning_rate": 4.953455284552846e-05,
190
- "loss": 0.0334,
191
- "step": 230
192
- },
193
- {
194
- "epoch": 0.04878048780487805,
195
- "grad_norm": 0.13994655013084412,
196
- "learning_rate": 4.951422764227642e-05,
197
- "loss": 0.0329,
198
- "step": 240
199
- },
200
- {
201
- "epoch": 0.0508130081300813,
202
- "grad_norm": 0.13009095191955566,
203
- "learning_rate": 4.949390243902439e-05,
204
- "loss": 0.0336,
205
- "step": 250
206
- },
207
- {
208
- "epoch": 0.052845528455284556,
209
- "grad_norm": 0.12570971250534058,
210
- "learning_rate": 4.9473577235772363e-05,
211
- "loss": 0.033,
212
- "step": 260
213
- },
214
- {
215
- "epoch": 0.054878048780487805,
216
- "grad_norm": 0.12797130644321442,
217
- "learning_rate": 4.9453252032520324e-05,
218
- "loss": 0.0323,
219
- "step": 270
220
- },
221
- {
222
- "epoch": 0.056910569105691054,
223
- "grad_norm": 0.1270533800125122,
224
- "learning_rate": 4.943292682926829e-05,
225
- "loss": 0.0324,
226
- "step": 280
227
- },
228
- {
229
- "epoch": 0.05894308943089431,
230
- "grad_norm": 0.13283374905586243,
231
- "learning_rate": 4.9412601626016266e-05,
232
- "loss": 0.0328,
233
- "step": 290
234
- },
235
- {
236
- "epoch": 0.06097560975609756,
237
- "grad_norm": 0.14629143476486206,
238
- "learning_rate": 4.9392276422764226e-05,
239
- "loss": 0.0311,
240
- "step": 300
241
- },
242
- {
243
- "epoch": 0.06097560975609756,
244
- "eval_cosine_similarity": 0.4329459800733757,
245
- "eval_loss": 0.02308308333158493,
246
- "eval_mse_sync": 0.02308308467053616,
247
- "eval_runtime": 7.902,
248
- "eval_samples_per_second": 378.765,
249
- "eval_steps_per_second": 23.791,
250
- "step": 300
251
- },
252
- {
253
- "epoch": 0.06300813008130081,
254
- "grad_norm": 0.15855424106121063,
255
- "learning_rate": 4.93719512195122e-05,
256
- "loss": 0.0324,
257
- "step": 310
258
- },
259
- {
260
- "epoch": 0.06504065040650407,
261
- "grad_norm": 0.1387602835893631,
262
- "learning_rate": 4.935162601626017e-05,
263
- "loss": 0.0312,
264
- "step": 320
265
- },
266
- {
267
- "epoch": 0.06707317073170732,
268
- "grad_norm": 0.14994463324546814,
269
- "learning_rate": 4.933130081300813e-05,
270
- "loss": 0.0342,
271
- "step": 330
272
- },
273
- {
274
- "epoch": 0.06910569105691057,
275
- "grad_norm": 0.1439775824546814,
276
- "learning_rate": 4.93109756097561e-05,
277
- "loss": 0.032,
278
- "step": 340
279
- },
280
- {
281
- "epoch": 0.07113821138211382,
282
- "grad_norm": 0.11875730007886887,
283
- "learning_rate": 4.929065040650407e-05,
284
- "loss": 0.0325,
285
- "step": 350
286
- },
287
- {
288
- "epoch": 0.07317073170731707,
289
- "grad_norm": 0.12371476739645004,
290
- "learning_rate": 4.927032520325203e-05,
291
- "loss": 0.0319,
292
- "step": 360
293
- },
294
- {
295
- "epoch": 0.07520325203252033,
296
- "grad_norm": 0.12192820757627487,
297
- "learning_rate": 4.9250000000000004e-05,
298
- "loss": 0.0312,
299
- "step": 370
300
- },
301
- {
302
- "epoch": 0.07723577235772358,
303
- "grad_norm": 0.14944523572921753,
304
- "learning_rate": 4.922967479674797e-05,
305
- "loss": 0.0325,
306
- "step": 380
307
- },
308
- {
309
- "epoch": 0.07926829268292683,
310
- "grad_norm": 0.13014467060565948,
311
- "learning_rate": 4.920934959349593e-05,
312
- "loss": 0.0296,
313
- "step": 390
314
- },
315
- {
316
- "epoch": 0.08130081300813008,
317
- "grad_norm": 0.12468204647302628,
318
- "learning_rate": 4.9189024390243907e-05,
319
- "loss": 0.0336,
320
- "step": 400
321
- },
322
- {
323
- "epoch": 0.08130081300813008,
324
- "eval_cosine_similarity": 0.45511097012655893,
325
- "eval_loss": 0.023316312581300735,
326
- "eval_mse_sync": 0.02331631100243981,
327
- "eval_runtime": 7.896,
328
- "eval_samples_per_second": 379.053,
329
- "eval_steps_per_second": 23.81,
330
- "step": 400
331
- },
332
- {
333
- "epoch": 0.08333333333333333,
334
- "grad_norm": 0.12918700277805328,
335
- "learning_rate": 4.9168699186991874e-05,
336
- "loss": 0.0307,
337
- "step": 410
338
- },
339
- {
340
- "epoch": 0.08536585365853659,
341
- "grad_norm": 0.16114428639411926,
342
- "learning_rate": 4.9148373983739835e-05,
343
- "loss": 0.0302,
344
- "step": 420
345
- },
346
- {
347
- "epoch": 0.08739837398373984,
348
- "grad_norm": 0.13520212471485138,
349
- "learning_rate": 4.912804878048781e-05,
350
- "loss": 0.0311,
351
- "step": 430
352
- },
353
- {
354
- "epoch": 0.08943089430894309,
355
- "grad_norm": 0.12596647441387177,
356
- "learning_rate": 4.9107723577235776e-05,
357
- "loss": 0.0297,
358
- "step": 440
359
- },
360
- {
361
- "epoch": 0.09146341463414634,
362
- "grad_norm": 0.10777679830789566,
363
- "learning_rate": 4.908739837398374e-05,
364
- "loss": 0.0304,
365
- "step": 450
366
- },
367
- {
368
- "epoch": 0.09349593495934959,
369
- "grad_norm": 0.127786323428154,
370
- "learning_rate": 4.906707317073171e-05,
371
- "loss": 0.0308,
372
- "step": 460
373
- },
374
- {
375
- "epoch": 0.09552845528455285,
376
- "grad_norm": 0.165998637676239,
377
- "learning_rate": 4.904674796747968e-05,
378
- "loss": 0.0294,
379
- "step": 470
380
- },
381
- {
382
- "epoch": 0.0975609756097561,
383
- "grad_norm": 0.1122402474284172,
384
- "learning_rate": 4.902642276422764e-05,
385
- "loss": 0.0305,
386
- "step": 480
387
- },
388
- {
389
- "epoch": 0.09959349593495935,
390
- "grad_norm": 0.1091734766960144,
391
- "learning_rate": 4.900609756097561e-05,
392
- "loss": 0.0323,
393
- "step": 490
394
- },
395
- {
396
- "epoch": 0.1016260162601626,
397
- "grad_norm": 0.12982085347175598,
398
- "learning_rate": 4.898577235772358e-05,
399
- "loss": 0.0297,
400
- "step": 500
401
- },
402
- {
403
- "epoch": 0.1016260162601626,
404
- "eval_cosine_similarity": 0.44197734325098587,
405
- "eval_loss": 0.022700216621160507,
406
- "eval_mse_sync": 0.02270021719246296,
407
- "eval_runtime": 7.917,
408
- "eval_samples_per_second": 378.046,
409
- "eval_steps_per_second": 23.746,
410
- "step": 500
411
- },
412
- {
413
- "epoch": 0.10365853658536585,
414
- "grad_norm": 0.14651747047901154,
415
- "learning_rate": 4.896544715447154e-05,
416
- "loss": 0.0291,
417
- "step": 510
418
- },
419
- {
420
- "epoch": 0.10569105691056911,
421
- "grad_norm": 0.13730144500732422,
422
- "learning_rate": 4.8945121951219515e-05,
423
- "loss": 0.0313,
424
- "step": 520
425
- },
426
- {
427
- "epoch": 0.10772357723577236,
428
- "grad_norm": 0.1328081637620926,
429
- "learning_rate": 4.892479674796748e-05,
430
- "loss": 0.0305,
431
- "step": 530
432
- },
433
- {
434
- "epoch": 0.10975609756097561,
435
- "grad_norm": 0.15137328207492828,
436
- "learning_rate": 4.890447154471545e-05,
437
- "loss": 0.0299,
438
- "step": 540
439
- },
440
- {
441
- "epoch": 0.11178861788617886,
442
- "grad_norm": 0.15069301426410675,
443
- "learning_rate": 4.888414634146342e-05,
444
- "loss": 0.0302,
445
- "step": 550
446
- },
447
- {
448
- "epoch": 0.11382113821138211,
449
- "grad_norm": 0.13555215299129486,
450
- "learning_rate": 4.8863821138211385e-05,
451
- "loss": 0.031,
452
- "step": 560
453
- },
454
- {
455
- "epoch": 0.11585365853658537,
456
- "grad_norm": 0.11980469524860382,
457
- "learning_rate": 4.884349593495935e-05,
458
- "loss": 0.0302,
459
- "step": 570
460
- },
461
- {
462
- "epoch": 0.11788617886178862,
463
- "grad_norm": 0.11329913884401321,
464
- "learning_rate": 4.882317073170732e-05,
465
- "loss": 0.0302,
466
- "step": 580
467
- },
468
- {
469
- "epoch": 0.11991869918699187,
470
- "grad_norm": 0.11942901462316513,
471
- "learning_rate": 4.880284552845529e-05,
472
- "loss": 0.0283,
473
- "step": 590
474
- },
475
- {
476
- "epoch": 0.12195121951219512,
477
- "grad_norm": 0.17668181657791138,
478
- "learning_rate": 4.8782520325203254e-05,
479
- "loss": 0.0298,
480
- "step": 600
481
- },
482
- {
483
- "epoch": 0.12195121951219512,
484
- "eval_cosine_similarity": 0.45993702271452247,
485
- "eval_loss": 0.022243835031986237,
486
- "eval_mse_sync": 0.02224383312391072,
487
- "eval_runtime": 7.95,
488
- "eval_samples_per_second": 376.478,
489
- "eval_steps_per_second": 23.648,
490
- "step": 600
491
- },
492
- {
493
- "epoch": 0.12398373983739837,
494
- "grad_norm": 0.14265325665473938,
495
- "learning_rate": 4.876219512195122e-05,
496
- "loss": 0.0298,
497
- "step": 610
498
- },
499
- {
500
- "epoch": 0.12601626016260162,
501
- "grad_norm": 0.1118614450097084,
502
- "learning_rate": 4.874186991869919e-05,
503
- "loss": 0.0303,
504
- "step": 620
505
- },
506
- {
507
- "epoch": 0.12804878048780488,
508
- "grad_norm": 0.11675341427326202,
509
- "learning_rate": 4.8721544715447156e-05,
510
- "loss": 0.0304,
511
- "step": 630
512
- },
513
- {
514
- "epoch": 0.13008130081300814,
515
- "grad_norm": 0.12549124658107758,
516
- "learning_rate": 4.8701219512195124e-05,
517
- "loss": 0.0297,
518
- "step": 640
519
- },
520
- {
521
- "epoch": 0.13211382113821138,
522
- "grad_norm": 0.13423743844032288,
523
- "learning_rate": 4.868089430894309e-05,
524
- "loss": 0.0289,
525
- "step": 650
526
- },
527
- {
528
- "epoch": 0.13414634146341464,
529
- "grad_norm": 0.1256653517484665,
530
- "learning_rate": 4.8660569105691065e-05,
531
- "loss": 0.0291,
532
- "step": 660
533
- },
534
- {
535
- "epoch": 0.13617886178861788,
536
- "grad_norm": 0.1281777024269104,
537
- "learning_rate": 4.8640243902439026e-05,
538
- "loss": 0.0302,
539
- "step": 670
540
- },
541
- {
542
- "epoch": 0.13821138211382114,
543
- "grad_norm": 0.1304328590631485,
544
- "learning_rate": 4.861991869918699e-05,
545
- "loss": 0.0282,
546
- "step": 680
547
- },
548
- {
549
- "epoch": 0.1402439024390244,
550
- "grad_norm": 0.1297086775302887,
551
- "learning_rate": 4.859959349593497e-05,
552
- "loss": 0.0299,
553
- "step": 690
554
- },
555
- {
556
- "epoch": 0.14227642276422764,
557
- "grad_norm": 0.13382111489772797,
558
- "learning_rate": 4.857926829268293e-05,
559
- "loss": 0.0303,
560
- "step": 700
561
- },
562
- {
563
- "epoch": 0.14227642276422764,
564
- "eval_cosine_similarity": 0.4609147342917037,
565
- "eval_loss": 0.02211085520684719,
566
- "eval_mse_sync": 0.022110854542816995,
567
- "eval_runtime": 7.95,
568
- "eval_samples_per_second": 376.479,
569
- "eval_steps_per_second": 23.648,
570
- "step": 700
571
- },
572
- {
573
- "epoch": 0.1443089430894309,
574
- "grad_norm": 0.14310456812381744,
575
- "learning_rate": 4.8558943089430895e-05,
576
- "loss": 0.0282,
577
- "step": 710
578
- },
579
- {
580
- "epoch": 0.14634146341463414,
581
- "grad_norm": 0.13597296178340912,
582
- "learning_rate": 4.853861788617887e-05,
583
- "loss": 0.0282,
584
- "step": 720
585
- },
586
- {
587
- "epoch": 0.1483739837398374,
588
- "grad_norm": 0.21410208940505981,
589
- "learning_rate": 4.851829268292683e-05,
590
- "loss": 0.03,
591
- "step": 730
592
- },
593
- {
594
- "epoch": 0.15040650406504066,
595
- "grad_norm": 0.14297862350940704,
596
- "learning_rate": 4.84979674796748e-05,
597
- "loss": 0.0288,
598
- "step": 740
599
- },
600
- {
601
- "epoch": 0.1524390243902439,
602
- "grad_norm": 0.13763341307640076,
603
- "learning_rate": 4.847764227642277e-05,
604
- "loss": 0.0293,
605
- "step": 750
606
- },
607
- {
608
- "epoch": 0.15447154471544716,
609
- "grad_norm": 0.14061112701892853,
610
- "learning_rate": 4.845731707317073e-05,
611
- "loss": 0.0294,
612
- "step": 760
613
- },
614
- {
615
- "epoch": 0.1565040650406504,
616
- "grad_norm": 0.1257963925600052,
617
- "learning_rate": 4.84369918699187e-05,
618
- "loss": 0.0292,
619
- "step": 770
620
- },
621
- {
622
- "epoch": 0.15853658536585366,
623
- "grad_norm": 0.10741665959358215,
624
- "learning_rate": 4.8416666666666673e-05,
625
- "loss": 0.0284,
626
- "step": 780
627
- },
628
- {
629
- "epoch": 0.16056910569105692,
630
- "grad_norm": 0.15043672919273376,
631
- "learning_rate": 4.8396341463414634e-05,
632
- "loss": 0.029,
633
- "step": 790
634
- },
635
- {
636
- "epoch": 0.16260162601626016,
637
- "grad_norm": 0.15317371487617493,
638
- "learning_rate": 4.83760162601626e-05,
639
- "loss": 0.0298,
640
- "step": 800
641
- },
642
- {
643
- "epoch": 0.16260162601626016,
644
- "eval_cosine_similarity": 0.4601094912733874,
645
- "eval_loss": 0.022229857742786407,
646
- "eval_mse_sync": 0.022229857477672196,
647
- "eval_runtime": 7.9894,
648
- "eval_samples_per_second": 374.619,
649
- "eval_steps_per_second": 23.531,
650
- "step": 800
651
- },
652
- {
653
- "epoch": 0.16463414634146342,
654
- "grad_norm": 0.11747121810913086,
655
- "learning_rate": 4.8355691056910576e-05,
656
- "loss": 0.029,
657
- "step": 810
658
- },
659
- {
660
- "epoch": 0.16666666666666666,
661
- "grad_norm": 0.12080563604831696,
662
- "learning_rate": 4.8335365853658536e-05,
663
- "loss": 0.029,
664
- "step": 820
665
- },
666
- {
667
- "epoch": 0.16869918699186992,
668
- "grad_norm": 0.11985825002193451,
669
- "learning_rate": 4.8315040650406504e-05,
670
- "loss": 0.0289,
671
- "step": 830
672
- },
673
- {
674
- "epoch": 0.17073170731707318,
675
- "grad_norm": 0.139726921916008,
676
- "learning_rate": 4.829471544715448e-05,
677
- "loss": 0.0289,
678
- "step": 840
679
- },
680
- {
681
- "epoch": 0.17276422764227642,
682
- "grad_norm": 0.12052454799413681,
683
- "learning_rate": 4.827439024390244e-05,
684
- "loss": 0.0278,
685
- "step": 850
686
- },
687
- {
688
- "epoch": 0.17479674796747968,
689
- "grad_norm": 0.1230531707406044,
690
- "learning_rate": 4.825406504065041e-05,
691
- "loss": 0.0282,
692
- "step": 860
693
- },
694
- {
695
- "epoch": 0.17682926829268292,
696
- "grad_norm": 0.12765666842460632,
697
- "learning_rate": 4.823373983739838e-05,
698
- "loss": 0.0277,
699
- "step": 870
700
- },
701
- {
702
- "epoch": 0.17886178861788618,
703
- "grad_norm": 0.11474256962537766,
704
- "learning_rate": 4.821341463414634e-05,
705
- "loss": 0.0259,
706
- "step": 880
707
- },
708
- {
709
- "epoch": 0.18089430894308944,
710
- "grad_norm": 0.12510469555854797,
711
- "learning_rate": 4.8193089430894315e-05,
712
- "loss": 0.0295,
713
- "step": 890
714
- },
715
- {
716
- "epoch": 0.18292682926829268,
717
- "grad_norm": 0.13411709666252136,
718
- "learning_rate": 4.817276422764228e-05,
719
- "loss": 0.0286,
720
- "step": 900
721
- },
722
- {
723
- "epoch": 0.18292682926829268,
724
- "eval_cosine_similarity": 0.47567485046354857,
725
- "eval_loss": 0.021220851689577103,
726
- "eval_mse_sync": 0.021220852660106707,
727
- "eval_runtime": 7.9662,
728
- "eval_samples_per_second": 375.714,
729
- "eval_steps_per_second": 23.6,
730
- "step": 900
731
- },
732
- {
733
- "epoch": 0.18495934959349594,
734
- "grad_norm": 0.12140695750713348,
735
- "learning_rate": 4.815243902439024e-05,
736
- "loss": 0.0271,
737
- "step": 910
738
- },
739
- {
740
- "epoch": 0.18699186991869918,
741
- "grad_norm": 0.1151667982339859,
742
- "learning_rate": 4.813211382113822e-05,
743
- "loss": 0.0261,
744
- "step": 920
745
- },
746
- {
747
- "epoch": 0.18902439024390244,
748
- "grad_norm": 0.13169187307357788,
749
- "learning_rate": 4.8111788617886184e-05,
750
- "loss": 0.0284,
751
- "step": 930
752
- },
753
- {
754
- "epoch": 0.1910569105691057,
755
- "grad_norm": 0.11413775384426117,
756
- "learning_rate": 4.8091463414634145e-05,
757
- "loss": 0.0278,
758
- "step": 940
759
- },
760
- {
761
- "epoch": 0.19308943089430894,
762
- "grad_norm": 0.13462452590465546,
763
- "learning_rate": 4.807113821138212e-05,
764
- "loss": 0.0269,
765
- "step": 950
766
- },
767
- {
768
- "epoch": 0.1951219512195122,
769
- "grad_norm": 0.0989966094493866,
770
- "learning_rate": 4.8050813008130086e-05,
771
- "loss": 0.0269,
772
- "step": 960
773
- },
774
- {
775
- "epoch": 0.19715447154471544,
776
- "grad_norm": 0.11530207097530365,
777
- "learning_rate": 4.803048780487805e-05,
778
- "loss": 0.0273,
779
- "step": 970
780
- },
781
- {
782
- "epoch": 0.1991869918699187,
783
- "grad_norm": 0.11543365567922592,
784
- "learning_rate": 4.801016260162602e-05,
785
- "loss": 0.0265,
786
- "step": 980
787
- },
788
- {
789
- "epoch": 0.20121951219512196,
790
- "grad_norm": 0.12322687357664108,
791
- "learning_rate": 4.798983739837399e-05,
792
- "loss": 0.0277,
793
- "step": 990
794
- },
795
- {
796
- "epoch": 0.2032520325203252,
797
- "grad_norm": 0.12608106434345245,
798
- "learning_rate": 4.796951219512195e-05,
799
- "loss": 0.0254,
800
- "step": 1000
801
- },
802
- {
803
- "epoch": 0.2032520325203252,
804
- "eval_cosine_similarity": 0.4821661613555485,
805
- "eval_loss": 0.021430717781186104,
806
- "eval_mse_sync": 0.021430718056257656,
807
- "eval_runtime": 8.0256,
808
- "eval_samples_per_second": 372.931,
809
- "eval_steps_per_second": 23.425,
810
- "step": 1000
811
- },
812
- {
813
- "epoch": 0.20528455284552846,
814
- "grad_norm": 0.1113312616944313,
815
- "learning_rate": 4.794918699186992e-05,
816
- "loss": 0.0277,
817
- "step": 1010
818
- },
819
- {
820
- "epoch": 0.2073170731707317,
821
- "grad_norm": 0.1282844841480255,
822
- "learning_rate": 4.792886178861789e-05,
823
- "loss": 0.0275,
824
- "step": 1020
825
- },
826
- {
827
- "epoch": 0.20934959349593496,
828
- "grad_norm": 0.10981626808643341,
829
- "learning_rate": 4.790853658536585e-05,
830
- "loss": 0.0282,
831
- "step": 1030
832
- },
833
- {
834
- "epoch": 0.21138211382113822,
835
- "grad_norm": 0.1443098783493042,
836
- "learning_rate": 4.7888211382113825e-05,
837
- "loss": 0.0268,
838
- "step": 1040
839
- },
840
- {
841
- "epoch": 0.21341463414634146,
842
- "grad_norm": 0.10926985740661621,
843
- "learning_rate": 4.786788617886179e-05,
844
- "loss": 0.0265,
845
- "step": 1050
846
- },
847
- {
848
- "epoch": 0.21544715447154472,
849
- "grad_norm": 0.12235318124294281,
850
- "learning_rate": 4.784756097560975e-05,
851
- "loss": 0.0271,
852
- "step": 1060
853
- },
854
- {
855
- "epoch": 0.21747967479674796,
856
- "grad_norm": 0.12022686749696732,
857
- "learning_rate": 4.782723577235773e-05,
858
- "loss": 0.0269,
859
- "step": 1070
860
- },
861
- {
862
- "epoch": 0.21951219512195122,
863
- "grad_norm": 0.13947460055351257,
864
- "learning_rate": 4.7806910569105695e-05,
865
- "loss": 0.0272,
866
- "step": 1080
867
- },
868
- {
869
- "epoch": 0.22154471544715448,
870
- "grad_norm": 0.10289262980222702,
871
- "learning_rate": 4.778658536585366e-05,
872
- "loss": 0.027,
873
- "step": 1090
874
- },
875
- {
876
- "epoch": 0.22357723577235772,
877
- "grad_norm": 0.10644431412220001,
878
- "learning_rate": 4.776626016260163e-05,
879
- "loss": 0.026,
880
- "step": 1100
881
- },
882
- {
883
- "epoch": 0.22357723577235772,
884
- "eval_cosine_similarity": 0.490080117004355,
885
- "eval_loss": 0.020727790892124176,
886
- "eval_mse_sync": 0.020727790696711355,
887
- "eval_runtime": 7.9802,
888
- "eval_samples_per_second": 375.055,
889
- "eval_steps_per_second": 23.558,
890
- "step": 1100
891
- },
892
- {
893
- "epoch": 0.22560975609756098,
894
- "grad_norm": 0.1217174306511879,
895
- "learning_rate": 4.77459349593496e-05,
896
- "loss": 0.0255,
897
- "step": 1110
898
- },
899
- {
900
- "epoch": 0.22764227642276422,
901
- "grad_norm": 0.13680125772953033,
902
- "learning_rate": 4.7725609756097564e-05,
903
- "loss": 0.0261,
904
- "step": 1120
905
- },
906
- {
907
- "epoch": 0.22967479674796748,
908
- "grad_norm": 0.10098998993635178,
909
- "learning_rate": 4.770528455284553e-05,
910
- "loss": 0.0266,
911
- "step": 1130
912
- },
913
- {
914
- "epoch": 0.23170731707317074,
915
- "grad_norm": 0.10652109980583191,
916
- "learning_rate": 4.76849593495935e-05,
917
- "loss": 0.0254,
918
- "step": 1140
919
- },
920
- {
921
- "epoch": 0.23373983739837398,
922
- "grad_norm": 0.14970383048057556,
923
- "learning_rate": 4.7664634146341466e-05,
924
- "loss": 0.0255,
925
- "step": 1150
926
- },
927
- {
928
- "epoch": 0.23577235772357724,
929
- "grad_norm": 0.10969521850347519,
930
- "learning_rate": 4.7644308943089434e-05,
931
- "loss": 0.0262,
932
- "step": 1160
933
- },
934
- {
935
- "epoch": 0.23780487804878048,
936
- "grad_norm": 0.18681780993938446,
937
- "learning_rate": 4.76239837398374e-05,
938
- "loss": 0.0267,
939
- "step": 1170
940
- },
941
- {
942
- "epoch": 0.23983739837398374,
943
- "grad_norm": 0.13186466693878174,
944
- "learning_rate": 4.760365853658537e-05,
945
- "loss": 0.0279,
946
- "step": 1180
947
- },
948
- {
949
- "epoch": 0.241869918699187,
950
- "grad_norm": 0.09688113629817963,
951
- "learning_rate": 4.7583333333333336e-05,
952
- "loss": 0.0259,
953
- "step": 1190
954
- },
955
- {
956
- "epoch": 0.24390243902439024,
957
- "grad_norm": 0.1350603997707367,
958
- "learning_rate": 4.75630081300813e-05,
959
- "loss": 0.0274,
960
- "step": 1200
961
- },
962
- {
963
- "epoch": 0.24390243902439024,
964
- "eval_cosine_similarity": 0.46065077879657484,
965
- "eval_loss": 0.02211129106581211,
966
- "eval_mse_sync": 0.0221112903003415,
967
- "eval_runtime": 8.0094,
968
- "eval_samples_per_second": 373.686,
969
- "eval_steps_per_second": 23.472,
970
- "step": 1200
971
- },
972
- {
973
- "epoch": 0.2459349593495935,
974
- "grad_norm": 0.13613583147525787,
975
- "learning_rate": 4.754268292682927e-05,
976
- "loss": 0.0275,
977
- "step": 1210
978
- },
979
- {
980
- "epoch": 0.24796747967479674,
981
- "grad_norm": 0.1278214156627655,
982
- "learning_rate": 4.752235772357724e-05,
983
- "loss": 0.0257,
984
- "step": 1220
985
- },
986
- {
987
- "epoch": 0.25,
988
- "grad_norm": 0.1331510692834854,
989
- "learning_rate": 4.7502032520325205e-05,
990
- "loss": 0.0251,
991
- "step": 1230
992
- },
993
- {
994
- "epoch": 0.25203252032520324,
995
- "grad_norm": 0.15280725061893463,
996
- "learning_rate": 4.748170731707317e-05,
997
- "loss": 0.0268,
998
- "step": 1240
999
- },
1000
- {
1001
- "epoch": 0.2540650406504065,
1002
- "grad_norm": 0.11740286648273468,
1003
- "learning_rate": 4.746138211382114e-05,
1004
- "loss": 0.0258,
1005
- "step": 1250
1006
- },
1007
- {
1008
- "epoch": 0.25609756097560976,
1009
- "grad_norm": 0.09965524077415466,
1010
- "learning_rate": 4.744105691056911e-05,
1011
- "loss": 0.0274,
1012
- "step": 1260
1013
- },
1014
- {
1015
- "epoch": 0.258130081300813,
1016
- "grad_norm": 0.12852798402309418,
1017
- "learning_rate": 4.7420731707317075e-05,
1018
- "loss": 0.0272,
1019
- "step": 1270
1020
- },
1021
- {
1022
- "epoch": 0.2601626016260163,
1023
- "grad_norm": 0.11415109783411026,
1024
- "learning_rate": 4.740040650406504e-05,
1025
- "loss": 0.026,
1026
- "step": 1280
1027
- },
1028
- {
1029
- "epoch": 0.2621951219512195,
1030
- "grad_norm": 0.1628946214914322,
1031
- "learning_rate": 4.738008130081301e-05,
1032
- "loss": 0.0279,
1033
- "step": 1290
1034
- },
1035
- {
1036
- "epoch": 0.26422764227642276,
1037
- "grad_norm": 0.11203841865062714,
1038
- "learning_rate": 4.735975609756098e-05,
1039
- "loss": 0.0253,
1040
- "step": 1300
1041
- },
1042
- {
1043
- "epoch": 0.26422764227642276,
1044
- "eval_cosine_similarity": 0.477201213187053,
1045
- "eval_loss": 0.021068334579467773,
1046
- "eval_mse_sync": 0.02106833454150541,
1047
- "eval_runtime": 7.9772,
1048
- "eval_samples_per_second": 375.194,
1049
- "eval_steps_per_second": 23.567,
1050
- "step": 1300
1051
- },
1052
- {
1053
- "epoch": 0.266260162601626,
1054
- "grad_norm": 0.10183481872081757,
1055
- "learning_rate": 4.7339430894308944e-05,
1056
- "loss": 0.0266,
1057
- "step": 1310
1058
- },
1059
- {
1060
- "epoch": 0.2682926829268293,
1061
- "grad_norm": 0.09649122506380081,
1062
- "learning_rate": 4.731910569105691e-05,
1063
- "loss": 0.0254,
1064
- "step": 1320
1065
- },
1066
- {
1067
- "epoch": 0.2703252032520325,
1068
- "grad_norm": 0.11728145182132721,
1069
- "learning_rate": 4.729878048780488e-05,
1070
- "loss": 0.0256,
1071
- "step": 1330
1072
- },
1073
- {
1074
- "epoch": 0.27235772357723576,
1075
- "grad_norm": 0.11149100959300995,
1076
- "learning_rate": 4.7278455284552846e-05,
1077
- "loss": 0.0247,
1078
- "step": 1340
1079
- },
1080
- {
1081
- "epoch": 0.27439024390243905,
1082
- "grad_norm": 0.12775219976902008,
1083
- "learning_rate": 4.7258130081300814e-05,
1084
- "loss": 0.0263,
1085
- "step": 1350
1086
- },
1087
- {
1088
- "epoch": 0.2764227642276423,
1089
- "grad_norm": 0.1255025416612625,
1090
- "learning_rate": 4.723780487804878e-05,
1091
- "loss": 0.0248,
1092
- "step": 1360
1093
- },
1094
- {
1095
- "epoch": 0.2784552845528455,
1096
- "grad_norm": 0.11553100496530533,
1097
- "learning_rate": 4.721747967479675e-05,
1098
- "loss": 0.0261,
1099
- "step": 1370
1100
- },
1101
- {
1102
- "epoch": 0.2804878048780488,
1103
- "grad_norm": 0.115130715072155,
1104
- "learning_rate": 4.7197154471544716e-05,
1105
- "loss": 0.0266,
1106
- "step": 1380
1107
- },
1108
- {
1109
- "epoch": 0.28252032520325204,
1110
- "grad_norm": 0.10078372061252594,
1111
- "learning_rate": 4.717682926829268e-05,
1112
- "loss": 0.0249,
1113
- "step": 1390
1114
- },
1115
- {
1116
- "epoch": 0.2845528455284553,
1117
- "grad_norm": 0.11324010044336319,
1118
- "learning_rate": 4.715650406504065e-05,
1119
- "loss": 0.0257,
1120
- "step": 1400
1121
- },
1122
- {
1123
- "epoch": 0.2845528455284553,
1124
- "eval_cosine_similarity": 0.490024376186367,
1125
- "eval_loss": 0.02175692841410637,
1126
- "eval_mse_sync": 0.021756927129920514,
1127
- "eval_runtime": 8.0149,
1128
- "eval_samples_per_second": 373.432,
1129
- "eval_steps_per_second": 23.456,
1130
- "step": 1400
1131
- },
1132
- {
1133
- "epoch": 0.2865853658536585,
1134
- "grad_norm": 0.1149812862277031,
1135
- "learning_rate": 4.7136178861788625e-05,
1136
- "loss": 0.0254,
1137
- "step": 1410
1138
- },
1139
- {
1140
- "epoch": 0.2886178861788618,
1141
- "grad_norm": 0.11180515587329865,
1142
- "learning_rate": 4.7115853658536585e-05,
1143
- "loss": 0.0266,
1144
- "step": 1420
1145
- },
1146
- {
1147
- "epoch": 0.29065040650406504,
1148
- "grad_norm": 0.11090870201587677,
1149
- "learning_rate": 4.709552845528455e-05,
1150
- "loss": 0.0252,
1151
- "step": 1430
1152
- },
1153
- {
1154
- "epoch": 0.2926829268292683,
1155
- "grad_norm": 0.15381398797035217,
1156
- "learning_rate": 4.707520325203253e-05,
1157
- "loss": 0.0267,
1158
- "step": 1440
1159
- },
1160
- {
1161
- "epoch": 0.29471544715447157,
1162
- "grad_norm": 0.11547625809907913,
1163
- "learning_rate": 4.705487804878049e-05,
1164
- "loss": 0.0259,
1165
- "step": 1450
1166
- },
1167
- {
1168
- "epoch": 0.2967479674796748,
1169
- "grad_norm": 0.12333870679140091,
1170
- "learning_rate": 4.7034552845528455e-05,
1171
- "loss": 0.0273,
1172
- "step": 1460
1173
- },
1174
- {
1175
- "epoch": 0.29878048780487804,
1176
- "grad_norm": 0.13967566192150116,
1177
- "learning_rate": 4.701422764227643e-05,
1178
- "loss": 0.0255,
1179
- "step": 1470
1180
- },
1181
- {
1182
- "epoch": 0.3008130081300813,
1183
- "grad_norm": 0.12606772780418396,
1184
- "learning_rate": 4.699390243902439e-05,
1185
- "loss": 0.0245,
1186
- "step": 1480
1187
- },
1188
- {
1189
- "epoch": 0.30284552845528456,
1190
- "grad_norm": 0.11195407062768936,
1191
- "learning_rate": 4.697357723577236e-05,
1192
- "loss": 0.0258,
1193
- "step": 1490
1194
- },
1195
- {
1196
- "epoch": 0.3048780487804878,
1197
- "grad_norm": 0.1050952821969986,
1198
- "learning_rate": 4.695325203252033e-05,
1199
- "loss": 0.0256,
1200
- "step": 1500
1201
- },
1202
- {
1203
- "epoch": 0.3048780487804878,
1204
- "eval_cosine_similarity": 0.5019669328052943,
1205
- "eval_loss": 0.020201340317726135,
1206
- "eval_mse_sync": 0.020201340437836564,
1207
- "eval_runtime": 8.0028,
1208
- "eval_samples_per_second": 373.996,
1209
- "eval_steps_per_second": 23.492,
1210
- "step": 1500
1211
- }
1212
- ],
1213
- "logging_steps": 10,
1214
- "max_steps": 24600,
1215
- "num_input_tokens_seen": 0,
1216
- "num_train_epochs": 5,
1217
- "save_steps": 100,
1218
- "stateful_callbacks": {
1219
- "TrainerControl": {
1220
- "args": {
1221
- "should_epoch_stop": false,
1222
- "should_evaluate": false,
1223
- "should_log": false,
1224
- "should_save": true,
1225
- "should_training_stop": false
1226
- },
1227
- "attributes": {}
1228
- }
1229
- },
1230
- "total_flos": 1.1221717745664e+16,
1231
- "train_batch_size": 16,
1232
- "trial_name": null,
1233
- "trial_params": null
1234
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb70f48c7fa1622aca3a115af51cf270a88d8053efe253371c58188a6a763ba4
3
- size 5841
 
 
 
 
last-checkpoint/vocab.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "unk": 0,
3
- "pad": 1,
4
- "/s": 2,
5
- "s": 3,
6
- "(": 4,
7
- ")": 5,
8
- "always": 6,
9
- "eventually": 7,
10
- "until": 8,
11
- "and": 9,
12
- "or": 10,
13
- "not": 11,
14
- ">=": 12,
15
- "<=": 13,
16
- ">": 14,
17
- "<": 15,
18
- "=": 16,
19
- "x_": 17,
20
- "[": 18,
21
- "]": 19,
22
- ",": 20,
23
- "inf": 21,
24
- "-": 22,
25
- ".": 23,
26
- "0": 24,
27
- "1": 25,
28
- "2": 26,
29
- "3": 27,
30
- "4": 28,
31
- "5": 29,
32
- "6": 30,
33
- "7": 31,
34
- "8": 32,
35
- "9": 33,
36
- "@": 34
37
- }