saracandu commited on
Commit
5a5a5aa
·
verified ·
1 Parent(s): e12bade

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ base_model: saracandu/stlenc-new-temp0.3
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: stlenc-new-temp0.3
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # stlenc-new-temp0.3
16
+
17
+ This model is a fine-tuned version of [saracandu/stlenc-new-temp0.3](https://huggingface.co/saracandu/stlenc-new-temp0.3) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 1.3740
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-06
39
+ - train_batch_size: 128
40
+ - eval_batch_size: 128
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 4
43
+ - total_train_batch_size: 512
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: linear
46
+ - num_epochs: 10
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:------:|:----:|:---------------:|
52
+ | 2.4093 | 0.3639 | 250 | 2.5770 |
53
+ | 2.4426 | 0.7278 | 500 | 2.5822 |
54
+ | 2.4486 | 1.0917 | 750 | 2.5639 |
55
+ | 2.4435 | 1.4556 | 1000 | 2.5948 |
56
+ | 2.4516 | 1.8195 | 1250 | 2.6232 |
57
+ | 2.4462 | 2.1834 | 1500 | 2.5765 |
58
+ | 2.4458 | 2.5473 | 1750 | 2.6115 |
59
+ | 2.4496 | 2.9112 | 2000 | 2.6147 |
60
+ | 2.4366 | 3.2751 | 2250 | 2.6214 |
61
+ | 2.445 | 3.6390 | 2500 | 2.6100 |
62
+ | 2.4477 | 4.0029 | 2750 | 2.6061 |
63
+ | 2.4428 | 4.3668 | 3000 | 2.5868 |
64
+ | 2.4442 | 4.7307 | 3250 | 2.6178 |
65
+ | 2.3614 | 5.0946 | 3500 | 2.5041 |
66
+ | 1.4114 | 5.4585 | 3750 | 1.9460 |
67
+ | 1.1683 | 5.8224 | 4000 | 1.6975 |
68
+ | 1.1236 | 6.1863 | 4250 | 1.5663 |
69
+ | 1.1069 | 6.5502 | 4500 | 1.5173 |
70
+ | 1.0834 | 6.9141 | 4750 | 1.5008 |
71
+ | 1.0683 | 7.2780 | 5000 | 1.4697 |
72
+ | 1.0608 | 7.6419 | 5250 | 1.4206 |
73
+ | 1.0412 | 8.0058 | 5500 | 1.4282 |
74
+ | 1.0346 | 8.3697 | 5750 | 1.3987 |
75
+ | 1.0305 | 8.7336 | 6000 | 1.3890 |
76
+ | 1.0282 | 9.0975 | 6250 | 1.3912 |
77
+ | 1.0199 | 9.4614 | 6500 | 1.3683 |
78
+ | 1.0209 | 9.8253 | 6750 | 1.3740 |
79
+
80
+
81
+ ### Framework versions
82
+
83
+ - Transformers 4.57.3
84
+ - Pytorch 2.9.1+cu128
85
+ - Datasets 4.4.2
86
+ - Tokenizers 0.22.1
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "STLEncoderModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_stlenc.STLEncoderConfig",
7
+ "AutoModel": "modeling_stlenc.STLEncoderModel",
8
+ "AutoTokenizer": [
9
+ "tokenizer_stlenc.STLTokenizer",
10
+ null
11
+ ]
12
+ },
13
+ "bos_token_id": 2,
14
+ "dtype": "float32",
15
+ "embedding_dim_target": 1024,
16
+ "eos_token_id": 3,
17
+ "hidden_size": 1024,
18
+ "intermediate_size": 4096,
19
+ "max_position_embeddings": 512,
20
+ "model_type": "stl_encoder",
21
+ "num_attention_heads": 16,
22
+ "num_hidden_layers": 12,
23
+ "pad_token_id": 1,
24
+ "transformers_version": "4.57.3",
25
+ "vocab_size": 35
26
+ }
configuration_stlenc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class STLEncoderConfig(PretrainedConfig):
4
+ model_type = "stl_encoder"
5
+ def __init__(
6
+ self,
7
+ vocab_size=35,
8
+ hidden_size=1024,
9
+ num_hidden_layers=12,
10
+ num_attention_heads=16,
11
+ intermediate_size=4096,
12
+ max_position_embeddings=512,
13
+ embedding_dim_target=1024,
14
+ **kwargs
15
+ ):
16
+ super().__init__(**kwargs)
17
+ self.vocab_size = vocab_size
18
+ self.hidden_size = hidden_size
19
+ self.num_hidden_layers = num_hidden_layers
20
+ self.num_attention_heads = num_attention_heads
21
+ self.intermediate_size = intermediate_size
22
+ self.max_position_embeddings = max_position_embeddings
23
+ self.embedding_dim_target = embedding_dim_target
modeling_stlenc.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from .configuration_stlenc import STLEncoderConfig
5
+
6
+ class STLEncoderModel(PreTrainedModel):
7
+ config_class = STLEncoderConfig
8
+
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+ self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
12
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
13
+
14
+ encoder_layer = nn.TransformerEncoderLayer(
15
+ d_model=config.hidden_size,
16
+ nhead=config.num_attention_heads,
17
+ dim_feedforward=config.intermediate_size,
18
+ activation="gelu", # GELU è standard per i Transformer moderni
19
+ batch_first=True
20
+ )
21
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
22
+
23
+ # --- POTENZIAMENTO ARCHITETTURALE ---
24
+ # Creiamo una testa di proiezione profonda (MLP)
25
+ self.projector = nn.Sequential(
26
+ nn.Linear(config.hidden_size, config.hidden_size),
27
+ nn.GELU(),
28
+ nn.LayerNorm(config.hidden_size),
29
+ nn.Dropout(0.1), # Aiuta a non overfittare sulle costanti numeriche
30
+ nn.Linear(config.hidden_size, config.hidden_size // 2),
31
+ nn.GELU(),
32
+ nn.Linear(config.hidden_size // 2, config.embedding_dim_target)
33
+ )
34
+ # ------------------------------------
35
+
36
+ self.post_init()
37
+
38
+ def forward(self, input_ids, attention_mask=None, **kwargs):
39
+ batch_size, seq_length = input_ids.size()
40
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
41
+ position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_length)
42
+ x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
43
+ # Maschera per il padding (TransformerEncoder si aspetta True dove NON deve guardare)
44
+ padding_mask = (attention_mask == 0) if attention_mask is not None else None
45
+ # Encoding delle sequenze
46
+ sequence_output = self.encoder(x, src_key_padding_mask=padding_mask)
47
+ # Prendiamo il CLS (indice 0)
48
+ cls_token = sequence_output[:, 0, :]
49
+ # Passiamo per la testa di proiezione non-lineare
50
+ # Rimuoviamo la Tanh finale per lasciare che il kernel scalare respiri
51
+ pooled_output = self.projector(cls_token)
52
+ return pooled_output
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "/s",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "s",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "pad",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "unk",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "unk",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "pad",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "/s",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "s",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer_stlenc.STLTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "/s",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "s",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 512,
47
+ "pad_token": "pad",
48
+ "tokenizer_class": "STLTokenizer",
49
+ "unk_token": "unk"
50
+ }
tokenizer_stlenc.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import torch
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from transformers import PreTrainedTokenizer, AutoTokenizer
6
+
7
+ class STLTokenizer(PreTrainedTokenizer):
8
+ model_type = "stl_encoder"
9
+
10
+ def __init__(
11
+ self,
12
+ vocab_file="vocab.json",
13
+ unk_token="unk",
14
+ pad_token="pad",
15
+ bos_token="/s",
16
+ eos_token="s",
17
+ model_max_length=512,
18
+ **kwargs
19
+ ):
20
+ current_dir = os.path.dirname(__file__)
21
+ full_vocab_path = os.path.join(current_dir, vocab_file)
22
+
23
+ if not os.path.exists(full_vocab_path):
24
+ from huggingface_hub import hf_hub_download
25
+ try:
26
+ full_vocab_path = hf_hub_download("saracandu/stlenc", vocab_file)
27
+ except:
28
+ full_vocab_path = vocab_file
29
+
30
+ with open(full_vocab_path, "r", encoding="utf-8") as f:
31
+ self.vocab = json.load(f)
32
+
33
+ self.id_to_token = {v: k for k, v in self.vocab.items()}
34
+
35
+ super().__init__(
36
+ unk_token=unk_token,
37
+ pad_token=pad_token,
38
+ bos_token=bos_token,
39
+ eos_token=eos_token,
40
+ model_max_length=model_max_length,
41
+ **kwargs
42
+ )
43
+
44
+ @property
45
+ def vocab_size(self) -> int:
46
+ return len(self.vocab)
47
+
48
+ def get_vocab(self) -> Dict[str, int]:
49
+ return dict(self.vocab)
50
+
51
+ def _tokenize(self, text: str) -> List[str]:
52
+ text = f'{self.bos_token} {text} {self.eos_token}'.replace(' ', '@')
53
+
54
+ tokens = []
55
+ i = 0
56
+ while i < len(text):
57
+ best_match = None
58
+ for j in range(min(i + 50, len(text)), i, -1):
59
+ subtoken = text[i:j]
60
+ if subtoken in self.vocab:
61
+ best_match = subtoken
62
+ break
63
+
64
+ if best_match:
65
+ tokens.append(best_match)
66
+ i += len(best_match)
67
+ else:
68
+ tokens.append(self.unk_token)
69
+ i += 1
70
+ return tokens
71
+
72
+ def _convert_token_to_id(self, token: str) -> int:
73
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
74
+
75
+ def _convert_id_to_token(self, index: int) -> str:
76
+ return self.id_to_token.get(index, self.unk_token)
77
+
78
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
79
+ if not os.path.isdir(save_directory):
80
+ os.makedirs(save_directory)
81
+
82
+ prefix = filename_prefix if filename_prefix is not None else ""
83
+ vocab_file = os.path.join(save_directory, prefix + "vocab.json")
84
+
85
+ with open(vocab_file, "w", encoding="utf-8") as f:
86
+ json.dump(self.vocab, f, indent=2, ensure_ascii=False)
87
+
88
+ return (vocab_file,)
89
+
90
+ try:
91
+ AutoTokenizer.register("stl_encoder", STLTokenizer)
92
+ except Exception:
93
+ pass
vocab.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk": 0,
3
+ "pad": 1,
4
+ "/s": 2,
5
+ "s": 3,
6
+ "(": 4,
7
+ ")": 5,
8
+ "always": 6,
9
+ "eventually": 7,
10
+ "until": 8,
11
+ "and": 9,
12
+ "or": 10,
13
+ "not": 11,
14
+ ">=": 12,
15
+ "<=": 13,
16
+ ">": 14,
17
+ "<": 15,
18
+ "=": 16,
19
+ "x_": 17,
20
+ "[": 18,
21
+ "]": 19,
22
+ ",": 20,
23
+ "inf": 21,
24
+ "-": 22,
25
+ ".": 23,
26
+ "0": 24,
27
+ "1": 25,
28
+ "2": 26,
29
+ "3": 27,
30
+ "4": 28,
31
+ "5": 29,
32
+ "6": 30,
33
+ "7": 31,
34
+ "8": 32,
35
+ "9": 33,
36
+ "@": 34
37
+ }