Training in progress, step 23750
Browse files- model.safetensors +1 -1
- modeling_stlenc.py +21 -18
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 611079728
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44b55f92fa7fc67b6a482ca1c47995642b88f2f2032baa4be347f71af6fb62c0
|
| 3 |
size 611079728
|
modeling_stlenc.py
CHANGED
|
@@ -10,40 +10,43 @@ class STLEncoderModel(PreTrainedModel):
|
|
| 10 |
super().__init__(config)
|
| 11 |
self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
|
| 12 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 13 |
-
|
| 14 |
encoder_layer = nn.TransformerEncoderLayer(
|
| 15 |
d_model=config.hidden_size,
|
| 16 |
nhead=config.num_attention_heads,
|
| 17 |
dim_feedforward=config.intermediate_size,
|
| 18 |
-
activation="gelu",
|
| 19 |
batch_first=True
|
| 20 |
)
|
| 21 |
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
|
| 22 |
-
|
| 23 |
-
#
|
|
|
|
| 24 |
self.projector = nn.Sequential(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
nn.Linear(config.hidden_size, config.hidden_size // 2),
|
| 26 |
nn.GELU(),
|
| 27 |
-
nn.LayerNorm(config.hidden_size // 2),
|
| 28 |
nn.Linear(config.hidden_size // 2, config.embedding_dim_target)
|
| 29 |
)
|
|
|
|
|
|
|
| 30 |
self.post_init()
|
| 31 |
|
| 32 |
def forward(self, input_ids, attention_mask=None, **kwargs):
|
| 33 |
batch_size, seq_length = input_ids.size()
|
| 34 |
-
position_ids = torch.arange(seq_length, device=input_ids.device)
|
| 35 |
-
|
| 36 |
x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
|
|
|
|
| 37 |
padding_mask = (attention_mask == 0) if attention_mask is not None else None
|
| 38 |
-
|
| 39 |
sequence_output = self.encoder(x, src_key_padding_mask=padding_mask)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
else:
|
| 47 |
-
emb = sequence_output.mean(dim=1)
|
| 48 |
-
|
| 49 |
-
return self.projector(emb)
|
|
|
|
| 10 |
super().__init__(config)
|
| 11 |
self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
|
| 12 |
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 13 |
+
|
| 14 |
encoder_layer = nn.TransformerEncoderLayer(
|
| 15 |
d_model=config.hidden_size,
|
| 16 |
nhead=config.num_attention_heads,
|
| 17 |
dim_feedforward=config.intermediate_size,
|
| 18 |
+
activation="gelu", # GELU è standard per i Transformer moderni
|
| 19 |
batch_first=True
|
| 20 |
)
|
| 21 |
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
|
| 22 |
+
|
| 23 |
+
# --- POTENZIAMENTO ARCHITETTURALE ---
|
| 24 |
+
# Creiamo una testa di proiezione profonda (MLP)
|
| 25 |
self.projector = nn.Sequential(
|
| 26 |
+
nn.Linear(config.hidden_size, config.hidden_size),
|
| 27 |
+
nn.GELU(),
|
| 28 |
+
nn.LayerNorm(config.hidden_size),
|
| 29 |
+
nn.Dropout(0.1), # Aiuta a non overfittare sulle costanti numeriche
|
| 30 |
nn.Linear(config.hidden_size, config.hidden_size // 2),
|
| 31 |
nn.GELU(),
|
|
|
|
| 32 |
nn.Linear(config.hidden_size // 2, config.embedding_dim_target)
|
| 33 |
)
|
| 34 |
+
# ------------------------------------
|
| 35 |
+
|
| 36 |
self.post_init()
|
| 37 |
|
| 38 |
def forward(self, input_ids, attention_mask=None, **kwargs):
|
| 39 |
batch_size, seq_length = input_ids.size()
|
| 40 |
+
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
|
| 41 |
+
position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_length)
|
| 42 |
x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
|
| 43 |
+
# Maschera per il padding (TransformerEncoder si aspetta True dove NON deve guardare)
|
| 44 |
padding_mask = (attention_mask == 0) if attention_mask is not None else None
|
| 45 |
+
# Encoding delle sequenze
|
| 46 |
sequence_output = self.encoder(x, src_key_padding_mask=padding_mask)
|
| 47 |
+
# Prendiamo il CLS (indice 0)
|
| 48 |
+
cls_token = sequence_output[:, 0, :]
|
| 49 |
+
# Passiamo per la testa di proiezione non-lineare
|
| 50 |
+
# Rimuoviamo la Tanh finale per lasciare che il kernel scalare respiri
|
| 51 |
+
pooled_output = self.projector(cls_token)
|
| 52 |
+
return pooled_output
|
|
|
|
|
|
|
|
|
|
|
|