saracandu commited on
Commit
2ed168e
·
verified ·
1 Parent(s): 7d8f5e5

Training in progress, step 23750

Browse files
Files changed (2) hide show
  1. model.safetensors +1 -1
  2. modeling_stlenc.py +21 -18
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6bd3ebd29b873e4e06d10a6b58a955c0e341adb2cbfb4fb8256a6ead9f81b3c
3
  size 611079728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b55f92fa7fc67b6a482ca1c47995642b88f2f2032baa4be347f71af6fb62c0
3
  size 611079728
modeling_stlenc.py CHANGED
@@ -10,40 +10,43 @@ class STLEncoderModel(PreTrainedModel):
10
  super().__init__(config)
11
  self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
12
  self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
13
-
14
  encoder_layer = nn.TransformerEncoderLayer(
15
  d_model=config.hidden_size,
16
  nhead=config.num_attention_heads,
17
  dim_feedforward=config.intermediate_size,
18
- activation="gelu",
19
  batch_first=True
20
  )
21
  self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
22
-
23
- # 1024 -> 512 (Bottleneck) -> Target (1024 o 512)
 
24
  self.projector = nn.Sequential(
 
 
 
 
25
  nn.Linear(config.hidden_size, config.hidden_size // 2),
26
  nn.GELU(),
27
- nn.LayerNorm(config.hidden_size // 2),
28
  nn.Linear(config.hidden_size // 2, config.embedding_dim_target)
29
  )
 
 
30
  self.post_init()
31
 
32
  def forward(self, input_ids, attention_mask=None, **kwargs):
33
  batch_size, seq_length = input_ids.size()
34
- position_ids = torch.arange(seq_length, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_length)
35
-
36
  x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
 
37
  padding_mask = (attention_mask == 0) if attention_mask is not None else None
38
-
39
  sequence_output = self.encoder(x, src_key_padding_mask=padding_mask)
40
-
41
- if attention_mask is not None:
42
- mask = attention_mask.unsqueeze(-1).expand(sequence_output.size()).float()
43
- sum_embeddings = torch.sum(sequence_output * mask, 1)
44
- sum_mask = torch.clamp(mask.sum(1), min=1e-9)
45
- emb = sum_embeddings / sum_mask
46
- else:
47
- emb = sequence_output.mean(dim=1)
48
-
49
- return self.projector(emb)
 
10
  super().__init__(config)
11
  self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
12
  self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
13
+
14
  encoder_layer = nn.TransformerEncoderLayer(
15
  d_model=config.hidden_size,
16
  nhead=config.num_attention_heads,
17
  dim_feedforward=config.intermediate_size,
18
+ activation="gelu", # GELU è standard per i Transformer moderni
19
  batch_first=True
20
  )
21
  self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)
22
+
23
+ # --- POTENZIAMENTO ARCHITETTURALE ---
24
+ # Creiamo una testa di proiezione profonda (MLP)
25
  self.projector = nn.Sequential(
26
+ nn.Linear(config.hidden_size, config.hidden_size),
27
+ nn.GELU(),
28
+ nn.LayerNorm(config.hidden_size),
29
+ nn.Dropout(0.1), # Aiuta a non overfittare sulle costanti numeriche
30
  nn.Linear(config.hidden_size, config.hidden_size // 2),
31
  nn.GELU(),
 
32
  nn.Linear(config.hidden_size // 2, config.embedding_dim_target)
33
  )
34
+ # ------------------------------------
35
+
36
  self.post_init()
37
 
38
  def forward(self, input_ids, attention_mask=None, **kwargs):
39
  batch_size, seq_length = input_ids.size()
40
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
41
+ position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_length)
42
  x = self.embeddings(input_ids) + self.position_embeddings(position_ids)
43
+ # Maschera per il padding (TransformerEncoder si aspetta True dove NON deve guardare)
44
  padding_mask = (attention_mask == 0) if attention_mask is not None else None
45
+ # Encoding delle sequenze
46
  sequence_output = self.encoder(x, src_key_padding_mask=padding_mask)
47
+ # Prendiamo il CLS (indice 0)
48
+ cls_token = sequence_output[:, 0, :]
49
+ # Passiamo per la testa di proiezione non-lineare
50
+ # Rimuoviamo la Tanh finale per lasciare che il kernel scalare respiri
51
+ pooled_output = self.projector(cls_token)
52
+ return pooled_output