Spaces:

catninja123
/

mash-stylebart-trainer

Paused

App Files Files Community

catninja123 commited on Mar 8

Commit

a88f7e6

verified ·

1 Parent(s): 1941b80

Upload src/model.py with huggingface_hub

Browse files

Files changed (1) hide show

src/model.py +17 -10

src/model.py CHANGED Viewed

@@ -5,7 +5,11 @@ Upgrade from BART-base (140M) to Flan-T5-XL (3B).
 - Same style injection architecture (4 style vectors + fusion layer)
 - T5 encoder-decoder is native seq2seq, ideal for rewriting
 - Flan-T5 has instruction-following capability built in
-- fp16 training on A100 80GB
 """
 import torch
@@ -51,9 +55,10 @@ class StyleT5(nn.Module):
         config = T5Config.from_pretrained(model_name)
         config.dropout_rate = dropout
         self.t5 = T5ForConditionalGeneration.from_pretrained(
             model_name, config=config,
-            torch_dtype=torch.float16,  # fp16 to fit in memory
         )
         self.tokenizer = T5Tokenizer.from_pretrained(model_name)
@@ -63,7 +68,7 @@ class StyleT5(nn.Module):
         self.dropout_rate = dropout
         self.model_name_str = model_name
-        # 4 trainable style embeddings (larger dim for larger model)
         self.style_embeddings = nn.ParameterDict({
             'human_ps':   nn.Parameter(torch.randn(style_dim) * 0.02),
             'human_supp': nn.Parameter(torch.randn(style_dim) * 0.02),
@@ -71,7 +76,7 @@ class StyleT5(nn.Module):
             'ai_supp':    nn.Parameter(torch.randn(style_dim) * 0.02),
         })
-        # Style fusion layer — fp32 for stability
         self.fusion = StyleFusionLayer(hidden_dim, style_dim, dropout=dropout)
     def get_style_embedding(self, style_keys: list) -> torch.Tensor:
@@ -86,11 +91,13 @@ class StyleT5(nn.Module):
         )
         hidden_states = encoder_output.last_hidden_state
-        # Cast to fp32 for fusion layer stability, then back to fp16
-        hidden_fp32 = hidden_states.float()
-        style_emb = self.get_style_embedding(style_keys).float()
-        fused = self.fusion(hidden_fp32, style_emb)
-        fused = fused.to(hidden_states.dtype)
         encoder_output.last_hidden_state = fused
         return encoder_output
@@ -125,7 +132,7 @@ class StyleT5(nn.Module):
         import os
         os.makedirs(path, exist_ok=True)
-        # Save T5 model in fp16
         self.t5.save_pretrained(os.path.join(path, 't5'))
         self.tokenizer.save_pretrained(os.path.join(path, 't5'))

 - Same style injection architecture (4 style vectors + fusion layer)
 - T5 encoder-decoder is native seq2seq, ideal for rewriting
 - Flan-T5 has instruction-following capability built in
+- bf16 training on A100 80GB (NOT fp16 — must match autocast dtype)
+v3b fixes:
+- Load model in bfloat16 (was fp16, causing NaN with bf16 autocast)
+- Fusion layer stays in bf16 (no manual dtype casting needed)
 """
 import torch
         config = T5Config.from_pretrained(model_name)
         config.dropout_rate = dropout
+        # CRITICAL: Use bfloat16 to match autocast dtype (was float16 → caused NaN)
         self.t5 = T5ForConditionalGeneration.from_pretrained(
             model_name, config=config,
+            torch_dtype=torch.bfloat16,
         )
         self.tokenizer = T5Tokenizer.from_pretrained(model_name)
         self.dropout_rate = dropout
         self.model_name_str = model_name
+        # 4 trainable style embeddings
         self.style_embeddings = nn.ParameterDict({
             'human_ps':   nn.Parameter(torch.randn(style_dim) * 0.02),
             'human_supp': nn.Parameter(torch.randn(style_dim) * 0.02),
             'ai_supp':    nn.Parameter(torch.randn(style_dim) * 0.02),
         })
+        # Style fusion layer
         self.fusion = StyleFusionLayer(hidden_dim, style_dim, dropout=dropout)
     def get_style_embedding(self, style_keys: list) -> torch.Tensor:
         )
         hidden_states = encoder_output.last_hidden_state
+        # Get style embedding and cast to same dtype as hidden states
+        style_emb = self.get_style_embedding(style_keys).to(hidden_states.dtype)
+        # Cast fusion layer to same dtype (it may be fp32 from init)
+        self.fusion = self.fusion.to(hidden_states.dtype)
+        fused = self.fusion(hidden_states, style_emb)
         encoder_output.last_hidden_state = fused
         return encoder_output
         import os
         os.makedirs(path, exist_ok=True)
+        # Save T5 model
         self.t5.save_pretrained(os.path.join(path, 't5'))
         self.tokenizer.save_pretrained(os.path.join(path, 't5'))