EvolphTech
/

Wildnerve-tlm01_Hybrid_Model

Text Generation

wildnerve_tlm01

Model card Files Files and versions

xet

Community

WildnerveAI commited on May 16, 2025

Commit

1acab08

verified ·

1 Parent(s): 2de4a17

Upload 3 files

Browse files

Files changed (3) hide show

config.py +6 -4
model_Custm.py +22 -21
model_manager.py +2 -3

config.py CHANGED Viewed

@@ -511,12 +511,14 @@ def get_model_architecture_params():
     """Get model architecture parameters from config file"""
     if hasattr(app_config, "TRANSFORMER_CONFIG"):
         tc = app_config.TRANSFORMER_CONFIG
         return {
             "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
-            "embedding_dim": getattr(tc, "EMBEDDING_DIM", 768),
-            "num_heads": getattr(tc, "NUM_HEADS", 12),
-            "hidden_dim": getattr(tc, "HIDDEN_DIM", 768),
-            "num_layers": getattr(tc, "NUM_LAYERS", 12),
             "output_size": getattr(tc, "VOCAB_SIZE", 50257),
             "dropout": getattr(tc, "DROPOUT", 0.1),
             "max_seq_length": getattr(tc, "MAX_SEQ_LENGTH", 512)

     """Get model architecture parameters from config file"""
     if hasattr(app_config, "TRANSFORMER_CONFIG"):
         tc = app_config.TRANSFORMER_CONFIG
+        # CRITICAL: Ensure we ALWAYS get 768 for embedding_dim and hidden_dim
+        # This avoids issues with dimension mismatches between 512 and 768
         return {
             "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
+            "embedding_dim": 768,  # Fixed to 768 to prevent mismatches
+            "num_heads": 12,       # 12 heads works with 768 (768/12=64)
+            "hidden_dim": 768,     # Fixed to 768 to prevent mismatches
+            "num_layers": getattr(tc, "NUM_LAYERS", 12),
             "output_size": getattr(tc, "VOCAB_SIZE", 50257),
             "dropout": getattr(tc, "DROPOUT", 0.1),
             "max_seq_length": getattr(tc, "MAX_SEQ_LENGTH", 512)

model_Custm.py CHANGED Viewed

@@ -283,19 +283,17 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
     def forward(
         self,
-        src: torch.Tensor = None,
-        tgt: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        src_mask: Optional[torch.Tensor] = None,  # Make sure to include this parameter
-        tgt_mask: Optional[torch.Tensor] = None,
-        src_key_padding_mask: Optional[torch.Tensor] = None,
-        tgt_key_padding_mask: Optional[torch.Tensor] = None,
-        return_sequence: bool = False,
-        # Add Hugging Face compatibility parameters
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...], ModelOutput]:
         try:
             # Log input shapes for debugging
             logger.info(f"Input shapes - src: {src.shape if src is not None else None}, tgt: {tgt.shape if tgt is not None else None}")
@@ -312,13 +310,8 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
             src_embeddings = self.pos_encoder(src_embeddings)
             # Pass through encoder layers
-            memory = src_embeddings
-            # Ensure memory maintains 3 dimensions [batch_size, seq_length, hidden_dim]
-            if memory.dim() == 2:
-                memory = memory.unsqueeze(1)
-            # Use self.transformer_encoder instead of self.encoder_layers (which doesn't exist)
-            encoded_src = self.transformer_encoder(memory)
             if src.size(1) > 256 and hasattr(self, 'hybrid_attention'):
                 # Prepare inputs for hybrid attention
@@ -347,7 +340,15 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
                 encoded_src = hybrid_outputs
             # Pass through decoder layers
-            output = encoded_src
             # Ensure output maintains 3 dimensions [batch_size, seq_length, hidden_dim]
             if output.dim() == 2:
                 output = output.unsqueeze(1)

     def forward(
         self,
+        input_ids=None,
+        attention_mask=None,
+        labels=None,
+        src=None,
+        tgt=None,
+        src_key_padding_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        return_sequence=False,
+        **kwargs
+    ):
         try:
             # Log input shapes for debugging
             logger.info(f"Input shapes - src: {src.shape if src is not None else None}, tgt: {tgt.shape if tgt is not None else None}")
             src_embeddings = self.pos_encoder(src_embeddings)
             # Pass through encoder layers
+            memory = self.transformer_encoder(src_embeddings,
+                src_key_padding_mask=src_key_padding_mask)
             if src.size(1) > 256 and hasattr(self, 'hybrid_attention'):
                 # Prepare inputs for hybrid attention
                 encoded_src = hybrid_outputs
             # Pass through decoder layers
+            if tgt is not None:
+                tgt_embeddings = self.tgt_embedding(tgt)
+                tgt_embeddings = self.pos_decoder(tgt_embeddings)
+                output = self.transformer_decoder(tgt_embeddings, memory,
+                    tgt_key_padding_mask=tgt_key_padding_mask,
+                    memory_key_padding_mask=memory_key_padding_mask)
+            else:
+                output = memory
             # Ensure output maintains 3 dimensions [batch_size, seq_length, hidden_dim]
             if output.dim() == 2:
                 output = output.unsqueeze(1)

model_manager.py CHANGED Viewed

@@ -344,7 +344,7 @@ class ModelManager:
         # Create embedding for input text
         input_embedding = self.embedding_model.encode(input_text)
-        # NEW: Process input through SmartHybridAttention for enhanced understanding
         if hasattr(self, 'smart_attention') and self.smart_attention:
             try:
                 # Convert embedding to tensor format needed by attention
@@ -352,8 +352,7 @@ class ModelManager:
                 input_tensor = torch.tensor(input_embedding).unsqueeze(0).unsqueeze(0)  # [1, 1, dim]
                 # Process through attention mechanism to extract key patterns
-                # This helps identify which parts of input are most relevant
-                enhanced = self.smart_attention(
                     query=input_tensor,
                     key=input_tensor,
                     value=input_tensor

         # Create embedding for input text
         input_embedding = self.embedding_model.encode(input_text)
+        # Process input through SmartHybridAttention for enhanced understanding
         if hasattr(self, 'smart_attention') and self.smart_attention:
             try:
                 # Convert embedding to tensor format needed by attention
                 input_tensor = torch.tensor(input_embedding).unsqueeze(0).unsqueeze(0)  # [1, 1, dim]
                 # Process through attention mechanism to extract key patterns
+                enhanced, _ = self.smart_attention(  # FIXED: Properly unpack tuple
                     query=input_tensor,
                     key=input_tensor,
                     value=input_tensor