EvolphTech
/

Wildnerve-tlm01_Hybrid_Model

Text Generation

wildnerve_tlm01

Model card Files Files and versions

xet

Community

WildnerveAI commited on May 16, 2025

Commit

2de4a17

verified ·

1 Parent(s): b9e3afb

Upload 2 files

Browse files

Files changed (2) hide show

model_Custm.py +76 -261
model_manager.py +28 -1

model_Custm.py CHANGED Viewed

@@ -263,6 +263,9 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
         self.classifier = nn.Linear(embedding_dim, self.vocab_size)
         self.dropout_layer = nn.Dropout(dropout)
         self.init_weights()
     def init_weights(self) -> None:
@@ -280,17 +283,19 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        labels=None,
-        src=None,
-        tgt=None,
-        src_key_padding_mask=None,
-        tgt_key_padding_mask=None,
-        memory_key_padding_mask=None,
-        return_sequence=False,
-        **kwargs
-    ):
         try:
             # Log input shapes for debugging
             logger.info(f"Input shapes - src: {src.shape if src is not None else None}, tgt: {tgt.shape if tgt is not None else None}")
@@ -308,288 +313,98 @@ class Wildnerve_tlm01(nn.Module, AbstractModel):
             # Pass through encoder layers
             memory = src_embeddings
-            for enc_layer in self.encoder_layers:
-                memory = enc_layer(memory)
-            # Pass through decoder layers (ensuring we maintain 3D shape)
-            # This maintains [batch_size, seq_length, hidden_dim]
-            output = memory
-            for dec_layer in self.decoder_layers:
-                output = dec_layer(output)
-            # Apply final projection to vocabulary space
-            # This should result in [batch_size, seq_length, vocab_size]
-            output = self.final_layer(output)
-            # CRITICAL: Ensure we keep the 3D shape for language modeling
-            # Check if we have a 2D tensor and reshape if needed
-            if output.dim() == 2:
-                # If 2D tensor [batch_size, vocab_size], reshape to 3D [batch_size, 1, vocab_size]
-                batch_size, vocab_size = output.shape
-                logger.info(f"Reshaping 2D output {output.shape} to 3D tensor")
-                output = output.unsqueeze(1)  # Add sequence dimension
-                logger.info(f"Reshaped output: {output.shape}")
-            # Record the output shape and dimensions for debugging
-            logger.info(f"Output shape: {output.shape}, dimensions: {output.dim()}")
-            # Calculate loss if labels are provided
-            loss = None
-            if labels is not None:
-                # Check output shape
-                if output.dim() == 3:  # [batch_size, seq_length, vocab_size]
-                    batch_size, seq_length, vocab_size = output.shape
-                    logger.info(f"3D tensor: batch_size={batch_size}, seq_length={seq_length}, vocab_size={vocab_size}")
-                    # Check if labels are flattened
-                    expected_flattened_size = batch_size * seq_length
-                    is_flattened_labels = (labels.dim() == 1 and labels.size(0) == expected_flattened_size)
-                    if is_flattened_labels:
-                        # Reshape output to match flattened labels
-                        output_reshaped = output.reshape(-1, vocab_size)
-                        loss_fct = nn.CrossEntropyLoss()
-                        loss = loss_fct(output_reshaped, labels)
-                    else:
-                        # If labels are 2D [batch_size, seq_length], reshape them
-                        if labels.dim() == 2:
-                            # Need to reshape labels to 1D for CrossEntropyLoss
-                            labels_reshaped = labels.reshape(-1)
-                            output_reshaped = output.reshape(-1, vocab_size)
-                            loss_fct = nn.CrossEntropyLoss()
-                            loss = loss_fct(output_reshaped, labels_reshaped)
-                        else:
-                            loss_fct = nn.CrossEntropyLoss()
-                            loss = loss_fct(output.view(-1, vocab_size), labels.view(-1))
-                elif output.dim() == 2:  # [batch_size, vocab_size]
-                    batch_size, vocab_size = output.shape
-                    logger.info(f"2D tensor: batch_size={batch_size}, vocab_size={vocab_size}")
-                    # Handle 2D output with 1D labels (single prediction per sequence)
-                    if labels.dim() == 1 and labels.size(0) == batch_size:
-                        loss_fct = nn.CrossEntropyLoss()
-                        loss = loss_fct(output, labels)
-                    else:
-                        # If we have 1D labels but with seq_length*batch_size elements
-                        logger.warning(f"Label shape {labels.shape} incompatible with output {output.shape}")
-                        # Take just the first token prediction for each sequence
-                        labels_reshaped = labels.view(batch_size, -1)[:, 0]
-                        loss_fct = nn.CrossEntropyLoss()
-                        loss = loss_fct(output, labels_reshaped)
-            # Return the proper format
-            if loss is not None:
-                logger.info(f"Returning loss tensor: {loss.item()}")
-                return loss, output
-            else:
-                return output
-        except Exception as e:
-            logger.error(f"Error in forward pass: {str(e)}")
-            logger.error(f"Traceback: {traceback.format_exc()}")
-            # Log input shapes for debugging
-            logger.error(f"Input shapes - src: {src.shape if src is not None else None}, input_ids: {input_ids.shape if input_ids is not None else None}")
-            # Ensure we return a proper tuple with correct types even in error case
-            dummy_output = torch.zeros(1)
-            dummy_loss = torch.tensor(float('nan'))
-            return dummy_loss, dummy_output
-    def forward(
-        self,
-        src: torch.Tensor = None,
-        tgt: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        src_mask: Optional[torch.Tensor] = None,
-        tgt_mask: Optional[torch.Tensor] = None,
-        src_key_padding_mask: Optional[torch.Tensor] = None,
-        tgt_key_padding_mask: Optional[torch.Tensor] = None,
-        return_sequence: bool = False,
-        # Add Hugging Face compatibility parameters
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...], ModelOutput]:
-        try:
-            # Add this at the start of the forward method to log input shapes
-            logger.info(f"Input shapes - src: {src.shape if src is not None else None}, tgt: {tgt.shape if tgt is not None else None}")
-            # Use Hugging Face parameters if provided
-            if src is None and input_ids is not None:
-                src = input_ids
-            if src_key_padding_mask is None and attention_mask is not None:
-                src_key_padding_mask = attention_mask
-            # Handle input shape - our layers expect batch_first=True format
-            if src.dim() == 2:
-                # src is already [batch_size, seq_len]
-                pass
-            elif src.dim() == 3 and src.size(0) > src.size(1):
-                # src is [seq_len, batch_size, dim] - need to transpose
-                src = src.transpose(0, 1)
-            # ----------------------------
-            # Encoder: Custom processing of source
-            # ----------------------------
-            src_emb = self.embedding(src) * math.sqrt(self.embedding_dim)
-            src_emb = self.pos_encoder(src_emb.transpose(0, 1)).transpose(0, 1)  # Apply positional encoding
-            # Use hybrid attention if sequence length is above the threshold
             if src.size(1) > 256 and hasattr(self, 'hybrid_attention'):
                 # Prepare inputs for hybrid attention
-                query = src_emb.transpose(0, 1)  # Ensure shape is [seq_len, batch, dim]
                 key = query
                 value = query
-                # Apply smart hybrid attention - FIX: properly handle any return format
                 hybrid_outputs = self.hybrid_attention(
                     query=query,
                     key=key,
                     value=value,
                     key_padding_mask=src_key_padding_mask,
-                    attn_mask=src_mask,
                     prompt_length=src.size(1),
-                    prompt_complexity=0.5  # Default value, can be computed based on input
                 )
-                # FIX: Handle all possible return types from hybrid_attention
-                if isinstance(hybrid_outputs, tuple):
-                    # If it returns a tuple, the first element is the attended output
-                    attended_output = hybrid_outputs[0]
-                    logger.debug(f"Hybrid attention returned tuple of length {len(hybrid_outputs)}")
-                else:
-                    # If it returns a tensor directly
-                    attended_output = hybrid_outputs
-                    logger.debug("Hybrid attention returned single tensor")
-                # Convert back to expected format
-                encoded_src = attended_output.transpose(0, 1)
-            else:
-                # Use standard transformer encoder for shorter sequences
-                encoded_src = self.transformer_encoder(src_emb, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
-            # Process through adapter layer
-            adapted = self.adapter(encoded_src)
-            # ----------------------------
-            # Decoder / Output
-            # ----------------------------
-            if tgt is not None:
-                # Handle tgt shape for batch_first format
-                if tgt.dim() == 2:
-                    # tgt is already [batch_size, seq_len]
-                    pass
-                elif tgt.dim() == 3 and tgt.size(0) > tgt.size(1):
-                    # tgt is [seq_len, batch_size, dim] - need to transpose
-                    tgt = tgt.transpose(0, 1)
-                tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.embedding_dim)
-                tgt_emb = self.pos_decoder(tgt_emb.transpose(0, 1)).transpose(0, 1)  # Apply positional encoding
-                decoded = self.transformer_decoder(
-                    tgt_emb,
-                    adapted,
-                    tgt_mask=tgt_mask,
-                    memory_key_padding_mask=src_key_padding_mask,
-                    tgt_key_padding_mask=tgt_key_padding_mask
-                )
-                output = self.classifier(decoded)  # [batch_size, seq_len, output_size]
-                if not return_sequence:
-                    output = output.mean(dim=1)  # Average over sequence dimension
-            else:
-                # For encoder-only tasks (e.g., classification)
-                if self.pooling_mode == "mean":
-                    pooled = encoded_src.mean(dim=1)
-                elif self.pooling_mode == "max":
-                    pooled = torch.max(encoded_src, dim=1)[0]
-                elif self.pooling_mode == "cls":
-                    pooled = encoded_src[:, 0]  # Use first token (CLS) - batch_first format
-                else:
-                    pooled = encoded_src.mean(dim=1)
-                pooled = self.dropout_layer(pooled)
-                output = self.classifier(pooled)
             # Calculate loss if labels are provided
             loss = None
             if labels is not None:
-                # More defensive shape handling - check shape dimensions first
-                if not isinstance(output, torch.Tensor):
-                    logger.error(f"Output is not a tensor, got {type(output)}")
-                    return torch.tensor(0.0), output  # Return dummy loss
-                # Check output shape and handle multiple possible dimensions
-                output_dim = output.dim()
-                logger.info(f"Output shape: {output.shape}, dimensions: {output_dim}")
-                if output_dim == 3:  # [batch_size, seq_length, vocab_size]
-                    batch_size, seq_length, vocab_size = output.shape
-                    # Debug logging
-                    logger.info(f"3D tensor: batch_size={batch_size}, seq_length={seq_length}, vocab_size={vocab_size}")
-                    # Check labels shape
-                    expected_flattened_size = batch_size * seq_length
-                    is_flattened_labels = (labels.dim() == 1 and labels.size(0) == expected_flattened_size)
-                    # Reshape output from [batch_size, seq_length, vocab_size] to [batch_size*seq_length, vocab_size]
-                    output_reshaped = output.reshape(-1, vocab_size)
-                    # Calculate loss
-                    loss_fct = nn.CrossEntropyLoss()
-                    loss = loss_fct(output_reshaped, labels)
-                elif output_dim == 2:  # [batch_size, vocab_size]
-                    # Already shaped for loss calculation
-                    batch_size, vocab_size = output.shape
-                    logger.info(f"2D tensor: batch_size={batch_size}, vocab_size={vocab_size}")
-                    # Need to reshape labels to 1D - this is the critical fix
-                    if labels.dim() > 1:  # If labels are multi-dimensional
-                        # Language modeling usually needs the last token prediction
-                        # Get the last token's label from each sequence
-                        labels = labels[:, -1]  # Take the last token from each sequence
-                        logger.info(f"Reshaped labels to {labels.shape}")
-                    # Now calculate loss with properly shaped tensors
-                    loss_fct = nn.CrossEntropyLoss()
-                    loss = loss_fct(output, labels)
-                else:
-                    logger.error(f"Unexpected output dimensions: {output_dim}")
-                    # Create a dummy loss to avoid breaking the training loop
-                    loss = torch.tensor(0.0, requires_grad=True, device=output.device)
             # Return the proper format
             if loss is not None:
-                logger.info(f"Returning loss tensor: {loss.item()}")
                 return loss, output
             else:
                 return output
         except Exception as e:
-            # Detailed error logging for debugging
-            logger.error(f"Error in forward pass: {e}")
             logger.error(f"Traceback: {traceback.format_exc()}")
-            logger.error(f"Input shapes - src: {src.shape if src is not None else None}, "
-                       f"input_ids: {input_ids.shape if input_ids is not None else None}")
-            # Create minimal output to prevent cascading errors, matching expected return format
-            batch_size = src.shape[0] if src is not None else (input_ids.shape[0] if input_ids is not None else 1)
-            dummy_output = torch.zeros((batch_size, self.output_size), device=next(self.parameters()).device)
-            # Return in expected format to avoid "too many values to unpack" errors
-            if labels is not None:
-                # Match (loss, logits) format
-                dummy_loss = torch.tensor(999.0, device=next(self.parameters()).device)
-                return (dummy_loss, dummy_output)
-            else:
-                # Match object with logits attribute
-                class SimpleModelOutput:
-                    def __init__(self, logits):
-                        self.logits = logits
-                return SimpleModelOutput(dummy_output)
     # Add sentence transformer methods
     def encode_sentences(self, sentences, batch_size=32, normalize_embeddings=True):
         """Encode sentences into vectors (sentence transformer functionality)"""

         self.classifier = nn.Linear(embedding_dim, self.vocab_size)
         self.dropout_layer = nn.Dropout(dropout)
+        # This is a standard linear layer that reshapes 3D input to 2D output:
+        self.final_layer = nn.Linear(hidden_dim, vocab_size)
         self.init_weights()
     def init_weights(self) -> None:
     def forward(
         self,
+        src: torch.Tensor = None,
+        tgt: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        src_mask: Optional[torch.Tensor] = None,  # Make sure to include this parameter
+        tgt_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        tgt_key_padding_mask: Optional[torch.Tensor] = None,
+        return_sequence: bool = False,
+        # Add Hugging Face compatibility parameters
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...], ModelOutput]:
         try:
             # Log input shapes for debugging
             logger.info(f"Input shapes - src: {src.shape if src is not None else None}, tgt: {tgt.shape if tgt is not None else None}")
             # Pass through encoder layers
             memory = src_embeddings
+            # Ensure memory maintains 3 dimensions [batch_size, seq_length, hidden_dim]
+            if memory.dim() == 2:
+                memory = memory.unsqueeze(1)
+            # Use self.transformer_encoder instead of self.encoder_layers (which doesn't exist)
+            encoded_src = self.transformer_encoder(memory)
             if src.size(1) > 256 and hasattr(self, 'hybrid_attention'):
                 # Prepare inputs for hybrid attention
+                query = src_embeddings.transpose(0, 1)
                 key = query
                 value = query
+                # IMPORTANT: Initialize src_mask if it's None
+                if src_mask is None and src is not None:
+                    # Create a default mask that allows all tokens to attend to all other tokens
+                    src_seq_len = src.size(1)
+                    src_mask = torch.zeros((src_seq_len, src_seq_len), device=src.device, dtype=torch.bool)
+                # Actually using the hybrid attention here!
                 hybrid_outputs = self.hybrid_attention(
                     query=query,
                     key=key,
                     value=value,
                     key_padding_mask=src_key_padding_mask,
+                    attn_mask=src_mask,  # Now src_mask is properly defined
                     prompt_length=src.size(1),
+                    prompt_complexity=0.5
                 )
+                # Process the hybrid attention outputs
+                encoded_src = hybrid_outputs
+            # Pass through decoder layers
+            output = encoded_src
+            # Ensure output maintains 3 dimensions [batch_size, seq_length, hidden_dim]
+            if output.dim() == 2:
+                output = output.unsqueeze(1)
+            # Apply final projection to vocabulary space
+            output = self.final_layer(output)
+            # CRITICAL: Ensure output is always 3D [batch_size, seq_length, vocab_size]
+            if output.dim() == 2:
+                # If 2D tensor [batch_size, vocab_size], reshape to 3D [batch_size, 1, vocab_size]
+                batch_size, vocab_size = output.shape
+                logger.info(f"2D tensor: batch_size={batch_size}, vocab_size={vocab_size}")
+                output = output.unsqueeze(1)  # Add sequence dimension
+                logger.info(f"Reshaped 2D output to 3D tensor: {output.shape}")
+            # Record the output shape and dimensions for debugging
+            logger.info(f"Output shape: {output.shape}, dimensions: {output.dim()}")
             # Calculate loss if labels are provided
             loss = None
             if labels is not None:
+                # Reshape labels to 1D if needed
+                if labels.dim() > 1:
+                    labels = labels.reshape(-1)
+                    logger.info(f"Reshaped labels to {labels.shape}")
+                # Calculate loss with properly shaped tensors
+                batch_size, seq_length, vocab_size = output.shape
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(output.reshape(-1, vocab_size), labels)
+                logger.info(f"Returning loss tensor: {loss.item()}")
             # Return the proper format
             if loss is not None:
                 return loss, output
             else:
                 return output
         except Exception as e:
+            logger.error(f"Error in forward pass: {str(e)}")
             logger.error(f"Traceback: {traceback.format_exc()}")
+            # Log input shapes for debugging
+            logger.error(f"Input shapes - src: {src.shape if src is not None else None}, input_ids: {input_ids.shape if input_ids is not None else None}")
+            # Create minimal dummy outputs in correct format
+            dummy_batch = 1
+            if src is not None:
+                dummy_batch = src.shape[0]
+            elif input_ids is not None:
+                dummy_batch = input_ids.shape[0]
+            # CRITICAL: Return a proper 3D tensor even in error case
+            dummy_output = torch.zeros((dummy_batch, 1, self.vocab_size), device=next(self.parameters()).device)
+            dummy_loss = torch.tensor(float('nan'), device=next(self.parameters()).device)
+            return dummy_loss, dummy_output
     # Add sentence transformer methods
     def encode_sentences(self, sentences, batch_size=32, normalize_embeddings=True):
         """Encode sentences into vectors (sentence transformer functionality)"""

model_manager.py CHANGED Viewed

@@ -341,7 +341,34 @@ class ModelManager:
             return model
     def route_input(self, input_text: str) -> dict:
         input_embedding = self.embedding_model.encode(input_text)
         similarities = {}
         for spec in self.specializations:
             model = self.get_model(spec)
@@ -778,4 +805,4 @@ def register_models():
         return True
     except Exception as e:
         logger.error(f"Failed to register models: {e}")
-        return False

             return model
     def route_input(self, input_text: str) -> dict:
+        # Create embedding for input text
         input_embedding = self.embedding_model.encode(input_text)
+        # NEW: Process input through SmartHybridAttention for enhanced understanding
+        if hasattr(self, 'smart_attention') and self.smart_attention:
+            try:
+                # Convert embedding to tensor format needed by attention
+                import torch
+                input_tensor = torch.tensor(input_embedding).unsqueeze(0).unsqueeze(0)  # [1, 1, dim]
+                # Process through attention mechanism to extract key patterns
+                # This helps identify which parts of input are most relevant
+                enhanced = self.smart_attention(
+                    query=input_tensor,
+                    key=input_tensor,
+                    value=input_tensor
+                )
+                # Convert back to numpy for similarity calculations
+                if isinstance(enhanced, torch.Tensor):
+                    enhanced_embedding = enhanced.squeeze().cpu().numpy()
+                    # Use enhanced embedding for similarity calculation
+                    input_embedding = enhanced_embedding
+                    logger.info("Using SmartHybridAttention for enhanced prompt routing")
+            except Exception as e:
+                logger.warning(f"Error using SmartHybridAttention: {e}")
+        # Continue with existing similarity calculation
         similarities = {}
         for spec in self.specializations:
             model = self.get_model(spec)
         return True
     except Exception as e:
         logger.error(f"Failed to register models: {e}")
+        return False