TuKoResearch
/

GSLM-HuBERT200

Safetensors

gslm

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Jul 30, 2025

Commit

29b3bed

verified ·

1 Parent(s): 13e5b3f

Update modeling.py

Browse files

Files changed (1) hide show

modeling.py +453 -101

modeling.py CHANGED Viewed

@@ -1,130 +1,482 @@
 """
-GSLM Model Configuration
 """
-import json
 import os
-from typing import Optional
-class GSLMConfig:
-    """
-    Configuration class for GSLM (Generative Spoken Language Model).
-    This configuration class stores all parameters needed to initialize a GSLMModel.
-    """
-    model_type = "gslm"
     def __init__(
-        self,
-        vocab_size: int = 204,
-        d_model: int = 1024,
-        nhead: int = 16,
-        num_layers: int = 12,
-        dim_feedforward: int = 4096,
         dropout: float = 0.1,
         attention_dropout: float = 0.1,
-        max_seq_length: int = 3072,
-        pad_idx: int = 0,
-        share_input_output_embed: bool = True,
-        activation: str = "relu",
-        architecture: str = "transformer_lm_big",
-        **kwargs
     ):
-        """
-        Initialize GSLM configuration.
         Args:
-            vocab_size: Size of the vocabulary
-            d_model: Dimensionality of the embeddings and hidden states
-            nhead: Number of attention heads
-            num_layers: Number of transformer layers
-            dim_feedforward: Dimensionality of the feedforward network
-            dropout: Dropout probability
-            attention_dropout: Dropout probability for attention weights
-            max_seq_length: Maximum sequence length
-            pad_idx: Padding token index
-            share_input_output_embed: Whether to share input and output embeddings
-            activation: Activation function ("relu" or "gelu")
-            architecture: Model architecture name
         """
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.nhead = nhead
-        self.num_layers = num_layers
-        self.dim_feedforward = dim_feedforward
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_seq_length = max_seq_length
-        self.pad_idx = pad_idx
-        self.share_input_output_embed = share_input_output_embed
-        self.activation = activation
-        self.architecture = architecture
-        # Handle any extra kwargs
-        for key, value in kwargs.items():
-            setattr(self, key, value)
-    def to_dict(self):
-        """Convert configuration to dictionary."""
-        output = {}
-        for key, value in self.__dict__.items():
-            if not key.startswith('_'):
-                output[key] = value
-        output['model_type'] = self.model_type
-        return output
-    def to_json_string(self):
-        """Convert configuration to JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True)
-    def save_pretrained(self, save_directory: str):
-        """Save configuration to directory."""
-        if not os.path.exists(save_directory):
-            os.makedirs(save_directory)
-        config_file = os.path.join(save_directory, "config.json")
-        with open(config_file, 'w') as f:
-            f.write(self.to_json_string())
-    @classmethod
-    def from_dict(cls, config_dict: dict):
-        """Create configuration from dictionary."""
-        return cls(**config_dict)
-    @classmethod
-    def from_json_file(cls, json_file: str):
-        """Create configuration from JSON file."""
-        with open(json_file, 'r') as f:
-            config_dict = json.load(f)
-        return cls.from_dict(config_dict)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         """
-        Load configuration from pretrained model.
         Args:
-            pretrained_model_name_or_path: Path to pretrained model or model identifier
-            **kwargs: Additional configuration parameters to override
         Returns:
-            GSLMConfig instance
         """
         if os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, "config.json")
         else:
-            config_file = pretrained_model_name_or_path
-        # Load config from file
-        config = cls.from_json_file(config_file)
-        # Override with any provided kwargs
-        for key, value in kwargs.items():
-            setattr(config, key, value)
-        return config
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"

 """
+GSLM Unit Language Model - HuggingFace Compatible Implementation
+Based on fairseq's transformer_lm_big architecture
 """
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
 import os
+import json
+from typing import Optional, Tuple, Dict, Union, List
+from dataclasses import dataclass
+# Import config - handle both local and remote imports
+try:
+    from .config import GSLMConfig
+except ImportError:
+    # Fallback for when file is accessed directly
+    from config import GSLMConfig
+# Import or define the output classes
+@dataclass
+class BaseModelOutput:
+    last_hidden_state: torch.FloatTensor
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class CausalLMOutput:
+    loss: Optional[torch.FloatTensor] = None
+    logits: Union[torch.FloatTensor, List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for transformer models."""
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                           (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding to input tensor."""
+        return x + self.pe[:, :x.size(1)]
+class MultiheadAttention(nn.Module):
+    """Multi-head attention mechanism."""
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.attn_dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        value: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query: [batch_size, tgt_len, embed_dim]
+            key: [batch_size, src_len, embed_dim]
+            value: [batch_size, src_len, embed_dim]
+            attn_mask: [tgt_len, src_len] or [batch_size * num_heads, tgt_len, src_len]
+            key_padding_mask: [batch_size, src_len]
+        """
+        if key is None:
+            key = query
+        if value is None:
+            value = query
+        batch_size, tgt_len, embed_dim = query.size()
+        src_len = key.size(1)
+        # Project and reshape
+        q = self.q_proj(query) * self.scaling
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        q = q.view(batch_size, tgt_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, src_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, src_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Compute attention scores
+        attn_weights = torch.matmul(q, k.transpose(-2, -1))
+        # Apply masks
+        if attn_mask is not None:
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0)
+            attn_weights = attn_weights + attn_mask
+        if key_padding_mask is not None:
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf')
+            )
+        # Softmax
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).type_as(attn_weights)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Apply attention to values
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(
+            batch_size, tgt_len, embed_dim
+        )
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class TransformerDecoderLayer(nn.Module):
+    """Transformer decoder layer."""
     def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
         dropout: float = 0.1,
         attention_dropout: float = 0.1,
+        activation: str = "relu"
     ):
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=attention_dropout)
+        # Feedforward network
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        # Layer normalization
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        # Dropout modules
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        # Activation
+        self.activation = F.relu if activation == "relu" else F.gelu
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
         Args:
+            x: [batch_size, seq_len, d_model]
+            self_attn_mask: [seq_len, seq_len]
+            self_attn_padding_mask: [batch_size, seq_len]
         """
+        # Self-attention block
+        residual = x
+        x = self.norm1(x)
+        x, _ = self.self_attn(x, x, x, self_attn_mask, self_attn_padding_mask)
+        x = self.dropout1(x)
+        x = residual + x
+        # Feedforward block
+        residual = x
+        x = self.norm2(x)
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        x = self.dropout2(x)
+        x = residual + x
+        return x
+class GSLMForCausalLM(nn.Module):
+    """
+    GSLM Unit Language Model - Transformer LM Big Architecture
+    HuggingFace compatible version with modified forward API
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.d_model = config.d_model
+        self.vocab_size = config.vocab_size
+        self.pad_idx = getattr(config, 'pad_idx', 0)
+        self.max_seq_length = config.max_seq_length
+        # Create transformer module container for compatibility
+        self.transformer = nn.Module()
+        # Token embeddings (wte for compatibility)
+        self.transformer.wte = nn.Embedding(config.vocab_size, config.d_model, padding_idx=self.pad_idx)
+        self.embed_scale = math.sqrt(config.d_model)
+        # Positional encoding
+        self.pos_encoder = PositionalEncoding(config.d_model, config.max_seq_length)
+        # Transformer decoder layers (h for compatibility)
+        self.transformer.h = nn.ModuleList([
+            TransformerDecoderLayer(
+                config.d_model,
+                config.nhead,
+                config.dim_feedforward,
+                config.dropout,
+                config.attention_dropout
+            ) for _ in range(config.num_layers)
+        ])
+        # Final layer norm (ln_f for compatibility)
+        self.transformer.ln_f = nn.LayerNorm(config.d_model)
+        # Output projection (coch_head for compatibility)
+        if config.share_input_output_embed:
+            self.coch_head = lambda x: F.linear(x, self.transformer.wte.weight)
+        else:
+            self.coch_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Dropout
+        self.transformer.drop = nn.Dropout(config.dropout)
+        # Future heads not supported in GSLM
+        self.future_heads = None
+        # Initialize parameters
+        self.init_weights()
+    def init_weights(self):
+        """Initialize model parameters."""
+        # Initialize embeddings
+        nn.init.normal_(self.transformer.wte.weight, mean=0, std=self.d_model ** -0.5)
+        nn.init.constant_(self.transformer.wte.weight[self.pad_idx], 0)
+        # Initialize output projection if not shared
+        if not self.config.share_input_output_embed:
+            nn.init.normal_(self.coch_head.weight, mean=0, std=self.d_model ** -0.5)
+    def _create_causal_mask(self, seq_len: int, device) -> torch.Tensor:
+        """Create causal attention mask."""
+        mask = torch.triu(
+            torch.full((seq_len, seq_len), float('-inf'), device=device),
+            diagonal=1
+        )
+        return mask
+    def forward(
+        self,
+        seq,
+        tgt=None,
+        output_logits=False,
+        output_hidden_states=False,
+        return_dict=False,
+        up_until_layer=None
+    ):
         """
+        Compatible forward method with the specified API.
         Args:
+            seq: torch.Tensor of shape (b, t) - input token IDs
+            tgt: torch.Tensor of shape (b, t) or None - target token IDs
+            output_logits: bool - whether to output logits
+            output_hidden_states: bool - whether to output all hidden states
+            return_dict: bool - whether to return dictionary output
+            up_until_layer: int or None - stop at specific layer
         Returns:
+            Depending on return_dict and other flags
         """
+        batch_size, seq_len = seq.shape
+        device = seq.device
+        # Create causal mask
+        causal_mask = self._create_causal_mask(seq_len, device)
+        # Create padding mask
+        padding_mask = seq.eq(self.pad_idx)
+        # Token embeddings
+        tok_emb = self.transformer.wte(seq) * self.embed_scale
+        # Add positional encoding (sinusoidal, not learned)
+        x = self.pos_encoder(tok_emb)
+        x = self.transformer.drop(x)
+        all_hidden_states = []
+        # Pass through transformer layers
+        for block_idx, block in enumerate(self.transformer.h):
+            # Save hidden state before block
+            if output_hidden_states:
+                all_hidden_states.append(x)
+            # Check if we should stop early
+            if up_until_layer is not None and block_idx == up_until_layer:
+                break
+            # Forward the block
+            x = block(x, causal_mask, padding_mask)
+        # Append the last hidden state if we didn't exit early
+        if output_hidden_states and (up_until_layer is None or block_idx == len(self.transformer.h) - 1):
+            all_hidden_states.append(x)
+        # If only hidden states requested
+        if output_hidden_states and not output_logits and tgt is None:
+            model_output = BaseModelOutput(
+                last_hidden_state=x,
+                hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+            )
+            return model_output
+        # Final layer norm
+        x = self.transformer.ln_f(x)
+        # Compute logits
+        logits = self.coch_head(x)
+        # Compute loss if targets provided
+        if tgt is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = tgt[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.reshape(-1, self.config.vocab_size),
+                shift_labels.reshape(-1),
+                ignore_index=self.pad_idx
+            )
+            if return_dict:
+                if output_logits:
+                    # For compatibility, wrap single logits in list
+                    all_logits = [logits]
+                if output_hidden_states:
+                    model_output = CausalLMOutput(
+                        loss=loss,
+                        logits=all_logits if output_logits else logits,
+                        hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+                    )
+                else:
+                    model_output = CausalLMOutput(
+                        loss=loss,
+                        logits=all_logits if output_logits else logits,
+                    )
+                return model_output
+            return logits, loss
+        return logits, None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, config=None, **kwargs):
+        """Load model from pretrained weights."""
+        import os
+        from huggingface_hub import hf_hub_download
+        # Load config if not provided
+        if config is None:
+            if os.path.isdir(pretrained_model_name_or_path):
+                config_path = os.path.join(pretrained_model_name_or_path, "config.json")
+                config = GSLMConfig.from_pretrained(config_path)
+            else:
+                # Download config from hub
+                config_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="config.json"
+                )
+                config = GSLMConfig.from_pretrained(config_path)
+        # Create model
+        model = cls(config)
+        # Load weights
         if os.path.isdir(pretrained_model_name_or_path):
+            weights_file = os.path.join(pretrained_model_name_or_path, "model.safetensors")
         else:
+            # Download weights from hub
+            weights_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename="model.safetensors"
+            )
+        if weights_file.endswith('.safetensors'):
+            from safetensors.torch import load_file
+            state_dict = load_file(weights_file)
+        else:
+            state_dict = torch.load(weights_file, map_location='cpu')
+        model.load_state_dict(state_dict)
+        return model
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_length: int = 100,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None
+    ) -> torch.Tensor:
+        """Generate sequences using the language model."""
+        if pad_token_id is None:
+            pad_token_id = self.pad_idx
+        batch_size = input_ids.shape[0]
+        device = input_ids.device
+        # Keep track of which sequences are done
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=device)
+        while input_ids.shape[1] < max_length:
+            # Forward pass
+            logits, _ = self.forward(input_ids)
+            next_token_logits = logits[:, -1, :]
+            # Apply temperature
+            if temperature != 1.0:
+                next_token_logits = next_token_logits / temperature
+            # Apply top-k sampling
+            if top_k is not None:
+                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                next_token_logits[indices_to_remove] = -float('inf')
+            # Apply top-p (nucleus) sampling
+            if top_p is not None:
+                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                # Remove tokens with cumulative probability above the threshold
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices_to_remove.scatter(
+                    dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+                )
+                next_token_logits[indices_to_remove] = -float('inf')
+            # Sample from the distribution
+            probs = F.softmax(next_token_logits, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            # Update unfinished sequences
+            if eos_token_id is not None:
+                tokens_to_add = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+                unfinished_sequences = unfinished_sequences * (next_tokens != eos_token_id).long()
+            else:
+                tokens_to_add = next_tokens
+            # Concatenate tokens
+            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
+            # Stop if all sequences are finished
+            if eos_token_id is not None and unfinished_sequences.sum() == 0:
+                break
+        return input_ids