BeetleLM
/

beetlelm_eng_mono

pico_decoder

custom_code

Model card Files Files and versions

xet

Community

suchirsalhan commited on Mar 15

Commit

3850015

verified ·

1 Parent(s): 56a67a7

Fix pico_decoder.py: init defaults, ZeroDivisionError, _tied_weights_keys

Browse files

Files changed (2) hide show

config.json +8 -8
pico_decoder.py +88 -106

config.json CHANGED Viewed

@@ -1,4 +1,12 @@
 {
   "n_layers": 14,
   "d_model": 768,
   "vocab_size": 32768,
@@ -10,14 +18,6 @@
   "activation_hidden_dim": 3072,
   "norm_eps": 1e-05,
   "dropout": 0.1,
-  "architectures": [
-    "PicoDecoderHF"
-  ],
-  "model_type": "pico_decoder",
-  "auto_map": {
-    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
-    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
-  },
   "torch_dtype": "float32",
   "transformers_version": "4.48.3"
 }

 {
+  "architectures": [
+    "PicoDecoderHF"
+  ],
+  "model_type": "pico_decoder",
+  "auto_map": {
+    "AutoConfig": "pico_decoder.PicoDecoderHFConfig",
+    "AutoModelForCausalLM": "pico_decoder.PicoDecoderHF"
+  },
   "n_layers": 14,
   "d_model": 768,
   "vocab_size": 32768,
   "activation_hidden_dim": 3072,
   "norm_eps": 1e-05,
   "dropout": 0.1,
   "torch_dtype": "float32",
   "transformers_version": "4.48.3"
 }

pico_decoder.py CHANGED Viewed

@@ -1,27 +1,3 @@
-"""
-Pico Decoder: A Lightweight Causal Transformer Language Model
-Implementation from https://github.com/pico-lm/pico-train/blob/main/src/model/pico_decoder.py
-Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
-Everything is written with a modular design for easy modification and experimentation.
-Key features:
-- RMSNorm for layer normalization
-- Rotary Positional Embeddings (RoPE)
-- Multi-head attention with KV-cache support
-- SwiGLU activation function
-- Residual connections throughout
-- KV-cache for faster autoregressive generation
-References:
-    - RoPE: https://arxiv.org/abs/2104.09864
-    - SwiGLU: https://arxiv.org/abs/2002.05202
-    - LLAMA: https://arxiv.org/abs/2302.13971
-Adapted from:
-    - OLMO: https://github.com/allenai/OLMo
-    - LLAMA: https://github.com/meta/llama
-"""
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
@@ -50,8 +26,7 @@ class RMSNorm(torch.nn.Module):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
     def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
 class RoPE(nn.Module):
@@ -61,69 +36,64 @@ class RoPE(nn.Module):
         super().__init__()
         self.theta = config.position_emb_theta
         self.dim = config.d_model // config.attention_n_heads
-        max_seq_len = config.max_seq_len
         if RoPE._freqs_cis_tensor is None:
-            RoPE._freqs_cis_tensor = self._setup_freqs_cis(max_seq_len, self.theta, self.dim)
-        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
     @classmethod
     def _setup_freqs_cis(cls, seq_len, theta, dim):
         _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        positions = torch.arange(seq_len)
-        freqs = torch.outer(positions, _freqs)
         return torch.polar(torch.ones_like(freqs), freqs)
     def get_freqs_cis(self, input_shape, start_pos, end_pos):
-        _freqs_cis = self._freqs_cis[start_pos:end_pos]
         ndim = len(input_shape)
         assert 0 <= 1 < ndim
-        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
         shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
-        return _freqs_cis.view(*shape)
     def forward(self, queries, keys, start_pos=0):
-        queries_ = torch.view_as_complex(queries.float().reshape(*queries.shape[:-1], -1, 2))
-        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
-        input_shape = queries_.shape
-        freqs_start_pos = start_pos
-        freqs_end_pos = freqs_start_pos + queries_.shape[1]
-        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
-        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
-        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
-        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
 class Attention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.n_heads    = config.attention_n_heads
-        self.n_kv_heads = config.attention_n_kv_heads
         self.batch_size  = config.batch_size
         self.max_seq_len = config.max_seq_len
-        d_model = config.d_model
-        self.head_dim = d_model // self.n_heads
-        self.n_rep = self.n_heads // self.n_kv_heads
-        self.q_proj = nn.Linear(d_model, self.n_heads    * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model,    bias=False)
-        self.rope = RoPE(config)
     def forward(self, input, mask=None, past_key_values=None, use_cache=False):
         bsz, seq_len, _ = input.shape
-        _queries, _keys, _values = self.q_proj(input), self.k_proj(input), self.v_proj(input)
-        queries = _queries.view(bsz, seq_len, self.n_heads,    self.head_dim)
-        keys    = _keys.view(   bsz, seq_len, self.n_kv_heads, self.head_dim)
-        values  = _values.view( bsz, seq_len, self.n_kv_heads, self.head_dim)
         start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
         queries, keys = self.rope(queries, keys, start_pos)
         if past_key_values is not None:
             keys   = torch.cat([past_key_values[0], keys],   dim=1)
             values = torch.cat([past_key_values[1], values], dim=1)
-        if use_cache:
-            cached_keys, cached_values = keys, values
-        else:
-            cached_keys = cached_values = None
         queries = queries.transpose(1, 2)
         keys    = keys.transpose(1, 2)
         values  = values.transpose(1, 2)
@@ -132,15 +102,14 @@ class Attention(nn.Module):
             keys   = keys.repeat_interleave(self.n_rep, dim=-3)
             values = values.repeat_interleave(self.n_rep, dim=-3)
             apply_gqa = False
-        backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
-        with sdpa_kernel(backends=backends):
-            attn_output = F.scaled_dot_product_attention(
                 queries.contiguous(), keys.contiguous(), values.contiguous(),
-                attn_mask=mask.to(queries.dtype),
                 enable_gqa=apply_gqa,
             )
-        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
-        return self.o_proj(attn_output), (cached_keys, cached_values)
 class SwiGLU(nn.Module):
@@ -163,25 +132,24 @@ class PicoDecoderBlock(nn.Module):
         self.swiglu_norm    = RMSNorm(config)
     def forward(self, input, mask=None, past_key_values=None, use_cache=False):
-        attention_output, cached_key_values = self.attention(
             self.attention_norm(input), mask=mask,
             past_key_values=past_key_values, use_cache=use_cache,
         )
-        h = input + attention_output
-        out = h + self.swiglu(self.swiglu_norm(h))
-        return out, cached_key_values
 class PicoDecoder(nn.Module):
     def __init__(self, model_config):
         super().__init__()
         self.config = model_config
-        self.embedding_proj    = nn.Embedding(self.config.vocab_size, self.config.d_model)
         self.layers            = nn.ModuleList(
-            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
         )
-        self.output_norm       = RMSNorm(self.config)
-        self.de_embedding_proj = nn.Linear(self.config.d_model, self.config.vocab_size, bias=False)
     def convert_to_hf_model(self):
         hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
@@ -195,8 +163,7 @@ class PicoDecoder(nn.Module):
         start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
         mask = None
         if seq_len > 1:
-            mask = torch.full((seq_len, seq_len), float("-inf"))
-            mask = torch.triu(mask, diagonal=1)
             if past_key_values is not None:
                 mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
             mask = mask.to(h.device)
@@ -206,19 +173,15 @@ class PicoDecoder(nn.Module):
             h, layer_cached = layer(h, mask=mask, past_key_values=layer_past, use_cache=use_cache)
             if use_cache:
                 cached_key_values += (layer_cached,)
-        h = self.output_norm(h)
-        logits = self.de_embedding_proj(h).float()
-        return logits, cached_key_values
 class PicoDecoderHFConfig(PretrainedConfig):
-    """Config class for the Pico Decoder HuggingFace wrapper."""
     model_type = "pico_decoder"
-    # Defaults match generate_configs.py MODEL_BASE exactly.
-    # The guard on attention_n_kv_heads fixes ZeroDivisionError when the field
-    # is missing or null in config.json from older checkpoints.
     def __init__(
         self,
         n_layers: int = 14,
@@ -234,20 +197,21 @@ class PicoDecoderHFConfig(PretrainedConfig):
         dropout: float = 0.1,
         **kwargs,
     ):
         if not attention_n_kv_heads:
             attention_n_kv_heads = attention_n_heads
         super().__init__(**kwargs)
-        self.n_layers               = n_layers
-        self.d_model                = d_model
-        self.vocab_size             = vocab_size
-        self.attention_n_heads      = attention_n_heads
-        self.attention_n_kv_heads   = attention_n_kv_heads
-        self.max_seq_len            = max_seq_len
-        self.batch_size             = batch_size
-        self.position_emb_theta     = position_emb_theta
-        self.activation_hidden_dim  = activation_hidden_dim
-        self.norm_eps               = norm_eps
-        self.dropout                = dropout
     @classmethod
     def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
@@ -264,22 +228,40 @@ class PicoDecoderHFConfig(PretrainedConfig):
 class PicoDecoderHF(PreTrainedModel):
     """
-    HuggingFace wrapper for BeetleLM PicoDecoder.
-    Usage: AutoModelForCausalLM.from_pretrained(repo, trust_remote_code=True)
-    """
-    config_class      = PicoDecoderHFConfig
-    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
-    _tied_weights_keys = []
     def __init__(self, config: PicoDecoderHFConfig):
         super().__init__(config)
         self.pico_decoder = PicoDecoder(config)
-    def forward(self, input_ids, past_key_values=None, use_cache=False, **kwargs):
-        logits, past_key_values = self.pico_decoder(input_ids, past_key_values, use_cache)
         if use_cache:
-            return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
-        return CausalLMOutput(logits=logits)
 PicoDecoderHFConfig.register_for_auto_class()

 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
     def forward(self, x):
+        return self._norm(x.float()).type_as(x) * self.weight
 class RoPE(nn.Module):
         super().__init__()
         self.theta = config.position_emb_theta
         self.dim = config.d_model // config.attention_n_heads
         if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                config.max_seq_len, self.theta, self.dim
+            )
+        self.register_buffer("_freqs_cis", RoPE._freqs_cis_tensor, persistent=False)
     @classmethod
     def _setup_freqs_cis(cls, seq_len, theta, dim):
         _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        freqs = torch.outer(torch.arange(seq_len), _freqs)
         return torch.polar(torch.ones_like(freqs), freqs)
     def get_freqs_cis(self, input_shape, start_pos, end_pos):
+        _f = self._freqs_cis[start_pos:end_pos]
         ndim = len(input_shape)
         assert 0 <= 1 < ndim
+        assert _f.shape == (input_shape[1], input_shape[-1])
         shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _f.view(*shape)
     def forward(self, queries, keys, start_pos=0):
+        q_ = torch.view_as_complex(queries.float().reshape(*queries.shape[:-1], -1, 2))
+        k_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        fc = self.get_freqs_cis(q_.shape, start_pos, start_pos + q_.shape[1])
+        return (
+            torch.view_as_real(q_ * fc).flatten(3).type_as(queries),
+            torch.view_as_real(k_ * fc).flatten(3).type_as(keys),
+        )
 class Attention(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.n_heads     = config.attention_n_heads
+        self.n_kv_heads  = config.attention_n_kv_heads
         self.batch_size  = config.batch_size
         self.max_seq_len = config.max_seq_len
+        d = config.d_model
+        self.head_dim = d // self.n_heads
+        self.n_rep    = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d, self.n_heads    * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d,    bias=False)
+        self.rope   = RoPE(config)
     def forward(self, input, mask=None, past_key_values=None, use_cache=False):
         bsz, seq_len, _ = input.shape
+        queries = self.q_proj(input).view(bsz, seq_len, self.n_heads,    self.head_dim)
+        keys    = self.k_proj(input).view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values  = self.v_proj(input).view(bsz, seq_len, self.n_kv_heads, self.head_dim)
         start_pos = past_key_values[0].shape[1] if past_key_values is not None else 0
         queries, keys = self.rope(queries, keys, start_pos)
         if past_key_values is not None:
             keys   = torch.cat([past_key_values[0], keys],   dim=1)
             values = torch.cat([past_key_values[1], values], dim=1)
+        cached_keys   = keys   if use_cache else None
+        cached_values = values if use_cache else None
         queries = queries.transpose(1, 2)
         keys    = keys.transpose(1, 2)
         values  = values.transpose(1, 2)
             keys   = keys.repeat_interleave(self.n_rep, dim=-3)
             values = values.repeat_interleave(self.n_rep, dim=-3)
             apply_gqa = False
+        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]):
+            out = F.scaled_dot_product_attention(
                 queries.contiguous(), keys.contiguous(), values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
                 enable_gqa=apply_gqa,
             )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out), (cached_keys, cached_values)
 class SwiGLU(nn.Module):
         self.swiglu_norm    = RMSNorm(config)
     def forward(self, input, mask=None, past_key_values=None, use_cache=False):
+        attn_out, cached = self.attention(
             self.attention_norm(input), mask=mask,
             past_key_values=past_key_values, use_cache=use_cache,
         )
+        h = input + attn_out
+        return h + self.swiglu(self.swiglu_norm(h)), cached
 class PicoDecoder(nn.Module):
     def __init__(self, model_config):
         super().__init__()
         self.config = model_config
+        self.embedding_proj    = nn.Embedding(model_config.vocab_size, model_config.d_model)
         self.layers            = nn.ModuleList(
+            [PicoDecoderBlock(model_config) for _ in range(model_config.n_layers)]
         )
+        self.output_norm       = RMSNorm(model_config)
+        self.de_embedding_proj = nn.Linear(model_config.d_model, model_config.vocab_size, bias=False)
     def convert_to_hf_model(self):
         hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
         start_pos = 0 if past_key_values is None else past_key_values[0][0].shape[1]
         mask = None
         if seq_len > 1:
+            mask = torch.triu(torch.full((seq_len, seq_len), float("-inf")), diagonal=1)
             if past_key_values is not None:
                 mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
             mask = mask.to(h.device)
             h, layer_cached = layer(h, mask=mask, past_key_values=layer_past, use_cache=use_cache)
             if use_cache:
                 cached_key_values += (layer_cached,)
+        return self.de_embedding_proj(self.output_norm(h)).float(), cached_key_values
 class PicoDecoderHFConfig(PretrainedConfig):
+    """HuggingFace config for BeetleLM PicoDecoder.
+    Defaults match generate_configs.py MODEL_BASE exactly.
+    """
     model_type = "pico_decoder"
     def __init__(
         self,
         n_layers: int = 14,
         dropout: float = 0.1,
         **kwargs,
     ):
+        # Fix: guard against None/0/missing attention_n_kv_heads in old config.json
         if not attention_n_kv_heads:
             attention_n_kv_heads = attention_n_heads
         super().__init__(**kwargs)
+        self.n_layers              = n_layers
+        self.d_model               = d_model
+        self.vocab_size            = vocab_size
+        self.attention_n_heads     = attention_n_heads
+        self.attention_n_kv_heads  = attention_n_kv_heads
+        self.max_seq_len           = max_seq_len
+        self.batch_size            = batch_size
+        self.position_emb_theta    = position_emb_theta
+        self.activation_hidden_dim = activation_hidden_dim
+        self.norm_eps              = norm_eps
+        self.dropout               = dropout
     @classmethod
     def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
 class PicoDecoderHF(PreTrainedModel):
+    """HuggingFace wrapper for BeetleLM PicoDecoder.
+    Load with: AutoModelForCausalLM.from_pretrained(repo, trust_remote_code=True)
     """
+    config_class       = PicoDecoderHFConfig
+    _no_split_modules  = ["PicoDecoderBlock", "Attention", "SwiGLU", "RMSNorm"]
+    _tied_weights_keys = []   # Fix: required by transformers >= 4.38
     def __init__(self, config: PicoDecoderHFConfig):
         super().__init__(config)
         self.pico_decoder = PicoDecoder(config)
+    def get_input_embeddings(self):
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        self.pico_decoder.embedding_proj = value
+    def forward(self, input_ids=None, past_key_values=None, use_cache=False,
+                labels=None, **kwargs):
+        logits, new_past = self.pico_decoder(input_ids, past_key_values, use_cache)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1].contiguous()
+            shift_labels = labels[:, 1:].contiguous().clamp(0, self.config.vocab_size - 1)
+            loss = F.cross_entropy(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1),
+            )
         if use_cache:
+            return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=new_past)
+        return CausalLMOutput(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": True}
 PicoDecoderHFConfig.register_for_auto_class()

Fix pico_decoder.py: __init__ defaults, ZeroDivisionError, _tied_weights_keys

Fix pico_decoder.py: init defaults, ZeroDivisionError, _tied_weights_keys