Spaces:

Nihal2000
/

autoSLM

Sleeping

File size: 10,625 Bytes

6145f50

# src/model_architecture.py
import math
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F


@dataclass
class AutomotiveSLMConfig:
    model_name: str = "Automotive-SLM-Edge-3M"
    d_model: int = 256
    n_layer: int = 4
    n_head: int = 4
    vocab_size: int = 50257
    n_positions: int = 256
    use_moe: bool = True
    n_experts: int = 4
    expert_capacity: int = 2
    moe_intermediate_size: int = 384
    router_aux_loss_coef: float = 0.01
    rotary_dim: int = 64
    rope_base: float = 10000.0
    dropout: float = 0.05
    layer_norm_epsilon: float = 1e-5
    # generation defaults (UI can override)
    max_gen_length: int = 50
    temperature: float = 0.8
    top_p: float = 0.9
    top_k: int = 50
    repetition_penalty: float = 1.1


class RotaryEmbedding(nn.Module):
    def __init__(self, dim: int, base: float = 10000.0):
        super().__init__()
        self.dim = dim
        self.base = base

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: [B, T, H, Dh] or [B, T, D]; we’ll only apply to last dim if even
        # This is a simple RoPE helper that returns sinusoid cache given T, not applied directly.
        # For simplicity, we’ll generate cos/sin in the attention module per step.
        return x


def apply_rotary(q, k, cos, sin):
    # q,k: [B, T, H, Dh]; cos/sin: [T, 1, 1, Dh]
    q1, q2 = q[..., ::2], q[..., 1::2]
    k1, k2 = k[..., ::2], k[..., 1::2]
    q_rot = torch.stack([q1 * cos - q2 * sin, q2 * cos + q1 * sin], dim=-1).flatten(-2)
    k_rot = torch.stack([k1 * cos - k2 * sin, k2 * cos + k1 * sin], dim=-1).flatten(-2)
    return q_rot, k_rot


def build_rope_cache(T: int, dim: int, base: float, device, dtype):
    # returns cos, sin: [T, 1, 1, dim]
    position = torch.arange(T, device=device, dtype=dtype).unsqueeze(1)
    idx = torch.arange(dim // 2, device=device, dtype=dtype)
    inv_freq = 1.0 / (base ** (idx / (dim // 2)))
    freqs = position * inv_freq.unsqueeze(0)  # [T, dim/2]
    cos = torch.cos(freqs).unsqueeze(1).unsqueeze(1).repeat(1, 1, 1, 1)
    sin = torch.sin(freqs).unsqueeze(1).unsqueeze(1).repeat(1, 1, 1, 1)
    # Interleave to shape [T, 1, 1, dim] as (cos, sin) split usage
    cos = torch.stack([cos, cos], dim=-1).reshape(T, 1, 1, dim)
    sin = torch.stack([sin, sin], dim=-1).reshape(T, 1, 1, dim)
    return cos, sin


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config: AutomotiveSLMConfig):
        super().__init__()
        assert config.d_model % config.n_head == 0
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.head_dim = config.d_model // config.n_head

        self.qkv = nn.Linear(config.d_model, 3 * config.d_model, bias=True)
        self.o_proj = nn.Linear(config.d_model, config.d_model, bias=True)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        self.rotary_dim = min(config.rotary_dim, self.head_dim)
        self.rope_base = config.rope_base

        self.register_buffer("mask", None, persistent=False)

    def _causal_mask(self, T: int, device):
        if self.mask is not None and self.mask.size(0) >= T:
            return self.mask[:T, :T]
        mask = torch.full((T, T), float("-inf"), device=device)
        mask = torch.triu(mask, diagonal=1)
        self.mask = mask
        return mask

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.size()
        qkv = self.qkv(x)  # [B,T,3C]
        q, k, v = qkv.split(C, dim=-1)

        # [B,T,H,Dh]
        q = q.view(B, T, self.n_head, self.head_dim)
        k = k.view(B, T, self.n_head, self.head_dim)
        v = v.view(B, T, self.n_head, self.head_dim)

        # Apply RoPE to first rotary_dim of q,k
        if self.rotary_dim > 0:
            cos, sin = build_rope_cache(T, self.rotary_dim, self.rope_base, x.device, x.dtype)
            q_rot, k_rot = apply_rotary(q[..., :self.rotary_dim], k[..., :self.rotary_dim], cos, sin)
            q = torch.cat([q_rot, q[..., self.rotary_dim:]], dim=-1)
            k = torch.cat([k_rot, k[..., self.rotary_dim:]], dim=-1)

        # attention scores
        att = torch.einsum("bthd,bshd->bhts", q, k) / math.sqrt(self.head_dim)
        att = att + self._causal_mask(T, x.device)
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        y = torch.einsum("bhts,bshd->bthd", att, v).contiguous()
        y = y.view(B, T, C)
        y = self.o_proj(y)
        y = self.resid_dropout(y)
        return y


class FeedForward(nn.Module):
    def __init__(self, config: AutomotiveSLMConfig):
        super().__init__()
        hidden = 4 * config.d_model
        self.net = nn.Sequential(
            nn.Linear(config.d_model, hidden),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(hidden, config.d_model),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        return self.net(x)


class MoEExpert(nn.Module):
    def __init__(self, d_model: int, hidden_size: int, dropout: float):
        super().__init__()
        self.w1 = nn.Linear(d_model, hidden_size, bias=False)
        self.w2 = nn.Linear(hidden_size, d_model, bias=False)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        return self.drop(self.w2(self.act(self.w1(x))))


class Top1Router(nn.Module):
    def __init__(self, d_model: int, n_experts: int):
        super().__init__()
        self.w = nn.Linear(d_model, n_experts, bias=False)

    def forward(self, x):
        # x: [B,T,D]
        logits = self.w(x)  # [B,T,E]
        probs = F.softmax(logits, dim=-1)
        top1 = torch.argmax(probs, dim=-1)  # [B,T]
        return top1, probs


class MoE(nn.Module):
    def __init__(self, config: AutomotiveSLMConfig):
        super().__init__()
        self.n_experts = config.n_experts
        self.router = Top1Router(config.d_model, config.n_experts)
        self.experts = nn.ModuleList(
            [MoEExpert(config.d_model, config.moe_intermediate_size, config.dropout) for _ in range(config.n_experts)]
        )

    def forward(self, x):
        B, T, D = x.shape
        assign, probs = self.router(x)  # [B,T], [B,T,E]
        y = torch.zeros_like(x)

        for e_idx in range(self.n_experts):
            mask = (assign == e_idx).unsqueeze(-1)  # [B,T,1]
            if mask.any():
                expert_in = x[mask.expand_as(x)].view(-1, D)
                expert_out = self.experts[e_idx](expert_in)
                y[mask.expand_as(y)] = expert_out.view(-1, D)
        return y


class TransformerBlock(nn.Module):
    def __init__(self, config: AutomotiveSLMConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.attn = MultiHeadSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.use_moe = config.use_moe
        if self.use_moe:
            self.ff = MoE(config)
        else:
            self.ff = FeedForward(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.ff(self.ln_2(x))
        return x


class AutomotiveSLM(nn.Module):
    def __init__(self, config: AutomotiveSLMConfig):
        super().__init__()
        self.config = config
        self.embed = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_embed = nn.Embedding(config.n_positions, config.d_model)
        self.drop = nn.Dropout(config.dropout)
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        self.apply(self._init_weights)

    def _init_weights(self, m: nn.Module):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, input_ids: torch.Tensor):
        B, T = input_ids.shape
        pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)  # [1,T]
        x = self.embed(input_ids) + self.pos_embed(pos)
        x = self.drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return {"logits": logits}

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        max_new_tokens: int = 50,
        temperature: float = 0.8,
        top_p: float = 0.9,
        top_k: int = 50,
        eos_token_id: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        do_sample: bool = True,
    ):
        self.eval()
        device = next(self.parameters()).device
        seq = input_ids.to(device)
        for _ in range(max_new_tokens):
            out = self.forward(seq)
            logits = out["logits"][:, -1, :]  # [B, V]
            logits = logits / max(temperature, 1e-6)

            if top_k is not None and top_k > 0:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits = torch.where(
                    logits < v[:, [-1]], torch.full_like(logits, -float("inf")), logits
                )

            probs = F.softmax(logits, dim=-1)

            if top_p is not None and 0 < top_p < 1.0:
                sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
                cumsum = torch.cumsum(sorted_probs, dim=-1)
                mask = cumsum <= top_p
                # ensure at least one token
                mask[..., 0] = True
                filtered = torch.where(mask, sorted_probs, torch.zeros_like(sorted_probs))
                filtered = filtered / filtered.sum(dim=-1, keepdim=True)
                next_id_sorted = torch.multinomial(filtered, num_samples=1)
                next_id = torch.gather(sorted_idx, -1, next_id_sorted)
            else:
                if do_sample:
                    next_id = torch.multinomial(probs, num_samples=1)
                else:
                    next_id = torch.argmax(probs, dim=-1, keepdim=True)

            if eos_token_id is not None and (next_id == eos_token_id).all():
                seq = torch.cat([seq, next_id], dim=1)
                break

            seq = torch.cat([seq, next_id], dim=1)

        return seq