File size: 5,878 Bytes
72d9cc3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | # coding=utf-8
# modeling_sam2.py
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
# -----------------------------
# Config
# -----------------------------
@dataclass
class Sam2Config(PretrainedConfig):
model_type = "sam2"
vocab_size: int = 50257
d_model: int = 384
n_layers: int = 6
n_heads: int = 6
ff_mult: float = 4.0
dropout: float = 0.1
pad_token_id: int = 50256 # default GPT-2 eos
bos_token_id: int = 50256
eos_token_id: int = 50256
# -----------------------------
# Building blocks
# -----------------------------
class RMSNorm(nn.Module):
def __init__(self, d, eps=1e-6):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(d))
def forward(self, x):
norm = x.pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
return self.weight * x * norm
class MHA(nn.Module):
def __init__(self, d_model, n_heads, dropout=0.0):
super().__init__()
self.n_heads = n_heads
self.head_dim = d_model // n_heads
self.q_proj = nn.Linear(d_model, d_model, bias=False)
self.k_proj = nn.Linear(d_model, d_model, bias=False)
self.v_proj = nn.Linear(d_model, d_model, bias=False)
self.out_proj = nn.Linear(d_model, d_model, bias=False)
self.dropout = nn.Dropout(dropout)
def forward(self, x, attn_mask=None):
B, T, C = x.shape
q = self.q_proj(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
k = self.k_proj(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
v = self.v_proj(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
causal = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
scores = scores.masked_fill(causal, float("-inf"))
if attn_mask is not None:
key_mask = attn_mask.unsqueeze(1).unsqueeze(2)
scores = scores.masked_fill(~key_mask.bool(), float("-inf"))
attn = F.softmax(scores, dim=-1)
out = torch.matmul(self.dropout(attn), v).transpose(1, 2).contiguous().view(B, T, C)
return self.out_proj(out)
class SwiGLU(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.0):
super().__init__()
self.w1 = nn.Linear(d_model, d_ff, bias=False)
self.w2 = nn.Linear(d_model, d_ff, bias=False)
self.w3 = nn.Linear(d_ff, d_model, bias=False)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.w3(self.dropout(F.silu(self.w1(x)) * self.w2(x)))
class Block(nn.Module):
def __init__(self, d_model, n_heads, ff_mult, dropout=0.0):
super().__init__()
self.norm1 = RMSNorm(d_model)
self.attn = MHA(d_model, n_heads, dropout=dropout)
self.norm2 = RMSNorm(d_model)
self.ff = SwiGLU(d_model, int(ff_mult * d_model), dropout=dropout)
self.drop = nn.Dropout(dropout)
def forward(self, x, attn_mask=None):
x = x + self.drop(self.attn(self.norm1(x), attn_mask=attn_mask))
x = x + self.drop(self.ff(self.norm2(x)))
return x
# -----------------------------
# Main model
# -----------------------------
class Sam2PreTrainedModel(PreTrainedModel):
config_class = Sam2Config
base_model_prefix = "sam2"
supports_gradient_checkpointing = False
def _init_weights(self, module):
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, mean=0.0, std=0.02)
class Sam2Model(Sam2PreTrainedModel):
def __init__(self, config: Sam2Config):
super().__init__(config)
self.embed = nn.Embedding(config.vocab_size, config.d_model)
self.blocks = nn.ModuleList([
Block(config.d_model, config.n_heads, config.ff_mult, dropout=config.dropout)
for _ in range(config.n_layers)
])
self.norm = RMSNorm(config.d_model)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.lm_head.weight = self.embed.weight
self.dropout = nn.Dropout(config.dropout)
self.post_init()
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
**kwargs
) -> Union[Tuple, CausalLMOutputWithPast]:
x = self.embed(input_ids)
for blk in self.blocks:
x = blk(x, attn_mask=attention_mask)
x = self.norm(x)
logits = self.lm_head(x)
loss = None
if labels is not None:
shift_logits = logits[:, :-1, :].contiguous()
shift_labels = labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
# -----------------------------
# AutoModel registration
# -----------------------------
from transformers import AutoConfig, AutoModelForCausalLM
AutoConfig.register("sam2", Sam2Config)
AutoModelForCausalLM.register(Sam2Config, Sam2Model)
|