SykoLLM / modeling_syko.py
syko818121's picture
Upgrade: Scaled up to ~15M parameters and 256 block size.
628d356 verified
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast
class SykoConfig(PretrainedConfig):
model_type = "syko"
def __init__(
self,
vocab_size=4096,
n_embd=384, # ARTIRILDI (Eskisi 256)
n_layer=8, # ARTIRILDI (Eskisi 6)
n_head=6, # AYARLANDI (384 / 64 = 6)
block_size=256, # ARTIRILDI (Eskisi 64) -> Daha uzun hafıza
dropout=0.2,
**kwargs
):
self.vocab_size = vocab_size
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.block_size = block_size
self.dropout = dropout
self.num_hidden_layers = n_layer
self.hidden_size = n_embd
self.num_attention_heads = n_head
super().__init__(**kwargs)
class Head(nn.Module):
def __init__(self, n_embd, head_size, block_size, dropout):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, T, C = x.shape
k = self.key(x)
q = self.query(x)
wei = q @ k.transpose(-2, -1) * (C ** -0.5)
# Maskeleme dinamik olmalı (gelen T kadar)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei = self.dropout(wei)
v = self.value(x)
out = wei @ v
return out
class MultiHeadAttention(nn.Module):
def __init__(self, n_head, head_size, n_embd, block_size, dropout):
super().__init__()
self.heads = nn.ModuleList([Head(n_embd, head_size, block_size, dropout) for _ in range(n_head)])
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
out = self.dropout(self.proj(out))
return out
class FeedForward(nn.Module):
def __init__(self, n_embd, dropout):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.GELU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
def __init__(self, n_embd, n_head, block_size, dropout):
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size, dropout)
self.ffwd = FeedForward(n_embd, dropout)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
class SykoForCausalLM(PreTrainedModel):
config_class = SykoConfig
def __init__(self, config):
super().__init__(config)
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.block_size = config.block_size
self.n_head = config.n_head
self.n_layer = config.n_layer
self.dropout = config.dropout
self.token_embedding_table = nn.Embedding(self.vocab_size, self.n_embd)
self.position_embedding_table = nn.Embedding(self.block_size, self.n_embd)
self.blocks = nn.Sequential(*[Block(self.n_embd, self.n_head, self.block_size, self.dropout) for _ in range(self.n_layer)])
self.ln_f = nn.LayerNorm(self.n_embd)
self.lm_head = nn.Linear(self.n_embd, self.vocab_size)
self.apply(self._init_weights)
def get_input_embeddings(self):
return self.token_embedding_table
def set_input_embeddings(self, new_embeddings):
self.token_embedding_table = new_embeddings
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, input_ids, labels=None, **kwargs):
idx = input_ids
B, T = idx.shape
device = idx.device
# Eğer context (T), block_size'dan büyükse kırp (Safety check)
if T > self.block_size:
idx = idx[:, -self.block_size:]
T = self.block_size
pos_emb = self.position_embedding_table(torch.arange(T, device=device))
tok_emb = self.token_embedding_table(idx)
x = tok_emb + pos_emb
x = self.blocks(x)
x = self.ln1_f(x) if hasattr(self, 'ln1_f') else self.ln_f(x)
logits = self.lm_head(x)
loss = None
if labels is not None:
# Labels da kırpılmalı eğer idx kırpıldıysa
if labels.shape[1] > T:
labels = labels[:, -T:]
B, T, C = logits.shape
logits_reshaped = logits.view(B*T, C)
labels_reshaped = labels.view(B*T)
loss = F.cross_entropy(logits_reshaped, labels_reshaped)
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return {"input_ids": input_ids}